Learning and deploying counting grids service code

This commit is contained in:
Spencer Buja 2019-06-18 17:54:38 -07:00
Родитель 230c06ccee
Коммит 99f4d5367a
28 изменённых файлов: 3409 добавлений и 0 удалений

5
Batch/.vscode/settings.json поставляемый Normal file
Просмотреть файл

@ -0,0 +1,5 @@
{
"python.linting.pylintEnabled": true,
"python.linting.enabled": true,
"python.linting.lintOnSave": true,
}

292
Batch/Batch/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,292 @@
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
src/keys.json
src/metadata.json
src/metadata_other.json
# User-specific files
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
**/Properties/launchSettings.json
*_i.c
*_p.c
*_i.h
*.ilk
*.meta
*.obj
*.pch
*.pdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# TODO: Comment the next line if you want to checkin your web deploy settings
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/packages/*
# except build/, which is used as an MSBuild target.
!**/packages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/packages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Typescript v1 declaration files
typings/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# JetBrains Rider
.idea/
*.sln.iml
# CodeRush
.cr/
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs

Просмотреть файл

@ -0,0 +1,34 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from jobStatus import JobStatus
class BatchJob():
def __init__(self, id_in: str, jobStatus_in, progress_in: int):
self.id = id_in
self.jobStatus = jobStatus_in
self.progress = progress_in
def next(self):
if self.jobStatus.value in [JobStatus.NotStarted, JobStatus.PreProcessing, JobStatus.Training]:
self.jobStatus = JobStatus(self.jobStatus.value + 1)
else:
raise ValueError("Invalid job status value.")
def makeProgress(self, progress: int):
if progress < 0 or progress > 100:
raise ValueError("Invalid progress value.")
else:
self.progress = progress
if __name__ == "__main__":
b = BatchJob("", JobStatus.NotStarted, 0, "", 5, 24)
b.next()
assert(b.jobStatus == JobStatus.PreProcessing)
b.makeProgress(5)
assert(b.progress == 5)
import json
print(json.dumps(b.__dict__))

Просмотреть файл

@ -0,0 +1,272 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import datetime
import io
import os
import sys
import time
from datetime import timedelta
import uuid
import azure.storage.blob as azureblob
import azure.batch.batch_service_client as batch
import azure.batch.models as batchmodels
sys.path.append('.')
sys.path.append('..')
def query_yes_no(question, default="yes"):
"""
Prompts the user for yes/no input, displaying the specified question text.
:param str question: The text of the prompt for input.
:param str default: The default if the user hits <ENTER>. Acceptable values
are 'yes', 'no', and None.
:rtype: str
:return: 'yes' or 'no'
"""
valid = {'y': 'yes', 'n': 'no'}
if default is None:
prompt = ' [y/n] '
elif default == 'yes':
prompt = ' [Y/n] '
elif default == 'no':
prompt = ' [y/N] '
else:
raise ValueError("Invalid default answer: '{}'".format(default))
while 1:
choice = input(question + prompt).lower()
if default and not choice:
return default
try:
return valid[choice[0]]
except (KeyError, IndexError):
print("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
def print_batch_exception(batch_exception):
"""
Prints the contents of the specified Batch exception.
:param batch_exception:
"""
print('-------------------------------------------')
print('Exception encountered:')
if batch_exception.error and \
batch_exception.error.message and \
batch_exception.error.message.value:
print(batch_exception.error.message.value)
if batch_exception.error.values:
print()
for mesg in batch_exception.error.values:
print('{}:\t{}'.format(mesg.key, mesg.value))
print('-------------------------------------------')
def download_file_from_container(block_blob_client, container_name, file_path, blob_name):
print("\nDownloading blob to " + file_path)
block_blob_client.get_blob_to_path(container_name, blob_name, file_path)
def upload_file_to_container(block_blob_client, container_name, file_path):
"""
Uploads a local file to an Azure Blob storage container.
:param block_blob_client: A blob service client.
:type block_blob_client: `azure.storage.blob.BlockBlobService`
:param str container_name: The name of the Azure Blob storage container.
:param str file_path: The local path to the file.
:rtype: `azure.batch.models.ResourceFile`
:return: A ResourceFile initialized with a SAS URL appropriate for Batch
tasks.
"""
blob_name = os.path.basename(file_path)
print('Uploading file {} to container [{}]...'.format(file_path,
container_name))
block_blob_client.create_blob_from_path(container_name,
blob_name,
file_path)
# Obtain the SAS token for the container.
sas_token = get_container_sas_token(block_blob_client,
container_name, azureblob.BlobPermissions.READ)
sas_url = block_blob_client.make_blob_url(container_name,
blob_name,
sas_token=sas_token)
return batchmodels.ResourceFile(file_path=blob_name,
blob_source=sas_url)
def get_container_sas_token(block_blob_client,
container_name, blob_permissions):
"""
Obtains a shared access signature granting the specified permissions to the
container.
:param block_blob_client: A blob service client.
:type block_blob_client: `azure.storage.blob.BlockBlobService`
:param str container_name: The name of the Azure Blob storage container.
:param BlobPermissions blob_permissions:
:rtype: str
:return: A SAS token granting the specified permissions to the container.
"""
# Obtain the SAS token for the container, setting the expiry time and
# permissions. In this case, no start time is specified, so the shared
# access signature becomes valid immediately. Expiration is in 2 hours.
container_sas_token = \
block_blob_client.generate_container_shared_access_signature(
container_name,
permission=blob_permissions,
expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=2))
return container_sas_token
def get_container_sas_url(block_blob_client,
container_name, blob_permissions):
"""
Obtains a shared access signature URL that provides write access to the
ouput container to which the tasks will upload their output.
:param block_blob_client: A blob service client.
:type block_blob_client: `azure.storage.blob.BlockBlobService`
:param str container_name: The name of the Azure Blob storage container.
:param BlobPermissions blob_permissions:
:rtype: str
:return: A SAS URL granting the specified permissions to the container.
"""
# Obtain the SAS token for the container.
sas_token = get_container_sas_token(block_blob_client,
container_name, azureblob.BlobPermissions.WRITE)
# Construct SAS URL for the container
container_sas_url = "https://{}.blob.core.windows.net/{}?{}".format(
_STORAGE_ACCOUNT_NAME, container_name, sas_token)
return container_sas_url
def create_pool(batch_service_client, pool_id, vm_size, imageName, versions, auto_scale_formula):
"""
Creates a pool of compute nodes with the specified OS settings.
:param batch_service_client: A Batch service client.
:type batch_service_client: `azure.batch.BatchServiceClient`
:param str pool_id: An ID for the new pool.
:param str publisher: Marketplace image publisher
:param str offer: Marketplace image offer
:param str sku: Marketplace image sky
"""
print('Creating pool [{}]...'.format(pool_id))
new_pool = batch.models.PoolAddParameter(
id=pool_id,
virtual_machine_configuration=batchmodels.VirtualMachineConfiguration(
image_reference=batchmodels.ImageReference(
virtual_machine_image_id="/subscriptions/{}/resourceGroups/{}/providers/Microsoft.Compute/images/{}".format(
"ad49354a-6ce2-4dae-a51d-b6907372f608", "BrowseCloud", imageName)
),
node_agent_sku_id="batch.node.windows amd64"),
vm_size=vm_size,
start_task=None,
enable_auto_scale=True,
auto_scale_formula=auto_scale_formula,
application_package_references=[batchmodels.ApplicationPackageReference(
application_id="browsecloudtrainer", version=version) for version in versions],
auto_scale_evaluation_interval=timedelta(
minutes=5) # the smallest evaluation interval
)
batch_service_client.pool.add(new_pool)
def create_job(batch_service_client, job_id, pool_id):
"""
Creates a job with the specified ID, associated with the specified pool.
:param batch_service_client: A Batch service client.
:type batch_service_client: `azure.batch.BatchServiceClient`
:param str job_id: The ID for the job.
:param str pool_id: The ID for the pool.
"""
print('Creating job [{}]...'.format(job_id))
job = batch.models.JobAddParameter(
id=job_id,
pool_info=batch.models.PoolInformation(pool_id=pool_id))
batch_service_client.job.add(job)
def add_training_task(batch_service_client, job_id, command, displayName, containerName, blobName):
"""
Adds a task to the specified job.
:param batch_service_client: A Batch service client.
:type batch_service_client: `azure.batch.BatchServiceClient`
:param str jobId: The ID of the job to which to add the tasks.
:param command: commandLine activity to do
:param displayName display name of task
:param containerName name of container with training data
:param blobName name of training data blob
"""
# sample Linux command "/bin/bash -c \"ffmpeg -i {} {} \"
# sample Windows command "cmd /c cd .. & cd C:\Users\chbuja"
GUID = str(uuid.uuid4())[:63]
autouser = batchmodels.AutoUserSpecification(
scope='task', elevation_level='admin')
userId = batchmodels.UserIdentity(auto_user=autouser)
tasks = list()
tasks.append(batch.models.TaskAddParameter(
id='{}'.format(GUID),
command_line=command,
display_name=displayName,
user_identity=userId
))
batch_service_client.task.add_collection(job_id, tasks)
def wait_for_tasks_to_complete(batch_service_client, job_id, timeout):
"""
Returns when all tasks in the specified job reach the Completed state.
:param batch_service_client: A Batch service client.
:type batch_service_client: `azure.batch.BatchServiceClient`
:param str job_id: The id of the job whose tasks should be monitored.
:param timedelta timeout: The duration to wait for task completion. If all
tasks in the specified job do not reach Completed state within this time
period, an exception will be raised.
"""
timeout_expiration = datetime.datetime.now() + timeout
print("Monitoring all tasks for 'Completed' state, timeout in {}..."
.format(timeout), end='')
while datetime.datetime.now() < timeout_expiration:
print('.', end='')
sys.stdout.flush()
tasks = batch_service_client.task.list(job_id)
incomplete_tasks = [task for task in tasks if
task.state != batchmodels.TaskState.completed]
if not incomplete_tasks:
print()
return True
else:
time.sleep(1)
print()
raise RuntimeError("ERROR: Tasks did not reach 'Completed' state within "
"timeout period of " + str(timeout))

Просмотреть файл

@ -0,0 +1,58 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import json
import adal
from msrestazure.azure_active_directory import AADTokenCredentials
import requests
class BrowseCloudServiceAuthorizer():
'''
Retrieves credentials for backend to send incremental results to the trainer.
Example:
url = 'https://contoso-browsecloud-service.azurewebsites.net/api/v1/documents'
bcsa = BrowseCloudServiceAuthorizer(resource_uri='https://contoso.oncontoso.com/BrowseCloud.Service')
headers = {'Content-Type': "application/json", 'Accept': "application/json", 'Authorization': bcsa.authOutput()}
res = requests.get(url, headers=headers)
print(res.text)
'''
def __init__(self, resource_uri: str):
self.token = ""
self.resource_uri = resource_uri
def retrieveToken(self):
"""
Authenticate using service principal w/ key.
"""
if self.resource_uri != "":
client_id = ""
client_secret = ""
authority_host_uri = ""
tenant = ""
# TODO: get this from key vault
with open("metadata.json", "r") as f:
data = json.load(f)
client_id = data['CLIENT_ID']
client_secret = data['CLIENT_SECRET']
authority_host_uri = data["AUTHORITY_HOST_URI"]
tenant = data["TENANT"]
authority_uri = authority_host_uri + '/' + tenant
resource_uri = self.resource_uri
context = adal.AuthenticationContext(
authority_uri, api_version=None)
self.token = context.acquire_token_with_client_credentials(
resource_uri, client_id, client_secret)
credentials = AADTokenCredentials(self.token, client_id)
# Returns:'Bearer <MY_TOKEN>'
def authOutput(self):
self.retrieveToken()
return 'Bearer ' + self.token['accessToken']

Просмотреть файл

@ -0,0 +1,54 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from jobStatus import JobStatus
import threading
import requests
class CountingGridsHeartBeater():
def __init__(self, url, batchJob, bcServiceAuthorizer):
self.url = url
self.batchJob = batchJob
self.bcServiceAuthorizer = bcServiceAuthorizer
self.RESET_PROGRESS_NUMBER = 0
self.FINISHED_PROGRESS_NUMBER = 100
def workerFactory(self, batchJob, bcServiceAuthorizer):
url = self.url
def worker():
data = batchJob.__dict__
if url != '':
headers = {'Content-Type': "application/json", 'Accept': "application/json",
'Authorization': self.bcServiceAuthorizer.authOutput()}
res = requests.put(self.url, json=data, headers=headers)
print(res)
print(res.text)
else:
print("Faking heartbeat with following batchJob:")
print(data)
return worker
def beat(self):
t = threading.Thread(target=self.workerFactory(
self.batchJob, self.bcServiceAuthorizer), args=())
t.setDaemon(False)
t.start()
def next(self):
self.batchJob.next()
self.batchJob.makeProgress(self.RESET_PROGRESS_NUMBER)
self.beat()
def makeProgress(self, progress: int):
self.batchJob.makeProgress(progress)
self.beat()
def done(self, success: bool):
if success:
self.batchJob.jobStatus = JobStatus.Success
else:
self.batchJob.jobStatus = JobStatus.Failure
self.makeProgress(self.FINISHED_PROGRESS_NUMBER)

Просмотреть файл

@ -0,0 +1,127 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from __future__ import print_function
import datetime
import io
import os
import sys
import time
from datetime import timedelta
import json
from browseCloudAzureUtilities import *
import azure.storage.blob as azureblob
import azure.batch.batch_service_client as batch
import azure.batch.batch_auth as batchauth
import azure.batch.models as batchmodels
from azure.batch import BatchServiceClient
from azure.common.credentials import ServicePrincipalCredentials
sys.path.append('.')
sys.path.append('..')
# Update the Batch and Storage account credential strings below with the values
# unique to your accounts. These are used when constructing connection strings
# for the Batch and Storage client objects.
_BATCH_ACCOUNT_NAME = ""
_BATCH_ACCOUNT_KEY = ""
_BATCH_ACCOUNT_URL = ""
_SERVICE_PRINCIPAL_KEY = ""
with open("keys.json", "r") as f:
data = json.load(f)
_BATCH_ACCOUNT_KEY = data['_BATCH_ACCOUNT_KEY']
_SERVICE_PRINCIPAL_KEY = data['_SERVICE_PRINCIPAL_KEY_DEV']
if __name__ == '__main__':
start_time = datetime.datetime.now().replace(microsecond=0)
print('Sample start: {}'.format(start_time))
print()
# Create a Batch service client. We'll now be interacting with the Batch
# service in addition to Storage
'''
credentials = batchauth.SharedKeyCredentials(_BATCH_ACCOUNT_NAME,
_BATCH_ACCOUNT_KEY)
'''
# Have separate applications, deployment locations, scaling formulae, etc., for dev and prod
# This is read from a file
JOB_ID = ""
POOL_VM_SIZE = ""
POOL_ID = ""
IMAGE_NAME = ""
SCALING_FORMULA = ""
VERSIONS = []
TENANT_ID = ""
CLIENT_ID = ""
with open('metadata.json', 'r') as fMeta:
dataMeta = json.load(fMeta)
TENANT_ID = dataMeta["TENANT_ID_DEPLOYMENT_AND_TESTING"]
CLIENT_ID = dataMeta["CLIENT_ID_DEPLOYMENT_AND_TESTING"]
_BATCH_ACCOUNT_NAME = dataMeta["_BATCH_ACCOUNT_NAME"]
_BATCH_ACCOUNT_URL = dataMeta["_BATCH_ACCOUNT_URL"]
if dataMeta['ENV'] == 'DEV':
JOB_ID = dataMeta['JOB_ID_DEV']
POOL_VM_SIZE = dataMeta['POOL_VM_SIZE_DEV']
POOL_ID = dataMeta['POOL_ID_DEV']
IMAGE_NAME = dataMeta['IMAGE_NAME_DEV']
SCALING_FORMULA = dataMeta['SCALING_FORMULA_DEV']
VERSIONS = dataMeta['VERSIONS_DEV']
elif dataMeta['ENV'] == 'PROD':
JOB_ID = dataMeta['JOB_ID_PROD']
POOL_VM_SIZE = dataMeta['POOL_VM_SIZE_PROD']
POOL_ID = dataMeta['POOL_ID_PROD']
IMAGE_NAME = dataMeta['IMAGE_NAME_PROD']
SCALING_FORMULA = dataMeta['SCALING_FORMULA_PROD']
VERSIONS = dataMeta['VERSIONS_PROD']
else:
raise ValueError("Environment type in metadata.json is invalid.")
SECRET = _SERVICE_PRINCIPAL_KEY
credentials = ServicePrincipalCredentials(
client_id=CLIENT_ID,
secret=SECRET,
tenant=TENANT_ID,
resource="https://batch.core.windows.net/"
)
batch_client = batch.BatchServiceClient(
credentials,
base_url=_BATCH_ACCOUNT_URL)
try:
# Create the pool that will contain the compute nodes that will execute the
# tasks.
create_pool(batch_client, POOL_ID, POOL_VM_SIZE,
IMAGE_NAME, VERSIONS, SCALING_FORMULA)
# Create the job that will run the tasks.
create_job(batch_client, JOB_ID, POOL_ID)
print(" Success! All tasks reached the 'Completed' state within the "
"specified timeout period.")
except batchmodels.BatchErrorException as err:
print_batch_exception(err)
raise
# Print out some timing info
end_time = datetime.datetime.now().replace(microsecond=0)
print()
print('Sample end: {}'.format(end_time))
print('Elapsed time: {}'.format(end_time - start_time))
print()
print()
input('Press ENTER to exit...')

Просмотреть файл

@ -0,0 +1,229 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import hashlib
import json
import datetime
import pandas as pd
import os
import numpy as np
from CountingGridsPy.models import CountingGridModel
import traceback
from browseCloudServiceAuthorizer import BrowseCloudServiceAuthorizer
from countingGridsHeartBeater import CountingGridsHeartBeater
from jobStatus import JobStatus
from batchJob import BatchJob
import azure.storage.blob as azureblob
from browseCloudAzureUtilities import upload_file_to_container, download_file_from_container
import matplotlib.pyplot as plt
from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, CGEngineWrapper, NLPCleaner, PipelineTimer
import sys
sys.path.append("../../..")
# CLI: python generateCountingGridsFromAzure.py <input_containername> <extent_size_of_grid_hyperparameter>
# <window_size_of_grid_hyperparameter> <engine_type> <inputfile_type> <inputfile_name> <output_containername>
#
# Example CLI: python generateCountingGridsFromAzure.py trainingdata 24 5 numpyEngine simpleInput dictionaryVerySmallSample.txt bighash
# Assumes: input_containername and output_containername must be <64 characters long.
if __name__ == "__main__":
HEART_BEATER = None
try:
# ---------------------------------------------------------------------------------------
# Input
# ---------------------------------------------------------------------------------------
errStr = "Please give valid command-line arguments."
if len(sys.argv) != 8:
raise ValueError(errStr)
containerNameIn = sys.argv[1]
EXTENT_SIZE = int(sys.argv[2])
WINDOW_SIZE = int(sys.argv[3])
engine_type = sys.argv[4]
inputfile_type = sys.argv[5]
blobName = sys.argv[6]
containerNameOut = sys.argv[7]
if engine_type != "numpyEngine" and engine_type != "matlabEngine":
raise ValueError(
"The {0} engine does not exist. Please use 'matlabEngine' or 'numpyEngine'.".format(engine_type))
engine_type = engine_type[:-6] # 6 characters in the word "Engine"
if inputfile_type != "metadataInput" and inputfile_type != "simpleInput":
raise ValueError(
"The {0} input type does not exist. Please use 'simpleInput' or 'metadataInput'.".format(inputfile_type))
inputfile_type = inputfile_type[:-5] # remove "Input"
# ---------------------------------------------------------------------------------------
# Authentication
# ---------------------------------------------------------------------------------------
AUTH_URL = ""
SERVICE_URL = ""
_STORAGE_ACCOUNT_NAME_IN = 'browsecloudtrainingdata'
_STORAGE_ACCOUNT_KEY_IN = ""
_STORAGE_ACCOUNT_NAME_OUT = 'browsecloudmodelfiles'
_STORAGE_ACCOUNT_KEY_OUT = ""
jobId = containerNameOut
docId = containerNameIn
with open('metadata.json', 'r') as fMeta:
dataMeta = json.load(fMeta)
AUTH_URL = dataMeta["AUTH_URL"]
_STORAGE_ACCOUNT_NAME_IN = dataMeta["_STORAGE_ACCOUNT_NAME_TRAININGDATA"]
_STORAGE_ACCOUNT_KEY_OUT = dataMeta["_STORAGE_ACCOUNT_NAME_MODELS"]
if dataMeta['ENV'] == 'DEV':
# TODO: Use key vault and certificate
# to retreive that information instead of temp file for keys.
# Note that keys.json is not checked in.
with open("keys.json", "r") as f:
data = json.load(f)
_STORAGE_ACCOUNT_KEY_IN = data['_STORAGE_ACCOUNT_KEY_DEV']
_STORAGE_ACCOUNT_KEY_OUT = data['_STORAGE_ACCOUNT_KEY_OUT_DEV']
_STORAGE_ACCOUNT_NAME_OUT = data['_STORAGE_ACCOUNT_NAME_OUT_DEV'] if (
'_STORAGE_ACCOUNT_NAME_OUT_DEV' in data
) else _STORAGE_ACCOUNT_NAME_OUT
_STORAGE_ACCOUNT_NAME_IN = data['_STORAGE_ACCOUNT_NAME_IN_DEV'] if (
'_STORAGE_ACCOUNT_NAME_IN_DEV' in data
) else _STORAGE_ACCOUNT_NAME_IN
SERVICE_URL = dataMeta["SERVICE_URL_DEV"] + "/api/v1/jobs/" + \
jobId if "SERVICE_URL_DEV" in dataMeta else SERVICE_URL
elif dataMeta['ENV'] == 'PROD':
with open("keys.json", "r") as f:
data = json.load(f)
_STORAGE_ACCOUNT_KEY_IN = data['_STORAGE_ACCOUNT_KEY_PROD']
_STORAGE_ACCOUNT_KEY_OUT = data['_STORAGE_ACCOUNT_KEY_OUT_PROD']
_STORAGE_ACCOUNT_NAME_OUT = data['_STORAGE_ACCOUNT_NAME_OUT_PROD'] if (
'_STORAGE_ACCOUNT_NAME_OUT_PROD' in data
) else _STORAGE_ACCOUNT_NAME_OUT
_STORAGE_ACCOUNT_NAME_IN = data['_STORAGE_ACCOUNT_NAME_IN_PROD'] if (
'_STORAGE_ACCOUNT_NAME_IN_PROD' in data
) else _STORAGE_ACCOUNT_NAME_IN
SERVICE_URL = dataMeta["SERVICE_URL_PROD"] + "/api/v1/jobs/" + \
jobId if "SERVICE_URL_PROD" in dataMeta else SERVICE_URL
else:
raise ValueError(
"Environment type in metadata.json is invalid.")
BATCH_JOB = BatchJob(jobId, JobStatus.NotStarted, 0)
SERVICE_AUTHORIZER = BrowseCloudServiceAuthorizer(AUTH_URL)
HEART_BEATER = CountingGridsHeartBeater(
SERVICE_URL, BATCH_JOB, SERVICE_AUTHORIZER)
DIRECTORY_SUFFIX = hashlib.sha3_256(
(docId+blobName+str(datetime.datetime.now())).encode()).hexdigest()
DIRECTORY_DATA = blobName.split(".")[0] + "_" + DIRECTORY_SUFFIX
if not os.path.isdir(DIRECTORY_DATA):
os.mkdir(DIRECTORY_DATA)
'''
Algorithm:
1. Get training data from Azure.
2. Run learning code.
3. Write model files to Azure.
Changes between this and dumpCountingGrids.py:
1. DIRECTORY_DIR must be unique.
2. Fetching and writing to Azure. Idea is to fetch into directory and then write it to Azure
'''
blob_client = azureblob.BlockBlobService(
account_name=_STORAGE_ACCOUNT_NAME_IN,
account_key=_STORAGE_ACCOUNT_KEY_IN)
download_file_from_container(
blob_client, containerNameIn, DIRECTORY_DATA+"/"+blobName, blobName)
FILE_NAME = DIRECTORY_DATA + "\\" + blobName
CLEAN_DATA_FILE_NAME, MIN_FREQUENCY, MIN_WORDS = [
"\cg-processed.tsv", 2, 5]
# ---------------------------------------------------------------------------------------
# Data Cleaning
# ---------------------------------------------------------------------------------------
cleaner = NLPCleaner()
HEART_BEATER.next()
correspondences = None
CACHED_CORRESPONDENCES_FILE_NAME = "\cached_correspondences.tsv"
pTimer = PipelineTimer()
pTimer("Reading data file.")
df, keep = cleaner.read(
FILE_NAME, inputfile_type, MIN_FREQUENCY, MIN_WORDS)
if not (os.path.exists(DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME) and os.path.exists(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME)):
pTimer("Starting data cleaning.")
cleaner.handle_negation_tokens()
cleaner.removePunctuation()
HEART_BEATER.makeProgress(50)
correspondences = cleaner.lemmatize()
cleaner.write_cached_correspondences(
DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
cleaner.write(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME)
else:
pTimer("Skipping data cleaning.")
correspondences = cleaner.read_cached_correspondences(
DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
pTimer("Learning counting grid.")
LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
# ---------------------------------------------------------------------------------------
# Learning
# ---------------------------------------------------------------------------------------
engine = CGEngineWrapper(
extent_size=EXTENT_SIZE, window_size=WINDOW_SIZE, heartBeaters=[HEART_BEATER])
HEART_BEATER.next()
vocabulary = None
if not os.path.exists(DIRECTORY_DATA + LEARNED_GRID_FILE_NAME):
vocabulary, keep = engine.fit(
DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS, MIN_FREQUENCY, keep, engine=engine_type)
else:
vocabulary, keep = engine.get_vocab(
DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS, MIN_FREQUENCY, keep)
# ---------------------------------------------------------------------------------------
# Output
# ---------------------------------------------------------------------------------------
pTimer("Generating counting grid artifacts.")
LINK_FILE_NAME = ""
bcag = BrowseCloudArtifactGenerator(DIRECTORY_DATA)
bcag.read(LEARNED_GRID_FILE_NAME)
bcag.write_docmap(engine.wd_size, engine=engine_type)
bcag.write_counts()
bcag.write_vocabulary(vocabulary)
bcag.write_top_pi()
bcag.write_top_pi_layers()
bcag.write_colors() # write default blue colors
bcag.write_database(df, keep)
bcag.write_correspondences(correspondences, vocabulary)
bcag.write_keep(keep)
pTimer("Done.")
blob_client = azureblob.BlockBlobService(
account_name=_STORAGE_ACCOUNT_NAME_OUT,
account_key=_STORAGE_ACCOUNT_KEY_OUT)
# apply some recursion to create multiple container once uniqueness is problem
blob_client.create_container(containerNameOut)
# upload each file, aside from the input file into
FILES = [f.path for f in os.scandir(DIRECTORY_DATA) if not f.is_dir()]
for modelfile in FILES:
if not modelfile.endswith(blobName):
upload_file_to_container(
blob_client, containerNameOut, modelfile)
except Exception as e:
HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
print("Script failed.")
print(traceback.format_exc())
except:
HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
print("Script failed.")
print(traceback.format_exc())
else:
HEART_BEATER.done(success=True)
print("Script succeeded.")

Просмотреть файл

@ -0,0 +1,201 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import hashlib
import json
import datetime
import pandas as pd
import os
import numpy as np
import requests
import re
import traceback
from itertools import islice
from browseCloudServiceAuthorizer import BrowseCloudServiceAuthorizer
from countingGridsHeartBeater import CountingGridsHeartBeater
from jobStatus import JobStatus
from batchJob import BatchJob
import azure.storage.blob as azureblob
from browseCloudAzureUtilities import upload_file_to_container, download_file_from_container
import matplotlib.pyplot as plt
from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, PipelineTimer
import sys
sys.path.append("../../..")
if __name__ == "__main__":
HEART_BEATER = None
try:
errStr = "Please give valid command-line arguments."
if len(sys.argv) != 5:
raise ValueError(errStr)
containerNameIn = sys.argv[1] # targetId
metadataColumnName = sys.argv[2] # targetId
containerNameOut = sys.argv[3] # id of current job
windowSize = int(sys.argv[4]) # window size
AUTH_URL = ""
SERVICE_URL = ""
_STORAGE_ACCOUNT_NAME = 'browsecloudmodelfiles'
_STORAGE_ACCOUNT_KEY = ''
jobId = containerNameOut
targetId = containerNameIn
with open("keys.json", "r") as f:
data = json.load(f)
_SENTIMENT_ANALYSIS_KEY = data['_SENTIMENT_ANALYSIS_KEY']
with open('metadata.json', 'r') as fMeta:
dataMeta = json.load(fMeta)
AUTH_URL = dataMeta["AUTH_URL"]
if dataMeta['ENV'] == 'DEV':
# TODO: Use key vault and certificate to retreive that information
# instead of temp file for keys.
# Note that keys.json is not checked in.
with open("keys.json", "r") as f:
data = json.load(f)
_STORAGE_ACCOUNT_KEY = data['_STORAGE_ACCOUNT_KEY_OUT_DEV']
_STORAGE_ACCOUNT_NAME = data['_STORAGE_ACCOUNT_NAME_OUT_DEV'] if (
'_STORAGE_ACCOUNT_NAME_OUT_DEV' in data
) else _STORAGE_ACCOUNT_NAME
SERVICE_URL = dataMeta["SERVICE_URL_DEV"] + "/api/v1/jobs/" + \
jobId if "SERVICE_URL_DEV" in dataMeta else SERVICE_URL
elif dataMeta['ENV'] == 'PROD':
with open("keys.json", "r") as f:
data = json.load(f)
_STORAGE_ACCOUNT_KEY = data['_STORAGE_ACCOUNT_KEY_OUT_PROD']
_STORAGE_ACCOUNT_NAME = data['_STORAGE_ACCOUNT_NAME_OUT_PROD'] if (
'_STORAGE_ACCOUNT_NAME_OUT_PROD' in data
) else _STORAGE_ACCOUNT_NAME
SERVICE_URL = dataMeta["SERVICE_URL_PROD"] + "/api/v1/jobs/" + \
jobId if "SERVICE_URL_PROD" in dataMeta else SERVICE_URL
else:
raise ValueError(
"Environment type in metadata.json is invalid.")
BATCH_JOB = BatchJob(jobId, JobStatus.NotStarted, 0)
SERVICE_AUTHORIZER = BrowseCloudServiceAuthorizer(AUTH_URL)
HEART_BEATER = CountingGridsHeartBeater(
SERVICE_URL, BATCH_JOB, SERVICE_AUTHORIZER)
# Setup Working Directory.
DIRECTORY_SUFFIX = hashlib.sha3_256(
(targetId+str(datetime.datetime.now())).encode()).hexdigest()
DIRECTORY_DATA = targetId + "_" + DIRECTORY_SUFFIX
if not os.path.isdir(DIRECTORY_DATA):
os.mkdir(DIRECTORY_DATA)
'''
Algorithm:
1. Copy everything over from target container to job container.
2. Separate and properly encode the feature column.
3. Weight documents on grid.
3. Output colors file.
5. Write legend file.
'''
pTimer = PipelineTimer()
pTimer("Downloading Target Job Files.")
# Heart beater to training state.
HEART_BEATER.next()
HEART_BEATER.next()
blob_client = azureblob.BlockBlobService(
account_name=_STORAGE_ACCOUNT_NAME,
account_key=_STORAGE_ACCOUNT_KEY)
all_blobs = blob_client.list_blobs(containerNameIn)
for blob in all_blobs:
download_file_from_container(
blob_client, containerNameIn, DIRECTORY_DATA+"/"+blob.name, blob.name)
HEART_BEATER.makeProgress(20)
pTimer("Getting sentiment from Azure.")
# Feature Mapping data
raw_feature = []
def dictionary_from_line(row):
prop_dict = {}
row_components = row.split('\t')
for row_component in row_components:
row_key_value = row_component.split(':')
prop_dict[row_key_value[0]] = row_key_value[1]
return prop_dict
with open(DIRECTORY_DATA+"/database.txt", 'r') as f:
for row in f:
prop_dict = dictionary_from_line(row)
raw_feature.append(prop_dict[metadataColumnName])
# first, try to convert all values to numbers
feature_map = []
labelTuples = []
cm = None
try:
feature_map = np.array([float(x) for x in raw_feature])
labelTuples = [('{0:.3g}'.format(np.min(feature_map)) + " " + metadataColumnName,
'{0:.3g}'.format(np.max(feature_map)) + " " + metadataColumnName)]
feature_map = (feature_map - np.min(feature_map)) / \
(np.max(feature_map) - np.min(feature_map))
cm = plt.get_cmap('cool')
except:
# Not all numbers. Try to do a 2-categorical
set_of_values = set(raw_feature)
if len(set_of_values) is not 2:
raise ValueError(
"Selected metadata row is neither numerical nor 2-categorical")
mask = [int(x == raw_feature[0]) for x in raw_feature]
feature_map = np.array(mask)
cm = plt.get_cmap('PiYG')
labelTuples = [
(list(set_of_values - set([raw_feature[0]]))[0], raw_feature[0])]
HEART_BEATER.makeProgress(60)
colorTuples = [(tuple(list(cm(0))[:-1]),
tuple(list(cm(127))[:-1]), tuple(list(cm(255))[:-1]))]
pTimer("Generating counting grid artifacts.")
LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
DOCMAP_FILE_NAME = "/docmap.txt"
bcag = BrowseCloudArtifactGenerator(DIRECTORY_DATA)
bcag.W = [windowSize, windowSize]
bcag.read(LEARNED_GRID_FILE_NAME)
bcag.read_docmap(DOCMAP_FILE_NAME, engine="numpy")
bcag.write_colors(cm=cm, feature_map=feature_map,
stretch_the_truth=True)
bcag.write_legends(labelTuples=labelTuples, colorTuples=colorTuples)
bcag.add_feature_map_to_database(feature_map)
HEART_BEATER.makeProgress(80)
pTimer("Done.")
blob_client.create_container(containerNameOut)
# upload each file, aside from the input file into
FILES = [f.path for f in os.scandir(DIRECTORY_DATA) if not f.is_dir()]
for modelfile in FILES:
upload_file_to_container(blob_client, containerNameOut, modelfile)
HEART_BEATER.makeProgress(100)
except Exception as e:
HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
print("Script failed.")
print(traceback.format_exc())
except:
HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
print("Script failed.")
print(traceback.format_exc())
else:
HEART_BEATER.done(success=True)
print("Script succeeded.")

Просмотреть файл

@ -0,0 +1,210 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import hashlib
import json
import datetime
import pandas as pd
import os
import numpy as np
import requests
import re
import traceback
from itertools import islice
from browseCloudServiceAuthorizer import BrowseCloudServiceAuthorizer
from countingGridsHeartBeater import CountingGridsHeartBeater
from jobStatus import JobStatus
from batchJob import BatchJob
import azure.storage.blob as azureblob
from browseCloudAzureUtilities import upload_file_to_container, download_file_from_container
import matplotlib.pyplot as plt
from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, PipelineTimer
import sys
sys.path.append("../../..")
if __name__ == "__main__":
HEART_BEATER = None
try:
errStr = "Please give valid command-line arguments."
if len(sys.argv) != 4:
raise ValueError(errStr)
containerNameIn = sys.argv[1] # targetId
containerNameOut = sys.argv[2] # id of current job
windowSize = int(sys.argv[3]) # window size
AUTH_URL = ""
SERVICE_URL = ""
_STORAGE_ACCOUNT_NAME = ""
_STORAGE_ACCOUNT_KEY = ""
jobId = containerNameOut
targetId = containerNameIn
with open("keys.json", "r") as f:
data = json.load(f)
_SENTIMENT_ANALYSIS_KEY = data['_SENTIMENT_ANALYSIS_KEY']
with open('metadata.json', 'r') as fMeta:
dataMeta = json.load(fMeta)
AUTH_URL = dataMeta["AUTH_URL"]
_STORAGE_ACCOUNT_NAME = dataMeta["_STORAGE_ACCOUNT_NAME_MODELS"]
SENTIMENT_ANALYSIS_ENDPOINT = dataMeta["SENTIMENT_ANALYSIS_ENDPOINT"]
if dataMeta['ENV'] == 'DEV':
# TODO: Use key vault and certificate to retreive that information
# instead of temp file for keys.
# Note that keys.json is not checked in.
with open("keys.json", "r") as f:
data = json.load(f)
_STORAGE_ACCOUNT_KEY = data['_STORAGE_ACCOUNT_KEY_OUT_DEV']
_STORAGE_ACCOUNT_NAME = data['_STORAGE_ACCOUNT_NAME_OUT_DEV'] if (
'_STORAGE_ACCOUNT_NAME_OUT_DEV' in data
) else _STORAGE_ACCOUNT_NAME
SERVICE_URL = dataMeta["SERVICE_URL_DEV"] + "/api/v1/jobs/" + \
jobId if "SERVICE_URL_DEV" in dataMeta else SERVICE_URL
elif dataMeta['ENV'] == 'PROD':
with open("keys.json", "r") as f:
data = json.load(f)
_STORAGE_ACCOUNT_KEY = data['_STORAGE_ACCOUNT_KEY_OUT_PROD']
_STORAGE_ACCOUNT_NAME = data['_STORAGE_ACCOUNT_NAME_OUT_PROD'] if (
'_STORAGE_ACCOUNT_NAME_OUT_PROD' in data
) else _STORAGE_ACCOUNT_NAME
SERVICE_URL = dataMeta["SERVICE_URL_PROD"] + "/api/v1/jobs/" + \
jobId if "SERVICE_URL_PROD" in dataMeta else SERVICE_URL
else:
raise ValueError(
"Environment type in metadata.json is invalid.")
BATCH_JOB = BatchJob(jobId, JobStatus.NotStarted, 0)
SERVICE_AUTHORIZER = BrowseCloudServiceAuthorizer(AUTH_URL)
HEART_BEATER = CountingGridsHeartBeater(
SERVICE_URL, BATCH_JOB, SERVICE_AUTHORIZER)
# Setup Working Directory.
DIRECTORY_SUFFIX = hashlib.sha3_256(
(targetId+str(datetime.datetime.now())).encode()).hexdigest()
DIRECTORY_DATA = targetId + "_" + DIRECTORY_SUFFIX
if not os.path.isdir(DIRECTORY_DATA):
os.mkdir(DIRECTORY_DATA)
'''
Algorithm:
1. Copy everything over from target container to job container.
2. Get sentiment analysis results for all documents.
3. Weight documents on grid.
3. Output colors file.
5. Write legend file.
'''
pTimer = PipelineTimer()
pTimer("Downloading Target Job Files.")
# Heart beater to training state.
HEART_BEATER.next()
HEART_BEATER.next()
blob_client = azureblob.BlockBlobService(
account_name=_STORAGE_ACCOUNT_NAME,
account_key=_STORAGE_ACCOUNT_KEY)
all_blobs = blob_client.list_blobs(containerNameIn)
for blob in all_blobs:
download_file_from_container(
blob_client, containerNameIn, DIRECTORY_DATA + "/"+blob.name, blob.name)
HEART_BEATER.makeProgress(20)
pTimer("Getting sentiment from Azure.")
# Feature Mapping data
# Generate body to send to text analytics.
base_object = {
'documents': []
}
documents = []
def chunk(it, size):
it = iter(it)
return iter(lambda: tuple(islice(it, size)), ())
def dictionary_from_line(row):
prop_dict = {}
row_components = row.split('\t')
for row_component in row_components:
row_key_value = row_component.split(':')
prop_dict[row_key_value[0]] = row_key_value[1]
return prop_dict
with open(DIRECTORY_DATA+"/database.txt", 'r') as f:
for row in f:
prop_dict = dictionary_from_line(row)
documents.append({
'id': prop_dict['id'],
# Text analytics has a 5120 character limit.
'text': (prop_dict['title'] + " " + prop_dict['abstract'])[:5120]
})
# Call text analytics API
requestHeaders = {
'Content-Type': 'application/json',
'Ocp-Apim-Subscription-Key': _SENTIMENT_ANALYSIS_KEY
}
# To avoid API size limit, send in chunks of 500
feature_map = [None] * len(documents)
for chunk in chunk(documents, 500):
base_object['documents'] = chunk
response = requests.post(
SENTIMENT_ANALYSIS_ENDPOINT, json=base_object, headers=requestHeaders)
# Raise for non 200 response codes
response.raise_for_status()
json_response = response.json()
for element in json_response['documents']:
feature_map[int(element['id']) - 1] = float(element['score'])
HEART_BEATER.makeProgress(60)
# Legend data
cm = plt.get_cmap('coolwarm_r')
labelTuples = [("Negative Sentiment", "Positive Sentiment")]
colorTuples = [(tuple(list(cm(0))[:-1]),
tuple(list(cm(127))[:-1]), tuple(list(cm(255))[:-1]))]
pTimer("Generating counting grid artifacts.")
LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
DOCMAP_FILE_NAME = "/docmap.txt"
bcag = BrowseCloudArtifactGenerator(DIRECTORY_DATA)
bcag.W = [windowSize, windowSize]
bcag.read(LEARNED_GRID_FILE_NAME)
bcag.read_docmap(DOCMAP_FILE_NAME, engine="numpy")
bcag.write_colors(cm=cm, feature_map=feature_map)
bcag.write_legends(labelTuples=labelTuples, colorTuples=colorTuples)
bcag.add_feature_map_to_database(feature_map)
HEART_BEATER.makeProgress(80)
pTimer("Done.")
blob_client.create_container(containerNameOut)
# upload each file, aside from the input file into
FILES = [f.path for f in os.scandir(DIRECTORY_DATA) if not f.is_dir()]
for modelfile in FILES:
upload_file_to_container(blob_client, containerNameOut, modelfile)
HEART_BEATER.makeProgress(100)
except Exception as e:
HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
print("Script failed.")
print(traceback.format_exc())
except:
HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
print("Script failed.")
print(traceback.format_exc())
else:
HEART_BEATER.done(success=True)
print("Script succeeded.")

Просмотреть файл

@ -0,0 +1,12 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from enum import IntEnum
class JobStatus(IntEnum):
NotStarted = 0
PreProcessing = 1
Training = 2
Success = 3
Failure = 4

Просмотреть файл

@ -0,0 +1,4 @@
azure-batch==5.1.1
azure-storage-blob==1.4.0
adal==1.2.0
azure-mgmt-datalake-analytics==0.2.0

292
CountingGridsPy/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,292 @@
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
src/keys.json
src/metadata.json
src/metadata_other.json
# User-specific files
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
**/Properties/launchSettings.json
*_i.c
*_p.c
*_i.h
*.ilk
*.meta
*.obj
*.pch
*.pdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# TODO: Comment the next line if you want to checkin your web deploy settings
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/packages/*
# except build/, which is used as an MSBuild target.
!**/packages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/packages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Typescript v1 declaration files
typings/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# JetBrains Rider
.idea/
*.sln.iml
# CodeRush
.cr/
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs

Просмотреть файл

@ -0,0 +1,10 @@
from .morphologicalTightener import MorphologicalTightener
from .slidingWindowTrainer import SlidingWindowTrainer
from .browseCloudArtifactGenerator import BrowseCloudArtifactGenerator
from .cgEngineWrapper import CGEngineWrapper
from .nlpCleaner import NLPCleaner
from .pipelineTimer import PipelineTimer
__all__ = ['MorphologicalTightener', 'BrowseCloudArtifactGenerator',
'CGEngineWrapper', 'NLPCleaner', 'PipelineTimer', 'SlidingWindowTrainer']

Просмотреть файл

@ -0,0 +1,353 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import numpy as np
import scipy.io as io
import pandas as pd
import matplotlib.pyplot as plt
from CountingGridsPy.EngineToBrowseCloudPipeline import MorphologicalTightener
class BrowseCloudArtifactGenerator(object):
def __init__(self, DIRECTORY_DATA):
self.DIRECTORY_DATA = DIRECTORY_DATA
def read(self, LEARNED_GRID_FILE_NAME):
MAT = io.loadmat(self.DIRECTORY_DATA + LEARNED_GRID_FILE_NAME)
self.pi2_idf = MAT['pi2_idf']
self.counts_to_show = MAT['counts_to_show'] # COUNTS MATRIX
if type(self.counts_to_show) is not np.ndarray:
try:
self.counts_to_show = self.counts_to_show.toarray()
except Exception as e:
raise ValueError(e)
# MAPPING AFTER LAYERS CODE Q( Location | Document ) TxE
self.ql2 = MAT['ql2']
# ARRAY WITH THE LAYER NUMBER FOR EACH DOCUMENT, argmax over the layers
self.id_layer = MAT['id_layer'][0]
# LAYERED PI WEIGHTED BY IDF. E1xE2xZxLA
self.pi_la_idf = MAT['pi_la_idf']
try:
# We don't need it for now. Simply 1:T
self.indices_to_show = MAT['indices_to_show'][0]
except Exception as e:
# Not implemented in matlab version
self.indices_to_show = np.array(range(MAT['ql2'].shape[0])) + 1
del MAT
cgsz = np.zeros(2)
cgsz[0], cgsz[1], self.Z = self.pi2_idf.shape
self.cgsz = cgsz.astype(int)
self.indexR = np.arange(0, self.cgsz[0]).astype(int)
self.indexC = np.arange(0, self.cgsz[1]).astype(int)
def write_top_pi(self):
MAXZ = 80
self.pi2_idf = MorphologicalTightener.tighten_pi(self.pi2_idf)
pi_max = np.argsort(-self.pi2_idf, axis=2)[:, :, :MAXZ]
pi_max_vals = -np.sort(-self.pi2_idf, axis=2)[:, :, :MAXZ]
missing_words = set(range(self.Z)).difference(set(pi_max.flatten()))
locations_missing_words = np.zeros([len(missing_words), 4])
for m_id, m in enumerate(missing_words):
loc = np.unravel_index(
np.argmax(self.pi2_idf[:, :, m]), self.cgsz.astype('int'))
locations_missing_words[m_id, :] = [
int(m), self.pi2_idf[loc[0], loc[1], m], loc[0], loc[1]]
with open(self.DIRECTORY_DATA + '/top_pi.txt', 'w') as the_file:
for r in self.indexR:
for c in self.indexC:
tmp = "row:" + ("%1d" % (r+1)) + "\t" + "col:" + ("%1d" % (c+1)) + "\t" + "\t".join(
["%1d" % a + ":" + "%1.3f" % b for a, b in zip(pi_max[r, c, :], pi_max_vals[r, c, :])])
if any((locations_missing_words[:, 2] == r) & (locations_missing_words[:, 3] == c)):
tmp = tmp + "\t" + "\t".join([("%1d" % a) + ":" + ("%1.3f" % b) for a, b in locations_missing_words[(
locations_missing_words[:, 2] == r) & (locations_missing_words[:, 3] == c), :2]])
the_file.write(tmp + "\n")
def write_top_pi_layers(self):
no_layers = self.pi_la_idf.shape[3]
with open(self.DIRECTORY_DATA + '/top_pi_layers.txt', 'w') as the_file:
for layer in range(no_layers):
MAXZ = 80
self.pi_la_idf[:, :, :, layer] = MorphologicalTightener.tighten_pi(
self.pi_la_idf[:, :, :, layer])
pi_max = np.argsort(-self.pi_la_idf[:,
:, :, layer], axis=2)[:, :, :MAXZ]
pi_max_vals = - \
np.sort(-self.pi_la_idf[:, :, :,
layer], axis=2)[:, :, :MAXZ]
missing_words = set(range(self.Z)).difference(
set(pi_max.flatten()))
locations_missing_words = np.zeros([len(missing_words), 4])
for m_id, m in enumerate(missing_words):
loc = np.unravel_index(
np.argmax(self.pi_la_idf[:, :, m, layer]), self.cgsz.astype(int))
locations_missing_words[m_id, :] = [
int(m), self.pi_la_idf[loc[0], loc[1], m, layer], loc[0], loc[1]]
for r in self.indexR:
for c in self.indexC:
tmp = "layer:" + ("%1d" % (layer+1)) + "\t" + "row:" + ("%1d" % (r+1)) + "\t" + "col:" + ("%1d" % (c+1)) + "\t" + "\t".join(
["%1d" % a + ":" + "%1.3f" % b for a, b in zip(pi_max[r, c, :], pi_max_vals[r, c, :])])
if any((locations_missing_words[:, 2] == r) & (locations_missing_words[:, 3] == c)):
tmp = tmp + "\t" + "\t".join([("%1d" % a) + ":" + ("%1.3f" % b) for a, b in locations_missing_words[(
locations_missing_words[:, 2] == r) & (locations_missing_words[:, 3] == c), :2]])
the_file.write(tmp + "\n")
def write_database(self, df, keep):
dfSave = df.copy()
dfSave = dfSave[keep]
dfSave.reset_index(drop=True, inplace=True)
dfSave["id"] = np.arange(len(dfSave)) + 1
dfSave["layer"] = self.id_layer
def format_full_row(row):
row_property_strings = []
for column_name in set(dfSave.columns):
row_property_strings.append(
column_name + ":" + str(row[column_name]))
return str.join('\t', row_property_strings) + '\n'
databaselist = dfSave.apply(format_full_row, axis=1).tolist()
with open(self.DIRECTORY_DATA + '/database.txt', 'w', encoding="utf-8") as the_file:
the_file.writelines(databaselist)
def add_feature_map_to_database(self, feature_map):
file_text = ""
with open(self.DIRECTORY_DATA + '/database.txt', "r") as f:
for index, line in enumerate(f):
file_text += line.strip() + '\tfeature:' + \
str(feature_map[index]) + "\n"
with open(self.DIRECTORY_DATA + '/database.txt', "w") as f:
f.writelines(file_text)
def write_keep(self, keep):
with open(self.DIRECTORY_DATA + '/keep.txt', 'w') as the_file:
the_file.writelines("\n".join([str(int(x)) for x in keep]))
def read_docmap(self, fileName, engine="numpy"):
if not (engine == "numpy" or engine == "matlab"):
raise ValueError("The {} engine does not exist.".format(engine))
self.ql2 = np.zeros(self.ql2.shape)
with open(self.DIRECTORY_DATA + fileName) as f:
for line in f:
arr = line.split("\t")
e1Index = int(arr[0].replace("row:", ""))-1
e2Index = int(arr[1].replace("col:", ""))-1
# since there are layers, we have previously pick the max probability over the bold-ith layer
i = 2
while i < len(arr):
docId, qVal, layer = arr[i].split(":")
docId = int(docId)
qVal = float(qVal)
layer = int(layer)
t = docId - 1
if engine == "matlab":
self.ql2[e1Index, e2Index, t] = qVal
elif engine == "numpy":
self.ql2[t, e1Index, e2Index] = qVal
i += 1
def write_docmap(self, wd_size, engine="numpy"):
docToGridMapping = np.copy(self.ql2)
if engine == "matlab":
pass
elif engine == "numpy":
docToGridMapping = np.moveaxis(docToGridMapping, 0, -1)
else:
raise ValueError("The {} engine does not exist.".format(engine))
thr = 0.01
mask = np.zeros(self.cgsz)
mask[:wd_size, :wd_size] = 1
qlSmooth = np.real(np.fft.ifft2(np.fft.fft2(docToGridMapping, axes=(
0, 1)) * np.fft.fft2(np.expand_dims(mask, 2), axes=(0, 1)), axes=(0, 1)))
tmp = list()
with open(self.DIRECTORY_DATA + '/docmap.txt', 'w') as f:
for r in self.indexR:
for c in self.indexC:
ids = np.where(qlSmooth[r, c, :] > thr)[0]
vals = qlSmooth[r, c, ids]
lay = self.id_layer[ids]
tmp.append("row:" + ("%1d" % (r+1)) + "\tcol:" + ("%1d" % (c+1)) + "\t" + "\t".join(
[str(theid + 1)+":"+str(val)+":"+str(l) for theid, val, l in zip(ids, vals, lay)]) + "\n")
f.writelines(tmp)
def write_correspondences(self, correspondences, vocabulary):
'''
Correspondences maps the lemmatized words to the original text.
Example correspondences:
{'adopt': ',adopted,adopted,adopted,adopted', 'work': ',work,work,work,work', 'i': ',i,i,i,i,i,i,i,i
', 'wish': ',wish,wish,wish,wish'}
'''
li = list()
with open(self.DIRECTORY_DATA + '/correspondences.txt', 'w') as the_file:
for k, v in correspondences.items():
unique_values = list(set([w for w in v.split(",") if w != '']))
N = len(unique_values)
li = li + list(zip(unique_values, [k]*N))
tmp = list()
for w1, w2 in li:
try:
i = vocabulary.index(w2)
tmp.append(w1 + "\t" + w2 + "\t" + str(i+1) + "\n")
except Exception as e:
pass
the_file.writelines(tmp)
def write_cooccurences(self):
raise Exception(
"The coccurrences function should not be called because it's not guaranteed to be a correct artifact for BrowseCloud.")
def write_counts(self):
tmp = list()
with open(self.DIRECTORY_DATA + '/words.txt', 'w') as the_file:
for z in range(self.counts_to_show.shape[0]):
docIds = np.where(self.counts_to_show[z, :] != 0)[0]
vals = np.array(self.counts_to_show[z, docIds]).flatten()
tmp.append("id:"+str(z+1) + "\t" + "\t".join(
[str(i + 1) + ":" + "%1d" % v for i, v in zip(docIds, vals)]) + "\n")
the_file.writelines(tmp)
def write_vocabulary(self, vocabulary):
with open(self.DIRECTORY_DATA + '/vocabulary.txt', 'w') as the_file:
the_file.writelines(
[str(id + 1) + "\t" + str(word) + "\n" for id, word in enumerate(vocabulary)])
def write_legends(self, colorTuples=None, labelTuples=None):
if (colorTuples is not None and labelTuples is not None):
with open(self.DIRECTORY_DATA + '/legend.txt', 'w') as the_file:
for ct, lt in zip(colorTuples, labelTuples):
r1, g1, b1 = ct[0]
rm, gm, bm = ct[1]
r2, g2, b2 = ct[2]
l1, l2 = lt
data = [l1, r1, g1, b1, rm, gm, bm, l2, r2, g2, b2]
data = [str(x) for x in data]
the_file.write("\t".join(data)+"\n")
# 0. make sure ql is a distribution over the indices again -
# chose not to do this because the final map used for visualization will be screwed up
# docToGridMapping/np.sum(np.sum(docToGridMapping,axis=0),axis=0)
# 1. Calculate the weighted average between ql and the featuremapping for each index - weights are ql
# 2. Do 0,1 normalization of the and multiply by 255 to map the range [0,1] to the range [0,255]
# 3. Use this new range to map to the RGB colorscale
def mapSentiment(self, docToGridMapping, feature_map, W=[5, 5], doNormalizeQOverGrid=True, stretch_the_truth=False):
normalizedDocToGridMapping = None
if doNormalizeQOverGrid:
normalizedDocToGridMapping = docToGridMapping / \
np.sum(docToGridMapping, axis=(0, 1))
else:
normalizedDocToGridMapping = np.copy(docToGridMapping)
e0, e1, T = docToGridMapping.shape
# toroidal- top, left, and top left
Q = np.pad(normalizedDocToGridMapping, [
(W[0]-1, 0), (W[1]-1, 0), (0, 0)], 'wrap').cumsum(axis=0).cumsum(axis=1)
# sum area table trick
normalizedDocToGridMapping = Q[(
W[0]-1):, (W[1]-1):, :] - Q[(W[0]-1):, :e1, :] - Q[:e0, (W[1]-1):, :] + Q[:e0, :e1, :]
normalizedDocToGridMapping = np.moveaxis(np.moveaxis(
normalizedDocToGridMapping, -1, 0) / np.sum(normalizedDocToGridMapping, axis=-1), 0, -1)
sentimentMapping = np.dot(normalizedDocToGridMapping, feature_map)
weights = None
if stretch_the_truth:
weights = 255*(sentimentMapping - np.min(sentimentMapping.flatten())) / (np.max(
sentimentMapping.flatten()) - np.min(sentimentMapping.flatten())) # weights between 0 and 256
else:
weights = 255*(sentimentMapping)
return (sentimentMapping, weights)
def write_colors(self, colors=None, feature_map=None, engine="numpy", cm=None, stretch_the_truth=False):
def valid_color_comp(c):
return 0.0 < c and c < 1
if colors is not None:
for color in colors:
if len(color) != 3 or not valid_color_comp(color[0]) or not valid_color_comp(color[1]) or not valid_color_comp(color[2]):
raise Exception(
"Invalid RGB color for BrowseCloud input. Must be between 0 and 1 and only 3 dimensions are given.")
elif feature_map is not None:
colors = [0 for d in range(len(self.indexR)*len(self.indexC))]
docToGridMapping = np.copy(self.ql2)
if engine == "matlab":
pass
elif engine == "numpy":
# move the first axis to the third
docToGridMapping = np.moveaxis(docToGridMapping, 0, -1)
else:
raise ValueError(
"The {} engine does not exist.".format(engine))
W = None
if self.W is None:
W = [5, 5]
else:
W = self.W.copy()
sentimentMapping, weights = self.mapSentiment(
docToGridMapping, feature_map, W, stretch_the_truth=stretch_the_truth)
if cm is None:
cm = plt.get_cmap('PuRd')
colors = [(c[0], c[1], c[2])
for c in cm([int(np.round(w)) for w in weights.flatten()])]
else:
colors = [(1.0, 1.0, 1.0)
for d in range(len(self.indexR)*len(self.indexC))]
with open(self.DIRECTORY_DATA + '/colors_browser.txt', 'w') as the_file:
for r in self.indexR:
for c in self.indexC:
i = len(self.indexR)*r + c
tmp = ("%1d" % (r+1)) + "\t" + ("%1d" % (c+1)) + "\t" + str(
(colors[i][0])) + "\t"+str((colors[i][1])) + "\t" + str((colors[i][2]))
the_file.write(tmp + "\n")
return colors
if __name__ == "__main__":
bcag = BrowseCloudArtifactGenerator("")
# 3 documents each with sentiment[0,.5,1] from negative to positive
# 9x9 grid
doc1Q = [
[0.25, 0.25, 0],
[0.25, 0.25, 0],
[0, 0, 0]
]
doc2Q = [
[0, 0.25, 0.25],
[0, 0.25, 0.25],
[0, 0, 0]
]
doc3Q = [
[0, 0, 0],
[0, 0.25, 0.25],
[0, 0.25, 0.25]
]
q = np.array([doc1Q, doc2Q, doc3Q])
feature_map = np.array([0, 0.5, 1])
W = [1, 1]
qMatlab = np.moveaxis(q, 0, -1)
Q = np.pad(q, [(0, 0), (W[1]-1, 0), (W[0]-1, 0)],
'wrap').cumsum(1).cumsum(2)
normalizedDocToGridMapping = Q[:, (W[0]-1):, (W[1]-1):] - \
Q[:, (W[0]-1):, :3] - Q[:, :3, (W[1]-1):] + Q[:, :3, :3]
print(normalizedDocToGridMapping)
result = bcag.mapSentiment(qMatlab, feature_map)[0]
print('DONE')
print(result)

Просмотреть файл

@ -0,0 +1,135 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from CountingGridsPy.models import CountingGridModel
from CountingGridsPy.EngineToBrowseCloudPipeline import SlidingWindowTrainer
from scipy import io
import pandas as pd
import numpy as np
import scipy as sp
import os
from sklearn.feature_extraction.text import CountVectorizer
class CGEngineWrapper(object):
def __init__(self, extent_size=32, window_size=5, layers=2, heartBeaters=None):
self.cg_size = extent_size
self.wd_size = window_size
self.no_layers = layers
self.heartBeaters = heartBeaters
def ready_the_matlab_engine(self, X, labels_filtered, DIRECTORY_DATA):
m, n = X.shape
s = X.data
i = X.tocoo().row
j = X.indices
io.savemat(DIRECTORY_DATA + '/counts.mat',
{'m': m, 'n': n, 's': s, 'i': i, 'j': j})
if labels_filtered is not None:
io.savemat(DIRECTORY_DATA + '/labels.mat',
{'labels': labels_filtered})
def __fitCountVectorizor(self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, labels, MIN_FREQUENCY):
df = pd.read_table(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME)
df = df[df.columns[1:]]
TEXT = df['pos_filtered'].tolist()
id_grid = np.array(
[i for i, t in enumerate(TEXT) if len(str(t).split(" ")) > 3]
)
addl_keep = np.zeros(len(TEXT))
addl_keep[id_grid] += 1
addl_keep = addl_keep.astype(bool)
df = df.ix[id_grid].reset_index(drop=True)
self.labelsS = np.array(
labels)[id_grid] if labels is not None else None
vect = CountVectorizer(decode_error="ignore", min_df=MIN_FREQUENCY)
X = vect.fit_transform(df['pos_filtered'].tolist())
return (vect, X, addl_keep)
def get_vocab(self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, labels, MIN_FREQUENCY, keep):
vect, _, addl_keep = self.__fitCountVectorizor(
DIRECTORY_DATA,
CLEAN_DATA_FILE_NAME,
labels,
MIN_FREQUENCY
)
return (vect.get_feature_names(), np.array(keep) & np.array(addl_keep))
def incremental_fit(
self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, labels,
MIN_FREQUENCY, keep, engine, initial_max_iter=100,
w=2000, s=1000, runInitialTrain=True
):
vect, X, addl_keep = self.__fitCountVectorizor(
DIRECTORY_DATA,
CLEAN_DATA_FILE_NAME,
labels,
MIN_FREQUENCY
)
if engine != "numpy":
raise ValueError("Not implemented!")
extent = np.array([self.cg_size, self.cg_size])
window = np.array([self.wd_size, self.wd_size])
model = CountingGridModel(extent, window)
T = X.shape[0]
training_max_iter_vec = [1 for x in range(int(np.ceil((T-w)/s)) + 1)]
train = model.fit
kwargs = {
"data": X.toarray(),
"max_iter": initial_max_iter,
"returnSumSquareDifferencesOfPi": False,
"layers": self.no_layers,
"noise": .00001,
"output_directory": DIRECTORY_DATA
}
print("Iteration vectors:")
print("Running for this number of iteration:" +
str(len([initial_max_iter] + training_max_iter_vec)))
print([initial_max_iter] + training_max_iter_vec)
SlidingWindowTrainer.SlidingTrainer(
w,
s,
training_max_iter_vec,
train,
kwargs,
runInitialTrain=runInitialTrain
)
return (vect.get_feature_names(), np.array(keep) & np.array(addl_keep))
def fit(self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, labels, MIN_FREQUENCY, keep, engine):
vect, X, addl_keep = self.__fitCountVectorizor(
DIRECTORY_DATA,
CLEAN_DATA_FILE_NAME,
labels,
MIN_FREQUENCY
)
if engine == "matlab":
self.ready_the_matlab_engine(X, labels, DIRECTORY_DATA)
os.system(r"cg_exe_buja.exe {0} counts.mat ".format(
DIRECTORY_DATA) + str(self.cg_size) + " " + str(self.wd_size) + " " + str(self.no_layers))
elif engine == "numpy":
extent = np.array([self.cg_size, self.cg_size])
window = np.array([self.wd_size, self.wd_size])
model = CountingGridModel(extent, window)
model.fit(
X.toarray(),
max_iter=100,
returnSumSquareDifferencesOfPi=False,
layers=self.no_layers,
noise=.00000001,
output_directory=DIRECTORY_DATA,
heartBeaters=self.heartBeaters
)
elif engine == "torch":
raise ValueError("Not implemented yet.")
else:
raise ValueError("The {} engine does not exist.".format(engine))
return (vect.get_feature_names(), np.array(keep) & np.array(addl_keep))

Просмотреть файл

@ -0,0 +1,102 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, CGEngineWrapper, NLPCleaner, PipelineTimer
import sys
import numpy as np
import os
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from CountingGridsPy.models import CountingGridModel
pTimer = PipelineTimer()
# Example CLI: python dumpCountingGrids.py CountingGridInput_MarchSurvey 24 5 matlabEngine traditionalInput channelDump.csv
errStr = '''
Please give a valid command-line arguments.
Instructions found here: https://github.com/microsoft/browsecloud/wiki/Data-Pipeline-Documentation.
'''
# ---------------------------------------------------------------------------------------
# Input
# ---------------------------------------------------------------------------------------
if len(sys.argv) != 7:
print(((sys.argv)))
raise ValueError(errStr)
DIRECTORY_DATA = sys.argv[1]
EXTENT_SIZE = int(sys.argv[2])
WINDOW_SIZE = int(sys.argv[3])
engine_type = sys.argv[4]
inputfile_type = sys.argv[5]
inputfile_name = sys.argv[6]
if engine_type != "numpyEngine" and engine_type != "matlabEngine":
raise ValueError(
"The {0} engine does not exist. Please use 'matlabEngine' or 'numpyEngine'.".format(engine_type))
engine_type = engine_type[:-6] # 6 characters in the word "Engine"
if inputfile_type != "metadataInput" and inputfile_type != "simpleInput":
raise ValueError(
"The {0} input type does not exist. Please use 'simpleInput' or 'metadataInput'.".format(inputfile_type))
inputfile_type = inputfile_type[:-5] # remove "Input"
if not os.path.isdir(DIRECTORY_DATA):
raise ValueError(
"Undefined local directory where digital channel is dumped!\n" + errStr)
FILE_NAME = DIRECTORY_DATA + "\\" + inputfile_name
CLEAN_DATA_FILE_NAME, MIN_FREQUENCY, MIN_WORDS = ["\cg-processed.tsv", 2, 5]
# ---------------------------------------------------------------------------------------
# Data Cleaning
# ---------------------------------------------------------------------------------------
cleaner = NLPCleaner()
correspondences = None
CACHED_CORRESPONDENCES_FILE_NAME = "\cached_correspondences.tsv"
pTimer("Reading data file.")
df, keep = cleaner.read(FILE_NAME, inputfile_type, MIN_FREQUENCY, MIN_WORDS)
if not (os.path.exists(DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME) and os.path.exists(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME)):
pTimer("Starting data cleaning.")
cleaner.handle_negation_tokens()
cleaner.removePunctuation()
correspondences = cleaner.lemmatize()
cleaner.write_cached_correspondences(
DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
cleaner.write(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME)
else:
pTimer("Skipping data cleaning.")
correspondences = cleaner.read_cached_correspondences(
DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
pTimer("Learning counting grid.")
LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
engine = CGEngineWrapper(extent_size=EXTENT_SIZE, window_size=WINDOW_SIZE)
vocabulary = None
# ---------------------------------------------------------------------------------------
# Learning
# ---------------------------------------------------------------------------------------
if not os.path.exists(DIRECTORY_DATA + LEARNED_GRID_FILE_NAME):
vocabulary, keep = engine.fit(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME,
cleaner.labelsS, MIN_FREQUENCY, keep, engine=engine_type)
else:
vocabulary, keep = engine.get_vocab(
DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS, MIN_FREQUENCY, keep)
# ---------------------------------------------------------------------------------------
# Output
# ---------------------------------------------------------------------------------------
pTimer("Generating counting grid artifacts.")
LINK_FILE_NAME = ""
bcag = BrowseCloudArtifactGenerator(DIRECTORY_DATA)
bcag.read(LEARNED_GRID_FILE_NAME)
bcag.write_docmap(engine.wd_size, engine=engine_type)
bcag.write_counts()
bcag.write_vocabulary(vocabulary)
bcag.write_top_pi()
bcag.write_top_pi_layers()
bcag.write_colors() # write default blue colors
bcag.write_database(df, keep)
bcag.write_correspondences(correspondences, vocabulary)
bcag.write_keep(keep)
pTimer("Done.")

Просмотреть файл

@ -0,0 +1,111 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, CGEngineWrapper, NLPCleaner, PipelineTimer
import sys
import numpy as np
import os
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from CountingGridsPy.models import CountingGridModel
pTimer = PipelineTimer()
# Example CLI: python dumpSlidingWindowCountingGrids.py CalculatorDataExperiment6 24 5 numpyEngine simpleTimeInput CalculatorUIFData.txt
errStr = '''
Please give a valid command-line arguments.
Instructions found here: https://github.com/microsoft/browsecloud/wiki/Data-Pipeline-Documentation.
'''
# ---------------------------------------------------------------------------------------
# Input
# ---------------------------------------------------------------------------------------
if len(sys.argv) != 7:
print(((sys.argv)))
raise ValueError(errStr)
DIRECTORY_DATA = sys.argv[1]
EXTENT_SIZE = int(sys.argv[2])
WINDOW_SIZE = int(sys.argv[3])
engine_type = sys.argv[4]
inputfile_type = sys.argv[5]
inputfile_name = sys.argv[6]
if engine_type != "numpyEngine":
raise ValueError("The {0} engine does not exist.".format(engine_type))
engine_type = engine_type[:-6] # 6 characters in the word "Engine"
if inputfile_type != "simpleTimeInput":
raise ValueError(
"The {0} input type does not exist.".format(inputfile_type))
inputfile_type = inputfile_type[:-5] # remove "Input"
if not os.path.isdir(DIRECTORY_DATA):
raise ValueError(
"Undefined local directory where digital channel is dumped!\n" + errStr)
FILE_NAME = DIRECTORY_DATA + "\\" + inputfile_name
CLEAN_DATA_FILE_NAME, MIN_FREQUENCY, MIN_WORDS = ["\cg-processed.tsv", 2, 5]
# ---------------------------------------------------------------------------------------
# Data Cleaning
# ---------------------------------------------------------------------------------------
cleaner = NLPCleaner()
correspondences = None
CACHED_CORRESPONDENCES_FILE_NAME = "\cached_correspondences.tsv"
pTimer("Reading data file.")
df, keep = cleaner.read(FILE_NAME, inputfile_type, MIN_FREQUENCY, MIN_WORDS)
if not (os.path.exists(DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME) and os.path.exists(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME)):
pTimer("Starting data cleaning.")
cleaner.handle_negation_tokens()
cleaner.removePunctuation()
correspondences = cleaner.lemmatize()
cleaner.write_cached_correspondences(
DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
cleaner.write(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME)
else:
pTimer("Skipping data cleaning.")
correspondences = cleaner.read_cached_correspondences(
DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
pTimer("Learning counting grid.")
LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
engine = CGEngineWrapper(extent_size=EXTENT_SIZE, window_size=WINDOW_SIZE)
vocabulary = None
try:
# ---------------------------------------------------------------------------------------
# Learning
# ---------------------------------------------------------------------------------------
if not os.path.exists(DIRECTORY_DATA + LEARNED_GRID_FILE_NAME):
vocabulary, keep = engine.incremental_fit(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS,
MIN_FREQUENCY, keep, engine=engine_type, initial_max_iter=100, w=2000, s=1, runInitialTrain=True)
else:
vocabulary, keep = engine.get_vocab(
DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS, MIN_FREQUENCY, keep)
pTimer("Generating counting grid artifacts.")
dataFolders = [DIRECTORY_DATA] + \
[f.path for f in os.scandir(DIRECTORY_DATA) if f.is_dir()]
# we know a priori it should be [".\\","\\iter0","./iter1",...]
dataFolders.sort()
except Exception as e:
print(e)
finally:
# ---------------------------------------------------------------------------------------
# Output
# ---------------------------------------------------------------------------------------
for i, folder in enumerate(dataFolders):
LINK_FILE_NAME = ""
bcag = BrowseCloudArtifactGenerator(folder)
if os.path.exists(folder + LEARNED_GRID_FILE_NAME):
bcag.read(LEARNED_GRID_FILE_NAME)
bcag.write_docmap(engine.wd_size, engine=engine_type)
bcag.write_counts()
bcag.write_vocabulary(vocabulary)
bcag.write_top_pi()
bcag.write_top_pi_layers()
bcag.write_colors() # write default blue colors
bcag.write_correspondences(correspondences, vocabulary)
if i == 0:
bcag.write_database(df, keep)
bcag.write_keep(keep)
pTimer("Done.")

Просмотреть файл

@ -0,0 +1,102 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from skimage.morphology import label
from skimage.measure import regionprops
import numpy as np
class MorphologicalTightener():
@staticmethod
def tighten_pi(pi):
'''
Algorithm:
1. Go through each word in the vocabulary stored in pi params.
2. Find all connected components within the
the 2-d plane for the given feature, which is like an image.
3. Within each component find the index with highest probability.
4. Set the probability of all other elements
in the component set to be 0.
'''
pi = np.copy(pi)
Z = pi.shape[2]
for z in range(Z):
tmp_pi = pi[:, :, z]
labelled_components = label(
tmp_pi > 1e-3,
neighbors=None,
background=None,
return_num=False,
connectivity=1
)
for region in regionprops(labelled_components):
x_coords = region.coords[:, 0]
y_coords = region.coords[:, 1]
vals = tmp_pi[x_coords, y_coords]
i_max = np.argmax(vals)
maxval = tmp_pi[np.array(region.coords[i_max, :][0]), np.array(
region.coords[i_max, :][1])]
tmp_pi[x_coords, y_coords] = 0
tmp_pi[np.array(region.coords[i_max, :][0]), np.array(
region.coords[i_max, :][1])] = maxval
pi[:, :, z] = tmp_pi
return pi
@staticmethod
def tighten_q():
pass
@staticmethod
def print_results(input, output):
assert(np.all(input.shape == output.shape))
for z in range(input.shape[2]):
print("Feature index " + str(z) + ":")
print("Input:")
print(input[:, :, z])
print("Output:")
print(output[:, :, z])
print("")
if __name__ == "__main__":
# test1
INPUT = np.array(
[
[
[0.5, 0.5, 0, 0],
[0.7, 0, 0, 0.5],
[0.5, 0, 0, 0.6],
[0.5, 0, 0.5, 0]
],
[
[0.004, 0.5, 0, 0],
[0.5, 0, 0, 0],
[100, 0, 0, 0],
[0, 0, 0, 0.5]
]
]).astype(float).transpose((1, 2, 0))
OUTPUT = np.array(
[
[
[0, 0, 0, 0],
[0.7, 0, 0, 0],
[0, 0, 0, 0.6],
[0, 0, 0.5, 0]
],
[
[0, 0, 0, 0],
[0, 0, 0, 0],
[100, 0, 0, 0],
[0, 0, 0, 0.5]
]
]
).astype(float).transpose((1, 2, 0))
assert(np.all(MorphologicalTightener.tighten_pi(INPUT) == OUTPUT))
MorphologicalTightener.print_results(
INPUT,
MorphologicalTightener.tighten_pi(INPUT)
)

Просмотреть файл

@ -0,0 +1,207 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import re
import functools
import string
from collections import defaultdict
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tag.stanford import StanfordPOSTagger
import pandas as pd
import os
import numpy as np
from CountingGridsPy.EngineToBrowseCloudPipeline.surveyData import SurveyData
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import stop_words
def tokenizer(x, wordnet_lemmatizer, corrispondences, tagger):
try:
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N') or treebank_tag.startswith('FW'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
# tagged = tagger.tag(x.split()) #e.g. [('what', 'WP')]
tagged = [(d, 'VB') if d not in stop_words.ENGLISH_STOP_WORDS else (
d, '') for d in x.split()] # stopword removal here
tagged = [(wordnet_lemmatizer.lemmatize(re.sub("[^\w\d]", "", t[0]), get_wordnet_pos(t[1])), t[1], t[0])
for t in tagged] # the regular expression removes characters that aren't digits or alphanumeric characters
# tagged - (lemmatized word, part of speech, original word) e.g. [('what', 'WP', 'what')
for t in tagged:
corrispondences[t[0]] = corrispondences.get(t[0], "") + "," + t[2]
return [t[0] for t in tagged], [t[1] for t in tagged]
except Exception as e:
print(e)
return [], []
class NLPCleaner(object):
def __init__(self):
self.corrispondences = dict()
self.textS = None
self.labelsS = None
def initial_clean_after_read(self, df, MIN_WORDS):
self.text = [(
str(aTuple[0]) + " " + str(aTuple[1]) + " " + str(aTuple[1]) + " " + str(aTuple[1])
) if not pd.isnull(aTuple[0]) and not pd.isnull(aTuple[1]) else (
str(aTuple[1]) if not pd.isnull(aTuple[1]) else (
str(aTuple[0]) if not pd.isnull(aTuple[0]) else ""
)
) for aTuple in zip(df.title.tolist(), df.abstract.tolist())]
text_extremely_cleaned = [a for a in map(lambda x: re.sub("\\b[A-Za-z]{1,3}\\b", " ", re.sub(
" +", " ", re.sub("[^A-Za-z\\s]", " ", re.sub("\[[^\]]*\]", " ", x)))), self.text)]
keep = np.array([True if len(doc.split()) >=
MIN_WORDS else False for doc in text_extremely_cleaned])
self.textS = [re.sub(" +", " ", re.sub("\|", " ", re.sub("\[[^\]]*\]",
" ", t.lower()))) if k else "" for k, t in zip(keep, self.text)]
return keep
def read(self, FILE_NAME, inputfile_type, MIN_FREQUENCY=2, MIN_WORDS=5):
df = None
if inputfile_type == "metadata":
df = pd.read_csv(FILE_NAME, keep_default_na=False)
if 'title' not in df:
df['title'] = ''
if 'abstract' not in df:
df['abstract'] = ''
elif inputfile_type == "simple":
df = pd.read_csv(FILE_NAME, sep="\t", header=None,
keep_default_na=False)
df.columns = ["title", "abstract", "link"]
elif inputfile_type == "simpleTime":
df = pd.read_csv(FILE_NAME, sep="\t", header=None,
keep_default_na=False)
df.columns = ["title", "abstract", "link", "time"]
else:
raise ValueError(
"Input_type " + str(inputfile_type) + " is not valid.")
# Validation now happens in BrowseCloud.Service. Assuming correct input, so no validation needed.
keep = self.initial_clean_after_read(df, MIN_WORDS)
'''
le = LabelEncoder()
labels = le.fit_transform(df.Corpus.tolist())
self.labelsS = [t for k,t in zip(keep,labels) if k] #not used
'''
return (df, np.copy(keep))
def handle_negation_tokens(self):
negation_handling_functions = [lambda x: re.sub(" +", " ", x),
lambda x: re.sub(
"(^|\W|\.)can[\W']*[o]*(t|not)(\W|$|\.)", " can not ", x),
lambda x: re.sub(
"(^|\W|\.)could[n]*[\W']*[no]*(t|not)(\W|$|\.)", " could not ", x),
lambda x: re.sub(
"(^|\W|\.)should[n]*[\W']*[no]*(t|not)(\W|$|\.)", " should not ", x),
lambda x: re.sub(
"(^|\W|\.)would[n]*[\W']*[no]*(t|not)(\W|$|\.)", " would not ", x),
lambda x: re.sub(
"(^|\W|\.)won[\W']*(t)(\W|$|\.)", " wont ", x),
lambda x: re.sub(
"(^|\W|\.)do[n]*[\W']*[no]*t(\W|$|\.)", " do not ", x),
lambda x: re.sub(
"(^|\W|\.)does[n]*[\W']*[no]*t(\W|$|\.)", " does not ", x),
lambda x: re.sub(
"(^|\W|\.)did[n]*[\W']*[no]*t(\W|$|\.)", " did not ", x),
lambda x: re.sub(
"(^|\W|\.)is[n]*[\W']*[no]*t(\W|$|\.)", " is not ", x),
lambda x: re.sub(
"(^|\W|\.)are[n]*[\W']*[no]*t(\W|$|\.)", " are not ", x),
lambda x: re.sub(
"(^|\W|\.)was[n]*[\W']*[no]*t(\W|$|\.)", " was not ", x),
lambda x: re.sub(
"(^|\W|\.)were[n]*[\W']*[no]*t(\W|$|\.)", " were not ", x),
lambda x: re.sub(
"(^|\W|\.)ain[\W']*[no]*t(\W|$|\.)", " aint ", x),
lambda x: re.sub(
"(^|\W|\.)had[n]*[\W']*[no]*(t|not)(\W|$|\.)", " had not ", x),
lambda x: re.sub(
"(^|\W|\.)has[n]*[\W']*[no]*(t|not)(\W|$|\.)", " has not ", x),
lambda x: re.sub(
"(^|\W|\.)have[n]*[\W']*[no]*(t|not)(\W|$|\.)", " have not ", x),
lambda x: x.lower()]
def compose(*functions):
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
normalize_negations = compose(*negation_handling_functions)
self.textS = [a for a in map(
lambda x: normalize_negations(x), self.textS)]
def removePunctuation(self):
regexPunkt = re.compile('[%s]' % re.escape(string.punctuation))
self.textSb = [a for a in map(lambda x: re.sub(
" +", " ", regexPunkt.sub(' ', x)), self.textS)]
def lemmatize(self):
path_to_model = "./stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger"
path_to_jar = "./stanford-postagger-full-2017-06-09/stanford-postagger.jar"
# Keep the constructor call here as a comment, just in case. We removed the HMM in tokenizer for part of speech tagging.
# StanfordPOSTagger(path_to_model, path_to_jar); tagger.java_options='-mx10G'
tagger = None
wordnet_lemmatizer = WordNetLemmatizer()
self.cleaned_featurized = [a for a in map(lambda x: tokenizer(
x, wordnet_lemmatizer, self.corrispondences, tagger) if x != "" else ([], []), self.textSb)]
return self.corrispondences
def write(self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, start=None, finish=None):
assert(len(self.text) == len(self.cleaned_featurized))
if start is None or finish is None:
start = 0
finish = len(self.text)
df = pd.DataFrame(
columns=['original', 'cleaned', 'pos', 'pos_filtered'])
df['original'] = [w for w in map(
lambda x: x.replace("\t", " "), self.text[start:finish])]
try: # if correspondences runs
df['cleaned'] = [" ".join(d[0]).replace("\t", " ")
for d in self.cleaned_featurized[start:finish]]
df['pos'] = [" ".join(d[1]).replace("\t", " ")
for d in self.cleaned_featurized[start:finish]]
df['pos_filtered'] = [" ".join([w for (w, t) in zip(d[0], d[1]) if t in (
['JJR', 'JJS', 'JJ', 'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WRB', 'FW']
)]) for d in self.cleaned_featurized[start:finish]]
except Exception as e:
df['cleaned'] = ["" for w in self.text[start:finish]]
df['pos'] = ["" for w in self.text[start:finish]]
df['pos_filtered'] = ["" for w in self.text[start:finish]]
if not os.path.isdir(DIRECTORY_DATA):
os.mkdir(DIRECTORY_DATA)
df.to_csv(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME,
sep="\t", encoding="utf-8")
def write_cached_correspondences(self, DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME):
df = pd.DataFrame(columns=['lemma', 'words'])
df['lemma'] = [lemma for lemma in self.corrispondences]
df['words'] = [self.corrispondences[lemma]
for lemma in self.corrispondences]
df.to_csv(DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME,
sep="\t", encoding="utf-8")
def read_cached_correspondences(self, DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME):
df = pd.read_csv(
DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME, sep="\t", encoding="utf-8")
correspondences = dict()
for key, val in zip(df['lemma'], df['words']):
correspondences[key] = val
return correspondences

Просмотреть файл

@ -0,0 +1,46 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import datetime
import time
class PipelineTimer():
def __init__(self):
self.last_s = time.time()
self.init_time = self.last_s
def __call__(self, description):
curr_s = time.time()
self.printSectionDescription(description)
self.printCurrentTime()
self.printPrettyTime(curr_s - self.last_s)
self.last_s = curr_s
if description == "Done.":
self.prettyPrintTotal(curr_s - self.init_time)
def printSectionDescription(self, description):
print(description)
def prettyPrintHelper(self, s: float):
hours = s // 3600
s -= (hours * 3600)
minutes = s//60
s -= (minutes * 60)
seconds = s
return [hours, minutes, seconds]
def printPrettyTime(self, s: float):
hours, minutes, seconds = self.prettyPrintHelper(s)
string = 'Previous section took {}h {}m {}s.'.format(
int(hours), int(minutes), float(int(seconds*100))/100)
print(string)
def printCurrentTime(self):
print(datetime.datetime.now())
def prettyPrintTotal(self, s: float):
hours, minutes, seconds = self.prettyPrintHelper(s)
string = 'Entire program took {}h {}m {}s.'.format(
int(hours), int(minutes), float(int(seconds*100))/100)
print(string)

Просмотреть файл

@ -0,0 +1,44 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import numpy as np
import os
class SlidingWindowTrainer():
@staticmethod
def SlidingTrainer(w: int, s: int, training_max_iter_vec: list, train, kwargs: dict, runInitialTrain=True):
'''
Assume: data is sorted by date
'''
print("Learning Initial Grid.")
pi = None
if runInitialTrain:
pi = train(**kwargs)
assert("data" in kwargs and "max_iter" in kwargs and "output_directory" in kwargs)
data = kwargs['data']
root_dir = kwargs["output_directory"].replace(
".", "").replace("/", "").replace("\\", "")
T = len(data)
assert(w < T)
max_sliding_window_size = int(np.ceil((T-w)/s) + 1)
assert(len(training_max_iter_vec) == max_sliding_window_size)
first_index = 0
last_index = min(w, T)
for i in range(max_sliding_window_size):
kwargs['max_iter'] = training_max_iter_vec[i]
kwargs['data'] = data[first_index:last_index]
kwargs['output_directory'] = "./" + root_dir + "/iter" + str(i)
kwargs['pi'] = pi
if not (os.path.exists(kwargs['output_directory']) and os.path.isdir(kwargs['output_directory'])):
os.mkdir(kwargs['output_directory'])
print("Learning grid window from first index: " +
str(first_index) + " to second index: " + str(last_index))
pi = train(**kwargs)
last_index = min(last_index + s, T)
first_index = min(first_index + s, T)
assert(last_index >= T)

Просмотреть файл

@ -0,0 +1,16 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
class SurveyData(object):
def __init__(self, alias, title, abstract, responseId, surveyId, link, image):
self.alias = alias
self.title = title
self.abstract = abstract
self.responseId = responseId
self.surveyId = surveyId
self.link = link
self.image = image
def toCols(self):
return [colname for colname in self.__dict__]

Просмотреть файл

Просмотреть файл

@ -0,0 +1,441 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import math
import os
import numpy as np
import scipy.io
import scipy.stats
np.random.seed(0)
# Two functions for use by users of the library:
# 1.
# fit(data,.) - Fits the counting grids model to the data set. if layers >1,
# then learned a hierarchical layered counting grid.
# 2.
# predict_probabilities(bagofwordcounts,k)) - computes the likelihood,
# the probability of the data given the parameters and choice of window
class CountingGridModel():
def __init__(self, extent, window):
"""
extent is a 1-D array of size D in the paper.
D is often 2, since it makes the model easily visualizable.
window is a one-dimensional of size D in the paper.
"""
# E in the paper
self.extent = extent
# W in the paper
self.window = window
self.D = len(extent)
# Each word is some value between 1 & the size of vocabulary
self.vocabulary = np.array([])
self.extent_volume = 1
self.window_volume = 1
for v in self.window:
self.window_volume *= v
for v in self.extent:
self.extent_volume *= v
self.capacity = self.extent_volume / self.window_volume
# Assumes: self.pi, self.q,self.extent are set properly
def cg_layers(self, data, L, noise=1e-10):
T, Z = data.shape
pi_la = np.zeros([self.extent[0], self.extent[1], Z, L])
h_la = np.zeros([self.extent[0], self.extent[1], Z, L])
# Modifies: h_la
def layer_compute_h(pi_la, h_la):
h = self.compute_h(pi_la[:, :, :, l], self.window)
h_la[:, :, :, l] = np.transpose(
np.transpose(h) / np.transpose(np.sum(h, axis=2))
)
# Add noise to pi
for l in range(L):
pi_la[:, :, :, l] = (
self.pi + np.random.uniform(size=self.pi.shape)*noise
)
# Normalize
pi_la[:, :, :, l] = np.transpose(
np.transpose(pi_la[:, :, :, l]) /
np.transpose(np.sum(pi_la[:, :, :, l], axis=2))
)
layer_compute_h(pi_la, h_la)
P = np.prod(self.extent)
# lq/lql is the log of q
# lq/lql is shaped as ([P,T])
# self.q is stored as TxE1xE2, which is why we
def normalize_q(lq):
lqlmax = np.amax(lql, axis=0)
precompute_lql_minus_max = lql - lqlmax
# qla has the same formula, essentially, but not in the log form
Lq = np.reshape(
precompute_lql_minus_max -
np.log(np.sum(np.exp(precompute_lql_minus_max), axis=0)),
[self.extent[0], self.extent[1], T]
)
return np.moveaxis(np.exp(Lq), 2, 0)
def update_summand(self, toroidal_F, M, N):
A = toroidal_F.cumsum(axis=0).cumsum(axis=1)
c = np.pad(
A,
pad_width=tuple([(1, 0) for x in list(self.extent)] + [(0, 0)]), mode='constant', constant_values=0
)
w0 = self.window[0]
w1 = self.window[1]
return (
c[slice(w0, toroidal_F.shape[0]+1), slice(w1, toroidal_F.shape[1]+1), :] -
c[slice(0, self.extent[0]), slice(w1, toroidal_F.shape[1] + 1), :] -
c[slice(w0, toroidal_F.shape[0]+1), slice(0, self.extent[1]), :] +
c[slice(0, self.extent[0]), slice(0, self.extent[1]), :]
)[slice(0, toroidal_F.shape[0]), slice(0, toroidal_F.shape[1]), :]
# Adding noise to q, which was not originally in the matlab code
self.q = self.q + 0.25*noise
lql = np.log(np.reshape(np.moveaxis(self.q, 0, -1), (P, T)))
self.q = normalize_q(lql)
qlsm = qlsm = np.fft.ifft2(np.fft.fft2(
np.reshape(self.q, (T, P))
)).real.astype(np.float64)
lql = np.log(np.reshape(np.moveaxis(self.q, 0, -1), (P, T)))
# Check the distribution properties of ql
'''
assert( np.all(np.isclose(np.sum(self.q,axis=0)[:,:],1 ) )==True)
assert(np.any((self.q) <0 ) == False)
'''
alpha = 1e-10
miter = 1
SCALING_FACTOR = 2.5
# ~1/2 the average number of number of word per document, make this number smaller as the counting grid gets bigger
pseudocounts = np.mean(np.sum(data, axis=1)) / (P*SCALING_FACTOR)
# This is the posterior for each layer Q( Layer | document ).
qla = np.ones([L, T])
lqla = np.log(qla)
plal = np.ones([L, P]) / L # P(layer | position in counting grid)
dirichlet_prior = np.ones([L, Z])
# Didn't implement the fft to smooth the posterior probabilities
# of picking a location, differing from previous code by choice.
every_iter = 1
nmax = 2 # maximum number of iterations
start_ql = 1
minp = 1e-10
TOROID_ARGUMENTS = [(self.window[0], 0), (self.window[1], 0), (0, 0)]
eps = (np.finfo(h_la.dtype).eps)
for iter in range(nmax):
# assert( np.all(np.isclose(np.sum(np.reshape(np.transpose(self.q),(P,T)),axis=0),1 ) ))
# assert(np.any((self.q) <0 ) == False)
# assert(np.all(np.isclose(np.sum(qla,axis=0),1 )))
# assert(np.any((qla) <0 ) == False)
if iter >= start_ql:
lql = np.zeros([P, T])
for l in range(L):
tmp = np.reshape(np.log(eps + h_la[:, :, :, l]), [P, Z])
# qla is the Q(layer) in the mean field posterior factorization, its structure is L,T
lql = lql + np.dot(tmp, np.transpose(data))*qla[l, :]
self.q = np.reshape(
normalize_q(lql),
[T, self.extent[0], self.extent[1]]
)
# Didn't use the smoothened ql
qlsm = np.fft.ifft2(np.fft.fft2(
np.reshape(self.q, (T, P))
)).real.astype(np.float64)
# update Q(layer)
tmpq = np.reshape(np.moveaxis(np.copy(self.q), 0, -1), [P, T])
for l in range(L):
tmp = np.reshape(np.log(alpha + h_la[:, :, :, l]), [P, Z])
lqla[l, :] = np.sum(
tmpq*(np.dot(tmp, np.transpose(data)) +
np.reshape(np.transpose(np.log(plal[l, :])), [P, 1])),
axis=0
)
lqlamax = np.amax(lqla, axis=0)
qla = np.exp(
(lqla - lqlamax) - np.log(np.sum(np.exp((lqla - lqlamax)), axis=0))
)
# M-STEP. Basically the normal CG M-Step, repeated #Layers times.
# Reimplemented it to include the dirichlet prior in the mix
for l in range(L):
# Dirichlet prior. Does very little in practice, may be useful only in the early stages of learning... leave it.
tmpdirip = dirichlet_prior[l, :] - 1
tmpdirip[np.isnan(tmpdirip)] = 0
# Recall that self.q is T x E1 x E2 tensor
# Original Matlab code here for reference:
#
# nrm = bsxfun(@plus, reshape(tmpdirip,[1 1 Z]),
# reshape( reshape( padarray( ql, W, 'circular','pre'),
# [prod(cg_size+W),T])*bsxfun( @times, WD, qla(l,:))', [ cg_size+W,Z ]));
first = np.reshape(np.pad(np.moveaxis(
self.q, 0, -1), TOROID_ARGUMENTS, 'wrap'), [np.prod(self.extent+self.window), T])
# using transpose function make sense here because we're just swapping 2 Dimensions
D = np.dot(first, np.transpose(np.transpose(data) * qla[l, :]))
nrm = np.reshape(tmpdirip, [1, 1, Z]) + np.reshape(
D, [self.extent[0]+self.window[0], self.extent[1]+self.window[1], Z])
QH = nrm / \
np.pad(h_la[:, :, :, l] + np.prod(self.window) * alpha, TOROID_ARGUMENTS, 'wrap')
QH = QH[slice(1, self.extent[0]+self.window[0]),
slice(1, self.extent[1]+self.window[1]), :]
QH = update_summand(self, QH, T, Z)
QH[QH < 0] = 0
un_pi = pseudocounts + QH*(pi_la[:, :, :, l]+alpha)
mask = np.sum(un_pi, 2) != 0
nmask = np.sum(un_pi, 2) == 0
M1 = np.transpose(np.transpose(
un_pi)*np.transpose(mask.astype(np.float64)) / np.transpose(np.sum(un_pi, 2)))
M2 = np.ones([Z, self.extent[0], self.extent[1]]) * \
(nmask).astype(np.float64)
pi_la[:, :, :, l] = M1 + np.moveaxis((1.0/Z) * M2, 0, 2)
layer_compute_h(pi_la, h_la)
qlsm = np.fft.ifft2(np.fft.fft2(np.reshape(
self.q, (T, P)))).real.astype(np.float64)
A = np.sum(qlsm, axis=0)
if np.any(np.isclose(A, 0)):
A += eps
plal = np.transpose(np.reshape(np.sum(np.reshape(np.moveaxis(
qlsm, 0, -1), [self.extent[0], self.extent[1], T, 1]) * np.reshape(np.transpose(qla), [1, 1, T, L]), axis=2), [P, L]))/A
plal[plal < 1e-100] = 1e-100
plal = plal / np.sum(plal, axis=0) # sum over the layers
INVERSE_DOCUMENT_FREQUENCY = np.log(
data.shape[0] + eps) - np.log(np.sum((data > 0).astype(np.float64), axis=0) + eps)
pi_la_idf = np.zeros(pi_la.shape)
for l in range(L):
pi_la_idf[:, :, :, l] = pi_la[:, :, :, l] * \
INVERSE_DOCUMENT_FREQUENCY
id_layers = np.argmax(qla, axis=0) + 1
wg_ep = eps
mask = np.pad(np.ones(self.window), [
(0, x) for x in self.extent-self.window], 'constant', constant_values=0)
wg = np.zeros([self.extent[0], self.extent[1], L])
for l in range(L):
idl = np.where(id_layers-1 == l)[0] # id_layer in matlab format
for t in range(len(idl)):
wg[:, :, l] = wg[:, :, l] + np.fft.ifft2(np.fft.fft2(mask)*np.fft.fft2(
self.q[idl[t], :, :])).real.astype(np.float64) # m x E structure
# This makes wg not a distribution.
wg = np.transpose(np.transpose(
wg) / (np.sum(np.transpose(wg), axis=0) + wg_ep))
# sum over the layers
pi2_idf = np.sum(
pi_la_idf*np.reshape(wg, [self.extent[0], self.extent[1], 1, L]), axis=3)
# Renormalize Pi after using inverse document frequency.
pi2_idf = np.transpose(np.transpose(pi2_idf) /
np.transpose(np.sum(pi2_idf, axis=2)))
return {
"pi2_idf": pi2_idf, "pi_la_idf": pi_la_idf, "id_layer": [id_layers],
"ql2": self.q, "counts_to_show": np.transpose(data), "indices_to_show": [np.array(list(range(T))) + 1],
"pi": self.pi, "pi_la": pi_la
}
def fit(
self, data, max_iter=100, returnSumSquareDifferencesOfPi=False,
noise=.000001, learn_pi=True, pi=None, layers=1, output_directory="./", heartBeaters=None
):
"""
Implements variational expectation maximization for the Counting Grid model
Assumes: data is an m x n matrix
TO DO: return early if fitness converges. don't just run it for max iter.
"""
if not os.path.exists(str(output_directory)):
raise Exception(
"output_directory does not exist for counting grids trainer."
)
def SSD(pi, piHat):
A = np.abs(pi - piHat)
return np.sum(A * A)
alpha = 1e-10
SSDPi = []
data = data.astype(np.float64)
if pi is None:
self.initializePi(data)
else:
self.pi = pi
self.h = self.compute_h(self.pi, self.window)
self.check_model()
extentProduct = np.prod(self.extent)
T, _ = data.shape
pseudocounts = np.mean(np.sum(data, axis=1) / extentProduct) / 2.5
# q is an m x dim(extent) structure
qshape = [len(data)]
for v in self.extent:
qshape.append(v)
self.q = np.zeros(tuple(qshape))
i = 0
while i < max_iter:
# E-Step
self.q = self.q_update(data)
# M-Step
if learn_pi:
if returnSumSquareDifferencesOfPi:
pi = self.pi
self.pi = self.pi_update(data, pseudocounts, alpha)
if returnSumSquareDifferencesOfPi:
piHat = self.pi
SSDPi.append(SSD(pi, piHat))
self.h = self.compute_h(self.pi, self.window)
i = i + 1
[(h.makeProgress(int(100*i/max_iter)) if h is not None else False)
for h in heartBeaters] if heartBeaters is not None else False
if layers > 1:
self.layercgdata = self.cg_layers(data, L=layers, noise=noise)
scipy.io.savemat(str(output_directory) +
"/CountingGridDataMatrices.mat", self.layercgdata)
else:
scipy.io.savemat(str(output_directory) +
"/CGData.mat", {"pi": self.pi, "q": self.q})
return self.pi
# assumptions that we need for the model to be valid
def check_model(self):
assert(len(self.extent) == len(self.window))
for v in self.extent:
assert(int(v) == v)
assert(v > 0)
for v in self.window:
assert(int(v) == v)
assert(v > 0)
# dimensions of pi is one more than the window
assert(self.pi.ndim - 1 == len(self.window))
# w0 and w1 are window size
def compute_h_noLoopFull(self, PI, w0, w1):
return PI[w0:, w1:, :] - PI[:-w0, w1:, :] - PI[w0:, :-w1, :] + PI[:-w0, :-w1, :]
# no side effects
def compute_h(self, pi, W):
PI = np.pad(pi, [(0, W[0]), (0, W[1]), (0, 0)],
'wrap').cumsum(axis=0).cumsum(axis=1)
PI = np.pad(PI, [(1, 0), (1, 0), (0, 0)], 'constant')
w0 = W[0]
w1 = W[1]
cumsum_output = self.compute_h_noLoopFull(PI, w0, w1)
return np.moveaxis(np.moveaxis(cumsum_output[:-1, :-1, :], 2, 0)/np.sum(cumsum_output[:-1, :-1, :], axis=2), 0, -1)
# How to initialize pi
# Note that we don't want pi to be 0, since our update equations depend on a multiplication by pi
def initializePi(self, data, technique="uniform"):
if technique is "uniform":
size = [x for x in self.extent]
size.append(data.shape[1])
self.pi = np.random.random(size=tuple(size)).astype(np.float64)
else:
raise ValueError("No initialize strategy given")
def pi_update(self, data, pseudocounts, alpha):
'''
Modifies: pi
Assumes: self.q has been initialized (in the fit function) and that h has been initialized
Assumes: a two dimensional extent
Recall: q is M x E tensor
'''
T, Z = data.shape
W = self.window
# QdotConH is called nrm in matlab engine, but padding is done beforehand in matlab
QdotConH = np.dot(np.moveaxis(self.q, 0, -1,), data)
QH = np.pad(QdotConH / (self.h + np.prod(self.window)*alpha),
[(W[0], 0), (W[1], 0), (0, 0)], 'wrap').cumsum(axis=0).cumsum(axis=1)
w0 = W[0]
w1 = W[1]
QH = self.compute_h_noLoopFull(QH, w0, w1)
QH[QH < 0] = 0
un_pi = pseudocounts + QH*(self.pi+alpha)
mask = (np.sum(un_pi, axis=2) != 0).astype(np.float64)
not_mask = (np.sum(un_pi, axis=2) == 0).astype(np.float64)
denom = np.sum(un_pi, axis=2)
self.pi = np.transpose(np.transpose(mask)*(np.transpose(un_pi) / np.transpose(denom))) + \
(1.0/Z) * np.transpose(np.transpose(
np.ones([self.extent[0], self.extent[1], Z]))*np.transpose(not_mask))
return self.pi
def get_indices_for_window_indexed_by_k(self, k, z):
indices = [[]]*len(self.extent)
for j, v in enumerate(indices):
indices[j] = slice(k[j], k[j] + self.window[j])
indices.append(z) # the z index in the paper
return tuple(indices)
def get_h(self):
return self.h
def q_update(self, data):
L = np.prod(self.extent)
lql = np.dot(np.log(self.h).reshape(
(L, data.shape[1])), np.transpose(data))
lqlmax = np.amax(lql, axis=0)
min_prob = 1.0/(10*L)
Lq = ((lql-lqlmax)-np.log(np.sum(np.exp(lql-lqlmax), axis=0))
).reshape(tuple(list(self.extent) + [data.shape[0]]))
q = np.exp(Lq)
q[q < min_prob] = min_prob
q = q / np.sum(np.sum(q, axis=0), axis=0)
return np.moveaxis(q, 2, 0)
# Assumes: bagofwordcounts are integers
def predict_probabilities(self, bagofwordcounts, k):
assert(len(k) == len(self.window))
log_p = 0 # log of the products is the sum of the logs
for (i, count) in enumerate(bagofwordcounts):
if count > 0:
indices = self.get_indices_for_window_indexed_by_k(k, i)
log_p += np.log(np.sum(self.pi[indices].reshape(
self.pi[indices].size)))*float(count)
# 1/normalizationfactor * probability
# e ^ (log(prob) - log(normalization factor))
return math.pow(
math.e,
log_p - np.sum(bagofwordcounts)*np.sum(np.log(self.window))
)
if __name__ == "__main__":
data = np.array([
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
[1, 2, 1, 0, 1, 0, 0, 0, 0, 0],
[0, 1, 1, 0, 0, 1, 1, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 1, 1, 1]
])
extent = np.array([3, 3])
window = np.array([2, 2])
model = CountingGridModel(extent, window)
model.fit(data, max_iter=1000,
returnSumSquareDifferencesOfPi=False, layers=2)

Просмотреть файл

@ -0,0 +1,3 @@
from .CountingGridModel import CountingGridModel
__all__ = ['CountingGridModel']

Просмотреть файл

@ -0,0 +1,29 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import unittest
import numpy as np
from CountingGridsPy.models import CountingGridModel
class TestCorrectnessOfNontrivialDesignMatrix(unittest.TestCase):
def setUp(self):
M, N = [5, 7]
self.N = N
self.data = np.array(
list(range(1, 8))+list(range(7, 0, -1)) +
[1]*7+[0, 1, 0, 1, 0, 1, 0]+list(range(1, 8))
).reshape((M, N))
# note: after one iteration h distribution is the same regardless of position on matrix or window size
self.extent = np.array([5, 5])
window = np.array([2, 3])
self.pi_init = np.ones([5]*2+[N])/1000
self.model = CountingGridModel(self.extent, window)
def test_correct_data(self):
assert(sum(sum(self.data)) == 94)
def test_fitted_model(self):
self.model.fit(self.data, max_iter=1,
returnSumSquareDifferencesOfPi=False, pi=np.copy(self.pi_init))
assert(np.all(np.isclose(self.model.q, .04)))

20
pipeline.yaml Normal file
Просмотреть файл

@ -0,0 +1,20 @@
- job:
pool:
vmImage: 'ubuntu-16.04'
strategy:
matrix:
Python36:
python.version: '3.6'
steps:
- task: UsePythonVersion@0
displayName: 'Use Python $(python.version)'
inputs:
versionSpec: '$(python.version)'
- script: pip install pep8
displayName: 'Install pep8 linter'
- script: pep8 . --max-line-length=150
displayName: 'Run pep8 linter'