Learning and deploying counting grids service code
This commit is contained in:
Родитель
230c06ccee
Коммит
99f4d5367a
|
@ -0,0 +1,5 @@
|
|||
{
|
||||
"python.linting.pylintEnabled": true,
|
||||
"python.linting.enabled": true,
|
||||
"python.linting.lintOnSave": true,
|
||||
}
|
|
@ -0,0 +1,292 @@
|
|||
## Ignore Visual Studio temporary files, build results, and
|
||||
## files generated by popular Visual Studio add-ons.
|
||||
##
|
||||
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
|
||||
|
||||
src/keys.json
|
||||
src/metadata.json
|
||||
src/metadata_other.json
|
||||
|
||||
# User-specific files
|
||||
*.suo
|
||||
*.user
|
||||
*.userosscache
|
||||
*.sln.docstates
|
||||
|
||||
# User-specific files (MonoDevelop/Xamarin Studio)
|
||||
*.userprefs
|
||||
|
||||
# Build results
|
||||
[Dd]ebug/
|
||||
[Dd]ebugPublic/
|
||||
[Rr]elease/
|
||||
[Rr]eleases/
|
||||
x64/
|
||||
x86/
|
||||
bld/
|
||||
[Bb]in/
|
||||
[Oo]bj/
|
||||
[Ll]og/
|
||||
|
||||
# Visual Studio 2015 cache/options directory
|
||||
.vs/
|
||||
# Uncomment if you have tasks that create the project's static files in wwwroot
|
||||
#wwwroot/
|
||||
|
||||
# MSTest test Results
|
||||
[Tt]est[Rr]esult*/
|
||||
[Bb]uild[Ll]og.*
|
||||
|
||||
# NUNIT
|
||||
*.VisualState.xml
|
||||
TestResult.xml
|
||||
|
||||
# Build Results of an ATL Project
|
||||
[Dd]ebugPS/
|
||||
[Rr]eleasePS/
|
||||
dlldata.c
|
||||
|
||||
# .NET Core
|
||||
project.lock.json
|
||||
project.fragment.lock.json
|
||||
artifacts/
|
||||
**/Properties/launchSettings.json
|
||||
|
||||
*_i.c
|
||||
*_p.c
|
||||
*_i.h
|
||||
*.ilk
|
||||
*.meta
|
||||
*.obj
|
||||
*.pch
|
||||
*.pdb
|
||||
*.pgc
|
||||
*.pgd
|
||||
*.rsp
|
||||
*.sbr
|
||||
*.tlb
|
||||
*.tli
|
||||
*.tlh
|
||||
*.tmp
|
||||
*.tmp_proj
|
||||
*.log
|
||||
*.vspscc
|
||||
*.vssscc
|
||||
.builds
|
||||
*.pidb
|
||||
*.svclog
|
||||
*.scc
|
||||
|
||||
# Chutzpah Test files
|
||||
_Chutzpah*
|
||||
|
||||
# Visual C++ cache files
|
||||
ipch/
|
||||
*.aps
|
||||
*.ncb
|
||||
*.opendb
|
||||
*.opensdf
|
||||
*.sdf
|
||||
*.cachefile
|
||||
*.VC.db
|
||||
*.VC.VC.opendb
|
||||
|
||||
# Visual Studio profiler
|
||||
*.psess
|
||||
*.vsp
|
||||
*.vspx
|
||||
*.sap
|
||||
|
||||
# TFS 2012 Local Workspace
|
||||
$tf/
|
||||
|
||||
# Guidance Automation Toolkit
|
||||
*.gpState
|
||||
|
||||
# ReSharper is a .NET coding add-in
|
||||
_ReSharper*/
|
||||
*.[Rr]e[Ss]harper
|
||||
*.DotSettings.user
|
||||
|
||||
# JustCode is a .NET coding add-in
|
||||
.JustCode
|
||||
|
||||
# TeamCity is a build add-in
|
||||
_TeamCity*
|
||||
|
||||
# DotCover is a Code Coverage Tool
|
||||
*.dotCover
|
||||
|
||||
# Visual Studio code coverage results
|
||||
*.coverage
|
||||
*.coveragexml
|
||||
|
||||
# NCrunch
|
||||
_NCrunch_*
|
||||
.*crunch*.local.xml
|
||||
nCrunchTemp_*
|
||||
|
||||
# MightyMoose
|
||||
*.mm.*
|
||||
AutoTest.Net/
|
||||
|
||||
# Web workbench (sass)
|
||||
.sass-cache/
|
||||
|
||||
# Installshield output folder
|
||||
[Ee]xpress/
|
||||
|
||||
# DocProject is a documentation generator add-in
|
||||
DocProject/buildhelp/
|
||||
DocProject/Help/*.HxT
|
||||
DocProject/Help/*.HxC
|
||||
DocProject/Help/*.hhc
|
||||
DocProject/Help/*.hhk
|
||||
DocProject/Help/*.hhp
|
||||
DocProject/Help/Html2
|
||||
DocProject/Help/html
|
||||
|
||||
# Click-Once directory
|
||||
publish/
|
||||
|
||||
# Publish Web Output
|
||||
*.[Pp]ublish.xml
|
||||
*.azurePubxml
|
||||
# TODO: Comment the next line if you want to checkin your web deploy settings
|
||||
# but database connection strings (with potential passwords) will be unencrypted
|
||||
*.pubxml
|
||||
*.publishproj
|
||||
|
||||
# Microsoft Azure Web App publish settings. Comment the next line if you want to
|
||||
# checkin your Azure Web App publish settings, but sensitive information contained
|
||||
# in these scripts will be unencrypted
|
||||
PublishScripts/
|
||||
|
||||
# NuGet Packages
|
||||
*.nupkg
|
||||
# The packages folder can be ignored because of Package Restore
|
||||
**/packages/*
|
||||
# except build/, which is used as an MSBuild target.
|
||||
!**/packages/build/
|
||||
# Uncomment if necessary however generally it will be regenerated when needed
|
||||
#!**/packages/repositories.config
|
||||
# NuGet v3's project.json files produces more ignorable files
|
||||
*.nuget.props
|
||||
*.nuget.targets
|
||||
|
||||
# Microsoft Azure Build Output
|
||||
csx/
|
||||
*.build.csdef
|
||||
|
||||
# Microsoft Azure Emulator
|
||||
ecf/
|
||||
rcf/
|
||||
|
||||
# Windows Store app package directories and files
|
||||
AppPackages/
|
||||
BundleArtifacts/
|
||||
Package.StoreAssociation.xml
|
||||
_pkginfo.txt
|
||||
|
||||
# Visual Studio cache files
|
||||
# files ending in .cache can be ignored
|
||||
*.[Cc]ache
|
||||
# but keep track of directories ending in .cache
|
||||
!*.[Cc]ache/
|
||||
|
||||
# Others
|
||||
ClientBin/
|
||||
~$*
|
||||
*~
|
||||
*.dbmdl
|
||||
*.dbproj.schemaview
|
||||
*.jfm
|
||||
*.pfx
|
||||
*.publishsettings
|
||||
orleans.codegen.cs
|
||||
|
||||
# Since there are multiple workflows, uncomment next line to ignore bower_components
|
||||
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
|
||||
#bower_components/
|
||||
|
||||
# RIA/Silverlight projects
|
||||
Generated_Code/
|
||||
|
||||
# Backup & report files from converting an old project file
|
||||
# to a newer Visual Studio version. Backup files are not needed,
|
||||
# because we have git ;-)
|
||||
_UpgradeReport_Files/
|
||||
Backup*/
|
||||
UpgradeLog*.XML
|
||||
UpgradeLog*.htm
|
||||
|
||||
# SQL Server files
|
||||
*.mdf
|
||||
*.ldf
|
||||
*.ndf
|
||||
|
||||
# Business Intelligence projects
|
||||
*.rdl.data
|
||||
*.bim.layout
|
||||
*.bim_*.settings
|
||||
|
||||
# Microsoft Fakes
|
||||
FakesAssemblies/
|
||||
|
||||
# GhostDoc plugin setting file
|
||||
*.GhostDoc.xml
|
||||
|
||||
# Node.js Tools for Visual Studio
|
||||
.ntvs_analysis.dat
|
||||
node_modules/
|
||||
|
||||
# Typescript v1 declaration files
|
||||
typings/
|
||||
|
||||
# Visual Studio 6 build log
|
||||
*.plg
|
||||
|
||||
# Visual Studio 6 workspace options file
|
||||
*.opt
|
||||
|
||||
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
|
||||
*.vbw
|
||||
|
||||
# Visual Studio LightSwitch build output
|
||||
**/*.HTMLClient/GeneratedArtifacts
|
||||
**/*.DesktopClient/GeneratedArtifacts
|
||||
**/*.DesktopClient/ModelManifest.xml
|
||||
**/*.Server/GeneratedArtifacts
|
||||
**/*.Server/ModelManifest.xml
|
||||
_Pvt_Extensions
|
||||
|
||||
# Paket dependency manager
|
||||
.paket/paket.exe
|
||||
paket-files/
|
||||
|
||||
# FAKE - F# Make
|
||||
.fake/
|
||||
|
||||
# JetBrains Rider
|
||||
.idea/
|
||||
*.sln.iml
|
||||
|
||||
# CodeRush
|
||||
.cr/
|
||||
|
||||
# Python Tools for Visual Studio (PTVS)
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
# Cake - Uncomment if you are using it
|
||||
# tools/**
|
||||
# !tools/packages.config
|
||||
|
||||
# Telerik's JustMock configuration file
|
||||
*.jmconfig
|
||||
|
||||
# BizTalk build output
|
||||
*.btp.cs
|
||||
*.btm.cs
|
||||
*.odx.cs
|
||||
*.xsd.cs
|
|
@ -0,0 +1,34 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from jobStatus import JobStatus
|
||||
|
||||
|
||||
class BatchJob():
|
||||
def __init__(self, id_in: str, jobStatus_in, progress_in: int):
|
||||
self.id = id_in
|
||||
self.jobStatus = jobStatus_in
|
||||
self.progress = progress_in
|
||||
|
||||
def next(self):
|
||||
if self.jobStatus.value in [JobStatus.NotStarted, JobStatus.PreProcessing, JobStatus.Training]:
|
||||
self.jobStatus = JobStatus(self.jobStatus.value + 1)
|
||||
else:
|
||||
raise ValueError("Invalid job status value.")
|
||||
|
||||
def makeProgress(self, progress: int):
|
||||
if progress < 0 or progress > 100:
|
||||
raise ValueError("Invalid progress value.")
|
||||
else:
|
||||
self.progress = progress
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
b = BatchJob("", JobStatus.NotStarted, 0, "", 5, 24)
|
||||
b.next()
|
||||
assert(b.jobStatus == JobStatus.PreProcessing)
|
||||
b.makeProgress(5)
|
||||
assert(b.progress == 5)
|
||||
|
||||
import json
|
||||
print(json.dumps(b.__dict__))
|
|
@ -0,0 +1,272 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import datetime
|
||||
import io
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import timedelta
|
||||
import uuid
|
||||
|
||||
import azure.storage.blob as azureblob
|
||||
import azure.batch.batch_service_client as batch
|
||||
import azure.batch.models as batchmodels
|
||||
|
||||
sys.path.append('.')
|
||||
sys.path.append('..')
|
||||
|
||||
|
||||
def query_yes_no(question, default="yes"):
|
||||
"""
|
||||
Prompts the user for yes/no input, displaying the specified question text.
|
||||
|
||||
:param str question: The text of the prompt for input.
|
||||
:param str default: The default if the user hits <ENTER>. Acceptable values
|
||||
are 'yes', 'no', and None.
|
||||
:rtype: str
|
||||
:return: 'yes' or 'no'
|
||||
"""
|
||||
valid = {'y': 'yes', 'n': 'no'}
|
||||
if default is None:
|
||||
prompt = ' [y/n] '
|
||||
elif default == 'yes':
|
||||
prompt = ' [Y/n] '
|
||||
elif default == 'no':
|
||||
prompt = ' [y/N] '
|
||||
else:
|
||||
raise ValueError("Invalid default answer: '{}'".format(default))
|
||||
|
||||
while 1:
|
||||
choice = input(question + prompt).lower()
|
||||
if default and not choice:
|
||||
return default
|
||||
try:
|
||||
return valid[choice[0]]
|
||||
except (KeyError, IndexError):
|
||||
print("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
|
||||
|
||||
|
||||
def print_batch_exception(batch_exception):
|
||||
"""
|
||||
Prints the contents of the specified Batch exception.
|
||||
|
||||
:param batch_exception:
|
||||
"""
|
||||
print('-------------------------------------------')
|
||||
print('Exception encountered:')
|
||||
if batch_exception.error and \
|
||||
batch_exception.error.message and \
|
||||
batch_exception.error.message.value:
|
||||
print(batch_exception.error.message.value)
|
||||
if batch_exception.error.values:
|
||||
print()
|
||||
for mesg in batch_exception.error.values:
|
||||
print('{}:\t{}'.format(mesg.key, mesg.value))
|
||||
print('-------------------------------------------')
|
||||
|
||||
|
||||
def download_file_from_container(block_blob_client, container_name, file_path, blob_name):
|
||||
print("\nDownloading blob to " + file_path)
|
||||
block_blob_client.get_blob_to_path(container_name, blob_name, file_path)
|
||||
|
||||
|
||||
def upload_file_to_container(block_blob_client, container_name, file_path):
|
||||
"""
|
||||
Uploads a local file to an Azure Blob storage container.
|
||||
|
||||
:param block_blob_client: A blob service client.
|
||||
:type block_blob_client: `azure.storage.blob.BlockBlobService`
|
||||
:param str container_name: The name of the Azure Blob storage container.
|
||||
:param str file_path: The local path to the file.
|
||||
:rtype: `azure.batch.models.ResourceFile`
|
||||
:return: A ResourceFile initialized with a SAS URL appropriate for Batch
|
||||
tasks.
|
||||
"""
|
||||
blob_name = os.path.basename(file_path)
|
||||
|
||||
print('Uploading file {} to container [{}]...'.format(file_path,
|
||||
container_name))
|
||||
|
||||
block_blob_client.create_blob_from_path(container_name,
|
||||
blob_name,
|
||||
file_path)
|
||||
|
||||
# Obtain the SAS token for the container.
|
||||
sas_token = get_container_sas_token(block_blob_client,
|
||||
container_name, azureblob.BlobPermissions.READ)
|
||||
|
||||
sas_url = block_blob_client.make_blob_url(container_name,
|
||||
blob_name,
|
||||
sas_token=sas_token)
|
||||
|
||||
return batchmodels.ResourceFile(file_path=blob_name,
|
||||
blob_source=sas_url)
|
||||
|
||||
|
||||
def get_container_sas_token(block_blob_client,
|
||||
container_name, blob_permissions):
|
||||
"""
|
||||
Obtains a shared access signature granting the specified permissions to the
|
||||
container.
|
||||
|
||||
:param block_blob_client: A blob service client.
|
||||
:type block_blob_client: `azure.storage.blob.BlockBlobService`
|
||||
:param str container_name: The name of the Azure Blob storage container.
|
||||
:param BlobPermissions blob_permissions:
|
||||
:rtype: str
|
||||
:return: A SAS token granting the specified permissions to the container.
|
||||
"""
|
||||
# Obtain the SAS token for the container, setting the expiry time and
|
||||
# permissions. In this case, no start time is specified, so the shared
|
||||
# access signature becomes valid immediately. Expiration is in 2 hours.
|
||||
container_sas_token = \
|
||||
block_blob_client.generate_container_shared_access_signature(
|
||||
container_name,
|
||||
permission=blob_permissions,
|
||||
expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=2))
|
||||
|
||||
return container_sas_token
|
||||
|
||||
|
||||
def get_container_sas_url(block_blob_client,
|
||||
container_name, blob_permissions):
|
||||
"""
|
||||
Obtains a shared access signature URL that provides write access to the
|
||||
ouput container to which the tasks will upload their output.
|
||||
|
||||
:param block_blob_client: A blob service client.
|
||||
:type block_blob_client: `azure.storage.blob.BlockBlobService`
|
||||
:param str container_name: The name of the Azure Blob storage container.
|
||||
:param BlobPermissions blob_permissions:
|
||||
:rtype: str
|
||||
:return: A SAS URL granting the specified permissions to the container.
|
||||
"""
|
||||
# Obtain the SAS token for the container.
|
||||
sas_token = get_container_sas_token(block_blob_client,
|
||||
container_name, azureblob.BlobPermissions.WRITE)
|
||||
|
||||
# Construct SAS URL for the container
|
||||
container_sas_url = "https://{}.blob.core.windows.net/{}?{}".format(
|
||||
_STORAGE_ACCOUNT_NAME, container_name, sas_token)
|
||||
|
||||
return container_sas_url
|
||||
|
||||
|
||||
def create_pool(batch_service_client, pool_id, vm_size, imageName, versions, auto_scale_formula):
|
||||
"""
|
||||
Creates a pool of compute nodes with the specified OS settings.
|
||||
|
||||
:param batch_service_client: A Batch service client.
|
||||
:type batch_service_client: `azure.batch.BatchServiceClient`
|
||||
:param str pool_id: An ID for the new pool.
|
||||
:param str publisher: Marketplace image publisher
|
||||
:param str offer: Marketplace image offer
|
||||
:param str sku: Marketplace image sky
|
||||
"""
|
||||
print('Creating pool [{}]...'.format(pool_id))
|
||||
|
||||
new_pool = batch.models.PoolAddParameter(
|
||||
id=pool_id,
|
||||
virtual_machine_configuration=batchmodels.VirtualMachineConfiguration(
|
||||
image_reference=batchmodels.ImageReference(
|
||||
virtual_machine_image_id="/subscriptions/{}/resourceGroups/{}/providers/Microsoft.Compute/images/{}".format(
|
||||
"ad49354a-6ce2-4dae-a51d-b6907372f608", "BrowseCloud", imageName)
|
||||
),
|
||||
node_agent_sku_id="batch.node.windows amd64"),
|
||||
vm_size=vm_size,
|
||||
start_task=None,
|
||||
enable_auto_scale=True,
|
||||
auto_scale_formula=auto_scale_formula,
|
||||
application_package_references=[batchmodels.ApplicationPackageReference(
|
||||
application_id="browsecloudtrainer", version=version) for version in versions],
|
||||
auto_scale_evaluation_interval=timedelta(
|
||||
minutes=5) # the smallest evaluation interval
|
||||
|
||||
)
|
||||
|
||||
batch_service_client.pool.add(new_pool)
|
||||
|
||||
|
||||
def create_job(batch_service_client, job_id, pool_id):
|
||||
"""
|
||||
Creates a job with the specified ID, associated with the specified pool.
|
||||
|
||||
:param batch_service_client: A Batch service client.
|
||||
:type batch_service_client: `azure.batch.BatchServiceClient`
|
||||
:param str job_id: The ID for the job.
|
||||
:param str pool_id: The ID for the pool.
|
||||
"""
|
||||
print('Creating job [{}]...'.format(job_id))
|
||||
|
||||
job = batch.models.JobAddParameter(
|
||||
id=job_id,
|
||||
pool_info=batch.models.PoolInformation(pool_id=pool_id))
|
||||
|
||||
batch_service_client.job.add(job)
|
||||
|
||||
|
||||
def add_training_task(batch_service_client, job_id, command, displayName, containerName, blobName):
|
||||
"""
|
||||
Adds a task to the specified job.
|
||||
|
||||
:param batch_service_client: A Batch service client.
|
||||
:type batch_service_client: `azure.batch.BatchServiceClient`
|
||||
:param str jobId: The ID of the job to which to add the tasks.
|
||||
:param command: commandLine activity to do
|
||||
:param displayName display name of task
|
||||
:param containerName name of container with training data
|
||||
:param blobName name of training data blob
|
||||
"""
|
||||
|
||||
# sample Linux command "/bin/bash -c \"ffmpeg -i {} {} \"
|
||||
# sample Windows command "cmd /c cd .. & cd C:\Users\chbuja"
|
||||
|
||||
GUID = str(uuid.uuid4())[:63]
|
||||
|
||||
autouser = batchmodels.AutoUserSpecification(
|
||||
scope='task', elevation_level='admin')
|
||||
userId = batchmodels.UserIdentity(auto_user=autouser)
|
||||
|
||||
tasks = list()
|
||||
tasks.append(batch.models.TaskAddParameter(
|
||||
id='{}'.format(GUID),
|
||||
command_line=command,
|
||||
display_name=displayName,
|
||||
user_identity=userId
|
||||
))
|
||||
batch_service_client.task.add_collection(job_id, tasks)
|
||||
|
||||
|
||||
def wait_for_tasks_to_complete(batch_service_client, job_id, timeout):
|
||||
"""
|
||||
Returns when all tasks in the specified job reach the Completed state.
|
||||
|
||||
:param batch_service_client: A Batch service client.
|
||||
:type batch_service_client: `azure.batch.BatchServiceClient`
|
||||
:param str job_id: The id of the job whose tasks should be monitored.
|
||||
:param timedelta timeout: The duration to wait for task completion. If all
|
||||
tasks in the specified job do not reach Completed state within this time
|
||||
period, an exception will be raised.
|
||||
"""
|
||||
timeout_expiration = datetime.datetime.now() + timeout
|
||||
|
||||
print("Monitoring all tasks for 'Completed' state, timeout in {}..."
|
||||
.format(timeout), end='')
|
||||
|
||||
while datetime.datetime.now() < timeout_expiration:
|
||||
print('.', end='')
|
||||
sys.stdout.flush()
|
||||
tasks = batch_service_client.task.list(job_id)
|
||||
|
||||
incomplete_tasks = [task for task in tasks if
|
||||
task.state != batchmodels.TaskState.completed]
|
||||
if not incomplete_tasks:
|
||||
print()
|
||||
return True
|
||||
else:
|
||||
time.sleep(1)
|
||||
|
||||
print()
|
||||
raise RuntimeError("ERROR: Tasks did not reach 'Completed' state within "
|
||||
"timeout period of " + str(timeout))
|
|
@ -0,0 +1,58 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import json
|
||||
import adal
|
||||
from msrestazure.azure_active_directory import AADTokenCredentials
|
||||
import requests
|
||||
|
||||
|
||||
class BrowseCloudServiceAuthorizer():
|
||||
'''
|
||||
Retrieves credentials for backend to send incremental results to the trainer.
|
||||
|
||||
Example:
|
||||
|
||||
url = 'https://contoso-browsecloud-service.azurewebsites.net/api/v1/documents'
|
||||
bcsa = BrowseCloudServiceAuthorizer(resource_uri='https://contoso.oncontoso.com/BrowseCloud.Service')
|
||||
headers = {'Content-Type': "application/json", 'Accept': "application/json", 'Authorization': bcsa.authOutput()}
|
||||
res = requests.get(url, headers=headers)
|
||||
print(res.text)
|
||||
'''
|
||||
|
||||
def __init__(self, resource_uri: str):
|
||||
self.token = ""
|
||||
self.resource_uri = resource_uri
|
||||
|
||||
def retrieveToken(self):
|
||||
"""
|
||||
Authenticate using service principal w/ key.
|
||||
"""
|
||||
if self.resource_uri != "":
|
||||
|
||||
client_id = ""
|
||||
client_secret = ""
|
||||
authority_host_uri = ""
|
||||
tenant = ""
|
||||
# TODO: get this from key vault
|
||||
with open("metadata.json", "r") as f:
|
||||
data = json.load(f)
|
||||
client_id = data['CLIENT_ID']
|
||||
client_secret = data['CLIENT_SECRET']
|
||||
authority_host_uri = data["AUTHORITY_HOST_URI"]
|
||||
tenant = data["TENANT"]
|
||||
|
||||
authority_uri = authority_host_uri + '/' + tenant
|
||||
resource_uri = self.resource_uri
|
||||
|
||||
context = adal.AuthenticationContext(
|
||||
authority_uri, api_version=None)
|
||||
self.token = context.acquire_token_with_client_credentials(
|
||||
resource_uri, client_id, client_secret)
|
||||
credentials = AADTokenCredentials(self.token, client_id)
|
||||
|
||||
# Returns:'Bearer <MY_TOKEN>'
|
||||
|
||||
def authOutput(self):
|
||||
self.retrieveToken()
|
||||
return 'Bearer ' + self.token['accessToken']
|
|
@ -0,0 +1,54 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from jobStatus import JobStatus
|
||||
import threading
|
||||
import requests
|
||||
|
||||
|
||||
class CountingGridsHeartBeater():
|
||||
def __init__(self, url, batchJob, bcServiceAuthorizer):
|
||||
self.url = url
|
||||
self.batchJob = batchJob
|
||||
self.bcServiceAuthorizer = bcServiceAuthorizer
|
||||
self.RESET_PROGRESS_NUMBER = 0
|
||||
self.FINISHED_PROGRESS_NUMBER = 100
|
||||
|
||||
def workerFactory(self, batchJob, bcServiceAuthorizer):
|
||||
url = self.url
|
||||
|
||||
def worker():
|
||||
data = batchJob.__dict__
|
||||
if url != '':
|
||||
headers = {'Content-Type': "application/json", 'Accept': "application/json",
|
||||
'Authorization': self.bcServiceAuthorizer.authOutput()}
|
||||
res = requests.put(self.url, json=data, headers=headers)
|
||||
print(res)
|
||||
print(res.text)
|
||||
else:
|
||||
print("Faking heartbeat with following batchJob:")
|
||||
print(data)
|
||||
|
||||
return worker
|
||||
|
||||
def beat(self):
|
||||
t = threading.Thread(target=self.workerFactory(
|
||||
self.batchJob, self.bcServiceAuthorizer), args=())
|
||||
t.setDaemon(False)
|
||||
t.start()
|
||||
|
||||
def next(self):
|
||||
self.batchJob.next()
|
||||
self.batchJob.makeProgress(self.RESET_PROGRESS_NUMBER)
|
||||
self.beat()
|
||||
|
||||
def makeProgress(self, progress: int):
|
||||
self.batchJob.makeProgress(progress)
|
||||
self.beat()
|
||||
|
||||
def done(self, success: bool):
|
||||
if success:
|
||||
self.batchJob.jobStatus = JobStatus.Success
|
||||
else:
|
||||
self.batchJob.jobStatus = JobStatus.Failure
|
||||
self.makeProgress(self.FINISHED_PROGRESS_NUMBER)
|
|
@ -0,0 +1,127 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from __future__ import print_function
|
||||
import datetime
|
||||
import io
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import timedelta
|
||||
import json
|
||||
from browseCloudAzureUtilities import *
|
||||
|
||||
import azure.storage.blob as azureblob
|
||||
import azure.batch.batch_service_client as batch
|
||||
import azure.batch.batch_auth as batchauth
|
||||
import azure.batch.models as batchmodels
|
||||
|
||||
|
||||
from azure.batch import BatchServiceClient
|
||||
from azure.common.credentials import ServicePrincipalCredentials
|
||||
|
||||
sys.path.append('.')
|
||||
sys.path.append('..')
|
||||
|
||||
|
||||
# Update the Batch and Storage account credential strings below with the values
|
||||
# unique to your accounts. These are used when constructing connection strings
|
||||
# for the Batch and Storage client objects.
|
||||
|
||||
_BATCH_ACCOUNT_NAME = ""
|
||||
_BATCH_ACCOUNT_KEY = ""
|
||||
_BATCH_ACCOUNT_URL = ""
|
||||
_SERVICE_PRINCIPAL_KEY = ""
|
||||
|
||||
with open("keys.json", "r") as f:
|
||||
data = json.load(f)
|
||||
_BATCH_ACCOUNT_KEY = data['_BATCH_ACCOUNT_KEY']
|
||||
_SERVICE_PRINCIPAL_KEY = data['_SERVICE_PRINCIPAL_KEY_DEV']
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
start_time = datetime.datetime.now().replace(microsecond=0)
|
||||
print('Sample start: {}'.format(start_time))
|
||||
print()
|
||||
|
||||
# Create a Batch service client. We'll now be interacting with the Batch
|
||||
# service in addition to Storage
|
||||
'''
|
||||
credentials = batchauth.SharedKeyCredentials(_BATCH_ACCOUNT_NAME,
|
||||
_BATCH_ACCOUNT_KEY)
|
||||
'''
|
||||
# Have separate applications, deployment locations, scaling formulae, etc., for dev and prod
|
||||
# This is read from a file
|
||||
|
||||
JOB_ID = ""
|
||||
POOL_VM_SIZE = ""
|
||||
POOL_ID = ""
|
||||
IMAGE_NAME = ""
|
||||
SCALING_FORMULA = ""
|
||||
VERSIONS = []
|
||||
TENANT_ID = ""
|
||||
CLIENT_ID = ""
|
||||
|
||||
with open('metadata.json', 'r') as fMeta:
|
||||
dataMeta = json.load(fMeta)
|
||||
TENANT_ID = dataMeta["TENANT_ID_DEPLOYMENT_AND_TESTING"]
|
||||
CLIENT_ID = dataMeta["CLIENT_ID_DEPLOYMENT_AND_TESTING"]
|
||||
_BATCH_ACCOUNT_NAME = dataMeta["_BATCH_ACCOUNT_NAME"]
|
||||
_BATCH_ACCOUNT_URL = dataMeta["_BATCH_ACCOUNT_URL"]
|
||||
|
||||
if dataMeta['ENV'] == 'DEV':
|
||||
JOB_ID = dataMeta['JOB_ID_DEV']
|
||||
POOL_VM_SIZE = dataMeta['POOL_VM_SIZE_DEV']
|
||||
POOL_ID = dataMeta['POOL_ID_DEV']
|
||||
IMAGE_NAME = dataMeta['IMAGE_NAME_DEV']
|
||||
SCALING_FORMULA = dataMeta['SCALING_FORMULA_DEV']
|
||||
VERSIONS = dataMeta['VERSIONS_DEV']
|
||||
elif dataMeta['ENV'] == 'PROD':
|
||||
JOB_ID = dataMeta['JOB_ID_PROD']
|
||||
POOL_VM_SIZE = dataMeta['POOL_VM_SIZE_PROD']
|
||||
POOL_ID = dataMeta['POOL_ID_PROD']
|
||||
IMAGE_NAME = dataMeta['IMAGE_NAME_PROD']
|
||||
SCALING_FORMULA = dataMeta['SCALING_FORMULA_PROD']
|
||||
VERSIONS = dataMeta['VERSIONS_PROD']
|
||||
else:
|
||||
raise ValueError("Environment type in metadata.json is invalid.")
|
||||
|
||||
SECRET = _SERVICE_PRINCIPAL_KEY
|
||||
|
||||
credentials = ServicePrincipalCredentials(
|
||||
client_id=CLIENT_ID,
|
||||
secret=SECRET,
|
||||
tenant=TENANT_ID,
|
||||
resource="https://batch.core.windows.net/"
|
||||
)
|
||||
|
||||
batch_client = batch.BatchServiceClient(
|
||||
credentials,
|
||||
base_url=_BATCH_ACCOUNT_URL)
|
||||
|
||||
try:
|
||||
# Create the pool that will contain the compute nodes that will execute the
|
||||
# tasks.
|
||||
|
||||
create_pool(batch_client, POOL_ID, POOL_VM_SIZE,
|
||||
IMAGE_NAME, VERSIONS, SCALING_FORMULA)
|
||||
|
||||
# Create the job that will run the tasks.
|
||||
create_job(batch_client, JOB_ID, POOL_ID)
|
||||
|
||||
print(" Success! All tasks reached the 'Completed' state within the "
|
||||
"specified timeout period.")
|
||||
|
||||
except batchmodels.BatchErrorException as err:
|
||||
print_batch_exception(err)
|
||||
raise
|
||||
|
||||
# Print out some timing info
|
||||
end_time = datetime.datetime.now().replace(microsecond=0)
|
||||
print()
|
||||
print('Sample end: {}'.format(end_time))
|
||||
print('Elapsed time: {}'.format(end_time - start_time))
|
||||
print()
|
||||
print()
|
||||
input('Press ENTER to exit...')
|
|
@ -0,0 +1,229 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import datetime
|
||||
import pandas as pd
|
||||
import os
|
||||
import numpy as np
|
||||
from CountingGridsPy.models import CountingGridModel
|
||||
import traceback
|
||||
from browseCloudServiceAuthorizer import BrowseCloudServiceAuthorizer
|
||||
from countingGridsHeartBeater import CountingGridsHeartBeater
|
||||
from jobStatus import JobStatus
|
||||
from batchJob import BatchJob
|
||||
import azure.storage.blob as azureblob
|
||||
from browseCloudAzureUtilities import upload_file_to_container, download_file_from_container
|
||||
import matplotlib.pyplot as plt
|
||||
from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, CGEngineWrapper, NLPCleaner, PipelineTimer
|
||||
import sys
|
||||
sys.path.append("../../..")
|
||||
|
||||
|
||||
# CLI: python generateCountingGridsFromAzure.py <input_containername> <extent_size_of_grid_hyperparameter>
|
||||
# <window_size_of_grid_hyperparameter> <engine_type> <inputfile_type> <inputfile_name> <output_containername>
|
||||
#
|
||||
# Example CLI: python generateCountingGridsFromAzure.py trainingdata 24 5 numpyEngine simpleInput dictionaryVerySmallSample.txt bighash
|
||||
|
||||
# Assumes: input_containername and output_containername must be <64 characters long.
|
||||
if __name__ == "__main__":
|
||||
HEART_BEATER = None
|
||||
try:
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Input
|
||||
# ---------------------------------------------------------------------------------------
|
||||
|
||||
errStr = "Please give valid command-line arguments."
|
||||
if len(sys.argv) != 8:
|
||||
raise ValueError(errStr)
|
||||
|
||||
containerNameIn = sys.argv[1]
|
||||
EXTENT_SIZE = int(sys.argv[2])
|
||||
WINDOW_SIZE = int(sys.argv[3])
|
||||
engine_type = sys.argv[4]
|
||||
inputfile_type = sys.argv[5]
|
||||
blobName = sys.argv[6]
|
||||
containerNameOut = sys.argv[7]
|
||||
if engine_type != "numpyEngine" and engine_type != "matlabEngine":
|
||||
raise ValueError(
|
||||
"The {0} engine does not exist. Please use 'matlabEngine' or 'numpyEngine'.".format(engine_type))
|
||||
engine_type = engine_type[:-6] # 6 characters in the word "Engine"
|
||||
if inputfile_type != "metadataInput" and inputfile_type != "simpleInput":
|
||||
raise ValueError(
|
||||
"The {0} input type does not exist. Please use 'simpleInput' or 'metadataInput'.".format(inputfile_type))
|
||||
inputfile_type = inputfile_type[:-5] # remove "Input"
|
||||
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Authentication
|
||||
# ---------------------------------------------------------------------------------------
|
||||
AUTH_URL = ""
|
||||
SERVICE_URL = ""
|
||||
_STORAGE_ACCOUNT_NAME_IN = 'browsecloudtrainingdata'
|
||||
_STORAGE_ACCOUNT_KEY_IN = ""
|
||||
_STORAGE_ACCOUNT_NAME_OUT = 'browsecloudmodelfiles'
|
||||
_STORAGE_ACCOUNT_KEY_OUT = ""
|
||||
|
||||
jobId = containerNameOut
|
||||
docId = containerNameIn
|
||||
with open('metadata.json', 'r') as fMeta:
|
||||
dataMeta = json.load(fMeta)
|
||||
AUTH_URL = dataMeta["AUTH_URL"]
|
||||
_STORAGE_ACCOUNT_NAME_IN = dataMeta["_STORAGE_ACCOUNT_NAME_TRAININGDATA"]
|
||||
_STORAGE_ACCOUNT_KEY_OUT = dataMeta["_STORAGE_ACCOUNT_NAME_MODELS"]
|
||||
if dataMeta['ENV'] == 'DEV':
|
||||
# TODO: Use key vault and certificate
|
||||
# to retreive that information instead of temp file for keys.
|
||||
# Note that keys.json is not checked in.
|
||||
with open("keys.json", "r") as f:
|
||||
data = json.load(f)
|
||||
_STORAGE_ACCOUNT_KEY_IN = data['_STORAGE_ACCOUNT_KEY_DEV']
|
||||
_STORAGE_ACCOUNT_KEY_OUT = data['_STORAGE_ACCOUNT_KEY_OUT_DEV']
|
||||
_STORAGE_ACCOUNT_NAME_OUT = data['_STORAGE_ACCOUNT_NAME_OUT_DEV'] if (
|
||||
'_STORAGE_ACCOUNT_NAME_OUT_DEV' in data
|
||||
) else _STORAGE_ACCOUNT_NAME_OUT
|
||||
_STORAGE_ACCOUNT_NAME_IN = data['_STORAGE_ACCOUNT_NAME_IN_DEV'] if (
|
||||
'_STORAGE_ACCOUNT_NAME_IN_DEV' in data
|
||||
) else _STORAGE_ACCOUNT_NAME_IN
|
||||
SERVICE_URL = dataMeta["SERVICE_URL_DEV"] + "/api/v1/jobs/" + \
|
||||
jobId if "SERVICE_URL_DEV" in dataMeta else SERVICE_URL
|
||||
elif dataMeta['ENV'] == 'PROD':
|
||||
with open("keys.json", "r") as f:
|
||||
data = json.load(f)
|
||||
_STORAGE_ACCOUNT_KEY_IN = data['_STORAGE_ACCOUNT_KEY_PROD']
|
||||
_STORAGE_ACCOUNT_KEY_OUT = data['_STORAGE_ACCOUNT_KEY_OUT_PROD']
|
||||
_STORAGE_ACCOUNT_NAME_OUT = data['_STORAGE_ACCOUNT_NAME_OUT_PROD'] if (
|
||||
'_STORAGE_ACCOUNT_NAME_OUT_PROD' in data
|
||||
) else _STORAGE_ACCOUNT_NAME_OUT
|
||||
_STORAGE_ACCOUNT_NAME_IN = data['_STORAGE_ACCOUNT_NAME_IN_PROD'] if (
|
||||
'_STORAGE_ACCOUNT_NAME_IN_PROD' in data
|
||||
) else _STORAGE_ACCOUNT_NAME_IN
|
||||
SERVICE_URL = dataMeta["SERVICE_URL_PROD"] + "/api/v1/jobs/" + \
|
||||
jobId if "SERVICE_URL_PROD" in dataMeta else SERVICE_URL
|
||||
else:
|
||||
raise ValueError(
|
||||
"Environment type in metadata.json is invalid.")
|
||||
|
||||
BATCH_JOB = BatchJob(jobId, JobStatus.NotStarted, 0)
|
||||
SERVICE_AUTHORIZER = BrowseCloudServiceAuthorizer(AUTH_URL)
|
||||
HEART_BEATER = CountingGridsHeartBeater(
|
||||
SERVICE_URL, BATCH_JOB, SERVICE_AUTHORIZER)
|
||||
|
||||
DIRECTORY_SUFFIX = hashlib.sha3_256(
|
||||
(docId+blobName+str(datetime.datetime.now())).encode()).hexdigest()
|
||||
DIRECTORY_DATA = blobName.split(".")[0] + "_" + DIRECTORY_SUFFIX
|
||||
|
||||
if not os.path.isdir(DIRECTORY_DATA):
|
||||
os.mkdir(DIRECTORY_DATA)
|
||||
|
||||
'''
|
||||
Algorithm:
|
||||
1. Get training data from Azure.
|
||||
2. Run learning code.
|
||||
3. Write model files to Azure.
|
||||
|
||||
|
||||
Changes between this and dumpCountingGrids.py:
|
||||
1. DIRECTORY_DIR must be unique.
|
||||
2. Fetching and writing to Azure. Idea is to fetch into directory and then write it to Azure
|
||||
'''
|
||||
blob_client = azureblob.BlockBlobService(
|
||||
account_name=_STORAGE_ACCOUNT_NAME_IN,
|
||||
account_key=_STORAGE_ACCOUNT_KEY_IN)
|
||||
|
||||
download_file_from_container(
|
||||
blob_client, containerNameIn, DIRECTORY_DATA+"/"+blobName, blobName)
|
||||
|
||||
FILE_NAME = DIRECTORY_DATA + "\\" + blobName
|
||||
|
||||
CLEAN_DATA_FILE_NAME, MIN_FREQUENCY, MIN_WORDS = [
|
||||
"\cg-processed.tsv", 2, 5]
|
||||
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Data Cleaning
|
||||
# ---------------------------------------------------------------------------------------
|
||||
|
||||
cleaner = NLPCleaner()
|
||||
HEART_BEATER.next()
|
||||
correspondences = None
|
||||
CACHED_CORRESPONDENCES_FILE_NAME = "\cached_correspondences.tsv"
|
||||
pTimer = PipelineTimer()
|
||||
pTimer("Reading data file.")
|
||||
df, keep = cleaner.read(
|
||||
FILE_NAME, inputfile_type, MIN_FREQUENCY, MIN_WORDS)
|
||||
if not (os.path.exists(DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME) and os.path.exists(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME)):
|
||||
pTimer("Starting data cleaning.")
|
||||
cleaner.handle_negation_tokens()
|
||||
cleaner.removePunctuation()
|
||||
HEART_BEATER.makeProgress(50)
|
||||
correspondences = cleaner.lemmatize()
|
||||
cleaner.write_cached_correspondences(
|
||||
DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
|
||||
cleaner.write(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME)
|
||||
else:
|
||||
pTimer("Skipping data cleaning.")
|
||||
correspondences = cleaner.read_cached_correspondences(
|
||||
DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
|
||||
|
||||
pTimer("Learning counting grid.")
|
||||
LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
|
||||
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Learning
|
||||
# ---------------------------------------------------------------------------------------
|
||||
engine = CGEngineWrapper(
|
||||
extent_size=EXTENT_SIZE, window_size=WINDOW_SIZE, heartBeaters=[HEART_BEATER])
|
||||
HEART_BEATER.next()
|
||||
vocabulary = None
|
||||
if not os.path.exists(DIRECTORY_DATA + LEARNED_GRID_FILE_NAME):
|
||||
vocabulary, keep = engine.fit(
|
||||
DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS, MIN_FREQUENCY, keep, engine=engine_type)
|
||||
else:
|
||||
vocabulary, keep = engine.get_vocab(
|
||||
DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS, MIN_FREQUENCY, keep)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Output
|
||||
# ---------------------------------------------------------------------------------------
|
||||
pTimer("Generating counting grid artifacts.")
|
||||
LINK_FILE_NAME = ""
|
||||
bcag = BrowseCloudArtifactGenerator(DIRECTORY_DATA)
|
||||
bcag.read(LEARNED_GRID_FILE_NAME)
|
||||
bcag.write_docmap(engine.wd_size, engine=engine_type)
|
||||
bcag.write_counts()
|
||||
bcag.write_vocabulary(vocabulary)
|
||||
bcag.write_top_pi()
|
||||
bcag.write_top_pi_layers()
|
||||
bcag.write_colors() # write default blue colors
|
||||
bcag.write_database(df, keep)
|
||||
bcag.write_correspondences(correspondences, vocabulary)
|
||||
bcag.write_keep(keep)
|
||||
|
||||
pTimer("Done.")
|
||||
|
||||
blob_client = azureblob.BlockBlobService(
|
||||
account_name=_STORAGE_ACCOUNT_NAME_OUT,
|
||||
account_key=_STORAGE_ACCOUNT_KEY_OUT)
|
||||
|
||||
# apply some recursion to create multiple container once uniqueness is problem
|
||||
blob_client.create_container(containerNameOut)
|
||||
|
||||
# upload each file, aside from the input file into
|
||||
FILES = [f.path for f in os.scandir(DIRECTORY_DATA) if not f.is_dir()]
|
||||
for modelfile in FILES:
|
||||
if not modelfile.endswith(blobName):
|
||||
upload_file_to_container(
|
||||
blob_client, containerNameOut, modelfile)
|
||||
except Exception as e:
|
||||
HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
|
||||
print("Script failed.")
|
||||
print(traceback.format_exc())
|
||||
except:
|
||||
HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
|
||||
print("Script failed.")
|
||||
print(traceback.format_exc())
|
||||
|
||||
else:
|
||||
HEART_BEATER.done(success=True)
|
||||
print("Script succeeded.")
|
|
@ -0,0 +1,201 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import datetime
|
||||
import pandas as pd
|
||||
import os
|
||||
import numpy as np
|
||||
import requests
|
||||
import re
|
||||
import traceback
|
||||
from itertools import islice
|
||||
from browseCloudServiceAuthorizer import BrowseCloudServiceAuthorizer
|
||||
from countingGridsHeartBeater import CountingGridsHeartBeater
|
||||
from jobStatus import JobStatus
|
||||
from batchJob import BatchJob
|
||||
import azure.storage.blob as azureblob
|
||||
from browseCloudAzureUtilities import upload_file_to_container, download_file_from_container
|
||||
import matplotlib.pyplot as plt
|
||||
from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, PipelineTimer
|
||||
import sys
|
||||
sys.path.append("../../..")
|
||||
|
||||
if __name__ == "__main__":
|
||||
HEART_BEATER = None
|
||||
try:
|
||||
errStr = "Please give valid command-line arguments."
|
||||
if len(sys.argv) != 5:
|
||||
raise ValueError(errStr)
|
||||
|
||||
containerNameIn = sys.argv[1] # targetId
|
||||
metadataColumnName = sys.argv[2] # targetId
|
||||
containerNameOut = sys.argv[3] # id of current job
|
||||
windowSize = int(sys.argv[4]) # window size
|
||||
|
||||
AUTH_URL = ""
|
||||
SERVICE_URL = ""
|
||||
|
||||
_STORAGE_ACCOUNT_NAME = 'browsecloudmodelfiles'
|
||||
_STORAGE_ACCOUNT_KEY = ''
|
||||
|
||||
jobId = containerNameOut
|
||||
targetId = containerNameIn
|
||||
|
||||
with open("keys.json", "r") as f:
|
||||
data = json.load(f)
|
||||
_SENTIMENT_ANALYSIS_KEY = data['_SENTIMENT_ANALYSIS_KEY']
|
||||
|
||||
with open('metadata.json', 'r') as fMeta:
|
||||
dataMeta = json.load(fMeta)
|
||||
AUTH_URL = dataMeta["AUTH_URL"]
|
||||
if dataMeta['ENV'] == 'DEV':
|
||||
# TODO: Use key vault and certificate to retreive that information
|
||||
# instead of temp file for keys.
|
||||
# Note that keys.json is not checked in.
|
||||
with open("keys.json", "r") as f:
|
||||
data = json.load(f)
|
||||
_STORAGE_ACCOUNT_KEY = data['_STORAGE_ACCOUNT_KEY_OUT_DEV']
|
||||
_STORAGE_ACCOUNT_NAME = data['_STORAGE_ACCOUNT_NAME_OUT_DEV'] if (
|
||||
'_STORAGE_ACCOUNT_NAME_OUT_DEV' in data
|
||||
) else _STORAGE_ACCOUNT_NAME
|
||||
SERVICE_URL = dataMeta["SERVICE_URL_DEV"] + "/api/v1/jobs/" + \
|
||||
jobId if "SERVICE_URL_DEV" in dataMeta else SERVICE_URL
|
||||
elif dataMeta['ENV'] == 'PROD':
|
||||
with open("keys.json", "r") as f:
|
||||
data = json.load(f)
|
||||
_STORAGE_ACCOUNT_KEY = data['_STORAGE_ACCOUNT_KEY_OUT_PROD']
|
||||
_STORAGE_ACCOUNT_NAME = data['_STORAGE_ACCOUNT_NAME_OUT_PROD'] if (
|
||||
'_STORAGE_ACCOUNT_NAME_OUT_PROD' in data
|
||||
) else _STORAGE_ACCOUNT_NAME
|
||||
SERVICE_URL = dataMeta["SERVICE_URL_PROD"] + "/api/v1/jobs/" + \
|
||||
jobId if "SERVICE_URL_PROD" in dataMeta else SERVICE_URL
|
||||
else:
|
||||
raise ValueError(
|
||||
"Environment type in metadata.json is invalid.")
|
||||
|
||||
BATCH_JOB = BatchJob(jobId, JobStatus.NotStarted, 0)
|
||||
SERVICE_AUTHORIZER = BrowseCloudServiceAuthorizer(AUTH_URL)
|
||||
HEART_BEATER = CountingGridsHeartBeater(
|
||||
SERVICE_URL, BATCH_JOB, SERVICE_AUTHORIZER)
|
||||
|
||||
# Setup Working Directory.
|
||||
DIRECTORY_SUFFIX = hashlib.sha3_256(
|
||||
(targetId+str(datetime.datetime.now())).encode()).hexdigest()
|
||||
DIRECTORY_DATA = targetId + "_" + DIRECTORY_SUFFIX
|
||||
|
||||
if not os.path.isdir(DIRECTORY_DATA):
|
||||
os.mkdir(DIRECTORY_DATA)
|
||||
|
||||
'''
|
||||
Algorithm:
|
||||
1. Copy everything over from target container to job container.
|
||||
2. Separate and properly encode the feature column.
|
||||
3. Weight documents on grid.
|
||||
3. Output colors file.
|
||||
5. Write legend file.
|
||||
'''
|
||||
pTimer = PipelineTimer()
|
||||
|
||||
pTimer("Downloading Target Job Files.")
|
||||
|
||||
# Heart beater to training state.
|
||||
HEART_BEATER.next()
|
||||
HEART_BEATER.next()
|
||||
|
||||
blob_client = azureblob.BlockBlobService(
|
||||
account_name=_STORAGE_ACCOUNT_NAME,
|
||||
account_key=_STORAGE_ACCOUNT_KEY)
|
||||
|
||||
all_blobs = blob_client.list_blobs(containerNameIn)
|
||||
|
||||
for blob in all_blobs:
|
||||
download_file_from_container(
|
||||
blob_client, containerNameIn, DIRECTORY_DATA+"/"+blob.name, blob.name)
|
||||
|
||||
HEART_BEATER.makeProgress(20)
|
||||
pTimer("Getting sentiment from Azure.")
|
||||
|
||||
# Feature Mapping data
|
||||
raw_feature = []
|
||||
|
||||
def dictionary_from_line(row):
|
||||
prop_dict = {}
|
||||
row_components = row.split('\t')
|
||||
for row_component in row_components:
|
||||
row_key_value = row_component.split(':')
|
||||
prop_dict[row_key_value[0]] = row_key_value[1]
|
||||
return prop_dict
|
||||
|
||||
with open(DIRECTORY_DATA+"/database.txt", 'r') as f:
|
||||
for row in f:
|
||||
prop_dict = dictionary_from_line(row)
|
||||
raw_feature.append(prop_dict[metadataColumnName])
|
||||
|
||||
# first, try to convert all values to numbers
|
||||
feature_map = []
|
||||
labelTuples = []
|
||||
cm = None
|
||||
try:
|
||||
feature_map = np.array([float(x) for x in raw_feature])
|
||||
labelTuples = [('{0:.3g}'.format(np.min(feature_map)) + " " + metadataColumnName,
|
||||
'{0:.3g}'.format(np.max(feature_map)) + " " + metadataColumnName)]
|
||||
feature_map = (feature_map - np.min(feature_map)) / \
|
||||
(np.max(feature_map) - np.min(feature_map))
|
||||
cm = plt.get_cmap('cool')
|
||||
except:
|
||||
# Not all numbers. Try to do a 2-categorical
|
||||
set_of_values = set(raw_feature)
|
||||
|
||||
if len(set_of_values) is not 2:
|
||||
raise ValueError(
|
||||
"Selected metadata row is neither numerical nor 2-categorical")
|
||||
|
||||
mask = [int(x == raw_feature[0]) for x in raw_feature]
|
||||
feature_map = np.array(mask)
|
||||
|
||||
cm = plt.get_cmap('PiYG')
|
||||
labelTuples = [
|
||||
(list(set_of_values - set([raw_feature[0]]))[0], raw_feature[0])]
|
||||
|
||||
HEART_BEATER.makeProgress(60)
|
||||
|
||||
colorTuples = [(tuple(list(cm(0))[:-1]),
|
||||
tuple(list(cm(127))[:-1]), tuple(list(cm(255))[:-1]))]
|
||||
|
||||
pTimer("Generating counting grid artifacts.")
|
||||
LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
|
||||
DOCMAP_FILE_NAME = "/docmap.txt"
|
||||
bcag = BrowseCloudArtifactGenerator(DIRECTORY_DATA)
|
||||
bcag.W = [windowSize, windowSize]
|
||||
bcag.read(LEARNED_GRID_FILE_NAME)
|
||||
bcag.read_docmap(DOCMAP_FILE_NAME, engine="numpy")
|
||||
bcag.write_colors(cm=cm, feature_map=feature_map,
|
||||
stretch_the_truth=True)
|
||||
bcag.write_legends(labelTuples=labelTuples, colorTuples=colorTuples)
|
||||
bcag.add_feature_map_to_database(feature_map)
|
||||
|
||||
HEART_BEATER.makeProgress(80)
|
||||
pTimer("Done.")
|
||||
|
||||
blob_client.create_container(containerNameOut)
|
||||
|
||||
# upload each file, aside from the input file into
|
||||
FILES = [f.path for f in os.scandir(DIRECTORY_DATA) if not f.is_dir()]
|
||||
for modelfile in FILES:
|
||||
upload_file_to_container(blob_client, containerNameOut, modelfile)
|
||||
|
||||
HEART_BEATER.makeProgress(100)
|
||||
except Exception as e:
|
||||
HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
|
||||
print("Script failed.")
|
||||
print(traceback.format_exc())
|
||||
except:
|
||||
HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
|
||||
print("Script failed.")
|
||||
print(traceback.format_exc())
|
||||
|
||||
else:
|
||||
HEART_BEATER.done(success=True)
|
||||
print("Script succeeded.")
|
|
@ -0,0 +1,210 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import datetime
|
||||
import pandas as pd
|
||||
import os
|
||||
import numpy as np
|
||||
import requests
|
||||
import re
|
||||
import traceback
|
||||
from itertools import islice
|
||||
from browseCloudServiceAuthorizer import BrowseCloudServiceAuthorizer
|
||||
from countingGridsHeartBeater import CountingGridsHeartBeater
|
||||
from jobStatus import JobStatus
|
||||
from batchJob import BatchJob
|
||||
import azure.storage.blob as azureblob
|
||||
from browseCloudAzureUtilities import upload_file_to_container, download_file_from_container
|
||||
import matplotlib.pyplot as plt
|
||||
from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, PipelineTimer
|
||||
import sys
|
||||
sys.path.append("../../..")
|
||||
|
||||
if __name__ == "__main__":
|
||||
HEART_BEATER = None
|
||||
try:
|
||||
errStr = "Please give valid command-line arguments."
|
||||
if len(sys.argv) != 4:
|
||||
raise ValueError(errStr)
|
||||
|
||||
containerNameIn = sys.argv[1] # targetId
|
||||
containerNameOut = sys.argv[2] # id of current job
|
||||
windowSize = int(sys.argv[3]) # window size
|
||||
|
||||
AUTH_URL = ""
|
||||
SERVICE_URL = ""
|
||||
|
||||
_STORAGE_ACCOUNT_NAME = ""
|
||||
_STORAGE_ACCOUNT_KEY = ""
|
||||
|
||||
jobId = containerNameOut
|
||||
targetId = containerNameIn
|
||||
|
||||
with open("keys.json", "r") as f:
|
||||
data = json.load(f)
|
||||
_SENTIMENT_ANALYSIS_KEY = data['_SENTIMENT_ANALYSIS_KEY']
|
||||
|
||||
with open('metadata.json', 'r') as fMeta:
|
||||
dataMeta = json.load(fMeta)
|
||||
AUTH_URL = dataMeta["AUTH_URL"]
|
||||
_STORAGE_ACCOUNT_NAME = dataMeta["_STORAGE_ACCOUNT_NAME_MODELS"]
|
||||
SENTIMENT_ANALYSIS_ENDPOINT = dataMeta["SENTIMENT_ANALYSIS_ENDPOINT"]
|
||||
if dataMeta['ENV'] == 'DEV':
|
||||
# TODO: Use key vault and certificate to retreive that information
|
||||
# instead of temp file for keys.
|
||||
# Note that keys.json is not checked in.
|
||||
with open("keys.json", "r") as f:
|
||||
data = json.load(f)
|
||||
_STORAGE_ACCOUNT_KEY = data['_STORAGE_ACCOUNT_KEY_OUT_DEV']
|
||||
_STORAGE_ACCOUNT_NAME = data['_STORAGE_ACCOUNT_NAME_OUT_DEV'] if (
|
||||
'_STORAGE_ACCOUNT_NAME_OUT_DEV' in data
|
||||
) else _STORAGE_ACCOUNT_NAME
|
||||
SERVICE_URL = dataMeta["SERVICE_URL_DEV"] + "/api/v1/jobs/" + \
|
||||
jobId if "SERVICE_URL_DEV" in dataMeta else SERVICE_URL
|
||||
elif dataMeta['ENV'] == 'PROD':
|
||||
with open("keys.json", "r") as f:
|
||||
data = json.load(f)
|
||||
_STORAGE_ACCOUNT_KEY = data['_STORAGE_ACCOUNT_KEY_OUT_PROD']
|
||||
_STORAGE_ACCOUNT_NAME = data['_STORAGE_ACCOUNT_NAME_OUT_PROD'] if (
|
||||
'_STORAGE_ACCOUNT_NAME_OUT_PROD' in data
|
||||
) else _STORAGE_ACCOUNT_NAME
|
||||
SERVICE_URL = dataMeta["SERVICE_URL_PROD"] + "/api/v1/jobs/" + \
|
||||
jobId if "SERVICE_URL_PROD" in dataMeta else SERVICE_URL
|
||||
else:
|
||||
raise ValueError(
|
||||
"Environment type in metadata.json is invalid.")
|
||||
|
||||
BATCH_JOB = BatchJob(jobId, JobStatus.NotStarted, 0)
|
||||
SERVICE_AUTHORIZER = BrowseCloudServiceAuthorizer(AUTH_URL)
|
||||
HEART_BEATER = CountingGridsHeartBeater(
|
||||
SERVICE_URL, BATCH_JOB, SERVICE_AUTHORIZER)
|
||||
|
||||
# Setup Working Directory.
|
||||
DIRECTORY_SUFFIX = hashlib.sha3_256(
|
||||
(targetId+str(datetime.datetime.now())).encode()).hexdigest()
|
||||
DIRECTORY_DATA = targetId + "_" + DIRECTORY_SUFFIX
|
||||
|
||||
if not os.path.isdir(DIRECTORY_DATA):
|
||||
os.mkdir(DIRECTORY_DATA)
|
||||
|
||||
'''
|
||||
Algorithm:
|
||||
1. Copy everything over from target container to job container.
|
||||
2. Get sentiment analysis results for all documents.
|
||||
3. Weight documents on grid.
|
||||
3. Output colors file.
|
||||
5. Write legend file.
|
||||
'''
|
||||
pTimer = PipelineTimer()
|
||||
|
||||
pTimer("Downloading Target Job Files.")
|
||||
|
||||
# Heart beater to training state.
|
||||
HEART_BEATER.next()
|
||||
HEART_BEATER.next()
|
||||
|
||||
blob_client = azureblob.BlockBlobService(
|
||||
account_name=_STORAGE_ACCOUNT_NAME,
|
||||
account_key=_STORAGE_ACCOUNT_KEY)
|
||||
|
||||
all_blobs = blob_client.list_blobs(containerNameIn)
|
||||
|
||||
for blob in all_blobs:
|
||||
download_file_from_container(
|
||||
blob_client, containerNameIn, DIRECTORY_DATA + "/"+blob.name, blob.name)
|
||||
|
||||
HEART_BEATER.makeProgress(20)
|
||||
pTimer("Getting sentiment from Azure.")
|
||||
|
||||
# Feature Mapping data
|
||||
# Generate body to send to text analytics.
|
||||
base_object = {
|
||||
'documents': []
|
||||
}
|
||||
documents = []
|
||||
|
||||
def chunk(it, size):
|
||||
it = iter(it)
|
||||
return iter(lambda: tuple(islice(it, size)), ())
|
||||
|
||||
def dictionary_from_line(row):
|
||||
prop_dict = {}
|
||||
row_components = row.split('\t')
|
||||
for row_component in row_components:
|
||||
row_key_value = row_component.split(':')
|
||||
prop_dict[row_key_value[0]] = row_key_value[1]
|
||||
return prop_dict
|
||||
|
||||
with open(DIRECTORY_DATA+"/database.txt", 'r') as f:
|
||||
for row in f:
|
||||
prop_dict = dictionary_from_line(row)
|
||||
documents.append({
|
||||
'id': prop_dict['id'],
|
||||
# Text analytics has a 5120 character limit.
|
||||
'text': (prop_dict['title'] + " " + prop_dict['abstract'])[:5120]
|
||||
})
|
||||
|
||||
# Call text analytics API
|
||||
requestHeaders = {
|
||||
'Content-Type': 'application/json',
|
||||
'Ocp-Apim-Subscription-Key': _SENTIMENT_ANALYSIS_KEY
|
||||
}
|
||||
|
||||
# To avoid API size limit, send in chunks of 500
|
||||
feature_map = [None] * len(documents)
|
||||
for chunk in chunk(documents, 500):
|
||||
base_object['documents'] = chunk
|
||||
response = requests.post(
|
||||
SENTIMENT_ANALYSIS_ENDPOINT, json=base_object, headers=requestHeaders)
|
||||
|
||||
# Raise for non 200 response codes
|
||||
response.raise_for_status()
|
||||
|
||||
json_response = response.json()
|
||||
for element in json_response['documents']:
|
||||
feature_map[int(element['id']) - 1] = float(element['score'])
|
||||
|
||||
HEART_BEATER.makeProgress(60)
|
||||
|
||||
# Legend data
|
||||
cm = plt.get_cmap('coolwarm_r')
|
||||
labelTuples = [("Negative Sentiment", "Positive Sentiment")]
|
||||
colorTuples = [(tuple(list(cm(0))[:-1]),
|
||||
tuple(list(cm(127))[:-1]), tuple(list(cm(255))[:-1]))]
|
||||
|
||||
pTimer("Generating counting grid artifacts.")
|
||||
LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
|
||||
DOCMAP_FILE_NAME = "/docmap.txt"
|
||||
bcag = BrowseCloudArtifactGenerator(DIRECTORY_DATA)
|
||||
bcag.W = [windowSize, windowSize]
|
||||
bcag.read(LEARNED_GRID_FILE_NAME)
|
||||
bcag.read_docmap(DOCMAP_FILE_NAME, engine="numpy")
|
||||
bcag.write_colors(cm=cm, feature_map=feature_map)
|
||||
bcag.write_legends(labelTuples=labelTuples, colorTuples=colorTuples)
|
||||
bcag.add_feature_map_to_database(feature_map)
|
||||
|
||||
HEART_BEATER.makeProgress(80)
|
||||
pTimer("Done.")
|
||||
|
||||
blob_client.create_container(containerNameOut)
|
||||
|
||||
# upload each file, aside from the input file into
|
||||
FILES = [f.path for f in os.scandir(DIRECTORY_DATA) if not f.is_dir()]
|
||||
for modelfile in FILES:
|
||||
upload_file_to_container(blob_client, containerNameOut, modelfile)
|
||||
|
||||
HEART_BEATER.makeProgress(100)
|
||||
except Exception as e:
|
||||
HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
|
||||
print("Script failed.")
|
||||
print(traceback.format_exc())
|
||||
except:
|
||||
HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
|
||||
print("Script failed.")
|
||||
print(traceback.format_exc())
|
||||
|
||||
else:
|
||||
HEART_BEATER.done(success=True)
|
||||
print("Script succeeded.")
|
|
@ -0,0 +1,12 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from enum import IntEnum
|
||||
|
||||
|
||||
class JobStatus(IntEnum):
|
||||
NotStarted = 0
|
||||
PreProcessing = 1
|
||||
Training = 2
|
||||
Success = 3
|
||||
Failure = 4
|
|
@ -0,0 +1,4 @@
|
|||
azure-batch==5.1.1
|
||||
azure-storage-blob==1.4.0
|
||||
adal==1.2.0
|
||||
azure-mgmt-datalake-analytics==0.2.0
|
|
@ -0,0 +1,292 @@
|
|||
## Ignore Visual Studio temporary files, build results, and
|
||||
## files generated by popular Visual Studio add-ons.
|
||||
##
|
||||
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
|
||||
|
||||
src/keys.json
|
||||
src/metadata.json
|
||||
src/metadata_other.json
|
||||
|
||||
# User-specific files
|
||||
*.suo
|
||||
*.user
|
||||
*.userosscache
|
||||
*.sln.docstates
|
||||
|
||||
# User-specific files (MonoDevelop/Xamarin Studio)
|
||||
*.userprefs
|
||||
|
||||
# Build results
|
||||
[Dd]ebug/
|
||||
[Dd]ebugPublic/
|
||||
[Rr]elease/
|
||||
[Rr]eleases/
|
||||
x64/
|
||||
x86/
|
||||
bld/
|
||||
[Bb]in/
|
||||
[Oo]bj/
|
||||
[Ll]og/
|
||||
|
||||
# Visual Studio 2015 cache/options directory
|
||||
.vs/
|
||||
# Uncomment if you have tasks that create the project's static files in wwwroot
|
||||
#wwwroot/
|
||||
|
||||
# MSTest test Results
|
||||
[Tt]est[Rr]esult*/
|
||||
[Bb]uild[Ll]og.*
|
||||
|
||||
# NUNIT
|
||||
*.VisualState.xml
|
||||
TestResult.xml
|
||||
|
||||
# Build Results of an ATL Project
|
||||
[Dd]ebugPS/
|
||||
[Rr]eleasePS/
|
||||
dlldata.c
|
||||
|
||||
# .NET Core
|
||||
project.lock.json
|
||||
project.fragment.lock.json
|
||||
artifacts/
|
||||
**/Properties/launchSettings.json
|
||||
|
||||
*_i.c
|
||||
*_p.c
|
||||
*_i.h
|
||||
*.ilk
|
||||
*.meta
|
||||
*.obj
|
||||
*.pch
|
||||
*.pdb
|
||||
*.pgc
|
||||
*.pgd
|
||||
*.rsp
|
||||
*.sbr
|
||||
*.tlb
|
||||
*.tli
|
||||
*.tlh
|
||||
*.tmp
|
||||
*.tmp_proj
|
||||
*.log
|
||||
*.vspscc
|
||||
*.vssscc
|
||||
.builds
|
||||
*.pidb
|
||||
*.svclog
|
||||
*.scc
|
||||
|
||||
# Chutzpah Test files
|
||||
_Chutzpah*
|
||||
|
||||
# Visual C++ cache files
|
||||
ipch/
|
||||
*.aps
|
||||
*.ncb
|
||||
*.opendb
|
||||
*.opensdf
|
||||
*.sdf
|
||||
*.cachefile
|
||||
*.VC.db
|
||||
*.VC.VC.opendb
|
||||
|
||||
# Visual Studio profiler
|
||||
*.psess
|
||||
*.vsp
|
||||
*.vspx
|
||||
*.sap
|
||||
|
||||
# TFS 2012 Local Workspace
|
||||
$tf/
|
||||
|
||||
# Guidance Automation Toolkit
|
||||
*.gpState
|
||||
|
||||
# ReSharper is a .NET coding add-in
|
||||
_ReSharper*/
|
||||
*.[Rr]e[Ss]harper
|
||||
*.DotSettings.user
|
||||
|
||||
# JustCode is a .NET coding add-in
|
||||
.JustCode
|
||||
|
||||
# TeamCity is a build add-in
|
||||
_TeamCity*
|
||||
|
||||
# DotCover is a Code Coverage Tool
|
||||
*.dotCover
|
||||
|
||||
# Visual Studio code coverage results
|
||||
*.coverage
|
||||
*.coveragexml
|
||||
|
||||
# NCrunch
|
||||
_NCrunch_*
|
||||
.*crunch*.local.xml
|
||||
nCrunchTemp_*
|
||||
|
||||
# MightyMoose
|
||||
*.mm.*
|
||||
AutoTest.Net/
|
||||
|
||||
# Web workbench (sass)
|
||||
.sass-cache/
|
||||
|
||||
# Installshield output folder
|
||||
[Ee]xpress/
|
||||
|
||||
# DocProject is a documentation generator add-in
|
||||
DocProject/buildhelp/
|
||||
DocProject/Help/*.HxT
|
||||
DocProject/Help/*.HxC
|
||||
DocProject/Help/*.hhc
|
||||
DocProject/Help/*.hhk
|
||||
DocProject/Help/*.hhp
|
||||
DocProject/Help/Html2
|
||||
DocProject/Help/html
|
||||
|
||||
# Click-Once directory
|
||||
publish/
|
||||
|
||||
# Publish Web Output
|
||||
*.[Pp]ublish.xml
|
||||
*.azurePubxml
|
||||
# TODO: Comment the next line if you want to checkin your web deploy settings
|
||||
# but database connection strings (with potential passwords) will be unencrypted
|
||||
*.pubxml
|
||||
*.publishproj
|
||||
|
||||
# Microsoft Azure Web App publish settings. Comment the next line if you want to
|
||||
# checkin your Azure Web App publish settings, but sensitive information contained
|
||||
# in these scripts will be unencrypted
|
||||
PublishScripts/
|
||||
|
||||
# NuGet Packages
|
||||
*.nupkg
|
||||
# The packages folder can be ignored because of Package Restore
|
||||
**/packages/*
|
||||
# except build/, which is used as an MSBuild target.
|
||||
!**/packages/build/
|
||||
# Uncomment if necessary however generally it will be regenerated when needed
|
||||
#!**/packages/repositories.config
|
||||
# NuGet v3's project.json files produces more ignorable files
|
||||
*.nuget.props
|
||||
*.nuget.targets
|
||||
|
||||
# Microsoft Azure Build Output
|
||||
csx/
|
||||
*.build.csdef
|
||||
|
||||
# Microsoft Azure Emulator
|
||||
ecf/
|
||||
rcf/
|
||||
|
||||
# Windows Store app package directories and files
|
||||
AppPackages/
|
||||
BundleArtifacts/
|
||||
Package.StoreAssociation.xml
|
||||
_pkginfo.txt
|
||||
|
||||
# Visual Studio cache files
|
||||
# files ending in .cache can be ignored
|
||||
*.[Cc]ache
|
||||
# but keep track of directories ending in .cache
|
||||
!*.[Cc]ache/
|
||||
|
||||
# Others
|
||||
ClientBin/
|
||||
~$*
|
||||
*~
|
||||
*.dbmdl
|
||||
*.dbproj.schemaview
|
||||
*.jfm
|
||||
*.pfx
|
||||
*.publishsettings
|
||||
orleans.codegen.cs
|
||||
|
||||
# Since there are multiple workflows, uncomment next line to ignore bower_components
|
||||
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
|
||||
#bower_components/
|
||||
|
||||
# RIA/Silverlight projects
|
||||
Generated_Code/
|
||||
|
||||
# Backup & report files from converting an old project file
|
||||
# to a newer Visual Studio version. Backup files are not needed,
|
||||
# because we have git ;-)
|
||||
_UpgradeReport_Files/
|
||||
Backup*/
|
||||
UpgradeLog*.XML
|
||||
UpgradeLog*.htm
|
||||
|
||||
# SQL Server files
|
||||
*.mdf
|
||||
*.ldf
|
||||
*.ndf
|
||||
|
||||
# Business Intelligence projects
|
||||
*.rdl.data
|
||||
*.bim.layout
|
||||
*.bim_*.settings
|
||||
|
||||
# Microsoft Fakes
|
||||
FakesAssemblies/
|
||||
|
||||
# GhostDoc plugin setting file
|
||||
*.GhostDoc.xml
|
||||
|
||||
# Node.js Tools for Visual Studio
|
||||
.ntvs_analysis.dat
|
||||
node_modules/
|
||||
|
||||
# Typescript v1 declaration files
|
||||
typings/
|
||||
|
||||
# Visual Studio 6 build log
|
||||
*.plg
|
||||
|
||||
# Visual Studio 6 workspace options file
|
||||
*.opt
|
||||
|
||||
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
|
||||
*.vbw
|
||||
|
||||
# Visual Studio LightSwitch build output
|
||||
**/*.HTMLClient/GeneratedArtifacts
|
||||
**/*.DesktopClient/GeneratedArtifacts
|
||||
**/*.DesktopClient/ModelManifest.xml
|
||||
**/*.Server/GeneratedArtifacts
|
||||
**/*.Server/ModelManifest.xml
|
||||
_Pvt_Extensions
|
||||
|
||||
# Paket dependency manager
|
||||
.paket/paket.exe
|
||||
paket-files/
|
||||
|
||||
# FAKE - F# Make
|
||||
.fake/
|
||||
|
||||
# JetBrains Rider
|
||||
.idea/
|
||||
*.sln.iml
|
||||
|
||||
# CodeRush
|
||||
.cr/
|
||||
|
||||
# Python Tools for Visual Studio (PTVS)
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
# Cake - Uncomment if you are using it
|
||||
# tools/**
|
||||
# !tools/packages.config
|
||||
|
||||
# Telerik's JustMock configuration file
|
||||
*.jmconfig
|
||||
|
||||
# BizTalk build output
|
||||
*.btp.cs
|
||||
*.btm.cs
|
||||
*.odx.cs
|
||||
*.xsd.cs
|
|
@ -0,0 +1,10 @@
|
|||
from .morphologicalTightener import MorphologicalTightener
|
||||
from .slidingWindowTrainer import SlidingWindowTrainer
|
||||
from .browseCloudArtifactGenerator import BrowseCloudArtifactGenerator
|
||||
from .cgEngineWrapper import CGEngineWrapper
|
||||
from .nlpCleaner import NLPCleaner
|
||||
from .pipelineTimer import PipelineTimer
|
||||
|
||||
|
||||
__all__ = ['MorphologicalTightener', 'BrowseCloudArtifactGenerator',
|
||||
'CGEngineWrapper', 'NLPCleaner', 'PipelineTimer', 'SlidingWindowTrainer']
|
|
@ -0,0 +1,353 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import numpy as np
|
||||
import scipy.io as io
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from CountingGridsPy.EngineToBrowseCloudPipeline import MorphologicalTightener
|
||||
|
||||
|
||||
class BrowseCloudArtifactGenerator(object):
|
||||
def __init__(self, DIRECTORY_DATA):
|
||||
self.DIRECTORY_DATA = DIRECTORY_DATA
|
||||
|
||||
def read(self, LEARNED_GRID_FILE_NAME):
|
||||
MAT = io.loadmat(self.DIRECTORY_DATA + LEARNED_GRID_FILE_NAME)
|
||||
self.pi2_idf = MAT['pi2_idf']
|
||||
self.counts_to_show = MAT['counts_to_show'] # COUNTS MATRIX
|
||||
if type(self.counts_to_show) is not np.ndarray:
|
||||
try:
|
||||
self.counts_to_show = self.counts_to_show.toarray()
|
||||
except Exception as e:
|
||||
raise ValueError(e)
|
||||
# MAPPING AFTER LAYERS CODE Q( Location | Document ) TxE
|
||||
self.ql2 = MAT['ql2']
|
||||
# ARRAY WITH THE LAYER NUMBER FOR EACH DOCUMENT, argmax over the layers
|
||||
self.id_layer = MAT['id_layer'][0]
|
||||
# LAYERED PI WEIGHTED BY IDF. E1xE2xZxLA
|
||||
self.pi_la_idf = MAT['pi_la_idf']
|
||||
try:
|
||||
# We don't need it for now. Simply 1:T
|
||||
self.indices_to_show = MAT['indices_to_show'][0]
|
||||
except Exception as e:
|
||||
# Not implemented in matlab version
|
||||
self.indices_to_show = np.array(range(MAT['ql2'].shape[0])) + 1
|
||||
del MAT
|
||||
|
||||
cgsz = np.zeros(2)
|
||||
cgsz[0], cgsz[1], self.Z = self.pi2_idf.shape
|
||||
self.cgsz = cgsz.astype(int)
|
||||
self.indexR = np.arange(0, self.cgsz[0]).astype(int)
|
||||
self.indexC = np.arange(0, self.cgsz[1]).astype(int)
|
||||
|
||||
def write_top_pi(self):
|
||||
MAXZ = 80
|
||||
self.pi2_idf = MorphologicalTightener.tighten_pi(self.pi2_idf)
|
||||
|
||||
pi_max = np.argsort(-self.pi2_idf, axis=2)[:, :, :MAXZ]
|
||||
pi_max_vals = -np.sort(-self.pi2_idf, axis=2)[:, :, :MAXZ]
|
||||
missing_words = set(range(self.Z)).difference(set(pi_max.flatten()))
|
||||
locations_missing_words = np.zeros([len(missing_words), 4])
|
||||
for m_id, m in enumerate(missing_words):
|
||||
loc = np.unravel_index(
|
||||
np.argmax(self.pi2_idf[:, :, m]), self.cgsz.astype('int'))
|
||||
locations_missing_words[m_id, :] = [
|
||||
int(m), self.pi2_idf[loc[0], loc[1], m], loc[0], loc[1]]
|
||||
|
||||
with open(self.DIRECTORY_DATA + '/top_pi.txt', 'w') as the_file:
|
||||
for r in self.indexR:
|
||||
for c in self.indexC:
|
||||
tmp = "row:" + ("%1d" % (r+1)) + "\t" + "col:" + ("%1d" % (c+1)) + "\t" + "\t".join(
|
||||
["%1d" % a + ":" + "%1.3f" % b for a, b in zip(pi_max[r, c, :], pi_max_vals[r, c, :])])
|
||||
if any((locations_missing_words[:, 2] == r) & (locations_missing_words[:, 3] == c)):
|
||||
tmp = tmp + "\t" + "\t".join([("%1d" % a) + ":" + ("%1.3f" % b) for a, b in locations_missing_words[(
|
||||
locations_missing_words[:, 2] == r) & (locations_missing_words[:, 3] == c), :2]])
|
||||
the_file.write(tmp + "\n")
|
||||
|
||||
def write_top_pi_layers(self):
|
||||
no_layers = self.pi_la_idf.shape[3]
|
||||
with open(self.DIRECTORY_DATA + '/top_pi_layers.txt', 'w') as the_file:
|
||||
for layer in range(no_layers):
|
||||
MAXZ = 80
|
||||
self.pi_la_idf[:, :, :, layer] = MorphologicalTightener.tighten_pi(
|
||||
self.pi_la_idf[:, :, :, layer])
|
||||
pi_max = np.argsort(-self.pi_la_idf[:,
|
||||
:, :, layer], axis=2)[:, :, :MAXZ]
|
||||
pi_max_vals = - \
|
||||
np.sort(-self.pi_la_idf[:, :, :,
|
||||
layer], axis=2)[:, :, :MAXZ]
|
||||
missing_words = set(range(self.Z)).difference(
|
||||
set(pi_max.flatten()))
|
||||
locations_missing_words = np.zeros([len(missing_words), 4])
|
||||
for m_id, m in enumerate(missing_words):
|
||||
loc = np.unravel_index(
|
||||
np.argmax(self.pi_la_idf[:, :, m, layer]), self.cgsz.astype(int))
|
||||
locations_missing_words[m_id, :] = [
|
||||
int(m), self.pi_la_idf[loc[0], loc[1], m, layer], loc[0], loc[1]]
|
||||
|
||||
for r in self.indexR:
|
||||
for c in self.indexC:
|
||||
tmp = "layer:" + ("%1d" % (layer+1)) + "\t" + "row:" + ("%1d" % (r+1)) + "\t" + "col:" + ("%1d" % (c+1)) + "\t" + "\t".join(
|
||||
["%1d" % a + ":" + "%1.3f" % b for a, b in zip(pi_max[r, c, :], pi_max_vals[r, c, :])])
|
||||
if any((locations_missing_words[:, 2] == r) & (locations_missing_words[:, 3] == c)):
|
||||
tmp = tmp + "\t" + "\t".join([("%1d" % a) + ":" + ("%1.3f" % b) for a, b in locations_missing_words[(
|
||||
locations_missing_words[:, 2] == r) & (locations_missing_words[:, 3] == c), :2]])
|
||||
the_file.write(tmp + "\n")
|
||||
|
||||
def write_database(self, df, keep):
|
||||
dfSave = df.copy()
|
||||
|
||||
dfSave = dfSave[keep]
|
||||
dfSave.reset_index(drop=True, inplace=True)
|
||||
dfSave["id"] = np.arange(len(dfSave)) + 1
|
||||
dfSave["layer"] = self.id_layer
|
||||
|
||||
def format_full_row(row):
|
||||
row_property_strings = []
|
||||
for column_name in set(dfSave.columns):
|
||||
row_property_strings.append(
|
||||
column_name + ":" + str(row[column_name]))
|
||||
return str.join('\t', row_property_strings) + '\n'
|
||||
databaselist = dfSave.apply(format_full_row, axis=1).tolist()
|
||||
with open(self.DIRECTORY_DATA + '/database.txt', 'w', encoding="utf-8") as the_file:
|
||||
the_file.writelines(databaselist)
|
||||
|
||||
def add_feature_map_to_database(self, feature_map):
|
||||
file_text = ""
|
||||
with open(self.DIRECTORY_DATA + '/database.txt', "r") as f:
|
||||
for index, line in enumerate(f):
|
||||
file_text += line.strip() + '\tfeature:' + \
|
||||
str(feature_map[index]) + "\n"
|
||||
|
||||
with open(self.DIRECTORY_DATA + '/database.txt', "w") as f:
|
||||
f.writelines(file_text)
|
||||
|
||||
def write_keep(self, keep):
|
||||
with open(self.DIRECTORY_DATA + '/keep.txt', 'w') as the_file:
|
||||
the_file.writelines("\n".join([str(int(x)) for x in keep]))
|
||||
|
||||
def read_docmap(self, fileName, engine="numpy"):
|
||||
if not (engine == "numpy" or engine == "matlab"):
|
||||
raise ValueError("The {} engine does not exist.".format(engine))
|
||||
|
||||
self.ql2 = np.zeros(self.ql2.shape)
|
||||
|
||||
with open(self.DIRECTORY_DATA + fileName) as f:
|
||||
for line in f:
|
||||
arr = line.split("\t")
|
||||
e1Index = int(arr[0].replace("row:", ""))-1
|
||||
e2Index = int(arr[1].replace("col:", ""))-1
|
||||
|
||||
# since there are layers, we have previously pick the max probability over the bold-ith layer
|
||||
i = 2
|
||||
while i < len(arr):
|
||||
docId, qVal, layer = arr[i].split(":")
|
||||
docId = int(docId)
|
||||
qVal = float(qVal)
|
||||
layer = int(layer)
|
||||
t = docId - 1
|
||||
if engine == "matlab":
|
||||
self.ql2[e1Index, e2Index, t] = qVal
|
||||
elif engine == "numpy":
|
||||
self.ql2[t, e1Index, e2Index] = qVal
|
||||
i += 1
|
||||
|
||||
def write_docmap(self, wd_size, engine="numpy"):
|
||||
docToGridMapping = np.copy(self.ql2)
|
||||
|
||||
if engine == "matlab":
|
||||
pass
|
||||
elif engine == "numpy":
|
||||
docToGridMapping = np.moveaxis(docToGridMapping, 0, -1)
|
||||
else:
|
||||
raise ValueError("The {} engine does not exist.".format(engine))
|
||||
|
||||
thr = 0.01
|
||||
mask = np.zeros(self.cgsz)
|
||||
mask[:wd_size, :wd_size] = 1
|
||||
qlSmooth = np.real(np.fft.ifft2(np.fft.fft2(docToGridMapping, axes=(
|
||||
0, 1)) * np.fft.fft2(np.expand_dims(mask, 2), axes=(0, 1)), axes=(0, 1)))
|
||||
tmp = list()
|
||||
with open(self.DIRECTORY_DATA + '/docmap.txt', 'w') as f:
|
||||
for r in self.indexR:
|
||||
for c in self.indexC:
|
||||
ids = np.where(qlSmooth[r, c, :] > thr)[0]
|
||||
vals = qlSmooth[r, c, ids]
|
||||
lay = self.id_layer[ids]
|
||||
|
||||
tmp.append("row:" + ("%1d" % (r+1)) + "\tcol:" + ("%1d" % (c+1)) + "\t" + "\t".join(
|
||||
[str(theid + 1)+":"+str(val)+":"+str(l) for theid, val, l in zip(ids, vals, lay)]) + "\n")
|
||||
f.writelines(tmp)
|
||||
|
||||
def write_correspondences(self, correspondences, vocabulary):
|
||||
'''
|
||||
Correspondences maps the lemmatized words to the original text.
|
||||
|
||||
Example correspondences:
|
||||
{'adopt': ',adopted,adopted,adopted,adopted', 'work': ',work,work,work,work', 'i': ',i,i,i,i,i,i,i,i
|
||||
', 'wish': ',wish,wish,wish,wish'}
|
||||
|
||||
'''
|
||||
li = list()
|
||||
|
||||
with open(self.DIRECTORY_DATA + '/correspondences.txt', 'w') as the_file:
|
||||
for k, v in correspondences.items():
|
||||
unique_values = list(set([w for w in v.split(",") if w != '']))
|
||||
N = len(unique_values)
|
||||
li = li + list(zip(unique_values, [k]*N))
|
||||
|
||||
tmp = list()
|
||||
|
||||
for w1, w2 in li:
|
||||
try:
|
||||
i = vocabulary.index(w2)
|
||||
tmp.append(w1 + "\t" + w2 + "\t" + str(i+1) + "\n")
|
||||
except Exception as e:
|
||||
pass
|
||||
the_file.writelines(tmp)
|
||||
|
||||
def write_cooccurences(self):
|
||||
raise Exception(
|
||||
"The coccurrences function should not be called because it's not guaranteed to be a correct artifact for BrowseCloud.")
|
||||
|
||||
def write_counts(self):
|
||||
tmp = list()
|
||||
with open(self.DIRECTORY_DATA + '/words.txt', 'w') as the_file:
|
||||
for z in range(self.counts_to_show.shape[0]):
|
||||
docIds = np.where(self.counts_to_show[z, :] != 0)[0]
|
||||
vals = np.array(self.counts_to_show[z, docIds]).flatten()
|
||||
tmp.append("id:"+str(z+1) + "\t" + "\t".join(
|
||||
[str(i + 1) + ":" + "%1d" % v for i, v in zip(docIds, vals)]) + "\n")
|
||||
the_file.writelines(tmp)
|
||||
|
||||
def write_vocabulary(self, vocabulary):
|
||||
with open(self.DIRECTORY_DATA + '/vocabulary.txt', 'w') as the_file:
|
||||
the_file.writelines(
|
||||
[str(id + 1) + "\t" + str(word) + "\n" for id, word in enumerate(vocabulary)])
|
||||
|
||||
def write_legends(self, colorTuples=None, labelTuples=None):
|
||||
if (colorTuples is not None and labelTuples is not None):
|
||||
with open(self.DIRECTORY_DATA + '/legend.txt', 'w') as the_file:
|
||||
for ct, lt in zip(colorTuples, labelTuples):
|
||||
r1, g1, b1 = ct[0]
|
||||
rm, gm, bm = ct[1]
|
||||
r2, g2, b2 = ct[2]
|
||||
l1, l2 = lt
|
||||
data = [l1, r1, g1, b1, rm, gm, bm, l2, r2, g2, b2]
|
||||
data = [str(x) for x in data]
|
||||
the_file.write("\t".join(data)+"\n")
|
||||
|
||||
# 0. make sure ql is a distribution over the indices again -
|
||||
# chose not to do this because the final map used for visualization will be screwed up
|
||||
# docToGridMapping/np.sum(np.sum(docToGridMapping,axis=0),axis=0)
|
||||
# 1. Calculate the weighted average between ql and the featuremapping for each index - weights are ql
|
||||
# 2. Do 0,1 normalization of the and multiply by 255 to map the range [0,1] to the range [0,255]
|
||||
# 3. Use this new range to map to the RGB colorscale
|
||||
def mapSentiment(self, docToGridMapping, feature_map, W=[5, 5], doNormalizeQOverGrid=True, stretch_the_truth=False):
|
||||
normalizedDocToGridMapping = None
|
||||
if doNormalizeQOverGrid:
|
||||
normalizedDocToGridMapping = docToGridMapping / \
|
||||
np.sum(docToGridMapping, axis=(0, 1))
|
||||
else:
|
||||
normalizedDocToGridMapping = np.copy(docToGridMapping)
|
||||
e0, e1, T = docToGridMapping.shape
|
||||
# toroidal- top, left, and top left
|
||||
Q = np.pad(normalizedDocToGridMapping, [
|
||||
(W[0]-1, 0), (W[1]-1, 0), (0, 0)], 'wrap').cumsum(axis=0).cumsum(axis=1)
|
||||
|
||||
# sum area table trick
|
||||
normalizedDocToGridMapping = Q[(
|
||||
W[0]-1):, (W[1]-1):, :] - Q[(W[0]-1):, :e1, :] - Q[:e0, (W[1]-1):, :] + Q[:e0, :e1, :]
|
||||
normalizedDocToGridMapping = np.moveaxis(np.moveaxis(
|
||||
normalizedDocToGridMapping, -1, 0) / np.sum(normalizedDocToGridMapping, axis=-1), 0, -1)
|
||||
sentimentMapping = np.dot(normalizedDocToGridMapping, feature_map)
|
||||
|
||||
weights = None
|
||||
if stretch_the_truth:
|
||||
weights = 255*(sentimentMapping - np.min(sentimentMapping.flatten())) / (np.max(
|
||||
sentimentMapping.flatten()) - np.min(sentimentMapping.flatten())) # weights between 0 and 256
|
||||
else:
|
||||
weights = 255*(sentimentMapping)
|
||||
return (sentimentMapping, weights)
|
||||
|
||||
def write_colors(self, colors=None, feature_map=None, engine="numpy", cm=None, stretch_the_truth=False):
|
||||
def valid_color_comp(c):
|
||||
return 0.0 < c and c < 1
|
||||
if colors is not None:
|
||||
for color in colors:
|
||||
if len(color) != 3 or not valid_color_comp(color[0]) or not valid_color_comp(color[1]) or not valid_color_comp(color[2]):
|
||||
raise Exception(
|
||||
"Invalid RGB color for BrowseCloud input. Must be between 0 and 1 and only 3 dimensions are given.")
|
||||
elif feature_map is not None:
|
||||
colors = [0 for d in range(len(self.indexR)*len(self.indexC))]
|
||||
docToGridMapping = np.copy(self.ql2)
|
||||
if engine == "matlab":
|
||||
pass
|
||||
elif engine == "numpy":
|
||||
# move the first axis to the third
|
||||
docToGridMapping = np.moveaxis(docToGridMapping, 0, -1)
|
||||
else:
|
||||
raise ValueError(
|
||||
"The {} engine does not exist.".format(engine))
|
||||
W = None
|
||||
if self.W is None:
|
||||
W = [5, 5]
|
||||
else:
|
||||
W = self.W.copy()
|
||||
sentimentMapping, weights = self.mapSentiment(
|
||||
docToGridMapping, feature_map, W, stretch_the_truth=stretch_the_truth)
|
||||
|
||||
if cm is None:
|
||||
cm = plt.get_cmap('PuRd')
|
||||
colors = [(c[0], c[1], c[2])
|
||||
for c in cm([int(np.round(w)) for w in weights.flatten()])]
|
||||
else:
|
||||
colors = [(1.0, 1.0, 1.0)
|
||||
for d in range(len(self.indexR)*len(self.indexC))]
|
||||
with open(self.DIRECTORY_DATA + '/colors_browser.txt', 'w') as the_file:
|
||||
for r in self.indexR:
|
||||
for c in self.indexC:
|
||||
i = len(self.indexR)*r + c
|
||||
tmp = ("%1d" % (r+1)) + "\t" + ("%1d" % (c+1)) + "\t" + str(
|
||||
(colors[i][0])) + "\t"+str((colors[i][1])) + "\t" + str((colors[i][2]))
|
||||
the_file.write(tmp + "\n")
|
||||
return colors
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
bcag = BrowseCloudArtifactGenerator("")
|
||||
|
||||
# 3 documents each with sentiment[0,.5,1] from negative to positive
|
||||
# 9x9 grid
|
||||
doc1Q = [
|
||||
[0.25, 0.25, 0],
|
||||
[0.25, 0.25, 0],
|
||||
[0, 0, 0]
|
||||
]
|
||||
doc2Q = [
|
||||
[0, 0.25, 0.25],
|
||||
[0, 0.25, 0.25],
|
||||
[0, 0, 0]
|
||||
]
|
||||
doc3Q = [
|
||||
[0, 0, 0],
|
||||
[0, 0.25, 0.25],
|
||||
[0, 0.25, 0.25]
|
||||
]
|
||||
q = np.array([doc1Q, doc2Q, doc3Q])
|
||||
feature_map = np.array([0, 0.5, 1])
|
||||
|
||||
W = [1, 1]
|
||||
|
||||
qMatlab = np.moveaxis(q, 0, -1)
|
||||
|
||||
Q = np.pad(q, [(0, 0), (W[1]-1, 0), (W[0]-1, 0)],
|
||||
'wrap').cumsum(1).cumsum(2)
|
||||
normalizedDocToGridMapping = Q[:, (W[0]-1):, (W[1]-1):] - \
|
||||
Q[:, (W[0]-1):, :3] - Q[:, :3, (W[1]-1):] + Q[:, :3, :3]
|
||||
print(normalizedDocToGridMapping)
|
||||
|
||||
result = bcag.mapSentiment(qMatlab, feature_map)[0]
|
||||
print('DONE')
|
||||
print(result)
|
|
@ -0,0 +1,135 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from CountingGridsPy.models import CountingGridModel
|
||||
from CountingGridsPy.EngineToBrowseCloudPipeline import SlidingWindowTrainer
|
||||
from scipy import io
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import scipy as sp
|
||||
import os
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
|
||||
class CGEngineWrapper(object):
|
||||
def __init__(self, extent_size=32, window_size=5, layers=2, heartBeaters=None):
|
||||
self.cg_size = extent_size
|
||||
self.wd_size = window_size
|
||||
self.no_layers = layers
|
||||
self.heartBeaters = heartBeaters
|
||||
|
||||
def ready_the_matlab_engine(self, X, labels_filtered, DIRECTORY_DATA):
|
||||
m, n = X.shape
|
||||
s = X.data
|
||||
i = X.tocoo().row
|
||||
j = X.indices
|
||||
io.savemat(DIRECTORY_DATA + '/counts.mat',
|
||||
{'m': m, 'n': n, 's': s, 'i': i, 'j': j})
|
||||
if labels_filtered is not None:
|
||||
io.savemat(DIRECTORY_DATA + '/labels.mat',
|
||||
{'labels': labels_filtered})
|
||||
|
||||
def __fitCountVectorizor(self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, labels, MIN_FREQUENCY):
|
||||
df = pd.read_table(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME)
|
||||
df = df[df.columns[1:]]
|
||||
TEXT = df['pos_filtered'].tolist()
|
||||
id_grid = np.array(
|
||||
[i for i, t in enumerate(TEXT) if len(str(t).split(" ")) > 3]
|
||||
)
|
||||
addl_keep = np.zeros(len(TEXT))
|
||||
addl_keep[id_grid] += 1
|
||||
addl_keep = addl_keep.astype(bool)
|
||||
df = df.ix[id_grid].reset_index(drop=True)
|
||||
self.labelsS = np.array(
|
||||
labels)[id_grid] if labels is not None else None
|
||||
vect = CountVectorizer(decode_error="ignore", min_df=MIN_FREQUENCY)
|
||||
X = vect.fit_transform(df['pos_filtered'].tolist())
|
||||
return (vect, X, addl_keep)
|
||||
|
||||
def get_vocab(self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, labels, MIN_FREQUENCY, keep):
|
||||
vect, _, addl_keep = self.__fitCountVectorizor(
|
||||
DIRECTORY_DATA,
|
||||
CLEAN_DATA_FILE_NAME,
|
||||
labels,
|
||||
MIN_FREQUENCY
|
||||
)
|
||||
return (vect.get_feature_names(), np.array(keep) & np.array(addl_keep))
|
||||
|
||||
def incremental_fit(
|
||||
self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, labels,
|
||||
MIN_FREQUENCY, keep, engine, initial_max_iter=100,
|
||||
w=2000, s=1000, runInitialTrain=True
|
||||
):
|
||||
vect, X, addl_keep = self.__fitCountVectorizor(
|
||||
DIRECTORY_DATA,
|
||||
CLEAN_DATA_FILE_NAME,
|
||||
labels,
|
||||
MIN_FREQUENCY
|
||||
)
|
||||
|
||||
if engine != "numpy":
|
||||
raise ValueError("Not implemented!")
|
||||
|
||||
extent = np.array([self.cg_size, self.cg_size])
|
||||
window = np.array([self.wd_size, self.wd_size])
|
||||
model = CountingGridModel(extent, window)
|
||||
|
||||
T = X.shape[0]
|
||||
training_max_iter_vec = [1 for x in range(int(np.ceil((T-w)/s)) + 1)]
|
||||
train = model.fit
|
||||
kwargs = {
|
||||
"data": X.toarray(),
|
||||
"max_iter": initial_max_iter,
|
||||
"returnSumSquareDifferencesOfPi": False,
|
||||
"layers": self.no_layers,
|
||||
"noise": .00001,
|
||||
"output_directory": DIRECTORY_DATA
|
||||
}
|
||||
|
||||
print("Iteration vectors:")
|
||||
print("Running for this number of iteration:" +
|
||||
str(len([initial_max_iter] + training_max_iter_vec)))
|
||||
print([initial_max_iter] + training_max_iter_vec)
|
||||
|
||||
SlidingWindowTrainer.SlidingTrainer(
|
||||
w,
|
||||
s,
|
||||
training_max_iter_vec,
|
||||
train,
|
||||
kwargs,
|
||||
runInitialTrain=runInitialTrain
|
||||
)
|
||||
|
||||
return (vect.get_feature_names(), np.array(keep) & np.array(addl_keep))
|
||||
|
||||
def fit(self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, labels, MIN_FREQUENCY, keep, engine):
|
||||
vect, X, addl_keep = self.__fitCountVectorizor(
|
||||
DIRECTORY_DATA,
|
||||
CLEAN_DATA_FILE_NAME,
|
||||
labels,
|
||||
MIN_FREQUENCY
|
||||
)
|
||||
|
||||
if engine == "matlab":
|
||||
self.ready_the_matlab_engine(X, labels, DIRECTORY_DATA)
|
||||
os.system(r"cg_exe_buja.exe {0} counts.mat ".format(
|
||||
DIRECTORY_DATA) + str(self.cg_size) + " " + str(self.wd_size) + " " + str(self.no_layers))
|
||||
elif engine == "numpy":
|
||||
extent = np.array([self.cg_size, self.cg_size])
|
||||
window = np.array([self.wd_size, self.wd_size])
|
||||
model = CountingGridModel(extent, window)
|
||||
model.fit(
|
||||
X.toarray(),
|
||||
max_iter=100,
|
||||
returnSumSquareDifferencesOfPi=False,
|
||||
layers=self.no_layers,
|
||||
noise=.00000001,
|
||||
output_directory=DIRECTORY_DATA,
|
||||
heartBeaters=self.heartBeaters
|
||||
)
|
||||
elif engine == "torch":
|
||||
raise ValueError("Not implemented yet.")
|
||||
else:
|
||||
raise ValueError("The {} engine does not exist.".format(engine))
|
||||
|
||||
return (vect.get_feature_names(), np.array(keep) & np.array(addl_keep))
|
|
@ -0,0 +1,102 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, CGEngineWrapper, NLPCleaner, PipelineTimer
|
||||
import sys
|
||||
import numpy as np
|
||||
import os
|
||||
import pandas as pd
|
||||
import datetime
|
||||
import matplotlib.pyplot as plt
|
||||
from CountingGridsPy.models import CountingGridModel
|
||||
|
||||
pTimer = PipelineTimer()
|
||||
# Example CLI: python dumpCountingGrids.py CountingGridInput_MarchSurvey 24 5 matlabEngine traditionalInput channelDump.csv
|
||||
errStr = '''
|
||||
Please give a valid command-line arguments.
|
||||
Instructions found here: https://github.com/microsoft/browsecloud/wiki/Data-Pipeline-Documentation.
|
||||
'''
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Input
|
||||
# ---------------------------------------------------------------------------------------
|
||||
if len(sys.argv) != 7:
|
||||
print(((sys.argv)))
|
||||
raise ValueError(errStr)
|
||||
DIRECTORY_DATA = sys.argv[1]
|
||||
EXTENT_SIZE = int(sys.argv[2])
|
||||
WINDOW_SIZE = int(sys.argv[3])
|
||||
engine_type = sys.argv[4]
|
||||
inputfile_type = sys.argv[5]
|
||||
inputfile_name = sys.argv[6]
|
||||
if engine_type != "numpyEngine" and engine_type != "matlabEngine":
|
||||
raise ValueError(
|
||||
"The {0} engine does not exist. Please use 'matlabEngine' or 'numpyEngine'.".format(engine_type))
|
||||
engine_type = engine_type[:-6] # 6 characters in the word "Engine"
|
||||
if inputfile_type != "metadataInput" and inputfile_type != "simpleInput":
|
||||
raise ValueError(
|
||||
"The {0} input type does not exist. Please use 'simpleInput' or 'metadataInput'.".format(inputfile_type))
|
||||
inputfile_type = inputfile_type[:-5] # remove "Input"
|
||||
|
||||
if not os.path.isdir(DIRECTORY_DATA):
|
||||
raise ValueError(
|
||||
"Undefined local directory where digital channel is dumped!\n" + errStr)
|
||||
|
||||
FILE_NAME = DIRECTORY_DATA + "\\" + inputfile_name
|
||||
|
||||
CLEAN_DATA_FILE_NAME, MIN_FREQUENCY, MIN_WORDS = ["\cg-processed.tsv", 2, 5]
|
||||
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Data Cleaning
|
||||
# ---------------------------------------------------------------------------------------
|
||||
|
||||
cleaner = NLPCleaner()
|
||||
correspondences = None
|
||||
CACHED_CORRESPONDENCES_FILE_NAME = "\cached_correspondences.tsv"
|
||||
pTimer("Reading data file.")
|
||||
df, keep = cleaner.read(FILE_NAME, inputfile_type, MIN_FREQUENCY, MIN_WORDS)
|
||||
if not (os.path.exists(DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME) and os.path.exists(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME)):
|
||||
pTimer("Starting data cleaning.")
|
||||
cleaner.handle_negation_tokens()
|
||||
cleaner.removePunctuation()
|
||||
correspondences = cleaner.lemmatize()
|
||||
cleaner.write_cached_correspondences(
|
||||
DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
|
||||
cleaner.write(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME)
|
||||
else:
|
||||
pTimer("Skipping data cleaning.")
|
||||
correspondences = cleaner.read_cached_correspondences(
|
||||
DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
|
||||
|
||||
pTimer("Learning counting grid.")
|
||||
LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
|
||||
engine = CGEngineWrapper(extent_size=EXTENT_SIZE, window_size=WINDOW_SIZE)
|
||||
vocabulary = None
|
||||
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Learning
|
||||
# ---------------------------------------------------------------------------------------
|
||||
if not os.path.exists(DIRECTORY_DATA + LEARNED_GRID_FILE_NAME):
|
||||
vocabulary, keep = engine.fit(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME,
|
||||
cleaner.labelsS, MIN_FREQUENCY, keep, engine=engine_type)
|
||||
else:
|
||||
vocabulary, keep = engine.get_vocab(
|
||||
DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS, MIN_FREQUENCY, keep)
|
||||
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Output
|
||||
# ---------------------------------------------------------------------------------------
|
||||
pTimer("Generating counting grid artifacts.")
|
||||
LINK_FILE_NAME = ""
|
||||
bcag = BrowseCloudArtifactGenerator(DIRECTORY_DATA)
|
||||
bcag.read(LEARNED_GRID_FILE_NAME)
|
||||
bcag.write_docmap(engine.wd_size, engine=engine_type)
|
||||
bcag.write_counts()
|
||||
bcag.write_vocabulary(vocabulary)
|
||||
bcag.write_top_pi()
|
||||
bcag.write_top_pi_layers()
|
||||
bcag.write_colors() # write default blue colors
|
||||
bcag.write_database(df, keep)
|
||||
bcag.write_correspondences(correspondences, vocabulary)
|
||||
bcag.write_keep(keep)
|
||||
|
||||
pTimer("Done.")
|
|
@ -0,0 +1,111 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, CGEngineWrapper, NLPCleaner, PipelineTimer
|
||||
import sys
|
||||
import numpy as np
|
||||
import os
|
||||
import pandas as pd
|
||||
import datetime
|
||||
import matplotlib.pyplot as plt
|
||||
from CountingGridsPy.models import CountingGridModel
|
||||
|
||||
pTimer = PipelineTimer()
|
||||
# Example CLI: python dumpSlidingWindowCountingGrids.py CalculatorDataExperiment6 24 5 numpyEngine simpleTimeInput CalculatorUIFData.txt
|
||||
errStr = '''
|
||||
Please give a valid command-line arguments.
|
||||
Instructions found here: https://github.com/microsoft/browsecloud/wiki/Data-Pipeline-Documentation.
|
||||
'''
|
||||
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Input
|
||||
# ---------------------------------------------------------------------------------------
|
||||
|
||||
if len(sys.argv) != 7:
|
||||
print(((sys.argv)))
|
||||
raise ValueError(errStr)
|
||||
DIRECTORY_DATA = sys.argv[1]
|
||||
EXTENT_SIZE = int(sys.argv[2])
|
||||
WINDOW_SIZE = int(sys.argv[3])
|
||||
engine_type = sys.argv[4]
|
||||
inputfile_type = sys.argv[5]
|
||||
inputfile_name = sys.argv[6]
|
||||
if engine_type != "numpyEngine":
|
||||
raise ValueError("The {0} engine does not exist.".format(engine_type))
|
||||
engine_type = engine_type[:-6] # 6 characters in the word "Engine"
|
||||
if inputfile_type != "simpleTimeInput":
|
||||
raise ValueError(
|
||||
"The {0} input type does not exist.".format(inputfile_type))
|
||||
inputfile_type = inputfile_type[:-5] # remove "Input"
|
||||
|
||||
if not os.path.isdir(DIRECTORY_DATA):
|
||||
raise ValueError(
|
||||
"Undefined local directory where digital channel is dumped!\n" + errStr)
|
||||
|
||||
FILE_NAME = DIRECTORY_DATA + "\\" + inputfile_name
|
||||
|
||||
CLEAN_DATA_FILE_NAME, MIN_FREQUENCY, MIN_WORDS = ["\cg-processed.tsv", 2, 5]
|
||||
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Data Cleaning
|
||||
# ---------------------------------------------------------------------------------------
|
||||
cleaner = NLPCleaner()
|
||||
correspondences = None
|
||||
CACHED_CORRESPONDENCES_FILE_NAME = "\cached_correspondences.tsv"
|
||||
pTimer("Reading data file.")
|
||||
df, keep = cleaner.read(FILE_NAME, inputfile_type, MIN_FREQUENCY, MIN_WORDS)
|
||||
if not (os.path.exists(DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME) and os.path.exists(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME)):
|
||||
pTimer("Starting data cleaning.")
|
||||
cleaner.handle_negation_tokens()
|
||||
cleaner.removePunctuation()
|
||||
correspondences = cleaner.lemmatize()
|
||||
cleaner.write_cached_correspondences(
|
||||
DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
|
||||
cleaner.write(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME)
|
||||
else:
|
||||
pTimer("Skipping data cleaning.")
|
||||
correspondences = cleaner.read_cached_correspondences(
|
||||
DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
|
||||
|
||||
pTimer("Learning counting grid.")
|
||||
LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
|
||||
engine = CGEngineWrapper(extent_size=EXTENT_SIZE, window_size=WINDOW_SIZE)
|
||||
vocabulary = None
|
||||
try:
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Learning
|
||||
# ---------------------------------------------------------------------------------------
|
||||
if not os.path.exists(DIRECTORY_DATA + LEARNED_GRID_FILE_NAME):
|
||||
vocabulary, keep = engine.incremental_fit(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS,
|
||||
MIN_FREQUENCY, keep, engine=engine_type, initial_max_iter=100, w=2000, s=1, runInitialTrain=True)
|
||||
else:
|
||||
vocabulary, keep = engine.get_vocab(
|
||||
DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS, MIN_FREQUENCY, keep)
|
||||
pTimer("Generating counting grid artifacts.")
|
||||
|
||||
dataFolders = [DIRECTORY_DATA] + \
|
||||
[f.path for f in os.scandir(DIRECTORY_DATA) if f.is_dir()]
|
||||
# we know a priori it should be [".\\","\\iter0","./iter1",...]
|
||||
dataFolders.sort()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
finally:
|
||||
# ---------------------------------------------------------------------------------------
|
||||
# Output
|
||||
# ---------------------------------------------------------------------------------------
|
||||
for i, folder in enumerate(dataFolders):
|
||||
LINK_FILE_NAME = ""
|
||||
bcag = BrowseCloudArtifactGenerator(folder)
|
||||
if os.path.exists(folder + LEARNED_GRID_FILE_NAME):
|
||||
bcag.read(LEARNED_GRID_FILE_NAME)
|
||||
bcag.write_docmap(engine.wd_size, engine=engine_type)
|
||||
bcag.write_counts()
|
||||
bcag.write_vocabulary(vocabulary)
|
||||
bcag.write_top_pi()
|
||||
bcag.write_top_pi_layers()
|
||||
bcag.write_colors() # write default blue colors
|
||||
bcag.write_correspondences(correspondences, vocabulary)
|
||||
if i == 0:
|
||||
bcag.write_database(df, keep)
|
||||
bcag.write_keep(keep)
|
||||
pTimer("Done.")
|
|
@ -0,0 +1,102 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from skimage.morphology import label
|
||||
from skimage.measure import regionprops
|
||||
import numpy as np
|
||||
|
||||
|
||||
class MorphologicalTightener():
|
||||
@staticmethod
|
||||
def tighten_pi(pi):
|
||||
'''
|
||||
Algorithm:
|
||||
1. Go through each word in the vocabulary stored in pi params.
|
||||
2. Find all connected components within the
|
||||
the 2-d plane for the given feature, which is like an image.
|
||||
3. Within each component find the index with highest probability.
|
||||
4. Set the probability of all other elements
|
||||
in the component set to be 0.
|
||||
'''
|
||||
pi = np.copy(pi)
|
||||
Z = pi.shape[2]
|
||||
for z in range(Z):
|
||||
tmp_pi = pi[:, :, z]
|
||||
labelled_components = label(
|
||||
tmp_pi > 1e-3,
|
||||
neighbors=None,
|
||||
background=None,
|
||||
return_num=False,
|
||||
connectivity=1
|
||||
)
|
||||
for region in regionprops(labelled_components):
|
||||
x_coords = region.coords[:, 0]
|
||||
y_coords = region.coords[:, 1]
|
||||
vals = tmp_pi[x_coords, y_coords]
|
||||
i_max = np.argmax(vals)
|
||||
maxval = tmp_pi[np.array(region.coords[i_max, :][0]), np.array(
|
||||
region.coords[i_max, :][1])]
|
||||
tmp_pi[x_coords, y_coords] = 0
|
||||
tmp_pi[np.array(region.coords[i_max, :][0]), np.array(
|
||||
region.coords[i_max, :][1])] = maxval
|
||||
|
||||
pi[:, :, z] = tmp_pi
|
||||
return pi
|
||||
|
||||
@staticmethod
|
||||
def tighten_q():
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def print_results(input, output):
|
||||
assert(np.all(input.shape == output.shape))
|
||||
for z in range(input.shape[2]):
|
||||
print("Feature index " + str(z) + ":")
|
||||
print("Input:")
|
||||
print(input[:, :, z])
|
||||
print("Output:")
|
||||
print(output[:, :, z])
|
||||
print("")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test1
|
||||
INPUT = np.array(
|
||||
[
|
||||
[
|
||||
[0.5, 0.5, 0, 0],
|
||||
[0.7, 0, 0, 0.5],
|
||||
[0.5, 0, 0, 0.6],
|
||||
[0.5, 0, 0.5, 0]
|
||||
],
|
||||
[
|
||||
[0.004, 0.5, 0, 0],
|
||||
[0.5, 0, 0, 0],
|
||||
[100, 0, 0, 0],
|
||||
[0, 0, 0, 0.5]
|
||||
]
|
||||
]).astype(float).transpose((1, 2, 0))
|
||||
|
||||
OUTPUT = np.array(
|
||||
[
|
||||
[
|
||||
[0, 0, 0, 0],
|
||||
[0.7, 0, 0, 0],
|
||||
[0, 0, 0, 0.6],
|
||||
[0, 0, 0.5, 0]
|
||||
],
|
||||
[
|
||||
[0, 0, 0, 0],
|
||||
[0, 0, 0, 0],
|
||||
[100, 0, 0, 0],
|
||||
[0, 0, 0, 0.5]
|
||||
]
|
||||
]
|
||||
).astype(float).transpose((1, 2, 0))
|
||||
|
||||
assert(np.all(MorphologicalTightener.tighten_pi(INPUT) == OUTPUT))
|
||||
|
||||
MorphologicalTightener.print_results(
|
||||
INPUT,
|
||||
MorphologicalTightener.tighten_pi(INPUT)
|
||||
)
|
|
@ -0,0 +1,207 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import re
|
||||
import functools
|
||||
import string
|
||||
from collections import defaultdict
|
||||
import nltk
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
from nltk.corpus import wordnet
|
||||
from nltk.tag.stanford import StanfordPOSTagger
|
||||
import pandas as pd
|
||||
import os
|
||||
import numpy as np
|
||||
from CountingGridsPy.EngineToBrowseCloudPipeline.surveyData import SurveyData
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.feature_extraction import stop_words
|
||||
|
||||
|
||||
def tokenizer(x, wordnet_lemmatizer, corrispondences, tagger):
|
||||
try:
|
||||
def get_wordnet_pos(treebank_tag):
|
||||
if treebank_tag.startswith('J'):
|
||||
return wordnet.ADJ
|
||||
elif treebank_tag.startswith('V'):
|
||||
return wordnet.VERB
|
||||
elif treebank_tag.startswith('N') or treebank_tag.startswith('FW'):
|
||||
return wordnet.NOUN
|
||||
elif treebank_tag.startswith('R'):
|
||||
return wordnet.ADV
|
||||
else:
|
||||
return wordnet.NOUN
|
||||
|
||||
# tagged = tagger.tag(x.split()) #e.g. [('what', 'WP')]
|
||||
tagged = [(d, 'VB') if d not in stop_words.ENGLISH_STOP_WORDS else (
|
||||
d, '') for d in x.split()] # stopword removal here
|
||||
tagged = [(wordnet_lemmatizer.lemmatize(re.sub("[^\w\d]", "", t[0]), get_wordnet_pos(t[1])), t[1], t[0])
|
||||
for t in tagged] # the regular expression removes characters that aren't digits or alphanumeric characters
|
||||
# tagged - (lemmatized word, part of speech, original word) e.g. [('what', 'WP', 'what')
|
||||
|
||||
for t in tagged:
|
||||
corrispondences[t[0]] = corrispondences.get(t[0], "") + "," + t[2]
|
||||
|
||||
return [t[0] for t in tagged], [t[1] for t in tagged]
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return [], []
|
||||
|
||||
|
||||
class NLPCleaner(object):
|
||||
def __init__(self):
|
||||
self.corrispondences = dict()
|
||||
self.textS = None
|
||||
self.labelsS = None
|
||||
|
||||
def initial_clean_after_read(self, df, MIN_WORDS):
|
||||
self.text = [(
|
||||
str(aTuple[0]) + " " + str(aTuple[1]) + " " + str(aTuple[1]) + " " + str(aTuple[1])
|
||||
) if not pd.isnull(aTuple[0]) and not pd.isnull(aTuple[1]) else (
|
||||
str(aTuple[1]) if not pd.isnull(aTuple[1]) else (
|
||||
str(aTuple[0]) if not pd.isnull(aTuple[0]) else ""
|
||||
)
|
||||
) for aTuple in zip(df.title.tolist(), df.abstract.tolist())]
|
||||
text_extremely_cleaned = [a for a in map(lambda x: re.sub("\\b[A-Za-z]{1,3}\\b", " ", re.sub(
|
||||
" +", " ", re.sub("[^A-Za-z\\s]", " ", re.sub("\[[^\]]*\]", " ", x)))), self.text)]
|
||||
keep = np.array([True if len(doc.split()) >=
|
||||
MIN_WORDS else False for doc in text_extremely_cleaned])
|
||||
self.textS = [re.sub(" +", " ", re.sub("\|", " ", re.sub("\[[^\]]*\]",
|
||||
" ", t.lower()))) if k else "" for k, t in zip(keep, self.text)]
|
||||
return keep
|
||||
|
||||
def read(self, FILE_NAME, inputfile_type, MIN_FREQUENCY=2, MIN_WORDS=5):
|
||||
df = None
|
||||
|
||||
if inputfile_type == "metadata":
|
||||
df = pd.read_csv(FILE_NAME, keep_default_na=False)
|
||||
if 'title' not in df:
|
||||
df['title'] = ''
|
||||
if 'abstract' not in df:
|
||||
df['abstract'] = ''
|
||||
|
||||
elif inputfile_type == "simple":
|
||||
df = pd.read_csv(FILE_NAME, sep="\t", header=None,
|
||||
keep_default_na=False)
|
||||
df.columns = ["title", "abstract", "link"]
|
||||
|
||||
elif inputfile_type == "simpleTime":
|
||||
df = pd.read_csv(FILE_NAME, sep="\t", header=None,
|
||||
keep_default_na=False)
|
||||
df.columns = ["title", "abstract", "link", "time"]
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
"Input_type " + str(inputfile_type) + " is not valid.")
|
||||
|
||||
# Validation now happens in BrowseCloud.Service. Assuming correct input, so no validation needed.
|
||||
keep = self.initial_clean_after_read(df, MIN_WORDS)
|
||||
'''
|
||||
le = LabelEncoder()
|
||||
labels = le.fit_transform(df.Corpus.tolist())
|
||||
self.labelsS = [t for k,t in zip(keep,labels) if k] #not used
|
||||
'''
|
||||
|
||||
return (df, np.copy(keep))
|
||||
|
||||
def handle_negation_tokens(self):
|
||||
negation_handling_functions = [lambda x: re.sub(" +", " ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)can[\W']*[o]*(t|not)(\W|$|\.)", " can not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)could[n]*[\W']*[no]*(t|not)(\W|$|\.)", " could not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)should[n]*[\W']*[no]*(t|not)(\W|$|\.)", " should not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)would[n]*[\W']*[no]*(t|not)(\W|$|\.)", " would not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)won[\W']*(t)(\W|$|\.)", " wont ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)do[n]*[\W']*[no]*t(\W|$|\.)", " do not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)does[n]*[\W']*[no]*t(\W|$|\.)", " does not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)did[n]*[\W']*[no]*t(\W|$|\.)", " did not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)is[n]*[\W']*[no]*t(\W|$|\.)", " is not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)are[n]*[\W']*[no]*t(\W|$|\.)", " are not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)was[n]*[\W']*[no]*t(\W|$|\.)", " was not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)were[n]*[\W']*[no]*t(\W|$|\.)", " were not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)ain[\W']*[no]*t(\W|$|\.)", " aint ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)had[n]*[\W']*[no]*(t|not)(\W|$|\.)", " had not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)has[n]*[\W']*[no]*(t|not)(\W|$|\.)", " has not ", x),
|
||||
lambda x: re.sub(
|
||||
"(^|\W|\.)have[n]*[\W']*[no]*(t|not)(\W|$|\.)", " have not ", x),
|
||||
lambda x: x.lower()]
|
||||
|
||||
def compose(*functions):
|
||||
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
|
||||
normalize_negations = compose(*negation_handling_functions)
|
||||
self.textS = [a for a in map(
|
||||
lambda x: normalize_negations(x), self.textS)]
|
||||
|
||||
def removePunctuation(self):
|
||||
regexPunkt = re.compile('[%s]' % re.escape(string.punctuation))
|
||||
self.textSb = [a for a in map(lambda x: re.sub(
|
||||
" +", " ", regexPunkt.sub(' ', x)), self.textS)]
|
||||
|
||||
def lemmatize(self):
|
||||
path_to_model = "./stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger"
|
||||
path_to_jar = "./stanford-postagger-full-2017-06-09/stanford-postagger.jar"
|
||||
# Keep the constructor call here as a comment, just in case. We removed the HMM in tokenizer for part of speech tagging.
|
||||
# StanfordPOSTagger(path_to_model, path_to_jar); tagger.java_options='-mx10G'
|
||||
tagger = None
|
||||
wordnet_lemmatizer = WordNetLemmatizer()
|
||||
self.cleaned_featurized = [a for a in map(lambda x: tokenizer(
|
||||
x, wordnet_lemmatizer, self.corrispondences, tagger) if x != "" else ([], []), self.textSb)]
|
||||
return self.corrispondences
|
||||
|
||||
def write(self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, start=None, finish=None):
|
||||
assert(len(self.text) == len(self.cleaned_featurized))
|
||||
if start is None or finish is None:
|
||||
start = 0
|
||||
finish = len(self.text)
|
||||
|
||||
df = pd.DataFrame(
|
||||
columns=['original', 'cleaned', 'pos', 'pos_filtered'])
|
||||
df['original'] = [w for w in map(
|
||||
lambda x: x.replace("\t", " "), self.text[start:finish])]
|
||||
|
||||
try: # if correspondences runs
|
||||
df['cleaned'] = [" ".join(d[0]).replace("\t", " ")
|
||||
for d in self.cleaned_featurized[start:finish]]
|
||||
df['pos'] = [" ".join(d[1]).replace("\t", " ")
|
||||
for d in self.cleaned_featurized[start:finish]]
|
||||
df['pos_filtered'] = [" ".join([w for (w, t) in zip(d[0], d[1]) if t in (
|
||||
['JJR', 'JJS', 'JJ', 'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WRB', 'FW']
|
||||
)]) for d in self.cleaned_featurized[start:finish]]
|
||||
except Exception as e:
|
||||
df['cleaned'] = ["" for w in self.text[start:finish]]
|
||||
df['pos'] = ["" for w in self.text[start:finish]]
|
||||
df['pos_filtered'] = ["" for w in self.text[start:finish]]
|
||||
if not os.path.isdir(DIRECTORY_DATA):
|
||||
os.mkdir(DIRECTORY_DATA)
|
||||
|
||||
df.to_csv(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME,
|
||||
sep="\t", encoding="utf-8")
|
||||
|
||||
def write_cached_correspondences(self, DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME):
|
||||
df = pd.DataFrame(columns=['lemma', 'words'])
|
||||
df['lemma'] = [lemma for lemma in self.corrispondences]
|
||||
df['words'] = [self.corrispondences[lemma]
|
||||
for lemma in self.corrispondences]
|
||||
df.to_csv(DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME,
|
||||
sep="\t", encoding="utf-8")
|
||||
|
||||
def read_cached_correspondences(self, DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME):
|
||||
df = pd.read_csv(
|
||||
DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME, sep="\t", encoding="utf-8")
|
||||
correspondences = dict()
|
||||
for key, val in zip(df['lemma'], df['words']):
|
||||
correspondences[key] = val
|
||||
return correspondences
|
|
@ -0,0 +1,46 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import datetime
|
||||
import time
|
||||
|
||||
|
||||
class PipelineTimer():
|
||||
def __init__(self):
|
||||
self.last_s = time.time()
|
||||
self.init_time = self.last_s
|
||||
|
||||
def __call__(self, description):
|
||||
curr_s = time.time()
|
||||
self.printSectionDescription(description)
|
||||
self.printCurrentTime()
|
||||
self.printPrettyTime(curr_s - self.last_s)
|
||||
self.last_s = curr_s
|
||||
if description == "Done.":
|
||||
self.prettyPrintTotal(curr_s - self.init_time)
|
||||
|
||||
def printSectionDescription(self, description):
|
||||
print(description)
|
||||
|
||||
def prettyPrintHelper(self, s: float):
|
||||
hours = s // 3600
|
||||
s -= (hours * 3600)
|
||||
minutes = s//60
|
||||
s -= (minutes * 60)
|
||||
seconds = s
|
||||
return [hours, minutes, seconds]
|
||||
|
||||
def printPrettyTime(self, s: float):
|
||||
hours, minutes, seconds = self.prettyPrintHelper(s)
|
||||
string = 'Previous section took {}h {}m {}s.'.format(
|
||||
int(hours), int(minutes), float(int(seconds*100))/100)
|
||||
print(string)
|
||||
|
||||
def printCurrentTime(self):
|
||||
print(datetime.datetime.now())
|
||||
|
||||
def prettyPrintTotal(self, s: float):
|
||||
hours, minutes, seconds = self.prettyPrintHelper(s)
|
||||
string = 'Entire program took {}h {}m {}s.'.format(
|
||||
int(hours), int(minutes), float(int(seconds*100))/100)
|
||||
print(string)
|
|
@ -0,0 +1,44 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
|
||||
class SlidingWindowTrainer():
|
||||
@staticmethod
|
||||
def SlidingTrainer(w: int, s: int, training_max_iter_vec: list, train, kwargs: dict, runInitialTrain=True):
|
||||
'''
|
||||
Assume: data is sorted by date
|
||||
|
||||
'''
|
||||
print("Learning Initial Grid.")
|
||||
pi = None
|
||||
if runInitialTrain:
|
||||
pi = train(**kwargs)
|
||||
assert("data" in kwargs and "max_iter" in kwargs and "output_directory" in kwargs)
|
||||
data = kwargs['data']
|
||||
root_dir = kwargs["output_directory"].replace(
|
||||
".", "").replace("/", "").replace("\\", "")
|
||||
T = len(data)
|
||||
assert(w < T)
|
||||
max_sliding_window_size = int(np.ceil((T-w)/s) + 1)
|
||||
assert(len(training_max_iter_vec) == max_sliding_window_size)
|
||||
first_index = 0
|
||||
last_index = min(w, T)
|
||||
for i in range(max_sliding_window_size):
|
||||
kwargs['max_iter'] = training_max_iter_vec[i]
|
||||
kwargs['data'] = data[first_index:last_index]
|
||||
kwargs['output_directory'] = "./" + root_dir + "/iter" + str(i)
|
||||
kwargs['pi'] = pi
|
||||
|
||||
if not (os.path.exists(kwargs['output_directory']) and os.path.isdir(kwargs['output_directory'])):
|
||||
os.mkdir(kwargs['output_directory'])
|
||||
|
||||
print("Learning grid window from first index: " +
|
||||
str(first_index) + " to second index: " + str(last_index))
|
||||
pi = train(**kwargs)
|
||||
|
||||
last_index = min(last_index + s, T)
|
||||
first_index = min(first_index + s, T)
|
||||
assert(last_index >= T)
|
|
@ -0,0 +1,16 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
|
||||
class SurveyData(object):
|
||||
def __init__(self, alias, title, abstract, responseId, surveyId, link, image):
|
||||
self.alias = alias
|
||||
self.title = title
|
||||
self.abstract = abstract
|
||||
self.responseId = responseId
|
||||
self.surveyId = surveyId
|
||||
self.link = link
|
||||
self.image = image
|
||||
|
||||
def toCols(self):
|
||||
return [colname for colname in self.__dict__]
|
|
@ -0,0 +1,441 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import math
|
||||
import os
|
||||
import numpy as np
|
||||
import scipy.io
|
||||
import scipy.stats
|
||||
|
||||
np.random.seed(0)
|
||||
# Two functions for use by users of the library:
|
||||
# 1.
|
||||
# fit(data,.) - Fits the counting grids model to the data set. if layers >1,
|
||||
# then learned a hierarchical layered counting grid.
|
||||
|
||||
# 2.
|
||||
# predict_probabilities(bagofwordcounts,k)) - computes the likelihood,
|
||||
# the probability of the data given the parameters and choice of window
|
||||
|
||||
|
||||
class CountingGridModel():
|
||||
def __init__(self, extent, window):
|
||||
"""
|
||||
extent is a 1-D array of size D in the paper.
|
||||
D is often 2, since it makes the model easily visualizable.
|
||||
window is a one-dimensional of size D in the paper.
|
||||
"""
|
||||
|
||||
# E in the paper
|
||||
self.extent = extent
|
||||
# W in the paper
|
||||
self.window = window
|
||||
self.D = len(extent)
|
||||
# Each word is some value between 1 & the size of vocabulary
|
||||
self.vocabulary = np.array([])
|
||||
|
||||
self.extent_volume = 1
|
||||
self.window_volume = 1
|
||||
for v in self.window:
|
||||
self.window_volume *= v
|
||||
for v in self.extent:
|
||||
self.extent_volume *= v
|
||||
|
||||
self.capacity = self.extent_volume / self.window_volume
|
||||
|
||||
# Assumes: self.pi, self.q,self.extent are set properly
|
||||
def cg_layers(self, data, L, noise=1e-10):
|
||||
T, Z = data.shape
|
||||
pi_la = np.zeros([self.extent[0], self.extent[1], Z, L])
|
||||
h_la = np.zeros([self.extent[0], self.extent[1], Z, L])
|
||||
|
||||
# Modifies: h_la
|
||||
def layer_compute_h(pi_la, h_la):
|
||||
h = self.compute_h(pi_la[:, :, :, l], self.window)
|
||||
h_la[:, :, :, l] = np.transpose(
|
||||
np.transpose(h) / np.transpose(np.sum(h, axis=2))
|
||||
)
|
||||
|
||||
# Add noise to pi
|
||||
for l in range(L):
|
||||
pi_la[:, :, :, l] = (
|
||||
self.pi + np.random.uniform(size=self.pi.shape)*noise
|
||||
)
|
||||
# Normalize
|
||||
pi_la[:, :, :, l] = np.transpose(
|
||||
np.transpose(pi_la[:, :, :, l]) /
|
||||
np.transpose(np.sum(pi_la[:, :, :, l], axis=2))
|
||||
)
|
||||
layer_compute_h(pi_la, h_la)
|
||||
P = np.prod(self.extent)
|
||||
|
||||
# lq/lql is the log of q
|
||||
# lq/lql is shaped as ([P,T])
|
||||
# self.q is stored as TxE1xE2, which is why we
|
||||
def normalize_q(lq):
|
||||
lqlmax = np.amax(lql, axis=0)
|
||||
precompute_lql_minus_max = lql - lqlmax
|
||||
# qla has the same formula, essentially, but not in the log form
|
||||
Lq = np.reshape(
|
||||
precompute_lql_minus_max -
|
||||
np.log(np.sum(np.exp(precompute_lql_minus_max), axis=0)),
|
||||
[self.extent[0], self.extent[1], T]
|
||||
)
|
||||
return np.moveaxis(np.exp(Lq), 2, 0)
|
||||
|
||||
def update_summand(self, toroidal_F, M, N):
|
||||
A = toroidal_F.cumsum(axis=0).cumsum(axis=1)
|
||||
c = np.pad(
|
||||
A,
|
||||
pad_width=tuple([(1, 0) for x in list(self.extent)] + [(0, 0)]), mode='constant', constant_values=0
|
||||
)
|
||||
w0 = self.window[0]
|
||||
w1 = self.window[1]
|
||||
return (
|
||||
c[slice(w0, toroidal_F.shape[0]+1), slice(w1, toroidal_F.shape[1]+1), :] -
|
||||
c[slice(0, self.extent[0]), slice(w1, toroidal_F.shape[1] + 1), :] -
|
||||
c[slice(w0, toroidal_F.shape[0]+1), slice(0, self.extent[1]), :] +
|
||||
c[slice(0, self.extent[0]), slice(0, self.extent[1]), :]
|
||||
)[slice(0, toroidal_F.shape[0]), slice(0, toroidal_F.shape[1]), :]
|
||||
|
||||
# Adding noise to q, which was not originally in the matlab code
|
||||
self.q = self.q + 0.25*noise
|
||||
lql = np.log(np.reshape(np.moveaxis(self.q, 0, -1), (P, T)))
|
||||
self.q = normalize_q(lql)
|
||||
qlsm = qlsm = np.fft.ifft2(np.fft.fft2(
|
||||
np.reshape(self.q, (T, P))
|
||||
)).real.astype(np.float64)
|
||||
lql = np.log(np.reshape(np.moveaxis(self.q, 0, -1), (P, T)))
|
||||
|
||||
# Check the distribution properties of ql
|
||||
'''
|
||||
assert( np.all(np.isclose(np.sum(self.q,axis=0)[:,:],1 ) )==True)
|
||||
assert(np.any((self.q) <0 ) == False)
|
||||
'''
|
||||
|
||||
alpha = 1e-10
|
||||
miter = 1
|
||||
|
||||
SCALING_FACTOR = 2.5
|
||||
# ~1/2 the average number of number of word per document, make this number smaller as the counting grid gets bigger
|
||||
pseudocounts = np.mean(np.sum(data, axis=1)) / (P*SCALING_FACTOR)
|
||||
# This is the posterior for each layer Q( Layer | document ).
|
||||
qla = np.ones([L, T])
|
||||
lqla = np.log(qla)
|
||||
|
||||
plal = np.ones([L, P]) / L # P(layer | position in counting grid)
|
||||
dirichlet_prior = np.ones([L, Z])
|
||||
|
||||
# Didn't implement the fft to smooth the posterior probabilities
|
||||
# of picking a location, differing from previous code by choice.
|
||||
every_iter = 1
|
||||
nmax = 2 # maximum number of iterations
|
||||
start_ql = 1
|
||||
|
||||
minp = 1e-10
|
||||
TOROID_ARGUMENTS = [(self.window[0], 0), (self.window[1], 0), (0, 0)]
|
||||
eps = (np.finfo(h_la.dtype).eps)
|
||||
for iter in range(nmax):
|
||||
# assert( np.all(np.isclose(np.sum(np.reshape(np.transpose(self.q),(P,T)),axis=0),1 ) ))
|
||||
# assert(np.any((self.q) <0 ) == False)
|
||||
# assert(np.all(np.isclose(np.sum(qla,axis=0),1 )))
|
||||
# assert(np.any((qla) <0 ) == False)
|
||||
|
||||
if iter >= start_ql:
|
||||
lql = np.zeros([P, T])
|
||||
for l in range(L):
|
||||
tmp = np.reshape(np.log(eps + h_la[:, :, :, l]), [P, Z])
|
||||
# qla is the Q(layer) in the mean field posterior factorization, its structure is L,T
|
||||
lql = lql + np.dot(tmp, np.transpose(data))*qla[l, :]
|
||||
self.q = np.reshape(
|
||||
normalize_q(lql),
|
||||
[T, self.extent[0], self.extent[1]]
|
||||
)
|
||||
|
||||
# Didn't use the smoothened ql
|
||||
qlsm = np.fft.ifft2(np.fft.fft2(
|
||||
np.reshape(self.q, (T, P))
|
||||
)).real.astype(np.float64)
|
||||
|
||||
# update Q(layer)
|
||||
tmpq = np.reshape(np.moveaxis(np.copy(self.q), 0, -1), [P, T])
|
||||
for l in range(L):
|
||||
tmp = np.reshape(np.log(alpha + h_la[:, :, :, l]), [P, Z])
|
||||
lqla[l, :] = np.sum(
|
||||
tmpq*(np.dot(tmp, np.transpose(data)) +
|
||||
np.reshape(np.transpose(np.log(plal[l, :])), [P, 1])),
|
||||
axis=0
|
||||
)
|
||||
|
||||
lqlamax = np.amax(lqla, axis=0)
|
||||
qla = np.exp(
|
||||
(lqla - lqlamax) - np.log(np.sum(np.exp((lqla - lqlamax)), axis=0))
|
||||
)
|
||||
|
||||
# M-STEP. Basically the normal CG M-Step, repeated #Layers times.
|
||||
# Reimplemented it to include the dirichlet prior in the mix
|
||||
for l in range(L):
|
||||
# Dirichlet prior. Does very little in practice, may be useful only in the early stages of learning... leave it.
|
||||
tmpdirip = dirichlet_prior[l, :] - 1
|
||||
tmpdirip[np.isnan(tmpdirip)] = 0
|
||||
|
||||
# Recall that self.q is T x E1 x E2 tensor
|
||||
# Original Matlab code here for reference:
|
||||
#
|
||||
# nrm = bsxfun(@plus, reshape(tmpdirip,[1 1 Z]),
|
||||
# reshape( reshape( padarray( ql, W, 'circular','pre'),
|
||||
# [prod(cg_size+W),T])*bsxfun( @times, WD, qla(l,:))', [ cg_size+W,Z ]));
|
||||
first = np.reshape(np.pad(np.moveaxis(
|
||||
self.q, 0, -1), TOROID_ARGUMENTS, 'wrap'), [np.prod(self.extent+self.window), T])
|
||||
# using transpose function make sense here because we're just swapping 2 Dimensions
|
||||
D = np.dot(first, np.transpose(np.transpose(data) * qla[l, :]))
|
||||
nrm = np.reshape(tmpdirip, [1, 1, Z]) + np.reshape(
|
||||
D, [self.extent[0]+self.window[0], self.extent[1]+self.window[1], Z])
|
||||
|
||||
QH = nrm / \
|
||||
np.pad(h_la[:, :, :, l] + np.prod(self.window) * alpha, TOROID_ARGUMENTS, 'wrap')
|
||||
QH = QH[slice(1, self.extent[0]+self.window[0]),
|
||||
slice(1, self.extent[1]+self.window[1]), :]
|
||||
QH = update_summand(self, QH, T, Z)
|
||||
QH[QH < 0] = 0
|
||||
|
||||
un_pi = pseudocounts + QH*(pi_la[:, :, :, l]+alpha)
|
||||
mask = np.sum(un_pi, 2) != 0
|
||||
nmask = np.sum(un_pi, 2) == 0
|
||||
M1 = np.transpose(np.transpose(
|
||||
un_pi)*np.transpose(mask.astype(np.float64)) / np.transpose(np.sum(un_pi, 2)))
|
||||
M2 = np.ones([Z, self.extent[0], self.extent[1]]) * \
|
||||
(nmask).astype(np.float64)
|
||||
pi_la[:, :, :, l] = M1 + np.moveaxis((1.0/Z) * M2, 0, 2)
|
||||
layer_compute_h(pi_la, h_la)
|
||||
|
||||
qlsm = np.fft.ifft2(np.fft.fft2(np.reshape(
|
||||
self.q, (T, P)))).real.astype(np.float64)
|
||||
A = np.sum(qlsm, axis=0)
|
||||
if np.any(np.isclose(A, 0)):
|
||||
A += eps
|
||||
plal = np.transpose(np.reshape(np.sum(np.reshape(np.moveaxis(
|
||||
qlsm, 0, -1), [self.extent[0], self.extent[1], T, 1]) * np.reshape(np.transpose(qla), [1, 1, T, L]), axis=2), [P, L]))/A
|
||||
plal[plal < 1e-100] = 1e-100
|
||||
plal = plal / np.sum(plal, axis=0) # sum over the layers
|
||||
|
||||
INVERSE_DOCUMENT_FREQUENCY = np.log(
|
||||
data.shape[0] + eps) - np.log(np.sum((data > 0).astype(np.float64), axis=0) + eps)
|
||||
|
||||
pi_la_idf = np.zeros(pi_la.shape)
|
||||
for l in range(L):
|
||||
pi_la_idf[:, :, :, l] = pi_la[:, :, :, l] * \
|
||||
INVERSE_DOCUMENT_FREQUENCY
|
||||
|
||||
id_layers = np.argmax(qla, axis=0) + 1
|
||||
wg_ep = eps
|
||||
mask = np.pad(np.ones(self.window), [
|
||||
(0, x) for x in self.extent-self.window], 'constant', constant_values=0)
|
||||
wg = np.zeros([self.extent[0], self.extent[1], L])
|
||||
for l in range(L):
|
||||
idl = np.where(id_layers-1 == l)[0] # id_layer in matlab format
|
||||
for t in range(len(idl)):
|
||||
wg[:, :, l] = wg[:, :, l] + np.fft.ifft2(np.fft.fft2(mask)*np.fft.fft2(
|
||||
self.q[idl[t], :, :])).real.astype(np.float64) # m x E structure
|
||||
|
||||
# This makes wg not a distribution.
|
||||
wg = np.transpose(np.transpose(
|
||||
wg) / (np.sum(np.transpose(wg), axis=0) + wg_ep))
|
||||
# sum over the layers
|
||||
pi2_idf = np.sum(
|
||||
pi_la_idf*np.reshape(wg, [self.extent[0], self.extent[1], 1, L]), axis=3)
|
||||
|
||||
# Renormalize Pi after using inverse document frequency.
|
||||
pi2_idf = np.transpose(np.transpose(pi2_idf) /
|
||||
np.transpose(np.sum(pi2_idf, axis=2)))
|
||||
|
||||
return {
|
||||
"pi2_idf": pi2_idf, "pi_la_idf": pi_la_idf, "id_layer": [id_layers],
|
||||
"ql2": self.q, "counts_to_show": np.transpose(data), "indices_to_show": [np.array(list(range(T))) + 1],
|
||||
"pi": self.pi, "pi_la": pi_la
|
||||
}
|
||||
|
||||
def fit(
|
||||
self, data, max_iter=100, returnSumSquareDifferencesOfPi=False,
|
||||
noise=.000001, learn_pi=True, pi=None, layers=1, output_directory="./", heartBeaters=None
|
||||
):
|
||||
"""
|
||||
Implements variational expectation maximization for the Counting Grid model
|
||||
Assumes: data is an m x n matrix
|
||||
TO DO: return early if fitness converges. don't just run it for max iter.
|
||||
"""
|
||||
|
||||
if not os.path.exists(str(output_directory)):
|
||||
raise Exception(
|
||||
"output_directory does not exist for counting grids trainer."
|
||||
)
|
||||
|
||||
def SSD(pi, piHat):
|
||||
A = np.abs(pi - piHat)
|
||||
return np.sum(A * A)
|
||||
alpha = 1e-10
|
||||
SSDPi = []
|
||||
data = data.astype(np.float64)
|
||||
if pi is None:
|
||||
self.initializePi(data)
|
||||
else:
|
||||
self.pi = pi
|
||||
self.h = self.compute_h(self.pi, self.window)
|
||||
self.check_model()
|
||||
extentProduct = np.prod(self.extent)
|
||||
T, _ = data.shape
|
||||
|
||||
pseudocounts = np.mean(np.sum(data, axis=1) / extentProduct) / 2.5
|
||||
|
||||
# q is an m x dim(extent) structure
|
||||
qshape = [len(data)]
|
||||
for v in self.extent:
|
||||
qshape.append(v)
|
||||
self.q = np.zeros(tuple(qshape))
|
||||
i = 0
|
||||
while i < max_iter:
|
||||
# E-Step
|
||||
self.q = self.q_update(data)
|
||||
|
||||
# M-Step
|
||||
if learn_pi:
|
||||
if returnSumSquareDifferencesOfPi:
|
||||
pi = self.pi
|
||||
|
||||
self.pi = self.pi_update(data, pseudocounts, alpha)
|
||||
if returnSumSquareDifferencesOfPi:
|
||||
piHat = self.pi
|
||||
SSDPi.append(SSD(pi, piHat))
|
||||
self.h = self.compute_h(self.pi, self.window)
|
||||
i = i + 1
|
||||
[(h.makeProgress(int(100*i/max_iter)) if h is not None else False)
|
||||
for h in heartBeaters] if heartBeaters is not None else False
|
||||
|
||||
if layers > 1:
|
||||
self.layercgdata = self.cg_layers(data, L=layers, noise=noise)
|
||||
scipy.io.savemat(str(output_directory) +
|
||||
"/CountingGridDataMatrices.mat", self.layercgdata)
|
||||
else:
|
||||
scipy.io.savemat(str(output_directory) +
|
||||
"/CGData.mat", {"pi": self.pi, "q": self.q})
|
||||
return self.pi
|
||||
|
||||
# assumptions that we need for the model to be valid
|
||||
def check_model(self):
|
||||
assert(len(self.extent) == len(self.window))
|
||||
for v in self.extent:
|
||||
assert(int(v) == v)
|
||||
assert(v > 0)
|
||||
for v in self.window:
|
||||
assert(int(v) == v)
|
||||
assert(v > 0)
|
||||
|
||||
# dimensions of pi is one more than the window
|
||||
assert(self.pi.ndim - 1 == len(self.window))
|
||||
|
||||
# w0 and w1 are window size
|
||||
def compute_h_noLoopFull(self, PI, w0, w1):
|
||||
return PI[w0:, w1:, :] - PI[:-w0, w1:, :] - PI[w0:, :-w1, :] + PI[:-w0, :-w1, :]
|
||||
|
||||
# no side effects
|
||||
def compute_h(self, pi, W):
|
||||
PI = np.pad(pi, [(0, W[0]), (0, W[1]), (0, 0)],
|
||||
'wrap').cumsum(axis=0).cumsum(axis=1)
|
||||
PI = np.pad(PI, [(1, 0), (1, 0), (0, 0)], 'constant')
|
||||
w0 = W[0]
|
||||
w1 = W[1]
|
||||
cumsum_output = self.compute_h_noLoopFull(PI, w0, w1)
|
||||
return np.moveaxis(np.moveaxis(cumsum_output[:-1, :-1, :], 2, 0)/np.sum(cumsum_output[:-1, :-1, :], axis=2), 0, -1)
|
||||
|
||||
# How to initialize pi
|
||||
# Note that we don't want pi to be 0, since our update equations depend on a multiplication by pi
|
||||
def initializePi(self, data, technique="uniform"):
|
||||
if technique is "uniform":
|
||||
size = [x for x in self.extent]
|
||||
size.append(data.shape[1])
|
||||
self.pi = np.random.random(size=tuple(size)).astype(np.float64)
|
||||
else:
|
||||
raise ValueError("No initialize strategy given")
|
||||
|
||||
def pi_update(self, data, pseudocounts, alpha):
|
||||
'''
|
||||
Modifies: pi
|
||||
Assumes: self.q has been initialized (in the fit function) and that h has been initialized
|
||||
Assumes: a two dimensional extent
|
||||
Recall: q is M x E tensor
|
||||
'''
|
||||
T, Z = data.shape
|
||||
W = self.window
|
||||
# QdotConH is called nrm in matlab engine, but padding is done beforehand in matlab
|
||||
QdotConH = np.dot(np.moveaxis(self.q, 0, -1,), data)
|
||||
QH = np.pad(QdotConH / (self.h + np.prod(self.window)*alpha),
|
||||
[(W[0], 0), (W[1], 0), (0, 0)], 'wrap').cumsum(axis=0).cumsum(axis=1)
|
||||
w0 = W[0]
|
||||
w1 = W[1]
|
||||
QH = self.compute_h_noLoopFull(QH, w0, w1)
|
||||
QH[QH < 0] = 0
|
||||
|
||||
un_pi = pseudocounts + QH*(self.pi+alpha)
|
||||
mask = (np.sum(un_pi, axis=2) != 0).astype(np.float64)
|
||||
not_mask = (np.sum(un_pi, axis=2) == 0).astype(np.float64)
|
||||
denom = np.sum(un_pi, axis=2)
|
||||
self.pi = np.transpose(np.transpose(mask)*(np.transpose(un_pi) / np.transpose(denom))) + \
|
||||
(1.0/Z) * np.transpose(np.transpose(
|
||||
np.ones([self.extent[0], self.extent[1], Z]))*np.transpose(not_mask))
|
||||
return self.pi
|
||||
|
||||
def get_indices_for_window_indexed_by_k(self, k, z):
|
||||
indices = [[]]*len(self.extent)
|
||||
for j, v in enumerate(indices):
|
||||
indices[j] = slice(k[j], k[j] + self.window[j])
|
||||
indices.append(z) # the z index in the paper
|
||||
return tuple(indices)
|
||||
|
||||
def get_h(self):
|
||||
return self.h
|
||||
|
||||
def q_update(self, data):
|
||||
L = np.prod(self.extent)
|
||||
lql = np.dot(np.log(self.h).reshape(
|
||||
(L, data.shape[1])), np.transpose(data))
|
||||
lqlmax = np.amax(lql, axis=0)
|
||||
min_prob = 1.0/(10*L)
|
||||
Lq = ((lql-lqlmax)-np.log(np.sum(np.exp(lql-lqlmax), axis=0))
|
||||
).reshape(tuple(list(self.extent) + [data.shape[0]]))
|
||||
q = np.exp(Lq)
|
||||
q[q < min_prob] = min_prob
|
||||
q = q / np.sum(np.sum(q, axis=0), axis=0)
|
||||
return np.moveaxis(q, 2, 0)
|
||||
|
||||
# Assumes: bagofwordcounts are integers
|
||||
def predict_probabilities(self, bagofwordcounts, k):
|
||||
assert(len(k) == len(self.window))
|
||||
|
||||
log_p = 0 # log of the products is the sum of the logs
|
||||
for (i, count) in enumerate(bagofwordcounts):
|
||||
if count > 0:
|
||||
indices = self.get_indices_for_window_indexed_by_k(k, i)
|
||||
log_p += np.log(np.sum(self.pi[indices].reshape(
|
||||
self.pi[indices].size)))*float(count)
|
||||
|
||||
# 1/normalizationfactor * probability
|
||||
# e ^ (log(prob) - log(normalization factor))
|
||||
return math.pow(
|
||||
math.e,
|
||||
log_p - np.sum(bagofwordcounts)*np.sum(np.log(self.window))
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
data = np.array([
|
||||
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
|
||||
[1, 2, 1, 0, 1, 0, 0, 0, 0, 0],
|
||||
[0, 1, 1, 0, 0, 1, 1, 0, 0, 0],
|
||||
[0, 0, 1, 0, 0, 0, 0, 1, 1, 1]
|
||||
])
|
||||
extent = np.array([3, 3])
|
||||
window = np.array([2, 2])
|
||||
model = CountingGridModel(extent, window)
|
||||
model.fit(data, max_iter=1000,
|
||||
returnSumSquareDifferencesOfPi=False, layers=2)
|
|
@ -0,0 +1,3 @@
|
|||
from .CountingGridModel import CountingGridModel
|
||||
|
||||
__all__ = ['CountingGridModel']
|
|
@ -0,0 +1,29 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import unittest
|
||||
import numpy as np
|
||||
from CountingGridsPy.models import CountingGridModel
|
||||
|
||||
|
||||
class TestCorrectnessOfNontrivialDesignMatrix(unittest.TestCase):
|
||||
def setUp(self):
|
||||
M, N = [5, 7]
|
||||
self.N = N
|
||||
self.data = np.array(
|
||||
list(range(1, 8))+list(range(7, 0, -1)) +
|
||||
[1]*7+[0, 1, 0, 1, 0, 1, 0]+list(range(1, 8))
|
||||
).reshape((M, N))
|
||||
# note: after one iteration h distribution is the same regardless of position on matrix or window size
|
||||
self.extent = np.array([5, 5])
|
||||
window = np.array([2, 3])
|
||||
self.pi_init = np.ones([5]*2+[N])/1000
|
||||
self.model = CountingGridModel(self.extent, window)
|
||||
|
||||
def test_correct_data(self):
|
||||
assert(sum(sum(self.data)) == 94)
|
||||
|
||||
def test_fitted_model(self):
|
||||
self.model.fit(self.data, max_iter=1,
|
||||
returnSumSquareDifferencesOfPi=False, pi=np.copy(self.pi_init))
|
||||
assert(np.all(np.isclose(self.model.q, .04)))
|
|
@ -0,0 +1,20 @@
|
|||
- job:
|
||||
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
strategy:
|
||||
matrix:
|
||||
Python36:
|
||||
python.version: '3.6'
|
||||
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
displayName: 'Use Python $(python.version)'
|
||||
inputs:
|
||||
versionSpec: '$(python.version)'
|
||||
|
||||
- script: pip install pep8
|
||||
displayName: 'Install pep8 linter'
|
||||
|
||||
- script: pep8 . --max-line-length=150
|
||||
displayName: 'Run pep8 linter'
|
Загрузка…
Ссылка в новой задаче