Updates for final working version

This commit is contained in:
msalvaris 2018-05-04 17:48:28 +00:00
Родитель d279779908
Коммит dec79e118b
7 изменённых файлов: 220 добавлений и 930 удалений

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -41,6 +41,8 @@ create-service-principal:
select-subscription:
az login -o table
az account set --subscription "$(SELECTED_SUBSCRIPTION)"
ln -s /anaconda/envs/py35/bin/conda /home/mat/repos/deep_bait/envs/default/bin/conda
create-storage:
@echo "Creating storage account"

Просмотреть файл

@ -45,6 +45,7 @@ variables:
TENANT:
SUBSCRIPTION_ID:
STORAGE_ACCOUNT_KEY:
downloads:
DATA:
filename: data/cifar-10-python.tar.gz
@ -75,7 +76,9 @@ env_specs:
packages:
- anaconda-project
- pip:
- msrest==0.4.29
- olefile
- keyring
- msrestazure
- fire==0.1.2
- toolz==0.8.2
- requests==2.18.4

Просмотреть файл

@ -395,7 +395,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
"version": "3.6.5"
}
},
"nbformat": 4,

Просмотреть файл

@ -41,7 +41,7 @@
"from os import path\n",
"from utils import cifar_for_library, yield_mb, create_logger, Timer\n",
"from nb_logging import NotebookLogger, output_to, error_to\n",
"from gpumon import db_log_context\n",
"from gpumon.influxdb import log_context\n",
"import codecs\n",
"\n",
"from influxdb import InfluxDBClient"
@ -301,8 +301,8 @@
],
"source": [
"with Timer() as t:\n",
" with db_log_context(LOGGER_URL, '8086', LOGGER_USRENAME, LOGGER_PASSWORD, LOGGER_DB, LOGGER_SERIES, \n",
" node_id=node_id, task_id=task_id, job_id=job_id):\n",
" with log_context(LOGGER_URL, LOGGER_USRENAME, LOGGER_PASSWORD, LOGGER_DB, LOGGER_SERIES, \n",
" node_id=node_id, task_id=task_id, job_id=job_id):\n",
" for j in range(EPOCHS):\n",
" for data, label in yield_mb(x_train, y_train, BATCHSIZE, shuffle=True):\n",
" sess.run(model, feed_dict={X: data, y: label, training: True})\n",
@ -377,7 +377,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.4"
"version": "3.6.5"
}
},
"nbformat": 4,

Просмотреть файл

@ -1,18 +1,18 @@
''' Script that sets everything up and introduces helper functions into the namespace
'''
import logging
logging.basicConfig(level=logging.ERROR)
import os
from glob import iglob
from itertools import chain
from os import path
from pprint import pprint
import utilities as ut
import azure.mgmt.batchai.models as models
import utilities as ut
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
NODE_COUNT = 10
CLUSTER_NAME = 'mync6'
@ -274,7 +274,7 @@ def download_files(job_name, output_id, output_folder=None):
if output_folder:
logger.info('Downloading files to {}'.format(output_folder))
files = client.jobs.list_output_files(config.group_name, job_name, models.JobsListOutputFilesOptions(output_id))
files = client.jobs.list_output_files(config.group_name, job_name, models.JobsListOutputFilesOptions(outputdirectoryid=output_id))
for file in files:
logger.info('Downloading {}'.format(file.name))
file_name = path.join(output_folder, file.name) if output_folder else file.name

Просмотреть файл

@ -12,8 +12,8 @@ import requests
from azure.common.credentials import ServicePrincipalCredentials
from azure.storage.file import FileService
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
POLLING_INTERVAL_SEC = 5
@ -72,7 +72,7 @@ class OutputStreamer:
files = self.client.jobs.list_output_files(
self.resource_group, self.job_name,
models.JobsListOutputFilesOptions(
self.output_directory_id))
outputdirectoryid=self.output_directory_id))
if not files:
return
else:
@ -248,12 +248,12 @@ def create_job(config, cluster_id, job_name, image_name, command, number_of_vms=
parameters = models.job_create_parameters.JobCreateParameters(
location=config.location,
cluster=models.ResourceId(cluster_id),
cluster=models.ResourceId(id=cluster_id),
node_count=number_of_vms,
input_directories=input_directories,
std_out_err_path_prefix=std_output_path_prefix,
output_directories=output_directories,
container_settings=models.ContainerSettings(models.ImageSourceRegistry(image=image_name)),
container_settings=models.ContainerSettings(image_source_registry=models.ImageSourceRegistry(image=image_name)),
custom_toolkit_settings=models.CustomToolkitSettings(command_line=command))
@ -268,7 +268,7 @@ def wait_for_job(config, job_name):
def setup_cluster(config):
client = client_from(config)
container_setting_for = lambda img: models.ContainerSettings(image_source_registry=models.ImageSourceRegistry(img))
container_setting_for = lambda img: models.ContainerSettings(image_source_registry=models.ImageSourceRegistry(image=img))
container_settings = [container_setting_for(img) for img in config.image_names]
volumes = create_volume(config.storage_account['name'],config.storage_account['key'], config.fileshare_name, config.fileshare_mount_point)