Updates for final working version

This commit is contained in:
msalvaris 2018-05-04 17:48:28 +00:00
Родитель d279779908
Коммит dec79e118b
7 изменённых файлов: 220 добавлений и 930 удалений

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -41,6 +41,8 @@ create-service-principal:
select-subscription: select-subscription:
az login -o table az login -o table
az account set --subscription "$(SELECTED_SUBSCRIPTION)" az account set --subscription "$(SELECTED_SUBSCRIPTION)"
ln -s /anaconda/envs/py35/bin/conda /home/mat/repos/deep_bait/envs/default/bin/conda
create-storage: create-storage:
@echo "Creating storage account" @echo "Creating storage account"

Просмотреть файл

@ -45,6 +45,7 @@ variables:
TENANT: TENANT:
SUBSCRIPTION_ID: SUBSCRIPTION_ID:
STORAGE_ACCOUNT_KEY: STORAGE_ACCOUNT_KEY:
downloads: downloads:
DATA: DATA:
filename: data/cifar-10-python.tar.gz filename: data/cifar-10-python.tar.gz
@ -75,7 +76,9 @@ env_specs:
packages: packages:
- anaconda-project - anaconda-project
- pip: - pip:
- msrest==0.4.29 - olefile
- keyring
- msrestazure
- fire==0.1.2 - fire==0.1.2
- toolz==0.8.2 - toolz==0.8.2
- requests==2.18.4 - requests==2.18.4

Просмотреть файл

@ -395,7 +395,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.5.2" "version": "3.6.5"
} }
}, },
"nbformat": 4, "nbformat": 4,

Просмотреть файл

@ -41,7 +41,7 @@
"from os import path\n", "from os import path\n",
"from utils import cifar_for_library, yield_mb, create_logger, Timer\n", "from utils import cifar_for_library, yield_mb, create_logger, Timer\n",
"from nb_logging import NotebookLogger, output_to, error_to\n", "from nb_logging import NotebookLogger, output_to, error_to\n",
"from gpumon import db_log_context\n", "from gpumon.influxdb import log_context\n",
"import codecs\n", "import codecs\n",
"\n", "\n",
"from influxdb import InfluxDBClient" "from influxdb import InfluxDBClient"
@ -301,8 +301,8 @@
], ],
"source": [ "source": [
"with Timer() as t:\n", "with Timer() as t:\n",
" with db_log_context(LOGGER_URL, '8086', LOGGER_USRENAME, LOGGER_PASSWORD, LOGGER_DB, LOGGER_SERIES, \n", " with log_context(LOGGER_URL, LOGGER_USRENAME, LOGGER_PASSWORD, LOGGER_DB, LOGGER_SERIES, \n",
" node_id=node_id, task_id=task_id, job_id=job_id):\n", " node_id=node_id, task_id=task_id, job_id=job_id):\n",
" for j in range(EPOCHS):\n", " for j in range(EPOCHS):\n",
" for data, label in yield_mb(x_train, y_train, BATCHSIZE, shuffle=True):\n", " for data, label in yield_mb(x_train, y_train, BATCHSIZE, shuffle=True):\n",
" sess.run(model, feed_dict={X: data, y: label, training: True})\n", " sess.run(model, feed_dict={X: data, y: label, training: True})\n",
@ -377,7 +377,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.5.4" "version": "3.6.5"
} }
}, },
"nbformat": 4, "nbformat": 4,

Просмотреть файл

@ -1,18 +1,18 @@
''' Script that sets everything up and introduces helper functions into the namespace ''' Script that sets everything up and introduces helper functions into the namespace
''' '''
import logging import logging
logging.basicConfig(level=logging.ERROR)
import os import os
from glob import iglob from glob import iglob
from itertools import chain from itertools import chain
from os import path from os import path
from pprint import pprint from pprint import pprint
import utilities as ut
import azure.mgmt.batchai.models as models import azure.mgmt.batchai.models as models
import utilities as ut
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
NODE_COUNT = 10 NODE_COUNT = 10
CLUSTER_NAME = 'mync6' CLUSTER_NAME = 'mync6'
@ -274,7 +274,7 @@ def download_files(job_name, output_id, output_folder=None):
if output_folder: if output_folder:
logger.info('Downloading files to {}'.format(output_folder)) logger.info('Downloading files to {}'.format(output_folder))
files = client.jobs.list_output_files(config.group_name, job_name, models.JobsListOutputFilesOptions(output_id)) files = client.jobs.list_output_files(config.group_name, job_name, models.JobsListOutputFilesOptions(outputdirectoryid=output_id))
for file in files: for file in files:
logger.info('Downloading {}'.format(file.name)) logger.info('Downloading {}'.format(file.name))
file_name = path.join(output_folder, file.name) if output_folder else file.name file_name = path.join(output_folder, file.name) if output_folder else file.name

Просмотреть файл

@ -12,8 +12,8 @@ import requests
from azure.common.credentials import ServicePrincipalCredentials from azure.common.credentials import ServicePrincipalCredentials
from azure.storage.file import FileService from azure.storage.file import FileService
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
POLLING_INTERVAL_SEC = 5 POLLING_INTERVAL_SEC = 5
@ -72,7 +72,7 @@ class OutputStreamer:
files = self.client.jobs.list_output_files( files = self.client.jobs.list_output_files(
self.resource_group, self.job_name, self.resource_group, self.job_name,
models.JobsListOutputFilesOptions( models.JobsListOutputFilesOptions(
self.output_directory_id)) outputdirectoryid=self.output_directory_id))
if not files: if not files:
return return
else: else:
@ -248,12 +248,12 @@ def create_job(config, cluster_id, job_name, image_name, command, number_of_vms=
parameters = models.job_create_parameters.JobCreateParameters( parameters = models.job_create_parameters.JobCreateParameters(
location=config.location, location=config.location,
cluster=models.ResourceId(cluster_id), cluster=models.ResourceId(id=cluster_id),
node_count=number_of_vms, node_count=number_of_vms,
input_directories=input_directories, input_directories=input_directories,
std_out_err_path_prefix=std_output_path_prefix, std_out_err_path_prefix=std_output_path_prefix,
output_directories=output_directories, output_directories=output_directories,
container_settings=models.ContainerSettings(models.ImageSourceRegistry(image=image_name)), container_settings=models.ContainerSettings(image_source_registry=models.ImageSourceRegistry(image=image_name)),
custom_toolkit_settings=models.CustomToolkitSettings(command_line=command)) custom_toolkit_settings=models.CustomToolkitSettings(command_line=command))
@ -268,7 +268,7 @@ def wait_for_job(config, job_name):
def setup_cluster(config): def setup_cluster(config):
client = client_from(config) client = client_from(config)
container_setting_for = lambda img: models.ContainerSettings(image_source_registry=models.ImageSourceRegistry(img)) container_setting_for = lambda img: models.ContainerSettings(image_source_registry=models.ImageSourceRegistry(image=img))
container_settings = [container_setting_for(img) for img in config.image_names] container_settings = [container_setting_for(img) for img in config.image_names]
volumes = create_volume(config.storage_account['name'],config.storage_account['key'], config.fileshare_name, config.fileshare_mount_point) volumes = create_volume(config.storage_account['name'],config.storage_account['key'], config.fileshare_name, config.fileshare_mount_point)