Updates for final working version
This commit is contained in:
Родитель
d279779908
Коммит
dec79e118b
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
2
Makefile
2
Makefile
|
@ -41,6 +41,8 @@ create-service-principal:
|
||||||
select-subscription:
|
select-subscription:
|
||||||
az login -o table
|
az login -o table
|
||||||
az account set --subscription "$(SELECTED_SUBSCRIPTION)"
|
az account set --subscription "$(SELECTED_SUBSCRIPTION)"
|
||||||
|
ln -s /anaconda/envs/py35/bin/conda /home/mat/repos/deep_bait/envs/default/bin/conda
|
||||||
|
|
||||||
|
|
||||||
create-storage:
|
create-storage:
|
||||||
@echo "Creating storage account"
|
@echo "Creating storage account"
|
||||||
|
|
|
@ -45,6 +45,7 @@ variables:
|
||||||
TENANT:
|
TENANT:
|
||||||
SUBSCRIPTION_ID:
|
SUBSCRIPTION_ID:
|
||||||
STORAGE_ACCOUNT_KEY:
|
STORAGE_ACCOUNT_KEY:
|
||||||
|
|
||||||
downloads:
|
downloads:
|
||||||
DATA:
|
DATA:
|
||||||
filename: data/cifar-10-python.tar.gz
|
filename: data/cifar-10-python.tar.gz
|
||||||
|
@ -75,7 +76,9 @@ env_specs:
|
||||||
packages:
|
packages:
|
||||||
- anaconda-project
|
- anaconda-project
|
||||||
- pip:
|
- pip:
|
||||||
- msrest==0.4.29
|
- olefile
|
||||||
|
- keyring
|
||||||
|
- msrestazure
|
||||||
- fire==0.1.2
|
- fire==0.1.2
|
||||||
- toolz==0.8.2
|
- toolz==0.8.2
|
||||||
- requests==2.18.4
|
- requests==2.18.4
|
||||||
|
|
|
@ -395,7 +395,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.2"
|
"version": "3.6.5"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
|
@ -41,7 +41,7 @@
|
||||||
"from os import path\n",
|
"from os import path\n",
|
||||||
"from utils import cifar_for_library, yield_mb, create_logger, Timer\n",
|
"from utils import cifar_for_library, yield_mb, create_logger, Timer\n",
|
||||||
"from nb_logging import NotebookLogger, output_to, error_to\n",
|
"from nb_logging import NotebookLogger, output_to, error_to\n",
|
||||||
"from gpumon import db_log_context\n",
|
"from gpumon.influxdb import log_context\n",
|
||||||
"import codecs\n",
|
"import codecs\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from influxdb import InfluxDBClient"
|
"from influxdb import InfluxDBClient"
|
||||||
|
@ -301,8 +301,8 @@
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"with Timer() as t:\n",
|
"with Timer() as t:\n",
|
||||||
" with db_log_context(LOGGER_URL, '8086', LOGGER_USRENAME, LOGGER_PASSWORD, LOGGER_DB, LOGGER_SERIES, \n",
|
" with log_context(LOGGER_URL, LOGGER_USRENAME, LOGGER_PASSWORD, LOGGER_DB, LOGGER_SERIES, \n",
|
||||||
" node_id=node_id, task_id=task_id, job_id=job_id):\n",
|
" node_id=node_id, task_id=task_id, job_id=job_id):\n",
|
||||||
" for j in range(EPOCHS):\n",
|
" for j in range(EPOCHS):\n",
|
||||||
" for data, label in yield_mb(x_train, y_train, BATCHSIZE, shuffle=True):\n",
|
" for data, label in yield_mb(x_train, y_train, BATCHSIZE, shuffle=True):\n",
|
||||||
" sess.run(model, feed_dict={X: data, y: label, training: True})\n",
|
" sess.run(model, feed_dict={X: data, y: label, training: True})\n",
|
||||||
|
@ -377,7 +377,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.4"
|
"version": "3.6.5"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
|
@ -1,18 +1,18 @@
|
||||||
''' Script that sets everything up and introduces helper functions into the namespace
|
''' Script that sets everything up and introduces helper functions into the namespace
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
logging.basicConfig(level=logging.ERROR)
|
||||||
import os
|
import os
|
||||||
from glob import iglob
|
from glob import iglob
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from os import path
|
from os import path
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
import utilities as ut
|
||||||
import azure.mgmt.batchai.models as models
|
import azure.mgmt.batchai.models as models
|
||||||
|
|
||||||
import utilities as ut
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
NODE_COUNT = 10
|
NODE_COUNT = 10
|
||||||
CLUSTER_NAME = 'mync6'
|
CLUSTER_NAME = 'mync6'
|
||||||
|
@ -274,7 +274,7 @@ def download_files(job_name, output_id, output_folder=None):
|
||||||
if output_folder:
|
if output_folder:
|
||||||
logger.info('Downloading files to {}'.format(output_folder))
|
logger.info('Downloading files to {}'.format(output_folder))
|
||||||
|
|
||||||
files = client.jobs.list_output_files(config.group_name, job_name, models.JobsListOutputFilesOptions(output_id))
|
files = client.jobs.list_output_files(config.group_name, job_name, models.JobsListOutputFilesOptions(outputdirectoryid=output_id))
|
||||||
for file in files:
|
for file in files:
|
||||||
logger.info('Downloading {}'.format(file.name))
|
logger.info('Downloading {}'.format(file.name))
|
||||||
file_name = path.join(output_folder, file.name) if output_folder else file.name
|
file_name = path.join(output_folder, file.name) if output_folder else file.name
|
||||||
|
|
10
utilities.py
10
utilities.py
|
@ -12,8 +12,8 @@ import requests
|
||||||
from azure.common.credentials import ServicePrincipalCredentials
|
from azure.common.credentials import ServicePrincipalCredentials
|
||||||
from azure.storage.file import FileService
|
from azure.storage.file import FileService
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
POLLING_INTERVAL_SEC = 5
|
POLLING_INTERVAL_SEC = 5
|
||||||
|
|
||||||
|
@ -72,7 +72,7 @@ class OutputStreamer:
|
||||||
files = self.client.jobs.list_output_files(
|
files = self.client.jobs.list_output_files(
|
||||||
self.resource_group, self.job_name,
|
self.resource_group, self.job_name,
|
||||||
models.JobsListOutputFilesOptions(
|
models.JobsListOutputFilesOptions(
|
||||||
self.output_directory_id))
|
outputdirectoryid=self.output_directory_id))
|
||||||
if not files:
|
if not files:
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
|
@ -248,12 +248,12 @@ def create_job(config, cluster_id, job_name, image_name, command, number_of_vms=
|
||||||
|
|
||||||
parameters = models.job_create_parameters.JobCreateParameters(
|
parameters = models.job_create_parameters.JobCreateParameters(
|
||||||
location=config.location,
|
location=config.location,
|
||||||
cluster=models.ResourceId(cluster_id),
|
cluster=models.ResourceId(id=cluster_id),
|
||||||
node_count=number_of_vms,
|
node_count=number_of_vms,
|
||||||
input_directories=input_directories,
|
input_directories=input_directories,
|
||||||
std_out_err_path_prefix=std_output_path_prefix,
|
std_out_err_path_prefix=std_output_path_prefix,
|
||||||
output_directories=output_directories,
|
output_directories=output_directories,
|
||||||
container_settings=models.ContainerSettings(models.ImageSourceRegistry(image=image_name)),
|
container_settings=models.ContainerSettings(image_source_registry=models.ImageSourceRegistry(image=image_name)),
|
||||||
custom_toolkit_settings=models.CustomToolkitSettings(command_line=command))
|
custom_toolkit_settings=models.CustomToolkitSettings(command_line=command))
|
||||||
|
|
||||||
|
|
||||||
|
@ -268,7 +268,7 @@ def wait_for_job(config, job_name):
|
||||||
|
|
||||||
def setup_cluster(config):
|
def setup_cluster(config):
|
||||||
client = client_from(config)
|
client = client_from(config)
|
||||||
container_setting_for = lambda img: models.ContainerSettings(image_source_registry=models.ImageSourceRegistry(img))
|
container_setting_for = lambda img: models.ContainerSettings(image_source_registry=models.ImageSourceRegistry(image=img))
|
||||||
container_settings = [container_setting_for(img) for img in config.image_names]
|
container_settings = [container_setting_for(img) for img in config.image_names]
|
||||||
|
|
||||||
volumes = create_volume(config.storage_account['name'],config.storage_account['key'], config.fileshare_name, config.fileshare_mount_point)
|
volumes = create_volume(config.storage_account['name'],config.storage_account['key'], config.fileshare_name, config.fileshare_mount_point)
|
||||||
|
|
Загрузка…
Ссылка в новой задаче