* inital, remove unused imports

* run yapf

* remove unused imports and variables, fix declaration outside init

* fix some pylint warnings, add ssh_into_master

* remove unused imports

* unused variables

* string and function normalization

* stop using list comprehension for side effects, make method function

* stop using protected member

* various pylint fixes

* formatting

* formatting

* add retry decorator with tests

* start adding retry decorator, retry docker compose download

* update pip and tests

* logic fix

* change no delete if

* factor out reused functions

* fix wait_for_all_nodes

* fix download return type bug

* test vsts ci update

* temporarily disable integration tests

* syntax fix

* update vsts build

* add back integration tests, remove debug branch

* remove parallel unit tests

* more verbose clis

* update pylint

* typo

* fix imports

* function returns nothing, don't return

* make iterator list

* change debug value
This commit is contained in:
Jacob Freck 2018-08-24 17:21:22 -07:00 коммит произвёл GitHub
Родитель 0a9ce94104
Коммит 828162ef10
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
142 изменённых файлов: 1984 добавлений и 1728 удалений

Просмотреть файл

@ -1,7 +1,6 @@
trigger:
- master
phases:
- phase: Test
queue: Hosted Linux Preview
@ -24,16 +23,22 @@ phases:
displayName: yapf
- script: |
pylint -j 2 -E aztk aztk_cli
pylint -jobs 2 --errors-only aztk aztk_cli
condition: succeeded()
displayName: pylint
displayName: pylint error check
- script: |
pytest -n 20 --ignore=tests/integration_tests
pytest --ignore=tests/integration_tests
condition: succeeded()
displayName: unit tests
- script: |
pytest -n 75
pytest --numprocesses=75
condition: succeeded()
displayName: integration tests
- script: |
pylint -jobs 2 --disable=fixme aztk aztk_cli
continueOnError: true
condition: succeeded()
displayName: pylint report

Просмотреть файл

@ -1,10 +1,19 @@
from aztk import models
from aztk.internal import cluster_data
from aztk.utils import ssh as ssh_lib
from .helpers import (create_user_on_cluster, create_user_on_node, delete_user_on_cluster, delete_user_on_node,
generate_user_on_cluster, generate_user_on_node, get_application_log, get_remote_login_settings,
node_run, run, ssh_into_node)
from .helpers import (
create_user_on_cluster,
create_user_on_node,
delete_user_on_cluster,
delete_user_on_node,
generate_user_on_cluster,
generate_user_on_node,
get_application_log,
get_remote_login_settings,
node_run,
run,
ssh_into_node,
)
class BaseOperations:
@ -15,14 +24,14 @@ class BaseOperations:
Azure Batch service.
blob_client (:obj:`azure.storage.blob.BlockBlobService`): Client used to interact with the Azure Storage
Blob service.
secrets_configuration (:obj:`aztk.models.SecretsConfiguration`): Model that holds AZTK secrets used to authenticate
with Azure and the clusters.
secrets_configuration (:obj:`aztk.models.SecretsConfiguration`):
Model that holds AZTK secrets used to authenticate with Azure and the clusters.
"""
def __init__(self, context):
self.batch_client = context['batch_client']
self.blob_client = context['blob_client']
self.secrets_configuration = context['secrets_configuration']
self.batch_client = context["batch_client"]
self.blob_client = context["blob_client"]
self.secrets_configuration = context["secrets_configuration"]
def get_cluster_configuration(self, id: str) -> models.ClusterConfiguration:
"""Open an ssh tunnel to a node
@ -62,7 +71,8 @@ class BaseOperations:
id (:obj:`str`): the id of the cluster the node is in
node_id (:obj:`str`): the id of the node to open the ssh tunnel to
username (:obj:`str`): the username to authenticate the ssh session
ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None.
ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password.
Defaults to None.
password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None.
port_forward_list (:obj:`List[PortForwardingSpecification`, optional): list of PortForwardingSpecifications.
The defined ports will be forwarded to the client.
@ -89,7 +99,7 @@ class BaseOperations:
"""
return create_user_on_node.create_user_on_node(self, id, node_id, username, ssh_key, password)
#TODO: remove nodes as param
# TODO: remove nodes as param
def create_user_on_cluster(self, id, nodes, username, ssh_pub_key=None, password=None):
"""Create a user on every node in the cluster
@ -97,7 +107,8 @@ class BaseOperations:
username (:obj:`str`): name of the user to create.
id (:obj:`str`): id of the cluster to create the user on.
nodes (:obj:`List[ComputeNode]`): list of nodes to create the user on
ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None.
ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password.
Defaults to None.
password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None.
Returns:
@ -117,7 +128,7 @@ class BaseOperations:
"""
return generate_user_on_node.generate_user_on_node(self, id, node_id)
#TODO: remove nodes as param
# TODO: remove nodes as param
def generate_user_on_cluster(self, id, nodes):
"""Create a user with an autogenerated username and ssh_key on the cluster
@ -143,7 +154,7 @@ class BaseOperations:
"""
return delete_user_on_node.delete_user(self, id, node_id, username)
#TODO: remove nodes as param
# TODO: remove nodes as param
def delete_user_on_cluster(self, username, id, nodes):
"""Delete a user on every node in the cluster
@ -212,10 +223,11 @@ class BaseOperations:
Args:
id (:obj:`str`): the id of the cluster to run the command on.
application_name (:obj:`str`): str
tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. Otherwise, the whole log will be retrieved.
Only use this if streaming the log as it is being written. Defaults to False.
current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are retrieved.
Only useful is streaming the log as it is being written. Only used if tail is True.
tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes.
Otherwise, the whole log will be retrieved. Only use this if streaming the log as it is being written.
Defaults to False.
current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes
are retrieved. Only useful is streaming the log as it is being written. Only used if tail is True.
Returns:
:obj:`aztk.models.ApplicationLog`: a model representing the output of the application.

Просмотреть файл

@ -1,7 +1,7 @@
import concurrent.futures
#TODO: remove nodes param
# TODO: remove nodes param
def create_user_on_cluster(base_operations, id, nodes, username, ssh_pub_key=None, password=None):
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {

Просмотреть файл

@ -3,7 +3,6 @@ from datetime import datetime, timedelta, timezone
import azure.batch.models as batch_models
import azure.batch.models.batch_error as batch_error
from aztk import models
from aztk.utils import get_ssh_key

Просмотреть файл

@ -1,7 +1,7 @@
import concurrent.futures
#TODO: remove nodes param
# TODO: remove nodes param
def delete_user_on_cluster(base_client, id, nodes, username):
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [executor.submit(base_client.delete_user_on_node, id, node.id, username) for node in nodes]

Просмотреть файл

@ -5,11 +5,11 @@ from Cryptodome.PublicKey import RSA
from aztk.utils import secure_utils
#TODO: remove nodes param
# TODO: remove nodes param
def generate_user_on_cluster(base_operations, id, nodes):
generated_username = secure_utils.generate_random_string()
ssh_key = RSA.generate(2048)
ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8')
ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8")
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {
executor.submit(base_operations.create_user_on_node, id, node.id, generated_username, ssh_pub_key): node

Просмотреть файл

@ -6,6 +6,6 @@ from aztk.utils import secure_utils
def generate_user_on_node(base_client, pool_id, node_id):
generated_username = secure_utils.generate_random_string()
ssh_key = RSA.generate(2048)
ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8')
ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8")
base_client.create_user_on_node(pool_id, node_id, generated_username, ssh_pub_key)
return generated_username, ssh_key

Просмотреть файл

@ -4,12 +4,10 @@ import azure
import azure.batch.models as batch_models
import azure.batch.models.batch_error as batch_error
from aztk import error
from aztk import models
from aztk import error, models
from aztk.utils import constants, helpers
output_file = constants.TASK_WORKING_DIR + \
"/" + constants.SPARK_SUBMIT_LOGS_FILE
output_file = constants.TASK_WORKING_DIR + "/" + constants.SPARK_SUBMIT_LOGS_FILE
def __check_task_node_exist(batch_client, cluster_id: str, task: batch_models.CloudTask) -> bool:
@ -50,17 +48,18 @@ def __get_output_file_properties(batch_client, cluster_id: str, application_name
def get_log_from_storage(blob_client, container_name, application_name, task):
try:
blob = blob_client.get_blob_to_text(container_name, application_name + '/' + constants.SPARK_SUBMIT_LOGS_FILE)
blob = blob_client.get_blob_to_text(container_name, application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE)
except azure.common.AzureMissingResourceHttpError:
raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.")
return models.ApplicationLog(
name=application_name,
cluster_id=container_name,
application_state=task.state._value_,
application_state=task.state.name,
log=blob.content,
total_bytes=blob.properties.content_length,
exit_code=task.execution_info.exit_code)
exit_code=task.execution_info.exit_code,
)
def get_log(batch_client, blob_client, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0):
@ -88,18 +87,20 @@ def get_log(batch_client, blob_client, cluster_id: str, application_name: str, t
return models.ApplicationLog(
name=application_name,
cluster_id=cluster_id,
application_state=task.state._value_,
application_state=task.state.name,
log=content,
total_bytes=target_bytes,
exit_code=task.execution_info.exit_code)
exit_code=task.execution_info.exit_code,
)
else:
return models.ApplicationLog(
name=application_name,
cluster_id=cluster_id,
application_state=task.state._value_,
log='',
application_state=task.state.name,
log="",
total_bytes=target_bytes,
exit_code=task.execution_info.exit_code)
exit_code=task.execution_info.exit_code,
)
def get_application_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0):

Просмотреть файл

@ -22,9 +22,10 @@ def node_run(base_client, cluster_id, node_id, command, internal, container_name
generated_username,
node_rls.ip_address,
node_rls.port,
ssh_key=ssh_key.exportKey().decode('utf-8'),
ssh_key=ssh_key.exportKey().decode("utf-8"),
container_name=container_name,
timeout=timeout)
timeout=timeout,
)
return output
finally:
base_client.delete_user_on_node(cluster_id, node.id, generated_username)

Просмотреть файл

@ -26,9 +26,10 @@ def cluster_run(base_operations, cluster_id, command, internal, container_name=N
command,
generated_username,
cluster_nodes,
ssh_key=ssh_key.exportKey().decode('utf-8'),
ssh_key=ssh_key.exportKey().decode("utf-8"),
container_name=container_name,
timeout=timeout))
timeout=timeout,
))
return output
except OSError as exc:
raise exc

Просмотреть файл

@ -13,8 +13,6 @@ import aztk.utils.constants as constants
import aztk.utils.get_ssh_key as get_ssh_key
import aztk.utils.helpers as helpers
import aztk.utils.ssh as ssh_lib
from aztk.client.cluster import CoreClusterOperations
from aztk.client.job import CoreJobOperations
from aztk.internal import cluster_data
from aztk.utils import deprecated, secure_utils
@ -27,6 +25,11 @@ class CoreClient:
"""
def __init__(self):
self.secrets_configuration = None
self.batch_client = None
self.blob_client = None
def _get_context(self, secrets_configuration: models.SecretsConfiguration):
self.secrets_configuration = secrets_configuration
@ -34,9 +37,9 @@ class CoreClient:
self.batch_client = azure_api.make_batch_client(secrets_configuration)
self.blob_client = azure_api.make_blob_client(secrets_configuration)
context = {
'batch_client': self.batch_client,
'blob_client': self.blob_client,
'secrets_configuration': self.secrets_configuration,
"batch_client": self.batch_client,
"blob_client": self.blob_client,
"secrets_configuration": self.secrets_configuration,
}
return context
@ -52,9 +55,9 @@ class CoreClient:
"""
return cluster_data.ClusterData(self.blob_client, cluster_id)
'''
"""
General Batch Operations
'''
"""
@deprecated("0.10.0")
def __delete_pool_and_job(self, pool_id: str, keep_logs: bool = False):
@ -104,9 +107,8 @@ class CoreClient:
job_id = cluster_conf.cluster_id
# Get a verified node agent sku
sku_to_use, image_ref_to_use = \
helpers.select_latest_verified_vm_image_with_node_agent_sku(
VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client)
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client)
network_conf = None
if cluster_conf.subnet_id is not None:
@ -130,8 +132,9 @@ class CoreClient:
metadata=[
batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key),
batch_models.MetadataItem(
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA)
])
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA),
],
)
# Create the pool + create user for the pool
helpers.create_pool_if_not_exist(pool, self.batch_client)
@ -184,13 +187,16 @@ class CoreClient:
"""
# Create new ssh user for the given node
self.batch_client.compute_node.add_user(
pool_id, node_id,
pool_id,
node_id,
batch_models.ComputeNodeUser(
name=username,
is_admin=True,
password=password,
ssh_public_key=get_ssh_key.get_user_public_key(ssh_key, self.secrets_configuration),
expiry_time=datetime.now(timezone.utc) + timedelta(days=365)))
expiry_time=datetime.now(timezone.utc) + timedelta(days=365),
),
)
@deprecated("0.10.0")
def __delete_user(self, pool_id: str, node_id: str, username: str) -> str:
@ -229,7 +235,7 @@ class CoreClient:
def __generate_user_on_node(self, pool_id, node_id):
generated_username = secure_utils.generate_random_string()
ssh_key = RSA.generate(2048)
ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8')
ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8")
self.__create_user_on_node(generated_username, pool_id, node_id, ssh_pub_key)
return generated_username, ssh_key
@ -237,7 +243,7 @@ class CoreClient:
def __generate_user_on_pool(self, pool_id, nodes):
generated_username = secure_utils.generate_random_string()
ssh_key = RSA.generate(2048)
ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8')
ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8")
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {
executor.submit(self.__create_user_on_node, generated_username, pool_id, node.id, ssh_pub_key): node
@ -283,9 +289,10 @@ class CoreClient:
generated_username,
node_rls.ip_address,
node_rls.port,
ssh_key=ssh_key.exportKey().decode('utf-8'),
ssh_key=ssh_key.exportKey().decode("utf-8"),
container_name=container_name,
timeout=timeout)
timeout=timeout,
)
return output
finally:
self.__delete_user(cluster_id, node.id, generated_username)
@ -306,9 +313,10 @@ class CoreClient:
command,
generated_username,
cluster_nodes,
ssh_key=ssh_key.exportKey().decode('utf-8'),
ssh_key=ssh_key.exportKey().decode("utf-8"),
container_name=container_name,
timeout=timeout))
timeout=timeout,
))
return output
except OSError as exc:
raise exc
@ -316,14 +324,16 @@ class CoreClient:
self.__delete_user_on_pool(generated_username, pool.id, nodes)
@deprecated("0.10.0")
def __cluster_copy(self,
cluster_id,
source_path,
destination_path=None,
container_name=None,
internal=False,
get=False,
timeout=None):
def __cluster_copy(
self,
cluster_id,
source_path,
destination_path=None,
container_name=None,
internal=False,
get=False,
timeout=None,
):
pool, nodes = self.__get_pool_details(cluster_id)
nodes = list(nodes)
if internal:
@ -340,9 +350,10 @@ class CoreClient:
nodes=cluster_nodes,
source_path=source_path,
destination_path=destination_path,
ssh_key=ssh_key.exportKey().decode('utf-8'),
ssh_key=ssh_key.exportKey().decode("utf-8"),
get=get,
timeout=timeout))
timeout=timeout,
))
return output
except (OSError, batch_error.BatchErrorException) as exc:
raise exc
@ -375,8 +386,16 @@ class CoreClient:
)
@deprecated("0.10.0")
def __submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula,
software_metadata_key: str, vm_image_model, application_metadata):
def __submit_job(
self,
job_configuration,
start_task,
job_manager_task,
autoscale_formula,
software_metadata_key: str,
vm_image_model,
application_metadata,
):
"""
Job Submission
:param job_configuration -> aztk_sdk.spark.models.JobConfiguration
@ -390,9 +409,8 @@ class CoreClient:
self._get_cluster_data(job_configuration.id).save_cluster_config(job_configuration.to_cluster_config())
# get a verified node agent sku
sku_to_use, image_ref_to_use = \
helpers.select_latest_verified_vm_image_with_node_agent_sku(
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, self.batch_client)
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, self.batch_client)
# set up subnet if necessary
network_conf = None
@ -419,8 +437,10 @@ class CoreClient:
metadata=[
batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key),
batch_models.MetadataItem(
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA)
]))
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA),
],
),
)
# define job specification
job_spec = batch_models.JobSpecification(
@ -428,7 +448,8 @@ class CoreClient:
display_name=job_configuration.id,
on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job,
job_manager_task=job_manager_task,
metadata=[batch_models.MetadataItem(name='applications', value=application_metadata)])
metadata=[batch_models.MetadataItem(name="applications", value=application_metadata)],
)
# define schedule
schedule = batch_models.Schedule(

Просмотреть файл

@ -8,14 +8,16 @@ from aztk.utils import ssh as ssh_lib
from aztk.utils import helpers
def cluster_copy(cluster_operations,
cluster_id,
source_path,
destination_path=None,
container_name=None,
internal=False,
get=False,
timeout=None):
def cluster_copy(
cluster_operations,
cluster_id,
source_path,
destination_path=None,
container_name=None,
internal=False,
get=False,
timeout=None,
):
cluster = cluster_operations.get(cluster_id)
pool, nodes = cluster.pool, list(cluster.nodes)
if internal:
@ -36,9 +38,10 @@ def cluster_copy(cluster_operations,
nodes=cluster_nodes,
source_path=source_path,
destination_path=destination_path,
ssh_key=ssh_key.exportKey().decode('utf-8'),
ssh_key=ssh_key.exportKey().decode("utf-8"),
get=get,
timeout=timeout))
timeout=timeout,
))
return output
except (OSError, batch_error.BatchErrorException) as exc:
raise exc

Просмотреть файл

@ -5,8 +5,13 @@ from aztk import models
from aztk.utils import helpers, constants
def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterConfiguration, software_metadata_key: str,
start_task, VmImageModel):
def create_pool_and_job(
core_cluster_operations,
cluster_conf: models.ClusterConfiguration,
software_metadata_key: str,
start_task,
VmImageModel,
):
"""
Create a pool and job
:param cluster_conf: the configuration object used to create the cluster
@ -22,9 +27,8 @@ def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterCon
job_id = cluster_conf.cluster_id
# Get a verified node agent sku
sku_to_use, image_ref_to_use = \
helpers.select_latest_verified_vm_image_with_node_agent_sku(
VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, core_cluster_operations.batch_client)
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, core_cluster_operations.batch_client)
network_conf = None
if cluster_conf.subnet_id is not None:
@ -48,8 +52,9 @@ def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterCon
metadata=[
batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key),
batch_models.MetadataItem(
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA)
])
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA),
],
)
# Create the pool + create user for the pool
helpers.create_pool_if_not_exist(pool, core_cluster_operations.batch_client)

Просмотреть файл

@ -1,4 +1,7 @@
import azure.batch.models as batch_models
from msrest.exceptions import ClientRequestError
from aztk.utils import BackOffPolicy, retry
def delete_pool_and_job(core_cluster_operations, pool_id: str, keep_logs: bool = False):
@ -19,13 +22,18 @@ def delete_pool_and_job(core_cluster_operations, pool_id: str, keep_logs: bool =
pool_exists = core_cluster_operations.batch_client.pool.exists(pool_id)
if job_exists:
core_cluster_operations.batch_client.job.delete(job_id)
delete_batch_object(core_cluster_operations.batch_client.job.delete, job_id)
if pool_exists:
core_cluster_operations.batch_client.pool.delete(pool_id)
delete_batch_object(core_cluster_operations.batch_client.pool.delete, pool_id)
if not keep_logs:
cluster_data = core_cluster_operations.get_cluster_data(pool_id)
cluster_data.delete_container(pool_id)
return job_exists or pool_exists
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
def delete_batch_object(function, *args, **kwargs):
return function(*args, **kwargs)

Просмотреть файл

@ -1,4 +1,4 @@
#TODO: return Cluster instead of (pool, nodes)
# TODO: return Cluster instead of (pool, nodes)
from aztk import models

Просмотреть файл

@ -13,7 +13,8 @@ class CoreClusterOperations(BaseOperations):
cluster_configuration (:obj:`aztk.models.ClusterConfiguration`): Configuration for the cluster to be created
software_metadata_key (:obj:`str`): the key for the primary software that will be run on the cluster
start_task (:obj:`azure.batch.models.StartTask`): Batch StartTask defintion to configure the Batch Pool
vm_image_model (:obj:`azure.batch.models.VirtualMachineConfiguration`): Configuration of the virtual machine image and settings
vm_image_model (:obj:`azure.batch.models.VirtualMachineConfiguration`):
Configuration of the virtual machine image and settings
Returns:
:obj:`aztk.models.Cluster`: A Cluster object representing the state and configuration of the cluster.
@ -52,7 +53,8 @@ class CoreClusterOperations(BaseOperations):
Defaults to None.
Returns:
:obj:`List[aztk.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command.
:obj:`List[aztk.models.NodeOutput]`:
A list of NodeOutput objects representing the output of the copy command.
"""
return copy.cluster_copy(self, id, source_path, destination_path, container_name, internal, get, timeout)
@ -65,7 +67,8 @@ class CoreClusterOperations(BaseOperations):
Defaults to False.
Returns:
:obj:`List[aztk.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command.
:obj:`List[aztk.models.NodeOutput]`:
A list of NodeOutput objects representing the output of the copy command.
"""
return delete.delete_pool_and_job(self, id, keep_logs)

Просмотреть файл

@ -1,11 +1,20 @@
from datetime import timedelta
import azure.batch.models as batch_models
from aztk.utils import helpers, constants
from aztk.utils import constants, helpers
def submit_job(job_client, job_configuration, start_task, job_manager_task, autoscale_formula,
software_metadata_key: str, vm_image_model, application_metadata):
def submit_job(
job_client,
job_configuration,
start_task,
job_manager_task,
autoscale_formula,
software_metadata_key: str,
vm_image_model,
application_metadata,
):
"""
Job Submission
:param job_configuration -> aztk_sdk.spark.models.JobConfiguration
@ -19,9 +28,8 @@ def submit_job(job_client, job_configuration, start_task, job_manager_task, auto
job_client.get_cluster_data(job_configuration.id).save_cluster_config(job_configuration.to_cluster_config())
# get a verified node agent sku
sku_to_use, image_ref_to_use = \
helpers.select_latest_verified_vm_image_with_node_agent_sku(
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, job_client.batch_client)
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, job_client.batch_client)
# set up subnet if necessary
network_conf = None
@ -48,8 +56,10 @@ def submit_job(job_client, job_configuration, start_task, job_manager_task, auto
metadata=[
batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key),
batch_models.MetadataItem(
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA)
]))
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA),
],
),
)
# define job specification
job_spec = batch_models.JobSpecification(
@ -57,7 +67,8 @@ def submit_job(job_client, job_configuration, start_task, job_manager_task, auto
display_name=job_configuration.id,
on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job,
job_manager_task=job_manager_task,
metadata=[batch_models.MetadataItem(name='applications', value=application_metadata)])
metadata=[batch_models.MetadataItem(name="applications", value=application_metadata)],
)
# define schedule
schedule = batch_models.Schedule(

Просмотреть файл

@ -4,8 +4,16 @@ from .helpers import submit
class CoreJobOperations(BaseOperations):
def submit(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key: str,
vm_image_model, application_metadata):
def submit(
self,
job_configuration,
start_task,
job_manager_task,
autoscale_formula,
software_metadata_key: str,
vm_image_model,
application_metadata,
):
"""Submit a job
Jobs are a cluster definition and one or many application definitions which run on the cluster. The job's
@ -26,5 +34,13 @@ class CoreJobOperations(BaseOperations):
Returns:
:obj:`azure.batch.models.CloudJobSchedule`: Model representing the Azure Batch JobSchedule state.
"""
return submit.submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula,
software_metadata_key, vm_image_model, application_metadata)
return submit.submit_job(
self,
job_configuration,
start_task,
job_manager_task,
autoscale_formula,
software_metadata_key,
vm_image_model,
application_metadata,
)

Просмотреть файл

@ -35,8 +35,8 @@ class Field:
"""
def __init__(self, *validators, **kwargs):
self.default = kwargs.get('default')
self.required = 'default' not in kwargs
self.default = kwargs.get("default")
self.required = "default" not in kwargs
self.validators = []
if self.required:
@ -44,7 +44,7 @@ class Field:
self.validators.extend(validators)
choices = kwargs.get('choices')
choices = kwargs.get("choices")
if choices:
self.validators.append(aztk_validators.In(choices))
@ -134,11 +134,11 @@ class List(Field):
def __init__(self, model=None, **kwargs):
self.model = model
kwargs.setdefault('default', list)
self.merge_strategy = kwargs.get('merge_strategy', ListMergeStrategy.Append)
self.skip_none = kwargs.get('skip_none', True)
kwargs.setdefault("default", list)
self.merge_strategy = kwargs.get("merge_strategy", ListMergeStrategy.Append)
self.skip_none = kwargs.get("skip_none", True)
super().__init__(aztk_validators.List(*kwargs.get('inner_validators', [])), **kwargs)
super().__init__(aztk_validators.List(*kwargs.get("inner_validators", [])), **kwargs)
def __set__(self, instance, value):
if isinstance(value, collections.MutableSequence):
@ -175,7 +175,7 @@ class List(Field):
output = []
if items is not None:
for item in items:
if hasattr(item, 'to_dict'):
if hasattr(item, "to_dict"):
output.append(item.to_dict())
else:
output.append(item)
@ -196,7 +196,7 @@ class Model(Field):
super().__init__(aztk_validators.Model(model), *args, **kwargs)
self.model = model
self.merge_strategy = kwargs.get('merge_strategy', ModelMergeStrategy.Merge)
self.merge_strategy = kwargs.get("merge_strategy", ModelMergeStrategy.Merge)
def __set__(self, instance, value):
if isinstance(value, collections.MutableMapping):

Просмотреть файл

@ -11,19 +11,19 @@ class ModelMeta(type):
"""
def __new__(mcs, name, bases, attrs):
attrs['_fields'] = {}
attrs["_fields"] = {}
for base in bases:
if hasattr(base, '_fields'):
if hasattr(base, "_fields"):
for k, v in base._fields.items():
attrs['_fields'][k] = v
attrs["_fields"][k] = v
for k, v in base.__dict__.items():
if isinstance(v, fields.Field):
attrs['_fields'][k] = v
attrs["_fields"][k] = v
for k, v in attrs.items():
if isinstance(v, fields.Field):
attrs['_fields'][k] = v
attrs["_fields"][k] = v
return super().__new__(mcs, name, bases, attrs)
@ -84,7 +84,7 @@ class Model(metaclass=ModelMeta):
e.model = self
raise e
if hasattr(self, '__validate__'):
if hasattr(self, "__validate__"):
self.__validate__()
def merge(self, other):

Просмотреть файл

@ -24,7 +24,7 @@ class Required(Validator):
def validate(self, value):
if value is None:
raise InvalidModelFieldError('is required')
raise InvalidModelFieldError("is required")
class String(Validator):
@ -37,7 +37,7 @@ class String(Validator):
return
if not isinstance(value, str):
raise InvalidModelFieldError('{0} should be a string'.format(value))
raise InvalidModelFieldError("{0} should be a string".format(value))
class Integer(Validator):
@ -50,7 +50,7 @@ class Integer(Validator):
return
if not isinstance(value, int):
raise InvalidModelFieldError('{0} should be an integer'.format(value))
raise InvalidModelFieldError("{0} should be an integer".format(value))
class Float(Validator):
@ -63,7 +63,7 @@ class Float(Validator):
return
if not isinstance(value, float):
raise InvalidModelFieldError('{0} should be a float'.format(value))
raise InvalidModelFieldError("{0} should be a float".format(value))
class Boolean(Validator):
@ -74,7 +74,7 @@ class Boolean(Validator):
return
if not isinstance(value, bool):
raise InvalidModelFieldError('{0} should be a boolean'.format(value))
raise InvalidModelFieldError("{0} should be a boolean".format(value))
class In(Validator):
@ -90,7 +90,7 @@ class In(Validator):
return
if value not in self.choices:
raise InvalidModelFieldError('{0} should be in {1}'.format(value, self.choices))
raise InvalidModelFieldError("{0} should be in {1}".format(value, self.choices))
class InstanceOf(Validator):
@ -140,7 +140,7 @@ class List(Validator):
return
if not isinstance(value, collections.MutableSequence):
raise InvalidModelFieldError('should be a list')
raise InvalidModelFieldError("should be a list")
for i in value:
for validator in self.validators:

Просмотреть файл

@ -1,6 +1,7 @@
import azure.batch.models as batch_models
import datetime
from azure.storage.blob import BlockBlobService, BlobPermissions
import azure.batch.models as batch_models
from azure.storage.blob import BlobPermissions, BlockBlobService
class BlobData:
@ -19,7 +20,8 @@ class BlobData:
self.container,
self.blob,
permission=BlobPermissions.READ,
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365))
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365),
)
sas_url = self.blob_client.make_blob_url(self.container, self.blob, sas_token=sas_token)

Просмотреть файл

@ -3,8 +3,10 @@ import logging
import azure.common
import yaml
from msrest.exceptions import ClientRequestError
from aztk.models import ClusterConfiguration
from aztk.utils import BackOffPolicy, retry
from .blob_data import BlobData
from .node_data import NodeData
@ -14,6 +16,7 @@ class ClusterData:
"""
Class handling the management of data for a cluster
"""
# ALl data related to cluster(config, metadata, etc.) should be under this folder
CLUSTER_DIR = "cluster"
APPLICATIONS_DIR = "applications"
@ -24,26 +27,30 @@ class ClusterData:
self.cluster_id = cluster_id
self._ensure_container()
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
def save_cluster_config(self, cluster_config):
blob_path = self.CLUSTER_DIR + "/" + self.CLUSTER_CONFIG_FILE
content = yaml.dump(cluster_config)
container_name = cluster_config.cluster_id
self.blob_client.create_blob_from_text(container_name, blob_path, content)
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
def read_cluster_config(self):
blob_path = self.CLUSTER_DIR + "/" + self.CLUSTER_CONFIG_FILE
try:
result = self.blob_client.get_blob_to_text(self.cluster_id, blob_path)
return yaml.load(result.content)
except azure.common.AzureMissingResourceHttpError:
logging.warn("Cluster %s doesn't have cluster configuration in storage", self.cluster_id)
logging.warning("Cluster %s doesn't have cluster configuration in storage", self.cluster_id)
except yaml.YAMLError:
logging.warn("Cluster %s contains invalid cluster configuration in blob", self.cluster_id)
logging.warning("Cluster %s contains invalid cluster configuration in blob", self.cluster_id)
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
def upload_file(self, blob_path: str, local_path: str) -> BlobData:
self.blob_client.create_blob_from_path(self.cluster_id, blob_path, local_path)
return BlobData(self.blob_client, self.cluster_id, blob_path)
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
def upload_bytes(self, blob_path: str, bytes_io: io.BytesIO) -> BlobData:
self.blob_client.create_blob_from_bytes(self.cluster_id, blob_path, bytes_io.getvalue())
return BlobData(self.blob_client, self.cluster_id, blob_path)
@ -61,8 +68,10 @@ class ClusterData:
def upload_node_data(self, node_data: NodeData) -> BlobData:
return self.upload_cluster_file("node-scripts.zip", node_data.zip_path)
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
def _ensure_container(self):
self.blob_client.create_container(self.cluster_id, fail_on_exist=False)
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
def delete_container(self, container_name: str):
self.blob_client.delete_container(container_name)

Просмотреть файл

@ -44,11 +44,11 @@ class NodeData:
return
if isinstance(file, (str, bytes)):
full_file_path = Path(file)
with io.open(file, 'r', encoding='UTF-8') as f:
with io.open(file, "r", encoding="UTF-8") as f:
if binary:
self.zipf.write(file, os.path.join(zip_dir, full_file_path.name))
else:
self.zipf.writestr(os.path.join(zip_dir, full_file_path.name), f.read().replace('\r\n', '\n'))
self.zipf.writestr(os.path.join(zip_dir, full_file_path.name), f.read().replace("\r\n", "\n"))
elif isinstance(file, models.File):
self.zipf.writestr(os.path.join(zip_dir, file.name), file.payload.getvalue())
@ -77,36 +77,38 @@ class NodeData:
return
self.add_files(
[
spark_configuration.spark_defaults_conf, spark_configuration.spark_env_sh,
spark_configuration.core_site_xml
spark_configuration.spark_defaults_conf,
spark_configuration.spark_env_sh,
spark_configuration.core_site_xml,
],
'conf',
binary=False)
"conf",
binary=False,
)
# add ssh keys for passwordless ssh
self.zipf.writestr('id_rsa.pub', spark_configuration.ssh_key_pair['pub_key'])
self.zipf.writestr('id_rsa', spark_configuration.ssh_key_pair['priv_key'])
self.zipf.writestr("id_rsa.pub", spark_configuration.ssh_key_pair["pub_key"])
self.zipf.writestr("id_rsa", spark_configuration.ssh_key_pair["priv_key"])
if spark_configuration.jars:
for jar in spark_configuration.jars:
self.add_file(jar, 'jars', binary=True)
self.add_file(jar, "jars", binary=True)
def _add_user_conf(self):
user_conf = self.cluster_config.user_configuration
if not user_conf:
return
encrypted_aes_session_key, cipher_aes_nonce, tag, ciphertext = secure_utils.encrypt_password(
self.cluster_config.spark_configuration.ssh_key_pair['pub_key'], user_conf.password)
self.cluster_config.spark_configuration.ssh_key_pair["pub_key"], user_conf.password)
user_conf = yaml.dump({
'username': user_conf.username,
'password': ciphertext,
'ssh-key': user_conf.ssh_key,
'aes_session_key': encrypted_aes_session_key,
'cipher_aes_nonce': cipher_aes_nonce,
'tag': tag,
'cluster_id': self.cluster_config.cluster_id
"username": user_conf.username,
"password": ciphertext,
"ssh-key": user_conf.ssh_key,
"aes_session_key": encrypted_aes_session_key,
"cipher_aes_nonce": cipher_aes_nonce,
"tag": tag,
"cluster_id": self.cluster_config.cluster_id,
})
self.zipf.writestr('user.yaml', user_conf)
self.zipf.writestr("user.yaml", user_conf)
def _add_plugins(self):
if not self.cluster_config.plugins:
@ -115,23 +117,22 @@ class NodeData:
data = []
for plugin in self.cluster_config.plugins:
for file in plugin.files:
zipf = self.zipf.writestr('plugins/{0}/{1}'.format(plugin.name, file.target), file.content())
self.zipf.writestr("plugins/{0}/{1}".format(plugin.name, file.target), file.content())
if plugin.execute:
data.append(
dict(
name=plugin.name,
execute='{0}/{1}'.format(plugin.name, plugin.execute),
execute="{0}/{1}".format(plugin.name, plugin.execute),
args=plugin.args,
env=plugin.env,
target=plugin.target.value,
target_role=plugin.target_role.value,
))
self.zipf.writestr(os.path.join('plugins', 'plugins-manifest.yaml'), yaml.dump(data))
return zipf
self.zipf.writestr(os.path.join("plugins", "plugins-manifest.yaml"), yaml.dump(data))
def _add_node_scripts(self):
self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=['*.pyc*', '*.png'])
self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=["*.pyc*", "*.png"])
def _includeFile(self, filename: str, exclude: List[str]) -> bool:
exclude = exclude or []

Просмотреть файл

@ -1,4 +1,3 @@
import os
from aztk.utils.command_builder import CommandBuilder
@ -9,30 +8,30 @@ class DockerCmd:
def __init__(self, name: str, docker_repo: str, docker_run_options: str, cmd: str, gpu_enabled=False):
if gpu_enabled:
self.cmd = CommandBuilder('nvidia-docker run')
self.cmd = CommandBuilder("nvidia-docker run")
else:
self.cmd = CommandBuilder('docker run')
self.cmd.add_option('--net', 'host')
self.cmd.add_option('--name', name)
self.cmd.add_argument('-d')
self.cmd = CommandBuilder("docker run")
self.cmd.add_option("--net", "host")
self.cmd.add_option("--name", name)
self.cmd.add_argument("-d")
self.cmd.add_argument(docker_run_options)
self.cmd.add_argument(docker_repo)
self.cmd.add_argument(cmd)
def add_env(self, env: str, value: str):
self.cmd.add_option('-e', '{0}={1}'.format(env, value))
self.cmd.add_option("-e", "{0}={1}".format(env, value))
def pass_env(self, env: str):
"""
Give the value of an environment variable in the main process to the docker image
"""
self.cmd.add_option('-e', '{0}'.format(env))
self.cmd.add_option("-e", "{0}".format(env))
def share_folder(self, folder: str):
self.cmd.add_option('-v', '{0}:{0}'.format(folder))
self.cmd.add_option("-v", "{0}:{0}".format(folder))
def open_port(self, port: int):
self.cmd.add_option('-p', '{0}:{0}'.format(port)) # Spark Master UI
self.cmd.add_option("-p", "{0}:{0}".format(port)) # Spark Master UI
def to_str(self):
return self.cmd.to_str()

Просмотреть файл

@ -1,9 +1,16 @@
import azure.batch.models as batch_models
class ApplicationLog():
def __init__(self, name: str, cluster_id: str, log: str, total_bytes: int,
application_state: batch_models.TaskState, exit_code: int):
class ApplicationLog:
def __init__(
self,
name: str,
cluster_id: str,
log: str,
total_bytes: int,
application_state: batch_models.TaskState,
exit_code: int,
):
self.name = name
self.cluster_id = cluster_id # TODO: change to something cluster/job agnostic
self.log = log

Просмотреть файл

@ -11,10 +11,8 @@ class Cluster:
self.visible_state = pool.allocation_state.value
else:
self.visible_state = pool.state.value
self.total_current_nodes = pool.current_dedicated_nodes + \
pool.current_low_priority_nodes
self.total_target_nodes = pool.target_dedicated_nodes + \
pool.target_low_priority_nodes
self.total_current_nodes = pool.current_dedicated_nodes + pool.current_low_priority_nodes
self.total_target_nodes = pool.target_dedicated_nodes + pool.target_low_priority_nodes
self.current_dedicated_nodes = pool.current_dedicated_nodes
self.current_low_pri_nodes = pool.current_low_priority_nodes
self.target_dedicated_nodes = pool.target_dedicated_nodes

Просмотреть файл

@ -61,8 +61,8 @@ class ClusterConfiguration(Model):
def __validate__(self) -> bool:
if self.size == 0 and self.size_low_priority == 0:
raise error.InvalidModelError(
"Please supply a valid (greater than 0) size or size_low_priority value either in the cluster.yaml configuration file or with a parameter (--size or --size-low-priority)"
)
"Please supply a valid (greater than 0) size or size_low_priority value either "
"in the cluster.yaml configuration file or with a parameter (--size or --size-low-priority)")
if self.vm_size is None:
raise error.InvalidModelError(
@ -70,8 +70,8 @@ class ClusterConfiguration(Model):
if self.mixed_mode() and not self.subnet_id:
raise error.InvalidModelError(
"You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). Set the VNET's subnet_id in your cluster.yaml or with a parameter (--subnet-id)."
)
"You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). "
"Set the VNET's subnet_id in your cluster.yaml or with a parameter (--subnet-id).")
if self.scheduling_target == SchedulingTarget.Dedicated and self.size == 0:
raise error.InvalidModelError("Scheduling target cannot be Dedicated if dedicated vm size is 0")

Просмотреть файл

@ -1,7 +1,4 @@
import os
import inspect
import importlib.util
from aztk.utils import constants
from aztk.error import InvalidPluginReferenceError
from aztk.spark.models import plugins
@ -28,7 +25,8 @@ class PluginManager:
nvblas=plugins.NvBLASPlugin,
apt_get=plugins.AptGetPlugin,
pip_install=plugins.PipPlugin,
conda_install=plugins.CondaPlugin)
conda_install=plugins.CondaPlugin,
)
def __init__(self):
self.loaded = False

Просмотреть файл

@ -50,7 +50,5 @@ class PluginReference(Model):
execute=script_filename,
target=self.target,
target_role=self.target_role or PluginConfiguration,
files=[
PluginFile(script_filename, self.script),
],
files=[PluginFile(script_filename, self.script)],
)

Просмотреть файл

@ -9,6 +9,7 @@ class PluginTarget(Enum):
"""
Where this plugin should run
"""
SparkContainer = "spark-container"
Host = "host"
@ -26,6 +27,7 @@ class PluginPort(Model):
:param public: [Optional] Port available to the user. If none won't open any port to the user
:param name: [Optional] name to differentiate ports if you have multiple
"""
internal = fields.Integer()
public = fields.Field(default=None)
name = fields.Integer()
@ -55,6 +57,7 @@ class PluginConfiguration(Model):
args: List of arguments to pass to the executing script
env: Dict of environment variables to pass to the script
"""
name = fields.String()
files = fields.List(PluginFile)
execute = fields.String()

Просмотреть файл

@ -15,7 +15,7 @@ class PluginFile(Model):
super().__init__(target=target, local_path=local_path)
def content(self):
with open(self.local_path, "r", encoding='UTF-8') as f:
with open(self.local_path, "r", encoding="UTF-8") as f:
return f.read()

Просмотреть файл

@ -6,6 +6,7 @@ class ServicePrincipalConfiguration(Model):
"""
Container class for AAD authentication
"""
tenant_id = fields.String()
client_id = fields.String()
credential = fields.String()
@ -17,6 +18,7 @@ class SharedKeyConfiguration(Model):
"""
Container class for shared key authentication
"""
batch_account_name = fields.String()
batch_account_key = fields.String()
batch_service_url = fields.String()
@ -34,6 +36,7 @@ class DockerConfiguration(Model):
username (str): Docker endpoint username
password (str): Docker endpoint password
"""
endpoint = fields.String(default=None)
username = fields.String(default=None)
password = fields.String(default=None)

Просмотреть файл

@ -2,4 +2,5 @@ class Software:
"""
Enum with list of available softwares
"""
spark = "spark"

Просмотреть файл

@ -1,4 +1,4 @@
class SSHLog():
class SSHLog:
def __init__(self, output, node_id):
self.output = output
self.node_id = node_id

Просмотреть файл

@ -25,8 +25,8 @@ TOOLKIT_MAP = dict(
r=ToolkitEnvironmentDefinition(),
miniconda=ToolkitEnvironmentDefinition(),
anaconda=ToolkitEnvironmentDefinition(),
)),
)
),
))
class Toolkit(Model):
@ -74,12 +74,12 @@ class Toolkit(Model):
self.environment, self.environment_version, self.software, env_def.versions))
if self.docker_run_options:
invalid_character = re.search('[^A-Za-z0-9 _./:=\-\"]', self.docker_run_options)
invalid_character = re.search(r'[^A-Za-z0-9 _./:=\-"]', self.docker_run_options)
if invalid_character:
raise InvalidModelError(
"Docker run options contains invalid character '{0}'. Only A-Z, a-z, 0-9, space, hyphen (-), "
"underscore (_), period (.), forward slash (/), colon (:), equals(=), comma (,), and "
"double quote (\") are allowed.".format(invalid_character.group(0)))
'double quote (") are allowed.'.format(invalid_character.group(0)))
def get_docker_repo(self, gpu: bool):
if self.docker_repo:
@ -87,10 +87,7 @@ class Toolkit(Model):
repo = "aztk/{0}".format(self.software)
return "{repo}:{tag}".format(
repo=repo,
tag=self._get_docker_tag(gpu),
)
return "{repo}:{tag}".format(repo=repo, tag=self._get_docker_tag(gpu))
def get_docker_run_options(self):
return self.docker_run_options
@ -109,7 +106,7 @@ class Toolkit(Model):
array.append("gpu" if gpu else "base")
return '-'.join(array)
return "-".join(array)
def _get_environment_definition(self) -> ToolkitEnvironmentDefinition:
toolkit = TOOLKIT_MAP.get(self.software)

Просмотреть файл

@ -1,19 +1,20 @@
import os
import re
import logging
import azure.batch.batch_auth as batchauth
import azure.batch.batch_service_client as batch
import azure.storage.blob as blob
import azure.batch.batch_auth as batchauth
from core import log
from azure.common.credentials import ServicePrincipalCredentials
from azure.mgmt.batch import BatchManagementClient
from azure.mgmt.storage import StorageManagementClient
from azure.storage.common import CloudStorageAccount
RESOURCE_ID_PATTERN = re.compile('^/subscriptions/(?P<subscription>[^/]+)'
'/resourceGroups/(?P<resourcegroup>[^/]+)'
'/providers/[^/]+'
'/[^/]+Accounts/(?P<account>[^/]+)$')
from core import log
RESOURCE_ID_PATTERN = re.compile("^/subscriptions/(?P<subscription>[^/]+)"
"/resourceGroups/(?P<resourcegroup>[^/]+)"
"/providers/[^/]+"
"/[^/]+Accounts/(?P<account>[^/]+)$")
batch_account_name = os.environ.get("AZ_BATCH_ACCOUNT_NAME")
batch_account_key = os.environ.get("BATCH_ACCOUNT_KEY")
@ -44,14 +45,14 @@ def get_blob_client() -> blob.BlockBlobService:
account_name=storage_account_name, account_key=storage_account_key, endpoint_suffix=storage_account_suffix)
else:
credentials = ServicePrincipalCredentials(
client_id=client_id, secret=credential, tenant=tenant_id, resource='https://management.core.windows.net/')
client_id=client_id, secret=credential, tenant=tenant_id, resource="https://management.core.windows.net/")
m = RESOURCE_ID_PATTERN.match(storage_resource_id)
accountname = m.group('account')
subscription = m.group('subscription')
resourcegroup = m.group('resourcegroup')
accountname = m.group("account")
subscription = m.group("subscription")
resourcegroup = m.group("resourcegroup")
mgmt_client = StorageManagementClient(credentials, subscription)
key = mgmt_client.storage_accounts.list_keys(
resource_group_name=resourcegroup, account_name=accountname).keys[0].value
key = (mgmt_client.storage_accounts.list_keys(resource_group_name=resourcegroup, account_name=accountname)
.keys[0].value)
storage_client = CloudStorageAccount(accountname, key)
return storage_client.create_block_blob_service()
@ -62,13 +63,13 @@ def get_batch_client() -> batch.BatchServiceClient:
credentials = batchauth.SharedKeyCredentials(batch_account_name, batch_account_key)
else:
credentials = ServicePrincipalCredentials(
client_id=client_id, secret=credential, tenant=tenant_id, resource='https://management.core.windows.net/')
client_id=client_id, secret=credential, tenant=tenant_id, resource="https://management.core.windows.net/")
m = RESOURCE_ID_PATTERN.match(batch_resource_id)
batch_client = BatchManagementClient(credentials, m.group('subscription'))
account = batch_client.batch_account.get(m.group('resourcegroup'), m.group('account'))
base_url = 'https://%s/' % account.account_endpoint
batch_client = BatchManagementClient(credentials, m.group("subscription"))
account = batch_client.batch_account.get(m.group("resourcegroup"), m.group("account"))
base_url = "https://%s/" % account.account_endpoint
credentials = ServicePrincipalCredentials(
client_id=client_id, secret=credential, tenant=tenant_id, resource='https://batch.core.windows.net/')
client_id=client_id, secret=credential, tenant=tenant_id, resource="https://batch.core.windows.net/")
return batch.BatchServiceClient(credentials, base_url=base_url)

Просмотреть файл

@ -3,7 +3,7 @@ import logging
log = logging.getLogger("aztk.node-agent")
DEFAULT_FORMAT = '%(message)s'
DEFAULT_FORMAT = "%(message)s"
def setup_logging():

Просмотреть файл

@ -5,45 +5,47 @@ from Cryptodome.PublicKey import RSA
from Cryptodome.Cipher import AES, PKCS1_OAEP
from datetime import datetime, timezone, timedelta
import yaml
'''
"""
Creates a user if the user configuration file at $AZTK_WORKING_DIR/user.yaml exists
'''
"""
def create_user(batch_client):
path = os.path.join(os.environ['AZTK_WORKING_DIR'], "user.yaml")
path = os.path.join(os.environ["AZTK_WORKING_DIR"], "user.yaml")
if not os.path.isfile(path):
print("No user to create.")
return
with open(path, 'r', encoding='UTF-8') as file:
with open(path, "r", encoding="UTF-8") as file:
user_conf = yaml.load(file.read())
try:
password = None if user_conf['ssh-key'] else decrypt_password(user_conf)
password = None if user_conf["ssh-key"] else decrypt_password(user_conf)
batch_client.compute_node.add_user(
pool_id=os.environ['AZ_BATCH_POOL_ID'],
node_id=os.environ['AZ_BATCH_NODE_ID'],
pool_id=os.environ["AZ_BATCH_POOL_ID"],
node_id=os.environ["AZ_BATCH_NODE_ID"],
user=batch_models.ComputeNodeUser(
name=user_conf['username'],
name=user_conf["username"],
is_admin=True,
password=password,
ssh_public_key=str(user_conf['ssh-key']),
expiry_time=datetime.now(timezone.utc) + timedelta(days=365)))
ssh_public_key=str(user_conf["ssh-key"]),
expiry_time=datetime.now(timezone.utc) + timedelta(days=365),
),
)
except batch_error.BatchErrorException as e:
print(e)
def decrypt_password(user_conf):
cipher_text = user_conf['password']
encrypted_aes_session_key = user_conf['aes_session_key']
cipher_aes_nonce = user_conf['cipher_aes_nonce']
tag = user_conf['tag']
cipher_text = user_conf["password"]
encrypted_aes_session_key = user_conf["aes_session_key"]
cipher_aes_nonce = user_conf["cipher_aes_nonce"]
tag = user_conf["tag"]
# Read private key
with open(os.path.join(os.environ['AZTK_WORKING_DIR'], 'id_rsa'), encoding='UTF-8') as f:
with open(os.path.join(os.environ["AZTK_WORKING_DIR"], "id_rsa"), encoding="UTF-8") as f:
private_key = RSA.import_key(f.read())
# Decrypt the session key with the public RSA key
cipher_rsa = PKCS1_OAEP.new(private_key)

Просмотреть файл

@ -25,7 +25,7 @@ def setup_host(docker_repo: str, docker_run_options: str):
client = config.batch_client
create_user.create_user(batch_client=client)
if os.environ['AZ_BATCH_NODE_IS_DEDICATED'] == "true" or os.environ['AZTK_MIXED_MODE'] == "false":
if os.environ["AZ_BATCH_NODE_IS_DEDICATED"] == "true" or os.environ["AZTK_MIXED_MODE"] == "false":
is_master = pick_master.find_master(client)
else:
is_master = False
@ -50,7 +50,7 @@ def setup_host(docker_repo: str, docker_run_options: str):
setup_node_scheduling(client, cluster_conf, is_master)
#TODO pass azure file shares
# TODO pass azure file shares
spark_container.start_spark_container(
docker_repo=docker_repo,
docker_run_options=docker_run_options,
@ -82,4 +82,4 @@ def setup_spark_container():
plugins.setup_plugins(target=PluginTarget.SparkContainer, is_master=is_master, is_worker=is_worker)
open("/tmp/setup_complete", 'a').close()
open("/tmp/setup_complete", "a").close()

Просмотреть файл

@ -37,8 +37,8 @@ def try_assign_self_as_master(client: batch.BatchServiceClient, pool: batchmodel
client.pool.patch(
config.pool_id,
batchmodels.PoolPatchParameter(metadata=new_metadata),
batchmodels.PoolPatchOptions(if_match=pool.e_tag,
))
batchmodels.PoolPatchOptions(if_match=pool.e_tag),
)
return True
except (batcherror.BatchErrorException, ClientRequestError):
print("Couldn't assign itself as master the pool because the pool was modified since last get.")

Просмотреть файл

@ -1,18 +1,19 @@
import os
import json
import yaml
import os
import subprocess
from pathlib import Path
import yaml
from aztk.models.plugins import PluginTarget, PluginTargetRole
log_folder = os.path.join(os.environ['AZTK_WORKING_DIR'], 'logs', 'plugins')
log_folder = os.path.join(os.environ["AZTK_WORKING_DIR"], "logs", "plugins")
def _read_manifest_file(path=None):
if not os.path.isfile(path):
print("Plugins manifest file doesn't exist at {0}".format(path))
else:
with open(path, 'r', encoding='UTF-8') as stream:
with open(path, "r", encoding="UTF-8") as stream:
try:
return yaml.load(stream)
except json.JSONDecodeError as err:
@ -22,7 +23,7 @@ def _read_manifest_file(path=None):
def setup_plugins(target: PluginTarget, is_master: bool = False, is_worker: bool = False):
plugins_dir = _plugins_dir()
plugins_manifest = _read_manifest_file(os.path.join(plugins_dir, 'plugins-manifest.yaml'))
plugins_manifest = _read_manifest_file(os.path.join(plugins_dir, "plugins-manifest.yaml"))
if not os.path.exists(log_folder):
os.makedirs(log_folder)
@ -32,28 +33,41 @@ def setup_plugins(target: PluginTarget, is_master: bool = False, is_worker: bool
def _plugins_dir():
return os.path.join(os.environ['AZTK_WORKING_DIR'], 'plugins')
return os.path.join(os.environ["AZTK_WORKING_DIR"], "plugins")
def _run_on_this_node(plugin_obj, target: PluginTarget, is_master, is_worker):
print("Loading plugin {} in {} on {}".format(plugin_obj["execute"], plugin_obj['target'],
plugin_obj['target_role']))
print("Loading plugin {} in {} on {}".format(plugin_obj["execute"], plugin_obj["target"],
plugin_obj["target_role"]))
if plugin_obj['target'] != target.value:
print("Ignoring ", plugin_obj["execute"], "as target is for ", plugin_obj['target'],
"but is currently running in ", target.value)
if plugin_obj["target"] != target.value:
print(
"Ignoring ",
plugin_obj["execute"],
"as target is for ",
plugin_obj["target"],
"but is currently running in ",
target.value,
)
return False
if plugin_obj['target_role'] == PluginTargetRole.Master.value and is_master is True:
if plugin_obj["target_role"] == PluginTargetRole.Master.value and is_master is True:
return True
if plugin_obj['target_role'] == PluginTargetRole.Worker.value and is_worker is True:
if plugin_obj["target_role"] == PluginTargetRole.Worker.value and is_worker is True:
return True
if plugin_obj['target_role'] == PluginTargetRole.All.value:
if plugin_obj["target_role"] == PluginTargetRole.All.value:
return True
print("Ignoring plugin", plugin_obj["execute"], "as target role is ", plugin_obj['target_role'],
"and node is master: ", is_master, is_worker)
print(
"Ignoring plugin",
plugin_obj["execute"],
"as target role is ",
plugin_obj["target_role"],
"and node is master: ",
is_master,
is_worker,
)
return False
@ -63,8 +77,8 @@ def _setup_plugins(plugins_manifest, target: PluginTarget, is_master, is_worker)
for plugin in plugins_manifest:
if _run_on_this_node(plugin, target, is_master, is_worker):
path = os.path.join(plugins_dir, plugin['execute'])
_run_script(plugin.get("name"), path, plugin.get('args'), plugin.get('env'))
path = os.path.join(plugins_dir, plugin["execute"])
_run_script(plugin.get("name"), path, plugin.get("args"), plugin.get("env"))
def _run_script(name: str, script_path: str = None, args: dict = None, env: dict = None):
@ -84,7 +98,7 @@ def _run_script(name: str, script_path: str = None, args: dict = None, env: dict
if args is None:
args = []
out_file = open(os.path.join(log_folder, '{0}.txt'.format(name)), 'w', encoding='UTF-8')
out_file = open(os.path.join(log_folder, "{0}.txt".format(name)), "w", encoding="UTF-8")
try:
subprocess.call([script_path] + args, env=my_env, stdout=out_file, stderr=out_file)
print("Finished running")

Просмотреть файл

@ -2,13 +2,14 @@
Code that handle spark configuration
"""
import datetime
import time
import os
import json
import shutil
from subprocess import call, Popen, check_output
import time
from subprocess import call
from typing import List
import azure.batch.models as batchmodels
from core import config
from install import pick_master
@ -55,7 +56,7 @@ def setup_connection():
master_node = get_node(master_node_id)
master_config_file = os.path.join(spark_conf_folder, "master")
master_file = open(master_config_file, 'w', encoding='UTF-8')
master_file = open(master_config_file, "w", encoding="UTF-8")
print("Adding master node ip {0} to config file '{1}'".format(master_node.ip_address, master_config_file))
master_file.write("{0}\n".format(master_node.ip_address))
@ -127,9 +128,9 @@ def setup_conf():
def setup_ssh_keys():
pub_key_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'id_rsa.pub')
priv_key_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'id_rsa')
ssh_key_dest = '/root/.ssh'
pub_key_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "id_rsa.pub")
priv_key_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "id_rsa")
ssh_key_dest = "/root/.ssh"
if not os.path.exists(ssh_key_dest):
os.mkdir(ssh_key_dest)
@ -139,27 +140,27 @@ def setup_ssh_keys():
def copy_spark_env():
spark_env_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'conf/spark-env.sh')
spark_env_path_dest = os.path.join(spark_home, 'conf/spark-env.sh')
spark_env_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "conf/spark-env.sh")
spark_env_path_dest = os.path.join(spark_home, "conf/spark-env.sh")
copyfile(spark_env_path_src, spark_env_path_dest)
def copy_spark_defaults():
spark_default_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'conf/spark-defaults.conf')
spark_default_path_dest = os.path.join(spark_home, 'conf/spark-defaults.conf')
spark_default_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "conf/spark-defaults.conf")
spark_default_path_dest = os.path.join(spark_home, "conf/spark-defaults.conf")
copyfile(spark_default_path_src, spark_default_path_dest)
def copy_core_site():
spark_core_site_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'conf/core-site.xml')
spark_core_site_dest = os.path.join(spark_home, 'conf/core-site.xml')
spark_core_site_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "conf/core-site.xml")
spark_core_site_dest = os.path.join(spark_home, "conf/core-site.xml")
copyfile(spark_core_site_src, spark_core_site_dest)
def copy_jars():
# Copy jars to $SPARK_HOME/jars
spark_default_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'jars')
spark_default_path_dest = os.path.join(spark_home, 'jars')
spark_default_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "jars")
spark_default_path_dest = os.path.join(spark_home, "jars")
try:
jar_files = os.listdir(spark_default_path_src)
@ -175,10 +176,10 @@ def copy_jars():
def parse_configuration_file(path_to_file: str):
try:
file = open(path_to_file, 'r', encoding='UTF-8')
file = open(path_to_file, "r", encoding="UTF-8")
properties = {}
for line in file:
if (not line.startswith('#') and len(line) > 1):
if not line.startswith("#") and len(line) > 1:
split = line.split()
properties[split[0]] = split[1]
return properties
@ -189,10 +190,10 @@ def parse_configuration_file(path_to_file: str):
def start_history_server():
# configure the history server
spark_event_log_enabled_key = 'spark.eventLog.enabled'
spark_event_log_directory_key = 'spark.eventLog.dir'
spark_history_fs_log_directory = 'spark.history.fs.logDirectory'
path_to_spark_defaults_conf = os.path.join(spark_home, 'conf/spark-defaults.conf')
spark_event_log_enabled_key = "spark.eventLog.enabled"
spark_event_log_directory_key = "spark.eventLog.dir"
spark_history_fs_log_directory = "spark.history.fs.logDirectory"
path_to_spark_defaults_conf = os.path.join(spark_home, "conf/spark-defaults.conf")
properties = parse_configuration_file(path_to_spark_defaults_conf)
required_keys = [spark_event_log_enabled_key, spark_event_log_directory_key, spark_history_fs_log_directory]
@ -208,17 +209,17 @@ def start_history_server():
def configure_history_server_log_path(path_to_log_file):
# Check if the file path starts with a local file extension
# If so, create the path on disk otherwise ignore
print('Configuring spark history server log directory {}.'.format(path_to_log_file))
if path_to_log_file.startswith('file:/'):
print("Configuring spark history server log directory {}.".format(path_to_log_file))
if path_to_log_file.startswith("file:/"):
# create the local path on disk
directory = path_to_log_file.replace('file:', '')
directory = path_to_log_file.replace("file:", "")
if os.path.exists(directory):
print('Skipping. Directory {} already exists.'.format(directory))
print("Skipping. Directory {} already exists.".format(directory))
else:
print('Create directory {}.'.format(directory))
print("Create directory {}.".format(directory))
os.makedirs(directory)
# Make sure the directory can be accessed by all users
os.chmod(directory, mode=0o777)
else:
print('Skipping. The eventLog directory is not local.')
print("Skipping. The eventLog directory is not local.")

Просмотреть файл

@ -15,42 +15,43 @@ def start_spark_container(docker_repo: str = None,
docker_repo=docker_repo,
docker_run_options=docker_run_options,
cmd="/bin/bash /mnt/batch/tasks/startup/wd/aztk/node_scripts/docker_main.sh",
gpu_enabled=gpu_enabled)
gpu_enabled=gpu_enabled,
)
if file_mounts:
for mount in file_mounts:
cmd.share_folder(mount.mount_path)
cmd.share_folder('/mnt')
cmd.share_folder("/mnt")
cmd.pass_env('AZTK_WORKING_DIR')
cmd.pass_env('AZ_BATCH_ACCOUNT_NAME')
cmd.pass_env('BATCH_ACCOUNT_KEY')
cmd.pass_env('BATCH_SERVICE_URL')
cmd.pass_env('STORAGE_ACCOUNT_NAME')
cmd.pass_env('STORAGE_ACCOUNT_KEY')
cmd.pass_env('STORAGE_ACCOUNT_SUFFIX')
cmd.pass_env("AZTK_WORKING_DIR")
cmd.pass_env("AZ_BATCH_ACCOUNT_NAME")
cmd.pass_env("BATCH_ACCOUNT_KEY")
cmd.pass_env("BATCH_SERVICE_URL")
cmd.pass_env("STORAGE_ACCOUNT_NAME")
cmd.pass_env("STORAGE_ACCOUNT_KEY")
cmd.pass_env("STORAGE_ACCOUNT_SUFFIX")
cmd.pass_env('SP_TENANT_ID')
cmd.pass_env('SP_CLIENT_ID')
cmd.pass_env('SP_CREDENTIAL')
cmd.pass_env('SP_BATCH_RESOURCE_ID')
cmd.pass_env('SP_STORAGE_RESOURCE_ID')
cmd.pass_env("SP_TENANT_ID")
cmd.pass_env("SP_CLIENT_ID")
cmd.pass_env("SP_CREDENTIAL")
cmd.pass_env("SP_BATCH_RESOURCE_ID")
cmd.pass_env("SP_STORAGE_RESOURCE_ID")
cmd.pass_env('AZ_BATCH_POOL_ID')
cmd.pass_env('AZ_BATCH_NODE_ID')
cmd.pass_env('AZ_BATCH_NODE_IS_DEDICATED')
cmd.pass_env("AZ_BATCH_POOL_ID")
cmd.pass_env("AZ_BATCH_NODE_ID")
cmd.pass_env("AZ_BATCH_NODE_IS_DEDICATED")
cmd.pass_env('AZTK_WORKER_ON_MASTER')
cmd.pass_env('AZTK_MIXED_MODE')
cmd.pass_env('AZTK_IS_MASTER')
cmd.pass_env('AZTK_IS_WORKER')
cmd.pass_env('AZTK_MASTER_IP')
cmd.pass_env("AZTK_WORKER_ON_MASTER")
cmd.pass_env("AZTK_MIXED_MODE")
cmd.pass_env("AZTK_IS_MASTER")
cmd.pass_env("AZTK_IS_WORKER")
cmd.pass_env("AZTK_MASTER_IP")
cmd.pass_env('SPARK_WEB_UI_PORT')
cmd.pass_env('SPARK_WORKER_UI_PORT')
cmd.pass_env('SPARK_CONTAINER_NAME')
cmd.pass_env('SPARK_SUBMIT_LOGS_FILE')
cmd.pass_env('SPARK_JOB_UI_PORT')
cmd.pass_env("SPARK_WEB_UI_PORT")
cmd.pass_env("SPARK_WORKER_UI_PORT")
cmd.pass_env("SPARK_CONTAINER_NAME")
cmd.pass_env("SPARK_SUBMIT_LOGS_FILE")
cmd.pass_env("SPARK_JOB_UI_PORT")
cmd.open_port(8080) # Spark Master UI
cmd.open_port(7077) # Spark Master
@ -69,5 +70,5 @@ def start_spark_container(docker_repo: str = None,
print("-" * 60)
print(cmd.to_str())
print("=" * 60)
subprocess.call(['/bin/bash', '-c', 'echo Is master?: $AZTK_IS_MASTER _ $AZTK_IS_WORKER'])
subprocess.call(['/bin/bash', '-c', cmd.to_str()])
subprocess.call(["/bin/bash", "-c", "echo Is master?: $AZTK_IS_MASTER _ $AZTK_IS_WORKER"])
subprocess.call(["/bin/bash", "-c", cmd.to_str()])

Просмотреть файл

@ -1,12 +1,8 @@
import datetime
import os
import subprocess
import sys
from typing import List
import azure.batch.models as batch_models
import azure.storage.blob as blob
import yaml
from aztk.utils.command_builder import CommandBuilder
from core import config
from install.pick_master import get_master_node_id
@ -20,14 +16,13 @@ def affinitize_task_to_master(batch_client, cluster_id, task):
def schedule_tasks(tasks_path):
'''
"""
Handle the request to submit a task
'''
"""
batch_client = config.batch_client
blob_client = config.blob_client
for task_definition in tasks_path:
with open(task_definition, 'r', encoding='UTF-8') as stream:
with open(task_definition, "r", encoding="UTF-8") as stream:
try:
task = yaml.load(stream)
except yaml.YAMLError as exc:
@ -36,13 +31,13 @@ def schedule_tasks(tasks_path):
# affinitize task to master
task = affinitize_task_to_master(batch_client, os.environ["AZ_BATCH_POOL_ID"], task)
# schedule the task
batch_client.task.add(job_id=os.environ['AZ_BATCH_JOB_ID'], task=task)
batch_client.task.add(job_id=os.environ["AZ_BATCH_JOB_ID"], task=task)
if __name__ == "__main__":
tasks_path = []
for file in os.listdir(os.environ['AZ_BATCH_TASK_WORKING_DIR']):
for file in os.listdir(os.environ["AZ_BATCH_TASK_WORKING_DIR"]):
if file.endswith(".yaml"):
tasks_path.append(os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], file))
tasks_path.append(os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], file))
schedule_tasks(tasks_path)

Просмотреть файл

@ -42,7 +42,9 @@ install_prerequisites () {
install_docker_compose () {
echo "Installing Docker-Compose"
sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose
for i in {1..5}; do
sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose && break || sleep 2;
done
sudo chmod +x /usr/local/bin/docker-compose
echo "Finished installing Docker-Compose"
}
@ -64,9 +66,9 @@ pull_docker_container () {
install_python_dependencies () {
echo "Installing python dependencies"
pipenv install --python /usr/bin/python3.5m
pipenv run pip install --upgrade setuptools wheel #TODO: add pip when pipenv is compatible with pip10
pipenv run pip install --upgrade pip setuptools wheel
pip --version
echo "Finished installing python dependencies"
}
run_docker_container () {

Просмотреть файл

@ -1,20 +1,22 @@
import sys
import os
import logging
import yaml
import subprocess
import datetime
import logging
import os
import subprocess
import sys
from typing import List
import azure.storage.blob as blob
import azure.batch.models as batch_models
import azure.storage.blob as blob
import yaml
from aztk.utils.command_builder import CommandBuilder
from core import config
# limit azure.storage logging
logging.getLogger("azure.storage").setLevel(logging.CRITICAL)
'''
"""
Submit helper methods
'''
"""
def upload_file_to_container(container_name,
@ -40,7 +42,7 @@ def upload_file_to_container(container_name,
blob_name = file_path.strip("/")
else:
blob_name = os.path.basename(file_path)
blob_path = application_name + '/' + blob_name
blob_path = application_name + "/" + blob_name
if not node_path:
node_path = blob_name
@ -53,47 +55,60 @@ def upload_file_to_container(container_name,
container_name,
blob_path,
permission=blob.BlobPermissions.READ,
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7))
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7),
)
sas_url = blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token)
return batch_models.ResourceFile(file_path=node_path, blob_source=sas_url)
def __app_submit_cmd(name: str, app: str, app_args: List[str], main_class: str, jars: List[str], py_files: List[str],
files: List[str], driver_java_options: str, driver_library_path: str, driver_class_path: str,
driver_memory: str, executor_memory: str, driver_cores: int, executor_cores: int):
cluster_id = os.environ['AZ_BATCH_POOL_ID']
spark_home = os.environ['SPARK_HOME']
with open(os.path.join(spark_home, 'conf', 'master')) as f:
def __app_submit_cmd(
name: str,
app: str,
app_args: List[str],
main_class: str,
jars: List[str],
py_files: List[str],
files: List[str],
driver_java_options: str,
driver_library_path: str,
driver_class_path: str,
driver_memory: str,
executor_memory: str,
driver_cores: int,
executor_cores: int,
):
spark_home = os.environ["SPARK_HOME"]
with open(os.path.join(spark_home, "conf", "master")) as f:
master_ip = f.read().rstrip()
# set file paths to correct path on container
files_path = os.environ['AZ_BATCH_TASK_WORKING_DIR']
files_path = os.environ["AZ_BATCH_TASK_WORKING_DIR"]
jars = [os.path.join(files_path, os.path.basename(jar)) for jar in jars]
py_files = [os.path.join(files_path, os.path.basename(py_file)) for py_file in py_files]
files = [os.path.join(files_path, os.path.basename(f)) for f in files]
# 2>&1 redirect stdout and stderr to be in the same file
spark_submit_cmd = CommandBuilder('{0}/bin/spark-submit'.format(spark_home))
spark_submit_cmd.add_option('--master', 'spark://{0}:7077'.format(master_ip))
spark_submit_cmd.add_option('--name', name)
spark_submit_cmd.add_option('--class', main_class)
spark_submit_cmd.add_option('--jars', jars and ','.join(jars))
spark_submit_cmd.add_option('--py-files', py_files and ','.join(py_files))
spark_submit_cmd.add_option('--files', files and ','.join(files))
spark_submit_cmd.add_option('--driver-java-options', driver_java_options)
spark_submit_cmd.add_option('--driver-library-path', driver_library_path)
spark_submit_cmd.add_option('--driver-class-path', driver_class_path)
spark_submit_cmd.add_option('--driver-memory', driver_memory)
spark_submit_cmd.add_option('--executor-memory', executor_memory)
spark_submit_cmd = CommandBuilder("{0}/bin/spark-submit".format(spark_home))
spark_submit_cmd.add_option("--master", "spark://{0}:7077".format(master_ip))
spark_submit_cmd.add_option("--name", name)
spark_submit_cmd.add_option("--class", main_class)
spark_submit_cmd.add_option("--jars", jars and ",".join(jars))
spark_submit_cmd.add_option("--py-files", py_files and ",".join(py_files))
spark_submit_cmd.add_option("--files", files and ",".join(files))
spark_submit_cmd.add_option("--driver-java-options", driver_java_options)
spark_submit_cmd.add_option("--driver-library-path", driver_library_path)
spark_submit_cmd.add_option("--driver-class-path", driver_class_path)
spark_submit_cmd.add_option("--driver-memory", driver_memory)
spark_submit_cmd.add_option("--executor-memory", executor_memory)
if driver_cores:
spark_submit_cmd.add_option('--driver-cores', str(driver_cores))
spark_submit_cmd.add_option("--driver-cores", str(driver_cores))
if executor_cores:
spark_submit_cmd.add_option('--executor-cores', str(executor_cores))
spark_submit_cmd.add_option("--executor-cores", str(executor_cores))
spark_submit_cmd.add_argument(
os.path.expandvars(app) + ' ' + ' '.join(['\'' + str(app_arg) + '\'' for app_arg in (app_args or [])]))
os.path.expandvars(app) + " " + " ".join(["'" + str(app_arg) + "'" for app_arg in (app_args or [])]))
with open("spark-submit.txt", mode="w", encoding="UTF-8") as stream:
stream.write(spark_submit_cmd.to_str())
@ -102,50 +117,51 @@ def __app_submit_cmd(name: str, app: str, app_args: List[str], main_class: str,
def load_application(application_file_path):
'''
"""
Read and parse the application from file
'''
with open(application_file_path, encoding='UTF-8') as f:
"""
with open(application_file_path, encoding="UTF-8") as f:
application = yaml.load(f)
return application
def upload_log(blob_client, application):
'''
"""
upload output.log to storage account
'''
log_file = os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], os.environ['SPARK_SUBMIT_LOGS_FILE'])
"""
log_file = os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], os.environ["SPARK_SUBMIT_LOGS_FILE"])
upload_file_to_container(
container_name=os.environ['STORAGE_LOGS_CONTAINER'],
application_name=application['name'],
container_name=os.environ["STORAGE_LOGS_CONTAINER"],
application_name=application["name"],
file_path=log_file,
blob_client=blob_client,
use_full_path=False)
use_full_path=False,
)
def receive_submit_request(application_file_path):
'''
"""
Handle the request to submit a task
'''
batch_client = config.batch_client
"""
blob_client = config.blob_client
application = load_application(application_file_path)
cmd = __app_submit_cmd(
name=application['name'],
app=application['application'],
app_args=application['application_args'],
main_class=application['main_class'],
jars=application['jars'],
py_files=application['py_files'],
files=application['files'],
driver_java_options=application['driver_java_options'],
driver_library_path=application['driver_library_path'],
driver_class_path=application['driver_class_path'],
driver_memory=application['driver_memory'],
executor_memory=application['executor_memory'],
driver_cores=application['driver_cores'],
executor_cores=application['executor_cores'])
name=application["name"],
app=application["application"],
app_args=application["application_args"],
main_class=application["main_class"],
jars=application["jars"],
py_files=application["py_files"],
files=application["files"],
driver_java_options=application["driver_java_options"],
driver_library_path=application["driver_library_path"],
driver_class_path=application["driver_class_path"],
driver_memory=application["driver_memory"],
executor_memory=application["executor_memory"],
driver_cores=application["driver_cores"],
executor_cores=application["executor_cores"],
)
return_code = subprocess.call(cmd.to_str(), shell=True)
upload_log(blob_client, application)
@ -157,24 +173,25 @@ def upload_error_log(error, application_file_path):
blob_client = config.blob_client
error_log_path = os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "error.log")
with open(error_log_path, "w", encoding='UTF-8') as error_log:
with open(error_log_path, "w", encoding="UTF-8") as error_log:
error_log.write(error)
upload_file_to_container(
container_name=os.environ['STORAGE_LOGS_CONTAINER'],
application_name=application['name'],
container_name=os.environ["STORAGE_LOGS_CONTAINER"],
application_name=application["name"],
file_path=os.path.realpath(error_log.name),
blob_client=blob_client,
use_full_path=False)
use_full_path=False,
)
upload_log(blob_client, application)
if __name__ == "__main__":
return_code = 1
try:
return_code = receive_submit_request(os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], 'application.yaml'))
return_code = receive_submit_request(os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "application.yaml"))
except Exception as e:
upload_error_log(str(e), os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], 'application.yaml'))
upload_error_log(str(e), os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "application.yaml"))
# force batch task exit code to match spark exit code
sys.exit(return_code)

Просмотреть файл

@ -1,8 +1,8 @@
import time
import os
while not os.path.exists('/tmp/setup_complete'):
while not os.path.exists("/tmp/setup_complete"):
time.sleep(1)
print("SETUP FINISHED")
os.remove('/tmp/setup_complete')
os.remove("/tmp/setup_complete")

Просмотреть файл

@ -17,12 +17,13 @@ def generate_application_task(core_base_operations, container_id, application, r
application_name=application.name,
file_path=application.application,
blob_client=core_base_operations.blob_client,
use_full_path=False)
use_full_path=False,
)
# Upload application file
resource_files.append(app_resource_file)
application.application = '$AZ_BATCH_TASK_WORKING_DIR/' + os.path.basename(application.application)
application.application = "$AZ_BATCH_TASK_WORKING_DIR/" + os.path.basename(application.application)
# Upload dependent JARS
jar_resource_file_paths = []
@ -32,7 +33,8 @@ def generate_application_task(core_base_operations, container_id, application, r
application_name=application.name,
file_path=jar,
blob_client=core_base_operations.blob_client,
use_full_path=False)
use_full_path=False,
)
jar_resource_file_paths.append(current_jar_resource_file_path)
resource_files.append(current_jar_resource_file_path)
@ -44,7 +46,8 @@ def generate_application_task(core_base_operations, container_id, application, r
application_name=application.name,
file_path=py_file,
blob_client=core_base_operations.blob_client,
use_full_path=False)
use_full_path=False,
)
py_files_resource_file_paths.append(current_py_files_resource_file_path)
resource_files.append(current_py_files_resource_file_path)
@ -56,7 +59,8 @@ def generate_application_task(core_base_operations, container_id, application, r
application_name=application.name,
file_path=file,
blob_client=core_base_operations.blob_client,
use_full_path=False)
use_full_path=False,
)
files_resource_file_paths.append(files_resource_file_path)
resource_files.append(files_resource_file_path)
@ -67,21 +71,23 @@ def generate_application_task(core_base_operations, container_id, application, r
application_definition_file = helpers.upload_text_to_container(
container_name=container_id,
application_name=application.name,
file_path='application.yaml',
file_path="application.yaml",
content=yaml.dump(vars(application)),
blob_client=core_base_operations.blob_client)
blob_client=core_base_operations.blob_client,
)
resource_files.append(application_definition_file)
# create command to submit task
task_cmd = CommandBuilder('sudo docker exec')
task_cmd.add_argument('-i')
task_cmd.add_option('-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR')
task_cmd.add_option('-e', 'STORAGE_LOGS_CONTAINER={0}'.format(container_id))
task_cmd.add_argument('spark /bin/bash >> output.log 2>&1')
task_cmd.add_argument('-c "source ~/.bashrc; ' \
'export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; ' \
'cd \$AZ_BATCH_TASK_WORKING_DIR; ' \
'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"')
task_cmd = CommandBuilder("sudo docker exec")
task_cmd.add_argument("-i")
task_cmd.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
task_cmd.add_option("-e", "STORAGE_LOGS_CONTAINER={0}".format(container_id))
task_cmd.add_argument("spark /bin/bash >> output.log 2>&1")
task_cmd.add_argument(
r'-c "source ~/.bashrc; '
r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; "
r"cd \$AZ_BATCH_TASK_WORKING_DIR; "
r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"')
# Create task
task = batch_models.TaskAddParameter(
@ -91,6 +97,7 @@ def generate_application_task(core_base_operations, container_id, application, r
constraints=batch_models.TaskConstraints(max_task_retry_count=application.max_retry_count),
user_identity=batch_models.UserIdentity(
auto_user=batch_models.AutoUserSpecification(
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)))
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)),
)
return task

Просмотреть файл

@ -1,14 +1,9 @@
from typing import List
import azure.batch.models as batch_models
import azure.batch.models.batch_error as batch_error
from aztk import error
from aztk.internal.cluster_data import NodeData
from aztk.spark import models
from aztk.spark.utils import util
from aztk.utils import constants, helpers
from aztk.spark import models
POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity(
auto_user=batch_models.AutoUserSpecification(
@ -60,14 +55,13 @@ def __get_secrets_env(core_base_operations):
]
def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
plugins=None,
worker_on_master: bool = True,
file_mounts=None,
mixed_mode: bool = False):
def __cluster_install_cmd(
zip_resource_file: batch_models.ResourceFile,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
file_mounts=None,
):
"""
For Docker on ubuntu 16.04 - return the command line
to be run on the start task of the pool to setup spark.
@ -80,41 +74,42 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
if file_mounts:
for mount in file_mounts:
# Create the directory on the node
shares.append('mkdir -p {0}'.format(mount.mount_path))
shares.append("mkdir -p {0}".format(mount.mount_path))
# Mount the file share
shares.append(
'mount -t cifs //{0}.file.core.windows.net/{2} {3} -o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp'.
"mount -t cifs //{0}.file.core.windows.net/{2} {3} -o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp".
format(mount.storage_account_name, mount.storage_account_key, mount.file_share_path, mount.mount_path))
setup = [
'time('\
'apt-get -y update;'\
'apt-get -y --no-install-recommends install unzip;'\
'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\
'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\
') 2>&1'.format(zip_resource_file.file_path),
"time("
"apt-get -y update;"
"apt-get -y --no-install-recommends install unzip;"
"unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};"
"chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;"
") 2>&1".format(zip_resource_file.file_path),
'/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1} "{2}"'.format(
constants.DOCKER_SPARK_CONTAINER_NAME,
docker_repo,
"" if docker_run_options is None else docker_run_options.replace('"', '\\\"')
)
"" if docker_run_options is None else docker_run_options.replace('"', '\\"'),
),
]
commands = shares + setup
return commands
def generate_cluster_start_task(core_base_operations,
zip_resource_file: batch_models.ResourceFile,
cluster_id: str,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
file_shares: List[models.FileShare] = None,
plugins: List[models.PluginConfiguration] = None,
mixed_mode: bool = False,
worker_on_master: bool = True):
def generate_cluster_start_task(
core_base_operations,
zip_resource_file: batch_models.ResourceFile,
cluster_id: str,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
file_shares: List[models.FileShare] = None,
mixed_mode: bool = False,
worker_on_master: bool = True,
):
"""
This will return the start task object for the pool to be created.
:param cluster_id str: Id of the cluster(Used for uploading the resource files)
@ -130,22 +125,23 @@ def generate_cluster_start_task(core_base_operations,
spark_submit_logs_file = constants.SPARK_SUBMIT_LOGS_FILE
# TODO use certificate
environment_settings = __get_secrets_env(core_base_operations) + [
environment_settings = (__get_secrets_env(core_base_operations) + [
batch_models.EnvironmentSetting(name="SPARK_WEB_UI_PORT", value=spark_web_ui_port),
batch_models.EnvironmentSetting(name="SPARK_WORKER_UI_PORT", value=spark_worker_ui_port),
batch_models.EnvironmentSetting(name="SPARK_JOB_UI_PORT", value=spark_job_ui_port),
batch_models.EnvironmentSetting(name="SPARK_CONTAINER_NAME", value=spark_container_name),
batch_models.EnvironmentSetting(name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file),
batch_models.EnvironmentSetting(name="AZTK_GPU_ENABLED", value=helpers.bool_env(gpu_enabled)),
] + __get_docker_credentials(core_base_operations) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode)
] + __get_docker_credentials(core_base_operations) + _get_aztk_environment(cluster_id, worker_on_master,
mixed_mode))
# start task command
command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, plugins,
worker_on_master, file_shares, mixed_mode)
command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, file_shares)
return batch_models.StartTask(
command_line=helpers.wrap_commands_in_shell(command),
resource_files=resource_files,
environment_settings=environment_settings,
user_identity=POOL_ADMIN_USER_IDENTITY,
wait_for_success=True)
wait_for_success=True,
)

Просмотреть файл

@ -2,7 +2,6 @@ from typing import List
import azure.batch.models as batch_models
from aztk.client.base import BaseOperations as CoreBaseOperations
from aztk.spark import models
from .helpers import generate_application_task, generate_cluster_start_task
@ -12,18 +11,19 @@ class SparkBaseOperations:
"""Spark Base operations object that all other Spark operations objects inherit from
"""
#TODO: make this private or otherwise not public
def _generate_cluster_start_task(self,
core_base_operations,
zip_resource_file: batch_models.ResourceFile,
id: str,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
file_shares: List[models.FileShare] = None,
plugins: List[models.PluginConfiguration] = None,
mixed_mode: bool = False,
worker_on_master: bool = True):
# TODO: make this private or otherwise not public
def _generate_cluster_start_task(
self,
core_base_operations,
zip_resource_file: batch_models.ResourceFile,
id: str,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
file_shares: List[models.FileShare] = None,
mixed_mode: bool = False,
worker_on_master: bool = True,
):
"""Generate the Azure Batch Start Task to provision a Spark cluster.
Args:
@ -35,10 +35,8 @@ class SparkBaseOperations:
If None, the default Docker image will be used. Defaults to None.
file_shares (:obj:`aztk.spark.models.FileShare`, optional): a list of FileShares to mount on the cluster.
Defaults to None.
plugins (:obj:`aztk.spark.models.PluginConfiguration`, optional): a list of plugins to set up on the cluster.
Defaults to None.
mixed_mode (:obj:`bool`, optional): If True, the cluster is configured to use both dedicated and low priority VMs.
Defaults to False.
mixed_mode (:obj:`bool`, optional): If True, the cluster is configured to use both dedicated
and low priority VMs. Defaults to False.
worker_on_master (:obj:`bool`, optional): If True, the cluster is configured to provision a Spark worker
on the VM that runs the Spark master. Defaults to True.
@ -46,10 +44,18 @@ class SparkBaseOperations:
:obj:`azure.batch.models.StartTask`: the StartTask definition to provision the cluster.
"""
return generate_cluster_start_task.generate_cluster_start_task(
core_base_operations, zip_resource_file, id, gpu_enabled, docker_repo, docker_run_options, file_shares,
plugins, mixed_mode, worker_on_master)
core_base_operations,
zip_resource_file,
id,
gpu_enabled,
docker_repo,
docker_run_options,
file_shares,
mixed_mode,
worker_on_master,
)
#TODO: make this private or otherwise not public
# TODO: make this private or otherwise not public
def _generate_application_task(self, core_base_operations, container_id, application, remote=False):
"""Generate the Azure Batch Start Task to provision a Spark cluster.

Просмотреть файл

@ -2,21 +2,15 @@ from typing import List
import azure.batch.models.batch_error as batch_error
import aztk
from aztk import error
from aztk import models as base_models
from aztk.client import CoreClient
from aztk.internal.cluster_data import NodeData
from aztk.spark import models
from aztk.spark.client.cluster import ClusterOperations
from aztk.spark.client.job import JobOperations
from aztk.spark.helpers import cluster_diagnostic_helper
from aztk.spark.helpers import create_cluster as create_cluster_helper
from aztk.spark.helpers import get_log as get_log_helper
from aztk.spark.helpers import job_submission as job_submit_helper
from aztk.spark.helpers import submit as cluster_submit_helper
from aztk.spark.utils import util
from aztk.utils import azure_api, deprecated, deprecate, helpers
from aztk.utils import deprecate, deprecated, helpers
class Client(CoreClient):
@ -28,13 +22,14 @@ class Client(CoreClient):
"""
def __init__(self, secrets_configuration: models.SecretsConfiguration = None, **kwargs):
self.secrets_configuration = None
super().__init__()
context = None
if kwargs.get("secrets_config"):
deprecate(
version="0.10.0",
message="secrets_config key is deprecated in secrets.yaml",
advice="Please use secrets_configuration key instead.")
advice="Please use secrets_configuration key instead.",
)
context = self._get_context(kwargs.get("secrets_config"))
else:
context = self._get_context(secrets_configuration)
@ -133,36 +128,42 @@ class Client(CoreClient):
id=cluster_id, node_id=node_id, command=command, host=host, internal=internal, timeout=timeout)
@deprecated("0.10.0")
def cluster_copy(self,
cluster_id: str,
source_path: str,
destination_path: str,
host: bool = False,
internal: bool = False,
timeout: int = None):
def cluster_copy(
self,
cluster_id: str,
source_path: str,
destination_path: str,
host: bool = False,
internal: bool = False,
timeout: int = None,
):
return self.cluster.copy(
id=cluster_id,
source_path=source_path,
destination_path=destination_path,
host=host,
internal=internal,
timeout=timeout)
timeout=timeout,
)
@deprecated("0.10.0")
def cluster_download(self,
cluster_id: str,
source_path: str,
destination_path: str = None,
host: bool = False,
internal: bool = False,
timeout: int = None):
def cluster_download(
self,
cluster_id: str,
source_path: str,
destination_path: str = None,
host: bool = False,
internal: bool = False,
timeout: int = None,
):
return self.cluster.download(
id=cluster_id,
source_path=source_path,
destination_path=destination_path,
host=host,
internal=internal,
timeout=timeout)
timeout=timeout,
)
@deprecated("0.10.0")
def cluster_ssh_into_master(self,
@ -176,9 +177,9 @@ class Client(CoreClient):
return self.cluster._core_cluster_operations.ssh_into_node(cluster_id, node_id, username, ssh_key, password,
port_forward_list, internal)
'''
"""
job submission
'''
"""
@deprecated("0.10.0")
def submit_job(self, job_configuration: models.JobConfiguration):

Просмотреть файл

@ -4,15 +4,17 @@ from aztk import error
from aztk.utils import helpers
def cluster_copy(core_cluster_operations,
cluster_id: str,
source_path: str,
destination_path: str,
host: bool = False,
internal: bool = False,
timeout: int = None):
def cluster_copy(
core_cluster_operations,
cluster_id: str,
source_path: str,
destination_path: str,
host: bool = False,
internal: bool = False,
timeout: int = None,
):
try:
container_name = None if host else 'spark'
container_name = None if host else "spark"
return core_cluster_operations.copy(
cluster_id,
source_path,
@ -20,6 +22,7 @@ def cluster_copy(core_cluster_operations,
container_name=container_name,
get=False,
internal=internal,
timeout=timeout)
timeout=timeout,
)
except batch_error.BatchErrorException as e:
raise error.AztkError(helpers.format_batch_exception(e))

Просмотреть файл

@ -52,9 +52,16 @@ def create_cluster(core_cluster_operations,
zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file()
start_task = spark_cluster_operations._generate_cluster_start_task(
core_cluster_operations, zip_resource_files, cluster_conf.cluster_id, cluster_conf.gpu_enabled(),
cluster_conf.get_docker_repo(), cluster_conf.get_docker_run_options(), cluster_conf.file_shares,
cluster_conf.plugins, cluster_conf.mixed_mode(), cluster_conf.worker_on_master)
core_cluster_operations,
zip_resource_files,
cluster_conf.cluster_id,
cluster_conf.gpu_enabled(),
cluster_conf.get_docker_repo(),
cluster_conf.get_docker_run_options(),
cluster_conf.file_shares,
cluster_conf.mixed_mode(),
cluster_conf.worker_on_master,
)
software_metadata_key = base_models.Software.spark

Просмотреть файл

@ -4,12 +4,14 @@ from aztk import error
from aztk.utils import helpers
def create_user(core_cluster_operations,
spark_cluster_operations,
cluster_id: str,
username: str,
password: str = None,
ssh_key: str = None) -> str:
def create_user(
core_cluster_operations,
spark_cluster_operations,
cluster_id: str,
username: str,
password: str = None,
ssh_key: str = None,
) -> str:
try:
cluster = spark_cluster_operations.get(cluster_id)
master_node_id = cluster.master_node_id

Просмотреть файл

@ -6,18 +6,13 @@ from aztk import error
from aztk.utils import helpers
def _write_error(stream, node_output):
stream.write(node_output.error)
def _write_output(stream, node_output):
stream.write(node_output.output)
def _run(spark_cluster_operations, cluster_id, output_directory=None, brief=False):
# copy debug program to each node
output = spark_cluster_operations.copy(
copy_output = spark_cluster_operations.copy(
cluster_id, os.path.abspath("./aztk/spark/utils/debug.py"), "/tmp/debug.py", host=True)
for node_output in copy_output:
if node_output.error:
raise error.AztkError("Failed to copy diagnostic script to cluster.")
ssh_cmd = _build_diagnostic_ssh_command(brief)
run_output = spark_cluster_operations.run(cluster_id, ssh_cmd, host=True)
remote_path = "/tmp/debug.zip"
@ -27,9 +22,9 @@ def _run(spark_cluster_operations, cluster_id, output_directory=None, brief=Fals
result = spark_cluster_operations.download(cluster_id, remote_path, local_path, host=True)
# write run output or error to debug/ directory
with open(os.path.join(output_directory, "debug-output.txt"), 'w', encoding="UTF-8") as stream:
with open(os.path.join(output_directory, "debug-output.txt"), "w", encoding="UTF-8") as stream:
for node_output in run_output:
_write_error(stream, node_output) if node_output.error else _write_output(stream, node_output)
stream.write(node_output.error) if node_output.error else stream.write(node_output.output)
else:
result = spark_cluster_operations.download(cluster_id, remote_path, host=True)
@ -37,11 +32,11 @@ def _run(spark_cluster_operations, cluster_id, output_directory=None, brief=Fals
def _build_diagnostic_ssh_command(brief):
return "sudo rm -rf /tmp/debug.zip; "\
"sudo apt-get install -y python3-pip; "\
"sudo -H pip3 install --upgrade pip; "\
"sudo -H pip3 install docker; "\
"sudo python3 /tmp/debug.py {}".format(brief)
return ("sudo rm -rf /tmp/debug.zip; "
"sudo apt-get install -y python3-pip; "
"sudo -H pip3 install --upgrade pip; "
"sudo -H pip3 install docker; "
"sudo python3 /tmp/debug.py {}".format(brief))
def run_cluster_diagnostics(spark_cluster_operations, cluster_id, output_directory=None, brief=False):

Просмотреть файл

@ -4,15 +4,17 @@ from aztk import error
from aztk.utils import helpers
def cluster_download(core_cluster_operations,
cluster_id: str,
source_path: str,
destination_path: str = None,
host: bool = False,
internal: bool = False,
timeout: int = None):
def cluster_download(
core_cluster_operations,
cluster_id: str,
source_path: str,
destination_path: str = None,
host: bool = False,
internal: bool = False,
timeout: int = None,
):
try:
container_name = None if host else 'spark'
container_name = None if host else "spark"
return core_cluster_operations.copy(
cluster_id,
source_path,
@ -20,6 +22,7 @@ def cluster_download(core_cluster_operations,
container_name=container_name,
get=True,
internal=internal,
timeout=timeout)
timeout=timeout,
)
except batch_error.BatchErrorException as e:
raise error.AztkError(helpers.format_batch_exception(e))

Просмотреть файл

@ -7,6 +7,6 @@ from aztk.utils import helpers
def get_application_status(core_cluster_operations, cluster_id: str, app_name: str):
try:
task = core_cluster_operations.batch_client.task.get(cluster_id, app_name)
return task.state._value_
return task.state.name
except batch_error.BatchErrorException as e:
raise error.AztkError(helpers.format_batch_exception(e))

Просмотреть файл

@ -4,15 +4,17 @@ from aztk import error
from aztk.utils import helpers
def node_run(core_cluster_operations,
cluster_id: str,
node_id: str,
command: str,
host=False,
internal: bool = False,
timeout=None):
def node_run(
core_cluster_operations,
cluster_id: str,
node_id: str,
command: str,
host=False,
internal: bool = False,
timeout=None,
):
try:
return core_cluster_operations.node_run(
cluster_id, node_id, command, internal, container_name='spark' if not host else None, timeout=timeout)
cluster_id, node_id, command, internal, container_name="spark" if not host else None, timeout=timeout)
except batch_error.BatchErrorException as e:
raise error.AztkError(helpers.format_batch_exception(e))

Просмотреть файл

@ -12,6 +12,6 @@ def cluster_run(core_cluster_operations,
timeout=None):
try:
return core_cluster_operations.run(
cluster_id, command, internal, container_name='spark' if not host else None, timeout=timeout)
cluster_id, command, internal, container_name="spark" if not host else None, timeout=timeout)
except batch_error.BatchErrorException as e:
raise error.AztkError(helpers.format_batch_exception(e))

Просмотреть файл

@ -4,16 +4,19 @@ from aztk import error
from aztk.utils import helpers
def cluster_ssh_into_master(spark_cluster_operations,
cluster_id,
node_id,
username,
ssh_key=None,
password=None,
port_forward_list=None,
internal=False):
def ssh_into_master(
spark_cluster_operations,
core_cluster_operations,
cluster_id,
username,
ssh_key=None,
password=None,
port_forward_list=None,
internal=False,
):
try:
spark_cluster_operations.ssh_into_node(cluster_id, node_id, username, ssh_key, password, port_forward_list,
internal)
master_node_id = spark_cluster_operations.get(cluster_id).master_node_id
core_cluster_operations.ssh_into_node(cluster_id, master_node_id, username, ssh_key, password,
port_forward_list, internal)
except batch_error.BatchErrorException as e:
raise error.AztkError(helpers.format_batch_exception(e))

Просмотреть файл

@ -42,12 +42,14 @@ def submit_application(core_cluster_operations,
job_id=job_id, task_id=task.id, batch_client=core_cluster_operations.batch_client)
def submit(core_cluster_operations,
spark_cluster_operations,
cluster_id: str,
application: models.ApplicationConfiguration,
remote: bool = False,
wait: bool = False):
def submit(
core_cluster_operations,
spark_cluster_operations,
cluster_id: str,
application: models.ApplicationConfiguration,
remote: bool = False,
wait: bool = False,
):
try:
submit_application(core_cluster_operations, spark_cluster_operations, cluster_id, application, remote, wait)
except batch_error.BatchErrorException as e:

Просмотреть файл

@ -2,9 +2,25 @@ from aztk.client.cluster import CoreClusterOperations
from aztk.spark import models
from aztk.spark.client.base import SparkBaseOperations
from .helpers import (copy, create, create_user, delete, diagnostics, download, get, get_application_log,
get_application_status, get_configuration, get_remote_login_settings, list, node_run, run, submit,
wait)
from .helpers import (
copy,
create,
create_user,
delete,
diagnostics,
download,
get,
get_application_log,
get_application_status,
get_configuration,
get_remote_login_settings,
list,
node_run,
run,
ssh_into_master,
submit,
wait,
)
class ClusterOperations(SparkBaseOperations):
@ -58,7 +74,8 @@ class ClusterOperations(SparkBaseOperations):
"""List all clusters.
Returns:
:obj:`List[aztk.spark.models.Cluster]`: List of Cluster objects each representing the state and configuration of the cluster.
:obj:`List[aztk.spark.models.Cluster]`: List of Cluster objects each representing the state
and configuration of the cluster.
"""
return list.list_clusters(self._core_cluster_operations)
@ -71,7 +88,8 @@ class ClusterOperations(SparkBaseOperations):
remote (:obj:`bool`): If True, the application file will not be uploaded, it is assumed to be reachable
by the cluster already. This is useful when your application is stored in a mounted Azure File Share
and not the client. Defaults to False.
wait (:obj:`bool`, optional): If True, this function blocks until the application has completed. Defaults to False.
wait (:obj:`bool`, optional): If True, this function blocks until the application has completed.
Defaults to False.
Returns:
:obj:`None`
@ -84,7 +102,8 @@ class ClusterOperations(SparkBaseOperations):
Args:
username (:obj:`str`): name of the user to create.
pool_id (:obj:`str`): id of the cluster to create the user on.
ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None.
ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password.
Defaults to None.
password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None.
Returns:
@ -118,7 +137,8 @@ class ClusterOperations(SparkBaseOperations):
Defaults to None.
Returns:
:obj:`List[aztk.spark.models.NodeOutput]`: list of NodeOutput objects containing the output of the run command
:obj:`List[aztk.spark.models.NodeOutput]`:
list of NodeOutput objects containing the output of the run command
"""
return run.cluster_run(self._core_cluster_operations, id, command, host, internal, timeout)
@ -141,13 +161,15 @@ class ClusterOperations(SparkBaseOperations):
"""
return node_run.node_run(self._core_cluster_operations, id, node_id, command, host, internal, timeout)
def copy(self,
id: str,
source_path: str,
destination_path: str,
host: bool = False,
internal: bool = False,
timeout: int = None):
def copy(
self,
id: str,
source_path: str,
destination_path: str,
host: bool = False,
internal: bool = False,
timeout: int = None,
):
"""Copy a file to every node in a cluster.
Args:
@ -162,18 +184,21 @@ class ClusterOperations(SparkBaseOperations):
Defaults to None.
Returns:
:obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command.
:obj:`List[aztk.spark.models.NodeOutput]`:
A list of NodeOutput objects representing the output of the copy command.
"""
return copy.cluster_copy(self._core_cluster_operations, id, source_path, destination_path, host, internal,
timeout)
def download(self,
id: str,
source_path: str,
destination_path: str = None,
host: bool = False,
internal: bool = False,
timeout: int = None):
def download(
self,
id: str,
source_path: str,
destination_path: str = None,
host: bool = False,
internal: bool = False,
timeout: int = None,
):
"""Download a file from every node in a cluster.
Args:
@ -190,7 +215,8 @@ class ClusterOperations(SparkBaseOperations):
Defaults to None.
Returns:
:obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command.
:obj:`List[aztk.spark.models.NodeOutput]`:
A list of NodeOutput objects representing the output of the copy command.
"""
return download.cluster_download(self._core_cluster_operations, id, source_path, destination_path, host,
internal, timeout)
@ -205,7 +231,8 @@ class ClusterOperations(SparkBaseOperations):
written to this path. Defaults to None.
Returns:
:obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command.
:obj:`List[aztk.spark.models.NodeOutput]`:
A list of NodeOutput objects representing the output of the copy command.
"""
return diagnostics.run_cluster_diagnostics(self, id, output_directory, brief)
@ -215,10 +242,11 @@ class ClusterOperations(SparkBaseOperations):
Args:
id (:obj:`str`): the id of the cluster to run the command on.
application_name (:obj:`str`): str
tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. Otherwise, the whole log will be retrieved.
Only use this if streaming the log as it is being written. Defaults to False.
current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are retrieved.
Only useful is streaming the log as it is being written. Only used if tail is True.
tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes.
Otherwise, the whole log will be retrieved. Only use this if streaming the log as it is being written.
Defaults to False.
current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are
retrieved. Only useful is streaming the log as it is being written. Only used if tail is True.
Returns:
:obj:`aztk.spark.models.ApplicationLog`: a model representing the output of the application.
@ -234,7 +262,8 @@ class ClusterOperations(SparkBaseOperations):
node_id (:obj:`str`): the id of the node in the cluster
Returns:
:obj:`aztk.spark.models.RemoteLogin`: Object that contains the ip address and port combination to login to a node
:obj:`aztk.spark.models.RemoteLogin`:
Object that contains the ip address and port combination to login to a node
"""
return get_remote_login_settings.get_remote_login_settings(self._core_cluster_operations, id, node_id)
@ -260,3 +289,21 @@ class ClusterOperations(SparkBaseOperations):
:obj:`aztk.spark.models.ClusterConfiguration`
"""
return get_configuration.get_configuration(self._core_cluster_operations, id)
def ssh_into_master(self, id, username, ssh_key=None, password=None, port_forward_list=None, internal=False):
"""Open an SSH tunnel to the Spark master node and forward the specified ports
Args:
id (:obj:`str`): the id of the cluster
username (:obj:`str`): the name of the user to open the ssh session with
ssh_key (:obj:`str`, optional): the ssh_key to authenticate the ssh user with.
Must specify either `ssh_key` or `password`.
password (:obj:`str`, optional): the password to authenticate the ssh user with.
Must specify either `password` or `ssh_key`.
port_forward_list (:obj:`aztk.spark.models.PortForwardingSpecification`, optional):
List of the ports to forward.
internal (:obj:`str`, optional): if True, this will connect to the node using its internal IP.
Only use this if running within the same VNET as the cluster. Defaults to False.
"""
return ssh_into_master.ssh_into_master(self, self._core_cluster_operations, id, username, ssh_key, password,
port_forward_list, internal)

Просмотреть файл

@ -2,7 +2,6 @@ import azure.batch.models as batch_models
import azure.batch.models.batch_error as batch_error
from aztk import error
from aztk.spark import models
from aztk.utils import helpers
from .get_recent_job import get_recent_job

Просмотреть файл

@ -5,7 +5,6 @@ from aztk import error
from aztk.spark import models
from aztk.utils import helpers
from .list_applications import list_applications
from .get_recent_job import get_recent_job
@ -25,8 +24,11 @@ def _get_application_log(core_job_operations, spark_job_operations, job_id, appl
raise error.AztkError("The application {0} has not yet been created.".format(application))
raise error.AztkError("The application {0} does not exist".format(application_name))
else:
if task.state in (batch_models.TaskState.active, batch_models.TaskState.running,
batch_models.TaskState.preparing):
if task.state in (
batch_models.TaskState.active,
batch_models.TaskState.running,
batch_models.TaskState.preparing,
):
raise error.AztkError("The application {0} has not yet finished executing.".format(application_name))
return core_job_operations.get_application_log(job_id, application_name)

Просмотреть файл

@ -13,7 +13,7 @@ def _list_applications(core_job_operations, job_id):
applications = {}
for metadata_item in recent_run_job.metadata:
if metadata_item.name == "applications":
for app_name in metadata_item.value.split('\n'):
for app_name in metadata_item.value.split("\n"):
applications[app_name] = None
# get tasks from Batch job

Просмотреть файл

@ -1,7 +1,6 @@
import azure.batch.models.batch_error as batch_error
from aztk import error
from aztk.spark import models
from aztk.utils import helpers
from .get_recent_job import get_recent_job

Просмотреть файл

@ -1,8 +1,5 @@
import azure.batch.models.batch_error as batch_error
from aztk import error
from aztk.spark import models
from aztk.utils import helpers
from .get_recent_job import get_recent_job

Просмотреть файл

@ -15,11 +15,12 @@ def __app_cmd():
docker_exec.add_argument("-i")
docker_exec.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID")
docker_exec.add_argument("spark /bin/bash >> output.log 2>&1 -c \"" \
"source ~/.bashrc; " \
"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " \
"cd \$AZ_BATCH_TASK_WORKING_DIR; " \
"\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py\"")
docker_exec.add_argument(
r'spark /bin/bash >> output.log 2>&1 -c "'
r"source ~/.bashrc; "
r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; "
r"cd \$AZ_BATCH_TASK_WORKING_DIR; "
r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py"')
return docker_exec.to_str()
@ -28,10 +29,11 @@ def generate_job_manager_task(core_job_operations, job, application_tasks):
for application, task in application_tasks:
task_definition_resource_file = helpers.upload_text_to_container(
container_name=job.id,
application_name=application.name + '.yaml',
file_path=application.name + '.yaml',
application_name=application.name + ".yaml",
file_path=application.name + ".yaml",
content=yaml.dump(task),
blob_client=core_job_operations.blob_client)
blob_client=core_job_operations.blob_client,
)
resource_files.append(task_definition_resource_file)
task_cmd = __app_cmd()
@ -45,7 +47,8 @@ def generate_job_manager_task(core_job_operations, job, application_tasks):
allow_low_priority_node=True,
user_identity=batch_models.UserIdentity(
auto_user=batch_models.AutoUserSpecification(
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)))
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)),
)
return task
@ -83,24 +86,24 @@ def submit_job(core_job_operations,
job_configuration.get_docker_repo(),
job_configuration.get_docker_run_options(),
mixed_mode=job_configuration.mixed_mode(),
worker_on_master=job_configuration.worker_on_master)
worker_on_master=job_configuration.worker_on_master,
)
application_tasks = []
for application in job_configuration.applications:
application_tasks.append((application,
spark_job_operations._generate_application_task(
core_job_operations, job_configuration.id, application)))
application_tasks.append((
application,
spark_job_operations._generate_application_task(core_job_operations, job_configuration.id, application),
))
job_manager_task = generate_job_manager_task(core_job_operations, job_configuration, application_tasks)
software_metadata_key = base_models.Software.spark
vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04')
vm_image = models.VmImage(publisher="Canonical", offer="UbuntuServer", sku="16.04")
autoscale_formula = "$TargetDedicatedNodes = {0}; " \
"$TargetLowPriorityNodes = {1}".format(
job_configuration.max_dedicated_nodes,
job_configuration.max_low_pri_nodes)
autoscale_formula = "$TargetDedicatedNodes = {0}; " "$TargetLowPriorityNodes = {1}".format(
job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes)
job = core_job_operations.submit(
job_configuration=job_configuration,
@ -109,7 +112,8 @@ def submit_job(core_job_operations,
autoscale_formula=autoscale_formula,
software_metadata_key=software_metadata_key,
vm_image_model=vm_image,
application_metadata='\n'.join(application.name for application in (job_configuration.applications or [])))
application_metadata="\n".join(application.name for application in (job_configuration.applications or [])),
)
if wait:
spark_job_operations.wait(id=job_configuration.id)

Просмотреть файл

@ -2,8 +2,18 @@ from aztk.client.job import CoreJobOperations
from aztk.spark import models
from aztk.spark.client.base import SparkBaseOperations
from .helpers import (delete, get, get_application, get_application_log, list, list_applications, stop,
stop_application, submit, wait_until_complete)
from .helpers import (
delete,
get,
get_application,
get_application_log,
list,
list_applications,
stop,
stop_application,
submit,
wait_until_complete,
)
class JobOperations(SparkBaseOperations):

Просмотреть файл

@ -1,8 +1,4 @@
import os
from aztk.utils import ssh
from aztk.utils.command_builder import CommandBuilder
from aztk import models as aztk_models
import azure.batch.models as batch_models
def run(spark_client, cluster_id, output_directory=None):
@ -17,8 +13,8 @@ def run(spark_client, cluster_id, output_directory=None):
output = spark_client.cluster_download(cluster_id, remote_path, local_path, host=True)
# write run output to debug/ directory
with open(os.path.join(os.path.dirname(local_path), "debug-output.txt"), 'w', encoding="UTF-8") as f:
[f.write(line + '\n') for node_output in run_output for line in node_output.output]
with open(os.path.join(os.path.dirname(local_path), "debug-output.txt"), "w", encoding="UTF-8") as f:
[f.write(line + "\n") for node_output in run_output for line in node_output.output]
else:
output = spark_client.cluster_download(cluster_id, remote_path, host=True)
@ -26,8 +22,4 @@ def run(spark_client, cluster_id, output_directory=None):
def _build_diagnostic_ssh_command():
return "sudo rm -rf /tmp/debug.zip; "\
"sudo apt-get install -y python3-pip; "\
"sudo -H pip3 install --upgrade pip; "\
"sudo -H pip3 install docker; "\
"sudo python3 /tmp/debug.py"
return "sudo rm -rf /tmp/debug.zip; " "sudo apt-get install -y python3-pip; " "sudo -H pip3 install --upgrade pip; " "sudo -H pip3 install docker; " "sudo python3 /tmp/debug.py"

Просмотреть файл

@ -1,9 +1,7 @@
from typing import List
from aztk.utils.command_builder import CommandBuilder
from aztk.utils import helpers
from aztk.utils import constants
from aztk import models as aztk_models
from aztk.spark.models import ClusterConfiguration
import azure.batch.models as batch_models
POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity(
@ -56,14 +54,16 @@ def __get_secrets_env(spark_client):
]
def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
plugins=None,
worker_on_master: bool = True,
file_mounts=None,
mixed_mode: bool = False):
def __cluster_install_cmd(
zip_resource_file: batch_models.ResourceFile,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
plugins=None,
worker_on_master: bool = True,
file_mounts=None,
mixed_mode: bool = False,
):
"""
For Docker on ubuntu 16.04 - return the command line
to be run on the start task of the pool to setup spark.
@ -77,41 +77,41 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
if file_mounts:
for mount in file_mounts:
# Create the directory on the node
shares.append('mkdir -p {0}'.format(mount.mount_path))
shares.append("mkdir -p {0}".format(mount.mount_path))
# Mount the file share
shares.append(
'mount -t cifs //{0}.file.core.windows.net/{2} {3} -o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp'.
format(mount.storage_account_name, mount.storage_account_key, mount.file_share_path, mount.mount_path))
shares.append("mount -t cifs //{0}.file.core.windows.net/{2} {3} "
"-o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp".format(
mount.storage_account_name, mount.storage_account_key, mount.file_share_path,
mount.mount_path))
setup = [
'time('\
'apt-get -y update;'\
'apt-get -y --no-install-recommends install unzip;'\
'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\
'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\
') 2>&1'.format(zip_resource_file.file_path),
"time("
"apt-get -y update;"
"apt-get -y --no-install-recommends install unzip;"
"unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};"
"chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;"
") 2>&1".format(zip_resource_file.file_path),
'/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1} "{2}"'.format(
constants.DOCKER_SPARK_CONTAINER_NAME,
docker_repo,
docker_run_options.replace('"', '\\\"')
)
constants.DOCKER_SPARK_CONTAINER_NAME, docker_repo, docker_run_options.replace('"', '\\"')),
]
commands = shares + setup
return commands
def generate_cluster_start_task(spark_client,
zip_resource_file: batch_models.ResourceFile,
cluster_id: str,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
file_shares: List[aztk_models.FileShare] = None,
plugins: List[aztk_models.PluginConfiguration] = None,
mixed_mode: bool = False,
worker_on_master: bool = True):
def generate_cluster_start_task(
spark_client,
zip_resource_file: batch_models.ResourceFile,
cluster_id: str,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
file_shares: List[aztk_models.FileShare] = None,
plugins: List[aztk_models.PluginConfiguration] = None,
mixed_mode: bool = False,
worker_on_master: bool = True,
):
"""
This will return the start task object for the pool to be created.
:param cluster_id str: Id of the cluster(Used for uploading the resource files)
@ -127,22 +127,31 @@ def generate_cluster_start_task(spark_client,
spark_submit_logs_file = constants.SPARK_SUBMIT_LOGS_FILE
# TODO use certificate
environment_settings = __get_secrets_env(spark_client) + [
environment_settings = (__get_secrets_env(spark_client) + [
batch_models.EnvironmentSetting(name="SPARK_WEB_UI_PORT", value=spark_web_ui_port),
batch_models.EnvironmentSetting(name="SPARK_WORKER_UI_PORT", value=spark_worker_ui_port),
batch_models.EnvironmentSetting(name="SPARK_JOB_UI_PORT", value=spark_job_ui_port),
batch_models.EnvironmentSetting(name="SPARK_CONTAINER_NAME", value=spark_container_name),
batch_models.EnvironmentSetting(name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file),
batch_models.EnvironmentSetting(name="AZTK_GPU_ENABLED", value=helpers.bool_env(gpu_enabled)),
] + __get_docker_credentials(spark_client) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode)
] + __get_docker_credentials(spark_client) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode))
# start task command
command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, plugins,
worker_on_master, file_shares, mixed_mode)
command = __cluster_install_cmd(
zip_resource_file,
gpu_enabled,
docker_repo,
docker_run_options,
plugins,
worker_on_master,
file_shares,
mixed_mode,
)
return batch_models.StartTask(
command_line=helpers.wrap_commands_in_shell(command),
resource_files=resource_files,
environment_settings=environment_settings,
user_identity=POOL_ADMIN_USER_IDENTITY,
wait_for_success=True)
wait_for_success=True,
)

Просмотреть файл

@ -9,8 +9,7 @@ from aztk import models as base_models
from aztk.spark import models
from aztk.utils import constants, helpers
output_file = constants.TASK_WORKING_DIR + \
"/" + constants.SPARK_SUBMIT_LOGS_FILE
output_file = constants.TASK_WORKING_DIR + "/" + constants.SPARK_SUBMIT_LOGS_FILE
def __check_task_node_exist(batch_client, cluster_id: str, task: batch_models.CloudTask) -> bool:
@ -51,16 +50,17 @@ def __get_output_file_properties(batch_client, cluster_id: str, application_name
def get_log_from_storage(blob_client, container_name, application_name, task):
try:
blob = blob_client.get_blob_to_text(container_name, application_name + '/' + constants.SPARK_SUBMIT_LOGS_FILE)
blob = blob_client.get_blob_to_text(container_name, application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE)
except azure.common.AzureMissingResourceHttpError:
raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.")
base_model = base_models.ApplicationLog(
name=application_name,
cluster_id=container_name,
application_state=task.state._value_,
application_state=task.state.name,
log=blob.content,
total_bytes=blob.properties.content_length,
exit_code=task.execution_info.exit_code)
exit_code=task.execution_info.exit_code,
)
return models.ApplicationLog(base_model)
@ -88,17 +88,19 @@ def get_log(batch_client, blob_client, cluster_id: str, application_name: str, t
base_model = base_models.ApplicationLog(
name=application_name,
cluster_id=cluster_id,
application_state=task.state._value_,
application_state=task.state.name,
log=content,
total_bytes=target_bytes,
exit_code=task.execution_info.exit_code)
exit_code=task.execution_info.exit_code,
)
return models.ApplicationLog(base_model)
else:
base_model = base_models.ApplicationLog(
name=application_name,
cluster_id=cluster_id,
application_state=task.state._value_,
log='',
application_state=task.state.name,
log="",
total_bytes=target_bytes,
exit_code=task.execution_info.exit_code)
exit_code=task.execution_info.exit_code,
)
return models.ApplicationLog(base_model)

Просмотреть файл

@ -1,17 +1,11 @@
import datetime
import os
import time
from typing import List
import azure.batch.models as batch_models
import yaml
import aztk.error as error
from aztk.utils import constants, helpers
from aztk.utils import helpers
from aztk.utils.command_builder import CommandBuilder
'''
Job Submission helper methods
'''
def __app_cmd():
@ -19,11 +13,12 @@ def __app_cmd():
docker_exec.add_argument("-i")
docker_exec.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID")
docker_exec.add_argument("spark /bin/bash >> output.log 2>&1 -c \"" \
"source ~/.bashrc; " \
"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " \
"cd \$AZ_BATCH_TASK_WORKING_DIR; " \
"\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py\"")
docker_exec.add_argument(
r'spark /bin/bash >> output.log 2>&1 -c "'
r"source ~/.bashrc; "
r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; "
r"cd \$AZ_BATCH_TASK_WORKING_DIR; "
r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py"')
return docker_exec.to_str()
@ -32,10 +27,11 @@ def generate_task(spark_client, job, application_tasks):
for application, task in application_tasks:
task_definition_resource_file = helpers.upload_text_to_container(
container_name=job.id,
application_name=application.name + '.yaml',
file_path=application.name + '.yaml',
application_name=application.name + ".yaml",
file_path=application.name + ".yaml",
content=yaml.dump(task),
blob_client=spark_client.blob_client)
blob_client=spark_client.blob_client,
)
resource_files.append(task_definition_resource_file)
task_cmd = __app_cmd()
@ -49,7 +45,8 @@ def generate_task(spark_client, job, application_tasks):
allow_low_priority_node=True,
user_identity=batch_models.UserIdentity(
auto_user=batch_models.AutoUserSpecification(
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)))
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)),
)
return task
@ -69,7 +66,7 @@ def list_applications(spark_client, job_id):
applications = {}
for metadata_item in recent_run_job.metadata:
if metadata_item.name == "applications":
for app_name in metadata_item.value.split('\n'):
for app_name in metadata_item.value.split("\n"):
applications[app_name] = None
# get tasks from Batch job
@ -177,8 +174,11 @@ def get_application_log(spark_client, job_id, application_name):
raise error.AztkError("The application {0} has not yet been created.".format(application))
raise error.AztkError("The application {0} does not exist".format(application_name))
else:
if task.state in (batch_models.TaskState.active, batch_models.TaskState.running,
batch_models.TaskState.preparing):
if task.state in (
batch_models.TaskState.active,
batch_models.TaskState.running,
batch_models.TaskState.preparing,
):
raise error.AztkError("The application {0} has not yet finished executing.".format(application_name))
return spark_client.get_application_log(job_id, application_name)

Просмотреть файл

@ -1,14 +1,11 @@
import datetime
import os
from typing import List
import yaml
import azure.batch.models as batch_models
import yaml
from aztk.error import AztkError
from aztk.utils import constants, helpers
from aztk.utils import helpers
from aztk.utils.command_builder import CommandBuilder
'''
Submit helper methods
'''
def __get_node(spark_client, node_id: str, cluster_id: str) -> batch_models.ComputeNode:
@ -25,12 +22,13 @@ def generate_task(spark_client, container_id, application, remote=False):
application_name=application.name,
file_path=application.application,
blob_client=spark_client.blob_client,
use_full_path=False)
use_full_path=False,
)
# Upload application file
resource_files.append(app_resource_file)
application.application = '$AZ_BATCH_TASK_WORKING_DIR/' + os.path.basename(application.application)
application.application = "$AZ_BATCH_TASK_WORKING_DIR/" + os.path.basename(application.application)
# Upload dependent JARS
jar_resource_file_paths = []
@ -40,7 +38,8 @@ def generate_task(spark_client, container_id, application, remote=False):
application_name=application.name,
file_path=jar,
blob_client=spark_client.blob_client,
use_full_path=False)
use_full_path=False,
)
jar_resource_file_paths.append(current_jar_resource_file_path)
resource_files.append(current_jar_resource_file_path)
@ -52,7 +51,8 @@ def generate_task(spark_client, container_id, application, remote=False):
application_name=application.name,
file_path=py_file,
blob_client=spark_client.blob_client,
use_full_path=False)
use_full_path=False,
)
py_files_resource_file_paths.append(current_py_files_resource_file_path)
resource_files.append(current_py_files_resource_file_path)
@ -64,7 +64,8 @@ def generate_task(spark_client, container_id, application, remote=False):
application_name=application.name,
file_path=file,
blob_client=spark_client.blob_client,
use_full_path=False)
use_full_path=False,
)
files_resource_file_paths.append(files_resource_file_path)
resource_files.append(files_resource_file_path)
@ -75,21 +76,23 @@ def generate_task(spark_client, container_id, application, remote=False):
application_definition_file = helpers.upload_text_to_container(
container_name=container_id,
application_name=application.name,
file_path='application.yaml',
file_path="application.yaml",
content=yaml.dump(vars(application)),
blob_client=spark_client.blob_client)
blob_client=spark_client.blob_client,
)
resource_files.append(application_definition_file)
# create command to submit task
task_cmd = CommandBuilder('sudo docker exec')
task_cmd.add_argument('-i')
task_cmd.add_option('-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR')
task_cmd.add_option('-e', 'STORAGE_LOGS_CONTAINER={0}'.format(container_id))
task_cmd.add_argument('spark /bin/bash >> output.log 2>&1')
task_cmd.add_argument('-c "source ~/.bashrc; ' \
'export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; ' \
'cd \$AZ_BATCH_TASK_WORKING_DIR; ' \
'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"')
task_cmd = CommandBuilder("sudo docker exec")
task_cmd.add_argument("-i")
task_cmd.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
task_cmd.add_option("-e", "STORAGE_LOGS_CONTAINER={0}".format(container_id))
task_cmd.add_argument("spark /bin/bash >> output.log 2>&1")
task_cmd.add_argument(
r'-c "source ~/.bashrc; '
r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; "
r"cd \$AZ_BATCH_TASK_WORKING_DIR; "
r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"')
# Create task
task = batch_models.TaskAddParameter(
@ -99,7 +102,8 @@ def generate_task(spark_client, container_id, application, remote=False):
constraints=batch_models.TaskConstraints(max_task_retry_count=application.max_retry_count),
user_identity=batch_models.UserIdentity(
auto_user=batch_models.AutoUserSpecification(
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)))
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)),
)
return task

Просмотреть файл

@ -12,11 +12,7 @@ from aztk.utils import constants, helpers
class SparkToolkit(aztk.models.Toolkit):
def __init__(self, version: str, environment: str = None, environment_version: str = None):
super().__init__(
software="spark",
version=version,
environment=environment,
environment_version=environment_version,
)
software="spark", version=version, environment=environment, environment_version=environment_version)
class Cluster(aztk.models.Cluster):
@ -74,9 +70,9 @@ class SparkConfiguration(Model):
def __generate_ssh_key_pair(self):
key = RSA.generate(2048)
priv_key = key.exportKey('PEM')
pub_key = key.publickey().exportKey('OpenSSH')
return {'pub_key': pub_key, 'priv_key': priv_key}
priv_key = key.exportKey("PEM")
pub_key = key.publickey().exportKey("OpenSSH")
return {"pub_key": pub_key, "priv_key": priv_key}
class CustomScript(aztk.models.CustomScript):
@ -124,22 +120,24 @@ class VmImage(aztk.models.VmImage):
class ApplicationConfiguration:
def __init__(self,
name=None,
application=None,
application_args=None,
main_class=None,
jars=None,
py_files=None,
files=None,
driver_java_options=None,
driver_library_path=None,
driver_class_path=None,
driver_memory=None,
executor_memory=None,
driver_cores=None,
executor_cores=None,
max_retry_count=None):
def __init__(
self,
name=None,
application=None,
application_args=None,
main_class=None,
jars=None,
py_files=None,
files=None,
driver_java_options=None,
driver_library_path=None,
driver_class_path=None,
driver_memory=None,
executor_memory=None,
driver_cores=None,
executor_cores=None,
max_retry_count=None,
):
self.name = name
self.application = application
self.application_args = application_args
@ -162,11 +160,11 @@ class Application:
self.name = cloud_task.id
self.last_modified = cloud_task.last_modified
self.creation_time = cloud_task.creation_time
self.state = cloud_task.state._value_
self.state = cloud_task.state.name
self.state_transition_time = cloud_task.state_transition_time
self.exit_code = cloud_task.execution_info.exit_code
if cloud_task.previous_state:
self.previous_state = cloud_task.previous_state._value_
self.previous_state = cloud_task.previous_state.name
self.previous_state_transition_time = cloud_task.previous_state_transition_time
self._execution_info = cloud_task.execution_info
@ -190,17 +188,19 @@ class Application:
class JobConfiguration:
def __init__(self,
id=None,
applications=None,
vm_size=None,
spark_configuration=None,
toolkit=None,
max_dedicated_nodes=0,
max_low_pri_nodes=0,
subnet_id=None,
scheduling_target: SchedulingTarget = None,
worker_on_master=None):
def __init__(
self,
id=None,
applications=None,
vm_size=None,
spark_configuration=None,
toolkit=None,
max_dedicated_nodes=0,
max_low_pri_nodes=0,
subnet_id=None,
scheduling_target: SchedulingTarget = None,
worker_on_master=None,
):
self.id = id
self.applications = applications
@ -252,24 +252,23 @@ class JobConfiguration:
raise error.AztkError("Please supply an ID for the Job in your configuration.")
if self.max_dedicated_nodes == 0 and self.max_low_pri_nodes == 0:
raise error.AztkError(
"Please supply a valid (greater than 0) value for either max_dedicated_nodes or max_low_pri_nodes in your configuration."
)
raise error.AztkError("Please supply a valid (greater than 0) value for either max_dedicated_nodes "
"or max_low_pri_nodes in your configuration.")
if self.vm_size is None:
raise error.AztkError("Please supply a vm_size in your configuration.")
if self.mixed_mode() and not self.subnet_id:
raise error.AztkError(
"You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes) and pass the subnet_id in your configuration.."
)
"You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes) "
"and pass the subnet_id in your configuration..")
if self.scheduling_target == SchedulingTarget.Dedicated and self.max_dedicated_nodes == 0:
raise error.InvalidModelError("Scheduling target cannot be Dedicated if dedicated vm size is 0")
class JobState():
complete = 'completed'
class JobState:
complete = "completed"
active = "active"
completed = "completed"
disabled = "disabled"
@ -277,15 +276,17 @@ class JobState():
deleting = "deleting"
class Job():
def __init__(self,
cloud_job_schedule: batch_models.CloudJobSchedule,
cloud_tasks: List[batch_models.CloudTask] = None,
pool: batch_models.CloudPool = None,
nodes: batch_models.ComputeNodePaged = None):
class Job:
def __init__(
self,
cloud_job_schedule: batch_models.CloudJobSchedule,
cloud_tasks: List[batch_models.CloudTask] = None,
pool: batch_models.CloudPool = None,
nodes: batch_models.ComputeNodePaged = None,
):
self.id = cloud_job_schedule.id
self.last_modified = cloud_job_schedule.last_modified
self.state = cloud_job_schedule.state._value_
self.state = cloud_job_schedule.state.name
self.state_transition_time = cloud_job_schedule.state_transition_time
self.creation_time = cloud_job_schedule.creation_time
self.applications = [Application(task) for task in (cloud_tasks or [])]
@ -297,9 +298,11 @@ class Job():
class ApplicationLog(aztk.models.ApplicationLog):
def __init__(self, application_log: aztk.models.ApplicationLog):
self.name = application_log.name
self.cluster_id = application_log.cluster_id # TODO: change to something cluster/job agnostic
self.log = application_log.log
self.total_bytes = application_log.total_bytes
self.application_state = application_log.application_state
self.exit_code = application_log.exit_code
super().__init__(
name=application_log.name,
cluster_id=application_log.cluster_id, # TODO: change to something cluster/job agnostic
log=application_log.log,
total_bytes=application_log.total_bytes,
application_state=application_log.application_state,
exit_code=application_log.exit_code,
)

Просмотреть файл

@ -1,7 +1,6 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))
@ -11,36 +10,14 @@ class HDFSPlugin(PluginConfiguration):
super().__init__(
name="hdfs",
ports=[
PluginPort(
name="File system metadata operations",
internal=8020,
),
PluginPort(
name="File system metadata operations(Backup)",
internal=9000,
),
PluginPort(
name="Datanode data transfer",
internal=50010,
),
PluginPort(
name="Datanode IPC metadata operations",
internal=50020,
),
PluginPort(
name="Namenode",
internal=50070,
public=True,
),
PluginPort(
name="Datanodes",
internal=50075,
public=True,
),
PluginPort(name="File system metadata operations", internal=8020),
PluginPort(name="File system metadata operations(Backup)", internal=9000),
PluginPort(name="Datanode data transfer", internal=50010),
PluginPort(name="Datanode IPC metadata operations", internal=50020),
PluginPort(name="Namenode", internal=50070, public=True),
PluginPort(name="Datanodes", internal=50075, public=True),
],
target_role=PluginTargetRole.All,
execute="hdfs.sh",
files=[
PluginFile("hdfs.sh", os.path.join(dir_path, "hdfs.sh")),
],
files=[PluginFile("hdfs.sh", os.path.join(dir_path, "hdfs.sh"))],
)

Просмотреть файл

@ -1,8 +1,5 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.spark.models.plugins.install import InstallPlugin
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))

Просмотреть файл

@ -1,8 +1,5 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.spark.models.plugins.install import InstallPlugin
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))

Просмотреть файл

@ -1,7 +1,6 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))
@ -13,4 +12,5 @@ def InstallPlugin(name, command, packages=None):
execute="install.sh",
files=[PluginFile("install.sh", os.path.join(dir_path, "install.sh"))],
args=packages,
env=dict(COMMAND=command))
env=dict(COMMAND=command),
)

Просмотреть файл

@ -1,8 +1,5 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.spark.models.plugins.install import InstallPlugin
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))

Просмотреть файл

@ -8,15 +8,8 @@ dir_path = os.path.dirname(os.path.realpath(__file__))
def JupyterPlugin():
return PluginConfiguration(
name="jupyter",
ports=[
PluginPort(
internal=8888,
public=True,
),
],
ports=[PluginPort(internal=8888, public=True)],
target_role=PluginTargetRole.All,
execute="jupyter.sh",
files=[
PluginFile("jupyter.sh", os.path.join(dir_path, "jupyter.sh")),
],
files=[PluginFile("jupyter.sh", os.path.join(dir_path, "jupyter.sh"))],
)

Просмотреть файл

@ -1,7 +1,6 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))
@ -9,15 +8,8 @@ dir_path = os.path.dirname(os.path.realpath(__file__))
def JupyterLabPlugin():
return PluginConfiguration(
name="jupyterlab",
ports=[
PluginPort(
internal=8889,
public=True,
),
],
ports=[PluginPort(internal=8889, public=True)],
target_role=PluginTargetRole.All,
execute="jupyter_lab.sh",
files=[
PluginFile("jupyter_lab.sh", os.path.join(dir_path, "jupyter_lab.sh")),
],
files=[PluginFile("jupyter_lab.sh", os.path.join(dir_path, "jupyter_lab.sh"))],
)

Просмотреть файл

@ -1,7 +1,6 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))
@ -12,6 +11,5 @@ def NvBLASPlugin():
ports=[],
target_role=PluginTargetRole.All,
execute="nvblas.sh",
files=[
PluginFile("nvblas.sh", os.path.join(dir_path, "nvblas.sh")),
])
files=[PluginFile("nvblas.sh", os.path.join(dir_path, "nvblas.sh"))],
)

Просмотреть файл

@ -1,7 +1,6 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))
@ -12,7 +11,5 @@ def OpenBLASPlugin():
ports=[],
target_role=PluginTargetRole.All,
execute="openblas.sh",
files=[
PluginFile("openblas.sh", os.path.join(dir_path, "openblas.sh")),
],
files=[PluginFile("openblas.sh", os.path.join(dir_path, "openblas.sh"))],
)

Просмотреть файл

@ -1,7 +1,6 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTarget, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))
@ -10,12 +9,7 @@ class ResourceMonitorPlugin(PluginConfiguration):
def __init__(self):
super().__init__(
name="resource_monitor",
ports=[
PluginPort(
internal=8890,
public=True,
),
],
ports=[PluginPort(internal=8890, public=True)],
target=PluginTarget.Host,
target_role=PluginTargetRole.All,
execute="start_monitor.sh",
@ -23,4 +17,5 @@ class ResourceMonitorPlugin(PluginConfiguration):
PluginFile("start_monitor.sh", os.path.join(dir_path, "start_monitor.sh")),
PluginFile("etc/telegraf.conf", os.path.join(dir_path, "telegraf.conf")),
PluginFile("docker-compose.yml", os.path.join(dir_path, "docker-compose.yml")),
])
],
)

Просмотреть файл

@ -8,16 +8,9 @@ dir_path = os.path.dirname(os.path.realpath(__file__))
def RStudioServerPlugin(version="1.1.383"):
return PluginConfiguration(
name="rstudio_server",
ports=[
PluginPort(
internal=8787,
public=True,
),
],
ports=[PluginPort(internal=8787, public=True)],
target_role=PluginTargetRole.Master,
execute="rstudio_server.sh",
files=[
PluginFile("rstudio_server.sh", os.path.join(dir_path, "rstudio_server.sh")),
],
files=[PluginFile("rstudio_server.sh", os.path.join(dir_path, "rstudio_server.sh"))],
env=dict(RSTUDIO_SERVER_VERSION=version),
)

Просмотреть файл

@ -1,7 +1,6 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole, PluginTarget
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTarget, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))
@ -13,7 +12,5 @@ class SimplePlugin(PluginConfiguration):
target_role=PluginTargetRole.All,
target=PluginTarget.Host,
execute="simple.sh",
files=[
PluginFile("simple.sh", os.path.join(dir_path, "simple.sh")),
],
files=[PluginFile("simple.sh", os.path.join(dir_path, "simple.sh"))],
)

Просмотреть файл

@ -1,7 +1,6 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))

Просмотреть файл

@ -29,7 +29,7 @@ from http.server import BaseHTTPRequestHandler, HTTPServer
BIND_ADDR = os.environ.get("BIND_ADDR", "0.0.0.0")
SERVER_PORT = int(os.environ.get("SERVER_PORT", "80"))
URL_PREFIX = os.environ.get("URL_PREFIX", "").rstrip('/') + '/'
URL_PREFIX = os.environ.get("URL_PREFIX", "").rstrip("/") + "/"
SPARK_MASTER_HOST = ""
@ -44,7 +44,7 @@ class ProxyHandler(BaseHTTPRequestHandler):
self.proxyRequest(None)
def do_POST(self):
length = int(self.headers.getheader('content-length'))
length = int(self.headers.getheader("content-length"))
postData = self.rfile.read(length)
self.proxyRequest(postData)
@ -84,17 +84,19 @@ class ProxyHandler(BaseHTTPRequestHandler):
def rewriteLinks(self, page, targetHost):
target = "{0}proxy:{1}/".format(URL_PREFIX, targetHost).encode()
page = page.replace(b'href="/', b'href="' + target)
page = page.replace(b"'<div><a href=' + logUrl + '>'",
b"'<div><a href=' + location.origin + logUrl.replace('http://', '/proxy:') + '>'")
page = page.replace(b'href="log', b'href="' + target + b'log')
page = page.replace(b'href="http://', b'href="' + URL_PREFIX.encode() + b'proxy:')
page = page.replace(
b"'<div><a href=' + logUrl + '>'",
b"'<div><a href=' + location.origin + logUrl.replace('http://', '/proxy:') + '>'",
)
page = page.replace(b'href="log', b'href="' + target + b"log")
page = page.replace(b'href="http://', b'href="' + URL_PREFIX.encode() + b"proxy:")
page = page.replace(b'src="/', b'src="' + target)
page = page.replace(b'action="', b'action="' + target)
page = page.replace(b'"/api/v1/', b'"' + target + b'api/v1/')
page = page.replace(b'"/api/v1/', b'"' + target + b"api/v1/")
return page
if __name__ == '__main__':
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: <proxied host:port> [<proxy port>]")
sys.exit(1)

Просмотреть файл

@ -1,7 +1,6 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.utils import constants
dir_path = os.path.dirname(os.path.realpath(__file__))
@ -11,7 +10,5 @@ def TensorflowOnSparkPlugin():
name="tensorflow_on_spark",
target_role=PluginTargetRole.Master,
execute="tensorflow_on_spark.sh",
files=[
PluginFile("tensorflow_on_spark.sh", os.path.join(dir_path, "tensorflow_on_spark.sh")),
],
files=[PluginFile("tensorflow_on_spark.sh", os.path.join(dir_path, "tensorflow_on_spark.sh"))],
)

Просмотреть файл

@ -1,3 +1,3 @@
from aztk.spark import models
SPARK_VM_IMAGE = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04')
SPARK_VM_IMAGE = models.VmImage(publisher="Canonical", offer="UbuntuServer", sku="16.04")

Просмотреть файл

@ -50,9 +50,7 @@ def cmd_check_output(cmd):
try:
output = check_output(cmd, shell=True, stderr=STDOUT)
except CalledProcessError as e:
return "CMD: {0}\n"\
"returncode: {1}"\
"output: {2}".format(e.cmd, e.returncode, e.output)
return "CMD: {0}\n" "returncode: {1}" "output: {2}".format(e.cmd, e.returncode, e.output)
else:
return output
@ -62,9 +60,9 @@ def get_disk_free():
def get_docker_diagnostics(docker_client):
'''
"""
returns list of tuples (filename, data) to be written in the zip
'''
"""
output = []
output.append(get_docker_images(docker_client))
logs = get_docker_containers(docker_client)
@ -95,7 +93,7 @@ def get_docker_containers(docker_client):
# get docker container logs
logs.append((container.name + "/docker.log", container.logs()))
logs.append(get_docker_process_status(container))
if container.name == "spark": #TODO: find a more robust way to get specific info off specific containers
if container.name == "spark": # TODO: find a more robust way to get specific info off specific containers
logs.extend(get_container_aztk_script(container))
logs.extend(get_spark_logs(container))
logs.extend(get_spark_app_logs(container))
@ -158,13 +156,13 @@ def filter_members(members):
def extract_tar_in_memory(container, data):
data = io.BytesIO(b''.join([item for item in data]))
data = io.BytesIO(b"".join([item for item in data]))
tarf = tarfile.open(fileobj=data)
logs = []
for member in filter_members(tarf):
file_bytes = tarf.extractfile(member)
if file_bytes is not None:
logs.append((container.name + "/" + member.name, b''.join(file_bytes.readlines())))
logs.append((container.name + "/" + member.name, b"".join(file_bytes.readlines())))
return logs
@ -174,7 +172,7 @@ def get_brief_diagnostics():
logs = []
for file_name in files:
try:
logs.append((file_name, open(batch_dir + file_name, 'rb').read()))
logs.append((file_name, open(batch_dir + file_name, "rb").read()))
# print("LOG:", (file_name, open(batch_dir+file_name, 'rb').read()))
except FileNotFoundError as e:
print("file not found", e)

Просмотреть файл

@ -1,16 +1,11 @@
from __future__ import print_function
import datetime
import io
import os
import time
import azure.batch.batch_service_client as batch
import azure.batch.batch_auth as batch_auth
import azure.batch.models as batch_models
import azure.storage.blob as blob
from aztk.version import __version__
from aztk.utils import constants
from aztk import error
import aztk.models
class MasterInvalidStateError(Exception):

Просмотреть файл

@ -1,8 +1,3 @@
from .deprecation import deprecated, deprecate
from . import azure_api
from . import command_builder
from . import constants
from . import helpers
from . import file_utils
from . import get_ssh_key
from . import secure_utils
from . import (azure_api, command_builder, constants, file_utils, get_ssh_key, helpers, secure_utils)
from .deprecation import deprecate, deprecated
from .retry import BackOffPolicy, retry

Просмотреть файл

@ -1,5 +1,4 @@
import re
from typing import Optional
import azure.batch.batch_auth as batch_auth
import azure.batch.batch_service_client as batch
@ -12,10 +11,10 @@ from azure.storage.common import CloudStorageAccount
from aztk import error
from aztk.version import __version__
RESOURCE_ID_PATTERN = re.compile('^/subscriptions/(?P<subscription>[^/]+)'
'/resourceGroups/(?P<resourcegroup>[^/]+)'
'/providers/[^/]+'
'/[^/]+Accounts/(?P<account>[^/]+)$')
RESOURCE_ID_PATTERN = re.compile("^/subscriptions/(?P<subscription>[^/]+)"
"/resourceGroups/(?P<resourcegroup>[^/]+)"
"/providers/[^/]+"
"/[^/]+Accounts/(?P<account>[^/]+)$")
def validate_secrets(secrets):
@ -48,23 +47,25 @@ def make_batch_client(secrets):
client_id=secrets.service_principal.client_id,
secret=secrets.service_principal.credential,
tenant=secrets.service_principal.tenant_id,
resource='https://management.core.windows.net/')
resource="https://management.core.windows.net/",
)
m = RESOURCE_ID_PATTERN.match(secrets.service_principal.batch_account_resource_id)
arm_batch_client = BatchManagementClient(arm_credentials, m.group('subscription'))
account = arm_batch_client.batch_account.get(m.group('resourcegroup'), m.group('account'))
base_url = 'https://{0}/'.format(account.account_endpoint)
arm_batch_client = BatchManagementClient(arm_credentials, m.group("subscription"))
account = arm_batch_client.batch_account.get(m.group("resourcegroup"), m.group("account"))
base_url = "https://{0}/".format(account.account_endpoint)
credentials = ServicePrincipalCredentials(
client_id=secrets.service_principal.client_id,
secret=secrets.service_principal.credential,
tenant=secrets.service_principal.tenant_id,
resource='https://batch.core.windows.net/')
resource="https://batch.core.windows.net/",
)
# Set up Batch Client
batch_client = batch.BatchServiceClient(credentials, base_url=base_url)
# Set retry policy
batch_client.config.retry_policy.retries = 5
batch_client.config.add_user_agent('aztk/{}'.format(__version__))
batch_client.config.add_user_agent("aztk/{}".format(__version__))
return batch_client
@ -82,26 +83,29 @@ def make_blob_client(secrets):
blob_client = blob.BlockBlobService(
account_name=secrets.shared_key.storage_account_name,
account_key=secrets.shared_key.storage_account_key,
endpoint_suffix=secrets.shared_key.storage_account_suffix)
endpoint_suffix=secrets.shared_key.storage_account_suffix,
)
else:
# Set up ServicePrincipalCredentials
arm_credentials = ServicePrincipalCredentials(
client_id=secrets.service_principal.client_id,
secret=secrets.service_principal.credential,
tenant=secrets.service_principal.tenant_id,
resource='https://management.core.windows.net/')
resource="https://management.core.windows.net/",
)
m = RESOURCE_ID_PATTERN.match(secrets.service_principal.storage_account_resource_id)
accountname = m.group('account')
subscription = m.group('subscription')
resourcegroup = m.group('resourcegroup')
accountname = m.group("account")
subscription = m.group("subscription")
resourcegroup = m.group("resourcegroup")
mgmt_client = StorageManagementClient(arm_credentials, subscription)
key = retry_function(
key = (retry_function(
mgmt_client.storage_accounts.list_keys,
10,
1,
Exception,
resource_group_name=resourcegroup,
account_name=accountname).keys[0].value
account_name=accountname,
).keys[0].value)
storage_client = CloudStorageAccount(accountname, key)
blob_client = storage_client.create_block_blob_service()
@ -110,6 +114,7 @@ def make_blob_client(secrets):
def retry_function(function, retry_attempts: int, retry_interval: int, exception: Exception, *args, **kwargs):
import time
for i in range(retry_attempts):
try:
return function(*args, **kwargs)

Просмотреть файл

@ -1,4 +1,4 @@
class CommandOption():
class CommandOption:
def __init__(self, name: str, value: str):
self.name = name
self.value = value

Просмотреть файл

@ -18,33 +18,33 @@ DOCKER_SPARK_HOME = "/home/spark-current"
"""
Root path of this repository
"""
ROOT_PATH = os.path.normpath(os.path.join(os.path.dirname(__file__), '..', '..'))
ROOT_PATH = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", ".."))
"""
User home directory path
"""
HOME_DIRECTORY_PATH = os.path.expanduser('~')
HOME_DIRECTORY_PATH = os.path.expanduser("~")
"""
Path to the secrets file
"""
DEFAULT_SECRETS_PATH = os.path.join(os.getcwd(), '.aztk/secrets.yaml')
DEFAULT_SECRETS_PATH = os.path.join(os.getcwd(), ".aztk/secrets.yaml")
"""
Paths to the cluster configuration files
"""
GLOBAL_CONFIG_PATH = os.path.join(HOME_DIRECTORY_PATH, '.aztk')
DEFAULT_SSH_CONFIG_PATH = os.path.join(os.getcwd(), '.aztk/ssh.yaml')
DEFAULT_CLUSTER_CONFIG_PATH = os.path.join(os.getcwd(), '.aztk/cluster.yaml')
DEFAULT_SPARK_CONF_SOURCE = os.path.join(os.getcwd(), '.aztk')
DEFAULT_SPARK_CONF_DEST = os.path.join(ROOT_PATH, 'node_scripts', 'conf')
DEFAULT_SPARK_JARS_SOURCE = os.path.join(os.getcwd(), '.aztk', 'jars')
DEFAULT_SPARK_JARS_DEST = os.path.join(ROOT_PATH, 'node_scripts', 'jars')
DEFAULT_SPARK_JOB_CONFIG = os.path.join(os.getcwd(), '.aztk', 'job.yaml')
GLOBAL_SPARK_JOB_CONFIG = os.path.join(HOME_DIRECTORY_PATH, '.aztk', 'job.yaml')
GLOBAL_CONFIG_PATH = os.path.join(HOME_DIRECTORY_PATH, ".aztk")
DEFAULT_SSH_CONFIG_PATH = os.path.join(os.getcwd(), ".aztk/ssh.yaml")
DEFAULT_CLUSTER_CONFIG_PATH = os.path.join(os.getcwd(), ".aztk/cluster.yaml")
DEFAULT_SPARK_CONF_SOURCE = os.path.join(os.getcwd(), ".aztk")
DEFAULT_SPARK_CONF_DEST = os.path.join(ROOT_PATH, "node_scripts", "conf")
DEFAULT_SPARK_JARS_SOURCE = os.path.join(os.getcwd(), ".aztk", "jars")
DEFAULT_SPARK_JARS_DEST = os.path.join(ROOT_PATH, "node_scripts", "jars")
DEFAULT_SPARK_JOB_CONFIG = os.path.join(os.getcwd(), ".aztk", "job.yaml")
GLOBAL_SPARK_JOB_CONFIG = os.path.join(HOME_DIRECTORY_PATH, ".aztk", "job.yaml")
"""
Source and destination paths for spark init
"""
INIT_DIRECTORY_SOURCE = os.path.join(ROOT_PATH, "aztk_cli", 'config')
LOCAL_INIT_DIRECTORY_DEST = os.path.join(os.getcwd(), '.aztk')
GLOBAL_INIT_DIRECTORY_DEST = os.path.join(HOME_DIRECTORY_PATH, '.aztk')
INIT_DIRECTORY_SOURCE = os.path.join(ROOT_PATH, "aztk_cli", "config")
LOCAL_INIT_DIRECTORY_DEST = os.path.join(os.getcwd(), ".aztk")
GLOBAL_INIT_DIRECTORY_DEST = os.path.join(HOME_DIRECTORY_PATH, ".aztk")
"""
Key of the metadata entry for the pool that is used to store the master node id
"""

Просмотреть файл

@ -39,9 +39,10 @@ def deprecate(version: str, message: str, advice: str = ""):
advice (str): Sentence explaining alternatives to the deprecated functionality.
"""
warnings.simplefilter('always', DeprecationWarning) # turn off filter
warnings.simplefilter("always", DeprecationWarning) # turn off filter
warnings.warn(
"{0} It will be removed in Aztk version {1}. {2}".format(message, version, advice),
category=DeprecationWarning,
stacklevel=2)
warnings.simplefilter('default', DeprecationWarning) # reset filter
stacklevel=2,
)
warnings.simplefilter("default", DeprecationWarning) # reset filter

Просмотреть файл

@ -29,6 +29,6 @@ def __read_ssh_key_from_file(path: str) -> str:
"""
Read the content of the given file
"""
with open(os.path.expanduser(path), 'r', encoding='UTF-8') as content_file:
with open(os.path.expanduser(path), "r", encoding="UTF-8") as content_file:
content = content_file.read()
return content

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше