From 828162ef10a80e8b9da8c1345bd12c5235bc1e1f Mon Sep 17 00:00:00 2001 From: Jacob Freck Date: Fri, 24 Aug 2018 17:21:22 -0700 Subject: [PATCH] Internal: fix pylint warnings (#651) * inital, remove unused imports * run yapf * remove unused imports and variables, fix declaration outside init * fix some pylint warnings, add ssh_into_master * remove unused imports * unused variables * string and function normalization * stop using list comprehension for side effects, make method function * stop using protected member * various pylint fixes * formatting * formatting * add retry decorator with tests * start adding retry decorator, retry docker compose download * update pip and tests * logic fix * change no delete if * factor out reused functions * fix wait_for_all_nodes * fix download return type bug * test vsts ci update * temporarily disable integration tests * syntax fix * update vsts build * add back integration tests, remove debug branch * remove parallel unit tests * more verbose clis * update pylint * typo * fix imports * function returns nothing, don't return * make iterator list * change debug value --- .vsts-ci.yml | 15 +- aztk/client/base/base_operations.py | 48 +++-- .../base/helpers/create_user_on_cluster.py | 2 +- .../base/helpers/create_user_on_node.py | 1 - .../base/helpers/delete_user_on_cluster.py | 2 +- .../base/helpers/generate_user_on_cluster.py | 4 +- .../base/helpers/generate_user_on_node.py | 2 +- .../base/helpers/get_application_log.py | 25 +-- aztk/client/base/helpers/node_run.py | 5 +- aztk/client/base/helpers/run.py | 5 +- aztk/client/client.py | 97 ++++++---- aztk/client/cluster/helpers/copy.py | 23 ++- aztk/client/cluster/helpers/create.py | 19 +- aztk/client/cluster/helpers/delete.py | 12 +- aztk/client/cluster/helpers/get.py | 2 +- aztk/client/cluster/operations.py | 9 +- aztk/client/job/helpers/submit.py | 29 ++- aztk/client/job/operations.py | 24 ++- aztk/core/models/fields.py | 18 +- aztk/core/models/model.py | 12 +- aztk/core/models/validators.py | 14 +- aztk/internal/cluster_data/blob_data.py | 8 +- aztk/internal/cluster_data/cluster_data.py | 13 +- aztk/internal/cluster_data/node_data.py | 47 ++--- aztk/internal/docker_cmd.py | 19 +- aztk/models/application_log.py | 13 +- aztk/models/cluster.py | 6 +- aztk/models/cluster_configuration.py | 8 +- .../models/plugins/internal/plugin_manager.py | 6 +- .../plugins/internal/plugin_reference.py | 4 +- aztk/models/plugins/plugin_configuration.py | 3 + aztk/models/plugins/plugin_file.py | 2 +- aztk/models/secrets_configuration.py | 3 + aztk/models/software.py | 1 + aztk/models/ssh_log.py | 2 +- aztk/models/toolkit.py | 15 +- aztk/node_scripts/core/config.py | 37 ++-- aztk/node_scripts/core/logger.py | 2 +- aztk/node_scripts/install/create_user.py | 32 ++-- aztk/node_scripts/install/install.py | 6 +- aztk/node_scripts/install/pick_master.py | 4 +- aztk/node_scripts/install/plugins.py | 54 ++++-- aztk/node_scripts/install/spark.py | 55 +++--- aztk/node_scripts/install/spark_container.py | 59 +++--- aztk/node_scripts/job_submission.py | 21 +-- aztk/node_scripts/setup_host.sh | 8 +- aztk/node_scripts/submit.py | 145 +++++++------- .../node_scripts/wait_until_setup_complete.py | 4 +- .../base/helpers/generate_application_task.py | 41 ++-- .../helpers/generate_cluster_start_task.py | 72 ++++--- aztk/spark/client/base/operations.py | 46 +++-- aztk/spark/client/client.py | 55 +++--- aztk/spark/client/cluster/helpers/copy.py | 21 ++- aztk/spark/client/cluster/helpers/create.py | 13 +- .../client/cluster/helpers/create_user.py | 14 +- .../client/cluster/helpers/diagnostics.py | 27 ++- aztk/spark/client/cluster/helpers/download.py | 21 ++- .../cluster/helpers/get_application_status.py | 2 +- aztk/spark/client/cluster/helpers/node_run.py | 18 +- aztk/spark/client/cluster/helpers/run.py | 2 +- .../client/cluster/helpers/ssh_into_master.py | 23 ++- aztk/spark/client/cluster/helpers/submit.py | 14 +- aztk/spark/client/cluster/operations.py | 105 ++++++++--- aztk/spark/client/job/helpers/delete.py | 1 - .../client/job/helpers/get_application_log.py | 8 +- .../client/job/helpers/list_applications.py | 2 +- aztk/spark/client/job/helpers/stop.py | 1 - .../client/job/helpers/stop_application.py | 3 - aztk/spark/client/job/helpers/submit.py | 42 +++-- aztk/spark/client/job/operations.py | 14 +- .../helpers/cluster_diagnostic_helper.py | 14 +- aztk/spark/helpers/create_cluster.py | 87 +++++---- aztk/spark/helpers/get_log.py | 22 ++- aztk/spark/helpers/job_submission.py | 38 ++-- aztk/spark/helpers/submit.py | 52 ++--- aztk/spark/models/models.py | 117 ++++++------ .../models/plugins/hdfs/configuration.py | 37 +--- .../plugins/install/apt_get/configuration.py | 3 - .../plugins/install/conda/configuration.py | 3 - .../models/plugins/install/configuration.py | 6 +- .../plugins/install/pip/configuration.py | 3 - .../models/plugins/jupyter/configuration.py | 11 +- .../plugins/jupyter_lab/configuration.py | 12 +- .../models/plugins/nvblas/configuration.py | 8 +- .../models/plugins/openblas/configuration.py | 7 +- .../plugins/resource_monitor/configuration.py | 11 +- .../plugins/rstudio_server/configuration.py | 11 +- .../models/plugins/simple/configuration.py | 7 +- .../plugins/spark_ui_proxy/configuration.py | 1 - .../plugins/spark_ui_proxy/spark_ui_proxy.py | 18 +- .../tensorflow_on_spark/configuration.py | 7 +- aztk/spark/utils/constants.py | 2 +- aztk/spark/utils/debug.py | 16 +- aztk/spark/utils/util.py | 11 +- aztk/utils/__init__.py | 11 +- aztk/utils/azure_api.py | 41 ++-- aztk/utils/command_builder.py | 2 +- aztk/utils/constants.py | 30 +-- aztk/utils/deprecation.py | 7 +- aztk/utils/get_ssh_key.py | 2 +- aztk/utils/helpers.py | 70 +++---- aztk/utils/retry.py | 29 +++ aztk/utils/secure_utils.py | 2 +- aztk/utils/ssh.py | 164 +++++++++------- aztk/version.py | 9 +- aztk_cli/__init__.py | 1 + aztk_cli/config.py | 161 ++++++++-------- aztk_cli/constants.py | 2 +- aztk_cli/entrypoint.py | 13 +- aztk_cli/logger.py | 47 ++--- aztk_cli/plugins.py | 5 +- .../endpoints/cluster/cluster_add_user.py | 31 +-- .../endpoints/cluster/cluster_app_logs.py | 13 +- .../spark/endpoints/cluster/cluster_copy.py | 25 ++- .../spark/endpoints/cluster/cluster_create.py | 45 ++--- .../spark/endpoints/cluster/cluster_debug.py | 6 +- .../spark/endpoints/cluster/cluster_delete.py | 24 +-- .../spark/endpoints/cluster/cluster_get.py | 13 +- .../spark/endpoints/cluster/cluster_list.py | 2 +- .../spark/endpoints/cluster/cluster_run.py | 27 +-- .../spark/endpoints/cluster/cluster_ssh.py | 50 ++--- .../spark/endpoints/cluster/cluster_submit.py | 177 ++++++++++-------- aztk_cli/spark/endpoints/init.py | 32 ++-- aztk_cli/spark/endpoints/job/delete.py | 24 +-- aztk_cli/spark/endpoints/job/get.py | 3 +- aztk_cli/spark/endpoints/job/get_app.py | 5 +- aztk_cli/spark/endpoints/job/get_app_logs.py | 11 +- aztk_cli/spark/endpoints/job/list.py | 3 +- aztk_cli/spark/endpoints/job/list_apps.py | 2 +- aztk_cli/spark/endpoints/job/stop.py | 5 +- aztk_cli/spark/endpoints/job/stop_app.py | 11 +- aztk_cli/spark/endpoints/job/submit.py | 18 +- aztk_cli/toolkit.py | 16 +- aztk_cli/utils.py | 162 ++++++++-------- pylintrc | 2 +- requirements.txt | 2 +- .../spark/sdk/clean_up_cluster.py | 26 +++ .../spark/sdk/cluster/test_cluster.py | 116 ++---------- .../sdk/cluster/test_cluster_deprecated.py | 133 +++---------- .../spark/sdk/ensure_spark_processes.py | 34 ++++ .../spark/sdk/wait_for_all_nodes.py | 23 +++ tests/utils/test_retry.py | 81 ++++++++ 142 files changed, 1984 insertions(+), 1728 deletions(-) create mode 100644 aztk/utils/retry.py create mode 100644 tests/integration_tests/spark/sdk/clean_up_cluster.py create mode 100644 tests/integration_tests/spark/sdk/ensure_spark_processes.py create mode 100644 tests/integration_tests/spark/sdk/wait_for_all_nodes.py create mode 100644 tests/utils/test_retry.py diff --git a/.vsts-ci.yml b/.vsts-ci.yml index a4c9f47a..d9e95c06 100644 --- a/.vsts-ci.yml +++ b/.vsts-ci.yml @@ -1,7 +1,6 @@ trigger: - master - phases: - phase: Test queue: Hosted Linux Preview @@ -24,16 +23,22 @@ phases: displayName: yapf - script: | - pylint -j 2 -E aztk aztk_cli + pylint -jobs 2 --errors-only aztk aztk_cli condition: succeeded() - displayName: pylint + displayName: pylint error check - script: | - pytest -n 20 --ignore=tests/integration_tests + pytest --ignore=tests/integration_tests condition: succeeded() displayName: unit tests - script: | - pytest -n 75 + pytest --numprocesses=75 condition: succeeded() displayName: integration tests + + - script: | + pylint -jobs 2 --disable=fixme aztk aztk_cli + continueOnError: true + condition: succeeded() + displayName: pylint report diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index ea2652ef..29180f66 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -1,10 +1,19 @@ from aztk import models from aztk.internal import cluster_data -from aztk.utils import ssh as ssh_lib -from .helpers import (create_user_on_cluster, create_user_on_node, delete_user_on_cluster, delete_user_on_node, - generate_user_on_cluster, generate_user_on_node, get_application_log, get_remote_login_settings, - node_run, run, ssh_into_node) +from .helpers import ( + create_user_on_cluster, + create_user_on_node, + delete_user_on_cluster, + delete_user_on_node, + generate_user_on_cluster, + generate_user_on_node, + get_application_log, + get_remote_login_settings, + node_run, + run, + ssh_into_node, +) class BaseOperations: @@ -15,14 +24,14 @@ class BaseOperations: Azure Batch service. blob_client (:obj:`azure.storage.blob.BlockBlobService`): Client used to interact with the Azure Storage Blob service. - secrets_configuration (:obj:`aztk.models.SecretsConfiguration`): Model that holds AZTK secrets used to authenticate - with Azure and the clusters. + secrets_configuration (:obj:`aztk.models.SecretsConfiguration`): + Model that holds AZTK secrets used to authenticate with Azure and the clusters. """ def __init__(self, context): - self.batch_client = context['batch_client'] - self.blob_client = context['blob_client'] - self.secrets_configuration = context['secrets_configuration'] + self.batch_client = context["batch_client"] + self.blob_client = context["blob_client"] + self.secrets_configuration = context["secrets_configuration"] def get_cluster_configuration(self, id: str) -> models.ClusterConfiguration: """Open an ssh tunnel to a node @@ -62,7 +71,8 @@ class BaseOperations: id (:obj:`str`): the id of the cluster the node is in node_id (:obj:`str`): the id of the node to open the ssh tunnel to username (:obj:`str`): the username to authenticate the ssh session - ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None. + ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. + Defaults to None. password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None. port_forward_list (:obj:`List[PortForwardingSpecification`, optional): list of PortForwardingSpecifications. The defined ports will be forwarded to the client. @@ -89,7 +99,7 @@ class BaseOperations: """ return create_user_on_node.create_user_on_node(self, id, node_id, username, ssh_key, password) - #TODO: remove nodes as param + # TODO: remove nodes as param def create_user_on_cluster(self, id, nodes, username, ssh_pub_key=None, password=None): """Create a user on every node in the cluster @@ -97,7 +107,8 @@ class BaseOperations: username (:obj:`str`): name of the user to create. id (:obj:`str`): id of the cluster to create the user on. nodes (:obj:`List[ComputeNode]`): list of nodes to create the user on - ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None. + ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. + Defaults to None. password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None. Returns: @@ -117,7 +128,7 @@ class BaseOperations: """ return generate_user_on_node.generate_user_on_node(self, id, node_id) - #TODO: remove nodes as param + # TODO: remove nodes as param def generate_user_on_cluster(self, id, nodes): """Create a user with an autogenerated username and ssh_key on the cluster @@ -143,7 +154,7 @@ class BaseOperations: """ return delete_user_on_node.delete_user(self, id, node_id, username) - #TODO: remove nodes as param + # TODO: remove nodes as param def delete_user_on_cluster(self, username, id, nodes): """Delete a user on every node in the cluster @@ -212,10 +223,11 @@ class BaseOperations: Args: id (:obj:`str`): the id of the cluster to run the command on. application_name (:obj:`str`): str - tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. Otherwise, the whole log will be retrieved. - Only use this if streaming the log as it is being written. Defaults to False. - current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are retrieved. - Only useful is streaming the log as it is being written. Only used if tail is True. + tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. + Otherwise, the whole log will be retrieved. Only use this if streaming the log as it is being written. + Defaults to False. + current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes + are retrieved. Only useful is streaming the log as it is being written. Only used if tail is True. Returns: :obj:`aztk.models.ApplicationLog`: a model representing the output of the application. diff --git a/aztk/client/base/helpers/create_user_on_cluster.py b/aztk/client/base/helpers/create_user_on_cluster.py index 0764a509..2a63f6c0 100644 --- a/aztk/client/base/helpers/create_user_on_cluster.py +++ b/aztk/client/base/helpers/create_user_on_cluster.py @@ -1,7 +1,7 @@ import concurrent.futures -#TODO: remove nodes param +# TODO: remove nodes param def create_user_on_cluster(base_operations, id, nodes, username, ssh_pub_key=None, password=None): with concurrent.futures.ThreadPoolExecutor() as executor: futures = { diff --git a/aztk/client/base/helpers/create_user_on_node.py b/aztk/client/base/helpers/create_user_on_node.py index 76c76625..a08fd7d4 100644 --- a/aztk/client/base/helpers/create_user_on_node.py +++ b/aztk/client/base/helpers/create_user_on_node.py @@ -3,7 +3,6 @@ from datetime import datetime, timedelta, timezone import azure.batch.models as batch_models import azure.batch.models.batch_error as batch_error -from aztk import models from aztk.utils import get_ssh_key diff --git a/aztk/client/base/helpers/delete_user_on_cluster.py b/aztk/client/base/helpers/delete_user_on_cluster.py index 22d968cb..29ca8a8a 100644 --- a/aztk/client/base/helpers/delete_user_on_cluster.py +++ b/aztk/client/base/helpers/delete_user_on_cluster.py @@ -1,7 +1,7 @@ import concurrent.futures -#TODO: remove nodes param +# TODO: remove nodes param def delete_user_on_cluster(base_client, id, nodes, username): with concurrent.futures.ThreadPoolExecutor() as executor: futures = [executor.submit(base_client.delete_user_on_node, id, node.id, username) for node in nodes] diff --git a/aztk/client/base/helpers/generate_user_on_cluster.py b/aztk/client/base/helpers/generate_user_on_cluster.py index aa9a2563..416af785 100644 --- a/aztk/client/base/helpers/generate_user_on_cluster.py +++ b/aztk/client/base/helpers/generate_user_on_cluster.py @@ -5,11 +5,11 @@ from Cryptodome.PublicKey import RSA from aztk.utils import secure_utils -#TODO: remove nodes param +# TODO: remove nodes param def generate_user_on_cluster(base_operations, id, nodes): generated_username = secure_utils.generate_random_string() ssh_key = RSA.generate(2048) - ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') + ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8") with concurrent.futures.ThreadPoolExecutor() as executor: futures = { executor.submit(base_operations.create_user_on_node, id, node.id, generated_username, ssh_pub_key): node diff --git a/aztk/client/base/helpers/generate_user_on_node.py b/aztk/client/base/helpers/generate_user_on_node.py index c984f080..9dbb2bec 100644 --- a/aztk/client/base/helpers/generate_user_on_node.py +++ b/aztk/client/base/helpers/generate_user_on_node.py @@ -6,6 +6,6 @@ from aztk.utils import secure_utils def generate_user_on_node(base_client, pool_id, node_id): generated_username = secure_utils.generate_random_string() ssh_key = RSA.generate(2048) - ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') + ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8") base_client.create_user_on_node(pool_id, node_id, generated_username, ssh_pub_key) return generated_username, ssh_key diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index a5353a37..7ff937fe 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -4,12 +4,10 @@ import azure import azure.batch.models as batch_models import azure.batch.models.batch_error as batch_error -from aztk import error -from aztk import models +from aztk import error, models from aztk.utils import constants, helpers -output_file = constants.TASK_WORKING_DIR + \ - "/" + constants.SPARK_SUBMIT_LOGS_FILE +output_file = constants.TASK_WORKING_DIR + "/" + constants.SPARK_SUBMIT_LOGS_FILE def __check_task_node_exist(batch_client, cluster_id: str, task: batch_models.CloudTask) -> bool: @@ -50,17 +48,18 @@ def __get_output_file_properties(batch_client, cluster_id: str, application_name def get_log_from_storage(blob_client, container_name, application_name, task): try: - blob = blob_client.get_blob_to_text(container_name, application_name + '/' + constants.SPARK_SUBMIT_LOGS_FILE) + blob = blob_client.get_blob_to_text(container_name, application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE) except azure.common.AzureMissingResourceHttpError: raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") return models.ApplicationLog( name=application_name, cluster_id=container_name, - application_state=task.state._value_, + application_state=task.state.name, log=blob.content, total_bytes=blob.properties.content_length, - exit_code=task.execution_info.exit_code) + exit_code=task.execution_info.exit_code, + ) def get_log(batch_client, blob_client, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): @@ -88,18 +87,20 @@ def get_log(batch_client, blob_client, cluster_id: str, application_name: str, t return models.ApplicationLog( name=application_name, cluster_id=cluster_id, - application_state=task.state._value_, + application_state=task.state.name, log=content, total_bytes=target_bytes, - exit_code=task.execution_info.exit_code) + exit_code=task.execution_info.exit_code, + ) else: return models.ApplicationLog( name=application_name, cluster_id=cluster_id, - application_state=task.state._value_, - log='', + application_state=task.state.name, + log="", total_bytes=target_bytes, - exit_code=task.execution_info.exit_code) + exit_code=task.execution_info.exit_code, + ) def get_application_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): diff --git a/aztk/client/base/helpers/node_run.py b/aztk/client/base/helpers/node_run.py index 80003db6..41c3f30b 100644 --- a/aztk/client/base/helpers/node_run.py +++ b/aztk/client/base/helpers/node_run.py @@ -22,9 +22,10 @@ def node_run(base_client, cluster_id, node_id, command, internal, container_name generated_username, node_rls.ip_address, node_rls.port, - ssh_key=ssh_key.exportKey().decode('utf-8'), + ssh_key=ssh_key.exportKey().decode("utf-8"), container_name=container_name, - timeout=timeout) + timeout=timeout, + ) return output finally: base_client.delete_user_on_node(cluster_id, node.id, generated_username) diff --git a/aztk/client/base/helpers/run.py b/aztk/client/base/helpers/run.py index bd279b64..d90b2e98 100644 --- a/aztk/client/base/helpers/run.py +++ b/aztk/client/base/helpers/run.py @@ -26,9 +26,10 @@ def cluster_run(base_operations, cluster_id, command, internal, container_name=N command, generated_username, cluster_nodes, - ssh_key=ssh_key.exportKey().decode('utf-8'), + ssh_key=ssh_key.exportKey().decode("utf-8"), container_name=container_name, - timeout=timeout)) + timeout=timeout, + )) return output except OSError as exc: raise exc diff --git a/aztk/client/client.py b/aztk/client/client.py index 6aafe7f6..4401f0d3 100644 --- a/aztk/client/client.py +++ b/aztk/client/client.py @@ -13,8 +13,6 @@ import aztk.utils.constants as constants import aztk.utils.get_ssh_key as get_ssh_key import aztk.utils.helpers as helpers import aztk.utils.ssh as ssh_lib -from aztk.client.cluster import CoreClusterOperations -from aztk.client.job import CoreJobOperations from aztk.internal import cluster_data from aztk.utils import deprecated, secure_utils @@ -27,6 +25,11 @@ class CoreClient: """ + def __init__(self): + self.secrets_configuration = None + self.batch_client = None + self.blob_client = None + def _get_context(self, secrets_configuration: models.SecretsConfiguration): self.secrets_configuration = secrets_configuration @@ -34,9 +37,9 @@ class CoreClient: self.batch_client = azure_api.make_batch_client(secrets_configuration) self.blob_client = azure_api.make_blob_client(secrets_configuration) context = { - 'batch_client': self.batch_client, - 'blob_client': self.blob_client, - 'secrets_configuration': self.secrets_configuration, + "batch_client": self.batch_client, + "blob_client": self.blob_client, + "secrets_configuration": self.secrets_configuration, } return context @@ -52,9 +55,9 @@ class CoreClient: """ return cluster_data.ClusterData(self.blob_client, cluster_id) - ''' + """ General Batch Operations - ''' + """ @deprecated("0.10.0") def __delete_pool_and_job(self, pool_id: str, keep_logs: bool = False): @@ -104,9 +107,8 @@ class CoreClient: job_id = cluster_conf.cluster_id # Get a verified node agent sku - sku_to_use, image_ref_to_use = \ - helpers.select_latest_verified_vm_image_with_node_agent_sku( - VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client) + sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku( + VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client) network_conf = None if cluster_conf.subnet_id is not None: @@ -130,8 +132,9 @@ class CoreClient: metadata=[ batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), batch_models.MetadataItem( - name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA) - ]) + name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA), + ], + ) # Create the pool + create user for the pool helpers.create_pool_if_not_exist(pool, self.batch_client) @@ -184,13 +187,16 @@ class CoreClient: """ # Create new ssh user for the given node self.batch_client.compute_node.add_user( - pool_id, node_id, + pool_id, + node_id, batch_models.ComputeNodeUser( name=username, is_admin=True, password=password, ssh_public_key=get_ssh_key.get_user_public_key(ssh_key, self.secrets_configuration), - expiry_time=datetime.now(timezone.utc) + timedelta(days=365))) + expiry_time=datetime.now(timezone.utc) + timedelta(days=365), + ), + ) @deprecated("0.10.0") def __delete_user(self, pool_id: str, node_id: str, username: str) -> str: @@ -229,7 +235,7 @@ class CoreClient: def __generate_user_on_node(self, pool_id, node_id): generated_username = secure_utils.generate_random_string() ssh_key = RSA.generate(2048) - ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') + ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8") self.__create_user_on_node(generated_username, pool_id, node_id, ssh_pub_key) return generated_username, ssh_key @@ -237,7 +243,7 @@ class CoreClient: def __generate_user_on_pool(self, pool_id, nodes): generated_username = secure_utils.generate_random_string() ssh_key = RSA.generate(2048) - ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') + ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8") with concurrent.futures.ThreadPoolExecutor() as executor: futures = { executor.submit(self.__create_user_on_node, generated_username, pool_id, node.id, ssh_pub_key): node @@ -283,9 +289,10 @@ class CoreClient: generated_username, node_rls.ip_address, node_rls.port, - ssh_key=ssh_key.exportKey().decode('utf-8'), + ssh_key=ssh_key.exportKey().decode("utf-8"), container_name=container_name, - timeout=timeout) + timeout=timeout, + ) return output finally: self.__delete_user(cluster_id, node.id, generated_username) @@ -306,9 +313,10 @@ class CoreClient: command, generated_username, cluster_nodes, - ssh_key=ssh_key.exportKey().decode('utf-8'), + ssh_key=ssh_key.exportKey().decode("utf-8"), container_name=container_name, - timeout=timeout)) + timeout=timeout, + )) return output except OSError as exc: raise exc @@ -316,14 +324,16 @@ class CoreClient: self.__delete_user_on_pool(generated_username, pool.id, nodes) @deprecated("0.10.0") - def __cluster_copy(self, - cluster_id, - source_path, - destination_path=None, - container_name=None, - internal=False, - get=False, - timeout=None): + def __cluster_copy( + self, + cluster_id, + source_path, + destination_path=None, + container_name=None, + internal=False, + get=False, + timeout=None, + ): pool, nodes = self.__get_pool_details(cluster_id) nodes = list(nodes) if internal: @@ -340,9 +350,10 @@ class CoreClient: nodes=cluster_nodes, source_path=source_path, destination_path=destination_path, - ssh_key=ssh_key.exportKey().decode('utf-8'), + ssh_key=ssh_key.exportKey().decode("utf-8"), get=get, - timeout=timeout)) + timeout=timeout, + )) return output except (OSError, batch_error.BatchErrorException) as exc: raise exc @@ -375,8 +386,16 @@ class CoreClient: ) @deprecated("0.10.0") - def __submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula, - software_metadata_key: str, vm_image_model, application_metadata): + def __submit_job( + self, + job_configuration, + start_task, + job_manager_task, + autoscale_formula, + software_metadata_key: str, + vm_image_model, + application_metadata, + ): """ Job Submission :param job_configuration -> aztk_sdk.spark.models.JobConfiguration @@ -390,9 +409,8 @@ class CoreClient: self._get_cluster_data(job_configuration.id).save_cluster_config(job_configuration.to_cluster_config()) # get a verified node agent sku - sku_to_use, image_ref_to_use = \ - helpers.select_latest_verified_vm_image_with_node_agent_sku( - vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, self.batch_client) + sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku( + vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, self.batch_client) # set up subnet if necessary network_conf = None @@ -419,8 +437,10 @@ class CoreClient: metadata=[ batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), batch_models.MetadataItem( - name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA) - ])) + name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA), + ], + ), + ) # define job specification job_spec = batch_models.JobSpecification( @@ -428,7 +448,8 @@ class CoreClient: display_name=job_configuration.id, on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job, job_manager_task=job_manager_task, - metadata=[batch_models.MetadataItem(name='applications', value=application_metadata)]) + metadata=[batch_models.MetadataItem(name="applications", value=application_metadata)], + ) # define schedule schedule = batch_models.Schedule( diff --git a/aztk/client/cluster/helpers/copy.py b/aztk/client/cluster/helpers/copy.py index 4148dd28..223ca23a 100644 --- a/aztk/client/cluster/helpers/copy.py +++ b/aztk/client/cluster/helpers/copy.py @@ -8,14 +8,16 @@ from aztk.utils import ssh as ssh_lib from aztk.utils import helpers -def cluster_copy(cluster_operations, - cluster_id, - source_path, - destination_path=None, - container_name=None, - internal=False, - get=False, - timeout=None): +def cluster_copy( + cluster_operations, + cluster_id, + source_path, + destination_path=None, + container_name=None, + internal=False, + get=False, + timeout=None, +): cluster = cluster_operations.get(cluster_id) pool, nodes = cluster.pool, list(cluster.nodes) if internal: @@ -36,9 +38,10 @@ def cluster_copy(cluster_operations, nodes=cluster_nodes, source_path=source_path, destination_path=destination_path, - ssh_key=ssh_key.exportKey().decode('utf-8'), + ssh_key=ssh_key.exportKey().decode("utf-8"), get=get, - timeout=timeout)) + timeout=timeout, + )) return output except (OSError, batch_error.BatchErrorException) as exc: raise exc diff --git a/aztk/client/cluster/helpers/create.py b/aztk/client/cluster/helpers/create.py index 736c79fa..6ae4f853 100644 --- a/aztk/client/cluster/helpers/create.py +++ b/aztk/client/cluster/helpers/create.py @@ -5,8 +5,13 @@ from aztk import models from aztk.utils import helpers, constants -def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterConfiguration, software_metadata_key: str, - start_task, VmImageModel): +def create_pool_and_job( + core_cluster_operations, + cluster_conf: models.ClusterConfiguration, + software_metadata_key: str, + start_task, + VmImageModel, +): """ Create a pool and job :param cluster_conf: the configuration object used to create the cluster @@ -22,9 +27,8 @@ def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterCon job_id = cluster_conf.cluster_id # Get a verified node agent sku - sku_to_use, image_ref_to_use = \ - helpers.select_latest_verified_vm_image_with_node_agent_sku( - VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, core_cluster_operations.batch_client) + sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku( + VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, core_cluster_operations.batch_client) network_conf = None if cluster_conf.subnet_id is not None: @@ -48,8 +52,9 @@ def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterCon metadata=[ batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), batch_models.MetadataItem( - name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA) - ]) + name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA), + ], + ) # Create the pool + create user for the pool helpers.create_pool_if_not_exist(pool, core_cluster_operations.batch_client) diff --git a/aztk/client/cluster/helpers/delete.py b/aztk/client/cluster/helpers/delete.py index 7f242def..0c6878c0 100644 --- a/aztk/client/cluster/helpers/delete.py +++ b/aztk/client/cluster/helpers/delete.py @@ -1,4 +1,7 @@ import azure.batch.models as batch_models +from msrest.exceptions import ClientRequestError + +from aztk.utils import BackOffPolicy, retry def delete_pool_and_job(core_cluster_operations, pool_id: str, keep_logs: bool = False): @@ -19,13 +22,18 @@ def delete_pool_and_job(core_cluster_operations, pool_id: str, keep_logs: bool = pool_exists = core_cluster_operations.batch_client.pool.exists(pool_id) if job_exists: - core_cluster_operations.batch_client.job.delete(job_id) + delete_batch_object(core_cluster_operations.batch_client.job.delete, job_id) if pool_exists: - core_cluster_operations.batch_client.pool.delete(pool_id) + delete_batch_object(core_cluster_operations.batch_client.pool.delete, pool_id) if not keep_logs: cluster_data = core_cluster_operations.get_cluster_data(pool_id) cluster_data.delete_container(pool_id) return job_exists or pool_exists + + +@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) +def delete_batch_object(function, *args, **kwargs): + return function(*args, **kwargs) diff --git a/aztk/client/cluster/helpers/get.py b/aztk/client/cluster/helpers/get.py index 1965a296..d8674afd 100644 --- a/aztk/client/cluster/helpers/get.py +++ b/aztk/client/cluster/helpers/get.py @@ -1,4 +1,4 @@ -#TODO: return Cluster instead of (pool, nodes) +# TODO: return Cluster instead of (pool, nodes) from aztk import models diff --git a/aztk/client/cluster/operations.py b/aztk/client/cluster/operations.py index 5ba831c2..8831ec65 100644 --- a/aztk/client/cluster/operations.py +++ b/aztk/client/cluster/operations.py @@ -13,7 +13,8 @@ class CoreClusterOperations(BaseOperations): cluster_configuration (:obj:`aztk.models.ClusterConfiguration`): Configuration for the cluster to be created software_metadata_key (:obj:`str`): the key for the primary software that will be run on the cluster start_task (:obj:`azure.batch.models.StartTask`): Batch StartTask defintion to configure the Batch Pool - vm_image_model (:obj:`azure.batch.models.VirtualMachineConfiguration`): Configuration of the virtual machine image and settings + vm_image_model (:obj:`azure.batch.models.VirtualMachineConfiguration`): + Configuration of the virtual machine image and settings Returns: :obj:`aztk.models.Cluster`: A Cluster object representing the state and configuration of the cluster. @@ -52,7 +53,8 @@ class CoreClusterOperations(BaseOperations): Defaults to None. Returns: - :obj:`List[aztk.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. + :obj:`List[aztk.models.NodeOutput]`: + A list of NodeOutput objects representing the output of the copy command. """ return copy.cluster_copy(self, id, source_path, destination_path, container_name, internal, get, timeout) @@ -65,7 +67,8 @@ class CoreClusterOperations(BaseOperations): Defaults to False. Returns: - :obj:`List[aztk.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. + :obj:`List[aztk.models.NodeOutput]`: + A list of NodeOutput objects representing the output of the copy command. """ return delete.delete_pool_and_job(self, id, keep_logs) diff --git a/aztk/client/job/helpers/submit.py b/aztk/client/job/helpers/submit.py index 1021201f..f5b6380b 100644 --- a/aztk/client/job/helpers/submit.py +++ b/aztk/client/job/helpers/submit.py @@ -1,11 +1,20 @@ from datetime import timedelta import azure.batch.models as batch_models -from aztk.utils import helpers, constants + +from aztk.utils import constants, helpers -def submit_job(job_client, job_configuration, start_task, job_manager_task, autoscale_formula, - software_metadata_key: str, vm_image_model, application_metadata): +def submit_job( + job_client, + job_configuration, + start_task, + job_manager_task, + autoscale_formula, + software_metadata_key: str, + vm_image_model, + application_metadata, +): """ Job Submission :param job_configuration -> aztk_sdk.spark.models.JobConfiguration @@ -19,9 +28,8 @@ def submit_job(job_client, job_configuration, start_task, job_manager_task, auto job_client.get_cluster_data(job_configuration.id).save_cluster_config(job_configuration.to_cluster_config()) # get a verified node agent sku - sku_to_use, image_ref_to_use = \ - helpers.select_latest_verified_vm_image_with_node_agent_sku( - vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, job_client.batch_client) + sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku( + vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, job_client.batch_client) # set up subnet if necessary network_conf = None @@ -48,8 +56,10 @@ def submit_job(job_client, job_configuration, start_task, job_manager_task, auto metadata=[ batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), batch_models.MetadataItem( - name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA) - ])) + name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA), + ], + ), + ) # define job specification job_spec = batch_models.JobSpecification( @@ -57,7 +67,8 @@ def submit_job(job_client, job_configuration, start_task, job_manager_task, auto display_name=job_configuration.id, on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job, job_manager_task=job_manager_task, - metadata=[batch_models.MetadataItem(name='applications', value=application_metadata)]) + metadata=[batch_models.MetadataItem(name="applications", value=application_metadata)], + ) # define schedule schedule = batch_models.Schedule( diff --git a/aztk/client/job/operations.py b/aztk/client/job/operations.py index e0fb1185..36adfd48 100644 --- a/aztk/client/job/operations.py +++ b/aztk/client/job/operations.py @@ -4,8 +4,16 @@ from .helpers import submit class CoreJobOperations(BaseOperations): - def submit(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key: str, - vm_image_model, application_metadata): + def submit( + self, + job_configuration, + start_task, + job_manager_task, + autoscale_formula, + software_metadata_key: str, + vm_image_model, + application_metadata, + ): """Submit a job Jobs are a cluster definition and one or many application definitions which run on the cluster. The job's @@ -26,5 +34,13 @@ class CoreJobOperations(BaseOperations): Returns: :obj:`azure.batch.models.CloudJobSchedule`: Model representing the Azure Batch JobSchedule state. """ - return submit.submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula, - software_metadata_key, vm_image_model, application_metadata) + return submit.submit_job( + self, + job_configuration, + start_task, + job_manager_task, + autoscale_formula, + software_metadata_key, + vm_image_model, + application_metadata, + ) diff --git a/aztk/core/models/fields.py b/aztk/core/models/fields.py index f7cf41c4..955b03bd 100644 --- a/aztk/core/models/fields.py +++ b/aztk/core/models/fields.py @@ -35,8 +35,8 @@ class Field: """ def __init__(self, *validators, **kwargs): - self.default = kwargs.get('default') - self.required = 'default' not in kwargs + self.default = kwargs.get("default") + self.required = "default" not in kwargs self.validators = [] if self.required: @@ -44,7 +44,7 @@ class Field: self.validators.extend(validators) - choices = kwargs.get('choices') + choices = kwargs.get("choices") if choices: self.validators.append(aztk_validators.In(choices)) @@ -134,11 +134,11 @@ class List(Field): def __init__(self, model=None, **kwargs): self.model = model - kwargs.setdefault('default', list) - self.merge_strategy = kwargs.get('merge_strategy', ListMergeStrategy.Append) - self.skip_none = kwargs.get('skip_none', True) + kwargs.setdefault("default", list) + self.merge_strategy = kwargs.get("merge_strategy", ListMergeStrategy.Append) + self.skip_none = kwargs.get("skip_none", True) - super().__init__(aztk_validators.List(*kwargs.get('inner_validators', [])), **kwargs) + super().__init__(aztk_validators.List(*kwargs.get("inner_validators", [])), **kwargs) def __set__(self, instance, value): if isinstance(value, collections.MutableSequence): @@ -175,7 +175,7 @@ class List(Field): output = [] if items is not None: for item in items: - if hasattr(item, 'to_dict'): + if hasattr(item, "to_dict"): output.append(item.to_dict()) else: output.append(item) @@ -196,7 +196,7 @@ class Model(Field): super().__init__(aztk_validators.Model(model), *args, **kwargs) self.model = model - self.merge_strategy = kwargs.get('merge_strategy', ModelMergeStrategy.Merge) + self.merge_strategy = kwargs.get("merge_strategy", ModelMergeStrategy.Merge) def __set__(self, instance, value): if isinstance(value, collections.MutableMapping): diff --git a/aztk/core/models/model.py b/aztk/core/models/model.py index 6f016f49..d58a348c 100644 --- a/aztk/core/models/model.py +++ b/aztk/core/models/model.py @@ -11,19 +11,19 @@ class ModelMeta(type): """ def __new__(mcs, name, bases, attrs): - attrs['_fields'] = {} + attrs["_fields"] = {} for base in bases: - if hasattr(base, '_fields'): + if hasattr(base, "_fields"): for k, v in base._fields.items(): - attrs['_fields'][k] = v + attrs["_fields"][k] = v for k, v in base.__dict__.items(): if isinstance(v, fields.Field): - attrs['_fields'][k] = v + attrs["_fields"][k] = v for k, v in attrs.items(): if isinstance(v, fields.Field): - attrs['_fields'][k] = v + attrs["_fields"][k] = v return super().__new__(mcs, name, bases, attrs) @@ -84,7 +84,7 @@ class Model(metaclass=ModelMeta): e.model = self raise e - if hasattr(self, '__validate__'): + if hasattr(self, "__validate__"): self.__validate__() def merge(self, other): diff --git a/aztk/core/models/validators.py b/aztk/core/models/validators.py index 3e050270..37478b97 100644 --- a/aztk/core/models/validators.py +++ b/aztk/core/models/validators.py @@ -24,7 +24,7 @@ class Required(Validator): def validate(self, value): if value is None: - raise InvalidModelFieldError('is required') + raise InvalidModelFieldError("is required") class String(Validator): @@ -37,7 +37,7 @@ class String(Validator): return if not isinstance(value, str): - raise InvalidModelFieldError('{0} should be a string'.format(value)) + raise InvalidModelFieldError("{0} should be a string".format(value)) class Integer(Validator): @@ -50,7 +50,7 @@ class Integer(Validator): return if not isinstance(value, int): - raise InvalidModelFieldError('{0} should be an integer'.format(value)) + raise InvalidModelFieldError("{0} should be an integer".format(value)) class Float(Validator): @@ -63,7 +63,7 @@ class Float(Validator): return if not isinstance(value, float): - raise InvalidModelFieldError('{0} should be a float'.format(value)) + raise InvalidModelFieldError("{0} should be a float".format(value)) class Boolean(Validator): @@ -74,7 +74,7 @@ class Boolean(Validator): return if not isinstance(value, bool): - raise InvalidModelFieldError('{0} should be a boolean'.format(value)) + raise InvalidModelFieldError("{0} should be a boolean".format(value)) class In(Validator): @@ -90,7 +90,7 @@ class In(Validator): return if value not in self.choices: - raise InvalidModelFieldError('{0} should be in {1}'.format(value, self.choices)) + raise InvalidModelFieldError("{0} should be in {1}".format(value, self.choices)) class InstanceOf(Validator): @@ -140,7 +140,7 @@ class List(Validator): return if not isinstance(value, collections.MutableSequence): - raise InvalidModelFieldError('should be a list') + raise InvalidModelFieldError("should be a list") for i in value: for validator in self.validators: diff --git a/aztk/internal/cluster_data/blob_data.py b/aztk/internal/cluster_data/blob_data.py index ceaf5f06..6e3fe117 100644 --- a/aztk/internal/cluster_data/blob_data.py +++ b/aztk/internal/cluster_data/blob_data.py @@ -1,6 +1,7 @@ -import azure.batch.models as batch_models import datetime -from azure.storage.blob import BlockBlobService, BlobPermissions + +import azure.batch.models as batch_models +from azure.storage.blob import BlobPermissions, BlockBlobService class BlobData: @@ -19,7 +20,8 @@ class BlobData: self.container, self.blob, permission=BlobPermissions.READ, - expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365)) + expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365), + ) sas_url = self.blob_client.make_blob_url(self.container, self.blob, sas_token=sas_token) diff --git a/aztk/internal/cluster_data/cluster_data.py b/aztk/internal/cluster_data/cluster_data.py index e099c9ce..47928ca1 100644 --- a/aztk/internal/cluster_data/cluster_data.py +++ b/aztk/internal/cluster_data/cluster_data.py @@ -3,8 +3,10 @@ import logging import azure.common import yaml +from msrest.exceptions import ClientRequestError from aztk.models import ClusterConfiguration +from aztk.utils import BackOffPolicy, retry from .blob_data import BlobData from .node_data import NodeData @@ -14,6 +16,7 @@ class ClusterData: """ Class handling the management of data for a cluster """ + # ALl data related to cluster(config, metadata, etc.) should be under this folder CLUSTER_DIR = "cluster" APPLICATIONS_DIR = "applications" @@ -24,26 +27,30 @@ class ClusterData: self.cluster_id = cluster_id self._ensure_container() + @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def save_cluster_config(self, cluster_config): blob_path = self.CLUSTER_DIR + "/" + self.CLUSTER_CONFIG_FILE content = yaml.dump(cluster_config) container_name = cluster_config.cluster_id self.blob_client.create_blob_from_text(container_name, blob_path, content) + @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def read_cluster_config(self): blob_path = self.CLUSTER_DIR + "/" + self.CLUSTER_CONFIG_FILE try: result = self.blob_client.get_blob_to_text(self.cluster_id, blob_path) return yaml.load(result.content) except azure.common.AzureMissingResourceHttpError: - logging.warn("Cluster %s doesn't have cluster configuration in storage", self.cluster_id) + logging.warning("Cluster %s doesn't have cluster configuration in storage", self.cluster_id) except yaml.YAMLError: - logging.warn("Cluster %s contains invalid cluster configuration in blob", self.cluster_id) + logging.warning("Cluster %s contains invalid cluster configuration in blob", self.cluster_id) + @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def upload_file(self, blob_path: str, local_path: str) -> BlobData: self.blob_client.create_blob_from_path(self.cluster_id, blob_path, local_path) return BlobData(self.blob_client, self.cluster_id, blob_path) + @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def upload_bytes(self, blob_path: str, bytes_io: io.BytesIO) -> BlobData: self.blob_client.create_blob_from_bytes(self.cluster_id, blob_path, bytes_io.getvalue()) return BlobData(self.blob_client, self.cluster_id, blob_path) @@ -61,8 +68,10 @@ class ClusterData: def upload_node_data(self, node_data: NodeData) -> BlobData: return self.upload_cluster_file("node-scripts.zip", node_data.zip_path) + @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def _ensure_container(self): self.blob_client.create_container(self.cluster_id, fail_on_exist=False) + @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) def delete_container(self, container_name: str): self.blob_client.delete_container(container_name) diff --git a/aztk/internal/cluster_data/node_data.py b/aztk/internal/cluster_data/node_data.py index f09c6047..3b381198 100644 --- a/aztk/internal/cluster_data/node_data.py +++ b/aztk/internal/cluster_data/node_data.py @@ -44,11 +44,11 @@ class NodeData: return if isinstance(file, (str, bytes)): full_file_path = Path(file) - with io.open(file, 'r', encoding='UTF-8') as f: + with io.open(file, "r", encoding="UTF-8") as f: if binary: self.zipf.write(file, os.path.join(zip_dir, full_file_path.name)) else: - self.zipf.writestr(os.path.join(zip_dir, full_file_path.name), f.read().replace('\r\n', '\n')) + self.zipf.writestr(os.path.join(zip_dir, full_file_path.name), f.read().replace("\r\n", "\n")) elif isinstance(file, models.File): self.zipf.writestr(os.path.join(zip_dir, file.name), file.payload.getvalue()) @@ -77,36 +77,38 @@ class NodeData: return self.add_files( [ - spark_configuration.spark_defaults_conf, spark_configuration.spark_env_sh, - spark_configuration.core_site_xml + spark_configuration.spark_defaults_conf, + spark_configuration.spark_env_sh, + spark_configuration.core_site_xml, ], - 'conf', - binary=False) + "conf", + binary=False, + ) # add ssh keys for passwordless ssh - self.zipf.writestr('id_rsa.pub', spark_configuration.ssh_key_pair['pub_key']) - self.zipf.writestr('id_rsa', spark_configuration.ssh_key_pair['priv_key']) + self.zipf.writestr("id_rsa.pub", spark_configuration.ssh_key_pair["pub_key"]) + self.zipf.writestr("id_rsa", spark_configuration.ssh_key_pair["priv_key"]) if spark_configuration.jars: for jar in spark_configuration.jars: - self.add_file(jar, 'jars', binary=True) + self.add_file(jar, "jars", binary=True) def _add_user_conf(self): user_conf = self.cluster_config.user_configuration if not user_conf: return encrypted_aes_session_key, cipher_aes_nonce, tag, ciphertext = secure_utils.encrypt_password( - self.cluster_config.spark_configuration.ssh_key_pair['pub_key'], user_conf.password) + self.cluster_config.spark_configuration.ssh_key_pair["pub_key"], user_conf.password) user_conf = yaml.dump({ - 'username': user_conf.username, - 'password': ciphertext, - 'ssh-key': user_conf.ssh_key, - 'aes_session_key': encrypted_aes_session_key, - 'cipher_aes_nonce': cipher_aes_nonce, - 'tag': tag, - 'cluster_id': self.cluster_config.cluster_id + "username": user_conf.username, + "password": ciphertext, + "ssh-key": user_conf.ssh_key, + "aes_session_key": encrypted_aes_session_key, + "cipher_aes_nonce": cipher_aes_nonce, + "tag": tag, + "cluster_id": self.cluster_config.cluster_id, }) - self.zipf.writestr('user.yaml', user_conf) + self.zipf.writestr("user.yaml", user_conf) def _add_plugins(self): if not self.cluster_config.plugins: @@ -115,23 +117,22 @@ class NodeData: data = [] for plugin in self.cluster_config.plugins: for file in plugin.files: - zipf = self.zipf.writestr('plugins/{0}/{1}'.format(plugin.name, file.target), file.content()) + self.zipf.writestr("plugins/{0}/{1}".format(plugin.name, file.target), file.content()) if plugin.execute: data.append( dict( name=plugin.name, - execute='{0}/{1}'.format(plugin.name, plugin.execute), + execute="{0}/{1}".format(plugin.name, plugin.execute), args=plugin.args, env=plugin.env, target=plugin.target.value, target_role=plugin.target_role.value, )) - self.zipf.writestr(os.path.join('plugins', 'plugins-manifest.yaml'), yaml.dump(data)) - return zipf + self.zipf.writestr(os.path.join("plugins", "plugins-manifest.yaml"), yaml.dump(data)) def _add_node_scripts(self): - self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=['*.pyc*', '*.png']) + self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=["*.pyc*", "*.png"]) def _includeFile(self, filename: str, exclude: List[str]) -> bool: exclude = exclude or [] diff --git a/aztk/internal/docker_cmd.py b/aztk/internal/docker_cmd.py index 83eccfb6..ad88e8d3 100644 --- a/aztk/internal/docker_cmd.py +++ b/aztk/internal/docker_cmd.py @@ -1,4 +1,3 @@ -import os from aztk.utils.command_builder import CommandBuilder @@ -9,30 +8,30 @@ class DockerCmd: def __init__(self, name: str, docker_repo: str, docker_run_options: str, cmd: str, gpu_enabled=False): if gpu_enabled: - self.cmd = CommandBuilder('nvidia-docker run') + self.cmd = CommandBuilder("nvidia-docker run") else: - self.cmd = CommandBuilder('docker run') - self.cmd.add_option('--net', 'host') - self.cmd.add_option('--name', name) - self.cmd.add_argument('-d') + self.cmd = CommandBuilder("docker run") + self.cmd.add_option("--net", "host") + self.cmd.add_option("--name", name) + self.cmd.add_argument("-d") self.cmd.add_argument(docker_run_options) self.cmd.add_argument(docker_repo) self.cmd.add_argument(cmd) def add_env(self, env: str, value: str): - self.cmd.add_option('-e', '{0}={1}'.format(env, value)) + self.cmd.add_option("-e", "{0}={1}".format(env, value)) def pass_env(self, env: str): """ Give the value of an environment variable in the main process to the docker image """ - self.cmd.add_option('-e', '{0}'.format(env)) + self.cmd.add_option("-e", "{0}".format(env)) def share_folder(self, folder: str): - self.cmd.add_option('-v', '{0}:{0}'.format(folder)) + self.cmd.add_option("-v", "{0}:{0}".format(folder)) def open_port(self, port: int): - self.cmd.add_option('-p', '{0}:{0}'.format(port)) # Spark Master UI + self.cmd.add_option("-p", "{0}:{0}".format(port)) # Spark Master UI def to_str(self): return self.cmd.to_str() diff --git a/aztk/models/application_log.py b/aztk/models/application_log.py index 58c215ec..9da3bb37 100644 --- a/aztk/models/application_log.py +++ b/aztk/models/application_log.py @@ -1,9 +1,16 @@ import azure.batch.models as batch_models -class ApplicationLog(): - def __init__(self, name: str, cluster_id: str, log: str, total_bytes: int, - application_state: batch_models.TaskState, exit_code: int): +class ApplicationLog: + def __init__( + self, + name: str, + cluster_id: str, + log: str, + total_bytes: int, + application_state: batch_models.TaskState, + exit_code: int, + ): self.name = name self.cluster_id = cluster_id # TODO: change to something cluster/job agnostic self.log = log diff --git a/aztk/models/cluster.py b/aztk/models/cluster.py index 24f78394..ca48f887 100644 --- a/aztk/models/cluster.py +++ b/aztk/models/cluster.py @@ -11,10 +11,8 @@ class Cluster: self.visible_state = pool.allocation_state.value else: self.visible_state = pool.state.value - self.total_current_nodes = pool.current_dedicated_nodes + \ - pool.current_low_priority_nodes - self.total_target_nodes = pool.target_dedicated_nodes + \ - pool.target_low_priority_nodes + self.total_current_nodes = pool.current_dedicated_nodes + pool.current_low_priority_nodes + self.total_target_nodes = pool.target_dedicated_nodes + pool.target_low_priority_nodes self.current_dedicated_nodes = pool.current_dedicated_nodes self.current_low_pri_nodes = pool.current_low_priority_nodes self.target_dedicated_nodes = pool.target_dedicated_nodes diff --git a/aztk/models/cluster_configuration.py b/aztk/models/cluster_configuration.py index af52690a..d1429f6b 100644 --- a/aztk/models/cluster_configuration.py +++ b/aztk/models/cluster_configuration.py @@ -61,8 +61,8 @@ class ClusterConfiguration(Model): def __validate__(self) -> bool: if self.size == 0 and self.size_low_priority == 0: raise error.InvalidModelError( - "Please supply a valid (greater than 0) size or size_low_priority value either in the cluster.yaml configuration file or with a parameter (--size or --size-low-priority)" - ) + "Please supply a valid (greater than 0) size or size_low_priority value either " + "in the cluster.yaml configuration file or with a parameter (--size or --size-low-priority)") if self.vm_size is None: raise error.InvalidModelError( @@ -70,8 +70,8 @@ class ClusterConfiguration(Model): if self.mixed_mode() and not self.subnet_id: raise error.InvalidModelError( - "You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). Set the VNET's subnet_id in your cluster.yaml or with a parameter (--subnet-id)." - ) + "You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). " + "Set the VNET's subnet_id in your cluster.yaml or with a parameter (--subnet-id).") if self.scheduling_target == SchedulingTarget.Dedicated and self.size == 0: raise error.InvalidModelError("Scheduling target cannot be Dedicated if dedicated vm size is 0") diff --git a/aztk/models/plugins/internal/plugin_manager.py b/aztk/models/plugins/internal/plugin_manager.py index 1b4c68c2..2eb99190 100644 --- a/aztk/models/plugins/internal/plugin_manager.py +++ b/aztk/models/plugins/internal/plugin_manager.py @@ -1,7 +1,4 @@ -import os import inspect -import importlib.util -from aztk.utils import constants from aztk.error import InvalidPluginReferenceError from aztk.spark.models import plugins @@ -28,7 +25,8 @@ class PluginManager: nvblas=plugins.NvBLASPlugin, apt_get=plugins.AptGetPlugin, pip_install=plugins.PipPlugin, - conda_install=plugins.CondaPlugin) + conda_install=plugins.CondaPlugin, + ) def __init__(self): self.loaded = False diff --git a/aztk/models/plugins/internal/plugin_reference.py b/aztk/models/plugins/internal/plugin_reference.py index 2c8c1fd8..bc75565d 100644 --- a/aztk/models/plugins/internal/plugin_reference.py +++ b/aztk/models/plugins/internal/plugin_reference.py @@ -50,7 +50,5 @@ class PluginReference(Model): execute=script_filename, target=self.target, target_role=self.target_role or PluginConfiguration, - files=[ - PluginFile(script_filename, self.script), - ], + files=[PluginFile(script_filename, self.script)], ) diff --git a/aztk/models/plugins/plugin_configuration.py b/aztk/models/plugins/plugin_configuration.py index 72b2229b..d3fc4cbb 100644 --- a/aztk/models/plugins/plugin_configuration.py +++ b/aztk/models/plugins/plugin_configuration.py @@ -9,6 +9,7 @@ class PluginTarget(Enum): """ Where this plugin should run """ + SparkContainer = "spark-container" Host = "host" @@ -26,6 +27,7 @@ class PluginPort(Model): :param public: [Optional] Port available to the user. If none won't open any port to the user :param name: [Optional] name to differentiate ports if you have multiple """ + internal = fields.Integer() public = fields.Field(default=None) name = fields.Integer() @@ -55,6 +57,7 @@ class PluginConfiguration(Model): args: List of arguments to pass to the executing script env: Dict of environment variables to pass to the script """ + name = fields.String() files = fields.List(PluginFile) execute = fields.String() diff --git a/aztk/models/plugins/plugin_file.py b/aztk/models/plugins/plugin_file.py index 277281c0..6c562825 100644 --- a/aztk/models/plugins/plugin_file.py +++ b/aztk/models/plugins/plugin_file.py @@ -15,7 +15,7 @@ class PluginFile(Model): super().__init__(target=target, local_path=local_path) def content(self): - with open(self.local_path, "r", encoding='UTF-8') as f: + with open(self.local_path, "r", encoding="UTF-8") as f: return f.read() diff --git a/aztk/models/secrets_configuration.py b/aztk/models/secrets_configuration.py index 4559d1cd..c30f9c19 100644 --- a/aztk/models/secrets_configuration.py +++ b/aztk/models/secrets_configuration.py @@ -6,6 +6,7 @@ class ServicePrincipalConfiguration(Model): """ Container class for AAD authentication """ + tenant_id = fields.String() client_id = fields.String() credential = fields.String() @@ -17,6 +18,7 @@ class SharedKeyConfiguration(Model): """ Container class for shared key authentication """ + batch_account_name = fields.String() batch_account_key = fields.String() batch_service_url = fields.String() @@ -34,6 +36,7 @@ class DockerConfiguration(Model): username (str): Docker endpoint username password (str): Docker endpoint password """ + endpoint = fields.String(default=None) username = fields.String(default=None) password = fields.String(default=None) diff --git a/aztk/models/software.py b/aztk/models/software.py index 1e1e2c46..bdf67e7a 100644 --- a/aztk/models/software.py +++ b/aztk/models/software.py @@ -2,4 +2,5 @@ class Software: """ Enum with list of available softwares """ + spark = "spark" diff --git a/aztk/models/ssh_log.py b/aztk/models/ssh_log.py index b0ed1d7a..bd7f3f7f 100644 --- a/aztk/models/ssh_log.py +++ b/aztk/models/ssh_log.py @@ -1,4 +1,4 @@ -class SSHLog(): +class SSHLog: def __init__(self, output, node_id): self.output = output self.node_id = node_id diff --git a/aztk/models/toolkit.py b/aztk/models/toolkit.py index b301d527..406df88b 100644 --- a/aztk/models/toolkit.py +++ b/aztk/models/toolkit.py @@ -25,8 +25,8 @@ TOOLKIT_MAP = dict( r=ToolkitEnvironmentDefinition(), miniconda=ToolkitEnvironmentDefinition(), anaconda=ToolkitEnvironmentDefinition(), - )), -) + ), + )) class Toolkit(Model): @@ -74,12 +74,12 @@ class Toolkit(Model): self.environment, self.environment_version, self.software, env_def.versions)) if self.docker_run_options: - invalid_character = re.search('[^A-Za-z0-9 _./:=\-\"]', self.docker_run_options) + invalid_character = re.search(r'[^A-Za-z0-9 _./:=\-"]', self.docker_run_options) if invalid_character: raise InvalidModelError( "Docker run options contains invalid character '{0}'. Only A-Z, a-z, 0-9, space, hyphen (-), " "underscore (_), period (.), forward slash (/), colon (:), equals(=), comma (,), and " - "double quote (\") are allowed.".format(invalid_character.group(0))) + 'double quote (") are allowed.'.format(invalid_character.group(0))) def get_docker_repo(self, gpu: bool): if self.docker_repo: @@ -87,10 +87,7 @@ class Toolkit(Model): repo = "aztk/{0}".format(self.software) - return "{repo}:{tag}".format( - repo=repo, - tag=self._get_docker_tag(gpu), - ) + return "{repo}:{tag}".format(repo=repo, tag=self._get_docker_tag(gpu)) def get_docker_run_options(self): return self.docker_run_options @@ -109,7 +106,7 @@ class Toolkit(Model): array.append("gpu" if gpu else "base") - return '-'.join(array) + return "-".join(array) def _get_environment_definition(self) -> ToolkitEnvironmentDefinition: toolkit = TOOLKIT_MAP.get(self.software) diff --git a/aztk/node_scripts/core/config.py b/aztk/node_scripts/core/config.py index d4a8efe6..94582056 100644 --- a/aztk/node_scripts/core/config.py +++ b/aztk/node_scripts/core/config.py @@ -1,19 +1,20 @@ import os import re -import logging + +import azure.batch.batch_auth as batchauth import azure.batch.batch_service_client as batch import azure.storage.blob as blob -import azure.batch.batch_auth as batchauth -from core import log from azure.common.credentials import ServicePrincipalCredentials from azure.mgmt.batch import BatchManagementClient from azure.mgmt.storage import StorageManagementClient from azure.storage.common import CloudStorageAccount -RESOURCE_ID_PATTERN = re.compile('^/subscriptions/(?P[^/]+)' - '/resourceGroups/(?P[^/]+)' - '/providers/[^/]+' - '/[^/]+Accounts/(?P[^/]+)$') +from core import log + +RESOURCE_ID_PATTERN = re.compile("^/subscriptions/(?P[^/]+)" + "/resourceGroups/(?P[^/]+)" + "/providers/[^/]+" + "/[^/]+Accounts/(?P[^/]+)$") batch_account_name = os.environ.get("AZ_BATCH_ACCOUNT_NAME") batch_account_key = os.environ.get("BATCH_ACCOUNT_KEY") @@ -44,14 +45,14 @@ def get_blob_client() -> blob.BlockBlobService: account_name=storage_account_name, account_key=storage_account_key, endpoint_suffix=storage_account_suffix) else: credentials = ServicePrincipalCredentials( - client_id=client_id, secret=credential, tenant=tenant_id, resource='https://management.core.windows.net/') + client_id=client_id, secret=credential, tenant=tenant_id, resource="https://management.core.windows.net/") m = RESOURCE_ID_PATTERN.match(storage_resource_id) - accountname = m.group('account') - subscription = m.group('subscription') - resourcegroup = m.group('resourcegroup') + accountname = m.group("account") + subscription = m.group("subscription") + resourcegroup = m.group("resourcegroup") mgmt_client = StorageManagementClient(credentials, subscription) - key = mgmt_client.storage_accounts.list_keys( - resource_group_name=resourcegroup, account_name=accountname).keys[0].value + key = (mgmt_client.storage_accounts.list_keys(resource_group_name=resourcegroup, account_name=accountname) + .keys[0].value) storage_client = CloudStorageAccount(accountname, key) return storage_client.create_block_blob_service() @@ -62,13 +63,13 @@ def get_batch_client() -> batch.BatchServiceClient: credentials = batchauth.SharedKeyCredentials(batch_account_name, batch_account_key) else: credentials = ServicePrincipalCredentials( - client_id=client_id, secret=credential, tenant=tenant_id, resource='https://management.core.windows.net/') + client_id=client_id, secret=credential, tenant=tenant_id, resource="https://management.core.windows.net/") m = RESOURCE_ID_PATTERN.match(batch_resource_id) - batch_client = BatchManagementClient(credentials, m.group('subscription')) - account = batch_client.batch_account.get(m.group('resourcegroup'), m.group('account')) - base_url = 'https://%s/' % account.account_endpoint + batch_client = BatchManagementClient(credentials, m.group("subscription")) + account = batch_client.batch_account.get(m.group("resourcegroup"), m.group("account")) + base_url = "https://%s/" % account.account_endpoint credentials = ServicePrincipalCredentials( - client_id=client_id, secret=credential, tenant=tenant_id, resource='https://batch.core.windows.net/') + client_id=client_id, secret=credential, tenant=tenant_id, resource="https://batch.core.windows.net/") return batch.BatchServiceClient(credentials, base_url=base_url) diff --git a/aztk/node_scripts/core/logger.py b/aztk/node_scripts/core/logger.py index 9e75731d..dab61457 100644 --- a/aztk/node_scripts/core/logger.py +++ b/aztk/node_scripts/core/logger.py @@ -3,7 +3,7 @@ import logging log = logging.getLogger("aztk.node-agent") -DEFAULT_FORMAT = '%(message)s' +DEFAULT_FORMAT = "%(message)s" def setup_logging(): diff --git a/aztk/node_scripts/install/create_user.py b/aztk/node_scripts/install/create_user.py index 0e3f6762..5ba80b92 100644 --- a/aztk/node_scripts/install/create_user.py +++ b/aztk/node_scripts/install/create_user.py @@ -5,45 +5,47 @@ from Cryptodome.PublicKey import RSA from Cryptodome.Cipher import AES, PKCS1_OAEP from datetime import datetime, timezone, timedelta import yaml -''' +""" Creates a user if the user configuration file at $AZTK_WORKING_DIR/user.yaml exists -''' +""" def create_user(batch_client): - path = os.path.join(os.environ['AZTK_WORKING_DIR'], "user.yaml") + path = os.path.join(os.environ["AZTK_WORKING_DIR"], "user.yaml") if not os.path.isfile(path): print("No user to create.") return - with open(path, 'r', encoding='UTF-8') as file: + with open(path, "r", encoding="UTF-8") as file: user_conf = yaml.load(file.read()) try: - password = None if user_conf['ssh-key'] else decrypt_password(user_conf) + password = None if user_conf["ssh-key"] else decrypt_password(user_conf) batch_client.compute_node.add_user( - pool_id=os.environ['AZ_BATCH_POOL_ID'], - node_id=os.environ['AZ_BATCH_NODE_ID'], + pool_id=os.environ["AZ_BATCH_POOL_ID"], + node_id=os.environ["AZ_BATCH_NODE_ID"], user=batch_models.ComputeNodeUser( - name=user_conf['username'], + name=user_conf["username"], is_admin=True, password=password, - ssh_public_key=str(user_conf['ssh-key']), - expiry_time=datetime.now(timezone.utc) + timedelta(days=365))) + ssh_public_key=str(user_conf["ssh-key"]), + expiry_time=datetime.now(timezone.utc) + timedelta(days=365), + ), + ) except batch_error.BatchErrorException as e: print(e) def decrypt_password(user_conf): - cipher_text = user_conf['password'] - encrypted_aes_session_key = user_conf['aes_session_key'] - cipher_aes_nonce = user_conf['cipher_aes_nonce'] - tag = user_conf['tag'] + cipher_text = user_conf["password"] + encrypted_aes_session_key = user_conf["aes_session_key"] + cipher_aes_nonce = user_conf["cipher_aes_nonce"] + tag = user_conf["tag"] # Read private key - with open(os.path.join(os.environ['AZTK_WORKING_DIR'], 'id_rsa'), encoding='UTF-8') as f: + with open(os.path.join(os.environ["AZTK_WORKING_DIR"], "id_rsa"), encoding="UTF-8") as f: private_key = RSA.import_key(f.read()) # Decrypt the session key with the public RSA key cipher_rsa = PKCS1_OAEP.new(private_key) diff --git a/aztk/node_scripts/install/install.py b/aztk/node_scripts/install/install.py index 7ea7c61c..d441baa9 100644 --- a/aztk/node_scripts/install/install.py +++ b/aztk/node_scripts/install/install.py @@ -25,7 +25,7 @@ def setup_host(docker_repo: str, docker_run_options: str): client = config.batch_client create_user.create_user(batch_client=client) - if os.environ['AZ_BATCH_NODE_IS_DEDICATED'] == "true" or os.environ['AZTK_MIXED_MODE'] == "false": + if os.environ["AZ_BATCH_NODE_IS_DEDICATED"] == "true" or os.environ["AZTK_MIXED_MODE"] == "false": is_master = pick_master.find_master(client) else: is_master = False @@ -50,7 +50,7 @@ def setup_host(docker_repo: str, docker_run_options: str): setup_node_scheduling(client, cluster_conf, is_master) - #TODO pass azure file shares + # TODO pass azure file shares spark_container.start_spark_container( docker_repo=docker_repo, docker_run_options=docker_run_options, @@ -82,4 +82,4 @@ def setup_spark_container(): plugins.setup_plugins(target=PluginTarget.SparkContainer, is_master=is_master, is_worker=is_worker) - open("/tmp/setup_complete", 'a').close() + open("/tmp/setup_complete", "a").close() diff --git a/aztk/node_scripts/install/pick_master.py b/aztk/node_scripts/install/pick_master.py index 0fca6b19..5114acd0 100644 --- a/aztk/node_scripts/install/pick_master.py +++ b/aztk/node_scripts/install/pick_master.py @@ -37,8 +37,8 @@ def try_assign_self_as_master(client: batch.BatchServiceClient, pool: batchmodel client.pool.patch( config.pool_id, batchmodels.PoolPatchParameter(metadata=new_metadata), - batchmodels.PoolPatchOptions(if_match=pool.e_tag, - )) + batchmodels.PoolPatchOptions(if_match=pool.e_tag), + ) return True except (batcherror.BatchErrorException, ClientRequestError): print("Couldn't assign itself as master the pool because the pool was modified since last get.") diff --git a/aztk/node_scripts/install/plugins.py b/aztk/node_scripts/install/plugins.py index f9734511..d342a033 100644 --- a/aztk/node_scripts/install/plugins.py +++ b/aztk/node_scripts/install/plugins.py @@ -1,18 +1,19 @@ -import os import json -import yaml +import os import subprocess -from pathlib import Path + +import yaml + from aztk.models.plugins import PluginTarget, PluginTargetRole -log_folder = os.path.join(os.environ['AZTK_WORKING_DIR'], 'logs', 'plugins') +log_folder = os.path.join(os.environ["AZTK_WORKING_DIR"], "logs", "plugins") def _read_manifest_file(path=None): if not os.path.isfile(path): print("Plugins manifest file doesn't exist at {0}".format(path)) else: - with open(path, 'r', encoding='UTF-8') as stream: + with open(path, "r", encoding="UTF-8") as stream: try: return yaml.load(stream) except json.JSONDecodeError as err: @@ -22,7 +23,7 @@ def _read_manifest_file(path=None): def setup_plugins(target: PluginTarget, is_master: bool = False, is_worker: bool = False): plugins_dir = _plugins_dir() - plugins_manifest = _read_manifest_file(os.path.join(plugins_dir, 'plugins-manifest.yaml')) + plugins_manifest = _read_manifest_file(os.path.join(plugins_dir, "plugins-manifest.yaml")) if not os.path.exists(log_folder): os.makedirs(log_folder) @@ -32,28 +33,41 @@ def setup_plugins(target: PluginTarget, is_master: bool = False, is_worker: bool def _plugins_dir(): - return os.path.join(os.environ['AZTK_WORKING_DIR'], 'plugins') + return os.path.join(os.environ["AZTK_WORKING_DIR"], "plugins") def _run_on_this_node(plugin_obj, target: PluginTarget, is_master, is_worker): - print("Loading plugin {} in {} on {}".format(plugin_obj["execute"], plugin_obj['target'], - plugin_obj['target_role'])) + print("Loading plugin {} in {} on {}".format(plugin_obj["execute"], plugin_obj["target"], + plugin_obj["target_role"])) - if plugin_obj['target'] != target.value: - print("Ignoring ", plugin_obj["execute"], "as target is for ", plugin_obj['target'], - "but is currently running in ", target.value) + if plugin_obj["target"] != target.value: + print( + "Ignoring ", + plugin_obj["execute"], + "as target is for ", + plugin_obj["target"], + "but is currently running in ", + target.value, + ) return False - if plugin_obj['target_role'] == PluginTargetRole.Master.value and is_master is True: + if plugin_obj["target_role"] == PluginTargetRole.Master.value and is_master is True: return True - if plugin_obj['target_role'] == PluginTargetRole.Worker.value and is_worker is True: + if plugin_obj["target_role"] == PluginTargetRole.Worker.value and is_worker is True: return True - if plugin_obj['target_role'] == PluginTargetRole.All.value: + if plugin_obj["target_role"] == PluginTargetRole.All.value: return True - print("Ignoring plugin", plugin_obj["execute"], "as target role is ", plugin_obj['target_role'], - "and node is master: ", is_master, is_worker) + print( + "Ignoring plugin", + plugin_obj["execute"], + "as target role is ", + plugin_obj["target_role"], + "and node is master: ", + is_master, + is_worker, + ) return False @@ -63,8 +77,8 @@ def _setup_plugins(plugins_manifest, target: PluginTarget, is_master, is_worker) for plugin in plugins_manifest: if _run_on_this_node(plugin, target, is_master, is_worker): - path = os.path.join(plugins_dir, plugin['execute']) - _run_script(plugin.get("name"), path, plugin.get('args'), plugin.get('env')) + path = os.path.join(plugins_dir, plugin["execute"]) + _run_script(plugin.get("name"), path, plugin.get("args"), plugin.get("env")) def _run_script(name: str, script_path: str = None, args: dict = None, env: dict = None): @@ -84,7 +98,7 @@ def _run_script(name: str, script_path: str = None, args: dict = None, env: dict if args is None: args = [] - out_file = open(os.path.join(log_folder, '{0}.txt'.format(name)), 'w', encoding='UTF-8') + out_file = open(os.path.join(log_folder, "{0}.txt".format(name)), "w", encoding="UTF-8") try: subprocess.call([script_path] + args, env=my_env, stdout=out_file, stderr=out_file) print("Finished running") diff --git a/aztk/node_scripts/install/spark.py b/aztk/node_scripts/install/spark.py index 4ce6967f..bd3911d9 100644 --- a/aztk/node_scripts/install/spark.py +++ b/aztk/node_scripts/install/spark.py @@ -2,13 +2,14 @@ Code that handle spark configuration """ import datetime -import time import os -import json import shutil -from subprocess import call, Popen, check_output +import time +from subprocess import call from typing import List + import azure.batch.models as batchmodels + from core import config from install import pick_master @@ -55,7 +56,7 @@ def setup_connection(): master_node = get_node(master_node_id) master_config_file = os.path.join(spark_conf_folder, "master") - master_file = open(master_config_file, 'w', encoding='UTF-8') + master_file = open(master_config_file, "w", encoding="UTF-8") print("Adding master node ip {0} to config file '{1}'".format(master_node.ip_address, master_config_file)) master_file.write("{0}\n".format(master_node.ip_address)) @@ -127,9 +128,9 @@ def setup_conf(): def setup_ssh_keys(): - pub_key_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'id_rsa.pub') - priv_key_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'id_rsa') - ssh_key_dest = '/root/.ssh' + pub_key_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "id_rsa.pub") + priv_key_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "id_rsa") + ssh_key_dest = "/root/.ssh" if not os.path.exists(ssh_key_dest): os.mkdir(ssh_key_dest) @@ -139,27 +140,27 @@ def setup_ssh_keys(): def copy_spark_env(): - spark_env_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'conf/spark-env.sh') - spark_env_path_dest = os.path.join(spark_home, 'conf/spark-env.sh') + spark_env_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "conf/spark-env.sh") + spark_env_path_dest = os.path.join(spark_home, "conf/spark-env.sh") copyfile(spark_env_path_src, spark_env_path_dest) def copy_spark_defaults(): - spark_default_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'conf/spark-defaults.conf') - spark_default_path_dest = os.path.join(spark_home, 'conf/spark-defaults.conf') + spark_default_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "conf/spark-defaults.conf") + spark_default_path_dest = os.path.join(spark_home, "conf/spark-defaults.conf") copyfile(spark_default_path_src, spark_default_path_dest) def copy_core_site(): - spark_core_site_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'conf/core-site.xml') - spark_core_site_dest = os.path.join(spark_home, 'conf/core-site.xml') + spark_core_site_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "conf/core-site.xml") + spark_core_site_dest = os.path.join(spark_home, "conf/core-site.xml") copyfile(spark_core_site_src, spark_core_site_dest) def copy_jars(): # Copy jars to $SPARK_HOME/jars - spark_default_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'jars') - spark_default_path_dest = os.path.join(spark_home, 'jars') + spark_default_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "jars") + spark_default_path_dest = os.path.join(spark_home, "jars") try: jar_files = os.listdir(spark_default_path_src) @@ -175,10 +176,10 @@ def copy_jars(): def parse_configuration_file(path_to_file: str): try: - file = open(path_to_file, 'r', encoding='UTF-8') + file = open(path_to_file, "r", encoding="UTF-8") properties = {} for line in file: - if (not line.startswith('#') and len(line) > 1): + if not line.startswith("#") and len(line) > 1: split = line.split() properties[split[0]] = split[1] return properties @@ -189,10 +190,10 @@ def parse_configuration_file(path_to_file: str): def start_history_server(): # configure the history server - spark_event_log_enabled_key = 'spark.eventLog.enabled' - spark_event_log_directory_key = 'spark.eventLog.dir' - spark_history_fs_log_directory = 'spark.history.fs.logDirectory' - path_to_spark_defaults_conf = os.path.join(spark_home, 'conf/spark-defaults.conf') + spark_event_log_enabled_key = "spark.eventLog.enabled" + spark_event_log_directory_key = "spark.eventLog.dir" + spark_history_fs_log_directory = "spark.history.fs.logDirectory" + path_to_spark_defaults_conf = os.path.join(spark_home, "conf/spark-defaults.conf") properties = parse_configuration_file(path_to_spark_defaults_conf) required_keys = [spark_event_log_enabled_key, spark_event_log_directory_key, spark_history_fs_log_directory] @@ -208,17 +209,17 @@ def start_history_server(): def configure_history_server_log_path(path_to_log_file): # Check if the file path starts with a local file extension # If so, create the path on disk otherwise ignore - print('Configuring spark history server log directory {}.'.format(path_to_log_file)) - if path_to_log_file.startswith('file:/'): + print("Configuring spark history server log directory {}.".format(path_to_log_file)) + if path_to_log_file.startswith("file:/"): # create the local path on disk - directory = path_to_log_file.replace('file:', '') + directory = path_to_log_file.replace("file:", "") if os.path.exists(directory): - print('Skipping. Directory {} already exists.'.format(directory)) + print("Skipping. Directory {} already exists.".format(directory)) else: - print('Create directory {}.'.format(directory)) + print("Create directory {}.".format(directory)) os.makedirs(directory) # Make sure the directory can be accessed by all users os.chmod(directory, mode=0o777) else: - print('Skipping. The eventLog directory is not local.') + print("Skipping. The eventLog directory is not local.") diff --git a/aztk/node_scripts/install/spark_container.py b/aztk/node_scripts/install/spark_container.py index 84565f52..222bc476 100644 --- a/aztk/node_scripts/install/spark_container.py +++ b/aztk/node_scripts/install/spark_container.py @@ -15,42 +15,43 @@ def start_spark_container(docker_repo: str = None, docker_repo=docker_repo, docker_run_options=docker_run_options, cmd="/bin/bash /mnt/batch/tasks/startup/wd/aztk/node_scripts/docker_main.sh", - gpu_enabled=gpu_enabled) + gpu_enabled=gpu_enabled, + ) if file_mounts: for mount in file_mounts: cmd.share_folder(mount.mount_path) - cmd.share_folder('/mnt') + cmd.share_folder("/mnt") - cmd.pass_env('AZTK_WORKING_DIR') - cmd.pass_env('AZ_BATCH_ACCOUNT_NAME') - cmd.pass_env('BATCH_ACCOUNT_KEY') - cmd.pass_env('BATCH_SERVICE_URL') - cmd.pass_env('STORAGE_ACCOUNT_NAME') - cmd.pass_env('STORAGE_ACCOUNT_KEY') - cmd.pass_env('STORAGE_ACCOUNT_SUFFIX') + cmd.pass_env("AZTK_WORKING_DIR") + cmd.pass_env("AZ_BATCH_ACCOUNT_NAME") + cmd.pass_env("BATCH_ACCOUNT_KEY") + cmd.pass_env("BATCH_SERVICE_URL") + cmd.pass_env("STORAGE_ACCOUNT_NAME") + cmd.pass_env("STORAGE_ACCOUNT_KEY") + cmd.pass_env("STORAGE_ACCOUNT_SUFFIX") - cmd.pass_env('SP_TENANT_ID') - cmd.pass_env('SP_CLIENT_ID') - cmd.pass_env('SP_CREDENTIAL') - cmd.pass_env('SP_BATCH_RESOURCE_ID') - cmd.pass_env('SP_STORAGE_RESOURCE_ID') + cmd.pass_env("SP_TENANT_ID") + cmd.pass_env("SP_CLIENT_ID") + cmd.pass_env("SP_CREDENTIAL") + cmd.pass_env("SP_BATCH_RESOURCE_ID") + cmd.pass_env("SP_STORAGE_RESOURCE_ID") - cmd.pass_env('AZ_BATCH_POOL_ID') - cmd.pass_env('AZ_BATCH_NODE_ID') - cmd.pass_env('AZ_BATCH_NODE_IS_DEDICATED') + cmd.pass_env("AZ_BATCH_POOL_ID") + cmd.pass_env("AZ_BATCH_NODE_ID") + cmd.pass_env("AZ_BATCH_NODE_IS_DEDICATED") - cmd.pass_env('AZTK_WORKER_ON_MASTER') - cmd.pass_env('AZTK_MIXED_MODE') - cmd.pass_env('AZTK_IS_MASTER') - cmd.pass_env('AZTK_IS_WORKER') - cmd.pass_env('AZTK_MASTER_IP') + cmd.pass_env("AZTK_WORKER_ON_MASTER") + cmd.pass_env("AZTK_MIXED_MODE") + cmd.pass_env("AZTK_IS_MASTER") + cmd.pass_env("AZTK_IS_WORKER") + cmd.pass_env("AZTK_MASTER_IP") - cmd.pass_env('SPARK_WEB_UI_PORT') - cmd.pass_env('SPARK_WORKER_UI_PORT') - cmd.pass_env('SPARK_CONTAINER_NAME') - cmd.pass_env('SPARK_SUBMIT_LOGS_FILE') - cmd.pass_env('SPARK_JOB_UI_PORT') + cmd.pass_env("SPARK_WEB_UI_PORT") + cmd.pass_env("SPARK_WORKER_UI_PORT") + cmd.pass_env("SPARK_CONTAINER_NAME") + cmd.pass_env("SPARK_SUBMIT_LOGS_FILE") + cmd.pass_env("SPARK_JOB_UI_PORT") cmd.open_port(8080) # Spark Master UI cmd.open_port(7077) # Spark Master @@ -69,5 +70,5 @@ def start_spark_container(docker_repo: str = None, print("-" * 60) print(cmd.to_str()) print("=" * 60) - subprocess.call(['/bin/bash', '-c', 'echo Is master?: $AZTK_IS_MASTER _ $AZTK_IS_WORKER']) - subprocess.call(['/bin/bash', '-c', cmd.to_str()]) + subprocess.call(["/bin/bash", "-c", "echo Is master?: $AZTK_IS_MASTER _ $AZTK_IS_WORKER"]) + subprocess.call(["/bin/bash", "-c", cmd.to_str()]) diff --git a/aztk/node_scripts/job_submission.py b/aztk/node_scripts/job_submission.py index 434b7345..d4bd4e4b 100644 --- a/aztk/node_scripts/job_submission.py +++ b/aztk/node_scripts/job_submission.py @@ -1,12 +1,8 @@ -import datetime import os -import subprocess -import sys -from typing import List + import azure.batch.models as batch_models -import azure.storage.blob as blob import yaml -from aztk.utils.command_builder import CommandBuilder + from core import config from install.pick_master import get_master_node_id @@ -20,14 +16,13 @@ def affinitize_task_to_master(batch_client, cluster_id, task): def schedule_tasks(tasks_path): - ''' + """ Handle the request to submit a task - ''' + """ batch_client = config.batch_client - blob_client = config.blob_client for task_definition in tasks_path: - with open(task_definition, 'r', encoding='UTF-8') as stream: + with open(task_definition, "r", encoding="UTF-8") as stream: try: task = yaml.load(stream) except yaml.YAMLError as exc: @@ -36,13 +31,13 @@ def schedule_tasks(tasks_path): # affinitize task to master task = affinitize_task_to_master(batch_client, os.environ["AZ_BATCH_POOL_ID"], task) # schedule the task - batch_client.task.add(job_id=os.environ['AZ_BATCH_JOB_ID'], task=task) + batch_client.task.add(job_id=os.environ["AZ_BATCH_JOB_ID"], task=task) if __name__ == "__main__": tasks_path = [] - for file in os.listdir(os.environ['AZ_BATCH_TASK_WORKING_DIR']): + for file in os.listdir(os.environ["AZ_BATCH_TASK_WORKING_DIR"]): if file.endswith(".yaml"): - tasks_path.append(os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], file)) + tasks_path.append(os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], file)) schedule_tasks(tasks_path) diff --git a/aztk/node_scripts/setup_host.sh b/aztk/node_scripts/setup_host.sh index 2a8dbfca..4a075291 100644 --- a/aztk/node_scripts/setup_host.sh +++ b/aztk/node_scripts/setup_host.sh @@ -42,7 +42,9 @@ install_prerequisites () { install_docker_compose () { echo "Installing Docker-Compose" - sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose + for i in {1..5}; do + sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose && break || sleep 2; + done sudo chmod +x /usr/local/bin/docker-compose echo "Finished installing Docker-Compose" } @@ -64,9 +66,9 @@ pull_docker_container () { install_python_dependencies () { echo "Installing python dependencies" pipenv install --python /usr/bin/python3.5m - pipenv run pip install --upgrade setuptools wheel #TODO: add pip when pipenv is compatible with pip10 + pipenv run pip install --upgrade pip setuptools wheel + pip --version echo "Finished installing python dependencies" - } run_docker_container () { diff --git a/aztk/node_scripts/submit.py b/aztk/node_scripts/submit.py index 730cca28..fb64aa5c 100644 --- a/aztk/node_scripts/submit.py +++ b/aztk/node_scripts/submit.py @@ -1,20 +1,22 @@ -import sys -import os -import logging -import yaml -import subprocess import datetime +import logging +import os +import subprocess +import sys from typing import List -import azure.storage.blob as blob + import azure.batch.models as batch_models +import azure.storage.blob as blob +import yaml + from aztk.utils.command_builder import CommandBuilder from core import config # limit azure.storage logging logging.getLogger("azure.storage").setLevel(logging.CRITICAL) -''' +""" Submit helper methods -''' +""" def upload_file_to_container(container_name, @@ -40,7 +42,7 @@ def upload_file_to_container(container_name, blob_name = file_path.strip("/") else: blob_name = os.path.basename(file_path) - blob_path = application_name + '/' + blob_name + blob_path = application_name + "/" + blob_name if not node_path: node_path = blob_name @@ -53,47 +55,60 @@ def upload_file_to_container(container_name, container_name, blob_path, permission=blob.BlobPermissions.READ, - expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7)) + expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7), + ) sas_url = blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token) return batch_models.ResourceFile(file_path=node_path, blob_source=sas_url) -def __app_submit_cmd(name: str, app: str, app_args: List[str], main_class: str, jars: List[str], py_files: List[str], - files: List[str], driver_java_options: str, driver_library_path: str, driver_class_path: str, - driver_memory: str, executor_memory: str, driver_cores: int, executor_cores: int): - cluster_id = os.environ['AZ_BATCH_POOL_ID'] - spark_home = os.environ['SPARK_HOME'] - with open(os.path.join(spark_home, 'conf', 'master')) as f: +def __app_submit_cmd( + name: str, + app: str, + app_args: List[str], + main_class: str, + jars: List[str], + py_files: List[str], + files: List[str], + driver_java_options: str, + driver_library_path: str, + driver_class_path: str, + driver_memory: str, + executor_memory: str, + driver_cores: int, + executor_cores: int, +): + spark_home = os.environ["SPARK_HOME"] + with open(os.path.join(spark_home, "conf", "master")) as f: master_ip = f.read().rstrip() # set file paths to correct path on container - files_path = os.environ['AZ_BATCH_TASK_WORKING_DIR'] + files_path = os.environ["AZ_BATCH_TASK_WORKING_DIR"] jars = [os.path.join(files_path, os.path.basename(jar)) for jar in jars] py_files = [os.path.join(files_path, os.path.basename(py_file)) for py_file in py_files] files = [os.path.join(files_path, os.path.basename(f)) for f in files] # 2>&1 redirect stdout and stderr to be in the same file - spark_submit_cmd = CommandBuilder('{0}/bin/spark-submit'.format(spark_home)) - spark_submit_cmd.add_option('--master', 'spark://{0}:7077'.format(master_ip)) - spark_submit_cmd.add_option('--name', name) - spark_submit_cmd.add_option('--class', main_class) - spark_submit_cmd.add_option('--jars', jars and ','.join(jars)) - spark_submit_cmd.add_option('--py-files', py_files and ','.join(py_files)) - spark_submit_cmd.add_option('--files', files and ','.join(files)) - spark_submit_cmd.add_option('--driver-java-options', driver_java_options) - spark_submit_cmd.add_option('--driver-library-path', driver_library_path) - spark_submit_cmd.add_option('--driver-class-path', driver_class_path) - spark_submit_cmd.add_option('--driver-memory', driver_memory) - spark_submit_cmd.add_option('--executor-memory', executor_memory) + spark_submit_cmd = CommandBuilder("{0}/bin/spark-submit".format(spark_home)) + spark_submit_cmd.add_option("--master", "spark://{0}:7077".format(master_ip)) + spark_submit_cmd.add_option("--name", name) + spark_submit_cmd.add_option("--class", main_class) + spark_submit_cmd.add_option("--jars", jars and ",".join(jars)) + spark_submit_cmd.add_option("--py-files", py_files and ",".join(py_files)) + spark_submit_cmd.add_option("--files", files and ",".join(files)) + spark_submit_cmd.add_option("--driver-java-options", driver_java_options) + spark_submit_cmd.add_option("--driver-library-path", driver_library_path) + spark_submit_cmd.add_option("--driver-class-path", driver_class_path) + spark_submit_cmd.add_option("--driver-memory", driver_memory) + spark_submit_cmd.add_option("--executor-memory", executor_memory) if driver_cores: - spark_submit_cmd.add_option('--driver-cores', str(driver_cores)) + spark_submit_cmd.add_option("--driver-cores", str(driver_cores)) if executor_cores: - spark_submit_cmd.add_option('--executor-cores', str(executor_cores)) + spark_submit_cmd.add_option("--executor-cores", str(executor_cores)) spark_submit_cmd.add_argument( - os.path.expandvars(app) + ' ' + ' '.join(['\'' + str(app_arg) + '\'' for app_arg in (app_args or [])])) + os.path.expandvars(app) + " " + " ".join(["'" + str(app_arg) + "'" for app_arg in (app_args or [])])) with open("spark-submit.txt", mode="w", encoding="UTF-8") as stream: stream.write(spark_submit_cmd.to_str()) @@ -102,50 +117,51 @@ def __app_submit_cmd(name: str, app: str, app_args: List[str], main_class: str, def load_application(application_file_path): - ''' + """ Read and parse the application from file - ''' - with open(application_file_path, encoding='UTF-8') as f: + """ + with open(application_file_path, encoding="UTF-8") as f: application = yaml.load(f) return application def upload_log(blob_client, application): - ''' + """ upload output.log to storage account - ''' - log_file = os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], os.environ['SPARK_SUBMIT_LOGS_FILE']) + """ + log_file = os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], os.environ["SPARK_SUBMIT_LOGS_FILE"]) upload_file_to_container( - container_name=os.environ['STORAGE_LOGS_CONTAINER'], - application_name=application['name'], + container_name=os.environ["STORAGE_LOGS_CONTAINER"], + application_name=application["name"], file_path=log_file, blob_client=blob_client, - use_full_path=False) + use_full_path=False, + ) def receive_submit_request(application_file_path): - ''' + """ Handle the request to submit a task - ''' - batch_client = config.batch_client + """ blob_client = config.blob_client application = load_application(application_file_path) cmd = __app_submit_cmd( - name=application['name'], - app=application['application'], - app_args=application['application_args'], - main_class=application['main_class'], - jars=application['jars'], - py_files=application['py_files'], - files=application['files'], - driver_java_options=application['driver_java_options'], - driver_library_path=application['driver_library_path'], - driver_class_path=application['driver_class_path'], - driver_memory=application['driver_memory'], - executor_memory=application['executor_memory'], - driver_cores=application['driver_cores'], - executor_cores=application['executor_cores']) + name=application["name"], + app=application["application"], + app_args=application["application_args"], + main_class=application["main_class"], + jars=application["jars"], + py_files=application["py_files"], + files=application["files"], + driver_java_options=application["driver_java_options"], + driver_library_path=application["driver_library_path"], + driver_class_path=application["driver_class_path"], + driver_memory=application["driver_memory"], + executor_memory=application["executor_memory"], + driver_cores=application["driver_cores"], + executor_cores=application["executor_cores"], + ) return_code = subprocess.call(cmd.to_str(), shell=True) upload_log(blob_client, application) @@ -157,24 +173,25 @@ def upload_error_log(error, application_file_path): blob_client = config.blob_client error_log_path = os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "error.log") - with open(error_log_path, "w", encoding='UTF-8') as error_log: + with open(error_log_path, "w", encoding="UTF-8") as error_log: error_log.write(error) upload_file_to_container( - container_name=os.environ['STORAGE_LOGS_CONTAINER'], - application_name=application['name'], + container_name=os.environ["STORAGE_LOGS_CONTAINER"], + application_name=application["name"], file_path=os.path.realpath(error_log.name), blob_client=blob_client, - use_full_path=False) + use_full_path=False, + ) upload_log(blob_client, application) if __name__ == "__main__": return_code = 1 try: - return_code = receive_submit_request(os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], 'application.yaml')) + return_code = receive_submit_request(os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "application.yaml")) except Exception as e: - upload_error_log(str(e), os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], 'application.yaml')) + upload_error_log(str(e), os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "application.yaml")) # force batch task exit code to match spark exit code sys.exit(return_code) diff --git a/aztk/node_scripts/wait_until_setup_complete.py b/aztk/node_scripts/wait_until_setup_complete.py index 19b0ad71..ca9c8d8f 100644 --- a/aztk/node_scripts/wait_until_setup_complete.py +++ b/aztk/node_scripts/wait_until_setup_complete.py @@ -1,8 +1,8 @@ import time import os -while not os.path.exists('/tmp/setup_complete'): +while not os.path.exists("/tmp/setup_complete"): time.sleep(1) print("SETUP FINISHED") -os.remove('/tmp/setup_complete') +os.remove("/tmp/setup_complete") diff --git a/aztk/spark/client/base/helpers/generate_application_task.py b/aztk/spark/client/base/helpers/generate_application_task.py index 183adfd4..b6559f2b 100644 --- a/aztk/spark/client/base/helpers/generate_application_task.py +++ b/aztk/spark/client/base/helpers/generate_application_task.py @@ -17,12 +17,13 @@ def generate_application_task(core_base_operations, container_id, application, r application_name=application.name, file_path=application.application, blob_client=core_base_operations.blob_client, - use_full_path=False) + use_full_path=False, + ) # Upload application file resource_files.append(app_resource_file) - application.application = '$AZ_BATCH_TASK_WORKING_DIR/' + os.path.basename(application.application) + application.application = "$AZ_BATCH_TASK_WORKING_DIR/" + os.path.basename(application.application) # Upload dependent JARS jar_resource_file_paths = [] @@ -32,7 +33,8 @@ def generate_application_task(core_base_operations, container_id, application, r application_name=application.name, file_path=jar, blob_client=core_base_operations.blob_client, - use_full_path=False) + use_full_path=False, + ) jar_resource_file_paths.append(current_jar_resource_file_path) resource_files.append(current_jar_resource_file_path) @@ -44,7 +46,8 @@ def generate_application_task(core_base_operations, container_id, application, r application_name=application.name, file_path=py_file, blob_client=core_base_operations.blob_client, - use_full_path=False) + use_full_path=False, + ) py_files_resource_file_paths.append(current_py_files_resource_file_path) resource_files.append(current_py_files_resource_file_path) @@ -56,7 +59,8 @@ def generate_application_task(core_base_operations, container_id, application, r application_name=application.name, file_path=file, blob_client=core_base_operations.blob_client, - use_full_path=False) + use_full_path=False, + ) files_resource_file_paths.append(files_resource_file_path) resource_files.append(files_resource_file_path) @@ -67,21 +71,23 @@ def generate_application_task(core_base_operations, container_id, application, r application_definition_file = helpers.upload_text_to_container( container_name=container_id, application_name=application.name, - file_path='application.yaml', + file_path="application.yaml", content=yaml.dump(vars(application)), - blob_client=core_base_operations.blob_client) + blob_client=core_base_operations.blob_client, + ) resource_files.append(application_definition_file) # create command to submit task - task_cmd = CommandBuilder('sudo docker exec') - task_cmd.add_argument('-i') - task_cmd.add_option('-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR') - task_cmd.add_option('-e', 'STORAGE_LOGS_CONTAINER={0}'.format(container_id)) - task_cmd.add_argument('spark /bin/bash >> output.log 2>&1') - task_cmd.add_argument('-c "source ~/.bashrc; ' \ - 'export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; ' \ - 'cd \$AZ_BATCH_TASK_WORKING_DIR; ' \ - '\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"') + task_cmd = CommandBuilder("sudo docker exec") + task_cmd.add_argument("-i") + task_cmd.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR") + task_cmd.add_option("-e", "STORAGE_LOGS_CONTAINER={0}".format(container_id)) + task_cmd.add_argument("spark /bin/bash >> output.log 2>&1") + task_cmd.add_argument( + r'-c "source ~/.bashrc; ' + r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " + r"cd \$AZ_BATCH_TASK_WORKING_DIR; " + r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"') # Create task task = batch_models.TaskAddParameter( @@ -91,6 +97,7 @@ def generate_application_task(core_base_operations, container_id, application, r constraints=batch_models.TaskConstraints(max_task_retry_count=application.max_retry_count), user_identity=batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( - scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin))) + scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)), + ) return task diff --git a/aztk/spark/client/base/helpers/generate_cluster_start_task.py b/aztk/spark/client/base/helpers/generate_cluster_start_task.py index e3670b95..c4eb23c0 100644 --- a/aztk/spark/client/base/helpers/generate_cluster_start_task.py +++ b/aztk/spark/client/base/helpers/generate_cluster_start_task.py @@ -1,14 +1,9 @@ from typing import List import azure.batch.models as batch_models -import azure.batch.models.batch_error as batch_error -from aztk import error -from aztk.internal.cluster_data import NodeData from aztk.spark import models -from aztk.spark.utils import util from aztk.utils import constants, helpers -from aztk.spark import models POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( @@ -60,14 +55,13 @@ def __get_secrets_env(core_base_operations): ] -def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, - gpu_enabled: bool, - docker_repo: str = None, - docker_run_options: str = None, - plugins=None, - worker_on_master: bool = True, - file_mounts=None, - mixed_mode: bool = False): +def __cluster_install_cmd( + zip_resource_file: batch_models.ResourceFile, + gpu_enabled: bool, + docker_repo: str = None, + docker_run_options: str = None, + file_mounts=None, +): """ For Docker on ubuntu 16.04 - return the command line to be run on the start task of the pool to setup spark. @@ -80,41 +74,42 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, if file_mounts: for mount in file_mounts: # Create the directory on the node - shares.append('mkdir -p {0}'.format(mount.mount_path)) + shares.append("mkdir -p {0}".format(mount.mount_path)) # Mount the file share shares.append( - 'mount -t cifs //{0}.file.core.windows.net/{2} {3} -o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp'. + "mount -t cifs //{0}.file.core.windows.net/{2} {3} -o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp". format(mount.storage_account_name, mount.storage_account_key, mount.file_share_path, mount.mount_path)) setup = [ - 'time('\ - 'apt-get -y update;'\ - 'apt-get -y --no-install-recommends install unzip;'\ - 'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\ - 'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\ - ') 2>&1'.format(zip_resource_file.file_path), + "time(" + "apt-get -y update;" + "apt-get -y --no-install-recommends install unzip;" + "unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};" + "chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;" + ") 2>&1".format(zip_resource_file.file_path), '/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1} "{2}"'.format( constants.DOCKER_SPARK_CONTAINER_NAME, docker_repo, - "" if docker_run_options is None else docker_run_options.replace('"', '\\\"') - ) + "" if docker_run_options is None else docker_run_options.replace('"', '\\"'), + ), ] commands = shares + setup return commands -def generate_cluster_start_task(core_base_operations, - zip_resource_file: batch_models.ResourceFile, - cluster_id: str, - gpu_enabled: bool, - docker_repo: str = None, - docker_run_options: str = None, - file_shares: List[models.FileShare] = None, - plugins: List[models.PluginConfiguration] = None, - mixed_mode: bool = False, - worker_on_master: bool = True): +def generate_cluster_start_task( + core_base_operations, + zip_resource_file: batch_models.ResourceFile, + cluster_id: str, + gpu_enabled: bool, + docker_repo: str = None, + docker_run_options: str = None, + file_shares: List[models.FileShare] = None, + mixed_mode: bool = False, + worker_on_master: bool = True, +): """ This will return the start task object for the pool to be created. :param cluster_id str: Id of the cluster(Used for uploading the resource files) @@ -130,22 +125,23 @@ def generate_cluster_start_task(core_base_operations, spark_submit_logs_file = constants.SPARK_SUBMIT_LOGS_FILE # TODO use certificate - environment_settings = __get_secrets_env(core_base_operations) + [ + environment_settings = (__get_secrets_env(core_base_operations) + [ batch_models.EnvironmentSetting(name="SPARK_WEB_UI_PORT", value=spark_web_ui_port), batch_models.EnvironmentSetting(name="SPARK_WORKER_UI_PORT", value=spark_worker_ui_port), batch_models.EnvironmentSetting(name="SPARK_JOB_UI_PORT", value=spark_job_ui_port), batch_models.EnvironmentSetting(name="SPARK_CONTAINER_NAME", value=spark_container_name), batch_models.EnvironmentSetting(name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file), batch_models.EnvironmentSetting(name="AZTK_GPU_ENABLED", value=helpers.bool_env(gpu_enabled)), - ] + __get_docker_credentials(core_base_operations) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode) + ] + __get_docker_credentials(core_base_operations) + _get_aztk_environment(cluster_id, worker_on_master, + mixed_mode)) # start task command - command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, plugins, - worker_on_master, file_shares, mixed_mode) + command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, file_shares) return batch_models.StartTask( command_line=helpers.wrap_commands_in_shell(command), resource_files=resource_files, environment_settings=environment_settings, user_identity=POOL_ADMIN_USER_IDENTITY, - wait_for_success=True) + wait_for_success=True, + ) diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index ef95e75d..9c4bf375 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -2,7 +2,6 @@ from typing import List import azure.batch.models as batch_models -from aztk.client.base import BaseOperations as CoreBaseOperations from aztk.spark import models from .helpers import generate_application_task, generate_cluster_start_task @@ -12,18 +11,19 @@ class SparkBaseOperations: """Spark Base operations object that all other Spark operations objects inherit from """ - #TODO: make this private or otherwise not public - def _generate_cluster_start_task(self, - core_base_operations, - zip_resource_file: batch_models.ResourceFile, - id: str, - gpu_enabled: bool, - docker_repo: str = None, - docker_run_options: str = None, - file_shares: List[models.FileShare] = None, - plugins: List[models.PluginConfiguration] = None, - mixed_mode: bool = False, - worker_on_master: bool = True): + # TODO: make this private or otherwise not public + def _generate_cluster_start_task( + self, + core_base_operations, + zip_resource_file: batch_models.ResourceFile, + id: str, + gpu_enabled: bool, + docker_repo: str = None, + docker_run_options: str = None, + file_shares: List[models.FileShare] = None, + mixed_mode: bool = False, + worker_on_master: bool = True, + ): """Generate the Azure Batch Start Task to provision a Spark cluster. Args: @@ -35,10 +35,8 @@ class SparkBaseOperations: If None, the default Docker image will be used. Defaults to None. file_shares (:obj:`aztk.spark.models.FileShare`, optional): a list of FileShares to mount on the cluster. Defaults to None. - plugins (:obj:`aztk.spark.models.PluginConfiguration`, optional): a list of plugins to set up on the cluster. - Defaults to None. - mixed_mode (:obj:`bool`, optional): If True, the cluster is configured to use both dedicated and low priority VMs. - Defaults to False. + mixed_mode (:obj:`bool`, optional): If True, the cluster is configured to use both dedicated + and low priority VMs. Defaults to False. worker_on_master (:obj:`bool`, optional): If True, the cluster is configured to provision a Spark worker on the VM that runs the Spark master. Defaults to True. @@ -46,10 +44,18 @@ class SparkBaseOperations: :obj:`azure.batch.models.StartTask`: the StartTask definition to provision the cluster. """ return generate_cluster_start_task.generate_cluster_start_task( - core_base_operations, zip_resource_file, id, gpu_enabled, docker_repo, docker_run_options, file_shares, - plugins, mixed_mode, worker_on_master) + core_base_operations, + zip_resource_file, + id, + gpu_enabled, + docker_repo, + docker_run_options, + file_shares, + mixed_mode, + worker_on_master, + ) - #TODO: make this private or otherwise not public + # TODO: make this private or otherwise not public def _generate_application_task(self, core_base_operations, container_id, application, remote=False): """Generate the Azure Batch Start Task to provision a Spark cluster. diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index 01743b6c..a78c7f81 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -2,21 +2,15 @@ from typing import List import azure.batch.models.batch_error as batch_error -import aztk from aztk import error from aztk import models as base_models from aztk.client import CoreClient -from aztk.internal.cluster_data import NodeData from aztk.spark import models from aztk.spark.client.cluster import ClusterOperations from aztk.spark.client.job import JobOperations -from aztk.spark.helpers import cluster_diagnostic_helper -from aztk.spark.helpers import create_cluster as create_cluster_helper -from aztk.spark.helpers import get_log as get_log_helper from aztk.spark.helpers import job_submission as job_submit_helper -from aztk.spark.helpers import submit as cluster_submit_helper from aztk.spark.utils import util -from aztk.utils import azure_api, deprecated, deprecate, helpers +from aztk.utils import deprecate, deprecated, helpers class Client(CoreClient): @@ -28,13 +22,14 @@ class Client(CoreClient): """ def __init__(self, secrets_configuration: models.SecretsConfiguration = None, **kwargs): - self.secrets_configuration = None + super().__init__() context = None if kwargs.get("secrets_config"): deprecate( version="0.10.0", message="secrets_config key is deprecated in secrets.yaml", - advice="Please use secrets_configuration key instead.") + advice="Please use secrets_configuration key instead.", + ) context = self._get_context(kwargs.get("secrets_config")) else: context = self._get_context(secrets_configuration) @@ -133,36 +128,42 @@ class Client(CoreClient): id=cluster_id, node_id=node_id, command=command, host=host, internal=internal, timeout=timeout) @deprecated("0.10.0") - def cluster_copy(self, - cluster_id: str, - source_path: str, - destination_path: str, - host: bool = False, - internal: bool = False, - timeout: int = None): + def cluster_copy( + self, + cluster_id: str, + source_path: str, + destination_path: str, + host: bool = False, + internal: bool = False, + timeout: int = None, + ): return self.cluster.copy( id=cluster_id, source_path=source_path, destination_path=destination_path, host=host, internal=internal, - timeout=timeout) + timeout=timeout, + ) @deprecated("0.10.0") - def cluster_download(self, - cluster_id: str, - source_path: str, - destination_path: str = None, - host: bool = False, - internal: bool = False, - timeout: int = None): + def cluster_download( + self, + cluster_id: str, + source_path: str, + destination_path: str = None, + host: bool = False, + internal: bool = False, + timeout: int = None, + ): return self.cluster.download( id=cluster_id, source_path=source_path, destination_path=destination_path, host=host, internal=internal, - timeout=timeout) + timeout=timeout, + ) @deprecated("0.10.0") def cluster_ssh_into_master(self, @@ -176,9 +177,9 @@ class Client(CoreClient): return self.cluster._core_cluster_operations.ssh_into_node(cluster_id, node_id, username, ssh_key, password, port_forward_list, internal) - ''' + """ job submission - ''' + """ @deprecated("0.10.0") def submit_job(self, job_configuration: models.JobConfiguration): diff --git a/aztk/spark/client/cluster/helpers/copy.py b/aztk/spark/client/cluster/helpers/copy.py index 2b9cdbc0..84d41363 100644 --- a/aztk/spark/client/cluster/helpers/copy.py +++ b/aztk/spark/client/cluster/helpers/copy.py @@ -4,15 +4,17 @@ from aztk import error from aztk.utils import helpers -def cluster_copy(core_cluster_operations, - cluster_id: str, - source_path: str, - destination_path: str, - host: bool = False, - internal: bool = False, - timeout: int = None): +def cluster_copy( + core_cluster_operations, + cluster_id: str, + source_path: str, + destination_path: str, + host: bool = False, + internal: bool = False, + timeout: int = None, +): try: - container_name = None if host else 'spark' + container_name = None if host else "spark" return core_cluster_operations.copy( cluster_id, source_path, @@ -20,6 +22,7 @@ def cluster_copy(core_cluster_operations, container_name=container_name, get=False, internal=internal, - timeout=timeout) + timeout=timeout, + ) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py index e72b0880..27206b38 100644 --- a/aztk/spark/client/cluster/helpers/create.py +++ b/aztk/spark/client/cluster/helpers/create.py @@ -52,9 +52,16 @@ def create_cluster(core_cluster_operations, zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() start_task = spark_cluster_operations._generate_cluster_start_task( - core_cluster_operations, zip_resource_files, cluster_conf.cluster_id, cluster_conf.gpu_enabled(), - cluster_conf.get_docker_repo(), cluster_conf.get_docker_run_options(), cluster_conf.file_shares, - cluster_conf.plugins, cluster_conf.mixed_mode(), cluster_conf.worker_on_master) + core_cluster_operations, + zip_resource_files, + cluster_conf.cluster_id, + cluster_conf.gpu_enabled(), + cluster_conf.get_docker_repo(), + cluster_conf.get_docker_run_options(), + cluster_conf.file_shares, + cluster_conf.mixed_mode(), + cluster_conf.worker_on_master, + ) software_metadata_key = base_models.Software.spark diff --git a/aztk/spark/client/cluster/helpers/create_user.py b/aztk/spark/client/cluster/helpers/create_user.py index 452bb640..f55b201f 100644 --- a/aztk/spark/client/cluster/helpers/create_user.py +++ b/aztk/spark/client/cluster/helpers/create_user.py @@ -4,12 +4,14 @@ from aztk import error from aztk.utils import helpers -def create_user(core_cluster_operations, - spark_cluster_operations, - cluster_id: str, - username: str, - password: str = None, - ssh_key: str = None) -> str: +def create_user( + core_cluster_operations, + spark_cluster_operations, + cluster_id: str, + username: str, + password: str = None, + ssh_key: str = None, +) -> str: try: cluster = spark_cluster_operations.get(cluster_id) master_node_id = cluster.master_node_id diff --git a/aztk/spark/client/cluster/helpers/diagnostics.py b/aztk/spark/client/cluster/helpers/diagnostics.py index 830df7d6..469872f4 100644 --- a/aztk/spark/client/cluster/helpers/diagnostics.py +++ b/aztk/spark/client/cluster/helpers/diagnostics.py @@ -6,18 +6,13 @@ from aztk import error from aztk.utils import helpers -def _write_error(stream, node_output): - stream.write(node_output.error) - - -def _write_output(stream, node_output): - stream.write(node_output.output) - - def _run(spark_cluster_operations, cluster_id, output_directory=None, brief=False): # copy debug program to each node - output = spark_cluster_operations.copy( + copy_output = spark_cluster_operations.copy( cluster_id, os.path.abspath("./aztk/spark/utils/debug.py"), "/tmp/debug.py", host=True) + for node_output in copy_output: + if node_output.error: + raise error.AztkError("Failed to copy diagnostic script to cluster.") ssh_cmd = _build_diagnostic_ssh_command(brief) run_output = spark_cluster_operations.run(cluster_id, ssh_cmd, host=True) remote_path = "/tmp/debug.zip" @@ -27,9 +22,9 @@ def _run(spark_cluster_operations, cluster_id, output_directory=None, brief=Fals result = spark_cluster_operations.download(cluster_id, remote_path, local_path, host=True) # write run output or error to debug/ directory - with open(os.path.join(output_directory, "debug-output.txt"), 'w', encoding="UTF-8") as stream: + with open(os.path.join(output_directory, "debug-output.txt"), "w", encoding="UTF-8") as stream: for node_output in run_output: - _write_error(stream, node_output) if node_output.error else _write_output(stream, node_output) + stream.write(node_output.error) if node_output.error else stream.write(node_output.output) else: result = spark_cluster_operations.download(cluster_id, remote_path, host=True) @@ -37,11 +32,11 @@ def _run(spark_cluster_operations, cluster_id, output_directory=None, brief=Fals def _build_diagnostic_ssh_command(brief): - return "sudo rm -rf /tmp/debug.zip; "\ - "sudo apt-get install -y python3-pip; "\ - "sudo -H pip3 install --upgrade pip; "\ - "sudo -H pip3 install docker; "\ - "sudo python3 /tmp/debug.py {}".format(brief) + return ("sudo rm -rf /tmp/debug.zip; " + "sudo apt-get install -y python3-pip; " + "sudo -H pip3 install --upgrade pip; " + "sudo -H pip3 install docker; " + "sudo python3 /tmp/debug.py {}".format(brief)) def run_cluster_diagnostics(spark_cluster_operations, cluster_id, output_directory=None, brief=False): diff --git a/aztk/spark/client/cluster/helpers/download.py b/aztk/spark/client/cluster/helpers/download.py index a4fca666..2c4367a4 100644 --- a/aztk/spark/client/cluster/helpers/download.py +++ b/aztk/spark/client/cluster/helpers/download.py @@ -4,15 +4,17 @@ from aztk import error from aztk.utils import helpers -def cluster_download(core_cluster_operations, - cluster_id: str, - source_path: str, - destination_path: str = None, - host: bool = False, - internal: bool = False, - timeout: int = None): +def cluster_download( + core_cluster_operations, + cluster_id: str, + source_path: str, + destination_path: str = None, + host: bool = False, + internal: bool = False, + timeout: int = None, +): try: - container_name = None if host else 'spark' + container_name = None if host else "spark" return core_cluster_operations.copy( cluster_id, source_path, @@ -20,6 +22,7 @@ def cluster_download(core_cluster_operations, container_name=container_name, get=True, internal=internal, - timeout=timeout) + timeout=timeout, + ) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get_application_status.py b/aztk/spark/client/cluster/helpers/get_application_status.py index 4dc19106..d8458db5 100644 --- a/aztk/spark/client/cluster/helpers/get_application_status.py +++ b/aztk/spark/client/cluster/helpers/get_application_status.py @@ -7,6 +7,6 @@ from aztk.utils import helpers def get_application_status(core_cluster_operations, cluster_id: str, app_name: str): try: task = core_cluster_operations.batch_client.task.get(cluster_id, app_name) - return task.state._value_ + return task.state.name except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/node_run.py b/aztk/spark/client/cluster/helpers/node_run.py index 623bb57d..c19c8067 100644 --- a/aztk/spark/client/cluster/helpers/node_run.py +++ b/aztk/spark/client/cluster/helpers/node_run.py @@ -4,15 +4,17 @@ from aztk import error from aztk.utils import helpers -def node_run(core_cluster_operations, - cluster_id: str, - node_id: str, - command: str, - host=False, - internal: bool = False, - timeout=None): +def node_run( + core_cluster_operations, + cluster_id: str, + node_id: str, + command: str, + host=False, + internal: bool = False, + timeout=None, +): try: return core_cluster_operations.node_run( - cluster_id, node_id, command, internal, container_name='spark' if not host else None, timeout=timeout) + cluster_id, node_id, command, internal, container_name="spark" if not host else None, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/run.py b/aztk/spark/client/cluster/helpers/run.py index 0cdc820c..03d12293 100644 --- a/aztk/spark/client/cluster/helpers/run.py +++ b/aztk/spark/client/cluster/helpers/run.py @@ -12,6 +12,6 @@ def cluster_run(core_cluster_operations, timeout=None): try: return core_cluster_operations.run( - cluster_id, command, internal, container_name='spark' if not host else None, timeout=timeout) + cluster_id, command, internal, container_name="spark" if not host else None, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/ssh_into_master.py b/aztk/spark/client/cluster/helpers/ssh_into_master.py index 029bc81d..d57e6c97 100644 --- a/aztk/spark/client/cluster/helpers/ssh_into_master.py +++ b/aztk/spark/client/cluster/helpers/ssh_into_master.py @@ -4,16 +4,19 @@ from aztk import error from aztk.utils import helpers -def cluster_ssh_into_master(spark_cluster_operations, - cluster_id, - node_id, - username, - ssh_key=None, - password=None, - port_forward_list=None, - internal=False): +def ssh_into_master( + spark_cluster_operations, + core_cluster_operations, + cluster_id, + username, + ssh_key=None, + password=None, + port_forward_list=None, + internal=False, +): try: - spark_cluster_operations.ssh_into_node(cluster_id, node_id, username, ssh_key, password, port_forward_list, - internal) + master_node_id = spark_cluster_operations.get(cluster_id).master_node_id + core_cluster_operations.ssh_into_node(cluster_id, master_node_id, username, ssh_key, password, + port_forward_list, internal) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/submit.py b/aztk/spark/client/cluster/helpers/submit.py index ea1d3f04..ac1f9144 100644 --- a/aztk/spark/client/cluster/helpers/submit.py +++ b/aztk/spark/client/cluster/helpers/submit.py @@ -42,12 +42,14 @@ def submit_application(core_cluster_operations, job_id=job_id, task_id=task.id, batch_client=core_cluster_operations.batch_client) -def submit(core_cluster_operations, - spark_cluster_operations, - cluster_id: str, - application: models.ApplicationConfiguration, - remote: bool = False, - wait: bool = False): +def submit( + core_cluster_operations, + spark_cluster_operations, + cluster_id: str, + application: models.ApplicationConfiguration, + remote: bool = False, + wait: bool = False, +): try: submit_application(core_cluster_operations, spark_cluster_operations, cluster_id, application, remote, wait) except batch_error.BatchErrorException as e: diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index 37e26c22..7cc2ca64 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -2,9 +2,25 @@ from aztk.client.cluster import CoreClusterOperations from aztk.spark import models from aztk.spark.client.base import SparkBaseOperations -from .helpers import (copy, create, create_user, delete, diagnostics, download, get, get_application_log, - get_application_status, get_configuration, get_remote_login_settings, list, node_run, run, submit, - wait) +from .helpers import ( + copy, + create, + create_user, + delete, + diagnostics, + download, + get, + get_application_log, + get_application_status, + get_configuration, + get_remote_login_settings, + list, + node_run, + run, + ssh_into_master, + submit, + wait, +) class ClusterOperations(SparkBaseOperations): @@ -58,7 +74,8 @@ class ClusterOperations(SparkBaseOperations): """List all clusters. Returns: - :obj:`List[aztk.spark.models.Cluster]`: List of Cluster objects each representing the state and configuration of the cluster. + :obj:`List[aztk.spark.models.Cluster]`: List of Cluster objects each representing the state + and configuration of the cluster. """ return list.list_clusters(self._core_cluster_operations) @@ -71,7 +88,8 @@ class ClusterOperations(SparkBaseOperations): remote (:obj:`bool`): If True, the application file will not be uploaded, it is assumed to be reachable by the cluster already. This is useful when your application is stored in a mounted Azure File Share and not the client. Defaults to False. - wait (:obj:`bool`, optional): If True, this function blocks until the application has completed. Defaults to False. + wait (:obj:`bool`, optional): If True, this function blocks until the application has completed. + Defaults to False. Returns: :obj:`None` @@ -84,7 +102,8 @@ class ClusterOperations(SparkBaseOperations): Args: username (:obj:`str`): name of the user to create. pool_id (:obj:`str`): id of the cluster to create the user on. - ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None. + ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. + Defaults to None. password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None. Returns: @@ -118,7 +137,8 @@ class ClusterOperations(SparkBaseOperations): Defaults to None. Returns: - :obj:`List[aztk.spark.models.NodeOutput]`: list of NodeOutput objects containing the output of the run command + :obj:`List[aztk.spark.models.NodeOutput]`: + list of NodeOutput objects containing the output of the run command """ return run.cluster_run(self._core_cluster_operations, id, command, host, internal, timeout) @@ -141,13 +161,15 @@ class ClusterOperations(SparkBaseOperations): """ return node_run.node_run(self._core_cluster_operations, id, node_id, command, host, internal, timeout) - def copy(self, - id: str, - source_path: str, - destination_path: str, - host: bool = False, - internal: bool = False, - timeout: int = None): + def copy( + self, + id: str, + source_path: str, + destination_path: str, + host: bool = False, + internal: bool = False, + timeout: int = None, + ): """Copy a file to every node in a cluster. Args: @@ -162,18 +184,21 @@ class ClusterOperations(SparkBaseOperations): Defaults to None. Returns: - :obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. + :obj:`List[aztk.spark.models.NodeOutput]`: + A list of NodeOutput objects representing the output of the copy command. """ return copy.cluster_copy(self._core_cluster_operations, id, source_path, destination_path, host, internal, timeout) - def download(self, - id: str, - source_path: str, - destination_path: str = None, - host: bool = False, - internal: bool = False, - timeout: int = None): + def download( + self, + id: str, + source_path: str, + destination_path: str = None, + host: bool = False, + internal: bool = False, + timeout: int = None, + ): """Download a file from every node in a cluster. Args: @@ -190,7 +215,8 @@ class ClusterOperations(SparkBaseOperations): Defaults to None. Returns: - :obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. + :obj:`List[aztk.spark.models.NodeOutput]`: + A list of NodeOutput objects representing the output of the copy command. """ return download.cluster_download(self._core_cluster_operations, id, source_path, destination_path, host, internal, timeout) @@ -205,7 +231,8 @@ class ClusterOperations(SparkBaseOperations): written to this path. Defaults to None. Returns: - :obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. + :obj:`List[aztk.spark.models.NodeOutput]`: + A list of NodeOutput objects representing the output of the copy command. """ return diagnostics.run_cluster_diagnostics(self, id, output_directory, brief) @@ -215,10 +242,11 @@ class ClusterOperations(SparkBaseOperations): Args: id (:obj:`str`): the id of the cluster to run the command on. application_name (:obj:`str`): str - tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. Otherwise, the whole log will be retrieved. - Only use this if streaming the log as it is being written. Defaults to False. - current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are retrieved. - Only useful is streaming the log as it is being written. Only used if tail is True. + tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. + Otherwise, the whole log will be retrieved. Only use this if streaming the log as it is being written. + Defaults to False. + current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are + retrieved. Only useful is streaming the log as it is being written. Only used if tail is True. Returns: :obj:`aztk.spark.models.ApplicationLog`: a model representing the output of the application. @@ -234,7 +262,8 @@ class ClusterOperations(SparkBaseOperations): node_id (:obj:`str`): the id of the node in the cluster Returns: - :obj:`aztk.spark.models.RemoteLogin`: Object that contains the ip address and port combination to login to a node + :obj:`aztk.spark.models.RemoteLogin`: + Object that contains the ip address and port combination to login to a node """ return get_remote_login_settings.get_remote_login_settings(self._core_cluster_operations, id, node_id) @@ -260,3 +289,21 @@ class ClusterOperations(SparkBaseOperations): :obj:`aztk.spark.models.ClusterConfiguration` """ return get_configuration.get_configuration(self._core_cluster_operations, id) + + def ssh_into_master(self, id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): + """Open an SSH tunnel to the Spark master node and forward the specified ports + + Args: + id (:obj:`str`): the id of the cluster + username (:obj:`str`): the name of the user to open the ssh session with + ssh_key (:obj:`str`, optional): the ssh_key to authenticate the ssh user with. + Must specify either `ssh_key` or `password`. + password (:obj:`str`, optional): the password to authenticate the ssh user with. + Must specify either `password` or `ssh_key`. + port_forward_list (:obj:`aztk.spark.models.PortForwardingSpecification`, optional): + List of the ports to forward. + internal (:obj:`str`, optional): if True, this will connect to the node using its internal IP. + Only use this if running within the same VNET as the cluster. Defaults to False. + """ + return ssh_into_master.ssh_into_master(self, self._core_cluster_operations, id, username, ssh_key, password, + port_forward_list, internal) diff --git a/aztk/spark/client/job/helpers/delete.py b/aztk/spark/client/job/helpers/delete.py index e2ad8be2..c133a628 100644 --- a/aztk/spark/client/job/helpers/delete.py +++ b/aztk/spark/client/job/helpers/delete.py @@ -2,7 +2,6 @@ import azure.batch.models as batch_models import azure.batch.models.batch_error as batch_error from aztk import error -from aztk.spark import models from aztk.utils import helpers from .get_recent_job import get_recent_job diff --git a/aztk/spark/client/job/helpers/get_application_log.py b/aztk/spark/client/job/helpers/get_application_log.py index 8c1855d9..a6f7b895 100644 --- a/aztk/spark/client/job/helpers/get_application_log.py +++ b/aztk/spark/client/job/helpers/get_application_log.py @@ -5,7 +5,6 @@ from aztk import error from aztk.spark import models from aztk.utils import helpers -from .list_applications import list_applications from .get_recent_job import get_recent_job @@ -25,8 +24,11 @@ def _get_application_log(core_job_operations, spark_job_operations, job_id, appl raise error.AztkError("The application {0} has not yet been created.".format(application)) raise error.AztkError("The application {0} does not exist".format(application_name)) else: - if task.state in (batch_models.TaskState.active, batch_models.TaskState.running, - batch_models.TaskState.preparing): + if task.state in ( + batch_models.TaskState.active, + batch_models.TaskState.running, + batch_models.TaskState.preparing, + ): raise error.AztkError("The application {0} has not yet finished executing.".format(application_name)) return core_job_operations.get_application_log(job_id, application_name) diff --git a/aztk/spark/client/job/helpers/list_applications.py b/aztk/spark/client/job/helpers/list_applications.py index 81dab6cc..5ba28d03 100644 --- a/aztk/spark/client/job/helpers/list_applications.py +++ b/aztk/spark/client/job/helpers/list_applications.py @@ -13,7 +13,7 @@ def _list_applications(core_job_operations, job_id): applications = {} for metadata_item in recent_run_job.metadata: if metadata_item.name == "applications": - for app_name in metadata_item.value.split('\n'): + for app_name in metadata_item.value.split("\n"): applications[app_name] = None # get tasks from Batch job diff --git a/aztk/spark/client/job/helpers/stop.py b/aztk/spark/client/job/helpers/stop.py index 8fd7660e..eb6ce919 100644 --- a/aztk/spark/client/job/helpers/stop.py +++ b/aztk/spark/client/job/helpers/stop.py @@ -1,7 +1,6 @@ import azure.batch.models.batch_error as batch_error from aztk import error -from aztk.spark import models from aztk.utils import helpers from .get_recent_job import get_recent_job diff --git a/aztk/spark/client/job/helpers/stop_application.py b/aztk/spark/client/job/helpers/stop_application.py index b2a72fcf..aa41ad79 100644 --- a/aztk/spark/client/job/helpers/stop_application.py +++ b/aztk/spark/client/job/helpers/stop_application.py @@ -1,8 +1,5 @@ import azure.batch.models.batch_error as batch_error -from aztk import error -from aztk.spark import models -from aztk.utils import helpers from .get_recent_job import get_recent_job diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py index 98e8af67..f79986d1 100644 --- a/aztk/spark/client/job/helpers/submit.py +++ b/aztk/spark/client/job/helpers/submit.py @@ -15,11 +15,12 @@ def __app_cmd(): docker_exec.add_argument("-i") docker_exec.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR") docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID") - docker_exec.add_argument("spark /bin/bash >> output.log 2>&1 -c \"" \ - "source ~/.bashrc; " \ - "export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " \ - "cd \$AZ_BATCH_TASK_WORKING_DIR; " \ - "\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py\"") + docker_exec.add_argument( + r'spark /bin/bash >> output.log 2>&1 -c "' + r"source ~/.bashrc; " + r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " + r"cd \$AZ_BATCH_TASK_WORKING_DIR; " + r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py"') return docker_exec.to_str() @@ -28,10 +29,11 @@ def generate_job_manager_task(core_job_operations, job, application_tasks): for application, task in application_tasks: task_definition_resource_file = helpers.upload_text_to_container( container_name=job.id, - application_name=application.name + '.yaml', - file_path=application.name + '.yaml', + application_name=application.name + ".yaml", + file_path=application.name + ".yaml", content=yaml.dump(task), - blob_client=core_job_operations.blob_client) + blob_client=core_job_operations.blob_client, + ) resource_files.append(task_definition_resource_file) task_cmd = __app_cmd() @@ -45,7 +47,8 @@ def generate_job_manager_task(core_job_operations, job, application_tasks): allow_low_priority_node=True, user_identity=batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( - scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin))) + scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)), + ) return task @@ -83,24 +86,24 @@ def submit_job(core_job_operations, job_configuration.get_docker_repo(), job_configuration.get_docker_run_options(), mixed_mode=job_configuration.mixed_mode(), - worker_on_master=job_configuration.worker_on_master) + worker_on_master=job_configuration.worker_on_master, + ) application_tasks = [] for application in job_configuration.applications: - application_tasks.append((application, - spark_job_operations._generate_application_task( - core_job_operations, job_configuration.id, application))) + application_tasks.append(( + application, + spark_job_operations._generate_application_task(core_job_operations, job_configuration.id, application), + )) job_manager_task = generate_job_manager_task(core_job_operations, job_configuration, application_tasks) software_metadata_key = base_models.Software.spark - vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') + vm_image = models.VmImage(publisher="Canonical", offer="UbuntuServer", sku="16.04") - autoscale_formula = "$TargetDedicatedNodes = {0}; " \ - "$TargetLowPriorityNodes = {1}".format( - job_configuration.max_dedicated_nodes, - job_configuration.max_low_pri_nodes) + autoscale_formula = "$TargetDedicatedNodes = {0}; " "$TargetLowPriorityNodes = {1}".format( + job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes) job = core_job_operations.submit( job_configuration=job_configuration, @@ -109,7 +112,8 @@ def submit_job(core_job_operations, autoscale_formula=autoscale_formula, software_metadata_key=software_metadata_key, vm_image_model=vm_image, - application_metadata='\n'.join(application.name for application in (job_configuration.applications or []))) + application_metadata="\n".join(application.name for application in (job_configuration.applications or [])), + ) if wait: spark_job_operations.wait(id=job_configuration.id) diff --git a/aztk/spark/client/job/operations.py b/aztk/spark/client/job/operations.py index ea3bd971..9ccd53e2 100644 --- a/aztk/spark/client/job/operations.py +++ b/aztk/spark/client/job/operations.py @@ -2,8 +2,18 @@ from aztk.client.job import CoreJobOperations from aztk.spark import models from aztk.spark.client.base import SparkBaseOperations -from .helpers import (delete, get, get_application, get_application_log, list, list_applications, stop, - stop_application, submit, wait_until_complete) +from .helpers import ( + delete, + get, + get_application, + get_application_log, + list, + list_applications, + stop, + stop_application, + submit, + wait_until_complete, +) class JobOperations(SparkBaseOperations): diff --git a/aztk/spark/helpers/cluster_diagnostic_helper.py b/aztk/spark/helpers/cluster_diagnostic_helper.py index 9dca8c58..2c5707d3 100644 --- a/aztk/spark/helpers/cluster_diagnostic_helper.py +++ b/aztk/spark/helpers/cluster_diagnostic_helper.py @@ -1,8 +1,4 @@ import os -from aztk.utils import ssh -from aztk.utils.command_builder import CommandBuilder -from aztk import models as aztk_models -import azure.batch.models as batch_models def run(spark_client, cluster_id, output_directory=None): @@ -17,8 +13,8 @@ def run(spark_client, cluster_id, output_directory=None): output = spark_client.cluster_download(cluster_id, remote_path, local_path, host=True) # write run output to debug/ directory - with open(os.path.join(os.path.dirname(local_path), "debug-output.txt"), 'w', encoding="UTF-8") as f: - [f.write(line + '\n') for node_output in run_output for line in node_output.output] + with open(os.path.join(os.path.dirname(local_path), "debug-output.txt"), "w", encoding="UTF-8") as f: + [f.write(line + "\n") for node_output in run_output for line in node_output.output] else: output = spark_client.cluster_download(cluster_id, remote_path, host=True) @@ -26,8 +22,4 @@ def run(spark_client, cluster_id, output_directory=None): def _build_diagnostic_ssh_command(): - return "sudo rm -rf /tmp/debug.zip; "\ - "sudo apt-get install -y python3-pip; "\ - "sudo -H pip3 install --upgrade pip; "\ - "sudo -H pip3 install docker; "\ - "sudo python3 /tmp/debug.py" + return "sudo rm -rf /tmp/debug.zip; " "sudo apt-get install -y python3-pip; " "sudo -H pip3 install --upgrade pip; " "sudo -H pip3 install docker; " "sudo python3 /tmp/debug.py" diff --git a/aztk/spark/helpers/create_cluster.py b/aztk/spark/helpers/create_cluster.py index ade2a042..282d8499 100644 --- a/aztk/spark/helpers/create_cluster.py +++ b/aztk/spark/helpers/create_cluster.py @@ -1,9 +1,7 @@ from typing import List -from aztk.utils.command_builder import CommandBuilder from aztk.utils import helpers from aztk.utils import constants from aztk import models as aztk_models -from aztk.spark.models import ClusterConfiguration import azure.batch.models as batch_models POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity( @@ -56,14 +54,16 @@ def __get_secrets_env(spark_client): ] -def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, - gpu_enabled: bool, - docker_repo: str = None, - docker_run_options: str = None, - plugins=None, - worker_on_master: bool = True, - file_mounts=None, - mixed_mode: bool = False): +def __cluster_install_cmd( + zip_resource_file: batch_models.ResourceFile, + gpu_enabled: bool, + docker_repo: str = None, + docker_run_options: str = None, + plugins=None, + worker_on_master: bool = True, + file_mounts=None, + mixed_mode: bool = False, +): """ For Docker on ubuntu 16.04 - return the command line to be run on the start task of the pool to setup spark. @@ -77,41 +77,41 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, if file_mounts: for mount in file_mounts: # Create the directory on the node - shares.append('mkdir -p {0}'.format(mount.mount_path)) + shares.append("mkdir -p {0}".format(mount.mount_path)) # Mount the file share - shares.append( - 'mount -t cifs //{0}.file.core.windows.net/{2} {3} -o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp'. - format(mount.storage_account_name, mount.storage_account_key, mount.file_share_path, mount.mount_path)) + shares.append("mount -t cifs //{0}.file.core.windows.net/{2} {3} " + "-o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp".format( + mount.storage_account_name, mount.storage_account_key, mount.file_share_path, + mount.mount_path)) setup = [ - 'time('\ - 'apt-get -y update;'\ - 'apt-get -y --no-install-recommends install unzip;'\ - 'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\ - 'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\ - ') 2>&1'.format(zip_resource_file.file_path), + "time(" + "apt-get -y update;" + "apt-get -y --no-install-recommends install unzip;" + "unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};" + "chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;" + ") 2>&1".format(zip_resource_file.file_path), '/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1} "{2}"'.format( - constants.DOCKER_SPARK_CONTAINER_NAME, - docker_repo, - docker_run_options.replace('"', '\\\"') - ) + constants.DOCKER_SPARK_CONTAINER_NAME, docker_repo, docker_run_options.replace('"', '\\"')), ] commands = shares + setup return commands -def generate_cluster_start_task(spark_client, - zip_resource_file: batch_models.ResourceFile, - cluster_id: str, - gpu_enabled: bool, - docker_repo: str = None, - docker_run_options: str = None, - file_shares: List[aztk_models.FileShare] = None, - plugins: List[aztk_models.PluginConfiguration] = None, - mixed_mode: bool = False, - worker_on_master: bool = True): +def generate_cluster_start_task( + spark_client, + zip_resource_file: batch_models.ResourceFile, + cluster_id: str, + gpu_enabled: bool, + docker_repo: str = None, + docker_run_options: str = None, + file_shares: List[aztk_models.FileShare] = None, + plugins: List[aztk_models.PluginConfiguration] = None, + mixed_mode: bool = False, + worker_on_master: bool = True, +): """ This will return the start task object for the pool to be created. :param cluster_id str: Id of the cluster(Used for uploading the resource files) @@ -127,22 +127,31 @@ def generate_cluster_start_task(spark_client, spark_submit_logs_file = constants.SPARK_SUBMIT_LOGS_FILE # TODO use certificate - environment_settings = __get_secrets_env(spark_client) + [ + environment_settings = (__get_secrets_env(spark_client) + [ batch_models.EnvironmentSetting(name="SPARK_WEB_UI_PORT", value=spark_web_ui_port), batch_models.EnvironmentSetting(name="SPARK_WORKER_UI_PORT", value=spark_worker_ui_port), batch_models.EnvironmentSetting(name="SPARK_JOB_UI_PORT", value=spark_job_ui_port), batch_models.EnvironmentSetting(name="SPARK_CONTAINER_NAME", value=spark_container_name), batch_models.EnvironmentSetting(name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file), batch_models.EnvironmentSetting(name="AZTK_GPU_ENABLED", value=helpers.bool_env(gpu_enabled)), - ] + __get_docker_credentials(spark_client) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode) + ] + __get_docker_credentials(spark_client) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode)) # start task command - command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, plugins, - worker_on_master, file_shares, mixed_mode) + command = __cluster_install_cmd( + zip_resource_file, + gpu_enabled, + docker_repo, + docker_run_options, + plugins, + worker_on_master, + file_shares, + mixed_mode, + ) return batch_models.StartTask( command_line=helpers.wrap_commands_in_shell(command), resource_files=resource_files, environment_settings=environment_settings, user_identity=POOL_ADMIN_USER_IDENTITY, - wait_for_success=True) + wait_for_success=True, + ) diff --git a/aztk/spark/helpers/get_log.py b/aztk/spark/helpers/get_log.py index 2f786509..ea63a62f 100644 --- a/aztk/spark/helpers/get_log.py +++ b/aztk/spark/helpers/get_log.py @@ -9,8 +9,7 @@ from aztk import models as base_models from aztk.spark import models from aztk.utils import constants, helpers -output_file = constants.TASK_WORKING_DIR + \ - "/" + constants.SPARK_SUBMIT_LOGS_FILE +output_file = constants.TASK_WORKING_DIR + "/" + constants.SPARK_SUBMIT_LOGS_FILE def __check_task_node_exist(batch_client, cluster_id: str, task: batch_models.CloudTask) -> bool: @@ -51,16 +50,17 @@ def __get_output_file_properties(batch_client, cluster_id: str, application_name def get_log_from_storage(blob_client, container_name, application_name, task): try: - blob = blob_client.get_blob_to_text(container_name, application_name + '/' + constants.SPARK_SUBMIT_LOGS_FILE) + blob = blob_client.get_blob_to_text(container_name, application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE) except azure.common.AzureMissingResourceHttpError: raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") base_model = base_models.ApplicationLog( name=application_name, cluster_id=container_name, - application_state=task.state._value_, + application_state=task.state.name, log=blob.content, total_bytes=blob.properties.content_length, - exit_code=task.execution_info.exit_code) + exit_code=task.execution_info.exit_code, + ) return models.ApplicationLog(base_model) @@ -88,17 +88,19 @@ def get_log(batch_client, blob_client, cluster_id: str, application_name: str, t base_model = base_models.ApplicationLog( name=application_name, cluster_id=cluster_id, - application_state=task.state._value_, + application_state=task.state.name, log=content, total_bytes=target_bytes, - exit_code=task.execution_info.exit_code) + exit_code=task.execution_info.exit_code, + ) return models.ApplicationLog(base_model) else: base_model = base_models.ApplicationLog( name=application_name, cluster_id=cluster_id, - application_state=task.state._value_, - log='', + application_state=task.state.name, + log="", total_bytes=target_bytes, - exit_code=task.execution_info.exit_code) + exit_code=task.execution_info.exit_code, + ) return models.ApplicationLog(base_model) diff --git a/aztk/spark/helpers/job_submission.py b/aztk/spark/helpers/job_submission.py index eb7ac1be..81fc9638 100644 --- a/aztk/spark/helpers/job_submission.py +++ b/aztk/spark/helpers/job_submission.py @@ -1,17 +1,11 @@ -import datetime -import os import time -from typing import List import azure.batch.models as batch_models import yaml import aztk.error as error -from aztk.utils import constants, helpers +from aztk.utils import helpers from aztk.utils.command_builder import CommandBuilder -''' - Job Submission helper methods -''' def __app_cmd(): @@ -19,11 +13,12 @@ def __app_cmd(): docker_exec.add_argument("-i") docker_exec.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR") docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID") - docker_exec.add_argument("spark /bin/bash >> output.log 2>&1 -c \"" \ - "source ~/.bashrc; " \ - "export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " \ - "cd \$AZ_BATCH_TASK_WORKING_DIR; " \ - "\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py\"") + docker_exec.add_argument( + r'spark /bin/bash >> output.log 2>&1 -c "' + r"source ~/.bashrc; " + r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " + r"cd \$AZ_BATCH_TASK_WORKING_DIR; " + r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py"') return docker_exec.to_str() @@ -32,10 +27,11 @@ def generate_task(spark_client, job, application_tasks): for application, task in application_tasks: task_definition_resource_file = helpers.upload_text_to_container( container_name=job.id, - application_name=application.name + '.yaml', - file_path=application.name + '.yaml', + application_name=application.name + ".yaml", + file_path=application.name + ".yaml", content=yaml.dump(task), - blob_client=spark_client.blob_client) + blob_client=spark_client.blob_client, + ) resource_files.append(task_definition_resource_file) task_cmd = __app_cmd() @@ -49,7 +45,8 @@ def generate_task(spark_client, job, application_tasks): allow_low_priority_node=True, user_identity=batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( - scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin))) + scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)), + ) return task @@ -69,7 +66,7 @@ def list_applications(spark_client, job_id): applications = {} for metadata_item in recent_run_job.metadata: if metadata_item.name == "applications": - for app_name in metadata_item.value.split('\n'): + for app_name in metadata_item.value.split("\n"): applications[app_name] = None # get tasks from Batch job @@ -177,8 +174,11 @@ def get_application_log(spark_client, job_id, application_name): raise error.AztkError("The application {0} has not yet been created.".format(application)) raise error.AztkError("The application {0} does not exist".format(application_name)) else: - if task.state in (batch_models.TaskState.active, batch_models.TaskState.running, - batch_models.TaskState.preparing): + if task.state in ( + batch_models.TaskState.active, + batch_models.TaskState.running, + batch_models.TaskState.preparing, + ): raise error.AztkError("The application {0} has not yet finished executing.".format(application_name)) return spark_client.get_application_log(job_id, application_name) diff --git a/aztk/spark/helpers/submit.py b/aztk/spark/helpers/submit.py index 49c5370c..a45b2c8c 100644 --- a/aztk/spark/helpers/submit.py +++ b/aztk/spark/helpers/submit.py @@ -1,14 +1,11 @@ -import datetime import os -from typing import List -import yaml + import azure.batch.models as batch_models +import yaml + from aztk.error import AztkError -from aztk.utils import constants, helpers +from aztk.utils import helpers from aztk.utils.command_builder import CommandBuilder -''' -Submit helper methods -''' def __get_node(spark_client, node_id: str, cluster_id: str) -> batch_models.ComputeNode: @@ -25,12 +22,13 @@ def generate_task(spark_client, container_id, application, remote=False): application_name=application.name, file_path=application.application, blob_client=spark_client.blob_client, - use_full_path=False) + use_full_path=False, + ) # Upload application file resource_files.append(app_resource_file) - application.application = '$AZ_BATCH_TASK_WORKING_DIR/' + os.path.basename(application.application) + application.application = "$AZ_BATCH_TASK_WORKING_DIR/" + os.path.basename(application.application) # Upload dependent JARS jar_resource_file_paths = [] @@ -40,7 +38,8 @@ def generate_task(spark_client, container_id, application, remote=False): application_name=application.name, file_path=jar, blob_client=spark_client.blob_client, - use_full_path=False) + use_full_path=False, + ) jar_resource_file_paths.append(current_jar_resource_file_path) resource_files.append(current_jar_resource_file_path) @@ -52,7 +51,8 @@ def generate_task(spark_client, container_id, application, remote=False): application_name=application.name, file_path=py_file, blob_client=spark_client.blob_client, - use_full_path=False) + use_full_path=False, + ) py_files_resource_file_paths.append(current_py_files_resource_file_path) resource_files.append(current_py_files_resource_file_path) @@ -64,7 +64,8 @@ def generate_task(spark_client, container_id, application, remote=False): application_name=application.name, file_path=file, blob_client=spark_client.blob_client, - use_full_path=False) + use_full_path=False, + ) files_resource_file_paths.append(files_resource_file_path) resource_files.append(files_resource_file_path) @@ -75,21 +76,23 @@ def generate_task(spark_client, container_id, application, remote=False): application_definition_file = helpers.upload_text_to_container( container_name=container_id, application_name=application.name, - file_path='application.yaml', + file_path="application.yaml", content=yaml.dump(vars(application)), - blob_client=spark_client.blob_client) + blob_client=spark_client.blob_client, + ) resource_files.append(application_definition_file) # create command to submit task - task_cmd = CommandBuilder('sudo docker exec') - task_cmd.add_argument('-i') - task_cmd.add_option('-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR') - task_cmd.add_option('-e', 'STORAGE_LOGS_CONTAINER={0}'.format(container_id)) - task_cmd.add_argument('spark /bin/bash >> output.log 2>&1') - task_cmd.add_argument('-c "source ~/.bashrc; ' \ - 'export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; ' \ - 'cd \$AZ_BATCH_TASK_WORKING_DIR; ' \ - '\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"') + task_cmd = CommandBuilder("sudo docker exec") + task_cmd.add_argument("-i") + task_cmd.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR") + task_cmd.add_option("-e", "STORAGE_LOGS_CONTAINER={0}".format(container_id)) + task_cmd.add_argument("spark /bin/bash >> output.log 2>&1") + task_cmd.add_argument( + r'-c "source ~/.bashrc; ' + r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " + r"cd \$AZ_BATCH_TASK_WORKING_DIR; " + r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"') # Create task task = batch_models.TaskAddParameter( @@ -99,7 +102,8 @@ def generate_task(spark_client, container_id, application, remote=False): constraints=batch_models.TaskConstraints(max_task_retry_count=application.max_retry_count), user_identity=batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( - scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin))) + scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)), + ) return task diff --git a/aztk/spark/models/models.py b/aztk/spark/models/models.py index 9fa7d1f5..60b43352 100644 --- a/aztk/spark/models/models.py +++ b/aztk/spark/models/models.py @@ -12,11 +12,7 @@ from aztk.utils import constants, helpers class SparkToolkit(aztk.models.Toolkit): def __init__(self, version: str, environment: str = None, environment_version: str = None): super().__init__( - software="spark", - version=version, - environment=environment, - environment_version=environment_version, - ) + software="spark", version=version, environment=environment, environment_version=environment_version) class Cluster(aztk.models.Cluster): @@ -74,9 +70,9 @@ class SparkConfiguration(Model): def __generate_ssh_key_pair(self): key = RSA.generate(2048) - priv_key = key.exportKey('PEM') - pub_key = key.publickey().exportKey('OpenSSH') - return {'pub_key': pub_key, 'priv_key': priv_key} + priv_key = key.exportKey("PEM") + pub_key = key.publickey().exportKey("OpenSSH") + return {"pub_key": pub_key, "priv_key": priv_key} class CustomScript(aztk.models.CustomScript): @@ -124,22 +120,24 @@ class VmImage(aztk.models.VmImage): class ApplicationConfiguration: - def __init__(self, - name=None, - application=None, - application_args=None, - main_class=None, - jars=None, - py_files=None, - files=None, - driver_java_options=None, - driver_library_path=None, - driver_class_path=None, - driver_memory=None, - executor_memory=None, - driver_cores=None, - executor_cores=None, - max_retry_count=None): + def __init__( + self, + name=None, + application=None, + application_args=None, + main_class=None, + jars=None, + py_files=None, + files=None, + driver_java_options=None, + driver_library_path=None, + driver_class_path=None, + driver_memory=None, + executor_memory=None, + driver_cores=None, + executor_cores=None, + max_retry_count=None, + ): self.name = name self.application = application self.application_args = application_args @@ -162,11 +160,11 @@ class Application: self.name = cloud_task.id self.last_modified = cloud_task.last_modified self.creation_time = cloud_task.creation_time - self.state = cloud_task.state._value_ + self.state = cloud_task.state.name self.state_transition_time = cloud_task.state_transition_time self.exit_code = cloud_task.execution_info.exit_code if cloud_task.previous_state: - self.previous_state = cloud_task.previous_state._value_ + self.previous_state = cloud_task.previous_state.name self.previous_state_transition_time = cloud_task.previous_state_transition_time self._execution_info = cloud_task.execution_info @@ -190,17 +188,19 @@ class Application: class JobConfiguration: - def __init__(self, - id=None, - applications=None, - vm_size=None, - spark_configuration=None, - toolkit=None, - max_dedicated_nodes=0, - max_low_pri_nodes=0, - subnet_id=None, - scheduling_target: SchedulingTarget = None, - worker_on_master=None): + def __init__( + self, + id=None, + applications=None, + vm_size=None, + spark_configuration=None, + toolkit=None, + max_dedicated_nodes=0, + max_low_pri_nodes=0, + subnet_id=None, + scheduling_target: SchedulingTarget = None, + worker_on_master=None, + ): self.id = id self.applications = applications @@ -252,24 +252,23 @@ class JobConfiguration: raise error.AztkError("Please supply an ID for the Job in your configuration.") if self.max_dedicated_nodes == 0 and self.max_low_pri_nodes == 0: - raise error.AztkError( - "Please supply a valid (greater than 0) value for either max_dedicated_nodes or max_low_pri_nodes in your configuration." - ) + raise error.AztkError("Please supply a valid (greater than 0) value for either max_dedicated_nodes " + "or max_low_pri_nodes in your configuration.") if self.vm_size is None: raise error.AztkError("Please supply a vm_size in your configuration.") if self.mixed_mode() and not self.subnet_id: raise error.AztkError( - "You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes) and pass the subnet_id in your configuration.." - ) + "You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes) " + "and pass the subnet_id in your configuration..") if self.scheduling_target == SchedulingTarget.Dedicated and self.max_dedicated_nodes == 0: raise error.InvalidModelError("Scheduling target cannot be Dedicated if dedicated vm size is 0") -class JobState(): - complete = 'completed' +class JobState: + complete = "completed" active = "active" completed = "completed" disabled = "disabled" @@ -277,15 +276,17 @@ class JobState(): deleting = "deleting" -class Job(): - def __init__(self, - cloud_job_schedule: batch_models.CloudJobSchedule, - cloud_tasks: List[batch_models.CloudTask] = None, - pool: batch_models.CloudPool = None, - nodes: batch_models.ComputeNodePaged = None): +class Job: + def __init__( + self, + cloud_job_schedule: batch_models.CloudJobSchedule, + cloud_tasks: List[batch_models.CloudTask] = None, + pool: batch_models.CloudPool = None, + nodes: batch_models.ComputeNodePaged = None, + ): self.id = cloud_job_schedule.id self.last_modified = cloud_job_schedule.last_modified - self.state = cloud_job_schedule.state._value_ + self.state = cloud_job_schedule.state.name self.state_transition_time = cloud_job_schedule.state_transition_time self.creation_time = cloud_job_schedule.creation_time self.applications = [Application(task) for task in (cloud_tasks or [])] @@ -297,9 +298,11 @@ class Job(): class ApplicationLog(aztk.models.ApplicationLog): def __init__(self, application_log: aztk.models.ApplicationLog): - self.name = application_log.name - self.cluster_id = application_log.cluster_id # TODO: change to something cluster/job agnostic - self.log = application_log.log - self.total_bytes = application_log.total_bytes - self.application_state = application_log.application_state - self.exit_code = application_log.exit_code + super().__init__( + name=application_log.name, + cluster_id=application_log.cluster_id, # TODO: change to something cluster/job agnostic + log=application_log.log, + total_bytes=application_log.total_bytes, + application_state=application_log.application_state, + exit_code=application_log.exit_code, + ) diff --git a/aztk/spark/models/plugins/hdfs/configuration.py b/aztk/spark/models/plugins/hdfs/configuration.py index 99a73d6f..7eedbf55 100644 --- a/aztk/spark/models/plugins/hdfs/configuration.py +++ b/aztk/spark/models/plugins/hdfs/configuration.py @@ -1,7 +1,6 @@ import os from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -11,36 +10,14 @@ class HDFSPlugin(PluginConfiguration): super().__init__( name="hdfs", ports=[ - PluginPort( - name="File system metadata operations", - internal=8020, - ), - PluginPort( - name="File system metadata operations(Backup)", - internal=9000, - ), - PluginPort( - name="Datanode data transfer", - internal=50010, - ), - PluginPort( - name="Datanode IPC metadata operations", - internal=50020, - ), - PluginPort( - name="Namenode", - internal=50070, - public=True, - ), - PluginPort( - name="Datanodes", - internal=50075, - public=True, - ), + PluginPort(name="File system metadata operations", internal=8020), + PluginPort(name="File system metadata operations(Backup)", internal=9000), + PluginPort(name="Datanode data transfer", internal=50010), + PluginPort(name="Datanode IPC metadata operations", internal=50020), + PluginPort(name="Namenode", internal=50070, public=True), + PluginPort(name="Datanodes", internal=50075, public=True), ], target_role=PluginTargetRole.All, execute="hdfs.sh", - files=[ - PluginFile("hdfs.sh", os.path.join(dir_path, "hdfs.sh")), - ], + files=[PluginFile("hdfs.sh", os.path.join(dir_path, "hdfs.sh"))], ) diff --git a/aztk/spark/models/plugins/install/apt_get/configuration.py b/aztk/spark/models/plugins/install/apt_get/configuration.py index c5487f9c..4f2f15f5 100644 --- a/aztk/spark/models/plugins/install/apt_get/configuration.py +++ b/aztk/spark/models/plugins/install/apt_get/configuration.py @@ -1,8 +1,5 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole -from aztk.models.plugins.plugin_file import PluginFile from aztk.spark.models.plugins.install import InstallPlugin -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) diff --git a/aztk/spark/models/plugins/install/conda/configuration.py b/aztk/spark/models/plugins/install/conda/configuration.py index d39bfbf8..a5ea9e7d 100644 --- a/aztk/spark/models/plugins/install/conda/configuration.py +++ b/aztk/spark/models/plugins/install/conda/configuration.py @@ -1,8 +1,5 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole -from aztk.models.plugins.plugin_file import PluginFile from aztk.spark.models.plugins.install import InstallPlugin -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) diff --git a/aztk/spark/models/plugins/install/configuration.py b/aztk/spark/models/plugins/install/configuration.py index 54140a94..2bbb9fbb 100644 --- a/aztk/spark/models/plugins/install/configuration.py +++ b/aztk/spark/models/plugins/install/configuration.py @@ -1,7 +1,6 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -13,4 +12,5 @@ def InstallPlugin(name, command, packages=None): execute="install.sh", files=[PluginFile("install.sh", os.path.join(dir_path, "install.sh"))], args=packages, - env=dict(COMMAND=command)) + env=dict(COMMAND=command), + ) diff --git a/aztk/spark/models/plugins/install/pip/configuration.py b/aztk/spark/models/plugins/install/pip/configuration.py index dab12e4e..6e850fcd 100644 --- a/aztk/spark/models/plugins/install/pip/configuration.py +++ b/aztk/spark/models/plugins/install/pip/configuration.py @@ -1,8 +1,5 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole -from aztk.models.plugins.plugin_file import PluginFile from aztk.spark.models.plugins.install import InstallPlugin -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) diff --git a/aztk/spark/models/plugins/jupyter/configuration.py b/aztk/spark/models/plugins/jupyter/configuration.py index e3df3bc2..1ac09e62 100644 --- a/aztk/spark/models/plugins/jupyter/configuration.py +++ b/aztk/spark/models/plugins/jupyter/configuration.py @@ -8,15 +8,8 @@ dir_path = os.path.dirname(os.path.realpath(__file__)) def JupyterPlugin(): return PluginConfiguration( name="jupyter", - ports=[ - PluginPort( - internal=8888, - public=True, - ), - ], + ports=[PluginPort(internal=8888, public=True)], target_role=PluginTargetRole.All, execute="jupyter.sh", - files=[ - PluginFile("jupyter.sh", os.path.join(dir_path, "jupyter.sh")), - ], + files=[PluginFile("jupyter.sh", os.path.join(dir_path, "jupyter.sh"))], ) diff --git a/aztk/spark/models/plugins/jupyter_lab/configuration.py b/aztk/spark/models/plugins/jupyter_lab/configuration.py index a14910be..205926c5 100644 --- a/aztk/spark/models/plugins/jupyter_lab/configuration.py +++ b/aztk/spark/models/plugins/jupyter_lab/configuration.py @@ -1,7 +1,6 @@ import os from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -9,15 +8,8 @@ dir_path = os.path.dirname(os.path.realpath(__file__)) def JupyterLabPlugin(): return PluginConfiguration( name="jupyterlab", - ports=[ - PluginPort( - internal=8889, - public=True, - ), - ], + ports=[PluginPort(internal=8889, public=True)], target_role=PluginTargetRole.All, execute="jupyter_lab.sh", - files=[ - PluginFile("jupyter_lab.sh", os.path.join(dir_path, "jupyter_lab.sh")), - ], + files=[PluginFile("jupyter_lab.sh", os.path.join(dir_path, "jupyter_lab.sh"))], ) diff --git a/aztk/spark/models/plugins/nvblas/configuration.py b/aztk/spark/models/plugins/nvblas/configuration.py index 6ec047f7..29b6beba 100644 --- a/aztk/spark/models/plugins/nvblas/configuration.py +++ b/aztk/spark/models/plugins/nvblas/configuration.py @@ -1,7 +1,6 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -12,6 +11,5 @@ def NvBLASPlugin(): ports=[], target_role=PluginTargetRole.All, execute="nvblas.sh", - files=[ - PluginFile("nvblas.sh", os.path.join(dir_path, "nvblas.sh")), - ]) + files=[PluginFile("nvblas.sh", os.path.join(dir_path, "nvblas.sh"))], + ) diff --git a/aztk/spark/models/plugins/openblas/configuration.py b/aztk/spark/models/plugins/openblas/configuration.py index c83c96be..342abe09 100644 --- a/aztk/spark/models/plugins/openblas/configuration.py +++ b/aztk/spark/models/plugins/openblas/configuration.py @@ -1,7 +1,6 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -12,7 +11,5 @@ def OpenBLASPlugin(): ports=[], target_role=PluginTargetRole.All, execute="openblas.sh", - files=[ - PluginFile("openblas.sh", os.path.join(dir_path, "openblas.sh")), - ], + files=[PluginFile("openblas.sh", os.path.join(dir_path, "openblas.sh"))], ) diff --git a/aztk/spark/models/plugins/resource_monitor/configuration.py b/aztk/spark/models/plugins/resource_monitor/configuration.py index 09c535da..ab313b29 100644 --- a/aztk/spark/models/plugins/resource_monitor/configuration.py +++ b/aztk/spark/models/plugins/resource_monitor/configuration.py @@ -1,7 +1,6 @@ import os from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTarget, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -10,12 +9,7 @@ class ResourceMonitorPlugin(PluginConfiguration): def __init__(self): super().__init__( name="resource_monitor", - ports=[ - PluginPort( - internal=8890, - public=True, - ), - ], + ports=[PluginPort(internal=8890, public=True)], target=PluginTarget.Host, target_role=PluginTargetRole.All, execute="start_monitor.sh", @@ -23,4 +17,5 @@ class ResourceMonitorPlugin(PluginConfiguration): PluginFile("start_monitor.sh", os.path.join(dir_path, "start_monitor.sh")), PluginFile("etc/telegraf.conf", os.path.join(dir_path, "telegraf.conf")), PluginFile("docker-compose.yml", os.path.join(dir_path, "docker-compose.yml")), - ]) + ], + ) diff --git a/aztk/spark/models/plugins/rstudio_server/configuration.py b/aztk/spark/models/plugins/rstudio_server/configuration.py index 8d0bef0c..23a555dc 100644 --- a/aztk/spark/models/plugins/rstudio_server/configuration.py +++ b/aztk/spark/models/plugins/rstudio_server/configuration.py @@ -8,16 +8,9 @@ dir_path = os.path.dirname(os.path.realpath(__file__)) def RStudioServerPlugin(version="1.1.383"): return PluginConfiguration( name="rstudio_server", - ports=[ - PluginPort( - internal=8787, - public=True, - ), - ], + ports=[PluginPort(internal=8787, public=True)], target_role=PluginTargetRole.Master, execute="rstudio_server.sh", - files=[ - PluginFile("rstudio_server.sh", os.path.join(dir_path, "rstudio_server.sh")), - ], + files=[PluginFile("rstudio_server.sh", os.path.join(dir_path, "rstudio_server.sh"))], env=dict(RSTUDIO_SERVER_VERSION=version), ) diff --git a/aztk/spark/models/plugins/simple/configuration.py b/aztk/spark/models/plugins/simple/configuration.py index d58e9f98..3c445a84 100644 --- a/aztk/spark/models/plugins/simple/configuration.py +++ b/aztk/spark/models/plugins/simple/configuration.py @@ -1,7 +1,6 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole, PluginTarget +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTarget, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -13,7 +12,5 @@ class SimplePlugin(PluginConfiguration): target_role=PluginTargetRole.All, target=PluginTarget.Host, execute="simple.sh", - files=[ - PluginFile("simple.sh", os.path.join(dir_path, "simple.sh")), - ], + files=[PluginFile("simple.sh", os.path.join(dir_path, "simple.sh"))], ) diff --git a/aztk/spark/models/plugins/spark_ui_proxy/configuration.py b/aztk/spark/models/plugins/spark_ui_proxy/configuration.py index 623a7bda..1119d81d 100644 --- a/aztk/spark/models/plugins/spark_ui_proxy/configuration.py +++ b/aztk/spark/models/plugins/spark_ui_proxy/configuration.py @@ -1,7 +1,6 @@ import os from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) diff --git a/aztk/spark/models/plugins/spark_ui_proxy/spark_ui_proxy.py b/aztk/spark/models/plugins/spark_ui_proxy/spark_ui_proxy.py index 332e7e0a..e5fefae4 100644 --- a/aztk/spark/models/plugins/spark_ui_proxy/spark_ui_proxy.py +++ b/aztk/spark/models/plugins/spark_ui_proxy/spark_ui_proxy.py @@ -29,7 +29,7 @@ from http.server import BaseHTTPRequestHandler, HTTPServer BIND_ADDR = os.environ.get("BIND_ADDR", "0.0.0.0") SERVER_PORT = int(os.environ.get("SERVER_PORT", "80")) -URL_PREFIX = os.environ.get("URL_PREFIX", "").rstrip('/') + '/' +URL_PREFIX = os.environ.get("URL_PREFIX", "").rstrip("/") + "/" SPARK_MASTER_HOST = "" @@ -44,7 +44,7 @@ class ProxyHandler(BaseHTTPRequestHandler): self.proxyRequest(None) def do_POST(self): - length = int(self.headers.getheader('content-length')) + length = int(self.headers.getheader("content-length")) postData = self.rfile.read(length) self.proxyRequest(postData) @@ -84,17 +84,19 @@ class ProxyHandler(BaseHTTPRequestHandler): def rewriteLinks(self, page, targetHost): target = "{0}proxy:{1}/".format(URL_PREFIX, targetHost).encode() page = page.replace(b'href="/', b'href="' + target) - page = page.replace(b"'
'", - b"'
'") - page = page.replace(b'href="log', b'href="' + target + b'log') - page = page.replace(b'href="http://', b'href="' + URL_PREFIX.encode() + b'proxy:') + page = page.replace( + b"'
'", + b"'
'", + ) + page = page.replace(b'href="log', b'href="' + target + b"log") + page = page.replace(b'href="http://', b'href="' + URL_PREFIX.encode() + b"proxy:") page = page.replace(b'src="/', b'src="' + target) page = page.replace(b'action="', b'action="' + target) - page = page.replace(b'"/api/v1/', b'"' + target + b'api/v1/') + page = page.replace(b'"/api/v1/', b'"' + target + b"api/v1/") return page -if __name__ == '__main__': +if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: []") sys.exit(1) diff --git a/aztk/spark/models/plugins/tensorflow_on_spark/configuration.py b/aztk/spark/models/plugins/tensorflow_on_spark/configuration.py index e5aea89f..6fa33390 100644 --- a/aztk/spark/models/plugins/tensorflow_on_spark/configuration.py +++ b/aztk/spark/models/plugins/tensorflow_on_spark/configuration.py @@ -1,7 +1,6 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -11,7 +10,5 @@ def TensorflowOnSparkPlugin(): name="tensorflow_on_spark", target_role=PluginTargetRole.Master, execute="tensorflow_on_spark.sh", - files=[ - PluginFile("tensorflow_on_spark.sh", os.path.join(dir_path, "tensorflow_on_spark.sh")), - ], + files=[PluginFile("tensorflow_on_spark.sh", os.path.join(dir_path, "tensorflow_on_spark.sh"))], ) diff --git a/aztk/spark/utils/constants.py b/aztk/spark/utils/constants.py index 831abf84..1e25dc65 100644 --- a/aztk/spark/utils/constants.py +++ b/aztk/spark/utils/constants.py @@ -1,3 +1,3 @@ from aztk.spark import models -SPARK_VM_IMAGE = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') +SPARK_VM_IMAGE = models.VmImage(publisher="Canonical", offer="UbuntuServer", sku="16.04") diff --git a/aztk/spark/utils/debug.py b/aztk/spark/utils/debug.py index 3dda40aa..988c65c5 100644 --- a/aztk/spark/utils/debug.py +++ b/aztk/spark/utils/debug.py @@ -50,9 +50,7 @@ def cmd_check_output(cmd): try: output = check_output(cmd, shell=True, stderr=STDOUT) except CalledProcessError as e: - return "CMD: {0}\n"\ - "returncode: {1}"\ - "output: {2}".format(e.cmd, e.returncode, e.output) + return "CMD: {0}\n" "returncode: {1}" "output: {2}".format(e.cmd, e.returncode, e.output) else: return output @@ -62,9 +60,9 @@ def get_disk_free(): def get_docker_diagnostics(docker_client): - ''' + """ returns list of tuples (filename, data) to be written in the zip - ''' + """ output = [] output.append(get_docker_images(docker_client)) logs = get_docker_containers(docker_client) @@ -95,7 +93,7 @@ def get_docker_containers(docker_client): # get docker container logs logs.append((container.name + "/docker.log", container.logs())) logs.append(get_docker_process_status(container)) - if container.name == "spark": #TODO: find a more robust way to get specific info off specific containers + if container.name == "spark": # TODO: find a more robust way to get specific info off specific containers logs.extend(get_container_aztk_script(container)) logs.extend(get_spark_logs(container)) logs.extend(get_spark_app_logs(container)) @@ -158,13 +156,13 @@ def filter_members(members): def extract_tar_in_memory(container, data): - data = io.BytesIO(b''.join([item for item in data])) + data = io.BytesIO(b"".join([item for item in data])) tarf = tarfile.open(fileobj=data) logs = [] for member in filter_members(tarf): file_bytes = tarf.extractfile(member) if file_bytes is not None: - logs.append((container.name + "/" + member.name, b''.join(file_bytes.readlines()))) + logs.append((container.name + "/" + member.name, b"".join(file_bytes.readlines()))) return logs @@ -174,7 +172,7 @@ def get_brief_diagnostics(): logs = [] for file_name in files: try: - logs.append((file_name, open(batch_dir + file_name, 'rb').read())) + logs.append((file_name, open(batch_dir + file_name, "rb").read())) # print("LOG:", (file_name, open(batch_dir+file_name, 'rb').read())) except FileNotFoundError as e: print("file not found", e) diff --git a/aztk/spark/utils/util.py b/aztk/spark/utils/util.py index 0df0c17d..ec03be92 100644 --- a/aztk/spark/utils/util.py +++ b/aztk/spark/utils/util.py @@ -1,16 +1,11 @@ from __future__ import print_function + import datetime -import io -import os import time -import azure.batch.batch_service_client as batch -import azure.batch.batch_auth as batch_auth + import azure.batch.models as batch_models -import azure.storage.blob as blob -from aztk.version import __version__ + from aztk.utils import constants -from aztk import error -import aztk.models class MasterInvalidStateError(Exception): diff --git a/aztk/utils/__init__.py b/aztk/utils/__init__.py index 295f5c40..eefb69dc 100644 --- a/aztk/utils/__init__.py +++ b/aztk/utils/__init__.py @@ -1,8 +1,3 @@ -from .deprecation import deprecated, deprecate -from . import azure_api -from . import command_builder -from . import constants -from . import helpers -from . import file_utils -from . import get_ssh_key -from . import secure_utils +from . import (azure_api, command_builder, constants, file_utils, get_ssh_key, helpers, secure_utils) +from .deprecation import deprecate, deprecated +from .retry import BackOffPolicy, retry diff --git a/aztk/utils/azure_api.py b/aztk/utils/azure_api.py index 160e47e4..69781805 100644 --- a/aztk/utils/azure_api.py +++ b/aztk/utils/azure_api.py @@ -1,5 +1,4 @@ import re -from typing import Optional import azure.batch.batch_auth as batch_auth import azure.batch.batch_service_client as batch @@ -12,10 +11,10 @@ from azure.storage.common import CloudStorageAccount from aztk import error from aztk.version import __version__ -RESOURCE_ID_PATTERN = re.compile('^/subscriptions/(?P[^/]+)' - '/resourceGroups/(?P[^/]+)' - '/providers/[^/]+' - '/[^/]+Accounts/(?P[^/]+)$') +RESOURCE_ID_PATTERN = re.compile("^/subscriptions/(?P[^/]+)" + "/resourceGroups/(?P[^/]+)" + "/providers/[^/]+" + "/[^/]+Accounts/(?P[^/]+)$") def validate_secrets(secrets): @@ -48,23 +47,25 @@ def make_batch_client(secrets): client_id=secrets.service_principal.client_id, secret=secrets.service_principal.credential, tenant=secrets.service_principal.tenant_id, - resource='https://management.core.windows.net/') + resource="https://management.core.windows.net/", + ) m = RESOURCE_ID_PATTERN.match(secrets.service_principal.batch_account_resource_id) - arm_batch_client = BatchManagementClient(arm_credentials, m.group('subscription')) - account = arm_batch_client.batch_account.get(m.group('resourcegroup'), m.group('account')) - base_url = 'https://{0}/'.format(account.account_endpoint) + arm_batch_client = BatchManagementClient(arm_credentials, m.group("subscription")) + account = arm_batch_client.batch_account.get(m.group("resourcegroup"), m.group("account")) + base_url = "https://{0}/".format(account.account_endpoint) credentials = ServicePrincipalCredentials( client_id=secrets.service_principal.client_id, secret=secrets.service_principal.credential, tenant=secrets.service_principal.tenant_id, - resource='https://batch.core.windows.net/') + resource="https://batch.core.windows.net/", + ) # Set up Batch Client batch_client = batch.BatchServiceClient(credentials, base_url=base_url) # Set retry policy batch_client.config.retry_policy.retries = 5 - batch_client.config.add_user_agent('aztk/{}'.format(__version__)) + batch_client.config.add_user_agent("aztk/{}".format(__version__)) return batch_client @@ -82,26 +83,29 @@ def make_blob_client(secrets): blob_client = blob.BlockBlobService( account_name=secrets.shared_key.storage_account_name, account_key=secrets.shared_key.storage_account_key, - endpoint_suffix=secrets.shared_key.storage_account_suffix) + endpoint_suffix=secrets.shared_key.storage_account_suffix, + ) else: # Set up ServicePrincipalCredentials arm_credentials = ServicePrincipalCredentials( client_id=secrets.service_principal.client_id, secret=secrets.service_principal.credential, tenant=secrets.service_principal.tenant_id, - resource='https://management.core.windows.net/') + resource="https://management.core.windows.net/", + ) m = RESOURCE_ID_PATTERN.match(secrets.service_principal.storage_account_resource_id) - accountname = m.group('account') - subscription = m.group('subscription') - resourcegroup = m.group('resourcegroup') + accountname = m.group("account") + subscription = m.group("subscription") + resourcegroup = m.group("resourcegroup") mgmt_client = StorageManagementClient(arm_credentials, subscription) - key = retry_function( + key = (retry_function( mgmt_client.storage_accounts.list_keys, 10, 1, Exception, resource_group_name=resourcegroup, - account_name=accountname).keys[0].value + account_name=accountname, + ).keys[0].value) storage_client = CloudStorageAccount(accountname, key) blob_client = storage_client.create_block_blob_service() @@ -110,6 +114,7 @@ def make_blob_client(secrets): def retry_function(function, retry_attempts: int, retry_interval: int, exception: Exception, *args, **kwargs): import time + for i in range(retry_attempts): try: return function(*args, **kwargs) diff --git a/aztk/utils/command_builder.py b/aztk/utils/command_builder.py index a55cf625..b184765b 100644 --- a/aztk/utils/command_builder.py +++ b/aztk/utils/command_builder.py @@ -1,4 +1,4 @@ -class CommandOption(): +class CommandOption: def __init__(self, name: str, value: str): self.name = name self.value = value diff --git a/aztk/utils/constants.py b/aztk/utils/constants.py index ec0ecf19..f4096615 100644 --- a/aztk/utils/constants.py +++ b/aztk/utils/constants.py @@ -18,33 +18,33 @@ DOCKER_SPARK_HOME = "/home/spark-current" """ Root path of this repository """ -ROOT_PATH = os.path.normpath(os.path.join(os.path.dirname(__file__), '..', '..')) +ROOT_PATH = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..")) """ User home directory path """ -HOME_DIRECTORY_PATH = os.path.expanduser('~') +HOME_DIRECTORY_PATH = os.path.expanduser("~") """ Path to the secrets file """ -DEFAULT_SECRETS_PATH = os.path.join(os.getcwd(), '.aztk/secrets.yaml') +DEFAULT_SECRETS_PATH = os.path.join(os.getcwd(), ".aztk/secrets.yaml") """ Paths to the cluster configuration files """ -GLOBAL_CONFIG_PATH = os.path.join(HOME_DIRECTORY_PATH, '.aztk') -DEFAULT_SSH_CONFIG_PATH = os.path.join(os.getcwd(), '.aztk/ssh.yaml') -DEFAULT_CLUSTER_CONFIG_PATH = os.path.join(os.getcwd(), '.aztk/cluster.yaml') -DEFAULT_SPARK_CONF_SOURCE = os.path.join(os.getcwd(), '.aztk') -DEFAULT_SPARK_CONF_DEST = os.path.join(ROOT_PATH, 'node_scripts', 'conf') -DEFAULT_SPARK_JARS_SOURCE = os.path.join(os.getcwd(), '.aztk', 'jars') -DEFAULT_SPARK_JARS_DEST = os.path.join(ROOT_PATH, 'node_scripts', 'jars') -DEFAULT_SPARK_JOB_CONFIG = os.path.join(os.getcwd(), '.aztk', 'job.yaml') -GLOBAL_SPARK_JOB_CONFIG = os.path.join(HOME_DIRECTORY_PATH, '.aztk', 'job.yaml') +GLOBAL_CONFIG_PATH = os.path.join(HOME_DIRECTORY_PATH, ".aztk") +DEFAULT_SSH_CONFIG_PATH = os.path.join(os.getcwd(), ".aztk/ssh.yaml") +DEFAULT_CLUSTER_CONFIG_PATH = os.path.join(os.getcwd(), ".aztk/cluster.yaml") +DEFAULT_SPARK_CONF_SOURCE = os.path.join(os.getcwd(), ".aztk") +DEFAULT_SPARK_CONF_DEST = os.path.join(ROOT_PATH, "node_scripts", "conf") +DEFAULT_SPARK_JARS_SOURCE = os.path.join(os.getcwd(), ".aztk", "jars") +DEFAULT_SPARK_JARS_DEST = os.path.join(ROOT_PATH, "node_scripts", "jars") +DEFAULT_SPARK_JOB_CONFIG = os.path.join(os.getcwd(), ".aztk", "job.yaml") +GLOBAL_SPARK_JOB_CONFIG = os.path.join(HOME_DIRECTORY_PATH, ".aztk", "job.yaml") """ Source and destination paths for spark init """ -INIT_DIRECTORY_SOURCE = os.path.join(ROOT_PATH, "aztk_cli", 'config') -LOCAL_INIT_DIRECTORY_DEST = os.path.join(os.getcwd(), '.aztk') -GLOBAL_INIT_DIRECTORY_DEST = os.path.join(HOME_DIRECTORY_PATH, '.aztk') +INIT_DIRECTORY_SOURCE = os.path.join(ROOT_PATH, "aztk_cli", "config") +LOCAL_INIT_DIRECTORY_DEST = os.path.join(os.getcwd(), ".aztk") +GLOBAL_INIT_DIRECTORY_DEST = os.path.join(HOME_DIRECTORY_PATH, ".aztk") """ Key of the metadata entry for the pool that is used to store the master node id """ diff --git a/aztk/utils/deprecation.py b/aztk/utils/deprecation.py index 5bb14f05..3cf7cd07 100644 --- a/aztk/utils/deprecation.py +++ b/aztk/utils/deprecation.py @@ -39,9 +39,10 @@ def deprecate(version: str, message: str, advice: str = ""): advice (str): Sentence explaining alternatives to the deprecated functionality. """ - warnings.simplefilter('always', DeprecationWarning) # turn off filter + warnings.simplefilter("always", DeprecationWarning) # turn off filter warnings.warn( "{0} It will be removed in Aztk version {1}. {2}".format(message, version, advice), category=DeprecationWarning, - stacklevel=2) - warnings.simplefilter('default', DeprecationWarning) # reset filter + stacklevel=2, + ) + warnings.simplefilter("default", DeprecationWarning) # reset filter diff --git a/aztk/utils/get_ssh_key.py b/aztk/utils/get_ssh_key.py index 4df781fa..73e33661 100644 --- a/aztk/utils/get_ssh_key.py +++ b/aztk/utils/get_ssh_key.py @@ -29,6 +29,6 @@ def __read_ssh_key_from_file(path: str) -> str: """ Read the content of the given file """ - with open(os.path.expanduser(path), 'r', encoding='UTF-8') as content_file: + with open(os.path.expanduser(path), "r", encoding="UTF-8") as content_file: content = content_file.read() return content diff --git a/aztk/utils/helpers.py b/aztk/utils/helpers.py index cd469889..6fb5aa64 100644 --- a/aztk/utils/helpers.py +++ b/aztk/utils/helpers.py @@ -1,27 +1,26 @@ from __future__ import print_function + import datetime import io -import os -import time -import re -import azure.common -import azure.batch.batch_service_client as batch -import azure.batch.batch_auth as batch_auth -import azure.batch.models as batch_models -import azure.storage.blob as blob -from aztk.version import __version__ -from aztk.utils import constants -from aztk import error -import aztk.models -import yaml import logging +import os +import re +import time -_STANDARD_OUT_FILE_NAME = 'stdout.txt' -_STANDARD_ERROR_FILE_NAME = 'stderr.txt' +import azure.batch.models as batch_models +import azure.common +import azure.storage.blob as blob +import yaml + +import aztk.models +from aztk import error + +_STANDARD_OUT_FILE_NAME = "stdout.txt" +_STANDARD_ERROR_FILE_NAME = "stderr.txt" def is_gpu_enabled(vm_size: str): - return bool(re.search('nv|nc', vm_size, flags=re.IGNORECASE)) + return bool(re.search("nv|nc", vm_size, flags=re.IGNORECASE)) def get_cluster(cluster_id, batch_client): @@ -66,7 +65,7 @@ def wait_for_task_to_complete(job_id: str, task_id: str, batch_client): def upload_text_to_container(container_name: str, application_name: str, content: str, file_path: str, blob_client=None) -> batch_models.ResourceFile: blob_name = file_path - blob_path = application_name + '/' + blob_name # + '/' + time_stamp + '/' + blob_name + blob_path = application_name + "/" + blob_name # + '/' + time_stamp + '/' + blob_name blob_client.create_container(container_name, fail_on_exist=False) blob_client.create_blob_from_text(container_name, blob_path, content) @@ -74,7 +73,8 @@ def upload_text_to_container(container_name: str, application_name: str, content container_name, blob_path, permission=blob.BlobPermissions.READ, - expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365)) + expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365), + ) sas_url = blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token) @@ -104,7 +104,7 @@ def upload_file_to_container(container_name, blob_name = file_path.strip("/") else: blob_name = os.path.basename(file_path) - blob_path = application_name + '/' + blob_name + blob_path = application_name + "/" + blob_name if not node_path: node_path = blob_name @@ -117,7 +117,8 @@ def upload_file_to_container(container_name, container_name, blob_path, permission=blob.BlobPermissions.READ, - expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7)) + expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7), + ) sas_url = blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token) @@ -158,11 +159,11 @@ def wait_for_all_nodes_state(pool, node_state, batch_client): # refresh pool to ensure that there is no resize error pool = batch_client.pool.get(pool.id) if pool.resize_errors is not None: - raise RuntimeError('resize error encountered for pool {}: {!r}'.format(pool.id, pool.resize_errors)) + raise RuntimeError("resize error encountered for pool {}: {!r}".format(pool.id, pool.resize_errors)) nodes = list(batch_client.compute_node.list(pool.id)) totalNodes = pool.target_dedicated_nodes + pool.target_low_priority_nodes - if (len(nodes) >= totalNodes and all(node.state in node_state for node in nodes)): + if len(nodes) >= totalNodes and all(node.state in node_state for node in nodes): return nodes time.sleep(1) @@ -241,7 +242,8 @@ def upload_blob_and_create_sas(container_name, blob_name, file_name, expiry, blo permission=blob.BlobPermissions.READ, blob_client=None, expiry=expiry, - timeout=timeout) + timeout=timeout, + ) sas_url = blob_client.make_blob_url(container_name, blob_name, sas_token=sas_token) @@ -256,7 +258,7 @@ def wrap_commands_in_shell(commands): :rtype: str :return: a shell wrapping commands """ - return '/bin/bash -c \'set -e; set -o pipefail; {}; wait\''.format(';'.join(commands)) + return "/bin/bash -c 'set -e; set -o pipefail; {}; wait'".format(";".join(commands)) def get_connection_info(pool_id, node_id, batch_client): @@ -290,11 +292,11 @@ def get_cluster_total_current_nodes(pool): def normalize_path(path: str) -> str: """ Convert a path in a path that will work well with blob storage and unix - It will replace \ with / and remove relative . + It will replace backslashes with forwardslashes and return absolute paths. """ path = os.path.abspath(os.path.expanduser(path)) - path = path.replace('\\', '/') - if path.startswith('./'): + path = path.replace("\\", "/") + if path.startswith("./"): return path[2:] else: return path @@ -326,7 +328,7 @@ def read_stream_as_string(stream, encoding="utf-8"): return output.getvalue().decode(encoding) finally: output.close() - raise RuntimeError('could not write data to stream or decode bytes') + raise RuntimeError("could not write data to stream or decode bytes") def format_batch_exception(batch_exception): @@ -336,17 +338,15 @@ def format_batch_exception(batch_exception): """ l = [] l.append("-------------------------------------------") - if batch_exception.error and \ - batch_exception.error.message and \ - batch_exception.error.message.value: + if batch_exception.error and batch_exception.error.message and batch_exception.error.message.value: l.append(batch_exception.error.message.value) if batch_exception.error.values: - l.append('') + l.append("") for mesg in batch_exception.error.values: l.append("{0}:\t{1}".format(mesg.key, mesg.value)) l.append("-------------------------------------------") - return '\n'.join(l) + return "\n".join(l) def save_cluster_config(cluster_config, blob_client): @@ -363,9 +363,9 @@ def read_cluster_config(cluster_id: str, blob_client: blob.BlockBlobService): result = blob_client.get_blob_to_text(cluster_id, blob_path) return yaml.load(result.content) except azure.common.AzureMissingResourceHttpError: - logging.warn("Cluster %s doesn't have cluster configuration in storage", cluster_id) + logging.warning("Cluster %s doesn't have cluster configuration in storage", cluster_id) except yaml.YAMLError: - logging.warn("Cluster %s contains invalid cluster configuration in blob", cluster_id) + logging.warning("Cluster %s contains invalid cluster configuration in blob", cluster_id) def bool_env(value: bool): diff --git a/aztk/utils/retry.py b/aztk/utils/retry.py new file mode 100644 index 00000000..9725d552 --- /dev/null +++ b/aztk/utils/retry.py @@ -0,0 +1,29 @@ +import functools +import time +from enum import Enum + + +class BackOffPolicy(Enum): + linear = "linear" + exponential = "exponential" + + +def retry(retry_count=1, retry_interval=0, backoff_policy=BackOffPolicy.linear, exceptions=()): + def decorator(function): + @functools.wraps(function) + def wrapper(*args, **kwargs): + for i in range(retry_count - 1): + try: + return function(*args, **kwargs) + except exceptions: + if backoff_policy == BackOffPolicy.linear: + time.sleep(i * retry_interval) + if backoff_policy == BackOffPolicy.exponential: + print("sleeping:", 2**(i * retry_interval)) + time.sleep(2**(i * retry_interval)) + # do not retry on the last iteration + return function(*args, **kwargs) + + return wrapper + + return decorator diff --git a/aztk/utils/secure_utils.py b/aztk/utils/secure_utils.py index 557396b6..4490370f 100644 --- a/aztk/utils/secure_utils.py +++ b/aztk/utils/secure_utils.py @@ -23,4 +23,4 @@ def encrypt_password(ssh_pub_key, password): def generate_random_string(charset=string.ascii_uppercase + string.ascii_lowercase, length=16): - return ''.join(random.SystemRandom().choice(charset) for _ in range(length)) + return "".join(random.SystemRandom().choice(charset) for _ in range(length)) diff --git a/aztk/utils/ssh.py b/aztk/utils/ssh.py index 84f4cc6a..1986b333 100644 --- a/aztk/utils/ssh.py +++ b/aztk/utils/ssh.py @@ -1,6 +1,6 @@ -''' +""" SSH utils -''' +""" import asyncio import io import logging @@ -24,34 +24,38 @@ class ForwardServer(SocketServer.ThreadingTCPServer): class Handler(SocketServer.BaseRequestHandler): def handle(self): try: - channel = self.ssh_transport.open_channel('direct-tcpip', + channel = self.ssh_transport.open_channel("direct-tcpip", (self.chain_host, self.chain_port), self.request.getpeername()) except Exception as e: - logging.debug('Incoming request to %s:%d failed: %s', self.chain_host, self.chain_port, repr(e)) + logging.debug("Incoming request to %s:%d failed: %s", self.chain_host, self.chain_port, repr(e)) return if channel is None: - logging.debug('Incoming request to %s:%d was rejected by the SSH server.', self.chain_host, self.chain_port) + logging.debug("Incoming request to %s:%d was rejected by the SSH server.", self.chain_host, self.chain_port) return - logging.debug('Connected! Tunnel open %r -> %r -> %r', self.request.getpeername(), channel.getpeername(), - (self.chain_host, self.chain_port)) + logging.debug( + "Connected! Tunnel open %r -> %r -> %r", + self.request.getpeername(), + channel.getpeername(), + (self.chain_host, self.chain_port), + ) while True: - r, w, x = select.select([self.request, channel], [], []) + r, _, _ = select.select([self.request, channel], [], []) if self.request in r: data = self.request.recv(1024) - if len(data) == 0: + if not data: break channel.send(data) if channel in r: data = channel.recv(1024) - if len(data) == 0: + if not data: break self.request.send(data) peername = self.request.getpeername() channel.close() self.request.close() - logging.debug('Tunnel closed from %r', peername) + logging.debug("Tunnel closed from %r", peername) def forward_tunnel(local_port, remote_host, remote_port, transport): @@ -60,7 +64,7 @@ def forward_tunnel(local_port, remote_host, remote_port, transport): chain_port = remote_port ssh_transport = transport - thread = threading.Thread(target=ForwardServer(('', local_port), SubHandler).serve_forever, daemon=True) + thread = threading.Thread(target=ForwardServer(("", local_port), SubHandler).serve_forever, daemon=True) thread.start() return thread @@ -77,7 +81,7 @@ def connect(hostname, port=22, username=None, password=None, pkey=None, timeout= ssh_key = None timeout = timeout or 20 - logging.debug("Connecting to {}@{}:{}, timeout={}".format(username, hostname, port, timeout)) + logging.debug("Connecting to %s@%s:%d, timeout=%d", username, hostname, port, timeout) try: client.connect(hostname, port=port, username=username, password=password, pkey=ssh_key, timeout=timeout) except socket.timeout: @@ -93,8 +97,12 @@ def forward_ports(client, port_forward_list): for port_forwarding_specification in port_forward_list: threads.append( - forward_tunnel(port_forwarding_specification.remote_port, "127.0.0.1", - port_forwarding_specification.local_port, client.get_transport())) + forward_tunnel( + port_forwarding_specification.remote_port, + "127.0.0.1", + port_forwarding_specification.local_port, + client.get_transport(), + )) return threads @@ -111,13 +119,13 @@ def node_exec_command(node_id, client = connect( hostname=hostname, port=port, username=username, password=password, pkey=ssh_key, timeout=timeout) except AztkError as e: - return NodeOutput(node_id, e) + return NodeOutput(node_id, None, e) if container_name: - cmd = 'sudo docker exec 2>&1 -t {0} /bin/bash -c \'set -e; set -o pipefail; {1}; wait\''.format( + cmd = "sudo docker exec 2>&1 -t {0} /bin/bash -c 'set -e; set -o pipefail; {1}; wait'".format( container_name, command) else: - cmd = '/bin/bash 2>&1 -c \'set -e; set -o pipefail; {0}; wait\''.format(command) - stdin, stdout, stderr = client.exec_command(cmd, get_pty=True) + cmd = "/bin/bash 2>&1 -c 'set -e; set -o pipefail; {0}; wait'".format(command) + _, stdout, _ = client.exec_command(cmd, get_pty=True) output = stdout.read().decode("utf-8") client.close() return NodeOutput(node_id, output, None) @@ -132,22 +140,34 @@ async def clus_exec_command(command, container_name=None, timeout=None): return await asyncio.gather(*[ - asyncio.get_event_loop() - .run_in_executor(ThreadPoolExecutor(), node_exec_command, node.id, command, username, node_rls.ip_address, - node_rls.port, ssh_key, password, container_name, timeout) for node, node_rls in nodes + asyncio.get_event_loop().run_in_executor( + ThreadPoolExecutor(), + node_exec_command, + node.id, + command, + username, + node_rls.ip_address, + node_rls.port, + ssh_key, + password, + container_name, + timeout, + ) for node, node_rls in nodes ]) -def copy_from_node(node_id, - source_path, - destination_path, - username, - hostname, - port, - ssh_key=None, - password=None, - container_name=None, - timeout=None): +def copy_from_node( + node_id, + source_path, + destination_path, + username, + hostname, + port, + ssh_key=None, + password=None, + container_name=None, + timeout=None, +): try: client = connect( hostname=hostname, port=port, username=username, password=password, pkey=ssh_key, timeout=timeout) @@ -159,32 +179,35 @@ def copy_from_node(node_id, destination_path = os.path.join( os.path.dirname(destination_path), node_id, os.path.basename(destination_path)) os.makedirs(os.path.dirname(destination_path), exist_ok=True) - with open(destination_path, 'wb') as f: + with open(destination_path, "wb") as f: sftp_client.getfo(source_path, f) + return NodeOutput(node_id, f, None) else: import tempfile + # create 2mb temporary file f = tempfile.SpooledTemporaryFile(2 * 1024**3) sftp_client.getfo(source_path, f) - - return NodeOutput(node_id, f, None) + return NodeOutput(node_id, f, None) except OSError as e: - return (node_id, None, e) + return NodeOutput(node_id, None, e) finally: sftp_client.close() client.close() -def node_copy(node_id, - source_path, - destination_path, - username, - hostname, - port, - ssh_key=None, - password=None, - container_name=None, - timeout=None): +def node_copy( + node_id, + source_path, + destination_path, + username, + hostname, + port, + ssh_key=None, + password=None, + container_name=None, + timeout=None, +): try: client = connect( hostname=hostname, port=port, username=username, password=password, pkey=ssh_key, timeout=timeout) @@ -194,12 +217,12 @@ def node_copy(node_id, try: if container_name: # put the file in /tmp on the host - tmp_file = '/tmp/' + os.path.basename(source_path) + tmp_file = "/tmp/" + os.path.basename(source_path) sftp_client.put(source_path, tmp_file) # move to correct destination on container - docker_command = 'sudo docker cp {0} {1}:{2}'.format(tmp_file, container_name, destination_path) + docker_command = "sudo docker cp {0} {1}:{2}".format(tmp_file, container_name, destination_path) _, stdout, _ = client.exec_command(docker_command, get_pty=True) - output = stdout.read().decode('utf-8') + output = stdout.read().decode("utf-8") # clean up sftp_client.remove(tmp_file) return NodeOutput(node_id, output, None) @@ -211,23 +234,35 @@ def node_copy(node_id, finally: sftp_client.close() client.close() - #TODO: progress bar + # TODO: progress bar -async def clus_copy(username, - nodes, - source_path, - destination_path, - ssh_key=None, - password=None, - container_name=None, - get=False, - timeout=None): +async def clus_copy( + username, + nodes, + source_path, + destination_path, + ssh_key=None, + password=None, + container_name=None, + get=False, + timeout=None, +): return await asyncio.gather(*[ - asyncio.get_event_loop() - .run_in_executor(ThreadPoolExecutor(), copy_from_node - if get else node_copy, node.id, source_path, destination_path, username, node_rls.ip_address, - node_rls.port, ssh_key, password, container_name, timeout) for node, node_rls in nodes + asyncio.get_event_loop().run_in_executor( + ThreadPoolExecutor(), + copy_from_node if get else node_copy, + node.id, + source_path, + destination_path, + username, + node_rls.ip_address, + node_rls.port, + ssh_key, + password, + container_name, + timeout, + ) for node, node_rls in nodes ]) @@ -235,12 +270,13 @@ def node_ssh(username, hostname, port, ssh_key=None, password=None, port_forward try: client = connect( hostname=hostname, port=port, username=username, password=password, pkey=ssh_key, timeout=timeout) - threads = forward_ports(client=client, port_forward_list=port_forward_list) + forward_ports(client=client, port_forward_list=port_forward_list) except AztkError as e: raise e try: import time + while True: time.sleep(1) except KeyboardInterrupt: diff --git a/aztk/version.py b/aztk/version.py index bc1058e5..71792b62 100644 --- a/aztk/version.py +++ b/aztk/version.py @@ -25,11 +25,6 @@ major = 0 minor = 8 patch = 1 -suffix = '' +suffix = "" -__version__ = "{major}.{minor}.{patch}{suffix}".format( - major=major, - minor=minor, - patch=patch, - suffix=suffix, -) +__version__ = "{major}.{minor}.{patch}{suffix}".format(major=major, minor=minor, patch=patch, suffix=suffix) diff --git a/aztk_cli/__init__.py b/aztk_cli/__init__.py index 7f2ec90a..fd33c296 100644 --- a/aztk_cli/__init__.py +++ b/aztk_cli/__init__.py @@ -1,2 +1,3 @@ import aztk_cli.logger + log = aztk_cli.logger.root diff --git a/aztk_cli/config.py b/aztk_cli/config.py index c5f7d1c9..856cd659 100644 --- a/aztk_cli/config.py +++ b/aztk_cli/config.py @@ -14,7 +14,7 @@ def load_aztk_secrets() -> SecretsConfiguration: """ secrets = SecretsConfiguration() # read global ~/secrets.yaml - global_config = _load_config_file(os.path.join(aztk.utils.constants.HOME_DIRECTORY_PATH, '.aztk', 'secrets.yaml')) + global_config = _load_config_file(os.path.join(aztk.utils.constants.HOME_DIRECTORY_PATH, ".aztk", "secrets.yaml")) # read current working directory secrets.yaml local_config = _load_config_file(aztk.utils.constants.DEFAULT_SECRETS_PATH) @@ -35,7 +35,7 @@ def _load_config_file(path: str): if not os.path.isfile(path): return None - with open(path, 'r', encoding='UTF-8') as stream: + with open(path, "r", encoding="UTF-8") as stream: try: return yaml.load(stream) except yaml.YAMLError as err: @@ -57,18 +57,18 @@ def read_cluster_config(path: str = aztk.utils.constants.DEFAULT_CLUSTER_CONFIG_ def cluster_config_from_dict(config: dict): wait = False - if config.get('plugins') not in [[None], None]: + if config.get("plugins") not in [[None], None]: plugins = [] - for plugin in config['plugins']: + for plugin in config["plugins"]: ref = PluginReference.from_dict(plugin) plugins.append(ref.get_plugin()) config["plugins"] = plugins - if config.get('username') is not None: - config['user_configuration'] = dict(username=config.pop('username')) + if config.get("username") is not None: + config["user_configuration"] = dict(username=config.pop("username")) - if config.get('wait') is not None: - wait = config.pop('wait') + if config.get("wait") is not None: + wait = config.pop("wait") return ClusterConfiguration.from_dict(config), wait @@ -82,9 +82,9 @@ class SshConfig: self.internal = False # Set up ports with default values - self.job_ui_port = '4040' - self.job_history_ui_port = '18080' - self.web_ui_port = '8080' + self.job_ui_port = "4040" + self.job_history_ui_port = "18080" + self.web_ui_port = "8080" def _read_config_file(self, path: str = aztk.utils.constants.DEFAULT_SSH_CONFIG_PATH): """ @@ -93,7 +93,7 @@ class SshConfig: if not os.path.isfile(path): return - with open(path, 'r', encoding='UTF-8') as stream: + with open(path, "r", encoding="UTF-8") as stream: try: config = yaml.load(stream) except yaml.YAMLError as err: @@ -105,35 +105,35 @@ class SshConfig: self._merge_dict(config) def _merge_dict(self, config): - if config.get('username') is not None: - self.username = config['username'] + if config.get("username") is not None: + self.username = config["username"] - if config.get('cluster_id') is not None: - self.cluster_id = config['cluster_id'] + if config.get("cluster_id") is not None: + self.cluster_id = config["cluster_id"] - if config.get('job_ui_port') is not None: - self.job_ui_port = config['job_ui_port'] + if config.get("job_ui_port") is not None: + self.job_ui_port = config["job_ui_port"] - if config.get('job_history_ui_port') is not None: - self.job_history_ui_port = config['job_history_ui_port'] + if config.get("job_history_ui_port") is not None: + self.job_history_ui_port = config["job_history_ui_port"] - if config.get('web_ui_port') is not None: - self.web_ui_port = config['web_ui_port'] + if config.get("web_ui_port") is not None: + self.web_ui_port = config["web_ui_port"] - if config.get('host') is not None: - self.host = config['host'] + if config.get("host") is not None: + self.host = config["host"] - if config.get('connect') is not None: - self.connect = config['connect'] + if config.get("connect") is not None: + self.connect = config["connect"] - if config.get('internal') is not None: - self.internal = config['internal'] + if config.get("internal") is not None: + self.internal = config["internal"] def merge(self, cluster_id, username, job_ui_port, job_history_ui_port, web_ui_port, host, connect, internal): """ Merges fields with args object """ - self._read_config_file(os.path.join(aztk.utils.constants.HOME_DIRECTORY_PATH, '.aztk', 'ssh.yaml')) + self._read_config_file(os.path.join(aztk.utils.constants.HOME_DIRECTORY_PATH, ".aztk", "ssh.yaml")) self._read_config_file() self._merge_dict( dict( @@ -144,19 +144,28 @@ class SshConfig: web_ui_port=web_ui_port, host=host, connect=connect, - internal=internal)) + internal=internal, + )) if self.cluster_id is None: - raise aztk.error.AztkError( - "Please supply an id for the cluster either in the ssh.yaml configuration file or with a parameter (--id)" - ) + raise aztk.error.AztkError("Please supply an id for the cluster either in the ssh.yaml configuration file " + "or with a parameter (--id)") if self.username is None: raise aztk.error.AztkError( "Please supply a username either in the ssh.yaml configuration file or with a parameter (--username)") -class JobConfig(): +def __convert_to_path(path: str): + if path: + abs_path = os.path.abspath(os.path.expanduser(path)) + if not os.path.exists(abs_path): + raise aztk.error.AztkError("Could not find file: {0}\nCheck your configuration file".format(path)) + return abs_path + return None + + +class JobConfig: def __init__(self): self.id = None self.applications = [] @@ -171,61 +180,56 @@ class JobConfig(): self.subnet_id = None self.worker_on_master = None self.scheduling_target = None + self.jars = [] def _merge_dict(self, config): - config = config.get('job') + config = config.get("job") - if config.get('id') is not None: - self.id = config['id'] + if config.get("id") is not None: + self.id = config["id"] - cluster_configuration = config.get('cluster_configuration') + cluster_configuration = config.get("cluster_configuration") if cluster_configuration: - self.vm_size = cluster_configuration.get('vm_size') - self.toolkit = Toolkit.from_dict(cluster_configuration.get('toolkit')) - if cluster_configuration.get('size') is not None: - self.max_dedicated_nodes = cluster_configuration.get('size') - if cluster_configuration.get('size_low_priority') is not None: - self.max_low_pri_nodes = cluster_configuration.get('size_low_priority') - self.subnet_id = cluster_configuration.get('subnet_id') + self.vm_size = cluster_configuration.get("vm_size") + self.toolkit = Toolkit.from_dict(cluster_configuration.get("toolkit")) + if cluster_configuration.get("size") is not None: + self.max_dedicated_nodes = cluster_configuration.get("size") + if cluster_configuration.get("size_low_priority") is not None: + self.max_low_pri_nodes = cluster_configuration.get("size_low_priority") + self.subnet_id = cluster_configuration.get("subnet_id") self.worker_on_master = cluster_configuration.get("worker_on_master") scheduling_target = cluster_configuration.get("scheduling_target") if scheduling_target: self.scheduling_target = SchedulingTarget(scheduling_target) - applications = config.get('applications') + applications = config.get("applications") if applications: self.applications = [] for application in applications: self.applications.append( aztk.spark.models.ApplicationConfiguration( - name=application.get('name'), - application=application.get('application'), - application_args=application.get('application_args'), - main_class=application.get('main_class'), - jars=application.get('jars'), - py_files=application.get('py_files'), - files=application.get('files'), - driver_java_options=application.get('driver_java_options'), - driver_library_path=application.get('driver_library_path'), - driver_class_path=application.get('driver_class_path'), - driver_memory=application.get('driver_memory'), - executor_memory=application.get('executor_memory'), - driver_cores=application.get('driver_cores'), - executor_cores=application.get('executor_cores'))) + name=application.get("name"), + application=application.get("application"), + application_args=application.get("application_args"), + main_class=application.get("main_class"), + jars=application.get("jars"), + py_files=application.get("py_files"), + files=application.get("files"), + driver_java_options=application.get("driver_java_options"), + driver_library_path=application.get("driver_library_path"), + driver_class_path=application.get("driver_class_path"), + driver_memory=application.get("driver_memory"), + executor_memory=application.get("executor_memory"), + driver_cores=application.get("driver_cores"), + executor_cores=application.get("executor_cores"), + )) - spark_configuration = config.get('spark_configuration') + spark_configuration = config.get("spark_configuration") if spark_configuration: - self.spark_defaults_conf = self.__convert_to_path(spark_configuration.get('spark_defaults_conf')) - self.spark_env_sh = self.__convert_to_path(spark_configuration.get('spark_env_sh')) - self.core_site_xml = self.__convert_to_path(spark_configuration.get('core_site_xml')) - self.jars = [self.__convert_to_path(jar) for jar in spark_configuration.get('jars') or []] - - def __convert_to_path(self, str_path): - if str_path: - abs_path = os.path.abspath(os.path.expanduser(str_path)) - if not os.path.exists(abs_path): - raise aztk.error.AztkError("Could not find file: {0}\nCheck your configuration file".format(str_path)) - return abs_path + self.spark_defaults_conf = __convert_to_path(spark_configuration.get("spark_defaults_conf")) + self.spark_env_sh = __convert_to_path(spark_configuration.get("spark_env_sh")) + self.core_site_xml = __convert_to_path(spark_configuration.get("core_site_xml")) + self.jars = [__convert_to_path(jar) for jar in spark_configuration.get("jars") or []] def _read_config_file(self, path: str = aztk.utils.constants.DEFAULT_SPARK_JOB_CONFIG): """ @@ -234,7 +238,7 @@ class JobConfig(): if not path or not os.path.isfile(path): return - with open(path, 'r', encoding='UTF-8') as stream: + with open(path, "r", encoding="UTF-8") as stream: try: config = yaml.load(stream) except yaml.YAMLError as err: @@ -274,10 +278,11 @@ def get_file_if_exists(file): def load_aztk_spark_config(): return aztk.spark.models.SparkConfiguration( - spark_defaults_conf=get_file_if_exists('spark-defaults.conf'), + spark_defaults_conf=get_file_if_exists("spark-defaults.conf"), jars=load_jars(), - spark_env_sh=get_file_if_exists('spark-env.sh'), - core_site_xml=get_file_if_exists('core-site.xml')) + spark_env_sh=get_file_if_exists("spark-env.sh"), + core_site_xml=get_file_if_exists("core-site.xml"), + ) def load_jars(): @@ -285,14 +290,14 @@ def load_jars(): # try load global try: - jars_src = os.path.join(aztk.utils.constants.GLOBAL_CONFIG_PATH, 'jars') + jars_src = os.path.join(aztk.utils.constants.GLOBAL_CONFIG_PATH, "jars") jars = [os.path.join(jars_src, jar) for jar in os.listdir(jars_src)] except FileNotFoundError: pass # try load local, overwrite if found try: - jars_src = os.path.join(aztk.utils.constants.DEFAULT_SPARK_CONF_SOURCE, 'jars') + jars_src = os.path.join(aztk.utils.constants.DEFAULT_SPARK_CONF_SOURCE, "jars") jars = [os.path.join(jars_src, jar) for jar in os.listdir(jars_src)] except FileNotFoundError: pass diff --git a/aztk_cli/constants.py b/aztk_cli/constants.py index 9bca4ba8..b2333d45 100644 --- a/aztk_cli/constants.py +++ b/aztk_cli/constants.py @@ -1,4 +1,4 @@ """ Name of the executable """ -CLI_EXE = 'aztk' +CLI_EXE = "aztk" diff --git a/aztk_cli/entrypoint.py b/aztk_cli/entrypoint.py index 3044a797..3c57c1f9 100644 --- a/aztk_cli/entrypoint.py +++ b/aztk_cli/entrypoint.py @@ -7,10 +7,13 @@ import argparse import warnings from typing import NamedTuple -import azure.batch.models.batch_error as batch_error + +from azure.batch.models import batch_error + import aztk -from aztk_cli import logger, log, utils, constants +from aztk_cli import constants, log, logger, utils from aztk_cli.spark.endpoints import spark + from . import plugins, toolkit @@ -46,8 +49,8 @@ def main(): def setup_common_args(parser: argparse.ArgumentParser): - parser.add_argument('--version', action='version', version=aztk.version.__version__) - parser.add_argument("--verbose", action='store_true', help="Enable verbose logging.") + parser.add_argument("--version", action="version", version=aztk.version.__version__) + parser.add_argument("--verbose", action="store_true", help="Enable verbose logging.") def parse_common_args(args: NamedTuple): @@ -69,5 +72,5 @@ def run_software(args: NamedTuple): func(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/aztk_cli/logger.py b/aztk_cli/logger.py index 24c94bf1..982a1485 100644 --- a/aztk_cli/logger.py +++ b/aztk_cli/logger.py @@ -4,21 +4,22 @@ import sys root = logging.getLogger("aztk") -DEFAULT_FORMAT = '%(message)s' -VERBOSE_FORMAT = '[%(asctime)s] [%(filename)s:%(module)s:%(funcName)s:%(lineno)d] %(levelname)s - %(message)s' +DEFAULT_FORMAT = "%(message)s" +VERBOSE_FORMAT = "[%(asctime)s] [%(filename)s:%(module)s:%(funcName)s:%(lineno)d] %(levelname)s - %(message)s" def add_coloring_to_emit_windows(fn): # add methods we need to the class - def _set_color(self, code): + def set_color(self, code): import ctypes + # Constants from the Windows API self.STD_OUTPUT_HANDLE = -11 hdl = ctypes.windll.kernel32.GetStdHandle(self.STD_OUTPUT_HANDLE) ctypes.windll.kernel32.SetConsoleTextAttribute(hdl, code) - setattr(logging.StreamHandler, '_set_color', _set_color) + setattr(logging.StreamHandler, "set_color", set_color) def new(*args): FOREGROUND_BLUE = 0x0001 # text color contains blue. @@ -27,30 +28,15 @@ def add_coloring_to_emit_windows(fn): FOREGROUND_INTENSITY = 0x0008 # text color is intensified. FOREGROUND_WHITE = FOREGROUND_BLUE | FOREGROUND_GREEN | FOREGROUND_RED - # winbase.h - STD_INPUT_HANDLE = -10 - STD_OUTPUT_HANDLE = -11 - STD_ERROR_HANDLE = -12 - # wincon.h - FOREGROUND_BLACK = 0x0000 FOREGROUND_BLUE = 0x0001 FOREGROUND_GREEN = 0x0002 - FOREGROUND_CYAN = 0x0003 FOREGROUND_RED = 0x0004 FOREGROUND_MAGENTA = 0x0005 FOREGROUND_YELLOW = 0x0006 - FOREGROUND_GREY = 0x0007 FOREGROUND_INTENSITY = 0x0008 # foreground color is intensified. - BACKGROUND_BLACK = 0x0000 - BACKGROUND_BLUE = 0x0010 - BACKGROUND_GREEN = 0x0020 - BACKGROUND_CYAN = 0x0030 - BACKGROUND_RED = 0x0040 - BACKGROUND_MAGENTA = 0x0050 BACKGROUND_YELLOW = 0x0060 - BACKGROUND_GREY = 0x0070 BACKGROUND_INTENSITY = 0x0080 # background color is intensified. levelno = args[1].levelno @@ -68,10 +54,10 @@ def add_coloring_to_emit_windows(fn): color = FOREGROUND_MAGENTA else: color = FOREGROUND_WHITE - args[0]._set_color(color) + args[0].set_color(color) ret = fn(*args) - args[0]._set_color(FOREGROUND_WHITE) + args[0].set_color(FOREGROUND_WHITE) # print "after" return ret @@ -83,27 +69,27 @@ def add_coloring_to_emit_ansi(fn): def new(*args): levelno = args[1].levelno if levelno >= 50: - color = '\x1b[31m' # red + color = "\x1b[31m" # red elif levelno >= 40: - color = '\x1b[31m' # red + color = "\x1b[31m" # red elif levelno >= 30: - color = '\x1b[33m' # yellow + color = "\x1b[33m" # yellow elif levelno >= 20: - color = '\x1b[32m' # green + color = "\x1b[32m" # green elif levelno >= 19: - color = '\x1b[0m' # normal + color = "\x1b[0m" # normal elif levelno >= 10: - color = '\x1b[35m' # pink + color = "\x1b[35m" # pink else: - color = '\x1b[0m' # normal - args[1].msg = color + args[1].msg + '\x1b[0m' # normal + color = "\x1b[0m" # normal + args[1].msg = color + args[1].msg + "\x1b[0m" # normal # print "after" return fn(*args) return new -if platform.system() == 'Windows': +if platform.system() == "Windows": # Windows does not support ANSI escapes and we are using API calls to set the console color logging.StreamHandler.emit = add_coloring_to_emit_windows(logging.StreamHandler.emit) else: @@ -114,6 +100,7 @@ logging.PRINT = 19 logging.addLevelName(logging.PRINT, "PRINT") +# pylint: disable=protected-access def print_level(self, message, *args, **kwargs): self._log(logging.PRINT, message, args, **kwargs) diff --git a/aztk_cli/plugins.py b/aztk_cli/plugins.py index 7a66f02a..a57cd6da 100644 --- a/aztk_cli/plugins.py +++ b/aztk_cli/plugins.py @@ -1,10 +1,11 @@ import argparse import typing -from aztk_cli import log + from aztk.models.plugins.internal import plugin_manager +from aztk_cli import log -def setup_parser(parser: argparse.ArgumentParser): +def setup_parser(_: argparse.ArgumentParser): pass diff --git a/aztk_cli/spark/endpoints/cluster/cluster_add_user.py b/aztk_cli/spark/endpoints/cluster/cluster_add_user.py index a3837cc8..673ce1e7 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_add_user.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_add_user.py @@ -6,18 +6,19 @@ from aztk_cli import config, log, utils def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='cluster_id', required=True, help='The unique id of your spark cluster') - parser.add_argument('-u', '--username', help='The username to access your spark cluster\'s head node') + parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") + parser.add_argument("-u", "--username", help="The username to access your spark cluster's head node") auth_group = parser.add_mutually_exclusive_group() auth_group.add_argument( - '-p', - '--password', - help="The password to access your spark cluster's master node. If not provided will use ssh public key.") + "-p", + "--password", + help="The password to access your spark cluster's master node. If not provided will use ssh public key.", + ) auth_group.add_argument( - '--ssh-key', - help= - "The ssh public key to access your spark cluster's master node. You can also set the ssh-key in the configuration file." + "--ssh-key", + help="The ssh public key to access your spark cluster's master node. " + "You can also set the ssh-key in the configuration file.", ) parser.set_defaults(username="admin") @@ -25,10 +26,10 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) - log.info('-------------------------------------------') - log.info('spark cluster id: {}'.format(args.cluster_id)) - log.info('username: {}'.format(args.username)) - log.info('-------------------------------------------') + log.info("-------------------------------------------") + log.info("spark cluster id: %s", args.cluster_id) + log.info("username: %s", args.username) + log.info("-------------------------------------------") if args.ssh_key: ssh_key = args.ssh_key @@ -41,8 +42,8 @@ def execute(args: typing.NamedTuple): spark_client.cluster.create_user(id=args.cluster_id, username=args.username, password=password, ssh_key=ssh_key) if password: - log.info('password: %s', '*' * len(password)) + log.info("password: %s", "*" * len(password)) elif ssh_key: - log.info('ssh public key: %s', ssh_key) + log.info("ssh public key: %s", ssh_key) - log.info('-------------------------------------------') + log.info("-------------------------------------------") diff --git a/aztk_cli/spark/endpoints/cluster/cluster_app_logs.py b/aztk_cli/spark/endpoints/cluster/cluster_app_logs.py index 03207633..927a6719 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_app_logs.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_app_logs.py @@ -7,16 +7,17 @@ from aztk_cli import config, utils, log def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='cluster_id', required=True, help='The unique id of your spark cluster') - parser.add_argument('--name', dest='app_name', required=True, help='The unique id of your job name') + parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") + parser.add_argument("--name", dest="app_name", required=True, help="The unique id of your job name") output_group = parser.add_mutually_exclusive_group() output_group.add_argument( - '--output', - help='Path to the file you wish to output to. If not \ - specified, output is printed to stdout') - output_group.add_argument('--tail', dest='tail', action='store_true') + "--output", + help="Path to the file you wish to output to. If not \ + specified, output is printed to stdout", + ) + output_group.add_argument("--tail", dest="tail", action="store_true") def execute(args: typing.NamedTuple): diff --git a/aztk_cli/spark/endpoints/cluster/cluster_copy.py b/aztk_cli/spark/endpoints/cluster/cluster_copy.py index ba17bbda..6ad8b010 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_copy.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_copy.py @@ -3,21 +3,25 @@ import sys import typing import aztk.spark -from aztk_cli import config, log, utils +from aztk_cli import config, utils def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='cluster_id', required=True, help='The unique id of your spark cluster') + parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") - parser.add_argument('--source-path', required=True, help='the local file you wish to copy to the cluster') + parser.add_argument("--source-path", required=True, help="the local file you wish to copy to the cluster") - parser.add_argument('--dest-path', required=True, - help='the path the file will be copied to on each node in the cluster.'\ - 'Note that this must include the file name.') parser.add_argument( - '--internal', - action='store_true', - help='Connect using the local IP of the master node. Only use if using a VPN.') + "--dest-path", + required=True, + help="the path the file will be copied to on each node in the cluster." + "Note that this must include the file name.", + ) + parser.add_argument( + "--internal", + action="store_true", + help="Connect using the local IP of the master node. Only use if using a VPN.", + ) parser.set_defaults(internal=False) @@ -26,5 +30,6 @@ def execute(args: typing.NamedTuple): with utils.Spinner(): copy_output = spark_client.cluster.copy( id=args.cluster_id, source_path=args.source_path, destination_path=args.dest_path, internal=args.internal) - [utils.log_node_copy_output(node_output) for node_output in copy_output] + for node_output in copy_output: + utils.log_node_copy_output(node_output) sys.exit(0 if not any([node_output.error for node_output in copy_output]) else 1) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_create.py b/aztk_cli/spark/endpoints/cluster/cluster_create.py index c738e89d..fffbb93d 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_create.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_create.py @@ -8,25 +8,27 @@ from aztk_cli.config import load_aztk_spark_config def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='cluster_id', help='The unique id of your spark cluster') - parser.add_argument('--size', type=int, help='Number of vms in your cluster') + parser.add_argument("--id", dest="cluster_id", help="The unique id of your spark cluster") + parser.add_argument("--size", type=int, help="Number of vms in your cluster") parser.add_argument( - '--size-low-pri', + "--size-low-pri", type=int, - help='Number of low priority vms in your cluster (Deprecated, use --size-low-priority)') - parser.add_argument('--size-low-priority', type=int, help='Number of low priority vms in your cluster') - parser.add_argument('--vm-size', help='VM size for nodes in your cluster') - parser.add_argument('--username', help='Username to access your cluster (required: --wait flag)') + help="Number of low priority vms in your cluster (Deprecated, use --size-low-priority)", + ) + parser.add_argument("--size-low-priority", type=int, help="Number of low priority vms in your cluster") + parser.add_argument("--vm-size", help="VM size for nodes in your cluster") + parser.add_argument("--username", help="Username to access your cluster (required: --wait flag)") parser.add_argument( - '--password', - help="The password to access your spark cluster's head node. If not provided will use ssh public key.") + "--password", + help="The password to access your spark cluster's head node. If not provided will use ssh public key.", + ) parser.add_argument( - '--docker-repo', help='The location of the public docker image you want to use (/:)') - parser.add_argument('--docker-run-options', help='command line options to pass to `docker run`') - parser.add_argument('--subnet-id', help='The subnet in which to create the cluster.') + "--docker-repo", help="The location of the public docker image you want to use (/:)") + parser.add_argument("--docker-run-options", help="command line options to pass to `docker run`") + parser.add_argument("--subnet-id", help="The subnet in which to create the cluster.") - parser.add_argument('--no-wait', dest='wait', action='store_false') - parser.add_argument('--wait', dest='wait', action='store_true') + parser.add_argument("--no-wait", dest="wait", action="store_false") + parser.add_argument("--wait", dest="wait", action="store_true") parser.set_defaults(wait=None, size=None, size_low_pri=None, size_low_priority=None) @@ -46,10 +48,8 @@ def execute(args: typing.NamedTuple): size_low_priority=args.size_low_priority, vm_size=args.vm_size, subnet_id=args.subnet_id, - user_configuration=UserConfiguration( - username=args.username, - password=args.password, - ))) + user_configuration=UserConfiguration(username=args.username, password=args.password), + )) if cluster_conf.toolkit: if args.docker_repo: @@ -62,9 +62,12 @@ def execute(args: typing.NamedTuple): user_configuration = cluster_conf.user_configuration if user_configuration and user_configuration.username: - ssh_key, password = utils.get_ssh_key_or_prompt(spark_client.secrets_configuration.ssh_pub_key, - user_configuration.username, user_configuration.password, - spark_client.secrets_configuration) + ssh_key, password = utils.get_ssh_key_or_prompt( + spark_client.secrets_configuration.ssh_pub_key, + user_configuration.username, + user_configuration.password, + spark_client.secrets_configuration, + ) cluster_conf.user_configuration = aztk.spark.models.UserConfiguration( username=user_configuration.username, password=password, ssh_key=ssh_key) else: diff --git a/aztk_cli/spark/endpoints/cluster/cluster_debug.py b/aztk_cli/spark/endpoints/cluster/cluster_debug.py index f1a72707..ea0196a2 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_debug.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_debug.py @@ -8,11 +8,11 @@ from aztk_cli import config, utils def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='cluster_id', required=True, help='The unique id of your spark cluster') + parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") - parser.add_argument('--output', '-o', required=False, help='the directory for the output folder') + parser.add_argument("--output", "-o", required=False, help="the directory for the output folder") parser.add_argument( - '--brief', '-b', required=False, action='store_true', help='Only gets a small subset of key logs') + "--brief", "-b", required=False, action="store_true", help="Only gets a small subset of key logs") parser.set_defaults(brief=False) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_delete.py b/aztk_cli/spark/endpoints/cluster/cluster_delete.py index 82df04f2..5431db33 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_delete.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_delete.py @@ -7,21 +7,23 @@ from aztk_cli import config, log def setup_parser(parser: argparse.ArgumentParser): parser.add_argument( - '--id', dest='cluster_ids', nargs='*', required=True, help='The unique id of your spark cluster') + "--id", dest="cluster_ids", nargs="*", required=True, help="The unique id of your spark cluster") parser.add_argument( - '--force', - '-f', - dest='force', + "--force", + "-f", + dest="force", required=False, - action='store_true', - help='Do not prompt for confirmation, force deletion of cluster.') + action="store_true", + help="Do not prompt for confirmation, force deletion of cluster.", + ) parser.add_argument( - '--keep-logs', - '-k', - dest='keep_logs', - action='store_true', + "--keep-logs", + "-k", + dest="keep_logs", + action="store_true", required=False, - help='Prevent logs in storage from being deleted.') + help="Prevent logs in storage from being deleted.", + ) parser.set_defaults(force=False, keep_logs=False) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_get.py b/aztk_cli/spark/endpoints/cluster/cluster_get.py index 2bdf5ce4..93f52dbc 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_get.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_get.py @@ -6,11 +6,14 @@ from aztk_cli import config, log, utils def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='cluster_id', required=True, help='The unique id of your spark cluster') - parser.add_argument('--show-config', dest='show_config', action='store_true', help='Show the cluster configuration') - parser.add_argument('--internal', action='store_true', - help="Show the local IP of the nodes. "\ - "Only use if using connecting with a VPN.") + parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") + parser.add_argument("--show-config", dest="show_config", action="store_true", help="Show the cluster configuration") + parser.add_argument( + "--internal", + action="store_true", + help="Show the local IP of the nodes. " + "Only use if using connecting with a VPN.", + ) parser.set_defaults(internal=False) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_list.py b/aztk_cli/spark/endpoints/cluster/cluster_list.py index 9f8dafbd..20d5231d 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_list.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_list.py @@ -7,7 +7,7 @@ from aztk_cli import config, utils def setup_parser(parser: argparse.ArgumentParser): parser.add_argument( - '-q', '--quiet', dest='quiet', required=False, action='store_true', help='The unique id of your spark cluster') + "-q", "--quiet", dest="quiet", required=False, action="store_true", help="The unique id of your spark cluster") parser.set_defaults(quiet=False) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_run.py b/aztk_cli/spark/endpoints/cluster/cluster_run.py index 9db3df84..d3c2caf4 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_run.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_run.py @@ -2,24 +2,25 @@ import argparse import typing import aztk.spark -from aztk_cli import config, log, utils +from aztk_cli import config, utils def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='cluster_id', required=True, help='The unique id of your spark cluster') + parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") parser.add_argument( - '--node-id', - '-n', - dest='node_id', + "--node-id", + "-n", + dest="node_id", required=False, - help='The unique id of the node in the cluster to run the command on') - parser.add_argument('command', help='The command to run on your spark cluster') + help="The unique id of the node in the cluster to run the command on", + ) + parser.add_argument("command", help="The command to run on your spark cluster") parser.add_argument( - '--internal', - action='store_true', - help='Connect using the local IP of the master node. Only use if using a VPN') + "--internal", + action="store_true", + help="Connect using the local IP of the master node. Only use if using a VPN") parser.add_argument( - '--host', action='store_true', help='Run the command on the host instead of the Spark Docker container') + "--host", action="store_true", help="Run the command on the host instead of the Spark Docker container") parser.set_defaults(internal=False, host=False) @@ -32,5 +33,5 @@ def execute(args: typing.NamedTuple): ] else: results = spark_client.cluster.run(args.cluster_id, args.command, args.host, args.internal) - - [utils.log_node_run_output(node_output) for node_output in results] + for node_output in results: + utils.log_node_run_output(node_output) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_ssh.py b/aztk_cli/spark/endpoints/cluster/cluster_ssh.py index a0b7aa4a..1711c3bd 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_ssh.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_ssh.py @@ -1,37 +1,38 @@ import argparse import typing -import azure.batch.models.batch_error as batch_error +from azure.batch.models import batch_error import aztk from aztk.models import ClusterConfiguration +from aztk.spark.models import PortForwardingSpecification from aztk_cli import config, log, utils from aztk_cli.config import SshConfig -from aztk.spark.models import PortForwardingSpecification - def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest="cluster_id", help='The unique id of your spark cluster') - parser.add_argument('--webui', help='Local port to port spark\'s master UI to') - parser.add_argument('--jobui', help='Local port to port spark\'s job UI to') - parser.add_argument('--jobhistoryui', help='Local port to port spark\'s job history UI to') - parser.add_argument('-u', '--username', help='Username to spark cluster') - parser.add_argument('--password', help='Password for the specified ssh user') - parser.add_argument('--host', dest="host", action='store_true', help='Connect to the host of the Spark container') + parser.add_argument("--id", dest="cluster_id", help="The unique id of your spark cluster") + parser.add_argument("--webui", help="Local port to port spark's master UI to") + parser.add_argument("--jobui", help="Local port to port spark's job UI to") + parser.add_argument("--jobhistoryui", help="Local port to port spark's job history UI to") + parser.add_argument("-u", "--username", help="Username to spark cluster") + parser.add_argument("--password", help="Password for the specified ssh user") + parser.add_argument("--host", dest="host", action="store_true", help="Connect to the host of the Spark container") parser.add_argument( - '--no-connect', + "--no-connect", dest="connect", - action='store_false', - help='Do not create the ssh session. Only print out the command to run.') + action="store_false", + help="Do not create the ssh session. Only print out the command to run.", + ) parser.add_argument( - '--internal', - action='store_true', - help='Connect using the local IP of the master node. Only use if using a VPN.') + "--internal", + action="store_true", + help="Connect using the local IP of the master node. Only use if using a VPN.", + ) parser.set_defaults(connect=True, internal=False) -http_prefix = 'http://localhost:' +http_prefix = "http://localhost:" def execute(args: typing.NamedTuple): @@ -48,7 +49,8 @@ def execute(args: typing.NamedTuple): web_ui_port=args.webui, host=args.host, connect=args.connect, - internal=args.internal) + internal=args.internal, + ) log.info("-------------------------------------------") utils.log_property("spark cluster id", ssh_conf.cluster_id) @@ -98,12 +100,11 @@ def native_python_ssh_into_master(spark_client, cluster, cluster_configuration, log.warning("No ssh client found, using pure python connection.") return - configuration = spark_client.cluster.get_configuration(cluster.id) plugin_ports = [] - if configuration and configuration.plugins: + if cluster_configuration and cluster_configuration.plugins: ports = [ PortForwardingSpecification(port.internal, port.public_port) - for plugin in configuration.plugins + for plugin in cluster_configuration.plugins for port in plugin.ports if port.expose_publicly ] @@ -112,7 +113,6 @@ def native_python_ssh_into_master(spark_client, cluster, cluster_configuration, print("Press ctrl+c to exit...") spark_client.cluster.ssh_into_master( cluster.id, - cluster.master_node_id, ssh_conf.username, ssh_key=None, password=password, @@ -121,7 +121,8 @@ def native_python_ssh_into_master(spark_client, cluster, cluster_configuration, PortForwardingSpecification(remote_port=4040, local_port=4040), # job ui PortForwardingSpecification(remote_port=18080, local_port=18080), # job history ui ] + plugin_ports, - internal=ssh_conf.internal) + internal=ssh_conf.internal, + ) def shell_out_ssh(spark_client, cluster_configuration, ssh_conf): @@ -136,7 +137,8 @@ def shell_out_ssh(spark_client, cluster_configuration, ssh_conf): username=ssh_conf.username, host=ssh_conf.host, connect=ssh_conf.connect, - internal=ssh_conf.internal) + internal=ssh_conf.internal, + ) if not ssh_conf.connect: log.info("") diff --git a/aztk_cli/spark/endpoints/cluster/cluster_submit.py b/aztk_cli/spark/endpoints/cluster/cluster_submit.py index 2669b6de..d1c25e98 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_submit.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_submit.py @@ -8,80 +8,89 @@ from aztk_cli import config, log, utils def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='cluster_id', required=True, help='The unique id of your spark cluster') + parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") - parser.add_argument('--name', required=True, help='a name for your application') + parser.add_argument("--name", required=True, help="a name for your application") - parser.add_argument('--wait', dest='wait', action='store_true', help='Wait for app to complete') - parser.add_argument('--no-wait', dest='wait', action='store_false', help='Do not wait for app to complete') + parser.add_argument("--wait", dest="wait", action="store_true", help="Wait for app to complete") + parser.add_argument("--no-wait", dest="wait", action="store_false", help="Do not wait for app to complete") parser.set_defaults(wait=True) - parser.add_argument('--class', dest='main_class', help='Your application\'s main class (for Java only).') + parser.add_argument("--class", dest="main_class", help="Your application's main class (for Java only).") parser.add_argument( - '--jars', - help='Comma-separated list of local jars to include \ + "--jars", + help="Comma-separated list of local jars to include \ on the driver and executor classpaths. Use \ - absolute path to reference files.') + absolute path to reference files.", + ) parser.add_argument( - '--py-files', - help='Comma-separated list of .zip, .egg, or .py files \ + "--py-files", + help="Comma-separated list of .zip, .egg, or .py files \ to place on the PYTHONPATH for Python apps. Use \ - absolute path to reference files.') + absolute path to reference files.", + ) parser.add_argument( - '--files', - help='Comma-separated list of .zip, .egg, or .py files \ + "--files", + help="Comma-separated list of .zip, .egg, or .py files \ to place on the PYTHONPATH for Python apps. Use \ - absolute path ot reference files.') + absolute path ot reference files.", + ) - parser.add_argument('--driver-java-options', help='Extra Java options to pass to the driver.') + parser.add_argument("--driver-java-options", help="Extra Java options to pass to the driver.") - parser.add_argument('--driver-library-path', help='Extra library path entries to pass to the driver.') + parser.add_argument("--driver-library-path", help="Extra library path entries to pass to the driver.") parser.add_argument( - '--driver-class-path', - help='Extra class path entries to pass to the driver. \ + "--driver-class-path", + help="Extra class path entries to pass to the driver. \ Note that jars added with --jars are automatically \ - included in the classpath.') + included in the classpath.", + ) - parser.add_argument('--driver-memory', help="Memory for driver (e.g. 1000M, 2G) (Default: 1024M).") + parser.add_argument("--driver-memory", help="Memory for driver (e.g. 1000M, 2G) (Default: 1024M).") - parser.add_argument('--executor-memory', help='Memory per executor (e.g. 1000M, 2G) (Default: 1G).') + parser.add_argument("--executor-memory", help="Memory per executor (e.g. 1000M, 2G) (Default: 1G).") - parser.add_argument('--driver-cores', help='Cores for driver (Default: 1).') + parser.add_argument("--driver-cores", help="Cores for driver (Default: 1).") parser.add_argument( - '--executor-cores', - help='Number of cores per executor. (Default: All \ - available cores on the worker)') + "--executor-cores", + help="Number of cores per executor. (Default: All \ + available cores on the worker)", + ) parser.add_argument( - '--max-retry-count', - help='Number of times the Spark job may be retried \ - if there is a failure') + "--max-retry-count", + help="Number of times the Spark job may be retried \ + if there is a failure", + ) parser.add_argument( - '--output', - help='Path to the file you wish to output to. If not \ - specified, output is printed to stdout') + "--output", + help="Path to the file you wish to output to. If not \ + specified, output is printed to stdout", + ) parser.add_argument( - '--remote', - action='store_true', - help='Do not upload the app to the cluster, assume it is \ - already accessible at the given path') + "--remote", + action="store_true", + help="Do not upload the app to the cluster, assume it is \ + already accessible at the given path", + ) parser.add_argument( - 'app', - help='App jar OR python file to execute. A path to a local \ - file is expected, unless used in conjunction with \ - the --remote flag. When the --remote flag is set, a \ - remote path that is accessible from the cluster is \ - expected. Remote paths are not validated up-front.') + "app", + help="App jar OR python file to execute. A path to a local " + "file is expected, unless used in conjunction with " + "the --remote flag. When the --remote flag is set, a " + "remote path that is accessible from the cluster is " + "expected. Remote paths are not validated up-front.", + ) - parser.add_argument('app_args', nargs='*', help='Arguments for the application') + parser.add_argument("app_args", nargs="*", help="Arguments for the application") def execute(args: typing.NamedTuple): @@ -94,14 +103,56 @@ def execute(args: typing.NamedTuple): files = [] if args.jars is not None: - jars = args.jars.replace(' ', '').split(',') + jars = args.jars.replace(" ", "").split(",") if args.py_files is not None: - py_files = args.py_files.replace(' ', '').split(',') + py_files = args.py_files.replace(" ", "").split(",") if args.files is not None: - files = args.files.replace(' ', '').split(',') + files = args.files.replace(" ", "").split(",") + log_application(args, jars, py_files, files) + + spark_client.cluster.submit( + id=args.cluster_id, + application=aztk.spark.models.ApplicationConfiguration( + name=args.name, + application=args.app, + application_args=args.app_args, + main_class=args.main_class, + jars=jars, + py_files=py_files, + files=files, + driver_java_options=args.driver_java_options, + driver_library_path=args.driver_library_path, + driver_class_path=args.driver_class_path, + driver_memory=args.driver_memory, + executor_memory=args.executor_memory, + driver_cores=args.driver_cores, + executor_cores=args.executor_cores, + max_retry_count=args.max_retry_count, + ), + remote=args.remote, + wait=False, + ) + + if args.wait: + if not args.output: + exit_code = utils.stream_logs(client=spark_client, cluster_id=args.cluster_id, application_name=args.name) + else: + with utils.Spinner(): + spark_client.cluster.wait( + id=args.cluster_id, application_name=args.name) # TODO: replace wait_until_application_done + application_log = spark_client.cluster.get_application_log( + id=args.cluster_id, application_name=args.name) + with open(os.path.abspath(os.path.expanduser(args.output)), "w", encoding="UTF-8") as f: + f.write(application_log.log) + exit_code = application_log.exit_code + + sys.exit(exit_code) + + +def log_application(args, jars, py_files, files): log.info("-------------------------------------------") log.info("Spark cluster id: %s", args.cluster_id) log.info("Spark app name: %s", args.name) @@ -131,39 +182,3 @@ def execute(args: typing.NamedTuple): log.info("Application: %s", args.app) log.info("Application arguments: %s", args.app_args) log.info("-------------------------------------------") - - spark_client.cluster.submit( - id=args.cluster_id, - application=aztk.spark.models.ApplicationConfiguration( - name=args.name, - application=args.app, - application_args=args.app_args, - main_class=args.main_class, - jars=jars, - py_files=py_files, - files=files, - driver_java_options=args.driver_java_options, - driver_library_path=args.driver_library_path, - driver_class_path=args.driver_class_path, - driver_memory=args.driver_memory, - executor_memory=args.executor_memory, - driver_cores=args.driver_cores, - executor_cores=args.executor_cores, - max_retry_count=args.max_retry_count), - remote=args.remote, - wait=False) - - if args.wait: - if not args.output: - exit_code = utils.stream_logs(client=spark_client, cluster_id=args.cluster_id, application_name=args.name) - else: - with utils.Spinner(): - spark_client.cluster.wait( - id=args.cluster_id, application_name=args.name) # TODO: replace wait_until_application_done - application_log = spark_client.cluster.get_application_log( - id=args.cluster_id, application_name=args.name) - with open(os.path.abspath(os.path.expanduser(args.output)), "w", encoding="UTF-8") as f: - f.write(application_log.log) - exit_code = application_log.exit_code - - sys.exit(exit_code) diff --git a/aztk_cli/spark/endpoints/init.py b/aztk_cli/spark/endpoints/init.py index e9fdabc2..9e1897ee 100644 --- a/aztk_cli/spark/endpoints/init.py +++ b/aztk_cli/spark/endpoints/init.py @@ -2,22 +2,24 @@ import argparse import os import typing from distutils.dir_util import copy_tree + +from aztk.utils import constants from aztk_cli import log -import aztk.utils.constants as constants def setup_parser(parser: argparse.ArgumentParser): parser.add_argument( - '--global', - dest='global_flag', - action='store_true', - help="Create a .aztk/ folder in your home directory for global configurations.") + "--global", + dest="global_flag", + action="store_true", + help="Create a .aztk/ folder in your home directory for global configurations.", + ) software_parser = parser.add_mutually_exclusive_group() - software_parser.add_argument('--miniconda', action="store_true", required=False) - software_parser.add_argument('--anaconda', action="store_true", required=False) - software_parser.add_argument('--r', '--R', action="store_true", required=False) - software_parser.add_argument('--java', action="store_true", required=False) - software_parser.add_argument('--scala', action="store_true", required=False) + software_parser.add_argument("--miniconda", action="store_true", required=False) + software_parser.add_argument("--anaconda", action="store_true", required=False) + software_parser.add_argument("--r", "--R", action="store_true", required=False) + software_parser.add_argument("--java", action="store_true", required=False) + software_parser.add_argument("--scala", action="store_true", required=False) def execute(args: typing.NamedTuple): @@ -47,8 +49,8 @@ def create_directory(dest_path: str, environment: str): copy_tree(config_src_path, config_dest_path, update=1) - secrets_template_path = os.path.join(dest_path, 'secrets.yaml.template') - secrets_path = os.path.join(dest_path, 'secrets.yaml') + secrets_template_path = os.path.join(dest_path, "secrets.yaml.template") + secrets_path = os.path.join(dest_path, "secrets.yaml") if os.path.isfile(secrets_path): os.remove(secrets_template_path) @@ -56,11 +58,11 @@ def create_directory(dest_path: str, environment: str): if os.path.isfile(secrets_template_path) and not os.path.isfile(secrets_path): os.rename(secrets_template_path, secrets_path) - cluster_path = os.path.join(dest_path, 'cluster.yaml') + cluster_path = os.path.join(dest_path, "cluster.yaml") if os.path.isfile(cluster_path): - with open(cluster_path, 'r', encoding='UTF-8') as stream: + with open(cluster_path, "r", encoding="UTF-8") as stream: cluster_yaml = stream.read() cluster_yaml = cluster_yaml.replace("{environment}", "{}\n".format(environment)) - with open(cluster_path, 'w', encoding='UTF-8') as file: + with open(cluster_path, "w", encoding="UTF-8") as file: file.write(cluster_yaml) diff --git a/aztk_cli/spark/endpoints/job/delete.py b/aztk_cli/spark/endpoints/job/delete.py index 70b2384d..a02dbead 100644 --- a/aztk_cli/spark/endpoints/job/delete.py +++ b/aztk_cli/spark/endpoints/job/delete.py @@ -6,21 +6,23 @@ from aztk_cli import config, log def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='job_id', required=True, help='The unique id of your AZTK Job') + parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK Job") parser.add_argument( - '--force', - '-f', - dest='force', + "--force", + "-f", + dest="force", required=False, - action='store_true', - help='Do not prompt for confirmation, force deletion of cluster.') + action="store_true", + help="Do not prompt for confirmation, force deletion of cluster.", + ) parser.add_argument( - '--keep-logs', - '-k', - dest='keep_logs', - action='store_true', + "--keep-logs", + "-k", + dest="keep_logs", + action="store_true", required=False, - help='Prevent logs in storage from being deleted.') + help="Prevent logs in storage from being deleted.", + ) parser.set_defaults(force=False, keep_logs=False) diff --git a/aztk_cli/spark/endpoints/job/get.py b/aztk_cli/spark/endpoints/job/get.py index 9e608dd8..74529d2b 100644 --- a/aztk_cli/spark/endpoints/job/get.py +++ b/aztk_cli/spark/endpoints/job/get.py @@ -1,5 +1,4 @@ import argparse -import time import typing import aztk.spark @@ -7,7 +6,7 @@ from aztk_cli import config, utils def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='job_id', required=True, help='The unique id of your AZTK job') + parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK job") def execute(args: typing.NamedTuple): diff --git a/aztk_cli/spark/endpoints/job/get_app.py b/aztk_cli/spark/endpoints/job/get_app.py index ca276d4a..8af9843b 100644 --- a/aztk_cli/spark/endpoints/job/get_app.py +++ b/aztk_cli/spark/endpoints/job/get_app.py @@ -1,5 +1,4 @@ import argparse -import time import typing import aztk.spark @@ -7,8 +6,8 @@ from aztk_cli import config, utils def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='job_id', required=True, help='The unique id of your AZTK job') - parser.add_argument('--name', dest='app_name', required=True, help='The unique id of your job name') + parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK job") + parser.add_argument("--name", dest="app_name", required=True, help="The unique id of your job name") def execute(args: typing.NamedTuple): diff --git a/aztk_cli/spark/endpoints/job/get_app_logs.py b/aztk_cli/spark/endpoints/job/get_app_logs.py index c4ae76aa..ec4206a8 100644 --- a/aztk_cli/spark/endpoints/job/get_app_logs.py +++ b/aztk_cli/spark/endpoints/job/get_app_logs.py @@ -7,12 +7,13 @@ from aztk_cli import config, log, utils def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='job_id', required=True, help='The unique id of your AZTK job') - parser.add_argument('--name', dest='app_name', required=True, help='The unique id of your job name') + parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK job") + parser.add_argument("--name", dest="app_name", required=True, help="The unique id of your job name") parser.add_argument( - '--output', - help='Path to the file you wish to output to. If not \ - specified, output is printed to stdout') + "--output", + help="Path to the file you wish to output to. If not \ + specified, output is printed to stdout", + ) def execute(args: typing.NamedTuple): diff --git a/aztk_cli/spark/endpoints/job/list.py b/aztk_cli/spark/endpoints/job/list.py index a1ef757e..d48a2545 100644 --- a/aztk_cli/spark/endpoints/job/list.py +++ b/aztk_cli/spark/endpoints/job/list.py @@ -1,5 +1,4 @@ import argparse -import time import typing import aztk.spark @@ -11,7 +10,7 @@ def setup_parser(_: argparse.ArgumentParser): pass -def execute(args: typing.NamedTuple): +def execute(_: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) utils.print_jobs(spark_client.job.list()) diff --git a/aztk_cli/spark/endpoints/job/list_apps.py b/aztk_cli/spark/endpoints/job/list_apps.py index 343490e5..d916078d 100644 --- a/aztk_cli/spark/endpoints/job/list_apps.py +++ b/aztk_cli/spark/endpoints/job/list_apps.py @@ -6,7 +6,7 @@ from aztk_cli import config, utils def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='job_id', required=True, help='The unique id of your AZTK job') + parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK job") def execute(args: typing.NamedTuple): diff --git a/aztk_cli/spark/endpoints/job/stop.py b/aztk_cli/spark/endpoints/job/stop.py index 4fd5e9f9..48dc6dfb 100644 --- a/aztk_cli/spark/endpoints/job/stop.py +++ b/aztk_cli/spark/endpoints/job/stop.py @@ -1,13 +1,12 @@ import argparse -import time import typing import aztk.spark -from aztk_cli import config, log, utils +from aztk_cli import config, log def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='job_id', required=True, help='The unique id of your AZTK job') + parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK job") def execute(args: typing.NamedTuple): diff --git a/aztk_cli/spark/endpoints/job/stop_app.py b/aztk_cli/spark/endpoints/job/stop_app.py index f1599892..1c54456c 100644 --- a/aztk_cli/spark/endpoints/job/stop_app.py +++ b/aztk_cli/spark/endpoints/job/stop_app.py @@ -1,20 +1,19 @@ import argparse -import time import typing import aztk.spark -from aztk_cli import config, log, utils +from aztk_cli import config, log def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('--id', dest='job_id', required=True, help='The unique id of your AZTK job') - parser.add_argument('--name', dest='app_name', required=True, help='The unique id of your job name') + parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK job") + parser.add_argument("--name", dest="app_name", required=True, help="The unique id of your job name") def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) if spark_client.job.stop_application(args.job_id, args.app_name): - log.info("Stopped app {0}".format(args.app_name)) + log.info("Stopped app %s", args.app_name) else: - log.error("App with name {0} does not exist or was already deleted") + log.error("App with name %s does not exist or was already deleted", args.app_name) diff --git a/aztk_cli/spark/endpoints/job/submit.py b/aztk_cli/spark/endpoints/job/submit.py index 2ba8e2af..8f236a32 100644 --- a/aztk_cli/spark/endpoints/job/submit.py +++ b/aztk_cli/spark/endpoints/job/submit.py @@ -8,16 +8,18 @@ from aztk_cli.config import JobConfig def setup_parser(parser: argparse.ArgumentParser): parser.add_argument( - '--id', - dest='job_id', + "--id", + dest="job_id", required=False, - help='The unique id of your Spark Job. Defaults to the id value in .aztk/job.yaml') + help="The unique id of your Spark Job. Defaults to the id value in .aztk/job.yaml", + ) parser.add_argument( - '--configuration', - '-c', - dest='job_conf', + "--configuration", + "-c", + dest="job_conf", required=False, - help='Path to the job.yaml configuration file. Defaults to .aztk/job.yaml') + help="Path to the job.yaml configuration file. Defaults to .aztk/job.yaml", + ) def execute(args: typing.NamedTuple): @@ -49,5 +51,5 @@ def execute(args: typing.NamedTuple): scheduling_target=job_conf.scheduling_target, ) - #TODO: utils.print_job_conf(job_configuration) + # TODO: utils.print_job_conf(job_configuration) spark_client.job.submit(job_configuration) diff --git a/aztk_cli/toolkit.py b/aztk_cli/toolkit.py index 52ce718a..d415335c 100644 --- a/aztk_cli/toolkit.py +++ b/aztk_cli/toolkit.py @@ -6,10 +6,10 @@ from aztk_cli import log def setup_parser(parser: argparse.ArgumentParser): - parser.add_argument('toolkit_software', nargs='?') - parser.add_argument('version', nargs='?') - parser.add_argument('environment', nargs='?') - parser.add_argument('--gpu', action='store_true') + parser.add_argument("toolkit_software", nargs="?") + parser.add_argument("version", nargs="?") + parser.add_argument("environment", nargs="?") + parser.add_argument("--gpu", action="store_true") def execute(args: typing.NamedTuple): @@ -24,14 +24,10 @@ def execute(args: typing.NamedTuple): if not args.environment: print_available_environments(args.toolkit_software) - toolkit = Toolkit( - software=args.toolkit_software, - version=args.version, - environment=args.environment, - ) + toolkit = Toolkit(software=args.toolkit_software, version=args.version, environment=args.environment) toolkit.validate() - log.info('Docker image picked for this toolkit: %s', toolkit.get_docker_repo(args.gpu)) + log.info("Docker image picked for this toolkit: %s", toolkit.get_docker_repo(args.gpu)) return None diff --git a/aztk_cli/utils.py b/aztk_cli/utils.py index 503f6145..6b36e9a2 100644 --- a/aztk_cli/utils.py +++ b/aztk_cli/utils.py @@ -4,7 +4,6 @@ import subprocess import sys import threading import time -from subprocess import call from typing import List import azure.batch.models as batch_models @@ -12,7 +11,7 @@ import azure.batch.models as batch_models from aztk import error, utils from aztk.models import ClusterConfiguration from aztk.spark import models -from aztk.utils import get_ssh_key, helpers +from aztk.utils import get_ssh_key from . import log @@ -35,8 +34,10 @@ def get_ssh_key_or_prompt(ssh_key, username, password, secrets_configuration): break else: raise error.AztkError( - "Failed to get valid password, cannot add user to cluster. It is recommended that you provide a ssh public key in .aztk/secrets.yaml. Or provide an ssh-key or password with command line parameters (--ssh-key or --password). You may also run the 'aztk spark cluster add-user' command to add a user to this cluster." - ) + "Failed to get valid password, cannot add user to cluster. " + "It is recommended that you provide a ssh public key in .aztk/secrets.yaml. " + "Or provide an ssh-key or password with command line parameters (--ssh-key or --password). " + "You may also run the 'aztk spark cluster add-user' command to add a user to this cluster.") return ssh_key, password @@ -53,13 +54,13 @@ def print_cluster(client, cluster: models.Cluster, internal: bool = False): log.info("| Low priority: %s", __pretty_low_pri_node_count(cluster)) log.info("") - print_format = '|{:^36}| {:^19} | {:^21}| {:^10} | {:^8} |' - print_format_underline = '|{:-^36}|{:-^21}|{:-^22}|{:-^12}|{:-^10}|' + print_format = "|{:^36}| {:^19} | {:^21}| {:^10} | {:^8} |" + print_format_underline = "|{:-^36}|{:-^21}|{:-^22}|{:-^12}|{:-^10}|" if internal: log.info(print_format.format("Nodes", "State", "IP", "Dedicated", "Master")) else: log.info(print_format.format("Nodes", "State", "IP:Port", "Dedicated", "Master")) - log.info(print_format_underline.format('', '', '', '', '')) + log.info(print_format_underline.format("", "", "", "", "")) if not cluster.nodes: return @@ -68,44 +69,47 @@ def print_cluster(client, cluster: models.Cluster, internal: bool = False): if internal: ip = node.ip_address else: - ip = '{}:{}'.format(remote_login_settings.ip_address, remote_login_settings.port) + ip = "{}:{}".format(remote_login_settings.ip_address, remote_login_settings.port) log.info( - print_format.format(node.id, node.state.value, ip, "*" if node.is_dedicated else '', '*' - if node.id == cluster.master_node_id else '')) - log.info('') + print_format.format( + node.id, + node.state.value, + ip, + "*" if node.is_dedicated else "", + "*" if node.id == cluster.master_node_id else "", + )) + log.info("") def __pretty_node_count(cluster: models.Cluster) -> str: if cluster.pool.allocation_state is batch_models.AllocationState.resizing: - return '{} -> {}'.format(cluster.total_current_nodes, cluster.total_target_nodes) + return "{} -> {}".format(cluster.total_current_nodes, cluster.total_target_nodes) else: - return '{}'.format(cluster.total_current_nodes) + return "{}".format(cluster.total_current_nodes) def __pretty_dedicated_node_count(cluster: models.Cluster) -> str: - if (cluster.pool.allocation_state is batch_models.AllocationState.resizing - or cluster.pool.state is batch_models.PoolState.deleting)\ - and cluster.current_dedicated_nodes != cluster.target_dedicated_nodes: - return '{} -> {}'.format(cluster.current_dedicated_nodes, cluster.target_dedicated_nodes) + if (cluster.pool.allocation_state is batch_models.AllocationState.resizing or cluster.pool.state is + batch_models.PoolState.deleting) and cluster.current_dedicated_nodes != cluster.target_dedicated_nodes: + return "{} -> {}".format(cluster.current_dedicated_nodes, cluster.target_dedicated_nodes) else: - return '{}'.format(cluster.current_dedicated_nodes) + return "{}".format(cluster.current_dedicated_nodes) def __pretty_low_pri_node_count(cluster: models.Cluster) -> str: - if (cluster.pool.allocation_state is batch_models.AllocationState.resizing - or cluster.pool.state is batch_models.PoolState.deleting)\ - and cluster.current_low_pri_nodes != cluster.target_low_pri_nodes: - return '{} -> {}'.format(cluster.current_low_pri_nodes, cluster.target_low_pri_nodes) + if (cluster.pool.allocation_state is batch_models.AllocationState.resizing or cluster.pool.state is + batch_models.PoolState.deleting) and cluster.current_low_pri_nodes != cluster.target_low_pri_nodes: + return "{} -> {}".format(cluster.current_low_pri_nodes, cluster.target_low_pri_nodes) else: - return '{}'.format(cluster.current_low_pri_nodes) + return "{}".format(cluster.current_low_pri_nodes) def print_clusters(clusters: List[models.Cluster]): - print_format = '{:<34}| {:<10}| {:<20}| {:<7}' - print_format_underline = '{:-<34}|{:-<11}|{:-<21}|{:-<7}' + print_format = "{:<34}| {:<10}| {:<20}| {:<7}" + print_format_underline = "{:-<34}|{:-<11}|{:-<21}|{:-<7}" - log.info(print_format.format('Cluster', 'State', 'VM Size', 'Nodes')) - log.info(print_format_underline.format('', '', '', '')) + log.info(print_format.format("Cluster", "State", "VM Size", "Nodes")) + log.info(print_format_underline.format("", "", "", "")) for cluster in clusters: node_count = __pretty_node_count(cluster) @@ -113,7 +117,7 @@ def print_clusters(clusters: List[models.Cluster]): def print_clusters_quiet(clusters: List[models.Cluster]): - log.print('\n'.join([str(cluster.id) for cluster in clusters])) + log.print("\n".join([str(cluster.id) for cluster in clusters])) def stream_logs(client, cluster_id, application_name): @@ -122,23 +126,25 @@ def stream_logs(client, cluster_id, application_name): app_logs = client.cluster.get_application_log( id=cluster_id, application_name=application_name, tail=True, current_bytes=current_bytes) log.print(app_logs.log) - if app_logs.application_state == 'completed': + if app_logs.application_state == "completed": return app_logs.exit_code current_bytes = app_logs.total_bytes time.sleep(3) -def ssh_in_master(client, - cluster_id: str, - cluster_configuration: models.ClusterConfiguration, - username: str = None, - webui: str = None, - jobui: str = None, - jobhistoryui: str = None, - ports=None, - host: bool = False, - connect: bool = True, - internal: bool = False): +def ssh_in_master( + client, + cluster_id: str, + cluster_configuration: models.ClusterConfiguration, + username: str = None, + webui: str = None, + jobui: str = None, + jobhistoryui: str = None, + ports=None, + host: bool = False, + connect: bool = True, + internal: bool = False, +): """ SSH into head node of spark-app :param cluster_id: Id of the cluster to ssh in @@ -166,11 +172,10 @@ def ssh_in_master(client, master_node_port = remote_login_settings.port spark_web_ui_port = utils.constants.DOCKER_SPARK_WEB_UI_PORT - spark_worker_ui_port = utils.constants.DOCKER_SPARK_WORKER_UI_PORT spark_job_ui_port = utils.constants.DOCKER_SPARK_JOB_UI_PORT spark_job_history_ui_port = utils.constants.DOCKER_SPARK_JOB_UI_HISTORY_PORT - ssh_command = utils.command_builder.CommandBuilder('ssh') + ssh_command = utils.command_builder.CommandBuilder("ssh") # get ssh private key path if specified ssh_priv_key = client.secrets_configuration.ssh_priv_key @@ -192,21 +197,21 @@ def ssh_in_master(client, if port.expose_publicly: ssh_command.add_option("-L", "{0}:localhost:{1}".format(port.public_port, port.internal)) - user = username if username is not None else '' + user = username if username is not None else "" if internal: ssh_command.add_argument("{0}@{1}".format(user, master_internal_node_ip)) else: ssh_command.add_argument("{0}@{1} -p {2}".format(user, master_node_ip, master_node_port)) if host is False: - ssh_command.add_argument("\'sudo docker exec -it spark /bin/bash\'") + ssh_command.add_argument("'sudo docker exec -it spark /bin/bash'") command = ssh_command.to_str() if connect: - call(command, shell=True) + subprocess.call(command, shell=True) - return '\n\t{}\n'.format(command) + return "\n\t{}\n".format(command) def print_batch_exception(batch_exception): @@ -216,35 +221,28 @@ def print_batch_exception(batch_exception): """ log.error("-------------------------------------------") log.error("Exception encountered:") - if batch_exception.error and \ - batch_exception.error.message and \ - batch_exception.error.message.value: + if batch_exception.error and batch_exception.error.message and batch_exception.error.message.value: log.error(batch_exception.error.message.value) if batch_exception.error.values: - log.error('') + log.error("") for mesg in batch_exception.error.values: log.error("%s:\t%s", mesg.key, mesg.value) log.error("-------------------------------------------") -''' - Job submission -''' - - def print_jobs(jobs: List[models.Job]): - print_format = '{:<34}| {:<10}| {:<20}' - print_format_underline = '{:-<34}|{:-<11}|{:-<21}' + print_format = "{:<34}| {:<10}| {:<20}" + print_format_underline = "{:-<34}|{:-<11}|{:-<21}" - log.info(print_format.format('Job', 'State', 'Creation Time')) - log.info(print_format_underline.format('', '', '', '')) + log.info(print_format.format("Job", "State", "Creation Time")) + log.info(print_format_underline.format("", "", "", "")) for job in jobs: log.info(print_format.format(job.id, job.state, utc_to_local(job.creation_time))) def print_job(client, job: models.Job): - print_format = '{:<36}| {:<15}' + print_format = "{:<36}| {:<15}" log.info("") log.info("Job %s", job.id) @@ -256,7 +254,7 @@ def print_job(client, job: models.Job): if job.cluster: print_cluster_summary(job.cluster) else: - if job.state == 'completed': + if job.state == "completed": log.info("Cluster %s", "Job completed, cluster deallocated.") log.info("") else: @@ -272,23 +270,23 @@ def print_job(client, job: models.Job): def node_state_count(cluster: models.Cluster): states = {} - for state in batch_models.ComputeNodeState: - states[state] = 0 for node in cluster.nodes: - states[node.state] += 1 + states[node.state] = states.get(node.state, 0) + 1 return states def print_cluster_summary(cluster: models.Cluster): - print_format = '{:<4} {:<23} {:<15}' - log.info("Cluster %s", cluster.id) log.info("-" * 42) log.info("Nodes %s", __pretty_node_count(cluster)) log.info("| Dedicated: %s", __pretty_dedicated_node_count(cluster)) log.info("| Low priority: %s", __pretty_low_pri_node_count(cluster)) state_count = node_state_count(cluster) - log.info("Master %s", cluster.master_node_id or "Pending") + if state_count: + log.info("| Node States:") + for state in state_count: + log.info("| \t%s: %d", state.name, state_count[state]) + log.info("Master: %s", cluster.master_node_id or "Pending") log.info("") @@ -300,13 +298,13 @@ def application_summary(applications): warn_scheduling = False for application in applications: - if type(application) == str: + if isinstance(application, str): states["scheduling"] += 1 warn_scheduling = True else: states[application.state] += 1 - print_format = '{:<17} {:<14}' + print_format = "{:<17} {:<14}" log.info("Applications") log.info("-" * 42) for state in states: @@ -318,10 +316,10 @@ def application_summary(applications): def print_applications(applications): - print_format = '{:<36}| {:<15}| {:<16} | {:^9} |' - print_format_underline = '{:-<36}|{:-<16}|{:-<18}|{:-<11}|' + print_format = "{:<36}| {:<15}| {:<16} | {:^9} |" + print_format_underline = "{:-<36}|{:-<16}|{:-<18}|{:-<11}|" log.info(print_format.format("Applications", "State", "Transition Time", "Exit Code")) - log.info(print_format_underline.format('', '', '', '')) + log.info(print_format_underline.format("", "", "", "")) warn_scheduling = False for name in applications: @@ -331,15 +329,18 @@ def print_applications(applications): else: application = applications[name] log.info( - print_format.format(application.name, application.state, - utc_to_local(application.state_transition_time), application.exit_code - if application.exit_code is not None else "-")) + print_format.format( + application.name, + application.state, + utc_to_local(application.state_transition_time), + application.exit_code if application.exit_code is not None else "-", + )) if warn_scheduling: log.warning("\nNo Spark applications will be scheduled until the master is selected.") def print_application(application: models.Application): - print_format = '{:<30}| {:<15}' + print_format = "{:<30}| {:<15}" log.info("") log.info("Application %s", application.name) @@ -356,12 +357,13 @@ class Spinner: @staticmethod def spinning_cursor(): while 1: - for cursor in '|/-\\': + for cursor in "|/-\\": yield cursor def __init__(self, delay=None): self.spinner_generator = self.spinning_cursor() - if delay and float(delay): self.delay = delay + if delay and float(delay): + self.delay = delay def __enter__(self): return self.start() @@ -374,7 +376,7 @@ class Spinner: sys.stdout.write(next(self.spinner_generator)) sys.stdout.flush() time.sleep(self.delay) - sys.stdout.write('\b') + sys.stdout.write("\b") sys.stdout.flush() def start(self): @@ -410,7 +412,7 @@ def print_cluster_conf(cluster_conf: ClusterConfiguration, wait: bool): if user_configuration: log.info("username: %s", user_configuration.username) if user_configuration.password: - log.info("Password: %s", '*' * len(user_configuration.password)) + log.info("Password: %s", "*" * len(user_configuration.password)) log.info("Plugins:") if not cluster_conf.plugins: log.info(" None Configured") diff --git a/pylintrc b/pylintrc index c1b570b5..dfc5bb91 100644 --- a/pylintrc +++ b/pylintrc @@ -66,7 +66,7 @@ confidence= # no Warning level messages displayed, use"--disable=all --enable=classes # --disable=W" # disable=print-statement,parameter-unpacking,unpacking-in-except,old-raise-syntax,backtick,import-star-module-level,apply-builtin,basestring-builtin,buffer-builtin,cmp-builtin,coerce-builtin,execfile-builtin,file-builtin,long-builtin,raw_input-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,no-absolute-import,old-division,dict-iter-method,dict-view-method,next-method-called,metaclass-assignment,indexing-exception,raising-string,reload-builtin,oct-method,hex-method,nonzero-method,cmp-method,input-builtin,round-builtin,intern-builtin,unichr-builtin,map-builtin-not-iterating,zip-builtin-not-iterating,range-builtin-not-iterating,filter-builtin-not-iterating,using-cmp-argument,long-suffix,old-ne-operator,old-octal-literal,suppressed-message,useless-suppression -disable = C0111,W0401,I0011,C0103,E1101,too-few-public-methods,too-many-instance-attributes,too-many-arguments,too-many-public-methods +disable = C0111,I0011,C0103,E1101,too-few-public-methods,too-many-instance-attributes,too-many-arguments,too-many-public-methods,redefined-builtin,no-else-return [REPORTS] diff --git a/requirements.txt b/requirements.txt index 89fb5ded..2c87df90 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ paramiko>=2.4 # Development yapf==0.22.0 -pylint==1.8.4 +pylint==2.1.1 pytest==3.1.3 pytest-xdist==1.22.0 twine==1.11.0 diff --git a/tests/integration_tests/spark/sdk/clean_up_cluster.py b/tests/integration_tests/spark/sdk/clean_up_cluster.py new file mode 100644 index 00000000..275028e0 --- /dev/null +++ b/tests/integration_tests/spark/sdk/clean_up_cluster.py @@ -0,0 +1,26 @@ +import azure.batch.models as batch_models +from azure.batch.models import BatchErrorException + +from aztk.error import AztkError + + +def clean_up_cluster(spark_client, id): + try: + cluster = spark_client.cluster.get(id) + nodes = [node for node in cluster.nodes] + if not any([ + node.state in [batch_models.ComputeNodeState.unusable, batch_models.ComputeNodeState.start_task_failed] + for node in nodes + ]): + spark_client.cluster.delete(id=id) + except (BatchErrorException, AztkError) as e: + # pass in the event that the cluster does not exist + print(str(e)) + acceptable_failures = [ + "The specified job has been marked for deletion and is being garbage collected.", + "The specified pool has been marked for deletion and is being reclaimed." + ] + if any(item in str(e) for item in acceptable_failures): + pass + else: + raise e diff --git a/tests/integration_tests/spark/sdk/cluster/test_cluster.py b/tests/integration_tests/spark/sdk/cluster/test_cluster.py index 50807c85..229a1e42 100644 --- a/tests/integration_tests/spark/sdk/cluster/test_cluster.py +++ b/tests/integration_tests/spark/sdk/cluster/test_cluster.py @@ -12,59 +12,18 @@ import aztk.spark from aztk.error import AztkError from aztk.utils import constants from aztk_cli import config -from tests.integration_tests.spark.sdk.get_client import (get_spark_client, get_test_suffix) +from tests.integration_tests.spark.sdk.clean_up_cluster import clean_up_cluster +from tests.integration_tests.spark.sdk.ensure_spark_processes import \ + ensure_spark_processes +from tests.integration_tests.spark.sdk.get_client import (get_spark_client, + get_test_suffix) +from tests.integration_tests.spark.sdk.wait_for_all_nodes import \ + wait_for_all_nodes base_cluster_id = get_test_suffix("cluster") spark_client = get_spark_client() -def clean_up_cluster(cluster_id): - try: - spark_client.cluster.delete(id=cluster_id) - except (BatchErrorException, AztkError): - # pass in the event that the cluster does not exist - pass - - -def ensure_spark_master(cluster_id): - results = spark_client.cluster.run(cluster_id, - "if $AZTK_IS_MASTER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.master.Master 1 ;" \ - " else echo AZTK_IS_MASTER is false ; fi") - for _, result in results: - if isinstance(result, Exception): - raise result - print(result[0]) - assert result[0] in ["org.apache.spark.deploy.master.Master is running.", "AZTK_IS_MASTER is false"] - - -def ensure_spark_worker(cluster_id): - results = spark_client.cluster.run(cluster_id, - "if $AZTK_IS_WORKER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.worker.Worker 1 ;" \ - " else echo AZTK_IS_WORKER is false ; fi") - for _, result in results: - if isinstance(result, Exception): - raise result - assert result[0] in ["org.apache.spark.deploy.worker.Worker is running.", "AZTK_IS_WORKER is false"] - - -def ensure_spark_processes(cluster_id): - ensure_spark_master(cluster_id) - ensure_spark_worker(cluster_id) - - -def wait_for_all_nodes(cluster_id, nodes): - while True: - for node in nodes: - if node.state in [batch_models.ComputeNodeState.unusable, batch_models.ComputeNodeState.start_task_failed]: - raise AztkError("Node {} in failed state.".format(node.id)) - if node.state not in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: - break - else: - nodes = spark_client.cluster.get(cluster_id).nodes - continue - break - - def test_create_cluster(): test_id = "test-create-" # TODO: make Cluster Configuration more robust, test each value @@ -89,11 +48,8 @@ def test_create_cluster(): assert cluster.master_node_id is not None assert cluster.current_low_pri_nodes == 0 - except (AztkError, BatchErrorException) as e: - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_list_clusters(): @@ -113,11 +69,8 @@ def test_list_clusters(): assert cluster_configuration.cluster_id in [cluster.id for cluster in clusters] - except (AztkError, BatchErrorException): - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_get_remote_login_settings(): @@ -138,13 +91,8 @@ def test_get_remote_login_settings(): assert rls.ip_address is not None assert rls.port is not None - - except (AztkError, BatchErrorException) as e: - raise e - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_submit(): @@ -178,13 +126,8 @@ def test_submit(): spark_client.cluster.submit( id=cluster_configuration.cluster_id, application=application_configuration, wait=True) - assert True - - except (AztkError, BatchErrorException): - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_get_application_log(): @@ -230,11 +173,8 @@ def test_get_application_log(): assert application_log.log is not None assert application_log.total_bytes is not None - except (AztkError, BatchErrorException): - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_create_user_password(): @@ -283,11 +223,8 @@ def test_get_application_status_complete(): assert status == "completed" - except (AztkError, BatchErrorException): - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_delete_cluster(): @@ -308,11 +245,8 @@ def test_delete_cluster(): assert success is True - except (AztkError, BatchErrorException): - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_spark_processes_up(): @@ -326,23 +260,16 @@ def test_spark_processes_up(): file_shares=None, toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), spark_configuration=None) - try: cluster = spark_client.cluster.create(cluster_configuration, wait=True) - wait_for_all_nodes(cluster.id, cluster.nodes) - success = spark_client.cluster.delete(id=cluster_configuration.cluster_id) - - assert success is True - - except (AztkError, BatchErrorException): - assert False - + wait_for_all_nodes(spark_client, cluster.id, cluster.nodes) + ensure_spark_processes(spark_client=spark_client, id=cluster_configuration.cluster_id) finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_debug_tool(): - test_id = "debug-tool-" + test_id = "test-debug-tool-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, @@ -359,7 +286,7 @@ def test_debug_tool(): try: cluster = spark_client.cluster.create(cluster_configuration, wait=True) nodes = [node for node in cluster.nodes] - wait_for_all_nodes(cluster.id, nodes) + wait_for_all_nodes(spark_client, cluster.id, nodes) cluster_output = spark_client.cluster.diagnostics(id=cluster.id) for node_output in cluster_output: node_output.output.seek(0) # tempfile requires seek 0 before reading @@ -367,8 +294,5 @@ def test_debug_tool(): assert node_output.id in [node.id for node in nodes] assert node_output.error is None assert any(member in name for name in debug_zip.namelist() for member in expected_members) - except (AztkError, BatchErrorException): - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) diff --git a/tests/integration_tests/spark/sdk/cluster/test_cluster_deprecated.py b/tests/integration_tests/spark/sdk/cluster/test_cluster_deprecated.py index 5304f85b..55d43272 100644 --- a/tests/integration_tests/spark/sdk/cluster/test_cluster_deprecated.py +++ b/tests/integration_tests/spark/sdk/cluster/test_cluster_deprecated.py @@ -12,61 +12,19 @@ import aztk.spark from aztk.error import AztkError from aztk.utils import constants from aztk_cli import config +from tests.integration_tests.spark.sdk.clean_up_cluster import clean_up_cluster +from tests.integration_tests.spark.sdk.ensure_spark_processes import \ + ensure_spark_processes from tests.integration_tests.spark.sdk.get_client import (get_spark_client, get_test_suffix) +from tests.integration_tests.spark.sdk.wait_for_all_nodes import \ + wait_for_all_nodes base_cluster_id = get_test_suffix("cluster") spark_client = get_spark_client() -def clean_up_cluster(cluster_id): - try: - spark_client.delete_cluster(cluster_id=cluster_id) - except (BatchErrorException, AztkError): - # pass in the event that the cluster does not exist - pass - - -def ensure_spark_master(cluster_id): - results = spark_client.cluster_run(cluster_id, - "if $AZTK_IS_MASTER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.master.Master 1 ;" \ - " else echo AZTK_IS_MASTER is false ; fi") - for _, result in results: - if isinstance(result, Exception): - raise result - print(result[0]) - assert result[0] in ["org.apache.spark.deploy.master.Master is running.", "AZTK_IS_MASTER is false"] - - -def ensure_spark_worker(cluster_id): - results = spark_client.cluster_run(cluster_id, - "if $AZTK_IS_WORKER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.worker.Worker 1 ;" \ - " else echo AZTK_IS_WORKER is false ; fi") - for _, result in results: - if isinstance(result, Exception): - raise result - assert result[0] in ["org.apache.spark.deploy.worker.Worker is running.", "AZTK_IS_WORKER is false"] - - -def ensure_spark_processes(cluster_id): - ensure_spark_master(cluster_id) - ensure_spark_worker(cluster_id) - - -def wait_for_all_nodes(cluster_id, nodes): - while True: - for node in nodes: - if node.state in [batch_models.ComputeNodeState.unusable, batch_models.ComputeNodeState.start_task_failed]: - raise AztkError("Node {} in failed state.".format(node.id)) - if node.state not in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: - break - else: - nodes = spark_client.cluster.get(cluster_id).nodes - continue - break - - def test_create_cluster(): - test_id = "test-create-" + test_id = "test-create-deprecated-" # TODO: make Cluster Configuration more robust, test each value cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, @@ -90,15 +48,12 @@ def test_create_cluster(): assert cluster.master_node_id is not None assert cluster.current_low_pri_nodes == 0 - except (AztkError, BatchErrorException) as e: - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_get_cluster(): - test_id = "test-get-" + test_id = "test-get-deprecated-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, @@ -123,15 +78,12 @@ def test_get_cluster(): assert cluster.master_node_id is not None assert cluster.current_low_pri_nodes == 0 - except (AztkError, BatchErrorException) as e: - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_list_clusters(): - test_id = "test-list-" + test_id = "test-list-deprecated-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, @@ -149,15 +101,12 @@ def test_list_clusters(): assert cluster_configuration.cluster_id in [cluster.id for cluster in clusters] - except (AztkError, BatchErrorException): - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_get_remote_login_settings(): - test_id = "test-get-remote-login-" + test_id = "test-get-remote-login-deprecated-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, @@ -178,16 +127,12 @@ def test_get_remote_login_settings(): assert rls.ip_address is not None assert rls.port is not None - except (AztkError, BatchErrorException) as e: - raise e - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_submit(): - test_id = "test-submit-" + test_id = "test-submit-deprecated-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, @@ -220,17 +165,12 @@ def test_submit(): spark_client.submit( cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) - assert True - - except (AztkError, BatchErrorException): - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_get_application_log(): - test_id = "test-get-app-log-" + test_id = "test-get-app-log-deprecated-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, @@ -275,11 +215,8 @@ def test_get_application_log(): assert application_log.log is not None assert application_log.total_bytes is not None - except (AztkError, BatchErrorException): - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_create_user_password(): @@ -293,7 +230,7 @@ def test_create_user_ssh_key(): def test_get_application_status_complete(): - test_id = "test-app-status-complete-" + test_id = "test-app-status-complete-deprecated-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, @@ -330,15 +267,12 @@ def test_get_application_status_complete(): assert status == "completed" - except (AztkError, BatchErrorException): - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_delete_cluster(): - test_id = "test-delete-" + test_id = "test-delete-deprecated-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, @@ -356,15 +290,12 @@ def test_delete_cluster(): assert success is True - except (AztkError, BatchErrorException): - assert False - finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_spark_processes_up(): - test_id = "test-spark-processes-up-" + test_id = "test-spark-processes-up-deprecated-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, @@ -378,22 +309,16 @@ def test_spark_processes_up(): try: with pytest.warns(DeprecationWarning): cluster = spark_client.create_cluster(cluster_configuration, wait=True) - wait_for_all_nodes(cluster.id, cluster.nodes) + wait_for_all_nodes(spark_client, cluster.id, cluster.nodes) - with pytest.warns(DeprecationWarning): - success = spark_client.delete_cluster(cluster_id=cluster_configuration.cluster_id) - - assert success is True - - except (AztkError, BatchErrorException): - assert False + ensure_spark_processes(spark_client=spark_client, id=cluster_configuration.cluster_id) finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) def test_debug_tool(): - test_id = "debug-tool-" + test_id = "test-debug-tool-" cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id=test_id + base_cluster_id, size=2, @@ -412,7 +337,7 @@ def test_debug_tool(): cluster = spark_client.create_cluster(cluster_configuration, wait=True) nodes = [node for node in cluster.nodes] - wait_for_all_nodes(cluster.id, nodes) + wait_for_all_nodes(spark_client, cluster.id, nodes) with pytest.warns(DeprecationWarning): cluster_output = spark_client.run_cluster_diagnostics(cluster_id=cluster.id) @@ -423,8 +348,6 @@ def test_debug_tool(): assert node_output.id in [node.id for node in nodes] assert node_output.error is None assert any(member in name for name in debug_zip.namelist() for member in expected_members) - except (AztkError, BatchErrorException): - assert False finally: - clean_up_cluster(cluster_configuration.cluster_id) + clean_up_cluster(spark_client, cluster_configuration.cluster_id) diff --git a/tests/integration_tests/spark/sdk/ensure_spark_processes.py b/tests/integration_tests/spark/sdk/ensure_spark_processes.py new file mode 100644 index 00000000..247fd8ba --- /dev/null +++ b/tests/integration_tests/spark/sdk/ensure_spark_processes.py @@ -0,0 +1,34 @@ +import azure.batch.models as batch_models + +from aztk.error import AztkError + + +def ensure_spark_master(spark_client, id): + results = spark_client.cluster.run( + id, + "if $AZTK_IS_MASTER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.master.Master 1 ;" + " else echo AZTK_IS_MASTER is false ; fi") + for result in results: + if result.error: + raise result.error + assert result.output.rstrip() in [ + "org.apache.spark.deploy.master.Master is running.", "AZTK_IS_MASTER is false" + ] + + +def ensure_spark_worker(spark_client, id): + results = spark_client.cluster.run( + id, + "if $AZTK_IS_WORKER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.worker.Worker 1 ;" + " else echo AZTK_IS_WORKER is false ; fi") + for result in results: + if result.error: + raise result + assert result.output.rstrip() in [ + "org.apache.spark.deploy.worker.Worker is running.", "AZTK_IS_WORKER is false" + ] + + +def ensure_spark_processes(spark_client, id): + ensure_spark_master(spark_client, id) + ensure_spark_worker(spark_client, id) diff --git a/tests/integration_tests/spark/sdk/wait_for_all_nodes.py b/tests/integration_tests/spark/sdk/wait_for_all_nodes.py new file mode 100644 index 00000000..db3a8c15 --- /dev/null +++ b/tests/integration_tests/spark/sdk/wait_for_all_nodes.py @@ -0,0 +1,23 @@ +import time + +import azure.batch.models as batch_models + +from aztk.error import AztkError + + +def wait_for_all_nodes(spark_client, id, nodes): + nodes = [node for node in nodes] + start_time = time.time() + while (time.time() - start_time) < 300: + if any([ + node.state in [batch_models.ComputeNodeState.unusable, batch_models.ComputeNodeState.start_task_failed] + for node in nodes + ]): + raise AztkError("A node is unusable or had its start task fail.") + + if not all(node.state in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running] + for node in nodes): + nodes = [node for node in spark_client.cluster.get(id).nodes] + time.sleep(1) + else: + break diff --git a/tests/utils/test_retry.py b/tests/utils/test_retry.py new file mode 100644 index 00000000..ff8f2589 --- /dev/null +++ b/tests/utils/test_retry.py @@ -0,0 +1,81 @@ +import time + +import pytest + +from aztk.utils import BackOffPolicy, retry + + +def test_retry_function_raises_allowed_exception(): + with pytest.raises(ValueError): + + @retry(exceptions=(ValueError)) + def raise_allowed_error(): + raise ValueError + + raise_allowed_error() + + +def test_retry_function_raises_diallowed_exception(): + with pytest.raises(FileExistsError): + + @retry(exceptions=(ValueError)) + def raise_disallowed_error(): + raise FileExistsError + + raise_disallowed_error() + + +def test_retry_function_retry_count(): + # use a mutable type to test number retries + my_list = [] + with pytest.raises(ValueError): + + @retry(retry_count=3, exceptions=(ValueError)) + def my_func(): + my_list.append(0) + raise ValueError + + my_func() + + assert len(my_list) == 3 + + +def test_retry_function_retry_interval(): + with pytest.raises(ValueError): + + @retry(retry_count=2, retry_interval=1, exceptions=(ValueError)) + def my_func(): + raise ValueError + + start = time.time() + my_func() + end = time.time() + assert int(end - start) == 2 + + +def test_retry_function_backoff_policy_linear(): + with pytest.raises(ValueError): + + @retry(retry_count=2, retry_interval=1, exceptions=(ValueError)) + def my_func(): + raise ValueError + + start = time.time() + my_func() + end = time.time() + assert int(end - start) == 2 # 1 + 1 + + +def test_retry_function_backoff_policy_exponential(): + @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ValueError)) + def my_func(): + raise ValueError + + start = time.time() + try: + my_func() + except ValueError: + pass + end = time.time() + print(end - start) + assert int(end - start) == 7 # 2**0 + 2**1 + 2**3