зеркало из https://github.com/Azure/aztk.git
Internal: fix pylint warnings (#651)
* inital, remove unused imports * run yapf * remove unused imports and variables, fix declaration outside init * fix some pylint warnings, add ssh_into_master * remove unused imports * unused variables * string and function normalization * stop using list comprehension for side effects, make method function * stop using protected member * various pylint fixes * formatting * formatting * add retry decorator with tests * start adding retry decorator, retry docker compose download * update pip and tests * logic fix * change no delete if * factor out reused functions * fix wait_for_all_nodes * fix download return type bug * test vsts ci update * temporarily disable integration tests * syntax fix * update vsts build * add back integration tests, remove debug branch * remove parallel unit tests * more verbose clis * update pylint * typo * fix imports * function returns nothing, don't return * make iterator list * change debug value
This commit is contained in:
Родитель
0a9ce94104
Коммит
828162ef10
15
.vsts-ci.yml
15
.vsts-ci.yml
|
@ -1,7 +1,6 @@
|
|||
trigger:
|
||||
- master
|
||||
|
||||
|
||||
phases:
|
||||
- phase: Test
|
||||
queue: Hosted Linux Preview
|
||||
|
@ -24,16 +23,22 @@ phases:
|
|||
displayName: yapf
|
||||
|
||||
- script: |
|
||||
pylint -j 2 -E aztk aztk_cli
|
||||
pylint -jobs 2 --errors-only aztk aztk_cli
|
||||
condition: succeeded()
|
||||
displayName: pylint
|
||||
displayName: pylint error check
|
||||
|
||||
- script: |
|
||||
pytest -n 20 --ignore=tests/integration_tests
|
||||
pytest --ignore=tests/integration_tests
|
||||
condition: succeeded()
|
||||
displayName: unit tests
|
||||
|
||||
- script: |
|
||||
pytest -n 75
|
||||
pytest --numprocesses=75
|
||||
condition: succeeded()
|
||||
displayName: integration tests
|
||||
|
||||
- script: |
|
||||
pylint -jobs 2 --disable=fixme aztk aztk_cli
|
||||
continueOnError: true
|
||||
condition: succeeded()
|
||||
displayName: pylint report
|
||||
|
|
|
@ -1,10 +1,19 @@
|
|||
from aztk import models
|
||||
from aztk.internal import cluster_data
|
||||
from aztk.utils import ssh as ssh_lib
|
||||
|
||||
from .helpers import (create_user_on_cluster, create_user_on_node, delete_user_on_cluster, delete_user_on_node,
|
||||
generate_user_on_cluster, generate_user_on_node, get_application_log, get_remote_login_settings,
|
||||
node_run, run, ssh_into_node)
|
||||
from .helpers import (
|
||||
create_user_on_cluster,
|
||||
create_user_on_node,
|
||||
delete_user_on_cluster,
|
||||
delete_user_on_node,
|
||||
generate_user_on_cluster,
|
||||
generate_user_on_node,
|
||||
get_application_log,
|
||||
get_remote_login_settings,
|
||||
node_run,
|
||||
run,
|
||||
ssh_into_node,
|
||||
)
|
||||
|
||||
|
||||
class BaseOperations:
|
||||
|
@ -15,14 +24,14 @@ class BaseOperations:
|
|||
Azure Batch service.
|
||||
blob_client (:obj:`azure.storage.blob.BlockBlobService`): Client used to interact with the Azure Storage
|
||||
Blob service.
|
||||
secrets_configuration (:obj:`aztk.models.SecretsConfiguration`): Model that holds AZTK secrets used to authenticate
|
||||
with Azure and the clusters.
|
||||
secrets_configuration (:obj:`aztk.models.SecretsConfiguration`):
|
||||
Model that holds AZTK secrets used to authenticate with Azure and the clusters.
|
||||
"""
|
||||
|
||||
def __init__(self, context):
|
||||
self.batch_client = context['batch_client']
|
||||
self.blob_client = context['blob_client']
|
||||
self.secrets_configuration = context['secrets_configuration']
|
||||
self.batch_client = context["batch_client"]
|
||||
self.blob_client = context["blob_client"]
|
||||
self.secrets_configuration = context["secrets_configuration"]
|
||||
|
||||
def get_cluster_configuration(self, id: str) -> models.ClusterConfiguration:
|
||||
"""Open an ssh tunnel to a node
|
||||
|
@ -62,7 +71,8 @@ class BaseOperations:
|
|||
id (:obj:`str`): the id of the cluster the node is in
|
||||
node_id (:obj:`str`): the id of the node to open the ssh tunnel to
|
||||
username (:obj:`str`): the username to authenticate the ssh session
|
||||
ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None.
|
||||
ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password.
|
||||
Defaults to None.
|
||||
password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None.
|
||||
port_forward_list (:obj:`List[PortForwardingSpecification`, optional): list of PortForwardingSpecifications.
|
||||
The defined ports will be forwarded to the client.
|
||||
|
@ -89,7 +99,7 @@ class BaseOperations:
|
|||
"""
|
||||
return create_user_on_node.create_user_on_node(self, id, node_id, username, ssh_key, password)
|
||||
|
||||
#TODO: remove nodes as param
|
||||
# TODO: remove nodes as param
|
||||
def create_user_on_cluster(self, id, nodes, username, ssh_pub_key=None, password=None):
|
||||
"""Create a user on every node in the cluster
|
||||
|
||||
|
@ -97,7 +107,8 @@ class BaseOperations:
|
|||
username (:obj:`str`): name of the user to create.
|
||||
id (:obj:`str`): id of the cluster to create the user on.
|
||||
nodes (:obj:`List[ComputeNode]`): list of nodes to create the user on
|
||||
ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None.
|
||||
ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password.
|
||||
Defaults to None.
|
||||
password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None.
|
||||
|
||||
Returns:
|
||||
|
@ -117,7 +128,7 @@ class BaseOperations:
|
|||
"""
|
||||
return generate_user_on_node.generate_user_on_node(self, id, node_id)
|
||||
|
||||
#TODO: remove nodes as param
|
||||
# TODO: remove nodes as param
|
||||
def generate_user_on_cluster(self, id, nodes):
|
||||
"""Create a user with an autogenerated username and ssh_key on the cluster
|
||||
|
||||
|
@ -143,7 +154,7 @@ class BaseOperations:
|
|||
"""
|
||||
return delete_user_on_node.delete_user(self, id, node_id, username)
|
||||
|
||||
#TODO: remove nodes as param
|
||||
# TODO: remove nodes as param
|
||||
def delete_user_on_cluster(self, username, id, nodes):
|
||||
"""Delete a user on every node in the cluster
|
||||
|
||||
|
@ -212,10 +223,11 @@ class BaseOperations:
|
|||
Args:
|
||||
id (:obj:`str`): the id of the cluster to run the command on.
|
||||
application_name (:obj:`str`): str
|
||||
tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. Otherwise, the whole log will be retrieved.
|
||||
Only use this if streaming the log as it is being written. Defaults to False.
|
||||
current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are retrieved.
|
||||
Only useful is streaming the log as it is being written. Only used if tail is True.
|
||||
tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes.
|
||||
Otherwise, the whole log will be retrieved. Only use this if streaming the log as it is being written.
|
||||
Defaults to False.
|
||||
current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes
|
||||
are retrieved. Only useful is streaming the log as it is being written. Only used if tail is True.
|
||||
|
||||
Returns:
|
||||
:obj:`aztk.models.ApplicationLog`: a model representing the output of the application.
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import concurrent.futures
|
||||
|
||||
|
||||
#TODO: remove nodes param
|
||||
# TODO: remove nodes param
|
||||
def create_user_on_cluster(base_operations, id, nodes, username, ssh_pub_key=None, password=None):
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = {
|
||||
|
|
|
@ -3,7 +3,6 @@ from datetime import datetime, timedelta, timezone
|
|||
import azure.batch.models as batch_models
|
||||
import azure.batch.models.batch_error as batch_error
|
||||
|
||||
from aztk import models
|
||||
from aztk.utils import get_ssh_key
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import concurrent.futures
|
||||
|
||||
|
||||
#TODO: remove nodes param
|
||||
# TODO: remove nodes param
|
||||
def delete_user_on_cluster(base_client, id, nodes, username):
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = [executor.submit(base_client.delete_user_on_node, id, node.id, username) for node in nodes]
|
||||
|
|
|
@ -5,11 +5,11 @@ from Cryptodome.PublicKey import RSA
|
|||
from aztk.utils import secure_utils
|
||||
|
||||
|
||||
#TODO: remove nodes param
|
||||
# TODO: remove nodes param
|
||||
def generate_user_on_cluster(base_operations, id, nodes):
|
||||
generated_username = secure_utils.generate_random_string()
|
||||
ssh_key = RSA.generate(2048)
|
||||
ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8')
|
||||
ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8")
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = {
|
||||
executor.submit(base_operations.create_user_on_node, id, node.id, generated_username, ssh_pub_key): node
|
||||
|
|
|
@ -6,6 +6,6 @@ from aztk.utils import secure_utils
|
|||
def generate_user_on_node(base_client, pool_id, node_id):
|
||||
generated_username = secure_utils.generate_random_string()
|
||||
ssh_key = RSA.generate(2048)
|
||||
ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8')
|
||||
ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8")
|
||||
base_client.create_user_on_node(pool_id, node_id, generated_username, ssh_pub_key)
|
||||
return generated_username, ssh_key
|
||||
|
|
|
@ -4,12 +4,10 @@ import azure
|
|||
import azure.batch.models as batch_models
|
||||
import azure.batch.models.batch_error as batch_error
|
||||
|
||||
from aztk import error
|
||||
from aztk import models
|
||||
from aztk import error, models
|
||||
from aztk.utils import constants, helpers
|
||||
|
||||
output_file = constants.TASK_WORKING_DIR + \
|
||||
"/" + constants.SPARK_SUBMIT_LOGS_FILE
|
||||
output_file = constants.TASK_WORKING_DIR + "/" + constants.SPARK_SUBMIT_LOGS_FILE
|
||||
|
||||
|
||||
def __check_task_node_exist(batch_client, cluster_id: str, task: batch_models.CloudTask) -> bool:
|
||||
|
@ -50,17 +48,18 @@ def __get_output_file_properties(batch_client, cluster_id: str, application_name
|
|||
|
||||
def get_log_from_storage(blob_client, container_name, application_name, task):
|
||||
try:
|
||||
blob = blob_client.get_blob_to_text(container_name, application_name + '/' + constants.SPARK_SUBMIT_LOGS_FILE)
|
||||
blob = blob_client.get_blob_to_text(container_name, application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE)
|
||||
except azure.common.AzureMissingResourceHttpError:
|
||||
raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.")
|
||||
|
||||
return models.ApplicationLog(
|
||||
name=application_name,
|
||||
cluster_id=container_name,
|
||||
application_state=task.state._value_,
|
||||
application_state=task.state.name,
|
||||
log=blob.content,
|
||||
total_bytes=blob.properties.content_length,
|
||||
exit_code=task.execution_info.exit_code)
|
||||
exit_code=task.execution_info.exit_code,
|
||||
)
|
||||
|
||||
|
||||
def get_log(batch_client, blob_client, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0):
|
||||
|
@ -88,18 +87,20 @@ def get_log(batch_client, blob_client, cluster_id: str, application_name: str, t
|
|||
return models.ApplicationLog(
|
||||
name=application_name,
|
||||
cluster_id=cluster_id,
|
||||
application_state=task.state._value_,
|
||||
application_state=task.state.name,
|
||||
log=content,
|
||||
total_bytes=target_bytes,
|
||||
exit_code=task.execution_info.exit_code)
|
||||
exit_code=task.execution_info.exit_code,
|
||||
)
|
||||
else:
|
||||
return models.ApplicationLog(
|
||||
name=application_name,
|
||||
cluster_id=cluster_id,
|
||||
application_state=task.state._value_,
|
||||
log='',
|
||||
application_state=task.state.name,
|
||||
log="",
|
||||
total_bytes=target_bytes,
|
||||
exit_code=task.execution_info.exit_code)
|
||||
exit_code=task.execution_info.exit_code,
|
||||
)
|
||||
|
||||
|
||||
def get_application_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0):
|
||||
|
|
|
@ -22,9 +22,10 @@ def node_run(base_client, cluster_id, node_id, command, internal, container_name
|
|||
generated_username,
|
||||
node_rls.ip_address,
|
||||
node_rls.port,
|
||||
ssh_key=ssh_key.exportKey().decode('utf-8'),
|
||||
ssh_key=ssh_key.exportKey().decode("utf-8"),
|
||||
container_name=container_name,
|
||||
timeout=timeout)
|
||||
timeout=timeout,
|
||||
)
|
||||
return output
|
||||
finally:
|
||||
base_client.delete_user_on_node(cluster_id, node.id, generated_username)
|
||||
|
|
|
@ -26,9 +26,10 @@ def cluster_run(base_operations, cluster_id, command, internal, container_name=N
|
|||
command,
|
||||
generated_username,
|
||||
cluster_nodes,
|
||||
ssh_key=ssh_key.exportKey().decode('utf-8'),
|
||||
ssh_key=ssh_key.exportKey().decode("utf-8"),
|
||||
container_name=container_name,
|
||||
timeout=timeout))
|
||||
timeout=timeout,
|
||||
))
|
||||
return output
|
||||
except OSError as exc:
|
||||
raise exc
|
||||
|
|
|
@ -13,8 +13,6 @@ import aztk.utils.constants as constants
|
|||
import aztk.utils.get_ssh_key as get_ssh_key
|
||||
import aztk.utils.helpers as helpers
|
||||
import aztk.utils.ssh as ssh_lib
|
||||
from aztk.client.cluster import CoreClusterOperations
|
||||
from aztk.client.job import CoreJobOperations
|
||||
from aztk.internal import cluster_data
|
||||
from aztk.utils import deprecated, secure_utils
|
||||
|
||||
|
@ -27,6 +25,11 @@ class CoreClient:
|
|||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.secrets_configuration = None
|
||||
self.batch_client = None
|
||||
self.blob_client = None
|
||||
|
||||
def _get_context(self, secrets_configuration: models.SecretsConfiguration):
|
||||
self.secrets_configuration = secrets_configuration
|
||||
|
||||
|
@ -34,9 +37,9 @@ class CoreClient:
|
|||
self.batch_client = azure_api.make_batch_client(secrets_configuration)
|
||||
self.blob_client = azure_api.make_blob_client(secrets_configuration)
|
||||
context = {
|
||||
'batch_client': self.batch_client,
|
||||
'blob_client': self.blob_client,
|
||||
'secrets_configuration': self.secrets_configuration,
|
||||
"batch_client": self.batch_client,
|
||||
"blob_client": self.blob_client,
|
||||
"secrets_configuration": self.secrets_configuration,
|
||||
}
|
||||
return context
|
||||
|
||||
|
@ -52,9 +55,9 @@ class CoreClient:
|
|||
"""
|
||||
return cluster_data.ClusterData(self.blob_client, cluster_id)
|
||||
|
||||
'''
|
||||
"""
|
||||
General Batch Operations
|
||||
'''
|
||||
"""
|
||||
|
||||
@deprecated("0.10.0")
|
||||
def __delete_pool_and_job(self, pool_id: str, keep_logs: bool = False):
|
||||
|
@ -104,9 +107,8 @@ class CoreClient:
|
|||
job_id = cluster_conf.cluster_id
|
||||
|
||||
# Get a verified node agent sku
|
||||
sku_to_use, image_ref_to_use = \
|
||||
helpers.select_latest_verified_vm_image_with_node_agent_sku(
|
||||
VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client)
|
||||
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
|
||||
VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client)
|
||||
|
||||
network_conf = None
|
||||
if cluster_conf.subnet_id is not None:
|
||||
|
@ -130,8 +132,9 @@ class CoreClient:
|
|||
metadata=[
|
||||
batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key),
|
||||
batch_models.MetadataItem(
|
||||
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA)
|
||||
])
|
||||
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA),
|
||||
],
|
||||
)
|
||||
|
||||
# Create the pool + create user for the pool
|
||||
helpers.create_pool_if_not_exist(pool, self.batch_client)
|
||||
|
@ -184,13 +187,16 @@ class CoreClient:
|
|||
"""
|
||||
# Create new ssh user for the given node
|
||||
self.batch_client.compute_node.add_user(
|
||||
pool_id, node_id,
|
||||
pool_id,
|
||||
node_id,
|
||||
batch_models.ComputeNodeUser(
|
||||
name=username,
|
||||
is_admin=True,
|
||||
password=password,
|
||||
ssh_public_key=get_ssh_key.get_user_public_key(ssh_key, self.secrets_configuration),
|
||||
expiry_time=datetime.now(timezone.utc) + timedelta(days=365)))
|
||||
expiry_time=datetime.now(timezone.utc) + timedelta(days=365),
|
||||
),
|
||||
)
|
||||
|
||||
@deprecated("0.10.0")
|
||||
def __delete_user(self, pool_id: str, node_id: str, username: str) -> str:
|
||||
|
@ -229,7 +235,7 @@ class CoreClient:
|
|||
def __generate_user_on_node(self, pool_id, node_id):
|
||||
generated_username = secure_utils.generate_random_string()
|
||||
ssh_key = RSA.generate(2048)
|
||||
ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8')
|
||||
ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8")
|
||||
self.__create_user_on_node(generated_username, pool_id, node_id, ssh_pub_key)
|
||||
return generated_username, ssh_key
|
||||
|
||||
|
@ -237,7 +243,7 @@ class CoreClient:
|
|||
def __generate_user_on_pool(self, pool_id, nodes):
|
||||
generated_username = secure_utils.generate_random_string()
|
||||
ssh_key = RSA.generate(2048)
|
||||
ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8')
|
||||
ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8")
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = {
|
||||
executor.submit(self.__create_user_on_node, generated_username, pool_id, node.id, ssh_pub_key): node
|
||||
|
@ -283,9 +289,10 @@ class CoreClient:
|
|||
generated_username,
|
||||
node_rls.ip_address,
|
||||
node_rls.port,
|
||||
ssh_key=ssh_key.exportKey().decode('utf-8'),
|
||||
ssh_key=ssh_key.exportKey().decode("utf-8"),
|
||||
container_name=container_name,
|
||||
timeout=timeout)
|
||||
timeout=timeout,
|
||||
)
|
||||
return output
|
||||
finally:
|
||||
self.__delete_user(cluster_id, node.id, generated_username)
|
||||
|
@ -306,9 +313,10 @@ class CoreClient:
|
|||
command,
|
||||
generated_username,
|
||||
cluster_nodes,
|
||||
ssh_key=ssh_key.exportKey().decode('utf-8'),
|
||||
ssh_key=ssh_key.exportKey().decode("utf-8"),
|
||||
container_name=container_name,
|
||||
timeout=timeout))
|
||||
timeout=timeout,
|
||||
))
|
||||
return output
|
||||
except OSError as exc:
|
||||
raise exc
|
||||
|
@ -316,14 +324,16 @@ class CoreClient:
|
|||
self.__delete_user_on_pool(generated_username, pool.id, nodes)
|
||||
|
||||
@deprecated("0.10.0")
|
||||
def __cluster_copy(self,
|
||||
cluster_id,
|
||||
source_path,
|
||||
destination_path=None,
|
||||
container_name=None,
|
||||
internal=False,
|
||||
get=False,
|
||||
timeout=None):
|
||||
def __cluster_copy(
|
||||
self,
|
||||
cluster_id,
|
||||
source_path,
|
||||
destination_path=None,
|
||||
container_name=None,
|
||||
internal=False,
|
||||
get=False,
|
||||
timeout=None,
|
||||
):
|
||||
pool, nodes = self.__get_pool_details(cluster_id)
|
||||
nodes = list(nodes)
|
||||
if internal:
|
||||
|
@ -340,9 +350,10 @@ class CoreClient:
|
|||
nodes=cluster_nodes,
|
||||
source_path=source_path,
|
||||
destination_path=destination_path,
|
||||
ssh_key=ssh_key.exportKey().decode('utf-8'),
|
||||
ssh_key=ssh_key.exportKey().decode("utf-8"),
|
||||
get=get,
|
||||
timeout=timeout))
|
||||
timeout=timeout,
|
||||
))
|
||||
return output
|
||||
except (OSError, batch_error.BatchErrorException) as exc:
|
||||
raise exc
|
||||
|
@ -375,8 +386,16 @@ class CoreClient:
|
|||
)
|
||||
|
||||
@deprecated("0.10.0")
|
||||
def __submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula,
|
||||
software_metadata_key: str, vm_image_model, application_metadata):
|
||||
def __submit_job(
|
||||
self,
|
||||
job_configuration,
|
||||
start_task,
|
||||
job_manager_task,
|
||||
autoscale_formula,
|
||||
software_metadata_key: str,
|
||||
vm_image_model,
|
||||
application_metadata,
|
||||
):
|
||||
"""
|
||||
Job Submission
|
||||
:param job_configuration -> aztk_sdk.spark.models.JobConfiguration
|
||||
|
@ -390,9 +409,8 @@ class CoreClient:
|
|||
self._get_cluster_data(job_configuration.id).save_cluster_config(job_configuration.to_cluster_config())
|
||||
|
||||
# get a verified node agent sku
|
||||
sku_to_use, image_ref_to_use = \
|
||||
helpers.select_latest_verified_vm_image_with_node_agent_sku(
|
||||
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, self.batch_client)
|
||||
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
|
||||
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, self.batch_client)
|
||||
|
||||
# set up subnet if necessary
|
||||
network_conf = None
|
||||
|
@ -419,8 +437,10 @@ class CoreClient:
|
|||
metadata=[
|
||||
batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key),
|
||||
batch_models.MetadataItem(
|
||||
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA)
|
||||
]))
|
||||
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA),
|
||||
],
|
||||
),
|
||||
)
|
||||
|
||||
# define job specification
|
||||
job_spec = batch_models.JobSpecification(
|
||||
|
@ -428,7 +448,8 @@ class CoreClient:
|
|||
display_name=job_configuration.id,
|
||||
on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job,
|
||||
job_manager_task=job_manager_task,
|
||||
metadata=[batch_models.MetadataItem(name='applications', value=application_metadata)])
|
||||
metadata=[batch_models.MetadataItem(name="applications", value=application_metadata)],
|
||||
)
|
||||
|
||||
# define schedule
|
||||
schedule = batch_models.Schedule(
|
||||
|
|
|
@ -8,14 +8,16 @@ from aztk.utils import ssh as ssh_lib
|
|||
from aztk.utils import helpers
|
||||
|
||||
|
||||
def cluster_copy(cluster_operations,
|
||||
cluster_id,
|
||||
source_path,
|
||||
destination_path=None,
|
||||
container_name=None,
|
||||
internal=False,
|
||||
get=False,
|
||||
timeout=None):
|
||||
def cluster_copy(
|
||||
cluster_operations,
|
||||
cluster_id,
|
||||
source_path,
|
||||
destination_path=None,
|
||||
container_name=None,
|
||||
internal=False,
|
||||
get=False,
|
||||
timeout=None,
|
||||
):
|
||||
cluster = cluster_operations.get(cluster_id)
|
||||
pool, nodes = cluster.pool, list(cluster.nodes)
|
||||
if internal:
|
||||
|
@ -36,9 +38,10 @@ def cluster_copy(cluster_operations,
|
|||
nodes=cluster_nodes,
|
||||
source_path=source_path,
|
||||
destination_path=destination_path,
|
||||
ssh_key=ssh_key.exportKey().decode('utf-8'),
|
||||
ssh_key=ssh_key.exportKey().decode("utf-8"),
|
||||
get=get,
|
||||
timeout=timeout))
|
||||
timeout=timeout,
|
||||
))
|
||||
return output
|
||||
except (OSError, batch_error.BatchErrorException) as exc:
|
||||
raise exc
|
||||
|
|
|
@ -5,8 +5,13 @@ from aztk import models
|
|||
from aztk.utils import helpers, constants
|
||||
|
||||
|
||||
def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterConfiguration, software_metadata_key: str,
|
||||
start_task, VmImageModel):
|
||||
def create_pool_and_job(
|
||||
core_cluster_operations,
|
||||
cluster_conf: models.ClusterConfiguration,
|
||||
software_metadata_key: str,
|
||||
start_task,
|
||||
VmImageModel,
|
||||
):
|
||||
"""
|
||||
Create a pool and job
|
||||
:param cluster_conf: the configuration object used to create the cluster
|
||||
|
@ -22,9 +27,8 @@ def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterCon
|
|||
job_id = cluster_conf.cluster_id
|
||||
|
||||
# Get a verified node agent sku
|
||||
sku_to_use, image_ref_to_use = \
|
||||
helpers.select_latest_verified_vm_image_with_node_agent_sku(
|
||||
VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, core_cluster_operations.batch_client)
|
||||
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
|
||||
VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, core_cluster_operations.batch_client)
|
||||
|
||||
network_conf = None
|
||||
if cluster_conf.subnet_id is not None:
|
||||
|
@ -48,8 +52,9 @@ def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterCon
|
|||
metadata=[
|
||||
batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key),
|
||||
batch_models.MetadataItem(
|
||||
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA)
|
||||
])
|
||||
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA),
|
||||
],
|
||||
)
|
||||
|
||||
# Create the pool + create user for the pool
|
||||
helpers.create_pool_if_not_exist(pool, core_cluster_operations.batch_client)
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
import azure.batch.models as batch_models
|
||||
from msrest.exceptions import ClientRequestError
|
||||
|
||||
from aztk.utils import BackOffPolicy, retry
|
||||
|
||||
|
||||
def delete_pool_and_job(core_cluster_operations, pool_id: str, keep_logs: bool = False):
|
||||
|
@ -19,13 +22,18 @@ def delete_pool_and_job(core_cluster_operations, pool_id: str, keep_logs: bool =
|
|||
pool_exists = core_cluster_operations.batch_client.pool.exists(pool_id)
|
||||
|
||||
if job_exists:
|
||||
core_cluster_operations.batch_client.job.delete(job_id)
|
||||
delete_batch_object(core_cluster_operations.batch_client.job.delete, job_id)
|
||||
|
||||
if pool_exists:
|
||||
core_cluster_operations.batch_client.pool.delete(pool_id)
|
||||
delete_batch_object(core_cluster_operations.batch_client.pool.delete, pool_id)
|
||||
|
||||
if not keep_logs:
|
||||
cluster_data = core_cluster_operations.get_cluster_data(pool_id)
|
||||
cluster_data.delete_container(pool_id)
|
||||
|
||||
return job_exists or pool_exists
|
||||
|
||||
|
||||
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
|
||||
def delete_batch_object(function, *args, **kwargs):
|
||||
return function(*args, **kwargs)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#TODO: return Cluster instead of (pool, nodes)
|
||||
# TODO: return Cluster instead of (pool, nodes)
|
||||
from aztk import models
|
||||
|
||||
|
||||
|
|
|
@ -13,7 +13,8 @@ class CoreClusterOperations(BaseOperations):
|
|||
cluster_configuration (:obj:`aztk.models.ClusterConfiguration`): Configuration for the cluster to be created
|
||||
software_metadata_key (:obj:`str`): the key for the primary software that will be run on the cluster
|
||||
start_task (:obj:`azure.batch.models.StartTask`): Batch StartTask defintion to configure the Batch Pool
|
||||
vm_image_model (:obj:`azure.batch.models.VirtualMachineConfiguration`): Configuration of the virtual machine image and settings
|
||||
vm_image_model (:obj:`azure.batch.models.VirtualMachineConfiguration`):
|
||||
Configuration of the virtual machine image and settings
|
||||
|
||||
Returns:
|
||||
:obj:`aztk.models.Cluster`: A Cluster object representing the state and configuration of the cluster.
|
||||
|
@ -52,7 +53,8 @@ class CoreClusterOperations(BaseOperations):
|
|||
Defaults to None.
|
||||
|
||||
Returns:
|
||||
:obj:`List[aztk.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command.
|
||||
:obj:`List[aztk.models.NodeOutput]`:
|
||||
A list of NodeOutput objects representing the output of the copy command.
|
||||
"""
|
||||
return copy.cluster_copy(self, id, source_path, destination_path, container_name, internal, get, timeout)
|
||||
|
||||
|
@ -65,7 +67,8 @@ class CoreClusterOperations(BaseOperations):
|
|||
Defaults to False.
|
||||
|
||||
Returns:
|
||||
:obj:`List[aztk.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command.
|
||||
:obj:`List[aztk.models.NodeOutput]`:
|
||||
A list of NodeOutput objects representing the output of the copy command.
|
||||
"""
|
||||
return delete.delete_pool_and_job(self, id, keep_logs)
|
||||
|
||||
|
|
|
@ -1,11 +1,20 @@
|
|||
from datetime import timedelta
|
||||
|
||||
import azure.batch.models as batch_models
|
||||
from aztk.utils import helpers, constants
|
||||
|
||||
from aztk.utils import constants, helpers
|
||||
|
||||
|
||||
def submit_job(job_client, job_configuration, start_task, job_manager_task, autoscale_formula,
|
||||
software_metadata_key: str, vm_image_model, application_metadata):
|
||||
def submit_job(
|
||||
job_client,
|
||||
job_configuration,
|
||||
start_task,
|
||||
job_manager_task,
|
||||
autoscale_formula,
|
||||
software_metadata_key: str,
|
||||
vm_image_model,
|
||||
application_metadata,
|
||||
):
|
||||
"""
|
||||
Job Submission
|
||||
:param job_configuration -> aztk_sdk.spark.models.JobConfiguration
|
||||
|
@ -19,9 +28,8 @@ def submit_job(job_client, job_configuration, start_task, job_manager_task, auto
|
|||
job_client.get_cluster_data(job_configuration.id).save_cluster_config(job_configuration.to_cluster_config())
|
||||
|
||||
# get a verified node agent sku
|
||||
sku_to_use, image_ref_to_use = \
|
||||
helpers.select_latest_verified_vm_image_with_node_agent_sku(
|
||||
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, job_client.batch_client)
|
||||
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
|
||||
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, job_client.batch_client)
|
||||
|
||||
# set up subnet if necessary
|
||||
network_conf = None
|
||||
|
@ -48,8 +56,10 @@ def submit_job(job_client, job_configuration, start_task, job_manager_task, auto
|
|||
metadata=[
|
||||
batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key),
|
||||
batch_models.MetadataItem(
|
||||
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA)
|
||||
]))
|
||||
name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA),
|
||||
],
|
||||
),
|
||||
)
|
||||
|
||||
# define job specification
|
||||
job_spec = batch_models.JobSpecification(
|
||||
|
@ -57,7 +67,8 @@ def submit_job(job_client, job_configuration, start_task, job_manager_task, auto
|
|||
display_name=job_configuration.id,
|
||||
on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job,
|
||||
job_manager_task=job_manager_task,
|
||||
metadata=[batch_models.MetadataItem(name='applications', value=application_metadata)])
|
||||
metadata=[batch_models.MetadataItem(name="applications", value=application_metadata)],
|
||||
)
|
||||
|
||||
# define schedule
|
||||
schedule = batch_models.Schedule(
|
||||
|
|
|
@ -4,8 +4,16 @@ from .helpers import submit
|
|||
|
||||
|
||||
class CoreJobOperations(BaseOperations):
|
||||
def submit(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key: str,
|
||||
vm_image_model, application_metadata):
|
||||
def submit(
|
||||
self,
|
||||
job_configuration,
|
||||
start_task,
|
||||
job_manager_task,
|
||||
autoscale_formula,
|
||||
software_metadata_key: str,
|
||||
vm_image_model,
|
||||
application_metadata,
|
||||
):
|
||||
"""Submit a job
|
||||
|
||||
Jobs are a cluster definition and one or many application definitions which run on the cluster. The job's
|
||||
|
@ -26,5 +34,13 @@ class CoreJobOperations(BaseOperations):
|
|||
Returns:
|
||||
:obj:`azure.batch.models.CloudJobSchedule`: Model representing the Azure Batch JobSchedule state.
|
||||
"""
|
||||
return submit.submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula,
|
||||
software_metadata_key, vm_image_model, application_metadata)
|
||||
return submit.submit_job(
|
||||
self,
|
||||
job_configuration,
|
||||
start_task,
|
||||
job_manager_task,
|
||||
autoscale_formula,
|
||||
software_metadata_key,
|
||||
vm_image_model,
|
||||
application_metadata,
|
||||
)
|
||||
|
|
|
@ -35,8 +35,8 @@ class Field:
|
|||
"""
|
||||
|
||||
def __init__(self, *validators, **kwargs):
|
||||
self.default = kwargs.get('default')
|
||||
self.required = 'default' not in kwargs
|
||||
self.default = kwargs.get("default")
|
||||
self.required = "default" not in kwargs
|
||||
self.validators = []
|
||||
|
||||
if self.required:
|
||||
|
@ -44,7 +44,7 @@ class Field:
|
|||
|
||||
self.validators.extend(validators)
|
||||
|
||||
choices = kwargs.get('choices')
|
||||
choices = kwargs.get("choices")
|
||||
if choices:
|
||||
self.validators.append(aztk_validators.In(choices))
|
||||
|
||||
|
@ -134,11 +134,11 @@ class List(Field):
|
|||
|
||||
def __init__(self, model=None, **kwargs):
|
||||
self.model = model
|
||||
kwargs.setdefault('default', list)
|
||||
self.merge_strategy = kwargs.get('merge_strategy', ListMergeStrategy.Append)
|
||||
self.skip_none = kwargs.get('skip_none', True)
|
||||
kwargs.setdefault("default", list)
|
||||
self.merge_strategy = kwargs.get("merge_strategy", ListMergeStrategy.Append)
|
||||
self.skip_none = kwargs.get("skip_none", True)
|
||||
|
||||
super().__init__(aztk_validators.List(*kwargs.get('inner_validators', [])), **kwargs)
|
||||
super().__init__(aztk_validators.List(*kwargs.get("inner_validators", [])), **kwargs)
|
||||
|
||||
def __set__(self, instance, value):
|
||||
if isinstance(value, collections.MutableSequence):
|
||||
|
@ -175,7 +175,7 @@ class List(Field):
|
|||
output = []
|
||||
if items is not None:
|
||||
for item in items:
|
||||
if hasattr(item, 'to_dict'):
|
||||
if hasattr(item, "to_dict"):
|
||||
output.append(item.to_dict())
|
||||
else:
|
||||
output.append(item)
|
||||
|
@ -196,7 +196,7 @@ class Model(Field):
|
|||
super().__init__(aztk_validators.Model(model), *args, **kwargs)
|
||||
|
||||
self.model = model
|
||||
self.merge_strategy = kwargs.get('merge_strategy', ModelMergeStrategy.Merge)
|
||||
self.merge_strategy = kwargs.get("merge_strategy", ModelMergeStrategy.Merge)
|
||||
|
||||
def __set__(self, instance, value):
|
||||
if isinstance(value, collections.MutableMapping):
|
||||
|
|
|
@ -11,19 +11,19 @@ class ModelMeta(type):
|
|||
"""
|
||||
|
||||
def __new__(mcs, name, bases, attrs):
|
||||
attrs['_fields'] = {}
|
||||
attrs["_fields"] = {}
|
||||
|
||||
for base in bases:
|
||||
if hasattr(base, '_fields'):
|
||||
if hasattr(base, "_fields"):
|
||||
for k, v in base._fields.items():
|
||||
attrs['_fields'][k] = v
|
||||
attrs["_fields"][k] = v
|
||||
for k, v in base.__dict__.items():
|
||||
if isinstance(v, fields.Field):
|
||||
attrs['_fields'][k] = v
|
||||
attrs["_fields"][k] = v
|
||||
|
||||
for k, v in attrs.items():
|
||||
if isinstance(v, fields.Field):
|
||||
attrs['_fields'][k] = v
|
||||
attrs["_fields"][k] = v
|
||||
|
||||
return super().__new__(mcs, name, bases, attrs)
|
||||
|
||||
|
@ -84,7 +84,7 @@ class Model(metaclass=ModelMeta):
|
|||
e.model = self
|
||||
raise e
|
||||
|
||||
if hasattr(self, '__validate__'):
|
||||
if hasattr(self, "__validate__"):
|
||||
self.__validate__()
|
||||
|
||||
def merge(self, other):
|
||||
|
|
|
@ -24,7 +24,7 @@ class Required(Validator):
|
|||
|
||||
def validate(self, value):
|
||||
if value is None:
|
||||
raise InvalidModelFieldError('is required')
|
||||
raise InvalidModelFieldError("is required")
|
||||
|
||||
|
||||
class String(Validator):
|
||||
|
@ -37,7 +37,7 @@ class String(Validator):
|
|||
return
|
||||
|
||||
if not isinstance(value, str):
|
||||
raise InvalidModelFieldError('{0} should be a string'.format(value))
|
||||
raise InvalidModelFieldError("{0} should be a string".format(value))
|
||||
|
||||
|
||||
class Integer(Validator):
|
||||
|
@ -50,7 +50,7 @@ class Integer(Validator):
|
|||
return
|
||||
|
||||
if not isinstance(value, int):
|
||||
raise InvalidModelFieldError('{0} should be an integer'.format(value))
|
||||
raise InvalidModelFieldError("{0} should be an integer".format(value))
|
||||
|
||||
|
||||
class Float(Validator):
|
||||
|
@ -63,7 +63,7 @@ class Float(Validator):
|
|||
return
|
||||
|
||||
if not isinstance(value, float):
|
||||
raise InvalidModelFieldError('{0} should be a float'.format(value))
|
||||
raise InvalidModelFieldError("{0} should be a float".format(value))
|
||||
|
||||
|
||||
class Boolean(Validator):
|
||||
|
@ -74,7 +74,7 @@ class Boolean(Validator):
|
|||
return
|
||||
|
||||
if not isinstance(value, bool):
|
||||
raise InvalidModelFieldError('{0} should be a boolean'.format(value))
|
||||
raise InvalidModelFieldError("{0} should be a boolean".format(value))
|
||||
|
||||
|
||||
class In(Validator):
|
||||
|
@ -90,7 +90,7 @@ class In(Validator):
|
|||
return
|
||||
|
||||
if value not in self.choices:
|
||||
raise InvalidModelFieldError('{0} should be in {1}'.format(value, self.choices))
|
||||
raise InvalidModelFieldError("{0} should be in {1}".format(value, self.choices))
|
||||
|
||||
|
||||
class InstanceOf(Validator):
|
||||
|
@ -140,7 +140,7 @@ class List(Validator):
|
|||
return
|
||||
|
||||
if not isinstance(value, collections.MutableSequence):
|
||||
raise InvalidModelFieldError('should be a list')
|
||||
raise InvalidModelFieldError("should be a list")
|
||||
|
||||
for i in value:
|
||||
for validator in self.validators:
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import azure.batch.models as batch_models
|
||||
import datetime
|
||||
from azure.storage.blob import BlockBlobService, BlobPermissions
|
||||
|
||||
import azure.batch.models as batch_models
|
||||
from azure.storage.blob import BlobPermissions, BlockBlobService
|
||||
|
||||
|
||||
class BlobData:
|
||||
|
@ -19,7 +20,8 @@ class BlobData:
|
|||
self.container,
|
||||
self.blob,
|
||||
permission=BlobPermissions.READ,
|
||||
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365))
|
||||
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365),
|
||||
)
|
||||
|
||||
sas_url = self.blob_client.make_blob_url(self.container, self.blob, sas_token=sas_token)
|
||||
|
||||
|
|
|
@ -3,8 +3,10 @@ import logging
|
|||
|
||||
import azure.common
|
||||
import yaml
|
||||
from msrest.exceptions import ClientRequestError
|
||||
|
||||
from aztk.models import ClusterConfiguration
|
||||
from aztk.utils import BackOffPolicy, retry
|
||||
|
||||
from .blob_data import BlobData
|
||||
from .node_data import NodeData
|
||||
|
@ -14,6 +16,7 @@ class ClusterData:
|
|||
"""
|
||||
Class handling the management of data for a cluster
|
||||
"""
|
||||
|
||||
# ALl data related to cluster(config, metadata, etc.) should be under this folder
|
||||
CLUSTER_DIR = "cluster"
|
||||
APPLICATIONS_DIR = "applications"
|
||||
|
@ -24,26 +27,30 @@ class ClusterData:
|
|||
self.cluster_id = cluster_id
|
||||
self._ensure_container()
|
||||
|
||||
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
|
||||
def save_cluster_config(self, cluster_config):
|
||||
blob_path = self.CLUSTER_DIR + "/" + self.CLUSTER_CONFIG_FILE
|
||||
content = yaml.dump(cluster_config)
|
||||
container_name = cluster_config.cluster_id
|
||||
self.blob_client.create_blob_from_text(container_name, blob_path, content)
|
||||
|
||||
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
|
||||
def read_cluster_config(self):
|
||||
blob_path = self.CLUSTER_DIR + "/" + self.CLUSTER_CONFIG_FILE
|
||||
try:
|
||||
result = self.blob_client.get_blob_to_text(self.cluster_id, blob_path)
|
||||
return yaml.load(result.content)
|
||||
except azure.common.AzureMissingResourceHttpError:
|
||||
logging.warn("Cluster %s doesn't have cluster configuration in storage", self.cluster_id)
|
||||
logging.warning("Cluster %s doesn't have cluster configuration in storage", self.cluster_id)
|
||||
except yaml.YAMLError:
|
||||
logging.warn("Cluster %s contains invalid cluster configuration in blob", self.cluster_id)
|
||||
logging.warning("Cluster %s contains invalid cluster configuration in blob", self.cluster_id)
|
||||
|
||||
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
|
||||
def upload_file(self, blob_path: str, local_path: str) -> BlobData:
|
||||
self.blob_client.create_blob_from_path(self.cluster_id, blob_path, local_path)
|
||||
return BlobData(self.blob_client, self.cluster_id, blob_path)
|
||||
|
||||
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
|
||||
def upload_bytes(self, blob_path: str, bytes_io: io.BytesIO) -> BlobData:
|
||||
self.blob_client.create_blob_from_bytes(self.cluster_id, blob_path, bytes_io.getvalue())
|
||||
return BlobData(self.blob_client, self.cluster_id, blob_path)
|
||||
|
@ -61,8 +68,10 @@ class ClusterData:
|
|||
def upload_node_data(self, node_data: NodeData) -> BlobData:
|
||||
return self.upload_cluster_file("node-scripts.zip", node_data.zip_path)
|
||||
|
||||
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
|
||||
def _ensure_container(self):
|
||||
self.blob_client.create_container(self.cluster_id, fail_on_exist=False)
|
||||
|
||||
@retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError))
|
||||
def delete_container(self, container_name: str):
|
||||
self.blob_client.delete_container(container_name)
|
||||
|
|
|
@ -44,11 +44,11 @@ class NodeData:
|
|||
return
|
||||
if isinstance(file, (str, bytes)):
|
||||
full_file_path = Path(file)
|
||||
with io.open(file, 'r', encoding='UTF-8') as f:
|
||||
with io.open(file, "r", encoding="UTF-8") as f:
|
||||
if binary:
|
||||
self.zipf.write(file, os.path.join(zip_dir, full_file_path.name))
|
||||
else:
|
||||
self.zipf.writestr(os.path.join(zip_dir, full_file_path.name), f.read().replace('\r\n', '\n'))
|
||||
self.zipf.writestr(os.path.join(zip_dir, full_file_path.name), f.read().replace("\r\n", "\n"))
|
||||
elif isinstance(file, models.File):
|
||||
self.zipf.writestr(os.path.join(zip_dir, file.name), file.payload.getvalue())
|
||||
|
||||
|
@ -77,36 +77,38 @@ class NodeData:
|
|||
return
|
||||
self.add_files(
|
||||
[
|
||||
spark_configuration.spark_defaults_conf, spark_configuration.spark_env_sh,
|
||||
spark_configuration.core_site_xml
|
||||
spark_configuration.spark_defaults_conf,
|
||||
spark_configuration.spark_env_sh,
|
||||
spark_configuration.core_site_xml,
|
||||
],
|
||||
'conf',
|
||||
binary=False)
|
||||
"conf",
|
||||
binary=False,
|
||||
)
|
||||
|
||||
# add ssh keys for passwordless ssh
|
||||
self.zipf.writestr('id_rsa.pub', spark_configuration.ssh_key_pair['pub_key'])
|
||||
self.zipf.writestr('id_rsa', spark_configuration.ssh_key_pair['priv_key'])
|
||||
self.zipf.writestr("id_rsa.pub", spark_configuration.ssh_key_pair["pub_key"])
|
||||
self.zipf.writestr("id_rsa", spark_configuration.ssh_key_pair["priv_key"])
|
||||
|
||||
if spark_configuration.jars:
|
||||
for jar in spark_configuration.jars:
|
||||
self.add_file(jar, 'jars', binary=True)
|
||||
self.add_file(jar, "jars", binary=True)
|
||||
|
||||
def _add_user_conf(self):
|
||||
user_conf = self.cluster_config.user_configuration
|
||||
if not user_conf:
|
||||
return
|
||||
encrypted_aes_session_key, cipher_aes_nonce, tag, ciphertext = secure_utils.encrypt_password(
|
||||
self.cluster_config.spark_configuration.ssh_key_pair['pub_key'], user_conf.password)
|
||||
self.cluster_config.spark_configuration.ssh_key_pair["pub_key"], user_conf.password)
|
||||
user_conf = yaml.dump({
|
||||
'username': user_conf.username,
|
||||
'password': ciphertext,
|
||||
'ssh-key': user_conf.ssh_key,
|
||||
'aes_session_key': encrypted_aes_session_key,
|
||||
'cipher_aes_nonce': cipher_aes_nonce,
|
||||
'tag': tag,
|
||||
'cluster_id': self.cluster_config.cluster_id
|
||||
"username": user_conf.username,
|
||||
"password": ciphertext,
|
||||
"ssh-key": user_conf.ssh_key,
|
||||
"aes_session_key": encrypted_aes_session_key,
|
||||
"cipher_aes_nonce": cipher_aes_nonce,
|
||||
"tag": tag,
|
||||
"cluster_id": self.cluster_config.cluster_id,
|
||||
})
|
||||
self.zipf.writestr('user.yaml', user_conf)
|
||||
self.zipf.writestr("user.yaml", user_conf)
|
||||
|
||||
def _add_plugins(self):
|
||||
if not self.cluster_config.plugins:
|
||||
|
@ -115,23 +117,22 @@ class NodeData:
|
|||
data = []
|
||||
for plugin in self.cluster_config.plugins:
|
||||
for file in plugin.files:
|
||||
zipf = self.zipf.writestr('plugins/{0}/{1}'.format(plugin.name, file.target), file.content())
|
||||
self.zipf.writestr("plugins/{0}/{1}".format(plugin.name, file.target), file.content())
|
||||
if plugin.execute:
|
||||
data.append(
|
||||
dict(
|
||||
name=plugin.name,
|
||||
execute='{0}/{1}'.format(plugin.name, plugin.execute),
|
||||
execute="{0}/{1}".format(plugin.name, plugin.execute),
|
||||
args=plugin.args,
|
||||
env=plugin.env,
|
||||
target=plugin.target.value,
|
||||
target_role=plugin.target_role.value,
|
||||
))
|
||||
|
||||
self.zipf.writestr(os.path.join('plugins', 'plugins-manifest.yaml'), yaml.dump(data))
|
||||
return zipf
|
||||
self.zipf.writestr(os.path.join("plugins", "plugins-manifest.yaml"), yaml.dump(data))
|
||||
|
||||
def _add_node_scripts(self):
|
||||
self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=['*.pyc*', '*.png'])
|
||||
self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=["*.pyc*", "*.png"])
|
||||
|
||||
def _includeFile(self, filename: str, exclude: List[str]) -> bool:
|
||||
exclude = exclude or []
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import os
|
||||
from aztk.utils.command_builder import CommandBuilder
|
||||
|
||||
|
||||
|
@ -9,30 +8,30 @@ class DockerCmd:
|
|||
|
||||
def __init__(self, name: str, docker_repo: str, docker_run_options: str, cmd: str, gpu_enabled=False):
|
||||
if gpu_enabled:
|
||||
self.cmd = CommandBuilder('nvidia-docker run')
|
||||
self.cmd = CommandBuilder("nvidia-docker run")
|
||||
else:
|
||||
self.cmd = CommandBuilder('docker run')
|
||||
self.cmd.add_option('--net', 'host')
|
||||
self.cmd.add_option('--name', name)
|
||||
self.cmd.add_argument('-d')
|
||||
self.cmd = CommandBuilder("docker run")
|
||||
self.cmd.add_option("--net", "host")
|
||||
self.cmd.add_option("--name", name)
|
||||
self.cmd.add_argument("-d")
|
||||
self.cmd.add_argument(docker_run_options)
|
||||
self.cmd.add_argument(docker_repo)
|
||||
self.cmd.add_argument(cmd)
|
||||
|
||||
def add_env(self, env: str, value: str):
|
||||
self.cmd.add_option('-e', '{0}={1}'.format(env, value))
|
||||
self.cmd.add_option("-e", "{0}={1}".format(env, value))
|
||||
|
||||
def pass_env(self, env: str):
|
||||
"""
|
||||
Give the value of an environment variable in the main process to the docker image
|
||||
"""
|
||||
self.cmd.add_option('-e', '{0}'.format(env))
|
||||
self.cmd.add_option("-e", "{0}".format(env))
|
||||
|
||||
def share_folder(self, folder: str):
|
||||
self.cmd.add_option('-v', '{0}:{0}'.format(folder))
|
||||
self.cmd.add_option("-v", "{0}:{0}".format(folder))
|
||||
|
||||
def open_port(self, port: int):
|
||||
self.cmd.add_option('-p', '{0}:{0}'.format(port)) # Spark Master UI
|
||||
self.cmd.add_option("-p", "{0}:{0}".format(port)) # Spark Master UI
|
||||
|
||||
def to_str(self):
|
||||
return self.cmd.to_str()
|
||||
|
|
|
@ -1,9 +1,16 @@
|
|||
import azure.batch.models as batch_models
|
||||
|
||||
|
||||
class ApplicationLog():
|
||||
def __init__(self, name: str, cluster_id: str, log: str, total_bytes: int,
|
||||
application_state: batch_models.TaskState, exit_code: int):
|
||||
class ApplicationLog:
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
cluster_id: str,
|
||||
log: str,
|
||||
total_bytes: int,
|
||||
application_state: batch_models.TaskState,
|
||||
exit_code: int,
|
||||
):
|
||||
self.name = name
|
||||
self.cluster_id = cluster_id # TODO: change to something cluster/job agnostic
|
||||
self.log = log
|
||||
|
|
|
@ -11,10 +11,8 @@ class Cluster:
|
|||
self.visible_state = pool.allocation_state.value
|
||||
else:
|
||||
self.visible_state = pool.state.value
|
||||
self.total_current_nodes = pool.current_dedicated_nodes + \
|
||||
pool.current_low_priority_nodes
|
||||
self.total_target_nodes = pool.target_dedicated_nodes + \
|
||||
pool.target_low_priority_nodes
|
||||
self.total_current_nodes = pool.current_dedicated_nodes + pool.current_low_priority_nodes
|
||||
self.total_target_nodes = pool.target_dedicated_nodes + pool.target_low_priority_nodes
|
||||
self.current_dedicated_nodes = pool.current_dedicated_nodes
|
||||
self.current_low_pri_nodes = pool.current_low_priority_nodes
|
||||
self.target_dedicated_nodes = pool.target_dedicated_nodes
|
||||
|
|
|
@ -61,8 +61,8 @@ class ClusterConfiguration(Model):
|
|||
def __validate__(self) -> bool:
|
||||
if self.size == 0 and self.size_low_priority == 0:
|
||||
raise error.InvalidModelError(
|
||||
"Please supply a valid (greater than 0) size or size_low_priority value either in the cluster.yaml configuration file or with a parameter (--size or --size-low-priority)"
|
||||
)
|
||||
"Please supply a valid (greater than 0) size or size_low_priority value either "
|
||||
"in the cluster.yaml configuration file or with a parameter (--size or --size-low-priority)")
|
||||
|
||||
if self.vm_size is None:
|
||||
raise error.InvalidModelError(
|
||||
|
@ -70,8 +70,8 @@ class ClusterConfiguration(Model):
|
|||
|
||||
if self.mixed_mode() and not self.subnet_id:
|
||||
raise error.InvalidModelError(
|
||||
"You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). Set the VNET's subnet_id in your cluster.yaml or with a parameter (--subnet-id)."
|
||||
)
|
||||
"You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). "
|
||||
"Set the VNET's subnet_id in your cluster.yaml or with a parameter (--subnet-id).")
|
||||
|
||||
if self.scheduling_target == SchedulingTarget.Dedicated and self.size == 0:
|
||||
raise error.InvalidModelError("Scheduling target cannot be Dedicated if dedicated vm size is 0")
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
import os
|
||||
import inspect
|
||||
import importlib.util
|
||||
from aztk.utils import constants
|
||||
from aztk.error import InvalidPluginReferenceError
|
||||
from aztk.spark.models import plugins
|
||||
|
||||
|
@ -28,7 +25,8 @@ class PluginManager:
|
|||
nvblas=plugins.NvBLASPlugin,
|
||||
apt_get=plugins.AptGetPlugin,
|
||||
pip_install=plugins.PipPlugin,
|
||||
conda_install=plugins.CondaPlugin)
|
||||
conda_install=plugins.CondaPlugin,
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
self.loaded = False
|
||||
|
|
|
@ -50,7 +50,5 @@ class PluginReference(Model):
|
|||
execute=script_filename,
|
||||
target=self.target,
|
||||
target_role=self.target_role or PluginConfiguration,
|
||||
files=[
|
||||
PluginFile(script_filename, self.script),
|
||||
],
|
||||
files=[PluginFile(script_filename, self.script)],
|
||||
)
|
||||
|
|
|
@ -9,6 +9,7 @@ class PluginTarget(Enum):
|
|||
"""
|
||||
Where this plugin should run
|
||||
"""
|
||||
|
||||
SparkContainer = "spark-container"
|
||||
Host = "host"
|
||||
|
||||
|
@ -26,6 +27,7 @@ class PluginPort(Model):
|
|||
:param public: [Optional] Port available to the user. If none won't open any port to the user
|
||||
:param name: [Optional] name to differentiate ports if you have multiple
|
||||
"""
|
||||
|
||||
internal = fields.Integer()
|
||||
public = fields.Field(default=None)
|
||||
name = fields.Integer()
|
||||
|
@ -55,6 +57,7 @@ class PluginConfiguration(Model):
|
|||
args: List of arguments to pass to the executing script
|
||||
env: Dict of environment variables to pass to the script
|
||||
"""
|
||||
|
||||
name = fields.String()
|
||||
files = fields.List(PluginFile)
|
||||
execute = fields.String()
|
||||
|
|
|
@ -15,7 +15,7 @@ class PluginFile(Model):
|
|||
super().__init__(target=target, local_path=local_path)
|
||||
|
||||
def content(self):
|
||||
with open(self.local_path, "r", encoding='UTF-8') as f:
|
||||
with open(self.local_path, "r", encoding="UTF-8") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ class ServicePrincipalConfiguration(Model):
|
|||
"""
|
||||
Container class for AAD authentication
|
||||
"""
|
||||
|
||||
tenant_id = fields.String()
|
||||
client_id = fields.String()
|
||||
credential = fields.String()
|
||||
|
@ -17,6 +18,7 @@ class SharedKeyConfiguration(Model):
|
|||
"""
|
||||
Container class for shared key authentication
|
||||
"""
|
||||
|
||||
batch_account_name = fields.String()
|
||||
batch_account_key = fields.String()
|
||||
batch_service_url = fields.String()
|
||||
|
@ -34,6 +36,7 @@ class DockerConfiguration(Model):
|
|||
username (str): Docker endpoint username
|
||||
password (str): Docker endpoint password
|
||||
"""
|
||||
|
||||
endpoint = fields.String(default=None)
|
||||
username = fields.String(default=None)
|
||||
password = fields.String(default=None)
|
||||
|
|
|
@ -2,4 +2,5 @@ class Software:
|
|||
"""
|
||||
Enum with list of available softwares
|
||||
"""
|
||||
|
||||
spark = "spark"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
class SSHLog():
|
||||
class SSHLog:
|
||||
def __init__(self, output, node_id):
|
||||
self.output = output
|
||||
self.node_id = node_id
|
||||
|
|
|
@ -25,8 +25,8 @@ TOOLKIT_MAP = dict(
|
|||
r=ToolkitEnvironmentDefinition(),
|
||||
miniconda=ToolkitEnvironmentDefinition(),
|
||||
anaconda=ToolkitEnvironmentDefinition(),
|
||||
)),
|
||||
)
|
||||
),
|
||||
))
|
||||
|
||||
|
||||
class Toolkit(Model):
|
||||
|
@ -74,12 +74,12 @@ class Toolkit(Model):
|
|||
self.environment, self.environment_version, self.software, env_def.versions))
|
||||
|
||||
if self.docker_run_options:
|
||||
invalid_character = re.search('[^A-Za-z0-9 _./:=\-\"]', self.docker_run_options)
|
||||
invalid_character = re.search(r'[^A-Za-z0-9 _./:=\-"]', self.docker_run_options)
|
||||
if invalid_character:
|
||||
raise InvalidModelError(
|
||||
"Docker run options contains invalid character '{0}'. Only A-Z, a-z, 0-9, space, hyphen (-), "
|
||||
"underscore (_), period (.), forward slash (/), colon (:), equals(=), comma (,), and "
|
||||
"double quote (\") are allowed.".format(invalid_character.group(0)))
|
||||
'double quote (") are allowed.'.format(invalid_character.group(0)))
|
||||
|
||||
def get_docker_repo(self, gpu: bool):
|
||||
if self.docker_repo:
|
||||
|
@ -87,10 +87,7 @@ class Toolkit(Model):
|
|||
|
||||
repo = "aztk/{0}".format(self.software)
|
||||
|
||||
return "{repo}:{tag}".format(
|
||||
repo=repo,
|
||||
tag=self._get_docker_tag(gpu),
|
||||
)
|
||||
return "{repo}:{tag}".format(repo=repo, tag=self._get_docker_tag(gpu))
|
||||
|
||||
def get_docker_run_options(self):
|
||||
return self.docker_run_options
|
||||
|
@ -109,7 +106,7 @@ class Toolkit(Model):
|
|||
|
||||
array.append("gpu" if gpu else "base")
|
||||
|
||||
return '-'.join(array)
|
||||
return "-".join(array)
|
||||
|
||||
def _get_environment_definition(self) -> ToolkitEnvironmentDefinition:
|
||||
toolkit = TOOLKIT_MAP.get(self.software)
|
||||
|
|
|
@ -1,19 +1,20 @@
|
|||
import os
|
||||
import re
|
||||
import logging
|
||||
|
||||
import azure.batch.batch_auth as batchauth
|
||||
import azure.batch.batch_service_client as batch
|
||||
import azure.storage.blob as blob
|
||||
import azure.batch.batch_auth as batchauth
|
||||
from core import log
|
||||
from azure.common.credentials import ServicePrincipalCredentials
|
||||
from azure.mgmt.batch import BatchManagementClient
|
||||
from azure.mgmt.storage import StorageManagementClient
|
||||
from azure.storage.common import CloudStorageAccount
|
||||
|
||||
RESOURCE_ID_PATTERN = re.compile('^/subscriptions/(?P<subscription>[^/]+)'
|
||||
'/resourceGroups/(?P<resourcegroup>[^/]+)'
|
||||
'/providers/[^/]+'
|
||||
'/[^/]+Accounts/(?P<account>[^/]+)$')
|
||||
from core import log
|
||||
|
||||
RESOURCE_ID_PATTERN = re.compile("^/subscriptions/(?P<subscription>[^/]+)"
|
||||
"/resourceGroups/(?P<resourcegroup>[^/]+)"
|
||||
"/providers/[^/]+"
|
||||
"/[^/]+Accounts/(?P<account>[^/]+)$")
|
||||
|
||||
batch_account_name = os.environ.get("AZ_BATCH_ACCOUNT_NAME")
|
||||
batch_account_key = os.environ.get("BATCH_ACCOUNT_KEY")
|
||||
|
@ -44,14 +45,14 @@ def get_blob_client() -> blob.BlockBlobService:
|
|||
account_name=storage_account_name, account_key=storage_account_key, endpoint_suffix=storage_account_suffix)
|
||||
else:
|
||||
credentials = ServicePrincipalCredentials(
|
||||
client_id=client_id, secret=credential, tenant=tenant_id, resource='https://management.core.windows.net/')
|
||||
client_id=client_id, secret=credential, tenant=tenant_id, resource="https://management.core.windows.net/")
|
||||
m = RESOURCE_ID_PATTERN.match(storage_resource_id)
|
||||
accountname = m.group('account')
|
||||
subscription = m.group('subscription')
|
||||
resourcegroup = m.group('resourcegroup')
|
||||
accountname = m.group("account")
|
||||
subscription = m.group("subscription")
|
||||
resourcegroup = m.group("resourcegroup")
|
||||
mgmt_client = StorageManagementClient(credentials, subscription)
|
||||
key = mgmt_client.storage_accounts.list_keys(
|
||||
resource_group_name=resourcegroup, account_name=accountname).keys[0].value
|
||||
key = (mgmt_client.storage_accounts.list_keys(resource_group_name=resourcegroup, account_name=accountname)
|
||||
.keys[0].value)
|
||||
storage_client = CloudStorageAccount(accountname, key)
|
||||
return storage_client.create_block_blob_service()
|
||||
|
||||
|
@ -62,13 +63,13 @@ def get_batch_client() -> batch.BatchServiceClient:
|
|||
credentials = batchauth.SharedKeyCredentials(batch_account_name, batch_account_key)
|
||||
else:
|
||||
credentials = ServicePrincipalCredentials(
|
||||
client_id=client_id, secret=credential, tenant=tenant_id, resource='https://management.core.windows.net/')
|
||||
client_id=client_id, secret=credential, tenant=tenant_id, resource="https://management.core.windows.net/")
|
||||
m = RESOURCE_ID_PATTERN.match(batch_resource_id)
|
||||
batch_client = BatchManagementClient(credentials, m.group('subscription'))
|
||||
account = batch_client.batch_account.get(m.group('resourcegroup'), m.group('account'))
|
||||
base_url = 'https://%s/' % account.account_endpoint
|
||||
batch_client = BatchManagementClient(credentials, m.group("subscription"))
|
||||
account = batch_client.batch_account.get(m.group("resourcegroup"), m.group("account"))
|
||||
base_url = "https://%s/" % account.account_endpoint
|
||||
credentials = ServicePrincipalCredentials(
|
||||
client_id=client_id, secret=credential, tenant=tenant_id, resource='https://batch.core.windows.net/')
|
||||
client_id=client_id, secret=credential, tenant=tenant_id, resource="https://batch.core.windows.net/")
|
||||
|
||||
return batch.BatchServiceClient(credentials, base_url=base_url)
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ import logging
|
|||
|
||||
log = logging.getLogger("aztk.node-agent")
|
||||
|
||||
DEFAULT_FORMAT = '%(message)s'
|
||||
DEFAULT_FORMAT = "%(message)s"
|
||||
|
||||
|
||||
def setup_logging():
|
||||
|
|
|
@ -5,45 +5,47 @@ from Cryptodome.PublicKey import RSA
|
|||
from Cryptodome.Cipher import AES, PKCS1_OAEP
|
||||
from datetime import datetime, timezone, timedelta
|
||||
import yaml
|
||||
'''
|
||||
"""
|
||||
Creates a user if the user configuration file at $AZTK_WORKING_DIR/user.yaml exists
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def create_user(batch_client):
|
||||
path = os.path.join(os.environ['AZTK_WORKING_DIR'], "user.yaml")
|
||||
path = os.path.join(os.environ["AZTK_WORKING_DIR"], "user.yaml")
|
||||
|
||||
if not os.path.isfile(path):
|
||||
print("No user to create.")
|
||||
return
|
||||
|
||||
with open(path, 'r', encoding='UTF-8') as file:
|
||||
with open(path, "r", encoding="UTF-8") as file:
|
||||
user_conf = yaml.load(file.read())
|
||||
|
||||
try:
|
||||
password = None if user_conf['ssh-key'] else decrypt_password(user_conf)
|
||||
password = None if user_conf["ssh-key"] else decrypt_password(user_conf)
|
||||
|
||||
batch_client.compute_node.add_user(
|
||||
pool_id=os.environ['AZ_BATCH_POOL_ID'],
|
||||
node_id=os.environ['AZ_BATCH_NODE_ID'],
|
||||
pool_id=os.environ["AZ_BATCH_POOL_ID"],
|
||||
node_id=os.environ["AZ_BATCH_NODE_ID"],
|
||||
user=batch_models.ComputeNodeUser(
|
||||
name=user_conf['username'],
|
||||
name=user_conf["username"],
|
||||
is_admin=True,
|
||||
password=password,
|
||||
ssh_public_key=str(user_conf['ssh-key']),
|
||||
expiry_time=datetime.now(timezone.utc) + timedelta(days=365)))
|
||||
ssh_public_key=str(user_conf["ssh-key"]),
|
||||
expiry_time=datetime.now(timezone.utc) + timedelta(days=365),
|
||||
),
|
||||
)
|
||||
except batch_error.BatchErrorException as e:
|
||||
print(e)
|
||||
|
||||
|
||||
def decrypt_password(user_conf):
|
||||
cipher_text = user_conf['password']
|
||||
encrypted_aes_session_key = user_conf['aes_session_key']
|
||||
cipher_aes_nonce = user_conf['cipher_aes_nonce']
|
||||
tag = user_conf['tag']
|
||||
cipher_text = user_conf["password"]
|
||||
encrypted_aes_session_key = user_conf["aes_session_key"]
|
||||
cipher_aes_nonce = user_conf["cipher_aes_nonce"]
|
||||
tag = user_conf["tag"]
|
||||
|
||||
# Read private key
|
||||
with open(os.path.join(os.environ['AZTK_WORKING_DIR'], 'id_rsa'), encoding='UTF-8') as f:
|
||||
with open(os.path.join(os.environ["AZTK_WORKING_DIR"], "id_rsa"), encoding="UTF-8") as f:
|
||||
private_key = RSA.import_key(f.read())
|
||||
# Decrypt the session key with the public RSA key
|
||||
cipher_rsa = PKCS1_OAEP.new(private_key)
|
||||
|
|
|
@ -25,7 +25,7 @@ def setup_host(docker_repo: str, docker_run_options: str):
|
|||
client = config.batch_client
|
||||
|
||||
create_user.create_user(batch_client=client)
|
||||
if os.environ['AZ_BATCH_NODE_IS_DEDICATED'] == "true" or os.environ['AZTK_MIXED_MODE'] == "false":
|
||||
if os.environ["AZ_BATCH_NODE_IS_DEDICATED"] == "true" or os.environ["AZTK_MIXED_MODE"] == "false":
|
||||
is_master = pick_master.find_master(client)
|
||||
else:
|
||||
is_master = False
|
||||
|
@ -50,7 +50,7 @@ def setup_host(docker_repo: str, docker_run_options: str):
|
|||
|
||||
setup_node_scheduling(client, cluster_conf, is_master)
|
||||
|
||||
#TODO pass azure file shares
|
||||
# TODO pass azure file shares
|
||||
spark_container.start_spark_container(
|
||||
docker_repo=docker_repo,
|
||||
docker_run_options=docker_run_options,
|
||||
|
@ -82,4 +82,4 @@ def setup_spark_container():
|
|||
|
||||
plugins.setup_plugins(target=PluginTarget.SparkContainer, is_master=is_master, is_worker=is_worker)
|
||||
|
||||
open("/tmp/setup_complete", 'a').close()
|
||||
open("/tmp/setup_complete", "a").close()
|
||||
|
|
|
@ -37,8 +37,8 @@ def try_assign_self_as_master(client: batch.BatchServiceClient, pool: batchmodel
|
|||
client.pool.patch(
|
||||
config.pool_id,
|
||||
batchmodels.PoolPatchParameter(metadata=new_metadata),
|
||||
batchmodels.PoolPatchOptions(if_match=pool.e_tag,
|
||||
))
|
||||
batchmodels.PoolPatchOptions(if_match=pool.e_tag),
|
||||
)
|
||||
return True
|
||||
except (batcherror.BatchErrorException, ClientRequestError):
|
||||
print("Couldn't assign itself as master the pool because the pool was modified since last get.")
|
||||
|
|
|
@ -1,18 +1,19 @@
|
|||
import os
|
||||
import json
|
||||
import yaml
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from aztk.models.plugins import PluginTarget, PluginTargetRole
|
||||
|
||||
log_folder = os.path.join(os.environ['AZTK_WORKING_DIR'], 'logs', 'plugins')
|
||||
log_folder = os.path.join(os.environ["AZTK_WORKING_DIR"], "logs", "plugins")
|
||||
|
||||
|
||||
def _read_manifest_file(path=None):
|
||||
if not os.path.isfile(path):
|
||||
print("Plugins manifest file doesn't exist at {0}".format(path))
|
||||
else:
|
||||
with open(path, 'r', encoding='UTF-8') as stream:
|
||||
with open(path, "r", encoding="UTF-8") as stream:
|
||||
try:
|
||||
return yaml.load(stream)
|
||||
except json.JSONDecodeError as err:
|
||||
|
@ -22,7 +23,7 @@ def _read_manifest_file(path=None):
|
|||
def setup_plugins(target: PluginTarget, is_master: bool = False, is_worker: bool = False):
|
||||
|
||||
plugins_dir = _plugins_dir()
|
||||
plugins_manifest = _read_manifest_file(os.path.join(plugins_dir, 'plugins-manifest.yaml'))
|
||||
plugins_manifest = _read_manifest_file(os.path.join(plugins_dir, "plugins-manifest.yaml"))
|
||||
|
||||
if not os.path.exists(log_folder):
|
||||
os.makedirs(log_folder)
|
||||
|
@ -32,28 +33,41 @@ def setup_plugins(target: PluginTarget, is_master: bool = False, is_worker: bool
|
|||
|
||||
|
||||
def _plugins_dir():
|
||||
return os.path.join(os.environ['AZTK_WORKING_DIR'], 'plugins')
|
||||
return os.path.join(os.environ["AZTK_WORKING_DIR"], "plugins")
|
||||
|
||||
|
||||
def _run_on_this_node(plugin_obj, target: PluginTarget, is_master, is_worker):
|
||||
|
||||
print("Loading plugin {} in {} on {}".format(plugin_obj["execute"], plugin_obj['target'],
|
||||
plugin_obj['target_role']))
|
||||
print("Loading plugin {} in {} on {}".format(plugin_obj["execute"], plugin_obj["target"],
|
||||
plugin_obj["target_role"]))
|
||||
|
||||
if plugin_obj['target'] != target.value:
|
||||
print("Ignoring ", plugin_obj["execute"], "as target is for ", plugin_obj['target'],
|
||||
"but is currently running in ", target.value)
|
||||
if plugin_obj["target"] != target.value:
|
||||
print(
|
||||
"Ignoring ",
|
||||
plugin_obj["execute"],
|
||||
"as target is for ",
|
||||
plugin_obj["target"],
|
||||
"but is currently running in ",
|
||||
target.value,
|
||||
)
|
||||
return False
|
||||
|
||||
if plugin_obj['target_role'] == PluginTargetRole.Master.value and is_master is True:
|
||||
if plugin_obj["target_role"] == PluginTargetRole.Master.value and is_master is True:
|
||||
return True
|
||||
if plugin_obj['target_role'] == PluginTargetRole.Worker.value and is_worker is True:
|
||||
if plugin_obj["target_role"] == PluginTargetRole.Worker.value and is_worker is True:
|
||||
return True
|
||||
if plugin_obj['target_role'] == PluginTargetRole.All.value:
|
||||
if plugin_obj["target_role"] == PluginTargetRole.All.value:
|
||||
return True
|
||||
|
||||
print("Ignoring plugin", plugin_obj["execute"], "as target role is ", plugin_obj['target_role'],
|
||||
"and node is master: ", is_master, is_worker)
|
||||
print(
|
||||
"Ignoring plugin",
|
||||
plugin_obj["execute"],
|
||||
"as target role is ",
|
||||
plugin_obj["target_role"],
|
||||
"and node is master: ",
|
||||
is_master,
|
||||
is_worker,
|
||||
)
|
||||
|
||||
return False
|
||||
|
||||
|
@ -63,8 +77,8 @@ def _setup_plugins(plugins_manifest, target: PluginTarget, is_master, is_worker)
|
|||
|
||||
for plugin in plugins_manifest:
|
||||
if _run_on_this_node(plugin, target, is_master, is_worker):
|
||||
path = os.path.join(plugins_dir, plugin['execute'])
|
||||
_run_script(plugin.get("name"), path, plugin.get('args'), plugin.get('env'))
|
||||
path = os.path.join(plugins_dir, plugin["execute"])
|
||||
_run_script(plugin.get("name"), path, plugin.get("args"), plugin.get("env"))
|
||||
|
||||
|
||||
def _run_script(name: str, script_path: str = None, args: dict = None, env: dict = None):
|
||||
|
@ -84,7 +98,7 @@ def _run_script(name: str, script_path: str = None, args: dict = None, env: dict
|
|||
if args is None:
|
||||
args = []
|
||||
|
||||
out_file = open(os.path.join(log_folder, '{0}.txt'.format(name)), 'w', encoding='UTF-8')
|
||||
out_file = open(os.path.join(log_folder, "{0}.txt".format(name)), "w", encoding="UTF-8")
|
||||
try:
|
||||
subprocess.call([script_path] + args, env=my_env, stdout=out_file, stderr=out_file)
|
||||
print("Finished running")
|
||||
|
|
|
@ -2,13 +2,14 @@
|
|||
Code that handle spark configuration
|
||||
"""
|
||||
import datetime
|
||||
import time
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
from subprocess import call, Popen, check_output
|
||||
import time
|
||||
from subprocess import call
|
||||
from typing import List
|
||||
|
||||
import azure.batch.models as batchmodels
|
||||
|
||||
from core import config
|
||||
from install import pick_master
|
||||
|
||||
|
@ -55,7 +56,7 @@ def setup_connection():
|
|||
master_node = get_node(master_node_id)
|
||||
|
||||
master_config_file = os.path.join(spark_conf_folder, "master")
|
||||
master_file = open(master_config_file, 'w', encoding='UTF-8')
|
||||
master_file = open(master_config_file, "w", encoding="UTF-8")
|
||||
|
||||
print("Adding master node ip {0} to config file '{1}'".format(master_node.ip_address, master_config_file))
|
||||
master_file.write("{0}\n".format(master_node.ip_address))
|
||||
|
@ -127,9 +128,9 @@ def setup_conf():
|
|||
|
||||
|
||||
def setup_ssh_keys():
|
||||
pub_key_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'id_rsa.pub')
|
||||
priv_key_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'id_rsa')
|
||||
ssh_key_dest = '/root/.ssh'
|
||||
pub_key_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "id_rsa.pub")
|
||||
priv_key_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "id_rsa")
|
||||
ssh_key_dest = "/root/.ssh"
|
||||
|
||||
if not os.path.exists(ssh_key_dest):
|
||||
os.mkdir(ssh_key_dest)
|
||||
|
@ -139,27 +140,27 @@ def setup_ssh_keys():
|
|||
|
||||
|
||||
def copy_spark_env():
|
||||
spark_env_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'conf/spark-env.sh')
|
||||
spark_env_path_dest = os.path.join(spark_home, 'conf/spark-env.sh')
|
||||
spark_env_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "conf/spark-env.sh")
|
||||
spark_env_path_dest = os.path.join(spark_home, "conf/spark-env.sh")
|
||||
copyfile(spark_env_path_src, spark_env_path_dest)
|
||||
|
||||
|
||||
def copy_spark_defaults():
|
||||
spark_default_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'conf/spark-defaults.conf')
|
||||
spark_default_path_dest = os.path.join(spark_home, 'conf/spark-defaults.conf')
|
||||
spark_default_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "conf/spark-defaults.conf")
|
||||
spark_default_path_dest = os.path.join(spark_home, "conf/spark-defaults.conf")
|
||||
copyfile(spark_default_path_src, spark_default_path_dest)
|
||||
|
||||
|
||||
def copy_core_site():
|
||||
spark_core_site_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'conf/core-site.xml')
|
||||
spark_core_site_dest = os.path.join(spark_home, 'conf/core-site.xml')
|
||||
spark_core_site_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "conf/core-site.xml")
|
||||
spark_core_site_dest = os.path.join(spark_home, "conf/core-site.xml")
|
||||
copyfile(spark_core_site_src, spark_core_site_dest)
|
||||
|
||||
|
||||
def copy_jars():
|
||||
# Copy jars to $SPARK_HOME/jars
|
||||
spark_default_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'jars')
|
||||
spark_default_path_dest = os.path.join(spark_home, 'jars')
|
||||
spark_default_path_src = os.path.join(os.environ["AZTK_WORKING_DIR"], "jars")
|
||||
spark_default_path_dest = os.path.join(spark_home, "jars")
|
||||
|
||||
try:
|
||||
jar_files = os.listdir(spark_default_path_src)
|
||||
|
@ -175,10 +176,10 @@ def copy_jars():
|
|||
|
||||
def parse_configuration_file(path_to_file: str):
|
||||
try:
|
||||
file = open(path_to_file, 'r', encoding='UTF-8')
|
||||
file = open(path_to_file, "r", encoding="UTF-8")
|
||||
properties = {}
|
||||
for line in file:
|
||||
if (not line.startswith('#') and len(line) > 1):
|
||||
if not line.startswith("#") and len(line) > 1:
|
||||
split = line.split()
|
||||
properties[split[0]] = split[1]
|
||||
return properties
|
||||
|
@ -189,10 +190,10 @@ def parse_configuration_file(path_to_file: str):
|
|||
|
||||
def start_history_server():
|
||||
# configure the history server
|
||||
spark_event_log_enabled_key = 'spark.eventLog.enabled'
|
||||
spark_event_log_directory_key = 'spark.eventLog.dir'
|
||||
spark_history_fs_log_directory = 'spark.history.fs.logDirectory'
|
||||
path_to_spark_defaults_conf = os.path.join(spark_home, 'conf/spark-defaults.conf')
|
||||
spark_event_log_enabled_key = "spark.eventLog.enabled"
|
||||
spark_event_log_directory_key = "spark.eventLog.dir"
|
||||
spark_history_fs_log_directory = "spark.history.fs.logDirectory"
|
||||
path_to_spark_defaults_conf = os.path.join(spark_home, "conf/spark-defaults.conf")
|
||||
properties = parse_configuration_file(path_to_spark_defaults_conf)
|
||||
required_keys = [spark_event_log_enabled_key, spark_event_log_directory_key, spark_history_fs_log_directory]
|
||||
|
||||
|
@ -208,17 +209,17 @@ def start_history_server():
|
|||
def configure_history_server_log_path(path_to_log_file):
|
||||
# Check if the file path starts with a local file extension
|
||||
# If so, create the path on disk otherwise ignore
|
||||
print('Configuring spark history server log directory {}.'.format(path_to_log_file))
|
||||
if path_to_log_file.startswith('file:/'):
|
||||
print("Configuring spark history server log directory {}.".format(path_to_log_file))
|
||||
if path_to_log_file.startswith("file:/"):
|
||||
# create the local path on disk
|
||||
directory = path_to_log_file.replace('file:', '')
|
||||
directory = path_to_log_file.replace("file:", "")
|
||||
if os.path.exists(directory):
|
||||
print('Skipping. Directory {} already exists.'.format(directory))
|
||||
print("Skipping. Directory {} already exists.".format(directory))
|
||||
else:
|
||||
print('Create directory {}.'.format(directory))
|
||||
print("Create directory {}.".format(directory))
|
||||
os.makedirs(directory)
|
||||
|
||||
# Make sure the directory can be accessed by all users
|
||||
os.chmod(directory, mode=0o777)
|
||||
else:
|
||||
print('Skipping. The eventLog directory is not local.')
|
||||
print("Skipping. The eventLog directory is not local.")
|
||||
|
|
|
@ -15,42 +15,43 @@ def start_spark_container(docker_repo: str = None,
|
|||
docker_repo=docker_repo,
|
||||
docker_run_options=docker_run_options,
|
||||
cmd="/bin/bash /mnt/batch/tasks/startup/wd/aztk/node_scripts/docker_main.sh",
|
||||
gpu_enabled=gpu_enabled)
|
||||
gpu_enabled=gpu_enabled,
|
||||
)
|
||||
|
||||
if file_mounts:
|
||||
for mount in file_mounts:
|
||||
cmd.share_folder(mount.mount_path)
|
||||
cmd.share_folder('/mnt')
|
||||
cmd.share_folder("/mnt")
|
||||
|
||||
cmd.pass_env('AZTK_WORKING_DIR')
|
||||
cmd.pass_env('AZ_BATCH_ACCOUNT_NAME')
|
||||
cmd.pass_env('BATCH_ACCOUNT_KEY')
|
||||
cmd.pass_env('BATCH_SERVICE_URL')
|
||||
cmd.pass_env('STORAGE_ACCOUNT_NAME')
|
||||
cmd.pass_env('STORAGE_ACCOUNT_KEY')
|
||||
cmd.pass_env('STORAGE_ACCOUNT_SUFFIX')
|
||||
cmd.pass_env("AZTK_WORKING_DIR")
|
||||
cmd.pass_env("AZ_BATCH_ACCOUNT_NAME")
|
||||
cmd.pass_env("BATCH_ACCOUNT_KEY")
|
||||
cmd.pass_env("BATCH_SERVICE_URL")
|
||||
cmd.pass_env("STORAGE_ACCOUNT_NAME")
|
||||
cmd.pass_env("STORAGE_ACCOUNT_KEY")
|
||||
cmd.pass_env("STORAGE_ACCOUNT_SUFFIX")
|
||||
|
||||
cmd.pass_env('SP_TENANT_ID')
|
||||
cmd.pass_env('SP_CLIENT_ID')
|
||||
cmd.pass_env('SP_CREDENTIAL')
|
||||
cmd.pass_env('SP_BATCH_RESOURCE_ID')
|
||||
cmd.pass_env('SP_STORAGE_RESOURCE_ID')
|
||||
cmd.pass_env("SP_TENANT_ID")
|
||||
cmd.pass_env("SP_CLIENT_ID")
|
||||
cmd.pass_env("SP_CREDENTIAL")
|
||||
cmd.pass_env("SP_BATCH_RESOURCE_ID")
|
||||
cmd.pass_env("SP_STORAGE_RESOURCE_ID")
|
||||
|
||||
cmd.pass_env('AZ_BATCH_POOL_ID')
|
||||
cmd.pass_env('AZ_BATCH_NODE_ID')
|
||||
cmd.pass_env('AZ_BATCH_NODE_IS_DEDICATED')
|
||||
cmd.pass_env("AZ_BATCH_POOL_ID")
|
||||
cmd.pass_env("AZ_BATCH_NODE_ID")
|
||||
cmd.pass_env("AZ_BATCH_NODE_IS_DEDICATED")
|
||||
|
||||
cmd.pass_env('AZTK_WORKER_ON_MASTER')
|
||||
cmd.pass_env('AZTK_MIXED_MODE')
|
||||
cmd.pass_env('AZTK_IS_MASTER')
|
||||
cmd.pass_env('AZTK_IS_WORKER')
|
||||
cmd.pass_env('AZTK_MASTER_IP')
|
||||
cmd.pass_env("AZTK_WORKER_ON_MASTER")
|
||||
cmd.pass_env("AZTK_MIXED_MODE")
|
||||
cmd.pass_env("AZTK_IS_MASTER")
|
||||
cmd.pass_env("AZTK_IS_WORKER")
|
||||
cmd.pass_env("AZTK_MASTER_IP")
|
||||
|
||||
cmd.pass_env('SPARK_WEB_UI_PORT')
|
||||
cmd.pass_env('SPARK_WORKER_UI_PORT')
|
||||
cmd.pass_env('SPARK_CONTAINER_NAME')
|
||||
cmd.pass_env('SPARK_SUBMIT_LOGS_FILE')
|
||||
cmd.pass_env('SPARK_JOB_UI_PORT')
|
||||
cmd.pass_env("SPARK_WEB_UI_PORT")
|
||||
cmd.pass_env("SPARK_WORKER_UI_PORT")
|
||||
cmd.pass_env("SPARK_CONTAINER_NAME")
|
||||
cmd.pass_env("SPARK_SUBMIT_LOGS_FILE")
|
||||
cmd.pass_env("SPARK_JOB_UI_PORT")
|
||||
|
||||
cmd.open_port(8080) # Spark Master UI
|
||||
cmd.open_port(7077) # Spark Master
|
||||
|
@ -69,5 +70,5 @@ def start_spark_container(docker_repo: str = None,
|
|||
print("-" * 60)
|
||||
print(cmd.to_str())
|
||||
print("=" * 60)
|
||||
subprocess.call(['/bin/bash', '-c', 'echo Is master?: $AZTK_IS_MASTER _ $AZTK_IS_WORKER'])
|
||||
subprocess.call(['/bin/bash', '-c', cmd.to_str()])
|
||||
subprocess.call(["/bin/bash", "-c", "echo Is master?: $AZTK_IS_MASTER _ $AZTK_IS_WORKER"])
|
||||
subprocess.call(["/bin/bash", "-c", cmd.to_str()])
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
import datetime
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import List
|
||||
|
||||
import azure.batch.models as batch_models
|
||||
import azure.storage.blob as blob
|
||||
import yaml
|
||||
from aztk.utils.command_builder import CommandBuilder
|
||||
|
||||
from core import config
|
||||
from install.pick_master import get_master_node_id
|
||||
|
||||
|
@ -20,14 +16,13 @@ def affinitize_task_to_master(batch_client, cluster_id, task):
|
|||
|
||||
|
||||
def schedule_tasks(tasks_path):
|
||||
'''
|
||||
"""
|
||||
Handle the request to submit a task
|
||||
'''
|
||||
"""
|
||||
batch_client = config.batch_client
|
||||
blob_client = config.blob_client
|
||||
|
||||
for task_definition in tasks_path:
|
||||
with open(task_definition, 'r', encoding='UTF-8') as stream:
|
||||
with open(task_definition, "r", encoding="UTF-8") as stream:
|
||||
try:
|
||||
task = yaml.load(stream)
|
||||
except yaml.YAMLError as exc:
|
||||
|
@ -36,13 +31,13 @@ def schedule_tasks(tasks_path):
|
|||
# affinitize task to master
|
||||
task = affinitize_task_to_master(batch_client, os.environ["AZ_BATCH_POOL_ID"], task)
|
||||
# schedule the task
|
||||
batch_client.task.add(job_id=os.environ['AZ_BATCH_JOB_ID'], task=task)
|
||||
batch_client.task.add(job_id=os.environ["AZ_BATCH_JOB_ID"], task=task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tasks_path = []
|
||||
for file in os.listdir(os.environ['AZ_BATCH_TASK_WORKING_DIR']):
|
||||
for file in os.listdir(os.environ["AZ_BATCH_TASK_WORKING_DIR"]):
|
||||
if file.endswith(".yaml"):
|
||||
tasks_path.append(os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], file))
|
||||
tasks_path.append(os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], file))
|
||||
|
||||
schedule_tasks(tasks_path)
|
||||
|
|
|
@ -42,7 +42,9 @@ install_prerequisites () {
|
|||
|
||||
install_docker_compose () {
|
||||
echo "Installing Docker-Compose"
|
||||
sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose
|
||||
for i in {1..5}; do
|
||||
sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose && break || sleep 2;
|
||||
done
|
||||
sudo chmod +x /usr/local/bin/docker-compose
|
||||
echo "Finished installing Docker-Compose"
|
||||
}
|
||||
|
@ -64,9 +66,9 @@ pull_docker_container () {
|
|||
install_python_dependencies () {
|
||||
echo "Installing python dependencies"
|
||||
pipenv install --python /usr/bin/python3.5m
|
||||
pipenv run pip install --upgrade setuptools wheel #TODO: add pip when pipenv is compatible with pip10
|
||||
pipenv run pip install --upgrade pip setuptools wheel
|
||||
pip --version
|
||||
echo "Finished installing python dependencies"
|
||||
|
||||
}
|
||||
|
||||
run_docker_container () {
|
||||
|
|
|
@ -1,20 +1,22 @@
|
|||
import sys
|
||||
import os
|
||||
import logging
|
||||
import yaml
|
||||
import subprocess
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import List
|
||||
import azure.storage.blob as blob
|
||||
|
||||
import azure.batch.models as batch_models
|
||||
import azure.storage.blob as blob
|
||||
import yaml
|
||||
|
||||
from aztk.utils.command_builder import CommandBuilder
|
||||
from core import config
|
||||
|
||||
# limit azure.storage logging
|
||||
logging.getLogger("azure.storage").setLevel(logging.CRITICAL)
|
||||
'''
|
||||
"""
|
||||
Submit helper methods
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def upload_file_to_container(container_name,
|
||||
|
@ -40,7 +42,7 @@ def upload_file_to_container(container_name,
|
|||
blob_name = file_path.strip("/")
|
||||
else:
|
||||
blob_name = os.path.basename(file_path)
|
||||
blob_path = application_name + '/' + blob_name
|
||||
blob_path = application_name + "/" + blob_name
|
||||
|
||||
if not node_path:
|
||||
node_path = blob_name
|
||||
|
@ -53,47 +55,60 @@ def upload_file_to_container(container_name,
|
|||
container_name,
|
||||
blob_path,
|
||||
permission=blob.BlobPermissions.READ,
|
||||
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7))
|
||||
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7),
|
||||
)
|
||||
|
||||
sas_url = blob_client.make_blob_url(container_name, blob_path, sas_token=sas_token)
|
||||
|
||||
return batch_models.ResourceFile(file_path=node_path, blob_source=sas_url)
|
||||
|
||||
|
||||
def __app_submit_cmd(name: str, app: str, app_args: List[str], main_class: str, jars: List[str], py_files: List[str],
|
||||
files: List[str], driver_java_options: str, driver_library_path: str, driver_class_path: str,
|
||||
driver_memory: str, executor_memory: str, driver_cores: int, executor_cores: int):
|
||||
cluster_id = os.environ['AZ_BATCH_POOL_ID']
|
||||
spark_home = os.environ['SPARK_HOME']
|
||||
with open(os.path.join(spark_home, 'conf', 'master')) as f:
|
||||
def __app_submit_cmd(
|
||||
name: str,
|
||||
app: str,
|
||||
app_args: List[str],
|
||||
main_class: str,
|
||||
jars: List[str],
|
||||
py_files: List[str],
|
||||
files: List[str],
|
||||
driver_java_options: str,
|
||||
driver_library_path: str,
|
||||
driver_class_path: str,
|
||||
driver_memory: str,
|
||||
executor_memory: str,
|
||||
driver_cores: int,
|
||||
executor_cores: int,
|
||||
):
|
||||
spark_home = os.environ["SPARK_HOME"]
|
||||
with open(os.path.join(spark_home, "conf", "master")) as f:
|
||||
master_ip = f.read().rstrip()
|
||||
|
||||
# set file paths to correct path on container
|
||||
files_path = os.environ['AZ_BATCH_TASK_WORKING_DIR']
|
||||
files_path = os.environ["AZ_BATCH_TASK_WORKING_DIR"]
|
||||
jars = [os.path.join(files_path, os.path.basename(jar)) for jar in jars]
|
||||
py_files = [os.path.join(files_path, os.path.basename(py_file)) for py_file in py_files]
|
||||
files = [os.path.join(files_path, os.path.basename(f)) for f in files]
|
||||
|
||||
# 2>&1 redirect stdout and stderr to be in the same file
|
||||
spark_submit_cmd = CommandBuilder('{0}/bin/spark-submit'.format(spark_home))
|
||||
spark_submit_cmd.add_option('--master', 'spark://{0}:7077'.format(master_ip))
|
||||
spark_submit_cmd.add_option('--name', name)
|
||||
spark_submit_cmd.add_option('--class', main_class)
|
||||
spark_submit_cmd.add_option('--jars', jars and ','.join(jars))
|
||||
spark_submit_cmd.add_option('--py-files', py_files and ','.join(py_files))
|
||||
spark_submit_cmd.add_option('--files', files and ','.join(files))
|
||||
spark_submit_cmd.add_option('--driver-java-options', driver_java_options)
|
||||
spark_submit_cmd.add_option('--driver-library-path', driver_library_path)
|
||||
spark_submit_cmd.add_option('--driver-class-path', driver_class_path)
|
||||
spark_submit_cmd.add_option('--driver-memory', driver_memory)
|
||||
spark_submit_cmd.add_option('--executor-memory', executor_memory)
|
||||
spark_submit_cmd = CommandBuilder("{0}/bin/spark-submit".format(spark_home))
|
||||
spark_submit_cmd.add_option("--master", "spark://{0}:7077".format(master_ip))
|
||||
spark_submit_cmd.add_option("--name", name)
|
||||
spark_submit_cmd.add_option("--class", main_class)
|
||||
spark_submit_cmd.add_option("--jars", jars and ",".join(jars))
|
||||
spark_submit_cmd.add_option("--py-files", py_files and ",".join(py_files))
|
||||
spark_submit_cmd.add_option("--files", files and ",".join(files))
|
||||
spark_submit_cmd.add_option("--driver-java-options", driver_java_options)
|
||||
spark_submit_cmd.add_option("--driver-library-path", driver_library_path)
|
||||
spark_submit_cmd.add_option("--driver-class-path", driver_class_path)
|
||||
spark_submit_cmd.add_option("--driver-memory", driver_memory)
|
||||
spark_submit_cmd.add_option("--executor-memory", executor_memory)
|
||||
if driver_cores:
|
||||
spark_submit_cmd.add_option('--driver-cores', str(driver_cores))
|
||||
spark_submit_cmd.add_option("--driver-cores", str(driver_cores))
|
||||
if executor_cores:
|
||||
spark_submit_cmd.add_option('--executor-cores', str(executor_cores))
|
||||
spark_submit_cmd.add_option("--executor-cores", str(executor_cores))
|
||||
|
||||
spark_submit_cmd.add_argument(
|
||||
os.path.expandvars(app) + ' ' + ' '.join(['\'' + str(app_arg) + '\'' for app_arg in (app_args or [])]))
|
||||
os.path.expandvars(app) + " " + " ".join(["'" + str(app_arg) + "'" for app_arg in (app_args or [])]))
|
||||
|
||||
with open("spark-submit.txt", mode="w", encoding="UTF-8") as stream:
|
||||
stream.write(spark_submit_cmd.to_str())
|
||||
|
@ -102,50 +117,51 @@ def __app_submit_cmd(name: str, app: str, app_args: List[str], main_class: str,
|
|||
|
||||
|
||||
def load_application(application_file_path):
|
||||
'''
|
||||
"""
|
||||
Read and parse the application from file
|
||||
'''
|
||||
with open(application_file_path, encoding='UTF-8') as f:
|
||||
"""
|
||||
with open(application_file_path, encoding="UTF-8") as f:
|
||||
application = yaml.load(f)
|
||||
return application
|
||||
|
||||
|
||||
def upload_log(blob_client, application):
|
||||
'''
|
||||
"""
|
||||
upload output.log to storage account
|
||||
'''
|
||||
log_file = os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], os.environ['SPARK_SUBMIT_LOGS_FILE'])
|
||||
"""
|
||||
log_file = os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], os.environ["SPARK_SUBMIT_LOGS_FILE"])
|
||||
upload_file_to_container(
|
||||
container_name=os.environ['STORAGE_LOGS_CONTAINER'],
|
||||
application_name=application['name'],
|
||||
container_name=os.environ["STORAGE_LOGS_CONTAINER"],
|
||||
application_name=application["name"],
|
||||
file_path=log_file,
|
||||
blob_client=blob_client,
|
||||
use_full_path=False)
|
||||
use_full_path=False,
|
||||
)
|
||||
|
||||
|
||||
def receive_submit_request(application_file_path):
|
||||
'''
|
||||
"""
|
||||
Handle the request to submit a task
|
||||
'''
|
||||
batch_client = config.batch_client
|
||||
"""
|
||||
blob_client = config.blob_client
|
||||
application = load_application(application_file_path)
|
||||
|
||||
cmd = __app_submit_cmd(
|
||||
name=application['name'],
|
||||
app=application['application'],
|
||||
app_args=application['application_args'],
|
||||
main_class=application['main_class'],
|
||||
jars=application['jars'],
|
||||
py_files=application['py_files'],
|
||||
files=application['files'],
|
||||
driver_java_options=application['driver_java_options'],
|
||||
driver_library_path=application['driver_library_path'],
|
||||
driver_class_path=application['driver_class_path'],
|
||||
driver_memory=application['driver_memory'],
|
||||
executor_memory=application['executor_memory'],
|
||||
driver_cores=application['driver_cores'],
|
||||
executor_cores=application['executor_cores'])
|
||||
name=application["name"],
|
||||
app=application["application"],
|
||||
app_args=application["application_args"],
|
||||
main_class=application["main_class"],
|
||||
jars=application["jars"],
|
||||
py_files=application["py_files"],
|
||||
files=application["files"],
|
||||
driver_java_options=application["driver_java_options"],
|
||||
driver_library_path=application["driver_library_path"],
|
||||
driver_class_path=application["driver_class_path"],
|
||||
driver_memory=application["driver_memory"],
|
||||
executor_memory=application["executor_memory"],
|
||||
driver_cores=application["driver_cores"],
|
||||
executor_cores=application["executor_cores"],
|
||||
)
|
||||
|
||||
return_code = subprocess.call(cmd.to_str(), shell=True)
|
||||
upload_log(blob_client, application)
|
||||
|
@ -157,24 +173,25 @@ def upload_error_log(error, application_file_path):
|
|||
blob_client = config.blob_client
|
||||
|
||||
error_log_path = os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "error.log")
|
||||
with open(error_log_path, "w", encoding='UTF-8') as error_log:
|
||||
with open(error_log_path, "w", encoding="UTF-8") as error_log:
|
||||
error_log.write(error)
|
||||
|
||||
upload_file_to_container(
|
||||
container_name=os.environ['STORAGE_LOGS_CONTAINER'],
|
||||
application_name=application['name'],
|
||||
container_name=os.environ["STORAGE_LOGS_CONTAINER"],
|
||||
application_name=application["name"],
|
||||
file_path=os.path.realpath(error_log.name),
|
||||
blob_client=blob_client,
|
||||
use_full_path=False)
|
||||
use_full_path=False,
|
||||
)
|
||||
upload_log(blob_client, application)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
return_code = 1
|
||||
try:
|
||||
return_code = receive_submit_request(os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], 'application.yaml'))
|
||||
return_code = receive_submit_request(os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "application.yaml"))
|
||||
except Exception as e:
|
||||
upload_error_log(str(e), os.path.join(os.environ['AZ_BATCH_TASK_WORKING_DIR'], 'application.yaml'))
|
||||
upload_error_log(str(e), os.path.join(os.environ["AZ_BATCH_TASK_WORKING_DIR"], "application.yaml"))
|
||||
|
||||
# force batch task exit code to match spark exit code
|
||||
sys.exit(return_code)
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import time
|
||||
import os
|
||||
|
||||
while not os.path.exists('/tmp/setup_complete'):
|
||||
while not os.path.exists("/tmp/setup_complete"):
|
||||
time.sleep(1)
|
||||
|
||||
print("SETUP FINISHED")
|
||||
os.remove('/tmp/setup_complete')
|
||||
os.remove("/tmp/setup_complete")
|
||||
|
|
|
@ -17,12 +17,13 @@ def generate_application_task(core_base_operations, container_id, application, r
|
|||
application_name=application.name,
|
||||
file_path=application.application,
|
||||
blob_client=core_base_operations.blob_client,
|
||||
use_full_path=False)
|
||||
use_full_path=False,
|
||||
)
|
||||
|
||||
# Upload application file
|
||||
resource_files.append(app_resource_file)
|
||||
|
||||
application.application = '$AZ_BATCH_TASK_WORKING_DIR/' + os.path.basename(application.application)
|
||||
application.application = "$AZ_BATCH_TASK_WORKING_DIR/" + os.path.basename(application.application)
|
||||
|
||||
# Upload dependent JARS
|
||||
jar_resource_file_paths = []
|
||||
|
@ -32,7 +33,8 @@ def generate_application_task(core_base_operations, container_id, application, r
|
|||
application_name=application.name,
|
||||
file_path=jar,
|
||||
blob_client=core_base_operations.blob_client,
|
||||
use_full_path=False)
|
||||
use_full_path=False,
|
||||
)
|
||||
jar_resource_file_paths.append(current_jar_resource_file_path)
|
||||
resource_files.append(current_jar_resource_file_path)
|
||||
|
||||
|
@ -44,7 +46,8 @@ def generate_application_task(core_base_operations, container_id, application, r
|
|||
application_name=application.name,
|
||||
file_path=py_file,
|
||||
blob_client=core_base_operations.blob_client,
|
||||
use_full_path=False)
|
||||
use_full_path=False,
|
||||
)
|
||||
py_files_resource_file_paths.append(current_py_files_resource_file_path)
|
||||
resource_files.append(current_py_files_resource_file_path)
|
||||
|
||||
|
@ -56,7 +59,8 @@ def generate_application_task(core_base_operations, container_id, application, r
|
|||
application_name=application.name,
|
||||
file_path=file,
|
||||
blob_client=core_base_operations.blob_client,
|
||||
use_full_path=False)
|
||||
use_full_path=False,
|
||||
)
|
||||
files_resource_file_paths.append(files_resource_file_path)
|
||||
resource_files.append(files_resource_file_path)
|
||||
|
||||
|
@ -67,21 +71,23 @@ def generate_application_task(core_base_operations, container_id, application, r
|
|||
application_definition_file = helpers.upload_text_to_container(
|
||||
container_name=container_id,
|
||||
application_name=application.name,
|
||||
file_path='application.yaml',
|
||||
file_path="application.yaml",
|
||||
content=yaml.dump(vars(application)),
|
||||
blob_client=core_base_operations.blob_client)
|
||||
blob_client=core_base_operations.blob_client,
|
||||
)
|
||||
resource_files.append(application_definition_file)
|
||||
|
||||
# create command to submit task
|
||||
task_cmd = CommandBuilder('sudo docker exec')
|
||||
task_cmd.add_argument('-i')
|
||||
task_cmd.add_option('-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR')
|
||||
task_cmd.add_option('-e', 'STORAGE_LOGS_CONTAINER={0}'.format(container_id))
|
||||
task_cmd.add_argument('spark /bin/bash >> output.log 2>&1')
|
||||
task_cmd.add_argument('-c "source ~/.bashrc; ' \
|
||||
'export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; ' \
|
||||
'cd \$AZ_BATCH_TASK_WORKING_DIR; ' \
|
||||
'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"')
|
||||
task_cmd = CommandBuilder("sudo docker exec")
|
||||
task_cmd.add_argument("-i")
|
||||
task_cmd.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
|
||||
task_cmd.add_option("-e", "STORAGE_LOGS_CONTAINER={0}".format(container_id))
|
||||
task_cmd.add_argument("spark /bin/bash >> output.log 2>&1")
|
||||
task_cmd.add_argument(
|
||||
r'-c "source ~/.bashrc; '
|
||||
r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; "
|
||||
r"cd \$AZ_BATCH_TASK_WORKING_DIR; "
|
||||
r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"')
|
||||
|
||||
# Create task
|
||||
task = batch_models.TaskAddParameter(
|
||||
|
@ -91,6 +97,7 @@ def generate_application_task(core_base_operations, container_id, application, r
|
|||
constraints=batch_models.TaskConstraints(max_task_retry_count=application.max_retry_count),
|
||||
user_identity=batch_models.UserIdentity(
|
||||
auto_user=batch_models.AutoUserSpecification(
|
||||
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)))
|
||||
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)),
|
||||
)
|
||||
|
||||
return task
|
||||
|
|
|
@ -1,14 +1,9 @@
|
|||
from typing import List
|
||||
|
||||
import azure.batch.models as batch_models
|
||||
import azure.batch.models.batch_error as batch_error
|
||||
|
||||
from aztk import error
|
||||
from aztk.internal.cluster_data import NodeData
|
||||
from aztk.spark import models
|
||||
from aztk.spark.utils import util
|
||||
from aztk.utils import constants, helpers
|
||||
from aztk.spark import models
|
||||
|
||||
POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity(
|
||||
auto_user=batch_models.AutoUserSpecification(
|
||||
|
@ -60,14 +55,13 @@ def __get_secrets_env(core_base_operations):
|
|||
]
|
||||
|
||||
|
||||
def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
|
||||
gpu_enabled: bool,
|
||||
docker_repo: str = None,
|
||||
docker_run_options: str = None,
|
||||
plugins=None,
|
||||
worker_on_master: bool = True,
|
||||
file_mounts=None,
|
||||
mixed_mode: bool = False):
|
||||
def __cluster_install_cmd(
|
||||
zip_resource_file: batch_models.ResourceFile,
|
||||
gpu_enabled: bool,
|
||||
docker_repo: str = None,
|
||||
docker_run_options: str = None,
|
||||
file_mounts=None,
|
||||
):
|
||||
"""
|
||||
For Docker on ubuntu 16.04 - return the command line
|
||||
to be run on the start task of the pool to setup spark.
|
||||
|
@ -80,41 +74,42 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
|
|||
if file_mounts:
|
||||
for mount in file_mounts:
|
||||
# Create the directory on the node
|
||||
shares.append('mkdir -p {0}'.format(mount.mount_path))
|
||||
shares.append("mkdir -p {0}".format(mount.mount_path))
|
||||
|
||||
# Mount the file share
|
||||
shares.append(
|
||||
'mount -t cifs //{0}.file.core.windows.net/{2} {3} -o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp'.
|
||||
"mount -t cifs //{0}.file.core.windows.net/{2} {3} -o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp".
|
||||
format(mount.storage_account_name, mount.storage_account_key, mount.file_share_path, mount.mount_path))
|
||||
|
||||
setup = [
|
||||
'time('\
|
||||
'apt-get -y update;'\
|
||||
'apt-get -y --no-install-recommends install unzip;'\
|
||||
'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\
|
||||
'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\
|
||||
') 2>&1'.format(zip_resource_file.file_path),
|
||||
"time("
|
||||
"apt-get -y update;"
|
||||
"apt-get -y --no-install-recommends install unzip;"
|
||||
"unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};"
|
||||
"chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;"
|
||||
") 2>&1".format(zip_resource_file.file_path),
|
||||
'/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1} "{2}"'.format(
|
||||
constants.DOCKER_SPARK_CONTAINER_NAME,
|
||||
docker_repo,
|
||||
"" if docker_run_options is None else docker_run_options.replace('"', '\\\"')
|
||||
)
|
||||
"" if docker_run_options is None else docker_run_options.replace('"', '\\"'),
|
||||
),
|
||||
]
|
||||
|
||||
commands = shares + setup
|
||||
return commands
|
||||
|
||||
|
||||
def generate_cluster_start_task(core_base_operations,
|
||||
zip_resource_file: batch_models.ResourceFile,
|
||||
cluster_id: str,
|
||||
gpu_enabled: bool,
|
||||
docker_repo: str = None,
|
||||
docker_run_options: str = None,
|
||||
file_shares: List[models.FileShare] = None,
|
||||
plugins: List[models.PluginConfiguration] = None,
|
||||
mixed_mode: bool = False,
|
||||
worker_on_master: bool = True):
|
||||
def generate_cluster_start_task(
|
||||
core_base_operations,
|
||||
zip_resource_file: batch_models.ResourceFile,
|
||||
cluster_id: str,
|
||||
gpu_enabled: bool,
|
||||
docker_repo: str = None,
|
||||
docker_run_options: str = None,
|
||||
file_shares: List[models.FileShare] = None,
|
||||
mixed_mode: bool = False,
|
||||
worker_on_master: bool = True,
|
||||
):
|
||||
"""
|
||||
This will return the start task object for the pool to be created.
|
||||
:param cluster_id str: Id of the cluster(Used for uploading the resource files)
|
||||
|
@ -130,22 +125,23 @@ def generate_cluster_start_task(core_base_operations,
|
|||
spark_submit_logs_file = constants.SPARK_SUBMIT_LOGS_FILE
|
||||
|
||||
# TODO use certificate
|
||||
environment_settings = __get_secrets_env(core_base_operations) + [
|
||||
environment_settings = (__get_secrets_env(core_base_operations) + [
|
||||
batch_models.EnvironmentSetting(name="SPARK_WEB_UI_PORT", value=spark_web_ui_port),
|
||||
batch_models.EnvironmentSetting(name="SPARK_WORKER_UI_PORT", value=spark_worker_ui_port),
|
||||
batch_models.EnvironmentSetting(name="SPARK_JOB_UI_PORT", value=spark_job_ui_port),
|
||||
batch_models.EnvironmentSetting(name="SPARK_CONTAINER_NAME", value=spark_container_name),
|
||||
batch_models.EnvironmentSetting(name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file),
|
||||
batch_models.EnvironmentSetting(name="AZTK_GPU_ENABLED", value=helpers.bool_env(gpu_enabled)),
|
||||
] + __get_docker_credentials(core_base_operations) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode)
|
||||
] + __get_docker_credentials(core_base_operations) + _get_aztk_environment(cluster_id, worker_on_master,
|
||||
mixed_mode))
|
||||
|
||||
# start task command
|
||||
command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, plugins,
|
||||
worker_on_master, file_shares, mixed_mode)
|
||||
command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, file_shares)
|
||||
|
||||
return batch_models.StartTask(
|
||||
command_line=helpers.wrap_commands_in_shell(command),
|
||||
resource_files=resource_files,
|
||||
environment_settings=environment_settings,
|
||||
user_identity=POOL_ADMIN_USER_IDENTITY,
|
||||
wait_for_success=True)
|
||||
wait_for_success=True,
|
||||
)
|
||||
|
|
|
@ -2,7 +2,6 @@ from typing import List
|
|||
|
||||
import azure.batch.models as batch_models
|
||||
|
||||
from aztk.client.base import BaseOperations as CoreBaseOperations
|
||||
from aztk.spark import models
|
||||
|
||||
from .helpers import generate_application_task, generate_cluster_start_task
|
||||
|
@ -12,18 +11,19 @@ class SparkBaseOperations:
|
|||
"""Spark Base operations object that all other Spark operations objects inherit from
|
||||
"""
|
||||
|
||||
#TODO: make this private or otherwise not public
|
||||
def _generate_cluster_start_task(self,
|
||||
core_base_operations,
|
||||
zip_resource_file: batch_models.ResourceFile,
|
||||
id: str,
|
||||
gpu_enabled: bool,
|
||||
docker_repo: str = None,
|
||||
docker_run_options: str = None,
|
||||
file_shares: List[models.FileShare] = None,
|
||||
plugins: List[models.PluginConfiguration] = None,
|
||||
mixed_mode: bool = False,
|
||||
worker_on_master: bool = True):
|
||||
# TODO: make this private or otherwise not public
|
||||
def _generate_cluster_start_task(
|
||||
self,
|
||||
core_base_operations,
|
||||
zip_resource_file: batch_models.ResourceFile,
|
||||
id: str,
|
||||
gpu_enabled: bool,
|
||||
docker_repo: str = None,
|
||||
docker_run_options: str = None,
|
||||
file_shares: List[models.FileShare] = None,
|
||||
mixed_mode: bool = False,
|
||||
worker_on_master: bool = True,
|
||||
):
|
||||
"""Generate the Azure Batch Start Task to provision a Spark cluster.
|
||||
|
||||
Args:
|
||||
|
@ -35,10 +35,8 @@ class SparkBaseOperations:
|
|||
If None, the default Docker image will be used. Defaults to None.
|
||||
file_shares (:obj:`aztk.spark.models.FileShare`, optional): a list of FileShares to mount on the cluster.
|
||||
Defaults to None.
|
||||
plugins (:obj:`aztk.spark.models.PluginConfiguration`, optional): a list of plugins to set up on the cluster.
|
||||
Defaults to None.
|
||||
mixed_mode (:obj:`bool`, optional): If True, the cluster is configured to use both dedicated and low priority VMs.
|
||||
Defaults to False.
|
||||
mixed_mode (:obj:`bool`, optional): If True, the cluster is configured to use both dedicated
|
||||
and low priority VMs. Defaults to False.
|
||||
worker_on_master (:obj:`bool`, optional): If True, the cluster is configured to provision a Spark worker
|
||||
on the VM that runs the Spark master. Defaults to True.
|
||||
|
||||
|
@ -46,10 +44,18 @@ class SparkBaseOperations:
|
|||
:obj:`azure.batch.models.StartTask`: the StartTask definition to provision the cluster.
|
||||
"""
|
||||
return generate_cluster_start_task.generate_cluster_start_task(
|
||||
core_base_operations, zip_resource_file, id, gpu_enabled, docker_repo, docker_run_options, file_shares,
|
||||
plugins, mixed_mode, worker_on_master)
|
||||
core_base_operations,
|
||||
zip_resource_file,
|
||||
id,
|
||||
gpu_enabled,
|
||||
docker_repo,
|
||||
docker_run_options,
|
||||
file_shares,
|
||||
mixed_mode,
|
||||
worker_on_master,
|
||||
)
|
||||
|
||||
#TODO: make this private or otherwise not public
|
||||
# TODO: make this private or otherwise not public
|
||||
def _generate_application_task(self, core_base_operations, container_id, application, remote=False):
|
||||
"""Generate the Azure Batch Start Task to provision a Spark cluster.
|
||||
|
||||
|
|
|
@ -2,21 +2,15 @@ from typing import List
|
|||
|
||||
import azure.batch.models.batch_error as batch_error
|
||||
|
||||
import aztk
|
||||
from aztk import error
|
||||
from aztk import models as base_models
|
||||
from aztk.client import CoreClient
|
||||
from aztk.internal.cluster_data import NodeData
|
||||
from aztk.spark import models
|
||||
from aztk.spark.client.cluster import ClusterOperations
|
||||
from aztk.spark.client.job import JobOperations
|
||||
from aztk.spark.helpers import cluster_diagnostic_helper
|
||||
from aztk.spark.helpers import create_cluster as create_cluster_helper
|
||||
from aztk.spark.helpers import get_log as get_log_helper
|
||||
from aztk.spark.helpers import job_submission as job_submit_helper
|
||||
from aztk.spark.helpers import submit as cluster_submit_helper
|
||||
from aztk.spark.utils import util
|
||||
from aztk.utils import azure_api, deprecated, deprecate, helpers
|
||||
from aztk.utils import deprecate, deprecated, helpers
|
||||
|
||||
|
||||
class Client(CoreClient):
|
||||
|
@ -28,13 +22,14 @@ class Client(CoreClient):
|
|||
"""
|
||||
|
||||
def __init__(self, secrets_configuration: models.SecretsConfiguration = None, **kwargs):
|
||||
self.secrets_configuration = None
|
||||
super().__init__()
|
||||
context = None
|
||||
if kwargs.get("secrets_config"):
|
||||
deprecate(
|
||||
version="0.10.0",
|
||||
message="secrets_config key is deprecated in secrets.yaml",
|
||||
advice="Please use secrets_configuration key instead.")
|
||||
advice="Please use secrets_configuration key instead.",
|
||||
)
|
||||
context = self._get_context(kwargs.get("secrets_config"))
|
||||
else:
|
||||
context = self._get_context(secrets_configuration)
|
||||
|
@ -133,36 +128,42 @@ class Client(CoreClient):
|
|||
id=cluster_id, node_id=node_id, command=command, host=host, internal=internal, timeout=timeout)
|
||||
|
||||
@deprecated("0.10.0")
|
||||
def cluster_copy(self,
|
||||
cluster_id: str,
|
||||
source_path: str,
|
||||
destination_path: str,
|
||||
host: bool = False,
|
||||
internal: bool = False,
|
||||
timeout: int = None):
|
||||
def cluster_copy(
|
||||
self,
|
||||
cluster_id: str,
|
||||
source_path: str,
|
||||
destination_path: str,
|
||||
host: bool = False,
|
||||
internal: bool = False,
|
||||
timeout: int = None,
|
||||
):
|
||||
return self.cluster.copy(
|
||||
id=cluster_id,
|
||||
source_path=source_path,
|
||||
destination_path=destination_path,
|
||||
host=host,
|
||||
internal=internal,
|
||||
timeout=timeout)
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
@deprecated("0.10.0")
|
||||
def cluster_download(self,
|
||||
cluster_id: str,
|
||||
source_path: str,
|
||||
destination_path: str = None,
|
||||
host: bool = False,
|
||||
internal: bool = False,
|
||||
timeout: int = None):
|
||||
def cluster_download(
|
||||
self,
|
||||
cluster_id: str,
|
||||
source_path: str,
|
||||
destination_path: str = None,
|
||||
host: bool = False,
|
||||
internal: bool = False,
|
||||
timeout: int = None,
|
||||
):
|
||||
return self.cluster.download(
|
||||
id=cluster_id,
|
||||
source_path=source_path,
|
||||
destination_path=destination_path,
|
||||
host=host,
|
||||
internal=internal,
|
||||
timeout=timeout)
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
@deprecated("0.10.0")
|
||||
def cluster_ssh_into_master(self,
|
||||
|
@ -176,9 +177,9 @@ class Client(CoreClient):
|
|||
return self.cluster._core_cluster_operations.ssh_into_node(cluster_id, node_id, username, ssh_key, password,
|
||||
port_forward_list, internal)
|
||||
|
||||
'''
|
||||
"""
|
||||
job submission
|
||||
'''
|
||||
"""
|
||||
|
||||
@deprecated("0.10.0")
|
||||
def submit_job(self, job_configuration: models.JobConfiguration):
|
||||
|
|
|
@ -4,15 +4,17 @@ from aztk import error
|
|||
from aztk.utils import helpers
|
||||
|
||||
|
||||
def cluster_copy(core_cluster_operations,
|
||||
cluster_id: str,
|
||||
source_path: str,
|
||||
destination_path: str,
|
||||
host: bool = False,
|
||||
internal: bool = False,
|
||||
timeout: int = None):
|
||||
def cluster_copy(
|
||||
core_cluster_operations,
|
||||
cluster_id: str,
|
||||
source_path: str,
|
||||
destination_path: str,
|
||||
host: bool = False,
|
||||
internal: bool = False,
|
||||
timeout: int = None,
|
||||
):
|
||||
try:
|
||||
container_name = None if host else 'spark'
|
||||
container_name = None if host else "spark"
|
||||
return core_cluster_operations.copy(
|
||||
cluster_id,
|
||||
source_path,
|
||||
|
@ -20,6 +22,7 @@ def cluster_copy(core_cluster_operations,
|
|||
container_name=container_name,
|
||||
get=False,
|
||||
internal=internal,
|
||||
timeout=timeout)
|
||||
timeout=timeout,
|
||||
)
|
||||
except batch_error.BatchErrorException as e:
|
||||
raise error.AztkError(helpers.format_batch_exception(e))
|
||||
|
|
|
@ -52,9 +52,16 @@ def create_cluster(core_cluster_operations,
|
|||
zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file()
|
||||
|
||||
start_task = spark_cluster_operations._generate_cluster_start_task(
|
||||
core_cluster_operations, zip_resource_files, cluster_conf.cluster_id, cluster_conf.gpu_enabled(),
|
||||
cluster_conf.get_docker_repo(), cluster_conf.get_docker_run_options(), cluster_conf.file_shares,
|
||||
cluster_conf.plugins, cluster_conf.mixed_mode(), cluster_conf.worker_on_master)
|
||||
core_cluster_operations,
|
||||
zip_resource_files,
|
||||
cluster_conf.cluster_id,
|
||||
cluster_conf.gpu_enabled(),
|
||||
cluster_conf.get_docker_repo(),
|
||||
cluster_conf.get_docker_run_options(),
|
||||
cluster_conf.file_shares,
|
||||
cluster_conf.mixed_mode(),
|
||||
cluster_conf.worker_on_master,
|
||||
)
|
||||
|
||||
software_metadata_key = base_models.Software.spark
|
||||
|
||||
|
|
|
@ -4,12 +4,14 @@ from aztk import error
|
|||
from aztk.utils import helpers
|
||||
|
||||
|
||||
def create_user(core_cluster_operations,
|
||||
spark_cluster_operations,
|
||||
cluster_id: str,
|
||||
username: str,
|
||||
password: str = None,
|
||||
ssh_key: str = None) -> str:
|
||||
def create_user(
|
||||
core_cluster_operations,
|
||||
spark_cluster_operations,
|
||||
cluster_id: str,
|
||||
username: str,
|
||||
password: str = None,
|
||||
ssh_key: str = None,
|
||||
) -> str:
|
||||
try:
|
||||
cluster = spark_cluster_operations.get(cluster_id)
|
||||
master_node_id = cluster.master_node_id
|
||||
|
|
|
@ -6,18 +6,13 @@ from aztk import error
|
|||
from aztk.utils import helpers
|
||||
|
||||
|
||||
def _write_error(stream, node_output):
|
||||
stream.write(node_output.error)
|
||||
|
||||
|
||||
def _write_output(stream, node_output):
|
||||
stream.write(node_output.output)
|
||||
|
||||
|
||||
def _run(spark_cluster_operations, cluster_id, output_directory=None, brief=False):
|
||||
# copy debug program to each node
|
||||
output = spark_cluster_operations.copy(
|
||||
copy_output = spark_cluster_operations.copy(
|
||||
cluster_id, os.path.abspath("./aztk/spark/utils/debug.py"), "/tmp/debug.py", host=True)
|
||||
for node_output in copy_output:
|
||||
if node_output.error:
|
||||
raise error.AztkError("Failed to copy diagnostic script to cluster.")
|
||||
ssh_cmd = _build_diagnostic_ssh_command(brief)
|
||||
run_output = spark_cluster_operations.run(cluster_id, ssh_cmd, host=True)
|
||||
remote_path = "/tmp/debug.zip"
|
||||
|
@ -27,9 +22,9 @@ def _run(spark_cluster_operations, cluster_id, output_directory=None, brief=Fals
|
|||
result = spark_cluster_operations.download(cluster_id, remote_path, local_path, host=True)
|
||||
|
||||
# write run output or error to debug/ directory
|
||||
with open(os.path.join(output_directory, "debug-output.txt"), 'w', encoding="UTF-8") as stream:
|
||||
with open(os.path.join(output_directory, "debug-output.txt"), "w", encoding="UTF-8") as stream:
|
||||
for node_output in run_output:
|
||||
_write_error(stream, node_output) if node_output.error else _write_output(stream, node_output)
|
||||
stream.write(node_output.error) if node_output.error else stream.write(node_output.output)
|
||||
else:
|
||||
result = spark_cluster_operations.download(cluster_id, remote_path, host=True)
|
||||
|
||||
|
@ -37,11 +32,11 @@ def _run(spark_cluster_operations, cluster_id, output_directory=None, brief=Fals
|
|||
|
||||
|
||||
def _build_diagnostic_ssh_command(brief):
|
||||
return "sudo rm -rf /tmp/debug.zip; "\
|
||||
"sudo apt-get install -y python3-pip; "\
|
||||
"sudo -H pip3 install --upgrade pip; "\
|
||||
"sudo -H pip3 install docker; "\
|
||||
"sudo python3 /tmp/debug.py {}".format(brief)
|
||||
return ("sudo rm -rf /tmp/debug.zip; "
|
||||
"sudo apt-get install -y python3-pip; "
|
||||
"sudo -H pip3 install --upgrade pip; "
|
||||
"sudo -H pip3 install docker; "
|
||||
"sudo python3 /tmp/debug.py {}".format(brief))
|
||||
|
||||
|
||||
def run_cluster_diagnostics(spark_cluster_operations, cluster_id, output_directory=None, brief=False):
|
||||
|
|
|
@ -4,15 +4,17 @@ from aztk import error
|
|||
from aztk.utils import helpers
|
||||
|
||||
|
||||
def cluster_download(core_cluster_operations,
|
||||
cluster_id: str,
|
||||
source_path: str,
|
||||
destination_path: str = None,
|
||||
host: bool = False,
|
||||
internal: bool = False,
|
||||
timeout: int = None):
|
||||
def cluster_download(
|
||||
core_cluster_operations,
|
||||
cluster_id: str,
|
||||
source_path: str,
|
||||
destination_path: str = None,
|
||||
host: bool = False,
|
||||
internal: bool = False,
|
||||
timeout: int = None,
|
||||
):
|
||||
try:
|
||||
container_name = None if host else 'spark'
|
||||
container_name = None if host else "spark"
|
||||
return core_cluster_operations.copy(
|
||||
cluster_id,
|
||||
source_path,
|
||||
|
@ -20,6 +22,7 @@ def cluster_download(core_cluster_operations,
|
|||
container_name=container_name,
|
||||
get=True,
|
||||
internal=internal,
|
||||
timeout=timeout)
|
||||
timeout=timeout,
|
||||
)
|
||||
except batch_error.BatchErrorException as e:
|
||||
raise error.AztkError(helpers.format_batch_exception(e))
|
||||
|
|
|
@ -7,6 +7,6 @@ from aztk.utils import helpers
|
|||
def get_application_status(core_cluster_operations, cluster_id: str, app_name: str):
|
||||
try:
|
||||
task = core_cluster_operations.batch_client.task.get(cluster_id, app_name)
|
||||
return task.state._value_
|
||||
return task.state.name
|
||||
except batch_error.BatchErrorException as e:
|
||||
raise error.AztkError(helpers.format_batch_exception(e))
|
||||
|
|
|
@ -4,15 +4,17 @@ from aztk import error
|
|||
from aztk.utils import helpers
|
||||
|
||||
|
||||
def node_run(core_cluster_operations,
|
||||
cluster_id: str,
|
||||
node_id: str,
|
||||
command: str,
|
||||
host=False,
|
||||
internal: bool = False,
|
||||
timeout=None):
|
||||
def node_run(
|
||||
core_cluster_operations,
|
||||
cluster_id: str,
|
||||
node_id: str,
|
||||
command: str,
|
||||
host=False,
|
||||
internal: bool = False,
|
||||
timeout=None,
|
||||
):
|
||||
try:
|
||||
return core_cluster_operations.node_run(
|
||||
cluster_id, node_id, command, internal, container_name='spark' if not host else None, timeout=timeout)
|
||||
cluster_id, node_id, command, internal, container_name="spark" if not host else None, timeout=timeout)
|
||||
except batch_error.BatchErrorException as e:
|
||||
raise error.AztkError(helpers.format_batch_exception(e))
|
||||
|
|
|
@ -12,6 +12,6 @@ def cluster_run(core_cluster_operations,
|
|||
timeout=None):
|
||||
try:
|
||||
return core_cluster_operations.run(
|
||||
cluster_id, command, internal, container_name='spark' if not host else None, timeout=timeout)
|
||||
cluster_id, command, internal, container_name="spark" if not host else None, timeout=timeout)
|
||||
except batch_error.BatchErrorException as e:
|
||||
raise error.AztkError(helpers.format_batch_exception(e))
|
||||
|
|
|
@ -4,16 +4,19 @@ from aztk import error
|
|||
from aztk.utils import helpers
|
||||
|
||||
|
||||
def cluster_ssh_into_master(spark_cluster_operations,
|
||||
cluster_id,
|
||||
node_id,
|
||||
username,
|
||||
ssh_key=None,
|
||||
password=None,
|
||||
port_forward_list=None,
|
||||
internal=False):
|
||||
def ssh_into_master(
|
||||
spark_cluster_operations,
|
||||
core_cluster_operations,
|
||||
cluster_id,
|
||||
username,
|
||||
ssh_key=None,
|
||||
password=None,
|
||||
port_forward_list=None,
|
||||
internal=False,
|
||||
):
|
||||
try:
|
||||
spark_cluster_operations.ssh_into_node(cluster_id, node_id, username, ssh_key, password, port_forward_list,
|
||||
internal)
|
||||
master_node_id = spark_cluster_operations.get(cluster_id).master_node_id
|
||||
core_cluster_operations.ssh_into_node(cluster_id, master_node_id, username, ssh_key, password,
|
||||
port_forward_list, internal)
|
||||
except batch_error.BatchErrorException as e:
|
||||
raise error.AztkError(helpers.format_batch_exception(e))
|
||||
|
|
|
@ -42,12 +42,14 @@ def submit_application(core_cluster_operations,
|
|||
job_id=job_id, task_id=task.id, batch_client=core_cluster_operations.batch_client)
|
||||
|
||||
|
||||
def submit(core_cluster_operations,
|
||||
spark_cluster_operations,
|
||||
cluster_id: str,
|
||||
application: models.ApplicationConfiguration,
|
||||
remote: bool = False,
|
||||
wait: bool = False):
|
||||
def submit(
|
||||
core_cluster_operations,
|
||||
spark_cluster_operations,
|
||||
cluster_id: str,
|
||||
application: models.ApplicationConfiguration,
|
||||
remote: bool = False,
|
||||
wait: bool = False,
|
||||
):
|
||||
try:
|
||||
submit_application(core_cluster_operations, spark_cluster_operations, cluster_id, application, remote, wait)
|
||||
except batch_error.BatchErrorException as e:
|
||||
|
|
|
@ -2,9 +2,25 @@ from aztk.client.cluster import CoreClusterOperations
|
|||
from aztk.spark import models
|
||||
from aztk.spark.client.base import SparkBaseOperations
|
||||
|
||||
from .helpers import (copy, create, create_user, delete, diagnostics, download, get, get_application_log,
|
||||
get_application_status, get_configuration, get_remote_login_settings, list, node_run, run, submit,
|
||||
wait)
|
||||
from .helpers import (
|
||||
copy,
|
||||
create,
|
||||
create_user,
|
||||
delete,
|
||||
diagnostics,
|
||||
download,
|
||||
get,
|
||||
get_application_log,
|
||||
get_application_status,
|
||||
get_configuration,
|
||||
get_remote_login_settings,
|
||||
list,
|
||||
node_run,
|
||||
run,
|
||||
ssh_into_master,
|
||||
submit,
|
||||
wait,
|
||||
)
|
||||
|
||||
|
||||
class ClusterOperations(SparkBaseOperations):
|
||||
|
@ -58,7 +74,8 @@ class ClusterOperations(SparkBaseOperations):
|
|||
"""List all clusters.
|
||||
|
||||
Returns:
|
||||
:obj:`List[aztk.spark.models.Cluster]`: List of Cluster objects each representing the state and configuration of the cluster.
|
||||
:obj:`List[aztk.spark.models.Cluster]`: List of Cluster objects each representing the state
|
||||
and configuration of the cluster.
|
||||
"""
|
||||
return list.list_clusters(self._core_cluster_operations)
|
||||
|
||||
|
@ -71,7 +88,8 @@ class ClusterOperations(SparkBaseOperations):
|
|||
remote (:obj:`bool`): If True, the application file will not be uploaded, it is assumed to be reachable
|
||||
by the cluster already. This is useful when your application is stored in a mounted Azure File Share
|
||||
and not the client. Defaults to False.
|
||||
wait (:obj:`bool`, optional): If True, this function blocks until the application has completed. Defaults to False.
|
||||
wait (:obj:`bool`, optional): If True, this function blocks until the application has completed.
|
||||
Defaults to False.
|
||||
|
||||
Returns:
|
||||
:obj:`None`
|
||||
|
@ -84,7 +102,8 @@ class ClusterOperations(SparkBaseOperations):
|
|||
Args:
|
||||
username (:obj:`str`): name of the user to create.
|
||||
pool_id (:obj:`str`): id of the cluster to create the user on.
|
||||
ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None.
|
||||
ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password.
|
||||
Defaults to None.
|
||||
password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None.
|
||||
|
||||
Returns:
|
||||
|
@ -118,7 +137,8 @@ class ClusterOperations(SparkBaseOperations):
|
|||
Defaults to None.
|
||||
|
||||
Returns:
|
||||
:obj:`List[aztk.spark.models.NodeOutput]`: list of NodeOutput objects containing the output of the run command
|
||||
:obj:`List[aztk.spark.models.NodeOutput]`:
|
||||
list of NodeOutput objects containing the output of the run command
|
||||
"""
|
||||
return run.cluster_run(self._core_cluster_operations, id, command, host, internal, timeout)
|
||||
|
||||
|
@ -141,13 +161,15 @@ class ClusterOperations(SparkBaseOperations):
|
|||
"""
|
||||
return node_run.node_run(self._core_cluster_operations, id, node_id, command, host, internal, timeout)
|
||||
|
||||
def copy(self,
|
||||
id: str,
|
||||
source_path: str,
|
||||
destination_path: str,
|
||||
host: bool = False,
|
||||
internal: bool = False,
|
||||
timeout: int = None):
|
||||
def copy(
|
||||
self,
|
||||
id: str,
|
||||
source_path: str,
|
||||
destination_path: str,
|
||||
host: bool = False,
|
||||
internal: bool = False,
|
||||
timeout: int = None,
|
||||
):
|
||||
"""Copy a file to every node in a cluster.
|
||||
|
||||
Args:
|
||||
|
@ -162,18 +184,21 @@ class ClusterOperations(SparkBaseOperations):
|
|||
Defaults to None.
|
||||
|
||||
Returns:
|
||||
:obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command.
|
||||
:obj:`List[aztk.spark.models.NodeOutput]`:
|
||||
A list of NodeOutput objects representing the output of the copy command.
|
||||
"""
|
||||
return copy.cluster_copy(self._core_cluster_operations, id, source_path, destination_path, host, internal,
|
||||
timeout)
|
||||
|
||||
def download(self,
|
||||
id: str,
|
||||
source_path: str,
|
||||
destination_path: str = None,
|
||||
host: bool = False,
|
||||
internal: bool = False,
|
||||
timeout: int = None):
|
||||
def download(
|
||||
self,
|
||||
id: str,
|
||||
source_path: str,
|
||||
destination_path: str = None,
|
||||
host: bool = False,
|
||||
internal: bool = False,
|
||||
timeout: int = None,
|
||||
):
|
||||
"""Download a file from every node in a cluster.
|
||||
|
||||
Args:
|
||||
|
@ -190,7 +215,8 @@ class ClusterOperations(SparkBaseOperations):
|
|||
Defaults to None.
|
||||
|
||||
Returns:
|
||||
:obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command.
|
||||
:obj:`List[aztk.spark.models.NodeOutput]`:
|
||||
A list of NodeOutput objects representing the output of the copy command.
|
||||
"""
|
||||
return download.cluster_download(self._core_cluster_operations, id, source_path, destination_path, host,
|
||||
internal, timeout)
|
||||
|
@ -205,7 +231,8 @@ class ClusterOperations(SparkBaseOperations):
|
|||
written to this path. Defaults to None.
|
||||
|
||||
Returns:
|
||||
:obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command.
|
||||
:obj:`List[aztk.spark.models.NodeOutput]`:
|
||||
A list of NodeOutput objects representing the output of the copy command.
|
||||
"""
|
||||
return diagnostics.run_cluster_diagnostics(self, id, output_directory, brief)
|
||||
|
||||
|
@ -215,10 +242,11 @@ class ClusterOperations(SparkBaseOperations):
|
|||
Args:
|
||||
id (:obj:`str`): the id of the cluster to run the command on.
|
||||
application_name (:obj:`str`): str
|
||||
tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. Otherwise, the whole log will be retrieved.
|
||||
Only use this if streaming the log as it is being written. Defaults to False.
|
||||
current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are retrieved.
|
||||
Only useful is streaming the log as it is being written. Only used if tail is True.
|
||||
tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes.
|
||||
Otherwise, the whole log will be retrieved. Only use this if streaming the log as it is being written.
|
||||
Defaults to False.
|
||||
current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are
|
||||
retrieved. Only useful is streaming the log as it is being written. Only used if tail is True.
|
||||
|
||||
Returns:
|
||||
:obj:`aztk.spark.models.ApplicationLog`: a model representing the output of the application.
|
||||
|
@ -234,7 +262,8 @@ class ClusterOperations(SparkBaseOperations):
|
|||
node_id (:obj:`str`): the id of the node in the cluster
|
||||
|
||||
Returns:
|
||||
:obj:`aztk.spark.models.RemoteLogin`: Object that contains the ip address and port combination to login to a node
|
||||
:obj:`aztk.spark.models.RemoteLogin`:
|
||||
Object that contains the ip address and port combination to login to a node
|
||||
"""
|
||||
return get_remote_login_settings.get_remote_login_settings(self._core_cluster_operations, id, node_id)
|
||||
|
||||
|
@ -260,3 +289,21 @@ class ClusterOperations(SparkBaseOperations):
|
|||
:obj:`aztk.spark.models.ClusterConfiguration`
|
||||
"""
|
||||
return get_configuration.get_configuration(self._core_cluster_operations, id)
|
||||
|
||||
def ssh_into_master(self, id, username, ssh_key=None, password=None, port_forward_list=None, internal=False):
|
||||
"""Open an SSH tunnel to the Spark master node and forward the specified ports
|
||||
|
||||
Args:
|
||||
id (:obj:`str`): the id of the cluster
|
||||
username (:obj:`str`): the name of the user to open the ssh session with
|
||||
ssh_key (:obj:`str`, optional): the ssh_key to authenticate the ssh user with.
|
||||
Must specify either `ssh_key` or `password`.
|
||||
password (:obj:`str`, optional): the password to authenticate the ssh user with.
|
||||
Must specify either `password` or `ssh_key`.
|
||||
port_forward_list (:obj:`aztk.spark.models.PortForwardingSpecification`, optional):
|
||||
List of the ports to forward.
|
||||
internal (:obj:`str`, optional): if True, this will connect to the node using its internal IP.
|
||||
Only use this if running within the same VNET as the cluster. Defaults to False.
|
||||
"""
|
||||
return ssh_into_master.ssh_into_master(self, self._core_cluster_operations, id, username, ssh_key, password,
|
||||
port_forward_list, internal)
|
||||
|
|
|
@ -2,7 +2,6 @@ import azure.batch.models as batch_models
|
|||
import azure.batch.models.batch_error as batch_error
|
||||
|
||||
from aztk import error
|
||||
from aztk.spark import models
|
||||
from aztk.utils import helpers
|
||||
|
||||
from .get_recent_job import get_recent_job
|
||||
|
|
|
@ -5,7 +5,6 @@ from aztk import error
|
|||
from aztk.spark import models
|
||||
from aztk.utils import helpers
|
||||
|
||||
from .list_applications import list_applications
|
||||
from .get_recent_job import get_recent_job
|
||||
|
||||
|
||||
|
@ -25,8 +24,11 @@ def _get_application_log(core_job_operations, spark_job_operations, job_id, appl
|
|||
raise error.AztkError("The application {0} has not yet been created.".format(application))
|
||||
raise error.AztkError("The application {0} does not exist".format(application_name))
|
||||
else:
|
||||
if task.state in (batch_models.TaskState.active, batch_models.TaskState.running,
|
||||
batch_models.TaskState.preparing):
|
||||
if task.state in (
|
||||
batch_models.TaskState.active,
|
||||
batch_models.TaskState.running,
|
||||
batch_models.TaskState.preparing,
|
||||
):
|
||||
raise error.AztkError("The application {0} has not yet finished executing.".format(application_name))
|
||||
|
||||
return core_job_operations.get_application_log(job_id, application_name)
|
||||
|
|
|
@ -13,7 +13,7 @@ def _list_applications(core_job_operations, job_id):
|
|||
applications = {}
|
||||
for metadata_item in recent_run_job.metadata:
|
||||
if metadata_item.name == "applications":
|
||||
for app_name in metadata_item.value.split('\n'):
|
||||
for app_name in metadata_item.value.split("\n"):
|
||||
applications[app_name] = None
|
||||
|
||||
# get tasks from Batch job
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import azure.batch.models.batch_error as batch_error
|
||||
|
||||
from aztk import error
|
||||
from aztk.spark import models
|
||||
from aztk.utils import helpers
|
||||
|
||||
from .get_recent_job import get_recent_job
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
import azure.batch.models.batch_error as batch_error
|
||||
|
||||
from aztk import error
|
||||
from aztk.spark import models
|
||||
from aztk.utils import helpers
|
||||
from .get_recent_job import get_recent_job
|
||||
|
||||
|
||||
|
|
|
@ -15,11 +15,12 @@ def __app_cmd():
|
|||
docker_exec.add_argument("-i")
|
||||
docker_exec.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
|
||||
docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID")
|
||||
docker_exec.add_argument("spark /bin/bash >> output.log 2>&1 -c \"" \
|
||||
"source ~/.bashrc; " \
|
||||
"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " \
|
||||
"cd \$AZ_BATCH_TASK_WORKING_DIR; " \
|
||||
"\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py\"")
|
||||
docker_exec.add_argument(
|
||||
r'spark /bin/bash >> output.log 2>&1 -c "'
|
||||
r"source ~/.bashrc; "
|
||||
r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; "
|
||||
r"cd \$AZ_BATCH_TASK_WORKING_DIR; "
|
||||
r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py"')
|
||||
return docker_exec.to_str()
|
||||
|
||||
|
||||
|
@ -28,10 +29,11 @@ def generate_job_manager_task(core_job_operations, job, application_tasks):
|
|||
for application, task in application_tasks:
|
||||
task_definition_resource_file = helpers.upload_text_to_container(
|
||||
container_name=job.id,
|
||||
application_name=application.name + '.yaml',
|
||||
file_path=application.name + '.yaml',
|
||||
application_name=application.name + ".yaml",
|
||||
file_path=application.name + ".yaml",
|
||||
content=yaml.dump(task),
|
||||
blob_client=core_job_operations.blob_client)
|
||||
blob_client=core_job_operations.blob_client,
|
||||
)
|
||||
resource_files.append(task_definition_resource_file)
|
||||
|
||||
task_cmd = __app_cmd()
|
||||
|
@ -45,7 +47,8 @@ def generate_job_manager_task(core_job_operations, job, application_tasks):
|
|||
allow_low_priority_node=True,
|
||||
user_identity=batch_models.UserIdentity(
|
||||
auto_user=batch_models.AutoUserSpecification(
|
||||
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)))
|
||||
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)),
|
||||
)
|
||||
|
||||
return task
|
||||
|
||||
|
@ -83,24 +86,24 @@ def submit_job(core_job_operations,
|
|||
job_configuration.get_docker_repo(),
|
||||
job_configuration.get_docker_run_options(),
|
||||
mixed_mode=job_configuration.mixed_mode(),
|
||||
worker_on_master=job_configuration.worker_on_master)
|
||||
worker_on_master=job_configuration.worker_on_master,
|
||||
)
|
||||
|
||||
application_tasks = []
|
||||
for application in job_configuration.applications:
|
||||
application_tasks.append((application,
|
||||
spark_job_operations._generate_application_task(
|
||||
core_job_operations, job_configuration.id, application)))
|
||||
application_tasks.append((
|
||||
application,
|
||||
spark_job_operations._generate_application_task(core_job_operations, job_configuration.id, application),
|
||||
))
|
||||
|
||||
job_manager_task = generate_job_manager_task(core_job_operations, job_configuration, application_tasks)
|
||||
|
||||
software_metadata_key = base_models.Software.spark
|
||||
|
||||
vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04')
|
||||
vm_image = models.VmImage(publisher="Canonical", offer="UbuntuServer", sku="16.04")
|
||||
|
||||
autoscale_formula = "$TargetDedicatedNodes = {0}; " \
|
||||
"$TargetLowPriorityNodes = {1}".format(
|
||||
job_configuration.max_dedicated_nodes,
|
||||
job_configuration.max_low_pri_nodes)
|
||||
autoscale_formula = "$TargetDedicatedNodes = {0}; " "$TargetLowPriorityNodes = {1}".format(
|
||||
job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes)
|
||||
|
||||
job = core_job_operations.submit(
|
||||
job_configuration=job_configuration,
|
||||
|
@ -109,7 +112,8 @@ def submit_job(core_job_operations,
|
|||
autoscale_formula=autoscale_formula,
|
||||
software_metadata_key=software_metadata_key,
|
||||
vm_image_model=vm_image,
|
||||
application_metadata='\n'.join(application.name for application in (job_configuration.applications or [])))
|
||||
application_metadata="\n".join(application.name for application in (job_configuration.applications or [])),
|
||||
)
|
||||
|
||||
if wait:
|
||||
spark_job_operations.wait(id=job_configuration.id)
|
||||
|
|
|
@ -2,8 +2,18 @@ from aztk.client.job import CoreJobOperations
|
|||
from aztk.spark import models
|
||||
from aztk.spark.client.base import SparkBaseOperations
|
||||
|
||||
from .helpers import (delete, get, get_application, get_application_log, list, list_applications, stop,
|
||||
stop_application, submit, wait_until_complete)
|
||||
from .helpers import (
|
||||
delete,
|
||||
get,
|
||||
get_application,
|
||||
get_application_log,
|
||||
list,
|
||||
list_applications,
|
||||
stop,
|
||||
stop_application,
|
||||
submit,
|
||||
wait_until_complete,
|
||||
)
|
||||
|
||||
|
||||
class JobOperations(SparkBaseOperations):
|
||||
|
|
|
@ -1,8 +1,4 @@
|
|||
import os
|
||||
from aztk.utils import ssh
|
||||
from aztk.utils.command_builder import CommandBuilder
|
||||
from aztk import models as aztk_models
|
||||
import azure.batch.models as batch_models
|
||||
|
||||
|
||||
def run(spark_client, cluster_id, output_directory=None):
|
||||
|
@ -17,8 +13,8 @@ def run(spark_client, cluster_id, output_directory=None):
|
|||
output = spark_client.cluster_download(cluster_id, remote_path, local_path, host=True)
|
||||
|
||||
# write run output to debug/ directory
|
||||
with open(os.path.join(os.path.dirname(local_path), "debug-output.txt"), 'w', encoding="UTF-8") as f:
|
||||
[f.write(line + '\n') for node_output in run_output for line in node_output.output]
|
||||
with open(os.path.join(os.path.dirname(local_path), "debug-output.txt"), "w", encoding="UTF-8") as f:
|
||||
[f.write(line + "\n") for node_output in run_output for line in node_output.output]
|
||||
else:
|
||||
output = spark_client.cluster_download(cluster_id, remote_path, host=True)
|
||||
|
||||
|
@ -26,8 +22,4 @@ def run(spark_client, cluster_id, output_directory=None):
|
|||
|
||||
|
||||
def _build_diagnostic_ssh_command():
|
||||
return "sudo rm -rf /tmp/debug.zip; "\
|
||||
"sudo apt-get install -y python3-pip; "\
|
||||
"sudo -H pip3 install --upgrade pip; "\
|
||||
"sudo -H pip3 install docker; "\
|
||||
"sudo python3 /tmp/debug.py"
|
||||
return "sudo rm -rf /tmp/debug.zip; " "sudo apt-get install -y python3-pip; " "sudo -H pip3 install --upgrade pip; " "sudo -H pip3 install docker; " "sudo python3 /tmp/debug.py"
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
from typing import List
|
||||
from aztk.utils.command_builder import CommandBuilder
|
||||
from aztk.utils import helpers
|
||||
from aztk.utils import constants
|
||||
from aztk import models as aztk_models
|
||||
from aztk.spark.models import ClusterConfiguration
|
||||
import azure.batch.models as batch_models
|
||||
|
||||
POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity(
|
||||
|
@ -56,14 +54,16 @@ def __get_secrets_env(spark_client):
|
|||
]
|
||||
|
||||
|
||||
def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
|
||||
gpu_enabled: bool,
|
||||
docker_repo: str = None,
|
||||
docker_run_options: str = None,
|
||||
plugins=None,
|
||||
worker_on_master: bool = True,
|
||||
file_mounts=None,
|
||||
mixed_mode: bool = False):
|
||||
def __cluster_install_cmd(
|
||||
zip_resource_file: batch_models.ResourceFile,
|
||||
gpu_enabled: bool,
|
||||
docker_repo: str = None,
|
||||
docker_run_options: str = None,
|
||||
plugins=None,
|
||||
worker_on_master: bool = True,
|
||||
file_mounts=None,
|
||||
mixed_mode: bool = False,
|
||||
):
|
||||
"""
|
||||
For Docker on ubuntu 16.04 - return the command line
|
||||
to be run on the start task of the pool to setup spark.
|
||||
|
@ -77,41 +77,41 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
|
|||
if file_mounts:
|
||||
for mount in file_mounts:
|
||||
# Create the directory on the node
|
||||
shares.append('mkdir -p {0}'.format(mount.mount_path))
|
||||
shares.append("mkdir -p {0}".format(mount.mount_path))
|
||||
|
||||
# Mount the file share
|
||||
shares.append(
|
||||
'mount -t cifs //{0}.file.core.windows.net/{2} {3} -o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp'.
|
||||
format(mount.storage_account_name, mount.storage_account_key, mount.file_share_path, mount.mount_path))
|
||||
shares.append("mount -t cifs //{0}.file.core.windows.net/{2} {3} "
|
||||
"-o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp".format(
|
||||
mount.storage_account_name, mount.storage_account_key, mount.file_share_path,
|
||||
mount.mount_path))
|
||||
|
||||
setup = [
|
||||
'time('\
|
||||
'apt-get -y update;'\
|
||||
'apt-get -y --no-install-recommends install unzip;'\
|
||||
'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\
|
||||
'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\
|
||||
') 2>&1'.format(zip_resource_file.file_path),
|
||||
"time("
|
||||
"apt-get -y update;"
|
||||
"apt-get -y --no-install-recommends install unzip;"
|
||||
"unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};"
|
||||
"chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;"
|
||||
") 2>&1".format(zip_resource_file.file_path),
|
||||
'/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1} "{2}"'.format(
|
||||
constants.DOCKER_SPARK_CONTAINER_NAME,
|
||||
docker_repo,
|
||||
docker_run_options.replace('"', '\\\"')
|
||||
)
|
||||
constants.DOCKER_SPARK_CONTAINER_NAME, docker_repo, docker_run_options.replace('"', '\\"')),
|
||||
]
|
||||
|
||||
commands = shares + setup
|
||||
return commands
|
||||
|
||||
|
||||
def generate_cluster_start_task(spark_client,
|
||||
zip_resource_file: batch_models.ResourceFile,
|
||||
cluster_id: str,
|
||||
gpu_enabled: bool,
|
||||
docker_repo: str = None,
|
||||
docker_run_options: str = None,
|
||||
file_shares: List[aztk_models.FileShare] = None,
|
||||
plugins: List[aztk_models.PluginConfiguration] = None,
|
||||
mixed_mode: bool = False,
|
||||
worker_on_master: bool = True):
|
||||
def generate_cluster_start_task(
|
||||
spark_client,
|
||||
zip_resource_file: batch_models.ResourceFile,
|
||||
cluster_id: str,
|
||||
gpu_enabled: bool,
|
||||
docker_repo: str = None,
|
||||
docker_run_options: str = None,
|
||||
file_shares: List[aztk_models.FileShare] = None,
|
||||
plugins: List[aztk_models.PluginConfiguration] = None,
|
||||
mixed_mode: bool = False,
|
||||
worker_on_master: bool = True,
|
||||
):
|
||||
"""
|
||||
This will return the start task object for the pool to be created.
|
||||
:param cluster_id str: Id of the cluster(Used for uploading the resource files)
|
||||
|
@ -127,22 +127,31 @@ def generate_cluster_start_task(spark_client,
|
|||
spark_submit_logs_file = constants.SPARK_SUBMIT_LOGS_FILE
|
||||
|
||||
# TODO use certificate
|
||||
environment_settings = __get_secrets_env(spark_client) + [
|
||||
environment_settings = (__get_secrets_env(spark_client) + [
|
||||
batch_models.EnvironmentSetting(name="SPARK_WEB_UI_PORT", value=spark_web_ui_port),
|
||||
batch_models.EnvironmentSetting(name="SPARK_WORKER_UI_PORT", value=spark_worker_ui_port),
|
||||
batch_models.EnvironmentSetting(name="SPARK_JOB_UI_PORT", value=spark_job_ui_port),
|
||||
batch_models.EnvironmentSetting(name="SPARK_CONTAINER_NAME", value=spark_container_name),
|
||||
batch_models.EnvironmentSetting(name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file),
|
||||
batch_models.EnvironmentSetting(name="AZTK_GPU_ENABLED", value=helpers.bool_env(gpu_enabled)),
|
||||
] + __get_docker_credentials(spark_client) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode)
|
||||
] + __get_docker_credentials(spark_client) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode))
|
||||
|
||||
# start task command
|
||||
command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, plugins,
|
||||
worker_on_master, file_shares, mixed_mode)
|
||||
command = __cluster_install_cmd(
|
||||
zip_resource_file,
|
||||
gpu_enabled,
|
||||
docker_repo,
|
||||
docker_run_options,
|
||||
plugins,
|
||||
worker_on_master,
|
||||
file_shares,
|
||||
mixed_mode,
|
||||
)
|
||||
|
||||
return batch_models.StartTask(
|
||||
command_line=helpers.wrap_commands_in_shell(command),
|
||||
resource_files=resource_files,
|
||||
environment_settings=environment_settings,
|
||||
user_identity=POOL_ADMIN_USER_IDENTITY,
|
||||
wait_for_success=True)
|
||||
wait_for_success=True,
|
||||
)
|
||||
|
|
|
@ -9,8 +9,7 @@ from aztk import models as base_models
|
|||
from aztk.spark import models
|
||||
from aztk.utils import constants, helpers
|
||||
|
||||
output_file = constants.TASK_WORKING_DIR + \
|
||||
"/" + constants.SPARK_SUBMIT_LOGS_FILE
|
||||
output_file = constants.TASK_WORKING_DIR + "/" + constants.SPARK_SUBMIT_LOGS_FILE
|
||||
|
||||
|
||||
def __check_task_node_exist(batch_client, cluster_id: str, task: batch_models.CloudTask) -> bool:
|
||||
|
@ -51,16 +50,17 @@ def __get_output_file_properties(batch_client, cluster_id: str, application_name
|
|||
|
||||
def get_log_from_storage(blob_client, container_name, application_name, task):
|
||||
try:
|
||||
blob = blob_client.get_blob_to_text(container_name, application_name + '/' + constants.SPARK_SUBMIT_LOGS_FILE)
|
||||
blob = blob_client.get_blob_to_text(container_name, application_name + "/" + constants.SPARK_SUBMIT_LOGS_FILE)
|
||||
except azure.common.AzureMissingResourceHttpError:
|
||||
raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.")
|
||||
base_model = base_models.ApplicationLog(
|
||||
name=application_name,
|
||||
cluster_id=container_name,
|
||||
application_state=task.state._value_,
|
||||
application_state=task.state.name,
|
||||
log=blob.content,
|
||||
total_bytes=blob.properties.content_length,
|
||||
exit_code=task.execution_info.exit_code)
|
||||
exit_code=task.execution_info.exit_code,
|
||||
)
|
||||
return models.ApplicationLog(base_model)
|
||||
|
||||
|
||||
|
@ -88,17 +88,19 @@ def get_log(batch_client, blob_client, cluster_id: str, application_name: str, t
|
|||
base_model = base_models.ApplicationLog(
|
||||
name=application_name,
|
||||
cluster_id=cluster_id,
|
||||
application_state=task.state._value_,
|
||||
application_state=task.state.name,
|
||||
log=content,
|
||||
total_bytes=target_bytes,
|
||||
exit_code=task.execution_info.exit_code)
|
||||
exit_code=task.execution_info.exit_code,
|
||||
)
|
||||
return models.ApplicationLog(base_model)
|
||||
else:
|
||||
base_model = base_models.ApplicationLog(
|
||||
name=application_name,
|
||||
cluster_id=cluster_id,
|
||||
application_state=task.state._value_,
|
||||
log='',
|
||||
application_state=task.state.name,
|
||||
log="",
|
||||
total_bytes=target_bytes,
|
||||
exit_code=task.execution_info.exit_code)
|
||||
exit_code=task.execution_info.exit_code,
|
||||
)
|
||||
return models.ApplicationLog(base_model)
|
||||
|
|
|
@ -1,17 +1,11 @@
|
|||
import datetime
|
||||
import os
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
import azure.batch.models as batch_models
|
||||
import yaml
|
||||
|
||||
import aztk.error as error
|
||||
from aztk.utils import constants, helpers
|
||||
from aztk.utils import helpers
|
||||
from aztk.utils.command_builder import CommandBuilder
|
||||
'''
|
||||
Job Submission helper methods
|
||||
'''
|
||||
|
||||
|
||||
def __app_cmd():
|
||||
|
@ -19,11 +13,12 @@ def __app_cmd():
|
|||
docker_exec.add_argument("-i")
|
||||
docker_exec.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
|
||||
docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID")
|
||||
docker_exec.add_argument("spark /bin/bash >> output.log 2>&1 -c \"" \
|
||||
"source ~/.bashrc; " \
|
||||
"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " \
|
||||
"cd \$AZ_BATCH_TASK_WORKING_DIR; " \
|
||||
"\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py\"")
|
||||
docker_exec.add_argument(
|
||||
r'spark /bin/bash >> output.log 2>&1 -c "'
|
||||
r"source ~/.bashrc; "
|
||||
r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; "
|
||||
r"cd \$AZ_BATCH_TASK_WORKING_DIR; "
|
||||
r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py"')
|
||||
return docker_exec.to_str()
|
||||
|
||||
|
||||
|
@ -32,10 +27,11 @@ def generate_task(spark_client, job, application_tasks):
|
|||
for application, task in application_tasks:
|
||||
task_definition_resource_file = helpers.upload_text_to_container(
|
||||
container_name=job.id,
|
||||
application_name=application.name + '.yaml',
|
||||
file_path=application.name + '.yaml',
|
||||
application_name=application.name + ".yaml",
|
||||
file_path=application.name + ".yaml",
|
||||
content=yaml.dump(task),
|
||||
blob_client=spark_client.blob_client)
|
||||
blob_client=spark_client.blob_client,
|
||||
)
|
||||
resource_files.append(task_definition_resource_file)
|
||||
|
||||
task_cmd = __app_cmd()
|
||||
|
@ -49,7 +45,8 @@ def generate_task(spark_client, job, application_tasks):
|
|||
allow_low_priority_node=True,
|
||||
user_identity=batch_models.UserIdentity(
|
||||
auto_user=batch_models.AutoUserSpecification(
|
||||
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)))
|
||||
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)),
|
||||
)
|
||||
|
||||
return task
|
||||
|
||||
|
@ -69,7 +66,7 @@ def list_applications(spark_client, job_id):
|
|||
applications = {}
|
||||
for metadata_item in recent_run_job.metadata:
|
||||
if metadata_item.name == "applications":
|
||||
for app_name in metadata_item.value.split('\n'):
|
||||
for app_name in metadata_item.value.split("\n"):
|
||||
applications[app_name] = None
|
||||
|
||||
# get tasks from Batch job
|
||||
|
@ -177,8 +174,11 @@ def get_application_log(spark_client, job_id, application_name):
|
|||
raise error.AztkError("The application {0} has not yet been created.".format(application))
|
||||
raise error.AztkError("The application {0} does not exist".format(application_name))
|
||||
else:
|
||||
if task.state in (batch_models.TaskState.active, batch_models.TaskState.running,
|
||||
batch_models.TaskState.preparing):
|
||||
if task.state in (
|
||||
batch_models.TaskState.active,
|
||||
batch_models.TaskState.running,
|
||||
batch_models.TaskState.preparing,
|
||||
):
|
||||
raise error.AztkError("The application {0} has not yet finished executing.".format(application_name))
|
||||
|
||||
return spark_client.get_application_log(job_id, application_name)
|
||||
|
|
|
@ -1,14 +1,11 @@
|
|||
import datetime
|
||||
import os
|
||||
from typing import List
|
||||
import yaml
|
||||
|
||||
import azure.batch.models as batch_models
|
||||
import yaml
|
||||
|
||||
from aztk.error import AztkError
|
||||
from aztk.utils import constants, helpers
|
||||
from aztk.utils import helpers
|
||||
from aztk.utils.command_builder import CommandBuilder
|
||||
'''
|
||||
Submit helper methods
|
||||
'''
|
||||
|
||||
|
||||
def __get_node(spark_client, node_id: str, cluster_id: str) -> batch_models.ComputeNode:
|
||||
|
@ -25,12 +22,13 @@ def generate_task(spark_client, container_id, application, remote=False):
|
|||
application_name=application.name,
|
||||
file_path=application.application,
|
||||
blob_client=spark_client.blob_client,
|
||||
use_full_path=False)
|
||||
use_full_path=False,
|
||||
)
|
||||
|
||||
# Upload application file
|
||||
resource_files.append(app_resource_file)
|
||||
|
||||
application.application = '$AZ_BATCH_TASK_WORKING_DIR/' + os.path.basename(application.application)
|
||||
application.application = "$AZ_BATCH_TASK_WORKING_DIR/" + os.path.basename(application.application)
|
||||
|
||||
# Upload dependent JARS
|
||||
jar_resource_file_paths = []
|
||||
|
@ -40,7 +38,8 @@ def generate_task(spark_client, container_id, application, remote=False):
|
|||
application_name=application.name,
|
||||
file_path=jar,
|
||||
blob_client=spark_client.blob_client,
|
||||
use_full_path=False)
|
||||
use_full_path=False,
|
||||
)
|
||||
jar_resource_file_paths.append(current_jar_resource_file_path)
|
||||
resource_files.append(current_jar_resource_file_path)
|
||||
|
||||
|
@ -52,7 +51,8 @@ def generate_task(spark_client, container_id, application, remote=False):
|
|||
application_name=application.name,
|
||||
file_path=py_file,
|
||||
blob_client=spark_client.blob_client,
|
||||
use_full_path=False)
|
||||
use_full_path=False,
|
||||
)
|
||||
py_files_resource_file_paths.append(current_py_files_resource_file_path)
|
||||
resource_files.append(current_py_files_resource_file_path)
|
||||
|
||||
|
@ -64,7 +64,8 @@ def generate_task(spark_client, container_id, application, remote=False):
|
|||
application_name=application.name,
|
||||
file_path=file,
|
||||
blob_client=spark_client.blob_client,
|
||||
use_full_path=False)
|
||||
use_full_path=False,
|
||||
)
|
||||
files_resource_file_paths.append(files_resource_file_path)
|
||||
resource_files.append(files_resource_file_path)
|
||||
|
||||
|
@ -75,21 +76,23 @@ def generate_task(spark_client, container_id, application, remote=False):
|
|||
application_definition_file = helpers.upload_text_to_container(
|
||||
container_name=container_id,
|
||||
application_name=application.name,
|
||||
file_path='application.yaml',
|
||||
file_path="application.yaml",
|
||||
content=yaml.dump(vars(application)),
|
||||
blob_client=spark_client.blob_client)
|
||||
blob_client=spark_client.blob_client,
|
||||
)
|
||||
resource_files.append(application_definition_file)
|
||||
|
||||
# create command to submit task
|
||||
task_cmd = CommandBuilder('sudo docker exec')
|
||||
task_cmd.add_argument('-i')
|
||||
task_cmd.add_option('-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR')
|
||||
task_cmd.add_option('-e', 'STORAGE_LOGS_CONTAINER={0}'.format(container_id))
|
||||
task_cmd.add_argument('spark /bin/bash >> output.log 2>&1')
|
||||
task_cmd.add_argument('-c "source ~/.bashrc; ' \
|
||||
'export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; ' \
|
||||
'cd \$AZ_BATCH_TASK_WORKING_DIR; ' \
|
||||
'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"')
|
||||
task_cmd = CommandBuilder("sudo docker exec")
|
||||
task_cmd.add_argument("-i")
|
||||
task_cmd.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
|
||||
task_cmd.add_option("-e", "STORAGE_LOGS_CONTAINER={0}".format(container_id))
|
||||
task_cmd.add_argument("spark /bin/bash >> output.log 2>&1")
|
||||
task_cmd.add_argument(
|
||||
r'-c "source ~/.bashrc; '
|
||||
r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; "
|
||||
r"cd \$AZ_BATCH_TASK_WORKING_DIR; "
|
||||
r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"')
|
||||
|
||||
# Create task
|
||||
task = batch_models.TaskAddParameter(
|
||||
|
@ -99,7 +102,8 @@ def generate_task(spark_client, container_id, application, remote=False):
|
|||
constraints=batch_models.TaskConstraints(max_task_retry_count=application.max_retry_count),
|
||||
user_identity=batch_models.UserIdentity(
|
||||
auto_user=batch_models.AutoUserSpecification(
|
||||
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)))
|
||||
scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)),
|
||||
)
|
||||
|
||||
return task
|
||||
|
||||
|
|
|
@ -12,11 +12,7 @@ from aztk.utils import constants, helpers
|
|||
class SparkToolkit(aztk.models.Toolkit):
|
||||
def __init__(self, version: str, environment: str = None, environment_version: str = None):
|
||||
super().__init__(
|
||||
software="spark",
|
||||
version=version,
|
||||
environment=environment,
|
||||
environment_version=environment_version,
|
||||
)
|
||||
software="spark", version=version, environment=environment, environment_version=environment_version)
|
||||
|
||||
|
||||
class Cluster(aztk.models.Cluster):
|
||||
|
@ -74,9 +70,9 @@ class SparkConfiguration(Model):
|
|||
|
||||
def __generate_ssh_key_pair(self):
|
||||
key = RSA.generate(2048)
|
||||
priv_key = key.exportKey('PEM')
|
||||
pub_key = key.publickey().exportKey('OpenSSH')
|
||||
return {'pub_key': pub_key, 'priv_key': priv_key}
|
||||
priv_key = key.exportKey("PEM")
|
||||
pub_key = key.publickey().exportKey("OpenSSH")
|
||||
return {"pub_key": pub_key, "priv_key": priv_key}
|
||||
|
||||
|
||||
class CustomScript(aztk.models.CustomScript):
|
||||
|
@ -124,22 +120,24 @@ class VmImage(aztk.models.VmImage):
|
|||
|
||||
|
||||
class ApplicationConfiguration:
|
||||
def __init__(self,
|
||||
name=None,
|
||||
application=None,
|
||||
application_args=None,
|
||||
main_class=None,
|
||||
jars=None,
|
||||
py_files=None,
|
||||
files=None,
|
||||
driver_java_options=None,
|
||||
driver_library_path=None,
|
||||
driver_class_path=None,
|
||||
driver_memory=None,
|
||||
executor_memory=None,
|
||||
driver_cores=None,
|
||||
executor_cores=None,
|
||||
max_retry_count=None):
|
||||
def __init__(
|
||||
self,
|
||||
name=None,
|
||||
application=None,
|
||||
application_args=None,
|
||||
main_class=None,
|
||||
jars=None,
|
||||
py_files=None,
|
||||
files=None,
|
||||
driver_java_options=None,
|
||||
driver_library_path=None,
|
||||
driver_class_path=None,
|
||||
driver_memory=None,
|
||||
executor_memory=None,
|
||||
driver_cores=None,
|
||||
executor_cores=None,
|
||||
max_retry_count=None,
|
||||
):
|
||||
self.name = name
|
||||
self.application = application
|
||||
self.application_args = application_args
|
||||
|
@ -162,11 +160,11 @@ class Application:
|
|||
self.name = cloud_task.id
|
||||
self.last_modified = cloud_task.last_modified
|
||||
self.creation_time = cloud_task.creation_time
|
||||
self.state = cloud_task.state._value_
|
||||
self.state = cloud_task.state.name
|
||||
self.state_transition_time = cloud_task.state_transition_time
|
||||
self.exit_code = cloud_task.execution_info.exit_code
|
||||
if cloud_task.previous_state:
|
||||
self.previous_state = cloud_task.previous_state._value_
|
||||
self.previous_state = cloud_task.previous_state.name
|
||||
self.previous_state_transition_time = cloud_task.previous_state_transition_time
|
||||
|
||||
self._execution_info = cloud_task.execution_info
|
||||
|
@ -190,17 +188,19 @@ class Application:
|
|||
|
||||
|
||||
class JobConfiguration:
|
||||
def __init__(self,
|
||||
id=None,
|
||||
applications=None,
|
||||
vm_size=None,
|
||||
spark_configuration=None,
|
||||
toolkit=None,
|
||||
max_dedicated_nodes=0,
|
||||
max_low_pri_nodes=0,
|
||||
subnet_id=None,
|
||||
scheduling_target: SchedulingTarget = None,
|
||||
worker_on_master=None):
|
||||
def __init__(
|
||||
self,
|
||||
id=None,
|
||||
applications=None,
|
||||
vm_size=None,
|
||||
spark_configuration=None,
|
||||
toolkit=None,
|
||||
max_dedicated_nodes=0,
|
||||
max_low_pri_nodes=0,
|
||||
subnet_id=None,
|
||||
scheduling_target: SchedulingTarget = None,
|
||||
worker_on_master=None,
|
||||
):
|
||||
|
||||
self.id = id
|
||||
self.applications = applications
|
||||
|
@ -252,24 +252,23 @@ class JobConfiguration:
|
|||
raise error.AztkError("Please supply an ID for the Job in your configuration.")
|
||||
|
||||
if self.max_dedicated_nodes == 0 and self.max_low_pri_nodes == 0:
|
||||
raise error.AztkError(
|
||||
"Please supply a valid (greater than 0) value for either max_dedicated_nodes or max_low_pri_nodes in your configuration."
|
||||
)
|
||||
raise error.AztkError("Please supply a valid (greater than 0) value for either max_dedicated_nodes "
|
||||
"or max_low_pri_nodes in your configuration.")
|
||||
|
||||
if self.vm_size is None:
|
||||
raise error.AztkError("Please supply a vm_size in your configuration.")
|
||||
|
||||
if self.mixed_mode() and not self.subnet_id:
|
||||
raise error.AztkError(
|
||||
"You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes) and pass the subnet_id in your configuration.."
|
||||
)
|
||||
"You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes) "
|
||||
"and pass the subnet_id in your configuration..")
|
||||
|
||||
if self.scheduling_target == SchedulingTarget.Dedicated and self.max_dedicated_nodes == 0:
|
||||
raise error.InvalidModelError("Scheduling target cannot be Dedicated if dedicated vm size is 0")
|
||||
|
||||
|
||||
class JobState():
|
||||
complete = 'completed'
|
||||
class JobState:
|
||||
complete = "completed"
|
||||
active = "active"
|
||||
completed = "completed"
|
||||
disabled = "disabled"
|
||||
|
@ -277,15 +276,17 @@ class JobState():
|
|||
deleting = "deleting"
|
||||
|
||||
|
||||
class Job():
|
||||
def __init__(self,
|
||||
cloud_job_schedule: batch_models.CloudJobSchedule,
|
||||
cloud_tasks: List[batch_models.CloudTask] = None,
|
||||
pool: batch_models.CloudPool = None,
|
||||
nodes: batch_models.ComputeNodePaged = None):
|
||||
class Job:
|
||||
def __init__(
|
||||
self,
|
||||
cloud_job_schedule: batch_models.CloudJobSchedule,
|
||||
cloud_tasks: List[batch_models.CloudTask] = None,
|
||||
pool: batch_models.CloudPool = None,
|
||||
nodes: batch_models.ComputeNodePaged = None,
|
||||
):
|
||||
self.id = cloud_job_schedule.id
|
||||
self.last_modified = cloud_job_schedule.last_modified
|
||||
self.state = cloud_job_schedule.state._value_
|
||||
self.state = cloud_job_schedule.state.name
|
||||
self.state_transition_time = cloud_job_schedule.state_transition_time
|
||||
self.creation_time = cloud_job_schedule.creation_time
|
||||
self.applications = [Application(task) for task in (cloud_tasks or [])]
|
||||
|
@ -297,9 +298,11 @@ class Job():
|
|||
|
||||
class ApplicationLog(aztk.models.ApplicationLog):
|
||||
def __init__(self, application_log: aztk.models.ApplicationLog):
|
||||
self.name = application_log.name
|
||||
self.cluster_id = application_log.cluster_id # TODO: change to something cluster/job agnostic
|
||||
self.log = application_log.log
|
||||
self.total_bytes = application_log.total_bytes
|
||||
self.application_state = application_log.application_state
|
||||
self.exit_code = application_log.exit_code
|
||||
super().__init__(
|
||||
name=application_log.name,
|
||||
cluster_id=application_log.cluster_id, # TODO: change to something cluster/job agnostic
|
||||
log=application_log.log,
|
||||
total_bytes=application_log.total_bytes,
|
||||
application_state=application_log.application_state,
|
||||
exit_code=application_log.exit_code,
|
||||
)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import os
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_file import PluginFile
|
||||
from aztk.utils import constants
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
@ -11,36 +10,14 @@ class HDFSPlugin(PluginConfiguration):
|
|||
super().__init__(
|
||||
name="hdfs",
|
||||
ports=[
|
||||
PluginPort(
|
||||
name="File system metadata operations",
|
||||
internal=8020,
|
||||
),
|
||||
PluginPort(
|
||||
name="File system metadata operations(Backup)",
|
||||
internal=9000,
|
||||
),
|
||||
PluginPort(
|
||||
name="Datanode data transfer",
|
||||
internal=50010,
|
||||
),
|
||||
PluginPort(
|
||||
name="Datanode IPC metadata operations",
|
||||
internal=50020,
|
||||
),
|
||||
PluginPort(
|
||||
name="Namenode",
|
||||
internal=50070,
|
||||
public=True,
|
||||
),
|
||||
PluginPort(
|
||||
name="Datanodes",
|
||||
internal=50075,
|
||||
public=True,
|
||||
),
|
||||
PluginPort(name="File system metadata operations", internal=8020),
|
||||
PluginPort(name="File system metadata operations(Backup)", internal=9000),
|
||||
PluginPort(name="Datanode data transfer", internal=50010),
|
||||
PluginPort(name="Datanode IPC metadata operations", internal=50020),
|
||||
PluginPort(name="Namenode", internal=50070, public=True),
|
||||
PluginPort(name="Datanodes", internal=50075, public=True),
|
||||
],
|
||||
target_role=PluginTargetRole.All,
|
||||
execute="hdfs.sh",
|
||||
files=[
|
||||
PluginFile("hdfs.sh", os.path.join(dir_path, "hdfs.sh")),
|
||||
],
|
||||
files=[PluginFile("hdfs.sh", os.path.join(dir_path, "hdfs.sh"))],
|
||||
)
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
import os
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_file import PluginFile
|
||||
from aztk.spark.models.plugins.install import InstallPlugin
|
||||
from aztk.utils import constants
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
import os
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_file import PluginFile
|
||||
from aztk.spark.models.plugins.install import InstallPlugin
|
||||
from aztk.utils import constants
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import os
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_file import PluginFile
|
||||
from aztk.utils import constants
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
@ -13,4 +12,5 @@ def InstallPlugin(name, command, packages=None):
|
|||
execute="install.sh",
|
||||
files=[PluginFile("install.sh", os.path.join(dir_path, "install.sh"))],
|
||||
args=packages,
|
||||
env=dict(COMMAND=command))
|
||||
env=dict(COMMAND=command),
|
||||
)
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
import os
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_file import PluginFile
|
||||
from aztk.spark.models.plugins.install import InstallPlugin
|
||||
from aztk.utils import constants
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
|
|
@ -8,15 +8,8 @@ dir_path = os.path.dirname(os.path.realpath(__file__))
|
|||
def JupyterPlugin():
|
||||
return PluginConfiguration(
|
||||
name="jupyter",
|
||||
ports=[
|
||||
PluginPort(
|
||||
internal=8888,
|
||||
public=True,
|
||||
),
|
||||
],
|
||||
ports=[PluginPort(internal=8888, public=True)],
|
||||
target_role=PluginTargetRole.All,
|
||||
execute="jupyter.sh",
|
||||
files=[
|
||||
PluginFile("jupyter.sh", os.path.join(dir_path, "jupyter.sh")),
|
||||
],
|
||||
files=[PluginFile("jupyter.sh", os.path.join(dir_path, "jupyter.sh"))],
|
||||
)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import os
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_file import PluginFile
|
||||
from aztk.utils import constants
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
@ -9,15 +8,8 @@ dir_path = os.path.dirname(os.path.realpath(__file__))
|
|||
def JupyterLabPlugin():
|
||||
return PluginConfiguration(
|
||||
name="jupyterlab",
|
||||
ports=[
|
||||
PluginPort(
|
||||
internal=8889,
|
||||
public=True,
|
||||
),
|
||||
],
|
||||
ports=[PluginPort(internal=8889, public=True)],
|
||||
target_role=PluginTargetRole.All,
|
||||
execute="jupyter_lab.sh",
|
||||
files=[
|
||||
PluginFile("jupyter_lab.sh", os.path.join(dir_path, "jupyter_lab.sh")),
|
||||
],
|
||||
files=[PluginFile("jupyter_lab.sh", os.path.join(dir_path, "jupyter_lab.sh"))],
|
||||
)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import os
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_file import PluginFile
|
||||
from aztk.utils import constants
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
@ -12,6 +11,5 @@ def NvBLASPlugin():
|
|||
ports=[],
|
||||
target_role=PluginTargetRole.All,
|
||||
execute="nvblas.sh",
|
||||
files=[
|
||||
PluginFile("nvblas.sh", os.path.join(dir_path, "nvblas.sh")),
|
||||
])
|
||||
files=[PluginFile("nvblas.sh", os.path.join(dir_path, "nvblas.sh"))],
|
||||
)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import os
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_file import PluginFile
|
||||
from aztk.utils import constants
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
@ -12,7 +11,5 @@ def OpenBLASPlugin():
|
|||
ports=[],
|
||||
target_role=PluginTargetRole.All,
|
||||
execute="openblas.sh",
|
||||
files=[
|
||||
PluginFile("openblas.sh", os.path.join(dir_path, "openblas.sh")),
|
||||
],
|
||||
files=[PluginFile("openblas.sh", os.path.join(dir_path, "openblas.sh"))],
|
||||
)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import os
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTarget, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_file import PluginFile
|
||||
from aztk.utils import constants
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
@ -10,12 +9,7 @@ class ResourceMonitorPlugin(PluginConfiguration):
|
|||
def __init__(self):
|
||||
super().__init__(
|
||||
name="resource_monitor",
|
||||
ports=[
|
||||
PluginPort(
|
||||
internal=8890,
|
||||
public=True,
|
||||
),
|
||||
],
|
||||
ports=[PluginPort(internal=8890, public=True)],
|
||||
target=PluginTarget.Host,
|
||||
target_role=PluginTargetRole.All,
|
||||
execute="start_monitor.sh",
|
||||
|
@ -23,4 +17,5 @@ class ResourceMonitorPlugin(PluginConfiguration):
|
|||
PluginFile("start_monitor.sh", os.path.join(dir_path, "start_monitor.sh")),
|
||||
PluginFile("etc/telegraf.conf", os.path.join(dir_path, "telegraf.conf")),
|
||||
PluginFile("docker-compose.yml", os.path.join(dir_path, "docker-compose.yml")),
|
||||
])
|
||||
],
|
||||
)
|
||||
|
|
|
@ -8,16 +8,9 @@ dir_path = os.path.dirname(os.path.realpath(__file__))
|
|||
def RStudioServerPlugin(version="1.1.383"):
|
||||
return PluginConfiguration(
|
||||
name="rstudio_server",
|
||||
ports=[
|
||||
PluginPort(
|
||||
internal=8787,
|
||||
public=True,
|
||||
),
|
||||
],
|
||||
ports=[PluginPort(internal=8787, public=True)],
|
||||
target_role=PluginTargetRole.Master,
|
||||
execute="rstudio_server.sh",
|
||||
files=[
|
||||
PluginFile("rstudio_server.sh", os.path.join(dir_path, "rstudio_server.sh")),
|
||||
],
|
||||
files=[PluginFile("rstudio_server.sh", os.path.join(dir_path, "rstudio_server.sh"))],
|
||||
env=dict(RSTUDIO_SERVER_VERSION=version),
|
||||
)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import os
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole, PluginTarget
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTarget, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_file import PluginFile
|
||||
from aztk.utils import constants
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
@ -13,7 +12,5 @@ class SimplePlugin(PluginConfiguration):
|
|||
target_role=PluginTargetRole.All,
|
||||
target=PluginTarget.Host,
|
||||
execute="simple.sh",
|
||||
files=[
|
||||
PluginFile("simple.sh", os.path.join(dir_path, "simple.sh")),
|
||||
],
|
||||
files=[PluginFile("simple.sh", os.path.join(dir_path, "simple.sh"))],
|
||||
)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import os
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_file import PluginFile
|
||||
from aztk.utils import constants
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ from http.server import BaseHTTPRequestHandler, HTTPServer
|
|||
|
||||
BIND_ADDR = os.environ.get("BIND_ADDR", "0.0.0.0")
|
||||
SERVER_PORT = int(os.environ.get("SERVER_PORT", "80"))
|
||||
URL_PREFIX = os.environ.get("URL_PREFIX", "").rstrip('/') + '/'
|
||||
URL_PREFIX = os.environ.get("URL_PREFIX", "").rstrip("/") + "/"
|
||||
SPARK_MASTER_HOST = ""
|
||||
|
||||
|
||||
|
@ -44,7 +44,7 @@ class ProxyHandler(BaseHTTPRequestHandler):
|
|||
self.proxyRequest(None)
|
||||
|
||||
def do_POST(self):
|
||||
length = int(self.headers.getheader('content-length'))
|
||||
length = int(self.headers.getheader("content-length"))
|
||||
postData = self.rfile.read(length)
|
||||
self.proxyRequest(postData)
|
||||
|
||||
|
@ -84,17 +84,19 @@ class ProxyHandler(BaseHTTPRequestHandler):
|
|||
def rewriteLinks(self, page, targetHost):
|
||||
target = "{0}proxy:{1}/".format(URL_PREFIX, targetHost).encode()
|
||||
page = page.replace(b'href="/', b'href="' + target)
|
||||
page = page.replace(b"'<div><a href=' + logUrl + '>'",
|
||||
b"'<div><a href=' + location.origin + logUrl.replace('http://', '/proxy:') + '>'")
|
||||
page = page.replace(b'href="log', b'href="' + target + b'log')
|
||||
page = page.replace(b'href="http://', b'href="' + URL_PREFIX.encode() + b'proxy:')
|
||||
page = page.replace(
|
||||
b"'<div><a href=' + logUrl + '>'",
|
||||
b"'<div><a href=' + location.origin + logUrl.replace('http://', '/proxy:') + '>'",
|
||||
)
|
||||
page = page.replace(b'href="log', b'href="' + target + b"log")
|
||||
page = page.replace(b'href="http://', b'href="' + URL_PREFIX.encode() + b"proxy:")
|
||||
page = page.replace(b'src="/', b'src="' + target)
|
||||
page = page.replace(b'action="', b'action="' + target)
|
||||
page = page.replace(b'"/api/v1/', b'"' + target + b'api/v1/')
|
||||
page = page.replace(b'"/api/v1/', b'"' + target + b"api/v1/")
|
||||
return page
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: <proxied host:port> [<proxy port>]")
|
||||
sys.exit(1)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import os
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole
|
||||
from aztk.models.plugins.plugin_file import PluginFile
|
||||
from aztk.utils import constants
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
@ -11,7 +10,5 @@ def TensorflowOnSparkPlugin():
|
|||
name="tensorflow_on_spark",
|
||||
target_role=PluginTargetRole.Master,
|
||||
execute="tensorflow_on_spark.sh",
|
||||
files=[
|
||||
PluginFile("tensorflow_on_spark.sh", os.path.join(dir_path, "tensorflow_on_spark.sh")),
|
||||
],
|
||||
files=[PluginFile("tensorflow_on_spark.sh", os.path.join(dir_path, "tensorflow_on_spark.sh"))],
|
||||
)
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
from aztk.spark import models
|
||||
|
||||
SPARK_VM_IMAGE = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04')
|
||||
SPARK_VM_IMAGE = models.VmImage(publisher="Canonical", offer="UbuntuServer", sku="16.04")
|
||||
|
|
|
@ -50,9 +50,7 @@ def cmd_check_output(cmd):
|
|||
try:
|
||||
output = check_output(cmd, shell=True, stderr=STDOUT)
|
||||
except CalledProcessError as e:
|
||||
return "CMD: {0}\n"\
|
||||
"returncode: {1}"\
|
||||
"output: {2}".format(e.cmd, e.returncode, e.output)
|
||||
return "CMD: {0}\n" "returncode: {1}" "output: {2}".format(e.cmd, e.returncode, e.output)
|
||||
else:
|
||||
return output
|
||||
|
||||
|
@ -62,9 +60,9 @@ def get_disk_free():
|
|||
|
||||
|
||||
def get_docker_diagnostics(docker_client):
|
||||
'''
|
||||
"""
|
||||
returns list of tuples (filename, data) to be written in the zip
|
||||
'''
|
||||
"""
|
||||
output = []
|
||||
output.append(get_docker_images(docker_client))
|
||||
logs = get_docker_containers(docker_client)
|
||||
|
@ -95,7 +93,7 @@ def get_docker_containers(docker_client):
|
|||
# get docker container logs
|
||||
logs.append((container.name + "/docker.log", container.logs()))
|
||||
logs.append(get_docker_process_status(container))
|
||||
if container.name == "spark": #TODO: find a more robust way to get specific info off specific containers
|
||||
if container.name == "spark": # TODO: find a more robust way to get specific info off specific containers
|
||||
logs.extend(get_container_aztk_script(container))
|
||||
logs.extend(get_spark_logs(container))
|
||||
logs.extend(get_spark_app_logs(container))
|
||||
|
@ -158,13 +156,13 @@ def filter_members(members):
|
|||
|
||||
|
||||
def extract_tar_in_memory(container, data):
|
||||
data = io.BytesIO(b''.join([item for item in data]))
|
||||
data = io.BytesIO(b"".join([item for item in data]))
|
||||
tarf = tarfile.open(fileobj=data)
|
||||
logs = []
|
||||
for member in filter_members(tarf):
|
||||
file_bytes = tarf.extractfile(member)
|
||||
if file_bytes is not None:
|
||||
logs.append((container.name + "/" + member.name, b''.join(file_bytes.readlines())))
|
||||
logs.append((container.name + "/" + member.name, b"".join(file_bytes.readlines())))
|
||||
return logs
|
||||
|
||||
|
||||
|
@ -174,7 +172,7 @@ def get_brief_diagnostics():
|
|||
logs = []
|
||||
for file_name in files:
|
||||
try:
|
||||
logs.append((file_name, open(batch_dir + file_name, 'rb').read()))
|
||||
logs.append((file_name, open(batch_dir + file_name, "rb").read()))
|
||||
# print("LOG:", (file_name, open(batch_dir+file_name, 'rb').read()))
|
||||
except FileNotFoundError as e:
|
||||
print("file not found", e)
|
||||
|
|
|
@ -1,16 +1,11 @@
|
|||
from __future__ import print_function
|
||||
|
||||
import datetime
|
||||
import io
|
||||
import os
|
||||
import time
|
||||
import azure.batch.batch_service_client as batch
|
||||
import azure.batch.batch_auth as batch_auth
|
||||
|
||||
import azure.batch.models as batch_models
|
||||
import azure.storage.blob as blob
|
||||
from aztk.version import __version__
|
||||
|
||||
from aztk.utils import constants
|
||||
from aztk import error
|
||||
import aztk.models
|
||||
|
||||
|
||||
class MasterInvalidStateError(Exception):
|
||||
|
|
|
@ -1,8 +1,3 @@
|
|||
from .deprecation import deprecated, deprecate
|
||||
from . import azure_api
|
||||
from . import command_builder
|
||||
from . import constants
|
||||
from . import helpers
|
||||
from . import file_utils
|
||||
from . import get_ssh_key
|
||||
from . import secure_utils
|
||||
from . import (azure_api, command_builder, constants, file_utils, get_ssh_key, helpers, secure_utils)
|
||||
from .deprecation import deprecate, deprecated
|
||||
from .retry import BackOffPolicy, retry
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import re
|
||||
from typing import Optional
|
||||
|
||||
import azure.batch.batch_auth as batch_auth
|
||||
import azure.batch.batch_service_client as batch
|
||||
|
@ -12,10 +11,10 @@ from azure.storage.common import CloudStorageAccount
|
|||
from aztk import error
|
||||
from aztk.version import __version__
|
||||
|
||||
RESOURCE_ID_PATTERN = re.compile('^/subscriptions/(?P<subscription>[^/]+)'
|
||||
'/resourceGroups/(?P<resourcegroup>[^/]+)'
|
||||
'/providers/[^/]+'
|
||||
'/[^/]+Accounts/(?P<account>[^/]+)$')
|
||||
RESOURCE_ID_PATTERN = re.compile("^/subscriptions/(?P<subscription>[^/]+)"
|
||||
"/resourceGroups/(?P<resourcegroup>[^/]+)"
|
||||
"/providers/[^/]+"
|
||||
"/[^/]+Accounts/(?P<account>[^/]+)$")
|
||||
|
||||
|
||||
def validate_secrets(secrets):
|
||||
|
@ -48,23 +47,25 @@ def make_batch_client(secrets):
|
|||
client_id=secrets.service_principal.client_id,
|
||||
secret=secrets.service_principal.credential,
|
||||
tenant=secrets.service_principal.tenant_id,
|
||||
resource='https://management.core.windows.net/')
|
||||
resource="https://management.core.windows.net/",
|
||||
)
|
||||
m = RESOURCE_ID_PATTERN.match(secrets.service_principal.batch_account_resource_id)
|
||||
arm_batch_client = BatchManagementClient(arm_credentials, m.group('subscription'))
|
||||
account = arm_batch_client.batch_account.get(m.group('resourcegroup'), m.group('account'))
|
||||
base_url = 'https://{0}/'.format(account.account_endpoint)
|
||||
arm_batch_client = BatchManagementClient(arm_credentials, m.group("subscription"))
|
||||
account = arm_batch_client.batch_account.get(m.group("resourcegroup"), m.group("account"))
|
||||
base_url = "https://{0}/".format(account.account_endpoint)
|
||||
credentials = ServicePrincipalCredentials(
|
||||
client_id=secrets.service_principal.client_id,
|
||||
secret=secrets.service_principal.credential,
|
||||
tenant=secrets.service_principal.tenant_id,
|
||||
resource='https://batch.core.windows.net/')
|
||||
resource="https://batch.core.windows.net/",
|
||||
)
|
||||
|
||||
# Set up Batch Client
|
||||
batch_client = batch.BatchServiceClient(credentials, base_url=base_url)
|
||||
|
||||
# Set retry policy
|
||||
batch_client.config.retry_policy.retries = 5
|
||||
batch_client.config.add_user_agent('aztk/{}'.format(__version__))
|
||||
batch_client.config.add_user_agent("aztk/{}".format(__version__))
|
||||
|
||||
return batch_client
|
||||
|
||||
|
@ -82,26 +83,29 @@ def make_blob_client(secrets):
|
|||
blob_client = blob.BlockBlobService(
|
||||
account_name=secrets.shared_key.storage_account_name,
|
||||
account_key=secrets.shared_key.storage_account_key,
|
||||
endpoint_suffix=secrets.shared_key.storage_account_suffix)
|
||||
endpoint_suffix=secrets.shared_key.storage_account_suffix,
|
||||
)
|
||||
else:
|
||||
# Set up ServicePrincipalCredentials
|
||||
arm_credentials = ServicePrincipalCredentials(
|
||||
client_id=secrets.service_principal.client_id,
|
||||
secret=secrets.service_principal.credential,
|
||||
tenant=secrets.service_principal.tenant_id,
|
||||
resource='https://management.core.windows.net/')
|
||||
resource="https://management.core.windows.net/",
|
||||
)
|
||||
m = RESOURCE_ID_PATTERN.match(secrets.service_principal.storage_account_resource_id)
|
||||
accountname = m.group('account')
|
||||
subscription = m.group('subscription')
|
||||
resourcegroup = m.group('resourcegroup')
|
||||
accountname = m.group("account")
|
||||
subscription = m.group("subscription")
|
||||
resourcegroup = m.group("resourcegroup")
|
||||
mgmt_client = StorageManagementClient(arm_credentials, subscription)
|
||||
key = retry_function(
|
||||
key = (retry_function(
|
||||
mgmt_client.storage_accounts.list_keys,
|
||||
10,
|
||||
1,
|
||||
Exception,
|
||||
resource_group_name=resourcegroup,
|
||||
account_name=accountname).keys[0].value
|
||||
account_name=accountname,
|
||||
).keys[0].value)
|
||||
storage_client = CloudStorageAccount(accountname, key)
|
||||
blob_client = storage_client.create_block_blob_service()
|
||||
|
||||
|
@ -110,6 +114,7 @@ def make_blob_client(secrets):
|
|||
|
||||
def retry_function(function, retry_attempts: int, retry_interval: int, exception: Exception, *args, **kwargs):
|
||||
import time
|
||||
|
||||
for i in range(retry_attempts):
|
||||
try:
|
||||
return function(*args, **kwargs)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
class CommandOption():
|
||||
class CommandOption:
|
||||
def __init__(self, name: str, value: str):
|
||||
self.name = name
|
||||
self.value = value
|
||||
|
|
|
@ -18,33 +18,33 @@ DOCKER_SPARK_HOME = "/home/spark-current"
|
|||
"""
|
||||
Root path of this repository
|
||||
"""
|
||||
ROOT_PATH = os.path.normpath(os.path.join(os.path.dirname(__file__), '..', '..'))
|
||||
ROOT_PATH = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
"""
|
||||
User home directory path
|
||||
"""
|
||||
HOME_DIRECTORY_PATH = os.path.expanduser('~')
|
||||
HOME_DIRECTORY_PATH = os.path.expanduser("~")
|
||||
"""
|
||||
Path to the secrets file
|
||||
"""
|
||||
DEFAULT_SECRETS_PATH = os.path.join(os.getcwd(), '.aztk/secrets.yaml')
|
||||
DEFAULT_SECRETS_PATH = os.path.join(os.getcwd(), ".aztk/secrets.yaml")
|
||||
"""
|
||||
Paths to the cluster configuration files
|
||||
"""
|
||||
GLOBAL_CONFIG_PATH = os.path.join(HOME_DIRECTORY_PATH, '.aztk')
|
||||
DEFAULT_SSH_CONFIG_PATH = os.path.join(os.getcwd(), '.aztk/ssh.yaml')
|
||||
DEFAULT_CLUSTER_CONFIG_PATH = os.path.join(os.getcwd(), '.aztk/cluster.yaml')
|
||||
DEFAULT_SPARK_CONF_SOURCE = os.path.join(os.getcwd(), '.aztk')
|
||||
DEFAULT_SPARK_CONF_DEST = os.path.join(ROOT_PATH, 'node_scripts', 'conf')
|
||||
DEFAULT_SPARK_JARS_SOURCE = os.path.join(os.getcwd(), '.aztk', 'jars')
|
||||
DEFAULT_SPARK_JARS_DEST = os.path.join(ROOT_PATH, 'node_scripts', 'jars')
|
||||
DEFAULT_SPARK_JOB_CONFIG = os.path.join(os.getcwd(), '.aztk', 'job.yaml')
|
||||
GLOBAL_SPARK_JOB_CONFIG = os.path.join(HOME_DIRECTORY_PATH, '.aztk', 'job.yaml')
|
||||
GLOBAL_CONFIG_PATH = os.path.join(HOME_DIRECTORY_PATH, ".aztk")
|
||||
DEFAULT_SSH_CONFIG_PATH = os.path.join(os.getcwd(), ".aztk/ssh.yaml")
|
||||
DEFAULT_CLUSTER_CONFIG_PATH = os.path.join(os.getcwd(), ".aztk/cluster.yaml")
|
||||
DEFAULT_SPARK_CONF_SOURCE = os.path.join(os.getcwd(), ".aztk")
|
||||
DEFAULT_SPARK_CONF_DEST = os.path.join(ROOT_PATH, "node_scripts", "conf")
|
||||
DEFAULT_SPARK_JARS_SOURCE = os.path.join(os.getcwd(), ".aztk", "jars")
|
||||
DEFAULT_SPARK_JARS_DEST = os.path.join(ROOT_PATH, "node_scripts", "jars")
|
||||
DEFAULT_SPARK_JOB_CONFIG = os.path.join(os.getcwd(), ".aztk", "job.yaml")
|
||||
GLOBAL_SPARK_JOB_CONFIG = os.path.join(HOME_DIRECTORY_PATH, ".aztk", "job.yaml")
|
||||
"""
|
||||
Source and destination paths for spark init
|
||||
"""
|
||||
INIT_DIRECTORY_SOURCE = os.path.join(ROOT_PATH, "aztk_cli", 'config')
|
||||
LOCAL_INIT_DIRECTORY_DEST = os.path.join(os.getcwd(), '.aztk')
|
||||
GLOBAL_INIT_DIRECTORY_DEST = os.path.join(HOME_DIRECTORY_PATH, '.aztk')
|
||||
INIT_DIRECTORY_SOURCE = os.path.join(ROOT_PATH, "aztk_cli", "config")
|
||||
LOCAL_INIT_DIRECTORY_DEST = os.path.join(os.getcwd(), ".aztk")
|
||||
GLOBAL_INIT_DIRECTORY_DEST = os.path.join(HOME_DIRECTORY_PATH, ".aztk")
|
||||
"""
|
||||
Key of the metadata entry for the pool that is used to store the master node id
|
||||
"""
|
||||
|
|
|
@ -39,9 +39,10 @@ def deprecate(version: str, message: str, advice: str = ""):
|
|||
advice (str): Sentence explaining alternatives to the deprecated functionality.
|
||||
"""
|
||||
|
||||
warnings.simplefilter('always', DeprecationWarning) # turn off filter
|
||||
warnings.simplefilter("always", DeprecationWarning) # turn off filter
|
||||
warnings.warn(
|
||||
"{0} It will be removed in Aztk version {1}. {2}".format(message, version, advice),
|
||||
category=DeprecationWarning,
|
||||
stacklevel=2)
|
||||
warnings.simplefilter('default', DeprecationWarning) # reset filter
|
||||
stacklevel=2,
|
||||
)
|
||||
warnings.simplefilter("default", DeprecationWarning) # reset filter
|
||||
|
|
|
@ -29,6 +29,6 @@ def __read_ssh_key_from_file(path: str) -> str:
|
|||
"""
|
||||
Read the content of the given file
|
||||
"""
|
||||
with open(os.path.expanduser(path), 'r', encoding='UTF-8') as content_file:
|
||||
with open(os.path.expanduser(path), "r", encoding="UTF-8") as content_file:
|
||||
content = content_file.read()
|
||||
return content
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче