зеркало из https://github.com/Azure/aztk.git
This commit is contained in:
Родитель
222821cdb2
Коммит
e9c7f74e79
|
@ -23,7 +23,8 @@ def cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, custom_scr
|
||||||
'sed -i -e "s/Defaults requiretty.*/ #Defaults requiretty/g" /etc/sudoers',
|
'sed -i -e "s/Defaults requiretty.*/ #Defaults requiretty/g" /etc/sudoers',
|
||||||
'unzip $AZ_BATCH_TASK_WORKING_DIR/%s' % zip_resource_file.file_path,
|
'unzip $AZ_BATCH_TASK_WORKING_DIR/%s' % zip_resource_file.file_path,
|
||||||
'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/main.sh',
|
'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/main.sh',
|
||||||
'dos2unix $AZ_BATCH_TASK_WORKING_DIR/main.sh', # Convert windows line ending to unix if applicable
|
# Convert windows line ending to unix if applicable
|
||||||
|
'dos2unix $AZ_BATCH_TASK_WORKING_DIR/main.sh',
|
||||||
'$AZ_BATCH_TASK_WORKING_DIR/main.sh'
|
'$AZ_BATCH_TASK_WORKING_DIR/main.sh'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -194,7 +195,8 @@ def create_cluster(
|
||||||
vm_size=vm_size,
|
vm_size=vm_size,
|
||||||
target_dedicated_nodes=vm_count,
|
target_dedicated_nodes=vm_count,
|
||||||
target_low_priority_nodes=vm_low_pri_count,
|
target_low_priority_nodes=vm_low_pri_count,
|
||||||
start_task=generate_cluster_start_task(pool_id, zip_resource_file, custom_script),
|
start_task=generate_cluster_start_task(
|
||||||
|
pool_id, zip_resource_file, custom_script),
|
||||||
enable_inter_node_communication=True,
|
enable_inter_node_communication=True,
|
||||||
max_tasks_per_node=1)
|
max_tasks_per_node=1)
|
||||||
|
|
||||||
|
@ -203,51 +205,17 @@ def create_cluster(
|
||||||
pool,
|
pool,
|
||||||
wait)
|
wait)
|
||||||
|
|
||||||
return # TODO
|
|
||||||
|
|
||||||
# Create job
|
# Create job
|
||||||
job = batch_models.JobAddParameter(
|
job = batch_models.JobAddParameter(
|
||||||
id=job_id,
|
id=job_id,
|
||||||
pool_info=batch_models.PoolInformation(pool_id=pool_id))
|
pool_info=batch_models.PoolInformation(pool_id=pool_id))
|
||||||
|
|
||||||
# Add job to batch
|
# Add job to batch
|
||||||
batch_client.job.add(job)
|
# batch_client.job.add(job) # TODO
|
||||||
|
|
||||||
# create application/coordination commands
|
|
||||||
coordination_cmd = cluster_connect_cmd()
|
|
||||||
application_cmd = cluster_start_cmd(
|
|
||||||
constants._WEBUI_PORT, constants._JUPYTER_PORT)
|
|
||||||
|
|
||||||
# reuse pool_id as multi-instance task id
|
|
||||||
task_id = pool_id
|
|
||||||
|
|
||||||
# Create multi-instance task
|
|
||||||
task = batch_models.TaskAddParameter(
|
|
||||||
id=task_id,
|
|
||||||
command_line=util.wrap_commands_in_shell(application_cmd),
|
|
||||||
resource_files=[],
|
|
||||||
multi_instance_settings=batch_models.MultiInstanceSettings(
|
|
||||||
number_of_instances=vm_count + vm_low_pri_count,
|
|
||||||
coordination_command_line=util.wrap_commands_in_shell(
|
|
||||||
coordination_cmd),
|
|
||||||
common_resource_files=[]))
|
|
||||||
|
|
||||||
# Add task to batch job (which has the same name as pool_id)
|
|
||||||
try:
|
|
||||||
batch_client.task.add(job_id=job_id, task=task)
|
|
||||||
except batch_models.batch_error.BatchErrorException as err:
|
|
||||||
util.print_batch_exception(err)
|
|
||||||
if err.error.code != 'JobExists':
|
|
||||||
raise
|
|
||||||
else:
|
|
||||||
print('Job {!r} already exists'.format(job_id))
|
|
||||||
|
|
||||||
# Wait for the app to finish
|
# Wait for the app to finish
|
||||||
if wait == True:
|
if wait == True:
|
||||||
|
util.wait_for_master_to_be_ready(pool_id)
|
||||||
util.wait_for_tasks_to_complete(
|
|
||||||
job_id,
|
|
||||||
timedelta(minutes=60))
|
|
||||||
|
|
||||||
if username is not None and password is not None:
|
if username is not None and password is not None:
|
||||||
create_user(pool_id, username, password)
|
create_user(pool_id, username, password)
|
||||||
|
|
22
dtde/util.py
22
dtde/util.py
|
@ -40,6 +40,24 @@ def wait_for_tasks_to_complete(job_id, timeout):
|
||||||
raise TimeoutError("Timed out waiting for tasks to complete")
|
raise TimeoutError("Timed out waiting for tasks to complete")
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_master_to_be_ready(cluster_id: str):
|
||||||
|
batch_client = azure_api.get_batch_client()
|
||||||
|
master_node_id = None
|
||||||
|
while True:
|
||||||
|
if not master_node_id:
|
||||||
|
master_node_id = get_master_node_id(cluster_id)
|
||||||
|
if not master_node_id:
|
||||||
|
time.sleep(5)
|
||||||
|
continue
|
||||||
|
|
||||||
|
master_node = batch_client.compute_node.get(cluster_id, master_node_id)
|
||||||
|
if master_node.state == batch_models.ComputeNodeState.idle or master_node.state == batch_models.ComputeNodeState.running:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
time.sleep(10)
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
|
||||||
def upload_file_to_container(container_name, file_path, use_full_path) -> batch_models.ResourceFile:
|
def upload_file_to_container(container_name, file_path, use_full_path) -> batch_models.ResourceFile:
|
||||||
"""
|
"""
|
||||||
Uploads a local file to an Azure Blob storage container.
|
Uploads a local file to an Azure Blob storage container.
|
||||||
|
@ -92,6 +110,7 @@ def print_configuration(config):
|
||||||
print("\nConfiguration is:")
|
print("\nConfiguration is:")
|
||||||
print(configuration_dict)
|
print(configuration_dict)
|
||||||
|
|
||||||
|
|
||||||
def get_master_node_id_from_pool(pool: batch_models.CloudPool):
|
def get_master_node_id_from_pool(pool: batch_models.CloudPool):
|
||||||
"""
|
"""
|
||||||
:returns: the id of the node that is the assigned master of this pool
|
:returns: the id of the node that is the assigned master of this pool
|
||||||
|
@ -105,6 +124,7 @@ def get_master_node_id_from_pool(pool: batch_models.CloudPool):
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_master_node_id(pool_id):
|
def get_master_node_id(pool_id):
|
||||||
batch_client = azure_api.get_batch_client()
|
batch_client = azure_api.get_batch_client()
|
||||||
return get_master_node_id_from_pool(batch_client.pool.get(pool_id))
|
return get_master_node_id_from_pool(batch_client.pool.get(pool_id))
|
||||||
|
@ -124,7 +144,7 @@ def create_pool_if_not_exist(pool, wait=True):
|
||||||
try:
|
try:
|
||||||
batch_client.pool.add(pool)
|
batch_client.pool.add(pool)
|
||||||
if wait:
|
if wait:
|
||||||
wait_for_all_nodes_state(batch_client, pool, frozenset(
|
wait_for_all_nodes_state(pool, frozenset(
|
||||||
(batch_models.ComputeNodeState.start_task_failed,
|
(batch_models.ComputeNodeState.start_task_failed,
|
||||||
batch_models.ComputeNodeState.unusable,
|
batch_models.ComputeNodeState.unusable,
|
||||||
batch_models.ComputeNodeState.idle)
|
batch_models.ComputeNodeState.idle)
|
||||||
|
|
|
@ -13,11 +13,11 @@ def setup_node():
|
||||||
|
|
||||||
def setup_as_master():
|
def setup_as_master():
|
||||||
print("Setting up as master.")
|
print("Setting up as master.")
|
||||||
spark.setup_connection()
|
spark.start_spark_master()
|
||||||
spark.start_spark()
|
# spark.start_spark_worker()
|
||||||
|
|
||||||
|
|
||||||
def setup_as_worker():
|
def setup_as_worker():
|
||||||
print("Setting up as worker.")
|
print("Setting up as worker.")
|
||||||
spark.setup_connection()
|
|
||||||
# spark.start_spark()
|
spark.start_spark_worker()
|
||||||
|
|
|
@ -61,14 +61,18 @@ def find_master(client: batch.BatchServiceClient) -> bool:
|
||||||
master = get_master_node_id(pool)
|
master = get_master_node_id(pool)
|
||||||
|
|
||||||
if master:
|
if master:
|
||||||
print("Pool already has a master '%s'. This node will be a worker" % master)
|
if master == config.node_id:
|
||||||
return False
|
print("Node is already the master '%s'" % master)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("Pool already has a master '%s'. This node will be a worker" % master)
|
||||||
|
return False
|
||||||
else:
|
else:
|
||||||
print("Pool has no master. Fighting for the throne! (%i/5)" % (i + 1))
|
print("Pool has no master. Fighting for the throne! (%i/5)" % (i + 1))
|
||||||
result = try_assign_self_as_master(client, pool)
|
result = try_assign_self_as_master(client, pool)
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
print("The battle has been won! Node %s is the new master.", config.node_id)
|
print("The battle has been won! Node %s is the new master." % config.node_id)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
raise CannotAllocateMasterError("Unable to assign node as a master in 5 tries")
|
raise CannotAllocateMasterError("Unable to assign node as a master in 5 tries")
|
||||||
|
|
|
@ -20,6 +20,10 @@ def get_pool() -> batchmodels.CloudPool:
|
||||||
return batch_client.pool.get(config.pool_id)
|
return batch_client.pool.get(config.pool_id)
|
||||||
|
|
||||||
|
|
||||||
|
def get_node(node_id: str) -> batchmodels.ComputeNode:
|
||||||
|
return batch_client.compute_node.get(config.pool_id, node_id)
|
||||||
|
|
||||||
|
|
||||||
def list_nodes() -> List[batchmodels.ComputeNode]:
|
def list_nodes() -> List[batchmodels.ComputeNode]:
|
||||||
"""
|
"""
|
||||||
List all the nodes in the pool.
|
List all the nodes in the pool.
|
||||||
|
@ -49,7 +53,8 @@ def setup_connection():
|
||||||
"""
|
"""
|
||||||
wait_for_pool_ready()
|
wait_for_pool_ready()
|
||||||
print("Pool is now steady. Setting up master")
|
print("Pool is now steady. Setting up master")
|
||||||
master_node_ip = pick_master.get_master_node_id(batch_client.pool.get(config.pool_id))
|
master_node_ip = pick_master.get_master_node_id(
|
||||||
|
batch_client.pool.get(config.pool_id))
|
||||||
|
|
||||||
nodes = list_nodes()
|
nodes = list_nodes()
|
||||||
|
|
||||||
|
@ -58,10 +63,12 @@ def setup_connection():
|
||||||
|
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
if node.id == master_node_ip:
|
if node.id == master_node_ip:
|
||||||
print("Adding node %s as a master" % node.id)
|
print("Adding node %s as a master with ip %s" %
|
||||||
|
(node.id, node.ip_address))
|
||||||
master_file.write("%s\n" % node.ip_address)
|
master_file.write("%s\n" % node.ip_address)
|
||||||
else:
|
else:
|
||||||
print("Adding node %s as a slave" % node.id)
|
print("Adding node %s as a slave with ip %s" %
|
||||||
|
(node.id, node.ip_address))
|
||||||
slaves_file.write("%s\n" % node.ip_address)
|
slaves_file.write("%s\n" % node.ip_address)
|
||||||
|
|
||||||
master_file.close()
|
master_file.close()
|
||||||
|
@ -69,7 +76,8 @@ def setup_connection():
|
||||||
|
|
||||||
|
|
||||||
def generate_jupyter_config():
|
def generate_jupyter_config():
|
||||||
master_node_ip = pick_master.get_master_node_id(batch_client.pool.get(config.pool_id))
|
master_node_ip = pick_master.get_master_node_id(
|
||||||
|
batch_client.pool.get(config.pool_id))
|
||||||
return dict(
|
return dict(
|
||||||
display_name="PySpark",
|
display_name="PySpark",
|
||||||
language="python",
|
language="python",
|
||||||
|
@ -91,34 +99,67 @@ def generate_jupyter_config():
|
||||||
def setup_jupyter():
|
def setup_jupyter():
|
||||||
print("Setting up jupyter.")
|
print("Setting up jupyter.")
|
||||||
call(["/anaconda/envs/py35/bin/jupyter", "notebook", "--generate-config"])
|
call(["/anaconda/envs/py35/bin/jupyter", "notebook", "--generate-config"])
|
||||||
with open("test.txt", "a") as config_file:
|
|
||||||
|
jupyter_config_file = os.path.join(os.path.expanduser("~"), ".jupyter/jupyter_notebook_config.py")
|
||||||
|
|
||||||
|
with open(jupyter_config_file, "a") as config_file:
|
||||||
config_file.write('\n')
|
config_file.write('\n')
|
||||||
config_file.write('c.NotebookApp.token=""\n')
|
config_file.write('c.NotebookApp.token=""\n')
|
||||||
config_file.write('c.NotebookApp.password=""\n')
|
config_file.write('c.NotebookApp.password=""\n')
|
||||||
shutil.rmtree('/usr/local/share/jupyter/kernels')
|
shutil.rmtree('/usr/local/share/jupyter/kernels')
|
||||||
os.makedirs('/usr/local/share/jupyter/kernels/pyspark', exist_ok=True)
|
os.makedirs('/usr/local/share/jupyter/kernels/pyspark', exist_ok = True)
|
||||||
|
|
||||||
with open('/usr/local/share/jupyter/kernels/pyspark/kernel.json', 'w') as outfile:
|
with open('/usr/local/share/jupyter/kernels/pyspark/kernel.json', 'w') as outfile:
|
||||||
data = generate_jupyter_config()
|
data=generate_jupyter_config()
|
||||||
json.dump(data, outfile)
|
json.dump(data, outfile)
|
||||||
|
|
||||||
|
|
||||||
def start_jupyter():
|
def start_jupyter():
|
||||||
jupyter_port = config.jupyter_port
|
jupyter_port=config.jupyter_port
|
||||||
|
|
||||||
my_env = os.environ.copy()
|
my_env=os.environ.copy()
|
||||||
my_env["PYSPARK_DRIVER_PYTHON"] = "/anaconda/envs/py35/bin/jupyter"
|
my_env["PYSPARK_DRIVER_PYTHON"]="/anaconda/envs/py35/bin/jupyter"
|
||||||
my_env["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook --no-browser --port='%s'" % jupyter_port
|
my_env["PYSPARK_DRIVER_PYTHON_OPTS"]="notebook --no-browser --port='%s'" % jupyter_port
|
||||||
|
|
||||||
# call("pyspark", "&", env=my_env)
|
Popen(["pyspark"], close_fds = True)
|
||||||
Popen(["pyspark"], close_fds=True)
|
|
||||||
|
|
||||||
|
|
||||||
def start_spark():
|
def wait_for_master():
|
||||||
webui_port = config.webui_port
|
print("Waiting for master to be ready.")
|
||||||
|
master_node_ip=pick_master.get_master_node_id(
|
||||||
|
batch_client.pool.get(config.pool_id))
|
||||||
|
while True:
|
||||||
|
master_node=get_node(master_node_ip)
|
||||||
|
|
||||||
exe = os.path.join(spark_home, "sbin", "start-all.sh")
|
if master_node.state == batchmodels.ComputeNodeState.idle or master_node.state == batchmodels.ComputeNodeState.running:
|
||||||
call([exe, "--webui-port", str(webui_port), "&"])
|
break
|
||||||
|
else:
|
||||||
|
print("Still waiting on master")
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
|
def start_spark_master():
|
||||||
|
webui_port=config.webui_port
|
||||||
|
master_ip=get_node(config.node_id).ip_address
|
||||||
|
exe=os.path.join(spark_home, "sbin", "start-master.sh")
|
||||||
|
cmd=[exe, "-h", master_ip]
|
||||||
|
print("Starting master with '%s'" % " ".join(cmd))
|
||||||
|
call(cmd)
|
||||||
|
|
||||||
setup_jupyter()
|
setup_jupyter()
|
||||||
start_jupyter()
|
start_jupyter()
|
||||||
|
|
||||||
|
|
||||||
|
def start_spark_worker():
|
||||||
|
wait_for_master()
|
||||||
|
exe=os.path.join(spark_home, "sbin", "start-slave.sh")
|
||||||
|
master_node_id=pick_master.get_master_node_id(
|
||||||
|
batch_client.pool.get(config.pool_id))
|
||||||
|
master_node=get_node(master_node_id)
|
||||||
|
|
||||||
|
my_env=os.environ.copy()
|
||||||
|
my_env["SPARK_MASTER_IP"]=master_node.ip_address
|
||||||
|
|
||||||
|
cmd=[exe, "spark://%s:7077" % master_node.ip_address]
|
||||||
|
print("Connecting to master with '%s'" % " ".join(cmd))
|
||||||
|
call(cmd)
|
||||||
|
|
Загрузка…
Ссылка в новой задаче