Feature: spark submit scheduling internal (#674)

* add internal support for scheduling_target cluster submit * add internal support for scheduling target job submission * add cli flag
2018-10-26 16:58:38 -07:00 · 2018-10-26 16:58:38 -07:00 · 8c2bf0c1a6
--- a/aztk/node_scripts/scheduling/job_submission.py
+++ b/aztk/node_scripts/scheduling/job_submission.py
@ -73,7 +73,7 @@ def schedule_with_target(scheduling_target, task_sas_urls):
            format(task_working_dir, aztk_cluster_id, task_sas_url, constants.SPARK_SUBMIT_LOGS_FILE))
        node_id = select_scheduling_target_node(config.spark_client.cluster, config.pool_id, scheduling_target)
        node_run_output = config.spark_client.cluster.node_run(
-            config.pool_id, node_id, task_cmd, timeout=120, block=False)
+            config.pool_id, node_id, task_cmd, timeout=120, block=False, internal=True)
    # block job_manager_task until scheduling_target task completion
    wait_until_tasks_complete(aztk_cluster_id)

--- a/aztk/spark/client/cluster/helpers/submit.py
+++ b/aztk/spark/client/cluster/helpers/submit.py
@ -40,7 +40,15 @@ def select_scheduling_target_node(spark_cluster_operations, cluster_id, scheduli
    return cluster.master_node_id


-def schedule_with_target(core_cluster_operations, spark_cluster_operations, cluster_id, scheduling_target, task, wait):
+def schedule_with_target(
+        core_cluster_operations,
+        spark_cluster_operations,
+        cluster_id,
+        scheduling_target,
+        task,
+        wait,
+        internal,
+):
    # upload "real" task definition to storage
    serialized_task_resource_file = upload_serialized_task_to_storage(core_cluster_operations.blob_client, cluster_id,
                                                                      task)
@ -65,7 +73,8 @@ def schedule_with_target(core_cluster_operations, spark_cluster_operations, clus
        format(task_working_dir, cluster_id, serialized_task_resource_file.blob_source,
               constants.SPARK_SUBMIT_LOGS_FILE))
    node_id = select_scheduling_target_node(spark_cluster_operations, cluster_id, scheduling_target)
-    node_run_output = spark_cluster_operations.node_run(cluster_id, node_id, task_cmd, timeout=120, block=wait)
+    node_run_output = spark_cluster_operations.node_run(
+        cluster_id, node_id, task_cmd, timeout=120, block=wait, internal=internal)


 def get_cluster_scheduling_target(core_cluster_operations, cluster_id):
@ -80,6 +89,7 @@ def submit_application(
        application,
        remote: bool = False,
        wait: bool = False,
+        internal: bool = False,
 ):
    """
    Submit a spark app
@ -90,7 +100,7 @@ def submit_application(
    scheduling_target = get_cluster_scheduling_target(core_cluster_operations, cluster_id)
    if scheduling_target is not models.SchedulingTarget.Any:
        schedule_with_target(core_cluster_operations, spark_cluster_operations, cluster_id, scheduling_target, task,
-                             wait)
+                             wait, internal)
    else:
        # Add task to batch job (which has the same name as cluster_id)
        core_cluster_operations.batch_client.task.add(job_id=cluster_id, task=task)
@ -107,9 +117,10 @@ def submit(
        application: models.ApplicationConfiguration,
        remote: bool = False,
        wait: bool = False,
-        scheduling_target: str = None,
+        internal: bool = False,
 ):
    try:
-        submit_application(core_cluster_operations, spark_cluster_operations, cluster_id, application, remote, wait)
+        submit_application(core_cluster_operations, spark_cluster_operations, cluster_id, application, remote, wait,
+                           internal)
    except BatchErrorException as e:
        raise error.AztkError(helpers.format_batch_exception(e))
--- a/aztk/spark/client/cluster/operations.py
+++ b/aztk/spark/client/cluster/operations.py
@ -63,7 +63,14 @@ class ClusterOperations(SparkBaseOperations):
        """
        return list.list_clusters(self._core_cluster_operations)

-    def submit(self, id: str, application: models.ApplicationConfiguration, remote: bool = False, wait: bool = False):
+    def submit(
+            self,
+            id: str,
+            application: models.ApplicationConfiguration,
+            remote: bool = False,
+            wait: bool = False,
+            internal: bool = False,
+    ):
        """Submit an application to a cluster.

        Args:
@ -72,13 +79,16 @@ class ClusterOperations(SparkBaseOperations):
            remote (:obj:`bool`): If True, the application file will not be uploaded, it is assumed to be reachable
                by the cluster already. This is useful when your application is stored in a mounted Azure File Share
                and not the client. Defaults to False.
+            internal (:obj:`bool`): if True, this will connect to the node using its internal IP.
+                Only use this if running within the same VNET as the cluster. This only applies if the cluster's
+                SchedulingTarget is not set to SchedulingTarget.Any. Defaults to False.
            wait (:obj:`bool`, optional): If True, this function blocks until the application has completed.
                Defaults to False.

        Returns:
            :obj:`None`
        """
-        return submit.submit(self._core_cluster_operations, self, id, application, remote, wait)
+        return submit.submit(self._core_cluster_operations, self, id, application, remote, wait, internal)

    def create_user(self, id: str, username: str, password: str = None, ssh_key: str = None):
        """Create a user on every node in the cluster
--- a/aztk_cli/spark/endpoints/cluster/cluster_submit.py
+++ b/aztk_cli/spark/endpoints/cluster/cluster_submit.py
@ -81,6 +81,12 @@ def setup_parser(parser: argparse.ArgumentParser):
                              already accessible at the given path",
    )

+    parser.add_argument(
+        "--internal",
+        action="store_true",
+        help="Connect using the local IP of the master node. Only use if using a VPN.",
+    )
+
    parser.add_argument(
        "app",
        help="App jar OR python file to execute. A path to a local "
@ -133,6 +139,7 @@ def execute(args: typing.NamedTuple):
            max_retry_count=args.max_retry_count,
        ),
        remote=args.remote,
+        internal=args.internal,
        wait=False,
    )