diff --git a/.editorconfig b/.editorconfig index e93c4ff4..cb003c72 100644 --- a/.editorconfig +++ b/.editorconfig @@ -3,6 +3,7 @@ indent_style = space indent_size = 4 insert_final_newline = true trim_trailing_whitespace = true +end_of_line = lf [*.{json,yml,yaml}] indent_size = 2 diff --git a/.gitignore b/.gitignore index 409db3be..4f27897a 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,7 @@ tmp/ # PyTest .cache/ + + +# Built docs +docs/_build/ diff --git a/.vscode/settings.json b/.vscode/settings.json index 790874c0..1ff1c625 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -15,6 +15,5 @@ "python.formatting.yapfArgs": [ "--style=.style.yapf" ], - "python.venvPath": "${workspaceFolder}/ENV", "python.pythonPath": "${workspaceFolder}\\ENV\\Scripts\\python.exe" } diff --git a/aztk/client.py b/aztk/client.py index 584a66a5..e094f28e 100644 --- a/aztk/client.py +++ b/aztk/client.py @@ -61,7 +61,7 @@ class Client: if pool_exists: self.batch_client.pool.delete(pool_id) - + if not keep_logs: cluster_data = self._get_cluster_data(pool_id) cluster_data.delete_container(pool_id) diff --git a/aztk/error.py b/aztk/error.py index 6cb6a6c8..88b2f0c3 100644 --- a/aztk/error.py +++ b/aztk/error.py @@ -4,10 +4,8 @@ All error should inherit from `AztkError` """ - class AztkError(Exception): - def __init__(self, message: str=None): - super().__init__(message) + pass class ClusterNotReadyError(AztkError): pass diff --git a/aztk/internal/cluster_data/__init__.py b/aztk/internal/cluster_data/__init__.py index bebc623f..b0896dd6 100644 --- a/aztk/internal/cluster_data/__init__.py +++ b/aztk/internal/cluster_data/__init__.py @@ -1,3 +1,3 @@ -from .blob_data import * -from .node_data import * -from .cluster_data import * +from .blob_data import BlobData +from .node_data import NodeData +from .cluster_data import ClusterData diff --git a/aztk/internal/cluster_data/cluster_data.py b/aztk/internal/cluster_data/cluster_data.py index c2355989..86659781 100644 --- a/aztk/internal/cluster_data/cluster_data.py +++ b/aztk/internal/cluster_data/cluster_data.py @@ -1,7 +1,6 @@ -import yaml import logging +import yaml import azure.common -from azure.storage.blob import BlockBlobService from .node_data import NodeData from .blob_data import BlobData @@ -15,7 +14,7 @@ class ClusterData: APPLICATIONS_DIR = "applications" CLUSTER_CONFIG_FILE = "config.yaml" - def __init__(self, blob_client: BlockBlobService, cluster_id: str): + def __init__(self, blob_client, cluster_id: str): self.blob_client = blob_client self.cluster_id = cluster_id self._ensure_container() diff --git a/aztk/internal/cluster_data/node_data.py b/aztk/internal/cluster_data/node_data.py index 47641ab3..64250e53 100644 --- a/aztk/internal/cluster_data/node_data.py +++ b/aztk/internal/cluster_data/node_data.py @@ -5,7 +5,7 @@ import zipfile from pathlib import Path from typing import List import yaml -from aztk.spark import models +from aztk import models from aztk.utils import constants, file_utils, secure_utils from aztk.error import InvalidCustomScriptError @@ -61,10 +61,12 @@ class NodeData: for file in file_paths: self.add_file(file, zip_dir, binary) - def add_dir(self, path: str, dest: str = None, exclude: List[str] = []): + def add_dir(self, path: str, dest: str = None, exclude: List[str] = None): """ Zip all the files in the given directory into the zip file handler """ + exclude = exclude or [] + for base, _, files in os.walk(path): relative_folder = os.path.relpath(base, path) for file in files: @@ -156,7 +158,8 @@ class NodeData: def _add_node_scripts(self): self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=['*.pyc*']) - def _includeFile(self, filename: str, exclude: List[str] = []) -> bool: + def _includeFile(self, filename: str, exclude: List[str]) -> bool: + exclude = exclude or [] for pattern in exclude: if fnmatch.fnmatch(filename, pattern): return False diff --git a/aztk/spark/__init__.py b/aztk/spark/__init__.py index a4807b9f..3ff722bf 100644 --- a/aztk/spark/__init__.py +++ b/aztk/spark/__init__.py @@ -1,2 +1 @@ -from .models import * from .client import Client diff --git a/aztk/spark/client.py b/aztk/spark/client.py index d706f1ea..3ff9f9b1 100644 --- a/aztk/spark/client.py +++ b/aztk/spark/client.py @@ -15,13 +15,27 @@ from aztk.internal.cluster_data import NodeData class Client(BaseClient): + """ + Aztk Spark Client + This is the main entry point for using aztk for spark + + Args: + secrets_config(aztk.spark.models.models.SecretsConfiguration): Configuration with all the needed credentials + """ def __init__(self, secrets_config): super().__init__(secrets_config) - ''' - Spark client public interface - ''' def create_cluster(self, cluster_conf: models.ClusterConfiguration, wait: bool = False): + """ + Create a new aztk spark cluster + + Args: + cluster_conf(aztk.spark.models.models.ClusterConfiguration): Configuration for the the cluster to be created + wait(bool): If you should wait for the cluster to be ready before returning + + Returns: + aztk.spark.models.Cluster + """ cluster_conf.validate() cluster_data = self._get_cluster_data(cluster_conf.cluster_id) try: diff --git a/aztk/spark/models/__init__.py b/aztk/spark/models/__init__.py index cf4f59d6..aed4fa32 100644 --- a/aztk/spark/models/__init__.py +++ b/aztk/spark/models/__init__.py @@ -1 +1 @@ -from .models import * \ No newline at end of file +from .models import * diff --git a/aztk/spark/models/plugins/__init__.py b/aztk/spark/models/plugins/__init__.py index 73fe3f3b..ea603f51 100644 --- a/aztk/spark/models/plugins/__init__.py +++ b/aztk/spark/models/plugins/__init__.py @@ -1,6 +1,6 @@ -from .hdfs import * -from .jupyter import * -from .jupyter_lab import * -from .rstudio_server import * -from .simple import * -from .spark_ui_proxy import * +from .hdfs import HDFSPlugin +from .jupyter import JupyterPlugin +from .jupyter_lab import JupyterLabPlugin +from .rstudio_server import RStudioServerPlugin +from .simple import SimplePlugin +from .spark_ui_proxy import SparkUIProxyPlugin diff --git a/docs/00-getting-started.md b/docs/00-getting-started.md index 2ec9a352..147e519a 100644 --- a/docs/00-getting-started.md +++ b/docs/00-getting-started.md @@ -1,14 +1,11 @@ -# Azure Distributed Data Engineering Toolkit -The Azure Distributed Data Engineering Toolkit is a project that allows Sparks users to easily spin up a Spark cluster in Azure. - -## Getting Started +# Getting Started The minimum requirements to get started with this package are: - Python 3.5+, pip 9.0.1+ - An Azure account - An Azure Batch account - An Azure Storage account -### Cloning and installing the project +## Cloning and installing the project 1. Clone the repo 2. Make sure you are running python 3.5 or greater. _If the default version on your machine is python 2 make sure to run the following commands with **pip3** instead of **pip**._ @@ -40,12 +37,12 @@ The minimum requirements to get started with this package are: This will put default configuration files in your home directory, *~/*. Please note that configuration files in your current working directory will take precident over global configuration files in your home directory. -### Setting up your accounts +## Setting up your accounts -#### Using the account setup script -A script to create and configure the Azure resources required to use `aztk` is provided. For more more information and usage, see [Getting Started Script](./01-getting-started-script.md). +### Using the account setup script +A script to create and configure the Azure resources required to use `aztk` is provided. For more more information and usage, see [Getting Started Script](01-getting-started-script.html) -#### Manual resource creation +### Manual resource creation To finish setting up, you need to fill out your Azure Batch and Azure Storage secrets in *.aztk/secrets.yaml*. We'd also recommend that you enter SSH key info in this file too. Please note that if you use ssh keys and a have a non-standard ssh key file name or path, you will need to specify the location of your ssh public and private keys. To do so, set them as shown below: @@ -64,7 +61,7 @@ If you do not already have an Azure account, go to [https://azure.microsoft.com/ Once you have one, simply log in and go to the [Azure Portal](https://portal.azure.com) to start creating your Azure Batch account and Azure Storage account. -##### Using AAD +#### Using AAD To get the required keys for your Azure Active Directory (AAD) Service Principal, Azure Batch Account and Azure Storage Account, please follow these instructions. Note that this is the recommended path for use with AZTK, as some features require AAD and are disabled if using Shared Key authentication. 1. Register an Azure Active Directory (AAD) Application @@ -135,7 +132,7 @@ service_principal: storage_account_resource_id: ``` -#### Using Shared Keys +### Using Shared Keys _Please note that using Shared Keys prevents the use of certain AZTK features including Mixed Mode clusters and support for VNETs._ To get the required keys for Azure Batch and Azure Storage, please follow the below instructions: @@ -167,13 +164,13 @@ To get the required keys for Azure Batch and Azure Storage, please follow the be - Go to the accounts in the Azure portal and copy pase the account names, keys and other information needed into the secrets file. -#### Storage account +### Storage account For the Storage account, copy the name and one of the two keys: ![](./misc/Storage_secrets.png) -#### Batch account +### Batch account For the Batch account, copy the name, the url and one of the two keys: @@ -181,5 +178,5 @@ For the Batch account, copy the name, the url and one of the two keys: ## Next Steps -- [Create a cluster](./10-clusters.md) -- [Run a Spark job](./20-spark-submit.md) +- [Create a cluster](10-clusters.html) +- [Run a Spark job](./20-spark-submit.html) diff --git a/docs/01-getting-started-script.md b/docs/01-getting-started-script.md index 944415f8..915cb792 100644 --- a/docs/01-getting-started-script.md +++ b/docs/01-getting-started-script.md @@ -9,7 +9,7 @@ The script will create and configure the following resources: - Azure Active Directory application and service principal -The script outputs all of the necessary information to use `aztk`, just copy the output into the `.aztk/secrets.yaml` file created when running `aztk spark init`. +The script outputs all of the necessary information to use `aztk`, just copy the output into the `.aztk/secrets.yaml` file created when running `aztk spark init`. ## Usage Copy and paste the following into an [Azure Cloud Shell](https://shell.azure.com): @@ -41,4 +41,4 @@ service_principal: Copy the entire `service_principal` section in your `.aztk/secrets.yaml`. If you do not have a `secrets.yaml` file, you can create one in your current working directory by running `aztk spark init`. -Now you are ready to create your first `aztk` cluster. See [Creating a Cluster](./10-clusters.md#creating-a-cluster). +Now you are ready to create your first `aztk` cluster. See [Creating a Cluster](./10-clusters.html#creating-a-cluster). diff --git a/docs/10-clusters.md b/docs/10-clusters.md index df0a0e98..36ab6d42 100644 --- a/docs/10-clusters.md +++ b/docs/10-clusters.md @@ -4,7 +4,7 @@ In the Azure Distributed Data Engineering Toolkit, a cluster is primarily design ## Creating a Cluster Creating a Spark cluster only takes a few simple steps after which you will be able to SSH into the master node of the cluster and interact with Spark. You will be able to view the Spark Web UI, Spark Jobs UI, submit Spark jobs (with *spark-submit*), and even interact with Spark in a Jupyter notebook. -For the advanced user, please note that the default cluster settings are preconfigured in the *.aztk/cluster.yaml* file that is generated when you run `aztk spark init`. More information on cluster config [here.](./13-configuration.md) +For the advanced user, please note that the default cluster settings are preconfigured in the *.aztk/cluster.yaml* file that is generated when you run `aztk spark init`. More information on cluster config [here.](./13-configuration.html) ### Commands Create a Spark cluster: @@ -33,7 +33,7 @@ You can create clusters with a mixed of [low-priority](https://docs.microsoft.co Please note, to use Mixed Mode clusters, you need to authenticate using Azure Active Directory (AAD) by configuring the Service Principal in `.aztk/secrets.yaml`. You also need to create a [Virtual Network \(VNET\)](https://azure.microsoft.com/en-us/services/virtual-network/), and provide the resource ID to a Subnet within the VNET in your ./aztk/cluster.yaml` configuration file. #### Setting your Spark and/or Python versions -By default, the Azure Distributed Data Engineering Toolkit will use **Spark v2.2.0** and **Python v3.5.4**. However, you can set your Spark and/or Python versions by [configuring the base Docker image used by this package](./12-docker-image.md). +By default, the Azure Distributed Data Engineering Toolkit will use **Spark v2.2.0** and **Python v3.5.4**. However, you can set your Spark and/or Python versions by [configuring the base Docker image used by this package](./12-docker-image.html). ### Listing clusters You can list all clusters currently running in your account by running @@ -161,9 +161,9 @@ __Please be careful sharing the output of the `debug` command as secrets and app ### Interact with your Spark cluster -By default, the `aztk spark cluster ssh` command port forwards the Spark Web UI to *localhost:8080*, Spark Jobs UI to *localhost:4040*, and Spark History Server to your *locahost:18080*. This can be [configured in *.aztk/ssh.yaml*](../docs/13-configuration.md##sshyaml). +By default, the `aztk spark cluster ssh` command port forwards the Spark Web UI to *localhost:8080*, Spark Jobs UI to *localhost:4040*, and Spark History Server to your *locahost:18080*. This can be [configured in *.aztk/ssh.yaml*](../docs/13-configuration.html#sshyaml). ## Next Steps -- [Run a Spark job](./20-spark-submit.md) -- [Configure the Spark cluster using custom commands](./11-custom-scripts.md) -- [Bring your own Docker image or choose between a variety of our supported base images to manage your Spark and Python versions](./12-docker-image.md) +- [Run a Spark job](20-spark-submit.html) +- [Configure the Spark cluster using custom commands](11-custom-scripts.html) +- [Bring your own Docker image or choose between a variety of our supported base images to manage your Spark and Python versions](12-docker-image.html) diff --git a/docs/13-configuration.md b/docs/13-configuration.md index 1f22afcd..9bb8bdce 100644 --- a/docs/13-configuration.md +++ b/docs/13-configuration.md @@ -22,7 +22,7 @@ size: 2 # username: (optional) username: spark -# docker_repo: +# docker_repo: docker_repo: aztk/base:spark2.2.0 # custom_script: (optional) @@ -102,14 +102,14 @@ spark.eventLog.dir spark.history.fs.logDirectory ``` -Please note that the path for `spark.eventLog.dir` and `spark.history.fs.logDirectory` should most likely match so that the history server reads the logs that each Spark job writes. Also note that while the paths can be local (`file:/`), it is recommended that the paths be accessible by every node in the cluster so that the history server, which runs on the Spark master node, has access to all application logs. HDFS, WASB, ADL, or any other Hadoop API compliant storage system may be used. +Please note that the path for `spark.eventLog.dir` and `spark.history.fs.logDirectory` should most likely match so that the history server reads the logs that each Spark job writes. Also note that while the paths can be local (`file:/`), it is recommended that the paths be accessible by every node in the cluster so that the history server, which runs on the Spark master node, has access to all application logs. HDFS, WASB, ADL, or any other Hadoop API compliant storage system may be used. -If using WASB, ADL or other cloud storage services, be sure to set your keys in `.aztk/core-site.xml`. For more information, see the [Cloud Storage](./30-cloud-storage.md) documentation. +If using WASB, ADL or other cloud storage services, be sure to set your keys in `.aztk/core-site.xml`. For more information, see the [Cloud Storage](./30-cloud-storage.html) documentation. ## Configuring Spark Storage -The Spark cluster can be configured to use different cloud supported storage offerrings (such as Azure Storage Blobs, Azure Data Lake Storage, or any other supported Spark file system). More information can be found in the [Cloud Storage](./30-cloud-storage.md) documentation. +The Spark cluster can be configured to use different cloud supported storage offerrings (such as Azure Storage Blobs, Azure Data Lake Storage, or any other supported Spark file system). More information can be found in the [Cloud Storage](./30-cloud-storage.html) documentation. ## Placing JARS @@ -129,5 +129,5 @@ Note: _This tool automatically registers several JARS for default cloud storage ## Next Steps -- [Add plugins](./15-plugins.md) -- [Set up your Cloud Storage](./30-cloud-storage.md) +- [Add plugins](./15-plugins.html) +- [Set up your Cloud Storage](./30-cloud-storage.html) diff --git a/docs/20-spark-submit.md b/docs/20-spark-submit.md index b67f8055..faf2ebc7 100644 --- a/docs/20-spark-submit.md +++ b/docs/20-spark-submit.md @@ -17,7 +17,7 @@ aztk spark cluster submit --id spark --name pipy examples/src/main/python/pi.py NOTE: The job name (--name) must be atleast 3 characters long, can only contain alphanumeric characters including hyphens but excluding underscores, and cannot contain uppercase letters. Each job you submit **must** have a unique name. ## Monitoring job -If you have set up a [SSH tunnel](./10-clusters.md#ssh-and-port-forwarding) with port fowarding, you can naviate to http://localhost:8080 and http://localhost:4040 to view the progess of the job using the Spark UI +If you have set up a [SSH tunnel](./10-clusters.html#ssh-and-port-forwarding) with port fowarding, you can naviate to http://localhost:8080 and http://localhost:4040 to view the progess of the job using the Spark UI ## Getting output logs diff --git a/docs/50-sdk.md b/docs/50-sdk.md deleted file mode 100644 index dfed5686..00000000 --- a/docs/50-sdk.md +++ /dev/null @@ -1,752 +0,0 @@ -# SDK - - -Operationalize AZTK with the provided Python SDK. - -Find some samples and getting stated tutorial in the `examples/sdk/` directory of the repository. - -## Public Interface - -### Client - -- `cluster_copy(self, cluster_id: str, source_path: str, destination_path: str, internal: bool = False)` - - Copy a file to every node in the given cluster - - Parameters: - - cluster_id: str - - the id of the cluster - - source_path: str - - the local path to the file to be copied - - destination_path: str - - the path (including the file name) that the file should be placed on each node. - - internal: bool - - if True, connects to the cluster using the local IP. Only set to True if the node's internal IP address is resolvable by the client. - -- `cluster_run(self, cluster_id: str, command: str, internal: bool = False)` - - Run a command on every node in the given cluster - - Parameters: - - cluster_id: str - - the id of the cluster - - command: str - - the command to run on each node - - internal: bool - - if True, connects to the cluster using the local IP. Only set to True if the node's internal IP address is resolvable by the client. - -- `create_cluster(self, cluster_conf: aztk.spark.models.ClusterConfiguration, wait=False)` - - Create an AZTK cluster with the given cluster configuration - - Parameters: - - - cluster_conf: models.ClusterConfiguration - - the definition of the cluster to create - - wait: bool = False - - If true, block until the cluster is running, else return immediately - - Returns: - - - aztk.spark.models.Cluster - -- `create_clusters_in_parallel(self, cluster_confs: List[aztk.models.ClusterConfiguration])` - - Create an AZTK clusters with the given list of cluster configurations - - Parameters: - - - cluster_confs: List[aztk.models.ClusterConfiguration] - - Returns: - - - None - -- `delete_cluster(self, cluster_id: str, keep_logs: bool = False)` - - Delete an AZTK cluster with the given ID - - Parameters: - - - cluster_id: str - - The ID of the cluster to delete - - keep_logs: bool - - If true, the logs associated with this cluster will not be deleted. - - Returns: - - - None - -- `get_cluster(self, cluster_id: str)` - - Retrieve detailed information about the cluster with the given ID - - Parameters: - - - cluster_id - - the ID of the cluster to get - - Returns: - - - aztk.models.Cluster() - - -- `list_clusters(self)` - Retrieve a list of existing AZTK clusters. - - Returns: - - - List[aztk.models.Cluster] - -- `get_remote_login_settings(self, cluster_id: str, node_id: str)` - - Return the settings required to login to a node - - Parameters: - - - cluster_id: str - The cluster to login to - - node_id: str - The node to login to - Returns: - - - aztk.spark.models.RemoteLogin - -- `submit(self, cluster_id: str, application: aztk.spark.models.Application)` - - Parameters: - - - cluster_id: str - The cluster that the application is submitted to - - application: aztk.spark.models.Application - The application to submit - - Returns: - - - None - -- `submit_all_applications(self, cluster_id: str, applications: List[aztk.spark.models.Application])` - - Submit a list of applications to be exected on a cluster - - Parameters: - - - cluster_id: str - The cluster that the applications are submitted to - - applications: List[aztk.spark.models.Application] - List of applications to submit - Returns: - - - None - -- `wait_until_application_done(self, cluster_id: str, task_id: str)` - - Block until the given application has completed on the given cluster - - Parameters: - - - cluster_id: str - The cluster on which the application is running - - task_id - The application to wait for - Returns: - - - None - -- `wait_until_applications_done(self, cluster_id: str)` - - Block until all applications on the given cluster are completed - - Parameters: - - - cluster_id: str - The cluster on which the application is running - - Returns: - - - None - -- `wait_until_cluster_is_ready(self, cluster_id: str)` - - - Block until the given cluster is running - - Parameters: - - - cluster_id: str - The ID of the cluster to wait for - - Returns: - - - aztk.spark.models.Cluster - - -- `wait_until_all_clusters_are_ready(self, clusters: List[str])` - - Wait until all clusters in the given list are ready - - Parameters: - - - clusters: List[str] - A list of the IDs of all the clusters to wait for - - Returns: - - - None - - - `create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None)` - - Create a user on the given cluster - - Parameters: - - - cluster_id: List[str] - The cluster on which to create the user - - - password: str - The password to create the user with (mutually exclusive with ssh_key) - - - ssh_key: str - The ssh_key to create the user with (mutually exclusive with password) - - Returns: - - - None - - -- `get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0)` - - Get the logs of a completed or currently running application - - Parameters: - - - cluster_id: str - The id of the cluster on which the application ran or is running. - - - application_name: str - The name of the application to retrieve logs for - - - tail: bool - Set to true if you want to get only the newly added data after current_bytes. - - - current_bytes: int - The amount of bytes already retrieved. To get the entire log, leave this at 0. If you are streaming, set this to the current number of bytes you have already retrieved, so you only retrieve the newly added bytes. - - Returns: - - - aztk.spark.models.ApplicationLog - -- `get_application_status(self, cluster_id: str, app_name: str)` - - Get the status of an application - - Parameters: - - cluster_id: str - The id of the cluster to which the app was submitted - - - app_name - the name of the application in question - - Returns: - - - str - - -- `submit_job(self, job_configuration)` - - Submit an AZTK Spark Job - - Parameters: - - - job_configuration: aztk.spark.models.JobConfiguration - The configuration of the job to be submitted - - Returns: - - - aztk.spark.models.Job - -- `list_jobs(self)` - - List all created AZTK Spark Jobs - - Parameters: - - - job_configuration: aztk.spark.models.JobConfiguration - The configuration of the job to be submitted - - Returns: - - - List[aztk.spark.models.Job] - -- `list_applicaitons(self, job_id)` - - List all applications created on the AZTK Spark Job with id job_id - - Parameters: - - - job_id: str - The id of the Job - - Returns: - - - Dict{str: aztk.spark.models.Application or None} - - the key is the name of the application - - the value is None if the application has not yet been scheduled or an Application model if it has been scheduled - -- `get_job(self, job_id)` - - Get information about the AZTK Spark Job with id job_id - - Parameters: - - - job_id: str - The id of the Job - - Returns: - - - List[aztk.spark.models.Job] - -- `stop_job(self, job_id)` - - Stop the AZTK Spark Job with id job_id - - Parameters: - - - job_id: str - The id of the Job - - Returns: - - - None - -- `delete_job(self, job_id, keep_logs: bool = False)` - - Delete the AZTK Spark Job with id job_id - - Parameters: - - - job_id: str - The id of the Job - - keep_logs: bool - - If true, the logs associated with this Job will not be deleted. - - Returns: - - - bool - -- `get_application(self, job_id, application_name)` - - Get information about an AZTK Spark Job's application - - Parameters: - - - job_id: str - The id of the Job - - application_name: str - The name of the Application - - Returns: - - - aztk.spark.models.Application - -- `get_job_application_log(self, job_id, application_name)` - - Get the log of an AZTK Spark Job's application - - - Parameters: - - - job_id: str - The id of the Job - - application_name: str - The name of the Application - - Returns: - - - aztk.spark.models.ApplicationLog - - -- `stop_job_app(self, job_id, application_name)` - - Stop an Application running on an AZTK Spark Job - - Parameters: - - - job_id: str - The id of the Job - - application_name: str - The name of the Application - - Returns: - - - None - - -- `wait_until_job_finished(self, job_id)` - - Wait until the AZTK Spark Job with id job_id is complete - - Parameters: - - - job_id: str - The id of the Job - - application_name: str - The name of the Application - - Returns: - - - None - - -- `wait_until_all_jobs_finished(self, jobs)` - - Wait until all of the given AZTK Spark Jobs are complete - - Parameters: - - - jobs: List[str] - The ids of the Jobs to wait for - - Returns: - - - None - - - -### Models - - -- `Application` - - The definition of an AZTK Spark Application as it exists in the cloud. Please note that this object is not used to configure Applications, only to read information about existing Applications. Please see ApplicationConfiguration if you are trying to create an Application. - - Fields: - - - name: str - - last_modified: datetime - - creation_time: datetime - - state: str - - state_transition_time: datetime - - previous_state: str - - previous_state_transition_time: datetime - - exit_code: int - - - - -- `ApplicationConfiguration` - - Define a Spark application to run on a cluster. - - Fields: - - - name: str - Unique identifier for the application. - - - application: str - Path to the application that will be executed. Can be jar or python file. - - - application_args: [str] - List of arguments for the application - - - main_class: str - The application's main class. (Only applies to Java/Scala) - - - jars: [str] - Additional jars to supply for the application. - - - py_files: [str] - Additional Python files to supply for the application. Can be .zip, .egg, or .py files. - - files: [str] - Additional files to supply for the application. - - - driver_java_options: str - Extra Java options to pass to the driver. - - - driver_library_path: str - Extra library path entries to pass to the driver. - - - driver_class_path: str - Extra class path entries to pass to the driver. Note that jars added with --jars are automatically included in the classpath. - - - driver_memory: str - Memory for driver (e.g. 1000M, 2G) (Default: 1024M). - - - executor_memory: str - Memory per executor (e.g. 1000M, 2G) (Default: 1G). - - - driver_cores: str - Cores for driver (Default: 1). - - - executor_cores: str - Number of cores per executor. (Default: All available cores on the worker) - - - max_retry_count: int - Number of times the Spark job may be retried if there is a failure - -- `ApplicationLog` - - Holds the logged data from a spark application and metadata about the application and log. - - Fields: - - - name: str - - cluster_id: str - - log: str - - total_bytes: int - - application_state: str - - exit_code: str - - -- `Cluster` - - An AZTK cluster. Note that this model is not used to create a cluster, for that see `ClusterConfiguration`. - - Fields: - - - id: str - - The unique id of the cluster - - - pool: azure.batch.models.CloudPool - - A pool in the Azure Batch service. - - - nodes: azure.batch.models.ComputeNodePaged - - A paging container for iterating over a list of ComputeNode objects - - - vm_size: str - - The size of virtual machines in the cluster. All virtual machines in a cluster are the same size. For information about available sizes of virtual machines, see Sizes for Virtual Machines (Linux) (https://azure.microsoft.com/documentation/articles/virtual-machines-linux-sizes/). AZTK supports all Azure VM sizes except STANDARD_A0 and those with premium storage (STANDARD_GS, STANDARD_DS, and STANDARD_DSV2 series). - - - visible_state - - The current state of the cluster. Possible values are: - resizing = 'resizing' - steady = 'steady' - stopping = 'stopping' - active = 'active' - deleting = 'deleting' - upgrading = 'upgrading' - - - total_current_nodes - The total number of nodes currently allocated to the cluster. - - - total_target_nodes - The desired number of nodes in the cluster. Sum of target_dedicated_nodes and target_low_pri_nodes. - - - current_dedicated_nodes - The number of dedicated nodes currently in the cluster. - - - current_low_pri_nodes - The number of low-priority nodes currently in the cluster. Low-priority nodes which have been preempted are included in this count. - - - target_dedicated_nodes - The desired number of dedicated nodes in the cluster. - - - target_low_pri_nodes - The desired number of low-priority nodes in the cluster. - - - - `ClusterConfiguration` - - Define a Spark cluster to be created. - - Fields: - - - custom_scripts: [CustomScript] - A list of custom scripts to execute in the Spark Docker container. - - - cluster_id: str - A unique ID of the cluster to be created. The ID can contain any combination of alphanumeric characters including hyphens and underscores, and cannot contain more than 64 characters. The ID is case-preserving and case-insensitive (that is, you may not have two IDs within an account that differ only by case). - - - vm_count: int - The number of dedicated VMs (nodes) to be allocated to the cluster. Mutually exclusive with vm_low_pri_count. - - - vm_size: str - The size of virtual machines in the cluster. All virtual machines in a cluster are the same size. For information about available sizes of virtual machines, see Sizes for Virtual Machines (Linux) (https://azure.microsoft.com/documentation/articles/virtual-machines-linux-sizes/). AZTK supports all Azure VM sizes except STANDARD_A0 and those with premium storage (STANDARD_GS, STANDARD_DS, and STANDARD_DSV2 series). - - - vm_low_pri_count: int - The number of VMs (nodes) to be allocated to the cluster. Mutually exclusive with vm_count. - - - docker_repo: str - The docker repository and image to use. For more information, see [Docker Image](./12-docker-image.md). - - - spark_configuration: aztk.spark.models.SparkConfiguration - Configuration object for spark-specific values. - - - - `Custom Script` - - A script that executed in the Docker container of specified nodes in the cluster. - - Fields: - - - name: str - A unique name for the script - - script: str or aztk.spark.models.File - Path to the script to be run or File object - - run_on: str - Set which nodes the script should execute on. Possible values: - - all-nodes - master - worker - - Please note that by default, the Master node is also a worker node. - - -- `File` - - A File definition for programmatically defined configuration files. - - Fields: - - name: str - - payload: io.StringIO - - -- `JobConfiguration` - - Define an AZTK Job. - - Methods: - - - `__init__( - self, - id, - applications=None, - custom_scripts=None, - spark_configuration=None, - vm_size=None, - docker_repo=None, - max_dedicated_nodes=None, - subnet_id=None)` - - - Fields: - - - id: str - - applications: List[aztk.spark.models.ApplicationConfiguration] - - custom_scripts: str - - spark_configuration: aztk.spark.models.SparkConfiguration - - vm_size: int - - gpu_enabled: str - - docker_repo: str - - max_dedicated_nodes: str - - subnet_id: str - - -- `Job` - - Methods: - - `__init__(self, cloud_job_schedule: batch_models.CloudJobSchedule, cloud_tasks: List[batch_models.CloudTask] = None)` - - Fields: - - - id: str - - last_modified: datetime - - state: datetime - - state_transition_time: datetime - - applications: datetime - - - - - -- `SecretsConfiguration` - - The Batch, Storage, Docker and SSH secrets used to create AZTK clusters. For more help with setting these values see [Getting Started](./00-getting-started.md). - - Exactly one of `service_principal` and `shared_key` must be provided to this object. If both or none validation will fail. - - Fields: - service_principal: ServicePrincipalConfiguration - shared_key: SharedKeyConfiguration - docker: DockerConfiguration - - ssh_pub_key: str - ssh_priv_key: str - -- `ServicePrincipalConfiguration` - - Configuration needed to use aad auth. - - Fields: - tenant_id: str - client_id: str - credential: str - batch_account_resource_id: str - storage_account_resource_id: str - -- `SharedKeyConfiguration` - - Configuration needed to use shared key auth. - - Fields: - batch_account_name: str - batch_account_key: str - batch_service_url: str - storage_account_name: str - storage_account_key: str - storage_account_suffix: str - -- `DockerConfiguration` - - Configuration needed to use custom docker. - - Fields: - endpoint: str - username: str - password: str - -- `SparkConfiguration` - - Define cluster-wide Spark specific parameters. - - Fields: - - - spark_defaults_conf: str or aztk.spark.models.File - Path or File object defining spark_defaults.conf configuration file to be used. - - - spark_env_sh: str or aztk.spark.models.File - Path or File object defining spark_env.sh configuration file to be used. - - - core_site_xml: str or aztk.spark.models.File - Path or File object defining the core-site.xml configuration file to be used. - - - jars: [str or aztk.spark.models.File] - Paths to or File objects defining Additional Jars to be uploaded diff --git a/docs/60-gpu.md b/docs/60-gpu.md index 2c8dcf2e..ae7c2b3b 100644 --- a/docs/60-gpu.md +++ b/docs/60-gpu.md @@ -1,6 +1,6 @@ # GPU -Use GPUs to accelerate your Spark applications. When using a [GPU enabled Azure VM](https://azure.microsoft.com/en-us/pricing/details/batch/), your docker image will contain CUDA-8.0 and cuDnn-6.0 by default. See [Docker Image](./12-docker-image.md) for more information about the AZTK Docker images. +Use GPUs to accelerate your Spark applications. When using a [GPU enabled Azure VM](https://azure.microsoft.com/en-us/pricing/details/batch/), your docker image will contain CUDA-8.0 and cuDnn-6.0 by default. See [Docker Image](./12-docker-image.html) for more information about the AZTK Docker images. [NOTE: Azure does not have GPU enabled VMs in all regions. Please use this [link](https://azure.microsoft.com/en-us/pricing/details/batch/) to make sure that your Batch account is in a region that has GPU enabled VMs] diff --git a/docs/70-jobs.md b/docs/70-jobs.md index a98c2174..00f07ff6 100644 --- a/docs/70-jobs.md +++ b/docs/70-jobs.md @@ -13,25 +13,25 @@ Creating a Job starts with defining the necessary properties in your `.aztk/job. Each Job has one or more applications given as a List in Job.yaml. Applications are defined using the following properties: ```yaml applications: - - name: - application: - application_args: - - - main_class: - jars: - - - py_files: - - + - name: + application: + application_args: + - + main_class: + jars: + - + py_files: + - files: - - - driver_java_options: - - - driver_library_path: - driver_class_path: - driver_memory: - executor_memory: - driver_cores: - executor_cores: + - + driver_java_options: + - + driver_library_path: + driver_class_path: + driver_memory: + executor_memory: + driver_cores: + executor_cores: ``` _Please note: the only required fields are name and application. All other fields may be removed or left blank._ @@ -44,7 +44,7 @@ Jobs also require a definition of the cluster on which the Applications will run size: docker_repo: subnet_id: - custom_scripts: + custom_scripts: - List - of - paths @@ -52,7 +52,7 @@ Jobs also require a definition of the cluster on which the Applications will run - custom - scripts ``` -_Please Note: For more information about Azure VM sizes, see [Azure Batch Pricing](https://azure.microsoft.com/en-us/pricing/details/batch/). And for more information about Docker repositories see [Docker](./12-docker-iamge.md)._ +_Please Note: For more information about Azure VM sizes, see [Azure Batch Pricing](https://azure.microsoft.com/en-us/pricing/details/batch/). And for more information about Docker repositories see [Docker](./12-docker-iamge.html)._ _The only required fields are vm_size and either size or size_low_pri, all other fields can be left blank or removed._ @@ -75,15 +75,15 @@ job: cluster_configuration: vm_size: standard_f2 size: 3 - + applications: - name: pipy100 application: /path/to/pi.py - application_args: + application_args: - 100 - name: pipy200 application: /path/to/pi.py - application_args: + application_args: - 200 ``` Once submitted, this Job will run two applications, pipy100 and pipy200, on an automatically provisioned Cluster with 3 dedicated Standard_f2 size Azure VMs. Immediately after both pipy100 and pipy200 have completed the Cluster will be destroyed. Application logs will be persisted and available. diff --git a/docs/80-tests.md b/docs/80-tests.md index 87a1f82d..908813e5 100644 --- a/docs/80-tests.md +++ b/docs/80-tests.md @@ -1,8 +1,8 @@ # Tests -AZTK comes with a testing library that can be used for verification, and debugging. Please note that some tests will provision and test real resources in Azure, and as a result, will cost money to run. See [Integration Tests](#IntegrationTests) for more details. +AZTK comes with a testing library that can be used for verification, and debugging. Please note that some tests will provision and test real resources in Azure, and as a result, will cost money to run. See [Integration Tests](#integration-tests) for more details. -## Integration Tests +## Integration Tests Integration tests use the credentials given in your `.aztk/secrets.yaml` file to spin up real Clusters and Jobs to verify the functionality of the library. Please note that these tests __will__ cost money to run. All created Clusters nad Jobs will be deleted when the test completes. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..6cd7e42d --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = aztk +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 6ec1682a..00000000 --- a/docs/README.md +++ /dev/null @@ -1,17 +0,0 @@ -## [Getting Started](./00-getting-started.md) -Set up your azure account and resources. - -## [Clusters](./10-clusters.md) -Create, manage and interact with a cluster. Also learn about connecting to the master node, using Jupyter and viewing the Spark UI. - -## [Custom Scripts](./11-custom-scripts.md) -Add custom configuraiton scripts to your cluster. - -## [Docker Image](./12-docker-image.md) -Learn about setting a custom docker image or how you can use Docker to manage versioning. - -## [Spark Submit](./20-spark-submit.md) -Submit a Spark job to the cluster. - -## [Cloud Storage](./30-cloud-storage.md) -Using cloud storage to load and save data to and from persisted data stores. diff --git a/docs/aztk.models.rst b/docs/aztk.models.rst new file mode 100644 index 00000000..a1d2a14e --- /dev/null +++ b/docs/aztk.models.rst @@ -0,0 +1,10 @@ +aztk.models package +=================== + +aztk.models.models module +------------------------- + +.. automodule:: aztk.models.models + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/aztk.rst b/docs/aztk.rst new file mode 100644 index 00000000..b9c5c3e4 --- /dev/null +++ b/docs/aztk.rst @@ -0,0 +1,23 @@ +aztk package +============ + +.. toctree:: + + aztk.models + aztk.spark + +aztk.client module +------------------ + +.. autoclass:: aztk.client.Client + :members: + :undoc-members: + :show-inheritance: + +aztk.error module +----------------- + +.. automodule:: aztk.error + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/aztk.spark.models.plugins.rst b/docs/aztk.spark.models.plugins.rst new file mode 100644 index 00000000..62bcc45a --- /dev/null +++ b/docs/aztk.spark.models.plugins.rst @@ -0,0 +1,8 @@ +aztk.spark.models.plugins package +================================= + +.. automodule:: aztk.spark.models.plugins + :members: + :undoc-members: + :show-inheritance: + :imported-members: diff --git a/docs/aztk.spark.models.rst b/docs/aztk.spark.models.rst new file mode 100644 index 00000000..35a3c95d --- /dev/null +++ b/docs/aztk.spark.models.rst @@ -0,0 +1,12 @@ +aztk.spark.models package +========================= + +.. toctree:: + + aztk.spark.models.plugins + +.. automodule:: aztk.spark.models + :members: + :undoc-members: + :show-inheritance: + :imported-members: diff --git a/docs/aztk.spark.rst b/docs/aztk.spark.rst new file mode 100644 index 00000000..706cbc62 --- /dev/null +++ b/docs/aztk.spark.rst @@ -0,0 +1,19 @@ +aztk.spark package +================== + +.. toctree:: + + aztk.spark.models + +aztk.spark.client module +------------------------ + +.. automodule:: aztk.spark.client + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: aztk.spark + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..fa8284d7 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,194 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/stable/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +# from recommonmark.transform import AutoStructify +from recommonmark.parser import CommonMarkParser + +basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__)))) +sys.path.insert(0, basedir) + +# -- Project information ----------------------------------------------------- + +project = 'aztk' +#pylint: disable=W0622 +copyright = '2018, Microsoft' +author = 'Microsoft' + +# This gets set automatically by readthedocs +release = version = '' + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', + 'sphinx.ext.intersphinx', +] + +intersphinx_mapping = { + 'python': ('https://docs.python.org/3.6', None), +} + + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] + +source_parsers = { + '.md': CommonMarkParser, +} + +source_suffix = ['.rst', '.md'] + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +html_theme_options = { + 'collapse_navigation': True, + 'sticky_navigation': True, +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'aztkdoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'aztk.tex', 'aztk Documentation', + 'Microsoft', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'aztk', 'aztk Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'aztk', 'aztk Documentation', + author, 'aztk', 'One line description of project.', + 'Miscellaneous'), +] + + +# Napoleon configuration +napoleon_google_docstring = True +napoleon_use_param = True +napoleon_use_rtype = True +napoleon_use_keyword = True +napoleon_include_special_with_doc = True + +# Autodoc configuration +autodoc_member_order = 'bysource' + + + +# def setup(app): +# app.add_config_value('recommonmark_config', { +# 'enable_auto_doc_ref': False, +# }, True) +# app.add_transform(AutoStructify) diff --git a/docs/docs.md b/docs/docs.md new file mode 100644 index 00000000..0ef51ebc --- /dev/null +++ b/docs/docs.md @@ -0,0 +1,14 @@ +# Writting docs + +Docs are located in the docs folder. We are using `sphinx` to generate the docs and then hosting them on `readthedocs`. + +## Start docs autobuild to test locally +```bash +sphinx-autobuild docs docs/_build/html --watch aztk +``` +Open `docs/_build/index.html` + +## Publish the docs + +Docs should be published automatically to read the docs as soon as you push to master under the `latest` tag. +You when creating a git tag readthedocs can also build that one. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..4a52ee97 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,43 @@ +Azure Distributed Data Engineering Toolkit +========================================== +Azure Distributed Data Engineering Toolkit (AZTK) is a python CLI application for provisioning on-demand Spark on Docker clusters in Azure. It's a cheap and easy way to get up and running with a Spark cluster, and a great tool for Spark users who want to experiment and start testing at scale. + +This toolkit is built on top of Azure Batch but does not require any Azure Batch knowledge to use. + +.. _user-docs: + +.. toctree:: + :maxdepth: 2 + :caption: User documentation: + + 00-getting-started + 01-getting-started-script + 10-clusters + 11-custom-scripts + 12-docker-image + 13-configuration + 14-azure-files + 15-plugins + 20-spark-submit + 30-cloud-storage + 60-gpu + 70-jobs + +.. _sdk-docs: +.. toctree:: + :maxdepth: 2 + :caption: SDK documentation: + + sdk-examples + 51-define-plugin + aztk + + +.. _dev-docs: + +.. toctree:: + :maxdepth: 2 + :caption: Developper documentation: + + docs + 80-tests diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..823714f0 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build +set SPHINXPROJ=aztk + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/sdk-examples.md b/docs/sdk-examples.md new file mode 100644 index 00000000..fe949e24 --- /dev/null +++ b/docs/sdk-examples.md @@ -0,0 +1,144 @@ +# SDK samples + +## Create the Spark client + +You can get the values for this by either running the [Getting Started script](getting-started) or using [Batch Labs](https://github.com/Azure/BatchLabs) + +```python + import sys, os, time + import aztk.spark + from aztk.error import AztkError + + # set your secrets + secrets_confg = aztk.spark.models.SecretsConfiguration( + service_principal=aztk.spark.models.ServicePrincipalConfiguration( + tenant_id=".onmicrosoft.com", + client_id="", + credential="", + batch_account_resource_id="", + storage_account_resource_id="", + ), + ssh_pub_key="" + ) + + + # create a client + client = aztk.spark.Client(secrets_confg) +``` + + +## List available clusters + +```python +# list available clusters +clusters = client.list_clusters() +``` + +## Create a new cluster + +```python +# define a custom script +plugins = [ + aztk.spark.models.plugins.JupyterPlugin(), +] + +# define spark configuration +spark_conf = aztk.spark.models.SparkConfiguration( + spark_defaults_conf=os.path.join(ROOT_PATH, 'config', 'spark-defaults.conf'), + spark_env_sh=os.path.join(ROOT_PATH, 'config', 'spark-env.sh'), + core_site_xml=os.path.join(ROOT_PATH, 'config', 'core-site.xml'), + jars=[os.path.join(ROOT_PATH, 'config', 'jars', jar) for jar in os.listdir(os.path.join(ROOT_PATH, 'config', 'jars'))] +) + +# configure my cluster +cluster_config = aztk.spark.models.ClusterConfiguration( + cluster_id="sdk-test", + vm_low_pri_count=2, + vm_size="standard_f2", + plugins=plugins, + spark_configuration=spark_conf +) + +# create a cluster, and wait until it is ready +try: + cluster = client.create_cluster(cluster_config) + cluster = client.wait_until_cluster_is_ready(cluster.id) +except AztkError as e: + print(e.message) + sys.exit() + +``` + +## Get an exiting cluster +```python + cluster = client.get_cluster(cluster_config.cluster_id) +``` + + +## Run an application on the cluster +```python + +# create some apps to run +app1 = aztk.spark.models.ApplicationConfiguration( + name="pipy1", + application=os.path.join(ROOT_PATH, 'examples', 'src', 'main', 'python', 'pi.py'), + application_args="10" +) + +app2 = aztk.spark.models.ApplicationConfiguration( + name="pipy2", + application=os.path.join(ROOT_PATH, 'examples', 'src', 'main', 'python', 'pi.py'), + application_args="20" +) + +app3 = aztk.spark.models.ApplicationConfiguration( + name="pipy3", + application=os.path.join(ROOT_PATH, 'examples', 'src', 'main', 'python', 'pi.py'), + application_args="30" +) + +# submit an app and wait until it is finished running +client.submit(cluster.id, app1) +client.wait_until_application_done(cluster.id, app1.name) + +# submit some other apps to the cluster in parallel +client.submit_all_applications(cluster.id, [app2, app3]) +``` + + +## Get the logs of an application + +```python +# get logs for app, print to console +app1_logs = client.get_application_log(cluster_id=cluster_config.cluster_id, application_name=app1.name) +print(app1_logs.log) +``` + +## Get status of app + +```python +status = client.get_application_status(cluster_config.cluster_id, app2.name) +``` + +## Stream logs of app, print to console as it runs +```python + +current_bytes = 0 +while True: + app2_logs = client.get_application_log( + cluster_id=cluster_config.cluster_id, + application_name=app2.name, + tail=True, + current_bytes=current_bytes) + + print(app2_logs.log, end="") + + if app2_logs.application_state == 'completed': + break + current_bytes = app2_logs.total_bytes + time.sleep(1) + +# wait until all jobs finish, then delete the cluster +client.wait_until_applications_done(cluster.id) +client.delete_cluster(cluster.id) +``` diff --git a/requirements.txt b/requirements.txt index dd817200..e8774016 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,14 @@ pylint==1.8.2 pytest==3.1.3 pytest-xdist==1.22.0 twine==1.9.1 +docker==3.2.1 + +# Docs +sphinx==1.7.2 +sphinx-autobuild==0.7.1 +recommonmark==0.4.0 +sphinx_rtd_theme==0.3.0 +docutils==0.12 # There is a breaking change is 3.0.0 of this package and the depenency chain of other package is not defined correcly SecretStorage==2.3.1