This commit is contained in:
Timothee Guerin 2018-04-26 14:03:45 -07:00 коммит произвёл GitHub
Родитель a00dbb7d6c
Коммит e361c3b0b3
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
34 изменённых файлов: 626 добавлений и 850 удалений

Просмотреть файл

@ -3,6 +3,7 @@ indent_style = space
indent_size = 4
insert_final_newline = true
trim_trailing_whitespace = true
end_of_line = lf
[*.{json,yml,yaml}]
indent_size = 2

4
.gitignore поставляемый
Просмотреть файл

@ -39,3 +39,7 @@ tmp/
# PyTest
.cache/
# Built docs
docs/_build/

1
.vscode/settings.json поставляемый
Просмотреть файл

@ -15,6 +15,5 @@
"python.formatting.yapfArgs": [
"--style=.style.yapf"
],
"python.venvPath": "${workspaceFolder}/ENV",
"python.pythonPath": "${workspaceFolder}\\ENV\\Scripts\\python.exe"
}

Просмотреть файл

@ -4,10 +4,8 @@ All error should inherit from `AztkError`
"""
class AztkError(Exception):
def __init__(self, message: str=None):
super().__init__(message)
pass
class ClusterNotReadyError(AztkError):
pass

Просмотреть файл

@ -1,3 +1,3 @@
from .blob_data import *
from .node_data import *
from .cluster_data import *
from .blob_data import BlobData
from .node_data import NodeData
from .cluster_data import ClusterData

Просмотреть файл

@ -1,7 +1,6 @@
import yaml
import logging
import yaml
import azure.common
from azure.storage.blob import BlockBlobService
from .node_data import NodeData
from .blob_data import BlobData
@ -15,7 +14,7 @@ class ClusterData:
APPLICATIONS_DIR = "applications"
CLUSTER_CONFIG_FILE = "config.yaml"
def __init__(self, blob_client: BlockBlobService, cluster_id: str):
def __init__(self, blob_client, cluster_id: str):
self.blob_client = blob_client
self.cluster_id = cluster_id
self._ensure_container()

Просмотреть файл

@ -5,7 +5,7 @@ import zipfile
from pathlib import Path
from typing import List
import yaml
from aztk.spark import models
from aztk import models
from aztk.utils import constants, file_utils, secure_utils
from aztk.error import InvalidCustomScriptError
@ -61,10 +61,12 @@ class NodeData:
for file in file_paths:
self.add_file(file, zip_dir, binary)
def add_dir(self, path: str, dest: str = None, exclude: List[str] = []):
def add_dir(self, path: str, dest: str = None, exclude: List[str] = None):
"""
Zip all the files in the given directory into the zip file handler
"""
exclude = exclude or []
for base, _, files in os.walk(path):
relative_folder = os.path.relpath(base, path)
for file in files:
@ -156,7 +158,8 @@ class NodeData:
def _add_node_scripts(self):
self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=['*.pyc*'])
def _includeFile(self, filename: str, exclude: List[str] = []) -> bool:
def _includeFile(self, filename: str, exclude: List[str]) -> bool:
exclude = exclude or []
for pattern in exclude:
if fnmatch.fnmatch(filename, pattern):
return False

Просмотреть файл

@ -1,2 +1 @@
from .models import *
from .client import Client

Просмотреть файл

@ -15,13 +15,27 @@ from aztk.internal.cluster_data import NodeData
class Client(BaseClient):
"""
Aztk Spark Client
This is the main entry point for using aztk for spark
Args:
secrets_config(aztk.spark.models.models.SecretsConfiguration): Configuration with all the needed credentials
"""
def __init__(self, secrets_config):
super().__init__(secrets_config)
'''
Spark client public interface
'''
def create_cluster(self, cluster_conf: models.ClusterConfiguration, wait: bool = False):
"""
Create a new aztk spark cluster
Args:
cluster_conf(aztk.spark.models.models.ClusterConfiguration): Configuration for the the cluster to be created
wait(bool): If you should wait for the cluster to be ready before returning
Returns:
aztk.spark.models.Cluster
"""
cluster_conf.validate()
cluster_data = self._get_cluster_data(cluster_conf.cluster_id)
try:

Просмотреть файл

@ -1,6 +1,6 @@
from .hdfs import *
from .jupyter import *
from .jupyter_lab import *
from .rstudio_server import *
from .simple import *
from .spark_ui_proxy import *
from .hdfs import HDFSPlugin
from .jupyter import JupyterPlugin
from .jupyter_lab import JupyterLabPlugin
from .rstudio_server import RStudioServerPlugin
from .simple import SimplePlugin
from .spark_ui_proxy import SparkUIProxyPlugin

Просмотреть файл

@ -1,14 +1,11 @@
# Azure Distributed Data Engineering Toolkit
The Azure Distributed Data Engineering Toolkit is a project that allows Sparks users to easily spin up a Spark cluster in Azure.
## Getting Started
# Getting Started
The minimum requirements to get started with this package are:
- Python 3.5+, pip 9.0.1+
- An Azure account
- An Azure Batch account
- An Azure Storage account
### Cloning and installing the project
## Cloning and installing the project
1. Clone the repo
2. Make sure you are running python 3.5 or greater.
_If the default version on your machine is python 2 make sure to run the following commands with **pip3** instead of **pip**._
@ -40,12 +37,12 @@ The minimum requirements to get started with this package are:
This will put default configuration files in your home directory, *~/*. Please note that configuration files in your current working directory will take precident over global configuration files in your home directory.
### Setting up your accounts
## Setting up your accounts
#### Using the account setup script
A script to create and configure the Azure resources required to use `aztk` is provided. For more more information and usage, see [Getting Started Script](./01-getting-started-script.md).
### Using the account setup script
A script to create and configure the Azure resources required to use `aztk` is provided. For more more information and usage, see [Getting Started Script](01-getting-started-script.html)
#### Manual resource creation
### Manual resource creation
To finish setting up, you need to fill out your Azure Batch and Azure Storage secrets in *.aztk/secrets.yaml*. We'd also recommend that you enter SSH key info in this file too.
Please note that if you use ssh keys and a have a non-standard ssh key file name or path, you will need to specify the location of your ssh public and private keys. To do so, set them as shown below:
@ -64,7 +61,7 @@ If you do not already have an Azure account, go to [https://azure.microsoft.com/
Once you have one, simply log in and go to the [Azure Portal](https://portal.azure.com) to start creating your Azure Batch account and Azure Storage account.
##### Using AAD
#### Using AAD
To get the required keys for your Azure Active Directory (AAD) Service Principal, Azure Batch Account and Azure Storage Account, please follow these instructions. Note that this is the recommended path for use with AZTK, as some features require AAD and are disabled if using Shared Key authentication.
1. Register an Azure Active Directory (AAD) Application
@ -135,7 +132,7 @@ service_principal:
storage_account_resource_id: </storage/account/resource/id>
```
#### Using Shared Keys
### Using Shared Keys
_Please note that using Shared Keys prevents the use of certain AZTK features including Mixed Mode clusters and support for VNETs._
To get the required keys for Azure Batch and Azure Storage, please follow the below instructions:
@ -167,13 +164,13 @@ To get the required keys for Azure Batch and Azure Storage, please follow the be
- Go to the accounts in the Azure portal and copy pase the account names, keys and other information needed into the
secrets file.
#### Storage account
### Storage account
For the Storage account, copy the name and one of the two keys:
![](./misc/Storage_secrets.png)
#### Batch account
### Batch account
For the Batch account, copy the name, the url and one of the two keys:
@ -181,5 +178,5 @@ For the Batch account, copy the name, the url and one of the two keys:
## Next Steps
- [Create a cluster](./10-clusters.md)
- [Run a Spark job](./20-spark-submit.md)
- [Create a cluster](10-clusters.html)
- [Run a Spark job](./20-spark-submit.html)

Просмотреть файл

@ -41,4 +41,4 @@ service_principal:
Copy the entire `service_principal` section in your `.aztk/secrets.yaml`. If you do not have a `secrets.yaml` file, you can create one in your current working directory by running `aztk spark init`.
Now you are ready to create your first `aztk` cluster. See [Creating a Cluster](./10-clusters.md#creating-a-cluster).
Now you are ready to create your first `aztk` cluster. See [Creating a Cluster](./10-clusters.html#creating-a-cluster).

Просмотреть файл

@ -4,7 +4,7 @@ In the Azure Distributed Data Engineering Toolkit, a cluster is primarily design
## Creating a Cluster
Creating a Spark cluster only takes a few simple steps after which you will be able to SSH into the master node of the cluster and interact with Spark. You will be able to view the Spark Web UI, Spark Jobs UI, submit Spark jobs (with *spark-submit*), and even interact with Spark in a Jupyter notebook.
For the advanced user, please note that the default cluster settings are preconfigured in the *.aztk/cluster.yaml* file that is generated when you run `aztk spark init`. More information on cluster config [here.](./13-configuration.md)
For the advanced user, please note that the default cluster settings are preconfigured in the *.aztk/cluster.yaml* file that is generated when you run `aztk spark init`. More information on cluster config [here.](./13-configuration.html)
### Commands
Create a Spark cluster:
@ -33,7 +33,7 @@ You can create clusters with a mixed of [low-priority](https://docs.microsoft.co
Please note, to use Mixed Mode clusters, you need to authenticate using Azure Active Directory (AAD) by configuring the Service Principal in `.aztk/secrets.yaml`. You also need to create a [Virtual Network \(VNET\)](https://azure.microsoft.com/en-us/services/virtual-network/), and provide the resource ID to a Subnet within the VNET in your ./aztk/cluster.yaml` configuration file.
#### Setting your Spark and/or Python versions
By default, the Azure Distributed Data Engineering Toolkit will use **Spark v2.2.0** and **Python v3.5.4**. However, you can set your Spark and/or Python versions by [configuring the base Docker image used by this package](./12-docker-image.md).
By default, the Azure Distributed Data Engineering Toolkit will use **Spark v2.2.0** and **Python v3.5.4**. However, you can set your Spark and/or Python versions by [configuring the base Docker image used by this package](./12-docker-image.html).
### Listing clusters
You can list all clusters currently running in your account by running
@ -161,9 +161,9 @@ __Please be careful sharing the output of the `debug` command as secrets and app
### Interact with your Spark cluster
By default, the `aztk spark cluster ssh` command port forwards the Spark Web UI to *localhost:8080*, Spark Jobs UI to *localhost:4040*, and Spark History Server to your *locahost:18080*. This can be [configured in *.aztk/ssh.yaml*](../docs/13-configuration.md##sshyaml).
By default, the `aztk spark cluster ssh` command port forwards the Spark Web UI to *localhost:8080*, Spark Jobs UI to *localhost:4040*, and Spark History Server to your *locahost:18080*. This can be [configured in *.aztk/ssh.yaml*](../docs/13-configuration.html#sshyaml).
## Next Steps
- [Run a Spark job](./20-spark-submit.md)
- [Configure the Spark cluster using custom commands](./11-custom-scripts.md)
- [Bring your own Docker image or choose between a variety of our supported base images to manage your Spark and Python versions](./12-docker-image.md)
- [Run a Spark job](20-spark-submit.html)
- [Configure the Spark cluster using custom commands](11-custom-scripts.html)
- [Bring your own Docker image or choose between a variety of our supported base images to manage your Spark and Python versions](12-docker-image.html)

Просмотреть файл

@ -22,7 +22,7 @@ size: 2
# username: <username for the linux user to be created> (optional)
username: spark
# docker_repo: <name of docker image repo (for more information, see https://github.com/Azure/aztk/blob/master/docs/12-docker-image.md)>
# docker_repo: <name of docker image repo (for more information, see https://github.com/Azure/aztk/blob/master/docs/12-docker-image.html)>
docker_repo: aztk/base:spark2.2.0
# custom_script: <path to custom script to run on each node> (optional)
@ -104,12 +104,12 @@ spark.history.fs.logDirectory <path>
Please note that the path for `spark.eventLog.dir` and `spark.history.fs.logDirectory` should most likely match so that the history server reads the logs that each Spark job writes. Also note that while the paths can be local (`file:/`), it is recommended that the paths be accessible by every node in the cluster so that the history server, which runs on the Spark master node, has access to all application logs. HDFS, WASB, ADL, or any other Hadoop API compliant storage system may be used.
If using WASB, ADL or other cloud storage services, be sure to set your keys in `.aztk/core-site.xml`. For more information, see the [Cloud Storage](./30-cloud-storage.md) documentation.
If using WASB, ADL or other cloud storage services, be sure to set your keys in `.aztk/core-site.xml`. For more information, see the [Cloud Storage](./30-cloud-storage.html) documentation.
## Configuring Spark Storage
The Spark cluster can be configured to use different cloud supported storage offerrings (such as Azure Storage Blobs, Azure Data Lake Storage, or any other supported Spark file system). More information can be found in the [Cloud Storage](./30-cloud-storage.md) documentation.
The Spark cluster can be configured to use different cloud supported storage offerrings (such as Azure Storage Blobs, Azure Data Lake Storage, or any other supported Spark file system). More information can be found in the [Cloud Storage](./30-cloud-storage.html) documentation.
## Placing JARS
@ -129,5 +129,5 @@ Note: _This tool automatically registers several JARS for default cloud storage
## Next Steps
- [Add plugins](./15-plugins.md)
- [Set up your Cloud Storage](./30-cloud-storage.md)
- [Add plugins](./15-plugins.html)
- [Set up your Cloud Storage](./30-cloud-storage.html)

Просмотреть файл

@ -17,7 +17,7 @@ aztk spark cluster submit --id spark --name pipy examples/src/main/python/pi.py
NOTE: The job name (--name) must be atleast 3 characters long, can only contain alphanumeric characters including hyphens but excluding underscores, and cannot contain uppercase letters. Each job you submit **must** have a unique name.
## Monitoring job
If you have set up a [SSH tunnel](./10-clusters.md#ssh-and-port-forwarding) with port fowarding, you can naviate to http://localhost:8080 and http://localhost:4040 to view the progess of the job using the Spark UI
If you have set up a [SSH tunnel](./10-clusters.html#ssh-and-port-forwarding) with port fowarding, you can naviate to http://localhost:8080 and http://localhost:4040 to view the progess of the job using the Spark UI
## Getting output logs

Просмотреть файл

@ -1,752 +0,0 @@
# SDK
Operationalize AZTK with the provided Python SDK.
Find some samples and getting stated tutorial in the `examples/sdk/` directory of the repository.
## Public Interface
### Client
- `cluster_copy(self, cluster_id: str, source_path: str, destination_path: str, internal: bool = False)`
Copy a file to every node in the given cluster
Parameters:
- cluster_id: str
- the id of the cluster
- source_path: str
- the local path to the file to be copied
- destination_path: str
- the path (including the file name) that the file should be placed on each node.
- internal: bool
- if True, connects to the cluster using the local IP. Only set to True if the node's internal IP address is resolvable by the client.
- `cluster_run(self, cluster_id: str, command: str, internal: bool = False)`
Run a command on every node in the given cluster
Parameters:
- cluster_id: str
- the id of the cluster
- command: str
- the command to run on each node
- internal: bool
- if True, connects to the cluster using the local IP. Only set to True if the node's internal IP address is resolvable by the client.
- `create_cluster(self, cluster_conf: aztk.spark.models.ClusterConfiguration, wait=False)`
Create an AZTK cluster with the given cluster configuration
Parameters:
- cluster_conf: models.ClusterConfiguration
- the definition of the cluster to create
- wait: bool = False
- If true, block until the cluster is running, else return immediately
Returns:
- aztk.spark.models.Cluster
- `create_clusters_in_parallel(self, cluster_confs: List[aztk.models.ClusterConfiguration])`
Create an AZTK clusters with the given list of cluster configurations
Parameters:
- cluster_confs: List[aztk.models.ClusterConfiguration]
Returns:
- None
- `delete_cluster(self, cluster_id: str, keep_logs: bool = False)`
Delete an AZTK cluster with the given ID
Parameters:
- cluster_id: str
- The ID of the cluster to delete
- keep_logs: bool
- If true, the logs associated with this cluster will not be deleted.
Returns:
- None
- `get_cluster(self, cluster_id: str)`
Retrieve detailed information about the cluster with the given ID
Parameters:
- cluster_id
- the ID of the cluster to get
Returns:
- aztk.models.Cluster()
- `list_clusters(self)`
Retrieve a list of existing AZTK clusters.
Returns:
- List[aztk.models.Cluster]
- `get_remote_login_settings(self, cluster_id: str, node_id: str)`
Return the settings required to login to a node
Parameters:
- cluster_id: str
The cluster to login to
- node_id: str
The node to login to
Returns:
- aztk.spark.models.RemoteLogin
- `submit(self, cluster_id: str, application: aztk.spark.models.Application)`
Parameters:
- cluster_id: str
The cluster that the application is submitted to
- application: aztk.spark.models.Application
The application to submit
Returns:
- None
- `submit_all_applications(self, cluster_id: str, applications: List[aztk.spark.models.Application])`
Submit a list of applications to be exected on a cluster
Parameters:
- cluster_id: str
The cluster that the applications are submitted to
- applications: List[aztk.spark.models.Application]
List of applications to submit
Returns:
- None
- `wait_until_application_done(self, cluster_id: str, task_id: str)`
Block until the given application has completed on the given cluster
Parameters:
- cluster_id: str
The cluster on which the application is running
- task_id
The application to wait for
Returns:
- None
- `wait_until_applications_done(self, cluster_id: str)`
Block until all applications on the given cluster are completed
Parameters:
- cluster_id: str
The cluster on which the application is running
Returns:
- None
- `wait_until_cluster_is_ready(self, cluster_id: str)`
Block until the given cluster is running
Parameters:
- cluster_id: str
The ID of the cluster to wait for
Returns:
- aztk.spark.models.Cluster
- `wait_until_all_clusters_are_ready(self, clusters: List[str])`
Wait until all clusters in the given list are ready
Parameters:
- clusters: List[str]
A list of the IDs of all the clusters to wait for
Returns:
- None
- `create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None)`
Create a user on the given cluster
Parameters:
- cluster_id: List[str]
The cluster on which to create the user
- password: str
The password to create the user with (mutually exclusive with ssh_key)
- ssh_key: str
The ssh_key to create the user with (mutually exclusive with password)
Returns:
- None
- `get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0)`
Get the logs of a completed or currently running application
Parameters:
- cluster_id: str
The id of the cluster on which the application ran or is running.
- application_name: str
The name of the application to retrieve logs for
- tail: bool
Set to true if you want to get only the newly added data after current_bytes.
- current_bytes: int
The amount of bytes already retrieved. To get the entire log, leave this at 0. If you are streaming, set this to the current number of bytes you have already retrieved, so you only retrieve the newly added bytes.
Returns:
- aztk.spark.models.ApplicationLog
- `get_application_status(self, cluster_id: str, app_name: str)`
Get the status of an application
Parameters:
- cluster_id: str
The id of the cluster to which the app was submitted
- app_name
the name of the application in question
Returns:
- str
- `submit_job(self, job_configuration)`
Submit an AZTK Spark Job
Parameters:
- job_configuration: aztk.spark.models.JobConfiguration
The configuration of the job to be submitted
Returns:
- aztk.spark.models.Job
- `list_jobs(self)`
List all created AZTK Spark Jobs
Parameters:
- job_configuration: aztk.spark.models.JobConfiguration
The configuration of the job to be submitted
Returns:
- List[aztk.spark.models.Job]
- `list_applicaitons(self, job_id)`
List all applications created on the AZTK Spark Job with id job_id
Parameters:
- job_id: str
The id of the Job
Returns:
- Dict{str: aztk.spark.models.Application or None}
- the key is the name of the application
- the value is None if the application has not yet been scheduled or an Application model if it has been scheduled
- `get_job(self, job_id)`
Get information about the AZTK Spark Job with id job_id
Parameters:
- job_id: str
The id of the Job
Returns:
- List[aztk.spark.models.Job]
- `stop_job(self, job_id)`
Stop the AZTK Spark Job with id job_id
Parameters:
- job_id: str
The id of the Job
Returns:
- None
- `delete_job(self, job_id, keep_logs: bool = False)`
Delete the AZTK Spark Job with id job_id
Parameters:
- job_id: str
The id of the Job
- keep_logs: bool
- If true, the logs associated with this Job will not be deleted.
Returns:
- bool
- `get_application(self, job_id, application_name)`
Get information about an AZTK Spark Job's application
Parameters:
- job_id: str
The id of the Job
- application_name: str
The name of the Application
Returns:
- aztk.spark.models.Application
- `get_job_application_log(self, job_id, application_name)`
Get the log of an AZTK Spark Job's application
Parameters:
- job_id: str
The id of the Job
- application_name: str
The name of the Application
Returns:
- aztk.spark.models.ApplicationLog
- `stop_job_app(self, job_id, application_name)`
Stop an Application running on an AZTK Spark Job
Parameters:
- job_id: str
The id of the Job
- application_name: str
The name of the Application
Returns:
- None
- `wait_until_job_finished(self, job_id)`
Wait until the AZTK Spark Job with id job_id is complete
Parameters:
- job_id: str
The id of the Job
- application_name: str
The name of the Application
Returns:
- None
- `wait_until_all_jobs_finished(self, jobs)`
Wait until all of the given AZTK Spark Jobs are complete
Parameters:
- jobs: List[str]
The ids of the Jobs to wait for
Returns:
- None
### Models
- `Application`
The definition of an AZTK Spark Application as it exists in the cloud. Please note that this object is not used to configure Applications, only to read information about existing Applications. Please see ApplicationConfiguration if you are trying to create an Application.
Fields:
- name: str
- last_modified: datetime
- creation_time: datetime
- state: str
- state_transition_time: datetime
- previous_state: str
- previous_state_transition_time: datetime
- exit_code: int
<!---
- _execution_info: azure.batch.models.TaskExecutionInformation
- _node_info
- _stats
- _multi_instance_settings
- _display_name
- _exit_conditions
- _command_line
- _resource_files
- _output_files
- _environment_settings
- _affinity_info
- _constraints
- _user_identity
- _depends_on
- _application_package_references
- _authentication_token_settings
- _url
- _e_tag
-->
- `ApplicationConfiguration`
Define a Spark application to run on a cluster.
Fields:
- name: str
Unique identifier for the application.
- application: str
Path to the application that will be executed. Can be jar or python file.
- application_args: [str]
List of arguments for the application
- main_class: str
The application's main class. (Only applies to Java/Scala)
- jars: [str]
Additional jars to supply for the application.
- py_files: [str]
Additional Python files to supply for the application. Can be .zip, .egg, or .py files.
- files: [str]
Additional files to supply for the application.
- driver_java_options: str
Extra Java options to pass to the driver.
- driver_library_path: str
Extra library path entries to pass to the driver.
- driver_class_path: str
Extra class path entries to pass to the driver. Note that jars added with --jars are automatically included in the classpath.
- driver_memory: str
Memory for driver (e.g. 1000M, 2G) (Default: 1024M).
- executor_memory: str
Memory per executor (e.g. 1000M, 2G) (Default: 1G).
- driver_cores: str
Cores for driver (Default: 1).
- executor_cores: str
Number of cores per executor. (Default: All available cores on the worker)
- max_retry_count: int
Number of times the Spark job may be retried if there is a failure
- `ApplicationLog`
Holds the logged data from a spark application and metadata about the application and log.
Fields:
- name: str
- cluster_id: str
- log: str
- total_bytes: int
- application_state: str
- exit_code: str
- `Cluster`
An AZTK cluster. Note that this model is not used to create a cluster, for that see `ClusterConfiguration`.
Fields:
- id: str
The unique id of the cluster
- pool: azure.batch.models.CloudPool
A pool in the Azure Batch service.
- nodes: azure.batch.models.ComputeNodePaged
A paging container for iterating over a list of ComputeNode objects
- vm_size: str
The size of virtual machines in the cluster. All virtual machines in a cluster are the same size. For information about available sizes of virtual machines, see Sizes for Virtual Machines (Linux) (https://azure.microsoft.com/documentation/articles/virtual-machines-linux-sizes/). AZTK supports all Azure VM sizes except STANDARD_A0 and those with premium storage (STANDARD_GS, STANDARD_DS, and STANDARD_DSV2 series).
- visible_state
The current state of the cluster. Possible values are:
resizing = 'resizing'
steady = 'steady'
stopping = 'stopping'
active = 'active'
deleting = 'deleting'
upgrading = 'upgrading'
- total_current_nodes
The total number of nodes currently allocated to the cluster.
- total_target_nodes
The desired number of nodes in the cluster. Sum of target_dedicated_nodes and target_low_pri_nodes.
- current_dedicated_nodes
The number of dedicated nodes currently in the cluster.
- current_low_pri_nodes
The number of low-priority nodes currently in the cluster. Low-priority nodes which have been preempted are included in this count.
- target_dedicated_nodes
The desired number of dedicated nodes in the cluster.
- target_low_pri_nodes
The desired number of low-priority nodes in the cluster.
- `ClusterConfiguration`
Define a Spark cluster to be created.
Fields:
- custom_scripts: [CustomScript]
A list of custom scripts to execute in the Spark Docker container.
- cluster_id: str
A unique ID of the cluster to be created. The ID can contain any combination of alphanumeric characters including hyphens and underscores, and cannot contain more than 64 characters. The ID is case-preserving and case-insensitive (that is, you may not have two IDs within an account that differ only by case).
- vm_count: int
The number of dedicated VMs (nodes) to be allocated to the cluster. Mutually exclusive with vm_low_pri_count.
- vm_size: str
The size of virtual machines in the cluster. All virtual machines in a cluster are the same size. For information about available sizes of virtual machines, see Sizes for Virtual Machines (Linux) (https://azure.microsoft.com/documentation/articles/virtual-machines-linux-sizes/). AZTK supports all Azure VM sizes except STANDARD_A0 and those with premium storage (STANDARD_GS, STANDARD_DS, and STANDARD_DSV2 series).
- vm_low_pri_count: int
The number of VMs (nodes) to be allocated to the cluster. Mutually exclusive with vm_count.
- docker_repo: str
The docker repository and image to use. For more information, see [Docker Image](./12-docker-image.md).
- spark_configuration: aztk.spark.models.SparkConfiguration
Configuration object for spark-specific values.
- `Custom Script`
A script that executed in the Docker container of specified nodes in the cluster.
Fields:
- name: str
A unique name for the script
- script: str or aztk.spark.models.File
Path to the script to be run or File object
- run_on: str
Set which nodes the script should execute on. Possible values:
all-nodes
master
worker
Please note that by default, the Master node is also a worker node.
- `File`
A File definition for programmatically defined configuration files.
Fields:
- name: str
- payload: io.StringIO
- `JobConfiguration`
Define an AZTK Job.
Methods:
- `__init__(
self,
id,
applications=None,
custom_scripts=None,
spark_configuration=None,
vm_size=None,
docker_repo=None,
max_dedicated_nodes=None,
subnet_id=None)`
Fields:
- id: str
- applications: List[aztk.spark.models.ApplicationConfiguration]
- custom_scripts: str
- spark_configuration: aztk.spark.models.SparkConfiguration
- vm_size: int
- gpu_enabled: str
- docker_repo: str
- max_dedicated_nodes: str
- subnet_id: str
- `Job`
Methods:
`__init__(self, cloud_job_schedule: batch_models.CloudJobSchedule, cloud_tasks: List[batch_models.CloudTask] = None)`
Fields:
- id: str
- last_modified: datetime
- state: datetime
- state_transition_time: datetime
- applications: datetime
<!--
- creation_time: datetime
- schedule: datetime
- exection_info: datetime
- recent_run_id: datetime
-->
<!--
- `JobState`
complete = 'completed'
active = "active"
completed = "completed"
disabled = "disabled"
terminating = "terminating"
deleting = "deleting"
-->
- `SecretsConfiguration`
The Batch, Storage, Docker and SSH secrets used to create AZTK clusters. For more help with setting these values see [Getting Started](./00-getting-started.md).
Exactly one of `service_principal` and `shared_key` must be provided to this object. If both or none validation will fail.
Fields:
service_principal: ServicePrincipalConfiguration
shared_key: SharedKeyConfiguration
docker: DockerConfiguration
ssh_pub_key: str
ssh_priv_key: str
- `ServicePrincipalConfiguration`
Configuration needed to use aad auth.
Fields:
tenant_id: str
client_id: str
credential: str
batch_account_resource_id: str
storage_account_resource_id: str
- `SharedKeyConfiguration`
Configuration needed to use shared key auth.
Fields:
batch_account_name: str
batch_account_key: str
batch_service_url: str
storage_account_name: str
storage_account_key: str
storage_account_suffix: str
- `DockerConfiguration`
Configuration needed to use custom docker.
Fields:
endpoint: str
username: str
password: str
- `SparkConfiguration`
Define cluster-wide Spark specific parameters.
Fields:
- spark_defaults_conf: str or aztk.spark.models.File
Path or File object defining spark_defaults.conf configuration file to be used.
- spark_env_sh: str or aztk.spark.models.File
Path or File object defining spark_env.sh configuration file to be used.
- core_site_xml: str or aztk.spark.models.File
Path or File object defining the core-site.xml configuration file to be used.
- jars: [str or aztk.spark.models.File]
Paths to or File objects defining Additional Jars to be uploaded

Просмотреть файл

@ -1,6 +1,6 @@
# GPU
Use GPUs to accelerate your Spark applications. When using a [GPU enabled Azure VM](https://azure.microsoft.com/en-us/pricing/details/batch/), your docker image will contain CUDA-8.0 and cuDnn-6.0 by default. See [Docker Image](./12-docker-image.md) for more information about the AZTK Docker images.
Use GPUs to accelerate your Spark applications. When using a [GPU enabled Azure VM](https://azure.microsoft.com/en-us/pricing/details/batch/), your docker image will contain CUDA-8.0 and cuDnn-6.0 by default. See [Docker Image](./12-docker-image.html) for more information about the AZTK Docker images.
[NOTE: Azure does not have GPU enabled VMs in all regions. Please use this [link](https://azure.microsoft.com/en-us/pricing/details/batch/) to make sure that your Batch account is in a region that has GPU enabled VMs]

Просмотреть файл

@ -52,7 +52,7 @@ Jobs also require a definition of the cluster on which the Applications will run
- custom
- scripts
```
_Please Note: For more information about Azure VM sizes, see [Azure Batch Pricing](https://azure.microsoft.com/en-us/pricing/details/batch/). And for more information about Docker repositories see [Docker](./12-docker-iamge.md)._
_Please Note: For more information about Azure VM sizes, see [Azure Batch Pricing](https://azure.microsoft.com/en-us/pricing/details/batch/). And for more information about Docker repositories see [Docker](./12-docker-iamge.html)._
_The only required fields are vm_size and either size or size_low_pri, all other fields can be left blank or removed._

Просмотреть файл

@ -1,8 +1,8 @@
# Tests
AZTK comes with a testing library that can be used for verification, and debugging. Please note that some tests will provision and test real resources in Azure, and as a result, will cost money to run. See [Integration Tests](#IntegrationTests) for more details.
AZTK comes with a testing library that can be used for verification, and debugging. Please note that some tests will provision and test real resources in Azure, and as a result, will cost money to run. See [Integration Tests](#integration-tests) for more details.
## <a name="IntegrationTests"></a> Integration Tests
## Integration Tests
Integration tests use the credentials given in your `.aztk/secrets.yaml` file to spin up real Clusters and Jobs to verify the functionality of the library. Please note that these tests __will__ cost money to run. All created Clusters nad Jobs will be deleted when the test completes.

20
docs/Makefile Normal file
Просмотреть файл

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = aztk
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

Просмотреть файл

@ -1,17 +0,0 @@
## [Getting Started](./00-getting-started.md)
Set up your azure account and resources.
## [Clusters](./10-clusters.md)
Create, manage and interact with a cluster. Also learn about connecting to the master node, using Jupyter and viewing the Spark UI.
## [Custom Scripts](./11-custom-scripts.md)
Add custom configuraiton scripts to your cluster.
## [Docker Image](./12-docker-image.md)
Learn about setting a custom docker image or how you can use Docker to manage versioning.
## [Spark Submit](./20-spark-submit.md)
Submit a Spark job to the cluster.
## [Cloud Storage](./30-cloud-storage.md)
Using cloud storage to load and save data to and from persisted data stores.

10
docs/aztk.models.rst Normal file
Просмотреть файл

@ -0,0 +1,10 @@
aztk.models package
===================
aztk.models.models module
-------------------------
.. automodule:: aztk.models.models
:members:
:undoc-members:
:show-inheritance:

23
docs/aztk.rst Normal file
Просмотреть файл

@ -0,0 +1,23 @@
aztk package
============
.. toctree::
aztk.models
aztk.spark
aztk.client module
------------------
.. autoclass:: aztk.client.Client
:members:
:undoc-members:
:show-inheritance:
aztk.error module
-----------------
.. automodule:: aztk.error
:members:
:undoc-members:
:show-inheritance:

Просмотреть файл

@ -0,0 +1,8 @@
aztk.spark.models.plugins package
=================================
.. automodule:: aztk.spark.models.plugins
:members:
:undoc-members:
:show-inheritance:
:imported-members:

Просмотреть файл

@ -0,0 +1,12 @@
aztk.spark.models package
=========================
.. toctree::
aztk.spark.models.plugins
.. automodule:: aztk.spark.models
:members:
:undoc-members:
:show-inheritance:
:imported-members:

19
docs/aztk.spark.rst Normal file
Просмотреть файл

@ -0,0 +1,19 @@
aztk.spark package
==================
.. toctree::
aztk.spark.models
aztk.spark.client module
------------------------
.. automodule:: aztk.spark.client
:members:
:undoc-members:
:show-inheritance:
.. automodule:: aztk.spark
:members:
:undoc-members:
:show-inheritance:

194
docs/conf.py Normal file
Просмотреть файл

@ -0,0 +1,194 @@
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/stable/config
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
# from recommonmark.transform import AutoStructify
from recommonmark.parser import CommonMarkParser
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__))))
sys.path.insert(0, basedir)
# -- Project information -----------------------------------------------------
project = 'aztk'
#pylint: disable=W0622
copyright = '2018, Microsoft'
author = 'Microsoft'
# This gets set automatically by readthedocs
release = version = ''
# -- General configuration ---------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'sphinx.ext.viewcode',
'sphinx.ext.napoleon',
'sphinx.ext.intersphinx',
]
intersphinx_mapping = {
'python': ('https://docs.python.org/3.6', None),
}
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_parsers = {
'.md': CommonMarkParser,
}
source_suffix = ['.rst', '.md']
# The master toctree document.
master_doc = 'index'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path .
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
html_theme_options = {
'collapse_navigation': True,
'sticky_navigation': True,
}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself. Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'aztkdoc'
# -- Options for LaTeX output ------------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'aztk.tex', 'aztk Documentation',
'Microsoft', 'manual'),
]
# -- Options for manual page output ------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'aztk', 'aztk Documentation',
[author], 1)
]
# -- Options for Texinfo output ----------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'aztk', 'aztk Documentation',
author, 'aztk', 'One line description of project.',
'Miscellaneous'),
]
# Napoleon configuration
napoleon_google_docstring = True
napoleon_use_param = True
napoleon_use_rtype = True
napoleon_use_keyword = True
napoleon_include_special_with_doc = True
# Autodoc configuration
autodoc_member_order = 'bysource'
# def setup(app):
# app.add_config_value('recommonmark_config', {
# 'enable_auto_doc_ref': False,
# }, True)
# app.add_transform(AutoStructify)

14
docs/docs.md Normal file
Просмотреть файл

@ -0,0 +1,14 @@
# Writting docs
Docs are located in the docs folder. We are using `sphinx` to generate the docs and then hosting them on `readthedocs`.
## Start docs autobuild to test locally
```bash
sphinx-autobuild docs docs/_build/html --watch aztk
```
Open `docs/_build/index.html`
## Publish the docs
Docs should be published automatically to read the docs as soon as you push to master under the `latest` tag.
You when creating a git tag readthedocs can also build that one.

43
docs/index.rst Normal file
Просмотреть файл

@ -0,0 +1,43 @@
Azure Distributed Data Engineering Toolkit
==========================================
Azure Distributed Data Engineering Toolkit (AZTK) is a python CLI application for provisioning on-demand Spark on Docker clusters in Azure. It's a cheap and easy way to get up and running with a Spark cluster, and a great tool for Spark users who want to experiment and start testing at scale.
This toolkit is built on top of Azure Batch but does not require any Azure Batch knowledge to use.
.. _user-docs:
.. toctree::
:maxdepth: 2
:caption: User documentation:
00-getting-started
01-getting-started-script
10-clusters
11-custom-scripts
12-docker-image
13-configuration
14-azure-files
15-plugins
20-spark-submit
30-cloud-storage
60-gpu
70-jobs
.. _sdk-docs:
.. toctree::
:maxdepth: 2
:caption: SDK documentation:
sdk-examples
51-define-plugin
aztk
.. _dev-docs:
.. toctree::
:maxdepth: 2
:caption: Developper documentation:
docs
80-tests

36
docs/make.bat Normal file
Просмотреть файл

@ -0,0 +1,36 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
set SPHINXPROJ=aztk
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
:end
popd

144
docs/sdk-examples.md Normal file
Просмотреть файл

@ -0,0 +1,144 @@
# SDK samples
## Create the Spark client
You can get the values for this by either running the [Getting Started script](getting-started) or using [Batch Labs](https://github.com/Azure/BatchLabs)
```python
import sys, os, time
import aztk.spark
from aztk.error import AztkError
# set your secrets
secrets_confg = aztk.spark.models.SecretsConfiguration(
service_principal=aztk.spark.models.ServicePrincipalConfiguration(
tenant_id="<org>.onmicrosoft.com",
client_id="",
credential="",
batch_account_resource_id="",
storage_account_resource_id="",
),
ssh_pub_key=""
)
# create a client
client = aztk.spark.Client(secrets_confg)
```
## List available clusters
```python
# list available clusters
clusters = client.list_clusters()
```
## Create a new cluster
```python
# define a custom script
plugins = [
aztk.spark.models.plugins.JupyterPlugin(),
]
# define spark configuration
spark_conf = aztk.spark.models.SparkConfiguration(
spark_defaults_conf=os.path.join(ROOT_PATH, 'config', 'spark-defaults.conf'),
spark_env_sh=os.path.join(ROOT_PATH, 'config', 'spark-env.sh'),
core_site_xml=os.path.join(ROOT_PATH, 'config', 'core-site.xml'),
jars=[os.path.join(ROOT_PATH, 'config', 'jars', jar) for jar in os.listdir(os.path.join(ROOT_PATH, 'config', 'jars'))]
)
# configure my cluster
cluster_config = aztk.spark.models.ClusterConfiguration(
cluster_id="sdk-test",
vm_low_pri_count=2,
vm_size="standard_f2",
plugins=plugins,
spark_configuration=spark_conf
)
# create a cluster, and wait until it is ready
try:
cluster = client.create_cluster(cluster_config)
cluster = client.wait_until_cluster_is_ready(cluster.id)
except AztkError as e:
print(e.message)
sys.exit()
```
## Get an exiting cluster
```python
cluster = client.get_cluster(cluster_config.cluster_id)
```
## Run an application on the cluster
```python
# create some apps to run
app1 = aztk.spark.models.ApplicationConfiguration(
name="pipy1",
application=os.path.join(ROOT_PATH, 'examples', 'src', 'main', 'python', 'pi.py'),
application_args="10"
)
app2 = aztk.spark.models.ApplicationConfiguration(
name="pipy2",
application=os.path.join(ROOT_PATH, 'examples', 'src', 'main', 'python', 'pi.py'),
application_args="20"
)
app3 = aztk.spark.models.ApplicationConfiguration(
name="pipy3",
application=os.path.join(ROOT_PATH, 'examples', 'src', 'main', 'python', 'pi.py'),
application_args="30"
)
# submit an app and wait until it is finished running
client.submit(cluster.id, app1)
client.wait_until_application_done(cluster.id, app1.name)
# submit some other apps to the cluster in parallel
client.submit_all_applications(cluster.id, [app2, app3])
```
## Get the logs of an application
```python
# get logs for app, print to console
app1_logs = client.get_application_log(cluster_id=cluster_config.cluster_id, application_name=app1.name)
print(app1_logs.log)
```
## Get status of app
```python
status = client.get_application_status(cluster_config.cluster_id, app2.name)
```
## Stream logs of app, print to console as it runs
```python
current_bytes = 0
while True:
app2_logs = client.get_application_log(
cluster_id=cluster_config.cluster_id,
application_name=app2.name,
tail=True,
current_bytes=current_bytes)
print(app2_logs.log, end="")
if app2_logs.application_state == 'completed':
break
current_bytes = app2_logs.total_bytes
time.sleep(1)
# wait until all jobs finish, then delete the cluster
client.wait_until_applications_done(cluster.id)
client.delete_cluster(cluster.id)
```

Просмотреть файл

@ -13,6 +13,14 @@ pylint==1.8.2
pytest==3.1.3
pytest-xdist==1.22.0
twine==1.9.1
docker==3.2.1
# Docs
sphinx==1.7.2
sphinx-autobuild==0.7.1
recommonmark==0.4.0
sphinx_rtd_theme==0.3.0
docutils==0.12
# There is a breaking change is 3.0.0 of this package and the depenency chain of other package is not defined correcly
SecretStorage==2.3.1