diff --git a/aztk/internal/configuration_base.py b/aztk/internal/configuration_base.py index 1fdd6ab6..8e3424fe 100644 --- a/aztk/internal/configuration_base.py +++ b/aztk/internal/configuration_base.py @@ -14,12 +14,17 @@ class ConfigurationBase: The dict is cleaned from null values and passed expanded to the constructor """ try: - clean = dict((k, v) for k, v in args.items() if v) - return cls(**clean) - except TypeError as e: + return cls._from_dict(args) + except (ValueError, TypeError) as e: pretty_args = yaml.dump(args, default_flow_style=False) raise AztkError("{0} {1}\n{2}".format(cls.__name__, str(e), pretty_args)) + + @classmethod + def _from_dict(cls, args: dict): + clean = dict((k, v) for k, v in args.items() if v) + return cls(**clean) + def validate(self): raise NotImplementedError("Validate not implemented") diff --git a/aztk/models/models.py b/aztk/models/models.py index 50552958..ebdbe11c 100644 --- a/aztk/models/models.py +++ b/aztk/models/models.py @@ -134,6 +134,9 @@ class ClusterConfiguration(ConfigurationBase): "You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). Set the VNET's subnet_id in your cluster.yaml." ) + if self.custom_scripts: + logging.warning("Custom scripts are DEPRECATED and will be removed in 0.8.0. Use plugins instead See https://aztk.readthedocs.io/en/latest/15-plugins.html") + class RemoteLogin: def __init__(self, ip_address, port): diff --git a/aztk/models/plugins/internal/plugin_reference.py b/aztk/models/plugins/internal/plugin_reference.py index 92930bc9..149f934a 100644 --- a/aztk/models/plugins/internal/plugin_reference.py +++ b/aztk/models/plugins/internal/plugin_reference.py @@ -1,21 +1,71 @@ -from aztk.error import InvalidPluginConfigurationError, InvalidModelError +import os + +from aztk.error import InvalidModelError from aztk.internal import ConfigurationBase from aztk.models import PluginConfiguration +from aztk.models.plugins import PluginFile, PluginTarget, PluginTargetRole + from .plugin_manager import plugin_manager + class PluginReference(ConfigurationBase): """ Contains the configuration to use a plugin + + Args: + name (str): Name of the plugin(Must be the name of one of the provided plugins if no script provided) + script (str): Path to a custom script to run as the plugin + target_role (PluginTarget): Target for the plugin. Default to SparkContainer. + This can only be used if providing a script + target_role (PluginTargetRole): Target role default to All nodes. This can only be used if providing a script + args: (dict): If using name this is the arguments to pass to the plugin """ - def __init__(self, name, args: dict = None): + def __init__(self, + name: str = None, + script: str = None, + target: PluginTarget = None, + target_role: PluginTargetRole = None, + args: dict = None): super().__init__() self.name = name + self.script = script + self.target = target + self.target_role = target_role self.args = args or dict() + @classmethod + def _from_dict(cls, args: dict): + if "target" in args: + args["target"] = PluginTarget(args["target"]) + if "target_role" in args: + args["target_role"] = PluginTargetRole(args["target_role"]) + + return super()._from_dict(args) + def get_plugin(self) -> PluginConfiguration: + self.validate() + + if self.script: + return self._plugin_from_script() + return plugin_manager.get_plugin(self.name, self.args) def validate(self) -> bool: - if not self.name: - raise InvalidModelError("Plugin is missing a name") + if not self.name and not self.script: + raise InvalidModelError("Plugin must either specify a name of an existing plugin or the path to a script.") + if self.script and not os.path.isfile(self.script): + raise InvalidModelError("Plugin script file doesn't exists: '{0}'".format(self.script)) + + def _plugin_from_script(self): + script_filename = os.path.basename(self.script) + name = self.name or os.path.splitext(script_filename)[0] + return PluginConfiguration( + name=name, + execute=script_filename, + target=self.target, + target_role=self.target_role or PluginConfiguration, + files=[ + PluginFile(script_filename, self.script), + ], + ) diff --git a/aztk/models/plugins/plugin_configuration.py b/aztk/models/plugins/plugin_configuration.py index b5f35049..b1673808 100644 --- a/aztk/models/plugins/plugin_configuration.py +++ b/aztk/models/plugins/plugin_configuration.py @@ -8,8 +8,8 @@ class PluginTarget(Enum): """ Where this plugin should run """ - SparkContainer = "spark-container", - Host = "host", + SparkContainer = "spark-container" + Host = "host" class PluginTargetRole(Enum): @@ -18,7 +18,6 @@ class PluginTargetRole(Enum): All = "all-nodes" - class PluginPort: """ Definition for a port that should be opened on node @@ -54,17 +53,17 @@ class PluginConfiguration(ConfigurationBase): def __init__(self, name: str, - ports: List[PluginPort]=None, - files: List[PluginFile]=None, - execute: str=None, + ports: List[PluginPort] = None, + files: List[PluginFile] = None, + execute: str = None, args=None, env=None, - target_role: PluginTargetRole=PluginTargetRole.Master, - target: PluginTarget=PluginTarget.SparkContainer): + target_role: PluginTargetRole = None, + target: PluginTarget = None): self.name = name # self.docker_image = docker_image - self.target = target - self.target_role = target_role + self.target = target or PluginTarget.SparkContainer + self.target_role = target_role or PluginTargetRole.Master self.ports = ports or [] self.files = files or [] self.args = args or [] diff --git a/aztk/spark/models/plugins/jupyter/configuration.py b/aztk/spark/models/plugins/jupyter/configuration.py index ef53f78e..43d042fe 100644 --- a/aztk/spark/models/plugins/jupyter/configuration.py +++ b/aztk/spark/models/plugins/jupyter/configuration.py @@ -1,7 +1,6 @@ import os from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile -from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) diff --git a/aztk_cli/config.py b/aztk_cli/config.py index b81dad26..69650c58 100644 --- a/aztk_cli/config.py +++ b/aztk_cli/config.py @@ -9,7 +9,6 @@ from aztk.spark.models import ( DockerConfiguration, ClusterConfiguration, UserConfiguration, - PluginConfiguration, ) from aztk.models.plugins.internal import PluginReference @@ -127,7 +126,7 @@ def read_cluster_config( Reads the config file in the .aztk/ directory (.aztk/cluster.yaml) """ if not os.path.isfile(path): - return + return None with open(path, 'r', encoding='UTF-8') as stream: try: @@ -137,7 +136,7 @@ def read_cluster_config( "Error in cluster.yaml: {0}".format(err)) if config_dict is None: - return + return None return cluster_config_from_dict(config_dict) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_create.py b/aztk_cli/spark/endpoints/cluster/cluster_create.py index 628a8834..061a25ab 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_create.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_create.py @@ -1,5 +1,4 @@ import argparse -import os import typing import aztk.spark diff --git a/docs/11-custom-scripts.md b/docs/11-custom-scripts.md index 3c571647..9e40559b 100644 --- a/docs/11-custom-scripts.md +++ b/docs/11-custom-scripts.md @@ -1,4 +1,7 @@ # Custom scripts + +**Custom scripts are _DEPRECATED_. Use [plugins](15-plugins.html) instead.** + Custom scripts allow for additional cluster setup steps when the cluster is being provisioned. This is useful if you want to install additional software, and if you need to modify the default cluster configuration for things such as modifying spark.conf, adding jars or downloading any files you need in the cluster. @@ -18,7 +21,7 @@ custom_scripts: The first script, simple.sh, will run on all nodes and will be executed first. The next script, master-only.sh will run only on nodes that are Spark masters and after simple.sh. The next script, worker-only.sh, will run last and only on nodes that are Spark workers. -Directories may also be provided in the custom_scripts section of `.aztk/cluster.yaml`. +Directories may also be provided in the custom_scripts section of `.aztk/cluster.yaml`. ```yaml custom_scripts: @@ -50,11 +53,11 @@ A custom-script to install HDFS (2.8.2) is provided at `custom-scripts/hdfs.sh` To enable HDFS, add this snippet to the custom_scripts section of your `.aztk/cluster.yaml` configuration file: ```yaml -custom_scripts: +custom_scripts: - script: ./custom-scripts/hdfs.sh runOn: all-nodes ``` -When SSHing into the cluster, you will have access to the Namenode UI at the default port 50070. This port can be changed in the ssh.yaml file in your `.aztk/` directory, or by passing the `--namenodeui` flag to the `aztk spark cluster ssh` command. +When SSHing into the cluster, you will have access to the Namenode UI at the default port 50070. This port can be changed in the ssh.yaml file in your `.aztk/` directory, or by passing the `--namenodeui` flag to the `aztk spark cluster ssh` command. -When enabled on the cluster, HDFS can be used to read or write data locally during program execution. \ No newline at end of file +When enabled on the cluster, HDFS can be used to read or write data locally during program execution. diff --git a/docs/15-plugins.md b/docs/15-plugins.md index 04610584..51f669a0 100644 --- a/docs/15-plugins.md +++ b/docs/15-plugins.md @@ -1,5 +1,9 @@ # Plugins +Plugins are a successor to [custom scripts](11-custom-scripts.html) and are the reconmmended way of running custom code on the cluster. + +Plugins can either be one of the Aztk [supported plugins](#supported-plugins) or the path to a [local file](#custom-script-plugin). + ## Supported Plugins AZTK ships with a library of default plugins that enable auxillary services to use with your Spark cluster. @@ -22,7 +26,8 @@ plugins: - name: hdfs - name: spark_ui_proxy - name: rsutio_server - version: "1.1.383" + args: + version: "1.1.383" ``` ### Enable a plugin using the SDK @@ -38,3 +43,26 @@ cluster_config = ClusterConfiguration( ] ) ``` + + +## Custom script plugin + +This allows you to run your custom code on the cluster +### Run a custom script plugin with the CLI + +#### Example +```yaml +plugins: + - script: path/to/my/script.sh + - name: friendly-name + script: path/to/my-other/script.sh + target: host + target_role: all-nodes +``` + +#### Options + +* `script`: **Required** Path to the script you want to run +* `name`: **Optional** Friendly name. By default will be the name of the script file +* `target`: **Optional** Target on where to run the plugin(Default: `spark-container`). Can be `spark-container` or `host` +* `target_role`: **Optional** What should be the role of the node where this script run(Default: `master`). Can be `master`, `worker` or `all-nodes` diff --git a/docs/51-define-plugin.md b/docs/51-define-plugin.md index 29b87c5a..d889101c 100644 --- a/docs/51-define-plugin.md +++ b/docs/51-define-plugin.md @@ -37,34 +37,52 @@ cluster_config = ClusterConfiguration( ## Parameters ### `PluginConfiguration` -| Name | Required? | Type | Description | -|--------------|-----------|---------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `name` | required | string | Name of your plugin(This will be used for creating folder, it is recommended to have a simple letter, dash, underscore only name) | -| `files` | required | List[PluginFile|PluginTextFile] | List of files to upload | -| `execute` | required | str | Script to execute. This script must be defined in the files above and must match its remote path | -| `args` | optional | List[str] | List of arguments to be passed to your execute scripts | -| `env` | optional | dict | List of environment variables to access in the script(This can be used to pass arguments to your script instead of args) | -| `ports` | optional | List[PluginPort] | List of ports to open if the script is running in a container. A port can also be specific public and it will then be accessible when ssh into the master node. | -| `target` | optional | PluginTarget | Define where the execute script should be running. Potential values are `PluginTarget.SparkContainer(Default)` and `PluginTarget.Host` | -| `taget_role` | optional | PluginTargetRole | If the plugin should be run only on the master worker or all. You can use environment variables(See below to have different master/worker config) | | + +#### name `required` | `string` +Name of your plugin(This will be used for creating folder, it is recommended to have a simple letter, dash, underscore only name) + +#### files `required` | `List[PluginFile|PluginTextFile]` +List of files to upload + +#### execute `required` | `str` +Script to execute. This script must be defined in the files above and must match its remote path + +#### args `optional` | List[str] +List of arguments to be passed to your execute scripts + +#### env `optional` | dict +List of environment variables to access in the script(This can be used to pass arguments to your script instead of args) + +#### ports `optional` | `List[PluginPort]` +List of ports to open if the script is running in a container. A port can also be specific public and it will then be accessible when ssh into the master node. + +#### target | `optional` | `PluginTarget` +Define where the execute script should be running. Potential values are `PluginTarget.SparkContainer(Default)` and `PluginTarget.Host` + +#### `taget_role` | `optional` | `PluginTargetRole` +If the plugin should be run only on the master worker or all. You can use environment variables(See below to have different master/worker config) ### `PluginFile` -| Name | Required? | Type | Description | -|--------------|-----------|------|------------------------------------------------------------------------------| -| `target` | required | str | Where the file should be dropped relative to the plugin working directory | -| `local_path` | required | str | Path to the local file you want to upload(Could form the plugins parameters) | + +#### `target` `required` | `str` +Where the file should be dropped relative to the plugin working directory + +#### `local_path` | `required` | `str` +Path to the local file you want to upload(Could form the plugins parameters) ### `TextPluginFile` -| Name | Required? | Type | Description | -|-----------|-----------|-------------------|------------------------------------------------------------------------------| -| `target` | required | str | Where the file should be dropped relative to the plugin working directory | -| `content` | required | str | io.StringIO | Path to the local file you want to upload(Could form the plugins parameters) | + +#### target | `required` | `str` + Where the file should be dropped relative to the plugin working directory + +#### content | `required` | `str` | `io.StringIO` + Path to the local file you want to upload(Could form the plugins parameters) ### `PluginPort` -| Name | Required? | Type | Description | -|------------|-----------|------|-------------------------------------------------------| -| `internal` | required | int | Internal port to open on the docker container | -| `public` | optional | bool | If the port should be open publicly(Default: `False`) | +#### internal | `required` | `int` + Internal port to open on the docker container +#### public | `optional` | `bool` + If the port should be open publicly(Default: `False`) ## Environment variables availables in the plugin diff --git a/docs/conf.py b/docs/conf.py index fa8284d7..36e53e91 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,6 +21,8 @@ basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__)))) sys.path.insert(0, basedir) +from aztk.version import __version__ + # -- Project information ----------------------------------------------------- project = 'aztk' @@ -28,8 +30,7 @@ project = 'aztk' copyright = '2018, Microsoft' author = 'Microsoft' -# This gets set automatically by readthedocs -release = version = '' +release = version = __version__ # -- General configuration --------------------------------------------------- @@ -54,7 +55,7 @@ intersphinx_mapping = { # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +# templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: @@ -75,7 +76,7 @@ master_doc = 'index' # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +# language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -88,24 +89,25 @@ pygments_style = 'sphinx' # -- Options for HTML output ------------------------------------------------- -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'sphinx_rtd_theme' +on_rtd = os.environ.get('READTHEDOCS', None) == 'True' +if not on_rtd: # only import and set the theme if we're building docs locally + import sphinx_rtd_theme + html_theme = 'sphinx_rtd_theme' + html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -html_theme_options = { - 'collapse_navigation': True, - 'sticky_navigation': True, -} + # Theme options are theme-specific and customize the look and feel of a theme + # further. For a list of options available for each theme, see the + # documentation. + # + html_theme_options = { + 'collapse_navigation': True, + 'sticky_navigation': True, + } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +# html_static_path = ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. diff --git a/docs/docs.md b/docs/dev/docs.md similarity index 100% rename from docs/docs.md rename to docs/dev/docs.md diff --git a/docs/80-tests.md b/docs/dev/tests.md similarity index 100% rename from docs/80-tests.md rename to docs/dev/tests.md diff --git a/docs/dev/writing-models.md b/docs/dev/writing-models.md new file mode 100644 index 00000000..553121bd --- /dev/null +++ b/docs/dev/writing-models.md @@ -0,0 +1,62 @@ +# Writing a model + + +## Getting started +In `aztk/models` create a new file with the name of your model `my_model.py` + +In `aztk/models/__init__.py` add `from .my_model import MyModel` + +Create a new class `MyModel` that inherit `ConfigurationBase` +```python +from aztk.internal import ConfigurationBase + +class MyModel(ConfigurationBase): + """ + MyModel is an sample model + + Args: + input1 (str): This is the first input + """ + def __init__(self, input1: str): + self.input1 = input1 + + def validate(self): + pass + +``` + +## Add validation + +In `def validate` do any kind of checks and raise a `InvalidModelError` if there is any problems with the values + +### Validate required +To validate required attributes call the parent `_validate_required` method. Method takes a list of attributes which should not be None + +```python +def validate(self) -> bool: + self._validate_required(["input1"]) +``` + +### Custom validation +```python + def validate(self) -> bool: + if "foo" in self.input1: + raise InvalidModelError("foo cannot be in input1") + +``` + +## Convert dict to model + +When inheriting from `ConfigurationBase` it comes with a `from_dict` class method which allows to convert a dict to this class +It works great for simple case where values are simple types(str, int, etc). If however you need to process it you can override the `_from_dict` method. + +** Important: Do not override the `from_dict` method as this one will handle error and display them nicely ** + +```python +@classmethod +def _from_dict(cls, args: dict): + if "input1" in args: + args["input1"] = MyInput1Model.from_dict(args["input1"]) + + return super()._from_dict(args) +``` diff --git a/docs/index.rst b/docs/index.rst index 4a52ee97..7e82ee2b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -39,5 +39,15 @@ This toolkit is built on top of Azure Batch but does not require any Azure Batch :maxdepth: 2 :caption: Developper documentation: - docs - 80-tests + dev/docs + dev/writing-models + dev/tests + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/tests/models/internal/test_plugin_reference.py b/tests/models/internal/test_plugin_reference.py new file mode 100644 index 00000000..6ef3e9aa --- /dev/null +++ b/tests/models/internal/test_plugin_reference.py @@ -0,0 +1,39 @@ +import pytest + +from aztk.error import AztkError +from aztk.models.plugins.internal import PluginReference, PluginTarget, PluginTargetRole + + +def test_from_dict(): + ref = PluginReference.from_dict(dict( + name="my-test-script", + script="path/to/script.sh", + target="host", + target_role="worker", + )) + + assert ref.name == "my-test-script" + assert ref.script == "path/to/script.sh" + assert ref.target == PluginTarget.Host + assert ref.target_role == PluginTargetRole.Worker + + +def test_from_dict_invalid_param(): + with pytest.raises(AztkError): + PluginReference.from_dict(dict( + name2="invalid" + )) + +def test_from_dict_invalid_target(): + with pytest.raises(AztkError): + PluginReference.from_dict(dict( + script="path/to/script.sh", + target="host-invalid", + )) + +def test_from_dict_invalid_target_role(): + with pytest.raises(AztkError): + PluginReference.from_dict(dict( + script="path/to/script.sh", + target_role="worker-invalid", + ))