diff --git a/aztk/internal/cluster_data/node_data.py b/aztk/internal/cluster_data/node_data.py index 47641ab3..3a55fd77 100644 --- a/aztk/internal/cluster_data/node_data.py +++ b/aztk/internal/cluster_data/node_data.py @@ -46,7 +46,7 @@ class NodeData: return if isinstance(file, (str, bytes)): full_file_path = Path(file) - with io.open(file, 'r', encoding='UTF-8') as f: + with io.open(file, 'r', encoding='ISO-8859-1') as f: if binary: self.zipf.write(file, os.path.join(zip_dir, full_file_path.name)) else: @@ -69,7 +69,7 @@ class NodeData: relative_folder = os.path.relpath(base, path) for file in files: if self._includeFile(file, exclude): - with io.open(os.path.join(base, file), 'r', encoding='UTF-8') as f: + with io.open(os.path.join(base, file), 'r', encoding='ISO-8859-1') as f: self.zipf.writestr(os.path.join(dest, relative_folder, file), f.read().replace('\r\n', '\n')) def _add_custom_scripts(self): @@ -82,7 +82,7 @@ class NodeData: new_file_name = str(index) + '_' + os.path.basename(custom_script.script) data.append(dict(script=new_file_name, runOn=str(custom_script.run_on))) try: - with io.open(custom_script.script, 'r', encoding='UTF-8') as f: + with io.open(custom_script.script, 'r', encoding='ISO-8859-1') as f: self.zipf.writestr( os.path.join(CUSTOM_SCRIPT_FOLDER, new_file_name), f.read().replace('\r\n', '\n')) diff --git a/aztk/spark/models/plugins/resource_monitor/images/Influx_measurements.png b/aztk/spark/models/plugins/resource_monitor/images/Influx_measurements.png new file mode 100644 index 00000000..7287a047 Binary files /dev/null and b/aztk/spark/models/plugins/resource_monitor/images/Influx_measurements.png differ diff --git a/aztk/spark/models/plugins/resource_monitor/images/datasource_setup.png b/aztk/spark/models/plugins/resource_monitor/images/datasource_setup.png new file mode 100644 index 00000000..08593544 Binary files /dev/null and b/aztk/spark/models/plugins/resource_monitor/images/datasource_setup.png differ diff --git a/aztk/spark/models/plugins/resource_monitor/images/default_dashboard.png b/aztk/spark/models/plugins/resource_monitor/images/default_dashboard.png new file mode 100644 index 00000000..caa71d39 Binary files /dev/null and b/aztk/spark/models/plugins/resource_monitor/images/default_dashboard.png differ diff --git a/aztk/spark/models/plugins/resource_monitor/images/grafana_login.png b/aztk/spark/models/plugins/resource_monitor/images/grafana_login.png new file mode 100644 index 00000000..c44855f7 Binary files /dev/null and b/aztk/spark/models/plugins/resource_monitor/images/grafana_login.png differ diff --git a/aztk/spark/models/plugins/resource_monitor/images/import_dashboard.png b/aztk/spark/models/plugins/resource_monitor/images/import_dashboard.png new file mode 100644 index 00000000..a08d9996 Binary files /dev/null and b/aztk/spark/models/plugins/resource_monitor/images/import_dashboard.png differ diff --git a/aztk/spark/models/plugins/resource_monitor/images/influx_query.png b/aztk/spark/models/plugins/resource_monitor/images/influx_query.png new file mode 100644 index 00000000..418e3611 Binary files /dev/null and b/aztk/spark/models/plugins/resource_monitor/images/influx_query.png differ diff --git a/aztk/spark/models/plugins/resource_monitor/nodestats.py b/aztk/spark/models/plugins/resource_monitor/nodestats.py index 9bc1fcce..ffbb4168 100644 --- a/aztk/spark/models/plugins/resource_monitor/nodestats.py +++ b/aztk/spark/models/plugins/resource_monitor/nodestats.py @@ -232,8 +232,6 @@ class NodeStatsCollector: series = [] for cpu_n in range(0, stats.cpu_count): - # client.track_metric("Cpu usage", - # stats.cpu_percent[cpu_n], properties={"Cpu #": cpu_n}) series.append(self._fill_template(now, "Cpu usage", stats.cpu_percent[cpu_n], @@ -332,7 +330,9 @@ def main(): node_id = os.environ.get('AZ_BATCH_NODE_ID', '_test-node-1') if is_master is None: - is_master = os.environ.get('AZTK_IS_MASTER', "0") + is_master = os.environ.get('AZTK_IS_MASTER', False) + else: + is_master = True logger.info('setting host to {}'.format(host)) diff --git a/aztk/spark/models/plugins/resource_monitor/readme.md b/aztk/spark/models/plugins/resource_monitor/readme.md index 23287ff4..bc7455e2 100644 --- a/aztk/spark/models/plugins/resource_monitor/readme.md +++ b/aztk/spark/models/plugins/resource_monitor/readme.md @@ -1,11 +1,110 @@ # Using the Resrouce Monitor Plugin +The resource monitor plugin is useful for tracking performance counters on the cluster. These include counters such as Percent CPU used per core, Disk Read, Disk Write, Network In, Network out, and several others. Simply enabling the plugin in your cluster.yaml will deploy all the necessary components to start tracking metrics. + ## Setup +Update your cluster.yaml file to include the plugin as follows: + +```yaml +... + +plugins: + - name: resource_monitor + +... + + +``` + +Once the cluster is created simply the cluster ssh command and all of the ports will automatically get forwareded. + +```sh +aztk spark cluster ssh --id +``` + +### Configuration and passwords +The default environement is configured in the .env file for the plugin. We highly recommend updating the user names and passwords before deploying your cluster. + +```sh +# Example .env file please modify and DO NOT USE AS IS +INFLUXDB_USER=admin +INFLUXDB_USER_PASSWORD=password +INFLUXDB_ADMIN_ENABLED=true +INFLUXDB_DATA_LOCATION=/mnt/batch/tasks/shared/influxdb +GF_SECURITY_ADMIN_PASSWORD=password +GRAFANA_DATA_LOCATION=/mnt/batch/tasks/shared/grafana + +``` + +### Ports +url | desciption +--- | --- +http://localhost:8083 | InfluxDB Query UI +http://localhost:8086 | InfluxDB API endpoint +http://localhost:3000 | Grafana UI + +## Querying the database +This plugin uses an on-disk [InfluxDB database](https://www.influxdata.com/) on the master node to track all of the metrics. The database is available while the cluster is up and running and destroyed when the cluster is deleted. + +After running the **cluster ssh** command simply navigate to http://loclahost:8083. + +![InfluxDB query UI](./images/influx_query.png) + +All of the performance counter metrics are stored in a database called **data**. In the top right corner of the web page, change the default database to use **data**. + +## Data +The metrics currently pushed are listed below. + +Measurement | Description +--- | --- +Cpu usage | Percentage of cpu used per core +Disk read | Bytes read +Disk write | Bytes written +Memory available | Bytes available +Memory used | Bytes used +Network read | Bytes read +Network write | Byres written + +To view the what measurements are available you can simply run the show measurements command in the query bar. +```sql +/* Show all measurements */ +SHOW MEASUREMENTS + +/* Show keys for a specific measurements */ +SHOW TAG KEYS FROM "Cpu usage" + +/* Show all distict values for a specific tag of a measurement */ +SHOW TAG VALUES FROM "Cpu usage" WITH KEY = "Cpu #" +``` + ## Visualize data in Grafana +[Grafana](https://grafana.com/) is a nice visualization tool that can pull data from InfluxDB. The UI is available while the cluster is up and running and destroyed when the cluster is deleted. + +After running the **cluster ssh** command simply navigate to http://loclahost:3000. + +### Log in + +To log in, use the username and password defined in the .env file. By default these are _username_: admin and _password_: password. + +![Grafana login](./images/grafana_login.png) + +### Configure a data source + +After logging in you will need to configure a data source as shown below. + +![Grafana data source](./images/datasource_setup.png) ### Importing the default dashboard -## Querying the database +The default dashboard included in this plugin gives an overview of cluster health and is useful to see what the cluster is currently doing. -## Data schema +To import the dashbaord, click the '+' button on the left hand side and select 'import'. + +![Grafana dashboard](./images/import_dashboard.png) + +The sample configuration file can be found [here](./resource_monitor_dashboard.json). + +Once you have imported the dashboard you can naviagte to the Perf Counters dashboard to view the cluster's data. + +![Grafana dashboard](./images/default_dashboard.png) diff --git a/aztk/spark/models/plugins/resource_monitor/resource_monitor_dashboard.json b/aztk/spark/models/plugins/resource_monitor/resource_monitor_dashboard.json index 561f87c0..83f78fc5 100644 --- a/aztk/spark/models/plugins/resource_monitor/resource_monitor_dashboard.json +++ b/aztk/spark/models/plugins/resource_monitor/resource_monitor_dashboard.json @@ -15,7 +15,7 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "id": 1, + "id": 2, "links": [], "panels": [ { @@ -23,7 +23,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, + "datasource": "perf", "fill": 1, "gridPos": { "h": 9, @@ -56,6 +56,7 @@ "steppedLine": false, "targets": [ { + "$$hashKey": "object:126", "groupBy": [ { "params": [ @@ -80,6 +81,42 @@ ] ], "tags": [] + }, + { + "$$hashKey": "object:133", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] } ], "thresholds": [], @@ -236,7 +273,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, + "datasource": "perf", "fill": 1, "gridPos": { "h": 9, @@ -269,7 +306,7 @@ "steppedLine": false, "targets": [ { - "$$hashKey": "object:284", + "$$hashKey": "object:229", "groupBy": [ { "params": [