add docs for resource_monitor plugin

2018-04-18 15:32:24 -07:00 · 2018-04-18 15:32:24 -07:00 · b13cffcfee
--- a/aztk/internal/cluster_data/node_data.py
+++ b/aztk/internal/cluster_data/node_data.py
@ -46,7 +46,7 @@ class NodeData:
            return
        if isinstance(file, (str, bytes)):
            full_file_path = Path(file)
-            with io.open(file, 'r', encoding='UTF-8') as f:
+            with io.open(file, 'r', encoding='ISO-8859-1') as f:
                if binary:
                    self.zipf.write(file, os.path.join(zip_dir, full_file_path.name))
                else:
@ -69,7 +69,7 @@ class NodeData:
            relative_folder = os.path.relpath(base, path)
            for file in files:
                if self._includeFile(file, exclude):
-                    with io.open(os.path.join(base, file), 'r', encoding='UTF-8') as f:
+                    with io.open(os.path.join(base, file), 'r', encoding='ISO-8859-1') as f:
                        self.zipf.writestr(os.path.join(dest, relative_folder, file), f.read().replace('\r\n', '\n'))

    def _add_custom_scripts(self):
@ -82,7 +82,7 @@ class NodeData:
                new_file_name = str(index) + '_' + os.path.basename(custom_script.script)
                data.append(dict(script=new_file_name, runOn=str(custom_script.run_on)))
                try:
-                    with io.open(custom_script.script, 'r', encoding='UTF-8') as f:
+                    with io.open(custom_script.script, 'r', encoding='ISO-8859-1') as f:
                        self.zipf.writestr(
                            os.path.join(CUSTOM_SCRIPT_FOLDER, new_file_name),
                            f.read().replace('\r\n', '\n'))
--- a/aztk/spark/models/plugins/resource_monitor/images/Influx_measurements.png
+++ b/aztk/spark/models/plugins/resource_monitor/images/Influx_measurements.png
--- a/aztk/spark/models/plugins/resource_monitor/images/datasource_setup.png
+++ b/aztk/spark/models/plugins/resource_monitor/images/datasource_setup.png
--- a/aztk/spark/models/plugins/resource_monitor/images/default_dashboard.png
+++ b/aztk/spark/models/plugins/resource_monitor/images/default_dashboard.png
--- a/aztk/spark/models/plugins/resource_monitor/images/grafana_login.png
+++ b/aztk/spark/models/plugins/resource_monitor/images/grafana_login.png
--- a/aztk/spark/models/plugins/resource_monitor/images/import_dashboard.png
+++ b/aztk/spark/models/plugins/resource_monitor/images/import_dashboard.png
--- a/aztk/spark/models/plugins/resource_monitor/images/influx_query.png
+++ b/aztk/spark/models/plugins/resource_monitor/images/influx_query.png
--- a/aztk/spark/models/plugins/resource_monitor/nodestats.py
+++ b/aztk/spark/models/plugins/resource_monitor/nodestats.py
@ -232,8 +232,6 @@ class NodeStatsCollector:
        series = []

        for cpu_n in range(0, stats.cpu_count):
-            # client.track_metric("Cpu usage",
-            #                     stats.cpu_percent[cpu_n], properties={"Cpu #": cpu_n})
            series.append(self._fill_template(now,
                "Cpu usage",
                stats.cpu_percent[cpu_n],
@ -332,7 +330,9 @@ def main():
        node_id = os.environ.get('AZ_BATCH_NODE_ID', '_test-node-1')

    if is_master is None:
-        is_master = os.environ.get('AZTK_IS_MASTER', "0")
+        is_master = os.environ.get('AZTK_IS_MASTER', False)
+    else:
+        is_master = True

    logger.info('setting host to {}'.format(host))

--- a/aztk/spark/models/plugins/resource_monitor/readme.md
+++ b/aztk/spark/models/plugins/resource_monitor/readme.md
@ -1,11 +1,110 @@
 # Using the Resrouce Monitor Plugin

+The resource monitor plugin is useful for tracking performance counters on the cluster. These include counters such as Percent CPU used per core, Disk Read, Disk Write, Network In, Network out, and several others. Simply enabling the plugin in your cluster.yaml will deploy all the necessary components to start tracking metrics.
+
 ## Setup

+Update your cluster.yaml file to include the plugin as follows:
+
+```yaml
+...
+
+plugins:
+  - name: resource_monitor
+
+...
+
+
+```
+
+Once the cluster is created simply the cluster ssh command and all of the ports will automatically get forwareded.
+
+```sh
+aztk spark cluster ssh --id <my_cluster>
+```
+
+### Configuration and passwords
+The default environement is configured in the .env file for the plugin. We highly recommend updating the user names and passwords before deploying your cluster.
+
+```sh
+# Example .env file please modify and DO NOT USE AS IS
+INFLUXDB_USER=admin
+INFLUXDB_USER_PASSWORD=password
+INFLUXDB_ADMIN_ENABLED=true
+INFLUXDB_DATA_LOCATION=/mnt/batch/tasks/shared/influxdb
+GF_SECURITY_ADMIN_PASSWORD=password
+GRAFANA_DATA_LOCATION=/mnt/batch/tasks/shared/grafana
+
+```
+
+### Ports
+url | desciption
+--- | ---
+http://localhost:8083 | InfluxDB Query UI
+http://localhost:8086 | InfluxDB API endpoint
+http://localhost:3000 | Grafana UI
+
+## Querying the database
+This plugin uses an on-disk [InfluxDB database](https://www.influxdata.com/) on the master node to track all of the metrics. The database is available while the cluster is up and running and destroyed when the cluster is deleted.
+
+After running the **cluster ssh** command simply navigate to http://loclahost:8083.
+
+![InfluxDB query UI](./images/influx_query.png)
+
+All of the performance counter metrics are stored in a database called **data**. In the top right corner of the web page, change the default database to use **data**.
+
+## Data
+The metrics currently pushed are listed below.
+
+Measurement | Description
+--- | ---
+Cpu usage | Percentage of cpu used per core
+Disk read | Bytes read
+Disk write | Bytes written
+Memory available | Bytes available
+Memory used | Bytes used
+Network read | Bytes read
+Network write | Byres written
+
+To view the what measurements are available you can simply run the show measurements command in the query bar.
+```sql
+/* Show all measurements */
+SHOW MEASUREMENTS
+
+/* Show keys for a specific measurements */
+SHOW TAG KEYS FROM "Cpu usage"
+
+/* Show all distict values for a specific tag of a measurement */
+SHOW TAG VALUES FROM "Cpu usage" WITH KEY = "Cpu #"
+```
+
 ## Visualize data in Grafana
+[Grafana](https://grafana.com/) is a nice visualization tool that can pull data from InfluxDB. The UI is available while the cluster is up and running and destroyed when the cluster is deleted.
+
+After running the **cluster ssh** command simply navigate to http://loclahost:3000.
+
+### Log in
+
+To log in, use the username and password defined in the .env file. By default these are _username_: admin and _password_: password.
+
+![Grafana login](./images/grafana_login.png)
+
+### Configure a data source
+
+After logging in you will need to configure a data source as shown below.
+
+![Grafana data source](./images/datasource_setup.png)

 ### Importing the default dashboard

-## Querying the database
+The default dashboard included in this plugin gives an overview of cluster health and is useful to see what the cluster is currently doing.

-## Data schema
+To import the dashbaord, click the '+' button on the left hand side and select 'import'.
+
+![Grafana dashboard](./images/import_dashboard.png)
+
+The sample configuration file can be found [here](./resource_monitor_dashboard.json).
+
+Once you have imported the dashboard you can naviagte to the Perf Counters dashboard to view the cluster's data.
+
+![Grafana dashboard](./images/default_dashboard.png)
--- a/aztk/spark/models/plugins/resource_monitor/resource_monitor_dashboard.json
+++ b/aztk/spark/models/plugins/resource_monitor/resource_monitor_dashboard.json
@ -15,7 +15,7 @@
  "editable": true,
  "gnetId": null,
  "graphTooltip": 0,
-  "id": 1,
+  "id": 2,
  "links": [],
  "panels": [
    {
@ -23,7 +23,7 @@
      "bars": false,
      "dashLength": 10,
      "dashes": false,
-      "datasource": null,
+      "datasource": "perf",
      "fill": 1,
      "gridPos": {
        "h": 9,
@ -56,6 +56,7 @@
      "steppedLine": false,
      "targets": [
        {
+          "$$hashKey": "object:126",
          "groupBy": [
            {
              "params": [
@ -80,6 +81,42 @@
            ]
          ],
          "tags": []
+        },
+        {
+          "$$hashKey": "object:133",
+          "groupBy": [
+            {
+              "params": [
+                "$__interval"
+              ],
+              "type": "time"
+            },
+            {
+              "params": [
+                "null"
+              ],
+              "type": "fill"
+            }
+          ],
+          "orderByTime": "ASC",
+          "policy": "default",
+          "refId": "B",
+          "resultFormat": "time_series",
+          "select": [
+            [
+              {
+                "params": [
+                  "value"
+                ],
+                "type": "field"
+              },
+              {
+                "params": [],
+                "type": "mean"
+              }
+            ]
+          ],
+          "tags": []
        }
      ],
      "thresholds": [],
@ -236,7 +273,7 @@
      "bars": false,
      "dashLength": 10,
      "dashes": false,
-      "datasource": null,
+      "datasource": "perf",
      "fill": 1,
      "gridPos": {
        "h": 9,
@ -269,7 +306,7 @@
      "steppedLine": false,
      "targets": [
        {
-          "$$hashKey": "object:284",
+          "$$hashKey": "object:229",
          "groupBy": [
            {
              "params": [