add docs for resource_monitor plugin

This commit is contained in:
Pablo 2018-04-18 15:32:24 -07:00
Родитель ade9bf9a3a
Коммит b13cffcfee
10 изменённых файлов: 148 добавлений и 12 удалений

Просмотреть файл

@ -46,7 +46,7 @@ class NodeData:
return
if isinstance(file, (str, bytes)):
full_file_path = Path(file)
with io.open(file, 'r', encoding='UTF-8') as f:
with io.open(file, 'r', encoding='ISO-8859-1') as f:
if binary:
self.zipf.write(file, os.path.join(zip_dir, full_file_path.name))
else:
@ -69,7 +69,7 @@ class NodeData:
relative_folder = os.path.relpath(base, path)
for file in files:
if self._includeFile(file, exclude):
with io.open(os.path.join(base, file), 'r', encoding='UTF-8') as f:
with io.open(os.path.join(base, file), 'r', encoding='ISO-8859-1') as f:
self.zipf.writestr(os.path.join(dest, relative_folder, file), f.read().replace('\r\n', '\n'))
def _add_custom_scripts(self):
@ -82,7 +82,7 @@ class NodeData:
new_file_name = str(index) + '_' + os.path.basename(custom_script.script)
data.append(dict(script=new_file_name, runOn=str(custom_script.run_on)))
try:
with io.open(custom_script.script, 'r', encoding='UTF-8') as f:
with io.open(custom_script.script, 'r', encoding='ISO-8859-1') as f:
self.zipf.writestr(
os.path.join(CUSTOM_SCRIPT_FOLDER, new_file_name),
f.read().replace('\r\n', '\n'))

Двоичный файл не отображается.

Двоичный файл не отображается.

Двоичный файл не отображается.

Двоичный файл не отображается.

Двоичный файл не отображается.

Двоичные данные
aztk/spark/models/plugins/resource_monitor/images/influx_query.png Normal file

Двоичный файл не отображается.

Просмотреть файл

@ -232,8 +232,6 @@ class NodeStatsCollector:
series = []
for cpu_n in range(0, stats.cpu_count):
# client.track_metric("Cpu usage",
# stats.cpu_percent[cpu_n], properties={"Cpu #": cpu_n})
series.append(self._fill_template(now,
"Cpu usage",
stats.cpu_percent[cpu_n],
@ -332,7 +330,9 @@ def main():
node_id = os.environ.get('AZ_BATCH_NODE_ID', '_test-node-1')
if is_master is None:
is_master = os.environ.get('AZTK_IS_MASTER', "0")
is_master = os.environ.get('AZTK_IS_MASTER', False)
else:
is_master = True
logger.info('setting host to {}'.format(host))

Просмотреть файл

@ -1,11 +1,110 @@
# Using the Resrouce Monitor Plugin
The resource monitor plugin is useful for tracking performance counters on the cluster. These include counters such as Percent CPU used per core, Disk Read, Disk Write, Network In, Network out, and several others. Simply enabling the plugin in your cluster.yaml will deploy all the necessary components to start tracking metrics.
## Setup
Update your cluster.yaml file to include the plugin as follows:
```yaml
...
plugins:
- name: resource_monitor
...
```
Once the cluster is created simply the cluster ssh command and all of the ports will automatically get forwareded.
```sh
aztk spark cluster ssh --id <my_cluster>
```
### Configuration and passwords
The default environement is configured in the .env file for the plugin. We highly recommend updating the user names and passwords before deploying your cluster.
```sh
# Example .env file please modify and DO NOT USE AS IS
INFLUXDB_USER=admin
INFLUXDB_USER_PASSWORD=password
INFLUXDB_ADMIN_ENABLED=true
INFLUXDB_DATA_LOCATION=/mnt/batch/tasks/shared/influxdb
GF_SECURITY_ADMIN_PASSWORD=password
GRAFANA_DATA_LOCATION=/mnt/batch/tasks/shared/grafana
```
### Ports
url | desciption
--- | ---
http://localhost:8083 | InfluxDB Query UI
http://localhost:8086 | InfluxDB API endpoint
http://localhost:3000 | Grafana UI
## Querying the database
This plugin uses an on-disk [InfluxDB database](https://www.influxdata.com/) on the master node to track all of the metrics. The database is available while the cluster is up and running and destroyed when the cluster is deleted.
After running the **cluster ssh** command simply navigate to http://loclahost:8083.
![InfluxDB query UI](./images/influx_query.png)
All of the performance counter metrics are stored in a database called **data**. In the top right corner of the web page, change the default database to use **data**.
## Data
The metrics currently pushed are listed below.
Measurement | Description
--- | ---
Cpu usage | Percentage of cpu used per core
Disk read | Bytes read
Disk write | Bytes written
Memory available | Bytes available
Memory used | Bytes used
Network read | Bytes read
Network write | Byres written
To view the what measurements are available you can simply run the show measurements command in the query bar.
```sql
/* Show all measurements */
SHOW MEASUREMENTS
/* Show keys for a specific measurements */
SHOW TAG KEYS FROM "Cpu usage"
/* Show all distict values for a specific tag of a measurement */
SHOW TAG VALUES FROM "Cpu usage" WITH KEY = "Cpu #"
```
## Visualize data in Grafana
[Grafana](https://grafana.com/) is a nice visualization tool that can pull data from InfluxDB. The UI is available while the cluster is up and running and destroyed when the cluster is deleted.
After running the **cluster ssh** command simply navigate to http://loclahost:3000.
### Log in
To log in, use the username and password defined in the .env file. By default these are _username_: admin and _password_: password.
![Grafana login](./images/grafana_login.png)
### Configure a data source
After logging in you will need to configure a data source as shown below.
![Grafana data source](./images/datasource_setup.png)
### Importing the default dashboard
## Querying the database
The default dashboard included in this plugin gives an overview of cluster health and is useful to see what the cluster is currently doing.
## Data schema
To import the dashbaord, click the '+' button on the left hand side and select 'import'.
![Grafana dashboard](./images/import_dashboard.png)
The sample configuration file can be found [here](./resource_monitor_dashboard.json).
Once you have imported the dashboard you can naviagte to the Perf Counters dashboard to view the cluster's data.
![Grafana dashboard](./images/default_dashboard.png)

Просмотреть файл

@ -15,7 +15,7 @@
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 1,
"id": 2,
"links": [],
"panels": [
{
@ -23,7 +23,7 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"datasource": "perf",
"fill": 1,
"gridPos": {
"h": 9,
@ -56,6 +56,7 @@
"steppedLine": false,
"targets": [
{
"$$hashKey": "object:126",
"groupBy": [
{
"params": [
@ -80,6 +81,42 @@
]
],
"tags": []
},
{
"$$hashKey": "object:133",
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"null"
],
"type": "fill"
}
],
"orderByTime": "ASC",
"policy": "default",
"refId": "B",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
}
],
"thresholds": [],
@ -236,7 +273,7 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"datasource": "perf",
"fill": 1,
"gridPos": {
"h": 9,
@ -269,7 +306,7 @@
"steppedLine": false,
"targets": [
{
"$$hashKey": "object:284",
"$$hashKey": "object:229",
"groupBy": [
{
"params": [