Grafana template
Published:
import this json to grafana
main dashboard
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Cluster Overview — CPU, Memory, Disk, Network, Node health, and GPU status via node_exporter + DCGM",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 1,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 100,
"panels": [],
"title": "Node Health",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"0": {
"color": "red",
"index": 0,
"text": "DOWN"
},
"1": {
"color": "green",
"index": 1,
"text": "UP"
}
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 1
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 24,
"x": 0,
"y": 1
},
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "up{job=\"node_exporter\", instance=~\"$node\"}",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "Node Status — $node",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 9
},
"id": 101,
"panels": [],
"title": "CPU",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 10
},
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"$node\"}[2m])) * 100)",
"legendFormat": "",
"refId": "A"
}
],
"title": "CPU Usage — $node",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 20
},
"id": 3,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"$node\"}[2m])) * 100)",
"legendFormat": "",
"refId": "A"
}
],
"title": "CPU Usage Over Time",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 28
},
"id": 102,
"panels": [],
"title": "Memory",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 29
},
"id": 4,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 * (1 - (node_memory_MemAvailable_bytes{instance=~\"$node\"} / node_memory_MemTotal_bytes{instance=~\"$node\"}))",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "Memory Usage — $node",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 39
},
"id": 5,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 * (1 - (node_memory_MemAvailable_bytes{instance=~\"$node\"} / node_memory_MemTotal_bytes{instance=~\"$node\"}))",
"legendFormat": "",
"refId": "A"
}
],
"title": "Memory Usage Over Time",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 47
},
"id": 103,
"panels": [],
"title": "Disk",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 48
},
"id": 6,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 - (node_filesystem_avail_bytes{instance=~\"$node\", mountpoint=\"/\", fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{instance=~\"$node\", mountpoint=\"/\", fstype!~\"tmpfs|overlay\"} * 100)",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "Disk Usage (/) — $node",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 58
},
"id": 7,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(node_disk_read_bytes_total{instance=~\"$node\", device!~\"dm-.*\"}[2m])",
"legendFormat": " read ",
"refId": "A"
}
],
"title": "Disk Read Throughput",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 58
},
"id": 8,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(node_disk_written_bytes_total{instance=~\"$node\", device!~\"dm-.*\"}[2m])",
"legendFormat": " write ",
"refId": "A"
}
],
"title": "Disk Write Throughput",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 66
},
"id": 104,
"panels": [],
"title": "Network",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 67
},
"id": 9,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(node_network_receive_bytes_total{instance=~\"$node\", device!~\"lo|veth.*|docker.*|br.*\"}[2m]) * 8",
"legendFormat": " rx ",
"refId": "A"
}
],
"title": "Network Receive (bps)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 67
},
"id": 10,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(node_network_transmit_bytes_total{instance=~\"$node\", device!~\"lo|veth.*|docker.*|br.*\"}[2m]) * 8",
"legendFormat": " tx ",
"refId": "A"
}
],
"title": "Network Transmit (bps)",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 75
},
"id": 105,
"panels": [],
"title": "System Load & Uptime",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 76
},
"id": 11,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "node_load1{instance=~\"$node\"}",
"legendFormat": " 1m",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "node_load5{instance=~\"$node\"}",
"legendFormat": " 5m",
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "node_load15{instance=~\"$node\"}",
"legendFormat": " 15m",
"refId": "C"
}
],
"title": "System Load Average",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "blue",
"value": 0
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 84
},
"id": 12,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "time() - node_boot_time_seconds{instance=~\"$node\"}",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "Uptime — $node",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 94
},
"id": 106,
"panels": [],
"title": "GPU — DCGM (GPU nodes only)",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"description": "Only shows nodes where DCGM metrics are present. Nodes without GPUs will not appear.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 60
},
{
"color": "red",
"value": 85
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 95
},
"id": 20,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "gpu_node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "avg by(instance) (DCGM_FI_DEV_GPU_UTIL{instance=~\"$gpu_node\", gpu=~\"$gpu\"})",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "GPU Utilization — $gpu_node",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 100
},
"id": 21,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "gpu_node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "avg by(instance) (DCGM_FI_DEV_FB_USED{instance=~\"$gpu_node\", gpu=~\"$gpu\"} / (DCGM_FI_DEV_FB_USED{instance=~\"$gpu_node\", gpu=~\"$gpu\"} + DCGM_FI_DEV_FB_FREE{instance=~\"$gpu_node\", gpu=~\"$gpu\"}) * 100)",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "GPU Memory Usage — $gpu_node",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 75
},
{
"color": "red",
"value": 85
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 105
},
"id": 22,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "gpu_node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "avg by(instance) (DCGM_FI_DEV_GPU_TEMP{instance=~\"$gpu_node\", gpu=~\"$gpu\"})",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "GPU Temperature — $gpu_node",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 60
},
{
"color": "red",
"value": 85
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 110
},
"id": 23,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": " GPU",
"refId": "A"
}
],
"title": "GPU Utilization Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 110
},
"id": 24,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"$gpu_node\", gpu=~\"$gpu\"} / (DCGM_FI_DEV_FB_USED{instance=~\"$gpu_node\", gpu=~\"$gpu\"} + DCGM_FI_DEV_FB_FREE{instance=~\"$gpu_node\", gpu=~\"$gpu\"}) * 100",
"legendFormat": " GPU",
"refId": "A"
}
],
"title": "GPU Memory Usage Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 118
},
"id": 25,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": " GPU",
"refId": "A"
}
],
"title": "GPU Temperature Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 118
},
"id": 26,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": " GPU",
"refId": "A"
}
],
"title": "GPU Power Usage (W)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 126
},
"id": 27,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(DCGM_FI_PROF_PCIE_TX_BYTES{instance=~\"$gpu_node\", gpu=~\"$gpu\"}[2m])",
"legendFormat": " GPU TX",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(DCGM_FI_PROF_PCIE_RX_BYTES{instance=~\"$gpu_node\", gpu=~\"$gpu\"}[2m])",
"legendFormat": " GPU RX",
"refId": "B"
}
],
"title": "GPU PCIe Bandwidth",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 126
},
"id": 28,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": " GPU SM",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_MEM_CLOCK{instance=~\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": " GPU Mem",
"refId": "B"
}
],
"title": "GPU Clock Speeds (MHz)",
"type": "timeseries"
}
],
"preload": false,
"refresh": "30s",
"schemaVersion": 42,
"tags": [
"cluster",
"node_exporter",
"infrastructure",
"gpu",
"dcgm"
],
"templating": {
"list": [
{
"current": {
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"definition": "label_values(up{job=\"node_exporter\"}, instance)",
"includeAll": true,
"label": "Node",
"multi": true,
"name": "node",
"options": [],
"query": {
"query": "label_values(up{job=\"node_exporter\"}, instance)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"sort": 1,
"type": "query"
},
{
"current": {
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"definition": "label_values(DCGM_FI_DEV_GPU_UTIL, instance)",
"includeAll": true,
"label": "GPU Node",
"multi": true,
"name": "gpu_node",
"options": [],
"query": {
"query": "label_values(DCGM_FI_DEV_GPU_UTIL, instance)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"sort": 1,
"type": "query"
},
{
"current": {
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"definition": "label_values(DCGM_FI_DEV_GPU_UTIL{instance=~\"$gpu_node\"}, gpu)",
"includeAll": true,
"label": "GPU Index",
"multi": true,
"name": "gpu",
"options": [],
"query": {
"query": "label_values(DCGM_FI_DEV_GPU_UTIL{instance=~\"$gpu_node\"}, gpu)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "Cluster Overview",
"uid": "cluster-overview-v1",
"version": 1
}
main dashboard v2
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Cluster Overview — CPU, Memory, Disk, Network, Node health, and GPU status via node_exporter + DCGM",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 8,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 100,
"panels": [],
"title": "Node Health",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"0": {
"color": "red",
"index": 0,
"text": "DOWN"
},
"1": {
"color": "green",
"index": 1,
"text": "UP"
}
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 1
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 24,
"x": 0,
"y": 1
},
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "up{job=\"node_exporter\"} * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"}",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "Node Status — $node",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 9
},
"id": 101,
"panels": [],
"title": "CPU",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 10
},
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 - (avg by(instance, nodename) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m]) * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"}) * 100)",
"legendFormat": "",
"refId": "A"
}
],
"title": "CPU Usage — $node",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 20
},
"id": 3,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 - (avg by(instance, nodename) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m]) * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"}) * 100)",
"legendFormat": "",
"refId": "A"
}
],
"title": "CPU Usage Over Time",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 28
},
"id": 102,
"panels": [],
"title": "Memory",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 29
},
"id": 4,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 * (1 - ((node_memory_MemAvailable_bytes * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"}) / (node_memory_MemTotal_bytes * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"})))",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "Memory Usage — $node",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 39
},
"id": 5,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 * (1 - ((node_memory_MemAvailable_bytes * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"}) / (node_memory_MemTotal_bytes * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"})))",
"legendFormat": "",
"refId": "A"
}
],
"title": "Memory Usage Over Time",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 47
},
"id": 103,
"panels": [],
"title": "Disk",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 48
},
"id": 6,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 - ((node_filesystem_avail_bytes{mountpoint=\"/\", fstype!~\"tmpfs|overlay\"} * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"}) / (node_filesystem_size_bytes{mountpoint=\"/\", fstype!~\"tmpfs|overlay\"} * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"}) * 100)",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "Disk Usage (/) — $node",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 53
},
"id": 7,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(node_disk_read_bytes_total{device!~\"dm-.*\"}[2m]) * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"}",
"legendFormat": " read ",
"refId": "A"
}
],
"title": "Disk Read Throughput",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 53
},
"id": 8,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(node_disk_written_bytes_total{device!~\"dm-.*\"}[2m]) * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"}",
"legendFormat": " write ",
"refId": "A"
}
],
"title": "Disk Write Throughput",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 61
},
"id": 104,
"panels": [],
"title": "Network",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 62
},
"id": 9,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(node_network_receive_bytes_total{device!~\"lo|veth.*|docker.*|br.*\"}[2m]) * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"} * 8",
"legendFormat": " rx ",
"refId": "A"
}
],
"title": "Network Receive (bps)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 62
},
"id": 10,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|docker.*|br.*\"}[2m]) * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"} * 8",
"legendFormat": " tx ",
"refId": "A"
}
],
"title": "Network Transmit (bps)",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 70
},
"id": 105,
"panels": [],
"title": "System Load & Uptime",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 71
},
"id": 11,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "node_load1 * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"}",
"legendFormat": " 1m",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "node_load5 * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"}",
"legendFormat": " 5m",
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "node_load15 * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"}",
"legendFormat": " 15m",
"refId": "C"
}
],
"title": "System Load Average",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "blue",
"value": 0
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 79
},
"id": 12,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"repeat": "node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "time() - (node_boot_time_seconds * on(instance) group_left(nodename) node_uname_info{nodename=~\"^($node)$\"})",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "Uptime — $node",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 84
},
"id": 106,
"panels": [],
"title": "GPU — DCGM (GPU nodes only)",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"description": "Only shows nodes where DCGM metrics are present. Nodes without GPUs will not appear.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "yellow",
"value": 60
},
{
"color": "red",
"value": 85
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 85
},
"id": 20,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.0.0",
"repeat": "gpu_node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "avg by(instance) (DCGM_FI_DEV_GPU_UTIL{instance=~\"$gpu_node\", gpu=~\"$gpu\"})",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "GPU Utilization — $gpu_node",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 90
},
"id": 21,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.0.0",
"repeat": "gpu_node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "avg by(instance) (DCGM_FI_DEV_FB_USED{instance=~\"$gpu_node\", gpu=~\"$gpu\"} / (DCGM_FI_DEV_FB_USED{instance=~\"$gpu_node\", gpu=~\"$gpu\"} + DCGM_FI_DEV_FB_FREE{instance=~\"$gpu_node\", gpu=~\"$gpu\"}) * 100)",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "GPU Memory Usage — $gpu_node",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "yellow",
"value": 75
},
{
"color": "red",
"value": 85
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 95
},
"id": 22,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.0.0",
"repeat": "gpu_node",
"repeatDirection": "h",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "avg by(instance) (DCGM_FI_DEV_GPU_TEMP{instance=~\"$gpu_node\", gpu=~\"$gpu\"})",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "GPU Temperature — $gpu_node",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"drawStyle": "line",
"fillOpacity": 15,
"lineInterpolation": "smooth",
"lineWidth": 2,
"showPoints": "never",
"spanNulls": false
},
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "yellow",
"value": 60
},
{
"color": "red",
"value": 85
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 100
},
"id": 23,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": " GPU",
"refId": "A"
}
],
"title": "GPU Utilization Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"drawStyle": "line",
"fillOpacity": 15,
"lineInterpolation": "smooth",
"lineWidth": 2,
"showPoints": "never",
"spanNulls": false
},
"max": 100,
"min": 0,
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 100
},
"id": 24,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"$gpu_node\", gpu=~\"$gpu\"} / (DCGM_FI_DEV_FB_USED{instance=~\"$gpu_node\", gpu=~\"$gpu\"} + DCGM_FI_DEV_FB_FREE{instance=~\"$gpu_node\", gpu=~\"$gpu\"}) * 100",
"legendFormat": " GPU",
"refId": "A"
}
],
"title": "GPU Memory Usage Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"drawStyle": "line",
"fillOpacity": 10,
"lineInterpolation": "smooth",
"lineWidth": 2,
"showPoints": "never",
"spanNulls": false
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 108
},
"id": 25,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": " GPU",
"refId": "A"
}
],
"title": "GPU Temperature Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"drawStyle": "line",
"fillOpacity": 10,
"lineInterpolation": "smooth",
"lineWidth": 2,
"showPoints": "never",
"spanNulls": false
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 108
},
"id": 26,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": " GPU",
"refId": "A"
}
],
"title": "GPU Power Usage (W)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"drawStyle": "line",
"fillOpacity": 10,
"lineInterpolation": "smooth",
"lineWidth": 2,
"showPoints": "never",
"spanNulls": false
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 116
},
"id": 27,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(DCGM_FI_PROF_PCIE_TX_BYTES{instance=~\"$gpu_node\", gpu=~\"$gpu\"}[2m])",
"legendFormat": " GPU TX",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(DCGM_FI_PROF_PCIE_RX_BYTES{instance=~\"$gpu_node\", gpu=~\"$gpu\"}[2m])",
"legendFormat": " GPU RX",
"refId": "B"
}
],
"title": "GPU PCIe Bandwidth",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"drawStyle": "line",
"fillOpacity": 10,
"lineInterpolation": "smooth",
"lineWidth": 2,
"showPoints": "never",
"spanNulls": false
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 116
},
"id": 28,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": " GPU SM",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_MEM_CLOCK{instance=~\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": " GPU Mem",
"refId": "B"
}
],
"title": "GPU Clock Speeds (MHz)",
"type": "timeseries"
}
],
"preload": false,
"refresh": "30s",
"schemaVersion": 42,
"tags": [
"cluster",
"node_exporter",
"infrastructure",
"gpu",
"dcgm"
],
"templating": {
"list": [
{
"current": {
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"definition": "label_values(node_uname_info, nodename)",
"includeAll": true,
"label": "Node",
"multi": true,
"name": "node",
"options": [],
"query": {
"query": "label_values(node_uname_info, nodename)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"sort": 1,
"type": "query"
},
{
"current": {
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"definition": "label_values(DCGM_FI_DEV_GPU_UTIL, instance)",
"includeAll": true,
"label": "GPU Node",
"multi": true,
"name": "gpu_node",
"options": [],
"query": {
"query": "label_values(DCGM_FI_DEV_GPU_UTIL, instance)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"sort": 1,
"type": "query"
},
{
"current": {
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"definition": "label_values(DCGM_FI_DEV_GPU_UTIL{instance=~\"$gpu_node\"}, gpu)",
"includeAll": true,
"label": "GPU Index",
"multi": true,
"name": "gpu",
"options": [],
"query": {
"query": "label_values(DCGM_FI_DEV_GPU_UTIL{instance=~\"$gpu_node\"}, gpu)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "Cluster Overview v2",
"uid": "cluster-overview-v2",
"version": 1
}
per node dashboard
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Per-Node Detail — CPU, Memory, Disk, Network, Load, and GPU for a single selected node",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 4,
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"cluster"
],
"title": "Back to Cluster Overview",
"tooltip": "",
"type": "dashboards",
"url": ""
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 100,
"panels": [],
"title": "Node Health",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"0": {
"color": "red",
"index": 0,
"text": "DOWN"
},
"1": {
"color": "green",
"index": 1,
"text": "UP"
}
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 1
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 0,
"y": 1
},
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "up{job=\"node_exporter\", instance=\"$node\"}",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "Node Status",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "blue",
"value": 0
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 4,
"y": 1
},
"id": 12,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "time() - node_boot_time_seconds{instance=\"$node\"}",
"instant": true,
"legendFormat": "Uptime",
"refId": "A"
}
],
"title": "Uptime",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 5
},
"id": 101,
"panels": [],
"title": "CPU",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 6
},
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"$node\"}[2m])) * 100)",
"legendFormat": "CPU Usage",
"refId": "A"
}
],
"title": "CPU Usage (current)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 18,
"x": 6,
"y": 6
},
"id": 3,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"$node\"}[2m])) * 100)",
"legendFormat": "CPU Usage",
"refId": "A"
}
],
"title": "CPU Usage Over Time",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 14
},
"id": 102,
"panels": [],
"title": "Memory",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 15
},
"id": 4,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 * (1 - (node_memory_MemAvailable_bytes{instance=\"$node\"} / node_memory_MemTotal_bytes{instance=\"$node\"}))",
"instant": true,
"legendFormat": "Memory Usage",
"refId": "A"
}
],
"title": "Memory Usage (current)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 18,
"x": 6,
"y": 15
},
"id": 5,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 * (1 - (node_memory_MemAvailable_bytes{instance=\"$node\"} / node_memory_MemTotal_bytes{instance=\"$node\"}))",
"legendFormat": "Memory Usage",
"refId": "A"
}
],
"title": "Memory Usage Over Time",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 23
},
"id": 103,
"panels": [],
"title": "Disk",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 24
},
"id": 6,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "100 - (node_filesystem_avail_bytes{instance=\"$node\", mountpoint=\"/\", fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{instance=\"$node\", mountpoint=\"/\", fstype!~\"tmpfs|overlay\"} * 100)",
"instant": true,
"legendFormat": "Disk Usage (/)",
"refId": "A"
}
],
"title": "Disk Usage / (current)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 9,
"x": 6,
"y": 24
},
"id": 7,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(node_disk_read_bytes_total{instance=\"$node\", device!~\"dm-.*\"}[2m])",
"legendFormat": "read ",
"refId": "A"
}
],
"title": "Disk Read Throughput",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 9,
"x": 15,
"y": 24
},
"id": 8,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(node_disk_written_bytes_total{instance=\"$node\", device!~\"dm-.*\"}[2m])",
"legendFormat": "write ",
"refId": "A"
}
],
"title": "Disk Write Throughput",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 32
},
"id": 104,
"panels": [],
"title": "Network",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 33
},
"id": 9,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(node_network_receive_bytes_total{instance=\"$node\", device!~\"lo|veth.*|docker.*|br.*\"}[2m]) * 8",
"legendFormat": "rx ",
"refId": "A"
}
],
"title": "Network Receive (bps)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 33
},
"id": 10,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(node_network_transmit_bytes_total{instance=\"$node\", device!~\"lo|veth.*|docker.*|br.*\"}[2m]) * 8",
"legendFormat": "tx ",
"refId": "A"
}
],
"title": "Network Transmit (bps)",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 41
},
"id": 105,
"panels": [],
"title": "System Load",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 42
},
"id": 11,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "node_load1{instance=\"$node\"}",
"legendFormat": "1m",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "node_load5{instance=\"$node\"}",
"legendFormat": "5m",
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "node_load15{instance=\"$node\"}",
"legendFormat": "15m",
"refId": "C"
}
],
"title": "System Load Average (1m / 5m / 15m)",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 50
},
"id": 106,
"panels": [],
"title": "GPU — DCGM (GPU nodes only)",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"description": "Only shows data when DCGM metrics are present for the selected GPU node.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 60
},
{
"color": "red",
"value": 85
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 51
},
"id": 20,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "avg by(instance) (DCGM_FI_DEV_GPU_UTIL{instance=\"$gpu_node\", gpu=~\"$gpu\"})",
"instant": true,
"legendFormat": "GPU Util",
"refId": "A"
}
],
"title": "GPU Utilization (avg)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 51
},
"id": 21,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "avg by(instance) (DCGM_FI_DEV_FB_USED{instance=\"$gpu_node\", gpu=~\"$gpu\"} / (DCGM_FI_DEV_FB_USED{instance=\"$gpu_node\", gpu=~\"$gpu\"} + DCGM_FI_DEV_FB_FREE{instance=\"$gpu_node\", gpu=~\"$gpu\"}) * 100)",
"instant": true,
"legendFormat": "GPU Mem",
"refId": "A"
}
],
"title": "GPU Memory Usage (avg)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 75
},
{
"color": "red",
"value": 85
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 51
},
"id": 22,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "avg by(instance) (DCGM_FI_DEV_GPU_TEMP{instance=\"$gpu_node\", gpu=~\"$gpu\"})",
"instant": true,
"legendFormat": "GPU Temp",
"refId": "A"
}
],
"title": "GPU Temperature (avg)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 60
},
{
"color": "red",
"value": 85
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 55
},
"id": 23,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": "GPU",
"refId": "A"
}
],
"title": "GPU Utilization Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 55
},
"id": 24,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_FB_USED{instance=\"$gpu_node\", gpu=~\"$gpu\"} / (DCGM_FI_DEV_FB_USED{instance=\"$gpu_node\", gpu=~\"$gpu\"} + DCGM_FI_DEV_FB_FREE{instance=\"$gpu_node\", gpu=~\"$gpu\"}) * 100",
"legendFormat": "GPU",
"refId": "A"
}
],
"title": "GPU Memory Usage Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 63
},
"id": 25,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": "GPU",
"refId": "A"
}
],
"title": "GPU Temperature Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 63
},
"id": 26,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": "GPU",
"refId": "A"
}
],
"title": "GPU Power Usage (W)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 71
},
"id": 27,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(DCGM_FI_PROF_PCIE_TX_BYTES{instance=\"$gpu_node\", gpu=~\"$gpu\"}[2m])",
"legendFormat": "GPU TX",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "rate(DCGM_FI_PROF_PCIE_RX_BYTES{instance=\"$gpu_node\", gpu=~\"$gpu\"}[2m])",
"legendFormat": "GPU RX",
"refId": "B"
}
],
"title": "GPU PCIe Bandwidth",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 71
},
"id": 28,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_SM_CLOCK{instance=\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": "GPU SM",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"expr": "DCGM_FI_DEV_MEM_CLOCK{instance=\"$gpu_node\", gpu=~\"$gpu\"}",
"legendFormat": "GPU Mem",
"refId": "B"
}
],
"title": "GPU Clock Speeds (MHz)",
"type": "timeseries"
}
],
"preload": false,
"refresh": "30s",
"schemaVersion": 42,
"tags": [
"node",
"node_exporter",
"per-node",
"gpu",
"dcgm"
],
"templating": {
"list": [
{
"current": {
"text": "192.168.2.10:9100",
"value": "192.168.2.10:9100"
},
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"definition": "label_values(up{job=\"node_exporter\"}, instance)",
"includeAll": false,
"label": "Node",
"name": "node",
"options": [],
"query": {
"query": "label_values(up{job=\"node_exporter\"}, instance)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"sort": 1,
"type": "query"
},
{
"current": {
"text": "192.168.2.10:9400",
"value": "192.168.2.10:9400"
},
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"definition": "label_values(DCGM_FI_DEV_GPU_UTIL, instance)",
"includeAll": false,
"label": "GPU Node",
"name": "gpu_node",
"options": [],
"query": {
"query": "label_values(DCGM_FI_DEV_GPU_UTIL, instance)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"sort": 1,
"type": "query"
},
{
"current": {
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"definition": "label_values(DCGM_FI_DEV_GPU_UTIL{instance=~\"$gpu_node\"}, gpu)",
"includeAll": true,
"label": "GPU Index",
"multi": true,
"name": "gpu",
"options": [],
"query": {
"query": "label_values(DCGM_FI_DEV_GPU_UTIL{instance=~\"$gpu_node\"}, gpu)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "Node Detail",
"uid": "per-node-detail-v1",
"version": 1
}
mobile phone dashboard cpu
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 5,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 1,
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"value": 0
},
{
"color": "#FF9830",
"value": 70
},
{
"color": "#F2495C",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"colorMode": "background_solid",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "inverted",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": true,
"textMode": "value_and_name",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"expr": "100 - (avg by(instance)(rate(node_cpu_seconds_total{mode=\"idle\"}[1m])) * 100) * on(instance) group_left(nodename) node_uname_info",
"legendFormat": "",
"refId": "A"
}
],
"title": "CPU 使用率",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"value": 0
},
{
"color": "#FF9830",
"value": 2
},
{
"color": "#F2495C",
"value": 4
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 0
},
"id": 2,
"options": {
"colorMode": "background_solid",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "inverted",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": true,
"textMode": "value_and_name",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"editorMode": "code",
"expr": "node_load1 * on(instance) group_left(nodename) node_uname_info",
"legendFormat": "",
"range": true,
"refId": "A"
}
],
"title": "系统负载 (1m)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 1,
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"value": 0
},
{
"color": "#FF9830",
"value": 70
},
{
"color": "#F2495C",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 8,
"x": 16,
"y": 0
},
"id": 3,
"options": {
"colorMode": "background_solid",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "inverted",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": true,
"textMode": "value_and_name",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 * on(instance) group_left(nodename) node_uname_info",
"legendFormat": "",
"refId": "A"
}
],
"title": "内存使用率",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"decimals": 1,
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 6
},
"id": 4,
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"expr": "100 - (avg by(instance)(rate(node_cpu_seconds_total{mode=\"idle\"}[1m])) * 100) * on(instance) group_left(nodename) node_uname_info",
"legendFormat": "CPU % - ",
"refId": "A"
}
],
"title": "CPU 使用率趋势",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"decimals": 1,
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 6
},
"id": 5,
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 * on(instance) group_left(nodename) node_uname_info",
"legendFormat": "内存 % - ",
"refId": "A"
}
],
"title": "内存使用率趋势",
"type": "timeseries"
}
],
"preload": false,
"refresh": "10s",
"schemaVersion": 42,
"tags": [
"mobile",
"system"
],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "Mobile CPU Nodes",
"uid": "42b1d24b-5f30-4f59-b23f-6dbce9fc7666",
"version": 9
}
mobile phone dashboard gpu
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 6,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 1,
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"value": 0
},
{
"color": "#FF9830",
"value": 70
},
{
"color": "#F2495C",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 6,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"colorMode": "background_solid",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "inverted",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": true,
"textMode": "value_and_name",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"expr": "DCGM_FI_DEV_GPU_UTIL",
"legendFormat": " · GPU",
"refId": "A"
}
],
"title": "GPU 使用率",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 1,
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"value": 0
},
{
"color": "#FF9830",
"value": 70
},
{
"color": "#F2495C",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 6,
"x": 6,
"y": 0
},
"id": 2,
"options": {
"colorMode": "background_solid",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "inverted",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": true,
"textMode": "value_and_name",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"expr": "DCGM_FI_DEV_FB_USED / (DCGM_FI_DEV_FB_USED + DCGM_FI_DEV_FB_FREE) * 100",
"legendFormat": " · GPU",
"refId": "A"
}
],
"title": "显存使用率",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 0,
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"value": 0
},
{
"color": "#FF9830",
"value": 70
},
{
"color": "#F2495C",
"value": 85
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 6,
"x": 12,
"y": 0
},
"id": 3,
"options": {
"colorMode": "background_solid",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "inverted",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "value_and_name",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"expr": "DCGM_FI_DEV_GPU_TEMP",
"legendFormat": " · GPU",
"refId": "A"
}
],
"title": "GPU 温度",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 0,
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"value": 0
},
{
"color": "#FF9830",
"value": 200
},
{
"color": "#F2495C",
"value": 350
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 6,
"x": 18,
"y": 0
},
"id": 4,
"options": {
"colorMode": "background_solid",
"graphMode": "area",
"justifyMode": "center",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "value_and_name",
"wideLayout": true
},
"pluginVersion": "12.3.3",
"targets": [
{
"expr": "DCGM_FI_DEV_POWER_USAGE",
"legendFormat": " · GPU",
"refId": "A"
}
],
"title": "GPU 功耗",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"decimals": 1,
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 6
},
"id": 5,
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"expr": "DCGM_FI_DEV_GPU_UTIL",
"legendFormat": " · GPU",
"refId": "A"
}
],
"title": "GPU 使用率趋势",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "ffe6byo4s8g74a"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"decimals": 1,
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 6
},
"id": 6,
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.3.3",
"targets": [
{
"expr": "DCGM_FI_DEV_FB_USED / (DCGM_FI_DEV_FB_USED + DCGM_FI_DEV_FB_FREE) * 100",
"legendFormat": " · GPU",
"refId": "A"
}
],
"title": "显存使用率趋势",
"type": "timeseries"
}
],
"preload": false,
"refresh": "10s",
"schemaVersion": 42,
"tags": [
"mobile",
"gpu"
],
"templating": {
"list": []
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "Mobile GPU Nodes",
"uid": "mobile-gpu",
"version": 1
}