infra/charts/victoria-metrics-k8s-stack/files/dashboards/generated/victoriametrics-vmalert.yaml
Konstantin Averkiev c45fd1a6ac added vm stack
2025-07-08 17:29:32 +03:00

3205 lines
78 KiB
YAML

{{- $Values := (.helm).Values | default .Values }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $multicluster := ((($Values.grafana).sidecar).dashboards).multicluster | default false }}
{{- $defaultDatasource := "prometheus" -}}
{{- range (((($Values.grafana).sidecar).datasources).victoriametrics | default list) }}
{{- if and .isDefault .type }}{{ $defaultDatasource = .type }}{{- end }}
{{- end }}
annotations:
list:
- builtIn: 1
datasource:
type: datasource
uid: grafana
enable: true
hide: true
iconColor: rgba(0, 211, 255, 1)
name: Annotations & Alerts
target:
limit: 100
matchAny: false
tags: []
type: dashboard
type: dashboard
- datasource:
type: prometheus
uid: $ds
enable: true
expr: sum(vm_app_version{job=~"$job", instance=~"$instance"}) by(short_version) unless (sum(vm_app_version{job=~"$job", instance=~"$instance"} offset $__interval) by(short_version))
hide: true
iconColor: dark-blue
name: version
textFormat: '{{`{{`}}short_version{{`}}`}}'
titleFormat: Version change
- datasource:
type: prometheus
uid: $ds
enable: true
expr: sum(changes(vm_app_start_timestamp{job=~"$job", instance=~"$instance"}[$__interval])) by(job, instance)
hide: false
iconColor: dark-yellow
name: restarts
textFormat: '{{`{{`}}job{{`}}`}}:{{`{{`}}instance{{`}}`}} restarted'
condition: {{ ($Values.vmalert).enabled }}
description: Overview for VictoriaMetrics vmalert v1.117.0 or higher
editable: false
fiscalYearStartMonth: 0
graphTooltip: 1
id: 3
links:
- asDropdown: false
icon: external link
includeVars: false
keepTime: false
tags: []
targetBlank: true
title: vmalert docs
tooltip: ''
type: link
url: https://docs.victoriametrics.com/victoriametrics/vmalert/
- asDropdown: false
icon: external link
includeVars: false
keepTime: false
tags: []
targetBlank: true
title: Found a bug?
tooltip: ''
type: link
url: ' https://github.com/VictoriaMetrics/VictoriaMetrics/issues'
- asDropdown: false
icon: external link
includeVars: false
keepTime: false
tags: []
targetBlank: true
title: New releases
tooltip: ''
type: link
url: ' https://github.com/VictoriaMetrics/VictoriaMetrics/releases'
panels:
- collapsed: false
gridPos:
h: 1
w: 24
x: 0
'y': 0
id: 47
panels: []
title: Stats
type: row
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows if the last configuration update was successful. "Not Ok" means there was an unsuccessful attempt to update the configuration due to some error. Check the log for details.
fieldConfig:
defaults:
mappings:
- options:
match: 'null'
result:
color: green
index: 0
text: Ok
type: special
- options:
from: 1
result:
color: red
index: 1
text: Not Ok
to: 999999
type: range
thresholds:
mode: absolute
steps:
- color: green
overrides: []
gridPos:
h: 3
w: 4
x: 0
'y': 1
id: 6
options:
colorMode: value
graphMode: area
justifyMode: auto
orientation: auto
percentChangeColorMode: standard
reduceOptions:
calcs:
- last
fields: ''
values: false
showPercentChange: false
text: {}
textMode: auto
wideLayout: true
pluginVersion: 12.0.2
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: count(vmalert_config_last_reload_successful{job=~"$job", instance=~"$instance"} < 1 )
interval: ''
legendFormat: ''
range: true
refId: A
title: Config update
type: stat
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows the total number of loaded alerting rules across selected instances and groups.
fieldConfig:
defaults:
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
overrides: []
gridPos:
h: 3
w: 5
x: 4
'y': 1
id: 9
options:
colorMode: value
graphMode: area
justifyMode: auto
orientation: auto
percentChangeColorMode: standard
reduceOptions:
calcs:
- last
fields: ''
values: false
showPercentChange: false
text: {}
textMode: auto
wideLayout: true
pluginVersion: 12.0.2
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
exemplar: false
expr: count(vmalert_alerting_rules_last_evaluation_samples{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"})
interval: ''
legendFormat: ''
refId: A
title: Alerting rules
type: stat
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows the total number of loaded recording rules across selected instances and groups.
fieldConfig:
defaults:
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
overrides: []
gridPos:
h: 3
w: 5
x: 9
'y': 1
id: 7
options:
colorMode: value
graphMode: area
justifyMode: auto
orientation: auto
percentChangeColorMode: standard
reduceOptions:
calcs:
- last
fields: ''
values: false
showPercentChange: false
text: {}
textMode: auto
wideLayout: true
pluginVersion: 12.0.2
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
exemplar: false
expr: count(vmalert_recording_rules_last_evaluation_samples{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"})
interval: ''
legendFormat: ''
refId: A
title: Recording rules
type: stat
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows the total number of errors generated by recording/alerting rules for selected instances and groups.
fieldConfig:
defaults:
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 1
overrides: []
gridPos:
h: 3
w: 5
x: 14
'y': 1
id: 8
options:
colorMode: value
graphMode: area
justifyMode: auto
orientation: auto
percentChangeColorMode: standard
reduceOptions:
calcs:
- last
fields: ''
values: false
showPercentChange: false
text: {}
textMode: auto
wideLayout: true
pluginVersion: 12.0.2
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval]))) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])))"
interval: ''
legendFormat: ''
range: true
refId: A
title: Errors
type: stat
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: "Shows number of Recording Rules which produce no data.\n\n Usually it means that such rules are misconfigured, since they give no output during the evaluation.\nPlease check if rule's expression is correct and it is working as expected."
fieldConfig:
defaults:
mappings:
- options:
match: 'null'
result:
index: 1
text: '0'
type: special
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 1
overrides: []
gridPos:
h: 3
w: 5
x: 19
'y': 1
id: 48
options:
colorMode: value
graphMode: area
justifyMode: auto
orientation: auto
percentChangeColorMode: standard
reduceOptions:
calcs:
- last
fields: ''
values: false
showPercentChange: false
text: {}
textMode: auto
wideLayout: true
pluginVersion: 12.0.2
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: count(vmalert_recording_rules_last_evaluation_samples{job=~"$job", instance=~"$instance"} < 1)
interval: ''
legendFormat: ''
range: true
refId: A
title: No data errors
type: stat
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
fieldConfig:
defaults:
color:
mode: thresholds
custom:
align: auto
cellOptions:
type: auto
inspect: false
minWidth: 50
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
overrides:
- matcher:
id: byName
options: Time
properties:
- id: custom.hidden
value: true
- matcher:
id: byName
options: Value
properties:
- id: displayName
value: Count
gridPos:
h: 4
w: 9
x: 0
'y': 4
id: 45
options:
cellHeight: sm
footer:
countRows: false
fields: ''
reducer:
- sum
show: false
showHeader: true
pluginVersion: 12.0.2
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: sum(vm_app_version{job=~"$job", instance=~"$instance"}) by(job, short_version)
format: table
instant: true
range: false
refId: A
title: ''
type: table
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: stepAfter
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
decimals: 0
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 4
w: 15
x: 9
'y': 4
id: 4
options:
legend:
calcs:
- lastNotNull
displayMode: table
placement: right
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
hideZeros: false
mode: multi
sort: asc
pluginVersion: 12.0.2
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: sum(min_over_time(up{job=~"$job", instance=~"$instance"}[$__rate_interval])) by (job)
format: time_series
instant: false
interval: ''
legendFormat: '{{`{{`}}job{{`}}`}}'
refId: A
title: Uptime
type: timeseries
- collapsed: false
gridPos:
h: 1
w: 24
x: 0
'y': 8
id: 11
panels: []
title: Overview ($instance)
type: row
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows the number of fired alerts by job.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 9
id: 15
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 12.0.2
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: sum(increase(vmalert_alerts_fired_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job)
interval: ''
legendFormat: '{{`{{`}}job{{`}}`}}'
range: true
refId: A
title: Alerts fired total ($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Top $topk groups by evaluation duration. Shows groups that take the most of time during the evaluation across all instances.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: s
overrides: []
gridPos:
h: 8
w: 12
x: 12
'y': 9
id: 23
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 12.0.2
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: "topk($topk, max(sum(\n rate(vmalert_iteration_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])\n/\n rate(vmalert_iteration_duration_seconds_count{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])\n) by(job, instance, group, file)) \nby(job, group, file))"
interval: ''
legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}({{`{{`}}file{{`}}`}})
range: true
refId: A
title: Top $topk groups avg evaluation duration ($group)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows how many requests (executions) per second vmalert sends to the configured datasource.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 17
id: 24
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 12.0.2
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: sum(rate(vmalert_execution_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by (job)
interval: ''
legendFormat: '{{`{{`}}job{{`}}`}}'
range: true
refId: A
title: Rules execution rate ($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows the error rate while executing configured rules. Non-zero value means there are some issues with existing rules. Check the logs to get more details.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 10
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 12
'y': 17
id: 25
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
hideZeros: false
mode: multi
sort: none
pluginVersion: 12.0.2
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: sum(rate(vmalert_execution_errors_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job) > 0
interval: ''
legendFormat: __auto
range: true
refId: A
title: Rules execution errors ($instance)
type: timeseries
- collapsed: true
gridPos:
h: 1
w: 24
x: 0
'y': 25
id: 43
panels:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: 'The percentage of used RSS memory
If you think that usage is abnormal or unexpected, please file an issue and attach memory profile if possible.'
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: percentunit
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 162
id: 37
links:
- targetBlank: true
title: Profiling
url: https://docs.victoriametrics.com/victoriametrics/vmagent/#profiling
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 11.5.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: |-
max(
max_over_time(process_resident_memory_bytes{job=~"$job", instance=~"$instance"}[$__rate_interval])
/
vm_available_memory_bytes{job=~"$job", instance=~"$instance"}
) by(job)
interval: ''
legendFormat: __auto
range: true
refId: A
title: Memory usage % ($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: "Shows the CPU usage percentage per vmalert instance. \nIf you think that usage is abnormal or unexpected pls file an issue and attach CPU profile if possible."
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: percentunit
overrides: []
gridPos:
h: 8
w: 12
x: 12
'y': 162
id: 35
links:
- targetBlank: true
title: Profiling
url: https://docs.victoriametrics.com/victoriametrics/vmagent/#profiling
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 11.5.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: "max(\n rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]) \n / \n process_cpu_cores_available{job=~\"$job\", instance=~\"$instance\"}\n) by(job)"
format: time_series
interval: ''
intervalFactor: 1
legendFormat: '{{`{{`}}job{{`}}`}}'
range: true
refId: A
title: CPU usage %($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: 'Share for memory allocated by the process itself. When memory usage reaches 100% it will be likely OOM-killed.
Safe memory usage % considered to be below 80%'
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: percentunit
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 170
id: 65
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 11.5.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: |-
max(
max_over_time(process_resident_memory_anon_bytes{job=~"$job", instance=~"$instance"}[$__rate_interval])
/
vm_available_memory_bytes{job=~"$job", instance=~"$instance"}
) by(instance)
interval: ''
legendFormat: __auto
range: true
refId: A
title: RSS anonymous memory % usage
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows the max number of CPU cores used by a `job` and the corresponding limit.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 12
'y': 170
id: 56
links:
- targetBlank: true
title: Profiling
url: https://docs.victoriametrics.com/victoriametrics/vmagent/#profiling
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 11.5.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: max(rate(process_cpu_seconds_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job)
format: time_series
interval: ''
intervalFactor: 1
legendFormat: '{{`{{`}}job{{`}}`}}'
range: true
refId: A
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: min(process_cpu_cores_available{job=~"$job", instance=~"$instance"}) by(job)
format: time_series
hide: false
interval: ''
intervalFactor: 1
legendFormat: limit ({{`{{`}}job{{`}}`}})
range: true
refId: B
title: CPU usage ($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: 'Amount of used RSS memory
If you think that usage is abnormal or unexpected, please file an issue and attach memory profile if possible.'
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: bytes
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 178
id: 57
links:
- targetBlank: true
title: Profiling
url: https://docs.victoriametrics.com/victoriametrics/vmagent/#profiling
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 11.5.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: |-
max(
max_over_time(process_resident_memory_bytes{job=~"$job", instance=~"$instance"}[$__rate_interval])
) by(job)
interval: ''
legendFormat: '{{`{{`}}job{{`}}`}}'
range: true
refId: A
title: Memory usage ($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: 'Shows CPU pressure based on [Pressure Stall Information](https://docs.kernel.org/accounting/psi.html).
The lower the better.'
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: line
decimals: 0
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
unit: s
overrides: []
gridPos:
h: 8
w: 12
x: 12
'y': 178
id: 66
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 11.5.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
expr: sum(rate(process_pressure_cpu_waiting_seconds_total{job=~"$job"}[$__rate_interval])) by (job, instance)
format: time_series
interval: ''
intervalFactor: 2
legendFormat: '{{`{{`}}instance{{`}}`}} - waiting'
range: true
refId: A
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
expr: sum(rate(process_pressure_cpu_stalled_seconds_total{job=~"$job"}[$__rate_interval])) by (job, instance)
format: time_series
hide: false
interval: ''
intervalFactor: 2
legendFormat: '{{`{{`}}instance{{`}}`}} - stalled'
range: true
refId: B
title: CPU pressure
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: 'Shows memory pressure based on [Pressure Stall Information](https://docs.kernel.org/accounting/psi.html).
The lower the better.'
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: line
decimals: 0
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
unit: s
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 186
id: 67
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 11.5.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
expr: sum(rate(process_pressure_memory_waiting_seconds_total{job=~"$job"}[$__rate_interval])) by (job, instance)
format: time_series
interval: ''
intervalFactor: 2
legendFormat: '{{`{{`}}instance{{`}}`}} - waiting'
range: true
refId: A
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
expr: sum(rate(process_pressure_memory_stalled_seconds_total{job=~"$job"}[$__rate_interval])) by (job, instance)
format: time_series
hide: false
interval: ''
intervalFactor: 2
legendFormat: '{{`{{`}}instance{{`}}`}} - stalled'
range: true
refId: B
title: Memory pressure
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
decimals: 0
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 12
'y': 186
id: 41
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
hideZeros: false
mode: multi
sort: none
pluginVersion: 11.5.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
expr: sum(go_goroutines{job=~"$job", instance=~"$instance"}) by(job)
format: time_series
interval: ''
intervalFactor: 2
legendFormat: __auto
range: true
refId: A
title: Goroutines ($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: 'Panel shows the percentage of open file descriptors in the OS.
Reaching the limit of open files can cause various issues and must be prevented.
See how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a'
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
decimals: 3
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: percentunit
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 194
id: 39
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 11.5.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: |-
max(
max_over_time(process_open_fds{job=~"$job", instance=~"$instance"}[$__rate_interval])
/
process_max_fds{job=~"$job", instance=~"$instance"}
) by(job)
format: time_series
interval: ''
intervalFactor: 2
legendFormat: __auto
range: true
refId: A
title: Open FDs usage % ($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: "Shows the time goroutines have spent in runnable state before actually running. The lower is better.\n\nHigh values or values exceeding the threshold is usually a sign of insufficient CPU resources or CPU throttling. \n\nVerify that service has enough CPU resources. Otherwise, the service could work unreliably with delays in processing."
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: line
decimals: 0
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 0.1
unit: s
overrides: []
gridPos:
h: 8
w: 12
x: 12
'y': 194
id: 61
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 11.5.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
expr: max(histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket{job=~"$job"}[$__rate_interval])) by (job, instance, le))) by(job)
format: time_series
interval: ''
intervalFactor: 2
legendFormat: __auto
range: true
refId: A
title: Go scheduling latency
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: 'Shows the percent of CPU spent on garbage collection.
If % is high, then CPU usage can be decreased by changing GOGC to higher values. Increasing GOGC value will increase memory usage, and decrease CPU usage.
Try searching for keyword `GOGC` at https://docs.victoriametrics.com/victoriametrics/troubleshooting/ '
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
decimals: 0
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: percentunit
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 202
id: 59
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 11.5.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
expr: "max(\n rate(go_gc_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]) \n / rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n ) by(job)"
format: time_series
interval: ''
intervalFactor: 2
legendFormat: __auto
range: true
refId: A
title: CPU spent on GC ($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows the rate of allocations in memory. Sudden increase in allocations would mean increased pressure on Go Garbage Collector and can saturate CPU resources of the application.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
barWidthFactor: 0.6
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: line
decimals: 0
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
unit: bytes
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 210
id: 64
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
hideZeros: false
mode: multi
sort: desc
pluginVersion: 11.5.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
expr: sum(rate(go_memstats_alloc_bytes_total{job=~"$job"}[$__rate_interval])) by (job, instance)
format: time_series
interval: ''
intervalFactor: 2
legendFormat: __auto
range: true
refId: A
title: Memory allocations rate
type: timeseries
title: Resource usage
type: row
- collapsed: true
gridPos:
h: 1
w: 24
x: 0
'y': 26
id: 62
panels:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
fieldConfig:
defaults:
color:
mode: thresholds
custom:
align: auto
cellOptions:
type: auto
inspect: false
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
overrides:
- matcher:
id: byName
options: Value
properties:
- id: custom.hidden
value: true
- matcher:
id: byName
options: Time
properties:
- id: custom.hidden
value: true
gridPos:
h: 7
w: 12
x: 0
'y': 345
id: 50
options:
cellHeight: sm
footer:
countRows: false
fields: ''
reducer:
- sum
show: false
showHeader: true
sortBy:
- desc: true
displayName: job
pluginVersion: 10.4.2
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: sum(flag{is_set="true", job=~"$job", instance=~"$instance"}) by(job, instance, name, value)
format: table
instant: true
legendFormat: __auto
range: false
refId: A
title: Non-default flags
transformations:
- id: groupBy
options:
fields:
instance:
aggregations:
- uniqueValues
operation: aggregate
job:
aggregations: []
operation: groupby
name:
aggregations: []
operation: groupby
value:
aggregations: []
operation: groupby
type: table
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: "Missed evaluation means that group evaluation time takes longer than the configured evaluation interval. \nThis may result in missed alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency for such groups. See https://docs.victoriametrics.com/victoriametrics/vmalert/#groups\n\nIf rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/victoriametrics/troubleshooting/#slow-queries.\""
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: bars
fillOpacity: 10
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 7
w: 12
x: 12
'y': 345
id: 58
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
mode: multi
sort: none
pluginVersion: 9.2.6
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: sum(increase(vmalert_iteration_missed_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job, group, file) > 0
interval: 1m
legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}({{`{{`}}file{{`}}`}})
range: true
refId: A
title: Missed evaluations ($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: 'Shows the number of restarts per job. The chart can be useful to identify periodic process restarts and correlate them with potential issues or anomalies. Normally, processes shouldn''t restart unless restart was inited by user. The reason of restarts should be figured out by checking the logs of each specific service. '
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
axisSoftMin: 0
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: stepAfter
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
decimals: 0
links: []
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: none
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 352
id: 63
options:
legend:
calcs:
- lastNotNull
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
mode: multi
sort: desc
pluginVersion: 9.1.0
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
expr: sum(changes(vm_app_start_timestamp{job=~"$job", instance=~"$instance"}[$__rate_interval]) > 0) by(job)
format: time_series
instant: false
legendFormat: '{{`{{`}}job{{`}}`}}'
refId: A
title: Restarts ($job)
type: timeseries
title: Troubleshooting
type: row
- collapsed: true
gridPos:
h: 1
w: 24
x: 0
'y': 27
id: 17
panels:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows top $topk current active (firing) alerting rules.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 370
id: 14
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
mode: multi
sort: desc
pluginVersion: 9.2.6
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: topk($topk, sum(vmalert_alerts_firing{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"}) by(job, group, file, alertname) > 0)
interval: ''
legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}.{{`{{`}}alertname{{`}}`}}({{`{{`}}file{{`}}`}})
range: true
refId: A
title: Top $topk active alerts ($group)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows the events when rule execution resulted into an error. Check the logs for more details.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 12
'y': 370
id: 13
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
mode: multi
sort: desc
pluginVersion: 9.2.6
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: sum(increase(vmalert_alerting_rules_errors_total{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"}[$__rate_interval])) by(job, group, file, alertname) > 0
interval: ''
legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}.{{`{{`}}alertname{{`}}`}}({{`{{`}}file{{`}}`}})
range: true
refId: A
title: Errors ($group)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: 'Shows the current pending alerting rules per group.
By pending means the rule which remains active less than configured `for` parameter.'
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 378
id: 20
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Mean
sortDesc: true
tooltip:
mode: multi
sort: desc
pluginVersion: 9.2.6
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: sum(vmalert_alerts_pending{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"}) by(job, group, file, alertname) > 0
interval: ''
legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}.{{`{{`}}alertname{{`}}`}}({{`{{`}}file{{`}}`}})
range: true
refId: A
title: Pending ($group)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows the error rate for the attempts to send alerts to Alertmanager. If not zero it means there issues on attempt to send notification to Alertmanager and some alerts may be not delivered properly. Check the logs for more details.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 12
'y': 378
id: 32
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
mode: multi
sort: desc
pluginVersion: 9.2.6
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
exemplar: false
expr: sum(rate(vmalert_alerts_send_errors_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(instance, addr) > 0
interval: ''
legendFormat: '{{`{{`}}instance{{`}}`}} => {{`{{`}}addr{{`}}`}}'
refId: A
title: Errors rate to Alertmanager
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows how many alerts are sent to Alertmanager per second. Only active alerts are sent.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 386
id: 26
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
mode: multi
sort: desc
pluginVersion: 9.2.6
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: sum(rate(vmalert_alerts_sent_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job, addr) > 0
interval: ''
legendFormat: '{{`{{`}}job{{`}}`}} => {{`{{`}}addr{{`}}`}}'
range: true
refId: A
title: Requests rate to Alertmanager by job ($group)
type: timeseries
title: Alerting rules ($instance)
type: row
- collapsed: true
gridPos:
h: 1
w: 24
x: 0
'y': 28
id: 28
panels:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows the top $topk recording rules which generate the most of [samples](https://docs.victoriametrics.com/victoriametrics/keyconcepts/#raw-samples). Each generated sample is basically a time series which then ingested into configured remote storage. Rules with high numbers may cause the most pressure on the remote database and become a source of too high cardinality.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 385
id: 31
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
mode: multi
sort: desc
pluginVersion: 9.2.6
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: "topk($topk, \n max(\n sum(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, instance, group, file, recording) > 0\n ) by(job, group, file, recording)\n)"
interval: ''
legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}.{{`{{`}}recording{{`}}`}}({{`{{`}}file{{`}}`}})
range: true
refId: A
title: Top $topk rules by produced samples ($group)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: 'Shows the rules which do not produce any [samples](https://docs.victoriametrics.com/victoriametrics/keyconcepts/#raw-samples) during the evaluation. Usually it means that such rules are misconfigured, since they give no output during the evaluation.
Please check if rule''s expression is correct and it is working as expected.'
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: true
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 12
'y': 385
id: 33
options:
legend:
calcs:
- lastNotNull
- max
- mean
displayMode: table
placement: bottom
showLegend: true
sortBy: Last *
sortDesc: true
tooltip:
mode: multi
sort: desc
pluginVersion: 8.0.3
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: count(vmalert_recording_rules_last_evaluation_samples{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"} < 1) by(job, group, file, recording)
interval: ''
legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}.{{`{{`}}recording{{`}}`}}({{`{{`}}file{{`}}`}})
range: true
refId: A
title: Rules with 0 produced samples ($group)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 393
id: 30
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
mode: multi
sort: none
pluginVersion: 9.2.6
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: false
expr: sum(increase(vmalert_recording_rules_errors_total{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"}[$__rate_interval])) by(job, group, file, recording) > 0
interval: ''
legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}.{{`{{`}}recording{{`}}`}}({{`{{`}}file{{`}}`}})
range: true
refId: A
title: Errors ($group)
type: timeseries
title: Recording rules ($instance)
type: row
- collapsed: true
gridPos:
h: 1
w: 24
x: 0
'y': 29
id: 55
panels:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: auto
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 351
id: 52
options:
legend:
calcs: []
displayMode: list
placement: bottom
showLegend: true
tooltip:
mode: single
sort: desc
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
expr: sum(rate(vmalert_remotewrite_sent_rows_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job)
legendFormat: __auto
range: true
refId: A
title: Datapoints send rate ($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows the number of datapoints dropped by vmalert while sending to the configured remote write URL. vmalert performs up to 5 retries before dropping the data. Check vmalert's error logs for the specific error message.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: auto
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
mappings: []
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
overrides: []
gridPos:
h: 8
w: 12
x: 12
'y': 351
id: 53
options:
legend:
calcs: []
displayMode: list
placement: bottom
showLegend: true
tooltip:
mode: single
sort: desc
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
expr: sum(rate(vmalert_remotewrite_dropped_rows_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job) > 0
legendFormat: __auto
range: true
refId: A
title: Datapoints drop rate ($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: 'Shows current number of established connections to remote write endpoints.
'
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: short
overrides: []
gridPos:
h: 8
w: 12
x: 0
'y': 378
id: 54
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
mode: multi
sort: desc
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: true
expr: sum(max_over_time(vmalert_remotewrite_conns{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job)
interval: ''
legendFormat: __auto
range: true
refId: A
title: Connections ($instance)
type: timeseries
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
description: Shows the global rate for number of written bytes via remote write connections.
fieldConfig:
defaults:
color:
mode: palette-classic
custom:
axisBorderShow: false
axisCenteredZero: false
axisColorMode: text
axisLabel: ''
axisPlacement: auto
barAlignment: 0
drawStyle: line
fillOpacity: 0
gradientMode: none
hideFrom:
legend: false
tooltip: false
viz: false
insertNulls: false
lineInterpolation: linear
lineWidth: 1
pointSize: 5
scaleDistribution:
type: linear
showPoints: never
spanNulls: false
stacking:
group: A
mode: none
thresholdsStyle:
mode: 'off'
links: []
mappings: []
min: 0
thresholds:
mode: absolute
steps:
- color: green
- color: red
value: 80
unit: decbytes
overrides: []
gridPos:
h: 8
w: 12
x: 12
'y': 378
id: 60
options:
legend:
calcs:
- mean
- lastNotNull
- max
displayMode: table
placement: bottom
showLegend: true
tooltip:
mode: multi
sort: desc
targets:
- datasource:
type: {{ $defaultDatasource }}
uid: $ds
editorMode: code
exemplar: true
expr: sum(rate(vmalert_remotewrite_conn_bytes_written_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job) > 0
interval: ''
legendFormat: __auto
range: true
refId: A
title: Bytes write rate ($instance)
type: timeseries
title: Remote write
type: row
preload: false
refresh: ''
schemaVersion: 41
tags:
- victoriametrics
- vm-k8s-stack
templating:
list:
- current:
text: VictoriaMetrics - cluster
value: PAF93674D0B4E9963
includeAll: false
name: ds
options: []
query: {{ $defaultDatasource }}
refresh: 1
regex: ''
type: datasource
- current: {}
datasource:
type: prometheus
uid: $ds
definition: label_values(vm_app_version{version=~"^vmalert.*"}, job)
includeAll: true
multi: true
name: job
options: []
query:
query: label_values(vm_app_version{version=~"^vmalert.*"}, job)
refId: StandardVariableQuery
refresh: 1
regex: ''
type: query
- allValue: .*
current: {}
datasource:
type: prometheus
uid: $ds
definition: label_values(vm_app_version{job=~"$job"}, instance)
includeAll: true
multi: true
name: instance
options: []
query:
query: label_values(vm_app_version{job=~"$job"}, instance)
refId: StandardVariableQuery
refresh: 1
regex: ''
type: query
- allValue: .*
current: {}
datasource:
type: prometheus
uid: $ds
definition: label_values(vmalert_iteration_total{job=~"$job", instance=~"$instance"},file)
includeAll: true
multi: true
name: file
options: []
query:
query: label_values(vmalert_iteration_total{job=~"$job", instance=~"$instance"},file)
refId: PrometheusVariableQueryEditor-VariableQuery
refresh: 1
regex: ''
type: query
- allValue: .*
current: {}
datasource:
type: prometheus
uid: $ds
definition: label_values(vmalert_iteration_total{job=~"$job", instance=~"$instance"}, group)
includeAll: true
multi: true
name: group
options: []
query:
query: label_values(vmalert_iteration_total{job=~"$job", instance=~"$instance"}, group)
refId: StandardVariableQuery
refresh: 1
regex: ''
type: query
- current:
text: '5'
value: '5'
includeAll: false
name: topk
options:
- selected: true
text: '5'
value: '5'
- selected: false
text: '10'
value: '10'
- selected: false
text: '20'
value: '20'
- selected: false
text: '30'
value: '30'
- selected: false
text: '40'
value: '40'
- selected: false
text: '50'
value: '50'
query: 5, 10, 20, 30, 40, 50
type: custom
- baseFilters: []
datasource:
type: prometheus
uid: ${ds}
filters: []
name: filter
type: adhoc
time:
from: now-3h
to: now
timepicker: {}
timezone: {{ default "utc" ($Values.defaultDashboards).defaultTimezone }}
title: VictoriaMetrics - vmalert
uid: LzldHAVnz
version: 1