{{- $Values := (.helm).Values | default .Values }} {{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }} {{- $multicluster := ((($Values.grafana).sidecar).dashboards).multicluster | default false }} {{- $defaultDatasource := "prometheus" -}} {{- range (((($Values.grafana).sidecar).datasources).victoriametrics | default list) }} {{- if and .isDefault .type }}{{ $defaultDatasource = .type }}{{- end }} {{- end }} annotations: list: - builtIn: 1 datasource: type: datasource uid: grafana enable: true hide: true iconColor: rgba(0, 211, 255, 1) name: Annotations & Alerts target: limit: 100 matchAny: false tags: [] type: dashboard type: dashboard - datasource: type: prometheus uid: $ds enable: true expr: sum(vm_app_version{job=~"$job", instance=~"$instance"}) by(short_version) unless (sum(vm_app_version{job=~"$job", instance=~"$instance"} offset $__interval) by(short_version)) hide: true iconColor: dark-blue name: version textFormat: '{{`{{`}}short_version{{`}}`}}' titleFormat: Version change - datasource: type: prometheus uid: $ds enable: true expr: sum(changes(vm_app_start_timestamp{job=~"$job", instance=~"$instance"}[$__interval])) by(job, instance) hide: false iconColor: dark-yellow name: restarts textFormat: '{{`{{`}}job{{`}}`}}:{{`{{`}}instance{{`}}`}} restarted' condition: {{ ($Values.vmalert).enabled }} description: Overview for VictoriaMetrics vmalert v1.117.0 or higher editable: false fiscalYearStartMonth: 0 graphTooltip: 1 id: 3 links: - asDropdown: false icon: external link includeVars: false keepTime: false tags: [] targetBlank: true title: vmalert docs tooltip: '' type: link url: https://docs.victoriametrics.com/victoriametrics/vmalert/ - asDropdown: false icon: external link includeVars: false keepTime: false tags: [] targetBlank: true title: Found a bug? tooltip: '' type: link url: ' https://github.com/VictoriaMetrics/VictoriaMetrics/issues' - asDropdown: false icon: external link includeVars: false keepTime: false tags: [] targetBlank: true title: New releases tooltip: '' type: link url: ' https://github.com/VictoriaMetrics/VictoriaMetrics/releases' panels: - collapsed: false gridPos: h: 1 w: 24 x: 0 'y': 0 id: 47 panels: [] title: Stats type: row - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows if the last configuration update was successful. "Not Ok" means there was an unsuccessful attempt to update the configuration due to some error. Check the log for details. fieldConfig: defaults: mappings: - options: match: 'null' result: color: green index: 0 text: Ok type: special - options: from: 1 result: color: red index: 1 text: Not Ok to: 999999 type: range thresholds: mode: absolute steps: - color: green overrides: [] gridPos: h: 3 w: 4 x: 0 'y': 1 id: 6 options: colorMode: value graphMode: area justifyMode: auto orientation: auto percentChangeColorMode: standard reduceOptions: calcs: - last fields: '' values: false showPercentChange: false text: {} textMode: auto wideLayout: true pluginVersion: 12.0.2 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: count(vmalert_config_last_reload_successful{job=~"$job", instance=~"$instance"} < 1 ) interval: '' legendFormat: '' range: true refId: A title: Config update type: stat - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows the total number of loaded alerting rules across selected instances and groups. fieldConfig: defaults: mappings: [] min: 0 thresholds: mode: absolute steps: - color: green overrides: [] gridPos: h: 3 w: 5 x: 4 'y': 1 id: 9 options: colorMode: value graphMode: area justifyMode: auto orientation: auto percentChangeColorMode: standard reduceOptions: calcs: - last fields: '' values: false showPercentChange: false text: {} textMode: auto wideLayout: true pluginVersion: 12.0.2 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds exemplar: false expr: count(vmalert_alerting_rules_last_evaluation_samples{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"}) interval: '' legendFormat: '' refId: A title: Alerting rules type: stat - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows the total number of loaded recording rules across selected instances and groups. fieldConfig: defaults: mappings: [] min: 0 thresholds: mode: absolute steps: - color: green overrides: [] gridPos: h: 3 w: 5 x: 9 'y': 1 id: 7 options: colorMode: value graphMode: area justifyMode: auto orientation: auto percentChangeColorMode: standard reduceOptions: calcs: - last fields: '' values: false showPercentChange: false text: {} textMode: auto wideLayout: true pluginVersion: 12.0.2 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds exemplar: false expr: count(vmalert_recording_rules_last_evaluation_samples{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"}) interval: '' legendFormat: '' refId: A title: Recording rules type: stat - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows the total number of errors generated by recording/alerting rules for selected instances and groups. fieldConfig: defaults: mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 1 overrides: [] gridPos: h: 3 w: 5 x: 14 'y': 1 id: 8 options: colorMode: value graphMode: area justifyMode: auto orientation: auto percentChangeColorMode: standard reduceOptions: calcs: - last fields: '' values: false showPercentChange: false text: {} textMode: auto wideLayout: true pluginVersion: 12.0.2 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval]))) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])))" interval: '' legendFormat: '' range: true refId: A title: Errors type: stat - datasource: type: {{ $defaultDatasource }} uid: $ds description: "Shows number of Recording Rules which produce no data.\n\n Usually it means that such rules are misconfigured, since they give no output during the evaluation.\nPlease check if rule's expression is correct and it is working as expected." fieldConfig: defaults: mappings: - options: match: 'null' result: index: 1 text: '0' type: special min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 1 overrides: [] gridPos: h: 3 w: 5 x: 19 'y': 1 id: 48 options: colorMode: value graphMode: area justifyMode: auto orientation: auto percentChangeColorMode: standard reduceOptions: calcs: - last fields: '' values: false showPercentChange: false text: {} textMode: auto wideLayout: true pluginVersion: 12.0.2 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: count(vmalert_recording_rules_last_evaluation_samples{job=~"$job", instance=~"$instance"} < 1) interval: '' legendFormat: '' range: true refId: A title: No data errors type: stat - datasource: type: {{ $defaultDatasource }} uid: $ds fieldConfig: defaults: color: mode: thresholds custom: align: auto cellOptions: type: auto inspect: false minWidth: 50 mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 overrides: - matcher: id: byName options: Time properties: - id: custom.hidden value: true - matcher: id: byName options: Value properties: - id: displayName value: Count gridPos: h: 4 w: 9 x: 0 'y': 4 id: 45 options: cellHeight: sm footer: countRows: false fields: '' reducer: - sum show: false showHeader: true pluginVersion: 12.0.2 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: sum(vm_app_version{job=~"$job", instance=~"$instance"}) by(job, short_version) format: table instant: true range: false refId: A title: '' type: table - datasource: type: {{ $defaultDatasource }} uid: $ds fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: stepAfter lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' decimals: 0 links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 4 w: 15 x: 9 'y': 4 id: 4 options: legend: calcs: - lastNotNull displayMode: table placement: right showLegend: true sortBy: Last * sortDesc: true tooltip: hideZeros: false mode: multi sort: asc pluginVersion: 12.0.2 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: sum(min_over_time(up{job=~"$job", instance=~"$instance"}[$__rate_interval])) by (job) format: time_series instant: false interval: '' legendFormat: '{{`{{`}}job{{`}}`}}' refId: A title: Uptime type: timeseries - collapsed: false gridPos: h: 1 w: 24 x: 0 'y': 8 id: 11 panels: [] title: Overview ($instance) type: row - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows the number of fired alerts by job. fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 9 id: 15 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 12.0.2 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: sum(increase(vmalert_alerts_fired_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job) interval: '' legendFormat: '{{`{{`}}job{{`}}`}}' range: true refId: A title: Alerts fired total ($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: Top $topk groups by evaluation duration. Shows groups that take the most of time during the evaluation across all instances. fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: s overrides: [] gridPos: h: 8 w: 12 x: 12 'y': 9 id: 23 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 12.0.2 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: "topk($topk, max(sum(\n rate(vmalert_iteration_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])\n/\n rate(vmalert_iteration_duration_seconds_count{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])\n) by(job, instance, group, file)) \nby(job, group, file))" interval: '' legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}({{`{{`}}file{{`}}`}}) range: true refId: A title: Top $topk groups avg evaluation duration ($group) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows how many requests (executions) per second vmalert sends to the configured datasource. fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 17 id: 24 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 12.0.2 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: sum(rate(vmalert_execution_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by (job) interval: '' legendFormat: '{{`{{`}}job{{`}}`}}' range: true refId: A title: Rules execution rate ($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows the error rate while executing configured rules. Non-zero value means there are some issues with existing rules. Check the logs to get more details. fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 10 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 12 'y': 17 id: 25 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: hideZeros: false mode: multi sort: none pluginVersion: 12.0.2 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: sum(rate(vmalert_execution_errors_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job) > 0 interval: '' legendFormat: __auto range: true refId: A title: Rules execution errors ($instance) type: timeseries - collapsed: true gridPos: h: 1 w: 24 x: 0 'y': 25 id: 43 panels: - datasource: type: {{ $defaultDatasource }} uid: $ds description: 'The percentage of used RSS memory If you think that usage is abnormal or unexpected, please file an issue and attach memory profile if possible.' fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: percentunit overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 162 id: 37 links: - targetBlank: true title: Profiling url: https://docs.victoriametrics.com/victoriametrics/vmagent/#profiling options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 11.5.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: |- max( max_over_time(process_resident_memory_bytes{job=~"$job", instance=~"$instance"}[$__rate_interval]) / vm_available_memory_bytes{job=~"$job", instance=~"$instance"} ) by(job) interval: '' legendFormat: __auto range: true refId: A title: Memory usage % ($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: "Shows the CPU usage percentage per vmalert instance. \nIf you think that usage is abnormal or unexpected pls file an issue and attach CPU profile if possible." fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: percentunit overrides: [] gridPos: h: 8 w: 12 x: 12 'y': 162 id: 35 links: - targetBlank: true title: Profiling url: https://docs.victoriametrics.com/victoriametrics/vmagent/#profiling options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 11.5.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: "max(\n rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]) \n / \n process_cpu_cores_available{job=~\"$job\", instance=~\"$instance\"}\n) by(job)" format: time_series interval: '' intervalFactor: 1 legendFormat: '{{`{{`}}job{{`}}`}}' range: true refId: A title: CPU usage %($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: 'Share for memory allocated by the process itself. When memory usage reaches 100% it will be likely OOM-killed. Safe memory usage % considered to be below 80%' fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: percentunit overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 170 id: 65 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 11.5.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: |- max( max_over_time(process_resident_memory_anon_bytes{job=~"$job", instance=~"$instance"}[$__rate_interval]) / vm_available_memory_bytes{job=~"$job", instance=~"$instance"} ) by(instance) interval: '' legendFormat: __auto range: true refId: A title: RSS anonymous memory % usage type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows the max number of CPU cores used by a `job` and the corresponding limit. fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 12 'y': 170 id: 56 links: - targetBlank: true title: Profiling url: https://docs.victoriametrics.com/victoriametrics/vmagent/#profiling options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 11.5.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: max(rate(process_cpu_seconds_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job) format: time_series interval: '' intervalFactor: 1 legendFormat: '{{`{{`}}job{{`}}`}}' range: true refId: A - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: min(process_cpu_cores_available{job=~"$job", instance=~"$instance"}) by(job) format: time_series hide: false interval: '' intervalFactor: 1 legendFormat: limit ({{`{{`}}job{{`}}`}}) range: true refId: B title: CPU usage ($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: 'Amount of used RSS memory If you think that usage is abnormal or unexpected, please file an issue and attach memory profile if possible.' fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: bytes overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 178 id: 57 links: - targetBlank: true title: Profiling url: https://docs.victoriametrics.com/victoriametrics/vmagent/#profiling options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 11.5.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: |- max( max_over_time(process_resident_memory_bytes{job=~"$job", instance=~"$instance"}[$__rate_interval]) ) by(job) interval: '' legendFormat: '{{`{{`}}job{{`}}`}}' range: true refId: A title: Memory usage ($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: 'Shows CPU pressure based on [Pressure Stall Information](https://docs.kernel.org/accounting/psi.html). The lower the better.' fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: line decimals: 0 links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green unit: s overrides: [] gridPos: h: 8 w: 12 x: 12 'y': 178 id: 66 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 11.5.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code expr: sum(rate(process_pressure_cpu_waiting_seconds_total{job=~"$job"}[$__rate_interval])) by (job, instance) format: time_series interval: '' intervalFactor: 2 legendFormat: '{{`{{`}}instance{{`}}`}} - waiting' range: true refId: A - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code expr: sum(rate(process_pressure_cpu_stalled_seconds_total{job=~"$job"}[$__rate_interval])) by (job, instance) format: time_series hide: false interval: '' intervalFactor: 2 legendFormat: '{{`{{`}}instance{{`}}`}} - stalled' range: true refId: B title: CPU pressure type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: 'Shows memory pressure based on [Pressure Stall Information](https://docs.kernel.org/accounting/psi.html). The lower the better.' fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: line decimals: 0 links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green unit: s overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 186 id: 67 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 11.5.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code expr: sum(rate(process_pressure_memory_waiting_seconds_total{job=~"$job"}[$__rate_interval])) by (job, instance) format: time_series interval: '' intervalFactor: 2 legendFormat: '{{`{{`}}instance{{`}}`}} - waiting' range: true refId: A - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code expr: sum(rate(process_pressure_memory_stalled_seconds_total{job=~"$job"}[$__rate_interval])) by (job, instance) format: time_series hide: false interval: '' intervalFactor: 2 legendFormat: '{{`{{`}}instance{{`}}`}} - stalled' range: true refId: B title: Memory pressure type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' decimals: 0 links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 12 'y': 186 id: 41 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: hideZeros: false mode: multi sort: none pluginVersion: 11.5.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code expr: sum(go_goroutines{job=~"$job", instance=~"$instance"}) by(job) format: time_series interval: '' intervalFactor: 2 legendFormat: __auto range: true refId: A title: Goroutines ($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: 'Panel shows the percentage of open file descriptors in the OS. Reaching the limit of open files can cause various issues and must be prevented. See how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a' fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' decimals: 3 links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: percentunit overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 194 id: 39 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 11.5.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: |- max( max_over_time(process_open_fds{job=~"$job", instance=~"$instance"}[$__rate_interval]) / process_max_fds{job=~"$job", instance=~"$instance"} ) by(job) format: time_series interval: '' intervalFactor: 2 legendFormat: __auto range: true refId: A title: Open FDs usage % ($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: "Shows the time goroutines have spent in runnable state before actually running. The lower is better.\n\nHigh values or values exceeding the threshold is usually a sign of insufficient CPU resources or CPU throttling. \n\nVerify that service has enough CPU resources. Otherwise, the service could work unreliably with delays in processing." fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: line decimals: 0 links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 0.1 unit: s overrides: [] gridPos: h: 8 w: 12 x: 12 'y': 194 id: 61 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 11.5.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code expr: max(histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket{job=~"$job"}[$__rate_interval])) by (job, instance, le))) by(job) format: time_series interval: '' intervalFactor: 2 legendFormat: __auto range: true refId: A title: Go scheduling latency type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: 'Shows the percent of CPU spent on garbage collection. If % is high, then CPU usage can be decreased by changing GOGC to higher values. Increasing GOGC value will increase memory usage, and decrease CPU usage. Try searching for keyword `GOGC` at https://docs.victoriametrics.com/victoriametrics/troubleshooting/ ' fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' decimals: 0 links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: percentunit overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 202 id: 59 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 11.5.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code expr: "max(\n rate(go_gc_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]) \n / rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n ) by(job)" format: time_series interval: '' intervalFactor: 2 legendFormat: __auto range: true refId: A title: CPU spent on GC ($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows the rate of allocations in memory. Sudden increase in allocations would mean increased pressure on Go Garbage Collector and can saturate CPU resources of the application. fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 barWidthFactor: 0.6 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: line decimals: 0 links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green unit: bytes overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 210 id: 64 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: hideZeros: false mode: multi sort: desc pluginVersion: 11.5.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code expr: sum(rate(go_memstats_alloc_bytes_total{job=~"$job"}[$__rate_interval])) by (job, instance) format: time_series interval: '' intervalFactor: 2 legendFormat: __auto range: true refId: A title: Memory allocations rate type: timeseries title: Resource usage type: row - collapsed: true gridPos: h: 1 w: 24 x: 0 'y': 26 id: 62 panels: - datasource: type: {{ $defaultDatasource }} uid: $ds fieldConfig: defaults: color: mode: thresholds custom: align: auto cellOptions: type: auto inspect: false mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 overrides: - matcher: id: byName options: Value properties: - id: custom.hidden value: true - matcher: id: byName options: Time properties: - id: custom.hidden value: true gridPos: h: 7 w: 12 x: 0 'y': 345 id: 50 options: cellHeight: sm footer: countRows: false fields: '' reducer: - sum show: false showHeader: true sortBy: - desc: true displayName: job pluginVersion: 10.4.2 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: sum(flag{is_set="true", job=~"$job", instance=~"$instance"}) by(job, instance, name, value) format: table instant: true legendFormat: __auto range: false refId: A title: Non-default flags transformations: - id: groupBy options: fields: instance: aggregations: - uniqueValues operation: aggregate job: aggregations: [] operation: groupby name: aggregations: [] operation: groupby value: aggregations: [] operation: groupby type: table - datasource: type: {{ $defaultDatasource }} uid: $ds description: "Missed evaluation means that group evaluation time takes longer than the configured evaluation interval. \nThis may result in missed alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency for such groups. See https://docs.victoriametrics.com/victoriametrics/vmalert/#groups\n\nIf rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/victoriametrics/troubleshooting/#slow-queries.\"" fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: bars fillOpacity: 10 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 7 w: 12 x: 12 'y': 345 id: 58 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: mode: multi sort: none pluginVersion: 9.2.6 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: sum(increase(vmalert_iteration_missed_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job, group, file) > 0 interval: 1m legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}({{`{{`}}file{{`}}`}}) range: true refId: A title: Missed evaluations ($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: 'Shows the number of restarts per job. The chart can be useful to identify periodic process restarts and correlate them with potential issues or anomalies. Normally, processes shouldn''t restart unless restart was inited by user. The reason of restarts should be figured out by checking the logs of each specific service. ' fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto axisSoftMin: 0 barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: stepAfter lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' decimals: 0 links: [] mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: none overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 352 id: 63 options: legend: calcs: - lastNotNull displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: mode: multi sort: desc pluginVersion: 9.1.0 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code expr: sum(changes(vm_app_start_timestamp{job=~"$job", instance=~"$instance"}[$__rate_interval]) > 0) by(job) format: time_series instant: false legendFormat: '{{`{{`}}job{{`}}`}}' refId: A title: Restarts ($job) type: timeseries title: Troubleshooting type: row - collapsed: true gridPos: h: 1 w: 24 x: 0 'y': 27 id: 17 panels: - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows top $topk current active (firing) alerting rules. fieldConfig: defaults: color: mode: palette-classic custom: axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 370 id: 14 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: mode: multi sort: desc pluginVersion: 9.2.6 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: topk($topk, sum(vmalert_alerts_firing{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"}) by(job, group, file, alertname) > 0) interval: '' legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}.{{`{{`}}alertname{{`}}`}}({{`{{`}}file{{`}}`}}) range: true refId: A title: Top $topk active alerts ($group) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows the events when rule execution resulted into an error. Check the logs for more details. fieldConfig: defaults: color: mode: palette-classic custom: axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 12 'y': 370 id: 13 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: mode: multi sort: desc pluginVersion: 9.2.6 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: sum(increase(vmalert_alerting_rules_errors_total{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"}[$__rate_interval])) by(job, group, file, alertname) > 0 interval: '' legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}.{{`{{`}}alertname{{`}}`}}({{`{{`}}file{{`}}`}}) range: true refId: A title: Errors ($group) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: 'Shows the current pending alerting rules per group. By pending means the rule which remains active less than configured `for` parameter.' fieldConfig: defaults: color: mode: palette-classic custom: axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 378 id: 20 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Mean sortDesc: true tooltip: mode: multi sort: desc pluginVersion: 9.2.6 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: sum(vmalert_alerts_pending{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"}) by(job, group, file, alertname) > 0 interval: '' legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}.{{`{{`}}alertname{{`}}`}}({{`{{`}}file{{`}}`}}) range: true refId: A title: Pending ($group) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows the error rate for the attempts to send alerts to Alertmanager. If not zero it means there issues on attempt to send notification to Alertmanager and some alerts may be not delivered properly. Check the logs for more details. fieldConfig: defaults: color: mode: palette-classic custom: axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 12 'y': 378 id: 32 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: mode: multi sort: desc pluginVersion: 9.2.6 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds exemplar: false expr: sum(rate(vmalert_alerts_send_errors_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(instance, addr) > 0 interval: '' legendFormat: '{{`{{`}}instance{{`}}`}} => {{`{{`}}addr{{`}}`}}' refId: A title: Errors rate to Alertmanager type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows how many alerts are sent to Alertmanager per second. Only active alerts are sent. fieldConfig: defaults: color: mode: palette-classic custom: axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 386 id: 26 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: mode: multi sort: desc pluginVersion: 9.2.6 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: sum(rate(vmalert_alerts_sent_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job, addr) > 0 interval: '' legendFormat: '{{`{{`}}job{{`}}`}} => {{`{{`}}addr{{`}}`}}' range: true refId: A title: Requests rate to Alertmanager by job ($group) type: timeseries title: Alerting rules ($instance) type: row - collapsed: true gridPos: h: 1 w: 24 x: 0 'y': 28 id: 28 panels: - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows the top $topk recording rules which generate the most of [samples](https://docs.victoriametrics.com/victoriametrics/keyconcepts/#raw-samples). Each generated sample is basically a time series which then ingested into configured remote storage. Rules with high numbers may cause the most pressure on the remote database and become a source of too high cardinality. fieldConfig: defaults: color: mode: palette-classic custom: axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 385 id: 31 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: mode: multi sort: desc pluginVersion: 9.2.6 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: "topk($topk, \n max(\n sum(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, instance, group, file, recording) > 0\n ) by(job, group, file, recording)\n)" interval: '' legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}.{{`{{`}}recording{{`}}`}}({{`{{`}}file{{`}}`}}) range: true refId: A title: Top $topk rules by produced samples ($group) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: 'Shows the rules which do not produce any [samples](https://docs.victoriametrics.com/victoriametrics/keyconcepts/#raw-samples) during the evaluation. Usually it means that such rules are misconfigured, since they give no output during the evaluation. Please check if rule''s expression is correct and it is working as expected.' fieldConfig: defaults: color: mode: palette-classic custom: axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: true stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 12 'y': 385 id: 33 options: legend: calcs: - lastNotNull - max - mean displayMode: table placement: bottom showLegend: true sortBy: Last * sortDesc: true tooltip: mode: multi sort: desc pluginVersion: 8.0.3 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: count(vmalert_recording_rules_last_evaluation_samples{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"} < 1) by(job, group, file, recording) interval: '' legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}.{{`{{`}}recording{{`}}`}}({{`{{`}}file{{`}}`}}) range: true refId: A title: Rules with 0 produced samples ($group) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds fieldConfig: defaults: color: mode: palette-classic custom: axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 393 id: 30 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: mode: multi sort: none pluginVersion: 9.2.6 targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: false expr: sum(increase(vmalert_recording_rules_errors_total{job=~"$job", instance=~"$instance", group=~"$group", file=~"$file"}[$__rate_interval])) by(job, group, file, recording) > 0 interval: '' legendFormat: ({{`{{`}}job{{`}}`}}) {{`{{`}}group{{`}}`}}.{{`{{`}}recording{{`}}`}}({{`{{`}}file{{`}}`}}) range: true refId: A title: Errors ($group) type: timeseries title: Recording rules ($instance) type: row - collapsed: true gridPos: h: 1 w: 24 x: 0 'y': 29 id: 55 panels: - datasource: type: {{ $defaultDatasource }} uid: $ds fieldConfig: defaults: color: mode: palette-classic custom: axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: auto spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 351 id: 52 options: legend: calcs: [] displayMode: list placement: bottom showLegend: true tooltip: mode: single sort: desc targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code expr: sum(rate(vmalert_remotewrite_sent_rows_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job) legendFormat: __auto range: true refId: A title: Datapoints send rate ($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows the number of datapoints dropped by vmalert while sending to the configured remote write URL. vmalert performs up to 5 retries before dropping the data. Check vmalert's error logs for the specific error message. fieldConfig: defaults: color: mode: palette-classic custom: axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: auto spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' mappings: [] thresholds: mode: absolute steps: - color: green - color: red value: 80 overrides: [] gridPos: h: 8 w: 12 x: 12 'y': 351 id: 53 options: legend: calcs: [] displayMode: list placement: bottom showLegend: true tooltip: mode: single sort: desc targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code expr: sum(rate(vmalert_remotewrite_dropped_rows_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job) > 0 legendFormat: __auto range: true refId: A title: Datapoints drop rate ($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: 'Shows current number of established connections to remote write endpoints. ' fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: short overrides: [] gridPos: h: 8 w: 12 x: 0 'y': 378 id: 54 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: mode: multi sort: desc targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: true expr: sum(max_over_time(vmalert_remotewrite_conns{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job) interval: '' legendFormat: __auto range: true refId: A title: Connections ($instance) type: timeseries - datasource: type: {{ $defaultDatasource }} uid: $ds description: Shows the global rate for number of written bytes via remote write connections. fieldConfig: defaults: color: mode: palette-classic custom: axisBorderShow: false axisCenteredZero: false axisColorMode: text axisLabel: '' axisPlacement: auto barAlignment: 0 drawStyle: line fillOpacity: 0 gradientMode: none hideFrom: legend: false tooltip: false viz: false insertNulls: false lineInterpolation: linear lineWidth: 1 pointSize: 5 scaleDistribution: type: linear showPoints: never spanNulls: false stacking: group: A mode: none thresholdsStyle: mode: 'off' links: [] mappings: [] min: 0 thresholds: mode: absolute steps: - color: green - color: red value: 80 unit: decbytes overrides: [] gridPos: h: 8 w: 12 x: 12 'y': 378 id: 60 options: legend: calcs: - mean - lastNotNull - max displayMode: table placement: bottom showLegend: true tooltip: mode: multi sort: desc targets: - datasource: type: {{ $defaultDatasource }} uid: $ds editorMode: code exemplar: true expr: sum(rate(vmalert_remotewrite_conn_bytes_written_total{job=~"$job", instance=~"$instance"}[$__rate_interval])) by(job) > 0 interval: '' legendFormat: __auto range: true refId: A title: Bytes write rate ($instance) type: timeseries title: Remote write type: row preload: false refresh: '' schemaVersion: 41 tags: - victoriametrics - vm-k8s-stack templating: list: - current: text: VictoriaMetrics - cluster value: PAF93674D0B4E9963 includeAll: false name: ds options: [] query: {{ $defaultDatasource }} refresh: 1 regex: '' type: datasource - current: {} datasource: type: prometheus uid: $ds definition: label_values(vm_app_version{version=~"^vmalert.*"}, job) includeAll: true multi: true name: job options: [] query: query: label_values(vm_app_version{version=~"^vmalert.*"}, job) refId: StandardVariableQuery refresh: 1 regex: '' type: query - allValue: .* current: {} datasource: type: prometheus uid: $ds definition: label_values(vm_app_version{job=~"$job"}, instance) includeAll: true multi: true name: instance options: [] query: query: label_values(vm_app_version{job=~"$job"}, instance) refId: StandardVariableQuery refresh: 1 regex: '' type: query - allValue: .* current: {} datasource: type: prometheus uid: $ds definition: label_values(vmalert_iteration_total{job=~"$job", instance=~"$instance"},file) includeAll: true multi: true name: file options: [] query: query: label_values(vmalert_iteration_total{job=~"$job", instance=~"$instance"},file) refId: PrometheusVariableQueryEditor-VariableQuery refresh: 1 regex: '' type: query - allValue: .* current: {} datasource: type: prometheus uid: $ds definition: label_values(vmalert_iteration_total{job=~"$job", instance=~"$instance"}, group) includeAll: true multi: true name: group options: [] query: query: label_values(vmalert_iteration_total{job=~"$job", instance=~"$instance"}, group) refId: StandardVariableQuery refresh: 1 regex: '' type: query - current: text: '5' value: '5' includeAll: false name: topk options: - selected: true text: '5' value: '5' - selected: false text: '10' value: '10' - selected: false text: '20' value: '20' - selected: false text: '30' value: '30' - selected: false text: '40' value: '40' - selected: false text: '50' value: '50' query: 5, 10, 20, 30, 40, 50 type: custom - baseFilters: [] datasource: type: prometheus uid: ${ds} filters: [] name: filter type: adhoc time: from: now-3h to: now timepicker: {} timezone: {{ default "utc" ($Values.defaultDashboards).defaultTimezone }} title: VictoriaMetrics - vmalert uid: LzldHAVnz version: 1