infra/charts/victoria-metrics-k8s-stack/files/rules/generated/vm-health.yaml
Konstantin Averkiev c45fd1a6ac added vm stack
2025-07-08 17:29:32 +03:00

156 lines
7.9 KiB
YAML

{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $additionalGroupByLabels := append $Values.defaultRules.additionalGroupByLabels $clusterLabel }}
{{- $groupLabels := join "," $additionalGroupByLabels }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: vm-health
rules:
- alert: TooManyRestarts
annotations:
description: 'Job {{`{{`}} $labels.job {{`}}`}} (instance {{`{{`}} $labels.instance {{`}}`}}) has restarted more than twice in the last 15 minutes. It might be crashlooping.
'
summary: '{{`{{`}} $labels.job {{`}}`}} too many restarts (instance {{`{{`}} $labels.instance {{`}}`}})'
condition: '{{ true }}'
expr: changes(process_start_time_seconds{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth|victorialogs|vlstorage|vlselect|vlinsert).*"}[15m]) > 2
labels:
severity: critical
- alert: ServiceDown
annotations:
description: '{{`{{`}} $labels.instance {{`}}`}} of job {{`{{`}} $labels.job {{`}}`}} has been down for more than 2 minutes.'
summary: 'Service {{`{{`}} $labels.job {{`}}`}} is down on {{`{{`}} $labels.instance {{`}}`}}'
condition: '{{ true }}'
expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth|victorialogs|vlstorage|vlselect|vlinsert).*"} == 0
for: 2m
labels:
severity: critical
- alert: ProcessNearFDLimits
annotations:
description: 'Exhausting OS file descriptors limit can cause severe degradation of the process.
Consider to increase the limit as fast as possible.
'
summary: 'Number of free file descriptors is less than 100 for "{{`{{`}} $labels.job {{`}}`}}"("{{`{{`}} $labels.instance {{`}}`}}") for the last 5m'
condition: '{{ true }}'
expr: (process_max_fds - process_open_fds) < 100
for: 5m
labels:
severity: critical
- alert: TooHighMemoryUsage
annotations:
description: 'Too high memory usage may result into multiple issues such as OOMs or degraded performance.
Consider to either increase available memory or decrease the load on the process.
'
summary: 'It is more than 80% of memory used by "{{`{{`}} $labels.job {{`}}`}}"("{{`{{`}} $labels.instance {{`}}`}}")'
condition: '{{ true }}'
expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8
for: 5m
labels:
severity: critical
- alert: TooHighCPUUsage
annotations:
description: 'Too high CPU usage may be a sign of insufficient resources and make process unstable. Consider to either increase available CPU resources or decrease the load on the process.
'
summary: 'More than 90% of CPU is used by "{{`{{`}} $labels.job {{`}}`}}"("{{`{{`}} $labels.instance {{`}}`}}") during the last 5m'
condition: '{{ true }}'
expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
for: 5m
labels:
severity: critical
- alert: TooHighGoroutineSchedulingLatency
annotations:
description: 'Go runtime is unable to schedule goroutines execution in acceptable time. This is usually a sign of insufficient CPU resources or CPU throttling. Verify that service has enough CPU resources. Otherwise, the service could work unreliably with delays in processing.
'
summary: '"{{`{{`}} $labels.job {{`}}`}}"("{{`{{`}} $labels.instance {{`}}`}}") has insufficient CPU resources for >15m'
condition: '{{ true }}'
expr: histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket[5m])) by (le,job,instance,{{ $groupLabels }})) > 0.1
for: 15m
labels:
severity: critical
- alert: TooManyLogs
annotations:
description: 'Logging rate for job \"{{`{{`}} $labels.job {{`}}`}}\" ({{`{{`}} $labels.instance {{`}}`}}) is {{`{{`}} $value {{`}}`}} for last 15m. Worth to check logs for specific error messages.
'
summary: 'Too many logs printed for job "{{`{{`}} $labels.job {{`}}`}}" ({{`{{`}} $labels.instance {{`}}`}})'
condition: '{{ true }}'
expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0
for: 15m
labels:
severity: warning
- alert: TooManyTSIDMisses
annotations:
description: 'The rate of TSID misses during query lookups is too high for \"{{`{{`}} $labels.job {{`}}`}}\" ({{`{{`}} $labels.instance {{`}}`}}).
Make sure you''re running VictoriaMetrics of v1.85.3 or higher.
Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502
'
summary: 'Too many TSID misses for job "{{`{{`}} $labels.job {{`}}`}}" ({{`{{`}} $labels.instance {{`}}`}})'
condition: '{{ true }}'
expr: rate(vm_missing_tsids_for_metric_id_total[5m]) > 0
for: 10m
labels:
severity: critical
- alert: ConcurrentInsertsHitTheLimit
annotations:
description: "The limit of concurrent inserts on instance {{`{{`}} $labels.instance {{`}}`}} depends on the number of CPUs.\nUsually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU.\nIn some cases for components like vmagent or vminsert the alert might trigger if there are too many clients\nmaking write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then \nit might be worth adjusting `-maxConcurrentInserts` cmd-line flag.\n"
summary: '{{`{{`}} $labels.job {{`}}`}} on instance {{`{{`}} $labels.instance {{`}}`}} is constantly hitting concurrent inserts limit'
condition: '{{ true }}'
expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
for: 15m
labels:
severity: warning
- alert: IndexDBRecordsDrop
annotations:
description: "VictoriaMetrics could skip registering new timeseries during ingestion if they fail the validation process. \nFor example, `reason=too_long_item` means that time series cannot exceed 64KB. Please, reduce the number \nof labels or label values for such series. Or enforce these limits via `-maxLabelsPerTimeseries` and \n`-maxLabelValueLen` command-line flags.\n"
summary: 'IndexDB skipped registering items during data ingestion with reason={{`{{`}} $labels.reason {{`}}`}}.'
condition: '{{ true }}'
expr: increase(vm_indexdb_items_dropped_total[5m]) > 0
labels:
severity: critical
- alert: RowsRejectedOnIngestion
annotations:
description: 'Ingested rows on instance "{{`{{`}} $labels.instance {{`}}`}}" are rejected due to the following reason: "{{`{{`}} $labels.reason {{`}}`}}"'
summary: 'Some rows are rejected on "{{`{{`}} $labels.instance {{`}}`}}" on ingestion attempt'
condition: '{{ true }}'
expr: rate(vm_rows_ignored_total[5m]) > 0
for: 15m
labels:
severity: warning
- alert: TooHighQueryLoad
annotations:
description: 'Instance {{`{{`}} $labels.instance {{`}}`}} ({{`{{`}} $labels.job {{`}}`}}) is failing to serve read queries during last 15m.
Concurrency limit `-search.maxConcurrentRequests` was reached on this instance and extra queries were
put into the queue for `-search.maxQueueDuration` interval. But even after waiting in the queue these queries weren''t served.
This happens if instance is overloaded with the current workload, or datasource is too slow to respond.
Possible solutions are the following:
* reduce the query load;
* increase compute resources or number of replicas;
* adjust limits `-search.maxConcurrentRequests` and `-search.maxQueueDuration`.
See more at https://docs.victoriametrics.com/victoriametrics/troubleshooting/#slow-queries.
'
summary: 'Read queries fail with timeout for {{`{{`}} $labels.job {{`}}`}} on instance {{`{{`}} $labels.instance {{`}}`}}'
condition: '{{ true }}'
expr: increase(vm_concurrent_select_limit_timeout_total[5m]) > 0
for: 15m
labels:
severity: warning