infra/charts/victoria-metrics-k8s-stack/files/rules/generated/vm-health.yaml

{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $additionalGroupByLabels := append $Values.defaultRules.additionalGroupByLabels $clusterLabel }}
{{- $groupLabels := join "," $additionalGroupByLabels }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: vm-health
rules:
- alert: TooManyRestarts
  annotations:
    description: 'Job {{`{{`}} $labels.job {{`}}`}} (instance {{`{{`}} $labels.instance {{`}}`}}) has restarted more than twice in the last 15 minutes. It might be crashlooping.

      '
    summary: '{{`{{`}} $labels.job {{`}}`}} too many restarts (instance {{`{{`}} $labels.instance {{`}}`}})'
  condition: '{{ true }}'
  expr: changes(process_start_time_seconds{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth|victorialogs|vlstorage|vlselect|vlinsert).*"}[15m]) > 2
  labels:
    severity: critical
- alert: ServiceDown
  annotations:
    description: '{{`{{`}} $labels.instance {{`}}`}} of job {{`{{`}} $labels.job {{`}}`}} has been down for more than 2 minutes.'
    summary: 'Service {{`{{`}} $labels.job {{`}}`}} is down on {{`{{`}} $labels.instance {{`}}`}}'
  condition: '{{ true }}'
  expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth|victorialogs|vlstorage|vlselect|vlinsert).*"} == 0
  for: 2m
  labels:
    severity: critical
- alert: ProcessNearFDLimits
  annotations:
    description: 'Exhausting OS file descriptors limit can cause severe degradation of the process.

      Consider to increase the limit as fast as possible.

      '
    summary: 'Number of free file descriptors is less than 100 for "{{`{{`}} $labels.job {{`}}`}}"("{{`{{`}} $labels.instance {{`}}`}}") for the last 5m'
  condition: '{{ true }}'
  expr: (process_max_fds - process_open_fds) < 100
  for: 5m
  labels:
    severity: critical
- alert: TooHighMemoryUsage
  annotations:
    description: 'Too high memory usage may result into multiple issues such as OOMs or degraded performance.

      Consider to either increase available memory or decrease the load on the process.

      '
    summary: 'It is more than 80% of memory used by "{{`{{`}} $labels.job {{`}}`}}"("{{`{{`}} $labels.instance {{`}}`}}")'
  condition: '{{ true }}'
  expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8
  for: 5m
  labels:
    severity: critical
- alert: TooHighCPUUsage
  annotations:
    description: 'Too high CPU usage may be a sign of insufficient resources and make process unstable. Consider to either increase available CPU resources or decrease the load on the process.

      '
    summary: 'More than 90% of CPU is used by "{{`{{`}} $labels.job {{`}}`}}"("{{`{{`}} $labels.instance {{`}}`}}") during the last 5m'
  condition: '{{ true }}'
  expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
  for: 5m
  labels:
    severity: critical
- alert: TooHighGoroutineSchedulingLatency
  annotations:
    description: 'Go runtime is unable to schedule goroutines execution in acceptable time. This is usually a sign of insufficient CPU resources or CPU throttling. Verify that service has enough CPU resources. Otherwise, the service could work unreliably with delays in processing.

      '
    summary: '"{{`{{`}} $labels.job {{`}}`}}"("{{`{{`}} $labels.instance {{`}}`}}") has insufficient CPU resources for >15m'
  condition: '{{ true }}'
  expr: histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket[5m])) by (le,job,instance,{{ $groupLabels }})) > 0.1
  for: 15m
  labels:
    severity: critical
- alert: TooManyLogs
  annotations:
    description: 'Logging rate for job \"{{`{{`}} $labels.job {{`}}`}}\" ({{`{{`}} $labels.instance {{`}}`}}) is {{`{{`}} $value {{`}}`}} for last 15m. Worth to check logs for specific error messages.

      '
    summary: 'Too many logs printed for job "{{`{{`}} $labels.job {{`}}`}}" ({{`{{`}} $labels.instance {{`}}`}})'
  condition: '{{ true }}'
  expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0
  for: 15m
  labels:
    severity: warning
- alert: TooManyTSIDMisses
  annotations:
    description: 'The rate of TSID misses during query lookups is too high for \"{{`{{`}} $labels.job {{`}}`}}\" ({{`{{`}} $labels.instance {{`}}`}}).

      Make sure you''re running VictoriaMetrics of v1.85.3 or higher.

      Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502

      '
    summary: 'Too many TSID misses for job "{{`{{`}} $labels.job {{`}}`}}" ({{`{{`}} $labels.instance {{`}}`}})'
  condition: '{{ true }}'
  expr: rate(vm_missing_tsids_for_metric_id_total[5m]) > 0
  for: 10m
  labels:
    severity: critical
- alert: ConcurrentInsertsHitTheLimit
  annotations:
    description: "The limit of concurrent inserts on instance {{`{{`}} $labels.instance {{`}}`}} depends on the number of CPUs.\nUsually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU.\nIn some cases for components like vmagent or vminsert the alert might trigger if there are too many clients\nmaking write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then \nit might be worth adjusting `-maxConcurrentInserts` cmd-line flag.\n"
    summary: '{{`{{`}} $labels.job {{`}}`}} on instance {{`{{`}} $labels.instance {{`}}`}} is constantly hitting concurrent inserts limit'
  condition: '{{ true }}'
  expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
  for: 15m
  labels:
    severity: warning
- alert: IndexDBRecordsDrop
  annotations:
    description: "VictoriaMetrics could skip registering new timeseries during ingestion if they fail the validation process. \nFor example, `reason=too_long_item` means that time series cannot exceed 64KB. Please, reduce the number \nof labels or label values for such series. Or enforce these limits via `-maxLabelsPerTimeseries` and \n`-maxLabelValueLen` command-line flags.\n"
    summary: 'IndexDB skipped registering items during data ingestion with reason={{`{{`}} $labels.reason {{`}}`}}.'
  condition: '{{ true }}'
  expr: increase(vm_indexdb_items_dropped_total[5m]) > 0
  labels:
    severity: critical
- alert: RowsRejectedOnIngestion
  annotations:
    description: 'Ingested rows on instance "{{`{{`}} $labels.instance {{`}}`}}" are rejected due to the following reason: "{{`{{`}} $labels.reason {{`}}`}}"'
    summary: 'Some rows are rejected on "{{`{{`}} $labels.instance {{`}}`}}" on ingestion attempt'
  condition: '{{ true }}'
  expr: rate(vm_rows_ignored_total[5m]) > 0
  for: 15m
  labels:
    severity: warning
- alert: TooHighQueryLoad
  annotations:
    description: 'Instance {{`{{`}} $labels.instance {{`}}`}} ({{`{{`}} $labels.job {{`}}`}}) is failing to serve read queries during last 15m.

      Concurrency limit `-search.maxConcurrentRequests` was reached on this instance and extra queries were

      put into the queue for `-search.maxQueueDuration` interval. But even after waiting in the queue these queries weren''t served.

      This happens if instance is overloaded with the current workload, or datasource is too slow to respond.

      Possible solutions are the following:

      * reduce the query load;

      * increase compute resources or number of replicas;

      * adjust limits `-search.maxConcurrentRequests` and `-search.maxQueueDuration`.

      See more at https://docs.victoriametrics.com/victoriametrics/troubleshooting/#slow-queries.

      '
    summary: 'Read queries fail with timeout for {{`{{`}} $labels.job {{`}}`}} on instance {{`{{`}} $labels.instance {{`}}`}}'
  condition: '{{ true }}'
  expr: increase(vm_concurrent_select_limit_timeout_total[5m]) > 0
  for: 15m
  labels:
    severity: warning