infra/charts/victoria-metrics-k8s-stack/files/rules/generated/vmoperator.yaml
Konstantin Averkiev c45fd1a6ac added vm stack
2025-07-08 17:29:32 +03:00

63 lines
2.6 KiB
YAML

{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $additionalGroupByLabels := append $Values.defaultRules.additionalGroupByLabels $clusterLabel }}
{{- $groupLabels := join "," $additionalGroupByLabels }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: vmoperator
rules:
- alert: LogErrors
annotations:
dashboard: '{{`{{`}} $externalURL {{`}}`}}/d/1H179hunk/victoriametrics-operator?ds={{`{{`}} $labels.dc {{`}}`}}&orgId=1&viewPanel=16'
description: 'Operator has too many errors at logs: {{`{{`}} $value{{`}}`}}, check operator logs'
summary: 'Too many errors at logs of operator: {{`{{`}} $value{{`}}`}}'
condition: '{{ true }}'
expr: |-
sum(
rate(
operator_log_messages_total{
level="error",job=~".*((victoria.*)|vm)-?operator"
}[5m]
)
) by ({{ $groupLabels }}) > 0
for: 15m
labels:
severity: warning
show_at: dashboard
- alert: ReconcileErrors
annotations:
dashboard: '{{`{{`}} $externalURL {{`}}`}}/d/1H179hunk/victoriametrics-operator?ds={{`{{`}} $labels.dc {{`}}`}}&orgId=1&viewPanel=10'
description: 'Operator cannot parse response from k8s api server, possible bug: {{`{{`}} $value {{`}}`}}, check operator logs'
summary: 'Too many errors at reconcile loop of operator: {{`{{`}} $value{{`}}`}}'
condition: '{{ true }}'
expr: |-
sum(
rate(
controller_runtime_reconcile_errors_total{
job=~".*((victoria.*)|vm)-?operator"
}[5m]
)
) by ({{ $groupLabels }}) > 0
for: 10m
labels:
severity: warning
show_at: dashboard
- alert: HighQueueDepth
annotations:
dashboard: '{{`{{`}} $externalURL {{`}}`}}/d/1H179hunk/victoriametrics-operator?ds={{`{{`}} $labels.dc {{`}}`}}&orgId=1&viewPanel=20'
description: 'Operator cannot handle reconciliation load for controller: `{{`{{`}}- $labels.name {{`}}`}}`, current depth: {{`{{`}} $value {{`}}`}}'
summary: 'Too many `{{`{{`}}- $labels.name {{`}}`}}` in queue: {{`{{`}} $value {{`}}`}}'
condition: '{{ true }}'
expr: |-
sum(
workqueue_depth{
job=~".*((victoria.*)|vm)-?operator",
name=~"(vmagent|vmalert|vmalertmanager|vmauth|vmcluster|vmnodescrape|vmpodscrape|vmprobe|vmrule|vmservicescrape|vmsingle|vmstaticscrape)"
}
) by (name,{{ $groupLabels }}) > 10
for: 15m
labels:
severity: warning
show_at: dashboard