63 lines
2.6 KiB
YAML
63 lines
2.6 KiB
YAML
{{- $Values := (.helm).Values | default .Values }}
|
|
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
|
|
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
|
|
{{- $additionalGroupByLabels := append $Values.defaultRules.additionalGroupByLabels $clusterLabel }}
|
|
{{- $groupLabels := join "," $additionalGroupByLabels }}
|
|
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
|
|
condition: '{{ true }}'
|
|
name: vmoperator
|
|
rules:
|
|
- alert: LogErrors
|
|
annotations:
|
|
dashboard: '{{`{{`}} $externalURL {{`}}`}}/d/1H179hunk/victoriametrics-operator?ds={{`{{`}} $labels.dc {{`}}`}}&orgId=1&viewPanel=16'
|
|
description: 'Operator has too many errors at logs: {{`{{`}} $value{{`}}`}}, check operator logs'
|
|
summary: 'Too many errors at logs of operator: {{`{{`}} $value{{`}}`}}'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
sum(
|
|
rate(
|
|
operator_log_messages_total{
|
|
level="error",job=~".*((victoria.*)|vm)-?operator"
|
|
}[5m]
|
|
)
|
|
) by ({{ $groupLabels }}) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
show_at: dashboard
|
|
- alert: ReconcileErrors
|
|
annotations:
|
|
dashboard: '{{`{{`}} $externalURL {{`}}`}}/d/1H179hunk/victoriametrics-operator?ds={{`{{`}} $labels.dc {{`}}`}}&orgId=1&viewPanel=10'
|
|
description: 'Operator cannot parse response from k8s api server, possible bug: {{`{{`}} $value {{`}}`}}, check operator logs'
|
|
summary: 'Too many errors at reconcile loop of operator: {{`{{`}} $value{{`}}`}}'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
sum(
|
|
rate(
|
|
controller_runtime_reconcile_errors_total{
|
|
job=~".*((victoria.*)|vm)-?operator"
|
|
}[5m]
|
|
)
|
|
) by ({{ $groupLabels }}) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
show_at: dashboard
|
|
- alert: HighQueueDepth
|
|
annotations:
|
|
dashboard: '{{`{{`}} $externalURL {{`}}`}}/d/1H179hunk/victoriametrics-operator?ds={{`{{`}} $labels.dc {{`}}`}}&orgId=1&viewPanel=20'
|
|
description: 'Operator cannot handle reconciliation load for controller: `{{`{{`}}- $labels.name {{`}}`}}`, current depth: {{`{{`}} $value {{`}}`}}'
|
|
summary: 'Too many `{{`{{`}}- $labels.name {{`}}`}}` in queue: {{`{{`}} $value {{`}}`}}'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
sum(
|
|
workqueue_depth{
|
|
job=~".*((victoria.*)|vm)-?operator",
|
|
name=~"(vmagent|vmalert|vmalertmanager|vmauth|vmcluster|vmnodescrape|vmpodscrape|vmprobe|vmrule|vmservicescrape|vmsingle|vmstaticscrape)"
|
|
}
|
|
) by (name,{{ $groupLabels }}) > 10
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
show_at: dashboard
|