60 lines
2.7 KiB
YAML
60 lines
2.7 KiB
YAML
{{- $Values := (.helm).Values | default .Values }}
|
|
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
|
|
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
|
|
{{- $additionalGroupByLabels := append $Values.defaultRules.additionalGroupByLabels $clusterLabel }}
|
|
{{- $groupLabels := join "," $additionalGroupByLabels }}
|
|
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
|
|
condition: '{{ true }}'
|
|
name: general.rules
|
|
rules:
|
|
- alert: TargetDown
|
|
annotations:
|
|
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
|
|
runbook_url: '{{ $runbookUrl }}/general/targetdown'
|
|
summary: 'One or more targets are unreachable.'
|
|
condition: '{{ true }}'
|
|
expr: 100 * (count(up == 0) BY (job,namespace,service,{{ $groupLabels }}) / count(up) BY (job,namespace,service,{{ $groupLabels }})) > 10
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: Watchdog
|
|
annotations:
|
|
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
|
|
|
|
This alert is always firing, therefore it should always be firing in Alertmanager
|
|
|
|
and always fire against a receiver. There are integrations with various notification
|
|
|
|
mechanisms that send a notification when this alert is not firing. For example the
|
|
|
|
"DeadMansSnitch" integration in PagerDuty.
|
|
|
|
'
|
|
runbook_url: '{{ $runbookUrl }}/general/watchdog'
|
|
summary: 'An alert that should always be firing to certify that Alertmanager is working properly.'
|
|
condition: '{{ true }}'
|
|
expr: vector(1)
|
|
labels:
|
|
severity: none
|
|
- alert: InfoInhibitor
|
|
annotations:
|
|
description: 'This is an alert that is used to inhibit info alerts.
|
|
|
|
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
|
|
|
other alerts.
|
|
|
|
This alert fires whenever there''s a severity="info" alert, and stops firing when another alert with a
|
|
|
|
severity of ''warning'' or ''critical'' starts firing on the same namespace.
|
|
|
|
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
|
|
|
'
|
|
runbook_url: '{{ $runbookUrl }}/general/infoinhibitor'
|
|
summary: 'Info-level alert inhibition.'
|
|
condition: '{{ true }}'
|
|
expr: ALERTS{severity = "info"} == 1 unless on (namespace,{{ $groupLabels }}) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
|
labels:
|
|
severity: none
|