280 lines
15 KiB
YAML
280 lines
15 KiB
YAML
{{- $Values := (.helm).Values | default .Values }}
|
|
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
|
|
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
|
|
{{- $additionalGroupByLabels := append $Values.defaultRules.additionalGroupByLabels $clusterLabel }}
|
|
{{- $groupLabels := join "," $additionalGroupByLabels }}
|
|
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
|
|
condition: '{{ true }}'
|
|
name: kubernetes-apps
|
|
rules:
|
|
- alert: KubePodCrashLooping
|
|
annotations:
|
|
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff") on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubepodcrashlooping'
|
|
summary: 'Pod is crash looping.'
|
|
condition: '{{ true }}'
|
|
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m]) >= 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubePodNotReady
|
|
annotations:
|
|
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubepodnotready'
|
|
summary: 'Pod has been in a non-ready state for more than 15 minutes.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
sum by (namespace,pod,{{ $groupLabels }}) (
|
|
max by (namespace,pod,{{ $groupLabels }}) (
|
|
kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}", phase=~"Pending|Unknown|Failed"}
|
|
) * on (namespace,pod,{{ $groupLabels }}) group_left(owner_kind) topk by (namespace,pod,{{ $groupLabels }}) (
|
|
1, max by (namespace,pod,owner_kind,{{ $groupLabels }}) (kube_pod_owner{owner_kind!="Job"})
|
|
)
|
|
) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDeploymentGenerationMismatch
|
|
annotations:
|
|
description: 'Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch'
|
|
summary: 'Deployment generation mismatch due to possible roll-back'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
!=
|
|
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDeploymentReplicasMismatch
|
|
annotations:
|
|
description: 'Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch'
|
|
summary: 'Deployment has not matched the expected number of replicas.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
(
|
|
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
>
|
|
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
) and (
|
|
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m])
|
|
==
|
|
0
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDeploymentRolloutStuck
|
|
annotations:
|
|
description: 'Rollout of deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} is not progressing for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedeploymentrolloutstuck'
|
|
summary: 'Deployment rollout is not progressing.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
!= 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeStatefulSetReplicasMismatch
|
|
annotations:
|
|
description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch'
|
|
summary: 'StatefulSet has not matched the expected number of replicas.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
(
|
|
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
!=
|
|
kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
) and (
|
|
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m])
|
|
==
|
|
0
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeStatefulSetGenerationMismatch
|
|
annotations:
|
|
description: 'StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch'
|
|
summary: 'StatefulSet generation mismatch due to possible roll-back'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
!=
|
|
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeStatefulSetUpdateNotRolledOut
|
|
annotations:
|
|
description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout'
|
|
summary: 'StatefulSet update has not been rolled out.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
(
|
|
max by (namespace,statefulset,job,{{ $groupLabels }}) (
|
|
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
unless
|
|
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
)
|
|
*
|
|
(
|
|
kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
!=
|
|
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
)
|
|
) and (
|
|
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m])
|
|
==
|
|
0
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDaemonSetRolloutStuck
|
|
annotations:
|
|
description: 'DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15m on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck'
|
|
summary: 'DaemonSet rollout is stuck.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
(
|
|
(
|
|
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
!=
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
) or (
|
|
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
!=
|
|
0
|
|
) or (
|
|
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
!=
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
) or (
|
|
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
!=
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
)
|
|
) and (
|
|
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m])
|
|
==
|
|
0
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeContainerWaiting
|
|
annotations:
|
|
description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. (reason: "{{`{{`}} $labels.reason {{`}}`}}") on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubecontainerwaiting'
|
|
summary: 'Pod container waiting longer than 1 hour'
|
|
condition: '{{ true }}'
|
|
expr: kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDaemonSetNotScheduled
|
|
annotations:
|
|
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedaemonsetnotscheduled'
|
|
summary: 'DaemonSet pods are not scheduled.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
-
|
|
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDaemonSetMisScheduled
|
|
annotations:
|
|
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedaemonsetmisscheduled'
|
|
summary: 'DaemonSet pods are misscheduled.'
|
|
condition: '{{ true }}'
|
|
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeJobNotCompleted
|
|
annotations:
|
|
description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubejobnotcompleted'
|
|
summary: 'Job did not complete in time'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
time() - max by (namespace,job_name,{{ $groupLabels }}) (kube_job_status_start_time{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
and
|
|
kube_job_status_active{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0) > 43200
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeJobFailed
|
|
annotations:
|
|
description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubejobfailed'
|
|
summary: 'Job failed to complete.'
|
|
condition: '{{ true }}'
|
|
expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeHpaReplicasMismatch
|
|
annotations:
|
|
description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubehpareplicasmismatch'
|
|
summary: 'HPA has not matched desired number of replicas.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
!=
|
|
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
|
|
and
|
|
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
>
|
|
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
|
|
and
|
|
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
<
|
|
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
|
|
and
|
|
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[15m]) == 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeHpaMaxedOut
|
|
annotations:
|
|
description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubehpamaxedout'
|
|
summary: 'HPA is running at max replicas'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
==
|
|
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubePdbNotEnoughHealthyPods
|
|
annotations:
|
|
description: 'PDB {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.poddisruptionbudget {{`}}`}} expects {{`{{`}} $value {{`}}`}} more healthy pods. The desired number of healthy pods has not been met for at least 15m.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubepdbnotenoughhealthypods'
|
|
summary: 'PDB does not have enough healthy pods.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
(
|
|
kube_poddisruptionbudget_status_desired_healthy{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
-
|
|
kube_poddisruptionbudget_status_current_healthy{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
|
)
|
|
> 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|