{{- $Values := (.helm).Values | default .Values }} {{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }} {{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }} {{- $additionalGroupByLabels := append $Values.defaultRules.additionalGroupByLabels $clusterLabel }} {{- $groupLabels := join "," $additionalGroupByLabels }} {{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }} condition: '{{ true }}' name: kubernetes-apps rules: - alert: KubePodCrashLooping annotations: description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff") on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubepodcrashlooping' summary: 'Pod is crash looping.' condition: '{{ true }}' expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m]) >= 1 for: 15m labels: severity: warning - alert: KubePodNotReady annotations: description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubepodnotready' summary: 'Pod has been in a non-ready state for more than 15 minutes.' condition: '{{ true }}' expr: |- sum by (namespace,pod,{{ $groupLabels }}) ( max by (namespace,pod,{{ $groupLabels }}) ( kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}", phase=~"Pending|Unknown|Failed"} ) * on (namespace,pod,{{ $groupLabels }}) group_left(owner_kind) topk by (namespace,pod,{{ $groupLabels }}) ( 1, max by (namespace,pod,owner_kind,{{ $groupLabels }}) (kube_pod_owner{owner_kind!="Job"}) ) ) > 0 for: 15m labels: severity: warning - alert: KubeDeploymentGenerationMismatch annotations: description: 'Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch' summary: 'Deployment generation mismatch due to possible roll-back' condition: '{{ true }}' expr: |- kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} != kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} for: 15m labels: severity: warning - alert: KubeDeploymentReplicasMismatch annotations: description: 'Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch' summary: 'Deployment has not matched the expected number of replicas.' condition: '{{ true }}' expr: |- ( kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} ) and ( changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m]) == 0 ) for: 15m labels: severity: warning - alert: KubeDeploymentRolloutStuck annotations: description: 'Rollout of deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} is not progressing for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubedeploymentrolloutstuck' summary: 'Deployment rollout is not progressing.' condition: '{{ true }}' expr: |- kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} != 0 for: 15m labels: severity: warning - alert: KubeStatefulSetReplicasMismatch annotations: description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch' summary: 'StatefulSet has not matched the expected number of replicas.' condition: '{{ true }}' expr: |- ( kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} != kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} ) and ( changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m]) == 0 ) for: 15m labels: severity: warning - alert: KubeStatefulSetGenerationMismatch annotations: description: 'StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch' summary: 'StatefulSet generation mismatch due to possible roll-back' condition: '{{ true }}' expr: |- kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} != kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} for: 15m labels: severity: warning - alert: KubeStatefulSetUpdateNotRolledOut annotations: description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout' summary: 'StatefulSet update has not been rolled out.' condition: '{{ true }}' expr: |- ( max by (namespace,statefulset,job,{{ $groupLabels }}) ( kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} unless kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} ) * ( kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} ) ) and ( changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m]) == 0 ) for: 15m labels: severity: warning - alert: KubeDaemonSetRolloutStuck annotations: description: 'DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15m on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck' summary: 'DaemonSet rollout is stuck.' condition: '{{ true }}' expr: |- ( ( kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} ) or ( kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} != 0 ) or ( kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} ) or ( kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} ) ) and ( changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m]) == 0 ) for: 15m labels: severity: warning - alert: KubeContainerWaiting annotations: description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. (reason: "{{`{{`}} $labels.reason {{`}}`}}") on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubecontainerwaiting' summary: 'Pod container waiting longer than 1 hour' condition: '{{ true }}' expr: kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0 for: 1h labels: severity: warning - alert: KubeDaemonSetNotScheduled annotations: description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubedaemonsetnotscheduled' summary: 'DaemonSet pods are not scheduled.' condition: '{{ true }}' expr: |- kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0 for: 10m labels: severity: warning - alert: KubeDaemonSetMisScheduled annotations: description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubedaemonsetmisscheduled' summary: 'DaemonSet pods are misscheduled.' condition: '{{ true }}' expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0 for: 15m labels: severity: warning - alert: KubeJobNotCompleted annotations: description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubejobnotcompleted' summary: 'Job did not complete in time' condition: '{{ true }}' expr: |- time() - max by (namespace,job_name,{{ $groupLabels }}) (kube_job_status_start_time{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} and kube_job_status_active{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0) > 43200 labels: severity: warning - alert: KubeJobFailed annotations: description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubejobfailed' summary: 'Job failed to complete.' condition: '{{ true }}' expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0 for: 15m labels: severity: warning - alert: KubeHpaReplicasMismatch annotations: description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubehpareplicasmismatch' summary: 'HPA has not matched desired number of replicas.' condition: '{{ true }}' expr: |- (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} != kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}) and (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}) and (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} < kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}) and changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[15m]) == 0 for: 15m labels: severity: warning - alert: KubeHpaMaxedOut annotations: description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubehpamaxedout' summary: 'HPA is running at max replicas' condition: '{{ true }}' expr: |- kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} == kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} for: 15m labels: severity: warning - alert: KubePdbNotEnoughHealthyPods annotations: description: 'PDB {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.poddisruptionbudget {{`}}`}} expects {{`{{`}} $value {{`}}`}} more healthy pods. The desired number of healthy pods has not been met for at least 15m.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubepdbnotenoughhealthypods' summary: 'PDB does not have enough healthy pods.' condition: '{{ true }}' expr: |- ( kube_poddisruptionbudget_status_desired_healthy{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} - kube_poddisruptionbudget_status_current_healthy{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} ) > 0 for: 15m labels: severity: warning