infra/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-apps.yaml
Konstantin Averkiev c45fd1a6ac added vm stack
2025-07-08 17:29:32 +03:00

280 lines
15 KiB
YAML

{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $additionalGroupByLabels := append $Values.defaultRules.additionalGroupByLabels $clusterLabel }}
{{- $groupLabels := join "," $additionalGroupByLabels }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: kubernetes-apps
rules:
- alert: KubePodCrashLooping
annotations:
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff") on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubepodcrashlooping'
summary: 'Pod is crash looping.'
condition: '{{ true }}'
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m]) >= 1
for: 15m
labels:
severity: warning
- alert: KubePodNotReady
annotations:
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubepodnotready'
summary: 'Pod has been in a non-ready state for more than 15 minutes.'
condition: '{{ true }}'
expr: |-
sum by (namespace,pod,{{ $groupLabels }}) (
max by (namespace,pod,{{ $groupLabels }}) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}", phase=~"Pending|Unknown|Failed"}
) * on (namespace,pod,{{ $groupLabels }}) group_left(owner_kind) topk by (namespace,pod,{{ $groupLabels }}) (
1, max by (namespace,pod,owner_kind,{{ $groupLabels }}) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
labels:
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
description: 'Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch'
summary: 'Deployment generation mismatch due to possible roll-back'
condition: '{{ true }}'
expr: |-
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
for: 15m
labels:
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
description: 'Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch'
summary: 'Deployment has not matched the expected number of replicas.'
condition: '{{ true }}'
expr: |-
(
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
>
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
) and (
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeDeploymentRolloutStuck
annotations:
description: 'Rollout of deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} is not progressing for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedeploymentrolloutstuck'
summary: 'Deployment rollout is not progressing.'
condition: '{{ true }}'
expr: |-
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!= 0
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch'
summary: 'StatefulSet has not matched the expected number of replicas.'
condition: '{{ true }}'
expr: |-
(
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
description: 'StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch'
summary: 'StatefulSet generation mismatch due to possible roll-back'
condition: '{{ true }}'
expr: |-
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout'
summary: 'StatefulSet update has not been rolled out.'
condition: '{{ true }}'
expr: |-
(
max by (namespace,statefulset,job,{{ $groupLabels }}) (
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
)
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
description: 'DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15m on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck'
summary: 'DaemonSet rollout is stuck.'
condition: '{{ true }}'
expr: |-
(
(
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
) or (
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
0
) or (
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
) or (
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
)
) and (
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeContainerWaiting
annotations:
description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. (reason: "{{`{{`}} $labels.reason {{`}}`}}") on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubecontainerwaiting'
summary: 'Pod container waiting longer than 1 hour'
condition: '{{ true }}'
expr: kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
for: 1h
labels:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedaemonsetnotscheduled'
summary: 'DaemonSet pods are not scheduled.'
condition: '{{ true }}'
expr: |-
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
-
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
for: 10m
labels:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedaemonsetmisscheduled'
summary: 'DaemonSet pods are misscheduled.'
condition: '{{ true }}'
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
for: 15m
labels:
severity: warning
- alert: KubeJobNotCompleted
annotations:
description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubejobnotcompleted'
summary: 'Job did not complete in time'
condition: '{{ true }}'
expr: |-
time() - max by (namespace,job_name,{{ $groupLabels }}) (kube_job_status_start_time{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0) > 43200
labels:
severity: warning
- alert: KubeJobFailed
annotations:
description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubejobfailed'
summary: 'Job failed to complete.'
condition: '{{ true }}'
expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
for: 15m
labels:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubehpareplicasmismatch'
summary: 'HPA has not matched desired number of replicas.'
condition: '{{ true }}'
expr: |-
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
>
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
<
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
and
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[15m]) == 0
for: 15m
labels:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubehpamaxedout'
summary: 'HPA is running at max replicas'
condition: '{{ true }}'
expr: |-
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
==
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
for: 15m
labels:
severity: warning
- alert: KubePdbNotEnoughHealthyPods
annotations:
description: 'PDB {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.poddisruptionbudget {{`}}`}} expects {{`{{`}} $value {{`}}`}} more healthy pods. The desired number of healthy pods has not been met for at least 15m.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubepdbnotenoughhealthypods'
summary: 'PDB does not have enough healthy pods.'
condition: '{{ true }}'
expr: |-
(
kube_poddisruptionbudget_status_desired_healthy{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
-
kube_poddisruptionbudget_status_current_healthy{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
)
> 0
for: 15m
labels:
severity: warning