{{- $Values := (.helm).Values | default .Values }} {{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }} {{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }} {{- $additionalGroupByLabels := append $Values.defaultRules.additionalGroupByLabels $clusterLabel }} {{- $groupLabels := join "," $additionalGroupByLabels }} {{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }} condition: '{{ true }}' name: kubernetes-resources rules: - alert: KubeCPUOvercommit annotations: description: 'Cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} printf "%.2f" $value {{`}}`}} CPU shares and cannot tolerate node failure.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubecpuovercommit' summary: 'Cluster has overcommitted CPU resource requests.' condition: '{{ true }}' expr: |- sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by ({{ $groupLabels }}) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ $groupLabels }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ $groupLabels }})) > 0 and (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ $groupLabels }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ $groupLabels }})) > 0 for: 10m labels: severity: warning - alert: KubeMemoryOvercommit annotations: description: 'Cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}} has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubememoryovercommit' summary: 'Cluster has overcommitted memory resource requests.' condition: '{{ true }}' expr: |- sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ $groupLabels }}) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }})) > 0 and (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }})) > 0 for: 10m labels: severity: warning - alert: KubeCPUQuotaOvercommit annotations: description: 'Cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}} has overcommitted CPU resource requests for Namespaces.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubecpuquotaovercommit' summary: 'Cluster has overcommitted CPU resource requests.' condition: '{{ true }}' expr: |- sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by ({{ $groupLabels }}) / sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by ({{ $groupLabels }}) > 1.5 for: 5m labels: severity: warning - alert: KubeMemoryQuotaOvercommit annotations: description: 'Cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}} has overcommitted memory resource requests for Namespaces.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubememoryquotaovercommit' summary: 'Cluster has overcommitted memory resource requests.' condition: '{{ true }}' expr: |- sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by ({{ $groupLabels }}) / sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }}) > 1.5 for: 5m labels: severity: warning - alert: KubeQuotaAlmostFull annotations: description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubequotaalmostfull' summary: 'Namespace quota is going to be full.' condition: '{{ true }}' expr: |- kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) > 0.9 < 1 for: 15m labels: severity: info - alert: KubeQuotaFullyUsed annotations: description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubequotafullyused' summary: 'Namespace quota is fully used.' condition: '{{ true }}' expr: |- kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) == 1 for: 15m labels: severity: info - alert: KubeQuotaExceeded annotations: description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/kubequotaexceeded' summary: 'Namespace quota has exceeded the limits.' condition: '{{ true }}' expr: |- kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) > 1 for: 15m labels: severity: warning - alert: CPUThrottlingHigh annotations: description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}} on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.' runbook_url: '{{ $runbookUrl }}/kubernetes/cputhrottlinghigh' summary: 'Processes experience elevated CPU throttling.' condition: '{{ true }}' expr: |- sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="kubelet", metrics_path="/metrics/cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) / on (namespace,pod,container,instance,{{ $groupLabels }}) group_left sum(increase(container_cpu_cfs_periods_total{job="kubelet", metrics_path="/metrics/cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) > ( 25 / 100 ) for: 15m labels: severity: info