infra/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-resources.yaml
Konstantin Averkiev c45fd1a6ac added vm stack
2025-07-08 17:29:32 +03:00

120 lines
6.7 KiB
YAML

{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $additionalGroupByLabels := append $Values.defaultRules.additionalGroupByLabels $clusterLabel }}
{{- $groupLabels := join "," $additionalGroupByLabels }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: kubernetes-resources
rules:
- alert: KubeCPUOvercommit
annotations:
description: 'Cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} printf "%.2f" $value {{`}}`}} CPU shares and cannot tolerate node failure.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubecpuovercommit'
summary: 'Cluster has overcommitted CPU resource requests.'
condition: '{{ true }}'
expr: |-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by ({{ $groupLabels }}) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ $groupLabels }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ $groupLabels }})) > 0
and
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ $groupLabels }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ $groupLabels }})) > 0
for: 10m
labels:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
description: 'Cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}} has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubememoryovercommit'
summary: 'Cluster has overcommitted memory resource requests.'
condition: '{{ true }}'
expr: |-
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ $groupLabels }}) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }})) > 0
and
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }})) > 0
for: 10m
labels:
severity: warning
- alert: KubeCPUQuotaOvercommit
annotations:
description: 'Cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}} has overcommitted CPU resource requests for Namespaces.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubecpuquotaovercommit'
summary: 'Cluster has overcommitted CPU resource requests.'
condition: '{{ true }}'
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by ({{ $groupLabels }})
/
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by ({{ $groupLabels }})
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeMemoryQuotaOvercommit
annotations:
description: 'Cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}} has overcommitted memory resource requests for Namespaces.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubememoryquotaovercommit'
summary: 'Cluster has overcommitted memory resource requests.'
condition: '{{ true }}'
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by ({{ $groupLabels }})
/
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }})
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeQuotaAlmostFull
annotations:
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubequotaalmostfull'
summary: 'Namespace quota is going to be full.'
condition: '{{ true }}'
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 0.9 < 1
for: 15m
labels:
severity: info
- alert: KubeQuotaFullyUsed
annotations:
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubequotafullyused'
summary: 'Namespace quota is fully used.'
condition: '{{ true }}'
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
== 1
for: 15m
labels:
severity: info
- alert: KubeQuotaExceeded
annotations:
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubequotaexceeded'
summary: 'Namespace quota has exceeded the limits.'
condition: '{{ true }}'
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 1
for: 15m
labels:
severity: warning
- alert: CPUThrottlingHigh
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}} on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/cputhrottlinghigh'
summary: 'Processes experience elevated CPU throttling.'
condition: '{{ true }}'
expr: |-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="kubelet", metrics_path="/metrics/cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node)
/ on (namespace,pod,container,instance,{{ $groupLabels }}) group_left
sum(increase(container_cpu_cfs_periods_total{job="kubelet", metrics_path="/metrics/cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node)
> ( 25 / 100 )
for: 15m
labels:
severity: info