120 lines
6.7 KiB
YAML
120 lines
6.7 KiB
YAML
{{- $Values := (.helm).Values | default .Values }}
|
|
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
|
|
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
|
|
{{- $additionalGroupByLabels := append $Values.defaultRules.additionalGroupByLabels $clusterLabel }}
|
|
{{- $groupLabels := join "," $additionalGroupByLabels }}
|
|
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
|
|
condition: '{{ true }}'
|
|
name: kubernetes-resources
|
|
rules:
|
|
- alert: KubeCPUOvercommit
|
|
annotations:
|
|
description: 'Cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} printf "%.2f" $value {{`}}`}} CPU shares and cannot tolerate node failure.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubecpuovercommit'
|
|
summary: 'Cluster has overcommitted CPU resource requests.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by ({{ $groupLabels }}) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ $groupLabels }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ $groupLabels }})) > 0
|
|
and
|
|
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ $groupLabels }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ $groupLabels }})) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeMemoryOvercommit
|
|
annotations:
|
|
description: 'Cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}} has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubememoryovercommit'
|
|
summary: 'Cluster has overcommitted memory resource requests.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ $groupLabels }}) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }})) > 0
|
|
and
|
|
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }})) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeCPUQuotaOvercommit
|
|
annotations:
|
|
description: 'Cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}} has overcommitted CPU resource requests for Namespaces.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubecpuquotaovercommit'
|
|
summary: 'Cluster has overcommitted CPU resource requests.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by ({{ $groupLabels }})
|
|
/
|
|
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by ({{ $groupLabels }})
|
|
> 1.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeMemoryQuotaOvercommit
|
|
annotations:
|
|
description: 'Cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}} has overcommitted memory resource requests for Namespaces.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubememoryquotaovercommit'
|
|
summary: 'Cluster has overcommitted memory resource requests.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by ({{ $groupLabels }})
|
|
/
|
|
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ $groupLabels }})
|
|
> 1.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeQuotaAlmostFull
|
|
annotations:
|
|
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubequotaalmostfull'
|
|
summary: 'Namespace quota is going to be full.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
kube_resourcequota{job="kube-state-metrics", type="used"}
|
|
/ ignoring(instance, job, type)
|
|
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
|
> 0.9 < 1
|
|
for: 15m
|
|
labels:
|
|
severity: info
|
|
- alert: KubeQuotaFullyUsed
|
|
annotations:
|
|
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubequotafullyused'
|
|
summary: 'Namespace quota is fully used.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
kube_resourcequota{job="kube-state-metrics", type="used"}
|
|
/ ignoring(instance, job, type)
|
|
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
|
== 1
|
|
for: 15m
|
|
labels:
|
|
severity: info
|
|
- alert: KubeQuotaExceeded
|
|
annotations:
|
|
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/kubequotaexceeded'
|
|
summary: 'Namespace quota has exceeded the limits.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
kube_resourcequota{job="kube-state-metrics", type="used"}
|
|
/ ignoring(instance, job, type)
|
|
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
|
> 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: CPUThrottlingHigh
|
|
annotations:
|
|
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}} on cluster {{`{{`}} $labels.{{ $clusterLabel }} {{`}}`}}.'
|
|
runbook_url: '{{ $runbookUrl }}/kubernetes/cputhrottlinghigh'
|
|
summary: 'Processes experience elevated CPU throttling.'
|
|
condition: '{{ true }}'
|
|
expr: |-
|
|
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="kubelet", metrics_path="/metrics/cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node)
|
|
/ on (namespace,pod,container,instance,{{ $groupLabels }}) group_left
|
|
sum(increase(container_cpu_cfs_periods_total{job="kubelet", metrics_path="/metrics/cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node)
|
|
> ( 25 / 100 )
|
|
for: 15m
|
|
labels:
|
|
severity: info
|