diff --git a/WIP-CHANGELOG.md b/WIP-CHANGELOG.md index e69de29bb..775a26615 100644 --- a/WIP-CHANGELOG.md +++ b/WIP-CHANGELOG.md @@ -0,0 +1,13 @@ +### Release notes + +### Added + +### Changed + +- Changed the alert `KubeContainerOOMKilled` threshold. + +### Fixed + +### Updated + +### Removed diff --git a/helmfile/charts/prometheus-alerts/templates/alerts/kubernetes-apps.yaml b/helmfile/charts/prometheus-alerts/templates/alerts/kubernetes-apps.yaml index 1cca7adfd..6752fd397 100644 --- a/helmfile/charts/prometheus-alerts/templates/alerts/kubernetes-apps.yaml +++ b/helmfile/charts/prometheus-alerts/templates/alerts/kubernetes-apps.yaml @@ -53,7 +53,7 @@ spec: annotations: description: Container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} from {{`{{`}} $labels.cluster {{`}}`}} has been OOMKilled {{`{{`}} $value {{`}}`}} times in the last 10 minutes. summary: Kubernetes container OOMKilled. - expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 + expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 30m >= 2) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[30m]) == 1 for: 0m labels: severity: warning