Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: cpu resource limits removed during boost #59

Merged
merged 1 commit into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Note: this is not an officially supported Google product.
* [[Boost resources] fixed target](#boost-resources-fixed-target)
* [[Boost duration] fixed time](#boost-duration-fixed-time)
* [[Boost duration] POD condition](#boost-duration-pod-condition)
* [Configuration](#configuration)
* [License](#license)

## Description
Expand All @@ -35,7 +36,9 @@ The Kube Startup CPU Boost leverages [In-place Resource Resize for Kubernetes Po
feature introduced in Kubernetes 1.27. It allows to revert workload's CPU resource requests and limits
back to their original values without the need to recreate the Pods.

The increase of resources is achieved by Mutating Admission Webhook.
The increase of resources is achieved by Mutating Admission Webhook. By default, the webhook also
removes CPU resource limits if present. The original resource values are set by operator after given
period of time or when the POD condition is met.

## Installation

Expand Down Expand Up @@ -203,6 +206,23 @@ Define the POD condition, the resource boost effect will last until the conditio
status: "True"
```

## Configuration

Kube Startup CPU Boost operator can be configured with environmental variables.

| Variable | Type | Default | Description |
| --- | --- | --- | --- |
| `POD_NAMESPACE` | `string` | `kube-startup-cpu-boost-system` | Kube Startup CPU Boost operator namespace |
| `MGR_CHECK_INTERVAL` | `int` | `5` | Duration in seconds between boost manager checks for time based boost duration policy |
| `LEADER_ELECTION` | `bool` | `false` | Enables leader election for controller manager |
| `METRICS_PROBE_BIND_ADDR` | `string` | `:8080` | Address the metrics endpoint binds to |
| `HEALTH_PROBE_BIND_ADDR` | `string` | `:8081` | Address the health probe endpoint binds to |
| `SECURE_METRICS` | `bool` | `false` | Determines if the metrics endpoint is served securely |
| `ZAP_LOG_LEVEL` | `int` | `0` | Log level for ZAP logger |
| `ZAP_DEVELOPMENT` | `bool` | `false` | Enables development mode for ZAP logger |
| `HTTP2` | `bool` | `false` | Determines if the HTTP/2 protocol is used for webhook and metrics servers|
| `REMOVE_LIMITS` | `bool` | `true` | Enables operator to remove container CPU limits during the boost time |

## License

[Apache License 2.0](LICENSE)
23 changes: 3 additions & 20 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,6 @@ func main() {
HealthProbeBindAddress: cfg.HealthProbeBindAddr,
LeaderElection: cfg.LeaderElection,
LeaderElectionID: leaderElectionID,
// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
// when the Manager ends. This requires the binary to immediately end when the
// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
// speeds up voluntary leader transitions as the new leader don't have to wait
// LeaseDuration time first.
//
// In the default scaffold provided, the program ends immediately after
// the manager stops, so would be fine to enable this option. However,
// if you are doing or is intended to do any operation such as perform cleanups
// after the manager stops then its usage might be unsafe.
// LeaderElectionReleaseOnCancel: true,
})
if err != nil {
setupLog.Error(err, "unable to start manager")
Expand All @@ -111,7 +100,7 @@ func main() {
}

boostMgr := boost.NewManager(mgr.GetClient())
go setupControllers(mgr, boostMgr, certsReady)
go setupControllers(mgr, boostMgr, cfg, certsReady)

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
Expand All @@ -131,7 +120,7 @@ func main() {
}
}

func setupControllers(mgr ctrl.Manager, boostMgr boost.Manager, certsReady chan struct{}) {
func setupControllers(mgr ctrl.Manager, boostMgr boost.Manager, cfg *config.Config, certsReady chan struct{}) {
setupLog.Info("Waiting for certificate generation to complete")
<-certsReady
setupLog.Info("Certificate generation has completed")
Expand All @@ -140,7 +129,7 @@ func setupControllers(mgr ctrl.Manager, boostMgr boost.Manager, certsReady chan
setupLog.Error(err, "Unable to create webhook", "webhook", failedWebhook)
os.Exit(1)
}
cpuBoostWebHook := boostWebhook.NewPodCPUBoostWebHook(boostMgr, scheme)
cpuBoostWebHook := boostWebhook.NewPodCPUBoostWebHook(boostMgr, scheme, cfg.RemoveLimits)
mgr.GetWebhookServer().Register("/mutate-v1-pod", cpuBoostWebHook)
boostCtrl := &controller.StartupCPUBoostReconciler{
Client: mgr.GetClient(),
Expand All @@ -153,11 +142,5 @@ func setupControllers(mgr ctrl.Manager, boostMgr boost.Manager, certsReady chan
setupLog.Error(err, "unable to create controller", "controller", "StartupCPUBoost")
os.Exit(1)
}
/*
if err = (&autoscalingv1alpha1.StartupCPUBoost{}).SetupWebhookWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "StartupCPUBoost")
os.Exit(1)
}
*/
//+kubebuilder:scaffold:builder
}
8 changes: 6 additions & 2 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ const (
ZapLogLevelDefault = 0 // zapcore.InfoLevel
ZapDevelopmentDefault = false
HTTP2Default = false
RemoveLimitsDefault = true
)

// ConfigProvider provides the Kube Startup CPU Boost configuration
Expand All @@ -42,9 +43,9 @@ type Config struct {
// LeaderElection enables leader election for controller manager
// Enabling this will ensure there is only one active controller manager
LeaderElection bool
// MetricsProbeBindAddr is the address the metric endpoint binds to
// MetricsProbeBindAddr is the address the metrics endpoint binds to
MetricsProbeBindAddr string
// HeathProbeBindAddr is the address the probe endpoint binds to
// HeathProbeBindAddr is the address the health probe endpoint binds to
HealthProbeBindAddr string
// SecureMetrics determines if the metrics endpoint is served securely
SecureMetrics bool
Expand All @@ -54,6 +55,8 @@ type Config struct {
ZapDevelopment bool
// HTTP2 determines if the HTTP/2 protocol is used for webhook and metrics servers
HTTP2 bool
// RemoveLimits determines if CPU resource limits should be removed during boost
RemoveLimits bool
}

// LoadDefaults loads the default configuration values
Expand All @@ -67,4 +70,5 @@ func (c *Config) LoadDefaults() {
c.ZapLogLevel = ZapLogLevelDefault
c.ZapDevelopment = ZapDevelopmentDefault
c.HTTP2 = HTTP2Default
c.RemoveLimits = RemoveLimitsDefault
}
7 changes: 5 additions & 2 deletions internal/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,14 @@ var _ = Describe("Config", func() {
It("has valid ZAP log level", func() {
Expect(cfg.ZapLogLevel).To(Equal(config.ZapLogLevelDefault))
})
It("has valid ZAP development ", func() {
It("has valid ZAP development", func() {
Expect(cfg.ZapDevelopment).To(Equal(config.ZapDevelopmentDefault))
})
It("has valid HTTP2 ", func() {
It("has valid HTTP2", func() {
Expect(cfg.HTTP2).To(Equal(config.HTTP2Default))
})
It("has valid RemoveLimits", func() {
Expect(cfg.RemoveLimits).To(Equal(config.RemoveLimitsDefault))
})
})
})
15 changes: 14 additions & 1 deletion internal/config/env_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ const (
ZapLogLevelEnvVar = "ZAP_LOG_LEVEL"
ZapDevelopmentEnvVar = "ZAP_DEVELOPMENT"
HTTP2EnvVar = "HTTP2"
RemoveLimitsEnvVar = "REMOVE_LIMITS"
)

type LookupEnvFunc func(key string) (string, bool)
Expand Down Expand Up @@ -57,6 +58,7 @@ func (p *EnvConfigProvider) LoadConfig() (*Config, error) {
errs = p.loadZapLogLevel(&config, errs)
errs = p.loadZapDevelopment(&config, errs)
errs = p.loadHTTP2(&config, errs)
errs = p.loadRemoveLimits(&config, errs)
var err error
if len(errs) > 0 {
err = errors.Join(errs...)
Expand Down Expand Up @@ -142,7 +144,18 @@ func (p *EnvConfigProvider) loadHTTP2(config *Config, curErrs []error) (errs []e
boolVal, err := strconv.ParseBool(v)
config.HTTP2 = boolVal
if err != nil {
errs = append(curErrs, fmt.Errorf("%s value is not a bool: %s", LeaderElectionEnvVar, err))
errs = append(curErrs, fmt.Errorf("%s value is not a bool: %s", HTTP2EnvVar, err))
}
}
return
}

func (p *EnvConfigProvider) loadRemoveLimits(config *Config, curErrs []error) (errs []error) {
if v, ok := p.lookupFunc(RemoveLimitsEnvVar); ok {
boolVal, err := strconv.ParseBool(v)
config.RemoveLimits = boolVal
if err != nil {
errs = append(curErrs, fmt.Errorf("%s value is not a bool: %s", RemoveLimitsEnvVar, err))
}
}
return
Expand Down
8 changes: 8 additions & 0 deletions internal/config/env_provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,5 +131,13 @@ var _ = Describe("EnvProvider", func() {
Expect(cfg.HTTP2).To(BeTrue())
})
})
When("removeLimits variable is set", func() {
BeforeEach(func() {
lookupFuncMap[config.RemoveLimitsEnvVar] = "false"
})
It("has valid remove limits", func() {
Expect(cfg.RemoveLimits).To(BeFalse())
})
})
})
})
15 changes: 10 additions & 5 deletions internal/webhook/podcpuboost_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,17 @@ import (
// +kubebuilder:webhook:path=/mutate-v1-pod,mutating=true,failurePolicy=ignore,sideEffects=None,timeoutSeconds=2,groups="",resources=pods,verbs=create,versions=v1,name=cpuboost.autoscaling.x-k8s.io,admissionReviewVersions=v1

type podCPUBoostHandler struct {
decoder admission.Decoder
manager boost.Manager
decoder admission.Decoder
manager boost.Manager
removeLimits bool
}

func NewPodCPUBoostWebHook(mgr boost.Manager, scheme *runtime.Scheme) *webhook.Admission {
func NewPodCPUBoostWebHook(mgr boost.Manager, scheme *runtime.Scheme, removeLimits bool) *webhook.Admission {
return &webhook.Admission{
Handler: &podCPUBoostHandler{
manager: mgr,
decoder: admission.NewDecoder(scheme),
manager: mgr,
decoder: admission.NewDecoder(scheme),
removeLimits: removeLimits,
},
}
}
Expand Down Expand Up @@ -89,6 +91,9 @@ func (h *podCPUBoostHandler) boostContainerResources(ctx context.Context, b boos
"newCpuRequests", resources.Requests.Cpu().String(),
"newCpuLimits", resources.Limits.Cpu().String(),
)
if h.removeLimits {
delete(resources.Limits, corev1.ResourceCPU)
}
pod.Spec.Containers[i].Resources = *resources
log.Info("pod resources increased")
}
Expand Down
50 changes: 42 additions & 8 deletions internal/webhook/podcpuboost_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,12 @@ import (
var _ = Describe("Pod CPU Boost Webhook", func() {
Describe("Handles admission requests", func() {
var (
mockCtrl *gomock.Controller
manager *mock.MockManager
managerCall *gomock.Call
pod *corev1.Pod
response webhook.AdmissionResponse
mockCtrl *gomock.Controller
manager *mock.MockManager
managerCall *gomock.Call
pod *corev1.Pod
response webhook.AdmissionResponse
removeLimits bool
)
BeforeEach(func() {
pod = podTemplate.DeepCopy()
Expand All @@ -72,7 +73,7 @@ var _ = Describe("Pod CPU Boost Webhook", func() {
},
},
}
hook := bwebhook.NewPodCPUBoostWebHook(manager, scheme.Scheme)
hook := bwebhook.NewPodCPUBoostWebHook(manager, scheme.Scheme, removeLimits)
response = hook.Handle(context.TODO(), admissionReq)
})
When("there is no matching Startup CPU Boost", func() {
Expand Down Expand Up @@ -130,6 +131,7 @@ var _ = Describe("Pod CPU Boost Webhook", func() {
resPolicyCallOne = boost.EXPECT().ResourcePolicy(gomock.Eq(containerOneName)).Return(resPolicy, true)
resPolicyCallTwo = boost.EXPECT().ResourcePolicy(gomock.Eq(containerTwoName)).Return(nil, false)
managerCall.Return(boost, true)
removeLimits = true
})
It("retrieves resource policy for containers", func() {
resPolicyCallOne.Times(1)
Expand Down Expand Up @@ -162,10 +164,28 @@ var _ = Describe("Pod CPU Boost Webhook", func() {
patch := containerResourcePatch(pod, resPolicy, "requests", 0)
Expect(response.Patches).To(ContainElement(patch))
})
It("returns admission with container-one limits patch", func() {
patch := containerResourcePatch(pod, resPolicy, "limits", 0)
It("returns admission with container-one remove limits patch", func() {
patch := containerRemoveRequirementPatch("limits", 0)
Expect(response.Patches).To(ContainElement(patch))
})
When("container has memory limits set", func() {
BeforeEach(func() {
pod.Spec.Containers[0].Resources.Limits[corev1.ResourceMemory] = apiResource.MustParse("100Mi")
})
It("returns admission with container-one remove CPU limits patch", func() {
patch := containerRemoveCPURequirementPatch("limits", 0)
Expect(response.Patches).To(ContainElement(patch))
})
})
When("removeLimits is not set", func() {
BeforeEach(func() {
removeLimits = false
})
It("returns admission with container-one limits patch", func() {
patch := containerResourcePatch(pod, resPolicy, "limits", 0)
Expect(response.Patches).To(ContainElement(patch))
})
})
When("container has no request and no limits set", func() {
BeforeEach(func() {
pod.Spec.Containers[0].Resources.Requests = nil
Expand Down Expand Up @@ -294,3 +314,17 @@ func containerResourcePatch(pod *corev1.Pod, policy resource.ContainerPolicy, re
Value: newQuantity.String(),
}
}

func containerRemoveCPURequirementPatch(requirement string, containerIdx int) jsonpatch.Operation {
return jsonpatch.Operation{
Operation: "remove",
Path: fmt.Sprintf("/spec/containers/%d/resources/%s/cpu", containerIdx, requirement),
}
}

func containerRemoveRequirementPatch(requirement string, containerIdx int) jsonpatch.Operation {
return jsonpatch.Operation{
Operation: "remove",
Path: fmt.Sprintf("/spec/containers/%d/resources/%s", containerIdx, requirement),
}
}