Skip to content

Commit

Permalink
Properly handle graceful shutdown of the kubelet plugin
Browse files Browse the repository at this point in the history
Signed-off-by: Kevin Klues <[email protected]>
  • Loading branch information
klueska committed Oct 29, 2024
1 parent 737b4c5 commit 9e00150
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 14 deletions.
5 changes: 4 additions & 1 deletion cmd/nvidia-dra-plugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,10 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {
return driver, nil
}

func (d *driver) Shutdown(ctx context.Context) error {
func (d *driver) Shutdown() error {
if d == nil {
return nil
}
d.plugin.Stop()
return nil
}
Expand Down
25 changes: 15 additions & 10 deletions cmd/nvidia-dra-plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,19 +183,24 @@ func StartPlugin(ctx context.Context, config *Config) error {
return fmt.Errorf("path for cdi file generation is not a directory: '%v'", config.flags.cdiRoot)
}

driver, err := NewDriver(ctx, config)
if err != nil {
return err
}

sigc := make(chan os.Signal, 1)
signal.Notify(sigc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
<-sigc
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)

var driver *driver
ctx, cancel := context.WithCancel(ctx)
defer func() {
cancel()
if err := driver.Shutdown(); err != nil {
klog.Errorf("Unable to cleanly shutdown driver: %v", err)
}
}()

err = driver.Shutdown(ctx)
driver, err = NewDriver(ctx, config)
if err != nil {
klog.Errorf("Unable to cleanly shutdown driver: %v", err)
return fmt.Errorf("error creating driver: %w", err)
}

<-sigs

return nil
}
4 changes: 1 addition & 3 deletions deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ spec:
command: ["bash", "-c"]
args:
- |-
trap 'exit 0' TERM
# TODO: Masking of the params file is done below to allow nvkind to
# selectively exclude certain GPUs from being visible to the driver.
# At present, this is only feasible with a host-mounted driver where
Expand All @@ -69,8 +68,7 @@ spec:
sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
mount --bind root/gpu-params /proc/driver/nvidia/params
fi
nvidia-dra-plugin &
wait
nvidia-dra-plugin
resources:
{{- toYaml .Values.kubeletPlugin.containers.plugin.resources | nindent 10 }}
env:
Expand Down

0 comments on commit 9e00150

Please sign in to comment.