From 1a339042050e936d5132c198ad0fdda9d470b98f Mon Sep 17 00:00:00 2001 From: Mark Laing Date: Thu, 14 Nov 2024 18:16:57 +0000 Subject: [PATCH] DEBUG: Add recovery handling and instance put logging Signed-off-by: Mark Laing --- Makefile | 2 +- lxd/daemon.go | 38 +++++++++++++++++++++++++++--- lxd/devlxd.go | 4 ++-- lxd/instance/drivers/driver_lxc.go | 11 +++++++++ lxd/instance_put.go | 3 +++ lxd/main.go | 2 +- lxd/main_daemon.go | 13 +++++++--- lxd/operations/operations.go | 35 +++++++++++++++++++++++++++ lxd/recovery/recovery.go | 12 ++++++++++ test/godeps/lxd-agent.list | 1 + 10 files changed, 111 insertions(+), 10 deletions(-) create mode 100644 lxd/recovery/recovery.go diff --git a/Makefile b/Makefile index 64e5fba05083..49bef16b325e 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ ifeq "$(TAG_SQLITE3)" "" endif ifeq "$(GOCOVERDIR)" "" - CC="$(CC)" CGO_LDFLAGS_ALLOW="$(CGO_LDFLAGS_ALLOW)" go install -v -tags "$(TAG_SQLITE3)" -trimpath $(DEBUG) ./lxd ./lxc-to-lxd + CC="$(CC)" CGO_LDFLAGS_ALLOW="$(CGO_LDFLAGS_ALLOW)" go install -v -tags "$(TAG_SQLITE3)" -trimpath $(DEBUG) -gcflags="all=-N -l" ./lxd ./lxc-to-lxd CGO_ENABLED=0 go install -v -tags netgo -trimpath $(DEBUG) ./lxd-user ./lxd-benchmark else CC="$(CC)" CGO_LDFLAGS_ALLOW="$(CGO_LDFLAGS_ALLOW)" go install -v -tags "$(TAG_SQLITE3)" -trimpath -cover $(DEBUG) ./lxd ./lxc-to-lxd diff --git a/lxd/daemon.go b/lxd/daemon.go index 6ba14f70a644..6325169230ba 100644 --- a/lxd/daemon.go +++ b/lxd/daemon.go @@ -15,6 +15,7 @@ import ( "os/exec" "os/user" "path/filepath" + "runtime" "strings" "sync" "time" @@ -57,6 +58,7 @@ import ( "github.com/canonical/lxd/lxd/metrics" networkZone "github.com/canonical/lxd/lxd/network/zone" "github.com/canonical/lxd/lxd/node" + "github.com/canonical/lxd/lxd/recovery" "github.com/canonical/lxd/lxd/request" "github.com/canonical/lxd/lxd/response" "github.com/canonical/lxd/lxd/rsync" @@ -168,6 +170,8 @@ type Daemon struct { // Ubuntu Pro settings ubuntuPro *ubuntupro.Client + + panics chan [2]any } // DaemonConfig holds configuration values for Daemon. @@ -199,6 +203,7 @@ func newDaemon(config *DaemonConfig, os *sys.OS) *Daemon { shutdownCtx: shutdownCtx, shutdownCancel: shutdownCancel, shutdownDoneCh: make(chan error), + panics: make(chan [2]any), } d.serverCert = func() *shared.CertInfo { return d.serverCertInt } @@ -646,7 +651,7 @@ func (d *Daemon) UnixSocket() string { return filepath.Join(d.os.VarDir, "unix.socket") } -// createCmd creates API handlers for the provided endpoint including some useful behavior, +// createCmd creates API devLxdHandlers for the provided endpoint including some useful behavior, // such as appropriate authentication, authorization and checking server availability. // // The created handler also keeps track of handled requests for the API metrics @@ -661,7 +666,7 @@ func (d *Daemon) createCmd(restAPI *mux.Router, version string, c APIEndpoint) { uri = fmt.Sprintf("/%s", c.Path) } - route := restAPI.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) { + handler := func(w http.ResponseWriter, r *http.Request) { // Only endpoints from the main API (version 1.0) should be counted for the metrics. // This prevents internal endpoints from being included as well. if version == "1.0" { @@ -861,7 +866,9 @@ func (d *Daemon) createCmd(restAPI *mux.Router, version string, c APIEndpoint) { logger.Warn("Failed writing error for HTTP response", logger.Ctx{"url": uri, "err": err, "writeErr": writeErr}) } } - }) + } + + route := restAPI.Handle(uri, recoveryWrapper(handler)) // If the endpoint has a canonical name then record it so it can be used to build URLS // and accessed in the context of the request by the handler function. @@ -870,6 +877,31 @@ func (d *Daemon) createCmd(restAPI *mux.Router, version string, c APIEndpoint) { } } +func recoveryWrapper(h http.HandlerFunc) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + defer func() { + rec := recover() + if rec == nil { + return + } + + // If we panic, return an error to the user, then panic again to propagate the error back to the main + // go routine. + _ = response.InternalError(errors.New("Unexpected error occurred")).Render(w, r) + + const size = 64 << 10 + buf := make([]byte, size) + buf = buf[:runtime.Stack(buf, false)] + recovery.Panic <- recovery.PanicResult{ + Err: fmt.Errorf("%v", rec), + Stacktrace: buf, + } + }() + + h.ServeHTTP(w, r) + } +} + // have we setup shared mounts? var sharedMountsLock sync.Mutex diff --git a/lxd/devlxd.go b/lxd/devlxd.go index efdb7df9a28a..b0ab4d13121a 100644 --- a/lxd/devlxd.go +++ b/lxd/devlxd.go @@ -337,7 +337,7 @@ func devlxdUbuntuProTokenPostHandler(d *Daemon, c instance.Instance, w http.Resp return response.DevLxdResponse(http.StatusOK, tokenJSON, "json", c.Type() == instancetype.VM) } -var handlers = []devLxdHandler{ +var devLxdHandlers = []devLxdHandler{ { path: "/", handlerFunc: func(d *Daemon, c instance.Instance, w http.ResponseWriter, r *http.Request) response.Response { @@ -398,7 +398,7 @@ func devLxdAPI(d *Daemon, f hoistFunc) http.Handler { m := mux.NewRouter() m.UseEncodedPath() // Allow encoded values in path segments. - for _, handler := range handlers { + for _, handler := range devLxdHandlers { m.HandleFunc(handler.path, f(handler.handlerFunc, d)) } diff --git a/lxd/instance/drivers/driver_lxc.go b/lxd/instance/drivers/driver_lxc.go index 8678034264e4..63e694f87347 100644 --- a/lxd/instance/drivers/driver_lxc.go +++ b/lxd/instance/drivers/driver_lxc.go @@ -4070,6 +4070,7 @@ func (d *lxc) Update(args db.InstanceArgs, userRequested bool) error { reverter := revert.New() defer reverter.Fail() + logger.Info("Reached instance update backup file lock") unlock, err := d.updateBackupFileLock(context.Background()) if err != nil { return err @@ -4122,6 +4123,7 @@ func (d *lxc) Update(args db.InstanceArgs, userRequested bool) error { var profiles []string + logger.Info("Reached instance update get profile names") err = d.state.DB.Cluster.Transaction(context.TODO(), func(ctx context.Context, tx *db.ClusterTx) error { // Validate the new profiles profiles, err = tx.GetProfileNames(ctx, args.Project) @@ -4132,6 +4134,7 @@ func (d *lxc) Update(args db.InstanceArgs, userRequested bool) error { return fmt.Errorf("Failed to get profiles: %w", err) } + logger.Info("Reached instance update check profiles") checkedProfiles := []string{} for _, profile := range args.Profiles { if !shared.ValueInSlice(profile.Name, profiles) { @@ -4153,6 +4156,7 @@ func (d *lxc) Update(args db.InstanceArgs, userRequested bool) error { } } + logger.Info("Reached instance update copy old config") // Get a copy of the old configuration oldDescription := d.Description() oldArchitecture := 0 @@ -4254,6 +4258,7 @@ func (d *lxc) Update(args db.InstanceArgs, userRequested bool) error { } } + logger.Info("Reached instance update device diff") // Diff the devices removeDevices, addDevices, updateDevices, allUpdatedKeys := oldExpandedDevices.Update(d.expandedDevices, func(oldDevice deviceConfig.Device, newDevice deviceConfig.Device) []string { // This function needs to return a list of fields that are excluded from differences @@ -4301,6 +4306,7 @@ func (d *lxc) Update(args db.InstanceArgs, userRequested bool) error { } if userRequested { + logger.Info("Reached instance update config validation") // Look for deleted idmap keys. protectedKeys := []string{ "volatile.idmap.base", @@ -4448,6 +4454,7 @@ func (d *lxc) Update(args db.InstanceArgs, userRequested bool) error { // Apply the live changes if isRunning { + logger.Info("Reached instance update apply live changes") cc, err := d.initLXC(false) if err != nil { return err @@ -4460,6 +4467,7 @@ func (d *lxc) Update(args db.InstanceArgs, userRequested bool) error { // Live update the container config for _, key := range changedConfig { + logger.Info("Reached instance update apply config key live", logger.Ctx{"key": key}) value := d.expandedConfig[key] if key == "raw.apparmor" || key == "security.nesting" { @@ -4785,6 +4793,7 @@ func (d *lxc) Update(args db.InstanceArgs, userRequested bool) error { } } + logger.Info("Reached instance update apply database changes") // Finally, apply the changes to the database err = d.state.DB.Cluster.Transaction(context.TODO(), func(ctx context.Context, tx *db.ClusterTx) error { // Snapshots should update only their descriptions and expiry date. @@ -4833,6 +4842,7 @@ func (d *lxc) Update(args db.InstanceArgs, userRequested bool) error { return fmt.Errorf("Failed to update database: %w", err) } + logger.Info("Reached instance update apply backup file changes") err = d.UpdateBackupFile() if err != nil && !os.IsNotExist(err) { return fmt.Errorf("Failed to write backup file: %w", err) @@ -4840,6 +4850,7 @@ func (d *lxc) Update(args db.InstanceArgs, userRequested bool) error { // Send devlxd notifications if isRunning { + logger.Info("Reached instance update send devlxd notifications") // Config changes (only for user.* keys for _, key := range changedConfig { if !strings.HasPrefix(key, "user.") { diff --git a/lxd/instance_put.go b/lxd/instance_put.go index 563a3aed28f9..d306efcce9c3 100644 --- a/lxd/instance_put.go +++ b/lxd/instance_put.go @@ -24,6 +24,7 @@ import ( "github.com/canonical/lxd/lxd/util" "github.com/canonical/lxd/shared" "github.com/canonical/lxd/shared/api" + "github.com/canonical/lxd/shared/logger" "github.com/canonical/lxd/shared/osarch" "github.com/canonical/lxd/shared/revert" "github.com/canonical/lxd/shared/version" @@ -61,6 +62,7 @@ import ( // "500": // $ref: "#/responses/InternalServerError" func instancePut(d *Daemon, r *http.Request) response.Response { + logger.Info("Reached instance put") // Don't mess with instance while in setup mode. <-d.waitReady.Done() @@ -166,6 +168,7 @@ func instancePut(d *Daemon, r *http.Request) response.Response { // Update container configuration do = func(op *operations.Operation) error { + logger.Info("Reached instance put operation") defer unlock() args := db.InstanceArgs{ diff --git a/lxd/main.go b/lxd/main.go index ce7f79d01cad..425f050ca93a 100644 --- a/lxd/main.go +++ b/lxd/main.go @@ -85,7 +85,7 @@ func (c *cmdGlobal) rawArgs(cmd *cobra.Command) []string { func main() { // daemon command (main) daemonCmd := cmdDaemon{} - app := daemonCmd.Command() + app := daemonCmd.command() app.SilenceUsage = true app.CompletionOptions = cobra.CompletionOptions{DisableDefaultCmd: true} diff --git a/lxd/main_daemon.go b/lxd/main_daemon.go index a135222ff5c7..7c54cf4940f7 100644 --- a/lxd/main_daemon.go +++ b/lxd/main_daemon.go @@ -2,6 +2,7 @@ package main import ( "context" + "errors" "fmt" "os" "os/exec" @@ -10,6 +11,7 @@ import ( "github.com/spf13/cobra" "golang.org/x/sys/unix" + "github.com/canonical/lxd/lxd/recovery" "github.com/canonical/lxd/lxd/sys" "github.com/canonical/lxd/shared/logger" ) @@ -21,7 +23,7 @@ type cmdDaemon struct { flagGroup string } -func (c *cmdDaemon) Command() *cobra.Command { +func (c *cmdDaemon) command() *cobra.Command { cmd := &cobra.Command{} cmd.Use = "lxd" cmd.Short = "The LXD container manager (daemon)" @@ -34,13 +36,13 @@ func (c *cmdDaemon) Command() *cobra.Command { There are however a number of subcommands that let you interact directly with the local LXD daemon and which may not be performed through the REST API alone. ` - cmd.RunE = c.Run + cmd.RunE = c.run cmd.Flags().StringVar(&c.flagGroup, "group", "", "The group of users that will be allowed to talk to LXD"+"``") return cmd } -func (c *cmdDaemon) Run(cmd *cobra.Command, args []string) error { +func (c *cmdDaemon) run(cmd *cobra.Command, args []string) error { if len(args) > 1 || (len(args) == 1 && args[0] != "daemon" && args[0] != "") { return fmt.Errorf("unknown command \"%s\" for \"%s\"", args[0], cmd.CommandPath()) } @@ -93,6 +95,11 @@ func (c *cmdDaemon) Run(cmd *cobra.Command, args []string) error { case err = <-d.shutdownDoneCh: return err + + case p := <-recovery.Panic: + _, _ = os.Stderr.Write([]byte(fmt.Sprintf("panic: %v\n", p.Err))) + _, _ = os.Stderr.Write(p.Stacktrace) + return errors.New("Unexpected error occurred") } } } diff --git a/lxd/operations/operations.go b/lxd/operations/operations.go index f7138a4d02f3..2a3b7b88b358 100644 --- a/lxd/operations/operations.go +++ b/lxd/operations/operations.go @@ -2,8 +2,10 @@ package operations import ( "context" + "errors" "fmt" "net/http" + "runtime" "sync" "time" @@ -12,6 +14,7 @@ import ( "github.com/canonical/lxd/lxd/auth" "github.com/canonical/lxd/lxd/db/operationtype" "github.com/canonical/lxd/lxd/events" + "github.com/canonical/lxd/lxd/recovery" "github.com/canonical/lxd/lxd/request" "github.com/canonical/lxd/lxd/response" "github.com/canonical/lxd/lxd/state" @@ -295,6 +298,38 @@ func (op *Operation) Start() error { if op.onRun != nil { go func(op *Operation) { + defer func() { + r := recover() + if r == nil { + return + } + + if op != nil { + op.lock.Lock() + op.status = api.Failure + op.err = errors.New("Unexpected error occurred") + op.lock.Unlock() + op.done() + + op.logger.Debug("Failure for operation", logger.Ctx{"err": op.err}) + _, md, _ := op.Render() + + op.lock.Lock() + op.sendEvent(md) + op.lock.Unlock() + time.Sleep(500 * time.Millisecond) + } + + const size = 64 << 10 + buf := make([]byte, size) + buf = buf[:runtime.Stack(buf, false)] + + recovery.Panic <- recovery.PanicResult{ + Err: fmt.Errorf("%v", r), + Stacktrace: buf, + } + }() + err := op.onRun(op) if err != nil { op.lock.Lock() diff --git a/lxd/recovery/recovery.go b/lxd/recovery/recovery.go new file mode 100644 index 000000000000..5a1810a6979c --- /dev/null +++ b/lxd/recovery/recovery.go @@ -0,0 +1,12 @@ +package recovery + +// Panic is a channel that a single PanicResult can be sent on. This channel is listened to in the `lxd` daemon command +// and if received, the daemon exits. We are doing this rather than panicking directly in case any libraries implement +// their own panic recovery. For example, we need to bypass the recovery in the standard net/http library (see https://github.com/golang/go/issues/25245). +var Panic = make(chan PanicResult, 1) + +// PanicResult contains the panic cause and a stacktrace. +type PanicResult struct { + Err error + Stacktrace []byte +} diff --git a/test/godeps/lxd-agent.list b/test/godeps/lxd-agent.list index b42e34f8f304..22c2d2436502 100644 --- a/test/godeps/lxd-agent.list +++ b/test/godeps/lxd-agent.list @@ -11,6 +11,7 @@ github.com/canonical/lxd/lxd/instance/instancetype github.com/canonical/lxd/lxd/ip github.com/canonical/lxd/lxd/metrics github.com/canonical/lxd/lxd/operations +github.com/canonical/lxd/lxd/recovery github.com/canonical/lxd/lxd/request github.com/canonical/lxd/lxd/response github.com/canonical/lxd/lxd/state