From 6ed55672980bd5cc17318b8583a3e50723243dc1 Mon Sep 17 00:00:00 2001 From: Anis Eleuch Date: Tue, 1 Oct 2024 16:51:52 +0100 Subject: [PATCH] heal: More UX changes *) Do not show healing ETA anymore, there are some scenarios where the ETA can be wrong. Show when healing is started and the progress of the healing drive usage comparing to non healing drives in the same erasure set. *) --verbose prints only offline/faulty and healing drives *) --all-drives to show information about all drives *) Add yes/no prompt when a user wants to heal the whole namespace --- cmd/admin-heal.go | 119 +++++++++++++++++++++++++++++----------------- 1 file changed, 76 insertions(+), 43 deletions(-) diff --git a/cmd/admin-heal.go b/cmd/admin-heal.go index b09a181f9f..488aad329e 100644 --- a/cmd/admin-heal.go +++ b/cmd/admin-heal.go @@ -18,8 +18,11 @@ package cmd import ( + "bufio" "fmt" + "math" "net/url" + "os" "path/filepath" "sort" "strings" @@ -95,6 +98,10 @@ var adminHealFlags = []cli.Flag{ Name: "verbose, v", Usage: "show verbose information", }, + cli.BoolFlag{ + Name: "all-drives, a", + Usage: "select all drives for verbose printing", + }, } var adminHealCmd = cli.Command{ @@ -162,8 +169,11 @@ type poolInfo struct { } type setInfo struct { - maxUsedSpace uint64 - totalDisks int + totalDisks int + + readyDisksCount int // disks online and not in healing state + readyDisksUsedSpace uint64 // the total used space of ready disks + incapableDisks int } @@ -218,11 +228,11 @@ func generateSetsStatus(disks []madmin.Disk) map[setIndex]setInfo { setSt = setInfo{} } setSt.totalDisks++ - if d.UsedSpace > setSt.maxUsedSpace { - setSt.maxUsedSpace = d.UsedSpace - } if d.State != "ok" || d.Healing { setSt.incapableDisks++ + } else { + setSt.readyDisksCount++ + setSt.readyDisksUsedSpace += d.UsedSpace } m[idx] = setSt } @@ -327,6 +337,8 @@ type verboseBackgroundHealStatusMessage struct { Status string `json:"status"` HealInfo madmin.BgHealState + allDrives bool + // Specify storage class to show servers/disks tolerance ToleranceForSC string `json:"-"` } @@ -374,14 +386,16 @@ func (s verboseBackgroundHealStatusMessage) String() string { fmt.Fprintf(&msg, " %s: %s\n", endpoint, stateText) continue } + var serverHeader strings.Builder + var serverHeaderPrinted bool serverStatus := serversStatus[endpoint] switch { case showTolerance: - serverHeader := " %s: (Tolerance: %d server(s))\n" - fmt.Fprintf(&msg, serverHeader, endpoint, poolsInfo[serverStatus.pool].tolerance) + hdr := " %s: (Tolerance: %d server(s))\n" + fmt.Fprintf(&serverHeader, hdr, endpoint, poolsInfo[serverStatus.pool].tolerance) default: - serverHeader := " %s:\n" - fmt.Fprintf(&msg, serverHeader, endpoint) + hdr := " %s:\n" + fmt.Fprintf(&serverHeader, hdr, endpoint) } for _, d := range serverStatus.disks { @@ -393,29 +407,49 @@ func (s verboseBackgroundHealStatusMessage) String() string { case d.State == "ok" && d.Healing: stateText = console.Colorize("DiskHealing", "HEALING") case d.State == "ok": + if !s.allDrives { + continue + } stateText = console.Colorize("DiskOK", "OK") default: stateText = console.Colorize("DiskFailed", d.State) } - fmt.Fprintf(&msg, " + %s : %s\n", d.DrivePath, stateText) + if !serverHeaderPrinted { + serverHeaderPrinted = true + fmt.Fprint(&msg, serverHeader.String()) + } + drivePath := d.DrivePath + if drivePath == "" { + if u, e := url.Parse(d.Endpoint); e == nil { + drivePath = u.Path + } + } + fmt.Fprintf(&msg, " + %s : %s\n", drivePath, stateText) + + thisSet := setsStatus[setIndex{d.PoolIndex, d.SetIndex}] if d.Healing && d.HealInfo != nil && !d.HealInfo.Finished { - if d.HealInfo.RetryAttempts == 0 { - now := time.Now().UTC() - scanSpeed := float64(d.UsedSpace) / float64(now.Sub(d.HealInfo.Started)) - remainingTime := time.Duration(float64(setsStatus[setIndex{d.PoolIndex, d.SetIndex}].maxUsedSpace-d.UsedSpace) / scanSpeed) - estimationText := humanize.RelTime(now, now.Add(remainingTime), "", "") - fmt.Fprintf(&msg, " |__ Estimated: %s\n", estimationText) - } else { - fmt.Fprintf(&msg, " |__ Retry attempts: %d\n", d.HealInfo.RetryAttempts) + refUsedSpace := uint64(math.MaxUint64) + if thisSet.readyDisksCount > 0 { // to avoid crashing + refUsedSpace = thisSet.readyDisksUsedSpace / uint64(thisSet.readyDisksCount) + } + if refUsedSpace < d.UsedSpace { // normalize + refUsedSpace = d.UsedSpace + } + fmt.Fprintf(&msg, " |__ Progress: %d%%\n", 100*d.UsedSpace/refUsedSpace) + fmt.Fprintf(&msg, " |__ Started: %s\n", humanize.Time(d.HealInfo.Started)) + if d.HealInfo.RetryAttempts > 0 { + fmt.Fprintf(&msg, " |__ Retries: %d\n", d.HealInfo.RetryAttempts) } } - fmt.Fprintf(&msg, " |__ Capacity: %s/%s\n", humanize.IBytes(d.UsedSpace), humanize.IBytes(d.TotalSpace)) + fmt.Fprintf(&msg, " |__ Capacity: %s/%s\n", humanize.IBytes(d.UsedSpace), humanize.IBytes(d.TotalSpace)) if showTolerance { - fmt.Fprintf(&msg, " |__ Tolerance: %d drive(s)\n", parity-setsStatus[setIndex{d.PoolIndex, d.SetIndex}].incapableDisks) + fmt.Fprintf(&msg, " |__ Tolerance: %d drive(s)\n", parity-thisSet.incapableDisks) } } - fmt.Fprintf(&msg, "\n") + if serverHeaderPrinted { + fmt.Fprintf(&msg, "\n") + } } } @@ -476,9 +510,6 @@ func (s shortBackgroundHealStatusMessage) String() string { // The addition of Elapsed time of each parallel healing operation // this is needed to calculate the rate of healing accumulatedElapsedTime time.Duration - - // ETA of healing - it is the latest ETA of all drives currently healing - healingRemaining time.Duration ) var problematicDisks int @@ -502,9 +533,16 @@ func (s shortBackgroundHealStatusMessage) String() string { if disk.HealInfo != nil && !disk.HealInfo.Finished { missingInSet++ - diskSet := setIndex{pool: disk.PoolIndex, set: disk.SetIndex} - if maxUsedSpace := setsStatus[diskSet].maxUsedSpace; maxUsedSpace > 0 { - if pct := float64(disk.UsedSpace) / float64(maxUsedSpace); pct < leastPct { + thisSet := setsStatus[setIndex{pool: disk.PoolIndex, set: disk.SetIndex}] + refUsedSpace := uint64(math.MaxUint64) + if thisSet.readyDisksCount > 0 { + refUsedSpace = thisSet.readyDisksUsedSpace / uint64(thisSet.readyDisksCount) + } + if refUsedSpace < disk.UsedSpace { + refUsedSpace = disk.UsedSpace + } + if refUsedSpace > 0 { + if pct := float64(disk.UsedSpace) / float64(refUsedSpace); pct < leastPct { leastPct = pct } } else { @@ -512,14 +550,6 @@ func (s shortBackgroundHealStatusMessage) String() string { leastPct = 0 } - if disk.HealInfo.RetryAttempts == 0 { - scanSpeed := float64(disk.UsedSpace) / float64(time.Since(disk.HealInfo.Started)) - remainingTime := time.Duration(float64(setsStatus[diskSet].maxUsedSpace-disk.UsedSpace) / scanSpeed) - if remainingTime > healingRemaining { - healingRemaining = remainingTime - } - } - disk := disk if furthestHealingDisk == nil { furthestHealingDisk = &disk @@ -582,14 +612,6 @@ func (s shortBackgroundHealStatusMessage) String() string { healPrettyMsg += fmt.Sprintf("Heal rate: %d obj/s, %s/s\n", int64(itemsHealedPerSec), humanize.IBytes(uint64(bytesHealedPerSec))) } - // Estimation completion - eta := "" - if healingRemaining > 0 { - now := time.Now() - eta = humanize.RelTime(now, now.Add(healingRemaining), "", "") - } - healPrettyMsg += fmt.Sprintf("Estimated Completion: %s\n", eta) - if problematicDisks > 0 { healPrettyMsg += "\n" healPrettyMsg += fmt.Sprintf("%d offline disk(s) found.", problematicDisks) @@ -669,6 +691,7 @@ func mainAdminHeal(ctx *cli.Context) error { printMsg(verboseBackgroundHealStatusMessage{ Status: "success", HealInfo: bgHealStatus, + allDrives: ctx.Bool("all-drives"), ToleranceForSC: strings.ToUpper(ctx.String("storage-class")), }) } else { @@ -715,6 +738,16 @@ func mainAdminHeal(ctx *cli.Context) error { return nil } + if prefix == "" && opts.Recursive && opts.Pool == nil && opts.Set == nil && isTerminal() { + fmt.Printf("You are about to scan and heal the whole namespace in all pools and sets, please confirm [y/N]: ") + answer, e := bufio.NewReader(os.Stdin).ReadString('\n') + fatalIf(probe.NewError(e), "Unable to parse user input.") + if answer = strings.TrimSpace(strings.ToLower(answer)); answer != "y" && answer != "yes" { + fmt.Println("Heal aborted!") + return nil + } + } + healStart, _, e := adminClnt.Heal(globalContext, bucket, prefix, opts, "", forceStart, false) fatalIf(probe.NewError(e), "Unable to start healing.")