Skip to content

Commit

Permalink
fix(nvidia/xid): do not error log when no xid happened yet (#138)
Browse files Browse the repository at this point in the history
Fix

> {"level":"error","ts":"2024-10-26T11:43:35Z","caller":"log/log.go:60","msg":"failed to invoke component state","operation":"GetStates","component":"accelerator-nvidia-error-xid","error":"no data collected yet in the poller","stacktrace":"github.com/leptonai/gpud/log.(*LeptonLogger).Errorw\n\t/root/leptonai/gpud/log/log.go:60\ngithub.com/leptonai/gpud/internal/session.(*Session).getStates\n\t/root/leptonai/gpud/internal/session/serve.go:250\ngithub.com/leptonai/gpud/internal/session.(*Session).serve\n\t/root/leptonai/gpud/internal/session/serve.go:67"}

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho authored Oct 27, 2024
1 parent 6f30ce1 commit 64b3b2e
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 26 deletions.
56 changes: 40 additions & 16 deletions components/accelerator/nvidia/error/xid/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,26 +92,50 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
if err != nil {
return nil, err
}
if last == nil || last.Output == nil { // no data
log.Logger.Debugw("no xid data -- this is normal when nvml has not received any registered xid events yet")
} else {
ev, ok := last.Output.(*nvidia_query_nvml.XidEvent)
if !ok {
return nil, fmt.Errorf("invalid output type: %T, expected nvidia_query_nvml.XidEvent", last.Output)
if last == nil && err != nil && err != query.ErrNoData { // no data
log.Logger.Debugw("nothing found in last state (no data collected yet)", "component", Name)
return []components.State{
{
Name: Name,
Healthy: false,
Error: query.ErrNoData.Error(),
Reason: query.ErrNoData.Error(),
},
}, nil
}
if last.Error != nil {
return []components.State{
{
Healthy: false,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
}
if last.Output == nil {
return []components.State{
{
Healthy: false,
Reason: "no output",
},
}, nil
}

ev, ok := last.Output.(*nvidia_query_nvml.XidEvent)
if !ok {
return nil, fmt.Errorf("invalid output type: %T, expected nvidia_query_nvml.XidEvent", last.Output)
}
if ev != nil {
if ev.Xid > 0 {
o.NVMLXidEvent = ev
}
if ev != nil {
if ev.Xid > 0 {
o.NVMLXidEvent = ev
}
if ev.Detail != nil && ev.Detail.SuggestedActions != nil && len(ev.Detail.SuggestedActions.RepairActions) > 0 {
if o.SuggestedActions == nil {
o.SuggestedActions = &common.SuggestedActions{}
}
o.SuggestedActions.Add(ev.Detail.SuggestedActions)
if ev.Detail != nil && ev.Detail.SuggestedActions != nil && len(ev.Detail.SuggestedActions.RepairActions) > 0 {
if o.SuggestedActions == nil {
o.SuggestedActions = &common.SuggestedActions{}
}
o.SuggestedActions.Add(ev.Detail.SuggestedActions)
}
}

return o.States()
}

Expand Down
44 changes: 34 additions & 10 deletions components/accelerator/nvidia/gpm/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,42 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
if err != nil {
return nil, err
}
if last == nil || last.Output == nil { // no data
log.Logger.Debugw("no gpm event data -- this is normal when nvml has not received any registered gpm event events yet")
} else {
gpmEvent, ok := last.Output.(*nvidia_query_nvml.GPMEvent)
if !ok {
return nil, fmt.Errorf("invalid output type: %T, expected nvidia_query_nvml.GPMEvent", last.Output)
}
if gpmEvent != nil && len(gpmEvent.Metrics) > 0 {
o.NVMLGPMEvent = gpmEvent
}
if last == nil && err != nil && err != query.ErrNoData { // no data
log.Logger.Debugw("nothing found in last state (no data collected yet)", "component", Name)
return []components.State{
{
Name: Name,
Healthy: false,
Error: query.ErrNoData.Error(),
Reason: query.ErrNoData.Error(),
},
}, nil
}
if last.Error != nil {
return []components.State{
{
Healthy: false,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
}
if last.Output == nil {
return []components.State{
{
Healthy: false,
Reason: "no output",
},
}, nil
}

gpmEvent, ok := last.Output.(*nvidia_query_nvml.GPMEvent)
if !ok {
return nil, fmt.Errorf("invalid output type: %T, expected nvidia_query_nvml.GPMEvent", last.Output)
}
if gpmEvent != nil && len(gpmEvent.Metrics) > 0 {
o.NVMLGPMEvent = gpmEvent
}
return o.States()
}

Expand Down

0 comments on commit 64b3b2e

Please sign in to comment.