Skip to content

Commit 71c6dc4

Browse files
authored
fix: stop disconnecting from coderd early and record disconnect correctly (#21250)
fixes coder/internal#1196 The above issue exposes two different bugs in Coder. In the agent, there is a race where if the agent is closed while starting up networking, it will erroneously disconnect from Coderd, which delays or breaks writing final status and logs. In Coderd, there is a bug where we don't properly record the latest agent disconnection time if the agent had previously disconnected. This causes us to report the agent status as "Connected" even after it has disconnected up until the inactivity timeout fires. This PR fixes both issues. It also slightly reworks when we send workspace updates based on connection and disconnection. Previously we would send two updates when the agent connected in certain circumstances, even though the status would be the same in both (only times changed). Now we universally only send one on connect, and then another on disconnect.
1 parent 2c94564 commit 71c6dc4

File tree

3 files changed

+180
-128
lines changed

3 files changed

+180
-128
lines changed

agent/agent.go

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ const (
7171
EnvProcOOMScore = "CODER_PROC_OOM_SCORE"
7272
)
7373

74+
var ErrAgentClosing = xerrors.New("agent is closing")
75+
7476
type Options struct {
7577
Filesystem afero.Fs
7678
LogDir string
@@ -401,6 +403,7 @@ func (a *agent) runLoop() {
401403
// need to keep retrying up to the hardCtx so that we can send graceful shutdown-related
402404
// messages.
403405
ctx := a.hardCtx
406+
defer a.logger.Info(ctx, "agent main loop exited")
404407
for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
405408
a.logger.Info(ctx, "connecting to coderd")
406409
err := a.run()
@@ -1348,7 +1351,7 @@ func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(co
13481351
a.closeMutex.Unlock()
13491352
if closing {
13501353
_ = network.Close()
1351-
return xerrors.New("agent is closing")
1354+
return xerrors.Errorf("agent closed while creating tailnet: %w", ErrAgentClosing)
13521355
}
13531356
} else {
13541357
// Update the wireguard IPs if the agent ID changed.
@@ -1471,7 +1474,7 @@ func (a *agent) trackGoroutine(fn func()) error {
14711474
a.closeMutex.Lock()
14721475
defer a.closeMutex.Unlock()
14731476
if a.closing {
1474-
return xerrors.New("track conn goroutine: agent is closing")
1477+
return xerrors.Errorf("track conn goroutine: %w", ErrAgentClosing)
14751478
}
14761479
a.closeWaitGroup.Add(1)
14771480
go func() {
@@ -2152,16 +2155,7 @@ func (a *apiConnRoutineManager) startAgentAPI(
21522155
a.eg.Go(func() error {
21532156
logger.Debug(ctx, "starting agent routine")
21542157
err := f(ctx, a.aAPI)
2155-
if xerrors.Is(err, context.Canceled) && ctx.Err() != nil {
2156-
logger.Debug(ctx, "swallowing context canceled")
2157-
// Don't propagate context canceled errors to the error group, because we don't want the
2158-
// graceful context being canceled to halt the work of routines with
2159-
// gracefulShutdownBehaviorRemain. Note that we check both that the error is
2160-
// context.Canceled and that *our* context is currently canceled, because when Coderd
2161-
// unilaterally closes the API connection (for example if the build is outdated), it can
2162-
// sometimes show up as context.Canceled in our RPC calls.
2163-
return nil
2164-
}
2158+
err = shouldPropagateError(ctx, logger, err)
21652159
logger.Debug(ctx, "routine exited", slog.Error(err))
21662160
if err != nil {
21672161
return xerrors.Errorf("error in routine %s: %w", name, err)
@@ -2189,21 +2183,7 @@ func (a *apiConnRoutineManager) startTailnetAPI(
21892183
a.eg.Go(func() error {
21902184
logger.Debug(ctx, "starting tailnet routine")
21912185
err := f(ctx, a.tAPI)
2192-
if (xerrors.Is(err, context.Canceled) ||
2193-
xerrors.Is(err, io.EOF)) &&
2194-
ctx.Err() != nil {
2195-
logger.Debug(ctx, "swallowing error because context is canceled", slog.Error(err))
2196-
// Don't propagate context canceled errors to the error group, because we don't want the
2197-
// graceful context being canceled to halt the work of routines with
2198-
// gracefulShutdownBehaviorRemain. Unfortunately, the dRPC library closes the stream
2199-
// when context is canceled on an RPC, so canceling the context can also show up as
2200-
// io.EOF. Also, when Coderd unilaterally closes the API connection (for example if the
2201-
// build is outdated), it can sometimes show up as context.Canceled in our RPC calls.
2202-
// We can't reliably distinguish between a context cancelation and a legit EOF, so we
2203-
// also check that *our* context is currently canceled. If it is, we can safely ignore
2204-
// the error.
2205-
return nil
2206-
}
2186+
err = shouldPropagateError(ctx, logger, err)
22072187
logger.Debug(ctx, "routine exited", slog.Error(err))
22082188
if err != nil {
22092189
return xerrors.Errorf("error in routine %s: %w", name, err)
@@ -2212,6 +2192,34 @@ func (a *apiConnRoutineManager) startTailnetAPI(
22122192
})
22132193
}
22142194

2195+
// shouldPropagateError decides whether an error from an API connection routine should be propagated to the
2196+
// apiConnRoutineManager. Its purpose is to prevent errors related to shutting down from propagating to the manager's
2197+
// error group, which will tear down the API connection and potentially stop graceful shutdown from succeeding.
2198+
func shouldPropagateError(ctx context.Context, logger slog.Logger, err error) error {
2199+
if (xerrors.Is(err, context.Canceled) ||
2200+
xerrors.Is(err, io.EOF)) &&
2201+
ctx.Err() != nil {
2202+
logger.Debug(ctx, "swallowing error because context is canceled", slog.Error(err))
2203+
// Don't propagate context canceled errors to the error group, because we don't want the
2204+
// graceful context being canceled to halt the work of routines with
2205+
// gracefulShutdownBehaviorRemain. Unfortunately, the dRPC library closes the stream
2206+
// when context is canceled on an RPC, so canceling the context can also show up as
2207+
// io.EOF. Also, when Coderd unilaterally closes the API connection (for example if the
2208+
// build is outdated), it can sometimes show up as context.Canceled in our RPC calls.
2209+
// We can't reliably distinguish between a context cancelation and a legit EOF, so we
2210+
// also check that *our* context is currently canceled. If it is, we can safely ignore
2211+
// the error.
2212+
return nil
2213+
}
2214+
if xerrors.Is(err, ErrAgentClosing) {
2215+
logger.Debug(ctx, "swallowing error because agent is closing")
2216+
// This can only be generated when the agent is closing, so we never want it to propagate to other routines.
2217+
// (They are signaled to exit via canceled contexts.)
2218+
return nil
2219+
}
2220+
return err
2221+
}
2222+
22152223
func (a *apiConnRoutineManager) wait() error {
22162224
return a.eg.Wait()
22172225
}

coderd/workspaceagentsrpc.go

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,16 @@ func (m *agentConnectionMonitor) start(ctx context.Context) {
359359
}
360360

361361
func (m *agentConnectionMonitor) monitor(ctx context.Context) {
362+
reason := "disconnect"
362363
defer func() {
364+
m.logger.Debug(ctx, "agent connection monitor is closing connection",
365+
slog.F("reason", reason))
366+
_ = m.conn.Close(websocket.StatusGoingAway, reason)
367+
m.disconnectedAt = sql.NullTime{
368+
Time: dbtime.Now(),
369+
Valid: true,
370+
}
371+
363372
// If connection closed then context will be canceled, try to
364373
// ensure our final update is sent. By waiting at most the agent
365374
// inactive disconnect timeout we ensure that we don't block but
@@ -372,13 +381,6 @@ func (m *agentConnectionMonitor) monitor(ctx context.Context) {
372381
finalCtx, cancel := context.WithTimeout(dbauthz.AsSystemRestricted(m.apiCtx), m.disconnectTimeout)
373382
defer cancel()
374383

375-
// Only update timestamp if the disconnect is new.
376-
if !m.disconnectedAt.Valid {
377-
m.disconnectedAt = sql.NullTime{
378-
Time: dbtime.Now(),
379-
Valid: true,
380-
}
381-
}
382384
err := m.updateConnectionTimes(finalCtx)
383385
if err != nil {
384386
// This is a bug with unit tests that cancel the app context and
@@ -398,12 +400,6 @@ func (m *agentConnectionMonitor) monitor(ctx context.Context) {
398400
AgentID: &m.workspaceAgent.ID,
399401
})
400402
}()
401-
reason := "disconnect"
402-
defer func() {
403-
m.logger.Debug(ctx, "agent connection monitor is closing connection",
404-
slog.F("reason", reason))
405-
_ = m.conn.Close(websocket.StatusGoingAway, reason)
406-
}()
407403

408404
err := m.updateConnectionTimes(ctx)
409405
if err != nil {
@@ -432,8 +428,7 @@ func (m *agentConnectionMonitor) monitor(ctx context.Context) {
432428
m.logger.Warn(ctx, "connection to agent timed out")
433429
return
434430
}
435-
connectionStatusChanged := m.disconnectedAt.Valid
436-
m.disconnectedAt = sql.NullTime{}
431+
437432
m.lastConnectedAt = sql.NullTime{
438433
Time: dbtime.Now(),
439434
Valid: true,
@@ -447,13 +442,9 @@ func (m *agentConnectionMonitor) monitor(ctx context.Context) {
447442
}
448443
return
449444
}
450-
if connectionStatusChanged {
451-
m.updater.publishWorkspaceUpdate(ctx, m.workspace.OwnerID, wspubsub.WorkspaceEvent{
452-
Kind: wspubsub.WorkspaceEventKindAgentConnectionUpdate,
453-
WorkspaceID: m.workspaceBuild.WorkspaceID,
454-
AgentID: &m.workspaceAgent.ID,
455-
})
456-
}
445+
// we don't need to publish a workspace update here because we published an update when the workspace first
446+
// connected. Since all we've done is updated lastConnectedAt, the workspace is still connected and hasn't
447+
// changed status. We don't expect to get updates just for the times changing.
457448

458449
ctx, err := dbauthz.WithWorkspaceRBAC(ctx, m.workspace.RBACObject())
459450
if err != nil {

0 commit comments

Comments
 (0)