@@ -71,6 +71,8 @@ const (
7171 EnvProcOOMScore = "CODER_PROC_OOM_SCORE"
7272)
7373
74+ var ErrAgentClosing = xerrors .New ("agent is closing" )
75+
7476type Options struct {
7577 Filesystem afero.Fs
7678 LogDir string
@@ -401,6 +403,7 @@ func (a *agent) runLoop() {
401403 // need to keep retrying up to the hardCtx so that we can send graceful shutdown-related
402404 // messages.
403405 ctx := a .hardCtx
406+ defer a .logger .Info (ctx , "agent main loop exited" )
404407 for retrier := retry .New (100 * time .Millisecond , 10 * time .Second ); retrier .Wait (ctx ); {
405408 a .logger .Info (ctx , "connecting to coderd" )
406409 err := a .run ()
@@ -1348,7 +1351,7 @@ func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(co
13481351 a .closeMutex .Unlock ()
13491352 if closing {
13501353 _ = network .Close ()
1351- return xerrors .New ("agent is closing" )
1354+ return xerrors .Errorf ("agent closed while creating tailnet: %w" , ErrAgentClosing )
13521355 }
13531356 } else {
13541357 // Update the wireguard IPs if the agent ID changed.
@@ -1471,7 +1474,7 @@ func (a *agent) trackGoroutine(fn func()) error {
14711474 a .closeMutex .Lock ()
14721475 defer a .closeMutex .Unlock ()
14731476 if a .closing {
1474- return xerrors .New ("track conn goroutine: agent is closing" )
1477+ return xerrors .Errorf ("track conn goroutine: %w" , ErrAgentClosing )
14751478 }
14761479 a .closeWaitGroup .Add (1 )
14771480 go func () {
@@ -2152,16 +2155,7 @@ func (a *apiConnRoutineManager) startAgentAPI(
21522155 a .eg .Go (func () error {
21532156 logger .Debug (ctx , "starting agent routine" )
21542157 err := f (ctx , a .aAPI )
2155- if xerrors .Is (err , context .Canceled ) && ctx .Err () != nil {
2156- logger .Debug (ctx , "swallowing context canceled" )
2157- // Don't propagate context canceled errors to the error group, because we don't want the
2158- // graceful context being canceled to halt the work of routines with
2159- // gracefulShutdownBehaviorRemain. Note that we check both that the error is
2160- // context.Canceled and that *our* context is currently canceled, because when Coderd
2161- // unilaterally closes the API connection (for example if the build is outdated), it can
2162- // sometimes show up as context.Canceled in our RPC calls.
2163- return nil
2164- }
2158+ err = shouldPropagateError (ctx , logger , err )
21652159 logger .Debug (ctx , "routine exited" , slog .Error (err ))
21662160 if err != nil {
21672161 return xerrors .Errorf ("error in routine %s: %w" , name , err )
@@ -2189,21 +2183,7 @@ func (a *apiConnRoutineManager) startTailnetAPI(
21892183 a .eg .Go (func () error {
21902184 logger .Debug (ctx , "starting tailnet routine" )
21912185 err := f (ctx , a .tAPI )
2192- if (xerrors .Is (err , context .Canceled ) ||
2193- xerrors .Is (err , io .EOF )) &&
2194- ctx .Err () != nil {
2195- logger .Debug (ctx , "swallowing error because context is canceled" , slog .Error (err ))
2196- // Don't propagate context canceled errors to the error group, because we don't want the
2197- // graceful context being canceled to halt the work of routines with
2198- // gracefulShutdownBehaviorRemain. Unfortunately, the dRPC library closes the stream
2199- // when context is canceled on an RPC, so canceling the context can also show up as
2200- // io.EOF. Also, when Coderd unilaterally closes the API connection (for example if the
2201- // build is outdated), it can sometimes show up as context.Canceled in our RPC calls.
2202- // We can't reliably distinguish between a context cancelation and a legit EOF, so we
2203- // also check that *our* context is currently canceled. If it is, we can safely ignore
2204- // the error.
2205- return nil
2206- }
2186+ err = shouldPropagateError (ctx , logger , err )
22072187 logger .Debug (ctx , "routine exited" , slog .Error (err ))
22082188 if err != nil {
22092189 return xerrors .Errorf ("error in routine %s: %w" , name , err )
@@ -2212,6 +2192,34 @@ func (a *apiConnRoutineManager) startTailnetAPI(
22122192 })
22132193}
22142194
2195+ // shouldPropagateError decides whether an error from an API connection routine should be propagated to the
2196+ // apiConnRoutineManager. Its purpose is to prevent errors related to shutting down from propagating to the manager's
2197+ // error group, which will tear down the API connection and potentially stop graceful shutdown from succeeding.
2198+ func shouldPropagateError (ctx context.Context , logger slog.Logger , err error ) error {
2199+ if (xerrors .Is (err , context .Canceled ) ||
2200+ xerrors .Is (err , io .EOF )) &&
2201+ ctx .Err () != nil {
2202+ logger .Debug (ctx , "swallowing error because context is canceled" , slog .Error (err ))
2203+ // Don't propagate context canceled errors to the error group, because we don't want the
2204+ // graceful context being canceled to halt the work of routines with
2205+ // gracefulShutdownBehaviorRemain. Unfortunately, the dRPC library closes the stream
2206+ // when context is canceled on an RPC, so canceling the context can also show up as
2207+ // io.EOF. Also, when Coderd unilaterally closes the API connection (for example if the
2208+ // build is outdated), it can sometimes show up as context.Canceled in our RPC calls.
2209+ // We can't reliably distinguish between a context cancelation and a legit EOF, so we
2210+ // also check that *our* context is currently canceled. If it is, we can safely ignore
2211+ // the error.
2212+ return nil
2213+ }
2214+ if xerrors .Is (err , ErrAgentClosing ) {
2215+ logger .Debug (ctx , "swallowing error because agent is closing" )
2216+ // This can only be generated when the agent is closing, so we never want it to propagate to other routines.
2217+ // (They are signaled to exit via canceled contexts.)
2218+ return nil
2219+ }
2220+ return err
2221+ }
2222+
22152223func (a * apiConnRoutineManager ) wait () error {
22162224 return a .eg .Wait ()
22172225}
0 commit comments