microsoft · pbozzay · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
@@ -515,13 +515,33 @@ func (he *hcsExec) waitForContainerExit() {
 	defer span.End()
 	span.AddAttributes(
 		trace.StringAttribute("tid", he.tid),
-		trace.StringAttribute("eid", he.id))
+		trace.StringAttribute("eid", he.id),
+		// container-reboot-v2 Stage 1 placeholder; Stage 4 flips this when Reboot notification is observed.
+		trace.BoolAttribute("reboot.pending", false))
 
 	// wait for container or process to exit and ckean up resrources
 	select {
 	case <-he.c.WaitChannel():
 		// Container exited first. We need to force the process into the exited
-		// state and cleanup any resources
+		// state and cleanup any resources.
+
+		// container-reboot-v2 Stage 2 observability: surface the ExitType that
+		// *hcs.System.waitBackground parsed out of SystemExitStatus JSON. When
+		// the HCS ExposeRebootNotification + PassExitStatusJson guards are on
+		// and the container ran `shutdown /r`, this logs "Reboot" — which is the
+		// signal Stage 4's handleReboot will key off instead of running the
+		// teardown branch below. At Stage 2 we only observe; the teardown runs
+		// unchanged so the container still dies. Harmless for non-Exited paths
+		// (empty string) and non-Argon paths (*gcs.Container / *JobContainer
+		// ExitType() return "" per cow.Container contract).
+		if exitType := he.c.ExitType(); exitType != "" {
+			log.G(ctx).
+				WithField("tid", he.tid).
+				WithField("eid", he.id).
+				WithField("reboot.exit_type", exitType).
+				Info("reboot-v2: container exited with ExitType (no action; Stage 2)")
+		}
+
 		he.sl.Lock()
 		switch he.state {
 		case shimExecStateCreated:

@@ -28,6 +28,7 @@ import (
 	"github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/stats"
 	"github.com/Microsoft/hcsshim/internal/cmd"
 	"github.com/Microsoft/hcsshim/internal/cow"
+	"github.com/Microsoft/hcsshim/internal/devguard"
 	"github.com/Microsoft/hcsshim/internal/guestpath"
 	"github.com/Microsoft/hcsshim/internal/hcs"
 	"github.com/Microsoft/hcsshim/internal/hcs/resourcepaths"
@@ -238,6 +239,10 @@ func newHcsTask(
 		closed:         make(chan struct{}),
 		taskSpec:       s,
 		ioRetryTimeout: ioRetryTimeout,
+		reqStdin:       req.Stdin,
+		reqStdout:      req.Stdout,
+		reqStderr:      req.Stderr,
+		reqTerminal:    req.Terminal,
 	}
 	ht.init = newHcsExec(
 		ctx,
@@ -344,6 +349,16 @@ type hcsTask struct {
 
 	// ioRetryTimeout is the time for how long to try reconnecting to stdio pipes from containerd.
 	ioRetryTimeout time.Duration
+
+	// container-reboot-v2 Stage 4 B3c: stash the original CreateTask stdio
+	// paths so doHandleReboot can call NewUpstreamIO to obtain fresh pipe
+	// connections for the new init process. oldExec.io's underlying pipes
+	// are closed by the original init exit path before we get to restart,
+	// so reusing them makes the new init blind and deadlocks follow-up ops.
+	reqStdin    string
+	reqStdout   string
+	reqStderr   string
+	reqTerminal bool
 }
 
 func (ht *hcsTask) ID() string {
@@ -626,10 +641,221 @@ func (ht *hcsTask) waitInitExit() {
 	// Wait for it to exit on its own
 	ht.init.Wait()
 
+	// container-reboot-v2 Stage 4 Sub-step A: detect Reboot at the single
+	// task-scoped intercept point, independent of the processDone/WaitChannel
+	// race in hcsExec::waitForContainerExit. Gated by EnableShimRebootHandler
+	// so Stage 2/3 traces still pass even with the shim deployed; when the
+	// guard is OFF we just fall through to close() / teardown unchanged.
+	//
+	// Timing subtlety: ht.init.Wait() returns when the init PROCESS exits,
+	// but *hcs.System.waitBackground (which parses SystemExitStatus JSON into
+	// ExitType) runs on the system-level exit notification, a separate HCS
+	// callback. The two goroutines can complete in either order. Per
+	// cow.Container.ExitType() contract, the value is only defined AFTER
+	// WaitChannel() closes — so we must block on it before reading ExitType
+	// or risk a false negative. Empirical proof: Stage 4 initial run on
+	// 2026-04-23 read ExitType right after init.Wait() and got empty string
+	// 100% of the time, despite waitBackground setting "Reboot" ~22ms later.
+	//
+	// Stage 4 Sub-step B will replace this log with actual handleReboot logic
+	// (suppress close/teardown, drive a new CreateComputeSystem). Sub-step A
+	// is observation-only so we have a stable hook to extend.
+	rebootPending := false
+	if ht.c != nil && devguard.IsEnabled(devguard.EnableShimRebootHandler) {
+		select {
+		case <-ht.c.WaitChannel():
+			// waitBackground has finished; ExitType is now reliable.
+		case <-time.After(5 * time.Second):
+			log.G(ctx).
+				WithField("tid", ht.id).
+				Warn("reboot-v2 Stage 4: timed out waiting for compute system WaitChannel; ExitType may be empty")
+		}
+		if exitType := ht.c.ExitType(); exitType == "Reboot" {
+			rebootPending = true
+			log.G(ctx).
+				WithField("tid", ht.id).
+				WithField("reboot.exit_type", exitType).
+				Info("reboot-v2: reboot observed; attempting transparent restart")
+			if err := ht.doHandleReboot(ctx); err != nil {
+				log.G(ctx).WithError(err).
+					WithField("tid", ht.id).
+					Warn("reboot-v2: handleReboot failed; falling through to teardown")
+				rebootPending = false // restart failed; normal exit semantics apply
+			} else {
+				span.AddAttributes(trace.BoolAttribute("reboot.pending", true))
+				log.G(ctx).
+					WithField("tid", ht.id).
+					Info("reboot-v2: transparent restart completed; suppressing teardown")
+				// IMPORTANT: return WITHOUT calling ht.close(ctx). The task
+				// continues to live under the new System; closeHost() is not
+				// invoked so no /tasks/exit event is published. doHandleReboot
+				// respawned a fresh waitInitExit goroutine before returning,
+				// so the next in-container reboot is also handled.
+				return
+			}
+		}
+	}
+	span.AddAttributes(trace.BoolAttribute("reboot.pending", rebootPending))
+
 	// Close the host and event the exit
 	ht.close(ctx)
 }
 
+// doHandleReboot performs the transparent in-place container restart.
+// Called from waitInitExit when the container exits with ExitType=Reboot
+// (HCS told us via the V1 SystemExitStatus JSON). Old silo is gone but
+// its compute-system ID slot is free. Overlay layer and HNS endpoint
+// both persist. Steps:
+//
+//  1. Close the old *hcs.System handle (silo already destructed kernel-side).
+//  2. Call hcs.CreateComputeSystem with the cached create document on the
+//     same ID. Overlay path and namespace GUID in the doc bind to
+//     persisted kernel-side state automatically.
+//  3. Call newSys.Start().
+//  4. Open fresh upstream IO pipes via NewUpstreamIO with the cached
+//     containerd pipe paths. Fall back to headless (nil stdio) if the
+//     pipes are gone — containerd typically tears them down when the
+//     shim's client disconnects during the original exit path, and a
+//     proper reattach protocol needs a containerd-side change.
+//  5. Spawn the original init process via cmd.Cmd with ht.taskSpec.Process
+//     and the (fresh or nil) stdio.
+//  6. Reset ht.init's hcsExec state in-place under its sl lock:
+//       - Point c + p + io at the new System, cmd, and fresh IO
+//       - Reset state=Running, pid=newPid, exitStatus=255, exitedAt=zero
+//       - Allocate fresh processDone / exited channels + sync.Once values
+//  7. Point ht.c at newSys so task-level operations target it.
+//  8. Respawn waitForExit so the new init process's lifecycle is tracked.
+//  9. Respawn waitInitExit so a subsequent in-container reboot is also
+//     handled transparently (reboot loop).
+//
+// Known gaps (non-blocking for end-to-end demo):
+//   - Stdio is not visible to containerd after the first restart; requires
+//     a containerd-side pipe-republish protocol or TaskRestart event type.
+//   - docker inspect reports the original PID because containerd caches
+//     it from the TaskCreate event; needs /tasks/start republish or new
+//     event topic.
+//   - If CreateComputeSystem or Start fails mid-way, resources are partially
+//     cleaned up; caller treats error as "fall through to teardown".
+func (ht *hcsTask) doHandleReboot(ctx context.Context) error {
+	oldSys, ok := ht.c.(*hcs.System)
+	if !ok {
+		return fmt.Errorf("ht.c is %T, not *hcs.System — cannot recreate", ht.c)
+	}
+	doc := oldSys.CreateDocument()
+	if len(doc) == 0 {
+		return fmt.Errorf("no cached create document; System not created via CreateComputeSystem")
+	}
+	oldExec, ok := ht.init.(*hcsExec)
+	if !ok {
+		return fmt.Errorf("ht.init is %T, not *hcsExec — cannot reset", ht.init)
+	}
+
+	log.G(ctx).
+		WithField("tid", ht.id).
+		WithField("doc_bytes", len(doc)).
+		Info("reboot-v2: closing old system handle")
+	if err := oldSys.Close(); err != nil {
+		log.G(ctx).WithError(err).Warn("reboot-v2: old system Close failed (proceeding anyway)")
+	}
+
+	newSys, err := hcs.CreateComputeSystem(ctx, ht.id, doc)
+	if err != nil {
+		return fmt.Errorf("CreateComputeSystem on same ID failed: %w", err)
+	}
+	log.G(ctx).WithField("tid", ht.id).Info("reboot-v2: new System created on same ID")
+
+	if err := newSys.Start(ctx); err != nil {
+		_ = newSys.Terminate(ctx)
+		_ = newSys.Wait()
+		_ = newSys.Close()
+		return fmt.Errorf("newSys.Start failed: %w", err)
+	}
+	log.G(ctx).WithField("tid", ht.id).Info("reboot-v2: new System started")
+
+	// B3c: try to open fresh upstream IO pipes for the new init. The old
+	// exec's UpstreamIO was closed by the original init exit path, which
+	// causes containerd to tear down its server-side pipes too — so
+	// NewUpstreamIO typically fails with "system cannot find the file
+	// specified". In that case, fall back to nil stdio and run the new
+	// init headless. The process still runs and docker sees the container
+	// as Up; just no stdout/stderr visibility until a proper reattach
+	// mechanism lands (future work — likely needs a containerd API change
+	// or a shim-side pipe-republish protocol).
+	newCmd := &cmd.Cmd{
+		Host:                 newSys,
+		Log:                  log.G(ctx).WithFields(logrus.Fields{"tid": ht.id, "eid": ht.id, "reboot-v2": "b3b-init"}),
+		CopyAfterExitTimeout: time.Second,
+	}
+	var freshIO cmd.UpstreamIO
+	if fio, ioErr := cmd.NewUpstreamIO(ctx, ht.id, ht.reqStdout, ht.reqStderr, ht.reqStdin, ht.reqTerminal, ht.ioRetryTimeout); ioErr == nil {
+		freshIO = fio
+		newCmd.Stdin = fio.Stdin()
+		newCmd.Stdout = fio.Stdout()
+		newCmd.Stderr = fio.Stderr()
+		log.G(ctx).WithField("tid", ht.id).Info("reboot-v2: fresh upstream IO opened for new init")
+	} else {
+		log.G(ctx).
+			WithField("tid", ht.id).
+			WithError(ioErr).
+			Warn("reboot-v2: could not open fresh IO pipes; new init will run headless")
+	}
+	if oldExec.isWCOW {
+		newCmd.Spec = ht.taskSpec.Process
+	}
+	if err := newCmd.Start(); err != nil {
+		if freshIO != nil {
+			freshIO.Close(ctx)
+		}
+		_ = newSys.Terminate(ctx)
+		_ = newSys.Wait()
+		_ = newSys.Close()
+		return fmt.Errorf("new init cmd.Start failed: %w", err)
+	}
+	newPid := newCmd.Process.Pid()
+	log.G(ctx).
+		WithField("tid", ht.id).
+		WithField("new.pid", newPid).
+		Info("reboot-v2: new init process spawned")
+
+	// Swap state into the existing hcsExec under its lock, including the
+	// fresh upstream IO if we got one (nil = headless).
+	oldExec.sl.Lock()
+	oldExec.c = newSys
+	oldExec.p = newCmd
+	if freshIO != nil {
+		oldExec.io = freshIO
+	}
+	oldExec.pid = newPid
+	oldExec.state = shimExecStateRunning
+	oldExec.exitStatus = 255
+	oldExec.exitedAt = time.Time{}
+	oldExec.processDone = make(chan struct{})
+	oldExec.processDoneOnce = sync.Once{}
+	oldExec.exited = make(chan struct{})
+	oldExec.exitedOnce = sync.Once{}
+	oldExec.sl.Unlock()
+
+	// Swap task-level container reference.
+	ht.c = newSys
+
+	// Respawn waitForExit so we track the new init process and publish
+	// TaskExit correctly when it ends. This is what startInternal does
+	// at the end of a normal Start() — we're replicating that step.
+	go oldExec.waitForExit()
+
+	// B3c reboot loop: respawn waitInitExit so a SECOND in-container
+	// reboot is also handled transparently. Each successful handleReboot
+	// spawns a fresh waiter for the next cycle. Normal (non-reboot) exits
+	// flow through close(ctx) as before.
+	go ht.waitInitExit()
+
+	log.G(ctx).
+		WithField("tid", ht.id).
+		WithField("new.pid", newPid).
+		Info("reboot-v2: task state swapped; container logically still Running; waiting for next exit")
+	return nil
+}
+
 // waitForHostExit waits for the host virtual machine to exit. Once exited
 // forcibly exits all additional exec's in this task.
 //

@@ -96,4 +96,13 @@ type Container interface {
 	WaitError() error
 	// Modify sends a request to modify container resources
 	Modify(ctx context.Context, config interface{}) error
+	// ExitType returns the parsed SystemExitStatus.ExitType string reported by HCS
+	// at compute-system exit — "Reboot", "GracefulExit", "UnexpectedExit", etc.
+	// Empty string before the container has exited (before WaitChannel() closes) or
+	// when HCS did not send a parseable SystemExitStatus JSON payload.
+	//
+	// Container implementations that don't observe HCS exit notifications (fakes,
+	// UVM wrappers, test containers) return "". Callers should treat empty string
+	// as "unknown/not a reboot" and fall back to the previous exit-handling logic.
+	ExitType() string
 }
@@ -0,0 +1,39 @@
+//go:build windows
+
+// Package devguard reads HKLM\Software\Microsoft\HCS\Dev\Reboot\<Name> DWORDs
+// at runtime for the container-reboot-v2 workstream dev matrix.
+//
+// Behavior: every call opens the registry key, reads the value, closes.
+// No caching. On any error, returns false (absent == disabled).
+package devguard
+
+import (
+	"golang.org/x/sys/windows/registry"
+)
+
+const guardRoot = `Software\Microsoft\HCS\Dev\Reboot`
+
+// Guard names mirror the HcsDev::Reboot::* accessors on the HCS C++ side.
+const (
+	ForceStopForRestart      = "ForceStopForRestart"
+	ExposeRebootNotification = "ExposeRebootNotification"
+	PassExitStatusJson       = "PassExitStatusJson"
+	SkipInternalRebootStart  = "SkipInternalRebootStart"
+	EnableShimRebootHandler  = "EnableShimRebootHandler"
+)
+
+// IsEnabled returns true iff HKLM\guardRoot\<name> exists as a non-zero DWORD.
+// Missing key, missing value, wrong type, or access-denied all return false.
+func IsEnabled(name string) bool {
+	k, err := registry.OpenKey(registry.LOCAL_MACHINE, guardRoot, registry.QUERY_VALUE)
+	if err != nil {
+		return false
+	}
+	defer k.Close()
+
+	v, _, err := k.GetIntegerValue(name)
+	if err != nil {
+		return false
+	}
+	return v != 0
+}