Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions cmd/containerd-shim-runhcs-v1/exec_hcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -515,13 +515,33 @@ func (he *hcsExec) waitForContainerExit() {
defer span.End()
span.AddAttributes(
trace.StringAttribute("tid", he.tid),
trace.StringAttribute("eid", he.id))
trace.StringAttribute("eid", he.id),
// container-reboot-v2 Stage 1 placeholder; Stage 4 flips this when Reboot notification is observed.
trace.BoolAttribute("reboot.pending", false))

// wait for container or process to exit and ckean up resrources
select {
case <-he.c.WaitChannel():
// Container exited first. We need to force the process into the exited
// state and cleanup any resources
// state and cleanup any resources.

// container-reboot-v2 Stage 2 observability: surface the ExitType that
// *hcs.System.waitBackground parsed out of SystemExitStatus JSON. When
// the HCS ExposeRebootNotification + PassExitStatusJson guards are on
// and the container ran `shutdown /r`, this logs "Reboot" — which is the
// signal Stage 4's handleReboot will key off instead of running the
// teardown branch below. At Stage 2 we only observe; the teardown runs
// unchanged so the container still dies. Harmless for non-Exited paths
// (empty string) and non-Argon paths (*gcs.Container / *JobContainer
// ExitType() return "" per cow.Container contract).
if exitType := he.c.ExitType(); exitType != "" {
log.G(ctx).
WithField("tid", he.tid).
WithField("eid", he.id).
WithField("reboot.exit_type", exitType).
Info("reboot-v2: container exited with ExitType (no action; Stage 2)")
}

he.sl.Lock()
switch he.state {
case shimExecStateCreated:
Expand Down
226 changes: 226 additions & 0 deletions cmd/containerd-shim-runhcs-v1/task_hcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/stats"
"github.com/Microsoft/hcsshim/internal/cmd"
"github.com/Microsoft/hcsshim/internal/cow"
"github.com/Microsoft/hcsshim/internal/devguard"
"github.com/Microsoft/hcsshim/internal/guestpath"
"github.com/Microsoft/hcsshim/internal/hcs"
"github.com/Microsoft/hcsshim/internal/hcs/resourcepaths"
Expand Down Expand Up @@ -238,6 +239,10 @@ func newHcsTask(
closed: make(chan struct{}),
taskSpec: s,
ioRetryTimeout: ioRetryTimeout,
reqStdin: req.Stdin,
reqStdout: req.Stdout,
reqStderr: req.Stderr,
reqTerminal: req.Terminal,
}
ht.init = newHcsExec(
ctx,
Expand Down Expand Up @@ -344,6 +349,16 @@ type hcsTask struct {

// ioRetryTimeout is the time for how long to try reconnecting to stdio pipes from containerd.
ioRetryTimeout time.Duration

// container-reboot-v2 Stage 4 B3c: stash the original CreateTask stdio
// paths so doHandleReboot can call NewUpstreamIO to obtain fresh pipe
// connections for the new init process. oldExec.io's underlying pipes
// are closed by the original init exit path before we get to restart,
// so reusing them makes the new init blind and deadlocks follow-up ops.
reqStdin string
reqStdout string
reqStderr string
reqTerminal bool
}

func (ht *hcsTask) ID() string {
Expand Down Expand Up @@ -626,10 +641,221 @@ func (ht *hcsTask) waitInitExit() {
// Wait for it to exit on its own
ht.init.Wait()

// container-reboot-v2 Stage 4 Sub-step A: detect Reboot at the single
// task-scoped intercept point, independent of the processDone/WaitChannel
// race in hcsExec::waitForContainerExit. Gated by EnableShimRebootHandler
// so Stage 2/3 traces still pass even with the shim deployed; when the
// guard is OFF we just fall through to close() / teardown unchanged.
//
// Timing subtlety: ht.init.Wait() returns when the init PROCESS exits,
// but *hcs.System.waitBackground (which parses SystemExitStatus JSON into
// ExitType) runs on the system-level exit notification, a separate HCS
// callback. The two goroutines can complete in either order. Per
// cow.Container.ExitType() contract, the value is only defined AFTER
// WaitChannel() closes — so we must block on it before reading ExitType
// or risk a false negative. Empirical proof: Stage 4 initial run on
// 2026-04-23 read ExitType right after init.Wait() and got empty string
// 100% of the time, despite waitBackground setting "Reboot" ~22ms later.
//
// Stage 4 Sub-step B will replace this log with actual handleReboot logic
// (suppress close/teardown, drive a new CreateComputeSystem). Sub-step A
// is observation-only so we have a stable hook to extend.
rebootPending := false
if ht.c != nil && devguard.IsEnabled(devguard.EnableShimRebootHandler) {
select {
case <-ht.c.WaitChannel():
// waitBackground has finished; ExitType is now reliable.
case <-time.After(5 * time.Second):
log.G(ctx).
WithField("tid", ht.id).
Warn("reboot-v2 Stage 4: timed out waiting for compute system WaitChannel; ExitType may be empty")
}
if exitType := ht.c.ExitType(); exitType == "Reboot" {
rebootPending = true
log.G(ctx).
WithField("tid", ht.id).
WithField("reboot.exit_type", exitType).
Info("reboot-v2: reboot observed; attempting transparent restart")
if err := ht.doHandleReboot(ctx); err != nil {
log.G(ctx).WithError(err).
WithField("tid", ht.id).
Warn("reboot-v2: handleReboot failed; falling through to teardown")
rebootPending = false // restart failed; normal exit semantics apply
} else {
span.AddAttributes(trace.BoolAttribute("reboot.pending", true))
log.G(ctx).
WithField("tid", ht.id).
Info("reboot-v2: transparent restart completed; suppressing teardown")
// IMPORTANT: return WITHOUT calling ht.close(ctx). The task
// continues to live under the new System; closeHost() is not
// invoked so no /tasks/exit event is published. doHandleReboot
// respawned a fresh waitInitExit goroutine before returning,
// so the next in-container reboot is also handled.
return
}
}
}
span.AddAttributes(trace.BoolAttribute("reboot.pending", rebootPending))

// Close the host and event the exit
ht.close(ctx)
}

// doHandleReboot performs the transparent in-place container restart.
// Called from waitInitExit when the container exits with ExitType=Reboot
// (HCS told us via the V1 SystemExitStatus JSON). Old silo is gone but
// its compute-system ID slot is free. Overlay layer and HNS endpoint
// both persist. Steps:
//
// 1. Close the old *hcs.System handle (silo already destructed kernel-side).
// 2. Call hcs.CreateComputeSystem with the cached create document on the
// same ID. Overlay path and namespace GUID in the doc bind to
// persisted kernel-side state automatically.
// 3. Call newSys.Start().
// 4. Open fresh upstream IO pipes via NewUpstreamIO with the cached
// containerd pipe paths. Fall back to headless (nil stdio) if the
// pipes are gone — containerd typically tears them down when the
// shim's client disconnects during the original exit path, and a
// proper reattach protocol needs a containerd-side change.
// 5. Spawn the original init process via cmd.Cmd with ht.taskSpec.Process
// and the (fresh or nil) stdio.
// 6. Reset ht.init's hcsExec state in-place under its sl lock:
// - Point c + p + io at the new System, cmd, and fresh IO
// - Reset state=Running, pid=newPid, exitStatus=255, exitedAt=zero
// - Allocate fresh processDone / exited channels + sync.Once values
// 7. Point ht.c at newSys so task-level operations target it.
// 8. Respawn waitForExit so the new init process's lifecycle is tracked.
// 9. Respawn waitInitExit so a subsequent in-container reboot is also
// handled transparently (reboot loop).
//
// Known gaps (non-blocking for end-to-end demo):
// - Stdio is not visible to containerd after the first restart; requires
// a containerd-side pipe-republish protocol or TaskRestart event type.
// - docker inspect reports the original PID because containerd caches
// it from the TaskCreate event; needs /tasks/start republish or new
// event topic.
// - If CreateComputeSystem or Start fails mid-way, resources are partially
// cleaned up; caller treats error as "fall through to teardown".
func (ht *hcsTask) doHandleReboot(ctx context.Context) error {
oldSys, ok := ht.c.(*hcs.System)
if !ok {
return fmt.Errorf("ht.c is %T, not *hcs.System — cannot recreate", ht.c)
}
doc := oldSys.CreateDocument()
if len(doc) == 0 {
return fmt.Errorf("no cached create document; System not created via CreateComputeSystem")
}
oldExec, ok := ht.init.(*hcsExec)
if !ok {
return fmt.Errorf("ht.init is %T, not *hcsExec — cannot reset", ht.init)
}

log.G(ctx).
WithField("tid", ht.id).
WithField("doc_bytes", len(doc)).
Info("reboot-v2: closing old system handle")
if err := oldSys.Close(); err != nil {
log.G(ctx).WithError(err).Warn("reboot-v2: old system Close failed (proceeding anyway)")
}

newSys, err := hcs.CreateComputeSystem(ctx, ht.id, doc)
if err != nil {
return fmt.Errorf("CreateComputeSystem on same ID failed: %w", err)
}
log.G(ctx).WithField("tid", ht.id).Info("reboot-v2: new System created on same ID")

if err := newSys.Start(ctx); err != nil {
_ = newSys.Terminate(ctx)
_ = newSys.Wait()
_ = newSys.Close()
return fmt.Errorf("newSys.Start failed: %w", err)
}
log.G(ctx).WithField("tid", ht.id).Info("reboot-v2: new System started")

// B3c: try to open fresh upstream IO pipes for the new init. The old
// exec's UpstreamIO was closed by the original init exit path, which
// causes containerd to tear down its server-side pipes too — so
// NewUpstreamIO typically fails with "system cannot find the file
// specified". In that case, fall back to nil stdio and run the new
// init headless. The process still runs and docker sees the container
// as Up; just no stdout/stderr visibility until a proper reattach
// mechanism lands (future work — likely needs a containerd API change
// or a shim-side pipe-republish protocol).
newCmd := &cmd.Cmd{
Host: newSys,
Log: log.G(ctx).WithFields(logrus.Fields{"tid": ht.id, "eid": ht.id, "reboot-v2": "b3b-init"}),
CopyAfterExitTimeout: time.Second,
}
var freshIO cmd.UpstreamIO
if fio, ioErr := cmd.NewUpstreamIO(ctx, ht.id, ht.reqStdout, ht.reqStderr, ht.reqStdin, ht.reqTerminal, ht.ioRetryTimeout); ioErr == nil {
freshIO = fio
newCmd.Stdin = fio.Stdin()
newCmd.Stdout = fio.Stdout()
newCmd.Stderr = fio.Stderr()
log.G(ctx).WithField("tid", ht.id).Info("reboot-v2: fresh upstream IO opened for new init")
} else {
log.G(ctx).
WithField("tid", ht.id).
WithError(ioErr).
Warn("reboot-v2: could not open fresh IO pipes; new init will run headless")
}
if oldExec.isWCOW {
newCmd.Spec = ht.taskSpec.Process
}
if err := newCmd.Start(); err != nil {
if freshIO != nil {
freshIO.Close(ctx)
}
_ = newSys.Terminate(ctx)
_ = newSys.Wait()
_ = newSys.Close()
return fmt.Errorf("new init cmd.Start failed: %w", err)
}
newPid := newCmd.Process.Pid()
log.G(ctx).
WithField("tid", ht.id).
WithField("new.pid", newPid).
Info("reboot-v2: new init process spawned")

// Swap state into the existing hcsExec under its lock, including the
// fresh upstream IO if we got one (nil = headless).
oldExec.sl.Lock()
oldExec.c = newSys
oldExec.p = newCmd
if freshIO != nil {
oldExec.io = freshIO
}
oldExec.pid = newPid
oldExec.state = shimExecStateRunning
oldExec.exitStatus = 255
oldExec.exitedAt = time.Time{}
oldExec.processDone = make(chan struct{})
oldExec.processDoneOnce = sync.Once{}
oldExec.exited = make(chan struct{})
oldExec.exitedOnce = sync.Once{}
oldExec.sl.Unlock()

// Swap task-level container reference.
ht.c = newSys

// Respawn waitForExit so we track the new init process and publish
// TaskExit correctly when it ends. This is what startInternal does
// at the end of a normal Start() — we're replicating that step.
go oldExec.waitForExit()

// B3c reboot loop: respawn waitInitExit so a SECOND in-container
// reboot is also handled transparently. Each successful handleReboot
// spawns a fresh waiter for the next cycle. Normal (non-reboot) exits
// flow through close(ctx) as before.
go ht.waitInitExit()

log.G(ctx).
WithField("tid", ht.id).
WithField("new.pid", newPid).
Info("reboot-v2: task state swapped; container logically still Running; waiting for next exit")
return nil
}

// waitForHostExit waits for the host virtual machine to exit. Once exited
// forcibly exits all additional exec's in this task.
//
Expand Down
9 changes: 9 additions & 0 deletions internal/cow/cow.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,13 @@ type Container interface {
WaitError() error
// Modify sends a request to modify container resources
Modify(ctx context.Context, config interface{}) error
// ExitType returns the parsed SystemExitStatus.ExitType string reported by HCS
// at compute-system exit — "Reboot", "GracefulExit", "UnexpectedExit", etc.
// Empty string before the container has exited (before WaitChannel() closes) or
// when HCS did not send a parseable SystemExitStatus JSON payload.
//
// Container implementations that don't observe HCS exit notifications (fakes,
// UVM wrappers, test containers) return "". Callers should treat empty string
// as "unknown/not a reboot" and fall back to the previous exit-handling logic.
ExitType() string
}
39 changes: 39 additions & 0 deletions internal/devguard/devguard.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
//go:build windows

// Package devguard reads HKLM\Software\Microsoft\HCS\Dev\Reboot\<Name> DWORDs
// at runtime for the container-reboot-v2 workstream dev matrix.
//
// Behavior: every call opens the registry key, reads the value, closes.
// No caching. On any error, returns false (absent == disabled).
package devguard

import (
"golang.org/x/sys/windows/registry"
)

const guardRoot = `Software\Microsoft\HCS\Dev\Reboot`

// Guard names mirror the HcsDev::Reboot::* accessors on the HCS C++ side.
const (
ForceStopForRestart = "ForceStopForRestart"
ExposeRebootNotification = "ExposeRebootNotification"
PassExitStatusJson = "PassExitStatusJson"
SkipInternalRebootStart = "SkipInternalRebootStart"
EnableShimRebootHandler = "EnableShimRebootHandler"
)

// IsEnabled returns true iff HKLM\guardRoot\<name> exists as a non-zero DWORD.
// Missing key, missing value, wrong type, or access-denied all return false.
func IsEnabled(name string) bool {
k, err := registry.OpenKey(registry.LOCAL_MACHINE, guardRoot, registry.QUERY_VALUE)
if err != nil {
return false
}
defer k.Close()

v, _, err := k.GetIntegerValue(name)
if err != nil {
return false
}
return v != 0
}
Loading