diff --git a/cmd/containerd-shim-runhcs-v1/exec_hcs.go b/cmd/containerd-shim-runhcs-v1/exec_hcs.go index 034e554802..9ff7b2875a 100644 --- a/cmd/containerd-shim-runhcs-v1/exec_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/exec_hcs.go @@ -515,13 +515,33 @@ func (he *hcsExec) waitForContainerExit() { defer span.End() span.AddAttributes( trace.StringAttribute("tid", he.tid), - trace.StringAttribute("eid", he.id)) + trace.StringAttribute("eid", he.id), + // container-reboot-v2 Stage 1 placeholder; Stage 4 flips this when Reboot notification is observed. + trace.BoolAttribute("reboot.pending", false)) // wait for container or process to exit and ckean up resrources select { case <-he.c.WaitChannel(): // Container exited first. We need to force the process into the exited - // state and cleanup any resources + // state and cleanup any resources. + + // container-reboot-v2 Stage 2 observability: surface the ExitType that + // *hcs.System.waitBackground parsed out of SystemExitStatus JSON. When + // the HCS ExposeRebootNotification + PassExitStatusJson guards are on + // and the container ran `shutdown /r`, this logs "Reboot" — which is the + // signal Stage 4's handleReboot will key off instead of running the + // teardown branch below. At Stage 2 we only observe; the teardown runs + // unchanged so the container still dies. Harmless for non-Exited paths + // (empty string) and non-Argon paths (*gcs.Container / *JobContainer + // ExitType() return "" per cow.Container contract). + if exitType := he.c.ExitType(); exitType != "" { + log.G(ctx). + WithField("tid", he.tid). + WithField("eid", he.id). + WithField("reboot.exit_type", exitType). + Info("reboot-v2: container exited with ExitType (no action; Stage 2)") + } + he.sl.Lock() switch he.state { case shimExecStateCreated: diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs.go b/cmd/containerd-shim-runhcs-v1/task_hcs.go index 9fbb1faf35..84008384c8 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs.go @@ -28,6 +28,7 @@ import ( "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/stats" "github.com/Microsoft/hcsshim/internal/cmd" "github.com/Microsoft/hcsshim/internal/cow" + "github.com/Microsoft/hcsshim/internal/devguard" "github.com/Microsoft/hcsshim/internal/guestpath" "github.com/Microsoft/hcsshim/internal/hcs" "github.com/Microsoft/hcsshim/internal/hcs/resourcepaths" @@ -238,6 +239,10 @@ func newHcsTask( closed: make(chan struct{}), taskSpec: s, ioRetryTimeout: ioRetryTimeout, + reqStdin: req.Stdin, + reqStdout: req.Stdout, + reqStderr: req.Stderr, + reqTerminal: req.Terminal, } ht.init = newHcsExec( ctx, @@ -344,6 +349,16 @@ type hcsTask struct { // ioRetryTimeout is the time for how long to try reconnecting to stdio pipes from containerd. ioRetryTimeout time.Duration + + // container-reboot-v2 Stage 4 B3c: stash the original CreateTask stdio + // paths so doHandleReboot can call NewUpstreamIO to obtain fresh pipe + // connections for the new init process. oldExec.io's underlying pipes + // are closed by the original init exit path before we get to restart, + // so reusing them makes the new init blind and deadlocks follow-up ops. + reqStdin string + reqStdout string + reqStderr string + reqTerminal bool } func (ht *hcsTask) ID() string { @@ -626,10 +641,221 @@ func (ht *hcsTask) waitInitExit() { // Wait for it to exit on its own ht.init.Wait() + // container-reboot-v2 Stage 4 Sub-step A: detect Reboot at the single + // task-scoped intercept point, independent of the processDone/WaitChannel + // race in hcsExec::waitForContainerExit. Gated by EnableShimRebootHandler + // so Stage 2/3 traces still pass even with the shim deployed; when the + // guard is OFF we just fall through to close() / teardown unchanged. + // + // Timing subtlety: ht.init.Wait() returns when the init PROCESS exits, + // but *hcs.System.waitBackground (which parses SystemExitStatus JSON into + // ExitType) runs on the system-level exit notification, a separate HCS + // callback. The two goroutines can complete in either order. Per + // cow.Container.ExitType() contract, the value is only defined AFTER + // WaitChannel() closes — so we must block on it before reading ExitType + // or risk a false negative. Empirical proof: Stage 4 initial run on + // 2026-04-23 read ExitType right after init.Wait() and got empty string + // 100% of the time, despite waitBackground setting "Reboot" ~22ms later. + // + // Stage 4 Sub-step B will replace this log with actual handleReboot logic + // (suppress close/teardown, drive a new CreateComputeSystem). Sub-step A + // is observation-only so we have a stable hook to extend. + rebootPending := false + if ht.c != nil && devguard.IsEnabled(devguard.EnableShimRebootHandler) { + select { + case <-ht.c.WaitChannel(): + // waitBackground has finished; ExitType is now reliable. + case <-time.After(5 * time.Second): + log.G(ctx). + WithField("tid", ht.id). + Warn("reboot-v2 Stage 4: timed out waiting for compute system WaitChannel; ExitType may be empty") + } + if exitType := ht.c.ExitType(); exitType == "Reboot" { + rebootPending = true + log.G(ctx). + WithField("tid", ht.id). + WithField("reboot.exit_type", exitType). + Info("reboot-v2: reboot observed; attempting transparent restart") + if err := ht.doHandleReboot(ctx); err != nil { + log.G(ctx).WithError(err). + WithField("tid", ht.id). + Warn("reboot-v2: handleReboot failed; falling through to teardown") + rebootPending = false // restart failed; normal exit semantics apply + } else { + span.AddAttributes(trace.BoolAttribute("reboot.pending", true)) + log.G(ctx). + WithField("tid", ht.id). + Info("reboot-v2: transparent restart completed; suppressing teardown") + // IMPORTANT: return WITHOUT calling ht.close(ctx). The task + // continues to live under the new System; closeHost() is not + // invoked so no /tasks/exit event is published. doHandleReboot + // respawned a fresh waitInitExit goroutine before returning, + // so the next in-container reboot is also handled. + return + } + } + } + span.AddAttributes(trace.BoolAttribute("reboot.pending", rebootPending)) + // Close the host and event the exit ht.close(ctx) } +// doHandleReboot performs the transparent in-place container restart. +// Called from waitInitExit when the container exits with ExitType=Reboot +// (HCS told us via the V1 SystemExitStatus JSON). Old silo is gone but +// its compute-system ID slot is free. Overlay layer and HNS endpoint +// both persist. Steps: +// +// 1. Close the old *hcs.System handle (silo already destructed kernel-side). +// 2. Call hcs.CreateComputeSystem with the cached create document on the +// same ID. Overlay path and namespace GUID in the doc bind to +// persisted kernel-side state automatically. +// 3. Call newSys.Start(). +// 4. Open fresh upstream IO pipes via NewUpstreamIO with the cached +// containerd pipe paths. Fall back to headless (nil stdio) if the +// pipes are gone — containerd typically tears them down when the +// shim's client disconnects during the original exit path, and a +// proper reattach protocol needs a containerd-side change. +// 5. Spawn the original init process via cmd.Cmd with ht.taskSpec.Process +// and the (fresh or nil) stdio. +// 6. Reset ht.init's hcsExec state in-place under its sl lock: +// - Point c + p + io at the new System, cmd, and fresh IO +// - Reset state=Running, pid=newPid, exitStatus=255, exitedAt=zero +// - Allocate fresh processDone / exited channels + sync.Once values +// 7. Point ht.c at newSys so task-level operations target it. +// 8. Respawn waitForExit so the new init process's lifecycle is tracked. +// 9. Respawn waitInitExit so a subsequent in-container reboot is also +// handled transparently (reboot loop). +// +// Known gaps (non-blocking for end-to-end demo): +// - Stdio is not visible to containerd after the first restart; requires +// a containerd-side pipe-republish protocol or TaskRestart event type. +// - docker inspect reports the original PID because containerd caches +// it from the TaskCreate event; needs /tasks/start republish or new +// event topic. +// - If CreateComputeSystem or Start fails mid-way, resources are partially +// cleaned up; caller treats error as "fall through to teardown". +func (ht *hcsTask) doHandleReboot(ctx context.Context) error { + oldSys, ok := ht.c.(*hcs.System) + if !ok { + return fmt.Errorf("ht.c is %T, not *hcs.System — cannot recreate", ht.c) + } + doc := oldSys.CreateDocument() + if len(doc) == 0 { + return fmt.Errorf("no cached create document; System not created via CreateComputeSystem") + } + oldExec, ok := ht.init.(*hcsExec) + if !ok { + return fmt.Errorf("ht.init is %T, not *hcsExec — cannot reset", ht.init) + } + + log.G(ctx). + WithField("tid", ht.id). + WithField("doc_bytes", len(doc)). + Info("reboot-v2: closing old system handle") + if err := oldSys.Close(); err != nil { + log.G(ctx).WithError(err).Warn("reboot-v2: old system Close failed (proceeding anyway)") + } + + newSys, err := hcs.CreateComputeSystem(ctx, ht.id, doc) + if err != nil { + return fmt.Errorf("CreateComputeSystem on same ID failed: %w", err) + } + log.G(ctx).WithField("tid", ht.id).Info("reboot-v2: new System created on same ID") + + if err := newSys.Start(ctx); err != nil { + _ = newSys.Terminate(ctx) + _ = newSys.Wait() + _ = newSys.Close() + return fmt.Errorf("newSys.Start failed: %w", err) + } + log.G(ctx).WithField("tid", ht.id).Info("reboot-v2: new System started") + + // B3c: try to open fresh upstream IO pipes for the new init. The old + // exec's UpstreamIO was closed by the original init exit path, which + // causes containerd to tear down its server-side pipes too — so + // NewUpstreamIO typically fails with "system cannot find the file + // specified". In that case, fall back to nil stdio and run the new + // init headless. The process still runs and docker sees the container + // as Up; just no stdout/stderr visibility until a proper reattach + // mechanism lands (future work — likely needs a containerd API change + // or a shim-side pipe-republish protocol). + newCmd := &cmd.Cmd{ + Host: newSys, + Log: log.G(ctx).WithFields(logrus.Fields{"tid": ht.id, "eid": ht.id, "reboot-v2": "b3b-init"}), + CopyAfterExitTimeout: time.Second, + } + var freshIO cmd.UpstreamIO + if fio, ioErr := cmd.NewUpstreamIO(ctx, ht.id, ht.reqStdout, ht.reqStderr, ht.reqStdin, ht.reqTerminal, ht.ioRetryTimeout); ioErr == nil { + freshIO = fio + newCmd.Stdin = fio.Stdin() + newCmd.Stdout = fio.Stdout() + newCmd.Stderr = fio.Stderr() + log.G(ctx).WithField("tid", ht.id).Info("reboot-v2: fresh upstream IO opened for new init") + } else { + log.G(ctx). + WithField("tid", ht.id). + WithError(ioErr). + Warn("reboot-v2: could not open fresh IO pipes; new init will run headless") + } + if oldExec.isWCOW { + newCmd.Spec = ht.taskSpec.Process + } + if err := newCmd.Start(); err != nil { + if freshIO != nil { + freshIO.Close(ctx) + } + _ = newSys.Terminate(ctx) + _ = newSys.Wait() + _ = newSys.Close() + return fmt.Errorf("new init cmd.Start failed: %w", err) + } + newPid := newCmd.Process.Pid() + log.G(ctx). + WithField("tid", ht.id). + WithField("new.pid", newPid). + Info("reboot-v2: new init process spawned") + + // Swap state into the existing hcsExec under its lock, including the + // fresh upstream IO if we got one (nil = headless). + oldExec.sl.Lock() + oldExec.c = newSys + oldExec.p = newCmd + if freshIO != nil { + oldExec.io = freshIO + } + oldExec.pid = newPid + oldExec.state = shimExecStateRunning + oldExec.exitStatus = 255 + oldExec.exitedAt = time.Time{} + oldExec.processDone = make(chan struct{}) + oldExec.processDoneOnce = sync.Once{} + oldExec.exited = make(chan struct{}) + oldExec.exitedOnce = sync.Once{} + oldExec.sl.Unlock() + + // Swap task-level container reference. + ht.c = newSys + + // Respawn waitForExit so we track the new init process and publish + // TaskExit correctly when it ends. This is what startInternal does + // at the end of a normal Start() — we're replicating that step. + go oldExec.waitForExit() + + // B3c reboot loop: respawn waitInitExit so a SECOND in-container + // reboot is also handled transparently. Each successful handleReboot + // spawns a fresh waiter for the next cycle. Normal (non-reboot) exits + // flow through close(ctx) as before. + go ht.waitInitExit() + + log.G(ctx). + WithField("tid", ht.id). + WithField("new.pid", newPid). + Info("reboot-v2: task state swapped; container logically still Running; waiting for next exit") + return nil +} + // waitForHostExit waits for the host virtual machine to exit. Once exited // forcibly exits all additional exec's in this task. // diff --git a/internal/cow/cow.go b/internal/cow/cow.go index b60cd383b6..8aaf416db0 100644 --- a/internal/cow/cow.go +++ b/internal/cow/cow.go @@ -96,4 +96,13 @@ type Container interface { WaitError() error // Modify sends a request to modify container resources Modify(ctx context.Context, config interface{}) error + // ExitType returns the parsed SystemExitStatus.ExitType string reported by HCS + // at compute-system exit — "Reboot", "GracefulExit", "UnexpectedExit", etc. + // Empty string before the container has exited (before WaitChannel() closes) or + // when HCS did not send a parseable SystemExitStatus JSON payload. + // + // Container implementations that don't observe HCS exit notifications (fakes, + // UVM wrappers, test containers) return "". Callers should treat empty string + // as "unknown/not a reboot" and fall back to the previous exit-handling logic. + ExitType() string } diff --git a/internal/devguard/devguard.go b/internal/devguard/devguard.go new file mode 100644 index 0000000000..905d8ea181 --- /dev/null +++ b/internal/devguard/devguard.go @@ -0,0 +1,39 @@ +//go:build windows + +// Package devguard reads HKLM\Software\Microsoft\HCS\Dev\Reboot\ DWORDs +// at runtime for the container-reboot-v2 workstream dev matrix. +// +// Behavior: every call opens the registry key, reads the value, closes. +// No caching. On any error, returns false (absent == disabled). +package devguard + +import ( + "golang.org/x/sys/windows/registry" +) + +const guardRoot = `Software\Microsoft\HCS\Dev\Reboot` + +// Guard names mirror the HcsDev::Reboot::* accessors on the HCS C++ side. +const ( + ForceStopForRestart = "ForceStopForRestart" + ExposeRebootNotification = "ExposeRebootNotification" + PassExitStatusJson = "PassExitStatusJson" + SkipInternalRebootStart = "SkipInternalRebootStart" + EnableShimRebootHandler = "EnableShimRebootHandler" +) + +// IsEnabled returns true iff HKLM\guardRoot\ exists as a non-zero DWORD. +// Missing key, missing value, wrong type, or access-denied all return false. +func IsEnabled(name string) bool { + k, err := registry.OpenKey(registry.LOCAL_MACHINE, guardRoot, registry.QUERY_VALUE) + if err != nil { + return false + } + defer k.Close() + + v, _, err := k.GetIntegerValue(name) + if err != nil { + return false + } + return v != 0 +} diff --git a/internal/devguard/devguard_test.go b/internal/devguard/devguard_test.go new file mode 100644 index 0000000000..dc88ab4b0f --- /dev/null +++ b/internal/devguard/devguard_test.go @@ -0,0 +1,56 @@ +//go:build windows + +package devguard + +import ( + "testing" + + "golang.org/x/sys/windows/registry" +) + +func setGuard(t *testing.T, name string, value uint32) { + t.Helper() + k, _, err := registry.CreateKey(registry.LOCAL_MACHINE, + `Software\Microsoft\HCS\Dev\Reboot`, registry.WRITE) + if err != nil { + t.Fatalf("CreateKey: %v", err) + } + defer k.Close() + if err := k.SetDWordValue(name, value); err != nil { + t.Fatalf("SetDWordValue: %v", err) + } +} + +func clearGuard(t *testing.T, name string) { + t.Helper() + k, err := registry.OpenKey(registry.LOCAL_MACHINE, + `Software\Microsoft\HCS\Dev\Reboot`, registry.WRITE) + if err != nil { + return + } + defer k.Close() + _ = k.DeleteValue(name) +} + +func TestIsEnabled_MissingKey_ReturnsFalse(t *testing.T) { + clearGuard(t, "TestGuardA") + if IsEnabled("TestGuardA") { + t.Fatal("expected false for missing key") + } +} + +func TestIsEnabled_ZeroValue_ReturnsFalse(t *testing.T) { + setGuard(t, "TestGuardB", 0) + defer clearGuard(t, "TestGuardB") + if IsEnabled("TestGuardB") { + t.Fatal("expected false for value=0") + } +} + +func TestIsEnabled_NonZeroValue_ReturnsTrue(t *testing.T) { + setGuard(t, "TestGuardC", 1) + defer clearGuard(t, "TestGuardC") + if !IsEnabled("TestGuardC") { + t.Fatal("expected true for value=1") + } +} diff --git a/internal/gcs/container.go b/internal/gcs/container.go index 549abd35a2..008e5106a5 100644 --- a/internal/gcs/container.go +++ b/internal/gcs/container.go @@ -241,6 +241,15 @@ func (c *Container) WaitError() error { return c.waitError } +// ExitType returns "" — the guest connection path doesn't observe HCS +// SystemExitStatus notifications (it talks to the LCOW guest directly), so the +// cow.Container.ExitType contract of "empty string means unknown/fallback" is +// the correct behavior here. container-reboot-v2 is Argon-only (process-isolated +// Windows Server containers) which go through *hcs.System, not *gcs.Container. +func (c *Container) ExitType() string { + return "" +} + // Wait waits for the container to terminate (or Close to be called, or the // guest connection to terminate). func (c *Container) Wait() error { diff --git a/internal/hcs/callback.go b/internal/hcs/callback.go index 7b27173c3a..ef1c73ac6f 100644 --- a/internal/hcs/callback.go +++ b/internal/hcs/callback.go @@ -6,6 +6,8 @@ import ( "fmt" "sync" "syscall" + "unicode/utf16" + "unsafe" "github.com/Microsoft/hcsshim/internal/interop" "github.com/Microsoft/hcsshim/internal/logfields" @@ -87,7 +89,18 @@ func (hn hcsNotification) String() string { } } -type notificationChannel chan error +// notificationPayload carries both the error code and the raw EventData +// string that accompanied the HCS notification. Prior to container-reboot-v2 +// the channel was just `chan error`, which silently discarded the +// notificationData pointer — so hcsshim couldn't observe the +// SystemExitStatus JSON (and therefore couldn't see ExitType=Reboot). +// Callers that only care about err can ignore data. +type notificationPayload struct { + err error + data string +} + +type notificationChannel chan notificationPayload type notificationWatcherContext struct { channels notificationChannels @@ -133,9 +146,12 @@ func closeChannels(channels notificationChannels) { } func notificationWatcher(notificationType hcsNotification, callbackNumber uintptr, notificationStatus uintptr, notificationData *uint16) uintptr { - var result error + var payload notificationPayload if int32(notificationStatus) < 0 { - result = interop.Win32FromHresult(notificationStatus) + payload.err = interop.Win32FromHresult(notificationStatus) + } + if notificationData != nil { + payload.data = utf16PtrToString(notificationData) } callbackMapLock.RLock() @@ -156,8 +172,27 @@ func notificationWatcher(notificationType hcsNotification, callbackNumber uintpt log.Debug("HCS notification") if channel, ok := context.channels[notificationType]; ok { - channel <- result + channel <- payload } return 0 } + +// utf16PtrToString materializes a null-terminated UTF-16 pointer (as the +// Win32 HCS callback gives us) into a Go string. Returns "" on nil input. +// Walks the pointer two bytes at a time until it hits NUL; the caller owns +// neither the pointer nor its backing memory so we must copy immediately. +func utf16PtrToString(p *uint16) string { + if p == nil { + return "" + } + var units []uint16 + for addr := uintptr(unsafe.Pointer(p)); ; addr += 2 { + c := *(*uint16)(unsafe.Pointer(addr)) + if c == 0 { + break + } + units = append(units, c) + } + return string(utf16.Decode(units)) +} diff --git a/internal/hcs/callback_test.go b/internal/hcs/callback_test.go new file mode 100644 index 0000000000..88c6c8a3a4 --- /dev/null +++ b/internal/hcs/callback_test.go @@ -0,0 +1,143 @@ +//go:build windows + +package hcs + +import ( + "strings" + "syscall" + "testing" + "unsafe" +) + +func TestParseExitType_Reboot(t *testing.T) { + et, err := parseExitType(`{"Status":0,"ExitType":"Reboot"}`) + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if et != "Reboot" { + t.Fatalf("got %q want Reboot", et) + } +} + +func TestParseExitType_GracefulExit(t *testing.T) { + et, err := parseExitType(`{"Status":0,"ExitType":"GracefulExit"}`) + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if et != "GracefulExit" { + t.Fatalf("got %q want GracefulExit", et) + } +} + +func TestParseExitType_Empty(t *testing.T) { + et, err := parseExitType("") + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if et != "" { + t.Fatalf("got %q want empty", et) + } +} + +func TestParseExitType_Malformed(t *testing.T) { + _, err := parseExitType(`{not json`) + if err == nil { + t.Fatal("expected error on malformed JSON") + } + if !strings.Contains(err.Error(), "invalid") && !strings.Contains(err.Error(), "json") { + t.Logf("non-canonical error (still OK): %v", err) + } +} + +func TestParseExitType_NoExitTypeField(t *testing.T) { + // Older HCS builds may send SystemExitStatus without the ExitType field. + // The parse shouldn't fail, just return "". + et, err := parseExitType(`{"Status":0}`) + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if et != "" { + t.Fatalf("got %q want empty", et) + } +} + +// TestNotificationWatcher_DeliversDataAndError verifies that notificationWatcher +// routes both the error and the raw notificationData payload to the channel that +// the watcher goroutine reads. This is the plumbing that lets hcsshim observe the +// SystemExitStatus JSON carried by an HcsEventSystemExited notification — it's +// how ExitType=Reboot reaches the shim-side reboot handler in container-reboot-v2. +func TestNotificationWatcher_DeliversDataAndError(t *testing.T) { + const callbackNumber uintptr = 0xdeadbeef + ctx := ¬ificationWatcherContext{ + channels: newSystemChannels(), + systemID: "TestNotificationWatcher_DeliversDataAndError", + } + callbackMapLock.Lock() + callbackMap[callbackNumber] = ctx + callbackMapLock.Unlock() + t.Cleanup(func() { + callbackMapLock.Lock() + delete(callbackMap, callbackNumber) + callbackMapLock.Unlock() + }) + + wantData := `{"Status":0,"ExitType":"Reboot"}` + u16, err := syscall.UTF16FromString(wantData) + if err != nil { + t.Fatalf("UTF16FromString: %v", err) + } + ptr := (*uint16)(unsafe.Pointer(&u16[0])) + + notificationWatcher(hcsNotificationSystemExited, callbackNumber, 0, ptr) + + select { + case p, ok := <-ctx.channels[hcsNotificationSystemExited]: + if !ok { + t.Fatal("channel closed before payload delivered") + } + if p.err != nil { + t.Fatalf("unexpected err: %v", p.err) + } + if p.data != wantData { + t.Fatalf("payload data = %q, want %q", p.data, wantData) + } + default: + t.Fatal("no payload delivered on channel") + } +} + +// TestNotificationWatcher_NilDataYieldsEmptyString covers the common case of a +// notification without event data (anything other than HcsEventSystemExited). +// The watcher must tolerate notificationData==nil and deliver payload.data == "". +func TestNotificationWatcher_NilDataYieldsEmptyString(t *testing.T) { + const callbackNumber uintptr = 0xdeadbef0 + ctx := ¬ificationWatcherContext{ + channels: newSystemChannels(), + systemID: "TestNotificationWatcher_NilDataYieldsEmptyString", + } + callbackMapLock.Lock() + callbackMap[callbackNumber] = ctx + callbackMapLock.Unlock() + t.Cleanup(func() { + callbackMapLock.Lock() + delete(callbackMap, callbackNumber) + callbackMapLock.Unlock() + }) + + notificationWatcher(hcsNotificationSystemStartCompleted, callbackNumber, 0, nil) + + select { + case p, ok := <-ctx.channels[hcsNotificationSystemStartCompleted]: + if !ok { + t.Fatal("channel closed before payload delivered") + } + if p.err != nil { + t.Fatalf("unexpected err: %v", p.err) + } + if p.data != "" { + t.Fatalf("payload data = %q, want empty", p.data) + } + default: + t.Fatal("no payload delivered on channel") + } +} diff --git a/internal/hcs/exitstatus.go b/internal/hcs/exitstatus.go new file mode 100644 index 0000000000..aae5e68b69 --- /dev/null +++ b/internal/hcs/exitstatus.go @@ -0,0 +1,34 @@ +//go:build windows + +package hcs + +import ( + "encoding/json" +) + +// systemExitStatus mirrors the HCS external schema for +// HcsEventSystemExited's EventData payload. The server (vmcompute.exe) serializes +// Schema::Responses::System::SystemExitStatus into JSON; the shim parses it back +// here. We care about Status (HRESULT) and the new ExitType added in schema 2.18 +// (string rendering of the NotificationType enum: "Reboot", "GracefulExit", ...). +// Other fields on the wire (e.g. Attribution) are ignored intentionally. +type systemExitStatus struct { + Status int32 `json:"Status"` + ExitType string `json:"ExitType,omitempty"` +} + +// parseExitType reads a SystemExitStatus JSON document and returns the ExitType +// string. Empty input returns ("", nil) so non-exited notifications that carry +// no payload are benign. Malformed JSON returns ("", err). A well-formed document +// without the ExitType field returns ("", nil) — that's how older HCS builds +// serialize the struct. +func parseExitType(s string) (string, error) { + if s == "" { + return "", nil + } + var st systemExitStatus + if err := json.Unmarshal([]byte(s), &st); err != nil { + return "", err + } + return st.ExitType, nil +} diff --git a/internal/hcs/system.go b/internal/hcs/system.go index 823e27b0b7..956ca6d73b 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -38,6 +38,22 @@ type System struct { os, typ, owner string startTime time.Time stopTime time.Time + + // container-reboot-v2: parsed SystemExitStatus.ExitType populated by + // waitBackground when hcsNotificationSystemExited fires with JSON payload. + // Read via ExitType(). Empty until waitBlock is closed; "Reboot" when + // the HCS server sent SystemExited with the new 2.18 Reboot enum value. + exitTypeMu sync.RWMutex + exitType string + + // container-reboot-v2 Stage 4 Sub-step B1: cached copy of the document + // this System was created with, so Sub-step B's handleReboot can reissue + // an identical CreateComputeSystem on the same ID after the silo + // destructs. Stored as raw JSON (not the pre-marshal interface{}) because + // the original document is a build-time tree of Go structs that's + // painful to round-trip; the bytes are what HCS actually sees. Exposed + // via CreateDocument(). + createDocument json.RawMessage } var _ cow.Container = &System{} @@ -75,6 +91,12 @@ func CreateComputeSystem(ctx context.Context, id string, hcsDocumentInterface in hcsDocument := string(hcsDocumentB) + // container-reboot-v2 Stage 4 Sub-step B1: cache the exact bytes HCS sees, + // before any errors, so a later handleReboot can reissue the same document. + // Copy into a fresh slice since the caller-owned hcsDocumentB may alias a + // larger buffer or be reused by GC. + computeSystem.createDocument = append(json.RawMessage(nil), hcsDocumentB...) + var ( identity syscall.Handle resultJSON string @@ -281,7 +303,53 @@ func (computeSystem *System) waitBackground() { defer span.End() span.AddAttributes(trace.StringAttribute("cid", computeSystem.id)) - err := waitForNotification(ctx, computeSystem.callbackNumber, hcsNotificationSystemExited, nil) + // container-reboot-v2 Stage 2: peek the hcsNotificationSystemExited channel + // BEFORE calling waitForNotification so we observe the payload.data (the + // SystemExitStatus JSON). waitForNotification consumes the same channel but + // discards data — it only returns payload.err. Running the recv ourselves + // here lets us extract ExitType; then we synthesize the err-only wait by + // returning the recv's error directly without going through waitForNotification. + // + // Safe because System.waitBackground is the sole reader of this channel for + // the compute system's lifetime (other waiters go through waitForNotification + // for *other* notification types). If that invariant changes, this split + // must move into waitForNotification itself. + callbackMapLock.RLock() + cbCtx, cbOK := callbackMap[computeSystem.callbackNumber] + callbackMapLock.RUnlock() + + var err error + var exitData string + if cbOK { + payload, ok := <-cbCtx.channels[hcsNotificationSystemExited] + if !ok { + err = ErrHandleClose + } else { + err = payload.err + exitData = payload.data + } + } else { + // Fall back to the old path if the callback context disappeared. + err = waitForNotification(ctx, computeSystem.callbackNumber, hcsNotificationSystemExited, nil) + } + + if exitData != "" { + if et, parseErr := parseExitType(exitData); parseErr == nil && et != "" { + computeSystem.exitTypeMu.Lock() + computeSystem.exitType = et + computeSystem.exitTypeMu.Unlock() + } else if parseErr != nil { + log.G(ctx).WithError(parseErr).WithField("system-id", computeSystem.id).Debug("failed to parse SystemExitStatus JSON") + } + } + + // container-reboot-v2 Stage 1 checkpoint #7 span attrs, populated with real + // values once Stage 2's PassExitStatusJson guard makes the JSON survive to here. + span.AddAttributes( + trace.StringAttribute("reboot.exit_type", computeSystem.ExitType()), + trace.Int64Attribute("reboot.notification_data_bytes", int64(len(exitData))), + ) + if err == nil { log.G(ctx).Debug("system exited") } else if errors.Is(err, ErrVmcomputeUnexpectedExit) { @@ -303,6 +371,28 @@ func (computeSystem *System) WaitChannel() <-chan struct{} { return computeSystem.waitBlock } +// ExitType returns the parsed SystemExitStatus.ExitType string reported by HCS +// at compute-system exit — "Reboot", "GracefulExit", "UnexpectedExit", etc. Empty +// string before the system has exited (before WaitChannel() unblocks) or when +// HCS did not send a parseable SystemExitStatus JSON payload. Populated by +// waitBackground exactly once per compute-system lifetime. +// +// container-reboot-v2 Stage 4 uses this to detect when a container exit was a +// reboot request and reroute to handleReboot instead of teardown. +func (computeSystem *System) ExitType() string { + computeSystem.exitTypeMu.RLock() + defer computeSystem.exitTypeMu.RUnlock() + return computeSystem.exitType +} + +// CreateDocument returns the JSON body this System was originally created with. +// Used by container-reboot-v2 Stage 4's handleReboot to reissue +// HcsCreateComputeSystem with identical configuration after a silo reboot. +// Returns nil for Systems created outside CreateComputeSystem (e.g. OpenComputeSystem). +func (computeSystem *System) CreateDocument() json.RawMessage { + return computeSystem.createDocument +} + func (computeSystem *System) WaitError() error { return computeSystem.waitError } diff --git a/internal/hcs/waithelper.go b/internal/hcs/waithelper.go index 3a51ed1955..f5d1bc2a8a 100644 --- a/internal/hcs/waithelper.go +++ b/internal/hcs/waithelper.go @@ -54,19 +54,19 @@ func waitForNotification( } select { - case err, ok := <-expectedChannel: + case payload, ok := <-expectedChannel: if !ok { return ErrHandleClose } - return err - case err, ok := <-channels[hcsNotificationSystemExited]: + return payload.err + case payload, ok := <-channels[hcsNotificationSystemExited]: if !ok { return ErrHandleClose } // If the expected notification is hcsNotificationSystemExited which of the two selects // chosen is random. Return the raw error if hcsNotificationSystemExited is expected if channels[hcsNotificationSystemExited] == expectedChannel { - return err + return payload.err } return ErrUnexpectedContainerExit case _, ok := <-channels[hcsNotificationServiceDisconnect]: diff --git a/internal/jobcontainers/jobcontainer.go b/internal/jobcontainers/jobcontainer.go index 63cd709564..8079bf9947 100644 --- a/internal/jobcontainers/jobcontainer.go +++ b/internal/jobcontainers/jobcontainer.go @@ -617,6 +617,13 @@ func (c *JobContainer) WaitError() error { return c.waitError } +// ExitType returns "" — job containers don't wrap an HCS compute system and +// therefore never observe a SystemExitStatus.ExitType notification. Callers +// treating empty string as "unknown" get the right fallback behavior. +func (c *JobContainer) ExitType() string { + return "" +} + // Wait synchronously waits for the container to shutdown or terminate. If // the container has already exited returns the previous error (if any). func (c *JobContainer) Wait() error {