From 23579e18623a9caee8630bec8a0274e369763dfd Mon Sep 17 00:00:00 2001
From: Tingmao Wang <tingmaowang@microsoft.com>
Date: Tue, 11 Nov 2025 14:58:51 +0000
Subject: [PATCH 1/8] Merged PR 12878148: bridge: Force sequential message
 handling for confidential containers

[cherry-picked from f81b450894206a79fff4d63182ff034ba503ebdb]

This PR contains 2 commits. The first one is the fix:

**bridge: Force sequential message handling for confidential containers**

This fixes a vulnerability (and reduces the surface for other similar potential
vulnerabilities) in confidential containers where if the host sends a
mount/unmount modification request concurrently with an ongoing CreateContainer
request, the host could re-order or skip image layers for the container to be
started.

While this could be fixed by adding mutex lock/unlock around the individual
modifyMappedVirtualDisk/modifyCombinedLayers/CreateContainer functions, we
decided that in order to prevent any more of this class of issues, the UVM, when
running in confidential mode, should just not allow concurrent requests (with
exception for any actually long-running requests, which for now is just
waitProcess).

The second one adds a log entry for when the processing thread blocks.  This
will make it easier to debug should the gcs "hung" on a request.

This PR is created on ADO targeting the conf branch as this security
vulnerability is not public yet. This fix should be backported to main once
deployed.

Related work items: #33357501, #34327300
Signed-off-by: Tingmao Wang <tingmaowang@microsoft.com>
---
 cmd/gcs/main.go                 |  4 ++
 internal/guest/bridge/bridge.go | 94 +++++++++++++++++++++++++--------
 2 files changed, 75 insertions(+), 23 deletions(-)

diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go
index 0162e7936d..72fc5ed13b 100644
--- a/cmd/gcs/main.go
+++ b/cmd/gcs/main.go
@@ -30,6 +30,7 @@ import (
 	"github.com/Microsoft/hcsshim/internal/log"
 	"github.com/Microsoft/hcsshim/internal/oc"
 	"github.com/Microsoft/hcsshim/internal/version"
+	"github.com/Microsoft/hcsshim/pkg/amdsevsnp"
 	"github.com/Microsoft/hcsshim/pkg/securitypolicy"
 	oci "github.com/opencontainers/runtime-spec/specs-go"
 )
@@ -414,6 +415,9 @@ func main() {
 
 	mux := bridge.NewBridgeMux()
 	b := bridge.New(mux, *v4)
+	// For confidential containers, we protect ourselves against attacks caused
+	// by concurrent modifications, by processing one request at a time.
+	b.ForceSequential = amdsevsnp.IsSNP()
 	b.AssignHandlers(mux, h)
 
 	// Reconnect loop: dial the host, serve until the connection drops, then
diff --git a/internal/guest/bridge/bridge.go b/internal/guest/bridge/bridge.go
index d8d57e786d..2a029773ed 100644
--- a/internal/guest/bridge/bridge.go
+++ b/internal/guest/bridge/bridge.go
@@ -177,6 +177,10 @@ type Bridge struct {
 	Handler Handler
 	// EnableV4 enables the v4+ bridge and the schema v2+ interfaces.
 	EnableV4 bool
+	// Setting ForceSequential to true will force the bridge to only process one
+	// request at a time, except for certain long-running operations (as defined
+	// in asyncMessages).
+	ForceSequential bool
 
 	// responseChan is the response channel used for both request/response
 	// and publish notification workflows.
@@ -256,6 +260,14 @@ func (b *Bridge) ShutdownRequested() bool {
 	return b.hasQuitPending.Load()
 }
 
+// Messages that will be processed asynchronously even in sequential mode.  Note
+// that in sequential mode, these messages will still wait for any in-progress
+// non-async messages to be handled before they are processed, but once they are
+// "acknowledged", the rest will be done asynchronously.
+var alwaysAsyncMessages map[prot.MessageIdentifier]bool = map[prot.MessageIdentifier]bool{
+	prot.ComputeSystemWaitForProcessV1: true,
+}
+
 // AssignHandlers creates and assigns the appropriate bridge
 // events to be listen for and intercepted on `mux` before forwarding
 // to `gcs` for handling.
@@ -307,6 +319,10 @@ func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser
 	defer b.disconnectNotifications()
 	defer close(b.quitChan)
 
+	if b.ForceSequential {
+		log.G(context.Background()).Info("bridge: ForceSequential enabled")
+	}
+
 	// Receive bridge requests and schedule them to be processed.
 	go func() {
 		var recverr error
@@ -409,30 +425,36 @@ func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser
 	}()
 	// Process each bridge request async and create the response writer.
 	go func() {
-		for req := range requestChan {
-			go func(r *Request) {
-				br := bridgeResponse{
-					ctx: r.Context,
-					header: &prot.MessageHeader{
-						Type: prot.GetResponseIdentifier(r.Header.Type),
-						ID:   r.Header.ID,
-					},
-				}
-				resp, err := b.Handler.ServeMsg(r)
-				if resp == nil {
-					resp = &prot.MessageResponseBase{}
-				}
-				resp.Base().ActivityID = r.ActivityID
-				if err != nil {
-					span := trace.FromContext(r.Context)
-					if span != nil {
-						oc.SetSpanStatus(span, err)
-					}
-					setErrorForResponseBase(resp.Base(), err, "gcs" /* moduleName */)
+		doOneRequest := func(r *Request) {
+			br := bridgeResponse{
+				ctx: r.Context,
+				header: &prot.MessageHeader{
+					Type: prot.GetResponseIdentifier(r.Header.Type),
+					ID:   r.Header.ID,
+				},
+			}
+			resp, err := b.Handler.ServeMsg(r)
+			if resp == nil {
+				resp = &prot.MessageResponseBase{}
+			}
+			resp.Base().ActivityID = r.ActivityID
+			if err != nil {
+				span := trace.FromContext(r.Context)
+				if span != nil {
+					oc.SetSpanStatus(span, err)
 				}
-				br.response = resp
-				b.responseChan <- br
-			}(req)
+				setErrorForResponseBase(resp.Base(), err, "gcs" /* moduleName */)
+			}
+			br.response = resp
+			b.responseChan <- br
+		}
+
+		for req := range requestChan {
+			if b.ForceSequential && !alwaysAsyncMessages[req.Header.Type] {
+				runSequentialRequestHandler(req, doOneRequest)
+			} else {
+				go doOneRequest(req)
+			}
 		}
 	}()
 	// Process each bridge response sync. This channel is for request/response and publish workflows.
@@ -495,6 +517,32 @@ func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser
 	}
 }
 
+// Do handleFn(r), but prints a warning if handleFn does not, or takes too long
+// to return.
+func runSequentialRequestHandler(r *Request, handleFn func(*Request)) {
+	// Note that this is only a context used for triggering the blockage
+	// warning, the request processing still uses r.Context.  We don't want to
+	// cancel the request handling itself when we reach the 5s timeout.
+	timeoutCtx, cancel := context.WithTimeout(r.Context, 5*time.Second)
+	go func() {
+		<-timeoutCtx.Done()
+		if errors.Is(timeoutCtx.Err(), context.DeadlineExceeded) {
+			log.G(timeoutCtx).WithFields(logrus.Fields{
+				// We want to log those even though we're providing r.Context, since if
+				// the request never finishes the span end log will never get written,
+				// and we may therefore not be able to find out about the following info
+				// otherwise:
+				"message-type": r.Header.Type.String(),
+				"message-id":   r.Header.ID,
+				"activity-id":  r.ActivityID,
+				"container-id": r.ContainerID,
+			}).Warnf("bridge: request processing thread in sequential mode blocked on the current request for more than 5 seconds")
+		}
+	}()
+	defer cancel()
+	handleFn(r)
+}
+
 // PublishNotification writes a specific notification to the bridge.
 func (b *Bridge) PublishNotification(n *prot.ContainerNotification) {
 	ctx, span := oc.StartSpan(context.Background(),

From 9ceb3e0f61e3cb3b93a449d7a6d516201f3d681c Mon Sep 17 00:00:00 2001
From: Tingmao Wang <tingmaowang@microsoft.com>
Date: Fri, 19 Dec 2025 11:48:20 +0000
Subject: [PATCH 2/8] Fix usage of IsSNP() to handle the new error return value

Signed-off-by: Tingmao Wang <tingmaowang@microsoft.com>
---
 cmd/gcs/main.go | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go
index 72fc5ed13b..b03013acdb 100644
--- a/cmd/gcs/main.go
+++ b/cmd/gcs/main.go
@@ -417,7 +417,14 @@ func main() {
 	b := bridge.New(mux, *v4)
 	// For confidential containers, we protect ourselves against attacks caused
 	// by concurrent modifications, by processing one request at a time.
-	b.ForceSequential = amdsevsnp.IsSNP()
+	forceSequential, err := amdsevsnp.IsSNP()
+	if err != nil {
+		// IsSNP cannot fail on LCOW
+		logrus.Errorf("Got unexpected error from IsSNP(): %v", err)
+		// If it fails, we proceed with forceSequential enabled to be safe
+		forceSequential = true
+	}
+	b.ForceSequential = forceSequential
 	b.AssignHandlers(mux, h)
 
 	// Reconnect loop: dial the host, serve until the connection drops, then

From b8b0146a8653e6731bea51b21d29f7f98daf8d83 Mon Sep 17 00:00:00 2001
From: Tingmao Wang <tingmaowang@microsoft.com>
Date: Mon, 20 Oct 2025 16:52:21 +0000
Subject: [PATCH 3/8] Merged PR 13618357: guest/network: Restrict hostname to
 valid characters

[cherry-picked from 055ee5eb4a802cb407575fb6cc1e9b07069d3319]

guest/network: Restrict hostname to valid characters

Because we write this hostname to /etc/hosts, without proper validation the host
can trick us into writing arbitrary data to /etc/hosts, which can, for example,
redirect things like ip6-localhost (but likely not localhost itself) to an
attacker-controlled IP address.

We implement a check here that the host-provided DNS name in the OCI spec is
valid.

ACI actually restricts this to 5-63 characters of a-zA-Z0-9 and '-', where the
first and last characters can not be '-'.  This aligns with the Kubernetes
restriction.  c.f. IsValidDnsLabel in Compute-ACI.  However, there is no
consistent official agreement on what a valid hostname can contain.  RFC 952
says that "Domain name" can be up to 24 characters of A-Z0-9 '.' and '-', RFC
1123 expands this to 255 characters, but RFC 1035 claims that domain names can
in fact contain anything if quoted (as long as the length is within 255
characters), and this is confirmed again in RFC 2181.  In practice we see names
with underscopes, most commonly \_dmarc.example.com.  curl allows 0-9a-zA-Z and
-.\_|~ and any other codepoints from \u0001-\u001f and above \u007f:
https://github.com/curl/curl/blob/master/lib/urlapi.c#L527-L545

With the above in mind, this commit allows up to 255 characters of a-zA-Z0-9 and
'_', '-' and '.'.  This change is applied to all LCOW use cases.

This fix can be tested with the below code to bypass any host-side checks:

	+++ b/internal/hcsoci/hcsdoc_lcow.go
	@@ -52,6 +52,10 @@ func createLCOWSpec(ctx context.Context, coi *createOptionsInternal) (*specs.Spe
			spec.Linux.Seccomp = nil
		}

	+	if spec.Annotations[annotations.KubernetesContainerType] == "sandbox" {
	+		spec.Hostname = "invalid-hostname\n1.1.1.1 ip6-localhost ip6-loopback localhost"
	+	}
	+
		return spec, nil
	}

Output:

	time="2025-10-01T15:13:41Z" level=fatal msg="run pod sandbox: rpc error: code = Unknown desc = failed to create containerd task: failed to create shim task: failed to create container f2209bb2960d5162fc9937d3362e1e2cf1724c56d1296ba2551ce510cb2bcd43: guest RPC failure: hostname \"invalid-hostname\\n1.1.1.1 ip6-localhost ip6-loopback localhost\" invalid: must match ^[a-zA-Z0-9_\\-\\.]{0,999}$: unknown"

Related work items: #34370598
Closes: https://msazure.visualstudio.com/One/_workitems/edit/34370598
Signed-off-by: Tingmao Wang <tingmaowang@microsoft.com>
---
 internal/guest/network/network.go             | 13 +++++++
 internal/guest/network/network_test.go        | 35 +++++++++++++++++++
 .../guest/runtime/hcsv2/sandbox_container.go  |  3 ++
 .../runtime/hcsv2/standalone_container.go     |  3 ++
 4 files changed, 54 insertions(+)

diff --git a/internal/guest/network/network.go b/internal/guest/network/network.go
index cc06398e41..f68f8afa39 100644
--- a/internal/guest/network/network.go
+++ b/internal/guest/network/network.go
@@ -9,6 +9,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"regexp"
 	"strings"
 	"time"
 
@@ -32,6 +33,18 @@ var (
 // maxDNSSearches is limited to 6 in `man 5 resolv.conf`
 const maxDNSSearches = 6
 
+var validHostnameRegex = regexp.MustCompile(`^[a-zA-Z0-9_\-\.]{0,255}$`)
+
+// Check that the hostname is safe. This function is less strict than
+// technically allowed, but ensures that when the hostname is inserted to
+// /etc/hosts, it cannot lead to injection attacks.
+func ValidateHostname(hostname string) error {
+	if !validHostnameRegex.MatchString(hostname) {
+		return errors.Errorf("hostname %q invalid: must match %s", hostname, validHostnameRegex.String())
+	}
+	return nil
+}
+
 // GenerateEtcHostsContent generates a /etc/hosts file based on `hostname`.
 func GenerateEtcHostsContent(ctx context.Context, hostname string) string {
 	_, span := oc.StartSpan(ctx, "network::GenerateEtcHostsContent")
diff --git a/internal/guest/network/network_test.go b/internal/guest/network/network_test.go
index 9a718aed66..cc0ec1be3a 100644
--- a/internal/guest/network/network_test.go
+++ b/internal/guest/network/network_test.go
@@ -7,6 +7,7 @@ import (
 	"context"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"
 	"time"
 )
@@ -70,6 +71,40 @@ func Test_GenerateResolvConfContent(t *testing.T) {
 	}
 }
 
+func Test_ValidateHostname(t *testing.T) {
+	validNames := []string{
+		"localhost",
+		"my-hostname",
+		"my.hostname",
+		"my-host-name123",
+		"_underscores.are.allowed.too",
+		"", // Allow not specifying a hostname
+	}
+
+	invalidNames := []string{
+		"localhost\n13.104.0.1 ip6-localhost ip6-loopback localhost",
+		"localhost\n2603:1000::1 ip6-localhost ip6-loopback",
+		"hello@microsoft.com",
+		"has space",
+		"has,comma",
+		"\x00",
+		"a\nb",
+		strings.Repeat("a", 1000),
+	}
+
+	for _, n := range validNames {
+		if err := ValidateHostname(n); err != nil {
+			t.Fatalf("expected %q to be valid, got: %v", n, err)
+		}
+	}
+
+	for _, n := range invalidNames {
+		if err := ValidateHostname(n); err == nil {
+			t.Fatalf("expected %q to be invalid, but got nil error", n)
+		}
+	}
+}
+
 func Test_GenerateEtcHostsContent(t *testing.T) {
 	type testcase struct {
 		name string
diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go
index 5f3d80356b..c6623d52f1 100644
--- a/internal/guest/runtime/hcsv2/sandbox_container.go
+++ b/internal/guest/runtime/hcsv2/sandbox_container.go
@@ -50,6 +50,9 @@ func setupSandboxContainerSpec(ctx context.Context, id, sandboxRoot string, spec
 
 	// Write the hostname
 	hostname := spec.Hostname
+	if err = network.ValidateHostname(hostname); err != nil {
+		return err
+	}
 	if hostname == "" {
 		var err error
 		hostname, err = os.Hostname()
diff --git a/internal/guest/runtime/hcsv2/standalone_container.go b/internal/guest/runtime/hcsv2/standalone_container.go
index 4cf8cc72ec..27b6f8471b 100644
--- a/internal/guest/runtime/hcsv2/standalone_container.go
+++ b/internal/guest/runtime/hcsv2/standalone_container.go
@@ -46,6 +46,9 @@ func setupStandaloneContainerSpec(ctx context.Context, id, rootDir string, spec
 	}()
 
 	hostname := spec.Hostname
+	if err = network.ValidateHostname(hostname); err != nil {
+		return err
+	}
 	if hostname == "" {
 		var err error
 		hostname, err = os.Hostname()

From 8c33ab55e4283cff8f9a7806b118efb1a152b809 Mon Sep 17 00:00:00 2001
From: Tingmao Wang <tingmaowang@microsoft.com>
Date: Tue, 11 Nov 2025 16:48:20 +0000
Subject: [PATCH 4/8] Merged PR 12878779: Enhance uvm_state::hostMounts to
 track in-use mounts, and prevent unmounting or deleting in-use things

[cherry-picked from d0334883cd43eecbb401a6ded3e0317179a3e54b]

This set of changes adds some checks (when running with a confidential policy)
to prevent the host from trying to clean up mounts, overlays, or the container
states dir when the container is running (or when the overlay has not been
unmounted yet).  This is through enhancing the existing `hostMounts` utility, as
well as adding a `terminated` flag to the Container struct.

The correct order of operations should always be:

- mount read-only layers and scratch (in any order, and individual containers
(not the sandbox) might not have their own scratch) - mount the overlay - start
the container - container terminates - unmount overlay - unmount read-only
layers and scratch

The starting up order is implied, and we now explicitly deny e.g. unmounting
layer/scratch before unmounting overlay, or unmounting the overlay while
container has not terminated.

We also deny deleteContainerState requests when the container is running or when
the overlay is mounted.  Doing so when a container is running can result in
unexpectedly deleting its files, which breaks it in unpredictable ways and is
bad.

Signed-off-by: Tingmao Wang <tingmaowang@microsoft.com>
---
 internal/guest/bridge/bridge_v2.go            |   8 +-
 internal/guest/runtime/hcsv2/container.go     |   3 +
 internal/guest/runtime/hcsv2/process.go       |   1 +
 internal/guest/runtime/hcsv2/uvm.go           | 246 +++++++++---
 internal/guest/runtime/hcsv2/uvm_state.go     | 359 +++++++++++++++---
 .../guest/runtime/hcsv2/uvm_state_test.go     | 219 ++++++++++-
 6 files changed, 738 insertions(+), 98 deletions(-)

diff --git a/internal/guest/bridge/bridge_v2.go b/internal/guest/bridge/bridge_v2.go
index dc13ec6428..748e9dcd9b 100644
--- a/internal/guest/bridge/bridge_v2.go
+++ b/internal/guest/bridge/bridge_v2.go
@@ -467,16 +467,10 @@ func (b *Bridge) deleteContainerStateV2(r *Request) (_ RequestResponse, err erro
 		return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message)
 	}
 
-	c, err := b.hostState.GetCreatedContainer(request.ContainerID)
+	err = b.hostState.DeleteContainerState(ctx, request.ContainerID)
 	if err != nil {
 		return nil, err
 	}
-	// remove container state regardless of delete's success
-	defer b.hostState.RemoveContainer(request.ContainerID)
-
-	if err := c.Delete(ctx); err != nil {
-		return nil, err
-	}
 
 	return &prot.MessageResponseBase{}, nil
 }
diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go
index 13c0231801..15d416ae96 100644
--- a/internal/guest/runtime/hcsv2/container.go
+++ b/internal/guest/runtime/hcsv2/container.go
@@ -77,6 +77,9 @@ type Container struct {
 	// and deal with the extra pointer dereferencing overhead.
 	status atomic.Uint32
 
+	// Set to true when the init process for the container has exited
+	terminated atomic.Bool
+
 	// scratchDirPath represents the path inside the UVM where the scratch directory
 	// of this container is located. Usually, this is either `/run/gcs/c/<containerID>` or
 	// `/run/gcs/c/<UVMID>/container_<containerID>` if scratch is shared with UVM scratch.
diff --git a/internal/guest/runtime/hcsv2/process.go b/internal/guest/runtime/hcsv2/process.go
index e94c9792f6..96564cfab0 100644
--- a/internal/guest/runtime/hcsv2/process.go
+++ b/internal/guest/runtime/hcsv2/process.go
@@ -99,6 +99,7 @@ func newProcess(c *Container, spec *oci.Process, process runtime.Process, pid ui
 			log.G(ctx).WithError(err).Error("failed to wait for runc process")
 		}
 		p.exitCode = exitCode
+		c.terminated.Store(true)
 		log.G(ctx).WithField("exitCode", p.exitCode).Debug("process exited")
 
 		// Free any process waiters
diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go
index 0cd6448cae..f6878dccd2 100644
--- a/internal/guest/runtime/hcsv2/uvm.go
+++ b/internal/guest/runtime/hcsv2/uvm.go
@@ -17,6 +17,7 @@ import (
 	"regexp"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"syscall"
 	"time"
 
@@ -25,6 +26,7 @@ import (
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
+	"go.opencensus.io/trace"
 	"golang.org/x/sys/unix"
 
 	"github.com/Microsoft/hcsshim/internal/bridgeutils/gcserr"
@@ -44,6 +46,7 @@ import (
 	"github.com/Microsoft/hcsshim/internal/guestpath"
 	"github.com/Microsoft/hcsshim/internal/log"
 	"github.com/Microsoft/hcsshim/internal/logfields"
+	"github.com/Microsoft/hcsshim/internal/oc"
 	"github.com/Microsoft/hcsshim/internal/oci"
 	"github.com/Microsoft/hcsshim/internal/protocol/guestrequest"
 	"github.com/Microsoft/hcsshim/internal/protocol/guestresource"
@@ -114,7 +117,9 @@ type Host struct {
 	securityOptions *securitypolicy.SecurityOptions
 
 	// hostMounts keeps the state of currently mounted devices and file systems,
-	// which is used for GCS hardening.
+	// which is used for GCS hardening.  It is only used for confidential
+	// containers, and is initialized in SetConfidentialUVMOptions.  If this is
+	// nil, we do not do add any special restrictions on mounts / unmounts.
 	hostMounts *hostMounts
 	// uvmError contains a permanent flag to indicate that further mounts,
 	// unmounts and container creation / deletion should not be allowed.  This
@@ -171,7 +176,7 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s
 		rtime:             rtime,
 		vsock:             vsock,
 		devNullTransport:  &transport.DevNullTransport{},
-		hostMounts:        newHostMounts(),
+		hostMounts:        nil,
 		securityOptions:   securityPolicyOptions,
 		uvmError:          uvmConsistencyError{},
 	}
@@ -500,6 +505,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM
 		isSandbox:      criType == "sandbox",
 		exitType:       prot.NtUnexpectedExit,
 		processes:      make(map[uint32]*containerProcess),
+		terminated:     atomic.Bool{},
 		scratchDirPath: settings.ScratchDirPath,
 	}
 	c.setStatus(containerCreating)
@@ -802,6 +808,25 @@ func writeSpecToFile(ctx context.Context, configFile string, spec *specs.Spec) e
 	return nil
 }
 
+// Returns whether there is a running container that is currently using the
+// given overlay (as its rootfs).
+func (h *Host) IsOverlayInUse(overlayPath string) bool {
+	h.containersMutex.Lock()
+	defer h.containersMutex.Unlock()
+
+	for _, c := range h.containers {
+		if c.terminated.Load() {
+			continue
+		}
+
+		if c.spec.Root.Path == overlayPath {
+			return true
+		}
+	}
+
+	return false
+}
+
 func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req *guestrequest.ModificationRequest) (retErr error) {
 	if h.HasSecurityPolicy() {
 		if err := checkValidContainerID(containerID, "container"); err != nil {
@@ -825,35 +850,6 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req *
 			return err
 		}
 		mvd.Controller = cNum
-		// first we try to update the internal state for read-write attachments.
-		if !mvd.ReadOnly {
-			localCtx, cancel := context.WithTimeout(ctx, time.Second*5)
-			defer cancel()
-			source, err := scsi.GetDevicePath(localCtx, mvd.Controller, mvd.Lun, mvd.Partition)
-			if err != nil {
-				return err
-			}
-			switch req.RequestType {
-			case guestrequest.RequestTypeAdd:
-				if err := h.hostMounts.AddRWDevice(mvd.MountPath, source, mvd.Encrypted); err != nil {
-					return err
-				}
-				defer func() {
-					if retErr != nil {
-						_ = h.hostMounts.RemoveRWDevice(mvd.MountPath, source)
-					}
-				}()
-			case guestrequest.RequestTypeRemove:
-				if err := h.hostMounts.RemoveRWDevice(mvd.MountPath, source); err != nil {
-					return err
-				}
-				defer func() {
-					if retErr != nil {
-						_ = h.hostMounts.AddRWDevice(mvd.MountPath, source, mvd.Encrypted)
-					}
-				}()
-			}
-		}
 		return h.modifyMappedVirtualDisk(ctx, req.RequestType, mvd)
 	case guestresource.ResourceTypeMappedDirectory:
 		if err := h.checkState(); err != nil {
@@ -872,12 +868,7 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req *
 			return err
 		}
 
-		cl := req.Settings.(*guestresource.LCOWCombinedLayers)
-		// when cl.ScratchPath == "", we mount overlay as read-only, in which case
-		// we don't really care about scratch encryption, since the host already
-		// knows about the layers and the overlayfs.
-		encryptedScratch := cl.ScratchPath != "" && h.hostMounts.IsEncrypted(cl.ScratchPath)
-		return h.modifyCombinedLayers(ctx, req.RequestType, req.Settings.(*guestresource.LCOWCombinedLayers), encryptedScratch)
+		return h.modifyCombinedLayers(ctx, req.RequestType, req.Settings.(*guestresource.LCOWCombinedLayers))
 	case guestresource.ResourceTypeNetwork:
 		return modifyNetwork(ctx, req.RequestType, req.Settings.(*guestresource.LCOWNetworkAdapter))
 	case guestresource.ResourceTypeVPCIDevice:
@@ -893,10 +884,22 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req *
 		if !ok {
 			return errors.New("the request's settings are not of type ConfidentialOptions")
 		}
-		return h.securityOptions.SetConfidentialOptions(ctx,
+		err := h.securityOptions.SetConfidentialOptions(ctx,
 			r.EnforcerType,
 			r.EncodedSecurityPolicy,
 			r.EncodedUVMReference)
+		if err != nil {
+			return err
+		}
+
+		// Start tracking mounts and restricting unmounts on confidential containers.
+		// As long as we started off with the ClosedDoorSecurityPolicyEnforcer, no
+		// mounts should have been allowed until this point.
+		if h.HasSecurityPolicy() {
+			log.G(ctx).Debug("hostMounts initialized")
+			h.hostMounts = newHostMounts()
+		}
+		return nil
 	case guestresource.ResourceTypePolicyFragment:
 		r, ok := req.Settings.(*guestresource.SecurityPolicyFragment)
 		if !ok {
@@ -1270,18 +1273,33 @@ func (h *Host) modifyMappedVirtualDisk(
 	rt guestrequest.RequestType,
 	mvd *guestresource.LCOWMappedVirtualDisk,
 ) (err error) {
+	ctx, span := oc.StartSpan(ctx, "gcs::Host::modifyMappedVirtualDisk")
+	defer span.End()
+	defer func() { oc.SetSpanStatus(span, err) }()
+	span.AddAttributes(
+		trace.StringAttribute("requestType", string(rt)),
+		trace.BoolAttribute("hasHostMounts", h.hostMounts != nil),
+		trace.Int64Attribute("controller", int64(mvd.Controller)),
+		trace.Int64Attribute("lun", int64(mvd.Lun)),
+		trace.Int64Attribute("partition", int64(mvd.Partition)),
+		trace.BoolAttribute("readOnly", mvd.ReadOnly),
+		trace.StringAttribute("mountPath", mvd.MountPath),
+	)
+
 	var verityInfo *guestresource.DeviceVerityInfo
 	securityPolicy := h.securityOptions.PolicyEnforcer
+	devPath, err := scsi.GetDevicePath(ctx, mvd.Controller, mvd.Lun, mvd.Partition)
+	if err != nil {
+		return err
+	}
+	span.AddAttributes(trace.StringAttribute("devicePath", devPath))
+
 	if mvd.ReadOnly {
 		// The only time the policy is empty, and we want it to be empty
 		// is when no policy is provided, and we default to open door
 		// policy. In any other case, e.g. explicit open door or any
 		// other rego policy we would like to mount layers with verity.
 		if h.HasSecurityPolicy() {
-			devPath, err := scsi.GetDevicePath(ctx, mvd.Controller, mvd.Lun, mvd.Partition)
-			if err != nil {
-				return err
-			}
 			verityInfo, err = verity.ReadVeritySuperBlock(ctx, devPath)
 			if err != nil {
 				return err
@@ -1311,11 +1329,42 @@ func (h *Host) modifyMappedVirtualDisk(
 					if err != nil {
 						return errors.Wrapf(err, "mounting scsi device controller %d lun %d onto %s denied by policy", mvd.Controller, mvd.Lun, mvd.MountPath)
 					}
+					if h.hostMounts != nil {
+						h.hostMounts.Lock()
+						defer h.hostMounts.Unlock()
+
+						err = h.hostMounts.AddRODevice(mvd.MountPath, devPath)
+						if err != nil {
+							return err
+						}
+						// Note: "When a function returns, its deferred calls are
+						// executed in last-in-first-out order." - so we are safe to
+						// call RemoveRODevice in this defer.
+						defer func() {
+							if err != nil {
+								_ = h.hostMounts.RemoveRODevice(mvd.MountPath, devPath)
+							}
+						}()
+					}
 				} else {
 					err = securityPolicy.EnforceRWDeviceMountPolicy(ctx, mvd.MountPath, mvd.Encrypted, mvd.EnsureFilesystem, mvd.Filesystem)
 					if err != nil {
 						return errors.Wrapf(err, "mounting scsi device controller %d lun %d onto %s denied by policy", mvd.Controller, mvd.Lun, mvd.MountPath)
 					}
+					if h.hostMounts != nil {
+						h.hostMounts.Lock()
+						defer h.hostMounts.Unlock()
+
+						err = h.hostMounts.AddRWDevice(mvd.MountPath, devPath, mvd.Encrypted)
+						if err != nil {
+							return err
+						}
+						defer func() {
+							if err != nil {
+								_ = h.hostMounts.RemoveRWDevice(mvd.MountPath, devPath, mvd.Encrypted)
+							}
+						}()
+					}
 				}
 				config := &scsi.Config{
 					Encrypted:        mvd.Encrypted,
@@ -1340,10 +1389,36 @@ func (h *Host) modifyMappedVirtualDisk(
 					if err = securityPolicy.EnforceDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil {
 						return fmt.Errorf("unmounting scsi device at %s denied by policy: %w", mvd.MountPath, err)
 					}
+					if h.hostMounts != nil {
+						h.hostMounts.Lock()
+						defer h.hostMounts.Unlock()
+
+						if err = h.hostMounts.RemoveRODevice(mvd.MountPath, devPath); err != nil {
+							return err
+						}
+						defer func() {
+							if err != nil {
+								_ = h.hostMounts.AddRODevice(mvd.MountPath, devPath)
+							}
+						}()
+					}
 				} else {
 					if err = securityPolicy.EnforceRWDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil {
 						return fmt.Errorf("unmounting scsi device at %s denied by policy: %w", mvd.MountPath, err)
 					}
+					if h.hostMounts != nil {
+						h.hostMounts.Lock()
+						defer h.hostMounts.Unlock()
+
+						if err = h.hostMounts.RemoveRWDevice(mvd.MountPath, devPath, mvd.Encrypted); err != nil {
+							return err
+						}
+						defer func() {
+							if err != nil {
+								_ = h.hostMounts.AddRWDevice(mvd.MountPath, devPath, mvd.Encrypted)
+							}
+						}()
+					}
 				}
 				// Check that the directory actually exists first, and if it
 				// does not then we just refuse to do anything, without closing
@@ -1513,8 +1588,17 @@ func (h *Host) modifyCombinedLayers(
 	ctx context.Context,
 	rt guestrequest.RequestType,
 	cl *guestresource.LCOWCombinedLayers,
-	scratchEncrypted bool,
 ) (err error) {
+	ctx, span := oc.StartSpan(ctx, "gcs::Host::modifyCombinedLayers")
+	defer span.End()
+	defer func() { oc.SetSpanStatus(span, err) }()
+	span.AddAttributes(
+		trace.StringAttribute("requestType", string(rt)),
+		trace.BoolAttribute("hasHostMounts", h.hostMounts != nil),
+		trace.StringAttribute("containerRootPath", cl.ContainerRootPath),
+		trace.StringAttribute("scratchPath", cl.ScratchPath),
+	)
+
 	securityPolicy := h.securityOptions.PolicyEnforcer
 	containerID := cl.ContainerID
 
@@ -1523,6 +1607,12 @@ func (h *Host) modifyCombinedLayers(
 	// we've actually called Unmount and it fails we permanently block further
 	// device operations.
 	return securityPolicy.WithMetadataRollback(func() error {
+		if h.hostMounts != nil {
+			// We will need this in multiple places, let's take the lock once here.
+			h.hostMounts.Lock()
+			defer h.hostMounts.Unlock()
+		}
+
 		switch rt {
 		case guestrequest.RequestTypeAdd:
 			if h.HasSecurityPolicy() {
@@ -1570,15 +1660,29 @@ func (h *Host) modifyCombinedLayers(
 			} else {
 				upperdirPath = filepath.Join(cl.ScratchPath, "upper")
 				workdirPath = filepath.Join(cl.ScratchPath, "work")
+				scratchEncrypted := false
+				if h.hostMounts != nil {
+					scratchEncrypted = h.hostMounts.IsEncrypted(cl.ScratchPath)
+				}
 
 				if err := securityPolicy.EnforceScratchMountPolicy(ctx, cl.ScratchPath, scratchEncrypted); err != nil {
 					return fmt.Errorf("scratch mounting denied by policy: %w", err)
 				}
 			}
 
-			if err := securityPolicy.EnforceOverlayMountPolicy(ctx, containerID, layerPaths, cl.ContainerRootPath); err != nil {
+			if err = securityPolicy.EnforceOverlayMountPolicy(ctx, containerID, layerPaths, cl.ContainerRootPath); err != nil {
 				return fmt.Errorf("overlay creation denied by policy: %w", err)
 			}
+			if h.hostMounts != nil {
+				if err = h.hostMounts.AddOverlay(cl.ContainerRootPath, layerPaths, cl.ScratchPath); err != nil {
+					return err
+				}
+				defer func() {
+					if err != nil {
+						_, _ = h.hostMounts.RemoveOverlay(cl.ContainerRootPath)
+					}
+				}()
+			}
 
 			// Correctness for policy transaction rollback:
 			// MountLayer does two things - mkdir, then mount. On mount failure, the
@@ -1592,6 +1696,23 @@ func (h *Host) modifyCombinedLayers(
 				return errors.Wrap(err, "overlay removal denied by policy")
 			}
 
+			// Check that no running container is using this overlay as its rootfs.
+			if h.HasSecurityPolicy() && h.IsOverlayInUse(cl.ContainerRootPath) {
+				return fmt.Errorf("overlay %q is in use by a running container", cl.ContainerRootPath)
+			}
+
+			if h.hostMounts != nil {
+				var undoRemoveOverlay func()
+				if undoRemoveOverlay, err = h.hostMounts.RemoveOverlay(cl.ContainerRootPath); err != nil {
+					return err
+				}
+				defer func() {
+					if err != nil && undoRemoveOverlay != nil {
+						undoRemoveOverlay()
+					}
+				}()
+			}
+
 			// Note: storage.UnmountPath is a no-op if the path does not exist.
 			err = storage.UnmountPath(ctx, cl.ContainerRootPath, true)
 			if err != nil {
@@ -1729,3 +1850,40 @@ func (h *Host) createContainerInPod(sandboxID string, containerID string) error
 
 	return nil
 }
+
+func (h *Host) DeleteContainerState(ctx context.Context, containerID string) error {
+	if h.HasSecurityPolicy() {
+		if err := checkValidContainerID(containerID, "container"); err != nil {
+			return err
+		}
+	}
+
+	if err := h.checkState(); err != nil {
+		return err
+	}
+
+	c, err := h.GetCreatedContainer(containerID)
+	if err != nil {
+		return err
+	}
+	if h.HasSecurityPolicy() {
+		if !c.terminated.Load() {
+			return errors.Errorf("Denied deleting state of a running container %q", containerID)
+		}
+		overlay := c.spec.Root.Path
+		h.hostMounts.Lock()
+		defer h.hostMounts.Unlock()
+		if h.hostMounts.HasOverlayMountedAt(overlay) {
+			return errors.Errorf("Denied deleting state of a container with a overlay mount still active")
+		}
+	}
+
+	// remove container state regardless of delete's success
+	defer h.RemoveContainer(containerID)
+
+	if err = c.Delete(ctx); err != nil {
+		return err
+	}
+
+	return nil
+}
diff --git a/internal/guest/runtime/hcsv2/uvm_state.go b/internal/guest/runtime/hcsv2/uvm_state.go
index dd1ff521f0..96e64371a2 100644
--- a/internal/guest/runtime/hcsv2/uvm_state.go
+++ b/internal/guest/runtime/hcsv2/uvm_state.go
@@ -4,91 +4,360 @@
 package hcsv2
 
 import (
+	"context"
+	"errors"
 	"fmt"
 	"path/filepath"
 	"strings"
 	"sync"
+
+	"github.com/Microsoft/hcsshim/internal/gcs"
+	"github.com/Microsoft/hcsshim/internal/log"
+	"github.com/sirupsen/logrus"
+)
+
+type deviceType int
+
+const (
+	DeviceTypeRW deviceType = iota
+	DeviceTypeRO
+	DeviceTypeOverlay
 )
 
-type rwDevice struct {
+func (d deviceType) String() string {
+	switch d {
+	case DeviceTypeRW:
+		return "RW"
+	case DeviceTypeRO:
+		return "RO"
+	case DeviceTypeOverlay:
+		return "Overlay"
+	default:
+		return fmt.Sprintf("Unknown(%d)", d)
+	}
+}
+
+type device struct {
+	// fields common to all
 	mountPath  string
+	ty         deviceType
+	usage      int
 	sourcePath string
-	encrypted  bool
+
+	// rw devices
+	encrypted bool
+
+	// overlay devices
+	referencedDevices []*device
 }
 
+// hostMounts tracks the state of fs/overlay mounts and their usage
+// relationship. Users of this struct must call hm.Lock() before calling any
+// other methods and call hm.Unlock() when done.
+//
+// Since mount/unmount operations can fail, the expected way to use this struct
+// is to first lock it, call the method to add/remove the device, then, with the
+// lock still held, perform the actual operation.  If the operation fails, the
+// caller must undo the operation by calling the appropriate remove/add method
+// or the returned undo function, before unlocking.
 type hostMounts struct {
-	stateMutex sync.Mutex
+	stateMutex       sync.Mutex
+	stateMutexLocked bool
 
-	// Holds information about read-write devices, which can be encrypted and
-	// contain overlay fs upper/work directory mounts.
-	readWriteMounts map[string]*rwDevice
+	// Map from mountPath to device struct
+	devices map[string]*device
 }
 
 func newHostMounts() *hostMounts {
 	return &hostMounts{
-		readWriteMounts: map[string]*rwDevice{},
+		devices: make(map[string]*device),
 	}
 }
 
-// AddRWDevice adds read-write device metadata for device mounted at `mountPath`.
-// Returns an error if there's an existing device mounted at `mountPath` location.
-func (hm *hostMounts) AddRWDevice(mountPath string, sourcePath string, encrypted bool) error {
+func (hm *hostMounts) expectLocked() {
+	if !hm.stateMutexLocked {
+		gcs.UnrecoverableError(errors.New("hostMounts: expected stateMutex to be locked, but it was not"))
+	}
+}
+
+// Locks the state mutex. This is not re-entrant, calling it twice in the same
+// thread will deadlock/panic.
+func (hm *hostMounts) Lock() {
 	hm.stateMutex.Lock()
-	defer hm.stateMutex.Unlock()
+	// Since we just acquired the lock, either it was not locked before, or
+	// somebody just unlocked it. Either case, hm.stateMutexLocked should be
+	// false.
+	if hm.stateMutexLocked {
+		gcs.UnrecoverableError(errors.New("hostMounts: stateMutexLocked already true when locking stateMutex"))
+	}
+	hm.stateMutexLocked = true
+}
+
+// Unlocks the state mutex
+func (hm *hostMounts) Unlock() {
+	hm.expectLocked()
+	hm.stateMutexLocked = false
+	hm.stateMutex.Unlock()
+}
 
-	mountTarget := filepath.Clean(mountPath)
-	if source, ok := hm.readWriteMounts[mountTarget]; ok {
-		return fmt.Errorf("read-write with source %q and mount target %q already exists", source.sourcePath, mountPath)
+func (hm *hostMounts) findDeviceAtPath(mountPath string) *device {
+	hm.expectLocked()
+
+	if dev, ok := hm.devices[mountPath]; ok {
+		return dev
 	}
-	hm.readWriteMounts[mountTarget] = &rwDevice{
-		mountPath:  mountTarget,
-		sourcePath: sourcePath,
-		encrypted:  encrypted,
+	return nil
+}
+
+func (hm *hostMounts) addDeviceToMapChecked(dev *device) error {
+	hm.expectLocked()
+
+	if _, ok := hm.devices[dev.mountPath]; ok {
+		return fmt.Errorf("device at mount path %q already exists", dev.mountPath)
 	}
+	hm.devices[dev.mountPath] = dev
 	return nil
 }
 
-// RemoveRWDevice removes the read-write device metadata for device mounted at
-// `mountPath`.
-func (hm *hostMounts) RemoveRWDevice(mountPath string, sourcePath string) error {
-	hm.stateMutex.Lock()
-	defer hm.stateMutex.Unlock()
+func (hm *hostMounts) findDeviceContainingPath(path string) *device {
+	hm.expectLocked()
+
+	// TODO: can we refactor this function by walking each component of the path
+	// from leaf to root, each time checking if the current component is a mount
+	// point?  (i.e. why do we have to use filepath.Rel?)
+
+	var foundDev *device
+	cleanPath := filepath.Clean(path)
+	for devPath, dev := range hm.devices {
+		relPath, err := filepath.Rel(devPath, cleanPath)
+		// skip further checks if an error is returned or the relative path
+		// contains "..", meaning that the `path` isn't directly nested under
+		// `rwPath`.
+		if err != nil || strings.HasPrefix(relPath, "..") {
+			continue
+		}
+		if foundDev == nil {
+			foundDev = dev
+		} else if len(dev.mountPath) > len(foundDev.mountPath) {
+			// The current device is mounted on top of a previously found device.
+			foundDev = dev
+		}
+	}
+	return foundDev
+}
+
+func (hm *hostMounts) usePath(path string) (*device, error) {
+	hm.expectLocked()
+
+	// Find the device at the given path and increment its usage count.
+	dev := hm.findDeviceContainingPath(path)
+	if dev == nil {
+		return nil, nil
+	}
+	dev.usage++
+	return dev, nil
+}
+
+func (hm *hostMounts) releaseDeviceUsage(dev *device) {
+	hm.expectLocked()
+
+	if dev.usage <= 0 {
+		log.G(context.Background()).WithFields(logrus.Fields{
+			"device":       dev.mountPath,
+			"deviceSource": dev.sourcePath,
+			"deviceType":   dev.ty,
+			"usage":        dev.usage,
+		}).Error("hostMounts::releaseDeviceUsage: unexpected zero usage count")
+		return
+	}
+	dev.usage--
+}
+
+// User should carefully handle side-effects of adding a device if the device
+// fails to be added.
+func (hm *hostMounts) doAddDevice(mountPath string, ty deviceType, sourcePath string) (*device, error) {
+	hm.expectLocked()
+
+	dev := &device{
+		mountPath:  filepath.Clean(mountPath),
+		ty:         ty,
+		usage:      0,
+		sourcePath: sourcePath,
+	}
+
+	if err := hm.addDeviceToMapChecked(dev); err != nil {
+		return nil, err
+	}
+	return dev, nil
+}
+
+// Once checks is called, unless it returns an error, this function will always
+// succeed
+func (hm *hostMounts) doRemoveDevice(mountPath string, ty deviceType, sourcePath string, checks func(*device) error) error {
+	hm.expectLocked()
 
 	unmountTarget := filepath.Clean(mountPath)
-	device, ok := hm.readWriteMounts[unmountTarget]
-	if !ok {
+	device := hm.findDeviceAtPath(unmountTarget)
+	if device == nil {
 		// already removed or didn't exist
 		return nil
 	}
 	if device.sourcePath != sourcePath {
-		return fmt.Errorf("wrong sourcePath %s", sourcePath)
+		return fmt.Errorf("wrong sourcePath %s, expected %s", sourcePath, device.sourcePath)
+	}
+	if device.ty != ty {
+		return fmt.Errorf("wrong device type %s, expected %s", ty, device.ty)
+	}
+	if device.usage > 0 {
+		log.G(context.Background()).WithFields(logrus.Fields{
+			"device":       device.mountPath,
+			"deviceSource": device.sourcePath,
+			"deviceType":   device.ty,
+			"usage":        device.usage,
+		}).Error("hostMounts::doRemoveDevice: device still in use, refusing unmount")
+		return fmt.Errorf("device at %q is still in use, can't unmount", unmountTarget)
+	}
+	if checks != nil {
+		if err := checks(device); err != nil {
+			return err
+		}
 	}
 
-	delete(hm.readWriteMounts, unmountTarget)
+	delete(hm.devices, unmountTarget)
 	return nil
 }
 
+func (hm *hostMounts) AddRODevice(mountPath string, sourcePath string) error {
+	hm.expectLocked()
+
+	_, err := hm.doAddDevice(mountPath, DeviceTypeRO, sourcePath)
+	return err
+}
+
+// AddRWDevice adds read-write device metadata for device mounted at `mountPath`.
+// Returns an error if there's an existing device mounted at `mountPath` location.
+func (hm *hostMounts) AddRWDevice(mountPath string, sourcePath string, encrypted bool) error {
+	hm.expectLocked()
+
+	dev, err := hm.doAddDevice(mountPath, DeviceTypeRW, sourcePath)
+	if err != nil {
+		return err
+	}
+	dev.encrypted = encrypted
+	return nil
+}
+
+func (hm *hostMounts) AddOverlay(mountPath string, layers []string, scratchDir string) (err error) {
+	hm.expectLocked()
+
+	dev, err := hm.doAddDevice(mountPath, DeviceTypeOverlay, mountPath)
+	if err != nil {
+		return err
+	}
+	dev.referencedDevices = make([]*device, 0, len(layers)+1)
+	defer func() {
+		if err != nil {
+			// If we failed to use any of the paths, we need to release the ones
+			// that we did use.
+			for _, d := range dev.referencedDevices {
+				hm.releaseDeviceUsage(d)
+			}
+			delete(hm.devices, mountPath)
+		}
+	}()
+
+	for _, layer := range layers {
+		refDev, err := hm.usePath(layer)
+		if err != nil {
+			return err
+		}
+		if refDev != nil {
+			dev.referencedDevices = append(dev.referencedDevices, refDev)
+		}
+	}
+	refDev, err := hm.usePath(scratchDir)
+	if err != nil {
+		return err
+	}
+	if refDev != nil {
+		dev.referencedDevices = append(dev.referencedDevices, refDev)
+	}
+
+	return nil
+}
+
+func (hm *hostMounts) RemoveRODevice(mountPath string, sourcePath string) error {
+	hm.expectLocked()
+
+	return hm.doRemoveDevice(mountPath, DeviceTypeRO, sourcePath, nil)
+}
+
+// RemoveRWDevice removes the read-write device metadata for device mounted at
+// `mountPath`.
+func (hm *hostMounts) RemoveRWDevice(mountPath string, sourcePath string, encrypted bool) error {
+	hm.expectLocked()
+
+	return hm.doRemoveDevice(mountPath, DeviceTypeRW, sourcePath, func(dev *device) error {
+		if dev.encrypted != encrypted {
+			return fmt.Errorf("encrypted flag wrong, provided %v, expected %v", encrypted, dev.encrypted)
+		}
+		return nil
+	})
+}
+
+func (hm *hostMounts) RemoveOverlay(mountPath string) (undo func(), err error) {
+	hm.expectLocked()
+
+	var dev *device
+	err = hm.doRemoveDevice(mountPath, DeviceTypeOverlay, mountPath, func(_dev *device) error {
+		dev = _dev
+		for _, refDev := range dev.referencedDevices {
+			hm.releaseDeviceUsage(refDev)
+		}
+		return nil
+	})
+	if err != nil {
+		// If we get an error from doRemoveDevice, we have not released anything
+		// yet.
+		return nil, err
+	}
+	undo = func() {
+		hm.expectLocked()
+
+		for _, refDev := range dev.referencedDevices {
+			refDev.usage++
+		}
+
+		if _, ok := hm.devices[mountPath]; ok {
+			log.G(context.Background()).WithField("mountPath", mountPath).Error(
+				"hostMounts::RemoveOverlay: failed to undo remove: device that was removed exists in map",
+			)
+			return
+		}
+
+		hm.devices[mountPath] = dev
+	}
+	return undo, nil
+}
+
 // IsEncrypted checks if the given path is a sub-path of an encrypted read-write
 // device.
 func (hm *hostMounts) IsEncrypted(path string) bool {
-	hm.stateMutex.Lock()
-	defer hm.stateMutex.Unlock()
+	hm.expectLocked()
 
-	parentPath := ""
-	encrypted := false
-	cleanPath := filepath.Clean(path)
-	for rwPath, rwDev := range hm.readWriteMounts {
-		relPath, err := filepath.Rel(rwPath, cleanPath)
-		// skip further checks if an error is returned or the relative path
-		// contains "..", meaning that the `path` isn't directly nested under
-		// `rwPath`.
-		if err != nil || strings.HasPrefix(relPath, "..") {
-			continue
-		}
-		if len(rwDev.mountPath) > len(parentPath) {
-			parentPath = rwDev.mountPath
-			encrypted = rwDev.encrypted
-		}
+	dev := hm.findDeviceContainingPath(path)
+	if dev == nil {
+		return false
+	}
+	return dev.encrypted
+}
+
+func (hm *hostMounts) HasOverlayMountedAt(path string) bool {
+	hm.expectLocked()
+
+	dev := hm.findDeviceAtPath(filepath.Clean(path))
+	if dev == nil {
+		return false
 	}
-	return encrypted
+	return dev.ty == DeviceTypeOverlay
 }
diff --git a/internal/guest/runtime/hcsv2/uvm_state_test.go b/internal/guest/runtime/hcsv2/uvm_state_test.go
index b708caaeba..e87a207308 100644
--- a/internal/guest/runtime/hcsv2/uvm_state_test.go
+++ b/internal/guest/runtime/hcsv2/uvm_state_test.go
@@ -12,10 +12,13 @@ func Test_Add_Remove_RWDevice(t *testing.T) {
 	mountPath := "/run/gcs/c/abcd"
 	sourcePath := "/dev/sda"
 
+	hm.Lock()
+	defer hm.Unlock()
+
 	if err := hm.AddRWDevice(mountPath, sourcePath, false); err != nil {
 		t.Fatalf("unexpected error adding RW device: %s", err)
 	}
-	if err := hm.RemoveRWDevice(mountPath, sourcePath); err != nil {
+	if err := hm.RemoveRWDevice(mountPath, sourcePath, false); err != nil {
 		t.Fatalf("unexpected error removing RW device: %s", err)
 	}
 }
@@ -25,29 +28,55 @@ func Test_Cannot_AddRWDevice_Twice(t *testing.T) {
 	mountPath := "/run/gcs/c/abc"
 	sourcePath := "/dev/sda"
 
+	hm.Lock()
 	if err := hm.AddRWDevice(mountPath, sourcePath, false); err != nil {
 		t.Fatalf("unexpected error: %s", err)
 	}
+	hm.Unlock()
+
+	hm.Lock()
 	if err := hm.AddRWDevice(mountPath, sourcePath, false); err == nil {
 		t.Fatalf("expected error adding %q for the second time", mountPath)
 	}
+	hm.Unlock()
 }
 
 func Test_Cannot_RemoveRWDevice_Wrong_Source(t *testing.T) {
 	hm := newHostMounts()
+	hm.Lock()
+	defer hm.Unlock()
+
 	mountPath := "/run/gcs/c/abcd"
 	sourcePath := "/dev/sda"
 	wrongSource := "/dev/sdb"
 	if err := hm.AddRWDevice(mountPath, sourcePath, false); err != nil {
 		t.Fatalf("unexpected error: %s", err)
 	}
-	if err := hm.RemoveRWDevice(mountPath, wrongSource); err == nil {
+	if err := hm.RemoveRWDevice(mountPath, wrongSource, false); err == nil {
 		t.Fatalf("expected error removing wrong source %s", wrongSource)
 	}
 }
 
+func Test_Cannot_RemoveRWDevice_Wrong_Encrypted(t *testing.T) {
+	hm := newHostMounts()
+	hm.Lock()
+	defer hm.Unlock()
+
+	mountPath := "/run/gcs/c/abcd"
+	sourcePath := "/dev/sda"
+	if err := hm.AddRWDevice(mountPath, sourcePath, false); err != nil {
+		t.Fatalf("unexpected error: %s", err)
+	}
+	if err := hm.RemoveRWDevice(mountPath, sourcePath, true); err == nil {
+		t.Fatalf("expected error removing RW device with wrong encrypted flag")
+	}
+}
+
 func Test_HostMounts_IsEncrypted(t *testing.T) {
 	hm := newHostMounts()
+	hm.Lock()
+	defer hm.Unlock()
+
 	encryptedPath := "/run/gcs/c/encrypted"
 	encryptedSource := "/dev/sda"
 	if err := hm.AddRWDevice(encryptedPath, encryptedSource, true); err != nil {
@@ -108,3 +137,189 @@ func Test_HostMounts_IsEncrypted(t *testing.T) {
 		})
 	}
 }
+
+func Test_HostMounts_AddRemoveRODevice(t *testing.T) {
+	hm := newHostMounts()
+	hm.Lock()
+	defer hm.Unlock()
+
+	mountPath := "/run/gcs/c/abcd"
+	sourcePath := "/dev/sda"
+
+	if err := hm.AddRODevice(mountPath, sourcePath); err != nil {
+		t.Fatalf("unexpected error adding RO device: %s", err)
+	}
+
+	if err := hm.RemoveRODevice(mountPath, sourcePath); err != nil {
+		t.Fatalf("unexpected error removing RO device: %s", err)
+	}
+}
+
+func Test_HostMounts_Cannot_AddRODevice_Twice(t *testing.T) {
+	hm := newHostMounts()
+	hm.Lock()
+	defer hm.Unlock()
+
+	mountPath := "/run/gcs/c/abc"
+	sourcePath := "/dev/sda"
+
+	if err := hm.AddRODevice(mountPath, sourcePath); err != nil {
+		t.Fatalf("unexpected error: %s", err)
+	}
+	if err := hm.AddRODevice(mountPath, sourcePath); err == nil {
+		t.Fatalf("expected error adding %q for the second time", mountPath)
+	}
+}
+
+func Test_HostMounts_AddRemoveOverlay(t *testing.T) {
+	hm := newHostMounts()
+	hm.Lock()
+	defer hm.Unlock()
+
+	mountPath := "/run/gcs/c/aaaa/rootfs"
+	layers := []string{
+		"/run/mounts/scsi/m1",
+		"/run/mounts/scsi/m2",
+		"/run/mounts/scsi/m3",
+	}
+	for _, layer := range layers {
+		if err := hm.AddRODevice(layer, layer); err != nil {
+			t.Fatalf("unexpected error adding RO device: %s", err)
+		}
+	}
+	scratchDir := "/run/gcs/c/aaaa/scratch"
+	if err := hm.AddRWDevice(scratchDir, scratchDir, true); err != nil {
+		t.Fatalf("unexpected error adding RW device: %s", err)
+	}
+	if err := hm.AddOverlay(mountPath, layers, scratchDir); err != nil {
+		t.Fatalf("unexpected error adding overlay: %s", err)
+	}
+	undo, err := hm.RemoveOverlay(mountPath)
+	if err != nil {
+		t.Fatalf("unexpected error removing overlay: %s", err)
+	}
+	if undo == nil {
+		t.Fatalf("expected undo function to be non-nil")
+	}
+	undo()
+	if _, err = hm.RemoveOverlay(mountPath); err != nil {
+		t.Fatalf("unexpected error removing overlay again: %s", err)
+	}
+}
+
+func Test_HostMounts_Cannot_RemoveInUseDeviceByOverlay(t *testing.T) {
+	hm := newHostMounts()
+	hm.Lock()
+	defer hm.Unlock()
+
+	mountPath := "/run/gcs/c/aaaa/rootfs"
+	layers := []string{
+		"/run/mounts/scsi/m1",
+		"/run/mounts/scsi/m2",
+		"/run/mounts/scsi/m3",
+	}
+	for _, layer := range layers {
+		if err := hm.AddRODevice(layer, layer); err != nil {
+			t.Fatalf("unexpected error adding RO device: %s", err)
+		}
+	}
+	scratchDir := "/run/gcs/c/aaaa/scratch"
+	if err := hm.AddRWDevice(scratchDir, scratchDir, true); err != nil {
+		t.Fatalf("unexpected error adding RW device: %s", err)
+	}
+	if err := hm.AddOverlay(mountPath, layers, scratchDir); err != nil {
+		t.Fatalf("unexpected error adding overlay: %s", err)
+	}
+
+	for _, layer := range layers {
+		if err := hm.RemoveRODevice(layer, layer); err == nil {
+			t.Fatalf("expected error removing RO device %s while in use by overlay", layer)
+		}
+	}
+	if err := hm.RemoveRWDevice(scratchDir, scratchDir, true); err == nil {
+		t.Fatalf("expected error removing RW device %s while in use by overlay", scratchDir)
+	}
+
+	if _, err := hm.RemoveOverlay(mountPath); err != nil {
+		t.Fatalf("unexpected error removing overlay: %s", err)
+	}
+
+	// now we can remove
+	for _, layer := range layers {
+		if err := hm.RemoveRODevice(layer, layer); err != nil {
+			t.Fatalf("unexpected error removing RO device %s: %s", layer, err)
+		}
+	}
+	if err := hm.RemoveRWDevice(scratchDir, scratchDir, true); err != nil {
+		t.Fatalf("unexpected error removing RW device %s: %s", scratchDir, err)
+	}
+}
+
+func Test_HostMounts_Cannot_RemoveInUseDeviceByOverlay_MultipleUsers(t *testing.T) {
+	hm := newHostMounts()
+	hm.Lock()
+	defer hm.Unlock()
+
+	overlay1 := "/run/gcs/c/aaaa/rootfs"
+	overlay2 := "/run/gcs/c/bbbb/rootfs"
+	layers := []string{
+		"/run/mounts/scsi/m1",
+		"/run/mounts/scsi/m2",
+		"/run/mounts/scsi/m3",
+	}
+	for _, layer := range layers {
+		if err := hm.AddRODevice(layer, layer); err != nil {
+			t.Fatalf("unexpected error adding RO device: %s", err)
+		}
+	}
+	sharedScratchMount := "/run/gcs/c/sandbox"
+	scratch1 := sharedScratchMount + "/scratch/aaaa"
+	scratch2 := sharedScratchMount + "/scratch/bbbb"
+	if err := hm.AddRWDevice(sharedScratchMount, sharedScratchMount, true); err != nil {
+		t.Fatalf("unexpected error adding RW device: %s", err)
+	}
+	if err := hm.AddOverlay(overlay1, layers, scratch1); err != nil {
+		t.Fatalf("unexpected error adding overlay1: %s", err)
+	}
+
+	if err := hm.AddOverlay(overlay2, layers[0:2], scratch2); err != nil {
+		t.Fatalf("unexpected error adding overlay2: %s", err)
+	}
+
+	for _, layer := range layers {
+		if err := hm.RemoveRODevice(layer, layer); err == nil {
+			t.Fatalf("expected error removing RO device %s while in use by overlay", layer)
+		}
+	}
+	if err := hm.RemoveRWDevice(sharedScratchMount, sharedScratchMount, true); err == nil {
+		t.Fatalf("expected error removing RW device %s while in use by overlay", sharedScratchMount)
+	}
+
+	if _, err := hm.RemoveOverlay(overlay1); err != nil {
+		t.Fatalf("unexpected error removing overlay 1: %s", err)
+	}
+
+	for _, layer := range layers[0:2] {
+		if err := hm.RemoveRODevice(layer, layer); err == nil {
+			t.Fatalf("expected error removing RO device %s (still in use by overlay 2)", layer)
+		}
+	}
+	if err := hm.RemoveRODevice(layers[2], layers[2]); err != nil {
+		t.Fatalf("unexpected error removing layers[2] which is not being used by overlay 2: %s", err)
+	}
+	if err := hm.RemoveRWDevice(sharedScratchMount, sharedScratchMount, true); err == nil {
+		t.Fatalf("expected error removing RW device %s while in use by overlay 2", scratch2)
+	}
+
+	if _, err := hm.RemoveOverlay(overlay2); err != nil {
+		t.Fatalf("unexpected error removing overlay 2: %s", err)
+	}
+	for _, layer := range layers[0:2] {
+		if err := hm.RemoveRODevice(layer, layer); err != nil {
+			t.Fatalf("unexpected error removing RO device %s: %s", layer, err)
+		}
+	}
+	if err := hm.RemoveRWDevice(sharedScratchMount, sharedScratchMount, true); err != nil {
+		t.Fatalf("unexpected error removing RW device %s: %s", sharedScratchMount, err)
+	}
+}

From d9bec7ac74854142bc08409e97fe864d9ed173b7 Mon Sep 17 00:00:00 2001
From: Tingmao Wang <tingmaowang@microsoft.com>
Date: Thu, 11 Jun 2026 15:33:42 +0000
Subject: [PATCH 5/8] Remove usage of UnrecoverableError

Signed-off-by: Tingmao Wang <tingmaowang@microsoft.com>
---
 internal/guest/runtime/hcsv2/uvm_state.go | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/internal/guest/runtime/hcsv2/uvm_state.go b/internal/guest/runtime/hcsv2/uvm_state.go
index 96e64371a2..f7d347cbf4 100644
--- a/internal/guest/runtime/hcsv2/uvm_state.go
+++ b/internal/guest/runtime/hcsv2/uvm_state.go
@@ -5,13 +5,11 @@ package hcsv2
 
 import (
 	"context"
-	"errors"
 	"fmt"
 	"path/filepath"
 	"strings"
 	"sync"
 
-	"github.com/Microsoft/hcsshim/internal/gcs"
 	"github.com/Microsoft/hcsshim/internal/log"
 	"github.com/sirupsen/logrus"
 )
@@ -76,7 +74,7 @@ func newHostMounts() *hostMounts {
 
 func (hm *hostMounts) expectLocked() {
 	if !hm.stateMutexLocked {
-		gcs.UnrecoverableError(errors.New("hostMounts: expected stateMutex to be locked, but it was not"))
+		panic("hostMounts: expected stateMutex to be locked, but it was not")
 	}
 }
 
@@ -88,7 +86,7 @@ func (hm *hostMounts) Lock() {
 	// somebody just unlocked it. Either case, hm.stateMutexLocked should be
 	// false.
 	if hm.stateMutexLocked {
-		gcs.UnrecoverableError(errors.New("hostMounts: stateMutexLocked already true when locking stateMutex"))
+		panic("hostMounts: stateMutexLocked already true when locking stateMutex")
 	}
 	hm.stateMutexLocked = true
 }

From 91d4805549334b54a2400834a2c9adbb599b3f8a Mon Sep 17 00:00:00 2001
From: Tingmao Wang <tingmaowang@microsoft.com>
Date: Tue, 11 Nov 2025 16:55:12 +0000
Subject: [PATCH 6/8] Merged PR 13627088: guest: Don't allow host to set mount
 options

[cherry-picked from 1dd0b7ea0b0f91d3698f6008fb0bd5b0de777da6]

Blocks mount option passing for 9p (which is accidental) and SCSI disks.

- guest: Restrict plan9 share names to digits only on Confidential mode
- hcsv2/uvm: Restrict SCSI mount options in confidential mode
   (The only one we allow is `ro`)

Related work items: #34370380
Signed-off-by: Tingmao Wang <tingmaowang@microsoft.com>
---
 internal/guest/runtime/hcsv2/uvm.go   | 24 ++++++++++++++++++++++++
 internal/guest/storage/plan9/plan9.go | 14 ++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go
index f6878dccd2..46603c07ea 100644
--- a/internal/guest/runtime/hcsv2/uvm.go
+++ b/internal/guest/runtime/hcsv2/uvm.go
@@ -1320,6 +1320,24 @@ func (h *Host) modifyMappedVirtualDisk(
 			mountCtx, cancel := context.WithTimeout(ctx, time.Second*5)
 			defer cancel()
 			if mvd.MountPath != "" {
+				if h.HasSecurityPolicy() {
+					// The only option we allow if there is policy enforcement is
+					// "ro", and it must match the readonly field in the request.
+					mountOptionHasRo := false
+					for _, opt := range mvd.Options {
+						if opt == "ro" {
+							mountOptionHasRo = true
+							continue
+						}
+						return errors.Errorf("mounting scsi device controller %d lun %d onto %s: mount option %q denied by policy", mvd.Controller, mvd.Lun, mvd.MountPath, opt)
+					}
+					if mvd.ReadOnly != mountOptionHasRo {
+						return errors.Errorf(
+							"mounting scsi device controller %d lun %d onto %s with mount option %q failed due to mount option mismatch: mvd.ReadOnly=%t but mountOptionHasRo=%t",
+							mvd.Controller, mvd.Lun, mvd.MountPath, strings.Join(mvd.Options, ","), mvd.ReadOnly, mountOptionHasRo,
+						)
+					}
+				}
 				if mvd.ReadOnly {
 					var deviceHash string
 					if verityInfo != nil {
@@ -1481,6 +1499,12 @@ func (h *Host) modifyMappedDirectory(
 				return errors.Wrapf(err, "mounting plan9 device at %s denied by policy", md.MountPath)
 			}
 
+			if h.HasSecurityPolicy() {
+				if err = plan9.ValidateShareName(md.ShareName); err != nil {
+					return err
+				}
+			}
+
 			// Similar to the reasoning in modifyMappedVirtualDisk, since we're
 			// rolling back the policy metadata, plan9.Mount here must clean up
 			// everything if it fails, which it does do.
diff --git a/internal/guest/storage/plan9/plan9.go b/internal/guest/storage/plan9/plan9.go
index 5c1f1d74f4..44ac0f4e4e 100644
--- a/internal/guest/storage/plan9/plan9.go
+++ b/internal/guest/storage/plan9/plan9.go
@@ -7,6 +7,7 @@ import (
 	"context"
 	"fmt"
 	"os"
+	"regexp"
 	"syscall"
 
 	"github.com/Microsoft/hcsshim/internal/guest/transport"
@@ -25,6 +26,19 @@ var (
 	unixMount   = unix.Mount
 )
 
+// c.f. v9fs_parse_options in linux/fs/9p/v9fs.c - technically anything other
+// than ',' is ok (quoting is not handled), however, this name is generated from
+// a counter in AddPlan9 (internal/uvm/plan9.go), and therefore we expect only
+// digits from a normal hcsshim host.
+var validShareNameRegex = regexp.MustCompile(`^[0-9]+$`)
+
+func ValidateShareName(name string) error {
+	if !validShareNameRegex.MatchString(name) {
+		return fmt.Errorf("invalid plan9 share name %q: must match regex %q", name, validShareNameRegex.String())
+	}
+	return nil
+}
+
 // Mount dials a connection from `vsock` and mounts a Plan9 share to `target`.
 //
 // `target` will be created. On mount failure the created `target` will be

From 4866c03e43db7d7d919369442242c3ea1dad7f23 Mon Sep 17 00:00:00 2001
From: Tingmao Wang <tingmaowang@microsoft.com>
Date: Wed, 4 Mar 2026 14:32:21 +0000
Subject: [PATCH 7/8] Merged PR 13694574: hcsv2/uvm, rego: Enforce OCI spec
 does not contain any LinuxDevices

[cherry-picked from 9f69e49f3..72b338a99]
[forward-ported from original PR 13653011 (e631aa42c6039ab0d228b746b280c26809433f00)]

See https://msazure.visualstudio.com/ContainerPlatform/_git/Microsoft.hcsshim/pullrequest/13653011

Signed-off-by: Tingmao Wang <tingmaowang@microsoft.com>
---
 internal/guest/runtime/hcsv2/uvm.go           | 34 ++++++++++++++-----
 pkg/securitypolicy/framework.rego             | 16 +++++++++
 pkg/securitypolicy/regopolicy_linux_test.go   | 33 ++++++++++++++++++
 pkg/securitypolicy/securitypolicyenforcer.go  |  1 +
 .../securitypolicyenforcer_rego.go            | 18 ++++++++++
 5 files changed, 94 insertions(+), 8 deletions(-)

diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go
index 46603c07ea..d2ca404f65 100644
--- a/internal/guest/runtime/hcsv2/uvm.go
+++ b/internal/guest/runtime/hcsv2/uvm.go
@@ -15,6 +15,7 @@ import (
 	"path"
 	"path/filepath"
 	"regexp"
+	"slices"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -530,6 +531,17 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM
 			h.RemoveContainer(id)
 		}
 	}()
+	// Take a backup of the devices array before we populate it with any devices
+	// found by GCS, in order to pass to the policy enforcer later.
+	//
+	// In specGuest.ApplyAnnotationsToSpec, if this is a privileged container,
+	// we will add devices found in the GCS namespace's /dev. Regardless of
+	// privileged or not, we also always include /dev/sev-guest.  Since the
+	// policy already lets the user enforce whether the container should be
+	// privileged or not, and the sev-guest device is always added for a
+	// confidential container, we do not need the policy enforcer to check these
+	// devices we dynamically add again.
+	extraLinuxDevices := slices.Clone(settings.OCISpecification.Linux.Devices)
 
 	var namespaceID string
 	if isCRI {
@@ -661,21 +673,27 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM
 		return nil, err
 	}
 
-	envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicy(
+	privileged := isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification)
+	noNewPrivileges := settings.OCISpecification.Process.NoNewPrivileges
+	opts := &securitypolicy.CreateContainerOptions{
+		SandboxID:            sandboxID,
+		Privileged:           &privileged,
+		NoNewPrivileges:      &noNewPrivileges,
+		Groups:               groups,
+		Umask:                umask,
+		Capabilities:         settings.OCISpecification.Process.Capabilities,
+		SeccompProfileSHA256: seccomp,
+		LinuxDevices:         extraLinuxDevices,
+	}
+	envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicyV2(
 		ctx,
-		sandboxID,
 		id,
 		settings.OCISpecification.Process.Args,
 		settings.OCISpecification.Process.Env,
 		settings.OCISpecification.Process.Cwd,
 		settings.OCISpecification.Mounts,
-		isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification),
-		settings.OCISpecification.Process.NoNewPrivileges,
 		user,
-		groups,
-		umask,
-		settings.OCISpecification.Process.Capabilities,
-		seccomp,
+		opts,
 	)
 	if err != nil {
 		return nil, errors.Wrapf(err, "container creation denied due to policy")
diff --git a/pkg/securitypolicy/framework.rego b/pkg/securitypolicy/framework.rego
index daa0fe864e..09aa685532 100644
--- a/pkg/securitypolicy/framework.rego
+++ b/pkg/securitypolicy/framework.rego
@@ -388,6 +388,14 @@ seccomp_ok(seccomp_profile_sha256) {
     is_windows
 }
 
+devices_ok(expected_devices, actual_devices) {
+    # Allow out of order but not duplicates
+    set_expected := {dev | dev := expected_devices[_]}
+    set_actual := {dev | dev := actual_devices[_]}
+    set_expected == set_actual
+    count(set_actual) == count(actual_devices)
+}
+
 default container_started := false
 
 container_started {
@@ -599,6 +607,8 @@ create_container := {"metadata": [updateMatches, addStarted],
         command_ok(container.command)
         mountList_ok(container.mounts, container.allow_elevated)
         seccomp_ok(container.seccomp_profile_sha256)
+        # We do not support adding device nodes to the policy yet
+        devices_ok([], input.devices)
     ]
 
     count(possible_after_initial_containers) > 0
@@ -2090,6 +2100,12 @@ errors["capabilities don't match"] {
     count(possible_after_caps_containers) == 0
 }
 
+errors["devices not supported"] {
+    is_linux
+    input.rule == "create_container"
+    not devices_ok([], input.devices)
+}
+
 # covers exec_in_container as well. it shouldn't be possible to ever get
 # an exec_in_container as it "inherits" capabilities rules from create_container
 errors["containers only distinguishable by capabilties"] {
diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go
index 236e64d4bd..6f01169b5a 100644
--- a/pkg/securitypolicy/regopolicy_linux_test.go
+++ b/pkg/securitypolicy/regopolicy_linux_test.go
@@ -2151,6 +2151,39 @@ func Test_Rego_EnforceCreateContainer_Capabilities_Drop_NoMatches(t *testing.T)
 	}
 }
 
+func Test_Regi_EnforceCreateContainer_RequireNoDevices(t *testing.T) {
+	f := func(p *generatedConstraints) bool {
+		tc, err := setupSimpleRegoCreateContainerTest(p)
+		if err != nil {
+			t.Error(err)
+			return false
+		}
+
+		privileged := false
+
+		_, _, _, err = tc.policy.EnforceCreateContainerPolicyV2(p.ctx, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, tc.user, &CreateContainerOptions{
+			SandboxID:            tc.sandboxID,
+			Privileged:           &privileged,
+			NoNewPrivileges:      &tc.noNewPrivileges,
+			Groups:               tc.groups,
+			Umask:                tc.umask,
+			Capabilities:         tc.capabilities,
+			SeccompProfileSHA256: tc.seccomp,
+			LinuxDevices: []oci.LinuxDevice{
+				{
+					Path: "/test",
+				},
+			},
+		})
+
+		return assertDecisionJSONContains(t, err, "devices not supported")
+	}
+
+	if err := quick.Check(f, &quick.Config{MaxCount: 50, Rand: testRand}); err != nil {
+		t.Errorf("Test_Regi_EnforceCreateContainer_RequireNoDevices: %v", err)
+	}
+}
+
 func Test_Rego_ExtendDefaultMounts(t *testing.T) {
 	f := func(p *generatedConstraints) bool {
 		tc, err := setupSimpleRegoCreateContainerTest(p)
diff --git a/pkg/securitypolicy/securitypolicyenforcer.go b/pkg/securitypolicy/securitypolicyenforcer.go
index 3f94bae0f5..c617016a1a 100644
--- a/pkg/securitypolicy/securitypolicyenforcer.go
+++ b/pkg/securitypolicy/securitypolicyenforcer.go
@@ -29,6 +29,7 @@ type CreateContainerOptions struct {
 	Umask                string
 	Capabilities         *oci.LinuxCapabilities
 	SeccompProfileSHA256 string
+	LinuxDevices         []oci.LinuxDevice
 }
 type SignalContainerOptions struct {
 	IsInitProcess bool
diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go
index 6cd911b5ec..0c4f168e6c 100644
--- a/pkg/securitypolicy/securitypolicyenforcer_rego.go
+++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go
@@ -713,6 +713,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicy(
 		Umask:                umask,
 		Capabilities:         capabilities,
 		SeccompProfileSHA256: seccompProfileSHA256,
+		LinuxDevices:         []oci.LinuxDevice{},
 	}
 	return policy.EnforceCreateContainerPolicyV2(ctx, containerID, argList, envList, workingDir, mounts, user, opts)
 }
@@ -750,6 +751,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicyV2(
 			"sandboxDir":           SandboxMountsDir(opts.SandboxID),
 			"hugePagesDir":         HugePagesMountsDir(opts.SandboxID),
 			"mounts":               appendMountData([]interface{}{}, mounts),
+			"devices":              appendDeviceData([]interface{}{}, opts.LinuxDevices),
 			"privileged":           opts.Privileged,
 			"noNewPrivileges":      opts.NoNewPrivileges,
 			"user":                 user.toInput(),
@@ -838,6 +840,22 @@ func appendMountData(mountData []interface{}, mounts []oci.Mount) []interface{}
 	return mountData
 }
 
+func appendDeviceData(deviceData []interface{}, devices []oci.LinuxDevice) []interface{} {
+	for _, device := range devices {
+		deviceData = append(deviceData, inputData{
+			"path":     device.Path,
+			"type":     device.Type,
+			"major":    device.Major,
+			"minor":    device.Minor,
+			"fileMode": device.FileMode,
+			"uid":      device.UID,
+			"gid":      device.GID,
+		})
+	}
+
+	return deviceData
+}
+
 func (policy *regoEnforcer) ExtendDefaultMounts(mounts []oci.Mount) error {
 	policy.defaultMounts = append(policy.defaultMounts, mounts...)
 	defaultMounts := appendMountData([]interface{}{}, policy.defaultMounts)

From ac490f938cd55c0cb6c42f1c096bc5247e4092e4 Mon Sep 17 00:00:00 2001
From: Tingmao Wang <tingmaowang@microsoft.com>
Date: Thu, 4 Jun 2026 00:36:25 +0000
Subject: [PATCH 8/8] Fix copilot review comments from blockdev mounts PR

Addresses the following comments:
https://github.com/microsoft/hcsshim/pull/2762#discussion_r3333091150
https://github.com/microsoft/hcsshim/pull/2762#discussion_r3333091204
https://github.com/microsoft/hcsshim/pull/2762#discussion_r3333091229
https://github.com/microsoft/hcsshim/pull/2762#discussion_r3333091293
https://github.com/microsoft/hcsshim/pull/2762#discussion_r3333091326
https://github.com/microsoft/hcsshim/pull/2762#discussion_r3333091352

Assisted-by: GitHub Copilot:claude-opus-4.7 copilot-review
Signed-off-by: Tingmao Wang <tingmaowang@microsoft.com>
---
 pkg/securitypolicy/regopolicy_linux_test.go       |  4 ++--
 pkg/securitypolicy/securitypolicyenforcer_rego.go | 11 +++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go
index 6f01169b5a..f64ab10f6d 100644
--- a/pkg/securitypolicy/regopolicy_linux_test.go
+++ b/pkg/securitypolicy/regopolicy_linux_test.go
@@ -2151,7 +2151,7 @@ func Test_Rego_EnforceCreateContainer_Capabilities_Drop_NoMatches(t *testing.T)
 	}
 }
 
-func Test_Regi_EnforceCreateContainer_RequireNoDevices(t *testing.T) {
+func Test_Rego_EnforceCreateContainer_RequireNoDevices(t *testing.T) {
 	f := func(p *generatedConstraints) bool {
 		tc, err := setupSimpleRegoCreateContainerTest(p)
 		if err != nil {
@@ -2180,7 +2180,7 @@ func Test_Regi_EnforceCreateContainer_RequireNoDevices(t *testing.T) {
 	}
 
 	if err := quick.Check(f, &quick.Config{MaxCount: 50, Rand: testRand}); err != nil {
-		t.Errorf("Test_Regi_EnforceCreateContainer_RequireNoDevices: %v", err)
+		t.Errorf("Test_Rego_EnforceCreateContainer_RequireNoDevices: %v", err)
 	}
 }
 
diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go
index 0c4f168e6c..6727e65464 100644
--- a/pkg/securitypolicy/securitypolicyenforcer_rego.go
+++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go
@@ -840,6 +840,13 @@ func appendMountData(mountData []interface{}, mounts []oci.Mount) []interface{}
 	return mountData
 }
 
+func uint32ptrtoany(i *uint32) interface{} {
+	if i == nil {
+		return nil
+	}
+	return *i
+}
+
 func appendDeviceData(deviceData []interface{}, devices []oci.LinuxDevice) []interface{} {
 	for _, device := range devices {
 		deviceData = append(deviceData, inputData{
@@ -848,8 +855,8 @@ func appendDeviceData(deviceData []interface{}, devices []oci.LinuxDevice) []int
 			"major":    device.Major,
 			"minor":    device.Minor,
 			"fileMode": device.FileMode,
-			"uid":      device.UID,
-			"gid":      device.GID,
+			"uid":      uint32ptrtoany(device.UID),
+			"gid":      uint32ptrtoany(device.GID),
 		})
 	}