diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 0162e7936d..b03013acdb 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -30,6 +30,7 @@ import ( "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/version" + "github.com/Microsoft/hcsshim/pkg/amdsevsnp" "github.com/Microsoft/hcsshim/pkg/securitypolicy" oci "github.com/opencontainers/runtime-spec/specs-go" ) @@ -414,6 +415,16 @@ func main() { mux := bridge.NewBridgeMux() b := bridge.New(mux, *v4) + // For confidential containers, we protect ourselves against attacks caused + // by concurrent modifications, by processing one request at a time. + forceSequential, err := amdsevsnp.IsSNP() + if err != nil { + // IsSNP cannot fail on LCOW + logrus.Errorf("Got unexpected error from IsSNP(): %v", err) + // If it fails, we proceed with forceSequential enabled to be safe + forceSequential = true + } + b.ForceSequential = forceSequential b.AssignHandlers(mux, h) // Reconnect loop: dial the host, serve until the connection drops, then diff --git a/internal/guest/bridge/bridge.go b/internal/guest/bridge/bridge.go index d8d57e786d..2a029773ed 100644 --- a/internal/guest/bridge/bridge.go +++ b/internal/guest/bridge/bridge.go @@ -177,6 +177,10 @@ type Bridge struct { Handler Handler // EnableV4 enables the v4+ bridge and the schema v2+ interfaces. EnableV4 bool + // Setting ForceSequential to true will force the bridge to only process one + // request at a time, except for certain long-running operations (as defined + // in asyncMessages). + ForceSequential bool // responseChan is the response channel used for both request/response // and publish notification workflows. @@ -256,6 +260,14 @@ func (b *Bridge) ShutdownRequested() bool { return b.hasQuitPending.Load() } +// Messages that will be processed asynchronously even in sequential mode. Note +// that in sequential mode, these messages will still wait for any in-progress +// non-async messages to be handled before they are processed, but once they are +// "acknowledged", the rest will be done asynchronously. +var alwaysAsyncMessages map[prot.MessageIdentifier]bool = map[prot.MessageIdentifier]bool{ + prot.ComputeSystemWaitForProcessV1: true, +} + // AssignHandlers creates and assigns the appropriate bridge // events to be listen for and intercepted on `mux` before forwarding // to `gcs` for handling. @@ -307,6 +319,10 @@ func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser defer b.disconnectNotifications() defer close(b.quitChan) + if b.ForceSequential { + log.G(context.Background()).Info("bridge: ForceSequential enabled") + } + // Receive bridge requests and schedule them to be processed. go func() { var recverr error @@ -409,30 +425,36 @@ func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser }() // Process each bridge request async and create the response writer. go func() { - for req := range requestChan { - go func(r *Request) { - br := bridgeResponse{ - ctx: r.Context, - header: &prot.MessageHeader{ - Type: prot.GetResponseIdentifier(r.Header.Type), - ID: r.Header.ID, - }, - } - resp, err := b.Handler.ServeMsg(r) - if resp == nil { - resp = &prot.MessageResponseBase{} - } - resp.Base().ActivityID = r.ActivityID - if err != nil { - span := trace.FromContext(r.Context) - if span != nil { - oc.SetSpanStatus(span, err) - } - setErrorForResponseBase(resp.Base(), err, "gcs" /* moduleName */) + doOneRequest := func(r *Request) { + br := bridgeResponse{ + ctx: r.Context, + header: &prot.MessageHeader{ + Type: prot.GetResponseIdentifier(r.Header.Type), + ID: r.Header.ID, + }, + } + resp, err := b.Handler.ServeMsg(r) + if resp == nil { + resp = &prot.MessageResponseBase{} + } + resp.Base().ActivityID = r.ActivityID + if err != nil { + span := trace.FromContext(r.Context) + if span != nil { + oc.SetSpanStatus(span, err) } - br.response = resp - b.responseChan <- br - }(req) + setErrorForResponseBase(resp.Base(), err, "gcs" /* moduleName */) + } + br.response = resp + b.responseChan <- br + } + + for req := range requestChan { + if b.ForceSequential && !alwaysAsyncMessages[req.Header.Type] { + runSequentialRequestHandler(req, doOneRequest) + } else { + go doOneRequest(req) + } } }() // Process each bridge response sync. This channel is for request/response and publish workflows. @@ -495,6 +517,32 @@ func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser } } +// Do handleFn(r), but prints a warning if handleFn does not, or takes too long +// to return. +func runSequentialRequestHandler(r *Request, handleFn func(*Request)) { + // Note that this is only a context used for triggering the blockage + // warning, the request processing still uses r.Context. We don't want to + // cancel the request handling itself when we reach the 5s timeout. + timeoutCtx, cancel := context.WithTimeout(r.Context, 5*time.Second) + go func() { + <-timeoutCtx.Done() + if errors.Is(timeoutCtx.Err(), context.DeadlineExceeded) { + log.G(timeoutCtx).WithFields(logrus.Fields{ + // We want to log those even though we're providing r.Context, since if + // the request never finishes the span end log will never get written, + // and we may therefore not be able to find out about the following info + // otherwise: + "message-type": r.Header.Type.String(), + "message-id": r.Header.ID, + "activity-id": r.ActivityID, + "container-id": r.ContainerID, + }).Warnf("bridge: request processing thread in sequential mode blocked on the current request for more than 5 seconds") + } + }() + defer cancel() + handleFn(r) +} + // PublishNotification writes a specific notification to the bridge. func (b *Bridge) PublishNotification(n *prot.ContainerNotification) { ctx, span := oc.StartSpan(context.Background(), diff --git a/internal/guest/bridge/bridge_v2.go b/internal/guest/bridge/bridge_v2.go index dc13ec6428..748e9dcd9b 100644 --- a/internal/guest/bridge/bridge_v2.go +++ b/internal/guest/bridge/bridge_v2.go @@ -467,16 +467,10 @@ func (b *Bridge) deleteContainerStateV2(r *Request) (_ RequestResponse, err erro return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) } - c, err := b.hostState.GetCreatedContainer(request.ContainerID) + err = b.hostState.DeleteContainerState(ctx, request.ContainerID) if err != nil { return nil, err } - // remove container state regardless of delete's success - defer b.hostState.RemoveContainer(request.ContainerID) - - if err := c.Delete(ctx); err != nil { - return nil, err - } return &prot.MessageResponseBase{}, nil } diff --git a/internal/guest/network/network.go b/internal/guest/network/network.go index cc06398e41..f68f8afa39 100644 --- a/internal/guest/network/network.go +++ b/internal/guest/network/network.go @@ -9,6 +9,7 @@ import ( "fmt" "os" "path/filepath" + "regexp" "strings" "time" @@ -32,6 +33,18 @@ var ( // maxDNSSearches is limited to 6 in `man 5 resolv.conf` const maxDNSSearches = 6 +var validHostnameRegex = regexp.MustCompile(`^[a-zA-Z0-9_\-\.]{0,255}$`) + +// Check that the hostname is safe. This function is less strict than +// technically allowed, but ensures that when the hostname is inserted to +// /etc/hosts, it cannot lead to injection attacks. +func ValidateHostname(hostname string) error { + if !validHostnameRegex.MatchString(hostname) { + return errors.Errorf("hostname %q invalid: must match %s", hostname, validHostnameRegex.String()) + } + return nil +} + // GenerateEtcHostsContent generates a /etc/hosts file based on `hostname`. func GenerateEtcHostsContent(ctx context.Context, hostname string) string { _, span := oc.StartSpan(ctx, "network::GenerateEtcHostsContent") diff --git a/internal/guest/network/network_test.go b/internal/guest/network/network_test.go index 9a718aed66..cc0ec1be3a 100644 --- a/internal/guest/network/network_test.go +++ b/internal/guest/network/network_test.go @@ -7,6 +7,7 @@ import ( "context" "os" "path/filepath" + "strings" "testing" "time" ) @@ -70,6 +71,40 @@ func Test_GenerateResolvConfContent(t *testing.T) { } } +func Test_ValidateHostname(t *testing.T) { + validNames := []string{ + "localhost", + "my-hostname", + "my.hostname", + "my-host-name123", + "_underscores.are.allowed.too", + "", // Allow not specifying a hostname + } + + invalidNames := []string{ + "localhost\n13.104.0.1 ip6-localhost ip6-loopback localhost", + "localhost\n2603:1000::1 ip6-localhost ip6-loopback", + "hello@microsoft.com", + "has space", + "has,comma", + "\x00", + "a\nb", + strings.Repeat("a", 1000), + } + + for _, n := range validNames { + if err := ValidateHostname(n); err != nil { + t.Fatalf("expected %q to be valid, got: %v", n, err) + } + } + + for _, n := range invalidNames { + if err := ValidateHostname(n); err == nil { + t.Fatalf("expected %q to be invalid, but got nil error", n) + } + } +} + func Test_GenerateEtcHostsContent(t *testing.T) { type testcase struct { name string diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 13c0231801..15d416ae96 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -77,6 +77,9 @@ type Container struct { // and deal with the extra pointer dereferencing overhead. status atomic.Uint32 + // Set to true when the init process for the container has exited + terminated atomic.Bool + // scratchDirPath represents the path inside the UVM where the scratch directory // of this container is located. Usually, this is either `/run/gcs/c/` or // `/run/gcs/c//container_` if scratch is shared with UVM scratch. diff --git a/internal/guest/runtime/hcsv2/process.go b/internal/guest/runtime/hcsv2/process.go index e94c9792f6..96564cfab0 100644 --- a/internal/guest/runtime/hcsv2/process.go +++ b/internal/guest/runtime/hcsv2/process.go @@ -99,6 +99,7 @@ func newProcess(c *Container, spec *oci.Process, process runtime.Process, pid ui log.G(ctx).WithError(err).Error("failed to wait for runc process") } p.exitCode = exitCode + c.terminated.Store(true) log.G(ctx).WithField("exitCode", p.exitCode).Debug("process exited") // Free any process waiters diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go index 5f3d80356b..c6623d52f1 100644 --- a/internal/guest/runtime/hcsv2/sandbox_container.go +++ b/internal/guest/runtime/hcsv2/sandbox_container.go @@ -50,6 +50,9 @@ func setupSandboxContainerSpec(ctx context.Context, id, sandboxRoot string, spec // Write the hostname hostname := spec.Hostname + if err = network.ValidateHostname(hostname); err != nil { + return err + } if hostname == "" { var err error hostname, err = os.Hostname() diff --git a/internal/guest/runtime/hcsv2/standalone_container.go b/internal/guest/runtime/hcsv2/standalone_container.go index 4cf8cc72ec..27b6f8471b 100644 --- a/internal/guest/runtime/hcsv2/standalone_container.go +++ b/internal/guest/runtime/hcsv2/standalone_container.go @@ -46,6 +46,9 @@ func setupStandaloneContainerSpec(ctx context.Context, id, rootDir string, spec }() hostname := spec.Hostname + if err = network.ValidateHostname(hostname); err != nil { + return err + } if hostname == "" { var err error hostname, err = os.Hostname() diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 0cd6448cae..d2ca404f65 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -15,8 +15,10 @@ import ( "path" "path/filepath" "regexp" + "slices" "strings" "sync" + "sync/atomic" "syscall" "time" @@ -25,6 +27,7 @@ import ( "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" + "go.opencensus.io/trace" "golang.org/x/sys/unix" "github.com/Microsoft/hcsshim/internal/bridgeutils/gcserr" @@ -44,6 +47,7 @@ import ( "github.com/Microsoft/hcsshim/internal/guestpath" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/logfields" + "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/oci" "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" "github.com/Microsoft/hcsshim/internal/protocol/guestresource" @@ -114,7 +118,9 @@ type Host struct { securityOptions *securitypolicy.SecurityOptions // hostMounts keeps the state of currently mounted devices and file systems, - // which is used for GCS hardening. + // which is used for GCS hardening. It is only used for confidential + // containers, and is initialized in SetConfidentialUVMOptions. If this is + // nil, we do not do add any special restrictions on mounts / unmounts. hostMounts *hostMounts // uvmError contains a permanent flag to indicate that further mounts, // unmounts and container creation / deletion should not be allowed. This @@ -171,7 +177,7 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s rtime: rtime, vsock: vsock, devNullTransport: &transport.DevNullTransport{}, - hostMounts: newHostMounts(), + hostMounts: nil, securityOptions: securityPolicyOptions, uvmError: uvmConsistencyError{}, } @@ -500,6 +506,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM isSandbox: criType == "sandbox", exitType: prot.NtUnexpectedExit, processes: make(map[uint32]*containerProcess), + terminated: atomic.Bool{}, scratchDirPath: settings.ScratchDirPath, } c.setStatus(containerCreating) @@ -524,6 +531,17 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM h.RemoveContainer(id) } }() + // Take a backup of the devices array before we populate it with any devices + // found by GCS, in order to pass to the policy enforcer later. + // + // In specGuest.ApplyAnnotationsToSpec, if this is a privileged container, + // we will add devices found in the GCS namespace's /dev. Regardless of + // privileged or not, we also always include /dev/sev-guest. Since the + // policy already lets the user enforce whether the container should be + // privileged or not, and the sev-guest device is always added for a + // confidential container, we do not need the policy enforcer to check these + // devices we dynamically add again. + extraLinuxDevices := slices.Clone(settings.OCISpecification.Linux.Devices) var namespaceID string if isCRI { @@ -655,21 +673,27 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM return nil, err } - envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicy( + privileged := isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification) + noNewPrivileges := settings.OCISpecification.Process.NoNewPrivileges + opts := &securitypolicy.CreateContainerOptions{ + SandboxID: sandboxID, + Privileged: &privileged, + NoNewPrivileges: &noNewPrivileges, + Groups: groups, + Umask: umask, + Capabilities: settings.OCISpecification.Process.Capabilities, + SeccompProfileSHA256: seccomp, + LinuxDevices: extraLinuxDevices, + } + envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicyV2( ctx, - sandboxID, id, settings.OCISpecification.Process.Args, settings.OCISpecification.Process.Env, settings.OCISpecification.Process.Cwd, settings.OCISpecification.Mounts, - isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification), - settings.OCISpecification.Process.NoNewPrivileges, user, - groups, - umask, - settings.OCISpecification.Process.Capabilities, - seccomp, + opts, ) if err != nil { return nil, errors.Wrapf(err, "container creation denied due to policy") @@ -802,6 +826,25 @@ func writeSpecToFile(ctx context.Context, configFile string, spec *specs.Spec) e return nil } +// Returns whether there is a running container that is currently using the +// given overlay (as its rootfs). +func (h *Host) IsOverlayInUse(overlayPath string) bool { + h.containersMutex.Lock() + defer h.containersMutex.Unlock() + + for _, c := range h.containers { + if c.terminated.Load() { + continue + } + + if c.spec.Root.Path == overlayPath { + return true + } + } + + return false +} + func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req *guestrequest.ModificationRequest) (retErr error) { if h.HasSecurityPolicy() { if err := checkValidContainerID(containerID, "container"); err != nil { @@ -825,35 +868,6 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * return err } mvd.Controller = cNum - // first we try to update the internal state for read-write attachments. - if !mvd.ReadOnly { - localCtx, cancel := context.WithTimeout(ctx, time.Second*5) - defer cancel() - source, err := scsi.GetDevicePath(localCtx, mvd.Controller, mvd.Lun, mvd.Partition) - if err != nil { - return err - } - switch req.RequestType { - case guestrequest.RequestTypeAdd: - if err := h.hostMounts.AddRWDevice(mvd.MountPath, source, mvd.Encrypted); err != nil { - return err - } - defer func() { - if retErr != nil { - _ = h.hostMounts.RemoveRWDevice(mvd.MountPath, source) - } - }() - case guestrequest.RequestTypeRemove: - if err := h.hostMounts.RemoveRWDevice(mvd.MountPath, source); err != nil { - return err - } - defer func() { - if retErr != nil { - _ = h.hostMounts.AddRWDevice(mvd.MountPath, source, mvd.Encrypted) - } - }() - } - } return h.modifyMappedVirtualDisk(ctx, req.RequestType, mvd) case guestresource.ResourceTypeMappedDirectory: if err := h.checkState(); err != nil { @@ -872,12 +886,7 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * return err } - cl := req.Settings.(*guestresource.LCOWCombinedLayers) - // when cl.ScratchPath == "", we mount overlay as read-only, in which case - // we don't really care about scratch encryption, since the host already - // knows about the layers and the overlayfs. - encryptedScratch := cl.ScratchPath != "" && h.hostMounts.IsEncrypted(cl.ScratchPath) - return h.modifyCombinedLayers(ctx, req.RequestType, req.Settings.(*guestresource.LCOWCombinedLayers), encryptedScratch) + return h.modifyCombinedLayers(ctx, req.RequestType, req.Settings.(*guestresource.LCOWCombinedLayers)) case guestresource.ResourceTypeNetwork: return modifyNetwork(ctx, req.RequestType, req.Settings.(*guestresource.LCOWNetworkAdapter)) case guestresource.ResourceTypeVPCIDevice: @@ -893,10 +902,22 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * if !ok { return errors.New("the request's settings are not of type ConfidentialOptions") } - return h.securityOptions.SetConfidentialOptions(ctx, + err := h.securityOptions.SetConfidentialOptions(ctx, r.EnforcerType, r.EncodedSecurityPolicy, r.EncodedUVMReference) + if err != nil { + return err + } + + // Start tracking mounts and restricting unmounts on confidential containers. + // As long as we started off with the ClosedDoorSecurityPolicyEnforcer, no + // mounts should have been allowed until this point. + if h.HasSecurityPolicy() { + log.G(ctx).Debug("hostMounts initialized") + h.hostMounts = newHostMounts() + } + return nil case guestresource.ResourceTypePolicyFragment: r, ok := req.Settings.(*guestresource.SecurityPolicyFragment) if !ok { @@ -1270,18 +1291,33 @@ func (h *Host) modifyMappedVirtualDisk( rt guestrequest.RequestType, mvd *guestresource.LCOWMappedVirtualDisk, ) (err error) { + ctx, span := oc.StartSpan(ctx, "gcs::Host::modifyMappedVirtualDisk") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes( + trace.StringAttribute("requestType", string(rt)), + trace.BoolAttribute("hasHostMounts", h.hostMounts != nil), + trace.Int64Attribute("controller", int64(mvd.Controller)), + trace.Int64Attribute("lun", int64(mvd.Lun)), + trace.Int64Attribute("partition", int64(mvd.Partition)), + trace.BoolAttribute("readOnly", mvd.ReadOnly), + trace.StringAttribute("mountPath", mvd.MountPath), + ) + var verityInfo *guestresource.DeviceVerityInfo securityPolicy := h.securityOptions.PolicyEnforcer + devPath, err := scsi.GetDevicePath(ctx, mvd.Controller, mvd.Lun, mvd.Partition) + if err != nil { + return err + } + span.AddAttributes(trace.StringAttribute("devicePath", devPath)) + if mvd.ReadOnly { // The only time the policy is empty, and we want it to be empty // is when no policy is provided, and we default to open door // policy. In any other case, e.g. explicit open door or any // other rego policy we would like to mount layers with verity. if h.HasSecurityPolicy() { - devPath, err := scsi.GetDevicePath(ctx, mvd.Controller, mvd.Lun, mvd.Partition) - if err != nil { - return err - } verityInfo, err = verity.ReadVeritySuperBlock(ctx, devPath) if err != nil { return err @@ -1302,6 +1338,24 @@ func (h *Host) modifyMappedVirtualDisk( mountCtx, cancel := context.WithTimeout(ctx, time.Second*5) defer cancel() if mvd.MountPath != "" { + if h.HasSecurityPolicy() { + // The only option we allow if there is policy enforcement is + // "ro", and it must match the readonly field in the request. + mountOptionHasRo := false + for _, opt := range mvd.Options { + if opt == "ro" { + mountOptionHasRo = true + continue + } + return errors.Errorf("mounting scsi device controller %d lun %d onto %s: mount option %q denied by policy", mvd.Controller, mvd.Lun, mvd.MountPath, opt) + } + if mvd.ReadOnly != mountOptionHasRo { + return errors.Errorf( + "mounting scsi device controller %d lun %d onto %s with mount option %q failed due to mount option mismatch: mvd.ReadOnly=%t but mountOptionHasRo=%t", + mvd.Controller, mvd.Lun, mvd.MountPath, strings.Join(mvd.Options, ","), mvd.ReadOnly, mountOptionHasRo, + ) + } + } if mvd.ReadOnly { var deviceHash string if verityInfo != nil { @@ -1311,11 +1365,42 @@ func (h *Host) modifyMappedVirtualDisk( if err != nil { return errors.Wrapf(err, "mounting scsi device controller %d lun %d onto %s denied by policy", mvd.Controller, mvd.Lun, mvd.MountPath) } + if h.hostMounts != nil { + h.hostMounts.Lock() + defer h.hostMounts.Unlock() + + err = h.hostMounts.AddRODevice(mvd.MountPath, devPath) + if err != nil { + return err + } + // Note: "When a function returns, its deferred calls are + // executed in last-in-first-out order." - so we are safe to + // call RemoveRODevice in this defer. + defer func() { + if err != nil { + _ = h.hostMounts.RemoveRODevice(mvd.MountPath, devPath) + } + }() + } } else { err = securityPolicy.EnforceRWDeviceMountPolicy(ctx, mvd.MountPath, mvd.Encrypted, mvd.EnsureFilesystem, mvd.Filesystem) if err != nil { return errors.Wrapf(err, "mounting scsi device controller %d lun %d onto %s denied by policy", mvd.Controller, mvd.Lun, mvd.MountPath) } + if h.hostMounts != nil { + h.hostMounts.Lock() + defer h.hostMounts.Unlock() + + err = h.hostMounts.AddRWDevice(mvd.MountPath, devPath, mvd.Encrypted) + if err != nil { + return err + } + defer func() { + if err != nil { + _ = h.hostMounts.RemoveRWDevice(mvd.MountPath, devPath, mvd.Encrypted) + } + }() + } } config := &scsi.Config{ Encrypted: mvd.Encrypted, @@ -1340,10 +1425,36 @@ func (h *Host) modifyMappedVirtualDisk( if err = securityPolicy.EnforceDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil { return fmt.Errorf("unmounting scsi device at %s denied by policy: %w", mvd.MountPath, err) } + if h.hostMounts != nil { + h.hostMounts.Lock() + defer h.hostMounts.Unlock() + + if err = h.hostMounts.RemoveRODevice(mvd.MountPath, devPath); err != nil { + return err + } + defer func() { + if err != nil { + _ = h.hostMounts.AddRODevice(mvd.MountPath, devPath) + } + }() + } } else { if err = securityPolicy.EnforceRWDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil { return fmt.Errorf("unmounting scsi device at %s denied by policy: %w", mvd.MountPath, err) } + if h.hostMounts != nil { + h.hostMounts.Lock() + defer h.hostMounts.Unlock() + + if err = h.hostMounts.RemoveRWDevice(mvd.MountPath, devPath, mvd.Encrypted); err != nil { + return err + } + defer func() { + if err != nil { + _ = h.hostMounts.AddRWDevice(mvd.MountPath, devPath, mvd.Encrypted) + } + }() + } } // Check that the directory actually exists first, and if it // does not then we just refuse to do anything, without closing @@ -1406,6 +1517,12 @@ func (h *Host) modifyMappedDirectory( return errors.Wrapf(err, "mounting plan9 device at %s denied by policy", md.MountPath) } + if h.HasSecurityPolicy() { + if err = plan9.ValidateShareName(md.ShareName); err != nil { + return err + } + } + // Similar to the reasoning in modifyMappedVirtualDisk, since we're // rolling back the policy metadata, plan9.Mount here must clean up // everything if it fails, which it does do. @@ -1513,8 +1630,17 @@ func (h *Host) modifyCombinedLayers( ctx context.Context, rt guestrequest.RequestType, cl *guestresource.LCOWCombinedLayers, - scratchEncrypted bool, ) (err error) { + ctx, span := oc.StartSpan(ctx, "gcs::Host::modifyCombinedLayers") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes( + trace.StringAttribute("requestType", string(rt)), + trace.BoolAttribute("hasHostMounts", h.hostMounts != nil), + trace.StringAttribute("containerRootPath", cl.ContainerRootPath), + trace.StringAttribute("scratchPath", cl.ScratchPath), + ) + securityPolicy := h.securityOptions.PolicyEnforcer containerID := cl.ContainerID @@ -1523,6 +1649,12 @@ func (h *Host) modifyCombinedLayers( // we've actually called Unmount and it fails we permanently block further // device operations. return securityPolicy.WithMetadataRollback(func() error { + if h.hostMounts != nil { + // We will need this in multiple places, let's take the lock once here. + h.hostMounts.Lock() + defer h.hostMounts.Unlock() + } + switch rt { case guestrequest.RequestTypeAdd: if h.HasSecurityPolicy() { @@ -1570,15 +1702,29 @@ func (h *Host) modifyCombinedLayers( } else { upperdirPath = filepath.Join(cl.ScratchPath, "upper") workdirPath = filepath.Join(cl.ScratchPath, "work") + scratchEncrypted := false + if h.hostMounts != nil { + scratchEncrypted = h.hostMounts.IsEncrypted(cl.ScratchPath) + } if err := securityPolicy.EnforceScratchMountPolicy(ctx, cl.ScratchPath, scratchEncrypted); err != nil { return fmt.Errorf("scratch mounting denied by policy: %w", err) } } - if err := securityPolicy.EnforceOverlayMountPolicy(ctx, containerID, layerPaths, cl.ContainerRootPath); err != nil { + if err = securityPolicy.EnforceOverlayMountPolicy(ctx, containerID, layerPaths, cl.ContainerRootPath); err != nil { return fmt.Errorf("overlay creation denied by policy: %w", err) } + if h.hostMounts != nil { + if err = h.hostMounts.AddOverlay(cl.ContainerRootPath, layerPaths, cl.ScratchPath); err != nil { + return err + } + defer func() { + if err != nil { + _, _ = h.hostMounts.RemoveOverlay(cl.ContainerRootPath) + } + }() + } // Correctness for policy transaction rollback: // MountLayer does two things - mkdir, then mount. On mount failure, the @@ -1592,6 +1738,23 @@ func (h *Host) modifyCombinedLayers( return errors.Wrap(err, "overlay removal denied by policy") } + // Check that no running container is using this overlay as its rootfs. + if h.HasSecurityPolicy() && h.IsOverlayInUse(cl.ContainerRootPath) { + return fmt.Errorf("overlay %q is in use by a running container", cl.ContainerRootPath) + } + + if h.hostMounts != nil { + var undoRemoveOverlay func() + if undoRemoveOverlay, err = h.hostMounts.RemoveOverlay(cl.ContainerRootPath); err != nil { + return err + } + defer func() { + if err != nil && undoRemoveOverlay != nil { + undoRemoveOverlay() + } + }() + } + // Note: storage.UnmountPath is a no-op if the path does not exist. err = storage.UnmountPath(ctx, cl.ContainerRootPath, true) if err != nil { @@ -1729,3 +1892,40 @@ func (h *Host) createContainerInPod(sandboxID string, containerID string) error return nil } + +func (h *Host) DeleteContainerState(ctx context.Context, containerID string) error { + if h.HasSecurityPolicy() { + if err := checkValidContainerID(containerID, "container"); err != nil { + return err + } + } + + if err := h.checkState(); err != nil { + return err + } + + c, err := h.GetCreatedContainer(containerID) + if err != nil { + return err + } + if h.HasSecurityPolicy() { + if !c.terminated.Load() { + return errors.Errorf("Denied deleting state of a running container %q", containerID) + } + overlay := c.spec.Root.Path + h.hostMounts.Lock() + defer h.hostMounts.Unlock() + if h.hostMounts.HasOverlayMountedAt(overlay) { + return errors.Errorf("Denied deleting state of a container with a overlay mount still active") + } + } + + // remove container state regardless of delete's success + defer h.RemoveContainer(containerID) + + if err = c.Delete(ctx); err != nil { + return err + } + + return nil +} diff --git a/internal/guest/runtime/hcsv2/uvm_state.go b/internal/guest/runtime/hcsv2/uvm_state.go index dd1ff521f0..f7d347cbf4 100644 --- a/internal/guest/runtime/hcsv2/uvm_state.go +++ b/internal/guest/runtime/hcsv2/uvm_state.go @@ -4,91 +4,358 @@ package hcsv2 import ( + "context" "fmt" "path/filepath" "strings" "sync" + + "github.com/Microsoft/hcsshim/internal/log" + "github.com/sirupsen/logrus" +) + +type deviceType int + +const ( + DeviceTypeRW deviceType = iota + DeviceTypeRO + DeviceTypeOverlay ) -type rwDevice struct { +func (d deviceType) String() string { + switch d { + case DeviceTypeRW: + return "RW" + case DeviceTypeRO: + return "RO" + case DeviceTypeOverlay: + return "Overlay" + default: + return fmt.Sprintf("Unknown(%d)", d) + } +} + +type device struct { + // fields common to all mountPath string + ty deviceType + usage int sourcePath string - encrypted bool + + // rw devices + encrypted bool + + // overlay devices + referencedDevices []*device } +// hostMounts tracks the state of fs/overlay mounts and their usage +// relationship. Users of this struct must call hm.Lock() before calling any +// other methods and call hm.Unlock() when done. +// +// Since mount/unmount operations can fail, the expected way to use this struct +// is to first lock it, call the method to add/remove the device, then, with the +// lock still held, perform the actual operation. If the operation fails, the +// caller must undo the operation by calling the appropriate remove/add method +// or the returned undo function, before unlocking. type hostMounts struct { - stateMutex sync.Mutex + stateMutex sync.Mutex + stateMutexLocked bool - // Holds information about read-write devices, which can be encrypted and - // contain overlay fs upper/work directory mounts. - readWriteMounts map[string]*rwDevice + // Map from mountPath to device struct + devices map[string]*device } func newHostMounts() *hostMounts { return &hostMounts{ - readWriteMounts: map[string]*rwDevice{}, + devices: make(map[string]*device), } } -// AddRWDevice adds read-write device metadata for device mounted at `mountPath`. -// Returns an error if there's an existing device mounted at `mountPath` location. -func (hm *hostMounts) AddRWDevice(mountPath string, sourcePath string, encrypted bool) error { +func (hm *hostMounts) expectLocked() { + if !hm.stateMutexLocked { + panic("hostMounts: expected stateMutex to be locked, but it was not") + } +} + +// Locks the state mutex. This is not re-entrant, calling it twice in the same +// thread will deadlock/panic. +func (hm *hostMounts) Lock() { hm.stateMutex.Lock() - defer hm.stateMutex.Unlock() + // Since we just acquired the lock, either it was not locked before, or + // somebody just unlocked it. Either case, hm.stateMutexLocked should be + // false. + if hm.stateMutexLocked { + panic("hostMounts: stateMutexLocked already true when locking stateMutex") + } + hm.stateMutexLocked = true +} + +// Unlocks the state mutex +func (hm *hostMounts) Unlock() { + hm.expectLocked() + hm.stateMutexLocked = false + hm.stateMutex.Unlock() +} - mountTarget := filepath.Clean(mountPath) - if source, ok := hm.readWriteMounts[mountTarget]; ok { - return fmt.Errorf("read-write with source %q and mount target %q already exists", source.sourcePath, mountPath) +func (hm *hostMounts) findDeviceAtPath(mountPath string) *device { + hm.expectLocked() + + if dev, ok := hm.devices[mountPath]; ok { + return dev } - hm.readWriteMounts[mountTarget] = &rwDevice{ - mountPath: mountTarget, - sourcePath: sourcePath, - encrypted: encrypted, + return nil +} + +func (hm *hostMounts) addDeviceToMapChecked(dev *device) error { + hm.expectLocked() + + if _, ok := hm.devices[dev.mountPath]; ok { + return fmt.Errorf("device at mount path %q already exists", dev.mountPath) } + hm.devices[dev.mountPath] = dev return nil } -// RemoveRWDevice removes the read-write device metadata for device mounted at -// `mountPath`. -func (hm *hostMounts) RemoveRWDevice(mountPath string, sourcePath string) error { - hm.stateMutex.Lock() - defer hm.stateMutex.Unlock() +func (hm *hostMounts) findDeviceContainingPath(path string) *device { + hm.expectLocked() + + // TODO: can we refactor this function by walking each component of the path + // from leaf to root, each time checking if the current component is a mount + // point? (i.e. why do we have to use filepath.Rel?) + + var foundDev *device + cleanPath := filepath.Clean(path) + for devPath, dev := range hm.devices { + relPath, err := filepath.Rel(devPath, cleanPath) + // skip further checks if an error is returned or the relative path + // contains "..", meaning that the `path` isn't directly nested under + // `rwPath`. + if err != nil || strings.HasPrefix(relPath, "..") { + continue + } + if foundDev == nil { + foundDev = dev + } else if len(dev.mountPath) > len(foundDev.mountPath) { + // The current device is mounted on top of a previously found device. + foundDev = dev + } + } + return foundDev +} + +func (hm *hostMounts) usePath(path string) (*device, error) { + hm.expectLocked() + + // Find the device at the given path and increment its usage count. + dev := hm.findDeviceContainingPath(path) + if dev == nil { + return nil, nil + } + dev.usage++ + return dev, nil +} + +func (hm *hostMounts) releaseDeviceUsage(dev *device) { + hm.expectLocked() + + if dev.usage <= 0 { + log.G(context.Background()).WithFields(logrus.Fields{ + "device": dev.mountPath, + "deviceSource": dev.sourcePath, + "deviceType": dev.ty, + "usage": dev.usage, + }).Error("hostMounts::releaseDeviceUsage: unexpected zero usage count") + return + } + dev.usage-- +} + +// User should carefully handle side-effects of adding a device if the device +// fails to be added. +func (hm *hostMounts) doAddDevice(mountPath string, ty deviceType, sourcePath string) (*device, error) { + hm.expectLocked() + + dev := &device{ + mountPath: filepath.Clean(mountPath), + ty: ty, + usage: 0, + sourcePath: sourcePath, + } + + if err := hm.addDeviceToMapChecked(dev); err != nil { + return nil, err + } + return dev, nil +} + +// Once checks is called, unless it returns an error, this function will always +// succeed +func (hm *hostMounts) doRemoveDevice(mountPath string, ty deviceType, sourcePath string, checks func(*device) error) error { + hm.expectLocked() unmountTarget := filepath.Clean(mountPath) - device, ok := hm.readWriteMounts[unmountTarget] - if !ok { + device := hm.findDeviceAtPath(unmountTarget) + if device == nil { // already removed or didn't exist return nil } if device.sourcePath != sourcePath { - return fmt.Errorf("wrong sourcePath %s", sourcePath) + return fmt.Errorf("wrong sourcePath %s, expected %s", sourcePath, device.sourcePath) + } + if device.ty != ty { + return fmt.Errorf("wrong device type %s, expected %s", ty, device.ty) + } + if device.usage > 0 { + log.G(context.Background()).WithFields(logrus.Fields{ + "device": device.mountPath, + "deviceSource": device.sourcePath, + "deviceType": device.ty, + "usage": device.usage, + }).Error("hostMounts::doRemoveDevice: device still in use, refusing unmount") + return fmt.Errorf("device at %q is still in use, can't unmount", unmountTarget) + } + if checks != nil { + if err := checks(device); err != nil { + return err + } } - delete(hm.readWriteMounts, unmountTarget) + delete(hm.devices, unmountTarget) return nil } +func (hm *hostMounts) AddRODevice(mountPath string, sourcePath string) error { + hm.expectLocked() + + _, err := hm.doAddDevice(mountPath, DeviceTypeRO, sourcePath) + return err +} + +// AddRWDevice adds read-write device metadata for device mounted at `mountPath`. +// Returns an error if there's an existing device mounted at `mountPath` location. +func (hm *hostMounts) AddRWDevice(mountPath string, sourcePath string, encrypted bool) error { + hm.expectLocked() + + dev, err := hm.doAddDevice(mountPath, DeviceTypeRW, sourcePath) + if err != nil { + return err + } + dev.encrypted = encrypted + return nil +} + +func (hm *hostMounts) AddOverlay(mountPath string, layers []string, scratchDir string) (err error) { + hm.expectLocked() + + dev, err := hm.doAddDevice(mountPath, DeviceTypeOverlay, mountPath) + if err != nil { + return err + } + dev.referencedDevices = make([]*device, 0, len(layers)+1) + defer func() { + if err != nil { + // If we failed to use any of the paths, we need to release the ones + // that we did use. + for _, d := range dev.referencedDevices { + hm.releaseDeviceUsage(d) + } + delete(hm.devices, mountPath) + } + }() + + for _, layer := range layers { + refDev, err := hm.usePath(layer) + if err != nil { + return err + } + if refDev != nil { + dev.referencedDevices = append(dev.referencedDevices, refDev) + } + } + refDev, err := hm.usePath(scratchDir) + if err != nil { + return err + } + if refDev != nil { + dev.referencedDevices = append(dev.referencedDevices, refDev) + } + + return nil +} + +func (hm *hostMounts) RemoveRODevice(mountPath string, sourcePath string) error { + hm.expectLocked() + + return hm.doRemoveDevice(mountPath, DeviceTypeRO, sourcePath, nil) +} + +// RemoveRWDevice removes the read-write device metadata for device mounted at +// `mountPath`. +func (hm *hostMounts) RemoveRWDevice(mountPath string, sourcePath string, encrypted bool) error { + hm.expectLocked() + + return hm.doRemoveDevice(mountPath, DeviceTypeRW, sourcePath, func(dev *device) error { + if dev.encrypted != encrypted { + return fmt.Errorf("encrypted flag wrong, provided %v, expected %v", encrypted, dev.encrypted) + } + return nil + }) +} + +func (hm *hostMounts) RemoveOverlay(mountPath string) (undo func(), err error) { + hm.expectLocked() + + var dev *device + err = hm.doRemoveDevice(mountPath, DeviceTypeOverlay, mountPath, func(_dev *device) error { + dev = _dev + for _, refDev := range dev.referencedDevices { + hm.releaseDeviceUsage(refDev) + } + return nil + }) + if err != nil { + // If we get an error from doRemoveDevice, we have not released anything + // yet. + return nil, err + } + undo = func() { + hm.expectLocked() + + for _, refDev := range dev.referencedDevices { + refDev.usage++ + } + + if _, ok := hm.devices[mountPath]; ok { + log.G(context.Background()).WithField("mountPath", mountPath).Error( + "hostMounts::RemoveOverlay: failed to undo remove: device that was removed exists in map", + ) + return + } + + hm.devices[mountPath] = dev + } + return undo, nil +} + // IsEncrypted checks if the given path is a sub-path of an encrypted read-write // device. func (hm *hostMounts) IsEncrypted(path string) bool { - hm.stateMutex.Lock() - defer hm.stateMutex.Unlock() + hm.expectLocked() - parentPath := "" - encrypted := false - cleanPath := filepath.Clean(path) - for rwPath, rwDev := range hm.readWriteMounts { - relPath, err := filepath.Rel(rwPath, cleanPath) - // skip further checks if an error is returned or the relative path - // contains "..", meaning that the `path` isn't directly nested under - // `rwPath`. - if err != nil || strings.HasPrefix(relPath, "..") { - continue - } - if len(rwDev.mountPath) > len(parentPath) { - parentPath = rwDev.mountPath - encrypted = rwDev.encrypted - } + dev := hm.findDeviceContainingPath(path) + if dev == nil { + return false + } + return dev.encrypted +} + +func (hm *hostMounts) HasOverlayMountedAt(path string) bool { + hm.expectLocked() + + dev := hm.findDeviceAtPath(filepath.Clean(path)) + if dev == nil { + return false } - return encrypted + return dev.ty == DeviceTypeOverlay } diff --git a/internal/guest/runtime/hcsv2/uvm_state_test.go b/internal/guest/runtime/hcsv2/uvm_state_test.go index b708caaeba..e87a207308 100644 --- a/internal/guest/runtime/hcsv2/uvm_state_test.go +++ b/internal/guest/runtime/hcsv2/uvm_state_test.go @@ -12,10 +12,13 @@ func Test_Add_Remove_RWDevice(t *testing.T) { mountPath := "/run/gcs/c/abcd" sourcePath := "/dev/sda" + hm.Lock() + defer hm.Unlock() + if err := hm.AddRWDevice(mountPath, sourcePath, false); err != nil { t.Fatalf("unexpected error adding RW device: %s", err) } - if err := hm.RemoveRWDevice(mountPath, sourcePath); err != nil { + if err := hm.RemoveRWDevice(mountPath, sourcePath, false); err != nil { t.Fatalf("unexpected error removing RW device: %s", err) } } @@ -25,29 +28,55 @@ func Test_Cannot_AddRWDevice_Twice(t *testing.T) { mountPath := "/run/gcs/c/abc" sourcePath := "/dev/sda" + hm.Lock() if err := hm.AddRWDevice(mountPath, sourcePath, false); err != nil { t.Fatalf("unexpected error: %s", err) } + hm.Unlock() + + hm.Lock() if err := hm.AddRWDevice(mountPath, sourcePath, false); err == nil { t.Fatalf("expected error adding %q for the second time", mountPath) } + hm.Unlock() } func Test_Cannot_RemoveRWDevice_Wrong_Source(t *testing.T) { hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + mountPath := "/run/gcs/c/abcd" sourcePath := "/dev/sda" wrongSource := "/dev/sdb" if err := hm.AddRWDevice(mountPath, sourcePath, false); err != nil { t.Fatalf("unexpected error: %s", err) } - if err := hm.RemoveRWDevice(mountPath, wrongSource); err == nil { + if err := hm.RemoveRWDevice(mountPath, wrongSource, false); err == nil { t.Fatalf("expected error removing wrong source %s", wrongSource) } } +func Test_Cannot_RemoveRWDevice_Wrong_Encrypted(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + mountPath := "/run/gcs/c/abcd" + sourcePath := "/dev/sda" + if err := hm.AddRWDevice(mountPath, sourcePath, false); err != nil { + t.Fatalf("unexpected error: %s", err) + } + if err := hm.RemoveRWDevice(mountPath, sourcePath, true); err == nil { + t.Fatalf("expected error removing RW device with wrong encrypted flag") + } +} + func Test_HostMounts_IsEncrypted(t *testing.T) { hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + encryptedPath := "/run/gcs/c/encrypted" encryptedSource := "/dev/sda" if err := hm.AddRWDevice(encryptedPath, encryptedSource, true); err != nil { @@ -108,3 +137,189 @@ func Test_HostMounts_IsEncrypted(t *testing.T) { }) } } + +func Test_HostMounts_AddRemoveRODevice(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + mountPath := "/run/gcs/c/abcd" + sourcePath := "/dev/sda" + + if err := hm.AddRODevice(mountPath, sourcePath); err != nil { + t.Fatalf("unexpected error adding RO device: %s", err) + } + + if err := hm.RemoveRODevice(mountPath, sourcePath); err != nil { + t.Fatalf("unexpected error removing RO device: %s", err) + } +} + +func Test_HostMounts_Cannot_AddRODevice_Twice(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + mountPath := "/run/gcs/c/abc" + sourcePath := "/dev/sda" + + if err := hm.AddRODevice(mountPath, sourcePath); err != nil { + t.Fatalf("unexpected error: %s", err) + } + if err := hm.AddRODevice(mountPath, sourcePath); err == nil { + t.Fatalf("expected error adding %q for the second time", mountPath) + } +} + +func Test_HostMounts_AddRemoveOverlay(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + mountPath := "/run/gcs/c/aaaa/rootfs" + layers := []string{ + "/run/mounts/scsi/m1", + "/run/mounts/scsi/m2", + "/run/mounts/scsi/m3", + } + for _, layer := range layers { + if err := hm.AddRODevice(layer, layer); err != nil { + t.Fatalf("unexpected error adding RO device: %s", err) + } + } + scratchDir := "/run/gcs/c/aaaa/scratch" + if err := hm.AddRWDevice(scratchDir, scratchDir, true); err != nil { + t.Fatalf("unexpected error adding RW device: %s", err) + } + if err := hm.AddOverlay(mountPath, layers, scratchDir); err != nil { + t.Fatalf("unexpected error adding overlay: %s", err) + } + undo, err := hm.RemoveOverlay(mountPath) + if err != nil { + t.Fatalf("unexpected error removing overlay: %s", err) + } + if undo == nil { + t.Fatalf("expected undo function to be non-nil") + } + undo() + if _, err = hm.RemoveOverlay(mountPath); err != nil { + t.Fatalf("unexpected error removing overlay again: %s", err) + } +} + +func Test_HostMounts_Cannot_RemoveInUseDeviceByOverlay(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + mountPath := "/run/gcs/c/aaaa/rootfs" + layers := []string{ + "/run/mounts/scsi/m1", + "/run/mounts/scsi/m2", + "/run/mounts/scsi/m3", + } + for _, layer := range layers { + if err := hm.AddRODevice(layer, layer); err != nil { + t.Fatalf("unexpected error adding RO device: %s", err) + } + } + scratchDir := "/run/gcs/c/aaaa/scratch" + if err := hm.AddRWDevice(scratchDir, scratchDir, true); err != nil { + t.Fatalf("unexpected error adding RW device: %s", err) + } + if err := hm.AddOverlay(mountPath, layers, scratchDir); err != nil { + t.Fatalf("unexpected error adding overlay: %s", err) + } + + for _, layer := range layers { + if err := hm.RemoveRODevice(layer, layer); err == nil { + t.Fatalf("expected error removing RO device %s while in use by overlay", layer) + } + } + if err := hm.RemoveRWDevice(scratchDir, scratchDir, true); err == nil { + t.Fatalf("expected error removing RW device %s while in use by overlay", scratchDir) + } + + if _, err := hm.RemoveOverlay(mountPath); err != nil { + t.Fatalf("unexpected error removing overlay: %s", err) + } + + // now we can remove + for _, layer := range layers { + if err := hm.RemoveRODevice(layer, layer); err != nil { + t.Fatalf("unexpected error removing RO device %s: %s", layer, err) + } + } + if err := hm.RemoveRWDevice(scratchDir, scratchDir, true); err != nil { + t.Fatalf("unexpected error removing RW device %s: %s", scratchDir, err) + } +} + +func Test_HostMounts_Cannot_RemoveInUseDeviceByOverlay_MultipleUsers(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + overlay1 := "/run/gcs/c/aaaa/rootfs" + overlay2 := "/run/gcs/c/bbbb/rootfs" + layers := []string{ + "/run/mounts/scsi/m1", + "/run/mounts/scsi/m2", + "/run/mounts/scsi/m3", + } + for _, layer := range layers { + if err := hm.AddRODevice(layer, layer); err != nil { + t.Fatalf("unexpected error adding RO device: %s", err) + } + } + sharedScratchMount := "/run/gcs/c/sandbox" + scratch1 := sharedScratchMount + "/scratch/aaaa" + scratch2 := sharedScratchMount + "/scratch/bbbb" + if err := hm.AddRWDevice(sharedScratchMount, sharedScratchMount, true); err != nil { + t.Fatalf("unexpected error adding RW device: %s", err) + } + if err := hm.AddOverlay(overlay1, layers, scratch1); err != nil { + t.Fatalf("unexpected error adding overlay1: %s", err) + } + + if err := hm.AddOverlay(overlay2, layers[0:2], scratch2); err != nil { + t.Fatalf("unexpected error adding overlay2: %s", err) + } + + for _, layer := range layers { + if err := hm.RemoveRODevice(layer, layer); err == nil { + t.Fatalf("expected error removing RO device %s while in use by overlay", layer) + } + } + if err := hm.RemoveRWDevice(sharedScratchMount, sharedScratchMount, true); err == nil { + t.Fatalf("expected error removing RW device %s while in use by overlay", sharedScratchMount) + } + + if _, err := hm.RemoveOverlay(overlay1); err != nil { + t.Fatalf("unexpected error removing overlay 1: %s", err) + } + + for _, layer := range layers[0:2] { + if err := hm.RemoveRODevice(layer, layer); err == nil { + t.Fatalf("expected error removing RO device %s (still in use by overlay 2)", layer) + } + } + if err := hm.RemoveRODevice(layers[2], layers[2]); err != nil { + t.Fatalf("unexpected error removing layers[2] which is not being used by overlay 2: %s", err) + } + if err := hm.RemoveRWDevice(sharedScratchMount, sharedScratchMount, true); err == nil { + t.Fatalf("expected error removing RW device %s while in use by overlay 2", scratch2) + } + + if _, err := hm.RemoveOverlay(overlay2); err != nil { + t.Fatalf("unexpected error removing overlay 2: %s", err) + } + for _, layer := range layers[0:2] { + if err := hm.RemoveRODevice(layer, layer); err != nil { + t.Fatalf("unexpected error removing RO device %s: %s", layer, err) + } + } + if err := hm.RemoveRWDevice(sharedScratchMount, sharedScratchMount, true); err != nil { + t.Fatalf("unexpected error removing RW device %s: %s", sharedScratchMount, err) + } +} diff --git a/internal/guest/storage/plan9/plan9.go b/internal/guest/storage/plan9/plan9.go index 5c1f1d74f4..44ac0f4e4e 100644 --- a/internal/guest/storage/plan9/plan9.go +++ b/internal/guest/storage/plan9/plan9.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "os" + "regexp" "syscall" "github.com/Microsoft/hcsshim/internal/guest/transport" @@ -25,6 +26,19 @@ var ( unixMount = unix.Mount ) +// c.f. v9fs_parse_options in linux/fs/9p/v9fs.c - technically anything other +// than ',' is ok (quoting is not handled), however, this name is generated from +// a counter in AddPlan9 (internal/uvm/plan9.go), and therefore we expect only +// digits from a normal hcsshim host. +var validShareNameRegex = regexp.MustCompile(`^[0-9]+$`) + +func ValidateShareName(name string) error { + if !validShareNameRegex.MatchString(name) { + return fmt.Errorf("invalid plan9 share name %q: must match regex %q", name, validShareNameRegex.String()) + } + return nil +} + // Mount dials a connection from `vsock` and mounts a Plan9 share to `target`. // // `target` will be created. On mount failure the created `target` will be diff --git a/pkg/securitypolicy/framework.rego b/pkg/securitypolicy/framework.rego index daa0fe864e..09aa685532 100644 --- a/pkg/securitypolicy/framework.rego +++ b/pkg/securitypolicy/framework.rego @@ -388,6 +388,14 @@ seccomp_ok(seccomp_profile_sha256) { is_windows } +devices_ok(expected_devices, actual_devices) { + # Allow out of order but not duplicates + set_expected := {dev | dev := expected_devices[_]} + set_actual := {dev | dev := actual_devices[_]} + set_expected == set_actual + count(set_actual) == count(actual_devices) +} + default container_started := false container_started { @@ -599,6 +607,8 @@ create_container := {"metadata": [updateMatches, addStarted], command_ok(container.command) mountList_ok(container.mounts, container.allow_elevated) seccomp_ok(container.seccomp_profile_sha256) + # We do not support adding device nodes to the policy yet + devices_ok([], input.devices) ] count(possible_after_initial_containers) > 0 @@ -2090,6 +2100,12 @@ errors["capabilities don't match"] { count(possible_after_caps_containers) == 0 } +errors["devices not supported"] { + is_linux + input.rule == "create_container" + not devices_ok([], input.devices) +} + # covers exec_in_container as well. it shouldn't be possible to ever get # an exec_in_container as it "inherits" capabilities rules from create_container errors["containers only distinguishable by capabilties"] { diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go index 236e64d4bd..f64ab10f6d 100644 --- a/pkg/securitypolicy/regopolicy_linux_test.go +++ b/pkg/securitypolicy/regopolicy_linux_test.go @@ -2151,6 +2151,39 @@ func Test_Rego_EnforceCreateContainer_Capabilities_Drop_NoMatches(t *testing.T) } } +func Test_Rego_EnforceCreateContainer_RequireNoDevices(t *testing.T) { + f := func(p *generatedConstraints) bool { + tc, err := setupSimpleRegoCreateContainerTest(p) + if err != nil { + t.Error(err) + return false + } + + privileged := false + + _, _, _, err = tc.policy.EnforceCreateContainerPolicyV2(p.ctx, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, tc.user, &CreateContainerOptions{ + SandboxID: tc.sandboxID, + Privileged: &privileged, + NoNewPrivileges: &tc.noNewPrivileges, + Groups: tc.groups, + Umask: tc.umask, + Capabilities: tc.capabilities, + SeccompProfileSHA256: tc.seccomp, + LinuxDevices: []oci.LinuxDevice{ + { + Path: "/test", + }, + }, + }) + + return assertDecisionJSONContains(t, err, "devices not supported") + } + + if err := quick.Check(f, &quick.Config{MaxCount: 50, Rand: testRand}); err != nil { + t.Errorf("Test_Rego_EnforceCreateContainer_RequireNoDevices: %v", err) + } +} + func Test_Rego_ExtendDefaultMounts(t *testing.T) { f := func(p *generatedConstraints) bool { tc, err := setupSimpleRegoCreateContainerTest(p) diff --git a/pkg/securitypolicy/securitypolicyenforcer.go b/pkg/securitypolicy/securitypolicyenforcer.go index 3f94bae0f5..c617016a1a 100644 --- a/pkg/securitypolicy/securitypolicyenforcer.go +++ b/pkg/securitypolicy/securitypolicyenforcer.go @@ -29,6 +29,7 @@ type CreateContainerOptions struct { Umask string Capabilities *oci.LinuxCapabilities SeccompProfileSHA256 string + LinuxDevices []oci.LinuxDevice } type SignalContainerOptions struct { IsInitProcess bool diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go index 6cd911b5ec..6727e65464 100644 --- a/pkg/securitypolicy/securitypolicyenforcer_rego.go +++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go @@ -713,6 +713,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicy( Umask: umask, Capabilities: capabilities, SeccompProfileSHA256: seccompProfileSHA256, + LinuxDevices: []oci.LinuxDevice{}, } return policy.EnforceCreateContainerPolicyV2(ctx, containerID, argList, envList, workingDir, mounts, user, opts) } @@ -750,6 +751,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicyV2( "sandboxDir": SandboxMountsDir(opts.SandboxID), "hugePagesDir": HugePagesMountsDir(opts.SandboxID), "mounts": appendMountData([]interface{}{}, mounts), + "devices": appendDeviceData([]interface{}{}, opts.LinuxDevices), "privileged": opts.Privileged, "noNewPrivileges": opts.NoNewPrivileges, "user": user.toInput(), @@ -838,6 +840,29 @@ func appendMountData(mountData []interface{}, mounts []oci.Mount) []interface{} return mountData } +func uint32ptrtoany(i *uint32) interface{} { + if i == nil { + return nil + } + return *i +} + +func appendDeviceData(deviceData []interface{}, devices []oci.LinuxDevice) []interface{} { + for _, device := range devices { + deviceData = append(deviceData, inputData{ + "path": device.Path, + "type": device.Type, + "major": device.Major, + "minor": device.Minor, + "fileMode": device.FileMode, + "uid": uint32ptrtoany(device.UID), + "gid": uint32ptrtoany(device.GID), + }) + } + + return deviceData +} + func (policy *regoEnforcer) ExtendDefaultMounts(mounts []oci.Mount) error { policy.defaultMounts = append(policy.defaultMounts, mounts...) defaultMounts := appendMountData([]interface{}{}, policy.defaultMounts)