From bdde27651a339f80f9d60d28f27ea5c3985cf9cb Mon Sep 17 00:00:00 2001 From: Max042004 Date: Thu, 2 Jul 2026 18:19:23 +0800 Subject: [PATCH] Reject invalid PROT on read-only MAP_SHARED fds A MAP_SHARED, PROT_READ mapping of a file opened O_RDONLY is common -- the JVM maps its ~135 MiB lib/modules image exactly this way. That case is already fixed on main by 520568c ("Harden runtime around foot"), which routes non-writable-fd MAP_SHARED requests through the pread-snapshot fallback in sys_mmap's non-fixed path instead of installing a live overlay. That fallback fires for any MAP_SHARED request the overlay's overlay_fd_writable() gate would reject, without checking the guest's requested prot. Two Linux-visible corners were left open as a result: - mmap(MAP_SHARED, PROT_WRITE) of an O_RDONLY fd silently succeeded via the fallback instead of failing EACCES. Fixed by checking overlay_fd_writable() before falling through to pread, rolling back the allocation and returning EACCES when the guest asked for PROT_WRITE against a non-writable backing fd. - Once a read-only MAP_SHARED mapping succeeded, nothing stopped a follow-up mprotect(PROT_READ | PROT_WRITE) from upgrading it. Linux tracks max_prot per VMA from the fd's open mode and rejects that upgrade with EACCES; sys_mprotect only consulted prot_to_perms() and happily granted it, so a subsequent guest write landed in guest-local memory with no error ever surfaced to the caller. Fixed by adding guest_region_t.backing_ro, set on a MAP_SHARED region whenever its backing_fd lacks write access (the same overlay_fd_writable() check), threaded through regions_mergeable (so two regions with different backing_ro never silently coalesce), region_snapshot_t capture/restore, and all three sys_mremap region-recreation sites. sys_mprotect now rejects a PROT_WRITE request over any MAP_SHARED region with backing_ro set, before doing any PTE work. test-mmap-shared-ro covers the O_RDONLY read path (already fixed by 520568c), a second concurrent read-only mapping, the O_RDONLY mmap(PROT_WRITE) rejection, the mprotect(PROT_WRITE) upgrade rejection, and the read-only-mapping-on-O_RDWR-fd branch. NPAGES is bumped from 64 (256 KiB, fits in one 2 MiB HVF segment) to 768 (3 MiB, crosses a segment boundary) so the cases actually exercise hvf_segment_split's multi-block path. --- src/core/guest.c | 17 +++ src/core/guest.h | 15 +++ src/syscall/mem.c | 71 ++++++++++ tests/manifest.txt | 3 + tests/test-mmap-shared-ro.c | 249 ++++++++++++++++++++++++++++++++++++ 5 files changed, 355 insertions(+) create mode 100644 tests/test-mmap-shared-ro.c diff --git a/src/core/guest.c b/src/core/guest.c index d4e455e..4373d2a 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -1769,6 +1769,8 @@ static bool regions_mergeable(const guest_region_t *a, const guest_region_t *b) */ if (a->noreserve != b->noreserve) return false; + if (a->backing_ro != b->backing_ro) + return false; if (a->overlay_active || b->overlay_active) return false; if (strcmp(a->name, b->name) != 0) @@ -1952,6 +1954,7 @@ int guest_region_add_ex_owned_gpa(guest_t *g, r->backing_fd = owned_backing_fd; r->shared = (flags & 0x01) != 0; /* LINUX_MAP_SHARED = 0x01 */ r->noreserve = (flags & 0x4000) != 0; /* LINUX_MAP_NORESERVE = 0x4000 */ + r->backing_ro = false; guest_region_clear_overlay(r); if (name) { str_copy_trunc(r->name, name, sizeof(r->name)); @@ -2162,6 +2165,20 @@ bool guest_region_range_has_noreserve(const guest_t *g, return false; } +bool guest_region_range_has_ro_shared_backing(const guest_t *g, + uint64_t start, + uint64_t end) +{ + for (int i = guest_region_first_end_above(g, start); i < g->nregions; i++) { + const guest_region_t *r = &g->regions[i]; + if (r->start >= end) + break; + if (r->shared && r->backing_ro) + return true; + } + return false; +} + void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot) { /* Walk regions overlapping [start, end), split at boundaries, update prot. diff --git a/src/core/guest.h b/src/core/guest.h index 58ca8b1..b5f6882 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -235,6 +235,12 @@ typedef struct { int backing_fd; /* Duplicated host fd for file-backed mappings, or -1 */ bool shared; /* MAP_SHARED (writes should propagate) */ bool noreserve; /* MAP_NORESERVE: PTEs deferred until fault */ + bool backing_ro; /* MAP_SHARED region whose backing_fd was opened + * without write access, so its Linux max_prot is + * capped to PROT_READ. sys_mprotect must reject any + * later PROT_WRITE request against it with EACCES, + * matching a real kernel's VMA max_prot tracking. + */ bool overlay_active; /* Region has a live host MAP_FIXED|MAP_SHARED overlay * of backing_fd at host_base+start. The kernel's page * cache keeps it coherent with the file and with peer @@ -1231,6 +1237,15 @@ bool guest_region_range_has_noreserve(const guest_t *g, uint64_t start, uint64_t end); +/* True if any tracked region overlapping [start, end) is MAP_SHARED with a + * backing_fd that lost write access (backing_ro), i.e. its Linux max_prot is + * capped to PROT_READ. sys_mprotect uses this to reject PROT_WRITE upgrades + * with EACCES. + */ +bool guest_region_range_has_ro_shared_backing(const guest_t *g, + uint64_t start, + uint64_t end); + /* Try to materialize a lazy (MAP_NORESERVE) page at the given offset. Called * from the data/instruction abort handler when the faulting address falls * within a noreserve region. Creates page table entries for one 2MiB block diff --git a/src/syscall/mem.c b/src/syscall/mem.c index 00f93b0..7db9e79 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -81,6 +81,7 @@ typedef struct { bool overlay_active; uint64_t overlay_start; uint64_t overlay_end; + bool backing_ro; char name[sizeof(((guest_region_t *) 0)->name)]; } region_snapshot_t; @@ -205,6 +206,21 @@ static void mark_overlay_metadata_range(guest_t *g, } } +/* Mark the region spanning exactly [start, end) as backed by a fd that lost + * write access, so sys_mprotect rejects a later PROT_WRITE upgrade. Exact + * match (not overlap) because callers use this right after installing a + * single freshly-added region. + */ +static void mark_region_backing_ro(guest_t *g, uint64_t start, uint64_t end) +{ + for (int i = 0; i < g->nregions; i++) { + if (g->regions[i].start == start && g->regions[i].end == end) { + g->regions[i].backing_ro = true; + break; + } + } +} + static void region_clip_overlay(guest_region_t *r) { if (!region_has_live_overlay(r) || r->end <= r->start) { @@ -1099,6 +1115,7 @@ static int capture_region_snapshots(guest_t *g, snap->overlay_active = r->overlay_active; snap->overlay_start = r->overlay_start; snap->overlay_end = r->overlay_end; + snap->backing_ro = r->backing_ro; str_copy_trunc(snap->name, r->name, sizeof(snap->name)); } @@ -1211,6 +1228,8 @@ static int restore_region_snapshots(guest_t *g, region_snapshot_t *snaps, int n) return -LINUX_ENOMEM; } snap->backing_fd = -1; + if (snap->backing_ro) + mark_region_backing_ro(g, snap->start, snap->end); } for (int i = 0; i < n; i++) { @@ -2426,6 +2445,26 @@ int64_t sys_mmap(guest_t *g, * never reachable by the guest because the gap-finder advances the hint * to the next host-page boundary after each allocation. */ + /* MAP_SHARED | PROT_WRITE against a backing fd opened without write + * access must fail EACCES, matching Linux. The alignment-mismatch and + * read-only-fd cases below both fall through to the pread snapshot + * path, which always succeeds -- without this check a writable shared + * mapping request on a read-only fd would be silently downgraded to a + * private snapshot instead of being rejected. + */ + if ((flags & LINUX_MAP_SHARED) && (prot & LINUX_PROT_WRITE) && + !overlay_fd_writable(host_backing_fd)) { + int rollback_err = rollback_fresh_mmap_allocation( + g, result_off, length, false, 0, 0, saved_mmap_next, + saved_mmap_end, saved_mmap_rx_next, saved_mmap_rx_end, + saved_rw_gap_hint, saved_rx_gap_hint); + if (track_backing_fd >= 0) + close(track_backing_fd); + host_fd_ref_close(&backing_ref); + if (rollback_err < 0) + return rollback_err; + return -LINUX_EACCES; + } /* overlay_fd_writable rejects read-only backing fds inside * hvf_apply_file_overlay; mirror the check here so a read-only mmap * takes the snapshot pread path directly, skipping the thread_quiesce / @@ -2554,6 +2593,15 @@ int64_t sys_mmap(guest_t *g, } } + /* A MAP_SHARED mapping whose backing fd cannot be written to has Linux + * max_prot capped to PROT_READ, whether or not the pread snapshot path + * above actually installed a live overlay. sys_mprotect consults this to + * reject a later PROT_WRITE upgrade with EACCES. + */ + if (!is_anon && fd >= 0 && !is_prot_none && (flags & LINUX_MAP_SHARED) && + !overlay_fd_writable(host_backing_fd)) + mark_region_backing_ro(g, result_off, result_off + length); + host_fd_ref_close(&backing_ref); dispose_region_snapshots(&replaced_snaps, &replaced_nsnaps); @@ -2697,6 +2745,7 @@ int64_t sys_mremap(guest_t *g, if (old_reg && old_reg->backing_fd >= 0 && track_backing_fd < 0) return -LINUX_ENOMEM; bool source_overlay = old_reg && region_has_live_overlay(old_reg); + bool source_backing_ro = old_reg && old_reg->backing_ro; uint64_t source_file_off = old_reg ? old_reg->offset + (old_off - old_reg->start) : 0; char track_name[sizeof(old_reg->name)] = {0}; @@ -2867,6 +2916,8 @@ int64_t sys_mremap(guest_t *g, dispose_region_snapshots(&dest_snaps, &dest_nsnaps); return -LINUX_ENOMEM; } + if (source_backing_ro) + mark_region_backing_ro(g, new_off, new_off + new_size); dispose_region_snapshots(&source_snaps, &source_nsnaps); dispose_region_snapshots(&dest_snaps, &dest_nsnaps); return (int64_t) guest_ipa(g, new_off); @@ -2919,6 +2970,7 @@ int64_t sys_mremap(guest_t *g, old_overlay ? old_reg->overlay_start : 0; uint64_t old_overlay_end = old_overlay ? old_reg->overlay_end : 0; + bool old_backing_ro = old_reg && old_reg->backing_ro; if (old_reg && old_reg->backing_fd >= 0 && track_backing_fd < 0) return -LINUX_ENOMEM; char track_name[sizeof(old_reg->name)] = {0}; @@ -2945,6 +2997,8 @@ int64_t sys_mremap(guest_t *g, mark_overlay_metadata_range(g, old_off, old_off + old_size, old_overlay_start, old_overlay_end); + if (old_backing_ro) + mark_region_backing_ro(g, old_off, old_off + new_size); /* Update high-water marks */ uint64_t hwm = old_off + new_size; @@ -2978,6 +3032,7 @@ int64_t sys_mremap(guest_t *g, uint64_t source_overlay_start = source_overlay ? old_reg->overlay_start : 0; uint64_t source_overlay_end = source_overlay ? old_reg->overlay_end : 0; + bool source_backing_ro = old_reg && old_reg->backing_ro; uint64_t source_file_off = old_reg ? old_reg->offset + (old_off - old_reg->start) : 0; uint64_t source_overlay_file_off = @@ -3092,6 +3147,8 @@ int64_t sys_mremap(guest_t *g, g, new_off, new_off + new_size, prot, track_flags, track_offset, track_name[0] ? track_name : NULL, track_backing_fd) < 0) return -LINUX_ENOMEM; + if (source_backing_ro) + mark_region_backing_ro(g, new_off, new_off + new_size); /* Update high-water marks */ uint64_t hwm = new_off + new_size; @@ -3457,6 +3514,14 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) (prot & LINUX_PROT_EXEC)) return -LINUX_EINVAL; + /* A MAP_SHARED region whose backing fd cannot be written to has + * Linux max_prot capped to PROT_READ; reject an upgrade the same + * way a real kernel's VMA max_prot check would. + */ + if ((prot & LINUX_PROT_WRITE) && + guest_region_range_has_ro_shared_backing(g, addr, mprot_end)) + return -LINUX_EACCES; + /* Fast path: if the tracker already records this prot for every * overlapping region and none are MAP_NORESERVE, page tables are * already in sync and no PTE work is required. The tracker update @@ -3498,6 +3563,12 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) if (guest_range_hits_infra(g, mprot_off, mprot_end)) return -LINUX_EINVAL; + /* Same max_prot check as the high-VA branch above. */ + if ((prot & LINUX_PROT_WRITE) && + guest_region_range_has_ro_shared_backing(g, mprot_off, + mprot_end)) + return -LINUX_EACCES; + /* Same fast path / ordering / staleness gate as above. */ if (mprotect_same_prot_fast_path_safe(prot) && !g->regions_tracker_stale && diff --git a/tests/manifest.txt b/tests/manifest.txt index 265c7c5..c93e1da 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -130,6 +130,9 @@ test-shim-cred-race [section] msync MAP_SHARED tests test-msync +[section] Read-only MAP_SHARED file overlay tests +test-mmap-shared-ro + [section] Cross-fork MAP_SHARED coherence tests test-cross-fork-mapshared # diff=skip diff --git a/tests/test-mmap-shared-ro.c b/tests/test-mmap-shared-ro.c new file mode 100644 index 0000000..84b2c96 --- /dev/null +++ b/tests/test-mmap-shared-ro.c @@ -0,0 +1,249 @@ +/* Read-only MAP_SHARED file overlay tests + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Regression lock-in for the file-overlay path in src/syscall/mem.c. + * + * A MAP_SHARED, PROT_READ mapping of a file opened O_RDONLY is extremely + * common -- the JVM maps its ~135 MiB lib/modules image this way, and so do + * loaders that map read-only data segments. The original overlay code always + * mmap'd the host page PROT_READ|PROT_WRITE and mapped the HVF segment RWX, + * which fails twice for a read-only fd: the host mmap returns EACCES (writable + * mapping of an O_RDONLY fd) and, even forced to PROT_READ, hv_vm_map then + * fails because a MAP_SHARED-of-O_RDONLY region has macOS max_protection=READ. + * + * Syscalls exercised: openat, ftruncate/pwrite, mmap, munmap, pread64 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +int passes = 0, fails = 0; + +/* Several guest pages so the overlay spans more than one host page and the + * containing 2 MiB segment is split and remapped over a realistic range. + * 768 pages (3 MiB) crosses a 2 MiB boundary so hvf_segment_split's + * multi-block path is exercised, matching how JVM's ~135 MiB lib/modules + * image crosses many segments. */ +#define NPAGES 768 +#define PGSZ ((size_t) 4096) +#define FILE_LEN (NPAGES * PGSZ) + +/* Distinct byte per 4 KiB page so a partial or misaligned overlay is caught. */ +static unsigned char page_marker(int page) +{ + return (unsigned char) (0x40 + (page % 64)); +} + +/* Create a file seeded with a per-page marker pattern, then close it. Returns + * the path in `out` (caller-sized buffer). Returns 0 on success, -1 on error. + */ +static int make_seed_file(char *out, size_t out_sz) +{ + snprintf(out, out_sz, "/tmp/elfuse-mmap-ro-%ld", (long) getpid()); + int fd = open(out, O_CREAT | O_TRUNC | O_RDWR, 0600); + if (fd < 0) + return -1; + for (int p = 0; p < NPAGES; p++) { + unsigned char buf[PGSZ]; + memset(buf, page_marker(p), sizeof(buf)); + off_t foff = (off_t) p * (off_t) PGSZ; + if (pwrite(fd, buf, sizeof(buf), foff) != (ssize_t) sizeof(buf)) { + close(fd); + unlink(out); + return -1; + } + } + close(fd); + return 0; +} + +/* The headline case: O_RDONLY fd + MAP_SHARED + PROT_READ must map and expose + * the full file contents. This is exactly the JVM lib/modules pattern. */ +static void test_rdonly_shared_read(const char *path) +{ + TEST("MAP_SHARED PROT_READ on O_RDONLY fd maps"); + + int fd = open(path, O_RDONLY); + if (fd < 0) { + FAIL("open O_RDONLY failed"); + return; + } + + unsigned char *p = mmap(NULL, FILE_LEN, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + FAIL("mmap MAP_SHARED PROT_READ failed"); + close(fd); + return; + } + + bool ok = true; + for (int pg = 0; pg < NPAGES && ok; pg++) { + unsigned char want = page_marker(pg); + for (int off = 0; off < PGSZ; off += 512) { + if (p[pg * PGSZ + off] != want) { + ok = false; + break; + } + } + } + if (ok) + PASS(); + else + FAIL("mapped contents did not match file across pages"); + + munmap(p, FILE_LEN); + close(fd); +} + +/* The same content must be readable back-to-back through a fresh mapping, and + * a second concurrent read-only mapping of the same fd must also work. */ +static void test_rdonly_shared_second_mapping(const char *path) +{ + TEST("second MAP_SHARED PROT_READ mapping maps"); + + int fd = open(path, O_RDONLY); + if (fd < 0) { + FAIL("open O_RDONLY failed"); + return; + } + + unsigned char *a = mmap(NULL, FILE_LEN, PROT_READ, MAP_SHARED, fd, 0); + unsigned char *b = mmap(NULL, FILE_LEN, PROT_READ, MAP_SHARED, fd, 0); + if (a == MAP_FAILED || b == MAP_FAILED) { + FAIL("one of two MAP_SHARED PROT_READ mappings failed"); + if (a != MAP_FAILED) + munmap(a, FILE_LEN); + if (b != MAP_FAILED) + munmap(b, FILE_LEN); + close(fd); + return; + } + + if (a[0] == page_marker(0) && b[0] == page_marker(0) && + a[(NPAGES - 1) * PGSZ] == page_marker(NPAGES - 1) && + b[(NPAGES - 1) * PGSZ] == page_marker(NPAGES - 1)) + PASS(); + else + FAIL("two concurrent read-only mappings disagree with file"); + + munmap(a, FILE_LEN); + munmap(b, FILE_LEN); + close(fd); +} + +/* A read-only mapping must stay read-only: requesting PROT_WRITE | MAP_SHARED + * on an O_RDONLY fd is EACCES on Linux, and elfuse must surface the same errno + * rather than silently succeeding (which the MAP_PRIVATE backing must not do). + */ +static void test_rdonly_shared_write_rejected(const char *path) +{ + TEST("MAP_SHARED PROT_WRITE on O_RDONLY fd is EACCES"); + + int fd = open(path, O_RDONLY); + if (fd < 0) { + FAIL("open O_RDONLY failed"); + return; + } + + void *p = mmap(NULL, FILE_LEN, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED && errno == EACCES) { + PASS(); + } else { + FAIL("writable shared mapping of O_RDONLY fd was not rejected"); + if (p != MAP_FAILED) + munmap(p, FILE_LEN); + } + close(fd); +} + +/* A read-only mapping taken from a writable (O_RDWR) fd must also work; this + * exercises the MAP_SHARED-PROT_READ-on-writable-fd branch (max_protection RWX + * so the segment maps without dropping to MAP_PRIVATE). */ +static void test_rdwr_fd_readonly_mapping(const char *path) +{ + TEST("MAP_SHARED PROT_READ on O_RDWR fd maps"); + + int fd = open(path, O_RDWR); + if (fd < 0) { + FAIL("open O_RDWR failed"); + return; + } + + unsigned char *p = mmap(NULL, FILE_LEN, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + FAIL("mmap MAP_SHARED PROT_READ on O_RDWR fd failed"); + close(fd); + return; + } + + if (p[0] == page_marker(0) && + p[(NPAGES - 1) * PGSZ] == page_marker(NPAGES - 1)) + PASS(); + else + FAIL("read-only mapping of O_RDWR fd did not match file"); + + munmap(p, FILE_LEN); + close(fd); +} + +/* A read-only mapping's max_protection must stay READ: mprotect must not be + * able to upgrade it to PROT_WRITE after the fact. Linux remembers max_prot + * from the O_RDONLY fd at mmap time and rejects the upgrade with EACCES. */ +static void test_rdonly_mprotect_write_rejected(const char *path) +{ + TEST("mprotect PROT_WRITE on read-only MAP_SHARED mapping is EACCES"); + + int fd = open(path, O_RDONLY); + if (fd < 0) { + FAIL("open O_RDONLY failed"); + return; + } + + unsigned char *p = mmap(NULL, FILE_LEN, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + FAIL("mmap MAP_SHARED PROT_READ failed"); + close(fd); + return; + } + + int rc = mprotect(p, FILE_LEN, PROT_READ | PROT_WRITE); + if (rc == -1 && errno == EACCES) + PASS(); + else + FAIL("mprotect PROT_WRITE upgrade was not rejected"); + + munmap(p, FILE_LEN); + close(fd); +} + +int main(void) +{ + printf("test-mmap-shared-ro: read-only MAP_SHARED file overlay tests\n\n"); + + char path[64]; + if (make_seed_file(path, sizeof(path)) != 0) { + printf(" %-30s FAIL: could not create seed file (errno=%d)\n", "setup", + errno); + return 1; + } + + test_rdonly_shared_read(path); + test_rdonly_shared_second_mapping(path); + test_rdonly_shared_write_rejected(path); + test_rdwr_fd_readonly_mapping(path); + test_rdonly_mprotect_write_rejected(path); + + unlink(path); + + SUMMARY("test-mmap-shared-ro"); + return fails ? 1 : 0; +}