Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/core/guest.c
Original file line number Diff line number Diff line change
Expand Up @@ -1769,6 +1769,8 @@ static bool regions_mergeable(const guest_region_t *a, const guest_region_t *b)
*/
if (a->noreserve != b->noreserve)
return false;
if (a->backing_ro != b->backing_ro)
return false;
if (a->overlay_active || b->overlay_active)
return false;
if (strcmp(a->name, b->name) != 0)
Expand Down Expand Up @@ -1952,6 +1954,7 @@ int guest_region_add_ex_owned_gpa(guest_t *g,
r->backing_fd = owned_backing_fd;
r->shared = (flags & 0x01) != 0; /* LINUX_MAP_SHARED = 0x01 */
r->noreserve = (flags & 0x4000) != 0; /* LINUX_MAP_NORESERVE = 0x4000 */
r->backing_ro = false;
guest_region_clear_overlay(r);
if (name) {
str_copy_trunc(r->name, name, sizeof(r->name));
Expand Down Expand Up @@ -2162,6 +2165,20 @@ bool guest_region_range_has_noreserve(const guest_t *g,
return false;
}

bool guest_region_range_has_ro_shared_backing(const guest_t *g,
uint64_t start,
uint64_t end)
{
for (int i = guest_region_first_end_above(g, start); i < g->nregions; i++) {
const guest_region_t *r = &g->regions[i];
if (r->start >= end)
break;
if (r->shared && r->backing_ro)
return true;
}
return false;
}

void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
{
/* Walk regions overlapping [start, end), split at boundaries, update prot.
Expand Down
15 changes: 15 additions & 0 deletions src/core/guest.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,12 @@ typedef struct {
int backing_fd; /* Duplicated host fd for file-backed mappings, or -1 */
bool shared; /* MAP_SHARED (writes should propagate) */
bool noreserve; /* MAP_NORESERVE: PTEs deferred until fault */
bool backing_ro; /* MAP_SHARED region whose backing_fd was opened
* without write access, so its Linux max_prot is
* capped to PROT_READ. sys_mprotect must reject any
* later PROT_WRITE request against it with EACCES,
* matching a real kernel's VMA max_prot tracking.
*/
bool overlay_active; /* Region has a live host MAP_FIXED|MAP_SHARED overlay
* of backing_fd at host_base+start. The kernel's page
* cache keeps it coherent with the file and with peer
Expand Down Expand Up @@ -1231,6 +1237,15 @@ bool guest_region_range_has_noreserve(const guest_t *g,
uint64_t start,
uint64_t end);

/* True if any tracked region overlapping [start, end) is MAP_SHARED with a
* backing_fd that lost write access (backing_ro), i.e. its Linux max_prot is
* capped to PROT_READ. sys_mprotect uses this to reject PROT_WRITE upgrades
* with EACCES.
*/
bool guest_region_range_has_ro_shared_backing(const guest_t *g,
uint64_t start,
uint64_t end);

/* Try to materialize a lazy (MAP_NORESERVE) page at the given offset. Called
* from the data/instruction abort handler when the faulting address falls
* within a noreserve region. Creates page table entries for one 2MiB block
Expand Down
71 changes: 71 additions & 0 deletions src/syscall/mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ typedef struct {
bool overlay_active;
uint64_t overlay_start;
uint64_t overlay_end;
bool backing_ro;
char name[sizeof(((guest_region_t *) 0)->name)];
} region_snapshot_t;

Expand Down Expand Up @@ -205,6 +206,21 @@ static void mark_overlay_metadata_range(guest_t *g,
}
}

/* Mark the region spanning exactly [start, end) as backed by a fd that lost
* write access, so sys_mprotect rejects a later PROT_WRITE upgrade. Exact
* match (not overlap) because callers use this right after installing a
* single freshly-added region.
*/
static void mark_region_backing_ro(guest_t *g, uint64_t start, uint64_t end)
{
for (int i = 0; i < g->nregions; i++) {
if (g->regions[i].start == start && g->regions[i].end == end) {
g->regions[i].backing_ro = true;
break;
}
}
}

static void region_clip_overlay(guest_region_t *r)
{
if (!region_has_live_overlay(r) || r->end <= r->start) {
Expand Down Expand Up @@ -1099,6 +1115,7 @@ static int capture_region_snapshots(guest_t *g,
snap->overlay_active = r->overlay_active;
snap->overlay_start = r->overlay_start;
snap->overlay_end = r->overlay_end;
snap->backing_ro = r->backing_ro;
str_copy_trunc(snap->name, r->name, sizeof(snap->name));
}

Expand Down Expand Up @@ -1211,6 +1228,8 @@ static int restore_region_snapshots(guest_t *g, region_snapshot_t *snaps, int n)
return -LINUX_ENOMEM;
}
snap->backing_fd = -1;
if (snap->backing_ro)
mark_region_backing_ro(g, snap->start, snap->end);
}

for (int i = 0; i < n; i++) {
Expand Down Expand Up @@ -2426,6 +2445,26 @@ int64_t sys_mmap(guest_t *g,
* never reachable by the guest because the gap-finder advances the hint
* to the next host-page boundary after each allocation.
*/
/* MAP_SHARED | PROT_WRITE against a backing fd opened without write
* access must fail EACCES, matching Linux. The alignment-mismatch and
* read-only-fd cases below both fall through to the pread snapshot
* path, which always succeeds -- without this check a writable shared
* mapping request on a read-only fd would be silently downgraded to a
* private snapshot instead of being rejected.
*/
if ((flags & LINUX_MAP_SHARED) && (prot & LINUX_PROT_WRITE) &&
!overlay_fd_writable(host_backing_fd)) {
int rollback_err = rollback_fresh_mmap_allocation(
g, result_off, length, false, 0, 0, saved_mmap_next,
saved_mmap_end, saved_mmap_rx_next, saved_mmap_rx_end,
saved_rw_gap_hint, saved_rx_gap_hint);
if (track_backing_fd >= 0)
close(track_backing_fd);
host_fd_ref_close(&backing_ref);
if (rollback_err < 0)
return rollback_err;
return -LINUX_EACCES;
}
/* overlay_fd_writable rejects read-only backing fds inside
* hvf_apply_file_overlay; mirror the check here so a read-only mmap
* takes the snapshot pread path directly, skipping the thread_quiesce /
Expand Down Expand Up @@ -2554,6 +2593,15 @@ int64_t sys_mmap(guest_t *g,
}
}

/* A MAP_SHARED mapping whose backing fd cannot be written to has Linux
* max_prot capped to PROT_READ, whether or not the pread snapshot path
* above actually installed a live overlay. sys_mprotect consults this to
* reject a later PROT_WRITE upgrade with EACCES.
*/
if (!is_anon && fd >= 0 && !is_prot_none && (flags & LINUX_MAP_SHARED) &&
!overlay_fd_writable(host_backing_fd))
mark_region_backing_ro(g, result_off, result_off + length);

host_fd_ref_close(&backing_ref);
dispose_region_snapshots(&replaced_snaps, &replaced_nsnaps);

Expand Down Expand Up @@ -2697,6 +2745,7 @@ int64_t sys_mremap(guest_t *g,
if (old_reg && old_reg->backing_fd >= 0 && track_backing_fd < 0)
return -LINUX_ENOMEM;
bool source_overlay = old_reg && region_has_live_overlay(old_reg);
bool source_backing_ro = old_reg && old_reg->backing_ro;
uint64_t source_file_off =
old_reg ? old_reg->offset + (old_off - old_reg->start) : 0;
char track_name[sizeof(old_reg->name)] = {0};
Expand Down Expand Up @@ -2867,6 +2916,8 @@ int64_t sys_mremap(guest_t *g,
dispose_region_snapshots(&dest_snaps, &dest_nsnaps);
return -LINUX_ENOMEM;
}
if (source_backing_ro)
mark_region_backing_ro(g, new_off, new_off + new_size);
dispose_region_snapshots(&source_snaps, &source_nsnaps);
dispose_region_snapshots(&dest_snaps, &dest_nsnaps);
return (int64_t) guest_ipa(g, new_off);
Expand Down Expand Up @@ -2919,6 +2970,7 @@ int64_t sys_mremap(guest_t *g,
old_overlay ? old_reg->overlay_start : 0;
uint64_t old_overlay_end =
old_overlay ? old_reg->overlay_end : 0;
bool old_backing_ro = old_reg && old_reg->backing_ro;
if (old_reg && old_reg->backing_fd >= 0 && track_backing_fd < 0)
return -LINUX_ENOMEM;
char track_name[sizeof(old_reg->name)] = {0};
Expand All @@ -2945,6 +2997,8 @@ int64_t sys_mremap(guest_t *g,
mark_overlay_metadata_range(g, old_off, old_off + old_size,
old_overlay_start,
old_overlay_end);
if (old_backing_ro)
mark_region_backing_ro(g, old_off, old_off + new_size);

/* Update high-water marks */
uint64_t hwm = old_off + new_size;
Expand Down Expand Up @@ -2978,6 +3032,7 @@ int64_t sys_mremap(guest_t *g,
uint64_t source_overlay_start =
source_overlay ? old_reg->overlay_start : 0;
uint64_t source_overlay_end = source_overlay ? old_reg->overlay_end : 0;
bool source_backing_ro = old_reg && old_reg->backing_ro;
uint64_t source_file_off =
old_reg ? old_reg->offset + (old_off - old_reg->start) : 0;
uint64_t source_overlay_file_off =
Expand Down Expand Up @@ -3092,6 +3147,8 @@ int64_t sys_mremap(guest_t *g,
g, new_off, new_off + new_size, prot, track_flags, track_offset,
track_name[0] ? track_name : NULL, track_backing_fd) < 0)
return -LINUX_ENOMEM;
if (source_backing_ro)
mark_region_backing_ro(g, new_off, new_off + new_size);

/* Update high-water marks */
uint64_t hwm = new_off + new_size;
Expand Down Expand Up @@ -3457,6 +3514,14 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot)
(prot & LINUX_PROT_EXEC))
return -LINUX_EINVAL;

/* A MAP_SHARED region whose backing fd cannot be written to has
* Linux max_prot capped to PROT_READ; reject an upgrade the same
* way a real kernel's VMA max_prot check would.
*/
if ((prot & LINUX_PROT_WRITE) &&
guest_region_range_has_ro_shared_backing(g, addr, mprot_end))
return -LINUX_EACCES;

/* Fast path: if the tracker already records this prot for every
* overlapping region and none are MAP_NORESERVE, page tables are
* already in sync and no PTE work is required. The tracker update
Expand Down Expand Up @@ -3498,6 +3563,12 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot)
if (guest_range_hits_infra(g, mprot_off, mprot_end))
return -LINUX_EINVAL;

/* Same max_prot check as the high-VA branch above. */
if ((prot & LINUX_PROT_WRITE) &&
guest_region_range_has_ro_shared_backing(g, mprot_off,
mprot_end))
return -LINUX_EACCES;

/* Same fast path / ordering / staleness gate as above. */
if (mprotect_same_prot_fast_path_safe(prot) &&
!g->regions_tracker_stale &&
Expand Down
3 changes: 3 additions & 0 deletions tests/manifest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ test-shim-cred-race
[section] msync MAP_SHARED tests
test-msync

[section] Read-only MAP_SHARED file overlay tests
test-mmap-shared-ro

[section] Cross-fork MAP_SHARED coherence tests
test-cross-fork-mapshared # diff=skip

Expand Down
Loading
Loading