Skip to main content

starnix_core/mm/
memory_manager.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::barrier::{BarrierType, system_barrier};
6use crate::mm::mapping::MappingBackingMemory;
7use crate::mm::memory::MemoryObject;
8use crate::mm::memory_accessor::{MemoryAccessor, TaskMemoryAccessor};
9use crate::mm::private_anonymous_memory_manager::PrivateAnonymousMemoryManager;
10use crate::mm::{
11    FaultRegisterMode, FutexTable, InflightVmsplicedPayloads, MapInfoCache, Mapping,
12    MappingBacking, MappingFlags, MappingName, MappingNameRef, MlockPinFlavor, PrivateFutexKey,
13    ProtectionFlags, UserFault, VMEX_RESOURCE, VmsplicePayload, VmsplicePayloadSegment,
14    read_to_array,
15};
16use crate::security;
17use crate::signals::{SignalDetail, SignalInfo};
18use crate::task::{CurrentTask, ExceptionResult, PageFaultExceptionReport, Task};
19use crate::vfs::aio::AioContext;
20use crate::vfs::pseudo::dynamic_file::{
21    DynamicFile, DynamicFileBuf, DynamicFileSource, SequenceFileSource,
22};
23use crate::vfs::{FsString, NamespaceNode};
24use anyhow::{Error, anyhow};
25use bitflags::bitflags;
26use flyweights::FlyByteStr;
27use linux_uapi::BUS_ADRERR;
28use memory_pinning::PinnedMapping;
29use range_map::RangeMap;
30use starnix_ext::map_ext::EntryExt;
31use starnix_lifecycle::DropNotifier;
32use starnix_logging::{
33    CATEGORY_STARNIX_MM, impossible_error, log_warn, trace_duration, track_stub,
34};
35use starnix_sync::{
36    LockBefore, Locked, MmDumpable, OrderedMutex, RwLock, RwLockWriteGuard, ThreadGroupLimits,
37    Unlocked, UserFaultInner,
38};
39use starnix_types::arch::ArchWidth;
40use starnix_types::futex_address::FutexAddress;
41use starnix_types::math::{round_down_to_system_page_size, round_up_to_system_page_size};
42use starnix_types::ownership::{TempRef, WeakRef};
43use starnix_types::user_buffer::{UserBuffer, UserBuffers};
44use starnix_uapi::auth::CAP_IPC_LOCK;
45use starnix_uapi::errors::Errno;
46use starnix_uapi::file_mode::Access;
47use starnix_uapi::range_ext::RangeExt;
48use starnix_uapi::resource_limits::Resource;
49use starnix_uapi::restricted_aspace::{
50    RESTRICTED_ASPACE_BASE, RESTRICTED_ASPACE_HIGHEST_ADDRESS, RESTRICTED_ASPACE_RANGE,
51    RESTRICTED_ASPACE_SIZE,
52};
53use starnix_uapi::signals::{SIGBUS, SIGSEGV};
54use starnix_uapi::user_address::{ArchSpecific, UserAddress};
55use starnix_uapi::{
56    MADV_COLD, MADV_COLLAPSE, MADV_DODUMP, MADV_DOFORK, MADV_DONTDUMP, MADV_DONTFORK,
57    MADV_DONTNEED, MADV_DONTNEED_LOCKED, MADV_FREE, MADV_HUGEPAGE, MADV_HWPOISON, MADV_KEEPONFORK,
58    MADV_MERGEABLE, MADV_NOHUGEPAGE, MADV_NORMAL, MADV_PAGEOUT, MADV_POPULATE_READ, MADV_RANDOM,
59    MADV_REMOVE, MADV_SEQUENTIAL, MADV_SOFT_OFFLINE, MADV_UNMERGEABLE, MADV_WILLNEED,
60    MADV_WIPEONFORK, MREMAP_DONTUNMAP, MREMAP_FIXED, MREMAP_MAYMOVE, SI_KERNEL, errno, error,
61};
62use std::collections::HashMap;
63use std::mem::MaybeUninit;
64use std::ops::{Deref, DerefMut, Range, RangeBounds};
65use std::sync::{Arc, LazyLock, Weak};
66use syncio::zxio::zxio_default_maybe_faultable_copy;
67use zerocopy::IntoBytes;
68use zx::{HandleBased, Rights, VmarInfo, VmoChildOptions};
69
70pub const ZX_VM_SPECIFIC_OVERWRITE: zx::VmarFlags =
71    zx::VmarFlags::from_bits_retain(zx::VmarFlagsExtended::SPECIFIC_OVERWRITE.bits());
72
73// We do not create shared processes in unit tests.
74pub(crate) const UNIFIED_ASPACES_ENABLED: bool = cfg!(not(test));
75
76/// Initializes the usercopy utilities.
77///
78/// It is useful to explicitly call this so that the usercopy is initialized
79/// at a known instant. For example, Starnix may want to make sure the usercopy
80/// thread created to support user copying is associated to the Starnix process
81/// and not a restricted-mode process.
82pub fn init_usercopy() {
83    // This call lazily initializes the `Usercopy` instance.
84    let _ = usercopy();
85}
86
87pub const GUARD_PAGE_COUNT_FOR_GROWSDOWN_MAPPINGS: usize = 256;
88
89#[cfg(target_arch = "x86_64")]
90const ASLR_RANDOM_BITS: usize = 27;
91
92#[cfg(target_arch = "aarch64")]
93const ASLR_RANDOM_BITS: usize = 28;
94
95#[cfg(target_arch = "riscv64")]
96const ASLR_RANDOM_BITS: usize = 18;
97
98/// Number of bits of entropy for processes running in 32 bits mode.
99const ASLR_32_RANDOM_BITS: usize = 8;
100
101// The biggest we expect stack to be; increase as needed
102// TODO(https://fxbug.dev/322874791): Once setting RLIMIT_STACK is implemented, we should use it.
103const MAX_STACK_SIZE: usize = 512 * 1024 * 1024;
104
105// Value to report temporarily as the VM RSS HWM.
106// TODO(https://fxbug.dev/396221597): Need support from the kernel to track the committed bytes high
107// water mark.
108const STUB_VM_RSS_HWM: usize = 2 * 1024 * 1024;
109
110fn usercopy() -> Option<&'static usercopy::Usercopy> {
111    static USERCOPY: LazyLock<Option<usercopy::Usercopy>> = LazyLock::new(|| {
112        // We do not create shared processes in unit tests.
113        if UNIFIED_ASPACES_ENABLED {
114            // ASUMPTION: All Starnix managed Linux processes have the same
115            // restricted mode address range.
116            Some(usercopy::Usercopy::new(RESTRICTED_ASPACE_RANGE).unwrap())
117        } else {
118            None
119        }
120    });
121
122    LazyLock::force(&USERCOPY).as_ref()
123}
124
125/// Provides an implementation for zxio's `zxio_maybe_faultable_copy` that supports
126/// catching faults.
127///
128/// See zxio's `zxio_maybe_faultable_copy` documentation for more details.
129///
130/// # Safety
131///
132/// Only one of `src`/`dest` may be an address to a buffer owned by user/restricted-mode
133/// (`ret_dest` indicates whether the user-owned buffer is `dest` when `true`).
134/// The other must be a valid Starnix/normal-mode buffer that will never cause a fault
135/// when the first `count` bytes are read/written.
136#[unsafe(no_mangle)]
137pub unsafe fn zxio_maybe_faultable_copy_impl(
138    dest: *mut u8,
139    src: *const u8,
140    count: usize,
141    ret_dest: bool,
142) -> bool {
143    if let Some(usercopy) = usercopy() {
144        #[allow(clippy::undocumented_unsafe_blocks, reason = "2024 edition migration")]
145        let ret = unsafe { usercopy.raw_hermetic_copy(dest, src, count, ret_dest) };
146        ret == count
147    } else {
148        #[allow(clippy::undocumented_unsafe_blocks, reason = "2024 edition migration")]
149        unsafe {
150            zxio_default_maybe_faultable_copy(dest, src, count, ret_dest)
151        }
152    }
153}
154
155pub static PAGE_SIZE: LazyLock<u64> = LazyLock::new(|| zx::system_get_page_size() as u64);
156
157bitflags! {
158    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
159    pub struct MappingOptions: u16 {
160      const SHARED      = 1 << 0;
161      const ANONYMOUS   = 1 << 1;
162      const LOWER_32BIT = 1 << 2;
163      const GROWSDOWN   = 1 << 3;
164      const ELF_BINARY  = 1 << 4;
165      const DONTFORK    = 1 << 5;
166      const WIPEONFORK  = 1 << 6;
167      const DONT_SPLIT  = 1 << 7;
168      const DONT_EXPAND = 1 << 8;
169      const POPULATE    = 1 << 9;
170    }
171}
172
173bitflags! {
174    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
175    pub struct MremapFlags: u32 {
176        const MAYMOVE = MREMAP_MAYMOVE;
177        const FIXED = MREMAP_FIXED;
178        const DONTUNMAP = MREMAP_DONTUNMAP;
179    }
180}
181
182bitflags! {
183    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
184    pub struct MsyncFlags: u32 {
185        const ASYNC = starnix_uapi::MS_ASYNC;
186        const INVALIDATE = starnix_uapi::MS_INVALIDATE;
187        const SYNC = starnix_uapi::MS_SYNC;
188    }
189}
190
191const PROGRAM_BREAK_LIMIT: u64 = 64 * 1024 * 1024;
192
193#[derive(Debug, Clone, Eq, PartialEq)]
194struct ProgramBreak {
195    // These base address at which the data segment is mapped.
196    base: UserAddress,
197
198    // The current program break.
199    //
200    // The addresses from [base, current.round_up(*PAGE_SIZE)) are mapped into the
201    // client address space from the underlying |memory|.
202    current: UserAddress,
203}
204
205/// The policy about whether the address space can be dumped.
206#[derive(Debug, Clone, Copy, Eq, PartialEq)]
207pub enum DumpPolicy {
208    /// The address space cannot be dumped.
209    ///
210    /// Corresponds to SUID_DUMP_DISABLE.
211    Disable,
212
213    /// The address space can be dumped.
214    ///
215    /// Corresponds to SUID_DUMP_USER.
216    User,
217}
218
219// Supported types of membarriers.
220pub enum MembarrierType {
221    Memory,   // MEMBARRIER_CMD_GLOBAL, etc
222    SyncCore, // MEMBARRIER_CMD_..._SYNC_CORE
223}
224
225// Tracks the types of membarriers this address space is registered to receive.
226#[derive(Default, Clone)]
227struct MembarrierRegistrations {
228    memory: bool,
229    sync_core: bool,
230}
231
232pub struct MemoryManagerState {
233    /// The VMAR in which userspace mappings occur.
234    ///
235    /// We map userspace memory in this child VMAR so that we can destroy the
236    /// entire VMAR during exec.
237    /// For 32-bit tasks, we limit the user_vmar to correspond to the available memory.
238    ///
239    /// This field is set to `ZX_HANDLE_INVALID` when the address-space has been destroyed (e.g. on
240    /// `exec()`), allowing the value to be pro-actively checked for, or the `ZX_ERR_BAD_HANDLE`
241    /// status return from Zircon operations handled, to suit the call-site.
242    user_vmar: zx::Vmar,
243
244    /// Cached VmarInfo for user_vmar.
245    user_vmar_info: zx::VmarInfo,
246
247    /// The memory mappings currently used by this address space.
248    ///
249    /// The mappings record which object backs each address.
250    mappings: RangeMap<UserAddress, Mapping>,
251
252    /// Memory object backing private, anonymous memory allocations in this address space.
253    private_anonymous: PrivateAnonymousMemoryManager,
254
255    /// UserFaults registered with this memory manager.
256    userfaultfds: Vec<Weak<UserFault>>,
257
258    /// Shadow mappings for mlock()'d pages.
259    ///
260    /// Used for MlockPinFlavor::ShadowProcess to keep track of when we need to unmap
261    /// memory from the shadow process.
262    shadow_mappings_for_mlock: RangeMap<UserAddress, Arc<PinnedMapping>>,
263
264    forkable_state: MemoryManagerForkableState,
265}
266
267// 64k under the 4GB
268const LOWER_4GB_LIMIT: UserAddress = UserAddress::const_from(0xffff_0000);
269
270#[derive(Default, Clone)]
271pub struct MemoryManagerForkableState {
272    /// State for the brk and sbrk syscalls.
273    brk: Option<ProgramBreak>,
274
275    /// The namespace node that represents the executable associated with this task.
276    executable_node: Option<NamespaceNode>,
277
278    pub stack_size: usize,
279    pub stack_start: UserAddress,
280    pub auxv_start: UserAddress,
281    pub auxv_end: UserAddress,
282    pub argv_start: UserAddress,
283    pub argv_end: UserAddress,
284    pub environ_start: UserAddress,
285    pub environ_end: UserAddress,
286
287    /// vDSO location
288    pub vdso_base: UserAddress,
289
290    /// Randomized regions:
291    pub mmap_top: UserAddress,
292    pub stack_origin: UserAddress,
293    pub brk_origin: UserAddress,
294
295    // Membarrier registrations
296    membarrier_registrations: MembarrierRegistrations,
297}
298
299impl Deref for MemoryManagerState {
300    type Target = MemoryManagerForkableState;
301    fn deref(&self) -> &Self::Target {
302        &self.forkable_state
303    }
304}
305
306impl DerefMut for MemoryManagerState {
307    fn deref_mut(&mut self) -> &mut Self::Target {
308        &mut self.forkable_state
309    }
310}
311
312#[derive(Debug, Default)]
313struct ReleasedMappings {
314    doomed: Vec<Mapping>,
315    doomed_pins: Vec<Arc<PinnedMapping>>,
316}
317
318impl ReleasedMappings {
319    fn extend(&mut self, mappings: impl IntoIterator<Item = Mapping>) {
320        self.doomed.extend(mappings);
321    }
322
323    fn extend_pins(&mut self, mappings: impl IntoIterator<Item = Arc<PinnedMapping>>) {
324        self.doomed_pins.extend(mappings);
325    }
326
327    fn is_empty(&self) -> bool {
328        self.doomed.is_empty() && self.doomed_pins.is_empty()
329    }
330
331    #[cfg(test)]
332    fn len(&self) -> usize {
333        self.doomed.len() + self.doomed_pins.len()
334    }
335
336    fn finalize(&mut self, mm_state: RwLockWriteGuard<'_, MemoryManagerState>) {
337        // Drop the state before the unmapped mappings, since dropping a mapping may acquire a lock
338        // in `DirEntry`'s `drop`.
339        std::mem::drop(mm_state);
340        std::mem::take(&mut self.doomed);
341        std::mem::take(&mut self.doomed_pins);
342    }
343}
344
345impl Drop for ReleasedMappings {
346    fn drop(&mut self) {
347        assert!(self.is_empty(), "ReleasedMappings::finalize() must be called before drop");
348    }
349}
350
351fn map_in_vmar(
352    vmar: &zx::Vmar,
353    vmar_info: &zx::VmarInfo,
354    addr: SelectedAddress,
355    memory: &MemoryObject,
356    memory_offset: u64,
357    length: usize,
358    flags: MappingFlags,
359    populate: bool,
360) -> Result<UserAddress, Errno> {
361    let vmar_offset = addr.addr().checked_sub(vmar_info.base).ok_or_else(|| errno!(ENOMEM))?;
362    let vmar_extra_flags = match addr {
363        SelectedAddress::Fixed(_) => zx::VmarFlags::SPECIFIC,
364        SelectedAddress::FixedOverwrite(_) => ZX_VM_SPECIFIC_OVERWRITE,
365    };
366
367    if populate {
368        let op = if flags.contains(MappingFlags::WRITE) {
369            // Requires ZX_RIGHT_WRITEABLE which we should expect when the mapping is writeable.
370            zx::VmoOp::COMMIT
371        } else {
372            // When we don't expect to have ZX_RIGHT_WRITEABLE, fall back to a VMO op that doesn't
373            // need it.
374            zx::VmoOp::PREFETCH
375        };
376        trace_duration!(CATEGORY_STARNIX_MM, "MmapCommitPages");
377        let _ = memory.op_range(op, memory_offset, length as u64);
378        // "The mmap() call doesn't fail if the mapping cannot be populated."
379    }
380
381    let vmar_maybe_map_range = if populate && !vmar_extra_flags.contains(ZX_VM_SPECIFIC_OVERWRITE) {
382        zx::VmarFlags::MAP_RANGE
383    } else {
384        zx::VmarFlags::empty()
385    };
386    let vmar_flags = flags.access_flags().to_vmar_flags()
387        | zx::VmarFlags::ALLOW_FAULTS
388        | vmar_extra_flags
389        | vmar_maybe_map_range;
390
391    let map_result = memory.map_in_vmar(vmar, vmar_offset.ptr(), memory_offset, length, vmar_flags);
392    let mapped_addr = map_result.map_err(MemoryManager::get_errno_for_map_err)?;
393
394    Ok(UserAddress::from_ptr(mapped_addr))
395}
396
397impl MemoryManagerState {
398    /// Returns occupied address ranges that intersect with the given range.
399    ///
400    /// An address range is "occupied" if (a) there is already a mapping in that range or (b) there
401    /// is a GROWSDOWN mapping <= 256 pages above that range. The 256 pages below a GROWSDOWN
402    /// mapping is the "guard region." The memory manager avoids mapping memory in the guard region
403    /// in some circumstances to preserve space for the GROWSDOWN mapping to grow down.
404    fn get_occupied_address_ranges<'a>(
405        &'a self,
406        subrange: &'a Range<UserAddress>,
407    ) -> impl Iterator<Item = Range<UserAddress>> + 'a {
408        let query_range = subrange.start
409            ..(subrange
410                .end
411                .saturating_add(*PAGE_SIZE as usize * GUARD_PAGE_COUNT_FOR_GROWSDOWN_MAPPINGS));
412        self.mappings.range(query_range).filter_map(|(range, mapping)| {
413            let occupied_range = mapping.inflate_to_include_guard_pages(range);
414            if occupied_range.start < subrange.end && subrange.start < occupied_range.end {
415                Some(occupied_range)
416            } else {
417                None
418            }
419        })
420    }
421
422    fn count_possible_placements(
423        &self,
424        length: usize,
425        subrange: &Range<UserAddress>,
426    ) -> Option<usize> {
427        let mut occupied_ranges = self.get_occupied_address_ranges(subrange);
428        let mut possible_placements = 0;
429        // If the allocation is placed at the first available address, every page that is left
430        // before the next mapping or the end of subrange is +1 potential placement.
431        let mut first_fill_end = subrange.start.checked_add(length)?;
432        while first_fill_end <= subrange.end {
433            let Some(mapping) = occupied_ranges.next() else {
434                possible_placements += (subrange.end - first_fill_end) / (*PAGE_SIZE as usize) + 1;
435                break;
436            };
437            if mapping.start >= first_fill_end {
438                possible_placements += (mapping.start - first_fill_end) / (*PAGE_SIZE as usize) + 1;
439            }
440            first_fill_end = mapping.end.checked_add(length)?;
441        }
442        Some(possible_placements)
443    }
444
445    fn pick_placement(
446        &self,
447        length: usize,
448        mut chosen_placement_idx: usize,
449        subrange: &Range<UserAddress>,
450    ) -> Option<UserAddress> {
451        let mut candidate =
452            Range { start: subrange.start, end: subrange.start.checked_add(length)? };
453        let mut occupied_ranges = self.get_occupied_address_ranges(subrange);
454        loop {
455            let Some(mapping) = occupied_ranges.next() else {
456                // No more mappings: treat the rest of the index as an offset.
457                let res =
458                    candidate.start.checked_add(chosen_placement_idx * *PAGE_SIZE as usize)?;
459                debug_assert!(res.checked_add(length)? <= subrange.end);
460                return Some(res);
461            };
462            if mapping.start < candidate.end {
463                // doesn't fit, skip
464                candidate = Range { start: mapping.end, end: mapping.end.checked_add(length)? };
465                continue;
466            }
467            let unused_space =
468                (mapping.start.ptr() - candidate.end.ptr()) / (*PAGE_SIZE as usize) + 1;
469            if unused_space > chosen_placement_idx {
470                // Chosen placement is within the range; treat the rest of the index as an offset.
471                let res =
472                    candidate.start.checked_add(chosen_placement_idx * *PAGE_SIZE as usize)?;
473                return Some(res);
474            }
475
476            // chosen address is further up, skip
477            chosen_placement_idx -= unused_space;
478            candidate = Range { start: mapping.end, end: mapping.end.checked_add(length)? };
479        }
480    }
481
482    fn find_random_unused_range(
483        &self,
484        length: usize,
485        subrange: &Range<UserAddress>,
486    ) -> Option<UserAddress> {
487        let possible_placements = self.count_possible_placements(length, subrange)?;
488        if possible_placements == 0 {
489            return None;
490        }
491        let chosen_placement_idx = rand::random_range(0..possible_placements);
492        self.pick_placement(length, chosen_placement_idx, subrange)
493    }
494
495    // Find the first unused range of addresses that fits a mapping of `length` bytes, searching
496    // from `mmap_top` downwards.
497    pub fn find_next_unused_range(&self, length: usize) -> Option<UserAddress> {
498        let gap_size = length as u64;
499        let mut upper_bound = self.mmap_top;
500
501        loop {
502            let gap_end = self.mappings.find_gap_end(gap_size, &upper_bound);
503            let candidate = gap_end.checked_sub(length)?;
504
505            // Is there a next mapping? If not, the candidate is already good.
506            let Some((occupied_range, mapping)) = self.mappings.get(gap_end) else {
507                return Some(candidate);
508            };
509            let occupied_range = mapping.inflate_to_include_guard_pages(occupied_range);
510            // If it doesn't overlap, the gap is big enough to fit.
511            if occupied_range.start >= gap_end {
512                return Some(candidate);
513            }
514            // If there was a mapping in the way, use the start of that range as the upper bound.
515            upper_bound = occupied_range.start;
516        }
517    }
518
519    // Accept the hint if the range is unused and within the range available for mapping.
520    fn is_hint_acceptable(&self, hint_addr: UserAddress, length: usize) -> bool {
521        let Some(hint_end) = hint_addr.checked_add(length) else {
522            return false;
523        };
524        if !RESTRICTED_ASPACE_RANGE.contains(&hint_addr.ptr())
525            || !RESTRICTED_ASPACE_RANGE.contains(&hint_end.ptr())
526        {
527            return false;
528        };
529        self.get_occupied_address_ranges(&(hint_addr..hint_end)).next().is_none()
530    }
531
532    fn select_address(
533        &self,
534        addr: DesiredAddress,
535        length: usize,
536        flags: MappingFlags,
537    ) -> Result<SelectedAddress, Errno> {
538        let adjusted_length = round_up_to_system_page_size(length).or_else(|_| error!(ENOMEM))?;
539
540        let find_address = || -> Result<SelectedAddress, Errno> {
541            let new_addr = if flags.contains(MappingFlags::LOWER_32BIT) {
542                // MAP_32BIT specifies that the memory allocated will
543                // be within the first 2 GB of the process address space.
544                self.find_random_unused_range(
545                    adjusted_length,
546                    &(UserAddress::from_ptr(RESTRICTED_ASPACE_BASE)
547                        ..UserAddress::from_ptr(0x80000000)),
548                )
549                .ok_or_else(|| errno!(ENOMEM))?
550            } else {
551                self.find_next_unused_range(adjusted_length).ok_or_else(|| errno!(ENOMEM))?
552            };
553
554            Ok(SelectedAddress::Fixed(new_addr))
555        };
556
557        Ok(match addr {
558            DesiredAddress::Any => find_address()?,
559            DesiredAddress::Hint(hint_addr) => {
560                // Round down to page size
561                let hint_addr =
562                    UserAddress::from_ptr(hint_addr.ptr() - hint_addr.ptr() % *PAGE_SIZE as usize);
563                if self.is_hint_acceptable(hint_addr, adjusted_length) {
564                    SelectedAddress::Fixed(hint_addr)
565                } else {
566                    find_address()?
567                }
568            }
569            DesiredAddress::Fixed(addr) => SelectedAddress::Fixed(addr),
570            DesiredAddress::FixedOverwrite(addr) => SelectedAddress::FixedOverwrite(addr),
571        })
572    }
573
574    // Map the memory without updating `self.mappings`.
575    fn map_in_user_vmar(
576        &self,
577        addr: SelectedAddress,
578        memory: &MemoryObject,
579        memory_offset: u64,
580        length: usize,
581        flags: MappingFlags,
582        populate: bool,
583    ) -> Result<UserAddress, Errno> {
584        map_in_vmar(
585            &self.user_vmar,
586            &self.user_vmar_info,
587            addr,
588            memory,
589            memory_offset,
590            length,
591            flags,
592            populate,
593        )
594    }
595
596    fn validate_addr(&self, addr: DesiredAddress, length: usize) -> Result<(), Errno> {
597        if let DesiredAddress::FixedOverwrite(addr) = addr {
598            if self.check_has_unauthorized_splits(addr, length) {
599                return error!(ENOMEM);
600            }
601        }
602        Ok(())
603    }
604
605    fn map_memory(
606        &mut self,
607        mm: &Arc<MemoryManager>,
608        addr: DesiredAddress,
609        memory: Arc<MemoryObject>,
610        memory_offset: u64,
611        length: usize,
612        flags: MappingFlags,
613        max_access: Access,
614        populate: bool,
615        name: MappingName,
616        released_mappings: &mut ReleasedMappings,
617    ) -> Result<UserAddress, Errno> {
618        self.validate_addr(addr, length)?;
619
620        let selected_address = self.select_address(addr, length, flags)?;
621        let mapped_addr = self.map_in_user_vmar(
622            selected_address,
623            &memory,
624            memory_offset,
625            length,
626            flags,
627            populate,
628        )?;
629
630        let end = (mapped_addr + length)?.round_up(*PAGE_SIZE)?;
631
632        if let DesiredAddress::FixedOverwrite(addr) = addr {
633            assert_eq!(addr, mapped_addr);
634            self.update_after_unmap(mm, addr, end - addr, released_mappings)?;
635        }
636
637        let mapping = Mapping::with_name(
638            self.create_memory_backing(mapped_addr, memory, memory_offset),
639            flags,
640            max_access,
641            name,
642        );
643        released_mappings.extend(self.mappings.insert(mapped_addr..end, mapping));
644
645        Ok(mapped_addr)
646    }
647
648    fn map_private_anonymous(
649        &mut self,
650        mm: &Arc<MemoryManager>,
651        addr: DesiredAddress,
652        length: usize,
653        prot_flags: ProtectionFlags,
654        options: MappingOptions,
655        populate: bool,
656        name: MappingName,
657        released_mappings: &mut ReleasedMappings,
658    ) -> Result<UserAddress, Errno> {
659        self.validate_addr(addr, length)?;
660
661        let flags = MappingFlags::from_access_flags_and_options(prot_flags, options);
662        let selected_addr = self.select_address(addr, length, flags)?;
663        let backing_memory_offset = selected_addr.addr().ptr();
664
665        let mapped_addr = self.map_in_user_vmar(
666            selected_addr,
667            &self.private_anonymous.backing,
668            backing_memory_offset as u64,
669            length,
670            flags,
671            populate,
672        )?;
673
674        let end = (mapped_addr + length)?.round_up(*PAGE_SIZE)?;
675        if let DesiredAddress::FixedOverwrite(addr) = addr {
676            assert_eq!(addr, mapped_addr);
677            self.update_after_unmap(mm, addr, end - addr, released_mappings)?;
678        }
679
680        let mapping = Mapping::new_private_anonymous(flags, name);
681        released_mappings.extend(self.mappings.insert(mapped_addr..end, mapping));
682
683        Ok(mapped_addr)
684    }
685
686    fn map_anonymous(
687        &mut self,
688        mm: &Arc<MemoryManager>,
689        addr: DesiredAddress,
690        length: usize,
691        prot_flags: ProtectionFlags,
692        options: MappingOptions,
693        name: MappingName,
694        released_mappings: &mut ReleasedMappings,
695    ) -> Result<UserAddress, Errno> {
696        if !options.contains(MappingOptions::SHARED) {
697            return self.map_private_anonymous(
698                mm,
699                addr,
700                length,
701                prot_flags,
702                options,
703                options.contains(MappingOptions::POPULATE),
704                name,
705                released_mappings,
706            );
707        }
708        let memory = create_anonymous_mapping_memory(length as u64)?;
709        let flags = MappingFlags::from_access_flags_and_options(prot_flags, options);
710        self.map_memory(
711            mm,
712            addr,
713            memory,
714            0,
715            length,
716            flags,
717            Access::rwx(),
718            options.contains(MappingOptions::POPULATE),
719            name,
720            released_mappings,
721        )
722    }
723
724    fn remap(
725        &mut self,
726        _current_task: &CurrentTask,
727        mm: &Arc<MemoryManager>,
728        old_addr: UserAddress,
729        old_length: usize,
730        new_length: usize,
731        flags: MremapFlags,
732        new_addr: UserAddress,
733        released_mappings: &mut ReleasedMappings,
734    ) -> Result<UserAddress, Errno> {
735        // MREMAP_FIXED moves a mapping, which requires MREMAP_MAYMOVE.
736        if flags.contains(MremapFlags::FIXED) && !flags.contains(MremapFlags::MAYMOVE) {
737            return error!(EINVAL);
738        }
739
740        // MREMAP_DONTUNMAP is always a move, so it requires MREMAP_MAYMOVE.
741        // There is no resizing allowed either.
742        if flags.contains(MremapFlags::DONTUNMAP)
743            && (!flags.contains(MremapFlags::MAYMOVE) || old_length != new_length)
744        {
745            return error!(EINVAL);
746        }
747
748        // In-place copies are invalid.
749        if !flags.contains(MremapFlags::MAYMOVE) && old_length == 0 {
750            return error!(ENOMEM);
751        }
752
753        if new_length == 0 {
754            return error!(EINVAL);
755        }
756
757        // Make sure old_addr is page-aligned.
758        if !old_addr.is_aligned(*PAGE_SIZE) {
759            return error!(EINVAL);
760        }
761
762        let old_length = round_up_to_system_page_size(old_length)?;
763        let new_length = round_up_to_system_page_size(new_length)?;
764
765        if self.check_has_unauthorized_splits(old_addr, old_length) {
766            return error!(EINVAL);
767        }
768
769        if self.check_has_unauthorized_splits(new_addr, new_length) {
770            return error!(EINVAL);
771        }
772
773        if !flags.contains(MremapFlags::DONTUNMAP)
774            && !flags.contains(MremapFlags::FIXED)
775            && old_length != 0
776        {
777            // We are not requested to remap to a specific address, so first we see if we can remap
778            // in-place. In-place copies (old_length == 0) are not allowed.
779            if let Some(new_addr) =
780                self.try_remap_in_place(mm, old_addr, old_length, new_length, released_mappings)?
781            {
782                return Ok(new_addr);
783            }
784        }
785
786        // There is no space to grow in place, or there is an explicit request to move.
787        if flags.contains(MremapFlags::MAYMOVE) {
788            let dst_address =
789                if flags.contains(MremapFlags::FIXED) { Some(new_addr) } else { None };
790            self.remap_move(
791                mm,
792                old_addr,
793                old_length,
794                dst_address,
795                new_length,
796                flags.contains(MremapFlags::DONTUNMAP),
797                released_mappings,
798            )
799        } else {
800            error!(ENOMEM)
801        }
802    }
803
804    /// Attempts to grow or shrink the mapping in-place. Returns `Ok(Some(addr))` if the remap was
805    /// successful. Returns `Ok(None)` if there was no space to grow.
806    fn try_remap_in_place(
807        &mut self,
808        mm: &Arc<MemoryManager>,
809        old_addr: UserAddress,
810        old_length: usize,
811        new_length: usize,
812        released_mappings: &mut ReleasedMappings,
813    ) -> Result<Option<UserAddress>, Errno> {
814        let old_range = old_addr..old_addr.checked_add(old_length).ok_or_else(|| errno!(EINVAL))?;
815        let new_range_in_place =
816            old_addr..old_addr.checked_add(new_length).ok_or_else(|| errno!(EINVAL))?;
817
818        if new_length <= old_length {
819            // Shrink the mapping in-place, which should always succeed.
820            // This is done by unmapping the extraneous region.
821            if new_length != old_length {
822                self.unmap(mm, new_range_in_place.end, old_length - new_length, released_mappings)?;
823            }
824            return Ok(Some(old_addr));
825        }
826
827        if self.mappings.range(old_range.end..new_range_in_place.end).next().is_some() {
828            // There is some mapping in the growth range prevening an in-place growth.
829            return Ok(None);
830        }
831
832        // There is space to grow in-place. The old range must be one contiguous mapping.
833        let (original_range, mapping) =
834            self.mappings.get(old_addr).ok_or_else(|| errno!(EINVAL))?;
835
836        if old_range.end > original_range.end {
837            return error!(EFAULT);
838        }
839        let original_range = original_range.clone();
840        let original_mapping = mapping.clone();
841
842        // Compute the new length of the entire mapping once it has grown.
843        let final_length = (original_range.end - original_range.start) + (new_length - old_length);
844
845        match self.get_mapping_backing(&original_mapping) {
846            MappingBacking::Memory(backing) => {
847                // Re-map the original range, which may include pages before the requested range.
848                Ok(Some(self.map_memory(
849                    mm,
850                    DesiredAddress::FixedOverwrite(original_range.start),
851                    backing.memory().clone(),
852                    backing.address_to_offset(original_range.start),
853                    final_length,
854                    original_mapping.flags(),
855                    original_mapping.max_access(),
856                    false,
857                    original_mapping.name().to_owned(),
858                    released_mappings,
859                )?))
860            }
861            MappingBacking::PrivateAnonymous => {
862                let growth_start = original_range.end;
863                let growth_length = new_length - old_length;
864                let final_end = (original_range.start + final_length)?;
865                // Map new pages to back the growth.
866                self.map_in_user_vmar(
867                    SelectedAddress::FixedOverwrite(growth_start),
868                    &self.private_anonymous.backing,
869                    growth_start.ptr() as u64,
870                    growth_length,
871                    original_mapping.flags(),
872                    false,
873                )?;
874                // Overwrite the mapping entry with the new larger size.
875                released_mappings.extend(
876                    self.mappings.insert(original_range.start..final_end, original_mapping.clone()),
877                );
878                Ok(Some(original_range.start))
879            }
880        }
881    }
882
883    /// Grows or shrinks the mapping while moving it to a new destination.
884    fn remap_move(
885        &mut self,
886        mm: &Arc<MemoryManager>,
887        src_addr: UserAddress,
888        src_length: usize,
889        dst_addr: Option<UserAddress>,
890        dst_length: usize,
891        keep_source: bool,
892        released_mappings: &mut ReleasedMappings,
893    ) -> Result<UserAddress, Errno> {
894        let src_range = src_addr..src_addr.checked_add(src_length).ok_or_else(|| errno!(EINVAL))?;
895        let (original_range, src_mapping) =
896            self.mappings.get(src_addr).ok_or_else(|| errno!(EINVAL))?;
897        let original_range = original_range.clone();
898        let src_mapping = src_mapping.clone();
899
900        if src_length == 0 && !src_mapping.flags().contains(MappingFlags::SHARED) {
901            // src_length == 0 means that the mapping is to be copied. This behavior is only valid
902            // with MAP_SHARED mappings.
903            return error!(EINVAL);
904        }
905
906        // If the destination range is smaller than the source range, we must first shrink
907        // the source range in place. This must be done now and visible to processes, even if
908        // a later failure causes the remap operation to fail.
909        if src_length != 0 && src_length > dst_length {
910            self.unmap(mm, (src_addr + dst_length)?, src_length - dst_length, released_mappings)?;
911        }
912
913        let dst_addr_for_map = match dst_addr {
914            None => DesiredAddress::Any,
915            Some(dst_addr) => {
916                // The mapping is being moved to a specific address.
917                let dst_range =
918                    dst_addr..(dst_addr.checked_add(dst_length).ok_or_else(|| errno!(EINVAL))?);
919                if !src_range.intersect(&dst_range).is_empty() {
920                    return error!(EINVAL);
921                }
922
923                // The destination range must be unmapped. This must be done now and visible to
924                // processes, even if a later failure causes the remap operation to fail.
925                self.unmap(mm, dst_addr, dst_length, released_mappings)?;
926
927                DesiredAddress::Fixed(dst_addr)
928            }
929        };
930
931        // According to gVisor's aio_test, Linux checks for DONT_EXPAND after unmapping the dst
932        // range.
933        if dst_length > src_length && src_mapping.flags().contains(MappingFlags::DONT_EXPAND) {
934            return error!(EFAULT);
935        }
936
937        if src_range.end > original_range.end {
938            // The source range is not one contiguous mapping. This check must be done only after
939            // the source range is shrunk and the destination unmapped.
940            return error!(EFAULT);
941        }
942
943        match self.get_mapping_backing(&src_mapping) {
944            MappingBacking::PrivateAnonymous => {
945                let dst_addr =
946                    self.select_address(dst_addr_for_map, dst_length, src_mapping.flags())?.addr();
947                let dst_end = (dst_addr + dst_length)?;
948
949                let length_to_move = std::cmp::min(dst_length, src_length) as u64;
950                let growth_start_addr = (dst_addr + length_to_move)?;
951
952                if dst_addr != src_addr {
953                    let src_move_end = (src_range.start + length_to_move)?;
954                    let range_to_move = src_range.start..src_move_end;
955                    // Move the previously mapped pages into their new location.
956                    self.private_anonymous.move_pages(&range_to_move, dst_addr)?;
957                }
958
959                // Userfault registration is not preserved by remap
960                let new_flags =
961                    src_mapping.flags().difference(MappingFlags::UFFD | MappingFlags::UFFD_MISSING);
962                self.map_in_user_vmar(
963                    SelectedAddress::FixedOverwrite(dst_addr),
964                    &self.private_anonymous.backing,
965                    dst_addr.ptr() as u64,
966                    dst_length,
967                    new_flags,
968                    false,
969                )?;
970
971                if dst_length > src_length {
972                    // The mapping has grown, map new pages in to cover the growth.
973                    let growth_length = dst_length - src_length;
974
975                    self.map_private_anonymous(
976                        mm,
977                        DesiredAddress::FixedOverwrite(growth_start_addr),
978                        growth_length,
979                        new_flags.access_flags(),
980                        new_flags.options(),
981                        false,
982                        src_mapping.name().to_owned(),
983                        released_mappings,
984                    )?;
985                }
986
987                released_mappings.extend(self.mappings.insert(
988                    dst_addr..dst_end,
989                    Mapping::new_private_anonymous(new_flags, src_mapping.name().to_owned()),
990                ));
991
992                if dst_addr != src_addr && src_length != 0 && !keep_source {
993                    self.unmap(mm, src_addr, src_length, released_mappings)?;
994                }
995
996                return Ok(dst_addr);
997            }
998            MappingBacking::Memory(backing) => {
999                // This mapping is backed by an FD or is a shared anonymous mapping. Just map the
1000                // range of the memory object covering the moved pages. If the memory object already
1001                // had COW semantics, this preserves them.
1002                let (dst_memory_offset, memory) =
1003                    (backing.address_to_offset(src_addr), backing.memory().clone());
1004
1005                let new_address = self.map_memory(
1006                    mm,
1007                    dst_addr_for_map,
1008                    memory,
1009                    dst_memory_offset,
1010                    dst_length,
1011                    src_mapping.flags(),
1012                    src_mapping.max_access(),
1013                    false,
1014                    src_mapping.name().to_owned(),
1015                    released_mappings,
1016                )?;
1017
1018                if src_length != 0 && !keep_source {
1019                    // Only unmap the source range if this is not a copy and if there was not a specific
1020                    // request to not unmap. It was checked earlier that in case of src_length == 0
1021                    // this mapping is MAP_SHARED.
1022                    self.unmap(mm, src_addr, src_length, released_mappings)?;
1023                }
1024
1025                return Ok(new_address);
1026            }
1027        };
1028    }
1029
1030    // Checks if an operation may be performed over the target mapping that may
1031    // result in a split mapping.
1032    //
1033    // An operation may be forbidden if the target mapping only partially covers
1034    // an existing mapping with the `MappingOptions::DONT_SPLIT` flag set.
1035    fn check_has_unauthorized_splits(&self, addr: UserAddress, length: usize) -> bool {
1036        let query_range = addr..addr.saturating_add(length);
1037        let mut intersection = self.mappings.range(query_range.clone());
1038
1039        // A mapping is not OK if it disallows splitting and the target range
1040        // does not fully cover the mapping range.
1041        let check_if_mapping_has_unauthorized_split =
1042            |mapping: Option<(&Range<UserAddress>, &Mapping)>| {
1043                mapping.is_some_and(|(mapping_range, mapping)| {
1044                    mapping.flags().contains(MappingFlags::DONT_SPLIT)
1045                        && (mapping_range.start < query_range.start
1046                            || query_range.end < mapping_range.end)
1047                })
1048            };
1049
1050        // We only check the first and last mappings in the range because naturally,
1051        // the mappings in the middle are fully covered by the target mapping and
1052        // won't be split.
1053        check_if_mapping_has_unauthorized_split(intersection.next())
1054            || check_if_mapping_has_unauthorized_split(intersection.next_back())
1055    }
1056
1057    /// Unmaps the specified range. Unmapped mappings are placed in `released_mappings`.
1058    fn unmap(
1059        &mut self,
1060        mm: &Arc<MemoryManager>,
1061        addr: UserAddress,
1062        length: usize,
1063        released_mappings: &mut ReleasedMappings,
1064    ) -> Result<(), Errno> {
1065        if !addr.is_aligned(*PAGE_SIZE) {
1066            return error!(EINVAL);
1067        }
1068        let length = round_up_to_system_page_size(length)?;
1069        if length == 0 {
1070            return error!(EINVAL);
1071        }
1072
1073        if self.check_has_unauthorized_splits(addr, length) {
1074            return error!(EINVAL);
1075        }
1076
1077        // Unmap the range, including the the tail of any range that would have been split. This
1078        // operation is safe because we're operating on another process.
1079        #[allow(
1080            clippy::undocumented_unsafe_blocks,
1081            reason = "Force documented unsafe blocks in Starnix"
1082        )]
1083        match unsafe { self.user_vmar.unmap(addr.ptr(), length) } {
1084            Ok(_) => (),
1085            Err(zx::Status::NOT_FOUND) => (),
1086            Err(zx::Status::INVALID_ARGS) => return error!(EINVAL),
1087            Err(status) => {
1088                impossible_error(status);
1089            }
1090        };
1091
1092        self.update_after_unmap(mm, addr, length, released_mappings)?;
1093
1094        Ok(())
1095    }
1096
1097    // Updates `self.mappings` after the specified range was unmaped.
1098    //
1099    // The range to unmap can span multiple mappings, and can split mappings if
1100    // the range start or end falls in the middle of a mapping.
1101    //
1102    // Private anonymous memory is contained in the same memory object; The pages of that object
1103    // that are no longer reachable should be released.
1104    //
1105    // File-backed mappings don't need to have their memory object modified.
1106    //
1107    // Unmapped mappings are placed in `released_mappings`.
1108    fn update_after_unmap(
1109        &mut self,
1110        mm: &Arc<MemoryManager>,
1111        addr: UserAddress,
1112        length: usize,
1113        released_mappings: &mut ReleasedMappings,
1114    ) -> Result<(), Errno> {
1115        let end_addr = addr.checked_add(length).ok_or_else(|| errno!(EINVAL))?;
1116        let unmap_range = addr..end_addr;
1117
1118        // Remove any shadow mappings for mlock()'d pages that are now unmapped.
1119        released_mappings.extend_pins(self.shadow_mappings_for_mlock.remove(unmap_range.clone()));
1120
1121        for (range, mapping) in self.mappings.range(unmap_range.clone()) {
1122            // Deallocate any pages in the private, anonymous backing that are now unreachable.
1123            if let MappingBacking::PrivateAnonymous = self.get_mapping_backing(mapping) {
1124                let unmapped_range = &unmap_range.intersect(range);
1125
1126                mm.inflight_vmspliced_payloads
1127                    .handle_unmapping(&self.private_anonymous.backing, unmapped_range)?;
1128
1129                self.private_anonymous
1130                    .zero(unmapped_range.start, unmapped_range.end - unmapped_range.start)?;
1131            }
1132        }
1133        released_mappings.extend(self.mappings.remove(unmap_range));
1134        return Ok(());
1135    }
1136
1137    fn protect_vmar_range(
1138        &self,
1139        addr: UserAddress,
1140        length: usize,
1141        prot_flags: ProtectionFlags,
1142    ) -> Result<(), Errno> {
1143        let vmar_flags = prot_flags.to_vmar_flags();
1144        // SAFETY: Modifying user vmar
1145        unsafe { self.user_vmar.protect(addr.ptr(), length, vmar_flags) }.map_err(|s| match s {
1146            zx::Status::INVALID_ARGS => errno!(EINVAL),
1147            zx::Status::NOT_FOUND => errno!(ENOMEM),
1148            zx::Status::ACCESS_DENIED => errno!(EACCES),
1149            _ => impossible_error(s),
1150        })
1151    }
1152
1153    fn protect(
1154        &mut self,
1155        current_task: &CurrentTask,
1156        addr: UserAddress,
1157        length: usize,
1158        prot_flags: ProtectionFlags,
1159        released_mappings: &mut ReleasedMappings,
1160    ) -> Result<(), Errno> {
1161        let vmar_flags = prot_flags.to_vmar_flags();
1162        let page_size = *PAGE_SIZE;
1163        let end = addr.checked_add(length).ok_or_else(|| errno!(EINVAL))?.round_up(page_size)?;
1164
1165        if self.check_has_unauthorized_splits(addr, length) {
1166            return error!(EINVAL);
1167        }
1168
1169        let prot_range = if prot_flags.contains(ProtectionFlags::GROWSDOWN) {
1170            let mut start = addr;
1171            let Some((range, mapping)) = self.mappings.get(start) else {
1172                return error!(EINVAL);
1173            };
1174            // Ensure that the mapping has GROWSDOWN if PROT_GROWSDOWN was specified.
1175            if !mapping.flags().contains(MappingFlags::GROWSDOWN) {
1176                return error!(EINVAL);
1177            }
1178            let access_flags = mapping.flags().access_flags();
1179            // From <https://man7.org/linux/man-pages/man2/mprotect.2.html>:
1180            //
1181            //   PROT_GROWSDOWN
1182            //     Apply the protection mode down to the beginning of a
1183            //     mapping that grows downward (which should be a stack
1184            //     segment or a segment mapped with the MAP_GROWSDOWN flag
1185            //     set).
1186            start = range.start;
1187            while let Some((range, mapping)) =
1188                self.mappings.get(start.saturating_sub(page_size as usize))
1189            {
1190                if !mapping.flags().contains(MappingFlags::GROWSDOWN)
1191                    || mapping.flags().access_flags() != access_flags
1192                {
1193                    break;
1194                }
1195                start = range.start;
1196            }
1197            start..end
1198        } else {
1199            addr..end
1200        };
1201
1202        let addr = prot_range.start;
1203        let length = prot_range.end - prot_range.start;
1204
1205        // TODO: We should check the max_access flags on all the mappings in this range.
1206        //       There are cases where max_access is more restrictive than the Zircon rights
1207        //       we hold on the underlying VMOs.
1208
1209        // TODO(https://fxbug.dev/411617451): `mprotect` should apply the protection flags
1210        // until it encounters a mapping that doesn't allow it, rather than not apply the protection
1211        // flags at all if a single mapping doesn't allow it.
1212        for (range, mapping) in self.mappings.range(prot_range.clone()) {
1213            security::file_mprotect(current_task, range, mapping, prot_flags)?;
1214        }
1215
1216        // Make one call to mprotect to update all the zircon protections.
1217        // SAFETY: This is safe because the vmar belongs to a different process.
1218        unsafe { self.user_vmar.protect(addr.ptr(), length, vmar_flags) }.map_err(|s| match s {
1219            zx::Status::INVALID_ARGS => errno!(EINVAL),
1220            zx::Status::NOT_FOUND => {
1221                track_stub!(
1222                    TODO("https://fxbug.dev/322875024"),
1223                    "mprotect: succeed and update prot after NOT_FOUND"
1224                );
1225                errno!(EINVAL)
1226            }
1227            zx::Status::ACCESS_DENIED => errno!(EACCES),
1228            _ => impossible_error(s),
1229        })?;
1230
1231        // Update the flags on each mapping in the range.
1232        let mut updates = vec![];
1233        for (range, mapping) in self.mappings.range(prot_range.clone()) {
1234            if mapping.flags().contains(MappingFlags::UFFD) {
1235                track_stub!(
1236                    TODO("https://fxbug.dev/297375964"),
1237                    "mprotect on uffd-registered range should not alter protections"
1238                );
1239                return error!(EINVAL);
1240            }
1241            let range = range.intersect(&prot_range);
1242            let mut mapping = mapping.clone();
1243            mapping.set_flags(mapping.flags().with_access_flags(prot_flags));
1244            updates.push((range, mapping));
1245        }
1246        // Use a separate loop to avoid mutating the mappings structure while iterating over it.
1247        for (range, mapping) in updates {
1248            released_mappings.extend(self.mappings.insert(range, mapping));
1249        }
1250        Ok(())
1251    }
1252
1253    fn madvise(
1254        &mut self,
1255        _current_task: &CurrentTask,
1256        addr: UserAddress,
1257        length: usize,
1258        advice: u32,
1259        released_mappings: &mut ReleasedMappings,
1260    ) -> Result<(), Errno> {
1261        if !addr.is_aligned(*PAGE_SIZE) {
1262            return error!(EINVAL);
1263        }
1264
1265        let end_addr =
1266            addr.checked_add(length).ok_or_else(|| errno!(EINVAL))?.round_up(*PAGE_SIZE)?;
1267        if end_addr > self.max_address() {
1268            return error!(EFAULT);
1269        }
1270
1271        if advice == MADV_NORMAL {
1272            track_stub!(TODO("https://fxbug.dev/322874202"), "madvise undo hints for MADV_NORMAL");
1273            return Ok(());
1274        }
1275
1276        let mut updates = vec![];
1277        let range_for_op = addr..end_addr;
1278        for (range, mapping) in self.mappings.range(range_for_op.clone()) {
1279            let range_to_zero = range.intersect(&range_for_op);
1280            if range_to_zero.is_empty() {
1281                continue;
1282            }
1283            let start_offset = mapping.address_to_offset(range_to_zero.start);
1284            let end_offset = mapping.address_to_offset(range_to_zero.end);
1285            if advice == MADV_DONTFORK
1286                || advice == MADV_DOFORK
1287                || advice == MADV_WIPEONFORK
1288                || advice == MADV_KEEPONFORK
1289                || advice == MADV_DONTDUMP
1290                || advice == MADV_DODUMP
1291                || advice == MADV_MERGEABLE
1292                || advice == MADV_UNMERGEABLE
1293            {
1294                // WIPEONFORK is only supported on private anonymous mappings per madvise(2).
1295                // KEEPONFORK can be specified on ranges that cover other sorts of mappings. It should
1296                // have no effect on mappings that are not private and anonymous as such mappings cannot
1297                // have the WIPEONFORK option set.
1298                if advice == MADV_WIPEONFORK && !mapping.private_anonymous() {
1299                    return error!(EINVAL);
1300                }
1301                let new_flags = match advice {
1302                    MADV_DONTFORK => mapping.flags() | MappingFlags::DONTFORK,
1303                    MADV_DOFORK => mapping.flags() & MappingFlags::DONTFORK.complement(),
1304                    MADV_WIPEONFORK => mapping.flags() | MappingFlags::WIPEONFORK,
1305                    MADV_KEEPONFORK => mapping.flags() & MappingFlags::WIPEONFORK.complement(),
1306                    MADV_DONTDUMP => {
1307                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_DONTDUMP");
1308                        mapping.flags()
1309                    }
1310                    MADV_DODUMP => {
1311                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_DODUMP");
1312                        mapping.flags()
1313                    }
1314                    MADV_MERGEABLE => {
1315                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_MERGEABLE");
1316                        mapping.flags()
1317                    }
1318                    MADV_UNMERGEABLE => {
1319                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_UNMERGEABLE");
1320                        mapping.flags()
1321                    }
1322                    // Only the variants in this match should be reachable given the condition for
1323                    // the containing branch.
1324                    unknown_advice => unreachable!("unknown advice {unknown_advice}"),
1325                };
1326                let mut new_mapping = mapping.clone();
1327                new_mapping.set_flags(new_flags);
1328                updates.push((range_to_zero, new_mapping));
1329            } else {
1330                if mapping.flags().contains(MappingFlags::SHARED) {
1331                    continue;
1332                }
1333                let op = match advice {
1334                    MADV_DONTNEED if !mapping.flags().contains(MappingFlags::ANONYMOUS) => {
1335                        // Note, we cannot simply implemented MADV_DONTNEED with
1336                        // zx::VmoOp::DONT_NEED because they have different
1337                        // semantics.
1338                        track_stub!(
1339                            TODO("https://fxbug.dev/322874496"),
1340                            "MADV_DONTNEED with file-backed mapping"
1341                        );
1342                        return error!(EINVAL);
1343                    }
1344                    MADV_DONTNEED if mapping.flags().contains(MappingFlags::LOCKED) => {
1345                        return error!(EINVAL);
1346                    }
1347                    MADV_DONTNEED => zx::VmoOp::ZERO,
1348                    MADV_DONTNEED_LOCKED => {
1349                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_DONTNEED_LOCKED");
1350                        return error!(EINVAL);
1351                    }
1352                    MADV_WILLNEED => {
1353                        if mapping.flags().contains(MappingFlags::WRITE) {
1354                            zx::VmoOp::COMMIT
1355                        } else {
1356                            zx::VmoOp::PREFETCH
1357                        }
1358                    }
1359                    MADV_COLD => {
1360                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_COLD");
1361                        return error!(EINVAL);
1362                    }
1363                    MADV_PAGEOUT => {
1364                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_PAGEOUT");
1365                        return error!(EINVAL);
1366                    }
1367                    MADV_POPULATE_READ => {
1368                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_POPULATE_READ");
1369                        return error!(EINVAL);
1370                    }
1371                    MADV_RANDOM => {
1372                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_RANDOM");
1373                        return error!(EINVAL);
1374                    }
1375                    MADV_SEQUENTIAL => {
1376                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_SEQUENTIAL");
1377                        return error!(EINVAL);
1378                    }
1379                    MADV_FREE if !mapping.flags().contains(MappingFlags::ANONYMOUS) => {
1380                        track_stub!(
1381                            TODO("https://fxbug.dev/411748419"),
1382                            "MADV_FREE with file-backed mapping"
1383                        );
1384                        return error!(EINVAL);
1385                    }
1386                    MADV_FREE if mapping.flags().contains(MappingFlags::LOCKED) => {
1387                        return error!(EINVAL);
1388                    }
1389                    MADV_FREE => {
1390                        track_stub!(TODO("https://fxbug.dev/411748419"), "MADV_FREE");
1391                        // TODO(https://fxbug.dev/411748419) For now, treat MADV_FREE like
1392                        // MADV_DONTNEED as a stopgap until we have proper support.
1393                        zx::VmoOp::ZERO
1394                    }
1395                    MADV_REMOVE => {
1396                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_REMOVE");
1397                        return error!(EINVAL);
1398                    }
1399                    MADV_HWPOISON => {
1400                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_HWPOISON");
1401                        return error!(EINVAL);
1402                    }
1403                    MADV_SOFT_OFFLINE => {
1404                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_SOFT_OFFLINE");
1405                        return error!(EINVAL);
1406                    }
1407                    MADV_HUGEPAGE => {
1408                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_HUGEPAGE");
1409                        return error!(EINVAL);
1410                    }
1411                    MADV_COLLAPSE => {
1412                        track_stub!(TODO("https://fxbug.dev/322874202"), "MADV_COLLAPSE");
1413                        return error!(EINVAL);
1414                    }
1415                    MADV_NOHUGEPAGE => return Ok(()),
1416                    advice => {
1417                        track_stub!(TODO("https://fxbug.dev/322874202"), "madvise", advice);
1418                        return error!(EINVAL);
1419                    }
1420                };
1421
1422                let memory = match self.get_mapping_backing(mapping) {
1423                    MappingBacking::Memory(backing) => backing.memory(),
1424                    MappingBacking::PrivateAnonymous => &self.private_anonymous.backing,
1425                };
1426                memory.op_range(op, start_offset, end_offset - start_offset).map_err(
1427                    |s| match s {
1428                        zx::Status::OUT_OF_RANGE => errno!(EINVAL),
1429                        zx::Status::NO_MEMORY => errno!(ENOMEM),
1430                        zx::Status::INVALID_ARGS => errno!(EINVAL),
1431                        zx::Status::ACCESS_DENIED => errno!(EACCES),
1432                        _ => impossible_error(s),
1433                    },
1434                )?;
1435            }
1436        }
1437        // Use a separate loop to avoid mutating the mappings structure while iterating over it.
1438        for (range, mapping) in updates {
1439            released_mappings.extend(self.mappings.insert(range, mapping));
1440        }
1441        Ok(())
1442    }
1443
1444    fn mlock<L>(
1445        &mut self,
1446        current_task: &CurrentTask,
1447        locked: &mut Locked<L>,
1448        desired_addr: UserAddress,
1449        desired_length: usize,
1450        on_fault: bool,
1451        released_mappings: &mut ReleasedMappings,
1452    ) -> Result<(), Errno>
1453    where
1454        L: LockBefore<ThreadGroupLimits>,
1455    {
1456        let desired_end_addr =
1457            desired_addr.checked_add(desired_length).ok_or_else(|| errno!(EINVAL))?;
1458        let start_addr = round_down_to_system_page_size(desired_addr)?;
1459        let end_addr = round_up_to_system_page_size(desired_end_addr)?;
1460
1461        let mut updates = vec![];
1462        let mut bytes_mapped_in_range = 0;
1463        let mut num_new_locked_bytes = 0;
1464        let mut failed_to_lock = false;
1465        for (range, mapping) in self.mappings.range(start_addr..end_addr) {
1466            let mut range = range.clone();
1467            let mut mapping = mapping.clone();
1468
1469            // Handle mappings that start before the region to be locked.
1470            range.start = std::cmp::max(range.start, start_addr);
1471            // Handle mappings that extend past the region to be locked.
1472            range.end = std::cmp::min(range.end, end_addr);
1473
1474            bytes_mapped_in_range += (range.end - range.start) as u64;
1475
1476            // PROT_NONE mappings generate ENOMEM but are left locked.
1477            if !mapping
1478                .flags()
1479                .intersects(MappingFlags::READ | MappingFlags::WRITE | MappingFlags::EXEC)
1480            {
1481                failed_to_lock = true;
1482            }
1483
1484            if !mapping.flags().contains(MappingFlags::LOCKED) {
1485                num_new_locked_bytes += (range.end - range.start) as u64;
1486                let shadow_mapping = match current_task.kernel().features.mlock_pin_flavor {
1487                    // Pin the memory by mapping the backing memory into the high priority vmar.
1488                    MlockPinFlavor::ShadowProcess => {
1489                        let shadow_process =
1490                            current_task.kernel().expando.get_or_try_init(|| {
1491                                memory_pinning::ShadowProcess::new(zx::Name::new_lossy(
1492                                    "starnix_mlock_pins",
1493                                ))
1494                                .map(MlockShadowProcess)
1495                                .map_err(|_| errno!(EPERM))
1496                            })?;
1497
1498                        let (vmo, offset) = match self.get_mapping_backing(&mapping) {
1499                            MappingBacking::Memory(m) => (
1500                                m.memory().as_vmo().ok_or_else(|| errno!(ENOMEM))?,
1501                                m.address_to_offset(range.start),
1502                            ),
1503                            MappingBacking::PrivateAnonymous => (
1504                                self.private_anonymous
1505                                    .backing
1506                                    .as_vmo()
1507                                    .ok_or_else(|| errno!(ENOMEM))?,
1508                                range.start.ptr() as u64,
1509                            ),
1510                        };
1511                        Some(shadow_process.0.pin_pages(vmo, offset, range.end - range.start)?)
1512                    }
1513
1514                    // Relying on VMAR-level operations means just flags are set per-mapping.
1515                    MlockPinFlavor::Noop | MlockPinFlavor::VmarAlwaysNeed => None,
1516                };
1517                mapping.set_mlock();
1518                updates.push((range, mapping, shadow_mapping));
1519            }
1520        }
1521
1522        if bytes_mapped_in_range as usize != end_addr - start_addr {
1523            return error!(ENOMEM);
1524        }
1525
1526        let memlock_rlimit = current_task.thread_group().get_rlimit(locked, Resource::MEMLOCK);
1527        if self.total_locked_bytes() + num_new_locked_bytes > memlock_rlimit {
1528            if crate::security::check_task_capable(current_task, CAP_IPC_LOCK).is_err() {
1529                let code = if memlock_rlimit > 0 { errno!(ENOMEM) } else { errno!(EPERM) };
1530                return Err(code);
1531            }
1532        }
1533
1534        let op_range_status_to_errno = |e| match e {
1535            zx::Status::BAD_STATE | zx::Status::NOT_SUPPORTED => errno!(ENOMEM),
1536            zx::Status::INVALID_ARGS | zx::Status::OUT_OF_RANGE => errno!(EINVAL),
1537            zx::Status::ACCESS_DENIED => {
1538                unreachable!("user vmar should always have needed rights")
1539            }
1540            zx::Status::BAD_HANDLE => {
1541                unreachable!("user vmar should always be a valid handle")
1542            }
1543            zx::Status::WRONG_TYPE => unreachable!("user vmar handle should be a vmar"),
1544            _ => unreachable!("unknown error from op_range on user vmar for mlock: {e}"),
1545        };
1546
1547        if !on_fault && !current_task.kernel().features.mlock_always_onfault {
1548            self.user_vmar
1549                .op_range(zx::VmarOp::PREFETCH, start_addr.ptr(), end_addr - start_addr)
1550                .map_err(op_range_status_to_errno)?;
1551        }
1552
1553        match current_task.kernel().features.mlock_pin_flavor {
1554            MlockPinFlavor::VmarAlwaysNeed => {
1555                self.user_vmar
1556                    .op_range(zx::VmarOp::ALWAYS_NEED, start_addr.ptr(), end_addr - start_addr)
1557                    .map_err(op_range_status_to_errno)?;
1558            }
1559            // The shadow process doesn't use any vmar-level operations to pin memory.
1560            MlockPinFlavor::Noop | MlockPinFlavor::ShadowProcess => (),
1561        }
1562
1563        for (range, mapping, shadow_mapping) in updates {
1564            if let Some(shadow_mapping) = shadow_mapping {
1565                released_mappings.extend_pins(
1566                    self.shadow_mappings_for_mlock.insert(range.clone(), shadow_mapping),
1567                );
1568            }
1569            released_mappings.extend(self.mappings.insert(range, mapping));
1570        }
1571
1572        if failed_to_lock { error!(ENOMEM) } else { Ok(()) }
1573    }
1574
1575    fn munlock(
1576        &mut self,
1577        _current_task: &CurrentTask,
1578        desired_addr: UserAddress,
1579        desired_length: usize,
1580        released_mappings: &mut ReleasedMappings,
1581    ) -> Result<(), Errno> {
1582        let desired_end_addr =
1583            desired_addr.checked_add(desired_length).ok_or_else(|| errno!(EINVAL))?;
1584        let start_addr = round_down_to_system_page_size(desired_addr)?;
1585        let end_addr = round_up_to_system_page_size(desired_end_addr)?;
1586
1587        let mut updates = vec![];
1588        let mut bytes_mapped_in_range = 0;
1589        for (range, mapping) in self.mappings.range(start_addr..end_addr) {
1590            let mut range = range.clone();
1591            let mut mapping = mapping.clone();
1592
1593            // Handle mappings that start before the region to be locked.
1594            range.start = std::cmp::max(range.start, start_addr);
1595            // Handle mappings that extend past the region to be locked.
1596            range.end = std::cmp::min(range.end, end_addr);
1597
1598            bytes_mapped_in_range += (range.end - range.start) as u64;
1599
1600            if mapping.flags().contains(MappingFlags::LOCKED) {
1601                // This clears the locking for the shadow process pin flavor. It's not currently
1602                // possible to actually unlock pages that were locked with the
1603                // ZX_VMAR_OP_ALWAYS_NEED pin flavor.
1604                mapping.clear_mlock();
1605                updates.push((range, mapping));
1606            }
1607        }
1608
1609        if bytes_mapped_in_range as usize != end_addr - start_addr {
1610            return error!(ENOMEM);
1611        }
1612
1613        for (range, mapping) in updates {
1614            released_mappings.extend(self.mappings.insert(range.clone(), mapping));
1615            released_mappings.extend_pins(self.shadow_mappings_for_mlock.remove(range));
1616        }
1617
1618        Ok(())
1619    }
1620
1621    pub fn total_locked_bytes(&self) -> u64 {
1622        self.num_locked_bytes(
1623            UserAddress::from(self.user_vmar_info.base as u64)
1624                ..UserAddress::from((self.user_vmar_info.base + self.user_vmar_info.len) as u64),
1625        )
1626    }
1627
1628    pub fn num_locked_bytes(&self, range: impl RangeBounds<UserAddress>) -> u64 {
1629        self.mappings
1630            .range(range)
1631            .filter(|(_, mapping)| mapping.flags().contains(MappingFlags::LOCKED))
1632            .map(|(range, _)| (range.end - range.start) as u64)
1633            .sum()
1634    }
1635
1636    fn max_address(&self) -> UserAddress {
1637        UserAddress::from_ptr(self.user_vmar_info.base + self.user_vmar_info.len)
1638    }
1639
1640    fn get_mappings_for_vmsplice(
1641        &self,
1642        mm: &Arc<MemoryManager>,
1643        buffers: &UserBuffers,
1644    ) -> Result<Vec<Arc<VmsplicePayload>>, Errno> {
1645        let mut vmsplice_mappings = Vec::new();
1646
1647        for UserBuffer { mut address, length } in buffers.iter().copied() {
1648            let mappings = self.get_contiguous_mappings_at(address, length)?;
1649            for (mapping, length) in mappings {
1650                let vmsplice_payload = match self.get_mapping_backing(mapping) {
1651                    MappingBacking::Memory(m) => VmsplicePayloadSegment {
1652                        addr_offset: address,
1653                        length,
1654                        memory: m.memory().clone(),
1655                        memory_offset: m.address_to_offset(address),
1656                    },
1657                    MappingBacking::PrivateAnonymous => VmsplicePayloadSegment {
1658                        addr_offset: address,
1659                        length,
1660                        memory: self.private_anonymous.backing.clone(),
1661                        memory_offset: address.ptr() as u64,
1662                    },
1663                };
1664                vmsplice_mappings.push(VmsplicePayload::new(Arc::downgrade(mm), vmsplice_payload));
1665
1666                address = (address + length)?;
1667            }
1668        }
1669
1670        Ok(vmsplice_mappings)
1671    }
1672
1673    /// Returns all the mappings starting at `addr`, and continuing until either `length` bytes have
1674    /// been covered or an unmapped page is reached.
1675    ///
1676    /// Mappings are returned in ascending order along with the number of bytes that intersect the
1677    /// requested range. The returned mappings are guaranteed to be contiguous and the total length
1678    /// corresponds to the number of contiguous mapped bytes starting from `addr`, i.e.:
1679    /// - 0 (empty iterator) if `addr` is not mapped.
1680    /// - exactly `length` if the requested range is fully mapped.
1681    /// - the offset of the first unmapped page (between 0 and `length`) if the requested range is
1682    ///   only partially mapped.
1683    ///
1684    /// Returns EFAULT if the requested range overflows or extends past the end of the vmar.
1685    fn get_contiguous_mappings_at(
1686        &self,
1687        addr: UserAddress,
1688        length: usize,
1689    ) -> Result<impl Iterator<Item = (&Mapping, usize)>, Errno> {
1690        let end_addr = addr.checked_add(length).ok_or_else(|| errno!(EFAULT))?;
1691        if end_addr > self.max_address() {
1692            return error!(EFAULT);
1693        }
1694
1695        // Iterate over all contiguous mappings intersecting the requested range.
1696        let mut mappings = self.mappings.range(addr..end_addr);
1697        let mut prev_range_end = None;
1698        let mut offset = 0;
1699        let result = std::iter::from_fn(move || {
1700            if offset != length {
1701                if let Some((range, mapping)) = mappings.next() {
1702                    return match prev_range_end {
1703                        // If this is the first mapping that we are considering, it may not actually
1704                        // contain `addr` at all.
1705                        None if range.start > addr => None,
1706
1707                        // Subsequent mappings may not be contiguous.
1708                        Some(prev_range_end) if range.start != prev_range_end => None,
1709
1710                        // This mapping can be returned.
1711                        _ => {
1712                            let mapping_length = std::cmp::min(length, range.end - addr) - offset;
1713                            offset += mapping_length;
1714                            prev_range_end = Some(range.end);
1715                            Some((mapping, mapping_length))
1716                        }
1717                    };
1718                }
1719            }
1720
1721            None
1722        });
1723
1724        Ok(result)
1725    }
1726
1727    /// Determines whether a fault at the given address could be covered by extending a growsdown
1728    /// mapping.
1729    ///
1730    /// If the address already belongs to a mapping, this function returns `None`. If the next
1731    /// mapping above the given address has the `MappingFlags::GROWSDOWN` flag, this function
1732    /// returns the address at which that mapping starts and the mapping itself. Otherwise, this
1733    /// function returns `None`.
1734    fn find_growsdown_mapping(&self, addr: UserAddress) -> Option<(UserAddress, &Mapping)> {
1735        match self.mappings.range(addr..).next() {
1736            Some((range, mapping)) => {
1737                if range.contains(&addr) {
1738                    // |addr| is already contained within a mapping, nothing to grow.
1739                    return None;
1740                } else if !mapping.flags().contains(MappingFlags::GROWSDOWN) {
1741                    // The next mapping above the given address does not have the
1742                    // `MappingFlags::GROWSDOWN` flag.
1743                    None
1744                } else {
1745                    Some((range.start, mapping))
1746                }
1747            }
1748            None => None,
1749        }
1750    }
1751
1752    /// Determines if an access at a given address could be covered by extending a growsdown mapping
1753    /// and extends it if possible. Returns true if the given address is covered by a mapping.
1754    fn extend_growsdown_mapping_to_address(
1755        &mut self,
1756        mm: &Arc<MemoryManager>,
1757        addr: UserAddress,
1758        is_write: bool,
1759    ) -> Result<bool, Error> {
1760        let Some((mapping_low_addr, mapping_to_grow)) = self.find_growsdown_mapping(addr) else {
1761            return Ok(false);
1762        };
1763        if is_write && !mapping_to_grow.can_write() {
1764            // Don't grow a read-only GROWSDOWN mapping for a write fault, it won't work.
1765            return Ok(false);
1766        }
1767        if !mapping_to_grow.flags().contains(MappingFlags::ANONYMOUS) {
1768            // Currently, we only grow anonymous mappings.
1769            return Ok(false);
1770        }
1771        let low_addr = (addr - (addr.ptr() as u64 % *PAGE_SIZE))?;
1772        let high_addr = mapping_low_addr;
1773
1774        let length = high_addr
1775            .ptr()
1776            .checked_sub(low_addr.ptr())
1777            .ok_or_else(|| anyhow!("Invalid growth range"))?;
1778
1779        let mut released_mappings = ReleasedMappings::default();
1780        self.map_anonymous(
1781            mm,
1782            DesiredAddress::FixedOverwrite(low_addr),
1783            length,
1784            mapping_to_grow.flags().access_flags(),
1785            mapping_to_grow.flags().options(),
1786            mapping_to_grow.name().to_owned(),
1787            &mut released_mappings,
1788        )?;
1789        // We can't have any released mappings because `find_growsdown_mapping` will return None if
1790        // the mapping already exists in this range.
1791        assert!(
1792            released_mappings.is_empty(),
1793            "expected to not remove mappings by inserting, got {released_mappings:#?}"
1794        );
1795        Ok(true)
1796    }
1797
1798    /// Reads exactly `bytes.len()` bytes of memory.
1799    ///
1800    /// # Parameters
1801    /// - `addr`: The address to read data from.
1802    /// - `bytes`: The byte array to read into.
1803    fn read_memory<'a>(
1804        &self,
1805        addr: UserAddress,
1806        bytes: &'a mut [MaybeUninit<u8>],
1807    ) -> Result<&'a mut [u8], Errno> {
1808        let mut bytes_read = 0;
1809        for (mapping, len) in self.get_contiguous_mappings_at(addr, bytes.len())? {
1810            let next_offset = bytes_read + len;
1811            self.read_mapping_memory(
1812                (addr + bytes_read)?,
1813                mapping,
1814                &mut bytes[bytes_read..next_offset],
1815            )?;
1816            bytes_read = next_offset;
1817        }
1818
1819        if bytes_read != bytes.len() {
1820            error!(EFAULT)
1821        } else {
1822            // SAFETY: The created slice is properly aligned/sized since it
1823            // is a subset of the `bytes` slice. Note that `MaybeUninit<T>` has
1824            // the same layout as `T`. Also note that `bytes_read` bytes have
1825            // been properly initialized.
1826            let bytes = unsafe {
1827                std::slice::from_raw_parts_mut(bytes.as_mut_ptr() as *mut u8, bytes_read)
1828            };
1829            Ok(bytes)
1830        }
1831    }
1832
1833    /// Reads exactly `bytes.len()` bytes of memory from `addr`.
1834    ///
1835    /// # Parameters
1836    /// - `addr`: The address to read data from.
1837    /// - `bytes`: The byte array to read into.
1838    fn read_mapping_memory<'a>(
1839        &self,
1840        addr: UserAddress,
1841        mapping: &Mapping,
1842        bytes: &'a mut [MaybeUninit<u8>],
1843    ) -> Result<&'a mut [u8], Errno> {
1844        if !mapping.can_read() {
1845            return error!(EFAULT, "read_mapping_memory called on unreadable mapping");
1846        }
1847        match self.get_mapping_backing(mapping) {
1848            MappingBacking::Memory(backing) => backing.read_memory(addr, bytes),
1849            MappingBacking::PrivateAnonymous => self.private_anonymous.read_memory(addr, bytes),
1850        }
1851    }
1852
1853    /// Reads bytes starting at `addr`, continuing until either `bytes.len()` bytes have been read
1854    /// or no more bytes can be read.
1855    ///
1856    /// This is used, for example, to read null-terminated strings where the exact length is not
1857    /// known, only the maximum length is.
1858    ///
1859    /// # Parameters
1860    /// - `addr`: The address to read data from.
1861    /// - `bytes`: The byte array to read into.
1862    fn read_memory_partial<'a>(
1863        &self,
1864        addr: UserAddress,
1865        bytes: &'a mut [MaybeUninit<u8>],
1866    ) -> Result<&'a mut [u8], Errno> {
1867        let mut bytes_read = 0;
1868        for (mapping, len) in self.get_contiguous_mappings_at(addr, bytes.len())? {
1869            let next_offset = bytes_read + len;
1870            if self
1871                .read_mapping_memory(
1872                    (addr + bytes_read)?,
1873                    mapping,
1874                    &mut bytes[bytes_read..next_offset],
1875                )
1876                .is_err()
1877            {
1878                break;
1879            }
1880            bytes_read = next_offset;
1881        }
1882
1883        // If at least one byte was requested but we got none, it means that `addr` was invalid.
1884        if !bytes.is_empty() && bytes_read == 0 {
1885            error!(EFAULT)
1886        } else {
1887            // SAFETY: The created slice is properly aligned/sized since it
1888            // is a subset of the `bytes` slice. Note that `MaybeUninit<T>` has
1889            // the same layout as `T`. Also note that `bytes_read` bytes have
1890            // been properly initialized.
1891            let bytes = unsafe {
1892                std::slice::from_raw_parts_mut(bytes.as_mut_ptr() as *mut u8, bytes_read)
1893            };
1894            Ok(bytes)
1895        }
1896    }
1897
1898    /// Like `read_memory_partial` but only returns the bytes up to and including
1899    /// a null (zero) byte.
1900    fn read_memory_partial_until_null_byte<'a>(
1901        &self,
1902        addr: UserAddress,
1903        bytes: &'a mut [MaybeUninit<u8>],
1904    ) -> Result<&'a mut [u8], Errno> {
1905        let read_bytes = self.read_memory_partial(addr, bytes)?;
1906        let max_len = memchr::memchr(b'\0', read_bytes)
1907            .map_or_else(|| read_bytes.len(), |null_index| null_index + 1);
1908        Ok(&mut read_bytes[..max_len])
1909    }
1910
1911    /// Writes the provided bytes.
1912    ///
1913    /// In case of success, the number of bytes written will always be `bytes.len()`.
1914    ///
1915    /// # Parameters
1916    /// - `addr`: The address to write to.
1917    /// - `bytes`: The bytes to write.
1918    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
1919        let mut bytes_written = 0;
1920        for (mapping, len) in self.get_contiguous_mappings_at(addr, bytes.len())? {
1921            let next_offset = bytes_written + len;
1922            self.write_mapping_memory(
1923                (addr + bytes_written)?,
1924                mapping,
1925                &bytes[bytes_written..next_offset],
1926            )?;
1927            bytes_written = next_offset;
1928        }
1929
1930        if bytes_written != bytes.len() { error!(EFAULT) } else { Ok(bytes.len()) }
1931    }
1932
1933    /// Writes the provided bytes to `addr`.
1934    ///
1935    /// # Parameters
1936    /// - `addr`: The address to write to.
1937    /// - `bytes`: The bytes to write to the memory object.
1938    fn write_mapping_memory(
1939        &self,
1940        addr: UserAddress,
1941        mapping: &Mapping,
1942        bytes: &[u8],
1943    ) -> Result<(), Errno> {
1944        if !mapping.can_write() {
1945            return error!(EFAULT, "write_mapping_memory called on unwritable memory");
1946        }
1947        match self.get_mapping_backing(mapping) {
1948            MappingBacking::Memory(backing) => backing.write_memory(addr, bytes),
1949            MappingBacking::PrivateAnonymous => self.private_anonymous.write_memory(addr, bytes),
1950        }
1951    }
1952
1953    /// Writes bytes starting at `addr`, continuing until either `bytes.len()` bytes have been
1954    /// written or no more bytes can be written.
1955    ///
1956    /// # Parameters
1957    /// - `addr`: The address to read data from.
1958    /// - `bytes`: The byte array to write from.
1959    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
1960        let mut bytes_written = 0;
1961        for (mapping, len) in self.get_contiguous_mappings_at(addr, bytes.len())? {
1962            let next_offset = bytes_written + len;
1963            if self
1964                .write_mapping_memory(
1965                    (addr + bytes_written)?,
1966                    mapping,
1967                    &bytes[bytes_written..next_offset],
1968                )
1969                .is_err()
1970            {
1971                break;
1972            }
1973            bytes_written = next_offset;
1974        }
1975
1976        if !bytes.is_empty() && bytes_written == 0 { error!(EFAULT) } else { Ok(bytes.len()) }
1977    }
1978
1979    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
1980        let mut bytes_written = 0;
1981        for (mapping, len) in self.get_contiguous_mappings_at(addr, length)? {
1982            let next_offset = bytes_written + len;
1983            if self.zero_mapping((addr + bytes_written)?, mapping, len).is_err() {
1984                break;
1985            }
1986            bytes_written = next_offset;
1987        }
1988
1989        if length != bytes_written { error!(EFAULT) } else { Ok(length) }
1990    }
1991
1992    fn zero_mapping(
1993        &self,
1994        addr: UserAddress,
1995        mapping: &Mapping,
1996        length: usize,
1997    ) -> Result<usize, Errno> {
1998        if !mapping.can_write() {
1999            return error!(EFAULT);
2000        }
2001
2002        match self.get_mapping_backing(mapping) {
2003            MappingBacking::Memory(backing) => backing.zero(addr, length),
2004            MappingBacking::PrivateAnonymous => self.private_anonymous.zero(addr, length),
2005        }
2006    }
2007
2008    pub fn create_memory_backing(
2009        &self,
2010        base: UserAddress,
2011        memory: Arc<MemoryObject>,
2012        memory_offset: u64,
2013    ) -> MappingBacking {
2014        MappingBacking::Memory(Box::new(MappingBackingMemory::new(base, memory, memory_offset)))
2015    }
2016
2017    pub fn get_mapping_backing<'a>(&self, mapping: &'a Mapping) -> &'a MappingBacking {
2018        mapping.get_backing_internal()
2019    }
2020
2021    fn get_aio_context(&self, addr: UserAddress) -> Option<(Range<UserAddress>, Arc<AioContext>)> {
2022        let Some((range, mapping)) = self.mappings.get(addr) else {
2023            return None;
2024        };
2025        let MappingNameRef::AioContext(ref aio_context) = mapping.name() else {
2026            return None;
2027        };
2028        if !mapping.can_read() {
2029            return None;
2030        }
2031        Some((range.clone(), Arc::clone(aio_context)))
2032    }
2033
2034    fn find_uffd<L>(&self, locked: &mut Locked<L>, addr: UserAddress) -> Option<Arc<UserFault>>
2035    where
2036        L: LockBefore<UserFaultInner>,
2037    {
2038        for userfault in self.userfaultfds.iter() {
2039            if let Some(userfault) = userfault.upgrade() {
2040                if userfault.contains_addr(locked, addr) {
2041                    return Some(userfault);
2042                }
2043            }
2044        }
2045        None
2046    }
2047
2048    pub fn mrelease(&self) -> Result<(), Errno> {
2049        self.private_anonymous
2050            .zero(UserAddress::from_ptr(self.user_vmar_info.base), self.user_vmar_info.len)?;
2051        return Ok(());
2052    }
2053
2054    fn cache_flush(&self, range: Range<UserAddress>) -> Result<(), Errno> {
2055        let mut addr = range.start;
2056        let size = range.end - range.start;
2057        for (mapping, len) in self.get_contiguous_mappings_at(addr, size)? {
2058            if !mapping.can_read() {
2059                return error!(EFAULT);
2060            }
2061            // SAFETY: This is operating on a readable restricted mode mapping and will not fault.
2062            zx::Status::ok(unsafe {
2063                zx::sys::zx_cache_flush(
2064                    addr.ptr() as *const u8,
2065                    len,
2066                    zx::sys::ZX_CACHE_FLUSH_DATA | zx::sys::ZX_CACHE_FLUSH_INSN,
2067                )
2068            })
2069            .map_err(impossible_error)?;
2070
2071            addr = (addr + len).unwrap(); // unwrap since we're iterating within the address space.
2072        }
2073        // Did we flush the entire range?
2074        if addr != range.end { error!(EFAULT) } else { Ok(()) }
2075    }
2076
2077    // Returns details of mappings in the `user_vmar`, or an empty vector if the `user_vmar` has
2078    // been destroyed.
2079    fn with_zx_mappings<R>(
2080        &self,
2081        current_task: &CurrentTask,
2082        op: impl FnOnce(&[zx::MapInfo]) -> R,
2083    ) -> R {
2084        if self.user_vmar.is_invalid() {
2085            return op(&[]);
2086        };
2087
2088        MapInfoCache::get_or_init(current_task)
2089            .expect("must be able to retrieve map info cache")
2090            .with_map_infos(&self.user_vmar, |infos| {
2091                // No other https://fuchsia.dev/reference/syscalls/object_get_info?hl=en#errors
2092                // are possible, because we created the VMAR and the `zx` crate ensures that the
2093                // info query is well-formed.
2094                op(infos.expect("must be able to query mappings for private user VMAR"))
2095            })
2096    }
2097
2098    /// Register the address space managed by this memory manager for interest in
2099    /// receiving private expedited memory barriers of the given kind.
2100    pub fn register_membarrier_private_expedited(
2101        &mut self,
2102        mtype: MembarrierType,
2103    ) -> Result<(), Errno> {
2104        let registrations = &mut self.forkable_state.membarrier_registrations;
2105        match mtype {
2106            MembarrierType::Memory => {
2107                registrations.memory = true;
2108            }
2109            MembarrierType::SyncCore => {
2110                registrations.sync_core = true;
2111            }
2112        }
2113        Ok(())
2114    }
2115
2116    /// Checks if the address space managed by this memory manager is registered
2117    /// for interest in private expedited barriers of the given kind.
2118    pub fn membarrier_private_expedited_registered(&self, mtype: MembarrierType) -> bool {
2119        let registrations = &self.forkable_state.membarrier_registrations;
2120        match mtype {
2121            MembarrierType::Memory => registrations.memory,
2122            MembarrierType::SyncCore => registrations.sync_core,
2123        }
2124    }
2125
2126    fn force_write_memory(
2127        &mut self,
2128        addr: UserAddress,
2129        bytes: &[u8],
2130        released_mappings: &mut ReleasedMappings,
2131    ) -> Result<(), Errno> {
2132        let (range, mapping) = self.mappings.get(addr).ok_or_else(|| errno!(EFAULT))?;
2133        if range.end < addr.saturating_add(bytes.len()) {
2134            track_stub!(
2135                TODO("https://fxbug.dev/445790710"),
2136                "ptrace poke across multiple mappings"
2137            );
2138            return error!(EFAULT);
2139        }
2140
2141        // Don't create CoW copy of shared memory, go through regular syscall writing.
2142        if mapping.flags().contains(MappingFlags::SHARED) {
2143            if !mapping.can_write() {
2144                // Linux returns EIO here instead of EFAULT.
2145                return error!(EIO);
2146            }
2147            return self.write_mapping_memory(addr, mapping, &bytes);
2148        }
2149
2150        let backing = match self.get_mapping_backing(mapping) {
2151            MappingBacking::PrivateAnonymous => {
2152                // Starnix has a writable handle to private anonymous memory.
2153                return self.private_anonymous.write_memory(addr, &bytes);
2154            }
2155            MappingBacking::Memory(backing) => backing,
2156        };
2157
2158        let vmo = backing.memory().as_vmo().ok_or_else(|| errno!(EFAULT))?;
2159        let addr_offset = backing.address_to_offset(addr);
2160        let can_exec =
2161            vmo.basic_info().expect("get VMO handle info").rights.contains(Rights::EXECUTE);
2162
2163        // Attempt to write to existing VMO
2164        match vmo.write(&bytes, addr_offset) {
2165            Ok(()) => {
2166                if can_exec {
2167                    // Issue a barrier to avoid executing stale instructions.
2168                    system_barrier(BarrierType::InstructionStream);
2169                }
2170                return Ok(());
2171            }
2172
2173            Err(zx::Status::ACCESS_DENIED) => { /* Fall through */ }
2174
2175            Err(status) => {
2176                return Err(MemoryManager::get_errno_for_vmo_err(status));
2177            }
2178        }
2179
2180        // Create a CoW child of the entire VMO and swap with the backing.
2181        let mapping_offset = backing.address_to_offset(range.start);
2182        let len = range.end - range.start;
2183
2184        // 1. Obtain a writable child of the VMO.
2185        let size = vmo.get_size().map_err(MemoryManager::get_errno_for_vmo_err)?;
2186        let child_vmo = vmo
2187            .create_child(VmoChildOptions::SNAPSHOT_AT_LEAST_ON_WRITE, 0, size)
2188            .map_err(MemoryManager::get_errno_for_vmo_err)?;
2189
2190        // 2. Modify the memory.
2191        child_vmo.write(&bytes, addr_offset).map_err(MemoryManager::get_errno_for_vmo_err)?;
2192
2193        // 3. If needed, remint the VMO as executable. Zircon flushes instruction caches when
2194        // mapping executable memory below, so a barrier isn't necessary here.
2195        let child_vmo = if can_exec {
2196            child_vmo
2197                .replace_as_executable(&VMEX_RESOURCE)
2198                .map_err(MemoryManager::get_errno_for_vmo_err)?
2199        } else {
2200            child_vmo
2201        };
2202
2203        // 4. Map the new VMO into user VMAR
2204        let memory = Arc::new(MemoryObject::from(child_vmo));
2205        let mapped_addr = self.map_in_user_vmar(
2206            SelectedAddress::FixedOverwrite(range.start),
2207            &memory,
2208            mapping_offset,
2209            len,
2210            mapping.flags(),
2211            false,
2212        )?;
2213        assert_eq!(mapped_addr, range.start);
2214
2215        // 5. Update mappings
2216        let new_backing = MappingBackingMemory::new(range.start, memory, mapping_offset);
2217
2218        let mut new_mapping = mapping.clone();
2219        new_mapping.set_backing_internal(MappingBacking::Memory(Box::new(new_backing)));
2220
2221        let range = range.clone();
2222        released_mappings.extend(self.mappings.insert(range, new_mapping));
2223
2224        Ok(())
2225    }
2226
2227    fn set_brk<L>(
2228        &mut self,
2229        locked: &mut Locked<L>,
2230        current_task: &CurrentTask,
2231        mm: &Arc<MemoryManager>,
2232        addr: UserAddress,
2233        released_mappings: &mut ReleasedMappings,
2234    ) -> Result<UserAddress, Errno>
2235    where
2236        L: LockBefore<ThreadGroupLimits>,
2237    {
2238        let rlimit_data = std::cmp::min(
2239            PROGRAM_BREAK_LIMIT,
2240            current_task.thread_group().get_rlimit(locked, Resource::DATA),
2241        );
2242
2243        let brk = match self.brk.clone() {
2244            None => {
2245                let brk = ProgramBreak { base: self.brk_origin, current: self.brk_origin };
2246                self.brk = Some(brk.clone());
2247                brk
2248            }
2249            Some(brk) => brk,
2250        };
2251
2252        let Ok(last_address) = brk.base + rlimit_data else {
2253            // The requested program break is out-of-range. We're supposed to simply
2254            // return the current program break.
2255            return Ok(brk.current);
2256        };
2257
2258        if addr < brk.base || addr > last_address {
2259            // The requested program break is out-of-range. We're supposed to simply
2260            // return the current program break.
2261            return Ok(brk.current);
2262        }
2263
2264        let old_end = brk.current.round_up(*PAGE_SIZE).unwrap();
2265        let new_end = addr.round_up(*PAGE_SIZE).unwrap();
2266
2267        match new_end.cmp(&old_end) {
2268            std::cmp::Ordering::Less => {
2269                // Shrinking the program break removes any mapped pages in the
2270                // affected range, regardless of whether they were actually program
2271                // break pages, or other mappings.
2272                let delta = old_end - new_end;
2273
2274                if self.unmap(mm, new_end, delta, released_mappings).is_err() {
2275                    return Ok(brk.current);
2276                }
2277            }
2278            std::cmp::Ordering::Greater => {
2279                let range = old_end..new_end;
2280                let delta = new_end - old_end;
2281
2282                // Check for mappings over the program break region.
2283                if self.mappings.range(range).next().is_some() {
2284                    return Ok(brk.current);
2285                }
2286
2287                if self
2288                    .map_anonymous(
2289                        mm,
2290                        DesiredAddress::FixedOverwrite(old_end),
2291                        delta,
2292                        ProtectionFlags::READ | ProtectionFlags::WRITE,
2293                        MappingOptions::ANONYMOUS,
2294                        MappingName::Heap,
2295                        released_mappings,
2296                    )
2297                    .is_err()
2298                {
2299                    return Ok(brk.current);
2300                }
2301            }
2302            _ => {}
2303        };
2304
2305        // Any required updates to the program break succeeded, so update internal state.
2306        let mut new_brk = brk;
2307        new_brk.current = addr;
2308        self.brk = Some(new_brk);
2309
2310        Ok(addr)
2311    }
2312
2313    fn register_with_uffd<L>(
2314        &mut self,
2315        locked: &mut Locked<L>,
2316        addr: UserAddress,
2317        length: usize,
2318        userfault: &Arc<UserFault>,
2319        mode: FaultRegisterMode,
2320        released_mappings: &mut ReleasedMappings,
2321    ) -> Result<(), Errno>
2322    where
2323        L: LockBefore<UserFaultInner>,
2324    {
2325        let end_addr = addr.checked_add(length).ok_or_else(|| errno!(EINVAL))?;
2326        let range_for_op = addr..end_addr;
2327        let mut updates = vec![];
2328
2329        for (range, mapping) in self.mappings.range(range_for_op.clone()) {
2330            if !mapping.private_anonymous() {
2331                track_stub!(TODO("https://fxbug.dev/391599171"), "uffd for shmem and hugetlbfs");
2332                return error!(EINVAL);
2333            }
2334            if mapping.flags().contains(MappingFlags::UFFD) {
2335                return error!(EBUSY);
2336            }
2337            let range = range.intersect(&range_for_op);
2338            let mut mapping = mapping.clone();
2339            mapping.set_uffd(mode);
2340            updates.push((range, mapping));
2341        }
2342        if updates.is_empty() {
2343            return error!(EINVAL);
2344        }
2345
2346        self.protect_vmar_range(addr, length, ProtectionFlags::empty())
2347            .expect("Failed to remove protections on uffd-registered range");
2348
2349        // Use a separate loop to avoid mutating the mappings structure while iterating over it.
2350        for (range, mapping) in updates {
2351            released_mappings.extend(self.mappings.insert(range, mapping));
2352        }
2353
2354        userfault.insert_pages(locked, range_for_op, false);
2355
2356        Ok(())
2357    }
2358
2359    fn unregister_range_from_uffd<L>(
2360        &mut self,
2361        locked: &mut Locked<L>,
2362        userfault: &Arc<UserFault>,
2363        addr: UserAddress,
2364        length: usize,
2365        released_mappings: &mut ReleasedMappings,
2366    ) -> Result<(), Errno>
2367    where
2368        L: LockBefore<UserFaultInner>,
2369    {
2370        let end_addr = addr.checked_add(length).ok_or_else(|| errno!(EINVAL))?;
2371        let range_for_op = addr..end_addr;
2372        let mut updates = vec![];
2373
2374        for (range, mapping) in self.mappings.range(range_for_op.clone()) {
2375            if !mapping.private_anonymous() {
2376                track_stub!(TODO("https://fxbug.dev/391599171"), "uffd for shmem and hugetlbfs");
2377                return error!(EINVAL);
2378            }
2379            if mapping.flags().contains(MappingFlags::UFFD) {
2380                let range = range.intersect(&range_for_op);
2381                if userfault.remove_pages(locked, range.clone()) {
2382                    let mut mapping = mapping.clone();
2383                    mapping.clear_uffd();
2384                    updates.push((range, mapping));
2385                }
2386            }
2387        }
2388        for (range, mapping) in updates {
2389            let length = range.end - range.start;
2390            let restored_flags = mapping.flags().access_flags();
2391
2392            released_mappings.extend(self.mappings.insert(range.clone(), mapping));
2393
2394            self.protect_vmar_range(range.start, length, restored_flags)
2395                .expect("Failed to restore original protection bits on uffd-registered range");
2396        }
2397        Ok(())
2398    }
2399
2400    fn unregister_uffd<L>(
2401        &mut self,
2402        locked: &mut Locked<L>,
2403        userfault: &Arc<UserFault>,
2404        released_mappings: &mut ReleasedMappings,
2405    ) where
2406        L: LockBefore<UserFaultInner>,
2407    {
2408        let mut updates = vec![];
2409
2410        for (range, mapping) in self.mappings.iter() {
2411            if mapping.flags().contains(MappingFlags::UFFD) {
2412                for range in userfault.get_registered_pages_overlapping_range(locked, range.clone())
2413                {
2414                    let mut mapping = mapping.clone();
2415                    mapping.clear_uffd();
2416                    updates.push((range.clone(), mapping));
2417                }
2418            }
2419        }
2420        // Use a separate loop to avoid mutating the mappings structure while iterating over it.
2421        for (range, mapping) in updates {
2422            let length = range.end - range.start;
2423            let restored_flags = mapping.flags().access_flags();
2424            released_mappings.extend(self.mappings.insert(range.clone(), mapping));
2425            // We can't recover from an error here as this is run during the cleanup.
2426            self.protect_vmar_range(range.start, length, restored_flags)
2427                .expect("Failed to restore original protection bits on uffd-registered range");
2428        }
2429
2430        userfault.remove_pages(
2431            locked,
2432            UserAddress::from_ptr(RESTRICTED_ASPACE_BASE)
2433                ..UserAddress::from_ptr(RESTRICTED_ASPACE_HIGHEST_ADDRESS),
2434        );
2435
2436        let weak_userfault = Arc::downgrade(userfault);
2437        self.userfaultfds.retain(|uf| !Weak::ptr_eq(uf, &weak_userfault));
2438    }
2439
2440    fn set_mapping_name(
2441        &mut self,
2442        addr: UserAddress,
2443        length: usize,
2444        name: Option<FsString>,
2445        released_mappings: &mut ReleasedMappings,
2446    ) -> Result<(), Errno> {
2447        if addr.ptr() % *PAGE_SIZE as usize != 0 {
2448            return error!(EINVAL);
2449        }
2450        let end = match addr.checked_add(length) {
2451            Some(addr) => addr.round_up(*PAGE_SIZE).map_err(|_| errno!(ENOMEM))?,
2452            None => return error!(EINVAL),
2453        };
2454
2455        let mappings_in_range =
2456            self.mappings.range(addr..end).map(|(r, m)| (r.clone(), m.clone())).collect::<Vec<_>>();
2457
2458        if mappings_in_range.is_empty() {
2459            return error!(EINVAL);
2460        }
2461        if !mappings_in_range.first().unwrap().0.contains(&addr) {
2462            return error!(ENOMEM);
2463        }
2464
2465        let mut last_range_end = None;
2466        // There's no get_mut on RangeMap, because it would be hard to implement correctly in
2467        // combination with merging of adjacent mappings. Instead, make a copy, change the copy,
2468        // and insert the copy.
2469        for (mut range, mut mapping) in mappings_in_range {
2470            if mapping.name().is_file() {
2471                // It's invalid to assign a name to a file-backed mapping.
2472                return error!(EBADF);
2473            }
2474            // Handle mappings that start before the region to be named.
2475            range.start = std::cmp::max(range.start, addr);
2476            // Handle mappings that extend past the region to be named.
2477            range.end = std::cmp::min(range.end, end);
2478
2479            if let Some(last_range_end) = last_range_end {
2480                if last_range_end != range.start {
2481                    // The name must apply to a contiguous range of mapped pages.
2482                    return error!(ENOMEM);
2483                }
2484            }
2485            last_range_end = Some(range.end.round_up(*PAGE_SIZE)?);
2486            // TODO(b/310255065): We have no place to store names in a way visible to programs outside of Starnix
2487            // such as memory analysis tools.
2488            if let MappingBacking::Memory(backing) = self.get_mapping_backing(&mapping) {
2489                match &name {
2490                    Some(memory_name) => {
2491                        backing.memory().set_zx_name(memory_name);
2492                    }
2493                    None => {
2494                        backing.memory().set_zx_name(b"");
2495                    }
2496                }
2497            }
2498            mapping.set_name(match &name {
2499                Some(name) => MappingName::Vma(FlyByteStr::new(name.as_bytes())),
2500                None => MappingName::None,
2501            });
2502            released_mappings.extend(self.mappings.insert(range, mapping));
2503        }
2504        if let Some(last_range_end) = last_range_end {
2505            if last_range_end < end {
2506                // The name must apply to a contiguous range of mapped pages.
2507                return error!(ENOMEM);
2508            }
2509        }
2510        Ok(())
2511    }
2512}
2513
2514/// The memory pinning shadow process used for mlock().
2515///
2516/// Uses its own distinct shadow process so that it doesn't interfere with other uses of memory
2517/// pinning.
2518pub struct MlockShadowProcess(memory_pinning::ShadowProcess);
2519
2520fn create_user_vmar(root_vmar: &zx::Vmar, arch_width: ArchWidth) -> Result<zx::Vmar, zx::Status> {
2521    let mut vmar_info = root_vmar.info()?;
2522    if arch_width.is_arch32() {
2523        vmar_info.len = (LOWER_4GB_LIMIT.ptr() - vmar_info.base) as usize;
2524    } else {
2525        assert_eq!(vmar_info.len, RESTRICTED_ASPACE_HIGHEST_ADDRESS - vmar_info.base);
2526    }
2527    let (vmar, ptr) = root_vmar.allocate(
2528        0,
2529        vmar_info.len,
2530        zx::VmarFlags::SPECIFIC
2531            | zx::VmarFlags::CAN_MAP_SPECIFIC
2532            | zx::VmarFlags::CAN_MAP_READ
2533            | zx::VmarFlags::CAN_MAP_WRITE
2534            | zx::VmarFlags::CAN_MAP_EXECUTE,
2535    )?;
2536    assert_eq!(ptr, vmar_info.base);
2537    Ok(vmar)
2538}
2539
2540/// A memory manager for another thread.
2541///
2542/// When accessing memory through this object, we use less efficient codepaths that work across
2543/// address spaces.
2544pub struct RemoteMemoryManager {
2545    mm: Arc<MemoryManager>,
2546}
2547
2548impl RemoteMemoryManager {
2549    fn new(mm: Arc<MemoryManager>) -> Self {
2550        Self { mm }
2551    }
2552}
2553
2554// If we just have a MemoryManager, we cannot assume that its address space is current, which means
2555// we need to use the slower "syscall" mechanism to access its memory.
2556impl MemoryAccessor for RemoteMemoryManager {
2557    fn read_memory<'a>(
2558        &self,
2559        addr: UserAddress,
2560        bytes: &'a mut [MaybeUninit<u8>],
2561    ) -> Result<&'a mut [u8], Errno> {
2562        self.mm.syscall_read_memory(addr, bytes)
2563    }
2564
2565    fn read_memory_partial_until_null_byte<'a>(
2566        &self,
2567        addr: UserAddress,
2568        bytes: &'a mut [MaybeUninit<u8>],
2569    ) -> Result<&'a mut [u8], Errno> {
2570        self.mm.syscall_read_memory_partial_until_null_byte(addr, bytes)
2571    }
2572
2573    fn read_memory_partial<'a>(
2574        &self,
2575        addr: UserAddress,
2576        bytes: &'a mut [MaybeUninit<u8>],
2577    ) -> Result<&'a mut [u8], Errno> {
2578        self.mm.syscall_read_memory_partial(addr, bytes)
2579    }
2580
2581    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2582        self.mm.syscall_write_memory(addr, bytes)
2583    }
2584
2585    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2586        self.mm.syscall_write_memory_partial(addr, bytes)
2587    }
2588
2589    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
2590        self.mm.syscall_zero(addr, length)
2591    }
2592}
2593
2594impl TaskMemoryAccessor for RemoteMemoryManager {
2595    fn maximum_valid_address(&self) -> Option<UserAddress> {
2596        Some(self.mm.maximum_valid_user_address)
2597    }
2598}
2599
2600impl MemoryManager {
2601    pub fn summarize(&self, summary: &mut crate::mm::MappingSummary) {
2602        let state = self.state.read();
2603        for (_, mapping) in state.mappings.iter() {
2604            summary.add(&state, mapping);
2605        }
2606    }
2607
2608    pub fn get_mappings_for_vmsplice(
2609        self: &Arc<MemoryManager>,
2610        buffers: &UserBuffers,
2611    ) -> Result<Vec<Arc<VmsplicePayload>>, Errno> {
2612        self.state.read().get_mappings_for_vmsplice(self, buffers)
2613    }
2614
2615    pub fn has_same_address_space(&self, other: &Self) -> bool {
2616        self.root_vmar == other.root_vmar
2617    }
2618
2619    pub fn unified_read_memory<'a>(
2620        &self,
2621        current_task: &CurrentTask,
2622        addr: UserAddress,
2623        bytes: &'a mut [MaybeUninit<u8>],
2624    ) -> Result<&'a mut [u8], Errno> {
2625        debug_assert!(self.has_same_address_space(&current_task.mm().unwrap()));
2626
2627        if let Some(usercopy) = usercopy() {
2628            let (read_bytes, unread_bytes) = usercopy.copyin(addr.ptr(), bytes);
2629            if unread_bytes.is_empty() { Ok(read_bytes) } else { error!(EFAULT) }
2630        } else {
2631            self.syscall_read_memory(addr, bytes)
2632        }
2633    }
2634
2635    pub fn syscall_read_memory<'a>(
2636        &self,
2637        addr: UserAddress,
2638        bytes: &'a mut [MaybeUninit<u8>],
2639    ) -> Result<&'a mut [u8], Errno> {
2640        self.state.read().read_memory(addr, bytes)
2641    }
2642
2643    pub fn unified_read_memory_partial_until_null_byte<'a>(
2644        &self,
2645        current_task: &CurrentTask,
2646        addr: UserAddress,
2647        bytes: &'a mut [MaybeUninit<u8>],
2648    ) -> Result<&'a mut [u8], Errno> {
2649        debug_assert!(self.has_same_address_space(&current_task.mm().unwrap()));
2650
2651        if let Some(usercopy) = usercopy() {
2652            let (read_bytes, unread_bytes) = usercopy.copyin_until_null_byte(addr.ptr(), bytes);
2653            if read_bytes.is_empty() && !unread_bytes.is_empty() {
2654                error!(EFAULT)
2655            } else {
2656                Ok(read_bytes)
2657            }
2658        } else {
2659            self.syscall_read_memory_partial_until_null_byte(addr, bytes)
2660        }
2661    }
2662
2663    pub fn syscall_read_memory_partial_until_null_byte<'a>(
2664        &self,
2665        addr: UserAddress,
2666        bytes: &'a mut [MaybeUninit<u8>],
2667    ) -> Result<&'a mut [u8], Errno> {
2668        self.state.read().read_memory_partial_until_null_byte(addr, bytes)
2669    }
2670
2671    pub fn unified_read_memory_partial<'a>(
2672        &self,
2673        current_task: &CurrentTask,
2674        addr: UserAddress,
2675        bytes: &'a mut [MaybeUninit<u8>],
2676    ) -> Result<&'a mut [u8], Errno> {
2677        debug_assert!(self.has_same_address_space(&current_task.mm().unwrap()));
2678
2679        if let Some(usercopy) = usercopy() {
2680            let (read_bytes, unread_bytes) = usercopy.copyin(addr.ptr(), bytes);
2681            if read_bytes.is_empty() && !unread_bytes.is_empty() {
2682                error!(EFAULT)
2683            } else {
2684                Ok(read_bytes)
2685            }
2686        } else {
2687            self.syscall_read_memory_partial(addr, bytes)
2688        }
2689    }
2690
2691    pub fn syscall_read_memory_partial<'a>(
2692        &self,
2693        addr: UserAddress,
2694        bytes: &'a mut [MaybeUninit<u8>],
2695    ) -> Result<&'a mut [u8], Errno> {
2696        self.state.read().read_memory_partial(addr, bytes)
2697    }
2698
2699    pub fn unified_write_memory(
2700        &self,
2701        current_task: &CurrentTask,
2702        addr: UserAddress,
2703        bytes: &[u8],
2704    ) -> Result<usize, Errno> {
2705        debug_assert!(self.has_same_address_space(&current_task.mm().unwrap()));
2706
2707        if let Some(usercopy) = usercopy() {
2708            let num_copied = usercopy.copyout(bytes, addr.ptr());
2709            if num_copied != bytes.len() {
2710                error!(
2711                    EFAULT,
2712                    format!("expected {:?} bytes, copied {:?} bytes", bytes.len(), num_copied)
2713                )
2714            } else {
2715                Ok(num_copied)
2716            }
2717        } else {
2718            self.syscall_write_memory(addr, bytes)
2719        }
2720    }
2721
2722    /// Write `bytes` to memory address `addr`, making a copy-on-write child of the VMO backing and
2723    /// replacing the mapping if necessary.
2724    ///
2725    /// NOTE: this bypasses userspace's memory protection configuration and should only be called
2726    /// by codepaths like ptrace which bypass memory protection.
2727    pub fn force_write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<(), Errno> {
2728        let mut state = self.state.write();
2729        let mut released_mappings = ReleasedMappings::default();
2730        let result = state.force_write_memory(addr, bytes, &mut released_mappings);
2731        released_mappings.finalize(state);
2732        result
2733    }
2734
2735    pub fn syscall_write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2736        self.state.read().write_memory(addr, bytes)
2737    }
2738
2739    pub fn unified_write_memory_partial(
2740        &self,
2741        current_task: &CurrentTask,
2742        addr: UserAddress,
2743        bytes: &[u8],
2744    ) -> Result<usize, Errno> {
2745        debug_assert!(self.has_same_address_space(&current_task.mm().unwrap()));
2746
2747        if let Some(usercopy) = usercopy() {
2748            let num_copied = usercopy.copyout(bytes, addr.ptr());
2749            if num_copied == 0 && !bytes.is_empty() { error!(EFAULT) } else { Ok(num_copied) }
2750        } else {
2751            self.syscall_write_memory_partial(addr, bytes)
2752        }
2753    }
2754
2755    pub fn syscall_write_memory_partial(
2756        &self,
2757        addr: UserAddress,
2758        bytes: &[u8],
2759    ) -> Result<usize, Errno> {
2760        self.state.read().write_memory_partial(addr, bytes)
2761    }
2762
2763    pub fn unified_zero(
2764        &self,
2765        current_task: &CurrentTask,
2766        addr: UserAddress,
2767        length: usize,
2768    ) -> Result<usize, Errno> {
2769        debug_assert!(self.has_same_address_space(&current_task.mm().unwrap()));
2770
2771        {
2772            let page_size = *PAGE_SIZE as usize;
2773            // Get the page boundary immediately following `addr` if `addr` is
2774            // not page aligned.
2775            let next_page_boundary = round_up_to_system_page_size(addr.ptr())?;
2776            // The number of bytes needed to zero at least a full page (not just
2777            // a pages worth of bytes) starting at `addr`.
2778            let length_with_atleast_one_full_page = page_size + (next_page_boundary - addr.ptr());
2779            // If at least one full page is being zeroed, go through the memory object since Zircon
2780            // can swap the mapped pages with the zero page which should be cheaper than zeroing
2781            // out a pages worth of bytes manually.
2782            //
2783            // If we are not zeroing out a full page, then go through usercopy
2784            // if unified aspaces is enabled.
2785            if length >= length_with_atleast_one_full_page {
2786                return self.syscall_zero(addr, length);
2787            }
2788        }
2789
2790        if let Some(usercopy) = usercopy() {
2791            if usercopy.zero(addr.ptr(), length) == length { Ok(length) } else { error!(EFAULT) }
2792        } else {
2793            self.syscall_zero(addr, length)
2794        }
2795    }
2796
2797    pub fn syscall_zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
2798        self.state.read().zero(addr, length)
2799    }
2800
2801    /// Obtain a reference to this memory manager that can be used from another thread.
2802    pub fn as_remote(self: &Arc<Self>) -> RemoteMemoryManager {
2803        RemoteMemoryManager::new(self.clone())
2804    }
2805
2806    /// Performs a data and instruction cache flush over the given address range.
2807    pub fn cache_flush(&self, range: Range<UserAddress>) -> Result<(), Errno> {
2808        self.state.read().cache_flush(range)
2809    }
2810
2811    /// Register the address space managed by this memory manager for interest in
2812    /// receiving private expedited memory barriers of the given type.
2813    pub fn register_membarrier_private_expedited(
2814        &self,
2815        mtype: MembarrierType,
2816    ) -> Result<(), Errno> {
2817        self.state.write().register_membarrier_private_expedited(mtype)
2818    }
2819
2820    /// Checks if the address space managed by this memory manager is registered
2821    /// for interest in private expedited barriers of the given kind.
2822    pub fn membarrier_private_expedited_registered(&self, mtype: MembarrierType) -> bool {
2823        self.state.read().membarrier_private_expedited_registered(mtype)
2824    }
2825}
2826
2827pub struct MemoryManager {
2828    /// The root VMAR for the child process.
2829    ///
2830    /// Instead of mapping memory directly in this VMAR, we map the memory in
2831    /// `state.user_vmar`.
2832    root_vmar: zx::Vmar,
2833
2834    /// The base address of the root_vmar.
2835    pub base_addr: UserAddress,
2836
2837    /// The futexes in this address space.
2838    pub futex: Arc<FutexTable<PrivateFutexKey>>,
2839
2840    /// Mutable state for the memory manager.
2841    pub state: RwLock<MemoryManagerState>,
2842
2843    /// Whether this address space is dumpable.
2844    pub dumpable: OrderedMutex<DumpPolicy, MmDumpable>,
2845
2846    /// Maximum valid user address for this vmar.
2847    pub maximum_valid_user_address: UserAddress,
2848
2849    /// In-flight payloads enqueued to a pipe as a consequence of a `vmsplice(2)`
2850    /// operation.
2851    ///
2852    /// For details on why we need to keep track of in-flight vmspliced payloads,
2853    /// see [`VmsplicePayload`].
2854    ///
2855    /// For details on why this isn't under the `RwLock` protected `MemoryManagerState`,
2856    /// See [`InflightVmsplicedPayloads::payloads`].
2857    pub inflight_vmspliced_payloads: InflightVmsplicedPayloads,
2858
2859    /// A mechanism to be notified when this `MemoryManager` is destroyed.
2860    pub drop_notifier: DropNotifier,
2861}
2862
2863impl MemoryManager {
2864    /// Returns a new
2865    pub fn new(root_vmar: zx::Vmar, arch_width: ArchWidth) -> Result<Self, zx::Status> {
2866        let user_vmar = create_user_vmar(&root_vmar, arch_width)?;
2867        let user_vmar_info = user_vmar.info()?;
2868        Ok(Self::from_vmar(root_vmar, user_vmar, user_vmar_info))
2869    }
2870
2871    fn from_vmar(root_vmar: zx::Vmar, user_vmar: zx::Vmar, user_vmar_info: zx::VmarInfo) -> Self {
2872        // Ensure that the `user_vmar_info` matches assumptions for 32- or 64-bit layout.
2873        debug_assert_eq!(RESTRICTED_ASPACE_BASE, user_vmar_info.base);
2874        debug_assert!(
2875            user_vmar_info.len == RESTRICTED_ASPACE_SIZE
2876                || user_vmar_info.len == LOWER_4GB_LIMIT.ptr() - user_vmar_info.base
2877        );
2878
2879        // The private anonymous backing memory object extend from the user address 0 up to the
2880        // highest mappable address. The pages below `user_vmar_info.base` are never mapped, but
2881        // including them in the memory object makes the math for mapping address to memory object
2882        // offsets simpler.
2883        let backing_size = (user_vmar_info.base + user_vmar_info.len) as u64;
2884
2885        MemoryManager {
2886            root_vmar,
2887            base_addr: UserAddress::from_ptr(user_vmar_info.base),
2888            futex: Arc::<FutexTable<PrivateFutexKey>>::default(),
2889            state: RwLock::new(MemoryManagerState {
2890                user_vmar: user_vmar,
2891                user_vmar_info,
2892                mappings: Default::default(),
2893                private_anonymous: PrivateAnonymousMemoryManager::new(backing_size),
2894                userfaultfds: Default::default(),
2895                shadow_mappings_for_mlock: Default::default(),
2896                forkable_state: Default::default(),
2897            }),
2898            // TODO(security): Reset to DISABLE, or the value in the fs.suid_dumpable sysctl, under
2899            // certain conditions as specified in the prctl(2) man page.
2900            dumpable: OrderedMutex::new(DumpPolicy::User),
2901            maximum_valid_user_address: UserAddress::from_ptr(
2902                user_vmar_info.base + user_vmar_info.len,
2903            ),
2904            inflight_vmspliced_payloads: Default::default(),
2905            drop_notifier: DropNotifier::default(),
2906        }
2907    }
2908
2909    pub fn set_brk<L>(
2910        self: &Arc<Self>,
2911        locked: &mut Locked<L>,
2912        current_task: &CurrentTask,
2913        addr: UserAddress,
2914    ) -> Result<UserAddress, Errno>
2915    where
2916        L: LockBefore<ThreadGroupLimits>,
2917    {
2918        let mut state = self.state.write();
2919        let mut released_mappings = ReleasedMappings::default();
2920        let result = state.set_brk(locked, current_task, self, addr, &mut released_mappings);
2921        released_mappings.finalize(state);
2922        result
2923    }
2924
2925    pub fn register_uffd(&self, userfault: &Arc<UserFault>) {
2926        let mut state = self.state.write();
2927        state.userfaultfds.push(Arc::downgrade(userfault));
2928    }
2929
2930    /// Register a given memory range with a userfault object.
2931    pub fn register_with_uffd<L>(
2932        self: &Arc<Self>,
2933        locked: &mut Locked<L>,
2934        addr: UserAddress,
2935        length: usize,
2936        userfault: &Arc<UserFault>,
2937        mode: FaultRegisterMode,
2938    ) -> Result<(), Errno>
2939    where
2940        L: LockBefore<UserFaultInner>,
2941    {
2942        let mut state = self.state.write();
2943        let mut released_mappings = ReleasedMappings::default();
2944        let result =
2945            state.register_with_uffd(locked, addr, length, userfault, mode, &mut released_mappings);
2946        released_mappings.finalize(state);
2947        result
2948    }
2949
2950    /// Unregister a given range from any userfault objects associated with it.
2951    pub fn unregister_range_from_uffd<L>(
2952        &self,
2953        locked: &mut Locked<L>,
2954        userfault: &Arc<UserFault>,
2955        addr: UserAddress,
2956        length: usize,
2957    ) -> Result<(), Errno>
2958    where
2959        L: LockBefore<UserFaultInner>,
2960    {
2961        let mut state = self.state.write();
2962        let mut released_mappings = ReleasedMappings::default();
2963        let result = state.unregister_range_from_uffd(
2964            locked,
2965            userfault,
2966            addr,
2967            length,
2968            &mut released_mappings,
2969        );
2970        released_mappings.finalize(state);
2971        result
2972    }
2973
2974    /// Unregister any mappings registered with a given userfault object. Used when closing the last
2975    /// file descriptor associated to it.
2976    pub fn unregister_uffd<L>(&self, locked: &mut Locked<L>, userfault: &Arc<UserFault>)
2977    where
2978        L: LockBefore<UserFaultInner>,
2979    {
2980        let mut state = self.state.write();
2981        let mut released_mappings = ReleasedMappings::default();
2982        state.unregister_uffd(locked, userfault, &mut released_mappings);
2983        released_mappings.finalize(state);
2984    }
2985
2986    /// Populate a range of pages registered with an userfaulfd according to a `populate` function.
2987    /// This will fail if the pages were not registered with userfaultfd, or if the page at `addr`
2988    /// was already populated. If any page other than the first one was populated, the `length`
2989    /// is adjusted to only include the first N unpopulated pages, and this adjusted length
2990    /// is then passed to `populate`. On success, returns the number of populated bytes.
2991    pub fn populate_from_uffd<F, L>(
2992        &self,
2993        locked: &mut Locked<L>,
2994        addr: UserAddress,
2995        length: usize,
2996        userfault: &Arc<UserFault>,
2997        populate: F,
2998    ) -> Result<usize, Errno>
2999    where
3000        F: FnOnce(&MemoryManagerState, usize) -> Result<usize, Errno>,
3001        L: LockBefore<UserFaultInner>,
3002    {
3003        let state = self.state.read();
3004
3005        // Check that the addr..length range is a contiguous range of mappings which are all
3006        // registered with an userfault object.
3007        let mut bytes_registered_with_uffd = 0;
3008        for (mapping, len) in state.get_contiguous_mappings_at(addr, length)? {
3009            if mapping.flags().contains(MappingFlags::UFFD) {
3010                // Check that the mapping is registered with the same uffd. This is not required,
3011                // but we don't support cross-uffd operations yet.
3012                if !userfault.contains_addr(locked, addr) {
3013                    track_stub!(
3014                        TODO("https://fxbug.dev/391599171"),
3015                        "operations across different uffds"
3016                    );
3017                    return error!(ENOTSUP);
3018                };
3019            } else {
3020                return error!(ENOENT);
3021            }
3022            bytes_registered_with_uffd += len;
3023        }
3024        if bytes_registered_with_uffd != length {
3025            return error!(ENOENT);
3026        }
3027
3028        let end_addr = addr.checked_add(length).ok_or_else(|| errno!(EINVAL))?;
3029
3030        // Determine how many pages in the requested range are already populated
3031        let first_populated =
3032            userfault.get_first_populated_page_after(locked, addr).ok_or_else(|| errno!(ENOENT))?;
3033        // If the very first page is already populated, uffd operations should just return EEXIST
3034        if first_populated == addr {
3035            return error!(EEXIST);
3036        }
3037        // Otherwise it is possible to do an incomplete operation by only populating pages until
3038        // the first populated one.
3039        let trimmed_end = std::cmp::min(first_populated, end_addr);
3040        let effective_length = trimmed_end - addr;
3041
3042        populate(&state, effective_length)?;
3043        userfault.insert_pages(locked, addr..trimmed_end, true);
3044
3045        // Since we used protection bits to force pagefaults, we now need to reverse this change by
3046        // restoring the protections on the underlying Zircon mappings to the "real" protection bits
3047        // that were kept in the Starnix mappings. This will prevent new pagefaults from being
3048        // generated. Only do this on the pages that were populated by this operation.
3049        for (range, mapping) in state.mappings.range(addr..trimmed_end) {
3050            let range_to_protect = range.intersect(&(addr..trimmed_end));
3051            let restored_flags = mapping.flags().access_flags();
3052            let length = range_to_protect.end - range_to_protect.start;
3053            state
3054                .protect_vmar_range(range_to_protect.start, length, restored_flags)
3055                .expect("Failed to restore original protection bits on uffd-registered range");
3056        }
3057        // Return the number of effectively populated bytes, which might be smaller than the
3058        // requested number.
3059        Ok(effective_length)
3060    }
3061
3062    pub fn zero_from_uffd<L>(
3063        &self,
3064        locked: &mut Locked<L>,
3065        addr: UserAddress,
3066        length: usize,
3067        userfault: &Arc<UserFault>,
3068    ) -> Result<usize, Errno>
3069    where
3070        L: LockBefore<UserFaultInner>,
3071    {
3072        self.populate_from_uffd(locked, addr, length, userfault, |state, effective_length| {
3073            state.zero(addr, effective_length)
3074        })
3075    }
3076
3077    pub fn fill_from_uffd<L>(
3078        &self,
3079        locked: &mut Locked<L>,
3080        addr: UserAddress,
3081        buf: &[u8],
3082        length: usize,
3083        userfault: &Arc<UserFault>,
3084    ) -> Result<usize, Errno>
3085    where
3086        L: LockBefore<UserFaultInner>,
3087    {
3088        self.populate_from_uffd(locked, addr, length, userfault, |state, effective_length| {
3089            state.write_memory(addr, &buf[..effective_length])
3090        })
3091    }
3092
3093    pub fn copy_from_uffd<L>(
3094        &self,
3095        locked: &mut Locked<L>,
3096        source_addr: UserAddress,
3097        dst_addr: UserAddress,
3098        length: usize,
3099        userfault: &Arc<UserFault>,
3100    ) -> Result<usize, Errno>
3101    where
3102        L: LockBefore<UserFaultInner>,
3103    {
3104        self.populate_from_uffd(locked, dst_addr, length, userfault, |state, effective_length| {
3105            let mut buf = vec![std::mem::MaybeUninit::uninit(); effective_length];
3106            let buf = state.read_memory(source_addr, &mut buf)?;
3107            state.write_memory(dst_addr, &buf[..effective_length])
3108        })
3109    }
3110
3111    /// Create a snapshot of the memory mapping from `self` into `target`. All
3112    /// memory mappings are copied entry-for-entry, and the copies end up at
3113    /// exactly the same addresses.
3114    pub fn snapshot_to<L>(
3115        &self,
3116        locked: &mut Locked<L>,
3117        target: &Arc<MemoryManager>,
3118    ) -> Result<(), Errno>
3119    where
3120        L: LockBefore<MmDumpable>,
3121    {
3122        // Hold the lock throughout the operation to uphold memory manager's invariants.
3123        // See mm/README.md.
3124        let state: &mut MemoryManagerState = &mut self.state.write();
3125        let mut target_state = target.state.write();
3126        debug_assert_eq!(state.user_vmar_info, target_state.user_vmar_info);
3127
3128        let mut clone_cache = HashMap::<zx::Koid, Arc<MemoryObject>>::new();
3129
3130        let backing_size = (state.user_vmar_info.base + state.user_vmar_info.len) as u64;
3131        target_state.private_anonymous = state.private_anonymous.snapshot(backing_size)?;
3132
3133        for (range, mapping) in state.mappings.iter() {
3134            if mapping.flags().contains(MappingFlags::DONTFORK) {
3135                continue;
3136            }
3137            // Locking is not inherited when forking.
3138            let target_mapping_flags = mapping.flags().difference(MappingFlags::LOCKED);
3139            match state.get_mapping_backing(mapping) {
3140                MappingBacking::Memory(backing) => {
3141                    let memory_offset = backing.address_to_offset(range.start);
3142                    let length = range.end - range.start;
3143
3144                    let target_memory = if mapping.flags().contains(MappingFlags::SHARED)
3145                        || mapping.name().is_vvar()
3146                    {
3147                        // Note that the Vvar is a special mapping that behaves like a shared mapping but
3148                        // is private to each process.
3149                        backing.memory().clone()
3150                    } else if mapping.flags().contains(MappingFlags::WIPEONFORK) {
3151                        create_anonymous_mapping_memory(length as u64)?
3152                    } else {
3153                        let basic_info = backing.memory().basic_info();
3154                        let memory =
3155                            clone_cache.entry(basic_info.koid).or_insert_with_fallible(|| {
3156                                backing.memory().clone_memory(basic_info.rights)
3157                            })?;
3158                        memory.clone()
3159                    };
3160
3161                    let mut released_mappings = ReleasedMappings::default();
3162                    target_state.map_memory(
3163                        target,
3164                        DesiredAddress::Fixed(range.start),
3165                        target_memory,
3166                        memory_offset,
3167                        length,
3168                        target_mapping_flags,
3169                        mapping.max_access(),
3170                        false,
3171                        mapping.name().to_owned(),
3172                        &mut released_mappings,
3173                    )?;
3174                    assert!(
3175                        released_mappings.is_empty(),
3176                        "target mm must be empty when cloning, got {released_mappings:#?}"
3177                    );
3178                }
3179                MappingBacking::PrivateAnonymous => {
3180                    let length = range.end - range.start;
3181                    if mapping.flags().contains(MappingFlags::WIPEONFORK) {
3182                        target_state
3183                            .private_anonymous
3184                            .zero(range.start, length)
3185                            .map_err(|_| errno!(ENOMEM))?;
3186                    }
3187
3188                    let target_memory_offset = range.start.ptr() as u64;
3189                    target_state.map_in_user_vmar(
3190                        SelectedAddress::FixedOverwrite(range.start),
3191                        &target_state.private_anonymous.backing,
3192                        target_memory_offset,
3193                        length,
3194                        target_mapping_flags,
3195                        false,
3196                    )?;
3197                    let removed_mappings = target_state.mappings.insert(
3198                        range.clone(),
3199                        Mapping::new_private_anonymous(
3200                            target_mapping_flags,
3201                            mapping.name().to_owned(),
3202                        ),
3203                    );
3204                    assert!(
3205                        removed_mappings.is_empty(),
3206                        "target mm must be empty when cloning, got {removed_mappings:#?}"
3207                    );
3208                }
3209            };
3210        }
3211
3212        target_state.forkable_state = state.forkable_state.clone();
3213
3214        let self_dumpable = *self.dumpable.lock(locked);
3215        *target.dumpable.lock(locked) = self_dumpable;
3216
3217        Ok(())
3218    }
3219
3220    /// Returns the replacement `MemoryManager` to be used by the `exec()`ing task.
3221    ///
3222    /// POSIX requires that "a call to any exec function from a process with more than one thread
3223    /// shall result in all threads being terminated and the new executable being loaded and
3224    /// executed. No destructor functions or cleanup handlers shall be called".
3225    /// The caller is responsible for having ensured that this is the only `Task` in the
3226    /// `ThreadGroup`, and thereby the `zx::process`, such that it is safe to tear-down the Zircon
3227    /// userspace VMAR for the current address-space.
3228    pub fn exec(
3229        &self,
3230        exe_node: NamespaceNode,
3231        arch_width: ArchWidth,
3232    ) -> Result<Arc<Self>, zx::Status> {
3233        // To safeguard against concurrent accesses by other tasks through this `MemoryManager`, the
3234        // following steps are performed while holding the write lock on this instance:
3235        //
3236        // 1. All `mappings` are removed, so that remote `MemoryAccessor` calls will fail.
3237        // 2. The `user_vmar` is `destroy()`ed to free-up the user address-space.
3238        // 3. The new `user_vmar` is created, to re-reserve the user address-space.
3239        //
3240        // Once these steps are complete the lock must first be dropped, after which it is safe for
3241        // the old mappings to be dropped.
3242        let (_old_mappings, user_vmar) = {
3243            let mut state = self.state.write();
3244
3245            // SAFETY: This operation is safe because this is the only `Task` active in the address-
3246            // space, and accesses by remote tasks will use syscalls on the `root_vmar`.
3247            unsafe { state.user_vmar.destroy()? }
3248            state.user_vmar = zx::NullableHandle::invalid().into();
3249
3250            // Create the new userspace VMAR, to ensure that the address range is (re-)reserved.
3251            let user_vmar = create_user_vmar(&self.root_vmar, arch_width)?;
3252
3253            (std::mem::replace(&mut state.mappings, Default::default()), user_vmar)
3254        };
3255
3256        // Wrap the new user address-space VMAR into a new `MemoryManager`.
3257        let root_vmar = self.root_vmar.duplicate_handle(zx::Rights::SAME_RIGHTS)?;
3258        let user_vmar_info = user_vmar.info()?;
3259        let new_mm = Self::from_vmar(root_vmar, user_vmar, user_vmar_info);
3260
3261        // Initialize the new `MemoryManager` state.
3262        new_mm.state.write().executable_node = Some(exe_node);
3263
3264        // Initialize the appropriate address-space layout for the `arch_width`.
3265        new_mm.initialize_mmap_layout(arch_width)?;
3266
3267        Ok(Arc::new(new_mm))
3268    }
3269
3270    pub fn initialize_mmap_layout(&self, arch_width: ArchWidth) -> Result<(), Errno> {
3271        let mut state = self.state.write();
3272
3273        // Place the stack at the end of the address space, subject to ASLR adjustment.
3274        state.stack_origin = UserAddress::from_ptr(
3275            state.user_vmar_info.base + state.user_vmar_info.len
3276                - MAX_STACK_SIZE
3277                - generate_random_offset_for_aslr(arch_width),
3278        )
3279        .round_up(*PAGE_SIZE)?;
3280
3281        // Set the highest address that `mmap` will assign to the allocations that don't ask for a
3282        // specific address, subject to ASLR adjustment.
3283        state.mmap_top = state
3284            .stack_origin
3285            .checked_sub(generate_random_offset_for_aslr(arch_width))
3286            .ok_or_else(|| errno!(EINVAL))?;
3287        Ok(())
3288    }
3289
3290    // Test tasks are not initialized by exec; simulate its behavior by initializing memory layout
3291    // as if a zero-size executable was loaded.
3292    pub fn initialize_mmap_layout_for_test(self: &Arc<Self>, arch_width: ArchWidth) {
3293        self.initialize_mmap_layout(arch_width).unwrap();
3294        let fake_executable_addr = self.get_random_base_for_executable(arch_width, 0).unwrap();
3295        self.initialize_brk_origin(arch_width, fake_executable_addr).unwrap();
3296    }
3297
3298    pub fn initialize_brk_origin(
3299        self: &Arc<Self>,
3300        arch_width: ArchWidth,
3301        executable_end: UserAddress,
3302    ) -> Result<(), Errno> {
3303        self.state.write().brk_origin = executable_end
3304            .checked_add(generate_random_offset_for_aslr(arch_width))
3305            .ok_or_else(|| errno!(EINVAL))?;
3306        Ok(())
3307    }
3308
3309    // Get a randomised address for loading a position-independent executable.
3310
3311    pub fn get_random_base_for_executable(
3312        &self,
3313        arch_width: ArchWidth,
3314        length: usize,
3315    ) -> Result<UserAddress, Errno> {
3316        let state = self.state.read();
3317
3318        // Place it at approx. 2/3 of the available mmap space, subject to ASLR adjustment.
3319        let base = round_up_to_system_page_size(2 * state.mmap_top.ptr() / 3).unwrap()
3320            + generate_random_offset_for_aslr(arch_width);
3321        if base.checked_add(length).ok_or_else(|| errno!(EINVAL))? <= state.mmap_top.ptr() {
3322            Ok(UserAddress::from_ptr(base))
3323        } else {
3324            error!(EINVAL)
3325        }
3326    }
3327    pub fn executable_node(&self) -> Option<NamespaceNode> {
3328        self.state.read().executable_node.clone()
3329    }
3330
3331    #[track_caller]
3332    pub fn get_errno_for_map_err(status: zx::Status) -> Errno {
3333        match status {
3334            zx::Status::INVALID_ARGS => errno!(EINVAL),
3335            zx::Status::ACCESS_DENIED => errno!(EPERM),
3336            zx::Status::NOT_SUPPORTED => errno!(ENODEV),
3337            zx::Status::NO_MEMORY => errno!(ENOMEM),
3338            zx::Status::NO_RESOURCES => errno!(ENOMEM),
3339            zx::Status::OUT_OF_RANGE => errno!(ENOMEM),
3340            zx::Status::ALREADY_EXISTS => errno!(EEXIST),
3341            zx::Status::BAD_STATE => errno!(EINVAL),
3342            _ => impossible_error(status),
3343        }
3344    }
3345
3346    #[track_caller]
3347    pub fn get_errno_for_vmo_err(status: zx::Status) -> Errno {
3348        match status {
3349            zx::Status::NO_MEMORY => errno!(ENOMEM),
3350            zx::Status::ACCESS_DENIED => errno!(EPERM),
3351            zx::Status::NOT_SUPPORTED => errno!(EIO),
3352            zx::Status::BAD_STATE => errno!(EIO),
3353            _ => return impossible_error(status),
3354        }
3355    }
3356
3357    pub fn map_memory(
3358        self: &Arc<Self>,
3359        addr: DesiredAddress,
3360        memory: Arc<MemoryObject>,
3361        memory_offset: u64,
3362        length: usize,
3363        prot_flags: ProtectionFlags,
3364        max_access: Access,
3365        options: MappingOptions,
3366        name: MappingName,
3367    ) -> Result<UserAddress, Errno> {
3368        let flags = MappingFlags::from_access_flags_and_options(prot_flags, options);
3369
3370        // Unmapped mappings must be released after the state is unlocked.
3371        let mut released_mappings = ReleasedMappings::default();
3372        // Hold the lock throughout the operation to uphold memory manager's invariants.
3373        // See mm/README.md.
3374        let mut state = self.state.write();
3375        let result = state.map_memory(
3376            self,
3377            addr,
3378            memory,
3379            memory_offset,
3380            length,
3381            flags,
3382            max_access,
3383            options.contains(MappingOptions::POPULATE),
3384            name,
3385            &mut released_mappings,
3386        );
3387
3388        // Drop the state before the unmapped mappings, since dropping a mapping may acquire a lock
3389        // in `DirEntry`'s `drop`.
3390        released_mappings.finalize(state);
3391
3392        result
3393    }
3394
3395    pub fn map_anonymous(
3396        self: &Arc<Self>,
3397        addr: DesiredAddress,
3398        length: usize,
3399        prot_flags: ProtectionFlags,
3400        options: MappingOptions,
3401        name: MappingName,
3402    ) -> Result<UserAddress, Errno> {
3403        let mut released_mappings = ReleasedMappings::default();
3404        // Hold the lock throughout the operation to uphold memory manager's invariants.
3405        // See mm/README.md.
3406        let mut state = self.state.write();
3407        let result = state.map_anonymous(
3408            self,
3409            addr,
3410            length,
3411            prot_flags,
3412            options,
3413            name,
3414            &mut released_mappings,
3415        );
3416
3417        released_mappings.finalize(state);
3418
3419        result
3420    }
3421
3422    /// Map the stack into a pre-selected address region
3423    pub fn map_stack(
3424        self: &Arc<Self>,
3425        length: usize,
3426        prot_flags: ProtectionFlags,
3427    ) -> Result<UserAddress, Errno> {
3428        assert!(length <= MAX_STACK_SIZE);
3429        let addr = self.state.read().stack_origin;
3430        // The address range containing stack_origin should normally be available: it's above the
3431        // mmap_top, and this method is called early enough in the process lifetime that only the
3432        // main ELF and the interpreter are already loaded. However, in the rare case that the
3433        // static position-independent executable is overlapping the chosen address, mapping as Hint
3434        // will make mmap choose a new place for it.
3435        // TODO(https://fxbug.dev/370027241): Consider a more robust approach
3436        let stack_addr = self.map_anonymous(
3437            DesiredAddress::Hint(addr),
3438            length,
3439            prot_flags,
3440            MappingOptions::ANONYMOUS | MappingOptions::GROWSDOWN,
3441            MappingName::Stack,
3442        )?;
3443        if stack_addr != addr {
3444            log_warn!(
3445                "An address designated for stack ({}) was unavailable, mapping at {} instead.",
3446                addr,
3447                stack_addr
3448            );
3449        }
3450        Ok(stack_addr)
3451    }
3452
3453    pub fn remap(
3454        self: &Arc<Self>,
3455        current_task: &CurrentTask,
3456        addr: UserAddress,
3457        old_length: usize,
3458        new_length: usize,
3459        flags: MremapFlags,
3460        new_addr: UserAddress,
3461    ) -> Result<UserAddress, Errno> {
3462        let mut released_mappings = ReleasedMappings::default();
3463        // Hold the lock throughout the operation to uphold memory manager's invariants.
3464        // See mm/README.md.
3465        let mut state = self.state.write();
3466        let result = state.remap(
3467            current_task,
3468            self,
3469            addr,
3470            old_length,
3471            new_length,
3472            flags,
3473            new_addr,
3474            &mut released_mappings,
3475        );
3476
3477        released_mappings.finalize(state);
3478
3479        result
3480    }
3481
3482    pub fn unmap(self: &Arc<Self>, addr: UserAddress, length: usize) -> Result<(), Errno> {
3483        let mut released_mappings = ReleasedMappings::default();
3484        // Hold the lock throughout the operation to uphold memory manager's invariants.
3485        // See mm/README.md.
3486        let mut state = self.state.write();
3487        let result = state.unmap(self, addr, length, &mut released_mappings);
3488
3489        released_mappings.finalize(state);
3490
3491        result
3492    }
3493
3494    pub fn protect(
3495        &self,
3496        current_task: &CurrentTask,
3497        addr: UserAddress,
3498        length: usize,
3499        prot_flags: ProtectionFlags,
3500    ) -> Result<(), Errno> {
3501        // Hold the lock throughout the operation to uphold memory manager's invariants.
3502        // See mm/README.md.
3503        let mut state = self.state.write();
3504        let mut released_mappings = ReleasedMappings::default();
3505        let result = state.protect(current_task, addr, length, prot_flags, &mut released_mappings);
3506        released_mappings.finalize(state);
3507        result
3508    }
3509
3510    pub fn msync(
3511        &self,
3512        _locked: &mut Locked<Unlocked>,
3513        current_task: &CurrentTask,
3514        addr: UserAddress,
3515        length: usize,
3516        flags: MsyncFlags,
3517    ) -> Result<(), Errno> {
3518        // According to POSIX, either MS_SYNC or MS_ASYNC must be specified in flags,
3519        // and indeed failure to include one of these flags will cause msync() to fail
3520        // on some systems.  However, Linux permits a call to msync() that specifies
3521        // neither of these flags, with semantics that are (currently) equivalent to
3522        // specifying MS_ASYNC.
3523
3524        // Both MS_SYNC and MS_ASYNC are set in flags
3525        if flags.contains(MsyncFlags::ASYNC) && flags.contains(MsyncFlags::SYNC) {
3526            return error!(EINVAL);
3527        }
3528
3529        if !addr.is_aligned(*PAGE_SIZE) {
3530            return error!(EINVAL);
3531        }
3532
3533        // We collect the nodes to sync first, release the memory manager lock, and then sync them.
3534        // This avoids holding the lock during blocking I/O operations (sync), which prevents
3535        // stalling other memory operations and avoids potential deadlocks.
3536        // It also allows us to deduplicate nodes, avoiding redundant sync calls for the same file.
3537        let mut nodes_to_sync = {
3538            let mm_state = self.state.read();
3539
3540            let length_rounded = round_up_to_system_page_size(length)?;
3541            let end_addr = addr.checked_add(length_rounded).ok_or_else(|| errno!(EINVAL))?;
3542
3543            let mut last_end = addr;
3544            let mut nodes = vec![];
3545            for (range, mapping) in mm_state.mappings.range(addr..end_addr) {
3546                // Check if there is a gap between the last mapped address and the current mapping.
3547                // msync requires the entire range to be mapped, so any gap results in ENOMEM.
3548                if range.start > last_end {
3549                    return error!(ENOMEM);
3550                }
3551                last_end = range.end;
3552
3553                if flags.contains(MsyncFlags::INVALIDATE)
3554                    && mapping.flags().contains(MappingFlags::LOCKED)
3555                {
3556                    return error!(EBUSY);
3557                }
3558
3559                if flags.contains(MsyncFlags::SYNC) {
3560                    if let MappingNameRef::File(file_mapping) = mapping.name() {
3561                        nodes.push(file_mapping.name.entry.node.clone());
3562                    }
3563                }
3564            }
3565            if last_end < end_addr {
3566                return error!(ENOMEM);
3567            }
3568            nodes
3569        };
3570
3571        // Deduplicate nodes to avoid redundant sync calls.
3572        nodes_to_sync.sort_by_key(|n| Arc::as_ptr(n) as usize);
3573        nodes_to_sync.dedup_by(|a, b| Arc::ptr_eq(a, b));
3574
3575        for node in nodes_to_sync {
3576            // Range-based sync is non-trivial for Fxfs to support due to its complicated
3577            // reservation system (b/322874588#comment5). Naive range-based sync could exhaust
3578            // space reservations if called page-by-page, as transaction costs are based on the
3579            // number of dirty pages rather than file ranges. We use whole-file sync for now
3580            // to ensure data durability without adding excessive complexity.
3581            node.ops().sync(&node, current_task)?;
3582        }
3583        Ok(())
3584    }
3585
3586    pub fn madvise(
3587        &self,
3588        current_task: &CurrentTask,
3589        addr: UserAddress,
3590        length: usize,
3591        advice: u32,
3592    ) -> Result<(), Errno> {
3593        let mut state = self.state.write();
3594        let mut released_mappings = ReleasedMappings::default();
3595        let result = state.madvise(current_task, addr, length, advice, &mut released_mappings);
3596        released_mappings.finalize(state);
3597        result
3598    }
3599
3600    pub fn mlock<L>(
3601        &self,
3602        current_task: &CurrentTask,
3603        locked: &mut Locked<L>,
3604        desired_addr: UserAddress,
3605        desired_length: usize,
3606        on_fault: bool,
3607    ) -> Result<(), Errno>
3608    where
3609        L: LockBefore<ThreadGroupLimits>,
3610    {
3611        let mut state = self.state.write();
3612        let mut released_mappings = ReleasedMappings::default();
3613        let result = state.mlock(
3614            current_task,
3615            locked,
3616            desired_addr,
3617            desired_length,
3618            on_fault,
3619            &mut released_mappings,
3620        );
3621        released_mappings.finalize(state);
3622        result
3623    }
3624
3625    pub fn munlock(
3626        &self,
3627        current_task: &CurrentTask,
3628        desired_addr: UserAddress,
3629        desired_length: usize,
3630    ) -> Result<(), Errno> {
3631        let mut state = self.state.write();
3632        let mut released_mappings = ReleasedMappings::default();
3633        let result =
3634            state.munlock(current_task, desired_addr, desired_length, &mut released_mappings);
3635        released_mappings.finalize(state);
3636        result
3637    }
3638
3639    pub fn log_memory_map(&self, task: &Task, fault_address: UserAddress) {
3640        let state = self.state.read();
3641        log_warn!("Memory map for pid={}:", task.thread_group.leader);
3642        let mut last_end = UserAddress::from_ptr(0);
3643        for (range, map) in state.mappings.iter() {
3644            if fault_address >= last_end && fault_address < range.start {
3645                log_warn!("{:08x} <= FAULT", fault_address.ptr());
3646            }
3647
3648            let perms = format!(
3649                "{}{}{}{}",
3650                if map.can_read() { 'r' } else { '-' },
3651                if map.can_write() { 'w' } else { '-' },
3652                if map.can_exec() { 'x' } else { '-' },
3653                if map.flags().contains(MappingFlags::SHARED) { 's' } else { 'p' }
3654            );
3655
3656            let backing = match state.get_mapping_backing(map) {
3657                MappingBacking::Memory(backing) => backing.address_to_offset(range.start),
3658                MappingBacking::PrivateAnonymous => 0,
3659            };
3660
3661            let name_str = match &map.name() {
3662                MappingNameRef::File(file) => {
3663                    String::from_utf8_lossy(&file.name.path(task)).into_owned()
3664                }
3665                MappingNameRef::None | MappingNameRef::AioContext(_) => {
3666                    if map.flags().contains(MappingFlags::SHARED)
3667                        && map.flags().contains(MappingFlags::ANONYMOUS)
3668                    {
3669                        "/dev/zero (deleted)".to_string()
3670                    } else {
3671                        "".to_string()
3672                    }
3673                }
3674                MappingNameRef::Stack => "[stack]".to_string(),
3675                MappingNameRef::Heap => "[heap]".to_string(),
3676                MappingNameRef::Vdso => "[vdso]".to_string(),
3677                MappingNameRef::Vvar => "[vvar]".to_string(),
3678                _ => format!("{:?}", map.name()),
3679            };
3680
3681            let fault_marker = if range.contains(&fault_address) { " <= FAULT" } else { "" };
3682
3683            log_warn!(
3684                "{:08x}-{:08x} {} {:08x} {}{}",
3685                range.start.ptr(),
3686                range.end.ptr(),
3687                perms,
3688                backing,
3689                name_str,
3690                fault_marker
3691            );
3692            last_end = range.end;
3693        }
3694
3695        if fault_address >= last_end {
3696            log_warn!("{:08x} <= FAULT", fault_address.ptr());
3697        }
3698    }
3699
3700    pub fn handle_page_fault(
3701        self: &Arc<Self>,
3702        locked: &mut Locked<Unlocked>,
3703        decoded: PageFaultExceptionReport,
3704        error_code: zx::Status,
3705    ) -> ExceptionResult {
3706        let addr = UserAddress::from(decoded.faulting_address);
3707        // On uffd-registered range, handle according to the uffd rules
3708        if error_code == zx::Status::ACCESS_DENIED {
3709            let state = self.state.write();
3710            if let Some((_, mapping)) = state.mappings.get(addr) {
3711                if mapping.flags().contains(MappingFlags::UFFD) {
3712                    // TODO(https://fxbug.dev/391599171): Support other modes
3713                    assert!(mapping.flags().contains(MappingFlags::UFFD_MISSING));
3714
3715                    if let Some(_uffd) = state.find_uffd(locked, addr) {
3716                        // If the SIGBUS feature was set, no event will be sent to the file.
3717                        // Instead, SIGBUS is delivered to the process that triggered the fault.
3718                        // TODO(https://fxbug.dev/391599171): For now we only support this feature,
3719                        // so we assume it is set.
3720                        // Check for the SIGBUS feature when we start supporting running without it.
3721                        return ExceptionResult::Signal(SignalInfo::with_detail(
3722                            SIGBUS,
3723                            BUS_ADRERR as i32,
3724                            SignalDetail::SigFault { addr: decoded.faulting_address },
3725                        ));
3726                    };
3727                }
3728                let exec_denied = decoded.is_execute && !mapping.can_exec();
3729                let write_denied = decoded.is_write && !mapping.can_write();
3730                let read_denied = (!decoded.is_execute && !decoded.is_write) && !mapping.can_read();
3731                // There is a data race resulting from uffd unregistration and page fault happening
3732                // at the same time. To detect it, we check if the access was meant to be rejected
3733                // according to Starnix own information about the mapping.
3734                let false_reject = !exec_denied && !write_denied && !read_denied;
3735                if false_reject {
3736                    track_stub!(
3737                        TODO("https://fxbug.dev/435171399"),
3738                        "Inconsistent permission fault"
3739                    );
3740                    return ExceptionResult::Handled;
3741                }
3742            }
3743            std::mem::drop(state);
3744        }
3745
3746        if decoded.not_present {
3747            // A page fault may be resolved by extending a growsdown mapping to cover the faulting
3748            // address. Mark the exception handled if so. Otherwise let the regular handling proceed.
3749
3750            // We should only attempt growth on a not-present fault and we should only extend if the
3751            // access type matches the protection on the GROWSDOWN mapping.
3752            match self.extend_growsdown_mapping_to_address(
3753                UserAddress::from(decoded.faulting_address),
3754                decoded.is_write,
3755            ) {
3756                Ok(true) => {
3757                    return ExceptionResult::Handled;
3758                }
3759                Err(e) => {
3760                    log_warn!("Error handling page fault: {e}")
3761                }
3762                _ => {}
3763            }
3764        }
3765        // For this exception type, the synth_code field in the exception report's context is the
3766        // error generated by the page fault handler. For us this is used to distinguish between a
3767        // segmentation violation and a bus error. Unfortunately this detail is not documented in
3768        // Zircon's public documentation and is only described in the architecture-specific
3769        // exception definitions such as:
3770        // zircon/kernel/arch/x86/include/arch/x86.h
3771        // zircon/kernel/arch/arm64/include/arch/arm64.h
3772        let signo = match error_code {
3773            zx::Status::OUT_OF_RANGE => SIGBUS,
3774            _ => SIGSEGV,
3775        };
3776        ExceptionResult::Signal(SignalInfo::with_detail(
3777            signo,
3778            SI_KERNEL as i32,
3779            SignalDetail::SigFault { addr: decoded.faulting_address },
3780        ))
3781    }
3782
3783    pub fn set_mapping_name(
3784        &self,
3785        addr: UserAddress,
3786        length: usize,
3787        name: Option<FsString>,
3788    ) -> Result<(), Errno> {
3789        let mut state = self.state.write();
3790        let mut released_mappings = ReleasedMappings::default();
3791        let result = state.set_mapping_name(addr, length, name, &mut released_mappings);
3792        released_mappings.finalize(state);
3793        result
3794    }
3795
3796    /// Returns [`Ok`] if the entire range specified by `addr..(addr+length)` contains valid
3797    /// mappings.
3798    ///
3799    /// # Errors
3800    ///
3801    /// Returns [`Err(errno)`] where `errno` is:
3802    ///
3803    ///   - `EINVAL`: `addr` is not page-aligned, or the range is too large,
3804    ///   - `ENOMEM`: one or more pages in the range are not mapped.
3805    pub fn ensure_mapped(&self, addr: UserAddress, length: usize) -> Result<(), Errno> {
3806        if !addr.is_aligned(*PAGE_SIZE) {
3807            return error!(EINVAL);
3808        }
3809
3810        let length = round_up_to_system_page_size(length)?;
3811        let end_addr = addr.checked_add(length).ok_or_else(|| errno!(EINVAL))?;
3812        let state = self.state.read();
3813        let mut last_end = addr;
3814        for (range, _) in state.mappings.range(addr..end_addr) {
3815            if range.start > last_end {
3816                // This mapping does not start immediately after the last.
3817                return error!(ENOMEM);
3818            }
3819            last_end = range.end;
3820        }
3821        if last_end < end_addr {
3822            // There is a gap of no mappings at the end of the range.
3823            error!(ENOMEM)
3824        } else {
3825            Ok(())
3826        }
3827    }
3828
3829    /// Returns the memory object mapped at the address and the offset into the memory object of
3830    /// the address. Intended for implementing futexes.
3831    pub fn get_mapping_memory(
3832        &self,
3833        addr: UserAddress,
3834        perms: ProtectionFlags,
3835    ) -> Result<(Arc<MemoryObject>, u64), Errno> {
3836        let state = self.state.read();
3837        let (_, mapping) = state.mappings.get(addr).ok_or_else(|| errno!(EFAULT))?;
3838        if !mapping.flags().access_flags().contains(perms) {
3839            return error!(EACCES);
3840        }
3841        match state.get_mapping_backing(mapping) {
3842            MappingBacking::Memory(backing) => {
3843                Ok((Arc::clone(backing.memory()), mapping.address_to_offset(addr)))
3844            }
3845            MappingBacking::PrivateAnonymous => {
3846                Ok((Arc::clone(&state.private_anonymous.backing), addr.ptr() as u64))
3847            }
3848        }
3849    }
3850
3851    /// Does a rough check that the given address is plausibly in the address space of the
3852    /// application. This does not mean the pointer is valid for any particular purpose or that
3853    /// it will remain so!
3854    ///
3855    /// In some syscalls, Linux seems to do some initial validation of the pointer up front to
3856    /// tell the caller early if it's invalid. For example, in epoll_wait() it's returning a vector
3857    /// of events. If the caller passes an invalid pointer, it wants to fail without dropping any
3858    /// events. Failing later when actually copying the required events to userspace would mean
3859    /// those events will be lost. But holding a lock on the memory manager for an asynchronous
3860    /// wait is not desirable.
3861    ///
3862    /// Testing shows that Linux seems to do some initial plausibility checking of the pointer to
3863    /// be able to report common usage errors before doing any (possibly unreversable) work. This
3864    /// checking is easy to get around if you try, so this function is also not required to
3865    /// be particularly robust. Certainly the more advanced cases of races (the memory could be
3866    /// unmapped after this call but before it's used) are not handled.
3867    ///
3868    /// The buffer_size variable is the size of the data structure that needs to fit
3869    /// in the given memory.
3870    ///
3871    /// Returns the error EFAULT if invalid.
3872    pub fn check_plausible(&self, addr: UserAddress, buffer_size: usize) -> Result<(), Errno> {
3873        let state = self.state.read();
3874
3875        if let Some(range) = state.mappings.last_range() {
3876            if (range.end - buffer_size)? >= addr {
3877                return Ok(());
3878            }
3879        }
3880        error!(EFAULT)
3881    }
3882
3883    pub fn get_aio_context(&self, addr: UserAddress) -> Option<Arc<AioContext>> {
3884        let state = self.state.read();
3885        state.get_aio_context(addr).map(|(_, aio_context)| aio_context)
3886    }
3887
3888    pub fn destroy_aio_context(
3889        self: &Arc<Self>,
3890        addr: UserAddress,
3891    ) -> Result<Arc<AioContext>, Errno> {
3892        let mut released_mappings = ReleasedMappings::default();
3893
3894        // Hold the lock throughout the operation to uphold memory manager's invariants.
3895        // See mm/README.md.
3896        let mut state = self.state.write();
3897
3898        // Validate that this address actually has an AioContext. We need to hold the state lock
3899        // until we actually remove the mappings to ensure that another thread does not manipulate
3900        // the mappings after we've validated that they contain an AioContext.
3901        let Some((range, aio_context)) = state.get_aio_context(addr) else {
3902            return error!(EINVAL);
3903        };
3904
3905        let length = range.end - range.start;
3906        let result = state.unmap(self, range.start, length, &mut released_mappings);
3907
3908        released_mappings.finalize(state);
3909
3910        result.map(|_| aio_context)
3911    }
3912
3913    #[cfg(test)]
3914    pub fn get_mapping_name(
3915        &self,
3916        addr: UserAddress,
3917    ) -> Result<Option<flyweights::FlyByteStr>, Errno> {
3918        let state = self.state.read();
3919        let (_, mapping) = state.mappings.get(addr).ok_or_else(|| errno!(EFAULT))?;
3920        if let MappingNameRef::Vma(name) = mapping.name() {
3921            Ok(Some(name.clone()))
3922        } else {
3923            Ok(None)
3924        }
3925    }
3926
3927    #[cfg(test)]
3928    pub fn get_mapping_count(&self) -> usize {
3929        let state = self.state.read();
3930        state.mappings.iter().count()
3931    }
3932
3933    pub fn extend_growsdown_mapping_to_address(
3934        self: &Arc<Self>,
3935        addr: UserAddress,
3936        is_write: bool,
3937    ) -> Result<bool, Error> {
3938        self.state.write().extend_growsdown_mapping_to_address(self, addr, is_write)
3939    }
3940
3941    pub fn get_stats(&self, current_task: &CurrentTask) -> MemoryStats {
3942        // Grab our state lock before reading zircon mappings so that the two are consistent.
3943        // Other Starnix threads should not make any changes to the Zircon mappings while we hold
3944        // a read lock to the memory manager state.
3945        let state = self.state.read();
3946
3947        let mut stats = MemoryStats::default();
3948        stats.vm_stack = state.stack_size;
3949
3950        state.with_zx_mappings(current_task, |zx_mappings| {
3951            for zx_mapping in zx_mappings {
3952                // We only care about map info for actual mappings.
3953                let zx_details = zx_mapping.details();
3954                let Some(zx_details) = zx_details.as_mapping() else { continue };
3955                let user_address = UserAddress::from(zx_mapping.base as u64);
3956                let (_, mm_mapping) = state
3957                    .mappings
3958                    .get(user_address)
3959                    .unwrap_or_else(|| panic!("mapping bookkeeping must be consistent with zircon's: not found: {user_address:?}"));
3960                debug_assert_eq!(
3961                    match state.get_mapping_backing(mm_mapping) {
3962                        MappingBacking::Memory(m)=>m.memory().get_koid(),
3963                        MappingBacking::PrivateAnonymous=>state.private_anonymous.backing.get_koid(),
3964                    },
3965                    zx_details.vmo_koid,
3966                    "MemoryManager and Zircon must agree on which VMO is mapped in this range",
3967                );
3968
3969                stats.vm_size += zx_mapping.size;
3970
3971                stats.vm_rss += zx_details.committed_bytes;
3972                stats.vm_swap += zx_details.populated_bytes - zx_details.committed_bytes;
3973
3974                if mm_mapping.flags().contains(MappingFlags::SHARED) {
3975                    stats.rss_shared += zx_details.committed_bytes;
3976                } else if mm_mapping.flags().contains(MappingFlags::ANONYMOUS) {
3977                    stats.rss_anonymous += zx_details.committed_bytes;
3978                } else if mm_mapping.name().is_file() {
3979                    stats.rss_file += zx_details.committed_bytes;
3980                }
3981
3982                if mm_mapping.flags().contains(MappingFlags::LOCKED) {
3983                    stats.vm_lck += zx_details.committed_bytes;
3984                }
3985
3986                if mm_mapping.flags().contains(MappingFlags::ELF_BINARY)
3987                    && mm_mapping.flags().contains(MappingFlags::WRITE)
3988                {
3989                    stats.vm_data += zx_mapping.size;
3990                }
3991
3992                if mm_mapping.flags().contains(MappingFlags::ELF_BINARY)
3993                    && mm_mapping.flags().contains(MappingFlags::EXEC)
3994                {
3995                    stats.vm_exe += zx_mapping.size;
3996                }
3997            }
3998        });
3999
4000        // TODO(https://fxbug.dev/396221597): Placeholder for now. We need kernel support to track
4001        // the committed bytes high water mark.
4002        stats.vm_rss_hwm = STUB_VM_RSS_HWM;
4003        stats
4004    }
4005
4006    pub fn atomic_load_u32_acquire(&self, futex_addr: FutexAddress) -> Result<u32, Errno> {
4007        if let Some(usercopy) = usercopy() {
4008            usercopy.atomic_load_u32_acquire(futex_addr.ptr()).map_err(|_| errno!(EFAULT))
4009        } else {
4010            unreachable!("can only control memory ordering of atomics with usercopy");
4011        }
4012    }
4013
4014    pub fn atomic_load_u32_relaxed(&self, futex_addr: FutexAddress) -> Result<u32, Errno> {
4015        if let Some(usercopy) = usercopy() {
4016            usercopy.atomic_load_u32_relaxed(futex_addr.ptr()).map_err(|_| errno!(EFAULT))
4017        } else {
4018            // SAFETY: `self.state.read().read_memory` only returns `Ok` if all
4019            // bytes were read to.
4020            let buf = unsafe {
4021                read_to_array(|buf| {
4022                    self.state.read().read_memory(futex_addr.into(), buf).map(|bytes_read| {
4023                        debug_assert_eq!(bytes_read.len(), std::mem::size_of::<u32>())
4024                    })
4025                })
4026            }?;
4027            Ok(u32::from_ne_bytes(buf))
4028        }
4029    }
4030
4031    pub fn atomic_store_u32_relaxed(
4032        &self,
4033        futex_addr: FutexAddress,
4034        value: u32,
4035    ) -> Result<(), Errno> {
4036        if let Some(usercopy) = usercopy() {
4037            usercopy.atomic_store_u32_relaxed(futex_addr.ptr(), value).map_err(|_| errno!(EFAULT))
4038        } else {
4039            self.state.read().write_memory(futex_addr.into(), value.as_bytes())?;
4040            Ok(())
4041        }
4042    }
4043
4044    pub fn atomic_compare_exchange_u32_acq_rel(
4045        &self,
4046        futex_addr: FutexAddress,
4047        current: u32,
4048        new: u32,
4049    ) -> CompareExchangeResult<u32> {
4050        let Some(usercopy) = usercopy() else {
4051            unreachable!("Atomic compare/exchange requires usercopy.");
4052        };
4053        CompareExchangeResult::from_usercopy(usercopy.atomic_compare_exchange_u32_acq_rel(
4054            futex_addr.ptr(),
4055            current,
4056            new,
4057        ))
4058    }
4059
4060    pub fn atomic_compare_exchange_weak_u32_acq_rel(
4061        &self,
4062        futex_addr: FutexAddress,
4063        current: u32,
4064        new: u32,
4065    ) -> CompareExchangeResult<u32> {
4066        let Some(usercopy) = usercopy() else {
4067            unreachable!("Atomic compare/exchange requires usercopy.");
4068        };
4069        CompareExchangeResult::from_usercopy(usercopy.atomic_compare_exchange_weak_u32_acq_rel(
4070            futex_addr.ptr(),
4071            current,
4072            new,
4073        ))
4074    }
4075
4076    pub fn get_restricted_vmar_info(&self) -> Option<VmarInfo> {
4077        if self.root_vmar.is_invalid() {
4078            return None;
4079        }
4080        Some(VmarInfo { base: RESTRICTED_ASPACE_BASE, len: RESTRICTED_ASPACE_SIZE })
4081    }
4082}
4083
4084/// The result of an atomic compare/exchange operation on user memory.
4085#[derive(Debug, Clone)]
4086pub enum CompareExchangeResult<T> {
4087    /// The current value provided matched the one observed in memory and the new value provided
4088    /// was written.
4089    Success,
4090    /// The provided current value did not match the current value in memory.
4091    Stale { observed: T },
4092    /// There was a general error while accessing the requested memory.
4093    Error(Errno),
4094}
4095
4096impl<T> CompareExchangeResult<T> {
4097    fn from_usercopy(usercopy_res: Result<Result<T, T>, ()>) -> Self {
4098        match usercopy_res {
4099            Ok(Ok(_)) => Self::Success,
4100            Ok(Err(observed)) => Self::Stale { observed },
4101            Err(()) => Self::Error(errno!(EFAULT)),
4102        }
4103    }
4104}
4105
4106impl<T> From<Errno> for CompareExchangeResult<T> {
4107    fn from(e: Errno) -> Self {
4108        Self::Error(e)
4109    }
4110}
4111
4112/// The user-space address at which a mapping should be placed. Used by [`MemoryManager::map`].
4113#[derive(Debug, Clone, Copy, PartialEq, Eq)]
4114pub enum DesiredAddress {
4115    /// Map at any address chosen by the kernel.
4116    Any,
4117    /// The address is a hint. If the address overlaps an existing mapping a different address may
4118    /// be chosen.
4119    Hint(UserAddress),
4120    /// The address is a requirement. If the address overlaps an existing mapping (and cannot
4121    /// overwrite it), mapping fails.
4122    Fixed(UserAddress),
4123    /// The address is a requirement. If the address overlaps an existing mapping (and cannot
4124    /// overwrite it), they should be unmapped.
4125    FixedOverwrite(UserAddress),
4126}
4127
4128/// The user-space address at which a mapping should be placed. Used by [`map_in_vmar`].
4129#[derive(Debug, Clone, Copy, PartialEq, Eq)]
4130enum SelectedAddress {
4131    /// See DesiredAddress::Fixed.
4132    Fixed(UserAddress),
4133    /// See DesiredAddress::FixedOverwrite.
4134    FixedOverwrite(UserAddress),
4135}
4136
4137impl SelectedAddress {
4138    fn addr(&self) -> UserAddress {
4139        match self {
4140            SelectedAddress::Fixed(addr) => *addr,
4141            SelectedAddress::FixedOverwrite(addr) => *addr,
4142        }
4143    }
4144}
4145
4146/// Write one line of the memory map intended for adding to `/proc/self/maps`.
4147fn write_map(
4148    task: &Task,
4149    sink: &mut DynamicFileBuf,
4150    state: &MemoryManagerState,
4151    range: &Range<UserAddress>,
4152    map: &Mapping,
4153) -> Result<(), Errno> {
4154    let line_length = write!(
4155        sink,
4156        "{:08x}-{:08x} {}{}{}{} {:08x} 00:00 {} ",
4157        range.start.ptr(),
4158        range.end.ptr(),
4159        if map.can_read() { 'r' } else { '-' },
4160        if map.can_write() { 'w' } else { '-' },
4161        if map.can_exec() { 'x' } else { '-' },
4162        if map.flags().contains(MappingFlags::SHARED) { 's' } else { 'p' },
4163        match state.get_mapping_backing(map) {
4164            MappingBacking::Memory(backing) => backing.address_to_offset(range.start),
4165            MappingBacking::PrivateAnonymous => 0,
4166        },
4167        if let MappingNameRef::File(file) = &map.name() { file.name.entry.node.ino } else { 0 }
4168    )?;
4169    let fill_to_name = |sink: &mut DynamicFileBuf| {
4170        // The filename goes at >= the 74th column (73rd when zero indexed)
4171        for _ in line_length..73 {
4172            sink.write(b" ");
4173        }
4174    };
4175    match &map.name() {
4176        MappingNameRef::None | MappingNameRef::AioContext(_) => {
4177            if map.flags().contains(MappingFlags::SHARED)
4178                && map.flags().contains(MappingFlags::ANONYMOUS)
4179            {
4180                // See proc(5), "/proc/[pid]/map_files/"
4181                fill_to_name(sink);
4182                sink.write(b"/dev/zero (deleted)");
4183            }
4184        }
4185        MappingNameRef::Stack => {
4186            fill_to_name(sink);
4187            sink.write(b"[stack]");
4188        }
4189        MappingNameRef::Heap => {
4190            fill_to_name(sink);
4191            sink.write(b"[heap]");
4192        }
4193        MappingNameRef::Vdso => {
4194            fill_to_name(sink);
4195            sink.write(b"[vdso]");
4196        }
4197        MappingNameRef::Vvar => {
4198            fill_to_name(sink);
4199            sink.write(b"[vvar]");
4200        }
4201        MappingNameRef::File(file) => {
4202            fill_to_name(sink);
4203            // File names can have newlines that need to be escaped before printing.
4204            // According to https://man7.org/linux/man-pages/man5/proc.5.html the only
4205            // escaping applied to paths is replacing newlines with an octal sequence.
4206            let path = file.name.path(task);
4207            sink.write_iter(
4208                path.iter()
4209                    .flat_map(|b| if *b == b'\n' { b"\\012" } else { std::slice::from_ref(b) })
4210                    .copied(),
4211            );
4212        }
4213        MappingNameRef::Vma(name) => {
4214            fill_to_name(sink);
4215            sink.write(b"[anon:");
4216            sink.write(name.as_bytes());
4217            sink.write(b"]");
4218        }
4219        MappingNameRef::Ashmem(name) => {
4220            fill_to_name(sink);
4221            sink.write(b"/dev/ashmem/");
4222            sink.write(name.as_bytes());
4223        }
4224    }
4225    sink.write(b"\n");
4226    Ok(())
4227}
4228
4229#[derive(Default)]
4230pub struct MemoryStats {
4231    pub vm_size: usize,
4232    pub vm_rss: usize,
4233    pub vm_rss_hwm: usize,
4234    pub rss_anonymous: usize,
4235    pub rss_file: usize,
4236    pub rss_shared: usize,
4237    pub vm_data: usize,
4238    pub vm_stack: usize,
4239    pub vm_exe: usize,
4240    pub vm_swap: usize,
4241    pub vm_lck: usize,
4242}
4243
4244/// Implements `/proc/self/maps`.
4245#[derive(Clone)]
4246pub struct ProcMapsFile {
4247    mm: Weak<MemoryManager>,
4248    task: WeakRef<Task>,
4249}
4250impl ProcMapsFile {
4251    pub fn new(task: TempRef<'_, Task>) -> DynamicFile<Self> {
4252        // "maps" is empty for kthreads, rather than inaccessible.
4253        let mm = task.mm().map_or_else(|_| Weak::default(), |mm| Arc::downgrade(&mm));
4254        let task = task.into();
4255        DynamicFile::new(Self { mm, task })
4256    }
4257}
4258
4259impl SequenceFileSource for ProcMapsFile {
4260    type Cursor = UserAddress;
4261
4262    fn next(
4263        &self,
4264        _current_task: &CurrentTask,
4265        cursor: UserAddress,
4266        sink: &mut DynamicFileBuf,
4267    ) -> Result<Option<UserAddress>, Errno> {
4268        let task = Task::from_weak(&self.task)?;
4269        // /proc/<pid>/maps is empty for kthreads and tasks whose memory manager has changed.
4270        let Some(mm) = self.mm.upgrade() else {
4271            return Ok(None);
4272        };
4273        let state = mm.state.read();
4274        if let Some((range, map)) = state.mappings.find_at_or_after(cursor) {
4275            write_map(&task, sink, &state, range, map)?;
4276            return Ok(Some(range.end));
4277        }
4278        Ok(None)
4279    }
4280}
4281
4282#[derive(Clone)]
4283pub struct ProcSmapsFile {
4284    mm: Weak<MemoryManager>,
4285    task: WeakRef<Task>,
4286}
4287impl ProcSmapsFile {
4288    pub fn new(task: TempRef<'_, Task>) -> DynamicFile<Self> {
4289        // "smaps" is empty for kthreads, rather than inaccessible.
4290        let mm = task.mm().map_or_else(|_| Weak::default(), |mm| Arc::downgrade(&mm));
4291        DynamicFile::new(Self { mm, task: task.into() })
4292    }
4293}
4294
4295impl DynamicFileSource for ProcSmapsFile {
4296    fn generate(&self, current_task: &CurrentTask, sink: &mut DynamicFileBuf) -> Result<(), Errno> {
4297        let page_size_kb = *PAGE_SIZE / 1024;
4298        let task = Task::from_weak(&self.task)?;
4299        // /proc/<pid>/smaps is empty for kthreads and tasks whose memory manager has changed.
4300        let Some(mm) = self.mm.upgrade() else {
4301            return Ok(());
4302        };
4303        let state = mm.state.read();
4304        let committed_bytes_vec = state.with_zx_mappings(current_task, |zx_mappings| {
4305            let mut zx_memory_info = RangeMap::<UserAddress, usize>::default();
4306            for idx in 0..zx_mappings.len() {
4307                let zx_mapping = zx_mappings[idx];
4308                // RangeMap uses #[must_use] for its default usecase but this drop is trivial.
4309                let _ = zx_memory_info.insert(
4310                    UserAddress::from_ptr(zx_mapping.base)
4311                        ..UserAddress::from_ptr(zx_mapping.base + zx_mapping.size),
4312                    idx,
4313                );
4314            }
4315
4316            let mut committed_bytes_vec = Vec::new();
4317            for (mm_range, mm_mapping) in state.mappings.iter() {
4318                let mut committed_bytes = 0;
4319
4320                for (zx_range, zx_mapping_idx) in zx_memory_info.range(mm_range.clone()) {
4321                    let intersect_range = zx_range.intersect(mm_range);
4322                    let zx_mapping = zx_mappings[*zx_mapping_idx];
4323                    let zx_details = zx_mapping.details();
4324                    let Some(zx_details) = zx_details.as_mapping() else { continue };
4325                    let zx_committed_bytes = zx_details.committed_bytes;
4326
4327                    // TODO(https://fxbug.dev/419882465): It can happen that the same Zircon mapping
4328                    // is covered by more than one Starnix mapping. In this case we don't have
4329                    // enough granularity to answer the question of how many committed bytes belong
4330                    // to one mapping or another. Make a best-effort approximation by dividing the
4331                    // committed bytes of a Zircon mapping proportionally.
4332                    committed_bytes += if intersect_range != *zx_range {
4333                        let intersection_size =
4334                            intersect_range.end.ptr() - intersect_range.start.ptr();
4335                        let part = intersection_size as f32 / zx_mapping.size as f32;
4336                        let prorated_committed_bytes: f32 = part * zx_committed_bytes as f32;
4337                        prorated_committed_bytes as u64
4338                    } else {
4339                        zx_committed_bytes as u64
4340                    };
4341                    assert_eq!(
4342                        match state.get_mapping_backing(mm_mapping) {
4343                            MappingBacking::Memory(m) => m.memory().get_koid(),
4344                            MappingBacking::PrivateAnonymous =>
4345                                state.private_anonymous.backing.get_koid(),
4346                        },
4347                        zx_details.vmo_koid,
4348                        "MemoryManager and Zircon must agree on which VMO is mapped in this range",
4349                    );
4350                }
4351                committed_bytes_vec.push(committed_bytes);
4352            }
4353            Ok(committed_bytes_vec)
4354        })?;
4355
4356        for ((mm_range, mm_mapping), committed_bytes) in
4357            state.mappings.iter().zip(committed_bytes_vec.into_iter())
4358        {
4359            write_map(&task, sink, &state, mm_range, mm_mapping)?;
4360
4361            let size_kb = (mm_range.end.ptr() - mm_range.start.ptr()) / 1024;
4362            writeln!(sink, "Size:           {size_kb:>8} kB",)?;
4363            let share_count = match state.get_mapping_backing(mm_mapping) {
4364                MappingBacking::Memory(backing) => {
4365                    let memory = backing.memory();
4366                    if memory.is_clock() {
4367                        // Clock memory mappings are not shared in a meaningful way.
4368                        1
4369                    } else {
4370                        let memory_info = backing.memory().info()?;
4371                        memory_info.share_count as u64
4372                    }
4373                }
4374                MappingBacking::PrivateAnonymous => {
4375                    1 // Private mapping
4376                }
4377            };
4378
4379            let rss_kb = committed_bytes / 1024;
4380            writeln!(sink, "Rss:            {rss_kb:>8} kB")?;
4381
4382            let pss_kb = if mm_mapping.flags().contains(MappingFlags::SHARED) {
4383                rss_kb / share_count
4384            } else {
4385                rss_kb
4386            };
4387            writeln!(sink, "Pss:            {pss_kb:>8} kB")?;
4388
4389            track_stub!(TODO("https://fxbug.dev/322874967"), "smaps dirty pages");
4390            let (shared_dirty_kb, private_dirty_kb) = (0, 0);
4391
4392            let is_shared = share_count > 1;
4393            let shared_clean_kb = if is_shared { rss_kb } else { 0 };
4394            writeln!(sink, "Shared_Clean:   {shared_clean_kb:>8} kB")?;
4395            writeln!(sink, "Shared_Dirty:   {shared_dirty_kb:>8} kB")?;
4396
4397            let private_clean_kb = if is_shared { 0 } else { rss_kb };
4398            writeln!(sink, "Private_Clean:  {private_clean_kb:>8} kB")?;
4399            writeln!(sink, "Private_Dirty:  {private_dirty_kb:>8} kB")?;
4400
4401            let anonymous_kb = if mm_mapping.private_anonymous() { rss_kb } else { 0 };
4402            writeln!(sink, "Anonymous:      {anonymous_kb:>8} kB")?;
4403            writeln!(sink, "KernelPageSize: {page_size_kb:>8} kB")?;
4404            writeln!(sink, "MMUPageSize:    {page_size_kb:>8} kB")?;
4405
4406            let locked_kb =
4407                if mm_mapping.flags().contains(MappingFlags::LOCKED) { rss_kb } else { 0 };
4408            writeln!(sink, "Locked:         {locked_kb:>8} kB")?;
4409            writeln!(sink, "VmFlags: {}", mm_mapping.vm_flags())?;
4410
4411            track_stub!(TODO("https://fxbug.dev/297444691"), "optional smaps fields");
4412        }
4413
4414        Ok(())
4415    }
4416}
4417
4418/// Creates a memory object that can be used in an anonymous mapping for the `mmap` syscall.
4419pub fn create_anonymous_mapping_memory(size: u64) -> Result<Arc<MemoryObject>, Errno> {
4420    // mremap can grow memory regions, so make sure the memory object is resizable.
4421    let mut memory = MemoryObject::from(
4422        zx::Vmo::create_with_opts(zx::VmoOptions::RESIZABLE, size).map_err(|s| match s {
4423            zx::Status::NO_MEMORY => errno!(ENOMEM),
4424            zx::Status::OUT_OF_RANGE => errno!(ENOMEM),
4425            _ => impossible_error(s),
4426        })?,
4427    )
4428    .with_zx_name(b"starnix:memory_manager");
4429
4430    memory.set_zx_name(b"starnix-anon");
4431
4432    // TODO(https://fxbug.dev/42056890): Audit replace_as_executable usage
4433    memory = memory.replace_as_executable(&VMEX_RESOURCE).map_err(impossible_error)?;
4434    Ok(Arc::new(memory))
4435}
4436
4437fn generate_random_offset_for_aslr(arch_width: ArchWidth) -> usize {
4438    // Generate a number with ASLR_RANDOM_BITS.
4439    let randomness = {
4440        let random_bits =
4441            if arch_width.is_arch32() { ASLR_32_RANDOM_BITS } else { ASLR_RANDOM_BITS };
4442        let mask = (1 << random_bits) - 1;
4443        let mut bytes = [0; std::mem::size_of::<usize>()];
4444        zx::cprng_draw(&mut bytes);
4445        usize::from_le_bytes(bytes) & mask
4446    };
4447
4448    // Transform it into a page-aligned offset.
4449    randomness * (*PAGE_SIZE as usize)
4450}
4451
4452#[cfg(test)]
4453mod tests {
4454    use super::*;
4455    use crate::mm::memory_accessor::MemoryAccessorExt;
4456    use crate::mm::syscalls::do_mmap;
4457    use crate::task::syscalls::sys_prctl;
4458    use crate::testing::*;
4459    use crate::vfs::FdNumber;
4460    use assert_matches::assert_matches;
4461    use itertools::assert_equal;
4462    use starnix_sync::{FileOpsCore, LockEqualOrBefore};
4463    use starnix_uapi::user_address::{UserCString, UserRef};
4464    use starnix_uapi::{
4465        MAP_ANONYMOUS, MAP_FIXED, MAP_GROWSDOWN, MAP_PRIVATE, MAP_SHARED, PR_SET_VMA,
4466        PR_SET_VMA_ANON_NAME, PROT_NONE, PROT_READ,
4467    };
4468    use std::ffi::CString;
4469    use zerocopy::{FromBytes, Immutable, KnownLayout};
4470
4471    #[::fuchsia::test]
4472    fn test_mapping_flags() {
4473        let options = MappingOptions::ANONYMOUS;
4474        let access_flags = ProtectionFlags::READ | ProtectionFlags::WRITE;
4475        let mapping_flags = MappingFlags::from_access_flags_and_options(access_flags, options);
4476        assert_eq!(mapping_flags.access_flags(), access_flags);
4477        assert_eq!(mapping_flags.options(), options);
4478
4479        let new_access_flags = ProtectionFlags::READ | ProtectionFlags::EXEC;
4480        let adusted_mapping_flags = mapping_flags.with_access_flags(new_access_flags);
4481        assert_eq!(adusted_mapping_flags.access_flags(), new_access_flags);
4482        assert_eq!(adusted_mapping_flags.options(), options);
4483    }
4484
4485    #[::fuchsia::test]
4486    async fn test_brk() {
4487        spawn_kernel_and_run(async |locked, current_task| {
4488            let mm = current_task.mm().unwrap();
4489
4490            // Look up the given addr in the mappings table.
4491            let get_range = |addr: UserAddress| {
4492                let state = mm.state.read();
4493                state.mappings.get(addr).map(|(range, mapping)| (range.clone(), mapping.clone()))
4494            };
4495
4496            // Initialize the program break.
4497            let base_addr = mm
4498                .set_brk(locked, &current_task, UserAddress::default())
4499                .expect("failed to set initial program break");
4500            assert!(base_addr > UserAddress::default());
4501
4502            // Page containing the program break address should not be mapped.
4503            assert_eq!(get_range(base_addr), None);
4504
4505            // Growing it by a single byte results in that page becoming mapped.
4506            let addr0 = mm
4507                .set_brk(locked, &current_task, (base_addr + 1u64).unwrap())
4508                .expect("failed to grow brk");
4509            assert!(addr0 > base_addr);
4510            let (range0, _) = get_range(base_addr).expect("base_addr should be mapped");
4511            assert_eq!(range0.start, base_addr);
4512            assert_eq!(range0.end, (base_addr + *PAGE_SIZE).unwrap());
4513
4514            // Grow the program break by another byte, which won't be enough to cause additional pages to be mapped.
4515            let addr1 = mm
4516                .set_brk(locked, &current_task, (base_addr + 2u64).unwrap())
4517                .expect("failed to grow brk");
4518            assert_eq!(addr1, (base_addr + 2u64).unwrap());
4519            let (range1, _) = get_range(base_addr).expect("base_addr should be mapped");
4520            assert_eq!(range1.start, range0.start);
4521            assert_eq!(range1.end, range0.end);
4522
4523            // Grow the program break by a non-trival amount and observe the larger mapping.
4524            let addr2 = mm
4525                .set_brk(locked, &current_task, (base_addr + 24893u64).unwrap())
4526                .expect("failed to grow brk");
4527            assert_eq!(addr2, (base_addr + 24893u64).unwrap());
4528            let (range2, _) = get_range(base_addr).expect("base_addr should be mapped");
4529            assert_eq!(range2.start, base_addr);
4530            assert_eq!(range2.end, addr2.round_up(*PAGE_SIZE).unwrap());
4531
4532            // Shrink the program break and observe the smaller mapping.
4533            let addr3 = mm
4534                .set_brk(locked, &current_task, (base_addr + 14832u64).unwrap())
4535                .expect("failed to shrink brk");
4536            assert_eq!(addr3, (base_addr + 14832u64).unwrap());
4537            let (range3, _) = get_range(base_addr).expect("base_addr should be mapped");
4538            assert_eq!(range3.start, base_addr);
4539            assert_eq!(range3.end, addr3.round_up(*PAGE_SIZE).unwrap());
4540
4541            // Shrink the program break close to zero and observe the smaller mapping.
4542            let addr4 = mm
4543                .set_brk(locked, &current_task, (base_addr + 3u64).unwrap())
4544                .expect("failed to drastically shrink brk");
4545            assert_eq!(addr4, (base_addr + 3u64).unwrap());
4546            let (range4, _) = get_range(base_addr).expect("base_addr should be mapped");
4547            assert_eq!(range4.start, base_addr);
4548            assert_eq!(range4.end, addr4.round_up(*PAGE_SIZE).unwrap());
4549
4550            // Shrink the program break to zero and observe that the mapping is entirely gone.
4551            let addr5 = mm
4552                .set_brk(locked, &current_task, base_addr)
4553                .expect("failed to drastically shrink brk to zero");
4554            assert_eq!(addr5, base_addr);
4555            assert_eq!(get_range(base_addr), None);
4556        })
4557        .await;
4558    }
4559
4560    #[::fuchsia::test]
4561    async fn test_mm_exec() {
4562        spawn_kernel_and_run(async |locked, current_task| {
4563            let mm = current_task.mm().unwrap();
4564
4565            let has = |addr: UserAddress| -> bool {
4566                let state = mm.state.read();
4567                state.mappings.get(addr).is_some()
4568            };
4569
4570            let brk_addr = mm
4571                .set_brk(locked, &current_task, UserAddress::default())
4572                .expect("failed to set initial program break");
4573            assert!(brk_addr > UserAddress::default());
4574
4575            // Allocate a single page of BRK space, so that the break base address is mapped.
4576            let _ = mm
4577                .set_brk(locked, &current_task, (brk_addr + 1u64).unwrap())
4578                .expect("failed to grow program break");
4579            assert!(has(brk_addr));
4580
4581            let mapped_addr = map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
4582            assert!(mapped_addr > UserAddress::default());
4583            assert!(has(mapped_addr));
4584
4585            let node = current_task.lookup_path_from_root(locked, "/".into()).unwrap();
4586            let new_mm = mm.exec(node, ArchWidth::Arch64).expect("failed to exec memory manager");
4587            current_task.mm.update(Some(new_mm));
4588
4589            assert!(!has(brk_addr));
4590            assert!(!has(mapped_addr));
4591
4592            // Check that the old addresses are actually available for mapping.
4593            let brk_addr2 = map_memory(locked, &current_task, brk_addr, *PAGE_SIZE);
4594            assert_eq!(brk_addr, brk_addr2);
4595            let mapped_addr2 = map_memory(locked, &current_task, mapped_addr, *PAGE_SIZE);
4596            assert_eq!(mapped_addr, mapped_addr2);
4597        })
4598        .await;
4599    }
4600
4601    #[::fuchsia::test]
4602    async fn test_get_contiguous_mappings_at() {
4603        spawn_kernel_and_run(async |locked, current_task| {
4604            let mm = current_task.mm().unwrap();
4605
4606            // Create four one-page mappings with a hole between the third one and the fourth one.
4607            let page_size = *PAGE_SIZE as usize;
4608            let addr_a = (mm.base_addr + 10 * page_size).unwrap();
4609            let addr_b = (mm.base_addr + 11 * page_size).unwrap();
4610            let addr_c = (mm.base_addr + 12 * page_size).unwrap();
4611            let addr_d = (mm.base_addr + 14 * page_size).unwrap();
4612            assert_eq!(map_memory(locked, &current_task, addr_a, *PAGE_SIZE), addr_a);
4613            assert_eq!(map_memory(locked, &current_task, addr_b, *PAGE_SIZE), addr_b);
4614            assert_eq!(map_memory(locked, &current_task, addr_c, *PAGE_SIZE), addr_c);
4615            assert_eq!(map_memory(locked, &current_task, addr_d, *PAGE_SIZE), addr_d);
4616
4617            {
4618                let mm_state = mm.state.read();
4619                // Verify that requesting an unmapped address returns an empty iterator.
4620                assert_equal(
4621                    mm_state.get_contiguous_mappings_at((addr_a - 100u64).unwrap(), 50).unwrap(),
4622                    vec![],
4623                );
4624                assert_equal(
4625                    mm_state.get_contiguous_mappings_at((addr_a - 100u64).unwrap(), 200).unwrap(),
4626                    vec![],
4627                );
4628
4629                // Verify that requesting zero bytes returns an empty iterator.
4630                assert_equal(mm_state.get_contiguous_mappings_at(addr_a, 0).unwrap(), vec![]);
4631
4632                // Verify errors.
4633                assert_eq!(
4634                    mm_state
4635                        .get_contiguous_mappings_at(UserAddress::from(100), usize::MAX)
4636                        .err()
4637                        .unwrap(),
4638                    errno!(EFAULT)
4639                );
4640                assert_eq!(
4641                    mm_state
4642                        .get_contiguous_mappings_at((mm_state.max_address() + 1u64).unwrap(), 0)
4643                        .err()
4644                        .unwrap(),
4645                    errno!(EFAULT)
4646                );
4647            }
4648
4649            assert_eq!(mm.get_mapping_count(), 2);
4650            let mm_state = mm.state.read();
4651            let (map_a, map_b) = {
4652                let mut it = mm_state.mappings.iter();
4653                (it.next().unwrap().1, it.next().unwrap().1)
4654            };
4655
4656            assert_equal(
4657                mm_state.get_contiguous_mappings_at(addr_a, page_size).unwrap(),
4658                vec![(map_a, page_size)],
4659            );
4660
4661            assert_equal(
4662                mm_state.get_contiguous_mappings_at(addr_a, page_size / 2).unwrap(),
4663                vec![(map_a, page_size / 2)],
4664            );
4665
4666            assert_equal(
4667                mm_state.get_contiguous_mappings_at(addr_a, page_size * 3).unwrap(),
4668                vec![(map_a, page_size * 3)],
4669            );
4670
4671            assert_equal(
4672                mm_state.get_contiguous_mappings_at(addr_b, page_size).unwrap(),
4673                vec![(map_a, page_size)],
4674            );
4675
4676            assert_equal(
4677                mm_state.get_contiguous_mappings_at(addr_d, page_size).unwrap(),
4678                vec![(map_b, page_size)],
4679            );
4680
4681            // Verify that results stop if there is a hole.
4682            assert_equal(
4683                mm_state
4684                    .get_contiguous_mappings_at((addr_a + page_size / 2).unwrap(), page_size * 10)
4685                    .unwrap(),
4686                vec![(map_a, page_size * 2 + page_size / 2)],
4687            );
4688
4689            // Verify that results stop at the last mapped page.
4690            assert_equal(
4691                mm_state.get_contiguous_mappings_at(addr_d, page_size * 10).unwrap(),
4692                vec![(map_b, page_size)],
4693            );
4694        })
4695        .await;
4696    }
4697
4698    #[::fuchsia::test]
4699    async fn test_read_write_crossing_mappings() {
4700        spawn_kernel_and_run(async |locked, current_task| {
4701            let mm = current_task.mm().unwrap();
4702            let ma = current_task.deref();
4703
4704            // Map two contiguous pages at fixed addresses, but backed by distinct mappings.
4705            let page_size = *PAGE_SIZE;
4706            let addr = (mm.base_addr + 10 * page_size).unwrap();
4707            assert_eq!(map_memory(locked, &current_task, addr, page_size), addr);
4708            assert_eq!(
4709                map_memory(locked, &current_task, (addr + page_size).unwrap(), page_size),
4710                (addr + page_size).unwrap()
4711            );
4712            // Mappings get merged since they are baked by the same memory object
4713            assert_eq!(mm.get_mapping_count(), 1);
4714
4715            // Write a pattern crossing our two mappings.
4716            let test_addr = (addr + page_size / 2).unwrap();
4717            let data: Vec<u8> = (0..page_size).map(|i| (i % 256) as u8).collect();
4718            ma.write_memory(test_addr, &data).expect("failed to write test data");
4719
4720            // Read it back.
4721            let data_readback =
4722                ma.read_memory_to_vec(test_addr, data.len()).expect("failed to read test data");
4723            assert_eq!(&data, &data_readback);
4724        })
4725        .await;
4726    }
4727
4728    #[::fuchsia::test]
4729    async fn test_read_write_errors() {
4730        spawn_kernel_and_run(async |locked, current_task| {
4731            let ma = current_task.deref();
4732
4733            let page_size = *PAGE_SIZE;
4734            let addr = map_memory(locked, &current_task, UserAddress::default(), page_size);
4735            let buf = vec![0u8; page_size as usize];
4736
4737            // Verify that accessing data that is only partially mapped is an error.
4738            let partial_addr_before = (addr - page_size / 2).unwrap();
4739            assert_eq!(ma.write_memory(partial_addr_before, &buf), error!(EFAULT));
4740            assert_eq!(ma.read_memory_to_vec(partial_addr_before, buf.len()), error!(EFAULT));
4741            let partial_addr_after = (addr + page_size / 2).unwrap();
4742            assert_eq!(ma.write_memory(partial_addr_after, &buf), error!(EFAULT));
4743            assert_eq!(ma.read_memory_to_vec(partial_addr_after, buf.len()), error!(EFAULT));
4744
4745            // Verify that accessing unmapped memory is an error.
4746            let unmapped_addr = (addr - 10 * page_size).unwrap();
4747            assert_eq!(ma.write_memory(unmapped_addr, &buf), error!(EFAULT));
4748            assert_eq!(ma.read_memory_to_vec(unmapped_addr, buf.len()), error!(EFAULT));
4749
4750            // However, accessing zero bytes in unmapped memory is not an error.
4751            ma.write_memory(unmapped_addr, &[]).expect("failed to write no data");
4752            ma.read_memory_to_vec(unmapped_addr, 0).expect("failed to read no data");
4753        })
4754        .await;
4755    }
4756
4757    #[::fuchsia::test]
4758    async fn test_read_c_string_to_vec_large() {
4759        spawn_kernel_and_run(async |locked, current_task| {
4760            let mm = current_task.mm().unwrap();
4761            let ma = current_task.deref();
4762
4763            let page_size = *PAGE_SIZE;
4764            let max_size = 4 * page_size as usize;
4765            let addr = (mm.base_addr + 10 * page_size).unwrap();
4766
4767            assert_eq!(map_memory(locked, &current_task, addr, max_size as u64), addr);
4768
4769            let mut random_data = vec![0; max_size];
4770            zx::cprng_draw(&mut random_data);
4771            // Remove all NUL bytes.
4772            for i in 0..random_data.len() {
4773                if random_data[i] == 0 {
4774                    random_data[i] = 1;
4775                }
4776            }
4777            random_data[max_size - 1] = 0;
4778
4779            ma.write_memory(addr, &random_data).expect("failed to write test string");
4780            // We should read the same value minus the last byte (NUL char).
4781            assert_eq!(
4782                ma.read_c_string_to_vec(UserCString::new(current_task, addr), max_size).unwrap(),
4783                random_data[..max_size - 1]
4784            );
4785        })
4786        .await;
4787    }
4788
4789    #[::fuchsia::test]
4790    async fn test_read_c_string_to_vec() {
4791        spawn_kernel_and_run(async |locked, current_task| {
4792            let mm = current_task.mm().unwrap();
4793            let ma = current_task.deref();
4794
4795            let page_size = *PAGE_SIZE;
4796            let max_size = 2 * page_size as usize;
4797            let addr = (mm.base_addr + 10 * page_size).unwrap();
4798
4799            // Map a page at a fixed address and write an unterminated string at the end of it.
4800            assert_eq!(map_memory(locked, &current_task, addr, page_size), addr);
4801            let test_str = b"foo!";
4802            let test_addr =
4803                addr.checked_add(page_size as usize).unwrap().checked_sub(test_str.len()).unwrap();
4804            ma.write_memory(test_addr, test_str).expect("failed to write test string");
4805
4806            // Expect error if the string is not terminated.
4807            assert_eq!(
4808                ma.read_c_string_to_vec(UserCString::new(current_task, test_addr), max_size),
4809                error!(ENAMETOOLONG)
4810            );
4811
4812            // Expect success if the string is terminated.
4813            ma.write_memory((addr + (page_size - 1)).unwrap(), b"\0").expect("failed to write nul");
4814            assert_eq!(
4815                ma.read_c_string_to_vec(UserCString::new(current_task, test_addr), max_size)
4816                    .unwrap(),
4817                "foo"
4818            );
4819
4820            // Expect success if the string spans over two mappings.
4821            assert_eq!(
4822                map_memory(locked, &current_task, (addr + page_size).unwrap(), page_size),
4823                (addr + page_size).unwrap()
4824            );
4825            // TODO: Adjacent private anonymous mappings are collapsed. To test this case this test needs to
4826            // provide a backing for the second mapping.
4827            // assert_eq!(mm.get_mapping_count(), 2);
4828            ma.write_memory((addr + (page_size - 1)).unwrap(), b"bar\0")
4829                .expect("failed to write extra chars");
4830            assert_eq!(
4831                ma.read_c_string_to_vec(UserCString::new(current_task, test_addr), max_size)
4832                    .unwrap(),
4833                "foobar",
4834            );
4835
4836            // Expect error if the string exceeds max limit
4837            assert_eq!(
4838                ma.read_c_string_to_vec(UserCString::new(current_task, test_addr), 2),
4839                error!(ENAMETOOLONG)
4840            );
4841
4842            // Expect error if the address is invalid.
4843            assert_eq!(
4844                ma.read_c_string_to_vec(UserCString::null(current_task), max_size),
4845                error!(EFAULT)
4846            );
4847        })
4848        .await;
4849    }
4850
4851    #[::fuchsia::test]
4852    async fn can_read_argv_like_regions() {
4853        spawn_kernel_and_run(async |locked, current_task| {
4854            let ma = current_task.deref();
4855
4856            // Map a page.
4857            let page_size = *PAGE_SIZE;
4858            let addr = map_memory_anywhere(locked, &current_task, page_size);
4859            assert!(!addr.is_null());
4860
4861            // Write an unterminated string.
4862            let mut payload = "first".as_bytes().to_vec();
4863            let mut expected_parses = vec![];
4864            ma.write_memory(addr, &payload).unwrap();
4865
4866            // Expect success if the string is terminated.
4867            expected_parses.push(payload.clone());
4868            payload.push(0);
4869            ma.write_memory(addr, &payload).unwrap();
4870            assert_eq!(
4871                ma.read_nul_delimited_c_string_list(addr, payload.len()).unwrap(),
4872                expected_parses,
4873            );
4874
4875            // Make sure we can parse multiple strings from the same region.
4876            let second = b"second";
4877            payload.extend(second);
4878            payload.push(0);
4879            expected_parses.push(second.to_vec());
4880
4881            let third = b"third";
4882            payload.extend(third);
4883            payload.push(0);
4884            expected_parses.push(third.to_vec());
4885
4886            ma.write_memory(addr, &payload).unwrap();
4887            assert_eq!(
4888                ma.read_nul_delimited_c_string_list(addr, payload.len()).unwrap(),
4889                expected_parses,
4890            );
4891        })
4892        .await;
4893    }
4894
4895    #[::fuchsia::test]
4896    async fn truncate_argv_like_regions() {
4897        spawn_kernel_and_run(async |locked, current_task| {
4898            let ma = current_task.deref();
4899
4900            // Map a page.
4901            let page_size = *PAGE_SIZE;
4902            let addr = map_memory_anywhere(locked, &current_task, page_size);
4903            assert!(!addr.is_null());
4904
4905            let payload = b"first\0second\0third\0";
4906            ma.write_memory(addr, payload).unwrap();
4907            assert_eq!(
4908                ma.read_nul_delimited_c_string_list(addr, payload.len() - 3).unwrap(),
4909                vec![b"first".to_vec(), b"second".to_vec(), b"thi".to_vec()],
4910                "Skipping last three bytes of payload should skip last two bytes of 3rd string"
4911            );
4912        })
4913        .await;
4914    }
4915
4916    #[::fuchsia::test]
4917    async fn test_read_c_string() {
4918        spawn_kernel_and_run(async |locked, current_task| {
4919            let mm = current_task.mm().unwrap();
4920            let ma = current_task.deref();
4921
4922            let page_size = *PAGE_SIZE;
4923            let buf_cap = 2 * page_size as usize;
4924            let mut buf = Vec::with_capacity(buf_cap);
4925            // We can't just use `spare_capacity_mut` because `Vec::with_capacity`
4926            // returns a `Vec` with _at least_ the requested capacity.
4927            let buf = &mut buf.spare_capacity_mut()[..buf_cap];
4928            let addr = (mm.base_addr + 10 * page_size).unwrap();
4929
4930            // Map a page at a fixed address and write an unterminated string at the end of it..
4931            assert_eq!(map_memory(locked, &current_task, addr, page_size), addr);
4932            let test_str = b"foo!";
4933            let test_addr = (addr + (page_size - test_str.len() as u64)).unwrap();
4934            ma.write_memory(test_addr, test_str).expect("failed to write test string");
4935
4936            // Expect error if the string is not terminated.
4937            assert_eq!(
4938                ma.read_c_string(UserCString::new(current_task, test_addr), buf),
4939                error!(ENAMETOOLONG)
4940            );
4941
4942            // Expect success if the string is terminated.
4943            ma.write_memory((addr + (page_size - 1)).unwrap(), b"\0").expect("failed to write nul");
4944            assert_eq!(
4945                ma.read_c_string(UserCString::new(current_task, test_addr), buf).unwrap(),
4946                "foo"
4947            );
4948
4949            // Expect success if the string spans over two mappings.
4950            assert_eq!(
4951                map_memory(locked, &current_task, (addr + page_size).unwrap(), page_size),
4952                (addr + page_size).unwrap()
4953            );
4954            // TODO: To be multiple mappings we need to provide a file backing for the next page or the
4955            // mappings will be collapsed.
4956            //assert_eq!(mm.get_mapping_count(), 2);
4957            ma.write_memory((addr + (page_size - 1)).unwrap(), b"bar\0")
4958                .expect("failed to write extra chars");
4959            assert_eq!(
4960                ma.read_c_string(UserCString::new(current_task, test_addr), buf).unwrap(),
4961                "foobar"
4962            );
4963
4964            // Expect error if the string does not fit in the provided buffer.
4965            assert_eq!(
4966                ma.read_c_string(
4967                    UserCString::new(current_task, test_addr),
4968                    &mut [MaybeUninit::uninit(); 2]
4969                ),
4970                error!(ENAMETOOLONG)
4971            );
4972
4973            // Expect error if the address is invalid.
4974            assert_eq!(ma.read_c_string(UserCString::null(current_task), buf), error!(EFAULT));
4975        })
4976        .await;
4977    }
4978
4979    #[::fuchsia::test]
4980    async fn test_find_next_unused_range() {
4981        spawn_kernel_and_run(async |locked, current_task| {
4982            let mm = current_task.mm().unwrap();
4983
4984            let mmap_top = mm.state.read().find_next_unused_range(0).unwrap().ptr();
4985            let page_size = *PAGE_SIZE as usize;
4986            assert!(mmap_top <= RESTRICTED_ASPACE_HIGHEST_ADDRESS);
4987
4988            // No mappings - top address minus requested size is available
4989            assert_eq!(
4990                mm.state.read().find_next_unused_range(page_size).unwrap(),
4991                UserAddress::from_ptr(mmap_top - page_size)
4992            );
4993
4994            // Fill it.
4995            let addr = UserAddress::from_ptr(mmap_top - page_size);
4996            assert_eq!(map_memory(locked, &current_task, addr, *PAGE_SIZE), addr);
4997
4998            // The next available range is right before the new mapping.
4999            assert_eq!(
5000                mm.state.read().find_next_unused_range(page_size).unwrap(),
5001                UserAddress::from_ptr(addr.ptr() - page_size)
5002            );
5003
5004            // Allocate an extra page before a one-page gap.
5005            let addr2 = UserAddress::from_ptr(addr.ptr() - 2 * page_size);
5006            assert_eq!(map_memory(locked, &current_task, addr2, *PAGE_SIZE), addr2);
5007
5008            // Searching for one-page range still gives the same result
5009            assert_eq!(
5010                mm.state.read().find_next_unused_range(page_size).unwrap(),
5011                UserAddress::from_ptr(addr.ptr() - page_size)
5012            );
5013
5014            // Searching for a bigger range results in the area before the second mapping
5015            assert_eq!(
5016                mm.state.read().find_next_unused_range(2 * page_size).unwrap(),
5017                UserAddress::from_ptr(addr2.ptr() - 2 * page_size)
5018            );
5019
5020            // Searching for more memory than available should fail.
5021            assert_eq!(mm.state.read().find_next_unused_range(mmap_top), None);
5022        })
5023        .await;
5024    }
5025
5026    #[::fuchsia::test]
5027    async fn test_count_placements() {
5028        spawn_kernel_and_run(async |locked, current_task| {
5029            let mm = current_task.mm().unwrap();
5030
5031            // ten-page range
5032            let page_size = *PAGE_SIZE as usize;
5033            let subrange_ten = UserAddress::from_ptr(RESTRICTED_ASPACE_BASE)
5034                ..UserAddress::from_ptr(RESTRICTED_ASPACE_BASE + 10 * page_size);
5035
5036            assert_eq!(
5037                mm.state.read().count_possible_placements(11 * page_size, &subrange_ten),
5038                Some(0)
5039            );
5040            assert_eq!(
5041                mm.state.read().count_possible_placements(10 * page_size, &subrange_ten),
5042                Some(1)
5043            );
5044            assert_eq!(
5045                mm.state.read().count_possible_placements(9 * page_size, &subrange_ten),
5046                Some(2)
5047            );
5048            assert_eq!(
5049                mm.state.read().count_possible_placements(page_size, &subrange_ten),
5050                Some(10)
5051            );
5052
5053            // map 6th page
5054            let addr = UserAddress::from_ptr(RESTRICTED_ASPACE_BASE + 5 * page_size);
5055            assert_eq!(map_memory(locked, &current_task, addr, *PAGE_SIZE), addr);
5056
5057            assert_eq!(
5058                mm.state.read().count_possible_placements(10 * page_size, &subrange_ten),
5059                Some(0)
5060            );
5061            assert_eq!(
5062                mm.state.read().count_possible_placements(5 * page_size, &subrange_ten),
5063                Some(1)
5064            );
5065            assert_eq!(
5066                mm.state.read().count_possible_placements(4 * page_size, &subrange_ten),
5067                Some(3)
5068            );
5069            assert_eq!(
5070                mm.state.read().count_possible_placements(page_size, &subrange_ten),
5071                Some(9)
5072            );
5073        })
5074        .await;
5075    }
5076
5077    #[::fuchsia::test]
5078    async fn test_pick_placement() {
5079        spawn_kernel_and_run(async |locked, current_task| {
5080            let mm = current_task.mm().unwrap();
5081
5082            let page_size = *PAGE_SIZE as usize;
5083            let subrange_ten = UserAddress::from_ptr(RESTRICTED_ASPACE_BASE)
5084                ..UserAddress::from_ptr(RESTRICTED_ASPACE_BASE + 10 * page_size);
5085
5086            let addr = UserAddress::from_ptr(RESTRICTED_ASPACE_BASE + 5 * page_size);
5087            assert_eq!(map_memory(locked, &current_task, addr, *PAGE_SIZE), addr);
5088            assert_eq!(
5089                mm.state.read().count_possible_placements(4 * page_size, &subrange_ten),
5090                Some(3)
5091            );
5092
5093            assert_eq!(
5094                mm.state.read().pick_placement(4 * page_size, 0, &subrange_ten),
5095                Some(UserAddress::from_ptr(RESTRICTED_ASPACE_BASE))
5096            );
5097            assert_eq!(
5098                mm.state.read().pick_placement(4 * page_size, 1, &subrange_ten),
5099                Some(UserAddress::from_ptr(RESTRICTED_ASPACE_BASE + page_size))
5100            );
5101            assert_eq!(
5102                mm.state.read().pick_placement(4 * page_size, 2, &subrange_ten),
5103                Some(UserAddress::from_ptr(RESTRICTED_ASPACE_BASE + 6 * page_size))
5104            );
5105        })
5106        .await;
5107    }
5108
5109    #[::fuchsia::test]
5110    async fn test_find_random_unused_range() {
5111        spawn_kernel_and_run(async |locked, current_task| {
5112            let mm = current_task.mm().unwrap();
5113
5114            // ten-page range
5115            let page_size = *PAGE_SIZE as usize;
5116            let subrange_ten = UserAddress::from_ptr(RESTRICTED_ASPACE_BASE)
5117                ..UserAddress::from_ptr(RESTRICTED_ASPACE_BASE + 10 * page_size);
5118
5119            for _ in 0..10 {
5120                let addr = mm.state.read().find_random_unused_range(page_size, &subrange_ten);
5121                assert!(addr.is_some());
5122                assert_eq!(
5123                    map_memory(locked, &current_task, addr.unwrap(), *PAGE_SIZE),
5124                    addr.unwrap()
5125                );
5126            }
5127            assert_eq!(mm.state.read().find_random_unused_range(page_size, &subrange_ten), None);
5128        })
5129        .await;
5130    }
5131
5132    #[::fuchsia::test]
5133    async fn test_grows_down_near_aspace_base() {
5134        spawn_kernel_and_run(async |locked, current_task| {
5135            let mm = current_task.mm().unwrap();
5136
5137            let page_count = 10;
5138
5139            let page_size = *PAGE_SIZE as usize;
5140            let addr =
5141                (UserAddress::from_ptr(RESTRICTED_ASPACE_BASE) + page_count * page_size).unwrap();
5142            assert_eq!(
5143                map_memory_with_flags(
5144                    locked,
5145                    &current_task,
5146                    addr,
5147                    page_size as u64,
5148                    MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN
5149                ),
5150                addr
5151            );
5152
5153            let subrange_ten = UserAddress::from_ptr(RESTRICTED_ASPACE_BASE)..addr;
5154            assert_eq!(mm.state.read().find_random_unused_range(page_size, &subrange_ten), None);
5155        })
5156        .await;
5157    }
5158
5159    #[::fuchsia::test]
5160    async fn test_unmap_returned_mappings() {
5161        spawn_kernel_and_run(async |locked, current_task| {
5162            let mm = current_task.mm().unwrap();
5163
5164            let addr = map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE * 2);
5165
5166            let mut released_mappings = ReleasedMappings::default();
5167            let mut mm_state = mm.state.write();
5168            let unmap_result =
5169                mm_state.unmap(&mm, addr, *PAGE_SIZE as usize, &mut released_mappings);
5170            assert!(unmap_result.is_ok());
5171            assert_eq!(released_mappings.len(), 1);
5172            released_mappings.finalize(mm_state);
5173        })
5174        .await;
5175    }
5176
5177    #[::fuchsia::test]
5178    async fn test_unmap_returns_multiple_mappings() {
5179        spawn_kernel_and_run(async |locked, current_task| {
5180            let mm = current_task.mm().unwrap();
5181
5182            let addr = mm.state.read().find_next_unused_range(3 * *PAGE_SIZE as usize).unwrap();
5183            let addr = map_memory(locked, &current_task, addr, *PAGE_SIZE);
5184            let _ = map_memory(locked, &current_task, (addr + 2 * *PAGE_SIZE).unwrap(), *PAGE_SIZE);
5185
5186            let mut released_mappings = ReleasedMappings::default();
5187            let mut mm_state = mm.state.write();
5188            let unmap_result =
5189                mm_state.unmap(&mm, addr, (*PAGE_SIZE * 3) as usize, &mut released_mappings);
5190            assert!(unmap_result.is_ok());
5191            assert_eq!(released_mappings.len(), 2);
5192            released_mappings.finalize(mm_state);
5193        })
5194        .await;
5195    }
5196
5197    /// Maps two pages in separate mappings next to each other, then unmaps the first page.
5198    /// The second page should not be modified.
5199    #[::fuchsia::test]
5200    async fn test_map_two_unmap_one() {
5201        spawn_kernel_and_run(async |locked, current_task| {
5202            let mm = current_task.mm().unwrap();
5203
5204            // reserve memory for both pages
5205            let addr_reserve =
5206                map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE * 2);
5207            let addr1 = do_mmap(
5208                locked,
5209                &current_task,
5210                addr_reserve,
5211                *PAGE_SIZE as usize,
5212                PROT_READ, // Map read-only to avoid merging of the two mappings
5213                MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED,
5214                FdNumber::from_raw(-1),
5215                0,
5216            )
5217            .expect("failed to mmap");
5218            let addr2 = map_memory_with_flags(
5219                locked,
5220                &current_task,
5221                (addr_reserve + *PAGE_SIZE).unwrap(),
5222                *PAGE_SIZE,
5223                MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED,
5224            );
5225            let state = mm.state.read();
5226            let (range1, _) = state.mappings.get(addr1).expect("mapping");
5227            assert_eq!(range1.start, addr1);
5228            assert_eq!(range1.end, (addr1 + *PAGE_SIZE).unwrap());
5229            let (range2, mapping2) = state.mappings.get(addr2).expect("mapping");
5230            assert_eq!(range2.start, addr2);
5231            assert_eq!(range2.end, (addr2 + *PAGE_SIZE).unwrap());
5232            let original_memory2 = {
5233                match state.get_mapping_backing(mapping2) {
5234                    MappingBacking::Memory(backing) => {
5235                        assert_eq!(backing.memory().get_size(), *PAGE_SIZE);
5236                        backing.memory().clone()
5237                    }
5238                    MappingBacking::PrivateAnonymous => {
5239                        panic!("Unexpected private anonymous mapping")
5240                    }
5241                }
5242            };
5243            std::mem::drop(state);
5244
5245            assert_eq!(mm.unmap(addr1, *PAGE_SIZE as usize), Ok(()));
5246
5247            let state = mm.state.read();
5248
5249            // The first page should be unmapped.
5250            assert!(state.mappings.get(addr1).is_none());
5251
5252            // The second page should remain unchanged.
5253            let (range2, mapping2) = state.mappings.get(addr2).expect("second page");
5254            assert_eq!(range2.start, addr2);
5255            assert_eq!(range2.end, (addr2 + *PAGE_SIZE).unwrap());
5256            match state.get_mapping_backing(mapping2) {
5257                MappingBacking::Memory(backing) => {
5258                    assert_eq!(backing.memory().get_size(), *PAGE_SIZE);
5259                    assert_eq!(original_memory2.get_koid(), backing.memory().get_koid());
5260                }
5261                MappingBacking::PrivateAnonymous => panic!("Unexpected private anonymous mapping"),
5262            }
5263        })
5264        .await;
5265    }
5266
5267    #[::fuchsia::test]
5268    async fn test_read_write_objects() {
5269        spawn_kernel_and_run(async |locked, current_task| {
5270            let ma = current_task.deref();
5271            let addr = map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
5272            let items_ref = UserRef::<i32>::new(addr);
5273
5274            let items_written = vec![0, 2, 3, 7, 1];
5275            ma.write_objects(items_ref, &items_written).expect("Failed to write object array.");
5276
5277            let items_read = ma
5278                .read_objects_to_vec(items_ref, items_written.len())
5279                .expect("Failed to read object array.");
5280
5281            assert_eq!(items_written, items_read);
5282        })
5283        .await;
5284    }
5285
5286    #[::fuchsia::test]
5287    async fn test_read_write_objects_null() {
5288        spawn_kernel_and_run(async |_, current_task| {
5289            let ma = current_task.deref();
5290            let items_ref = UserRef::<i32>::new(UserAddress::default());
5291
5292            let items_written = vec![];
5293            ma.write_objects(items_ref, &items_written)
5294                .expect("Failed to write empty object array.");
5295
5296            let items_read = ma
5297                .read_objects_to_vec(items_ref, items_written.len())
5298                .expect("Failed to read empty object array.");
5299
5300            assert_eq!(items_written, items_read);
5301        })
5302        .await;
5303    }
5304
5305    #[::fuchsia::test]
5306    async fn test_read_object_partial() {
5307        #[derive(Debug, Default, Copy, Clone, KnownLayout, FromBytes, Immutable, PartialEq)]
5308        struct Items {
5309            val: [i32; 4],
5310        }
5311
5312        spawn_kernel_and_run(async |locked, current_task| {
5313            let ma = current_task.deref();
5314            let addr = map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
5315            let items_array_ref = UserRef::<i32>::new(addr);
5316
5317            // Populate some values.
5318            let items_written = vec![75, 23, 51, 98];
5319            ma.write_objects(items_array_ref, &items_written)
5320                .expect("Failed to write object array.");
5321
5322            // Full read of all 4 values.
5323            let items_ref = UserRef::<Items>::new(addr);
5324            let items_read = ma
5325                .read_object_partial(items_ref, std::mem::size_of::<Items>())
5326                .expect("Failed to read object");
5327            assert_eq!(items_written, items_read.val);
5328
5329            // Partial read of the first two.
5330            let items_read = ma.read_object_partial(items_ref, 8).expect("Failed to read object");
5331            assert_eq!(vec![75, 23, 0, 0], items_read.val);
5332
5333            // The API currently allows reading 0 bytes (this could be re-evaluated) so test that does
5334            // the right thing.
5335            let items_read = ma.read_object_partial(items_ref, 0).expect("Failed to read object");
5336            assert_eq!(vec![0, 0, 0, 0], items_read.val);
5337
5338            // Size bigger than the object.
5339            assert_eq!(
5340                ma.read_object_partial(items_ref, std::mem::size_of::<Items>() + 8),
5341                error!(EINVAL)
5342            );
5343
5344            // Bad pointer.
5345            assert_eq!(
5346                ma.read_object_partial(UserRef::<Items>::new(UserAddress::from(1)), 16),
5347                error!(EFAULT)
5348            );
5349        })
5350        .await;
5351    }
5352
5353    #[::fuchsia::test]
5354    async fn test_partial_read() {
5355        spawn_kernel_and_run(async |locked, current_task| {
5356            let mm = current_task.mm().unwrap();
5357            let ma = current_task.deref();
5358
5359            let addr = mm.state.read().find_next_unused_range(2 * *PAGE_SIZE as usize).unwrap();
5360            let addr = map_memory(locked, &current_task, addr, *PAGE_SIZE);
5361            let second_map =
5362                map_memory(locked, &current_task, (addr + *PAGE_SIZE).unwrap(), *PAGE_SIZE);
5363
5364            let bytes = vec![0xf; (*PAGE_SIZE * 2) as usize];
5365            assert!(ma.write_memory(addr, &bytes).is_ok());
5366            let mut state = mm.state.write();
5367            let mut released_mappings = ReleasedMappings::default();
5368            state
5369                .protect(
5370                    ma,
5371                    second_map,
5372                    *PAGE_SIZE as usize,
5373                    ProtectionFlags::empty(),
5374                    &mut released_mappings,
5375                )
5376                .unwrap();
5377            released_mappings.finalize(state);
5378            assert_eq!(
5379                ma.read_memory_partial_to_vec(addr, bytes.len()).unwrap().len(),
5380                *PAGE_SIZE as usize,
5381            );
5382        })
5383        .await;
5384    }
5385
5386    fn map_memory_growsdown<L>(
5387        locked: &mut Locked<L>,
5388        current_task: &CurrentTask,
5389        length: u64,
5390    ) -> UserAddress
5391    where
5392        L: LockEqualOrBefore<FileOpsCore>,
5393    {
5394        map_memory_with_flags(
5395            locked,
5396            current_task,
5397            UserAddress::default(),
5398            length,
5399            MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN,
5400        )
5401    }
5402
5403    #[::fuchsia::test]
5404    async fn test_grow_mapping_empty_mm() {
5405        spawn_kernel_and_run(async |_, current_task| {
5406            let mm = current_task.mm().unwrap();
5407
5408            let addr = UserAddress::from(0x100000);
5409
5410            assert_matches!(mm.extend_growsdown_mapping_to_address(addr, false), Ok(false));
5411        })
5412        .await;
5413    }
5414
5415    #[::fuchsia::test]
5416    async fn test_grow_inside_mapping() {
5417        spawn_kernel_and_run(async |locked, current_task| {
5418            let mm = current_task.mm().unwrap();
5419
5420            let addr = map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
5421
5422            assert_matches!(mm.extend_growsdown_mapping_to_address(addr, false), Ok(false));
5423        })
5424        .await;
5425    }
5426
5427    #[::fuchsia::test]
5428    async fn test_grow_write_fault_inside_read_only_mapping() {
5429        spawn_kernel_and_run(async |locked, current_task| {
5430            let mm = current_task.mm().unwrap();
5431
5432            let addr = do_mmap(
5433                locked,
5434                &current_task,
5435                UserAddress::default(),
5436                *PAGE_SIZE as usize,
5437                PROT_READ,
5438                MAP_ANONYMOUS | MAP_PRIVATE,
5439                FdNumber::from_raw(-1),
5440                0,
5441            )
5442            .expect("Could not map memory");
5443
5444            assert_matches!(mm.extend_growsdown_mapping_to_address(addr, false), Ok(false));
5445            assert_matches!(mm.extend_growsdown_mapping_to_address(addr, true), Ok(false));
5446        })
5447        .await;
5448    }
5449
5450    #[::fuchsia::test]
5451    async fn test_grow_fault_inside_prot_none_mapping() {
5452        spawn_kernel_and_run(async |locked, current_task| {
5453            let mm = current_task.mm().unwrap();
5454
5455            let addr = do_mmap(
5456                locked,
5457                &current_task,
5458                UserAddress::default(),
5459                *PAGE_SIZE as usize,
5460                PROT_NONE,
5461                MAP_ANONYMOUS | MAP_PRIVATE,
5462                FdNumber::from_raw(-1),
5463                0,
5464            )
5465            .expect("Could not map memory");
5466
5467            assert_matches!(mm.extend_growsdown_mapping_to_address(addr, false), Ok(false));
5468            assert_matches!(mm.extend_growsdown_mapping_to_address(addr, true), Ok(false));
5469        })
5470        .await;
5471    }
5472
5473    #[::fuchsia::test]
5474    async fn test_grow_below_mapping() {
5475        spawn_kernel_and_run(async |locked, current_task| {
5476            let mm = current_task.mm().unwrap();
5477
5478            let addr = map_memory_growsdown(locked, &current_task, *PAGE_SIZE) - *PAGE_SIZE;
5479
5480            assert_matches!(mm.extend_growsdown_mapping_to_address(addr.unwrap(), false), Ok(true));
5481        })
5482        .await;
5483    }
5484
5485    #[::fuchsia::test]
5486    async fn test_grow_above_mapping() {
5487        spawn_kernel_and_run(async |locked, current_task| {
5488            let mm = current_task.mm().unwrap();
5489
5490            let addr = map_memory_growsdown(locked, &current_task, *PAGE_SIZE) + *PAGE_SIZE;
5491
5492            assert_matches!(
5493                mm.extend_growsdown_mapping_to_address(addr.unwrap(), false),
5494                Ok(false)
5495            );
5496        })
5497        .await;
5498    }
5499
5500    #[::fuchsia::test]
5501    async fn test_grow_write_fault_below_read_only_mapping() {
5502        spawn_kernel_and_run(async |locked, current_task| {
5503            let mm = current_task.mm().unwrap();
5504
5505            let mapped_addr = map_memory_growsdown(locked, &current_task, *PAGE_SIZE);
5506
5507            mm.protect(&current_task, mapped_addr, *PAGE_SIZE as usize, ProtectionFlags::READ)
5508                .unwrap();
5509
5510            assert_matches!(
5511                mm.extend_growsdown_mapping_to_address((mapped_addr - *PAGE_SIZE).unwrap(), true),
5512                Ok(false)
5513            );
5514
5515            assert_eq!(mm.get_mapping_count(), 1);
5516        })
5517        .await;
5518    }
5519
5520    #[::fuchsia::test]
5521    async fn test_snapshot_paged_memory() {
5522        use zx::sys::zx_page_request_command_t::ZX_PAGER_VMO_READ;
5523
5524        spawn_kernel_and_run(async |locked, current_task| {
5525            let kernel = current_task.kernel();
5526            let mm = current_task.mm().unwrap();
5527            let ma = current_task.deref();
5528
5529            let port = Arc::new(zx::Port::create());
5530            let port_clone = port.clone();
5531            let pager =
5532                Arc::new(zx::Pager::create(zx::PagerOptions::empty()).expect("create failed"));
5533            let pager_clone = pager.clone();
5534
5535            const VMO_SIZE: u64 = 128 * 1024;
5536            let vmo = Arc::new(
5537                pager
5538                    .create_vmo(zx::VmoOptions::RESIZABLE, &port, 1, VMO_SIZE)
5539                    .expect("create_vmo failed"),
5540            );
5541            let vmo_clone = vmo.clone();
5542
5543            // Create a thread to service the port where we will receive pager requests.
5544            let thread = std::thread::spawn(move || {
5545                loop {
5546                    let packet =
5547                        port_clone.wait(zx::MonotonicInstant::INFINITE).expect("wait failed");
5548                    match packet.contents() {
5549                        zx::PacketContents::Pager(contents) => {
5550                            if contents.command() == ZX_PAGER_VMO_READ {
5551                                let range = contents.range();
5552                                let source_vmo = zx::Vmo::create(range.end - range.start)
5553                                    .expect("create failed");
5554                                pager_clone
5555                                    .supply_pages(&vmo_clone, range, &source_vmo, 0)
5556                                    .expect("supply_pages failed");
5557                            }
5558                        }
5559                        zx::PacketContents::User(_) => break,
5560                        _ => {}
5561                    }
5562                }
5563            });
5564
5565            let child_vmo = vmo
5566                .create_child(zx::VmoChildOptions::SNAPSHOT_AT_LEAST_ON_WRITE, 0, VMO_SIZE)
5567                .unwrap();
5568
5569            // Write something to the source VMO.
5570            vmo.write(b"foo", 0).expect("write failed");
5571
5572            let prot_flags = ProtectionFlags::READ | ProtectionFlags::WRITE;
5573            let addr = mm
5574                .map_memory(
5575                    DesiredAddress::Any,
5576                    Arc::new(MemoryObject::from(child_vmo)),
5577                    0,
5578                    VMO_SIZE as usize,
5579                    prot_flags,
5580                    Access::rwx(),
5581                    MappingOptions::empty(),
5582                    MappingName::None,
5583                )
5584                .expect("map failed");
5585
5586            let target = create_task(locked, &kernel, "another-task");
5587            mm.snapshot_to(locked, &target.mm().unwrap()).expect("snapshot_to failed");
5588
5589            // Make sure it has what we wrote.
5590            let buf = target.read_memory_to_vec(addr, 3).expect("read_memory failed");
5591            assert_eq!(buf, b"foo");
5592
5593            // Write something to both source and target and make sure they are forked.
5594            ma.write_memory(addr, b"bar").expect("write_memory failed");
5595
5596            let buf = target.read_memory_to_vec(addr, 3).expect("read_memory failed");
5597            assert_eq!(buf, b"foo");
5598
5599            target.write_memory(addr, b"baz").expect("write_memory failed");
5600            let buf = ma.read_memory_to_vec(addr, 3).expect("read_memory failed");
5601            assert_eq!(buf, b"bar");
5602
5603            let buf = target.read_memory_to_vec(addr, 3).expect("read_memory failed");
5604            assert_eq!(buf, b"baz");
5605
5606            port.queue(&zx::Packet::from_user_packet(0, 0, zx::UserPacket::from_u8_array([0; 32])))
5607                .unwrap();
5608            thread.join().unwrap();
5609        })
5610        .await;
5611    }
5612
5613    #[::fuchsia::test]
5614    async fn test_set_vma_name() {
5615        spawn_kernel_and_run(async |locked, mut current_task| {
5616            let name_addr = map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
5617
5618            let vma_name = "vma name";
5619            current_task.write_memory(name_addr, vma_name.as_bytes()).unwrap();
5620
5621            let mapping_addr =
5622                map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
5623
5624            sys_prctl(
5625                locked,
5626                &mut current_task,
5627                PR_SET_VMA,
5628                PR_SET_VMA_ANON_NAME as u64,
5629                mapping_addr.ptr() as u64,
5630                *PAGE_SIZE,
5631                name_addr.ptr() as u64,
5632            )
5633            .unwrap();
5634
5635            assert_eq!(
5636                *current_task.mm().unwrap().get_mapping_name(mapping_addr).unwrap().unwrap(),
5637                vma_name
5638            );
5639        })
5640        .await;
5641    }
5642
5643    #[::fuchsia::test]
5644    async fn test_set_vma_name_adjacent_mappings() {
5645        spawn_kernel_and_run(async |locked, mut current_task| {
5646            let name_addr = map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
5647            current_task
5648                .write_memory(name_addr, CString::new("foo").unwrap().as_bytes_with_nul())
5649                .unwrap();
5650
5651            let first_mapping_addr =
5652                map_memory(locked, &current_task, UserAddress::default(), 2 * *PAGE_SIZE);
5653            let second_mapping_addr = map_memory_with_flags(
5654                locked,
5655                &current_task,
5656                (first_mapping_addr + *PAGE_SIZE).unwrap(),
5657                *PAGE_SIZE,
5658                MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
5659            );
5660
5661            assert_eq!((first_mapping_addr + *PAGE_SIZE).unwrap(), second_mapping_addr);
5662
5663            sys_prctl(
5664                locked,
5665                &mut current_task,
5666                PR_SET_VMA,
5667                PR_SET_VMA_ANON_NAME as u64,
5668                first_mapping_addr.ptr() as u64,
5669                2 * *PAGE_SIZE,
5670                name_addr.ptr() as u64,
5671            )
5672            .unwrap();
5673
5674            {
5675                let mm = current_task.mm().unwrap();
5676                let state = mm.state.read();
5677
5678                // The name should apply to both mappings.
5679                let (_, mapping) = state.mappings.get(first_mapping_addr).unwrap();
5680                assert_eq!(mapping.name(), MappingName::Vma("foo".into()));
5681
5682                let (_, mapping) = state.mappings.get(second_mapping_addr).unwrap();
5683                assert_eq!(mapping.name(), MappingName::Vma("foo".into()));
5684            }
5685        })
5686        .await;
5687    }
5688
5689    #[::fuchsia::test]
5690    async fn test_set_vma_name_beyond_end() {
5691        spawn_kernel_and_run(async |locked, mut current_task| {
5692            let name_addr = map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
5693            current_task
5694                .write_memory(name_addr, CString::new("foo").unwrap().as_bytes_with_nul())
5695                .unwrap();
5696
5697            let mapping_addr =
5698                map_memory(locked, &current_task, UserAddress::default(), 2 * *PAGE_SIZE);
5699
5700            let second_page = (mapping_addr + *PAGE_SIZE).unwrap();
5701            current_task.mm().unwrap().unmap(second_page, *PAGE_SIZE as usize).unwrap();
5702
5703            // This should fail with ENOMEM since it extends past the end of the mapping into unmapped memory.
5704            assert_eq!(
5705                sys_prctl(
5706                    locked,
5707                    &mut current_task,
5708                    PR_SET_VMA,
5709                    PR_SET_VMA_ANON_NAME as u64,
5710                    mapping_addr.ptr() as u64,
5711                    2 * *PAGE_SIZE,
5712                    name_addr.ptr() as u64,
5713                ),
5714                error!(ENOMEM)
5715            );
5716
5717            // Despite returning an error, the prctl should still assign a name to the region at the start of the region.
5718            {
5719                let mm = current_task.mm().unwrap();
5720                let state = mm.state.read();
5721
5722                let (_, mapping) = state.mappings.get(mapping_addr).unwrap();
5723                assert_eq!(mapping.name(), MappingName::Vma("foo".into()));
5724            }
5725        })
5726        .await;
5727    }
5728
5729    #[::fuchsia::test]
5730    async fn test_set_vma_name_before_start() {
5731        spawn_kernel_and_run(async |locked, mut current_task| {
5732            let name_addr = map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
5733            current_task
5734                .write_memory(name_addr, CString::new("foo").unwrap().as_bytes_with_nul())
5735                .unwrap();
5736
5737            let mapping_addr =
5738                map_memory(locked, &current_task, UserAddress::default(), 2 * *PAGE_SIZE);
5739
5740            let second_page = (mapping_addr + *PAGE_SIZE).unwrap();
5741            current_task.mm().unwrap().unmap(mapping_addr, *PAGE_SIZE as usize).unwrap();
5742
5743            // This should fail with ENOMEM since the start of the range is in unmapped memory.
5744            assert_eq!(
5745                sys_prctl(
5746                    locked,
5747                    &mut current_task,
5748                    PR_SET_VMA,
5749                    PR_SET_VMA_ANON_NAME as u64,
5750                    mapping_addr.ptr() as u64,
5751                    2 * *PAGE_SIZE,
5752                    name_addr.ptr() as u64,
5753                ),
5754                error!(ENOMEM)
5755            );
5756
5757            // Unlike a range which starts within a mapping and extends past the end, this should not assign
5758            // a name to any mappings.
5759            {
5760                let mm = current_task.mm().unwrap();
5761                let state = mm.state.read();
5762
5763                let (_, mapping) = state.mappings.get(second_page).unwrap();
5764                assert_eq!(mapping.name(), MappingName::None);
5765            }
5766        })
5767        .await;
5768    }
5769
5770    #[::fuchsia::test]
5771    async fn test_set_vma_name_partial() {
5772        spawn_kernel_and_run(async |locked, mut current_task| {
5773            let name_addr = map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
5774            current_task
5775                .write_memory(name_addr, CString::new("foo").unwrap().as_bytes_with_nul())
5776                .unwrap();
5777
5778            let mapping_addr =
5779                map_memory(locked, &current_task, UserAddress::default(), 3 * *PAGE_SIZE);
5780
5781            assert_eq!(
5782                sys_prctl(
5783                    locked,
5784                    &mut current_task,
5785                    PR_SET_VMA,
5786                    PR_SET_VMA_ANON_NAME as u64,
5787                    (mapping_addr + *PAGE_SIZE).unwrap().ptr() as u64,
5788                    *PAGE_SIZE,
5789                    name_addr.ptr() as u64,
5790                ),
5791                Ok(starnix_syscalls::SUCCESS)
5792            );
5793
5794            // This should split the mapping into 3 pieces with the second piece having the name "foo"
5795            {
5796                let mm = current_task.mm().unwrap();
5797                let state = mm.state.read();
5798
5799                let (_, mapping) = state.mappings.get(mapping_addr).unwrap();
5800                assert_eq!(mapping.name(), MappingName::None);
5801
5802                let (_, mapping) =
5803                    state.mappings.get((mapping_addr + *PAGE_SIZE).unwrap()).unwrap();
5804                assert_eq!(mapping.name(), MappingName::Vma("foo".into()));
5805
5806                let (_, mapping) =
5807                    state.mappings.get((mapping_addr + (2 * *PAGE_SIZE)).unwrap()).unwrap();
5808                assert_eq!(mapping.name(), MappingName::None);
5809            }
5810        })
5811        .await;
5812    }
5813
5814    #[::fuchsia::test]
5815    async fn test_preserve_name_snapshot() {
5816        spawn_kernel_and_run(async |locked, mut current_task| {
5817            let kernel = current_task.kernel().clone();
5818            let name_addr = map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
5819            current_task
5820                .write_memory(name_addr, CString::new("foo").unwrap().as_bytes_with_nul())
5821                .unwrap();
5822
5823            let mapping_addr =
5824                map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
5825
5826            assert_eq!(
5827                sys_prctl(
5828                    locked,
5829                    &mut current_task,
5830                    PR_SET_VMA,
5831                    PR_SET_VMA_ANON_NAME as u64,
5832                    mapping_addr.ptr() as u64,
5833                    *PAGE_SIZE,
5834                    name_addr.ptr() as u64,
5835                ),
5836                Ok(starnix_syscalls::SUCCESS)
5837            );
5838
5839            let target = create_task(locked, &kernel, "another-task");
5840            current_task
5841                .mm()
5842                .unwrap()
5843                .snapshot_to(locked, &target.mm().unwrap())
5844                .expect("snapshot_to failed");
5845
5846            {
5847                let mm = target.mm().unwrap();
5848                let state = mm.state.read();
5849
5850                let (_, mapping) = state.mappings.get(mapping_addr).unwrap();
5851                assert_eq!(mapping.name(), MappingName::Vma("foo".into()));
5852            }
5853        })
5854        .await;
5855    }
5856}