Skip to main content

starnix_core/vfs/
syscalls.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::{IOVecPtr, MemoryAccessor, MemoryAccessorExt, PAGE_SIZE};
6use crate::security;
7use crate::syscalls::time::{ITimerSpecPtr, TimeSpecPtr, TimeValPtr};
8use crate::task::{CurrentTask, EventHandler, ProcessEntryRef, ReadyItem, ReadyItemKey, Waiter};
9use crate::time::{Timeline, TimerWakeup};
10use crate::vfs::aio::AioContext;
11use crate::vfs::buffers::{UserBuffersInputBuffer, UserBuffersOutputBuffer};
12use crate::vfs::eventfd::{EventFdType, new_eventfd};
13use crate::vfs::fs_args::MountParams;
14use crate::vfs::inotify::InotifyFileObject;
15use crate::vfs::pidfd::new_pidfd;
16use crate::vfs::pipe::{PipeFileObject, new_pipe};
17use crate::vfs::timer::TimerFile;
18use crate::vfs::{
19    CheckAccessReason, DirentSink64, EpollFileObject, FallocMode, FdFlags, FdNumber,
20    FileAsyncOwner, FileHandle, FileSystemOptions, FlockOperation, FsStr, FsString, LookupContext,
21    Mount, NamespaceNode, PathWithReachability, RecordLockCommand, RenameFlags, SeekTarget,
22    StatxFlags, SymlinkMode, SymlinkTarget, TargetFdNumber, TimeUpdateType, UnlinkKind,
23    ValueOrSize, WdNumber, WhatToMount, XattrOp, checked_add_offset_and_length, new_memfd,
24    new_zombie_pidfd, splice,
25};
26use starnix_logging::{log_trace, track_stub};
27use starnix_sync::{
28    EventHandlerReadyQueueLock, FileOpsCore, LockDepMutex, LockEqualOrBefore, Locked, Unlocked,
29};
30use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
31use starnix_types::time::{
32    duration_from_poll_timeout, duration_from_timespec, time_from_timespec, timespec_from_duration,
33};
34use starnix_types::user_buffer::UserBuffer;
35use starnix_uapi::auth::{
36    CAP_BLOCK_SUSPEND, CAP_DAC_READ_SEARCH, CAP_LEASE, CAP_SYS_ADMIN, CAP_WAKE_ALARM, Capabilities,
37    Credentials, PTRACE_MODE_ATTACH_REALCREDS,
38};
39use starnix_uapi::device_id::DeviceId;
40use starnix_uapi::errors::{
41    EFAULT, EINTR, ENAMETOOLONG, ENOTSUP, ETIMEDOUT, Errno, ErrnoResultExt,
42};
43use starnix_uapi::file_lease::FileLeaseType;
44use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
45use starnix_uapi::inotify_mask::InotifyMask;
46use starnix_uapi::mount_flags::MountFlags;
47use starnix_uapi::open_flags::OpenFlags;
48use starnix_uapi::personality::PersonalityFlags;
49use starnix_uapi::resource_limits::Resource;
50use starnix_uapi::seal_flags::SealFlags;
51use starnix_uapi::signals::SigSet;
52use starnix_uapi::unmount_flags::UnmountFlags;
53use starnix_uapi::user_address::{MultiArchUserRef, UserAddress, UserCString, UserRef};
54use starnix_uapi::user_value::UserValue;
55use starnix_uapi::vfs::{EpollEvent, FdEvents, ResolveFlags};
56use starnix_uapi::{
57    __kernel_fd_set, AT_EACCESS, AT_EMPTY_PATH, AT_NO_AUTOMOUNT, AT_REMOVEDIR, AT_SYMLINK_FOLLOW,
58    AT_SYMLINK_NOFOLLOW, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM, CLOCK_MONOTONIC, CLOCK_REALTIME,
59    CLOCK_REALTIME_ALARM, CLOSE_RANGE_CLOEXEC, CLOSE_RANGE_UNSHARE, EFD_CLOEXEC, EFD_NONBLOCK,
60    EFD_SEMAPHORE, EPOLL_CLOEXEC, EPOLL_CTL_ADD, EPOLL_CTL_DEL, EPOLL_CTL_MOD, F_ADD_SEALS,
61    F_DUPFD, F_DUPFD_CLOEXEC, F_GET_SEALS, F_GETFD, F_GETFL, F_GETLEASE, F_GETLK, F_GETLK64,
62    F_GETOWN, F_GETOWN_EX, F_OFD_GETLK, F_OFD_SETLK, F_OFD_SETLKW, F_OWNER_PGRP, F_OWNER_PID,
63    F_OWNER_TID, F_SETFD, F_SETFL, F_SETLEASE, F_SETLK, F_SETLK64, F_SETLKW, F_SETLKW64, F_SETOWN,
64    F_SETOWN_EX, F_SETSIG, FIOCLEX, FIONCLEX, IN_CLOEXEC, IN_NONBLOCK, MFD_ALLOW_SEALING,
65    MFD_CLOEXEC, MFD_EXEC, MFD_HUGE_MASK, MFD_HUGE_SHIFT, MFD_HUGETLB, MFD_NOEXEC_SEAL, NAME_MAX,
66    O_CLOEXEC, O_CREAT, O_NOFOLLOW, O_PATH, O_TMPFILE, PIDFD_NONBLOCK, POLLERR, POLLHUP, POLLIN,
67    POLLOUT, POLLPRI, POLLRDBAND, POLLRDNORM, POLLWRBAND, POLLWRNORM, POSIX_FADV_DONTNEED,
68    POSIX_FADV_NOREUSE, POSIX_FADV_NORMAL, POSIX_FADV_RANDOM, POSIX_FADV_SEQUENTIAL,
69    POSIX_FADV_WILLNEED, RWF_SUPPORTED, TFD_CLOEXEC, TFD_NONBLOCK, TFD_TIMER_ABSTIME,
70    TFD_TIMER_CANCEL_ON_SET, XATTR_CREATE, XATTR_NAME_MAX, XATTR_REPLACE, aio_context_t, errno,
71    error, f_owner_ex, io_event, iocb, off_t, pid_t, pollfd, pselect6_sigmask, sigset_t, statx,
72    timespec, uapi, uid_t,
73};
74use std::cmp::Ordering;
75use std::collections::VecDeque;
76use std::marker::PhantomData;
77use std::sync::Arc;
78use std::usize;
79use zerocopy::{Immutable, IntoBytes};
80
81uapi::check_arch_independent_layout! {
82    pollfd {
83        fd,
84        events,
85        revents,
86    }
87
88    io_event {
89        data,
90        obj,
91        res,
92        res2,
93    }
94
95    iocb {
96        aio_data,
97        aio_key,
98        aio_rw_flags,
99        aio_lio_opcode,
100        aio_reqprio,
101        aio_fildes,
102        aio_buf,
103        aio_nbytes,
104        aio_offset,
105        aio_reserved2,
106        aio_flags,
107        aio_resfd,
108    }
109
110    statx_timestamp {
111        tv_sec,
112        tv_nsec,
113    }
114
115    statx {
116        stx_mask,
117        stx_blksize,
118        stx_attributes,
119        stx_nlink,
120        stx_uid,
121        stx_gid,
122        stx_mode,
123        stx_ino,
124        stx_size,
125        stx_blocks,
126        stx_attributes_mask,
127        stx_atime,
128        stx_btime,
129        stx_ctime,
130        stx_mtime,
131        stx_rdev_major,
132        stx_rdev_minor,
133        stx_dev_major,
134        stx_dev_minor,
135        stx_mnt_id,
136        stx_dio_mem_align,
137        stx_dio_offset_align,
138        stx_subvol,
139        stx_atomic_write_unit_min,
140        stx_atomic_write_unit_max,
141        stx_atomic_write_segments_max,
142    }
143
144    io_sqring_offsets {
145        head,
146        tail,
147        ring_mask,
148        ring_entries,
149        flags,
150        dropped,
151        array,
152        resv1,
153        user_addr,
154    }
155
156    io_cqring_offsets {
157        head,
158        tail,
159        ring_mask,
160        ring_entries,
161        overflow,
162        cqes,
163        flags,
164        resv1,
165        user_addr,
166    }
167
168    io_uring_params {
169        sq_entries,
170        cq_entries,
171        flags,
172        sq_thread_cpu,
173        sq_thread_idle,
174        features,
175        wq_fd,
176        resv,
177        sq_off,
178        cq_off,
179    }
180
181    io_uring_rsrc_update {
182        offset,
183        resv,
184        data,
185    }
186
187    io_uring_buf_reg {
188        ring_addr,
189        ring_entries,
190        bgid,
191        flags,
192        resv,
193    }
194}
195
196// Constants from bionic/libc/include/sys/stat.h
197const UTIME_NOW: i64 = 0x3fffffff;
198const UTIME_OMIT: i64 = 0x3ffffffe;
199
200pub type OffsetPtr = MultiArchUserRef<uapi::off_t, uapi::arch32::off_t>;
201pub type IocbPtr = MultiArchUserRef<iocb, iocb>;
202pub type IocbPtrPtr = MultiArchUserRef<IocbPtr, IocbPtr>;
203
204pub fn sys_read(
205    locked: &mut Locked<Unlocked>,
206    current_task: &CurrentTask,
207    fd: FdNumber,
208    address: UserAddress,
209    length: usize,
210) -> Result<usize, Errno> {
211    let file = current_task.get_file(fd)?;
212    file.read(
213        locked,
214        current_task,
215        &mut UserBuffersOutputBuffer::unified_new_at(current_task, address, length)?,
216    )
217    .map_eintr(|| errno!(ERESTARTSYS))
218}
219
220pub fn sys_write(
221    locked: &mut Locked<Unlocked>,
222    current_task: &CurrentTask,
223    fd: FdNumber,
224    address: UserAddress,
225    length: usize,
226) -> Result<usize, Errno> {
227    let file = current_task.get_file(fd)?;
228    file.write(
229        locked,
230        current_task,
231        &mut UserBuffersInputBuffer::unified_new_at(current_task, address, length)?,
232    )
233    .map_eintr(|| errno!(ERESTARTSYS))
234}
235
236pub fn sys_close(
237    _locked: &mut Locked<Unlocked>,
238    current_task: &CurrentTask,
239    fd: FdNumber,
240) -> Result<(), Errno> {
241    current_task.running_state().files.close(fd)?;
242    Ok(())
243}
244
245pub fn sys_close_range(
246    locked: &mut Locked<Unlocked>,
247    current_task: &CurrentTask,
248    first: u32,
249    last: u32,
250    flags: u32,
251) -> Result<(), Errno> {
252    if first > last || flags & !(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC) != 0 {
253        return error!(EINVAL);
254    }
255    let running_state = current_task.running_state();
256    if flags & CLOSE_RANGE_UNSHARE != 0 {
257        running_state.files.unshare();
258    }
259    let in_range = |fd: FdNumber| fd.raw() as u32 >= first && fd.raw() as u32 <= last;
260    if flags & CLOSE_RANGE_CLOEXEC != 0 {
261        running_state.files.retain(locked, current_task, |fd, flags| {
262            if in_range(fd) {
263                *flags |= FdFlags::CLOEXEC;
264            }
265            true
266        });
267    } else {
268        running_state.files.retain(locked, current_task, |fd, _| !in_range(fd));
269    }
270    Ok(())
271}
272
273pub fn sys_lseek(
274    locked: &mut Locked<Unlocked>,
275    current_task: &CurrentTask,
276    fd: FdNumber,
277    offset: off_t,
278    whence: u32,
279) -> Result<off_t, Errno> {
280    let file = current_task.get_file(fd)?;
281    file.seek(locked, current_task, SeekTarget::from_raw(whence, offset)?)
282}
283
284pub fn sys_fcntl(
285    locked: &mut Locked<Unlocked>,
286    current_task: &CurrentTask,
287    fd: FdNumber,
288    cmd: u32,
289    arg: u64,
290) -> Result<SyscallResult, Errno> {
291    let file = match cmd {
292        F_DUPFD | F_DUPFD_CLOEXEC | F_GETFD | F_SETFD | F_GETFL => {
293            current_task.get_file_allowing_opath(fd)?
294        }
295        _ => current_task.get_file(fd)?,
296    };
297
298    security::check_file_fcntl_access(current_task, &file, cmd, arg)?;
299
300    match cmd {
301        F_DUPFD | F_DUPFD_CLOEXEC => {
302            let fd_number = arg as i32;
303            let flags = if cmd == F_DUPFD_CLOEXEC { FdFlags::CLOEXEC } else { FdFlags::empty() };
304            let newfd = current_task.running_state().files.duplicate(
305                locked,
306                current_task,
307                fd,
308                TargetFdNumber::Minimum(FdNumber::from_raw(fd_number)),
309                flags,
310            )?;
311            Ok(newfd.into())
312        }
313        F_GETOWN => match file.get_async_owner() {
314            FileAsyncOwner::Unowned => Ok(0.into()),
315            FileAsyncOwner::Thread(tid) => Ok(tid.into()),
316            FileAsyncOwner::Process(pid) => Ok(pid.into()),
317            FileAsyncOwner::ProcessGroup(pgid) => Ok((-pgid).into()),
318        },
319        F_GETOWN_EX => {
320            let owner = match file.get_async_owner() {
321                FileAsyncOwner::Unowned => uapi::f_owner_ex { type_: F_OWNER_TID as i32, pid: 0 },
322                FileAsyncOwner::Thread(tid) => {
323                    uapi::f_owner_ex { type_: F_OWNER_TID as i32, pid: tid }
324                }
325                FileAsyncOwner::Process(pid) => uapi::f_owner_ex { type_: F_OWNER_PID as i32, pid },
326                FileAsyncOwner::ProcessGroup(pgid) => {
327                    uapi::f_owner_ex { type_: F_OWNER_PGRP as i32, pid: pgid }
328                }
329            };
330            let user_owner: UserRef<f_owner_ex> =
331                UserRef::<uapi::f_owner_ex>::new(UserAddress::from(arg));
332            current_task.write_object(user_owner, &owner)?;
333            Ok(SUCCESS)
334        }
335        F_SETOWN => {
336            let pid = (arg as u32) as i32;
337            let owner = match pid.cmp(&0) {
338                Ordering::Equal => FileAsyncOwner::Unowned,
339                Ordering::Greater => FileAsyncOwner::Process(pid),
340                Ordering::Less => {
341                    FileAsyncOwner::ProcessGroup(pid.checked_neg().ok_or_else(|| errno!(EINVAL))?)
342                }
343            };
344            owner.validate(current_task)?;
345            // TODO: https://fxbug.dev/364569860 - Integrate with LSM file_setfowner hook.
346            file.set_async_owner(owner);
347            Ok(SUCCESS)
348        }
349        F_SETOWN_EX => {
350            let user_owner = UserRef::<uapi::f_owner_ex>::new(UserAddress::from(arg));
351            let requested_owner = current_task.read_object(user_owner)?;
352            let owner = match requested_owner.type_ as u32 {
353                F_OWNER_TID => FileAsyncOwner::Thread(requested_owner.pid),
354                F_OWNER_PID => FileAsyncOwner::Process(requested_owner.pid),
355                F_OWNER_PGRP => FileAsyncOwner::ProcessGroup(requested_owner.pid),
356                _ => return error!(EINVAL),
357            };
358            owner.validate(current_task)?;
359            file.set_async_owner(owner);
360            Ok(SUCCESS)
361        }
362        F_GETFD => Ok(current_task.running_state().files.get_fd_flags_allowing_opath(fd)?.into()),
363        F_SETFD => {
364            current_task
365                .running_state()
366                .files
367                .set_fd_flags_allowing_opath(fd, FdFlags::from_bits_truncate(arg as u32))?;
368            Ok(SUCCESS)
369        }
370        F_GETFL => {
371            // O_PATH allowed for:
372            //
373            //   Retrieving open file status flags using the fcntl(2)
374            //   F_GETFL operation: the returned flags will include the
375            //   bit O_PATH.
376            //
377            // See https://man7.org/linux/man-pages/man2/open.2.html
378            Ok(file.flags().into())
379        }
380        F_SETFL => {
381            let settable_flags = OpenFlags::APPEND
382                | OpenFlags::DIRECT
383                | OpenFlags::NOATIME
384                | OpenFlags::NONBLOCK
385                | OpenFlags::ASYNC;
386            let requested_flags =
387                OpenFlags::from_bits_truncate((arg as u32) & settable_flags.bits());
388
389            // If `NOATIME` flag is being set then check that it's allowed.
390            if requested_flags.contains(OpenFlags::NOATIME)
391                && !file.flags().contains(OpenFlags::NOATIME)
392            {
393                file.name.check_o_noatime_allowed(current_task)?;
394            }
395
396            file.update_file_flags(requested_flags, settable_flags);
397            Ok(SUCCESS)
398        }
399        F_SETLK | F_SETLKW | F_GETLK => {
400            let flock_ref =
401                MultiArchUserRef::<uapi::flock, uapi::arch32::flock>::new(current_task, arg);
402            let flock = current_task.read_multi_arch_object(flock_ref)?;
403            let cmd = RecordLockCommand::from_raw(cmd).ok_or_else(|| errno!(EINVAL))?;
404            if let Some(flock) = file.record_lock(locked, current_task, cmd, flock)? {
405                current_task.write_multi_arch_object(flock_ref, flock)?;
406            }
407            Ok(SUCCESS)
408        }
409        F_SETLK64 | F_SETLKW64 | F_GETLK64 | F_OFD_GETLK | F_OFD_SETLK | F_OFD_SETLKW => {
410            let flock_ref =
411                MultiArchUserRef::<uapi::flock, uapi::arch32::flock64>::new(current_task, arg);
412            let flock = current_task.read_multi_arch_object(flock_ref)?;
413            let cmd = RecordLockCommand::from_raw(cmd).ok_or_else(|| errno!(EINVAL))?;
414            if let Some(flock) = file.record_lock(locked, current_task, cmd, flock)? {
415                current_task.write_multi_arch_object(flock_ref, flock)?;
416            }
417            Ok(SUCCESS)
418        }
419        F_ADD_SEALS => {
420            if !file.can_write() {
421                // Cannot add seals if the file is not writable
422                return error!(EPERM);
423            }
424            let mut state = file.name.entry.node.write_guard_state.lock();
425            let flags = SealFlags::from_bits_truncate(arg as u32);
426            state.try_add_seal(flags)?;
427            Ok(SUCCESS)
428        }
429        F_GET_SEALS => {
430            let state = file.name.entry.node.write_guard_state.lock();
431            Ok(state.get_seals()?.into())
432        }
433        F_SETLEASE => {
434            let fsuid = current_task.current_creds().fsuid;
435            if fsuid != file.node().info().uid {
436                security::check_task_capable(current_task, CAP_LEASE)?;
437            }
438            let lease = FileLeaseType::from_bits(arg as u32)?;
439            file.set_lease(current_task, lease)?;
440            Ok(SUCCESS)
441        }
442        F_GETLEASE => Ok(file.get_lease(current_task).into()),
443        F_SETSIG => {
444            track_stub!(TODO("https://fxbug.dev/437972675"), "F_SETSIG");
445            return error!(EINVAL);
446        }
447        _ => file.fcntl(current_task, cmd, arg),
448    }
449}
450
451pub fn sys_pread64(
452    locked: &mut Locked<Unlocked>,
453    current_task: &CurrentTask,
454    fd: FdNumber,
455    address: UserAddress,
456    length: usize,
457    offset: off_t,
458) -> Result<usize, Errno> {
459    let file = current_task.get_file(fd)?;
460    let offset = offset.try_into().map_err(|_| errno!(EINVAL))?;
461    file.read_at(
462        locked,
463        current_task,
464        offset,
465        &mut UserBuffersOutputBuffer::unified_new_at(current_task, address, length)?,
466    )
467}
468
469pub fn sys_pwrite64(
470    locked: &mut Locked<Unlocked>,
471    current_task: &CurrentTask,
472    fd: FdNumber,
473    address: UserAddress,
474    length: usize,
475    offset: off_t,
476) -> Result<usize, Errno> {
477    let file = current_task.get_file(fd)?;
478    let offset = offset.try_into().map_err(|_| errno!(EINVAL))?;
479    file.write_at(
480        locked,
481        current_task,
482        offset,
483        &mut UserBuffersInputBuffer::unified_new_at(current_task, address, length)?,
484    )
485}
486
487fn do_readv(
488    locked: &mut Locked<Unlocked>,
489    current_task: &CurrentTask,
490    fd: FdNumber,
491    iovec_addr: IOVecPtr,
492    iovec_count: UserValue<i32>,
493    offset: Option<off_t>,
494    flags: u32,
495) -> Result<usize, Errno> {
496    if flags & !RWF_SUPPORTED != 0 {
497        return error!(EOPNOTSUPP);
498    }
499    if flags != 0 {
500        track_stub!(TODO("https://fxbug.dev/322875072"), "preadv2 flags", flags);
501    }
502    let file = current_task.get_file(fd)?;
503    let iovec = current_task.read_iovec(iovec_addr, iovec_count)?;
504    let mut data = UserBuffersOutputBuffer::unified_new(current_task, iovec)?;
505    if let Some(offset) = offset {
506        file.read_at(
507            locked,
508            current_task,
509            offset.try_into().map_err(|_| errno!(EINVAL))?,
510            &mut data,
511        )
512    } else {
513        file.read(locked, current_task, &mut data)
514    }
515}
516
517pub fn sys_readv(
518    locked: &mut Locked<Unlocked>,
519    current_task: &CurrentTask,
520    fd: FdNumber,
521    iovec_addr: IOVecPtr,
522    iovec_count: UserValue<i32>,
523) -> Result<usize, Errno> {
524    do_readv(locked, current_task, fd, iovec_addr, iovec_count, None, 0)
525}
526
527pub fn sys_preadv(
528    locked: &mut Locked<Unlocked>,
529    current_task: &CurrentTask,
530    fd: FdNumber,
531    iovec_addr: IOVecPtr,
532    iovec_count: UserValue<i32>,
533    offset: off_t,
534) -> Result<usize, Errno> {
535    do_readv(locked, current_task, fd, iovec_addr, iovec_count, Some(offset), 0)
536}
537
538pub fn sys_preadv2(
539    locked: &mut Locked<Unlocked>,
540    current_task: &CurrentTask,
541    fd: FdNumber,
542    iovec_addr: IOVecPtr,
543    iovec_count: UserValue<i32>,
544    offset: off_t,
545    _unused: SyscallArg, // On 32-bit systems, holds the upper 32 bits of offset.
546    flags: u32,
547) -> Result<usize, Errno> {
548    let offset = if offset == -1 { None } else { Some(offset) };
549    do_readv(locked, current_task, fd, iovec_addr, iovec_count, offset, flags)
550}
551
552fn do_writev(
553    locked: &mut Locked<Unlocked>,
554    current_task: &CurrentTask,
555    fd: FdNumber,
556    iovec_addr: IOVecPtr,
557    iovec_count: UserValue<i32>,
558    offset: Option<off_t>,
559    flags: u32,
560) -> Result<usize, Errno> {
561    if flags & !RWF_SUPPORTED != 0 {
562        return error!(EOPNOTSUPP);
563    }
564    if flags != 0 {
565        track_stub!(TODO("https://fxbug.dev/322874523"), "pwritev2 flags", flags);
566    }
567
568    let file = current_task.get_file(fd)?;
569    let iovec = current_task.read_iovec(iovec_addr, iovec_count)?;
570    let mut data = UserBuffersInputBuffer::unified_new(current_task, iovec)?;
571    let res = if let Some(offset) = offset {
572        file.write_at(
573            locked,
574            current_task,
575            offset.try_into().map_err(|_| errno!(EINVAL))?,
576            &mut data,
577        )
578    } else {
579        file.write(locked, current_task, &mut data)
580    };
581
582    match &res {
583        Err(e) if e.code == EFAULT => {
584            track_stub!(TODO("https://fxbug.dev/297370529"), "allow partial writes")
585        }
586        _ => (),
587    }
588
589    res
590}
591
592pub fn sys_writev(
593    locked: &mut Locked<Unlocked>,
594    current_task: &CurrentTask,
595    fd: FdNumber,
596    iovec_addr: IOVecPtr,
597    iovec_count: UserValue<i32>,
598) -> Result<usize, Errno> {
599    do_writev(locked, current_task, fd, iovec_addr, iovec_count, None, 0)
600}
601
602pub fn sys_pwritev(
603    locked: &mut Locked<Unlocked>,
604    current_task: &CurrentTask,
605    fd: FdNumber,
606    iovec_addr: IOVecPtr,
607    iovec_count: UserValue<i32>,
608    offset: off_t,
609) -> Result<usize, Errno> {
610    do_writev(locked, current_task, fd, iovec_addr, iovec_count, Some(offset), 0)
611}
612
613pub fn sys_pwritev2(
614    locked: &mut Locked<Unlocked>,
615    current_task: &CurrentTask,
616    fd: FdNumber,
617    iovec_addr: IOVecPtr,
618    iovec_count: UserValue<i32>,
619    offset: off_t,
620    _unused: SyscallArg, // On 32-bit systems, holds the upper 32 bits of offset.
621    flags: u32,
622) -> Result<usize, Errno> {
623    let offset = if offset == -1 { None } else { Some(offset) };
624    do_writev(locked, current_task, fd, iovec_addr, iovec_count, offset, flags)
625}
626
627type StatFsPtr = MultiArchUserRef<uapi::statfs, uapi::arch32::statfs>;
628
629pub fn fstatfs<T32: IntoBytes + Immutable + TryFrom<uapi::statfs>>(
630    locked: &mut Locked<Unlocked>,
631    current_task: &CurrentTask,
632    fd: FdNumber,
633    user_buf: MultiArchUserRef<uapi::statfs, T32>,
634) -> Result<(), Errno> {
635    // O_PATH allowed for:
636    //
637    //   fstatfs(2) (since Linux 3.12).
638    //
639    // See https://man7.org/linux/man-pages/man2/open.2.html
640    let file = current_task.get_file_allowing_opath(fd)?;
641    let mut stat = file.fs.statfs(locked, current_task)?;
642    stat.f_flags |= file.name.mount.flags().bits() as i64;
643    current_task.write_multi_arch_object(user_buf, stat)?;
644    Ok(())
645}
646
647pub fn sys_fstatfs(
648    locked: &mut Locked<Unlocked>,
649    current_task: &CurrentTask,
650    fd: FdNumber,
651    user_buf: StatFsPtr,
652) -> Result<(), Errno> {
653    fstatfs(locked, current_task, fd, user_buf)
654}
655
656fn statfs<T32: IntoBytes + Immutable + TryFrom<uapi::statfs>>(
657    locked: &mut Locked<Unlocked>,
658    current_task: &CurrentTask,
659    user_path: UserCString,
660    user_buf: MultiArchUserRef<uapi::statfs, T32>,
661) -> Result<(), Errno> {
662    let name =
663        lookup_at(locked, current_task, FdNumber::AT_FDCWD, user_path, LookupFlags::default())?;
664    let fs = name.entry.node.fs();
665    let mut stat = fs.statfs(locked, current_task)?;
666    stat.f_flags |= name.mount.flags().bits() as i64;
667    current_task.write_multi_arch_object(user_buf, stat)?;
668    Ok(())
669}
670
671pub fn sys_statfs(
672    locked: &mut Locked<Unlocked>,
673    current_task: &CurrentTask,
674    user_path: UserCString,
675    user_buf: StatFsPtr,
676) -> Result<(), Errno> {
677    statfs(locked, current_task, user_path, user_buf)
678}
679
680pub fn sys_sendfile(
681    locked: &mut Locked<Unlocked>,
682    current_task: &CurrentTask,
683    out_fd: FdNumber,
684    in_fd: FdNumber,
685    user_offset: OffsetPtr,
686    count: i32,
687) -> Result<usize, Errno> {
688    splice::sendfile(locked, current_task, out_fd, in_fd, user_offset, count)
689}
690
691/// A convenient wrapper for Task::open_file_at.
692///
693/// Reads user_path from user memory and then calls through to Task::open_file_at.
694fn open_file_at(
695    locked: &mut Locked<Unlocked>,
696    current_task: &CurrentTask,
697    dir_fd: FdNumber,
698    user_path: UserCString,
699    flags: u32,
700    mode: FileMode,
701    resolve_flags: ResolveFlags,
702) -> Result<FileHandle, Errno> {
703    let path = current_task.read_path(user_path)?;
704    log_trace!(dir_fd:%, path:%; "open_file_at");
705    current_task.open_file_at(
706        locked,
707        dir_fd,
708        path.as_ref(),
709        OpenFlags::from_bits_truncate(flags),
710        mode,
711        resolve_flags,
712        AccessCheck::default(),
713    )
714}
715
716fn lookup_parent_at<T, F>(
717    locked: &mut Locked<Unlocked>,
718    current_task: &CurrentTask,
719    dir_fd: FdNumber,
720    user_path: UserCString,
721    callback: F,
722) -> Result<T, Errno>
723where
724    F: Fn(&mut Locked<Unlocked>, LookupContext, NamespaceNode, &FsStr) -> Result<T, Errno>,
725{
726    let path = current_task.read_path(user_path)?;
727    log_trace!(dir_fd:%, path:%; "lookup_parent_at");
728    if path.is_empty() {
729        return error!(ENOENT);
730    }
731    let mut context = LookupContext::default();
732    let (parent, basename) =
733        current_task.lookup_parent_at(locked, &mut context, dir_fd, path.as_ref())?;
734    callback(locked, context, parent, basename)
735}
736
737/// Options for lookup_at.
738#[derive(Debug, Default, Copy, Clone)]
739pub struct LookupFlags {
740    /// Whether AT_EMPTY_PATH was supplied.
741    allow_empty_path: bool,
742
743    /// Used to implement AT_SYMLINK_NOFOLLOW.
744    symlink_mode: SymlinkMode,
745
746    /// Automount directories on the path.
747    // TODO(https://fxbug.dev/297370602): Support the `AT_NO_AUTOMOUNT` flag.
748    #[allow(dead_code)]
749    automount: bool,
750}
751
752impl LookupFlags {
753    fn no_follow() -> Self {
754        Self { symlink_mode: SymlinkMode::NoFollow, ..Default::default() }
755    }
756
757    fn from_bits(flags: u32, allowed_flags: u32) -> Result<Self, Errno> {
758        if flags & !allowed_flags != 0 {
759            return error!(EINVAL);
760        }
761        let follow_symlinks = if allowed_flags & AT_SYMLINK_FOLLOW != 0 {
762            flags & AT_SYMLINK_FOLLOW != 0
763        } else {
764            flags & AT_SYMLINK_NOFOLLOW == 0
765        };
766        let automount =
767            if allowed_flags & AT_NO_AUTOMOUNT != 0 { flags & AT_NO_AUTOMOUNT == 0 } else { false };
768        if automount {
769            track_stub!(TODO("https://fxbug.dev/297370602"), "LookupFlags::automount");
770        }
771        Ok(LookupFlags {
772            allow_empty_path: (flags & AT_EMPTY_PATH != 0)
773                || (flags & O_PATH != 0 && flags & O_NOFOLLOW != 0),
774            symlink_mode: if follow_symlinks { SymlinkMode::Follow } else { SymlinkMode::NoFollow },
775            automount,
776        })
777    }
778}
779
780impl From<StatxFlags> for LookupFlags {
781    fn from(flags: StatxFlags) -> Self {
782        let lookup_flags = StatxFlags::AT_SYMLINK_NOFOLLOW
783            | StatxFlags::AT_EMPTY_PATH
784            | StatxFlags::AT_NO_AUTOMOUNT;
785        Self::from_bits((flags & lookup_flags).bits(), lookup_flags.bits()).unwrap()
786    }
787}
788
789pub fn lookup_at<L>(
790    locked: &mut Locked<L>,
791    current_task: &CurrentTask,
792    dir_fd: FdNumber,
793    user_path: UserCString,
794    options: LookupFlags,
795) -> Result<NamespaceNode, Errno>
796where
797    L: LockEqualOrBefore<FileOpsCore>,
798{
799    let path = current_task.read_path(user_path)?;
800    log_trace!(dir_fd:%, path:%; "lookup_at");
801    if path.is_empty() {
802        if options.allow_empty_path {
803            let (node, _) = current_task.resolve_dir_fd(
804                locked,
805                dir_fd,
806                path.as_ref(),
807                ResolveFlags::empty(),
808            )?;
809            return Ok(node);
810        }
811        return error!(ENOENT);
812    }
813
814    let mut parent_context = LookupContext::default();
815    let (parent, basename) =
816        current_task.lookup_parent_at(locked, &mut parent_context, dir_fd, path.as_ref())?;
817
818    let mut child_context = if parent_context.must_be_directory {
819        // The child must resolve to a directory. This is because a trailing slash
820        // was found in the path. If the child is a symlink, we should follow it.
821        // See https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap03.html#tag_21_03_00_75
822        parent_context.with(SymlinkMode::Follow)
823    } else {
824        parent_context.with(options.symlink_mode)
825    };
826
827    parent.lookup_child(locked, current_task, &mut child_context, basename)
828}
829
830fn do_openat(
831    locked: &mut Locked<Unlocked>,
832    current_task: &CurrentTask,
833    dir_fd: FdNumber,
834    user_path: UserCString,
835    flags: u32,
836    mode: FileMode,
837    resolve_flags: ResolveFlags,
838) -> Result<FdNumber, Errno> {
839    let file = open_file_at(locked, current_task, dir_fd, user_path, flags, mode, resolve_flags)?;
840    let fd_flags = get_fd_flags(flags);
841    current_task.add_file(locked, file, fd_flags)
842}
843
844pub fn sys_openat(
845    locked: &mut Locked<Unlocked>,
846    current_task: &CurrentTask,
847    dir_fd: FdNumber,
848    user_path: UserCString,
849    flags: u32,
850    mode: FileMode,
851) -> Result<FdNumber, Errno> {
852    do_openat(locked, current_task, dir_fd, user_path, flags, mode, ResolveFlags::empty())
853}
854
855pub fn sys_openat2(
856    locked: &mut Locked<Unlocked>,
857    current_task: &CurrentTask,
858    dir_fd: FdNumber,
859    user_path: UserCString,
860    how_ref: UserRef<uapi::open_how>,
861    size: usize,
862) -> Result<FdNumber, Errno> {
863    const EXPECTED_SIZE: usize = std::mem::size_of::<uapi::open_how>();
864    if size < EXPECTED_SIZE {
865        return error!(EINVAL);
866    }
867
868    let how = current_task.read_object(how_ref)?;
869
870    // If the `size` is greater than expected, then we need to check that any extra bytes after
871    // `open_how` are set to 0. This is needed to properly handle the case when `open_how` is
872    // extended with new fields in the future. There is no upper limit on the buffer size, so we
873    // limit size of each read to one page.
874    let mut pos = EXPECTED_SIZE;
875    while pos < size {
876        let length = std::cmp::min(size - pos, *PAGE_SIZE as usize);
877        let extra_bytes =
878            current_task.read_buffer(&UserBuffer { address: (how_ref.addr() + pos)?, length })?;
879        for b in extra_bytes {
880            if b != 0 {
881                return error!(E2BIG);
882            }
883        }
884        pos += length;
885    }
886
887    let flags: u32 = how.flags.try_into().map_err(|_| errno!(EINVAL))?;
888
889    // `mode` can be specified only with `O_CREAT` or `O_TMPFILE`.
890    let allowed_mode_flags = if (flags & (O_CREAT | O_TMPFILE)) > 0 { 0o7777 } else { 0 };
891    if (how.mode & !allowed_mode_flags) != 0 {
892        return error!(EINVAL);
893    }
894
895    let mode = FileMode::from_bits(how.mode.try_into().map_err(|_| errno!(EINVAL))?);
896    let resolve_flags =
897        ResolveFlags::from_bits(how.resolve.try_into().map_err(|_| errno!(EINVAL))?)
898            .ok_or_else(|| errno!(EINVAL))?;
899
900    if resolve_flags.contains(ResolveFlags::CACHED) {
901        track_stub!(TODO("https://fxbug.dev/326474574"), "openat2: RESOLVE_CACHED");
902        return error!(EAGAIN);
903    }
904
905    do_openat(locked, current_task, dir_fd, user_path, flags, mode, resolve_flags)
906}
907
908pub fn sys_faccessat(
909    locked: &mut Locked<Unlocked>,
910    current_task: &CurrentTask,
911    dir_fd: FdNumber,
912    user_path: UserCString,
913    mode: u32,
914) -> Result<(), Errno> {
915    sys_faccessat2(locked, current_task, dir_fd, user_path, mode, 0)
916}
917
918pub fn sys_faccessat2(
919    locked: &mut Locked<Unlocked>,
920    current_task: &CurrentTask,
921    dir_fd: FdNumber,
922    user_path: UserCString,
923    mode: u32,
924    flags: u32,
925) -> Result<(), Errno> {
926    let mut access_check = || {
927        let mode = Access::try_from(mode)?;
928        let lookup_flags = LookupFlags::from_bits(flags, AT_SYMLINK_NOFOLLOW | AT_EACCESS)?;
929        let name = lookup_at(locked, current_task, dir_fd, user_path, lookup_flags)?;
930        name.check_access(locked, current_task, mode, CheckAccessReason::Access)
931    };
932    // Unless `AT_ACCESS` is set, perform lookup & access-checking using real UID & GID.
933    if flags & AT_EACCESS == 0 {
934        let mut temporary_creds = Credentials::clone(&current_task.current_creds());
935        temporary_creds.fsuid = temporary_creds.uid;
936        temporary_creds.fsgid = temporary_creds.gid;
937
938        // access() for root users should use permitted capabilities instead of effective capabilities.
939        // access() for non-root users should use an empty set of capabilities.
940        if temporary_creds.uid == 0 {
941            temporary_creds.cap_effective = temporary_creds.cap_permitted;
942        } else {
943            temporary_creds.cap_effective = Capabilities::empty();
944        }
945
946        current_task.override_creds(temporary_creds.into(), access_check)
947    } else {
948        access_check()
949    }
950}
951
952pub fn sys_getdents64(
953    locked: &mut Locked<Unlocked>,
954    current_task: &CurrentTask,
955    fd: FdNumber,
956    user_buffer: UserAddress,
957    user_capacity: usize,
958) -> Result<usize, Errno> {
959    let file = current_task.get_file(fd)?;
960    let mut offset = file.offset.copy();
961    let mut sink = DirentSink64::new(current_task, &mut *offset, user_buffer, user_capacity);
962    let result = file.readdir(locked, current_task, &mut sink);
963    let ret = sink.map_result_with_actual(result);
964    offset.update();
965    ret
966}
967
968pub fn sys_chroot(
969    locked: &mut Locked<Unlocked>,
970    current_task: &CurrentTask,
971    user_path: UserCString,
972) -> Result<(), Errno> {
973    let name =
974        lookup_at(locked, current_task, FdNumber::AT_FDCWD, user_path, LookupFlags::default())?;
975    if !name.entry.node.is_dir() {
976        return error!(ENOTDIR);
977    }
978
979    current_task.fs().chroot(locked, current_task, name)?;
980    Ok(())
981}
982
983pub fn sys_chdir(
984    locked: &mut Locked<Unlocked>,
985    current_task: &CurrentTask,
986    user_path: UserCString,
987) -> Result<(), Errno> {
988    let name =
989        lookup_at(locked, current_task, FdNumber::AT_FDCWD, user_path, LookupFlags::default())?;
990    if !name.entry.node.is_dir() {
991        return error!(ENOTDIR);
992    }
993    current_task.fs().chdir(locked, current_task, name)
994}
995
996pub fn sys_fchdir(
997    locked: &mut Locked<Unlocked>,
998    current_task: &CurrentTask,
999    fd: FdNumber,
1000) -> Result<(), Errno> {
1001    // O_PATH allowed for:
1002    //
1003    //   fchdir(2), if the file descriptor refers to a directory
1004    //   (since Linux 3.5).
1005    //
1006    // See https://man7.org/linux/man-pages/man2/open.2.html
1007    let file = current_task.get_file_allowing_opath(fd)?;
1008    if !file.name.entry.node.is_dir() {
1009        return error!(ENOTDIR);
1010    }
1011    current_task.fs().chdir(locked, current_task, file.name.to_passive())
1012}
1013
1014pub fn sys_fstat(
1015    locked: &mut Locked<Unlocked>,
1016    current_task: &CurrentTask,
1017    fd: FdNumber,
1018    buffer: UserRef<uapi::stat>,
1019) -> Result<(), Errno> {
1020    // O_PATH allowed for:
1021    //
1022    //   fstat(2) (since Linux 3.6).
1023    //
1024    // See https://man7.org/linux/man-pages/man2/open.2.html
1025    let file = current_task.get_file_allowing_opath(fd)?;
1026    let result = file.node().stat(locked, current_task)?;
1027    current_task.write_object(buffer, &result)?;
1028    Ok(())
1029}
1030
1031type StatPtr = MultiArchUserRef<uapi::stat, uapi::arch32::stat64>;
1032
1033// TODO(https://fxbug.dev/485370648) remove when unnecessary
1034fn get_fake_ion_stat() -> uapi::stat {
1035    uapi::stat {
1036        st_mode: uapi::S_IFCHR | 0o666,
1037        st_rdev: DeviceId::new(10, 59).bits(),
1038        st_nlink: 1,
1039        st_blksize: 4096,
1040        ..Default::default()
1041    }
1042}
1043
1044// TODO(https://fxbug.dev/485370648) remove when unnecessary
1045fn get_fake_ion_statx() -> statx {
1046    statx {
1047        stx_mask: uapi::STATX_BASIC_STATS,
1048        stx_mode: (uapi::S_IFCHR | 0o666) as u16,
1049        stx_rdev_major: 10,
1050        stx_rdev_minor: 59,
1051        stx_nlink: 1,
1052        stx_blksize: 4096,
1053        ..Default::default()
1054    }
1055}
1056
1057pub fn sys_fstatat64(
1058    locked: &mut Locked<Unlocked>,
1059    current_task: &CurrentTask,
1060    dir_fd: FdNumber,
1061    user_path: UserCString,
1062    buffer: StatPtr,
1063    flags: u32,
1064) -> Result<(), Errno> {
1065    let lookup_flags =
1066        LookupFlags::from_bits(flags, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)?;
1067    let result = match lookup_at(locked, current_task, dir_fd, user_path, lookup_flags) {
1068        Ok(name) => name.entry.node.stat(locked, current_task)?,
1069        // TODO(https://fxbug.dev/485370648) remove when unnecessary
1070        Err(e) if e == errno!(ENOENT) && current_task.kernel().features.fake_ion => {
1071            let path = current_task.read_path(user_path)?;
1072            if path == b"/dev/ion" {
1073                get_fake_ion_stat()
1074            } else {
1075                return Err(e);
1076            }
1077        }
1078        Err(e) => return Err(e),
1079    };
1080    current_task.write_multi_arch_object(buffer, result)?;
1081    Ok(())
1082}
1083
1084pub use sys_fstatat64 as sys_newfstatat;
1085
1086pub fn sys_statx(
1087    locked: &mut Locked<Unlocked>,
1088    current_task: &CurrentTask,
1089    dir_fd: FdNumber,
1090    user_path: UserCString,
1091    flags: u32,
1092    mask: u32,
1093    statxbuf: UserRef<statx>,
1094) -> Result<(), Errno> {
1095    let statx_flags = StatxFlags::from_bits(flags).ok_or_else(|| errno!(EINVAL))?;
1096    if statx_flags & (StatxFlags::AT_STATX_FORCE_SYNC | StatxFlags::AT_STATX_DONT_SYNC)
1097        == (StatxFlags::AT_STATX_FORCE_SYNC | StatxFlags::AT_STATX_DONT_SYNC)
1098    {
1099        return error!(EINVAL);
1100    }
1101
1102    let result =
1103        match lookup_at(locked, current_task, dir_fd, user_path, LookupFlags::from(statx_flags)) {
1104            Ok(name) => name.entry.node.statx(locked, current_task, statx_flags, mask)?,
1105            // TODO(https://fxbug.dev/485370648) remove when unnecessary
1106            Err(e) if e == errno!(ENOENT) && current_task.kernel().features.fake_ion => {
1107                let path = current_task.read_path(user_path)?;
1108                if path == b"/dev/ion" {
1109                    get_fake_ion_statx()
1110                } else {
1111                    return Err(e);
1112                }
1113            }
1114            Err(e) => return Err(e),
1115        };
1116    current_task.write_object(statxbuf, &result)?;
1117    Ok(())
1118}
1119
1120pub fn sys_readlinkat(
1121    locked: &mut Locked<Unlocked>,
1122    current_task: &CurrentTask,
1123    dir_fd: FdNumber,
1124    user_path: UserCString,
1125    buffer: UserAddress,
1126    buffer_size: usize,
1127) -> Result<usize, Errno> {
1128    let path = current_task.read_path(user_path)?;
1129    let lookup_flags = if path.is_empty() {
1130        if dir_fd == FdNumber::AT_FDCWD {
1131            return error!(ENOENT);
1132        }
1133        LookupFlags {
1134            allow_empty_path: true,
1135            symlink_mode: SymlinkMode::NoFollow,
1136            ..Default::default()
1137        }
1138    } else {
1139        LookupFlags::no_follow()
1140    };
1141    let name = lookup_at(locked, current_task, dir_fd, user_path, lookup_flags)?;
1142
1143    let target = match name.readlink(locked, current_task)? {
1144        SymlinkTarget::Path(path) => path,
1145        SymlinkTarget::Node(node) => node.path(&current_task.fs()),
1146    };
1147
1148    if buffer_size == 0 {
1149        return error!(EINVAL);
1150    }
1151    // Cap the returned length at buffer_size.
1152    let length = std::cmp::min(buffer_size, target.len());
1153    current_task.write_memory(buffer, &target[..length])?;
1154    Ok(length)
1155}
1156
1157pub fn sys_truncate(
1158    locked: &mut Locked<Unlocked>,
1159    current_task: &CurrentTask,
1160    user_path: UserCString,
1161    length: off_t,
1162) -> Result<(), Errno> {
1163    let length = length.try_into().map_err(|_| errno!(EINVAL))?;
1164    let name =
1165        lookup_at(locked, current_task, FdNumber::AT_FDCWD, user_path, LookupFlags::default())?;
1166    name.truncate(locked, current_task, length)?;
1167    Ok(())
1168}
1169
1170pub fn sys_ftruncate(
1171    locked: &mut Locked<Unlocked>,
1172    current_task: &CurrentTask,
1173    fd: FdNumber,
1174    length: off_t,
1175) -> Result<(), Errno> {
1176    let length = length.try_into().map_err(|_| errno!(EINVAL))?;
1177    let file = current_task.get_file(fd)?;
1178    file.ftruncate(locked, current_task, length)?;
1179    Ok(())
1180}
1181
1182pub fn sys_mkdirat(
1183    locked: &mut Locked<Unlocked>,
1184    current_task: &CurrentTask,
1185    dir_fd: FdNumber,
1186    user_path: UserCString,
1187    mode: FileMode,
1188) -> Result<(), Errno> {
1189    let path = current_task.read_path(user_path)?;
1190
1191    if path.is_empty() {
1192        return error!(ENOENT);
1193    }
1194    let (parent, basename) = current_task.lookup_parent_at(
1195        locked,
1196        &mut LookupContext::default(),
1197        dir_fd,
1198        path.as_ref(),
1199    )?;
1200    parent.create_node(
1201        locked,
1202        current_task,
1203        basename,
1204        mode.with_type(FileMode::IFDIR),
1205        DeviceId::NONE,
1206    )?;
1207    Ok(())
1208}
1209
1210pub fn sys_mknodat(
1211    locked: &mut Locked<Unlocked>,
1212    current_task: &CurrentTask,
1213    dir_fd: FdNumber,
1214    user_path: UserCString,
1215    mode: FileMode,
1216    dev: DeviceId,
1217) -> Result<(), Errno> {
1218    let file_type = match mode.fmt() {
1219        FileMode::IFREG
1220        | FileMode::IFCHR
1221        | FileMode::IFBLK
1222        | FileMode::IFIFO
1223        | FileMode::IFSOCK => mode.fmt(),
1224        FileMode::EMPTY => FileMode::IFREG,
1225        _ => return error!(EINVAL),
1226    };
1227    lookup_parent_at(locked, current_task, dir_fd, user_path, |locked, _, parent, basename| {
1228        parent.create_node(locked, current_task, basename, mode.with_type(file_type), dev)
1229    })?;
1230    Ok(())
1231}
1232
1233pub fn sys_linkat(
1234    locked: &mut Locked<Unlocked>,
1235    current_task: &CurrentTask,
1236    old_dir_fd: FdNumber,
1237    old_user_path: UserCString,
1238    new_dir_fd: FdNumber,
1239    new_user_path: UserCString,
1240    flags: u32,
1241) -> Result<(), Errno> {
1242    if flags & !(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH) != 0 {
1243        track_stub!(TODO("https://fxbug.dev/322875706"), "linkat unknown flags", flags);
1244        return error!(EINVAL);
1245    }
1246
1247    if flags & AT_EMPTY_PATH != 0 {
1248        security::check_task_capable(current_task, CAP_DAC_READ_SEARCH)
1249            .map_err(|_| errno!(ENOENT))?;
1250    }
1251
1252    let flags = LookupFlags::from_bits(flags, AT_EMPTY_PATH | AT_SYMLINK_FOLLOW)?;
1253    let target = lookup_at(locked, current_task, old_dir_fd, old_user_path, flags)?;
1254    lookup_parent_at(
1255        locked,
1256        current_task,
1257        new_dir_fd,
1258        new_user_path,
1259        |locked, context, parent, basename| {
1260            // The path to a new link cannot end in `/`. That would imply that we are dereferencing
1261            // the link to a directory.
1262            if context.must_be_directory {
1263                return error!(ENOENT);
1264            }
1265            if target.mount != parent.mount {
1266                return error!(EXDEV);
1267            }
1268            parent.link(locked, current_task, basename, &target.entry.node)
1269        },
1270    )?;
1271
1272    Ok(())
1273}
1274
1275pub fn sys_unlinkat(
1276    locked: &mut Locked<Unlocked>,
1277    current_task: &CurrentTask,
1278    dir_fd: FdNumber,
1279    user_path: UserCString,
1280    flags: u32,
1281) -> Result<(), Errno> {
1282    if flags & !AT_REMOVEDIR != 0 {
1283        return error!(EINVAL);
1284    }
1285    let kind =
1286        if flags & AT_REMOVEDIR != 0 { UnlinkKind::Directory } else { UnlinkKind::NonDirectory };
1287    lookup_parent_at(
1288        locked,
1289        current_task,
1290        dir_fd,
1291        user_path,
1292        |locked, context, parent, basename| {
1293            parent.unlink(locked, current_task, basename, kind, context.must_be_directory)
1294        },
1295    )?;
1296    Ok(())
1297}
1298
1299pub fn sys_renameat2(
1300    locked: &mut Locked<Unlocked>,
1301    current_task: &CurrentTask,
1302    old_dir_fd: FdNumber,
1303    old_user_path: UserCString,
1304    new_dir_fd: FdNumber,
1305    new_user_path: UserCString,
1306    flags: u32,
1307) -> Result<(), Errno> {
1308    let flags = RenameFlags::from_bits(flags).ok_or_else(|| errno!(EINVAL))?;
1309    if flags.intersects(RenameFlags::INTERNAL) {
1310        return error!(EINVAL);
1311    };
1312
1313    // RENAME_EXCHANGE cannot be combined with the other flags.
1314    if flags.contains(RenameFlags::EXCHANGE)
1315        && flags.intersects(RenameFlags::NOREPLACE | RenameFlags::WHITEOUT)
1316    {
1317        return error!(EINVAL);
1318    }
1319
1320    // RENAME_WHITEOUT is not supported.
1321    if flags.contains(RenameFlags::WHITEOUT) {
1322        track_stub!(TODO("https://fxbug.dev/322875416"), "RENAME_WHITEOUT");
1323        return error!(ENOSYS);
1324    };
1325
1326    let mut lookup = |dir_fd, user_path| {
1327        lookup_parent_at(locked, current_task, dir_fd, user_path, |_, _, parent, basename| {
1328            Ok((parent, basename.to_owned()))
1329        })
1330    };
1331
1332    let (old_parent, old_basename) = lookup(old_dir_fd, old_user_path)?;
1333    let (new_parent, new_basename) = lookup(new_dir_fd, new_user_path)?;
1334
1335    if new_basename.len() > NAME_MAX as usize {
1336        return error!(ENAMETOOLONG);
1337    }
1338
1339    NamespaceNode::rename(
1340        locked,
1341        current_task,
1342        &old_parent,
1343        old_basename.as_ref(),
1344        &new_parent,
1345        new_basename.as_ref(),
1346        flags,
1347    )
1348}
1349
1350pub fn sys_fchmod(
1351    locked: &mut Locked<Unlocked>,
1352    current_task: &CurrentTask,
1353    fd: FdNumber,
1354    mode: FileMode,
1355) -> Result<(), Errno> {
1356    // Remove the filetype from the mode.
1357    let mode = mode & FileMode::PERMISSIONS;
1358    let file = current_task.get_file(fd)?;
1359    file.name.entry.node.chmod(locked, current_task, &file.name.mount, mode)?;
1360    file.name.entry.notify_ignoring_excl_unlink(InotifyMask::ATTRIB);
1361    Ok(())
1362}
1363
1364pub fn sys_fchmodat(
1365    locked: &mut Locked<Unlocked>,
1366    current_task: &CurrentTask,
1367    dir_fd: FdNumber,
1368    user_path: UserCString,
1369    mode: FileMode,
1370) -> Result<(), Errno> {
1371    // Remove the filetype from the mode.
1372    let mode = mode & FileMode::PERMISSIONS;
1373    let name = lookup_at(locked, current_task, dir_fd, user_path, LookupFlags::default())?;
1374    name.entry.node.chmod(locked, current_task, &name.mount, mode)?;
1375    name.entry.notify_ignoring_excl_unlink(InotifyMask::ATTRIB);
1376    Ok(())
1377}
1378
1379fn maybe_uid(id: u32) -> Option<uid_t> {
1380    if id == u32::MAX { None } else { Some(id) }
1381}
1382
1383pub fn sys_fchown(
1384    locked: &mut Locked<Unlocked>,
1385    current_task: &CurrentTask,
1386    fd: FdNumber,
1387    owner: u32,
1388    group: u32,
1389) -> Result<(), Errno> {
1390    let file = current_task.get_file(fd)?;
1391    file.name.entry.node.chown(
1392        locked,
1393        current_task,
1394        &file.name.mount,
1395        maybe_uid(owner),
1396        maybe_uid(group),
1397    )?;
1398    file.name.entry.notify_ignoring_excl_unlink(InotifyMask::ATTRIB);
1399    Ok(())
1400}
1401
1402pub fn sys_fchownat(
1403    locked: &mut Locked<Unlocked>,
1404    current_task: &CurrentTask,
1405    dir_fd: FdNumber,
1406    user_path: UserCString,
1407    owner: u32,
1408    group: u32,
1409    flags: u32,
1410) -> Result<(), Errno> {
1411    let flags = LookupFlags::from_bits(flags, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW)?;
1412    let name = lookup_at(locked, current_task, dir_fd, user_path, flags)?;
1413    name.entry.node.chown(locked, current_task, &name.mount, maybe_uid(owner), maybe_uid(group))?;
1414    name.entry.notify_ignoring_excl_unlink(InotifyMask::ATTRIB);
1415    Ok(())
1416}
1417
1418fn read_xattr_name(current_task: &CurrentTask, name_addr: UserCString) -> Result<FsString, Errno> {
1419    let name = current_task
1420        .read_c_string_to_vec(name_addr, XATTR_NAME_MAX as usize + 1)
1421        .map_err(|e| if e == ENAMETOOLONG { errno!(ERANGE) } else { e })?;
1422    if name.is_empty() {
1423        return error!(ERANGE);
1424    }
1425    let dot_index = memchr::memchr(b'.', &name).ok_or_else(|| errno!(ENOTSUP))?;
1426    if name[dot_index + 1..].is_empty() {
1427        return error!(EINVAL);
1428    }
1429    match &name[..dot_index] {
1430        b"user" | b"security" | b"trusted" | b"system" => {}
1431        _ => return error!(ENOTSUP),
1432    }
1433    Ok(name)
1434}
1435
1436fn do_getxattr(
1437    locked: &mut Locked<Unlocked>,
1438    current_task: &CurrentTask,
1439    node: &NamespaceNode,
1440    name_addr: UserCString,
1441    value_addr: UserAddress,
1442    size: usize,
1443) -> Result<usize, Errno> {
1444    let name = read_xattr_name(current_task, name_addr)?;
1445    let value =
1446        match node.entry.node.get_xattr(locked, current_task, &node.mount, name.as_ref(), size)? {
1447            ValueOrSize::Size(s) => return Ok(s),
1448            ValueOrSize::Value(v) => v,
1449        };
1450    if size == 0 {
1451        return Ok(value.len());
1452    }
1453    if size < value.len() {
1454        return error!(ERANGE);
1455    }
1456    current_task.write_memory(value_addr, &value)
1457}
1458
1459pub fn sys_getxattr(
1460    locked: &mut Locked<Unlocked>,
1461    current_task: &CurrentTask,
1462    path_addr: UserCString,
1463    name_addr: UserCString,
1464    value_addr: UserAddress,
1465    size: usize,
1466) -> Result<usize, Errno> {
1467    let node =
1468        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::default())?;
1469    do_getxattr(locked, current_task, &node, name_addr, value_addr, size)
1470}
1471
1472pub fn sys_fgetxattr(
1473    locked: &mut Locked<Unlocked>,
1474    current_task: &CurrentTask,
1475    fd: FdNumber,
1476    name_addr: UserCString,
1477    value_addr: UserAddress,
1478    size: usize,
1479) -> Result<usize, Errno> {
1480    let file = current_task.get_file(fd)?;
1481    do_getxattr(locked, current_task, &file.name, name_addr, value_addr, size)
1482}
1483
1484pub fn sys_lgetxattr(
1485    locked: &mut Locked<Unlocked>,
1486    current_task: &CurrentTask,
1487    path_addr: UserCString,
1488    name_addr: UserCString,
1489    value_addr: UserAddress,
1490    size: usize,
1491) -> Result<usize, Errno> {
1492    let node =
1493        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::no_follow())?;
1494    do_getxattr(locked, current_task, &node, name_addr, value_addr, size)
1495}
1496
1497fn do_setxattr(
1498    locked: &mut Locked<Unlocked>,
1499    current_task: &CurrentTask,
1500    node: &NamespaceNode,
1501    name_addr: UserCString,
1502    value_addr: UserAddress,
1503    size: usize,
1504    flags: u32,
1505) -> Result<(), Errno> {
1506    if size > XATTR_NAME_MAX as usize {
1507        return error!(E2BIG);
1508    }
1509
1510    let op = match flags {
1511        0 => XattrOp::Set,
1512        XATTR_CREATE => XattrOp::Create,
1513        XATTR_REPLACE => XattrOp::Replace,
1514        _ => return error!(EINVAL),
1515    };
1516    let name = read_xattr_name(current_task, name_addr)?;
1517    let value = FsString::from(current_task.read_memory_to_vec(value_addr, size)?);
1518    node.entry.node.set_xattr(locked, current_task, &node.mount, name.as_ref(), value.as_ref(), op)
1519}
1520
1521pub fn sys_fsetxattr(
1522    locked: &mut Locked<Unlocked>,
1523    current_task: &CurrentTask,
1524    fd: FdNumber,
1525    name_addr: UserCString,
1526    value_addr: UserAddress,
1527    size: usize,
1528    flags: u32,
1529) -> Result<(), Errno> {
1530    let file = current_task.get_file(fd)?;
1531    do_setxattr(locked, current_task, &file.name, name_addr, value_addr, size, flags)
1532}
1533
1534pub fn sys_lsetxattr(
1535    locked: &mut Locked<Unlocked>,
1536    current_task: &CurrentTask,
1537    path_addr: UserCString,
1538    name_addr: UserCString,
1539    value_addr: UserAddress,
1540    size: usize,
1541    flags: u32,
1542) -> Result<(), Errno> {
1543    let node =
1544        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::no_follow())?;
1545    do_setxattr(locked, current_task, &node, name_addr, value_addr, size, flags)
1546}
1547
1548pub fn sys_setxattr(
1549    locked: &mut Locked<Unlocked>,
1550    current_task: &CurrentTask,
1551    path_addr: UserCString,
1552    name_addr: UserCString,
1553    value_addr: UserAddress,
1554    size: usize,
1555    flags: u32,
1556) -> Result<(), Errno> {
1557    let node =
1558        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::default())?;
1559    do_setxattr(locked, current_task, &node, name_addr, value_addr, size, flags)
1560}
1561
1562fn do_removexattr(
1563    locked: &mut Locked<Unlocked>,
1564    current_task: &CurrentTask,
1565    node: &NamespaceNode,
1566    name_addr: UserCString,
1567) -> Result<(), Errno> {
1568    let mode = node.entry.node.info().mode;
1569    if mode.is_chr() || mode.is_fifo() {
1570        return error!(EPERM);
1571    }
1572    let name = read_xattr_name(current_task, name_addr)?;
1573    node.entry.node.remove_xattr(locked, current_task, &node.mount, name.as_ref())
1574}
1575
1576pub fn sys_removexattr(
1577    locked: &mut Locked<Unlocked>,
1578    current_task: &CurrentTask,
1579    path_addr: UserCString,
1580    name_addr: UserCString,
1581) -> Result<(), Errno> {
1582    let node =
1583        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::default())?;
1584    do_removexattr(locked, current_task, &node, name_addr)
1585}
1586
1587pub fn sys_lremovexattr(
1588    locked: &mut Locked<Unlocked>,
1589    current_task: &CurrentTask,
1590    path_addr: UserCString,
1591    name_addr: UserCString,
1592) -> Result<(), Errno> {
1593    let node =
1594        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::no_follow())?;
1595    do_removexattr(locked, current_task, &node, name_addr)
1596}
1597
1598pub fn sys_fremovexattr(
1599    locked: &mut Locked<Unlocked>,
1600    current_task: &CurrentTask,
1601    fd: FdNumber,
1602    name_addr: UserCString,
1603) -> Result<(), Errno> {
1604    let file = current_task.get_file(fd)?;
1605    do_removexattr(locked, current_task, &file.name, name_addr)
1606}
1607
1608fn do_listxattr(
1609    locked: &mut Locked<Unlocked>,
1610    current_task: &CurrentTask,
1611    node: &NamespaceNode,
1612    list_addr: UserAddress,
1613    size: usize,
1614) -> Result<usize, Errno> {
1615    let security_xattr = security::fs_node_listsecurity(current_task, &node.entry.node);
1616    let xattrs = match node.entry.node.list_xattrs(locked, current_task, size) {
1617        Ok(ValueOrSize::Size(s)) => return Ok(s + security_xattr.map_or(0, |s| s.len() + 1)),
1618        Ok(ValueOrSize::Value(mut v)) => {
1619            if let Some(security_value) = security_xattr {
1620                if !v.contains(&security_value) {
1621                    v.push(security_value);
1622                }
1623            }
1624            v
1625        }
1626        Err(e) => {
1627            if e.code != ENOTSUP || security_xattr.is_none() {
1628                return Err(e);
1629            }
1630            vec![security_xattr.unwrap()]
1631        }
1632    };
1633
1634    let mut list = vec![];
1635    for name in xattrs.iter() {
1636        list.extend_from_slice(name);
1637        list.push(b'\0');
1638    }
1639    if size == 0 {
1640        return Ok(list.len());
1641    }
1642    if size < list.len() {
1643        return error!(ERANGE);
1644    }
1645    current_task.write_memory(list_addr, &list)
1646}
1647
1648pub fn sys_listxattr(
1649    locked: &mut Locked<Unlocked>,
1650    current_task: &CurrentTask,
1651    path_addr: UserCString,
1652    list_addr: UserAddress,
1653    size: usize,
1654) -> Result<usize, Errno> {
1655    let node =
1656        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::default())?;
1657    do_listxattr(locked, current_task, &node, list_addr, size)
1658}
1659
1660pub fn sys_llistxattr(
1661    locked: &mut Locked<Unlocked>,
1662    current_task: &CurrentTask,
1663    path_addr: UserCString,
1664    list_addr: UserAddress,
1665    size: usize,
1666) -> Result<usize, Errno> {
1667    let node =
1668        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::no_follow())?;
1669    do_listxattr(locked, current_task, &node, list_addr, size)
1670}
1671
1672pub fn sys_flistxattr(
1673    locked: &mut Locked<Unlocked>,
1674    current_task: &CurrentTask,
1675    fd: FdNumber,
1676    list_addr: UserAddress,
1677    size: usize,
1678) -> Result<usize, Errno> {
1679    let file = current_task.get_file(fd)?;
1680    do_listxattr(locked, current_task, &file.name, list_addr, size)
1681}
1682
1683pub fn sys_getcwd(
1684    _locked: &mut Locked<Unlocked>,
1685    current_task: &CurrentTask,
1686    buf: UserAddress,
1687    size: usize,
1688) -> Result<usize, Errno> {
1689    let root = current_task.fs().root();
1690    let cwd = current_task.fs().cwd();
1691    let mut user_cwd = match cwd.path_from_root(Some(&root)) {
1692        PathWithReachability::Reachable(path) => path,
1693        PathWithReachability::Unreachable(mut path) => {
1694            let mut combined = vec![];
1695            combined.extend_from_slice(b"(unreachable)");
1696            combined.append(&mut path);
1697            combined.into()
1698        }
1699    };
1700    user_cwd.push(b'\0');
1701    if user_cwd.len() > size {
1702        return error!(ERANGE);
1703    }
1704    current_task.write_memory(buf, &user_cwd)?;
1705    Ok(user_cwd.len())
1706}
1707
1708pub fn sys_umask(
1709    _locked: &mut Locked<Unlocked>,
1710    current_task: &CurrentTask,
1711    umask: FileMode,
1712) -> Result<FileMode, Errno> {
1713    Ok(current_task.fs().set_umask(umask))
1714}
1715
1716fn get_fd_flags(flags: u32) -> FdFlags {
1717    if flags & O_CLOEXEC != 0 { FdFlags::CLOEXEC } else { FdFlags::empty() }
1718}
1719
1720pub fn sys_pipe2(
1721    locked: &mut Locked<Unlocked>,
1722    current_task: &CurrentTask,
1723    user_pipe: UserRef<FdNumber>,
1724    flags: u32,
1725) -> Result<(), Errno> {
1726    let supported_file_flags = OpenFlags::NONBLOCK | OpenFlags::DIRECT;
1727    if flags & !(O_CLOEXEC | supported_file_flags.bits()) != 0 {
1728        return error!(EINVAL);
1729    }
1730    let (read, write) = new_pipe(locked, current_task)?;
1731
1732    let file_flags = OpenFlags::from_bits_truncate(flags & supported_file_flags.bits());
1733    read.update_file_flags(file_flags, supported_file_flags);
1734    write.update_file_flags(file_flags, supported_file_flags);
1735
1736    let fd_flags = get_fd_flags(flags);
1737    let fd_read = current_task.add_file(locked, read, fd_flags)?;
1738    let fd_write = current_task.add_file(locked, write, fd_flags)?;
1739    log_trace!("pipe2 -> [{:#x}, {:#x}]", fd_read.raw(), fd_write.raw());
1740
1741    current_task.write_object(user_pipe, &fd_read)?;
1742    let user_pipe = user_pipe.next()?;
1743    current_task.write_object(user_pipe, &fd_write)?;
1744
1745    Ok(())
1746}
1747
1748pub fn sys_ioctl(
1749    locked: &mut Locked<Unlocked>,
1750    current_task: &CurrentTask,
1751    fd: FdNumber,
1752    request: u32,
1753    arg: SyscallArg,
1754) -> Result<SyscallResult, Errno> {
1755    match request {
1756        FIOCLEX | FIONCLEX => {
1757            current_task.running_state().files.ioctl_fd_flags(current_task, fd, request)?;
1758            Ok(SUCCESS)
1759        }
1760        _ => {
1761            let file = current_task.get_file(fd)?;
1762            file.ioctl(locked, current_task, request, arg)
1763        }
1764    }
1765}
1766
1767pub fn sys_symlinkat(
1768    locked: &mut Locked<Unlocked>,
1769    current_task: &CurrentTask,
1770    user_target: UserCString,
1771    new_dir_fd: FdNumber,
1772    user_path: UserCString,
1773) -> Result<(), Errno> {
1774    let target = current_task.read_path(user_target)?;
1775    if target.is_empty() {
1776        return error!(ENOENT);
1777    }
1778
1779    let path = current_task.read_path(user_path)?;
1780    // TODO: This check could probably be moved into parent.symlink(..).
1781    if path.is_empty() {
1782        return error!(ENOENT);
1783    }
1784
1785    let res = lookup_parent_at(
1786        locked,
1787        current_task,
1788        new_dir_fd,
1789        user_path,
1790        |locked, context, parent, basename| {
1791            // The path to a new symlink cannot end in `/`. That would imply that we are dereferencing
1792            // the symlink to a directory.
1793            //
1794            // See https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap03.html#tag_21_03_00_75
1795            if context.must_be_directory {
1796                return error!(ENOENT);
1797            }
1798            parent.create_symlink(locked, current_task, basename, target.as_ref())
1799        },
1800    );
1801    res?;
1802    Ok(())
1803}
1804
1805pub fn sys_dup(
1806    locked: &mut Locked<Unlocked>,
1807    current_task: &CurrentTask,
1808    oldfd: FdNumber,
1809) -> Result<FdNumber, Errno> {
1810    current_task.running_state().files.duplicate(
1811        locked,
1812        current_task,
1813        oldfd,
1814        TargetFdNumber::Default,
1815        FdFlags::empty(),
1816    )
1817}
1818
1819pub fn sys_dup3(
1820    locked: &mut Locked<Unlocked>,
1821    current_task: &CurrentTask,
1822    oldfd: FdNumber,
1823    newfd: FdNumber,
1824    flags: u32,
1825) -> Result<FdNumber, Errno> {
1826    if oldfd == newfd {
1827        return error!(EINVAL);
1828    }
1829    if flags & !O_CLOEXEC != 0 {
1830        return error!(EINVAL);
1831    }
1832    let fd_flags = get_fd_flags(flags);
1833    current_task.running_state().files.duplicate(
1834        locked,
1835        current_task,
1836        oldfd,
1837        TargetFdNumber::Specific(newfd),
1838        fd_flags,
1839    )?;
1840    Ok(newfd)
1841}
1842
1843/// A memfd file descriptor cannot have a name longer than 250 bytes, including
1844/// the null terminator.
1845///
1846/// See Errors section of https://man7.org/linux/man-pages/man2/memfd_create.2.html
1847const MEMFD_NAME_MAX_LEN: usize = 250;
1848
1849pub fn sys_memfd_create(
1850    locked: &mut Locked<Unlocked>,
1851    current_task: &CurrentTask,
1852    user_name: UserCString,
1853    flags: u32,
1854) -> Result<FdNumber, Errno> {
1855    const HUGE_SHIFTED_MASK: u32 = MFD_HUGE_MASK << MFD_HUGE_SHIFT;
1856
1857    if flags
1858        & !(MFD_CLOEXEC
1859            | MFD_ALLOW_SEALING
1860            | MFD_HUGETLB
1861            | HUGE_SHIFTED_MASK
1862            | MFD_NOEXEC_SEAL
1863            | MFD_EXEC)
1864        != 0
1865    {
1866        track_stub!(TODO("https://fxbug.dev/322875665"), "memfd_create unknown flags", flags);
1867        return error!(EINVAL);
1868    }
1869
1870    let _huge_page_size = if flags & MFD_HUGETLB != 0 {
1871        Some(flags & HUGE_SHIFTED_MASK)
1872    } else {
1873        if flags & HUGE_SHIFTED_MASK != 0 {
1874            return error!(EINVAL);
1875        }
1876        None
1877    };
1878
1879    let name = current_task
1880        .read_c_string_to_vec(user_name, MEMFD_NAME_MAX_LEN)
1881        .map_err(|e| if e == ENAMETOOLONG { errno!(EINVAL) } else { e })?;
1882
1883    // This behavior matches MEMFD_NOEXEC_SCOPE_EXEC, which states:
1884    //   > memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like MFD_EXEC was set.
1885    //
1886    // This behavior can be changed on Linux via sysctl vm.memfd_noexec, which is pid namespaced.
1887    // We do not currently support changing this behavior.
1888    let seals = if flags & MFD_NOEXEC_SEAL != 0 {
1889        SealFlags::NO_EXEC
1890    } else if flags & MFD_ALLOW_SEALING != 0 {
1891        SealFlags::empty()
1892    } else {
1893        // Forbid sealing, by sealing the seal operation.
1894        SealFlags::SEAL
1895    };
1896
1897    let file = new_memfd(locked, current_task, name, seals, OpenFlags::RDWR)?;
1898
1899    let mut fd_flags = FdFlags::empty();
1900    if flags & MFD_CLOEXEC != 0 {
1901        fd_flags |= FdFlags::CLOEXEC;
1902    }
1903    let fd = current_task.add_file(locked, file, fd_flags)?;
1904    Ok(fd)
1905}
1906
1907pub fn sys_mount(
1908    locked: &mut Locked<Unlocked>,
1909    current_task: &CurrentTask,
1910    source_addr: UserCString,
1911    target_addr: UserCString,
1912    filesystemtype_addr: UserCString,
1913    flags: u32,
1914    data_addr: UserCString,
1915) -> Result<(), Errno> {
1916    security::check_task_capable(current_task, CAP_SYS_ADMIN)?;
1917
1918    let flags = MountFlags::from_bits(flags).ok_or_else(|| {
1919        track_stub!(
1920            TODO("https://fxbug.dev/322875327"),
1921            "mount unknown flags",
1922            flags & !MountFlags::from_bits_truncate(flags).bits()
1923        );
1924        errno!(EINVAL)
1925    })?;
1926
1927    let target =
1928        lookup_at(locked, current_task, FdNumber::AT_FDCWD, target_addr, LookupFlags::default())?;
1929
1930    security::sb_mount(current_task, &target, flags)?;
1931
1932    if flags.contains(MountFlags::REMOUNT) {
1933        do_mount_remount(current_task, target, flags, data_addr)
1934    } else if flags.contains(MountFlags::BIND) {
1935        do_mount_bind(locked, current_task, source_addr, target, flags)
1936    } else if flags.intersects(MountFlags::SHARED | MountFlags::PRIVATE | MountFlags::DOWNSTREAM) {
1937        do_mount_change_propagation_type(current_task, target, flags)
1938    } else if flags.contains(MountFlags::MOVE) {
1939        do_mount_move(locked, current_task, source_addr, target)
1940    } else {
1941        do_mount_create(
1942            locked,
1943            current_task,
1944            source_addr,
1945            target,
1946            filesystemtype_addr,
1947            data_addr,
1948            flags,
1949        )
1950    }
1951}
1952
1953fn do_mount_remount(
1954    current_task: &CurrentTask,
1955    target: NamespaceNode,
1956    flags: MountFlags,
1957    data_addr: UserCString,
1958) -> Result<(), Errno> {
1959    if !data_addr.is_null() {
1960        track_stub!(TODO("https://fxbug.dev/322875506"), "MS_REMOUNT: Updating data");
1961    }
1962    let mount = target.mount_if_root()?;
1963
1964    let data = current_task.read_path_if_non_null(data_addr)?;
1965    let mount_options =
1966        security::sb_eat_lsm_opts(current_task.kernel(), &mut MountParams::parse(data.as_ref())?)?;
1967
1968    // From <https://man7.org/linux/man-pages/man2/mount.2.html>
1969    //
1970    //   Since Linux 2.6.26, the MS_REMOUNT flag can be used with MS_BIND
1971    //   to modify only the per-mount-point flags.  This is particularly
1972    //   useful for setting or clearing the "read-only" flag on a mount
1973    //   without changing the underlying filesystem.
1974    if !flags.contains(MountFlags::BIND) {
1975        security::sb_remount(current_task, &mount, mount_options)?;
1976        mount.reconfigure_fs(current_task, flags.file_system_flags())?;
1977    }
1978
1979    let updated_flags = flags & MountFlags::CHANGEABLE_WITH_REMOUNT;
1980    mount.update_flags(updated_flags.mountpoint_flags());
1981
1982    Ok(())
1983}
1984
1985fn do_mount_bind(
1986    locked: &mut Locked<Unlocked>,
1987    current_task: &CurrentTask,
1988    source_addr: UserCString,
1989    target: NamespaceNode,
1990    flags: MountFlags,
1991) -> Result<(), Errno> {
1992    let source =
1993        lookup_at(locked, current_task, FdNumber::AT_FDCWD, source_addr, LookupFlags::default())?;
1994    log_trace!(
1995        source:% = source.path(&current_task.fs()),
1996        target:% = target.path(&current_task.fs()),
1997        flags:?;
1998        "do_mount_bind",
1999    );
2000    target.mount(WhatToMount::Bind(source), flags.mountpoint_flags())
2001}
2002
2003fn do_mount_change_propagation_type(
2004    current_task: &CurrentTask,
2005    target: NamespaceNode,
2006    flags: MountFlags,
2007) -> Result<(), Errno> {
2008    log_trace!(
2009        target:% = target.path(&current_task.fs()),
2010        flags:?;
2011        "do_mount_change_propagation_type",
2012    );
2013
2014    // Flag validation. Of the three propagation type flags, exactly one must be passed. The only
2015    // valid flags other than propagation type are MS_SILENT and MS_REC.
2016    //
2017    // Use if statements to find the first propagation type flag, then check for valid flags using
2018    // only the first propagation flag and MS_REC / MS_SILENT as valid flags.
2019    let propagation_flag = if flags.contains(MountFlags::SHARED) {
2020        MountFlags::SHARED
2021    } else if flags.contains(MountFlags::PRIVATE) {
2022        MountFlags::PRIVATE
2023    } else if flags.contains(MountFlags::DOWNSTREAM) {
2024        MountFlags::DOWNSTREAM
2025    } else {
2026        return error!(EINVAL);
2027    };
2028    if flags.intersects(!(propagation_flag | MountFlags::REC | MountFlags::SILENT)) {
2029        return error!(EINVAL);
2030    }
2031
2032    let mount = target.mount_if_root()?;
2033    mount.change_propagation(propagation_flag, flags.contains(MountFlags::REC));
2034    Ok(())
2035}
2036
2037fn do_mount_move(
2038    locked: &mut Locked<Unlocked>,
2039    current_task: &CurrentTask,
2040    source_addr: UserCString,
2041    target: NamespaceNode,
2042) -> Result<(), Errno> {
2043    let source =
2044        lookup_at(locked, current_task, FdNumber::AT_FDCWD, source_addr, LookupFlags::default())?;
2045    let source_mount = source.mount_if_root()?;
2046    Mount::move_mount(source_mount, target.mount.as_ref().expect(""), &target.entry)
2047}
2048
2049fn do_mount_create(
2050    locked: &mut Locked<Unlocked>,
2051    current_task: &CurrentTask,
2052    source_addr: UserCString,
2053    target: NamespaceNode,
2054    filesystemtype_addr: UserCString,
2055    data_addr: UserCString,
2056    flags: MountFlags,
2057) -> Result<(), Errno> {
2058    let source = current_task.read_path_if_non_null(source_addr)?;
2059    let fs_type = current_task.read_path(filesystemtype_addr)?;
2060    let data = current_task.read_path_if_non_null(data_addr)?;
2061    log_trace!(
2062        source:%,
2063        target:% = target.path(&current_task.fs()),
2064        fs_type:%,
2065        data:%;
2066        "do_mount_create",
2067    );
2068
2069    let options = FileSystemOptions {
2070        source: source.into(),
2071        flags: flags.file_system_flags().into(),
2072        params: MountParams::parse(data.as_ref())?,
2073    };
2074
2075    let fs = current_task.create_filesystem(locked, fs_type.as_ref(), options)?;
2076
2077    security::sb_kern_mount(current_task, &fs)?;
2078    target.mount(WhatToMount::Fs(fs), flags.mountpoint_flags())
2079}
2080
2081pub fn sys_umount2(
2082    locked: &mut Locked<Unlocked>,
2083    current_task: &CurrentTask,
2084    target_addr: UserCString,
2085    flags: u32,
2086) -> Result<(), Errno> {
2087    security::check_task_capable(current_task, CAP_SYS_ADMIN)?;
2088
2089    let unmount_flags = UnmountFlags::from_bits(flags).ok_or_else(|| {
2090        track_stub!(
2091            TODO("https://fxbug.dev/322875327"),
2092            "unmount unknown flags",
2093            flags & !UnmountFlags::from_bits_truncate(flags).bits()
2094        );
2095        errno!(EINVAL)
2096    })?;
2097
2098    if unmount_flags.contains(UnmountFlags::EXPIRE)
2099        && (unmount_flags.contains(UnmountFlags::FORCE)
2100            || unmount_flags.contains(UnmountFlags::DETACH))
2101    {
2102        return error!(EINVAL);
2103    }
2104
2105    let lookup_flags = if unmount_flags.contains(UnmountFlags::NOFOLLOW) {
2106        LookupFlags::no_follow()
2107    } else {
2108        LookupFlags::default()
2109    };
2110    let target = lookup_at(locked, current_task, FdNumber::AT_FDCWD, target_addr, lookup_flags)?;
2111
2112    security::sb_umount(current_task, &target, unmount_flags)?;
2113
2114    target.unmount(unmount_flags)
2115}
2116
2117pub fn sys_eventfd2(
2118    locked: &mut Locked<Unlocked>,
2119    current_task: &CurrentTask,
2120    value: u32,
2121    flags: u32,
2122) -> Result<FdNumber, Errno> {
2123    if flags & !(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE) != 0 {
2124        return error!(EINVAL);
2125    }
2126    let blocking = (flags & EFD_NONBLOCK) == 0;
2127    let eventfd_type =
2128        if (flags & EFD_SEMAPHORE) == 0 { EventFdType::Counter } else { EventFdType::Semaphore };
2129    let file = new_eventfd(locked, current_task, value, eventfd_type, blocking);
2130    let fd_flags = if flags & EFD_CLOEXEC != 0 { FdFlags::CLOEXEC } else { FdFlags::empty() };
2131    let fd = current_task.add_file(locked, file, fd_flags)?;
2132    Ok(fd)
2133}
2134
2135pub fn sys_pidfd_open(
2136    locked: &mut Locked<Unlocked>,
2137    current_task: &CurrentTask,
2138    pid: pid_t,
2139    flags: u32,
2140) -> Result<FdNumber, Errno> {
2141    if flags & !PIDFD_NONBLOCK != 0 {
2142        return error!(EINVAL);
2143    }
2144    if pid <= 0 {
2145        return error!(EINVAL);
2146    }
2147
2148    let file = {
2149        let pid_table = current_task.kernel().pids.read();
2150
2151        let blocking = (flags & PIDFD_NONBLOCK) == 0;
2152        let open_flags = if blocking { OpenFlags::empty() } else { OpenFlags::NONBLOCK };
2153
2154        // Validate that a process (and not just a task) entry exists for the PID.
2155        let task = pid_table.get_task(pid).ok();
2156        let file = match (pid_table.get_process(pid), task) {
2157            (Some(ProcessEntryRef::Process(proc)), Some(task)) => {
2158                new_pidfd(locked, current_task, &proc, &*task.mm()?, open_flags)
2159            }
2160            (Some(ProcessEntryRef::Zombie(_)), _) => {
2161                new_zombie_pidfd(locked, current_task, open_flags)
2162            }
2163            (None, Some(_)) => return error!(EINVAL),
2164            _ => return error!(ESRCH),
2165        };
2166        file
2167    };
2168
2169    current_task.add_file(locked, file, FdFlags::CLOEXEC)
2170}
2171
2172pub fn sys_pidfd_getfd(
2173    locked: &mut Locked<Unlocked>,
2174    current_task: &CurrentTask,
2175    pidfd: FdNumber,
2176    targetfd: FdNumber,
2177    flags: u32,
2178) -> Result<FdNumber, Errno> {
2179    if flags != 0 {
2180        return error!(EINVAL);
2181    }
2182
2183    let file = current_task.get_file(pidfd)?;
2184    let tg = file.as_thread_group_key()?;
2185    let tg = tg.upgrade().ok_or_else(|| errno!(ESRCH))?;
2186    let task = tg.read().get_running_task()?;
2187
2188    current_task.check_ptrace_access_mode(locked, PTRACE_MODE_ATTACH_REALCREDS, &task)?;
2189
2190    let target_file = task.running_state()?.files.get(targetfd)?;
2191    current_task.add_file(locked, target_file, FdFlags::CLOEXEC)
2192}
2193
2194pub fn sys_timerfd_create(
2195    locked: &mut Locked<Unlocked>,
2196    current_task: &CurrentTask,
2197    clock_id: u32,
2198    flags: u32,
2199) -> Result<FdNumber, Errno> {
2200    let timeline = match clock_id {
2201        CLOCK_MONOTONIC => Timeline::Monotonic,
2202        CLOCK_BOOTTIME | CLOCK_BOOTTIME_ALARM => Timeline::BootInstant,
2203        CLOCK_REALTIME | CLOCK_REALTIME_ALARM => Timeline::RealTime,
2204        _ => return error!(EINVAL),
2205    };
2206    let timer_type = match clock_id {
2207        CLOCK_MONOTONIC | CLOCK_BOOTTIME | CLOCK_REALTIME => TimerWakeup::Regular,
2208        CLOCK_BOOTTIME_ALARM | CLOCK_REALTIME_ALARM => {
2209            security::check_task_capable(current_task, CAP_WAKE_ALARM)?;
2210            TimerWakeup::Alarm
2211        }
2212        _ => return error!(EINVAL),
2213    };
2214    if flags & !(TFD_NONBLOCK | TFD_CLOEXEC) != 0 {
2215        track_stub!(TODO("https://fxbug.dev/322875488"), "timerfd_create unknown flags", flags);
2216        return error!(EINVAL);
2217    }
2218    log_trace!("timerfd_create(clock_id={:?}, flags={:#x})", clock_id, flags);
2219
2220    let mut open_flags = OpenFlags::RDWR;
2221    if flags & TFD_NONBLOCK != 0 {
2222        open_flags |= OpenFlags::NONBLOCK;
2223    }
2224
2225    let mut fd_flags = FdFlags::empty();
2226    if flags & TFD_CLOEXEC != 0 {
2227        fd_flags |= FdFlags::CLOEXEC;
2228    };
2229
2230    let timer = TimerFile::new_file(locked, current_task, timer_type, timeline, open_flags)?;
2231    let fd = current_task.add_file(locked, timer, fd_flags)?;
2232    Ok(fd)
2233}
2234
2235pub fn sys_timerfd_gettime(
2236    _locked: &mut Locked<Unlocked>,
2237    current_task: &CurrentTask,
2238    fd: FdNumber,
2239    user_current_value: ITimerSpecPtr,
2240) -> Result<(), Errno> {
2241    let file = current_task.get_file(fd)?;
2242    let timer_file = file.downcast_file::<TimerFile>().ok_or_else(|| errno!(EINVAL))?;
2243    let timer_info = timer_file.current_timer_spec();
2244    log_trace!("timerfd_gettime(fd={:?}, current_value={:?})", fd, timer_info);
2245    current_task.write_multi_arch_object(user_current_value, timer_info)?;
2246    Ok(())
2247}
2248
2249pub fn sys_timerfd_settime(
2250    _locked: &mut Locked<Unlocked>,
2251    current_task: &CurrentTask,
2252    fd: FdNumber,
2253    flags: u32,
2254    user_new_value: ITimerSpecPtr,
2255    user_old_value: ITimerSpecPtr,
2256) -> Result<(), Errno> {
2257    if flags & !(TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET) != 0 {
2258        track_stub!(TODO("https://fxbug.dev/322874722"), "timerfd_settime unknown flags", flags);
2259        return error!(EINVAL);
2260    }
2261
2262    let file = current_task.get_file(fd)?;
2263    let timer_file = file.downcast_file::<TimerFile>().ok_or_else(|| errno!(EINVAL))?;
2264
2265    if timer_file.wakeup_type() == TimerWakeup::Alarm {
2266        security::check_task_capable(current_task, CAP_WAKE_ALARM)?;
2267    }
2268
2269    let new_timer_spec = current_task.read_multi_arch_object(user_new_value)?;
2270    let old_timer_spec = timer_file.set_timer_spec(current_task, &file, new_timer_spec, flags)?;
2271    log_trace!(
2272        "timerfd_settime(fd={:?}, flags={:#x}, new_value={:?}, current_value={:?})",
2273        fd,
2274        flags,
2275        new_timer_spec,
2276        old_timer_spec
2277    );
2278    if !user_old_value.is_null() {
2279        current_task.write_multi_arch_object(user_old_value, old_timer_spec)?;
2280    }
2281    Ok(())
2282}
2283
2284fn deadline_after_timespec(
2285    current_task: &CurrentTask,
2286    user_timespec: TimeSpecPtr,
2287) -> Result<zx::MonotonicInstant, Errno> {
2288    if user_timespec.is_null() {
2289        Ok(zx::MonotonicInstant::INFINITE)
2290    } else {
2291        let timespec = current_task.read_multi_arch_object(user_timespec)?;
2292        Ok(zx::MonotonicInstant::after(duration_from_timespec(timespec)?))
2293    }
2294}
2295
2296static_assertions::assert_eq_size!(uapi::__kernel_fd_set, uapi::arch32::__kernel_fd_set);
2297
2298fn select(
2299    locked: &mut Locked<Unlocked>,
2300    current_task: &mut CurrentTask,
2301    nfds: u32,
2302    readfds_addr: UserRef<__kernel_fd_set>,
2303    writefds_addr: UserRef<__kernel_fd_set>,
2304    exceptfds_addr: UserRef<__kernel_fd_set>,
2305    deadline: zx::MonotonicInstant,
2306    sigmask_addr: UserRef<pselect6_sigmask>,
2307) -> Result<i32, Errno> {
2308    const BITS_PER_BYTE: usize = 8;
2309
2310    fn sizeof<T>(_: &T) -> usize {
2311        BITS_PER_BYTE * std::mem::size_of::<T>()
2312    }
2313    fn is_fd_set(set: &__kernel_fd_set, fd: usize) -> bool {
2314        let index = fd / sizeof(&set.fds_bits[0]);
2315        let remainder = fd % sizeof(&set.fds_bits[0]);
2316        set.fds_bits[index] & (1 << remainder) > 0
2317    }
2318    fn add_fd_to_set(set: &mut __kernel_fd_set, fd: usize) {
2319        let index = fd / sizeof(&set.fds_bits[0]);
2320        let remainder = fd % sizeof(&set.fds_bits[0]);
2321
2322        set.fds_bits[index] |= 1 << remainder;
2323    }
2324    let read_fd_set = |addr: UserRef<__kernel_fd_set>| {
2325        if addr.is_null() { Ok(Default::default()) } else { current_task.read_object(addr) }
2326    };
2327
2328    if nfds as usize > BITS_PER_BYTE * std::mem::size_of::<__kernel_fd_set>() {
2329        return error!(EINVAL);
2330    }
2331
2332    let read_events =
2333        FdEvents::from_bits_truncate(POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR);
2334    let write_events = FdEvents::from_bits_truncate(POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR);
2335    let except_events = FdEvents::from_bits_truncate(POLLPRI);
2336
2337    let readfds = read_fd_set(readfds_addr)?;
2338    let writefds = read_fd_set(writefds_addr)?;
2339    let exceptfds = read_fd_set(exceptfds_addr)?;
2340
2341    let sets = &[(read_events, &readfds), (write_events, &writefds), (except_events, &exceptfds)];
2342    let waiter = FileWaiter::<FdNumber>::default();
2343
2344    for fd in 0..nfds {
2345        let mut aggregated_events = FdEvents::empty();
2346        for (events, fds) in sets.iter() {
2347            if is_fd_set(fds, fd as usize) {
2348                aggregated_events |= *events;
2349            }
2350        }
2351        if !aggregated_events.is_empty() {
2352            let fd = FdNumber::from_raw(fd as i32);
2353            let file = current_task.get_file(fd)?;
2354            waiter.add(locked, current_task, fd, Some(&file), aggregated_events)?;
2355        }
2356    }
2357
2358    let mask = if !sigmask_addr.is_null() {
2359        let sigmask = current_task.read_object(sigmask_addr)?;
2360        let mask = if sigmask.ss.is_null() {
2361            current_task.read().signal_mask()
2362        } else {
2363            if sigmask.ss_len < std::mem::size_of::<sigset_t>() {
2364                return error!(EINVAL);
2365            }
2366            current_task.read_object(sigmask.ss.into())?
2367        };
2368        Some(mask)
2369    } else {
2370        None
2371    };
2372
2373    waiter.wait(locked, current_task, mask, deadline)?;
2374
2375    let mut num_fds = 0;
2376    let mut readfds_out: __kernel_fd_set = Default::default();
2377    let mut writefds_out: __kernel_fd_set = Default::default();
2378    let mut exceptfds_out: __kernel_fd_set = Default::default();
2379    let mut sets = [
2380        (read_events, &readfds, &mut readfds_out),
2381        (write_events, &writefds, &mut writefds_out),
2382        (except_events, &exceptfds, &mut exceptfds_out),
2383    ];
2384    let mut ready_items = waiter.ready_items.lock();
2385    for ReadyItem { key: ready_key, events: ready_events } in ready_items.drain(..) {
2386        let ready_key = assert_matches::assert_matches!(
2387            ready_key,
2388            ReadyItemKey::FdNumber(v) => v
2389        );
2390
2391        sets.iter_mut().for_each(|(events, fds, fds_out)| {
2392            let fd = ready_key.raw() as usize;
2393            if events.intersects(ready_events) && is_fd_set(fds, fd) {
2394                add_fd_to_set(fds_out, fd);
2395                num_fds += 1;
2396            }
2397        });
2398    }
2399
2400    let write_fd_set =
2401        |addr: UserRef<__kernel_fd_set>, value: __kernel_fd_set| -> Result<(), Errno> {
2402            if !addr.is_null() {
2403                current_task.write_object(addr, &value)?;
2404            }
2405            Ok(())
2406        };
2407    write_fd_set(readfds_addr, readfds_out)?;
2408    write_fd_set(writefds_addr, writefds_out)?;
2409    write_fd_set(exceptfds_addr, exceptfds_out)?;
2410    Ok(num_fds)
2411}
2412
2413pub fn sys_pselect6(
2414    locked: &mut Locked<Unlocked>,
2415    current_task: &mut CurrentTask,
2416    nfds: u32,
2417    readfds_addr: UserRef<__kernel_fd_set>,
2418    writefds_addr: UserRef<__kernel_fd_set>,
2419    exceptfds_addr: UserRef<__kernel_fd_set>,
2420    timeout_addr: TimeSpecPtr,
2421    sigmask_addr: UserRef<pselect6_sigmask>,
2422) -> Result<i32, Errno> {
2423    let deadline = deadline_after_timespec(current_task, timeout_addr)?;
2424
2425    let num_fds = select(
2426        locked,
2427        current_task,
2428        nfds,
2429        readfds_addr,
2430        writefds_addr,
2431        exceptfds_addr,
2432        deadline,
2433        sigmask_addr,
2434    )?;
2435
2436    if !timeout_addr.is_null()
2437        && !current_task
2438            .thread_group()
2439            .read()
2440            .personality
2441            .contains(PersonalityFlags::STICKY_TIMEOUTS)
2442    {
2443        let now = zx::MonotonicInstant::get();
2444        let remaining = std::cmp::max(deadline - now, zx::MonotonicDuration::from_seconds(0));
2445        current_task.write_multi_arch_object(timeout_addr, timespec_from_duration(remaining))?;
2446    }
2447
2448    Ok(num_fds)
2449}
2450
2451pub fn sys_select(
2452    locked: &mut Locked<Unlocked>,
2453    current_task: &mut CurrentTask,
2454    nfds: u32,
2455    readfds_addr: UserRef<__kernel_fd_set>,
2456    writefds_addr: UserRef<__kernel_fd_set>,
2457    exceptfds_addr: UserRef<__kernel_fd_set>,
2458    timeout_addr: TimeValPtr,
2459) -> Result<i32, Errno> {
2460    let start_time = zx::MonotonicInstant::get();
2461
2462    let deadline = if timeout_addr.is_null() {
2463        zx::MonotonicInstant::INFINITE
2464    } else {
2465        let timeval = current_task.read_multi_arch_object(timeout_addr)?;
2466        start_time + starnix_types::time::duration_from_timeval(timeval)?
2467    };
2468
2469    let num_fds = select(
2470        locked,
2471        current_task,
2472        nfds,
2473        readfds_addr,
2474        writefds_addr,
2475        exceptfds_addr,
2476        deadline,
2477        UserRef::<pselect6_sigmask>::default(),
2478    )?;
2479
2480    if !timeout_addr.is_null()
2481        && !current_task
2482            .thread_group()
2483            .read()
2484            .personality
2485            .contains(PersonalityFlags::STICKY_TIMEOUTS)
2486    {
2487        let now = zx::MonotonicInstant::get();
2488        let remaining = std::cmp::max(deadline - now, zx::MonotonicDuration::from_seconds(0));
2489        current_task.write_multi_arch_object(
2490            timeout_addr,
2491            starnix_types::time::timeval_from_duration(remaining),
2492        )?;
2493    }
2494
2495    Ok(num_fds)
2496}
2497
2498pub fn sys_epoll_create1(
2499    locked: &mut Locked<Unlocked>,
2500    current_task: &CurrentTask,
2501    flags: u32,
2502) -> Result<FdNumber, Errno> {
2503    if flags & !EPOLL_CLOEXEC != 0 {
2504        return error!(EINVAL);
2505    }
2506    let ep_file = EpollFileObject::new_file(locked, current_task);
2507    let fd_flags = if flags & EPOLL_CLOEXEC != 0 { FdFlags::CLOEXEC } else { FdFlags::empty() };
2508    let fd = current_task.add_file(locked, ep_file, fd_flags)?;
2509    Ok(fd)
2510}
2511
2512pub fn sys_epoll_ctl(
2513    locked: &mut Locked<Unlocked>,
2514    current_task: &CurrentTask,
2515    epfd: FdNumber,
2516    op: u32,
2517    fd: FdNumber,
2518    event: UserRef<EpollEvent>,
2519) -> Result<(), Errno> {
2520    let file = current_task.get_file(epfd)?;
2521    let epoll_file = file.downcast_file::<EpollFileObject>().ok_or_else(|| errno!(EINVAL))?;
2522    let operand_file = current_task.get_file(fd)?;
2523
2524    if Arc::ptr_eq(&file, &operand_file) {
2525        return error!(EINVAL);
2526    }
2527
2528    let epoll_event = match current_task.read_object(event) {
2529        Ok(mut epoll_event) => {
2530            // If EPOLLWAKEUP is specified in flags, but the caller does not have the CAP_BLOCK_SUSPEND
2531            // capability, then the EPOLLWAKEUP flag is silently ignored.
2532            // See https://man7.org/linux/man-pages/man2/epoll_ctl.2.html
2533            if epoll_event.events().contains(FdEvents::EPOLLWAKEUP) {
2534                if !security::is_task_capable_noaudit(current_task, CAP_BLOCK_SUSPEND) {
2535                    epoll_event.ignore(FdEvents::EPOLLWAKEUP);
2536                }
2537            }
2538            Ok(epoll_event)
2539        }
2540        result => result,
2541    };
2542
2543    match op {
2544        EPOLL_CTL_ADD => {
2545            epoll_file.add(locked, current_task, &operand_file, &file, epoll_event?)?;
2546            operand_file.register_epfd(&file);
2547        }
2548        EPOLL_CTL_MOD => {
2549            epoll_file.modify(locked, current_task, &operand_file, epoll_event?)?;
2550        }
2551        EPOLL_CTL_DEL => {
2552            epoll_file.delete(current_task, &operand_file)?;
2553            operand_file.unregister_epfd(&file);
2554        }
2555        _ => return error!(EINVAL),
2556    }
2557    Ok(())
2558}
2559
2560// Backend for sys_epoll_pwait and sys_epoll_pwait2 that takes an already-decoded deadline.
2561fn do_epoll_pwait(
2562    locked: &mut Locked<Unlocked>,
2563    current_task: &mut CurrentTask,
2564    epfd: FdNumber,
2565    events: UserRef<EpollEvent>,
2566    unvalidated_max_events: i32,
2567    deadline: zx::MonotonicInstant,
2568    user_sigmask: UserRef<SigSet>,
2569) -> Result<usize, Errno> {
2570    let file = current_task.get_file(epfd)?;
2571    let epoll_file = file.downcast_file::<EpollFileObject>().ok_or_else(|| errno!(EINVAL))?;
2572
2573    // Max_events must be greater than 0.
2574    let max_events: usize = unvalidated_max_events.try_into().map_err(|_| errno!(EINVAL))?;
2575    if max_events == 0 {
2576        return error!(EINVAL);
2577    }
2578
2579    // Return early if the user passes an obviously invalid pointer. This avoids dropping events
2580    // for common pointer errors. When we catch bad pointers after the wait is complete when the
2581    // memory is actually written, the events will be lost. This check is not a guarantee.
2582    current_task
2583        .mm()?
2584        .check_plausible(events.addr(), max_events * std::mem::size_of::<EpollEvent>())?;
2585
2586    let active_events = if !user_sigmask.is_null() {
2587        let signal_mask = current_task.read_object(user_sigmask)?;
2588        current_task.wait_with_temporary_mask(locked, signal_mask, |locked, current_task| {
2589            epoll_file.wait(locked, current_task, max_events, deadline)
2590        })?
2591    } else {
2592        epoll_file.wait(locked, current_task, max_events, deadline)?
2593    };
2594
2595    current_task.write_objects(events, &active_events)?;
2596    Ok(active_events.len())
2597}
2598
2599pub fn sys_epoll_pwait(
2600    locked: &mut Locked<Unlocked>,
2601    current_task: &mut CurrentTask,
2602    epfd: FdNumber,
2603    events: UserRef<EpollEvent>,
2604    max_events: i32,
2605    timeout: i32,
2606    user_sigmask: UserRef<SigSet>,
2607) -> Result<usize, Errno> {
2608    let deadline = zx::MonotonicInstant::after(duration_from_poll_timeout(timeout)?);
2609    do_epoll_pwait(locked, current_task, epfd, events, max_events, deadline, user_sigmask)
2610}
2611
2612pub fn sys_epoll_pwait2(
2613    locked: &mut Locked<Unlocked>,
2614    current_task: &mut CurrentTask,
2615    epfd: FdNumber,
2616    events: UserRef<EpollEvent>,
2617    max_events: i32,
2618    user_timespec: TimeSpecPtr,
2619    user_sigmask: UserRef<SigSet>,
2620) -> Result<usize, Errno> {
2621    let deadline = deadline_after_timespec(current_task, user_timespec)?;
2622    do_epoll_pwait(locked, current_task, epfd, events, max_events, deadline, user_sigmask)
2623}
2624
2625struct FileWaiter<Key: Into<ReadyItemKey>> {
2626    waiter: Waiter,
2627    ready_items: Arc<LockDepMutex<VecDeque<ReadyItem>, EventHandlerReadyQueueLock>>,
2628    _marker: PhantomData<Key>,
2629}
2630
2631impl<Key: Into<ReadyItemKey>> Default for FileWaiter<Key> {
2632    fn default() -> Self {
2633        Self { waiter: Waiter::new(), ready_items: Default::default(), _marker: PhantomData }
2634    }
2635}
2636
2637impl<Key: Into<ReadyItemKey>> FileWaiter<Key> {
2638    fn add<L>(
2639        &self,
2640        locked: &mut Locked<L>,
2641        current_task: &CurrentTask,
2642        key: Key,
2643        file: Option<&FileHandle>,
2644        requested_events: FdEvents,
2645    ) -> Result<(), Errno>
2646    where
2647        L: LockEqualOrBefore<FileOpsCore>,
2648    {
2649        let key = key.into();
2650
2651        if let Some(file) = file {
2652            let sought_events = requested_events | FdEvents::POLLERR | FdEvents::POLLHUP;
2653
2654            let handler =
2655                EventHandler::Enqueue { key, queue: self.ready_items.clone(), sought_events };
2656            file.wait_async(locked, current_task, &self.waiter, sought_events, handler);
2657            let current_events = file.query_events(locked, current_task)? & sought_events;
2658            if !current_events.is_empty() {
2659                self.ready_items.lock().push_back(ReadyItem { key, events: current_events });
2660            }
2661        } else {
2662            self.ready_items.lock().push_back(ReadyItem { key, events: FdEvents::POLLNVAL });
2663        }
2664        Ok(())
2665    }
2666
2667    fn wait<L>(
2668        &self,
2669        locked: &mut Locked<L>,
2670        current_task: &mut CurrentTask,
2671        signal_mask: Option<SigSet>,
2672        deadline: zx::MonotonicInstant,
2673    ) -> Result<(), Errno>
2674    where
2675        L: LockEqualOrBefore<FileOpsCore>,
2676    {
2677        if self.ready_items.lock().is_empty() {
2678            // When wait_until() returns Ok() it means there was a wake up; however there may not
2679            // be a ready item, for example if waiting on a sync file with multiple sync points.
2680            // Keep waiting until there's at least one ready item.
2681            let signal_mask = signal_mask.unwrap_or_else(|| current_task.read().signal_mask());
2682            let mut result = current_task.wait_with_temporary_mask(
2683                locked,
2684                signal_mask,
2685                |locked, current_task| self.waiter.wait_until(locked, current_task, deadline),
2686            );
2687            loop {
2688                match result {
2689                    Err(err) if err == ETIMEDOUT => return Ok(()),
2690                    Ok(()) => {
2691                        if !self.ready_items.lock().is_empty() {
2692                            break;
2693                        }
2694                    }
2695                    result => result?,
2696                };
2697                result = self.waiter.wait_until(locked, current_task, deadline);
2698            }
2699        }
2700        Ok(())
2701    }
2702}
2703
2704pub fn poll(
2705    locked: &mut Locked<Unlocked>,
2706    current_task: &mut CurrentTask,
2707    user_pollfds: UserRef<pollfd>,
2708    num_fds: i32,
2709    mask: Option<SigSet>,
2710    deadline: zx::MonotonicInstant,
2711) -> Result<usize, Errno> {
2712    if num_fds < 0
2713        || num_fds as u64 > current_task.thread_group().get_rlimit(locked, Resource::NOFILE)
2714    {
2715        return error!(EINVAL);
2716    }
2717
2718    let mut pollfds = vec![pollfd::default(); num_fds as usize];
2719    let waiter = FileWaiter::<usize>::default();
2720
2721    for (index, poll_descriptor) in pollfds.iter_mut().enumerate() {
2722        *poll_descriptor = current_task.read_object(user_pollfds.at(index)?)?;
2723        poll_descriptor.revents = 0;
2724        if poll_descriptor.fd < 0 {
2725            continue;
2726        }
2727        let file = current_task.get_file(FdNumber::from_raw(poll_descriptor.fd)).ok();
2728        waiter.add(
2729            locked,
2730            current_task,
2731            index,
2732            file.as_ref(),
2733            FdEvents::from_bits_truncate(poll_descriptor.events as u32),
2734        )?;
2735    }
2736
2737    waiter.wait(locked, current_task, mask, deadline)?;
2738
2739    let mut ready_items = waiter.ready_items.lock();
2740    let mut unique_ready_items =
2741        bit_vec::BitVec::from_elem(usize::try_from(num_fds).unwrap(), false);
2742    for ReadyItem { key: ready_key, events: ready_events } in ready_items.drain(..) {
2743        let ready_key = assert_matches::assert_matches!(
2744            ready_key,
2745            ReadyItemKey::Usize(v) => v
2746        );
2747        let interested_events = FdEvents::from_bits_truncate(pollfds[ready_key].events as u32)
2748            | FdEvents::POLLERR
2749            | FdEvents::POLLHUP
2750            | FdEvents::POLLNVAL;
2751        let return_events = (interested_events & ready_events).bits();
2752        pollfds[ready_key].revents = return_events as i16;
2753        unique_ready_items.set(ready_key, true);
2754    }
2755
2756    for (index, poll_descriptor) in pollfds.iter().enumerate() {
2757        current_task.write_object(user_pollfds.at(index)?, poll_descriptor)?;
2758    }
2759
2760    Ok(unique_ready_items.into_iter().filter(Clone::clone).count())
2761}
2762
2763pub fn sys_ppoll(
2764    locked: &mut Locked<Unlocked>,
2765    current_task: &mut CurrentTask,
2766    user_fds: UserRef<pollfd>,
2767    num_fds: i32,
2768    user_timespec: TimeSpecPtr,
2769    user_mask: UserRef<SigSet>,
2770    sigset_size: usize,
2771) -> Result<usize, Errno> {
2772    let start_time = zx::MonotonicInstant::get();
2773
2774    let timeout = if user_timespec.is_null() {
2775        // Passing -1 to poll is equivalent to an infinite timeout.
2776        -1
2777    } else {
2778        let ts = current_task.read_multi_arch_object(user_timespec)?;
2779        duration_from_timespec::<zx::MonotonicTimeline>(ts)?.into_millis() as i32
2780    };
2781
2782    let deadline = start_time + duration_from_poll_timeout(timeout)?;
2783
2784    let mask = if !user_mask.is_null() {
2785        if sigset_size != std::mem::size_of::<SigSet>() {
2786            return error!(EINVAL);
2787        }
2788        let mask = current_task.read_object(user_mask)?;
2789        Some(mask)
2790    } else {
2791        None
2792    };
2793
2794    let poll_result = poll(locked, current_task, user_fds, num_fds, mask, deadline);
2795
2796    if user_timespec.is_null() {
2797        return poll_result;
2798    }
2799
2800    let now = zx::MonotonicInstant::get();
2801    let remaining = std::cmp::max(deadline - now, zx::MonotonicDuration::from_seconds(0));
2802    let remaining_timespec = timespec_from_duration(remaining);
2803
2804    // From gVisor: "ppoll is normally restartable if interrupted by something other than a signal
2805    // handled by the application (i.e. returns ERESTARTNOHAND). However, if
2806    // [copy out] failed, then the restarted ppoll would use the wrong timeout, so the
2807    // error should be left as EINTR."
2808    match (current_task.write_multi_arch_object(user_timespec, remaining_timespec), poll_result) {
2809        // If write was ok, and poll was ok, return poll result.
2810        (Ok(_), Ok(num_events)) => Ok(num_events),
2811        (Ok(_), Err(e)) if e == EINTR => {
2812            error!(ERESTARTNOHAND)
2813        }
2814        (Ok(_), poll_result) => poll_result,
2815        // If write was a failure, return the poll result unchanged.
2816        (Err(_), poll_result) => poll_result,
2817    }
2818}
2819
2820pub fn sys_flock(
2821    locked: &mut Locked<Unlocked>,
2822    current_task: &CurrentTask,
2823    fd: FdNumber,
2824    operation: u32,
2825) -> Result<(), Errno> {
2826    let file = current_task.get_file(fd)?;
2827    let operation = FlockOperation::from_flags(operation)?;
2828    file.flock(locked, current_task, operation)
2829}
2830
2831pub fn sys_sync(locked: &mut Locked<Unlocked>, current_task: &CurrentTask) -> Result<(), Errno> {
2832    current_task.kernel().mounts.sync_all(locked, current_task)
2833}
2834
2835pub fn sys_syncfs(
2836    locked: &mut Locked<Unlocked>,
2837    current_task: &CurrentTask,
2838    fd: FdNumber,
2839) -> Result<(), Errno> {
2840    let file = current_task.get_file(fd)?;
2841    file.fs.sync(locked, current_task)
2842}
2843
2844pub fn sys_fsync(
2845    _locked: &mut Locked<Unlocked>,
2846    current_task: &CurrentTask,
2847    fd: FdNumber,
2848) -> Result<(), Errno> {
2849    let file = current_task.get_file(fd)?;
2850    file.sync(current_task)
2851}
2852
2853pub fn sys_fdatasync(
2854    _locked: &mut Locked<Unlocked>,
2855    current_task: &CurrentTask,
2856    fd: FdNumber,
2857) -> Result<(), Errno> {
2858    let file = current_task.get_file(fd)?;
2859    file.data_sync(current_task)
2860}
2861
2862pub fn sys_sync_file_range(
2863    _locked: &mut Locked<Unlocked>,
2864    current_task: &CurrentTask,
2865    fd: FdNumber,
2866    offset: off_t,
2867    length: off_t,
2868    flags: u32,
2869) -> Result<(), Errno> {
2870    const KNOWN_FLAGS: u32 = uapi::SYNC_FILE_RANGE_WAIT_BEFORE
2871        | uapi::SYNC_FILE_RANGE_WRITE
2872        | uapi::SYNC_FILE_RANGE_WAIT_AFTER;
2873    if flags & !KNOWN_FLAGS != 0 {
2874        return error!(EINVAL);
2875    }
2876
2877    let file = current_task.get_file(fd)?;
2878
2879    if offset < 0 || length < 0 {
2880        return error!(EINVAL);
2881    }
2882
2883    checked_add_offset_and_length(offset as usize, length as usize)?;
2884
2885    // From <https://linux.die.net/man/2/sync_file_range>:
2886    //
2887    //   fd refers to something other than a regular file, a block device, a directory, or a symbolic link.
2888    let mode = file.node().info().mode;
2889    if !mode.is_reg() && !mode.is_blk() && !mode.is_dir() && !mode.is_lnk() {
2890        return error!(ESPIPE);
2891    }
2892
2893    if flags == 0 {
2894        return Ok(());
2895    }
2896
2897    // Syncing the whole file is much more than we need for sync_file_range, which only needs to
2898    // sync the specified data range.
2899    file.data_sync(current_task)
2900}
2901
2902pub fn sys_fadvise64(
2903    _locked: &mut Locked<Unlocked>,
2904    current_task: &CurrentTask,
2905    fd: FdNumber,
2906    offset: off_t,
2907    len: off_t,
2908    advice: u32,
2909) -> Result<(), Errno> {
2910    match advice {
2911        POSIX_FADV_NORMAL => track_stub!(TODO("https://fxbug.dev/297434181"), "POSIX_FADV_NORMAL"),
2912        POSIX_FADV_RANDOM => track_stub!(TODO("https://fxbug.dev/297434181"), "POSIX_FADV_RANDOM"),
2913        POSIX_FADV_SEQUENTIAL => {
2914            track_stub!(TODO("https://fxbug.dev/297434181"), "POSIX_FADV_SEQUENTIAL")
2915        }
2916        POSIX_FADV_WILLNEED => {
2917            track_stub!(TODO("https://fxbug.dev/297434181"), "POSIX_FADV_WILLNEED")
2918        }
2919        POSIX_FADV_DONTNEED => {
2920            track_stub!(TODO("https://fxbug.dev/297434181"), "POSIX_FADV_DONTNEED")
2921        }
2922        POSIX_FADV_NOREUSE => {
2923            track_stub!(TODO("https://fxbug.dev/297434181"), "POSIX_FADV_NOREUSE")
2924        }
2925        _ => {
2926            track_stub!(TODO("https://fxbug.dev/322875684"), "fadvise64 unknown advice", advice);
2927            return error!(EINVAL);
2928        }
2929    }
2930
2931    if offset < 0 || len < 0 {
2932        return error!(EINVAL);
2933    }
2934
2935    let file = current_task.get_file(fd)?;
2936    // fadvise does not work on pipes.
2937    if file.downcast_file::<PipeFileObject>().is_some() {
2938        return error!(ESPIPE);
2939    }
2940
2941    // fadvise does not work on paths.
2942    if file.flags().contains(OpenFlags::PATH) {
2943        return error!(EBADF);
2944    }
2945
2946    Ok(())
2947}
2948
2949pub fn sys_fallocate(
2950    locked: &mut Locked<Unlocked>,
2951    current_task: &CurrentTask,
2952    fd: FdNumber,
2953    mode: u32,
2954    offset: off_t,
2955    len: off_t,
2956) -> Result<(), Errno> {
2957    let file = current_task.get_file(fd)?;
2958
2959    // Offset must not be less than 0.
2960    // Length must not be less than or equal to 0.
2961    // See https://man7.org/linux/man-pages/man2/fallocate.2.html#ERRORS
2962    if offset < 0 || len <= 0 {
2963        return error!(EINVAL);
2964    }
2965
2966    let mode = FallocMode::from_bits(mode).ok_or_else(|| errno!(EINVAL))?;
2967    file.fallocate(locked, current_task, mode, offset as u64, len as u64)?;
2968
2969    Ok(())
2970}
2971
2972pub fn sys_inotify_init1(
2973    locked: &mut Locked<Unlocked>,
2974    current_task: &CurrentTask,
2975    flags: u32,
2976) -> Result<FdNumber, Errno> {
2977    if flags & !(IN_NONBLOCK | IN_CLOEXEC) != 0 {
2978        return error!(EINVAL);
2979    }
2980    let non_blocking = flags & IN_NONBLOCK != 0;
2981    let close_on_exec = flags & IN_CLOEXEC != 0;
2982    let inotify_file = InotifyFileObject::new_file(locked, current_task, non_blocking);
2983    let fd_flags = if close_on_exec { FdFlags::CLOEXEC } else { FdFlags::empty() };
2984    current_task.add_file(locked, inotify_file, fd_flags)
2985}
2986
2987pub fn sys_inotify_add_watch(
2988    locked: &mut Locked<Unlocked>,
2989    current_task: &CurrentTask,
2990    fd: FdNumber,
2991    user_path: UserCString,
2992    mask: u32,
2993) -> Result<WdNumber, Errno> {
2994    let mask = InotifyMask::from_bits(mask).ok_or_else(|| errno!(EINVAL))?;
2995    if !mask.intersects(InotifyMask::ALL_EVENTS) {
2996        // Mask must include at least 1 event.
2997        return error!(EINVAL);
2998    }
2999    let file = current_task.get_file(fd)?;
3000    let inotify_file = file.downcast_file::<InotifyFileObject>().ok_or_else(|| errno!(EINVAL))?;
3001    let options = if mask.contains(InotifyMask::DONT_FOLLOW) {
3002        LookupFlags::no_follow()
3003    } else {
3004        LookupFlags::default()
3005    };
3006    let watched_node = lookup_at(locked, current_task, FdNumber::AT_FDCWD, user_path, options)?;
3007    if mask.contains(InotifyMask::ONLYDIR) && !watched_node.entry.node.is_dir() {
3008        return error!(ENOTDIR);
3009    }
3010    inotify_file.add_watch(watched_node.entry, mask, &file)
3011}
3012
3013pub fn sys_inotify_rm_watch(
3014    _locked: &mut Locked<Unlocked>,
3015    current_task: &CurrentTask,
3016    fd: FdNumber,
3017    watch_id: WdNumber,
3018) -> Result<(), Errno> {
3019    let file = current_task.get_file(fd)?;
3020    let inotify_file = file.downcast_file::<InotifyFileObject>().ok_or_else(|| errno!(EINVAL))?;
3021    inotify_file.remove_watch(watch_id, &file)
3022}
3023
3024pub fn sys_utimensat(
3025    locked: &mut Locked<Unlocked>,
3026    current_task: &CurrentTask,
3027    dir_fd: FdNumber,
3028    user_path: UserCString,
3029    user_times: TimeSpecPtr,
3030    flags: u32,
3031) -> Result<(), Errno> {
3032    let (atime, mtime) = if user_times.addr().is_null() {
3033        // If user_times is null, the timestamps are updated to the current time.
3034        (TimeUpdateType::Now, TimeUpdateType::Now)
3035    } else {
3036        let ts = current_task.read_multi_arch_objects_to_vec(user_times, 2)?;
3037        let atime = ts[0];
3038        let mtime = ts[1];
3039        let parse_timespec = |spec: timespec| match spec.tv_nsec {
3040            UTIME_NOW => Ok(TimeUpdateType::Now),
3041            UTIME_OMIT => Ok(TimeUpdateType::Omit),
3042            _ => time_from_timespec(spec).map(TimeUpdateType::Time),
3043        };
3044        (parse_timespec(atime)?, parse_timespec(mtime)?)
3045    };
3046
3047    if let (TimeUpdateType::Omit, TimeUpdateType::Omit) = (atime, mtime) {
3048        return Ok(());
3049    };
3050
3051    // Non-standard feature: if user_path is null, the timestamps are updated on the file referred
3052    // to by dir_fd.
3053    // See https://man7.org/linux/man-pages/man2/utimensat.2.html
3054    let name = if user_path.addr().is_null() {
3055        if dir_fd == FdNumber::AT_FDCWD {
3056            return error!(EFAULT);
3057        }
3058        let (node, _) = current_task.resolve_dir_fd(
3059            locked,
3060            dir_fd,
3061            Default::default(),
3062            ResolveFlags::empty(),
3063        )?;
3064        node
3065    } else {
3066        let lookup_flags = LookupFlags::from_bits(flags, AT_SYMLINK_NOFOLLOW)?;
3067        lookup_at(locked, current_task, dir_fd, user_path, lookup_flags)?
3068    };
3069    name.entry.node.update_atime_mtime(locked, current_task, &name.mount, atime, mtime)?;
3070    let event_mask = match (atime, mtime) {
3071        (_, TimeUpdateType::Omit) => InotifyMask::ACCESS,
3072        (TimeUpdateType::Omit, _) => InotifyMask::MODIFY,
3073        (_, _) => InotifyMask::ATTRIB,
3074    };
3075    name.entry.notify_ignoring_excl_unlink(event_mask);
3076    Ok(())
3077}
3078
3079pub fn sys_splice(
3080    locked: &mut Locked<Unlocked>,
3081    current_task: &CurrentTask,
3082    fd_in: FdNumber,
3083    off_in: OffsetPtr,
3084    fd_out: FdNumber,
3085    off_out: OffsetPtr,
3086    len: usize,
3087    flags: u32,
3088) -> Result<usize, Errno> {
3089    splice::splice(locked, current_task, fd_in, off_in, fd_out, off_out, len, flags)
3090}
3091
3092pub fn sys_vmsplice(
3093    locked: &mut Locked<Unlocked>,
3094    current_task: &CurrentTask,
3095    fd: FdNumber,
3096    iovec_addr: IOVecPtr,
3097    iovec_count: UserValue<i32>,
3098    flags: u32,
3099) -> Result<usize, Errno> {
3100    splice::vmsplice(locked, current_task, fd, iovec_addr, iovec_count, flags)
3101}
3102
3103pub fn sys_copy_file_range(
3104    locked: &mut Locked<Unlocked>,
3105    current_task: &CurrentTask,
3106    fd_in: FdNumber,
3107    off_in: OffsetPtr,
3108    fd_out: FdNumber,
3109    off_out: OffsetPtr,
3110    len: usize,
3111    flags: u32,
3112) -> Result<usize, Errno> {
3113    splice::copy_file_range(locked, current_task, fd_in, off_in, fd_out, off_out, len, flags)
3114}
3115
3116pub fn sys_tee(
3117    locked: &mut Locked<Unlocked>,
3118    current_task: &CurrentTask,
3119    fd_in: FdNumber,
3120    fd_out: FdNumber,
3121    len: usize,
3122    flags: u32,
3123) -> Result<usize, Errno> {
3124    splice::tee(locked, current_task, fd_in, fd_out, len, flags)
3125}
3126
3127pub fn sys_readahead(
3128    _locked: &mut Locked<Unlocked>,
3129    current_task: &CurrentTask,
3130    fd: FdNumber,
3131    offset: off_t,
3132    length: usize,
3133) -> Result<(), Errno> {
3134    let file = current_task.get_file(fd)?;
3135    // Allow only non-negative values of `offset`. Some versions of Linux allow it to be negative,
3136    // but GVisor tests require `readahead()` to fail in this case.
3137    let offset: usize = offset.try_into().map_err(|_| errno!(EINVAL))?;
3138    file.readahead(current_task, offset, length)
3139}
3140
3141pub fn sys_io_setup(
3142    _locked: &mut Locked<Unlocked>,
3143    current_task: &CurrentTask,
3144    user_nr_events: UserValue<u32>,
3145    user_ctx_idp: MultiArchUserRef<uapi::aio_context_t, uapi::arch32::aio_context_t>,
3146) -> Result<(), Errno> {
3147    // From https://man7.org/linux/man-pages/man2/io_setup.2.html:
3148    //
3149    //   EINVAL ctx_idp is not initialized, or the specified nr_events
3150    //   exceeds internal limits.  nr_events should be greater than
3151    //   0.
3152    //
3153    // TODO: Determine what "internal limits" means.
3154    let max_operations =
3155        user_nr_events.validate(0..(i32::MAX as u32)).ok_or_else(|| errno!(EINVAL))? as usize;
3156    if current_task.read_multi_arch_object(user_ctx_idp)? != 0 {
3157        return error!(EINVAL);
3158    }
3159    let ctx_id = AioContext::create(current_task, max_operations)?;
3160    current_task.write_multi_arch_object(user_ctx_idp, ctx_id).map_err(|e| {
3161        let _ = current_task
3162            .mm()
3163            .expect("previous sys_io_setup code verified mm exists")
3164            .destroy_aio_context(ctx_id.into());
3165        e
3166    })?;
3167    Ok(())
3168}
3169
3170pub fn sys_io_submit(
3171    _locked: &mut Locked<Unlocked>,
3172    current_task: &CurrentTask,
3173    ctx_id: aio_context_t,
3174    user_nr: UserValue<i32>,
3175    mut iocb_addrs: IocbPtrPtr,
3176) -> Result<i32, Errno> {
3177    let nr = user_nr.validate(0..i32::MAX).ok_or_else(|| errno!(EINVAL))?;
3178    if nr == 0 {
3179        return Ok(0);
3180    }
3181    let ctx = current_task.mm()?.get_aio_context(ctx_id.into()).ok_or_else(|| errno!(EINVAL))?;
3182
3183    // `iocbpp` is an array of addresses to iocb's.
3184    let mut num_submitted: i32 = 0;
3185    loop {
3186        let iocb_ref = current_task.read_multi_arch_ptr(iocb_addrs)?;
3187        let control_block = current_task.read_multi_arch_object(iocb_ref)?;
3188
3189        match (num_submitted, ctx.submit(current_task, control_block, iocb_ref)) {
3190            (0, Err(e)) => return Err(e),
3191            (_, Err(_)) => break,
3192            (_, Ok(())) => {
3193                num_submitted += 1;
3194                if num_submitted == nr {
3195                    break;
3196                }
3197            }
3198        };
3199
3200        iocb_addrs = iocb_addrs.next()?;
3201    }
3202
3203    Ok(num_submitted)
3204}
3205
3206pub fn sys_io_getevents(
3207    _locked: &mut Locked<Unlocked>,
3208    current_task: &CurrentTask,
3209    ctx_id: aio_context_t,
3210    min_nr: i64,
3211    nr: i64,
3212    events_ref: UserRef<io_event>,
3213    user_timeout: TimeSpecPtr,
3214) -> Result<i32, Errno> {
3215    if min_nr < 0 || min_nr > nr || nr < 0 {
3216        return error!(EINVAL);
3217    }
3218    let min_results = min_nr as usize;
3219    let max_results = nr as usize;
3220    let deadline = deadline_after_timespec(current_task, user_timeout)?;
3221
3222    let ctx = current_task.mm()?.get_aio_context(ctx_id.into()).ok_or_else(|| errno!(EINVAL))?;
3223    let events = ctx.get_events(current_task, min_results, max_results, deadline)?;
3224    current_task.write_objects(events_ref, &events)?;
3225
3226    Ok(events.len() as i32)
3227}
3228
3229pub fn sys_io_cancel(
3230    _locked: &mut Locked<Unlocked>,
3231    current_task: &CurrentTask,
3232    ctx_id: aio_context_t,
3233    user_iocb: IocbPtr,
3234    _result: UserRef<io_event>,
3235) -> Result<(), Errno> {
3236    let iocb = current_task.read_multi_arch_object(user_iocb)?;
3237    let ctx = current_task.mm()?.get_aio_context(ctx_id.into()).ok_or_else(|| errno!(EINVAL))?;
3238
3239    ctx.cancel(current_task, iocb, user_iocb)?;
3240    // TODO: Correctly handle return. If the operation is successfully canceled, the event should be copied into the memory pointed to by result without being placed into the completion queue.
3241    track_stub!(TODO("https://fxbug.dev/297433877"), "io_cancel");
3242    Ok(())
3243}
3244
3245pub fn sys_io_destroy(
3246    _locked: &mut Locked<Unlocked>,
3247    current_task: &CurrentTask,
3248    ctx_id: aio_context_t,
3249) -> Result<(), Errno> {
3250    let aio_context = current_task.mm()?.destroy_aio_context(ctx_id.into())?;
3251    std::mem::drop(aio_context);
3252    Ok(())
3253}
3254
3255// Syscalls for arch32 usage
3256#[cfg(target_arch = "aarch64")]
3257mod arch32 {
3258    use crate::mm::MemoryAccessorExt;
3259    use crate::task::CurrentTask;
3260    use crate::vfs::syscalls::{
3261        LookupFlags, OpenFlags, lookup_at, sys_dup3, sys_faccessat, sys_fallocate, sys_lseek,
3262        sys_mkdirat, sys_openat, sys_readlinkat, sys_unlinkat,
3263    };
3264    use crate::vfs::{FdNumber, FsNode};
3265    use linux_uapi::off_t;
3266    use starnix_sync::{Locked, Unlocked};
3267    use starnix_syscalls::SyscallArg;
3268    use starnix_types::time::duration_from_poll_timeout;
3269    use starnix_uapi::errors::Errno;
3270    use starnix_uapi::file_mode::FileMode;
3271    use starnix_uapi::signals::SigSet;
3272    use starnix_uapi::user_address::{MultiArchUserRef, UserAddress, UserCString, UserRef};
3273    use starnix_uapi::vfs::EpollEvent;
3274    use starnix_uapi::{AT_REMOVEDIR, errno, error, uapi};
3275
3276    type StatFs64Ptr = MultiArchUserRef<uapi::statfs, uapi::arch32::statfs64>;
3277
3278    fn merge_low_and_high(low: u32, high: u32) -> off_t {
3279        ((high as off_t) << 32) | (low as off_t)
3280    }
3281
3282    pub fn sys_arch32_open(
3283        locked: &mut Locked<Unlocked>,
3284        current_task: &CurrentTask,
3285        user_path: UserCString,
3286        flags: u32,
3287        mode: FileMode,
3288    ) -> Result<FdNumber, Errno> {
3289        sys_openat(locked, current_task, FdNumber::AT_FDCWD, user_path, flags, mode)
3290    }
3291
3292    pub fn sys_arch32_access(
3293        locked: &mut Locked<Unlocked>,
3294        current_task: &CurrentTask,
3295        user_path: UserCString,
3296        mode: u32,
3297    ) -> Result<(), Errno> {
3298        sys_faccessat(locked, current_task, FdNumber::AT_FDCWD, user_path, mode)
3299    }
3300    pub fn stat64(
3301        locked: &mut Locked<Unlocked>,
3302        current_task: &CurrentTask,
3303        node: &FsNode,
3304        arch32_stat_buf: UserRef<uapi::arch32::stat64>,
3305    ) -> Result<(), Errno> {
3306        let stat_buffer = node.stat(locked, current_task)?;
3307        let result: uapi::arch32::stat64 = stat_buffer.try_into().map_err(|_| errno!(EINVAL))?;
3308        // Now we copy to the arch32 version and write.
3309        current_task.write_object(arch32_stat_buf, &result)?;
3310        Ok(())
3311    }
3312
3313    pub fn sys_arch32_fstat64(
3314        locked: &mut Locked<Unlocked>,
3315        current_task: &CurrentTask,
3316        fd: FdNumber,
3317        arch32_stat_buf: UserRef<uapi::arch32::stat64>,
3318    ) -> Result<(), Errno> {
3319        let file = current_task.get_file_allowing_opath(fd)?;
3320        stat64(locked, current_task, file.node(), arch32_stat_buf)
3321    }
3322
3323    pub fn sys_arch32_fallocate(
3324        locked: &mut Locked<Unlocked>,
3325        current_task: &CurrentTask,
3326        fd: FdNumber,
3327        mode: u32,
3328        offset_low: u32,
3329        offset_high: u32,
3330        len_low: u32,
3331        len_high: u32,
3332    ) -> Result<(), Errno> {
3333        let offset = merge_low_and_high(offset_low, offset_high);
3334        let len = merge_low_and_high(len_low, len_high);
3335        sys_fallocate(locked, current_task, fd, mode, offset, len)
3336    }
3337
3338    pub fn sys_arch32_stat64(
3339        locked: &mut Locked<Unlocked>,
3340        current_task: &CurrentTask,
3341        user_path: UserCString,
3342        arch32_stat_buf: UserRef<uapi::arch32::stat64>,
3343    ) -> Result<(), Errno> {
3344        let name =
3345            lookup_at(locked, current_task, FdNumber::AT_FDCWD, user_path, LookupFlags::default())?;
3346        stat64(locked, current_task, &name.entry.node, arch32_stat_buf)
3347    }
3348
3349    pub fn sys_arch32_readlink(
3350        locked: &mut Locked<Unlocked>,
3351        current_task: &CurrentTask,
3352        user_path: UserCString,
3353        buffer: UserAddress,
3354        buffer_size: usize,
3355    ) -> Result<usize, Errno> {
3356        sys_readlinkat(locked, current_task, FdNumber::AT_FDCWD, user_path, buffer, buffer_size)
3357    }
3358
3359    pub fn sys_arch32_mkdir(
3360        locked: &mut Locked<Unlocked>,
3361        current_task: &CurrentTask,
3362        user_path: UserCString,
3363        mode: FileMode,
3364    ) -> Result<(), Errno> {
3365        sys_mkdirat(locked, current_task, FdNumber::AT_FDCWD, user_path, mode)
3366    }
3367
3368    pub fn sys_arch32_rmdir(
3369        locked: &mut Locked<Unlocked>,
3370        current_task: &CurrentTask,
3371        user_path: UserCString,
3372    ) -> Result<(), Errno> {
3373        sys_unlinkat(locked, current_task, FdNumber::AT_FDCWD, user_path, AT_REMOVEDIR)
3374    }
3375
3376    #[allow(non_snake_case)]
3377    pub fn sys_arch32__llseek(
3378        locked: &mut Locked<Unlocked>,
3379        current_task: &CurrentTask,
3380        fd: FdNumber,
3381        offset_high: u32,
3382        offset_low: u32,
3383        result: UserRef<off_t>,
3384        whence: u32,
3385    ) -> Result<(), Errno> {
3386        let offset = merge_low_and_high(offset_low, offset_high);
3387        let result_value = sys_lseek(locked, current_task, fd, offset, whence)?;
3388        current_task.write_object(result, &result_value).map(|_| ())
3389    }
3390
3391    pub fn sys_arch32_dup2(
3392        locked: &mut Locked<Unlocked>,
3393        current_task: &CurrentTask,
3394        oldfd: FdNumber,
3395        newfd: FdNumber,
3396    ) -> Result<FdNumber, Errno> {
3397        if oldfd == newfd {
3398            // O_PATH allowed for:
3399            //
3400            //  Duplicating the file descriptor (dup(2), fcntl(2)
3401            //  F_DUPFD, etc.).
3402            //
3403            // See https://man7.org/linux/man-pages/man2/open.2.html
3404            current_task.get_file_allowing_opath(oldfd)?;
3405            return Ok(newfd);
3406        }
3407        sys_dup3(locked, current_task, oldfd, newfd, 0)
3408    }
3409
3410    pub fn sys_arch32_unlink(
3411        locked: &mut Locked<Unlocked>,
3412        current_task: &CurrentTask,
3413        user_path: UserCString,
3414    ) -> Result<(), Errno> {
3415        sys_unlinkat(locked, current_task, FdNumber::AT_FDCWD, user_path, 0)
3416    }
3417
3418    pub fn sys_arch32_pread64(
3419        locked: &mut Locked<Unlocked>,
3420        current_task: &CurrentTask,
3421        fd: FdNumber,
3422        address: UserAddress,
3423        length: usize,
3424        _: SyscallArg,
3425        offset_low: u32,
3426        offset_high: u32,
3427    ) -> Result<usize, Errno> {
3428        super::sys_pread64(
3429            locked,
3430            current_task,
3431            fd,
3432            address,
3433            length,
3434            merge_low_and_high(offset_low, offset_high),
3435        )
3436    }
3437
3438    pub fn sys_arch32_pwrite64(
3439        locked: &mut Locked<Unlocked>,
3440        current_task: &CurrentTask,
3441        fd: FdNumber,
3442        address: UserAddress,
3443        length: usize,
3444        _: SyscallArg,
3445        offset_low: u32,
3446        offset_high: u32,
3447    ) -> Result<usize, Errno> {
3448        super::sys_pwrite64(
3449            locked,
3450            current_task,
3451            fd,
3452            address,
3453            length,
3454            merge_low_and_high(offset_low, offset_high),
3455        )
3456    }
3457
3458    pub fn sys_arch32_truncate64(
3459        locked: &mut Locked<Unlocked>,
3460        current_task: &CurrentTask,
3461        user_path: UserCString,
3462        _unused: SyscallArg,
3463        length_low: u32,
3464        length_high: u32,
3465    ) -> Result<(), Errno> {
3466        super::sys_truncate(
3467            locked,
3468            current_task,
3469            user_path,
3470            merge_low_and_high(length_low, length_high),
3471        )
3472    }
3473
3474    pub fn sys_arch32_ftruncate64(
3475        locked: &mut Locked<Unlocked>,
3476        current_task: &CurrentTask,
3477        fd: FdNumber,
3478        _: SyscallArg,
3479        length_low: u32,
3480        length_high: u32,
3481    ) -> Result<(), Errno> {
3482        super::sys_ftruncate(locked, current_task, fd, merge_low_and_high(length_low, length_high))
3483    }
3484
3485    pub fn sys_arch32_chmod(
3486        locked: &mut Locked<Unlocked>,
3487        current_task: &CurrentTask,
3488        user_path: UserCString,
3489        mode: FileMode,
3490    ) -> Result<(), Errno> {
3491        super::sys_fchmodat(locked, current_task, FdNumber::AT_FDCWD, user_path, mode)
3492    }
3493
3494    pub fn sys_arch32_chown32(
3495        locked: &mut Locked<Unlocked>,
3496        current_task: &CurrentTask,
3497        user_path: UserCString,
3498        owner: uapi::arch32::__kernel_uid32_t,
3499        group: uapi::arch32::__kernel_uid32_t,
3500    ) -> Result<(), Errno> {
3501        super::sys_fchownat(locked, current_task, FdNumber::AT_FDCWD, user_path, owner, group, 0)
3502    }
3503
3504    pub fn sys_arch32_poll(
3505        locked: &mut Locked<Unlocked>,
3506        current_task: &mut CurrentTask,
3507        user_fds: UserRef<uapi::pollfd>,
3508        num_fds: i32,
3509        timeout: i32,
3510    ) -> Result<usize, Errno> {
3511        let deadline = zx::MonotonicInstant::after(duration_from_poll_timeout(timeout)?);
3512        super::poll(locked, current_task, user_fds, num_fds, None, deadline)
3513    }
3514
3515    pub fn sys_arch32_epoll_create(
3516        locked: &mut Locked<Unlocked>,
3517        current_task: &CurrentTask,
3518        size: i32,
3519    ) -> Result<FdNumber, Errno> {
3520        if size < 1 {
3521            // The man page for epoll_create says the size was used in a previous implementation as
3522            // a hint but no longer does anything. But it's still required to be >= 1 to ensure
3523            // programs are backwards-compatible.
3524            return error!(EINVAL);
3525        }
3526        super::sys_epoll_create1(locked, current_task, 0)
3527    }
3528
3529    pub fn sys_arch32_epoll_wait(
3530        locked: &mut Locked<Unlocked>,
3531        current_task: &mut CurrentTask,
3532        epfd: FdNumber,
3533        events: UserRef<EpollEvent>,
3534        max_events: i32,
3535        timeout: i32,
3536    ) -> Result<usize, Errno> {
3537        super::sys_epoll_pwait(
3538            locked,
3539            current_task,
3540            epfd,
3541            events,
3542            max_events,
3543            timeout,
3544            UserRef::<SigSet>::default(),
3545        )
3546    }
3547
3548    pub fn sys_arch32_rename(
3549        locked: &mut Locked<Unlocked>,
3550        current_task: &CurrentTask,
3551        old_user_path: UserCString,
3552        new_user_path: UserCString,
3553    ) -> Result<(), Errno> {
3554        super::sys_renameat2(
3555            locked,
3556            current_task,
3557            FdNumber::AT_FDCWD,
3558            old_user_path,
3559            FdNumber::AT_FDCWD,
3560            new_user_path,
3561            0,
3562        )
3563    }
3564
3565    pub fn sys_arch32_creat(
3566        locked: &mut Locked<Unlocked>,
3567        current_task: &CurrentTask,
3568        user_path: UserCString,
3569        mode: FileMode,
3570    ) -> Result<FdNumber, Errno> {
3571        super::sys_openat(
3572            locked,
3573            current_task,
3574            FdNumber::AT_FDCWD,
3575            user_path,
3576            (OpenFlags::WRONLY | OpenFlags::CREAT | OpenFlags::TRUNC).bits(),
3577            mode,
3578        )
3579    }
3580
3581    pub fn sys_arch32_symlink(
3582        locked: &mut Locked<Unlocked>,
3583        current_task: &CurrentTask,
3584        user_target: UserCString,
3585        user_path: UserCString,
3586    ) -> Result<(), Errno> {
3587        super::sys_symlinkat(locked, current_task, user_target, FdNumber::AT_FDCWD, user_path)
3588    }
3589
3590    pub fn sys_arch32_eventfd(
3591        locked: &mut Locked<Unlocked>,
3592        current_task: &CurrentTask,
3593        value: u32,
3594    ) -> Result<FdNumber, Errno> {
3595        super::sys_eventfd2(locked, current_task, value, 0)
3596    }
3597
3598    pub fn sys_arch32_inotify_init(
3599        locked: &mut Locked<Unlocked>,
3600        current_task: &CurrentTask,
3601    ) -> Result<FdNumber, Errno> {
3602        super::sys_inotify_init1(locked, current_task, 0)
3603    }
3604
3605    pub fn sys_arch32_link(
3606        locked: &mut Locked<Unlocked>,
3607        current_task: &CurrentTask,
3608        old_user_path: UserCString,
3609        new_user_path: UserCString,
3610    ) -> Result<(), Errno> {
3611        super::sys_linkat(
3612            locked,
3613            current_task,
3614            FdNumber::AT_FDCWD,
3615            old_user_path,
3616            FdNumber::AT_FDCWD,
3617            new_user_path,
3618            0,
3619        )
3620    }
3621
3622    pub fn sys_arch32_fstatfs64(
3623        locked: &mut Locked<Unlocked>,
3624        current_task: &CurrentTask,
3625        fd: FdNumber,
3626        user_buf_len: u32,
3627        user_buf: StatFs64Ptr,
3628    ) -> Result<(), Errno> {
3629        if (user_buf_len as usize) < std::mem::size_of::<uapi::arch32::statfs64>() {
3630            return error!(EINVAL);
3631        }
3632        super::fstatfs(locked, current_task, fd, user_buf)
3633    }
3634
3635    pub fn sys_arch32_statfs64(
3636        locked: &mut Locked<Unlocked>,
3637        current_task: &CurrentTask,
3638        user_path: UserCString,
3639        user_buf_len: u32,
3640        user_buf: StatFs64Ptr,
3641    ) -> Result<(), Errno> {
3642        if (user_buf_len as usize) < std::mem::size_of::<uapi::arch32::statfs64>() {
3643            return error!(EINVAL);
3644        }
3645        super::statfs(locked, current_task, user_path, user_buf)
3646    }
3647
3648    pub fn sys_arch32_arm_fadvise64_64(
3649        locked: &mut Locked<Unlocked>,
3650        current_task: &CurrentTask,
3651        fd: FdNumber,
3652        advice: u32,
3653        offset_low: u32,
3654        offset_high: u32,
3655        len_low: u32,
3656        len_high: u32,
3657    ) -> Result<(), Errno> {
3658        let offset = merge_low_and_high(offset_low, offset_high);
3659        let len = merge_low_and_high(len_low, len_high);
3660        super::sys_fadvise64(locked, current_task, fd, offset, len, advice)
3661    }
3662
3663    pub fn sys_arch32_sendfile64(
3664        locked: &mut Locked<Unlocked>,
3665        current_task: &CurrentTask,
3666        out_fd: FdNumber,
3667        in_fd: FdNumber,
3668        user_offset: UserRef<uapi::off_t>,
3669        count: i32,
3670    ) -> Result<usize, Errno> {
3671        super::sys_sendfile(locked, current_task, out_fd, in_fd, user_offset.into(), count)
3672    }
3673
3674    pub use super::{
3675        sys_chdir as sys_arch32_chdir, sys_chroot as sys_arch32_chroot,
3676        sys_copy_file_range as sys_arch32_copy_file_range, sys_dup3 as sys_arch32_dup3,
3677        sys_epoll_create1 as sys_arch32_epoll_create1, sys_epoll_ctl as sys_arch32_epoll_ctl,
3678        sys_epoll_pwait as sys_arch32_epoll_pwait, sys_epoll_pwait2 as sys_arch32_epoll_pwait2,
3679        sys_eventfd2 as sys_arch32_eventfd2, sys_fchmod as sys_arch32_fchmod,
3680        sys_fchmodat as sys_arch32_fchmodat, sys_fchown as sys_arch32_fchown32,
3681        sys_fchown as sys_arch32_fchown, sys_fchownat as sys_arch32_fchownat,
3682        sys_fdatasync as sys_arch32_fdatasync, sys_flock as sys_arch32_flock,
3683        sys_fsetxattr as sys_arch32_fsetxattr, sys_fstatat64 as sys_arch32_fstatat64,
3684        sys_fstatfs as sys_arch32_fstatfs, sys_fsync as sys_arch32_fsync,
3685        sys_ftruncate as sys_arch32_ftruncate,
3686        sys_inotify_add_watch as sys_arch32_inotify_add_watch,
3687        sys_inotify_init1 as sys_arch32_inotify_init1,
3688        sys_inotify_rm_watch as sys_arch32_inotify_rm_watch, sys_io_cancel as sys_arch32_io_cancel,
3689        sys_io_destroy as sys_arch32_io_destroy, sys_io_getevents as sys_arch32_io_getevents,
3690        sys_io_setup as sys_arch32_io_setup, sys_io_submit as sys_arch32_io_submit,
3691        sys_lgetxattr as sys_arch32_lgetxattr, sys_linkat as sys_arch32_linkat,
3692        sys_listxattr as sys_arch32_listxattr, sys_llistxattr as sys_arch32_llistxattr,
3693        sys_lsetxattr as sys_arch32_lsetxattr, sys_mkdirat as sys_arch32_mkdirat,
3694        sys_mknodat as sys_arch32_mknodat, sys_pidfd_getfd as sys_arch32_pidfd_getfd,
3695        sys_pidfd_open as sys_arch32_pidfd_open, sys_ppoll as sys_arch32_ppoll,
3696        sys_preadv as sys_arch32_preadv, sys_pselect6 as sys_arch32_pselect6,
3697        sys_readv as sys_arch32_readv, sys_removexattr as sys_arch32_removexattr,
3698        sys_renameat2 as sys_arch32_renameat2, sys_select as sys_arch32__newselect,
3699        sys_sendfile as sys_arch32_sendfile, sys_setxattr as sys_arch32_setxattr,
3700        sys_splice as sys_arch32_splice, sys_statfs as sys_arch32_statfs,
3701        sys_statx as sys_arch32_statx, sys_symlinkat as sys_arch32_symlinkat,
3702        sys_sync as sys_arch32_sync, sys_syncfs as sys_arch32_syncfs, sys_tee as sys_arch32_tee,
3703        sys_timerfd_create as sys_arch32_timerfd_create,
3704        sys_timerfd_gettime as sys_arch32_timerfd_gettime,
3705        sys_timerfd_settime as sys_arch32_timerfd_settime, sys_truncate as sys_arch32_truncate,
3706        sys_umask as sys_arch32_umask, sys_utimensat as sys_arch32_utimensat,
3707        sys_vmsplice as sys_arch32_vmsplice,
3708    };
3709}
3710
3711#[cfg(target_arch = "aarch64")]
3712pub use arch32::*;
3713
3714#[cfg(test)]
3715mod tests {
3716    use super::*;
3717    use crate::task::KernelFeatures;
3718    use crate::testing::*;
3719    use starnix_types::vfs::default_statfs;
3720    use starnix_uapi::{O_RDONLY, SEEK_CUR, SEEK_END, SEEK_SET};
3721    use zerocopy::IntoBytes;
3722
3723    #[::fuchsia::test]
3724    async fn test_sys_lseek() -> Result<(), Errno> {
3725        spawn_kernel_and_run_with_pkgfs(async |locked, current_task| {
3726            let fd = FdNumber::from_raw(10);
3727            let file_handle =
3728                current_task.open_file(locked, "data/testfile.txt".into(), OpenFlags::RDONLY)?;
3729            let file_size = file_handle.node().stat(locked, current_task).unwrap().st_size;
3730            current_task
3731                .running_state()
3732                .files
3733                .insert(locked, current_task, fd, file_handle)
3734                .unwrap();
3735
3736            assert_eq!(sys_lseek(locked, current_task, fd, 0, SEEK_CUR)?, 0);
3737            assert_eq!(sys_lseek(locked, current_task, fd, 1, SEEK_CUR)?, 1);
3738            assert_eq!(sys_lseek(locked, current_task, fd, 3, SEEK_SET)?, 3);
3739            assert_eq!(sys_lseek(locked, current_task, fd, -3, SEEK_CUR)?, 0);
3740            assert_eq!(sys_lseek(locked, current_task, fd, 0, SEEK_END)?, file_size);
3741            assert_eq!(sys_lseek(locked, current_task, fd, -5, SEEK_SET), error!(EINVAL));
3742
3743            // Make sure that the failed call above did not change the offset.
3744            assert_eq!(sys_lseek(locked, current_task, fd, 0, SEEK_CUR)?, file_size);
3745
3746            // Prepare for an overflow.
3747            assert_eq!(sys_lseek(locked, current_task, fd, 3, SEEK_SET)?, 3);
3748
3749            // Check for overflow.
3750            assert_eq!(sys_lseek(locked, current_task, fd, i64::MAX, SEEK_CUR), error!(EINVAL));
3751
3752            Ok(())
3753        })
3754        .await
3755    }
3756
3757    #[::fuchsia::test]
3758    async fn test_sys_dup() -> Result<(), Errno> {
3759        spawn_kernel_and_run_with_pkgfs(async |locked, current_task| {
3760            let file_handle =
3761                current_task.open_file(locked, "data/testfile.txt".into(), OpenFlags::RDONLY)?;
3762            let oldfd = current_task.add_file(locked, file_handle, FdFlags::empty())?;
3763            let newfd = sys_dup(locked, current_task, oldfd)?;
3764
3765            assert_ne!(oldfd, newfd);
3766            let files = &current_task.running_state().files;
3767            assert!(Arc::ptr_eq(&files.get(oldfd).unwrap(), &files.get(newfd).unwrap()));
3768
3769            assert_eq!(sys_dup(locked, current_task, FdNumber::from_raw(3)), error!(EBADF));
3770
3771            Ok(())
3772        })
3773        .await
3774    }
3775
3776    #[::fuchsia::test]
3777    async fn test_sys_dup3() -> Result<(), Errno> {
3778        spawn_kernel_and_run_with_pkgfs(async |locked, current_task| {
3779            let file_handle =
3780                current_task.open_file(locked, "data/testfile.txt".into(), OpenFlags::RDONLY)?;
3781            let oldfd = current_task.add_file(locked, file_handle, FdFlags::empty())?;
3782            let newfd = FdNumber::from_raw(2);
3783            sys_dup3(locked, current_task, oldfd, newfd, O_CLOEXEC)?;
3784
3785            assert_ne!(oldfd, newfd);
3786            let files = &current_task.running_state().files;
3787            assert!(Arc::ptr_eq(&files.get(oldfd).unwrap(), &files.get(newfd).unwrap()));
3788            assert_eq!(files.get_fd_flags_allowing_opath(oldfd).unwrap(), FdFlags::empty());
3789            assert_eq!(files.get_fd_flags_allowing_opath(newfd).unwrap(), FdFlags::CLOEXEC);
3790
3791            assert_eq!(sys_dup3(locked, current_task, oldfd, oldfd, O_CLOEXEC), error!(EINVAL));
3792
3793            // Pass invalid flags.
3794            let invalid_flags = 1234;
3795            assert_eq!(sys_dup3(locked, current_task, oldfd, newfd, invalid_flags), error!(EINVAL));
3796
3797            // Makes sure that dup closes the old file handle before the fd points
3798            // to the new file handle.
3799            let second_file_handle =
3800                current_task.open_file(locked, "data/testfile.txt".into(), OpenFlags::RDONLY)?;
3801            let different_file_fd =
3802                current_task.add_file(locked, second_file_handle, FdFlags::empty())?;
3803            assert!(!Arc::ptr_eq(
3804                &files.get(oldfd).unwrap(),
3805                &files.get(different_file_fd).unwrap()
3806            ));
3807            sys_dup3(locked, current_task, oldfd, different_file_fd, O_CLOEXEC)?;
3808            assert!(Arc::ptr_eq(
3809                &files.get(oldfd).unwrap(),
3810                &files.get(different_file_fd).unwrap()
3811            ));
3812
3813            Ok(())
3814        })
3815        .await
3816    }
3817
3818    #[::fuchsia::test]
3819    async fn test_sys_open_cloexec() -> Result<(), Errno> {
3820        spawn_kernel_and_run_with_pkgfs(async |locked, current_task| {
3821            let path_addr = map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
3822            let path = b"data/testfile.txt\0";
3823            current_task.write_memory(path_addr, path)?;
3824            let fd = sys_openat(
3825                locked,
3826                &current_task,
3827                FdNumber::AT_FDCWD,
3828                UserCString::new(current_task, path_addr),
3829                O_RDONLY | O_CLOEXEC,
3830                FileMode::default(),
3831            )?;
3832            assert!(
3833                current_task
3834                    .running_state()
3835                    .files
3836                    .get_fd_flags_allowing_opath(fd)?
3837                    .contains(FdFlags::CLOEXEC)
3838            );
3839            Ok(())
3840        })
3841        .await
3842    }
3843
3844    #[::fuchsia::test]
3845    async fn test_sys_epoll() -> Result<(), Errno> {
3846        spawn_kernel_and_run_with_pkgfs(async |locked, current_task| {
3847            let epoll_fd =
3848                sys_epoll_create1(locked, current_task, 0).expect("sys_epoll_create1 failed");
3849            sys_close(locked, current_task, epoll_fd).expect("sys_close failed");
3850
3851            Ok(())
3852        })
3853        .await
3854    }
3855
3856    #[::fuchsia::test]
3857    async fn test_fstat_tmp_file() {
3858        spawn_kernel_and_run(async |locked, current_task| {
3859            // Create the file that will be used to stat.
3860            let file_path = "testfile.txt";
3861            let _file_handle = current_task
3862                .open_file_at(
3863                    locked,
3864                    FdNumber::AT_FDCWD,
3865                    file_path.into(),
3866                    OpenFlags::RDWR | OpenFlags::CREAT,
3867                    FileMode::ALLOW_ALL,
3868                    ResolveFlags::empty(),
3869                    AccessCheck::default(),
3870                )
3871                .unwrap();
3872
3873            // Write the path to user memory.
3874            let path_addr = map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
3875            current_task
3876                .write_memory(path_addr, file_path.as_bytes())
3877                .expect("failed to clear struct");
3878
3879            let memory_len = (path_addr + file_path.len()).expect("OOB memory allocation!");
3880            let user_stat = UserRef::new(memory_len);
3881            current_task
3882                .write_object(user_stat, &default_statfs(0))
3883                .expect("failed to clear struct");
3884
3885            let user_path = UserCString::new(current_task, path_addr);
3886
3887            assert_eq!(sys_statfs(locked, current_task, user_path, user_stat.into()), Ok(()));
3888
3889            let returned_stat = current_task.read_object(user_stat).expect("failed to read struct");
3890            let expected_stat = starnix_uapi::statfs {
3891                f_blocks: 0x100000000,
3892                f_bavail: 0x100000000,
3893                f_bfree: 0x100000000,
3894                f_flags: starnix_uapi::MS_RELATIME as i64,
3895                ..default_statfs(starnix_uapi::TMPFS_MAGIC)
3896            };
3897            assert!(
3898                returned_stat.as_bytes() == expected_stat.as_bytes(),
3899                "Expected {:?}, got {:?}",
3900                expected_stat,
3901                returned_stat
3902            );
3903        })
3904        .await;
3905    }
3906
3907    #[::fuchsia::test]
3908    async fn test_unlinkat_dir() {
3909        spawn_kernel_and_run(async |locked, current_task| {
3910            // Create the dir that we will attempt to unlink later.
3911            let no_slash_path = b"testdir";
3912            let no_slash_path_addr =
3913                map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
3914            current_task
3915                .write_memory(no_slash_path_addr, no_slash_path)
3916                .expect("failed to write path");
3917            let no_slash_user_path = UserCString::new(current_task, no_slash_path_addr);
3918            sys_mkdirat(
3919                locked,
3920                &current_task,
3921                FdNumber::AT_FDCWD,
3922                no_slash_user_path,
3923                FileMode::ALLOW_ALL.with_type(FileMode::IFDIR),
3924            )
3925            .unwrap();
3926
3927            let slash_path = b"testdir/";
3928            let slash_path_addr =
3929                map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
3930            current_task.write_memory(slash_path_addr, slash_path).expect("failed to write path");
3931            let slash_user_path = UserCString::new(current_task, slash_path_addr);
3932
3933            // Try to remove a directory without specifying AT_REMOVEDIR.
3934            // This should fail with EISDIR, irrespective of the terminating slash.
3935            let error = sys_unlinkat(locked, current_task, FdNumber::AT_FDCWD, slash_user_path, 0)
3936                .unwrap_err();
3937            assert_eq!(error, errno!(EISDIR));
3938            let error =
3939                sys_unlinkat(locked, current_task, FdNumber::AT_FDCWD, no_slash_user_path, 0)
3940                    .unwrap_err();
3941            assert_eq!(error, errno!(EISDIR));
3942
3943            // Success with AT_REMOVEDIR.
3944            sys_unlinkat(locked, current_task, FdNumber::AT_FDCWD, slash_user_path, AT_REMOVEDIR)
3945                .unwrap();
3946        })
3947        .await;
3948    }
3949
3950    #[::fuchsia::test]
3951    async fn test_rename_noreplace() {
3952        spawn_kernel_and_run(async |locked, current_task| {
3953            // Create the file that will be renamed.
3954            let old_user_path = "testfile.txt";
3955            let _old_file_handle = current_task
3956                .open_file_at(
3957                    locked,
3958                    FdNumber::AT_FDCWD,
3959                    old_user_path.into(),
3960                    OpenFlags::RDWR | OpenFlags::CREAT,
3961                    FileMode::ALLOW_ALL,
3962                    ResolveFlags::empty(),
3963                    AccessCheck::default(),
3964                )
3965                .unwrap();
3966
3967            // Write the path to user memory.
3968            let old_path_addr =
3969                map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
3970            current_task
3971                .write_memory(old_path_addr, old_user_path.as_bytes())
3972                .expect("failed to clear struct");
3973
3974            // Create a second file that we will attempt to rename to.
3975            let new_user_path = "testfile2.txt";
3976            let _new_file_handle = current_task
3977                .open_file_at(
3978                    locked,
3979                    FdNumber::AT_FDCWD,
3980                    new_user_path.into(),
3981                    OpenFlags::RDWR | OpenFlags::CREAT,
3982                    FileMode::ALLOW_ALL,
3983                    ResolveFlags::empty(),
3984                    AccessCheck::default(),
3985                )
3986                .unwrap();
3987
3988            // Write the path to user memory.
3989            let new_path_addr =
3990                map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
3991            current_task
3992                .write_memory(new_path_addr, new_user_path.as_bytes())
3993                .expect("failed to clear struct");
3994
3995            // Try to rename first file to second file's name with RENAME_NOREPLACE flag.
3996            // This should fail with EEXIST.
3997            let error = sys_renameat2(
3998                locked,
3999                &current_task,
4000                FdNumber::AT_FDCWD,
4001                UserCString::new(current_task, old_path_addr),
4002                FdNumber::AT_FDCWD,
4003                UserCString::new(current_task, new_path_addr),
4004                RenameFlags::NOREPLACE.bits(),
4005            )
4006            .unwrap_err();
4007            assert_eq!(error, errno!(EEXIST));
4008        })
4009        .await;
4010    }
4011
4012    #[::fuchsia::test]
4013    async fn test_sys_sync() -> Result<(), Errno> {
4014        spawn_kernel_and_run(async |locked, current_task| {
4015            sys_sync(locked, current_task)?;
4016            Ok(())
4017        })
4018        .await
4019    }
4020
4021    #[::fuchsia::test]
4022    async fn test_sys_syncfs() -> Result<(), Errno> {
4023        spawn_kernel_and_run(async |locked, current_task| {
4024            let file_handle = current_task.open_file(locked, ".".into(), OpenFlags::RDONLY)?;
4025            let fd = current_task.add_file(locked, file_handle, FdFlags::empty())?;
4026            sys_syncfs(locked, current_task, fd)?;
4027            Ok(())
4028        })
4029        .await
4030    }
4031
4032    // TODO(https://fxbug.dev/485370648) remove when unnecessary
4033    #[::fuchsia::test]
4034    async fn test_fake_ion_stat() {
4035        // Test with fake_ion disabled (default).
4036        spawn_kernel_and_run(async |locked, current_task| {
4037            let ion_path = b"/dev/ion\0";
4038            let path_addr = map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
4039            current_task.write_memory(path_addr, ion_path).expect("failed to write path");
4040            let user_path = UserCString::new(current_task, path_addr);
4041
4042            let stat_addr = map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
4043            let stat_ptr = StatPtr::new(current_task, stat_addr);
4044
4045            let error =
4046                sys_fstatat64(locked, current_task, FdNumber::AT_FDCWD, user_path, stat_ptr, 0)
4047                    .unwrap_err();
4048            assert_eq!(error, errno!(ENOENT));
4049        })
4050        .await;
4051
4052        // Test with fake_ion enabled.
4053        let mut features = KernelFeatures::default();
4054        features.fake_ion = true;
4055        spawn_kernel_with_features_and_run(
4056            async |locked, current_task| {
4057                let ion_path = b"/dev/ion\0";
4058                let path_addr =
4059                    map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
4060                current_task.write_memory(path_addr, ion_path).expect("failed to write path");
4061                let user_path = UserCString::new(current_task, path_addr);
4062
4063                let stat_addr =
4064                    map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
4065                let stat_ptr = StatPtr::new(current_task, stat_addr);
4066
4067                sys_fstatat64(locked, current_task, FdNumber::AT_FDCWD, user_path, stat_ptr, 0)
4068                    .expect("sys_fstatat64 should succeed with fake_ion");
4069
4070                let stat_result: uapi::stat =
4071                    current_task.read_object(stat_addr.into()).expect("failed to read stat");
4072                assert_eq!(stat_result.st_mode, uapi::S_IFCHR | 0o666);
4073                assert_eq!(stat_result.st_rdev, DeviceId::new(10, 59).bits());
4074
4075                // Test statx as well.
4076                let statx_addr =
4077                    map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
4078                let statx_ptr = UserRef::new(statx_addr);
4079                sys_statx(
4080                    locked,
4081                    current_task,
4082                    FdNumber::AT_FDCWD,
4083                    user_path,
4084                    0,
4085                    uapi::STATX_BASIC_STATS,
4086                    statx_ptr,
4087                )
4088                .expect("sys_statx should succeed with fake_ion");
4089
4090                let statx_result: statx =
4091                    current_task.read_object(statx_ptr).expect("failed to read statx");
4092                assert_eq!(statx_result.stx_mode, (uapi::S_IFCHR | 0o666) as u16);
4093                assert_eq!(statx_result.stx_rdev_major, 10);
4094                assert_eq!(statx_result.stx_rdev_minor, 59);
4095            },
4096            features,
4097        )
4098        .await;
4099    }
4100}