starnix_core/vfs/
syscalls.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::{IOVecPtr, MemoryAccessor, MemoryAccessorExt, PAGE_SIZE};
6use crate::security;
7use crate::syscalls::time::{ITimerSpecPtr, TimeSpecPtr, TimeValPtr};
8use crate::task::{
9    CurrentTask, EventHandler, ProcessEntryRef, ReadyItem, ReadyItemKey, Timeline, TimerWakeup,
10    Waiter,
11};
12use crate::vfs::aio::AioContext;
13use crate::vfs::buffers::{UserBuffersInputBuffer, UserBuffersOutputBuffer};
14use crate::vfs::eventfd::{EventFdType, new_eventfd};
15use crate::vfs::fs_args::MountParams;
16use crate::vfs::inotify::InotifyFileObject;
17use crate::vfs::io_uring::{IORING_MAX_ENTRIES, IoUringFileObject};
18use crate::vfs::pidfd::new_pidfd;
19use crate::vfs::pipe::{PipeFileObject, new_pipe};
20use crate::vfs::timer::TimerFile;
21use crate::vfs::{
22    CheckAccessReason, DirentSink64, EpollFileObject, FallocMode, FdFlags, FdNumber,
23    FileAsyncOwner, FileHandle, FileSystemOptions, FlockOperation, FsStr, FsString, LookupContext,
24    NamespaceNode, PathWithReachability, RecordLockCommand, RenameFlags, SeekTarget, StatxFlags,
25    SymlinkMode, SymlinkTarget, TargetFdNumber, TimeUpdateType, UnlinkKind, ValueOrSize, WdNumber,
26    WhatToMount, XattrOp, checked_add_offset_and_length, new_memfd, new_zombie_pidfd, splice,
27};
28use starnix_logging::{log_trace, track_stub};
29use starnix_sync::{FileOpsCore, LockEqualOrBefore, Locked, Mutex, Unlocked};
30use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
31use starnix_types::ownership::TempRef;
32use starnix_types::time::{
33    duration_from_poll_timeout, duration_from_timespec, time_from_timespec, timespec_from_duration,
34};
35use starnix_types::user_buffer::UserBuffer;
36use starnix_uapi::auth::{
37    CAP_BLOCK_SUSPEND, CAP_DAC_READ_SEARCH, CAP_LEASE, CAP_SYS_ADMIN, CAP_WAKE_ALARM,
38    PTRACE_MODE_ATTACH_REALCREDS,
39};
40use starnix_uapi::device_type::DeviceType;
41use starnix_uapi::errors::{
42    EFAULT, EINTR, ENAMETOOLONG, ENOTSUP, ETIMEDOUT, Errno, ErrnoResultExt,
43};
44use starnix_uapi::file_lease::FileLeaseType;
45use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
46use starnix_uapi::inotify_mask::InotifyMask;
47use starnix_uapi::mount_flags::MountFlags;
48use starnix_uapi::open_flags::OpenFlags;
49use starnix_uapi::personality::PersonalityFlags;
50use starnix_uapi::resource_limits::Resource;
51use starnix_uapi::seal_flags::SealFlags;
52use starnix_uapi::signals::SigSet;
53use starnix_uapi::unmount_flags::UnmountFlags;
54use starnix_uapi::user_address::{MultiArchUserRef, UserAddress, UserCString, UserRef};
55use starnix_uapi::user_value::UserValue;
56use starnix_uapi::vfs::{EpollEvent, FdEvents, ResolveFlags};
57use starnix_uapi::{
58    __kernel_fd_set, AT_EACCESS, AT_EMPTY_PATH, AT_NO_AUTOMOUNT, AT_REMOVEDIR, AT_SYMLINK_FOLLOW,
59    AT_SYMLINK_NOFOLLOW, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM, CLOCK_MONOTONIC, CLOCK_REALTIME,
60    CLOCK_REALTIME_ALARM, CLOSE_RANGE_CLOEXEC, CLOSE_RANGE_UNSHARE, EFD_CLOEXEC, EFD_NONBLOCK,
61    EFD_SEMAPHORE, EPOLL_CLOEXEC, EPOLL_CTL_ADD, EPOLL_CTL_DEL, EPOLL_CTL_MOD, F_ADD_SEALS,
62    F_DUPFD, F_DUPFD_CLOEXEC, F_GET_SEALS, F_GETFD, F_GETFL, F_GETLEASE, F_GETLK, F_GETLK64,
63    F_GETOWN, F_GETOWN_EX, F_OFD_GETLK, F_OFD_SETLK, F_OFD_SETLKW, F_OWNER_PGRP, F_OWNER_PID,
64    F_OWNER_TID, F_SETFD, F_SETFL, F_SETLEASE, F_SETLK, F_SETLK64, F_SETLKW, F_SETLKW64, F_SETOWN,
65    F_SETOWN_EX, F_SETSIG, FIOCLEX, FIONCLEX, IN_CLOEXEC, IN_NONBLOCK, MFD_ALLOW_SEALING,
66    MFD_CLOEXEC, MFD_EXEC, MFD_HUGE_MASK, MFD_HUGE_SHIFT, MFD_HUGETLB, MFD_NOEXEC_SEAL, NAME_MAX,
67    O_CLOEXEC, O_CREAT, O_NOFOLLOW, O_PATH, O_TMPFILE, PIDFD_NONBLOCK, POLLERR, POLLHUP, POLLIN,
68    POLLOUT, POLLPRI, POLLRDBAND, POLLRDNORM, POLLWRBAND, POLLWRNORM, POSIX_FADV_DONTNEED,
69    POSIX_FADV_NOREUSE, POSIX_FADV_NORMAL, POSIX_FADV_RANDOM, POSIX_FADV_SEQUENTIAL,
70    POSIX_FADV_WILLNEED, RWF_SUPPORTED, TFD_CLOEXEC, TFD_NONBLOCK, TFD_TIMER_ABSTIME,
71    TFD_TIMER_CANCEL_ON_SET, XATTR_CREATE, XATTR_NAME_MAX, XATTR_REPLACE, aio_context_t, errno,
72    error, f_owner_ex, io_event, io_uring_params,
73    io_uring_register_op_IORING_REGISTER_BUFFERS as IORING_REGISTER_BUFFERS,
74    io_uring_register_op_IORING_REGISTER_IOWQ_MAX_WORKERS as IORING_REGISTER_IOWQ_MAX_WORKERS,
75    io_uring_register_op_IORING_REGISTER_PBUF_RING as IORING_REGISTER_PBUF_RING,
76    io_uring_register_op_IORING_REGISTER_PBUF_STATUS as IORING_REGISTER_PBUF_STATUS,
77    io_uring_register_op_IORING_REGISTER_RING_FDS as IORING_REGISTER_RING_FDS,
78    io_uring_register_op_IORING_UNREGISTER_BUFFERS as IORING_UNREGISTER_BUFFERS,
79    io_uring_register_op_IORING_UNREGISTER_PBUF_RING as IORING_UNREGISTER_PBUF_RING,
80    io_uring_register_op_IORING_UNREGISTER_RING_FDS as IORING_UNREGISTER_RING_FDS, iocb, off_t,
81    pid_t, pollfd, pselect6_sigmask, sigset_t, statx, timespec, uapi, uid_t,
82};
83use std::cmp::Ordering;
84use std::collections::VecDeque;
85use std::marker::PhantomData;
86use std::sync::{Arc, atomic};
87use std::usize;
88use zerocopy::{Immutable, IntoBytes};
89
90uapi::check_arch_independent_layout! {
91    pollfd {
92        fd,
93        events,
94        revents,
95    }
96
97    io_event {
98        data,
99        obj,
100        res,
101        res2,
102    }
103
104    iocb {
105        aio_data,
106        aio_key,
107        aio_rw_flags,
108        aio_lio_opcode,
109        aio_reqprio,
110        aio_fildes,
111        aio_buf,
112        aio_nbytes,
113        aio_offset,
114        aio_reserved2,
115        aio_flags,
116        aio_resfd,
117    }
118
119    statx_timestamp {
120        tv_sec,
121        tv_nsec,
122    }
123
124    statx {
125        stx_mask,
126        stx_blksize,
127        stx_attributes,
128        stx_nlink,
129        stx_uid,
130        stx_gid,
131        stx_mode,
132        stx_ino,
133        stx_size,
134        stx_blocks,
135        stx_attributes_mask,
136        stx_atime,
137        stx_btime,
138        stx_ctime,
139        stx_mtime,
140        stx_rdev_major,
141        stx_rdev_minor,
142        stx_dev_major,
143        stx_dev_minor,
144        stx_mnt_id,
145        stx_dio_mem_align,
146        stx_dio_offset_align,
147        stx_subvol,
148        stx_atomic_write_unit_min,
149        stx_atomic_write_unit_max,
150        stx_atomic_write_segments_max,
151    }
152
153    io_sqring_offsets {
154        head,
155        tail,
156        ring_mask,
157        ring_entries,
158        flags,
159        dropped,
160        array,
161        resv1,
162        user_addr,
163    }
164
165    io_cqring_offsets {
166        head,
167        tail,
168        ring_mask,
169        ring_entries,
170        overflow,
171        cqes,
172        flags,
173        resv1,
174        user_addr,
175    }
176
177    io_uring_params {
178        sq_entries,
179        cq_entries,
180        flags,
181        sq_thread_cpu,
182        sq_thread_idle,
183        features,
184        wq_fd,
185        resv,
186        sq_off,
187        cq_off,
188    }
189
190    io_uring_rsrc_update {
191        offset,
192        resv,
193        data,
194    }
195
196    io_uring_buf_reg {
197        ring_addr,
198        ring_entries,
199        bgid,
200        flags,
201        resv,
202    }
203}
204
205// Constants from bionic/libc/include/sys/stat.h
206const UTIME_NOW: i64 = 0x3fffffff;
207const UTIME_OMIT: i64 = 0x3ffffffe;
208
209pub type OffsetPtr = MultiArchUserRef<uapi::off_t, uapi::arch32::off_t>;
210pub type IocbPtr = MultiArchUserRef<iocb, iocb>;
211pub type IocbPtrPtr = MultiArchUserRef<IocbPtr, IocbPtr>;
212
213pub fn sys_read(
214    locked: &mut Locked<Unlocked>,
215    current_task: &CurrentTask,
216    fd: FdNumber,
217    address: UserAddress,
218    length: usize,
219) -> Result<usize, Errno> {
220    let file = current_task.files.get(fd)?;
221    file.read(
222        locked,
223        current_task,
224        &mut UserBuffersOutputBuffer::unified_new_at(current_task, address, length)?,
225    )
226    .map_eintr(|| errno!(ERESTARTSYS))
227}
228
229pub fn sys_write(
230    locked: &mut Locked<Unlocked>,
231    current_task: &CurrentTask,
232    fd: FdNumber,
233    address: UserAddress,
234    length: usize,
235) -> Result<usize, Errno> {
236    let file = current_task.files.get(fd)?;
237    file.write(
238        locked,
239        current_task,
240        &mut UserBuffersInputBuffer::unified_new_at(current_task, address, length)?,
241    )
242    .map_eintr(|| errno!(ERESTARTSYS))
243}
244
245pub fn sys_close(
246    _locked: &mut Locked<Unlocked>,
247    current_task: &CurrentTask,
248    fd: FdNumber,
249) -> Result<(), Errno> {
250    current_task.files.close(fd)?;
251    Ok(())
252}
253
254pub fn sys_close_range(
255    _locked: &mut Locked<Unlocked>,
256    current_task: &CurrentTask,
257    first: u32,
258    last: u32,
259    flags: u32,
260) -> Result<(), Errno> {
261    if first > last || flags & !(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC) != 0 {
262        return error!(EINVAL);
263    }
264    if flags & CLOSE_RANGE_UNSHARE != 0 {
265        current_task.files.unshare();
266    }
267    let in_range = |fd: FdNumber| fd.raw() as u32 >= first && fd.raw() as u32 <= last;
268    if flags & CLOSE_RANGE_CLOEXEC != 0 {
269        current_task.files.retain(|fd, flags| {
270            if in_range(fd) {
271                *flags |= FdFlags::CLOEXEC;
272            }
273            true
274        });
275    } else {
276        current_task.files.retain(|fd, _| !in_range(fd));
277    }
278    Ok(())
279}
280
281pub fn sys_lseek(
282    locked: &mut Locked<Unlocked>,
283    current_task: &CurrentTask,
284    fd: FdNumber,
285    offset: off_t,
286    whence: u32,
287) -> Result<off_t, Errno> {
288    let file = current_task.files.get(fd)?;
289    file.seek(locked, current_task, SeekTarget::from_raw(whence, offset)?)
290}
291
292pub fn sys_fcntl(
293    locked: &mut Locked<Unlocked>,
294    current_task: &CurrentTask,
295    fd: FdNumber,
296    cmd: u32,
297    arg: u64,
298) -> Result<SyscallResult, Errno> {
299    let file = match cmd {
300        F_DUPFD | F_DUPFD_CLOEXEC | F_GETFD | F_SETFD | F_GETFL => {
301            current_task.files.get_allowing_opath(fd)?
302        }
303        _ => current_task.files.get(fd)?,
304    };
305
306    match cmd {
307        // For the following values of cmd we need to perform more checks before running the
308        // `check_file_fcntl_access` LSM hook.
309        F_SETOWN | F_SETOWN_EX | F_ADD_SEALS | F_SETLEASE => {}
310        _ => {
311            security::check_file_fcntl_access(current_task, &file, cmd, arg)?;
312        }
313    };
314
315    match cmd {
316        F_DUPFD | F_DUPFD_CLOEXEC => {
317            let fd_number = arg as i32;
318            let flags = if cmd == F_DUPFD_CLOEXEC { FdFlags::CLOEXEC } else { FdFlags::empty() };
319            let newfd = current_task.files.duplicate(
320                locked,
321                current_task,
322                fd,
323                TargetFdNumber::Minimum(FdNumber::from_raw(fd_number)),
324                flags,
325            )?;
326            Ok(newfd.into())
327        }
328        F_GETOWN => match file.get_async_owner() {
329            FileAsyncOwner::Unowned => Ok(0.into()),
330            FileAsyncOwner::Thread(tid) => Ok(tid.into()),
331            FileAsyncOwner::Process(pid) => Ok(pid.into()),
332            FileAsyncOwner::ProcessGroup(pgid) => Ok((-pgid).into()),
333        },
334        F_GETOWN_EX => {
335            let maybe_owner = match file.get_async_owner() {
336                FileAsyncOwner::Unowned => None,
337                FileAsyncOwner::Thread(tid) => {
338                    Some(uapi::f_owner_ex { type_: F_OWNER_TID as i32, pid: tid })
339                }
340                FileAsyncOwner::Process(pid) => {
341                    Some(uapi::f_owner_ex { type_: F_OWNER_PID as i32, pid })
342                }
343                FileAsyncOwner::ProcessGroup(pgid) => {
344                    Some(uapi::f_owner_ex { type_: F_OWNER_PGRP as i32, pid: pgid })
345                }
346            };
347            if let Some(owner) = maybe_owner {
348                let user_owner: UserRef<f_owner_ex> =
349                    UserRef::<uapi::f_owner_ex>::new(UserAddress::from(arg));
350                current_task.write_object(user_owner, &owner)?;
351            }
352            Ok(SUCCESS)
353        }
354        F_SETOWN => {
355            let pid = (arg as u32) as i32;
356            let owner = match pid.cmp(&0) {
357                Ordering::Equal => FileAsyncOwner::Unowned,
358                Ordering::Greater => FileAsyncOwner::Process(pid),
359                Ordering::Less => {
360                    FileAsyncOwner::ProcessGroup(pid.checked_neg().ok_or_else(|| errno!(EINVAL))?)
361                }
362            };
363            owner.validate(current_task)?;
364            security::check_file_fcntl_access(current_task, &file, cmd, arg)?;
365            file.set_async_owner(owner);
366            Ok(SUCCESS)
367        }
368        F_SETOWN_EX => {
369            let user_owner = UserRef::<uapi::f_owner_ex>::new(UserAddress::from(arg));
370            let requested_owner = current_task.read_object(user_owner)?;
371            let mut owner = match requested_owner.type_ as u32 {
372                F_OWNER_TID => FileAsyncOwner::Thread(requested_owner.pid),
373                F_OWNER_PID => FileAsyncOwner::Process(requested_owner.pid),
374                F_OWNER_PGRP => FileAsyncOwner::ProcessGroup(requested_owner.pid),
375                _ => return error!(EINVAL),
376            };
377            if requested_owner.pid == 0 {
378                owner = FileAsyncOwner::Unowned;
379            }
380            owner.validate(current_task)?;
381            security::check_file_fcntl_access(current_task, &file, cmd, arg)?;
382            file.set_async_owner(owner);
383            Ok(SUCCESS)
384        }
385        F_GETFD => Ok(current_task.files.get_fd_flags_allowing_opath(fd)?.into()),
386        F_SETFD => {
387            current_task
388                .files
389                .set_fd_flags_allowing_opath(fd, FdFlags::from_bits_truncate(arg as u32))?;
390            Ok(SUCCESS)
391        }
392        F_GETFL => {
393            // O_PATH allowed for:
394            //
395            //   Retrieving open file status flags using the fcntl(2)
396            //   F_GETFL operation: the returned flags will include the
397            //   bit O_PATH.
398            //
399            // See https://man7.org/linux/man-pages/man2/open.2.html
400            Ok(file.flags().into())
401        }
402        F_SETFL => {
403            let settable_flags = OpenFlags::APPEND
404                | OpenFlags::DIRECT
405                | OpenFlags::NOATIME
406                | OpenFlags::NONBLOCK
407                | OpenFlags::ASYNC;
408            let requested_flags =
409                OpenFlags::from_bits_truncate((arg as u32) & settable_flags.bits());
410
411            // If `NOATIME` flag is being set then check that it's allowed.
412            if requested_flags.contains(OpenFlags::NOATIME)
413                && !file.flags().contains(OpenFlags::NOATIME)
414            {
415                file.name.check_o_noatime_allowed(current_task)?;
416            }
417
418            file.update_file_flags(requested_flags, settable_flags);
419            Ok(SUCCESS)
420        }
421        F_SETLK | F_SETLKW | F_GETLK => {
422            let flock_ref =
423                MultiArchUserRef::<uapi::flock, uapi::arch32::flock>::new(current_task, arg);
424            let flock = current_task.read_multi_arch_object(flock_ref)?;
425            let cmd = RecordLockCommand::from_raw(cmd).ok_or_else(|| errno!(EINVAL))?;
426            if let Some(flock) = file.record_lock(locked, current_task, cmd, flock)? {
427                current_task.write_multi_arch_object(flock_ref, flock)?;
428            }
429            Ok(SUCCESS)
430        }
431        F_SETLK64 | F_SETLKW64 | F_GETLK64 | F_OFD_GETLK | F_OFD_SETLK | F_OFD_SETLKW => {
432            let flock_ref =
433                MultiArchUserRef::<uapi::flock, uapi::arch32::flock64>::new(current_task, arg);
434            let flock = current_task.read_multi_arch_object(flock_ref)?;
435            let cmd = RecordLockCommand::from_raw(cmd).ok_or_else(|| errno!(EINVAL))?;
436            if let Some(flock) = file.record_lock(locked, current_task, cmd, flock)? {
437                current_task.write_multi_arch_object(flock_ref, flock)?;
438            }
439            Ok(SUCCESS)
440        }
441        F_ADD_SEALS => {
442            if !file.can_write() {
443                // Cannot add seals if the file is not writable
444                return error!(EPERM);
445            }
446            security::check_file_fcntl_access(current_task, &file, cmd, arg)?;
447            let mut state = file.name.entry.node.write_guard_state.lock();
448            let flags = SealFlags::from_bits_truncate(arg as u32);
449            state.try_add_seal(flags)?;
450            Ok(SUCCESS)
451        }
452        F_GET_SEALS => {
453            let state = file.name.entry.node.write_guard_state.lock();
454            Ok(state.get_seals()?.into())
455        }
456        F_SETLEASE => {
457            let fsuid = current_task.with_current_creds(|creds| creds.fsuid);
458            if fsuid != file.node().info().uid {
459                security::check_task_capable(current_task, CAP_LEASE)?;
460            }
461            let lease = FileLeaseType::from_bits(arg as u32)?;
462            security::check_file_fcntl_access(current_task, &file, cmd, arg)?;
463            file.set_lease(current_task, lease)?;
464            Ok(SUCCESS)
465        }
466        F_GETLEASE => Ok(file.get_lease(current_task).into()),
467        F_SETSIG => {
468            track_stub!(TODO("https://fxbug.dev/437972675"), "F_SETSIG");
469            return error!(EOPNOTSUPP);
470        }
471        _ => file.fcntl(current_task, cmd, arg),
472    }
473}
474
475pub fn sys_pread64(
476    locked: &mut Locked<Unlocked>,
477    current_task: &CurrentTask,
478    fd: FdNumber,
479    address: UserAddress,
480    length: usize,
481    offset: off_t,
482) -> Result<usize, Errno> {
483    let file = current_task.files.get(fd)?;
484    let offset = offset.try_into().map_err(|_| errno!(EINVAL))?;
485    file.read_at(
486        locked,
487        current_task,
488        offset,
489        &mut UserBuffersOutputBuffer::unified_new_at(current_task, address, length)?,
490    )
491}
492
493pub fn sys_pwrite64(
494    locked: &mut Locked<Unlocked>,
495    current_task: &CurrentTask,
496    fd: FdNumber,
497    address: UserAddress,
498    length: usize,
499    offset: off_t,
500) -> Result<usize, Errno> {
501    let file = current_task.files.get(fd)?;
502    let offset = offset.try_into().map_err(|_| errno!(EINVAL))?;
503    file.write_at(
504        locked,
505        current_task,
506        offset,
507        &mut UserBuffersInputBuffer::unified_new_at(current_task, address, length)?,
508    )
509}
510
511fn do_readv(
512    locked: &mut Locked<Unlocked>,
513    current_task: &CurrentTask,
514    fd: FdNumber,
515    iovec_addr: IOVecPtr,
516    iovec_count: UserValue<i32>,
517    offset: Option<off_t>,
518    flags: u32,
519) -> Result<usize, Errno> {
520    if flags & !RWF_SUPPORTED != 0 {
521        return error!(EOPNOTSUPP);
522    }
523    if flags != 0 {
524        track_stub!(TODO("https://fxbug.dev/322875072"), "preadv2 flags", flags);
525    }
526    let file = current_task.files.get(fd)?;
527    let iovec = current_task.read_iovec(iovec_addr, iovec_count)?;
528    let mut data = UserBuffersOutputBuffer::unified_new(current_task, iovec)?;
529    if let Some(offset) = offset {
530        file.read_at(
531            locked,
532            current_task,
533            offset.try_into().map_err(|_| errno!(EINVAL))?,
534            &mut data,
535        )
536    } else {
537        file.read(locked, current_task, &mut data)
538    }
539}
540
541pub fn sys_readv(
542    locked: &mut Locked<Unlocked>,
543    current_task: &CurrentTask,
544    fd: FdNumber,
545    iovec_addr: IOVecPtr,
546    iovec_count: UserValue<i32>,
547) -> Result<usize, Errno> {
548    do_readv(locked, current_task, fd, iovec_addr, iovec_count, None, 0)
549}
550
551pub fn sys_preadv(
552    locked: &mut Locked<Unlocked>,
553    current_task: &CurrentTask,
554    fd: FdNumber,
555    iovec_addr: IOVecPtr,
556    iovec_count: UserValue<i32>,
557    offset: off_t,
558) -> Result<usize, Errno> {
559    do_readv(locked, current_task, fd, iovec_addr, iovec_count, Some(offset), 0)
560}
561
562pub fn sys_preadv2(
563    locked: &mut Locked<Unlocked>,
564    current_task: &CurrentTask,
565    fd: FdNumber,
566    iovec_addr: IOVecPtr,
567    iovec_count: UserValue<i32>,
568    offset: off_t,
569    _unused: SyscallArg, // On 32-bit systems, holds the upper 32 bits of offset.
570    flags: u32,
571) -> Result<usize, Errno> {
572    let offset = if offset == -1 { None } else { Some(offset) };
573    do_readv(locked, current_task, fd, iovec_addr, iovec_count, offset, flags)
574}
575
576fn do_writev(
577    locked: &mut Locked<Unlocked>,
578    current_task: &CurrentTask,
579    fd: FdNumber,
580    iovec_addr: IOVecPtr,
581    iovec_count: UserValue<i32>,
582    offset: Option<off_t>,
583    flags: u32,
584) -> Result<usize, Errno> {
585    if flags & !RWF_SUPPORTED != 0 {
586        return error!(EOPNOTSUPP);
587    }
588    if flags != 0 {
589        track_stub!(TODO("https://fxbug.dev/322874523"), "pwritev2 flags", flags);
590    }
591
592    let file = current_task.files.get(fd)?;
593    let iovec = current_task.read_iovec(iovec_addr, iovec_count)?;
594    let mut data = UserBuffersInputBuffer::unified_new(current_task, iovec)?;
595    let res = if let Some(offset) = offset {
596        file.write_at(
597            locked,
598            current_task,
599            offset.try_into().map_err(|_| errno!(EINVAL))?,
600            &mut data,
601        )
602    } else {
603        file.write(locked, current_task, &mut data)
604    };
605
606    match &res {
607        Err(e) if e.code == EFAULT => {
608            track_stub!(TODO("https://fxbug.dev/297370529"), "allow partial writes")
609        }
610        _ => (),
611    }
612
613    res
614}
615
616pub fn sys_writev(
617    locked: &mut Locked<Unlocked>,
618    current_task: &CurrentTask,
619    fd: FdNumber,
620    iovec_addr: IOVecPtr,
621    iovec_count: UserValue<i32>,
622) -> Result<usize, Errno> {
623    do_writev(locked, current_task, fd, iovec_addr, iovec_count, None, 0)
624}
625
626pub fn sys_pwritev(
627    locked: &mut Locked<Unlocked>,
628    current_task: &CurrentTask,
629    fd: FdNumber,
630    iovec_addr: IOVecPtr,
631    iovec_count: UserValue<i32>,
632    offset: off_t,
633) -> Result<usize, Errno> {
634    do_writev(locked, current_task, fd, iovec_addr, iovec_count, Some(offset), 0)
635}
636
637pub fn sys_pwritev2(
638    locked: &mut Locked<Unlocked>,
639    current_task: &CurrentTask,
640    fd: FdNumber,
641    iovec_addr: IOVecPtr,
642    iovec_count: UserValue<i32>,
643    offset: off_t,
644    _unused: SyscallArg, // On 32-bit systems, holds the upper 32 bits of offset.
645    flags: u32,
646) -> Result<usize, Errno> {
647    let offset = if offset == -1 { None } else { Some(offset) };
648    do_writev(locked, current_task, fd, iovec_addr, iovec_count, offset, flags)
649}
650
651type StatFsPtr = MultiArchUserRef<uapi::statfs, uapi::arch32::statfs>;
652
653pub fn fstatfs<T32: IntoBytes + Immutable + TryFrom<uapi::statfs>>(
654    locked: &mut Locked<Unlocked>,
655    current_task: &CurrentTask,
656    fd: FdNumber,
657    user_buf: MultiArchUserRef<uapi::statfs, T32>,
658) -> Result<(), Errno> {
659    // O_PATH allowed for:
660    //
661    //   fstatfs(2) (since Linux 3.12).
662    //
663    // See https://man7.org/linux/man-pages/man2/open.2.html
664    let file = current_task.files.get_allowing_opath(fd)?;
665    let mut stat = file.fs.statfs(locked, current_task)?;
666    stat.f_flags |= file.name.mount.flags().bits() as i64;
667    current_task.write_multi_arch_object(user_buf, stat)?;
668    Ok(())
669}
670
671pub fn sys_fstatfs(
672    locked: &mut Locked<Unlocked>,
673    current_task: &CurrentTask,
674    fd: FdNumber,
675    user_buf: StatFsPtr,
676) -> Result<(), Errno> {
677    fstatfs(locked, current_task, fd, user_buf)
678}
679
680fn statfs<T32: IntoBytes + Immutable + TryFrom<uapi::statfs>>(
681    locked: &mut Locked<Unlocked>,
682    current_task: &CurrentTask,
683    user_path: UserCString,
684    user_buf: MultiArchUserRef<uapi::statfs, T32>,
685) -> Result<(), Errno> {
686    let name =
687        lookup_at(locked, current_task, FdNumber::AT_FDCWD, user_path, LookupFlags::default())?;
688    let fs = name.entry.node.fs();
689    let mut stat = fs.statfs(locked, current_task)?;
690    stat.f_flags |= name.mount.flags().bits() as i64;
691    current_task.write_multi_arch_object(user_buf, stat)?;
692    Ok(())
693}
694
695pub fn sys_statfs(
696    locked: &mut Locked<Unlocked>,
697    current_task: &CurrentTask,
698    user_path: UserCString,
699    user_buf: StatFsPtr,
700) -> Result<(), Errno> {
701    statfs(locked, current_task, user_path, user_buf)
702}
703
704pub fn sys_sendfile(
705    locked: &mut Locked<Unlocked>,
706    current_task: &CurrentTask,
707    out_fd: FdNumber,
708    in_fd: FdNumber,
709    user_offset: OffsetPtr,
710    count: i32,
711) -> Result<usize, Errno> {
712    splice::sendfile(locked, current_task, out_fd, in_fd, user_offset, count)
713}
714
715/// A convenient wrapper for Task::open_file_at.
716///
717/// Reads user_path from user memory and then calls through to Task::open_file_at.
718fn open_file_at(
719    locked: &mut Locked<Unlocked>,
720    current_task: &CurrentTask,
721    dir_fd: FdNumber,
722    user_path: UserCString,
723    flags: u32,
724    mode: FileMode,
725    resolve_flags: ResolveFlags,
726) -> Result<FileHandle, Errno> {
727    let path = current_task.read_path(user_path)?;
728    log_trace!(dir_fd:%, path:%; "open_file_at");
729    current_task.open_file_at(
730        locked,
731        dir_fd,
732        path.as_ref(),
733        OpenFlags::from_bits_truncate(flags),
734        mode,
735        resolve_flags,
736        AccessCheck::default(),
737    )
738}
739
740fn lookup_parent_at<T, F>(
741    locked: &mut Locked<Unlocked>,
742    current_task: &CurrentTask,
743    dir_fd: FdNumber,
744    user_path: UserCString,
745    callback: F,
746) -> Result<T, Errno>
747where
748    F: Fn(&mut Locked<Unlocked>, LookupContext, NamespaceNode, &FsStr) -> Result<T, Errno>,
749{
750    let path = current_task.read_path(user_path)?;
751    log_trace!(dir_fd:%, path:%; "lookup_parent_at");
752    if path.is_empty() {
753        return error!(ENOENT);
754    }
755    let mut context = LookupContext::default();
756    let (parent, basename) =
757        current_task.lookup_parent_at(locked, &mut context, dir_fd, path.as_ref())?;
758    callback(locked, context, parent, basename)
759}
760
761/// Options for lookup_at.
762#[derive(Debug, Default, Copy, Clone)]
763pub struct LookupFlags {
764    /// Whether AT_EMPTY_PATH was supplied.
765    allow_empty_path: bool,
766
767    /// Used to implement AT_SYMLINK_NOFOLLOW.
768    symlink_mode: SymlinkMode,
769
770    /// Automount directories on the path.
771    // TODO(https://fxbug.dev/297370602): Support the `AT_NO_AUTOMOUNT` flag.
772    #[allow(dead_code)]
773    automount: bool,
774}
775
776impl LookupFlags {
777    fn no_follow() -> Self {
778        Self { symlink_mode: SymlinkMode::NoFollow, ..Default::default() }
779    }
780
781    fn from_bits(flags: u32, allowed_flags: u32) -> Result<Self, Errno> {
782        if flags & !allowed_flags != 0 {
783            return error!(EINVAL);
784        }
785        let follow_symlinks = if allowed_flags & AT_SYMLINK_FOLLOW != 0 {
786            flags & AT_SYMLINK_FOLLOW != 0
787        } else {
788            flags & AT_SYMLINK_NOFOLLOW == 0
789        };
790        let automount =
791            if allowed_flags & AT_NO_AUTOMOUNT != 0 { flags & AT_NO_AUTOMOUNT == 0 } else { false };
792        if automount {
793            track_stub!(TODO("https://fxbug.dev/297370602"), "LookupFlags::automount");
794        }
795        Ok(LookupFlags {
796            allow_empty_path: (flags & AT_EMPTY_PATH != 0)
797                || (flags & O_PATH != 0 && flags & O_NOFOLLOW != 0),
798            symlink_mode: if follow_symlinks { SymlinkMode::Follow } else { SymlinkMode::NoFollow },
799            automount,
800        })
801    }
802}
803
804impl From<StatxFlags> for LookupFlags {
805    fn from(flags: StatxFlags) -> Self {
806        let lookup_flags = StatxFlags::AT_SYMLINK_NOFOLLOW
807            | StatxFlags::AT_EMPTY_PATH
808            | StatxFlags::AT_NO_AUTOMOUNT;
809        Self::from_bits((flags & lookup_flags).bits(), lookup_flags.bits()).unwrap()
810    }
811}
812
813pub fn lookup_at<L>(
814    locked: &mut Locked<L>,
815    current_task: &CurrentTask,
816    dir_fd: FdNumber,
817    user_path: UserCString,
818    options: LookupFlags,
819) -> Result<NamespaceNode, Errno>
820where
821    L: LockEqualOrBefore<FileOpsCore>,
822{
823    let path = current_task.read_path(user_path)?;
824    log_trace!(dir_fd:%, path:%; "lookup_at");
825    if path.is_empty() {
826        if options.allow_empty_path {
827            let (node, _) = current_task.resolve_dir_fd(
828                locked,
829                dir_fd,
830                path.as_ref(),
831                ResolveFlags::empty(),
832            )?;
833            return Ok(node);
834        }
835        return error!(ENOENT);
836    }
837
838    let mut parent_context = LookupContext::default();
839    let (parent, basename) =
840        current_task.lookup_parent_at(locked, &mut parent_context, dir_fd, path.as_ref())?;
841
842    let mut child_context = if parent_context.must_be_directory {
843        // The child must resolve to a directory. This is because a trailing slash
844        // was found in the path. If the child is a symlink, we should follow it.
845        // See https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap03.html#tag_21_03_00_75
846        parent_context.with(SymlinkMode::Follow)
847    } else {
848        parent_context.with(options.symlink_mode)
849    };
850
851    parent.lookup_child(locked, current_task, &mut child_context, basename)
852}
853
854fn do_openat(
855    locked: &mut Locked<Unlocked>,
856    current_task: &CurrentTask,
857    dir_fd: FdNumber,
858    user_path: UserCString,
859    flags: u32,
860    mode: FileMode,
861    resolve_flags: ResolveFlags,
862) -> Result<FdNumber, Errno> {
863    let file = open_file_at(locked, current_task, dir_fd, user_path, flags, mode, resolve_flags)?;
864    let fd_flags = get_fd_flags(flags);
865    current_task.add_file(locked, file, fd_flags)
866}
867
868pub fn sys_openat(
869    locked: &mut Locked<Unlocked>,
870    current_task: &CurrentTask,
871    dir_fd: FdNumber,
872    user_path: UserCString,
873    flags: u32,
874    mode: FileMode,
875) -> Result<FdNumber, Errno> {
876    do_openat(locked, current_task, dir_fd, user_path, flags, mode, ResolveFlags::empty())
877}
878
879pub fn sys_openat2(
880    locked: &mut Locked<Unlocked>,
881    current_task: &CurrentTask,
882    dir_fd: FdNumber,
883    user_path: UserCString,
884    how_ref: UserRef<uapi::open_how>,
885    size: usize,
886) -> Result<FdNumber, Errno> {
887    const EXPECTED_SIZE: usize = std::mem::size_of::<uapi::open_how>();
888    if size < EXPECTED_SIZE {
889        return error!(EINVAL);
890    }
891
892    let how = current_task.read_object(how_ref)?;
893
894    // If the `size` is greater than expected, then we need to check that any extra bytes after
895    // `open_how` are set to 0. This is needed to properly handle the case when `open_how` is
896    // extended with new fields in the future. There is no upper limit on the buffer size, so we
897    // limit size of each read to one page.
898    let mut pos = EXPECTED_SIZE;
899    while pos < size {
900        let length = std::cmp::min(size - pos, *PAGE_SIZE as usize);
901        let extra_bytes =
902            current_task.read_buffer(&UserBuffer { address: (how_ref.addr() + pos)?, length })?;
903        for b in extra_bytes {
904            if b != 0 {
905                return error!(E2BIG);
906            }
907        }
908        pos += length;
909    }
910
911    let flags: u32 = how.flags.try_into().map_err(|_| errno!(EINVAL))?;
912
913    // `mode` can be specified only with `O_CREAT` or `O_TMPFILE`.
914    let allowed_mode_flags = if (flags & (O_CREAT | O_TMPFILE)) > 0 { 0o7777 } else { 0 };
915    if (how.mode & !allowed_mode_flags) != 0 {
916        return error!(EINVAL);
917    }
918
919    let mode = FileMode::from_bits(how.mode.try_into().map_err(|_| errno!(EINVAL))?);
920    let resolve_flags =
921        ResolveFlags::from_bits(how.resolve.try_into().map_err(|_| errno!(EINVAL))?)
922            .ok_or_else(|| errno!(EINVAL))?;
923
924    if resolve_flags.contains(ResolveFlags::CACHED) {
925        track_stub!(TODO("https://fxbug.dev/326474574"), "openat2: RESOLVE_CACHED");
926        return error!(EAGAIN);
927    }
928
929    do_openat(locked, current_task, dir_fd, user_path, flags, mode, resolve_flags)
930}
931
932pub fn sys_faccessat(
933    locked: &mut Locked<Unlocked>,
934    current_task: &CurrentTask,
935    dir_fd: FdNumber,
936    user_path: UserCString,
937    mode: u32,
938) -> Result<(), Errno> {
939    sys_faccessat2(locked, current_task, dir_fd, user_path, mode, 0)
940}
941
942pub fn sys_faccessat2(
943    locked: &mut Locked<Unlocked>,
944    current_task: &CurrentTask,
945    dir_fd: FdNumber,
946    user_path: UserCString,
947    mode: u32,
948    flags: u32,
949) -> Result<(), Errno> {
950    current_task.override_creds(
951        |creds| {
952            // Unless `AT_ACCESS` is set, perform lookup & access-checking using real UID & GID.
953            if flags & AT_EACCESS == 0 {
954                creds.creds.fsuid = creds.creds.uid;
955                creds.creds.fsgid = creds.creds.gid;
956            }
957        },
958        || {
959            let mode = Access::try_from(mode)?;
960            let lookup_flags = LookupFlags::from_bits(flags, AT_SYMLINK_NOFOLLOW | AT_EACCESS)?;
961            let name = lookup_at(locked, current_task, dir_fd, user_path, lookup_flags)?;
962            name.check_access(locked, current_task, mode, CheckAccessReason::Access)
963        },
964    )
965}
966
967pub fn sys_getdents64(
968    locked: &mut Locked<Unlocked>,
969    current_task: &CurrentTask,
970    fd: FdNumber,
971    user_buffer: UserAddress,
972    user_capacity: usize,
973) -> Result<usize, Errno> {
974    let file = current_task.files.get(fd)?;
975    let mut offset = file.offset.lock();
976    let mut sink = DirentSink64::new(current_task, &mut offset, user_buffer, user_capacity);
977    let result = file.readdir(locked, current_task, &mut sink);
978    sink.map_result_with_actual(result)
979}
980
981pub fn sys_chroot(
982    locked: &mut Locked<Unlocked>,
983    current_task: &CurrentTask,
984    user_path: UserCString,
985) -> Result<(), Errno> {
986    let name =
987        lookup_at(locked, current_task, FdNumber::AT_FDCWD, user_path, LookupFlags::default())?;
988    if !name.entry.node.is_dir() {
989        return error!(ENOTDIR);
990    }
991
992    current_task.fs().chroot(locked, current_task, name)?;
993    Ok(())
994}
995
996pub fn sys_chdir(
997    locked: &mut Locked<Unlocked>,
998    current_task: &CurrentTask,
999    user_path: UserCString,
1000) -> Result<(), Errno> {
1001    let name =
1002        lookup_at(locked, current_task, FdNumber::AT_FDCWD, user_path, LookupFlags::default())?;
1003    if !name.entry.node.is_dir() {
1004        return error!(ENOTDIR);
1005    }
1006    current_task.fs().chdir(locked, current_task, name)
1007}
1008
1009pub fn sys_fchdir(
1010    locked: &mut Locked<Unlocked>,
1011    current_task: &CurrentTask,
1012    fd: FdNumber,
1013) -> Result<(), Errno> {
1014    // O_PATH allowed for:
1015    //
1016    //   fchdir(2), if the file descriptor refers to a directory
1017    //   (since Linux 3.5).
1018    //
1019    // See https://man7.org/linux/man-pages/man2/open.2.html
1020    let file = current_task.files.get_allowing_opath(fd)?;
1021    if !file.name.entry.node.is_dir() {
1022        return error!(ENOTDIR);
1023    }
1024    current_task.fs().chdir(locked, current_task, file.name.to_passive())
1025}
1026
1027pub fn sys_fstat(
1028    locked: &mut Locked<Unlocked>,
1029    current_task: &CurrentTask,
1030    fd: FdNumber,
1031    buffer: UserRef<uapi::stat>,
1032) -> Result<(), Errno> {
1033    // O_PATH allowed for:
1034    //
1035    //   fstat(2) (since Linux 3.6).
1036    //
1037    // See https://man7.org/linux/man-pages/man2/open.2.html
1038    let file = current_task.files.get_allowing_opath(fd)?;
1039    let result = file.node().stat(locked, current_task)?;
1040    current_task.write_object(buffer, &result)?;
1041    Ok(())
1042}
1043
1044type StatPtr = MultiArchUserRef<uapi::stat, uapi::arch32::stat64>;
1045
1046pub fn sys_fstatat64(
1047    locked: &mut Locked<Unlocked>,
1048    current_task: &CurrentTask,
1049    dir_fd: FdNumber,
1050    user_path: UserCString,
1051    buffer: StatPtr,
1052    flags: u32,
1053) -> Result<(), Errno> {
1054    let flags =
1055        LookupFlags::from_bits(flags, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)?;
1056    let name = lookup_at(locked, current_task, dir_fd, user_path, flags)?;
1057    let result = name.entry.node.stat(locked, current_task)?;
1058    current_task.write_multi_arch_object(buffer, result)?;
1059    Ok(())
1060}
1061
1062pub use sys_fstatat64 as sys_newfstatat;
1063
1064pub fn sys_statx(
1065    locked: &mut Locked<Unlocked>,
1066    current_task: &CurrentTask,
1067    dir_fd: FdNumber,
1068    user_path: UserCString,
1069    flags: u32,
1070    mask: u32,
1071    statxbuf: UserRef<statx>,
1072) -> Result<(), Errno> {
1073    let flags = StatxFlags::from_bits(flags).ok_or_else(|| errno!(EINVAL))?;
1074    if flags & (StatxFlags::AT_STATX_FORCE_SYNC | StatxFlags::AT_STATX_DONT_SYNC)
1075        == (StatxFlags::AT_STATX_FORCE_SYNC | StatxFlags::AT_STATX_DONT_SYNC)
1076    {
1077        return error!(EINVAL);
1078    }
1079
1080    let name = lookup_at(locked, current_task, dir_fd, user_path, LookupFlags::from(flags))?;
1081    let result = name.entry.node.statx(locked, current_task, flags, mask)?;
1082    current_task.write_object(statxbuf, &result)?;
1083    Ok(())
1084}
1085
1086pub fn sys_readlinkat(
1087    locked: &mut Locked<Unlocked>,
1088    current_task: &CurrentTask,
1089    dir_fd: FdNumber,
1090    user_path: UserCString,
1091    buffer: UserAddress,
1092    buffer_size: usize,
1093) -> Result<usize, Errno> {
1094    let path = current_task.read_path(user_path)?;
1095    let lookup_flags = if path.is_empty() {
1096        if dir_fd == FdNumber::AT_FDCWD {
1097            return error!(ENOENT);
1098        }
1099        LookupFlags {
1100            allow_empty_path: true,
1101            symlink_mode: SymlinkMode::NoFollow,
1102            ..Default::default()
1103        }
1104    } else {
1105        LookupFlags::no_follow()
1106    };
1107    let name = lookup_at(locked, current_task, dir_fd, user_path, lookup_flags)?;
1108
1109    let target = match name.readlink(locked, current_task)? {
1110        SymlinkTarget::Path(path) => path,
1111        SymlinkTarget::Node(node) => node.path(current_task),
1112    };
1113
1114    if buffer_size == 0 {
1115        return error!(EINVAL);
1116    }
1117    // Cap the returned length at buffer_size.
1118    let length = std::cmp::min(buffer_size, target.len());
1119    current_task.write_memory(buffer, &target[..length])?;
1120    Ok(length)
1121}
1122
1123pub fn sys_truncate(
1124    locked: &mut Locked<Unlocked>,
1125    current_task: &CurrentTask,
1126    user_path: UserCString,
1127    length: off_t,
1128) -> Result<(), Errno> {
1129    let length = length.try_into().map_err(|_| errno!(EINVAL))?;
1130    let name =
1131        lookup_at(locked, current_task, FdNumber::AT_FDCWD, user_path, LookupFlags::default())?;
1132    name.truncate(locked, current_task, length)?;
1133    Ok(())
1134}
1135
1136pub fn sys_ftruncate(
1137    locked: &mut Locked<Unlocked>,
1138    current_task: &CurrentTask,
1139    fd: FdNumber,
1140    length: off_t,
1141) -> Result<(), Errno> {
1142    let length = length.try_into().map_err(|_| errno!(EINVAL))?;
1143    let file = current_task.files.get(fd)?;
1144    file.ftruncate(locked, current_task, length)?;
1145    Ok(())
1146}
1147
1148pub fn sys_mkdirat(
1149    locked: &mut Locked<Unlocked>,
1150    current_task: &CurrentTask,
1151    dir_fd: FdNumber,
1152    user_path: UserCString,
1153    mode: FileMode,
1154) -> Result<(), Errno> {
1155    let path = current_task.read_path(user_path)?;
1156
1157    if path.is_empty() {
1158        return error!(ENOENT);
1159    }
1160    let (parent, basename) = current_task.lookup_parent_at(
1161        locked,
1162        &mut LookupContext::default(),
1163        dir_fd,
1164        path.as_ref(),
1165    )?;
1166    parent.create_node(
1167        locked,
1168        current_task,
1169        basename,
1170        mode.with_type(FileMode::IFDIR),
1171        DeviceType::NONE,
1172    )?;
1173    Ok(())
1174}
1175
1176pub fn sys_mknodat(
1177    locked: &mut Locked<Unlocked>,
1178    current_task: &CurrentTask,
1179    dir_fd: FdNumber,
1180    user_path: UserCString,
1181    mode: FileMode,
1182    dev: DeviceType,
1183) -> Result<(), Errno> {
1184    let file_type = match mode.fmt() {
1185        FileMode::IFREG
1186        | FileMode::IFCHR
1187        | FileMode::IFBLK
1188        | FileMode::IFIFO
1189        | FileMode::IFSOCK => mode.fmt(),
1190        FileMode::EMPTY => FileMode::IFREG,
1191        _ => return error!(EINVAL),
1192    };
1193    lookup_parent_at(locked, current_task, dir_fd, user_path, |locked, _, parent, basename| {
1194        parent.create_node(locked, current_task, basename, mode.with_type(file_type), dev)
1195    })?;
1196    Ok(())
1197}
1198
1199pub fn sys_linkat(
1200    locked: &mut Locked<Unlocked>,
1201    current_task: &CurrentTask,
1202    old_dir_fd: FdNumber,
1203    old_user_path: UserCString,
1204    new_dir_fd: FdNumber,
1205    new_user_path: UserCString,
1206    flags: u32,
1207) -> Result<(), Errno> {
1208    if flags & !(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH) != 0 {
1209        track_stub!(TODO("https://fxbug.dev/322875706"), "linkat unknown flags", flags);
1210        return error!(EINVAL);
1211    }
1212
1213    if flags & AT_EMPTY_PATH != 0 {
1214        security::check_task_capable(current_task, CAP_DAC_READ_SEARCH)
1215            .map_err(|_| errno!(ENOENT))?;
1216    }
1217
1218    let flags = LookupFlags::from_bits(flags, AT_EMPTY_PATH | AT_SYMLINK_FOLLOW)?;
1219    let target = lookup_at(locked, current_task, old_dir_fd, old_user_path, flags)?;
1220    lookup_parent_at(
1221        locked,
1222        current_task,
1223        new_dir_fd,
1224        new_user_path,
1225        |locked, context, parent, basename| {
1226            // The path to a new link cannot end in `/`. That would imply that we are dereferencing
1227            // the link to a directory.
1228            if context.must_be_directory {
1229                return error!(ENOENT);
1230            }
1231            if target.mount != parent.mount {
1232                return error!(EXDEV);
1233            }
1234            parent.link(locked, current_task, basename, &target.entry.node)
1235        },
1236    )?;
1237
1238    Ok(())
1239}
1240
1241pub fn sys_unlinkat(
1242    locked: &mut Locked<Unlocked>,
1243    current_task: &CurrentTask,
1244    dir_fd: FdNumber,
1245    user_path: UserCString,
1246    flags: u32,
1247) -> Result<(), Errno> {
1248    if flags & !AT_REMOVEDIR != 0 {
1249        return error!(EINVAL);
1250    }
1251    let kind =
1252        if flags & AT_REMOVEDIR != 0 { UnlinkKind::Directory } else { UnlinkKind::NonDirectory };
1253    lookup_parent_at(
1254        locked,
1255        current_task,
1256        dir_fd,
1257        user_path,
1258        |locked, context, parent, basename| {
1259            parent.unlink(locked, current_task, basename, kind, context.must_be_directory)
1260        },
1261    )?;
1262    Ok(())
1263}
1264
1265pub fn sys_renameat2(
1266    locked: &mut Locked<Unlocked>,
1267    current_task: &CurrentTask,
1268    old_dir_fd: FdNumber,
1269    old_user_path: UserCString,
1270    new_dir_fd: FdNumber,
1271    new_user_path: UserCString,
1272    flags: u32,
1273) -> Result<(), Errno> {
1274    let flags = RenameFlags::from_bits(flags).ok_or_else(|| errno!(EINVAL))?;
1275    if flags.intersects(RenameFlags::INTERNAL) {
1276        return error!(EINVAL);
1277    };
1278
1279    // RENAME_EXCHANGE cannot be combined with the other flags.
1280    if flags.contains(RenameFlags::EXCHANGE)
1281        && flags.intersects(RenameFlags::NOREPLACE | RenameFlags::WHITEOUT)
1282    {
1283        return error!(EINVAL);
1284    }
1285
1286    // RENAME_WHITEOUT is not supported.
1287    if flags.contains(RenameFlags::WHITEOUT) {
1288        track_stub!(TODO("https://fxbug.dev/322875416"), "RENAME_WHITEOUT");
1289        return error!(ENOSYS);
1290    };
1291
1292    let mut lookup = |dir_fd, user_path| {
1293        lookup_parent_at(locked, current_task, dir_fd, user_path, |_, _, parent, basename| {
1294            Ok((parent, basename.to_owned()))
1295        })
1296    };
1297
1298    let (old_parent, old_basename) = lookup(old_dir_fd, old_user_path)?;
1299    let (new_parent, new_basename) = lookup(new_dir_fd, new_user_path)?;
1300
1301    if new_basename.len() > NAME_MAX as usize {
1302        return error!(ENAMETOOLONG);
1303    }
1304
1305    NamespaceNode::rename(
1306        locked,
1307        current_task,
1308        &old_parent,
1309        old_basename.as_ref(),
1310        &new_parent,
1311        new_basename.as_ref(),
1312        flags,
1313    )
1314}
1315
1316pub fn sys_fchmod(
1317    locked: &mut Locked<Unlocked>,
1318    current_task: &CurrentTask,
1319    fd: FdNumber,
1320    mode: FileMode,
1321) -> Result<(), Errno> {
1322    // Remove the filetype from the mode.
1323    let mode = mode & FileMode::PERMISSIONS;
1324    let file = current_task.files.get(fd)?;
1325    file.name.entry.node.chmod(locked, current_task, &file.name.mount, mode)?;
1326    file.name.entry.notify_ignoring_excl_unlink(InotifyMask::ATTRIB);
1327    Ok(())
1328}
1329
1330pub fn sys_fchmodat(
1331    locked: &mut Locked<Unlocked>,
1332    current_task: &CurrentTask,
1333    dir_fd: FdNumber,
1334    user_path: UserCString,
1335    mode: FileMode,
1336) -> Result<(), Errno> {
1337    // Remove the filetype from the mode.
1338    let mode = mode & FileMode::PERMISSIONS;
1339    let name = lookup_at(locked, current_task, dir_fd, user_path, LookupFlags::default())?;
1340    name.entry.node.chmod(locked, current_task, &name.mount, mode)?;
1341    name.entry.notify_ignoring_excl_unlink(InotifyMask::ATTRIB);
1342    Ok(())
1343}
1344
1345fn maybe_uid(id: u32) -> Option<uid_t> {
1346    if id == u32::MAX { None } else { Some(id) }
1347}
1348
1349pub fn sys_fchown(
1350    locked: &mut Locked<Unlocked>,
1351    current_task: &CurrentTask,
1352    fd: FdNumber,
1353    owner: u32,
1354    group: u32,
1355) -> Result<(), Errno> {
1356    let file = current_task.files.get(fd)?;
1357    file.name.entry.node.chown(
1358        locked,
1359        current_task,
1360        &file.name.mount,
1361        maybe_uid(owner),
1362        maybe_uid(group),
1363    )?;
1364    file.name.entry.notify_ignoring_excl_unlink(InotifyMask::ATTRIB);
1365    Ok(())
1366}
1367
1368pub fn sys_fchownat(
1369    locked: &mut Locked<Unlocked>,
1370    current_task: &CurrentTask,
1371    dir_fd: FdNumber,
1372    user_path: UserCString,
1373    owner: u32,
1374    group: u32,
1375    flags: u32,
1376) -> Result<(), Errno> {
1377    let flags = LookupFlags::from_bits(flags, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW)?;
1378    let name = lookup_at(locked, current_task, dir_fd, user_path, flags)?;
1379    name.entry.node.chown(locked, current_task, &name.mount, maybe_uid(owner), maybe_uid(group))?;
1380    name.entry.notify_ignoring_excl_unlink(InotifyMask::ATTRIB);
1381    Ok(())
1382}
1383
1384fn read_xattr_name(current_task: &CurrentTask, name_addr: UserCString) -> Result<FsString, Errno> {
1385    let name = current_task
1386        .read_c_string_to_vec(name_addr, XATTR_NAME_MAX as usize + 1)
1387        .map_err(|e| if e == ENAMETOOLONG { errno!(ERANGE) } else { e })?;
1388    if name.is_empty() {
1389        return error!(ERANGE);
1390    }
1391    let dot_index = memchr::memchr(b'.', &name).ok_or_else(|| errno!(ENOTSUP))?;
1392    if name[dot_index + 1..].is_empty() {
1393        return error!(EINVAL);
1394    }
1395    match &name[..dot_index] {
1396        b"user" | b"security" | b"trusted" | b"system" => {}
1397        _ => return error!(ENOTSUP),
1398    }
1399    Ok(name)
1400}
1401
1402fn do_getxattr(
1403    locked: &mut Locked<Unlocked>,
1404    current_task: &CurrentTask,
1405    node: &NamespaceNode,
1406    name_addr: UserCString,
1407    value_addr: UserAddress,
1408    size: usize,
1409) -> Result<usize, Errno> {
1410    let name = read_xattr_name(current_task, name_addr)?;
1411    let value =
1412        match node.entry.node.get_xattr(locked, current_task, &node.mount, name.as_ref(), size)? {
1413            ValueOrSize::Size(s) => return Ok(s),
1414            ValueOrSize::Value(v) => v,
1415        };
1416    if size == 0 {
1417        return Ok(value.len());
1418    }
1419    if size < value.len() {
1420        return error!(ERANGE);
1421    }
1422    current_task.write_memory(value_addr, &value)
1423}
1424
1425pub fn sys_getxattr(
1426    locked: &mut Locked<Unlocked>,
1427    current_task: &CurrentTask,
1428    path_addr: UserCString,
1429    name_addr: UserCString,
1430    value_addr: UserAddress,
1431    size: usize,
1432) -> Result<usize, Errno> {
1433    let node =
1434        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::default())?;
1435    do_getxattr(locked, current_task, &node, name_addr, value_addr, size)
1436}
1437
1438pub fn sys_fgetxattr(
1439    locked: &mut Locked<Unlocked>,
1440    current_task: &CurrentTask,
1441    fd: FdNumber,
1442    name_addr: UserCString,
1443    value_addr: UserAddress,
1444    size: usize,
1445) -> Result<usize, Errno> {
1446    let file = current_task.files.get(fd)?;
1447    do_getxattr(locked, current_task, &file.name, name_addr, value_addr, size)
1448}
1449
1450pub fn sys_lgetxattr(
1451    locked: &mut Locked<Unlocked>,
1452    current_task: &CurrentTask,
1453    path_addr: UserCString,
1454    name_addr: UserCString,
1455    value_addr: UserAddress,
1456    size: usize,
1457) -> Result<usize, Errno> {
1458    let node =
1459        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::no_follow())?;
1460    do_getxattr(locked, current_task, &node, name_addr, value_addr, size)
1461}
1462
1463fn do_setxattr(
1464    locked: &mut Locked<Unlocked>,
1465    current_task: &CurrentTask,
1466    node: &NamespaceNode,
1467    name_addr: UserCString,
1468    value_addr: UserAddress,
1469    size: usize,
1470    flags: u32,
1471) -> Result<(), Errno> {
1472    if size > XATTR_NAME_MAX as usize {
1473        return error!(E2BIG);
1474    }
1475
1476    let op = match flags {
1477        0 => XattrOp::Set,
1478        XATTR_CREATE => XattrOp::Create,
1479        XATTR_REPLACE => XattrOp::Replace,
1480        _ => return error!(EINVAL),
1481    };
1482    let name = read_xattr_name(current_task, name_addr)?;
1483    let value = FsString::from(current_task.read_memory_to_vec(value_addr, size)?);
1484    node.entry.node.set_xattr(locked, current_task, &node.mount, name.as_ref(), value.as_ref(), op)
1485}
1486
1487pub fn sys_fsetxattr(
1488    locked: &mut Locked<Unlocked>,
1489    current_task: &CurrentTask,
1490    fd: FdNumber,
1491    name_addr: UserCString,
1492    value_addr: UserAddress,
1493    size: usize,
1494    flags: u32,
1495) -> Result<(), Errno> {
1496    let file = current_task.files.get(fd)?;
1497    do_setxattr(locked, current_task, &file.name, name_addr, value_addr, size, flags)
1498}
1499
1500pub fn sys_lsetxattr(
1501    locked: &mut Locked<Unlocked>,
1502    current_task: &CurrentTask,
1503    path_addr: UserCString,
1504    name_addr: UserCString,
1505    value_addr: UserAddress,
1506    size: usize,
1507    flags: u32,
1508) -> Result<(), Errno> {
1509    let node =
1510        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::no_follow())?;
1511    do_setxattr(locked, current_task, &node, name_addr, value_addr, size, flags)
1512}
1513
1514pub fn sys_setxattr(
1515    locked: &mut Locked<Unlocked>,
1516    current_task: &CurrentTask,
1517    path_addr: UserCString,
1518    name_addr: UserCString,
1519    value_addr: UserAddress,
1520    size: usize,
1521    flags: u32,
1522) -> Result<(), Errno> {
1523    let node =
1524        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::default())?;
1525    do_setxattr(locked, current_task, &node, name_addr, value_addr, size, flags)
1526}
1527
1528fn do_removexattr(
1529    locked: &mut Locked<Unlocked>,
1530    current_task: &CurrentTask,
1531    node: &NamespaceNode,
1532    name_addr: UserCString,
1533) -> Result<(), Errno> {
1534    let mode = node.entry.node.info().mode;
1535    if mode.is_chr() || mode.is_fifo() {
1536        return error!(EPERM);
1537    }
1538    let name = read_xattr_name(current_task, name_addr)?;
1539    node.entry.node.remove_xattr(locked, current_task, &node.mount, name.as_ref())
1540}
1541
1542pub fn sys_removexattr(
1543    locked: &mut Locked<Unlocked>,
1544    current_task: &CurrentTask,
1545    path_addr: UserCString,
1546    name_addr: UserCString,
1547) -> Result<(), Errno> {
1548    let node =
1549        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::default())?;
1550    do_removexattr(locked, current_task, &node, name_addr)
1551}
1552
1553pub fn sys_lremovexattr(
1554    locked: &mut Locked<Unlocked>,
1555    current_task: &CurrentTask,
1556    path_addr: UserCString,
1557    name_addr: UserCString,
1558) -> Result<(), Errno> {
1559    let node =
1560        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::no_follow())?;
1561    do_removexattr(locked, current_task, &node, name_addr)
1562}
1563
1564pub fn sys_fremovexattr(
1565    locked: &mut Locked<Unlocked>,
1566    current_task: &CurrentTask,
1567    fd: FdNumber,
1568    name_addr: UserCString,
1569) -> Result<(), Errno> {
1570    let file = current_task.files.get(fd)?;
1571    do_removexattr(locked, current_task, &file.name, name_addr)
1572}
1573
1574fn do_listxattr(
1575    locked: &mut Locked<Unlocked>,
1576    current_task: &CurrentTask,
1577    node: &NamespaceNode,
1578    list_addr: UserAddress,
1579    size: usize,
1580) -> Result<usize, Errno> {
1581    let security_xattr = security::fs_node_listsecurity(current_task, &node.entry.node);
1582    let xattrs = match node.entry.node.list_xattrs(locked, current_task, size) {
1583        Ok(ValueOrSize::Size(s)) => return Ok(s + security_xattr.map_or(0, |s| s.len() + 1)),
1584        Ok(ValueOrSize::Value(mut v)) => {
1585            if let Some(security_value) = security_xattr {
1586                if !v.contains(&security_value) {
1587                    v.push(security_value);
1588                }
1589            }
1590            v
1591        }
1592        Err(e) => {
1593            if e.code != ENOTSUP || security_xattr.is_none() {
1594                return Err(e);
1595            }
1596            vec![security_xattr.unwrap()]
1597        }
1598    };
1599
1600    let mut list = vec![];
1601    for name in xattrs.iter() {
1602        list.extend_from_slice(name);
1603        list.push(b'\0');
1604    }
1605    if size == 0 {
1606        return Ok(list.len());
1607    }
1608    if size < list.len() {
1609        return error!(ERANGE);
1610    }
1611    current_task.write_memory(list_addr, &list)
1612}
1613
1614pub fn sys_listxattr(
1615    locked: &mut Locked<Unlocked>,
1616    current_task: &CurrentTask,
1617    path_addr: UserCString,
1618    list_addr: UserAddress,
1619    size: usize,
1620) -> Result<usize, Errno> {
1621    let node =
1622        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::default())?;
1623    do_listxattr(locked, current_task, &node, list_addr, size)
1624}
1625
1626pub fn sys_llistxattr(
1627    locked: &mut Locked<Unlocked>,
1628    current_task: &CurrentTask,
1629    path_addr: UserCString,
1630    list_addr: UserAddress,
1631    size: usize,
1632) -> Result<usize, Errno> {
1633    let node =
1634        lookup_at(locked, current_task, FdNumber::AT_FDCWD, path_addr, LookupFlags::no_follow())?;
1635    do_listxattr(locked, current_task, &node, list_addr, size)
1636}
1637
1638pub fn sys_flistxattr(
1639    locked: &mut Locked<Unlocked>,
1640    current_task: &CurrentTask,
1641    fd: FdNumber,
1642    list_addr: UserAddress,
1643    size: usize,
1644) -> Result<usize, Errno> {
1645    let file = current_task.files.get(fd)?;
1646    do_listxattr(locked, current_task, &file.name, list_addr, size)
1647}
1648
1649pub fn sys_getcwd(
1650    _locked: &mut Locked<Unlocked>,
1651    current_task: &CurrentTask,
1652    buf: UserAddress,
1653    size: usize,
1654) -> Result<usize, Errno> {
1655    let root = current_task.fs().root();
1656    let cwd = current_task.fs().cwd();
1657    let mut user_cwd = match cwd.path_from_root(Some(&root)) {
1658        PathWithReachability::Reachable(path) => path,
1659        PathWithReachability::Unreachable(mut path) => {
1660            let mut combined = vec![];
1661            combined.extend_from_slice(b"(unreachable)");
1662            combined.append(&mut path);
1663            combined.into()
1664        }
1665    };
1666    user_cwd.push(b'\0');
1667    if user_cwd.len() > size {
1668        return error!(ERANGE);
1669    }
1670    current_task.write_memory(buf, &user_cwd)?;
1671    Ok(user_cwd.len())
1672}
1673
1674pub fn sys_umask(
1675    _locked: &mut Locked<Unlocked>,
1676    current_task: &CurrentTask,
1677    umask: FileMode,
1678) -> Result<FileMode, Errno> {
1679    Ok(current_task.fs().set_umask(umask))
1680}
1681
1682fn get_fd_flags(flags: u32) -> FdFlags {
1683    if flags & O_CLOEXEC != 0 { FdFlags::CLOEXEC } else { FdFlags::empty() }
1684}
1685
1686pub fn sys_pipe2(
1687    locked: &mut Locked<Unlocked>,
1688    current_task: &CurrentTask,
1689    user_pipe: UserRef<FdNumber>,
1690    flags: u32,
1691) -> Result<(), Errno> {
1692    let supported_file_flags = OpenFlags::NONBLOCK | OpenFlags::DIRECT;
1693    if flags & !(O_CLOEXEC | supported_file_flags.bits()) != 0 {
1694        return error!(EINVAL);
1695    }
1696    let (read, write) = new_pipe(locked, current_task)?;
1697
1698    let file_flags = OpenFlags::from_bits_truncate(flags & supported_file_flags.bits());
1699    read.update_file_flags(file_flags, supported_file_flags);
1700    write.update_file_flags(file_flags, supported_file_flags);
1701
1702    let fd_flags = get_fd_flags(flags);
1703    let fd_read = current_task.add_file(locked, read, fd_flags)?;
1704    let fd_write = current_task.add_file(locked, write, fd_flags)?;
1705    log_trace!("pipe2 -> [{:#x}, {:#x}]", fd_read.raw(), fd_write.raw());
1706
1707    current_task.write_object(user_pipe, &fd_read)?;
1708    let user_pipe = user_pipe.next()?;
1709    current_task.write_object(user_pipe, &fd_write)?;
1710
1711    Ok(())
1712}
1713
1714pub fn sys_ioctl(
1715    locked: &mut Locked<Unlocked>,
1716    current_task: &CurrentTask,
1717    fd: FdNumber,
1718    request: u32,
1719    arg: SyscallArg,
1720) -> Result<SyscallResult, Errno> {
1721    match request {
1722        FIOCLEX => {
1723            current_task.files.set_fd_flags(fd, FdFlags::CLOEXEC)?;
1724            Ok(SUCCESS)
1725        }
1726        FIONCLEX => {
1727            current_task.files.set_fd_flags(fd, FdFlags::empty())?;
1728            Ok(SUCCESS)
1729        }
1730        _ => {
1731            let file = current_task.files.get(fd)?;
1732            file.ioctl(locked, current_task, request, arg)
1733        }
1734    }
1735}
1736
1737pub fn sys_symlinkat(
1738    locked: &mut Locked<Unlocked>,
1739    current_task: &CurrentTask,
1740    user_target: UserCString,
1741    new_dir_fd: FdNumber,
1742    user_path: UserCString,
1743) -> Result<(), Errno> {
1744    let target = current_task.read_path(user_target)?;
1745    if target.is_empty() {
1746        return error!(ENOENT);
1747    }
1748
1749    let path = current_task.read_path(user_path)?;
1750    // TODO: This check could probably be moved into parent.symlink(..).
1751    if path.is_empty() {
1752        return error!(ENOENT);
1753    }
1754
1755    let res = lookup_parent_at(
1756        locked,
1757        current_task,
1758        new_dir_fd,
1759        user_path,
1760        |locked, context, parent, basename| {
1761            // The path to a new symlink cannot end in `/`. That would imply that we are dereferencing
1762            // the symlink to a directory.
1763            //
1764            // See https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap03.html#tag_21_03_00_75
1765            if context.must_be_directory {
1766                return error!(ENOENT);
1767            }
1768            parent.create_symlink(locked, current_task, basename, target.as_ref())
1769        },
1770    );
1771    res?;
1772    Ok(())
1773}
1774
1775pub fn sys_dup(
1776    locked: &mut Locked<Unlocked>,
1777    current_task: &CurrentTask,
1778    oldfd: FdNumber,
1779) -> Result<FdNumber, Errno> {
1780    current_task.files.duplicate(
1781        locked,
1782        current_task,
1783        oldfd,
1784        TargetFdNumber::Default,
1785        FdFlags::empty(),
1786    )
1787}
1788
1789pub fn sys_dup3(
1790    locked: &mut Locked<Unlocked>,
1791    current_task: &CurrentTask,
1792    oldfd: FdNumber,
1793    newfd: FdNumber,
1794    flags: u32,
1795) -> Result<FdNumber, Errno> {
1796    if oldfd == newfd {
1797        return error!(EINVAL);
1798    }
1799    if flags & !O_CLOEXEC != 0 {
1800        return error!(EINVAL);
1801    }
1802    let fd_flags = get_fd_flags(flags);
1803    current_task.files.duplicate(
1804        locked,
1805        current_task,
1806        oldfd,
1807        TargetFdNumber::Specific(newfd),
1808        fd_flags,
1809    )?;
1810    Ok(newfd)
1811}
1812
1813/// A memfd file descriptor cannot have a name longer than 250 bytes, including
1814/// the null terminator.
1815///
1816/// See Errors section of https://man7.org/linux/man-pages/man2/memfd_create.2.html
1817const MEMFD_NAME_MAX_LEN: usize = 250;
1818
1819pub fn sys_memfd_create(
1820    locked: &mut Locked<Unlocked>,
1821    current_task: &CurrentTask,
1822    user_name: UserCString,
1823    flags: u32,
1824) -> Result<FdNumber, Errno> {
1825    const HUGE_SHIFTED_MASK: u32 = MFD_HUGE_MASK << MFD_HUGE_SHIFT;
1826
1827    if flags
1828        & !(MFD_CLOEXEC
1829            | MFD_ALLOW_SEALING
1830            | MFD_HUGETLB
1831            | HUGE_SHIFTED_MASK
1832            | MFD_NOEXEC_SEAL
1833            | MFD_EXEC)
1834        != 0
1835    {
1836        track_stub!(TODO("https://fxbug.dev/322875665"), "memfd_create unknown flags", flags);
1837        return error!(EINVAL);
1838    }
1839
1840    let _huge_page_size = if flags & MFD_HUGETLB != 0 {
1841        Some(flags & HUGE_SHIFTED_MASK)
1842    } else {
1843        if flags & HUGE_SHIFTED_MASK != 0 {
1844            return error!(EINVAL);
1845        }
1846        None
1847    };
1848
1849    let name = current_task
1850        .read_c_string_to_vec(user_name, MEMFD_NAME_MAX_LEN)
1851        .map_err(|e| if e == ENAMETOOLONG { errno!(EINVAL) } else { e })?;
1852
1853    // This behavior matches MEMFD_NOEXEC_SCOPE_EXEC, which states:
1854    //   > memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like MFD_EXEC was set.
1855    //
1856    // This behavior can be changed on Linux via sysctl vm.memfd_noexec, which is pid namespaced.
1857    // We do not currently support changing this behavior.
1858    let seals = if flags & MFD_NOEXEC_SEAL != 0 {
1859        SealFlags::NO_EXEC
1860    } else if flags & MFD_ALLOW_SEALING != 0 {
1861        SealFlags::empty()
1862    } else {
1863        // Forbid sealing, by sealing the seal operation.
1864        SealFlags::SEAL
1865    };
1866
1867    let file = new_memfd(locked, current_task, name, seals, OpenFlags::RDWR)?;
1868
1869    let mut fd_flags = FdFlags::empty();
1870    if flags & MFD_CLOEXEC != 0 {
1871        fd_flags |= FdFlags::CLOEXEC;
1872    }
1873    let fd = current_task.add_file(locked, file, fd_flags)?;
1874    Ok(fd)
1875}
1876
1877pub fn sys_mount(
1878    locked: &mut Locked<Unlocked>,
1879    current_task: &CurrentTask,
1880    source_addr: UserCString,
1881    target_addr: UserCString,
1882    filesystemtype_addr: UserCString,
1883    flags: u32,
1884    data_addr: UserCString,
1885) -> Result<(), Errno> {
1886    security::check_task_capable(current_task, CAP_SYS_ADMIN)?;
1887
1888    let flags = MountFlags::from_bits(flags).ok_or_else(|| {
1889        track_stub!(
1890            TODO("https://fxbug.dev/322875327"),
1891            "mount unknown flags",
1892            flags & !MountFlags::from_bits_truncate(flags).bits()
1893        );
1894        errno!(EINVAL)
1895    })?;
1896
1897    let target =
1898        lookup_at(locked, current_task, FdNumber::AT_FDCWD, target_addr, LookupFlags::default())?;
1899
1900    security::sb_mount(current_task, &target, flags)?;
1901
1902    if flags.contains(MountFlags::REMOUNT) {
1903        do_mount_remount(current_task, target, flags, data_addr)
1904    } else if flags.contains(MountFlags::BIND) {
1905        do_mount_bind(locked, current_task, source_addr, target, flags)
1906    } else if flags.intersects(MountFlags::SHARED | MountFlags::PRIVATE | MountFlags::DOWNSTREAM) {
1907        do_mount_change_propagation_type(current_task, target, flags)
1908    } else {
1909        do_mount_create(
1910            locked,
1911            current_task,
1912            source_addr,
1913            target,
1914            filesystemtype_addr,
1915            data_addr,
1916            flags,
1917        )
1918    }
1919}
1920
1921fn do_mount_remount(
1922    current_task: &CurrentTask,
1923    target: NamespaceNode,
1924    flags: MountFlags,
1925    data_addr: UserCString,
1926) -> Result<(), Errno> {
1927    if !data_addr.is_null() {
1928        track_stub!(TODO("https://fxbug.dev/322875506"), "MS_REMOUNT: Updating data");
1929    }
1930    let mount = target.mount_if_root()?;
1931
1932    let data = current_task.read_path_if_non_null(data_addr)?;
1933    let mount_options =
1934        security::sb_eat_lsm_opts(current_task.kernel(), &mut MountParams::parse(data.as_ref())?)?;
1935    security::sb_remount(current_task, &mount, mount_options)?;
1936    let updated_flags = flags & MountFlags::CHANGEABLE_WITH_REMOUNT;
1937    mount.update_flags(updated_flags);
1938    if !flags.contains(MountFlags::BIND) {
1939        // From <https://man7.org/linux/man-pages/man2/mount.2.html>
1940        //
1941        //   Since Linux 2.6.26, the MS_REMOUNT flag can be used with MS_BIND
1942        //   to modify only the per-mount-point flags.  This is particularly
1943        //   useful for setting or clearing the "read-only" flag on a mount
1944        //   without changing the underlying filesystem.
1945        track_stub!(TODO("https://fxbug.dev/322875215"), "MS_REMOUNT: Updating superblock flags");
1946    }
1947    Ok(())
1948}
1949
1950fn do_mount_bind(
1951    locked: &mut Locked<Unlocked>,
1952    current_task: &CurrentTask,
1953    source_addr: UserCString,
1954    target: NamespaceNode,
1955    flags: MountFlags,
1956) -> Result<(), Errno> {
1957    let source =
1958        lookup_at(locked, current_task, FdNumber::AT_FDCWD, source_addr, LookupFlags::default())?;
1959    log_trace!(
1960        source:% = source.path(current_task),
1961        target:% = target.path(current_task),
1962        flags:?;
1963        "do_mount_bind",
1964    );
1965    target.mount(WhatToMount::Bind(source), flags)
1966}
1967
1968fn do_mount_change_propagation_type(
1969    current_task: &CurrentTask,
1970    target: NamespaceNode,
1971    flags: MountFlags,
1972) -> Result<(), Errno> {
1973    log_trace!(
1974        target:% = target.path(current_task),
1975        flags:?;
1976        "do_mount_change_propagation_type",
1977    );
1978
1979    // Flag validation. Of the three propagation type flags, exactly one must be passed. The only
1980    // valid flags other than propagation type are MS_SILENT and MS_REC.
1981    //
1982    // Use if statements to find the first propagation type flag, then check for valid flags using
1983    // only the first propagation flag and MS_REC / MS_SILENT as valid flags.
1984    let propagation_flag = if flags.contains(MountFlags::SHARED) {
1985        MountFlags::SHARED
1986    } else if flags.contains(MountFlags::PRIVATE) {
1987        MountFlags::PRIVATE
1988    } else if flags.contains(MountFlags::DOWNSTREAM) {
1989        MountFlags::DOWNSTREAM
1990    } else {
1991        return error!(EINVAL);
1992    };
1993    if flags.intersects(!(propagation_flag | MountFlags::REC | MountFlags::SILENT)) {
1994        return error!(EINVAL);
1995    }
1996
1997    let mount = target.mount_if_root()?;
1998    mount.change_propagation(propagation_flag, flags.contains(MountFlags::REC));
1999    Ok(())
2000}
2001
2002fn do_mount_create(
2003    locked: &mut Locked<Unlocked>,
2004    current_task: &CurrentTask,
2005    source_addr: UserCString,
2006    target: NamespaceNode,
2007    filesystemtype_addr: UserCString,
2008    data_addr: UserCString,
2009    flags: MountFlags,
2010) -> Result<(), Errno> {
2011    let source = current_task.read_path_if_non_null(source_addr)?;
2012    let fs_type = current_task.read_path(filesystemtype_addr)?;
2013    let data = current_task.read_path_if_non_null(data_addr)?;
2014    log_trace!(
2015        source:%,
2016        target:% = target.path(current_task),
2017        fs_type:%,
2018        data:%;
2019        "do_mount_create",
2020    );
2021
2022    let options = FileSystemOptions {
2023        source: source.into(),
2024        flags: flags & MountFlags::STORED_ON_FILESYSTEM,
2025        params: MountParams::parse(data.as_ref())?,
2026    };
2027
2028    let fs = current_task.create_filesystem(locked, fs_type.as_ref(), options)?;
2029
2030    security::sb_kern_mount(current_task, &fs)?;
2031    target.mount(WhatToMount::Fs(fs), flags)
2032}
2033
2034pub fn sys_umount2(
2035    locked: &mut Locked<Unlocked>,
2036    current_task: &CurrentTask,
2037    target_addr: UserCString,
2038    flags: u32,
2039) -> Result<(), Errno> {
2040    security::check_task_capable(current_task, CAP_SYS_ADMIN)?;
2041
2042    let unmount_flags = UnmountFlags::from_bits(flags).ok_or_else(|| {
2043        track_stub!(
2044            TODO("https://fxbug.dev/322875327"),
2045            "unmount unknown flags",
2046            flags & !UnmountFlags::from_bits_truncate(flags).bits()
2047        );
2048        errno!(EINVAL)
2049    })?;
2050
2051    if unmount_flags.contains(UnmountFlags::EXPIRE)
2052        && (unmount_flags.contains(UnmountFlags::FORCE)
2053            || unmount_flags.contains(UnmountFlags::DETACH))
2054    {
2055        return error!(EINVAL);
2056    }
2057
2058    let lookup_flags = if unmount_flags.contains(UnmountFlags::NOFOLLOW) {
2059        LookupFlags::no_follow()
2060    } else {
2061        LookupFlags::default()
2062    };
2063    let target = lookup_at(locked, current_task, FdNumber::AT_FDCWD, target_addr, lookup_flags)?;
2064
2065    security::sb_umount(current_task, &target, unmount_flags)?;
2066
2067    target.unmount(unmount_flags)
2068}
2069
2070pub fn sys_eventfd2(
2071    locked: &mut Locked<Unlocked>,
2072    current_task: &CurrentTask,
2073    value: u32,
2074    flags: u32,
2075) -> Result<FdNumber, Errno> {
2076    if flags & !(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE) != 0 {
2077        return error!(EINVAL);
2078    }
2079    let blocking = (flags & EFD_NONBLOCK) == 0;
2080    let eventfd_type =
2081        if (flags & EFD_SEMAPHORE) == 0 { EventFdType::Counter } else { EventFdType::Semaphore };
2082    let file = new_eventfd(locked, current_task, value, eventfd_type, blocking);
2083    let fd_flags = if flags & EFD_CLOEXEC != 0 { FdFlags::CLOEXEC } else { FdFlags::empty() };
2084    let fd = current_task.add_file(locked, file, fd_flags)?;
2085    Ok(fd)
2086}
2087
2088pub fn sys_pidfd_open(
2089    locked: &mut Locked<Unlocked>,
2090    current_task: &CurrentTask,
2091    pid: pid_t,
2092    flags: u32,
2093) -> Result<FdNumber, Errno> {
2094    if flags & !PIDFD_NONBLOCK != 0 {
2095        return error!(EINVAL);
2096    }
2097    if pid <= 0 {
2098        return error!(EINVAL);
2099    }
2100
2101    let file = {
2102        let pid_table = current_task.kernel().pids.read();
2103
2104        let blocking = (flags & PIDFD_NONBLOCK) == 0;
2105        let open_flags = if blocking { OpenFlags::empty() } else { OpenFlags::NONBLOCK };
2106
2107        // Validate that a process (and not just a task) entry exists for the PID.
2108        let task = pid_table.get_task(pid);
2109        let file = match (pid_table.get_process(pid), task.upgrade()) {
2110            (Some(ProcessEntryRef::Process(proc)), Some(task)) => {
2111                new_pidfd(locked, current_task, &proc, &*task.mm()?, open_flags)
2112            }
2113            (Some(ProcessEntryRef::Zombie(_)), _) => {
2114                new_zombie_pidfd(locked, current_task, open_flags)
2115            }
2116            (None, Some(_)) => return error!(EINVAL),
2117            _ => return error!(ESRCH),
2118        };
2119        file
2120    };
2121
2122    current_task.add_file(locked, file, FdFlags::CLOEXEC)
2123}
2124
2125pub fn sys_pidfd_getfd(
2126    locked: &mut Locked<Unlocked>,
2127    current_task: &CurrentTask,
2128    pidfd: FdNumber,
2129    targetfd: FdNumber,
2130    flags: u32,
2131) -> Result<FdNumber, Errno> {
2132    if flags != 0 {
2133        return error!(EINVAL);
2134    }
2135
2136    let file = current_task.files.get(pidfd)?;
2137    let tg = file.as_thread_group_key()?;
2138    let tg = tg.upgrade().ok_or_else(|| errno!(ESRCH))?;
2139    let task = TempRef::into_static(tg.read().tasks().next().ok_or_else(|| errno!(ESRCH))?);
2140
2141    current_task.check_ptrace_access_mode(locked, PTRACE_MODE_ATTACH_REALCREDS, &task)?;
2142
2143    let target_file = task.files.get(targetfd)?;
2144    current_task.add_file(locked, target_file, FdFlags::CLOEXEC)
2145}
2146
2147pub fn sys_timerfd_create(
2148    locked: &mut Locked<Unlocked>,
2149    current_task: &CurrentTask,
2150    clock_id: u32,
2151    flags: u32,
2152) -> Result<FdNumber, Errno> {
2153    let timeline = match clock_id {
2154        CLOCK_MONOTONIC => Timeline::Monotonic,
2155        CLOCK_BOOTTIME | CLOCK_BOOTTIME_ALARM => Timeline::BootInstant,
2156        CLOCK_REALTIME | CLOCK_REALTIME_ALARM => Timeline::RealTime,
2157        _ => return error!(EINVAL),
2158    };
2159    let timer_type = match clock_id {
2160        CLOCK_MONOTONIC | CLOCK_BOOTTIME | CLOCK_REALTIME => TimerWakeup::Regular,
2161        CLOCK_BOOTTIME_ALARM | CLOCK_REALTIME_ALARM => {
2162            security::check_task_capable(current_task, CAP_WAKE_ALARM)?;
2163            TimerWakeup::Alarm
2164        }
2165        _ => return error!(EINVAL),
2166    };
2167    if flags & !(TFD_NONBLOCK | TFD_CLOEXEC) != 0 {
2168        track_stub!(TODO("https://fxbug.dev/322875488"), "timerfd_create unknown flags", flags);
2169        return error!(EINVAL);
2170    }
2171    log_trace!("timerfd_create(clock_id={:?}, flags={:#x})", clock_id, flags);
2172
2173    let mut open_flags = OpenFlags::RDWR;
2174    if flags & TFD_NONBLOCK != 0 {
2175        open_flags |= OpenFlags::NONBLOCK;
2176    }
2177
2178    let mut fd_flags = FdFlags::empty();
2179    if flags & TFD_CLOEXEC != 0 {
2180        fd_flags |= FdFlags::CLOEXEC;
2181    };
2182
2183    let timer = TimerFile::new_file(locked, current_task, timer_type, timeline, open_flags)?;
2184    let fd = current_task.add_file(locked, timer, fd_flags)?;
2185    Ok(fd)
2186}
2187
2188pub fn sys_timerfd_gettime(
2189    _locked: &mut Locked<Unlocked>,
2190    current_task: &CurrentTask,
2191    fd: FdNumber,
2192    user_current_value: ITimerSpecPtr,
2193) -> Result<(), Errno> {
2194    let file = current_task.files.get(fd)?;
2195    let timer_file = file.downcast_file::<TimerFile>().ok_or_else(|| errno!(EINVAL))?;
2196    let timer_info = timer_file.current_timer_spec();
2197    log_trace!("timerfd_gettime(fd={:?}, current_value={:?})", fd, timer_info);
2198    current_task.write_multi_arch_object(user_current_value, timer_info)?;
2199    Ok(())
2200}
2201
2202pub fn sys_timerfd_settime(
2203    _locked: &mut Locked<Unlocked>,
2204    current_task: &CurrentTask,
2205    fd: FdNumber,
2206    flags: u32,
2207    user_new_value: ITimerSpecPtr,
2208    user_old_value: ITimerSpecPtr,
2209) -> Result<(), Errno> {
2210    if flags & !(TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET) != 0 {
2211        track_stub!(TODO("https://fxbug.dev/322874722"), "timerfd_settime unknown flags", flags);
2212        return error!(EINVAL);
2213    }
2214
2215    let file = current_task.files.get(fd)?;
2216    let timer_file = file.downcast_file::<TimerFile>().ok_or_else(|| errno!(EINVAL))?;
2217
2218    let new_timer_spec = current_task.read_multi_arch_object(user_new_value)?;
2219    let old_timer_spec = timer_file.set_timer_spec(current_task, &file, new_timer_spec, flags)?;
2220    log_trace!(
2221        "timerfd_settime(fd={:?}, flags={:#x}, new_value={:?}, current_value={:?})",
2222        fd,
2223        flags,
2224        new_timer_spec,
2225        old_timer_spec
2226    );
2227    if !user_old_value.is_null() {
2228        current_task.write_multi_arch_object(user_old_value, old_timer_spec)?;
2229    }
2230    Ok(())
2231}
2232
2233fn deadline_after_timespec(
2234    current_task: &CurrentTask,
2235    user_timespec: TimeSpecPtr,
2236) -> Result<zx::MonotonicInstant, Errno> {
2237    if user_timespec.is_null() {
2238        Ok(zx::MonotonicInstant::INFINITE)
2239    } else {
2240        let timespec = current_task.read_multi_arch_object(user_timespec)?;
2241        Ok(zx::MonotonicInstant::after(duration_from_timespec(timespec)?))
2242    }
2243}
2244
2245static_assertions::assert_eq_size!(uapi::__kernel_fd_set, uapi::arch32::__kernel_fd_set);
2246
2247fn select(
2248    locked: &mut Locked<Unlocked>,
2249    current_task: &mut CurrentTask,
2250    nfds: u32,
2251    readfds_addr: UserRef<__kernel_fd_set>,
2252    writefds_addr: UserRef<__kernel_fd_set>,
2253    exceptfds_addr: UserRef<__kernel_fd_set>,
2254    deadline: zx::MonotonicInstant,
2255    sigmask_addr: UserRef<pselect6_sigmask>,
2256) -> Result<i32, Errno> {
2257    const BITS_PER_BYTE: usize = 8;
2258
2259    fn sizeof<T>(_: &T) -> usize {
2260        BITS_PER_BYTE * std::mem::size_of::<T>()
2261    }
2262    fn is_fd_set(set: &__kernel_fd_set, fd: usize) -> bool {
2263        let index = fd / sizeof(&set.fds_bits[0]);
2264        let remainder = fd % sizeof(&set.fds_bits[0]);
2265        set.fds_bits[index] & (1 << remainder) > 0
2266    }
2267    fn add_fd_to_set(set: &mut __kernel_fd_set, fd: usize) {
2268        let index = fd / sizeof(&set.fds_bits[0]);
2269        let remainder = fd % sizeof(&set.fds_bits[0]);
2270
2271        set.fds_bits[index] |= 1 << remainder;
2272    }
2273    let read_fd_set = |addr: UserRef<__kernel_fd_set>| {
2274        if addr.is_null() { Ok(Default::default()) } else { current_task.read_object(addr) }
2275    };
2276
2277    if nfds as usize > BITS_PER_BYTE * std::mem::size_of::<__kernel_fd_set>() {
2278        return error!(EINVAL);
2279    }
2280
2281    let read_events =
2282        FdEvents::from_bits_truncate(POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR);
2283    let write_events = FdEvents::from_bits_truncate(POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR);
2284    let except_events = FdEvents::from_bits_truncate(POLLPRI);
2285
2286    let readfds = read_fd_set(readfds_addr)?;
2287    let writefds = read_fd_set(writefds_addr)?;
2288    let exceptfds = read_fd_set(exceptfds_addr)?;
2289
2290    let sets = &[(read_events, &readfds), (write_events, &writefds), (except_events, &exceptfds)];
2291    let waiter = FileWaiter::<FdNumber>::default();
2292
2293    for fd in 0..nfds {
2294        let mut aggregated_events = FdEvents::empty();
2295        for (events, fds) in sets.iter() {
2296            if is_fd_set(fds, fd as usize) {
2297                aggregated_events |= *events;
2298            }
2299        }
2300        if !aggregated_events.is_empty() {
2301            let fd = FdNumber::from_raw(fd as i32);
2302            let file = current_task.files.get(fd)?;
2303            waiter.add(locked, current_task, fd, Some(&file), aggregated_events)?;
2304        }
2305    }
2306
2307    let mask = if !sigmask_addr.is_null() {
2308        let sigmask = current_task.read_object(sigmask_addr)?;
2309        let mask = if sigmask.ss.is_null() {
2310            current_task.read().signal_mask()
2311        } else {
2312            if sigmask.ss_len < std::mem::size_of::<sigset_t>() {
2313                return error!(EINVAL);
2314            }
2315            current_task.read_object(sigmask.ss.into())?
2316        };
2317        Some(mask)
2318    } else {
2319        None
2320    };
2321
2322    waiter.wait(locked, current_task, mask, deadline)?;
2323
2324    let mut num_fds = 0;
2325    let mut readfds_out: __kernel_fd_set = Default::default();
2326    let mut writefds_out: __kernel_fd_set = Default::default();
2327    let mut exceptfds_out: __kernel_fd_set = Default::default();
2328    let mut sets = [
2329        (read_events, &readfds, &mut readfds_out),
2330        (write_events, &writefds, &mut writefds_out),
2331        (except_events, &exceptfds, &mut exceptfds_out),
2332    ];
2333    let mut ready_items = waiter.ready_items.lock();
2334    for ReadyItem { key: ready_key, events: ready_events } in ready_items.drain(..) {
2335        let ready_key = assert_matches::assert_matches!(
2336            ready_key,
2337            ReadyItemKey::FdNumber(v) => v
2338        );
2339
2340        sets.iter_mut().for_each(|(events, fds, fds_out)| {
2341            let fd = ready_key.raw() as usize;
2342            if events.intersects(ready_events) && is_fd_set(fds, fd) {
2343                add_fd_to_set(fds_out, fd);
2344                num_fds += 1;
2345            }
2346        });
2347    }
2348
2349    let write_fd_set =
2350        |addr: UserRef<__kernel_fd_set>, value: __kernel_fd_set| -> Result<(), Errno> {
2351            if !addr.is_null() {
2352                current_task.write_object(addr, &value)?;
2353            }
2354            Ok(())
2355        };
2356    write_fd_set(readfds_addr, readfds_out)?;
2357    write_fd_set(writefds_addr, writefds_out)?;
2358    write_fd_set(exceptfds_addr, exceptfds_out)?;
2359    Ok(num_fds)
2360}
2361
2362pub fn sys_pselect6(
2363    locked: &mut Locked<Unlocked>,
2364    current_task: &mut CurrentTask,
2365    nfds: u32,
2366    readfds_addr: UserRef<__kernel_fd_set>,
2367    writefds_addr: UserRef<__kernel_fd_set>,
2368    exceptfds_addr: UserRef<__kernel_fd_set>,
2369    timeout_addr: TimeSpecPtr,
2370    sigmask_addr: UserRef<pselect6_sigmask>,
2371) -> Result<i32, Errno> {
2372    let deadline = deadline_after_timespec(current_task, timeout_addr)?;
2373
2374    let num_fds = select(
2375        locked,
2376        current_task,
2377        nfds,
2378        readfds_addr,
2379        writefds_addr,
2380        exceptfds_addr,
2381        deadline,
2382        sigmask_addr,
2383    )?;
2384
2385    if !timeout_addr.is_null()
2386        && !current_task
2387            .thread_group()
2388            .read()
2389            .personality
2390            .contains(PersonalityFlags::STICKY_TIMEOUTS)
2391    {
2392        let now = zx::MonotonicInstant::get();
2393        let remaining = std::cmp::max(deadline - now, zx::MonotonicDuration::from_seconds(0));
2394        current_task.write_multi_arch_object(timeout_addr, timespec_from_duration(remaining))?;
2395    }
2396
2397    Ok(num_fds)
2398}
2399
2400pub fn sys_select(
2401    locked: &mut Locked<Unlocked>,
2402    current_task: &mut CurrentTask,
2403    nfds: u32,
2404    readfds_addr: UserRef<__kernel_fd_set>,
2405    writefds_addr: UserRef<__kernel_fd_set>,
2406    exceptfds_addr: UserRef<__kernel_fd_set>,
2407    timeout_addr: TimeValPtr,
2408) -> Result<i32, Errno> {
2409    let start_time = zx::MonotonicInstant::get();
2410
2411    let deadline = if timeout_addr.is_null() {
2412        zx::MonotonicInstant::INFINITE
2413    } else {
2414        let timeval = current_task.read_multi_arch_object(timeout_addr)?;
2415        start_time + starnix_types::time::duration_from_timeval(timeval)?
2416    };
2417
2418    let num_fds = select(
2419        locked,
2420        current_task,
2421        nfds,
2422        readfds_addr,
2423        writefds_addr,
2424        exceptfds_addr,
2425        deadline,
2426        UserRef::<pselect6_sigmask>::default(),
2427    )?;
2428
2429    if !timeout_addr.is_null()
2430        && !current_task
2431            .thread_group()
2432            .read()
2433            .personality
2434            .contains(PersonalityFlags::STICKY_TIMEOUTS)
2435    {
2436        let now = zx::MonotonicInstant::get();
2437        let remaining = std::cmp::max(deadline - now, zx::MonotonicDuration::from_seconds(0));
2438        current_task.write_multi_arch_object(
2439            timeout_addr,
2440            starnix_types::time::timeval_from_duration(remaining),
2441        )?;
2442    }
2443
2444    Ok(num_fds)
2445}
2446
2447pub fn sys_epoll_create1(
2448    locked: &mut Locked<Unlocked>,
2449    current_task: &CurrentTask,
2450    flags: u32,
2451) -> Result<FdNumber, Errno> {
2452    if flags & !EPOLL_CLOEXEC != 0 {
2453        return error!(EINVAL);
2454    }
2455    let ep_file = EpollFileObject::new_file(locked, current_task);
2456    let fd_flags = if flags & EPOLL_CLOEXEC != 0 { FdFlags::CLOEXEC } else { FdFlags::empty() };
2457    let fd = current_task.add_file(locked, ep_file, fd_flags)?;
2458    Ok(fd)
2459}
2460
2461pub fn sys_epoll_ctl(
2462    locked: &mut Locked<Unlocked>,
2463    current_task: &CurrentTask,
2464    epfd: FdNumber,
2465    op: u32,
2466    fd: FdNumber,
2467    event: UserRef<EpollEvent>,
2468) -> Result<(), Errno> {
2469    let file = current_task.files.get(epfd)?;
2470    let epoll_file = file.downcast_file::<EpollFileObject>().ok_or_else(|| errno!(EINVAL))?;
2471    let operand_file = current_task.files.get(fd)?;
2472
2473    if Arc::ptr_eq(&file, &operand_file) {
2474        return error!(EINVAL);
2475    }
2476
2477    let epoll_event = match current_task.read_object(event) {
2478        Ok(mut epoll_event) => {
2479            // If EPOLLWAKEUP is specified in flags, but the caller does not have the CAP_BLOCK_SUSPEND
2480            // capability, then the EPOLLWAKEUP flag is silently ignored.
2481            // See https://man7.org/linux/man-pages/man2/epoll_ctl.2.html
2482            if epoll_event.events().contains(FdEvents::EPOLLWAKEUP) {
2483                if !security::is_task_capable_noaudit(current_task, CAP_BLOCK_SUSPEND) {
2484                    epoll_event.ignore(FdEvents::EPOLLWAKEUP);
2485                }
2486            }
2487            Ok(epoll_event)
2488        }
2489        result => result,
2490    };
2491
2492    match op {
2493        EPOLL_CTL_ADD => {
2494            epoll_file.add(locked, current_task, &operand_file, &file, epoll_event?)?;
2495            operand_file.register_epfd(&file);
2496        }
2497        EPOLL_CTL_MOD => {
2498            epoll_file.modify(locked, current_task, &operand_file, epoll_event?)?;
2499        }
2500        EPOLL_CTL_DEL => {
2501            epoll_file.delete(&operand_file)?;
2502            current_task
2503                .kernel()
2504                .suspend_resume_manager
2505                .remove_epoll(operand_file.id.as_epoll_key());
2506            operand_file.unregister_epfd(&file);
2507        }
2508        _ => return error!(EINVAL),
2509    }
2510    Ok(())
2511}
2512
2513// Backend for sys_epoll_pwait and sys_epoll_pwait2 that takes an already-decoded deadline.
2514fn do_epoll_pwait(
2515    locked: &mut Locked<Unlocked>,
2516    current_task: &mut CurrentTask,
2517    epfd: FdNumber,
2518    events: UserRef<EpollEvent>,
2519    unvalidated_max_events: i32,
2520    deadline: zx::MonotonicInstant,
2521    user_sigmask: UserRef<SigSet>,
2522) -> Result<usize, Errno> {
2523    let file = current_task.files.get(epfd)?;
2524    let epoll_file = file.downcast_file::<EpollFileObject>().ok_or_else(|| errno!(EINVAL))?;
2525
2526    // Max_events must be greater than 0.
2527    let max_events: usize = unvalidated_max_events.try_into().map_err(|_| errno!(EINVAL))?;
2528    if max_events == 0 {
2529        return error!(EINVAL);
2530    }
2531
2532    // Return early if the user passes an obviously invalid pointer. This avoids dropping events
2533    // for common pointer errors. When we catch bad pointers after the wait is complete when the
2534    // memory is actually written, the events will be lost. This check is not a guarantee.
2535    current_task
2536        .mm()?
2537        .check_plausible(events.addr(), max_events * std::mem::size_of::<EpollEvent>())?;
2538
2539    let active_events = if !user_sigmask.is_null() {
2540        let signal_mask = current_task.read_object(user_sigmask)?;
2541        current_task.wait_with_temporary_mask(locked, signal_mask, |locked, current_task| {
2542            epoll_file.wait(locked, current_task, max_events, deadline)
2543        })?
2544    } else {
2545        epoll_file.wait(locked, current_task, max_events, deadline)?
2546    };
2547
2548    current_task.write_objects(events, &active_events)?;
2549    Ok(active_events.len())
2550}
2551
2552pub fn sys_epoll_pwait(
2553    locked: &mut Locked<Unlocked>,
2554    current_task: &mut CurrentTask,
2555    epfd: FdNumber,
2556    events: UserRef<EpollEvent>,
2557    max_events: i32,
2558    timeout: i32,
2559    user_sigmask: UserRef<SigSet>,
2560) -> Result<usize, Errno> {
2561    let deadline = zx::MonotonicInstant::after(duration_from_poll_timeout(timeout)?);
2562    do_epoll_pwait(locked, current_task, epfd, events, max_events, deadline, user_sigmask)
2563}
2564
2565pub fn sys_epoll_pwait2(
2566    locked: &mut Locked<Unlocked>,
2567    current_task: &mut CurrentTask,
2568    epfd: FdNumber,
2569    events: UserRef<EpollEvent>,
2570    max_events: i32,
2571    user_timespec: TimeSpecPtr,
2572    user_sigmask: UserRef<SigSet>,
2573) -> Result<usize, Errno> {
2574    let deadline = deadline_after_timespec(current_task, user_timespec)?;
2575    do_epoll_pwait(locked, current_task, epfd, events, max_events, deadline, user_sigmask)
2576}
2577
2578struct FileWaiter<Key: Into<ReadyItemKey>> {
2579    waiter: Waiter,
2580    ready_items: Arc<Mutex<VecDeque<ReadyItem>>>,
2581    _marker: PhantomData<Key>,
2582}
2583
2584impl<Key: Into<ReadyItemKey>> Default for FileWaiter<Key> {
2585    fn default() -> Self {
2586        Self { waiter: Waiter::new(), ready_items: Default::default(), _marker: PhantomData }
2587    }
2588}
2589
2590impl<Key: Into<ReadyItemKey>> FileWaiter<Key> {
2591    fn add<L>(
2592        &self,
2593        locked: &mut Locked<L>,
2594        current_task: &CurrentTask,
2595        key: Key,
2596        file: Option<&FileHandle>,
2597        requested_events: FdEvents,
2598    ) -> Result<(), Errno>
2599    where
2600        L: LockEqualOrBefore<FileOpsCore>,
2601    {
2602        let key = key.into();
2603
2604        if let Some(file) = file {
2605            let sought_events = requested_events | FdEvents::POLLERR | FdEvents::POLLHUP;
2606
2607            let handler =
2608                EventHandler::Enqueue { key, queue: self.ready_items.clone(), sought_events };
2609            file.wait_async(locked, current_task, &self.waiter, sought_events, handler);
2610            let current_events = file.query_events(locked, current_task)? & sought_events;
2611            if !current_events.is_empty() {
2612                self.ready_items.lock().push_back(ReadyItem { key, events: current_events });
2613            }
2614        } else {
2615            self.ready_items.lock().push_back(ReadyItem { key, events: FdEvents::POLLNVAL });
2616        }
2617        Ok(())
2618    }
2619
2620    fn wait<L>(
2621        &self,
2622        locked: &mut Locked<L>,
2623        current_task: &mut CurrentTask,
2624        signal_mask: Option<SigSet>,
2625        deadline: zx::MonotonicInstant,
2626    ) -> Result<(), Errno>
2627    where
2628        L: LockEqualOrBefore<FileOpsCore>,
2629    {
2630        if self.ready_items.lock().is_empty() {
2631            // When wait_until() returns Ok() it means there was a wake up; however there may not
2632            // be a ready item, for example if waiting on a sync file with multiple sync points.
2633            // Keep waiting until there's at least one ready item.
2634            let signal_mask = signal_mask.unwrap_or_else(|| current_task.read().signal_mask());
2635            let mut result = current_task.wait_with_temporary_mask(
2636                locked,
2637                signal_mask,
2638                |locked, current_task| self.waiter.wait_until(locked, current_task, deadline),
2639            );
2640            loop {
2641                match result {
2642                    Err(err) if err == ETIMEDOUT => return Ok(()),
2643                    Ok(()) => {
2644                        if !self.ready_items.lock().is_empty() {
2645                            break;
2646                        }
2647                    }
2648                    result => result?,
2649                };
2650                result = self.waiter.wait_until(locked, current_task, deadline);
2651            }
2652        }
2653        Ok(())
2654    }
2655}
2656
2657pub fn poll(
2658    locked: &mut Locked<Unlocked>,
2659    current_task: &mut CurrentTask,
2660    user_pollfds: UserRef<pollfd>,
2661    num_fds: i32,
2662    mask: Option<SigSet>,
2663    deadline: zx::MonotonicInstant,
2664) -> Result<usize, Errno> {
2665    if num_fds < 0
2666        || num_fds as u64 > current_task.thread_group().get_rlimit(locked, Resource::NOFILE)
2667    {
2668        return error!(EINVAL);
2669    }
2670
2671    let mut pollfds = vec![pollfd::default(); num_fds as usize];
2672    let waiter = FileWaiter::<usize>::default();
2673
2674    for (index, poll_descriptor) in pollfds.iter_mut().enumerate() {
2675        *poll_descriptor = current_task.read_object(user_pollfds.at(index)?)?;
2676        poll_descriptor.revents = 0;
2677        if poll_descriptor.fd < 0 {
2678            continue;
2679        }
2680        let file = current_task.files.get(FdNumber::from_raw(poll_descriptor.fd)).ok();
2681        waiter.add(
2682            locked,
2683            current_task,
2684            index,
2685            file.as_ref(),
2686            FdEvents::from_bits_truncate(poll_descriptor.events as u32),
2687        )?;
2688    }
2689
2690    waiter.wait(locked, current_task, mask, deadline)?;
2691
2692    let mut ready_items = waiter.ready_items.lock();
2693    let mut unique_ready_items =
2694        bit_vec::BitVec::from_elem(usize::try_from(num_fds).unwrap(), false);
2695    for ReadyItem { key: ready_key, events: ready_events } in ready_items.drain(..) {
2696        let ready_key = assert_matches::assert_matches!(
2697            ready_key,
2698            ReadyItemKey::Usize(v) => v
2699        );
2700        let interested_events = FdEvents::from_bits_truncate(pollfds[ready_key].events as u32)
2701            | FdEvents::POLLERR
2702            | FdEvents::POLLHUP
2703            | FdEvents::POLLNVAL;
2704        let return_events = (interested_events & ready_events).bits();
2705        pollfds[ready_key].revents = return_events as i16;
2706        unique_ready_items.set(ready_key, true);
2707    }
2708
2709    for (index, poll_descriptor) in pollfds.iter().enumerate() {
2710        current_task.write_object(user_pollfds.at(index)?, poll_descriptor)?;
2711    }
2712
2713    Ok(unique_ready_items.into_iter().filter(Clone::clone).count())
2714}
2715
2716pub fn sys_ppoll(
2717    locked: &mut Locked<Unlocked>,
2718    current_task: &mut CurrentTask,
2719    user_fds: UserRef<pollfd>,
2720    num_fds: i32,
2721    user_timespec: TimeSpecPtr,
2722    user_mask: UserRef<SigSet>,
2723    sigset_size: usize,
2724) -> Result<usize, Errno> {
2725    let start_time = zx::MonotonicInstant::get();
2726
2727    let timeout = if user_timespec.is_null() {
2728        // Passing -1 to poll is equivalent to an infinite timeout.
2729        -1
2730    } else {
2731        let ts = current_task.read_multi_arch_object(user_timespec)?;
2732        duration_from_timespec::<zx::MonotonicTimeline>(ts)?.into_millis() as i32
2733    };
2734
2735    let deadline = start_time + duration_from_poll_timeout(timeout)?;
2736
2737    let mask = if !user_mask.is_null() {
2738        if sigset_size != std::mem::size_of::<SigSet>() {
2739            return error!(EINVAL);
2740        }
2741        let mask = current_task.read_object(user_mask)?;
2742        Some(mask)
2743    } else {
2744        None
2745    };
2746
2747    let poll_result = poll(locked, current_task, user_fds, num_fds, mask, deadline);
2748
2749    if user_timespec.is_null() {
2750        return poll_result;
2751    }
2752
2753    let now = zx::MonotonicInstant::get();
2754    let remaining = std::cmp::max(deadline - now, zx::MonotonicDuration::from_seconds(0));
2755    let remaining_timespec = timespec_from_duration(remaining);
2756
2757    // From gVisor: "ppoll is normally restartable if interrupted by something other than a signal
2758    // handled by the application (i.e. returns ERESTARTNOHAND). However, if
2759    // [copy out] failed, then the restarted ppoll would use the wrong timeout, so the
2760    // error should be left as EINTR."
2761    match (current_task.write_multi_arch_object(user_timespec, remaining_timespec), poll_result) {
2762        // If write was ok, and poll was ok, return poll result.
2763        (Ok(_), Ok(num_events)) => Ok(num_events),
2764        (Ok(_), Err(e)) if e == EINTR => {
2765            error!(ERESTARTNOHAND)
2766        }
2767        (Ok(_), poll_result) => poll_result,
2768        // If write was a failure, return the poll result unchanged.
2769        (Err(_), poll_result) => poll_result,
2770    }
2771}
2772
2773pub fn sys_flock(
2774    locked: &mut Locked<Unlocked>,
2775    current_task: &CurrentTask,
2776    fd: FdNumber,
2777    operation: u32,
2778) -> Result<(), Errno> {
2779    let file = current_task.files.get(fd)?;
2780    let operation = FlockOperation::from_flags(operation)?;
2781    security::check_file_lock_access(current_task, &file)?;
2782    file.flock(locked, current_task, operation)
2783}
2784
2785pub fn sys_sync(_locked: &mut Locked<Unlocked>, _current_task: &CurrentTask) -> Result<(), Errno> {
2786    track_stub!(TODO("https://fxbug.dev/322875826"), "sync()");
2787    Ok(())
2788}
2789
2790pub fn sys_syncfs(
2791    _locked: &mut Locked<Unlocked>,
2792    current_task: &CurrentTask,
2793    fd: FdNumber,
2794) -> Result<(), Errno> {
2795    let _file = current_task.files.get(fd)?;
2796    track_stub!(TODO("https://fxbug.dev/322875646"), "syncfs");
2797    Ok(())
2798}
2799
2800pub fn sys_fsync(
2801    _locked: &mut Locked<Unlocked>,
2802    current_task: &CurrentTask,
2803    fd: FdNumber,
2804) -> Result<(), Errno> {
2805    let file = current_task.files.get(fd)?;
2806    file.sync(current_task)
2807}
2808
2809pub fn sys_fdatasync(
2810    _locked: &mut Locked<Unlocked>,
2811    current_task: &CurrentTask,
2812    fd: FdNumber,
2813) -> Result<(), Errno> {
2814    let file = current_task.files.get(fd)?;
2815    file.data_sync(current_task)
2816}
2817
2818pub fn sys_sync_file_range(
2819    _locked: &mut Locked<Unlocked>,
2820    current_task: &CurrentTask,
2821    fd: FdNumber,
2822    offset: off_t,
2823    length: off_t,
2824    flags: u32,
2825) -> Result<(), Errno> {
2826    const KNOWN_FLAGS: u32 = uapi::SYNC_FILE_RANGE_WAIT_BEFORE
2827        | uapi::SYNC_FILE_RANGE_WRITE
2828        | uapi::SYNC_FILE_RANGE_WAIT_AFTER;
2829    if flags & !KNOWN_FLAGS != 0 {
2830        return error!(EINVAL);
2831    }
2832
2833    let file = current_task.files.get(fd)?;
2834
2835    if offset < 0 || length < 0 {
2836        return error!(EINVAL);
2837    }
2838
2839    checked_add_offset_and_length(offset as usize, length as usize)?;
2840
2841    // From <https://linux.die.net/man/2/sync_file_range>:
2842    //
2843    //   fd refers to something other than a regular file, a block device, a directory, or a symbolic link.
2844    let mode = file.node().info().mode;
2845    if !mode.is_reg() && !mode.is_blk() && !mode.is_dir() && !mode.is_lnk() {
2846        return error!(ESPIPE);
2847    }
2848
2849    if flags == 0 {
2850        return Ok(());
2851    }
2852
2853    // Syncing the whole file is much more than we need for sync_file_range, which only needs to
2854    // sync the specified data range.
2855    file.data_sync(current_task)
2856}
2857
2858pub fn sys_fadvise64(
2859    _locked: &mut Locked<Unlocked>,
2860    current_task: &CurrentTask,
2861    fd: FdNumber,
2862    offset: off_t,
2863    len: off_t,
2864    advice: u32,
2865) -> Result<(), Errno> {
2866    match advice {
2867        POSIX_FADV_NORMAL => track_stub!(TODO("https://fxbug.dev/297434181"), "POSIX_FADV_NORMAL"),
2868        POSIX_FADV_RANDOM => track_stub!(TODO("https://fxbug.dev/297434181"), "POSIX_FADV_RANDOM"),
2869        POSIX_FADV_SEQUENTIAL => {
2870            track_stub!(TODO("https://fxbug.dev/297434181"), "POSIX_FADV_SEQUENTIAL")
2871        }
2872        POSIX_FADV_WILLNEED => {
2873            track_stub!(TODO("https://fxbug.dev/297434181"), "POSIX_FADV_WILLNEED")
2874        }
2875        POSIX_FADV_DONTNEED => {
2876            track_stub!(TODO("https://fxbug.dev/297434181"), "POSIX_FADV_DONTNEED")
2877        }
2878        POSIX_FADV_NOREUSE => {
2879            track_stub!(TODO("https://fxbug.dev/297434181"), "POSIX_FADV_NOREUSE")
2880        }
2881        _ => {
2882            track_stub!(TODO("https://fxbug.dev/322875684"), "fadvise64 unknown advice", advice);
2883            return error!(EINVAL);
2884        }
2885    }
2886
2887    if offset < 0 || len < 0 {
2888        return error!(EINVAL);
2889    }
2890
2891    let file = current_task.files.get(fd)?;
2892    // fadvise does not work on pipes.
2893    if file.downcast_file::<PipeFileObject>().is_some() {
2894        return error!(ESPIPE);
2895    }
2896
2897    // fadvise does not work on paths.
2898    if file.flags().contains(OpenFlags::PATH) {
2899        return error!(EBADF);
2900    }
2901
2902    Ok(())
2903}
2904
2905pub fn sys_fallocate(
2906    locked: &mut Locked<Unlocked>,
2907    current_task: &CurrentTask,
2908    fd: FdNumber,
2909    mode: u32,
2910    offset: off_t,
2911    len: off_t,
2912) -> Result<(), Errno> {
2913    let file = current_task.files.get(fd)?;
2914
2915    // Offset must not be less than 0.
2916    // Length must not be less than or equal to 0.
2917    // See https://man7.org/linux/man-pages/man2/fallocate.2.html#ERRORS
2918    if offset < 0 || len <= 0 {
2919        return error!(EINVAL);
2920    }
2921
2922    let mode = FallocMode::from_bits(mode).ok_or_else(|| errno!(EINVAL))?;
2923    file.fallocate(locked, current_task, mode, offset as u64, len as u64)?;
2924
2925    Ok(())
2926}
2927
2928pub fn sys_inotify_init1(
2929    locked: &mut Locked<Unlocked>,
2930    current_task: &CurrentTask,
2931    flags: u32,
2932) -> Result<FdNumber, Errno> {
2933    if flags & !(IN_NONBLOCK | IN_CLOEXEC) != 0 {
2934        return error!(EINVAL);
2935    }
2936    let non_blocking = flags & IN_NONBLOCK != 0;
2937    let close_on_exec = flags & IN_CLOEXEC != 0;
2938    let inotify_file = InotifyFileObject::new_file(locked, current_task, non_blocking);
2939    let fd_flags = if close_on_exec { FdFlags::CLOEXEC } else { FdFlags::empty() };
2940    current_task.add_file(locked, inotify_file, fd_flags)
2941}
2942
2943pub fn sys_inotify_add_watch(
2944    locked: &mut Locked<Unlocked>,
2945    current_task: &CurrentTask,
2946    fd: FdNumber,
2947    user_path: UserCString,
2948    mask: u32,
2949) -> Result<WdNumber, Errno> {
2950    let mask = InotifyMask::from_bits(mask).ok_or_else(|| errno!(EINVAL))?;
2951    if !mask.intersects(InotifyMask::ALL_EVENTS) {
2952        // Mask must include at least 1 event.
2953        return error!(EINVAL);
2954    }
2955    let file = current_task.files.get(fd)?;
2956    let inotify_file = file.downcast_file::<InotifyFileObject>().ok_or_else(|| errno!(EINVAL))?;
2957    let options = if mask.contains(InotifyMask::DONT_FOLLOW) {
2958        LookupFlags::no_follow()
2959    } else {
2960        LookupFlags::default()
2961    };
2962    let watched_node = lookup_at(locked, current_task, FdNumber::AT_FDCWD, user_path, options)?;
2963    if mask.contains(InotifyMask::ONLYDIR) && !watched_node.entry.node.is_dir() {
2964        return error!(ENOTDIR);
2965    }
2966    inotify_file.add_watch(watched_node.entry, mask, &file)
2967}
2968
2969pub fn sys_inotify_rm_watch(
2970    _locked: &mut Locked<Unlocked>,
2971    current_task: &CurrentTask,
2972    fd: FdNumber,
2973    watch_id: WdNumber,
2974) -> Result<(), Errno> {
2975    let file = current_task.files.get(fd)?;
2976    let inotify_file = file.downcast_file::<InotifyFileObject>().ok_or_else(|| errno!(EINVAL))?;
2977    inotify_file.remove_watch(watch_id, &file)
2978}
2979
2980pub fn sys_utimensat(
2981    locked: &mut Locked<Unlocked>,
2982    current_task: &CurrentTask,
2983    dir_fd: FdNumber,
2984    user_path: UserCString,
2985    user_times: TimeSpecPtr,
2986    flags: u32,
2987) -> Result<(), Errno> {
2988    let (atime, mtime) = if user_times.addr().is_null() {
2989        // If user_times is null, the timestamps are updated to the current time.
2990        (TimeUpdateType::Now, TimeUpdateType::Now)
2991    } else {
2992        let ts = current_task.read_multi_arch_objects_to_vec(user_times, 2)?;
2993        let atime = ts[0];
2994        let mtime = ts[1];
2995        let parse_timespec = |spec: timespec| match spec.tv_nsec {
2996            UTIME_NOW => Ok(TimeUpdateType::Now),
2997            UTIME_OMIT => Ok(TimeUpdateType::Omit),
2998            _ => time_from_timespec(spec).map(TimeUpdateType::Time),
2999        };
3000        (parse_timespec(atime)?, parse_timespec(mtime)?)
3001    };
3002
3003    if let (TimeUpdateType::Omit, TimeUpdateType::Omit) = (atime, mtime) {
3004        return Ok(());
3005    };
3006
3007    // Non-standard feature: if user_path is null, the timestamps are updated on the file referred
3008    // to by dir_fd.
3009    // See https://man7.org/linux/man-pages/man2/utimensat.2.html
3010    let name = if user_path.addr().is_null() {
3011        if dir_fd == FdNumber::AT_FDCWD {
3012            return error!(EFAULT);
3013        }
3014        let (node, _) = current_task.resolve_dir_fd(
3015            locked,
3016            dir_fd,
3017            Default::default(),
3018            ResolveFlags::empty(),
3019        )?;
3020        node
3021    } else {
3022        let lookup_flags = LookupFlags::from_bits(flags, AT_SYMLINK_NOFOLLOW)?;
3023        lookup_at(locked, current_task, dir_fd, user_path, lookup_flags)?
3024    };
3025    name.entry.node.update_atime_mtime(locked, current_task, &name.mount, atime, mtime)?;
3026    let event_mask = match (atime, mtime) {
3027        (_, TimeUpdateType::Omit) => InotifyMask::ACCESS,
3028        (TimeUpdateType::Omit, _) => InotifyMask::MODIFY,
3029        (_, _) => InotifyMask::ATTRIB,
3030    };
3031    name.entry.notify_ignoring_excl_unlink(event_mask);
3032    Ok(())
3033}
3034
3035pub fn sys_splice(
3036    locked: &mut Locked<Unlocked>,
3037    current_task: &CurrentTask,
3038    fd_in: FdNumber,
3039    off_in: OffsetPtr,
3040    fd_out: FdNumber,
3041    off_out: OffsetPtr,
3042    len: usize,
3043    flags: u32,
3044) -> Result<usize, Errno> {
3045    splice::splice(locked, current_task, fd_in, off_in, fd_out, off_out, len, flags)
3046}
3047
3048pub fn sys_vmsplice(
3049    locked: &mut Locked<Unlocked>,
3050    current_task: &CurrentTask,
3051    fd: FdNumber,
3052    iovec_addr: IOVecPtr,
3053    iovec_count: UserValue<i32>,
3054    flags: u32,
3055) -> Result<usize, Errno> {
3056    splice::vmsplice(locked, current_task, fd, iovec_addr, iovec_count, flags)
3057}
3058
3059pub fn sys_copy_file_range(
3060    locked: &mut Locked<Unlocked>,
3061    current_task: &CurrentTask,
3062    fd_in: FdNumber,
3063    off_in: OffsetPtr,
3064    fd_out: FdNumber,
3065    off_out: OffsetPtr,
3066    len: usize,
3067    flags: u32,
3068) -> Result<usize, Errno> {
3069    splice::copy_file_range(locked, current_task, fd_in, off_in, fd_out, off_out, len, flags)
3070}
3071
3072pub fn sys_tee(
3073    locked: &mut Locked<Unlocked>,
3074    current_task: &CurrentTask,
3075    fd_in: FdNumber,
3076    fd_out: FdNumber,
3077    len: usize,
3078    flags: u32,
3079) -> Result<usize, Errno> {
3080    splice::tee(locked, current_task, fd_in, fd_out, len, flags)
3081}
3082
3083pub fn sys_readahead(
3084    _locked: &mut Locked<Unlocked>,
3085    current_task: &CurrentTask,
3086    fd: FdNumber,
3087    offset: off_t,
3088    length: usize,
3089) -> Result<(), Errno> {
3090    let file = current_task.files.get(fd)?;
3091    // Allow only non-negative values of `offset`. Some versions of Linux allow it to be negative,
3092    // but GVisor tests require `readahead()` to fail in this case.
3093    let offset: usize = offset.try_into().map_err(|_| errno!(EINVAL))?;
3094    file.readahead(current_task, offset, length)
3095}
3096
3097pub fn sys_io_setup(
3098    _locked: &mut Locked<Unlocked>,
3099    current_task: &CurrentTask,
3100    user_nr_events: UserValue<u32>,
3101    user_ctx_idp: MultiArchUserRef<uapi::aio_context_t, uapi::arch32::aio_context_t>,
3102) -> Result<(), Errno> {
3103    // From https://man7.org/linux/man-pages/man2/io_setup.2.html:
3104    //
3105    //   EINVAL ctx_idp is not initialized, or the specified nr_events
3106    //   exceeds internal limits.  nr_events should be greater than
3107    //   0.
3108    //
3109    // TODO: Determine what "internal limits" means.
3110    let max_operations =
3111        user_nr_events.validate(0..(i32::MAX as u32)).ok_or_else(|| errno!(EINVAL))? as usize;
3112    if current_task.read_multi_arch_object(user_ctx_idp)? != 0 {
3113        return error!(EINVAL);
3114    }
3115    let ctx_id = AioContext::create(current_task, max_operations)?;
3116    current_task.write_multi_arch_object(user_ctx_idp, ctx_id).map_err(|e| {
3117        let _ = current_task
3118            .mm()
3119            .expect("previous sys_io_setup code verified mm exists")
3120            .destroy_aio_context(ctx_id.into());
3121        e
3122    })?;
3123    Ok(())
3124}
3125
3126pub fn sys_io_submit(
3127    _locked: &mut Locked<Unlocked>,
3128    current_task: &CurrentTask,
3129    ctx_id: aio_context_t,
3130    user_nr: UserValue<i32>,
3131    mut iocb_addrs: IocbPtrPtr,
3132) -> Result<i32, Errno> {
3133    let nr = user_nr.validate(0..i32::MAX).ok_or_else(|| errno!(EINVAL))?;
3134    if nr == 0 {
3135        return Ok(0);
3136    }
3137    let ctx = current_task.mm()?.get_aio_context(ctx_id.into()).ok_or_else(|| errno!(EINVAL))?;
3138
3139    // `iocbpp` is an array of addresses to iocb's.
3140    let mut num_submitted: i32 = 0;
3141    loop {
3142        let iocb_ref = current_task.read_multi_arch_ptr(iocb_addrs)?;
3143        let control_block = current_task.read_multi_arch_object(iocb_ref)?;
3144
3145        match (num_submitted, ctx.submit(current_task, control_block, iocb_ref)) {
3146            (0, Err(e)) => return Err(e),
3147            (_, Err(_)) => break,
3148            (_, Ok(())) => {
3149                num_submitted += 1;
3150                if num_submitted == nr {
3151                    break;
3152                }
3153            }
3154        };
3155
3156        iocb_addrs = iocb_addrs.next()?;
3157    }
3158
3159    Ok(num_submitted)
3160}
3161
3162pub fn sys_io_getevents(
3163    _locked: &mut Locked<Unlocked>,
3164    current_task: &CurrentTask,
3165    ctx_id: aio_context_t,
3166    min_nr: i64,
3167    nr: i64,
3168    events_ref: UserRef<io_event>,
3169    user_timeout: TimeSpecPtr,
3170) -> Result<i32, Errno> {
3171    if min_nr < 0 || min_nr > nr || nr < 0 {
3172        return error!(EINVAL);
3173    }
3174    let min_results = min_nr as usize;
3175    let max_results = nr as usize;
3176    let deadline = deadline_after_timespec(current_task, user_timeout)?;
3177
3178    let ctx = current_task.mm()?.get_aio_context(ctx_id.into()).ok_or_else(|| errno!(EINVAL))?;
3179    let events = ctx.get_events(current_task, min_results, max_results, deadline)?;
3180    current_task.write_objects(events_ref, &events)?;
3181
3182    Ok(events.len() as i32)
3183}
3184
3185pub fn sys_io_cancel(
3186    _locked: &mut Locked<Unlocked>,
3187    current_task: &CurrentTask,
3188    ctx_id: aio_context_t,
3189    user_iocb: IocbPtr,
3190    _result: UserRef<io_event>,
3191) -> Result<(), Errno> {
3192    let iocb = current_task.read_multi_arch_object(user_iocb)?;
3193    let ctx = current_task.mm()?.get_aio_context(ctx_id.into()).ok_or_else(|| errno!(EINVAL))?;
3194
3195    ctx.cancel(current_task, iocb, user_iocb)?;
3196    // TODO: Correctly handle return. If the operation is successfully canceled, the event should be copied into the memory pointed to by result without being placed into the completion queue.
3197    track_stub!(TODO("https://fxbug.dev/297433877"), "io_cancel");
3198    Ok(())
3199}
3200
3201pub fn sys_io_destroy(
3202    _locked: &mut Locked<Unlocked>,
3203    current_task: &CurrentTask,
3204    ctx_id: aio_context_t,
3205) -> Result<(), Errno> {
3206    let aio_context = current_task.mm()?.destroy_aio_context(ctx_id.into())?;
3207    std::mem::drop(aio_context);
3208    Ok(())
3209}
3210
3211pub fn sys_io_uring_setup(
3212    locked: &mut Locked<Unlocked>,
3213    current_task: &CurrentTask,
3214    user_entries: UserValue<u32>,
3215    user_params: UserRef<io_uring_params>,
3216) -> Result<FdNumber, Errno> {
3217    // TODO: https://fxbug.dev/397186254 - we will want to do a no-audit CAP_IPC_LOCK capability
3218    // check; see "If not granted CAP_IPC_LOCK io_uring operations are accounted against the user's
3219    // RLIMIT_MEMLOCK limit" at
3220    // https://github.com/SELinuxProject/selinux-notebook/blob/main/src/auditing.md#capability-audit-exemptions
3221
3222    if !current_task.kernel().features.io_uring {
3223        return error!(ENOSYS);
3224    }
3225
3226    // Apply policy from /proc/sys/kernel/io_uring_disabled
3227    let limits = &current_task.kernel().system_limits;
3228    match limits.io_uring_disabled.load(atomic::Ordering::Relaxed) {
3229        0 => (),
3230        1 => {
3231            let io_uring_group = limits.io_uring_group.load(atomic::Ordering::Relaxed).try_into();
3232            if io_uring_group.is_err()
3233                || !current_task
3234                    .with_current_creds(|creds| creds.is_in_group(io_uring_group.unwrap()))
3235            {
3236                security::check_task_capable(current_task, CAP_SYS_ADMIN)?;
3237            }
3238        }
3239        _ => {
3240            return error!(EPERM);
3241        }
3242    }
3243
3244    let entries = user_entries.validate(1..IORING_MAX_ENTRIES).ok_or_else(|| errno!(EINVAL))?;
3245
3246    let mut params = current_task.read_object(user_params)?;
3247    for byte in params.resv {
3248        if byte != 0 {
3249            return error!(EINVAL);
3250        }
3251    }
3252
3253    let file = IoUringFileObject::new_file(locked, current_task, entries, &mut params)?;
3254
3255    // io_uring file descriptors are always created with CLOEXEC.
3256    let fd = current_task.add_file(locked, file, FdFlags::CLOEXEC)?;
3257    current_task.write_object(user_params, &params)?;
3258    Ok(fd)
3259}
3260
3261pub fn sys_io_uring_enter(
3262    locked: &mut Locked<Unlocked>,
3263    current_task: &CurrentTask,
3264    fd: FdNumber,
3265    to_submit: u32,
3266    min_complete: u32,
3267    flags: u32,
3268    _sig: UserRef<SigSet>,
3269    sigset_size: usize,
3270) -> Result<u32, Errno> {
3271    if !current_task.kernel().features.io_uring {
3272        return error!(ENOSYS);
3273    }
3274    if !_sig.is_null() {
3275        if sigset_size != std::mem::size_of::<SigSet>() {
3276            return error!(EINVAL);
3277        }
3278    }
3279    let file = current_task.files.get(fd)?;
3280    let io_uring = file.downcast_file::<IoUringFileObject>().ok_or_else(|| errno!(EOPNOTSUPP))?;
3281    // TODO(https://fxbug.dev/297431387): Use `_sig` to change the signal mask for `current_task`.
3282    io_uring.enter(locked, current_task, to_submit, min_complete, flags)
3283}
3284
3285pub fn sys_io_uring_register(
3286    locked: &mut Locked<Unlocked>,
3287    current_task: &CurrentTask,
3288    fd: FdNumber,
3289    opcode: u32,
3290    arg: UserAddress,
3291    nr_args: UserValue<u32>,
3292) -> Result<SyscallResult, Errno> {
3293    if !current_task.kernel().features.io_uring {
3294        return error!(ENOSYS);
3295    }
3296    let file = current_task.files.get(fd)?;
3297    let io_uring = file.downcast_file::<IoUringFileObject>().ok_or_else(|| errno!(EOPNOTSUPP))?;
3298    match opcode {
3299        IORING_REGISTER_BUFFERS => {
3300            // TODO(https://fxbug.dev/297431387): Check nr_args for zero and return EINVAL here.
3301            let iovec = IOVecPtr::new(current_task, arg);
3302            let buffers = current_task.read_iovec(iovec, nr_args)?;
3303            io_uring.register_buffers(locked, buffers);
3304            return Ok(SUCCESS);
3305        }
3306        IORING_UNREGISTER_BUFFERS => {
3307            if !arg.is_null() {
3308                return error!(EINVAL);
3309            }
3310            io_uring.unregister_buffers(locked);
3311            return Ok(SUCCESS);
3312        }
3313        IORING_REGISTER_IOWQ_MAX_WORKERS => {
3314            track_stub!(
3315                TODO("https://fxbug.dev/297431387"),
3316                "io_uring_register IORING_REGISTER_IOWQ_MAX_WORKERS",
3317                opcode
3318            );
3319            // The current implementation only ever use 1 worker for read and 1 for write.
3320            return Ok(SUCCESS);
3321        }
3322        IORING_REGISTER_RING_FDS => {
3323            track_stub!(
3324                TODO("https://fxbug.dev/297431387"),
3325                "io_uring_register IORING_REGISTER_RING_FDS",
3326                opcode
3327            );
3328            // The current implementation doesn't use any thread local specific identifier for
3329            // performance. Instead, when registering a fd, just return the passed fd as the value
3330            // to use.
3331            let nr_args: usize = nr_args.raw().try_into().map_err(|_| errno!(EINVAL))?;
3332            if nr_args > 16 {
3333                return error!(EINVAL);
3334            }
3335            let updates_addr = UserRef::<uapi::io_uring_rsrc_update>::from(arg);
3336            let mut updates = current_task
3337                .read_objects_to_smallvec::<uapi::io_uring_rsrc_update, 1>(updates_addr, nr_args)?;
3338            let mut result = 0;
3339            for update in updates.iter_mut() {
3340                if update.offset == u32::MAX {
3341                    update.offset = update.data.try_into().map_err(|_| errno!(EINVAL))?;
3342                    result += 1;
3343                }
3344            }
3345            current_task.write_objects(updates_addr, &updates)?;
3346            return Ok(result.into());
3347        }
3348        IORING_UNREGISTER_RING_FDS => {
3349            track_stub!(
3350                TODO("https://fxbug.dev/297431387"),
3351                "io_uring_register IORING_UNREGISTER_RING_FDS",
3352                opcode
3353            );
3354            // Because registering a fd doesn't use any resource currently, unregistering is free.
3355            return Ok(SUCCESS);
3356        }
3357        IORING_REGISTER_PBUF_RING => {
3358            let nr_args: usize = nr_args.raw().try_into().map_err(|_| errno!(EINVAL))?;
3359            if nr_args != 1 {
3360                return error!(EINVAL);
3361            }
3362            let buffer_definition: uapi::io_uring_buf_reg = current_task.read_object(arg.into())?;
3363            io_uring.register_ring_buffers(locked, buffer_definition)?;
3364            return Ok(SUCCESS);
3365        }
3366
3367        IORING_UNREGISTER_PBUF_RING => {
3368            let nr_args: usize = nr_args.raw().try_into().map_err(|_| errno!(EINVAL))?;
3369            if nr_args != 1 {
3370                return error!(EINVAL);
3371            }
3372            let buffer_definition: uapi::io_uring_buf_reg = current_task.read_object(arg.into())?;
3373            io_uring.unregister_ring_buffers(locked, buffer_definition)?;
3374            return Ok(SUCCESS);
3375        }
3376
3377        IORING_REGISTER_PBUF_STATUS => {
3378            let nr_args: usize = nr_args.raw().try_into().map_err(|_| errno!(EINVAL))?;
3379            if nr_args != 1 {
3380                return error!(EINVAL);
3381            }
3382            let buffer_status_addr = UserRef::<uapi::io_uring_buf_status>::from(arg);
3383            let mut buffer_status: uapi::io_uring_buf_status =
3384                current_task.read_object(buffer_status_addr)?;
3385            io_uring.ring_buffer_status(locked, &mut buffer_status)?;
3386            current_task.write_object(buffer_status_addr, &buffer_status)?;
3387            return Ok(SUCCESS);
3388        }
3389
3390        _ => {
3391            track_stub!(
3392                TODO("https://fxbug.dev/297431387"),
3393                "io_uring_register unknown op",
3394                opcode
3395            );
3396            return error!(EINVAL);
3397        }
3398    }
3399}
3400
3401// Syscalls for arch32 usage
3402#[cfg(target_arch = "aarch64")]
3403mod arch32 {
3404    use crate::mm::MemoryAccessorExt;
3405    use crate::task::CurrentTask;
3406    use crate::vfs::syscalls::{
3407        LookupFlags, OpenFlags, lookup_at, sys_dup3, sys_faccessat, sys_fallocate, sys_lseek,
3408        sys_mkdirat, sys_openat, sys_readlinkat, sys_unlinkat,
3409    };
3410    use crate::vfs::{FdNumber, FsNode};
3411    use linux_uapi::off_t;
3412    use starnix_sync::{Locked, Unlocked};
3413    use starnix_syscalls::SyscallArg;
3414    use starnix_types::time::duration_from_poll_timeout;
3415    use starnix_uapi::errors::Errno;
3416    use starnix_uapi::file_mode::FileMode;
3417    use starnix_uapi::signals::SigSet;
3418    use starnix_uapi::user_address::{MultiArchUserRef, UserAddress, UserCString, UserRef};
3419    use starnix_uapi::vfs::EpollEvent;
3420    use starnix_uapi::{AT_REMOVEDIR, errno, error, uapi};
3421
3422    type StatFs64Ptr = MultiArchUserRef<uapi::statfs, uapi::arch32::statfs64>;
3423
3424    fn merge_low_and_high(low: u32, high: u32) -> off_t {
3425        ((high as off_t) << 32) | (low as off_t)
3426    }
3427
3428    pub fn sys_arch32_open(
3429        locked: &mut Locked<Unlocked>,
3430        current_task: &CurrentTask,
3431        user_path: UserCString,
3432        flags: u32,
3433        mode: FileMode,
3434    ) -> Result<FdNumber, Errno> {
3435        sys_openat(locked, current_task, FdNumber::AT_FDCWD, user_path, flags, mode)
3436    }
3437
3438    pub fn sys_arch32_access(
3439        locked: &mut Locked<Unlocked>,
3440        current_task: &CurrentTask,
3441        user_path: UserCString,
3442        mode: u32,
3443    ) -> Result<(), Errno> {
3444        sys_faccessat(locked, current_task, FdNumber::AT_FDCWD, user_path, mode)
3445    }
3446    pub fn stat64(
3447        locked: &mut Locked<Unlocked>,
3448        current_task: &CurrentTask,
3449        node: &FsNode,
3450        arch32_stat_buf: UserRef<uapi::arch32::stat64>,
3451    ) -> Result<(), Errno> {
3452        let stat_buffer = node.stat(locked, current_task)?;
3453        let result: uapi::arch32::stat64 = stat_buffer.try_into().map_err(|_| errno!(EINVAL))?;
3454        // Now we copy to the arch32 version and write.
3455        current_task.write_object(arch32_stat_buf, &result)?;
3456        Ok(())
3457    }
3458
3459    pub fn sys_arch32_fstat64(
3460        locked: &mut Locked<Unlocked>,
3461        current_task: &CurrentTask,
3462        fd: FdNumber,
3463        arch32_stat_buf: UserRef<uapi::arch32::stat64>,
3464    ) -> Result<(), Errno> {
3465        let file = current_task.files.get_allowing_opath(fd)?;
3466        stat64(locked, current_task, file.node(), arch32_stat_buf)
3467    }
3468
3469    pub fn sys_arch32_fallocate(
3470        locked: &mut Locked<Unlocked>,
3471        current_task: &CurrentTask,
3472        fd: FdNumber,
3473        mode: u32,
3474        offset_low: u32,
3475        offset_high: u32,
3476        len_low: u32,
3477        len_high: u32,
3478    ) -> Result<(), Errno> {
3479        let offset = merge_low_and_high(offset_low, offset_high);
3480        let len = merge_low_and_high(len_low, len_high);
3481        sys_fallocate(locked, current_task, fd, mode, offset, len)
3482    }
3483
3484    pub fn sys_arch32_stat64(
3485        locked: &mut Locked<Unlocked>,
3486        current_task: &CurrentTask,
3487        user_path: UserCString,
3488        arch32_stat_buf: UserRef<uapi::arch32::stat64>,
3489    ) -> Result<(), Errno> {
3490        let name =
3491            lookup_at(locked, current_task, FdNumber::AT_FDCWD, user_path, LookupFlags::default())?;
3492        stat64(locked, current_task, &name.entry.node, arch32_stat_buf)
3493    }
3494
3495    pub fn sys_arch32_readlink(
3496        locked: &mut Locked<Unlocked>,
3497        current_task: &CurrentTask,
3498        user_path: UserCString,
3499        buffer: UserAddress,
3500        buffer_size: usize,
3501    ) -> Result<usize, Errno> {
3502        sys_readlinkat(locked, current_task, FdNumber::AT_FDCWD, user_path, buffer, buffer_size)
3503    }
3504
3505    pub fn sys_arch32_mkdir(
3506        locked: &mut Locked<Unlocked>,
3507        current_task: &CurrentTask,
3508        user_path: UserCString,
3509        mode: FileMode,
3510    ) -> Result<(), Errno> {
3511        sys_mkdirat(locked, current_task, FdNumber::AT_FDCWD, user_path, mode)
3512    }
3513
3514    pub fn sys_arch32_rmdir(
3515        locked: &mut Locked<Unlocked>,
3516        current_task: &CurrentTask,
3517        user_path: UserCString,
3518    ) -> Result<(), Errno> {
3519        sys_unlinkat(locked, current_task, FdNumber::AT_FDCWD, user_path, AT_REMOVEDIR)
3520    }
3521
3522    #[allow(non_snake_case)]
3523    pub fn sys_arch32__llseek(
3524        locked: &mut Locked<Unlocked>,
3525        current_task: &CurrentTask,
3526        fd: FdNumber,
3527        offset_high: u32,
3528        offset_low: u32,
3529        result: UserRef<off_t>,
3530        whence: u32,
3531    ) -> Result<(), Errno> {
3532        let offset = merge_low_and_high(offset_low, offset_high);
3533        let result_value = sys_lseek(locked, current_task, fd, offset, whence)?;
3534        current_task.write_object(result, &result_value).map(|_| ())
3535    }
3536
3537    pub fn sys_arch32_dup2(
3538        locked: &mut Locked<Unlocked>,
3539        current_task: &CurrentTask,
3540        oldfd: FdNumber,
3541        newfd: FdNumber,
3542    ) -> Result<FdNumber, Errno> {
3543        if oldfd == newfd {
3544            // O_PATH allowed for:
3545            //
3546            //  Duplicating the file descriptor (dup(2), fcntl(2)
3547            //  F_DUPFD, etc.).
3548            //
3549            // See https://man7.org/linux/man-pages/man2/open.2.html
3550            current_task.files.get_allowing_opath(oldfd)?;
3551            return Ok(newfd);
3552        }
3553        sys_dup3(locked, current_task, oldfd, newfd, 0)
3554    }
3555
3556    pub fn sys_arch32_unlink(
3557        locked: &mut Locked<Unlocked>,
3558        current_task: &CurrentTask,
3559        user_path: UserCString,
3560    ) -> Result<(), Errno> {
3561        sys_unlinkat(locked, current_task, FdNumber::AT_FDCWD, user_path, 0)
3562    }
3563
3564    pub fn sys_arch32_pread64(
3565        locked: &mut Locked<Unlocked>,
3566        current_task: &CurrentTask,
3567        fd: FdNumber,
3568        address: UserAddress,
3569        length: usize,
3570        _: SyscallArg,
3571        offset_low: u32,
3572        offset_high: u32,
3573    ) -> Result<usize, Errno> {
3574        super::sys_pread64(
3575            locked,
3576            current_task,
3577            fd,
3578            address,
3579            length,
3580            merge_low_and_high(offset_low, offset_high),
3581        )
3582    }
3583
3584    pub fn sys_arch32_pwrite64(
3585        locked: &mut Locked<Unlocked>,
3586        current_task: &CurrentTask,
3587        fd: FdNumber,
3588        address: UserAddress,
3589        length: usize,
3590        _: SyscallArg,
3591        offset_low: u32,
3592        offset_high: u32,
3593    ) -> Result<usize, Errno> {
3594        super::sys_pwrite64(
3595            locked,
3596            current_task,
3597            fd,
3598            address,
3599            length,
3600            merge_low_and_high(offset_low, offset_high),
3601        )
3602    }
3603
3604    pub fn sys_arch32_truncate64(
3605        locked: &mut Locked<Unlocked>,
3606        current_task: &CurrentTask,
3607        user_path: UserCString,
3608        _unused: SyscallArg,
3609        length_low: u32,
3610        length_high: u32,
3611    ) -> Result<(), Errno> {
3612        super::sys_truncate(
3613            locked,
3614            current_task,
3615            user_path,
3616            merge_low_and_high(length_low, length_high),
3617        )
3618    }
3619
3620    pub fn sys_arch32_ftruncate64(
3621        locked: &mut Locked<Unlocked>,
3622        current_task: &CurrentTask,
3623        fd: FdNumber,
3624        _: SyscallArg,
3625        length_low: u32,
3626        length_high: u32,
3627    ) -> Result<(), Errno> {
3628        super::sys_ftruncate(locked, current_task, fd, merge_low_and_high(length_low, length_high))
3629    }
3630
3631    pub fn sys_arch32_chmod(
3632        locked: &mut Locked<Unlocked>,
3633        current_task: &CurrentTask,
3634        user_path: UserCString,
3635        mode: FileMode,
3636    ) -> Result<(), Errno> {
3637        super::sys_fchmodat(locked, current_task, FdNumber::AT_FDCWD, user_path, mode)
3638    }
3639
3640    pub fn sys_arch32_chown32(
3641        locked: &mut Locked<Unlocked>,
3642        current_task: &CurrentTask,
3643        user_path: UserCString,
3644        owner: uapi::arch32::__kernel_uid32_t,
3645        group: uapi::arch32::__kernel_uid32_t,
3646    ) -> Result<(), Errno> {
3647        super::sys_fchownat(locked, current_task, FdNumber::AT_FDCWD, user_path, owner, group, 0)
3648    }
3649
3650    pub fn sys_arch32_poll(
3651        locked: &mut Locked<Unlocked>,
3652        current_task: &mut CurrentTask,
3653        user_fds: UserRef<uapi::pollfd>,
3654        num_fds: i32,
3655        timeout: i32,
3656    ) -> Result<usize, Errno> {
3657        let deadline = zx::MonotonicInstant::after(duration_from_poll_timeout(timeout)?);
3658        super::poll(locked, current_task, user_fds, num_fds, None, deadline)
3659    }
3660
3661    pub fn sys_arch32_epoll_create(
3662        locked: &mut Locked<Unlocked>,
3663        current_task: &CurrentTask,
3664        size: i32,
3665    ) -> Result<FdNumber, Errno> {
3666        if size < 1 {
3667            // The man page for epoll_create says the size was used in a previous implementation as
3668            // a hint but no longer does anything. But it's still required to be >= 1 to ensure
3669            // programs are backwards-compatible.
3670            return error!(EINVAL);
3671        }
3672        super::sys_epoll_create1(locked, current_task, 0)
3673    }
3674
3675    pub fn sys_arch32_epoll_wait(
3676        locked: &mut Locked<Unlocked>,
3677        current_task: &mut CurrentTask,
3678        epfd: FdNumber,
3679        events: UserRef<EpollEvent>,
3680        max_events: i32,
3681        timeout: i32,
3682    ) -> Result<usize, Errno> {
3683        super::sys_epoll_pwait(
3684            locked,
3685            current_task,
3686            epfd,
3687            events,
3688            max_events,
3689            timeout,
3690            UserRef::<SigSet>::default(),
3691        )
3692    }
3693
3694    pub fn sys_arch32_rename(
3695        locked: &mut Locked<Unlocked>,
3696        current_task: &CurrentTask,
3697        old_user_path: UserCString,
3698        new_user_path: UserCString,
3699    ) -> Result<(), Errno> {
3700        super::sys_renameat2(
3701            locked,
3702            current_task,
3703            FdNumber::AT_FDCWD,
3704            old_user_path,
3705            FdNumber::AT_FDCWD,
3706            new_user_path,
3707            0,
3708        )
3709    }
3710
3711    pub fn sys_arch32_creat(
3712        locked: &mut Locked<Unlocked>,
3713        current_task: &CurrentTask,
3714        user_path: UserCString,
3715        mode: FileMode,
3716    ) -> Result<FdNumber, Errno> {
3717        super::sys_openat(
3718            locked,
3719            current_task,
3720            FdNumber::AT_FDCWD,
3721            user_path,
3722            (OpenFlags::WRONLY | OpenFlags::CREAT | OpenFlags::TRUNC).bits(),
3723            mode,
3724        )
3725    }
3726
3727    pub fn sys_arch32_symlink(
3728        locked: &mut Locked<Unlocked>,
3729        current_task: &CurrentTask,
3730        user_target: UserCString,
3731        user_path: UserCString,
3732    ) -> Result<(), Errno> {
3733        super::sys_symlinkat(locked, current_task, user_target, FdNumber::AT_FDCWD, user_path)
3734    }
3735
3736    pub fn sys_arch32_eventfd(
3737        locked: &mut Locked<Unlocked>,
3738        current_task: &CurrentTask,
3739        value: u32,
3740    ) -> Result<FdNumber, Errno> {
3741        super::sys_eventfd2(locked, current_task, value, 0)
3742    }
3743
3744    pub fn sys_arch32_inotify_init(
3745        locked: &mut Locked<Unlocked>,
3746        current_task: &CurrentTask,
3747    ) -> Result<FdNumber, Errno> {
3748        super::sys_inotify_init1(locked, current_task, 0)
3749    }
3750
3751    pub fn sys_arch32_link(
3752        locked: &mut Locked<Unlocked>,
3753        current_task: &CurrentTask,
3754        old_user_path: UserCString,
3755        new_user_path: UserCString,
3756    ) -> Result<(), Errno> {
3757        super::sys_linkat(
3758            locked,
3759            current_task,
3760            FdNumber::AT_FDCWD,
3761            old_user_path,
3762            FdNumber::AT_FDCWD,
3763            new_user_path,
3764            0,
3765        )
3766    }
3767
3768    pub fn sys_arch32_fstatfs64(
3769        locked: &mut Locked<Unlocked>,
3770        current_task: &CurrentTask,
3771        fd: FdNumber,
3772        user_buf_len: u32,
3773        user_buf: StatFs64Ptr,
3774    ) -> Result<(), Errno> {
3775        if (user_buf_len as usize) < std::mem::size_of::<uapi::arch32::statfs64>() {
3776            return error!(EINVAL);
3777        }
3778        super::fstatfs(locked, current_task, fd, user_buf)
3779    }
3780
3781    pub fn sys_arch32_statfs64(
3782        locked: &mut Locked<Unlocked>,
3783        current_task: &CurrentTask,
3784        user_path: UserCString,
3785        user_buf_len: u32,
3786        user_buf: StatFs64Ptr,
3787    ) -> Result<(), Errno> {
3788        if (user_buf_len as usize) < std::mem::size_of::<uapi::arch32::statfs64>() {
3789            return error!(EINVAL);
3790        }
3791        super::statfs(locked, current_task, user_path, user_buf)
3792    }
3793
3794    pub fn sys_arch32_arm_fadvise64_64(
3795        locked: &mut Locked<Unlocked>,
3796        current_task: &CurrentTask,
3797        fd: FdNumber,
3798        advice: u32,
3799        offset_low: u32,
3800        offset_high: u32,
3801        len_low: u32,
3802        len_high: u32,
3803    ) -> Result<(), Errno> {
3804        let offset = merge_low_and_high(offset_low, offset_high);
3805        let len = merge_low_and_high(len_low, len_high);
3806        super::sys_fadvise64(locked, current_task, fd, offset, len, advice)
3807    }
3808
3809    pub fn sys_arch32_sendfile64(
3810        locked: &mut Locked<Unlocked>,
3811        current_task: &CurrentTask,
3812        out_fd: FdNumber,
3813        in_fd: FdNumber,
3814        user_offset: UserRef<uapi::off_t>,
3815        count: i32,
3816    ) -> Result<usize, Errno> {
3817        super::sys_sendfile(locked, current_task, out_fd, in_fd, user_offset.into(), count)
3818    }
3819
3820    pub use super::{
3821        sys_chdir as sys_arch32_chdir, sys_chroot as sys_arch32_chroot,
3822        sys_copy_file_range as sys_arch32_copy_file_range, sys_dup3 as sys_arch32_dup3,
3823        sys_epoll_create1 as sys_arch32_epoll_create1, sys_epoll_ctl as sys_arch32_epoll_ctl,
3824        sys_epoll_pwait as sys_arch32_epoll_pwait, sys_epoll_pwait2 as sys_arch32_epoll_pwait2,
3825        sys_eventfd2 as sys_arch32_eventfd2, sys_fchmod as sys_arch32_fchmod,
3826        sys_fchmodat as sys_arch32_fchmodat, sys_fchown as sys_arch32_fchown32,
3827        sys_fchown as sys_arch32_fchown, sys_fchownat as sys_arch32_fchownat,
3828        sys_fdatasync as sys_arch32_fdatasync, sys_flock as sys_arch32_flock,
3829        sys_fsetxattr as sys_arch32_fsetxattr, sys_fstatat64 as sys_arch32_fstatat64,
3830        sys_fstatfs as sys_arch32_fstatfs, sys_fsync as sys_arch32_fsync,
3831        sys_ftruncate as sys_arch32_ftruncate,
3832        sys_inotify_add_watch as sys_arch32_inotify_add_watch,
3833        sys_inotify_init1 as sys_arch32_inotify_init1,
3834        sys_inotify_rm_watch as sys_arch32_inotify_rm_watch, sys_io_cancel as sys_arch32_io_cancel,
3835        sys_io_destroy as sys_arch32_io_destroy, sys_io_getevents as sys_arch32_io_getevents,
3836        sys_io_setup as sys_arch32_io_setup, sys_io_submit as sys_arch32_io_submit,
3837        sys_io_uring_enter as sys_arch32_io_uring_enter,
3838        sys_io_uring_register as sys_arch32_io_uring_register,
3839        sys_io_uring_setup as sys_arch32_io_uring_setup, sys_lgetxattr as sys_arch32_lgetxattr,
3840        sys_linkat as sys_arch32_linkat, sys_listxattr as sys_arch32_listxattr,
3841        sys_llistxattr as sys_arch32_llistxattr, sys_lsetxattr as sys_arch32_lsetxattr,
3842        sys_mkdirat as sys_arch32_mkdirat, sys_mknodat as sys_arch32_mknodat,
3843        sys_pidfd_getfd as sys_arch32_pidfd_getfd, sys_pidfd_open as sys_arch32_pidfd_open,
3844        sys_ppoll as sys_arch32_ppoll, sys_preadv as sys_arch32_preadv,
3845        sys_pselect6 as sys_arch32_pselect6, sys_readv as sys_arch32_readv,
3846        sys_removexattr as sys_arch32_removexattr, sys_renameat2 as sys_arch32_renameat2,
3847        sys_select as sys_arch32__newselect, sys_sendfile as sys_arch32_sendfile,
3848        sys_setxattr as sys_arch32_setxattr, sys_splice as sys_arch32_splice,
3849        sys_statfs as sys_arch32_statfs, sys_statx as sys_arch32_statx,
3850        sys_symlinkat as sys_arch32_symlinkat, sys_sync as sys_arch32_sync,
3851        sys_tee as sys_arch32_tee, sys_timerfd_create as sys_arch32_timerfd_create,
3852        sys_timerfd_gettime as sys_arch32_timerfd_gettime,
3853        sys_timerfd_settime as sys_arch32_timerfd_settime, sys_truncate as sys_arch32_truncate,
3854        sys_umask as sys_arch32_umask, sys_utimensat as sys_arch32_utimensat,
3855        sys_vmsplice as sys_arch32_vmsplice,
3856    };
3857}
3858
3859#[cfg(target_arch = "aarch64")]
3860pub use arch32::*;
3861
3862#[cfg(test)]
3863mod tests {
3864    use super::*;
3865    use crate::testing::*;
3866    use starnix_types::vfs::default_statfs;
3867    use starnix_uapi::{O_RDONLY, SEEK_CUR, SEEK_END, SEEK_SET};
3868    use zerocopy::IntoBytes;
3869
3870    #[::fuchsia::test]
3871    async fn test_sys_lseek() -> Result<(), Errno> {
3872        spawn_kernel_and_run_with_pkgfs(async |locked, current_task| {
3873            let fd = FdNumber::from_raw(10);
3874            let file_handle =
3875                current_task.open_file(locked, "data/testfile.txt".into(), OpenFlags::RDONLY)?;
3876            let file_size = file_handle.node().stat(locked, current_task).unwrap().st_size;
3877            current_task.files.insert(locked, current_task, fd, file_handle).unwrap();
3878
3879            assert_eq!(sys_lseek(locked, current_task, fd, 0, SEEK_CUR)?, 0);
3880            assert_eq!(sys_lseek(locked, current_task, fd, 1, SEEK_CUR)?, 1);
3881            assert_eq!(sys_lseek(locked, current_task, fd, 3, SEEK_SET)?, 3);
3882            assert_eq!(sys_lseek(locked, current_task, fd, -3, SEEK_CUR)?, 0);
3883            assert_eq!(sys_lseek(locked, current_task, fd, 0, SEEK_END)?, file_size);
3884            assert_eq!(sys_lseek(locked, current_task, fd, -5, SEEK_SET), error!(EINVAL));
3885
3886            // Make sure that the failed call above did not change the offset.
3887            assert_eq!(sys_lseek(locked, current_task, fd, 0, SEEK_CUR)?, file_size);
3888
3889            // Prepare for an overflow.
3890            assert_eq!(sys_lseek(locked, current_task, fd, 3, SEEK_SET)?, 3);
3891
3892            // Check for overflow.
3893            assert_eq!(sys_lseek(locked, current_task, fd, i64::MAX, SEEK_CUR), error!(EINVAL));
3894
3895            Ok(())
3896        })
3897        .await
3898    }
3899
3900    #[::fuchsia::test]
3901    async fn test_sys_dup() -> Result<(), Errno> {
3902        spawn_kernel_and_run_with_pkgfs(async |locked, current_task| {
3903            let file_handle =
3904                current_task.open_file(locked, "data/testfile.txt".into(), OpenFlags::RDONLY)?;
3905            let oldfd = current_task.add_file(locked, file_handle, FdFlags::empty())?;
3906            let newfd = sys_dup(locked, current_task, oldfd)?;
3907
3908            assert_ne!(oldfd, newfd);
3909            let files = &current_task.files;
3910            assert!(Arc::ptr_eq(&files.get(oldfd).unwrap(), &files.get(newfd).unwrap()));
3911
3912            assert_eq!(sys_dup(locked, current_task, FdNumber::from_raw(3)), error!(EBADF));
3913
3914            Ok(())
3915        })
3916        .await
3917    }
3918
3919    #[::fuchsia::test]
3920    async fn test_sys_dup3() -> Result<(), Errno> {
3921        spawn_kernel_and_run_with_pkgfs(async |locked, current_task| {
3922            let file_handle =
3923                current_task.open_file(locked, "data/testfile.txt".into(), OpenFlags::RDONLY)?;
3924            let oldfd = current_task.add_file(locked, file_handle, FdFlags::empty())?;
3925            let newfd = FdNumber::from_raw(2);
3926            sys_dup3(locked, current_task, oldfd, newfd, O_CLOEXEC)?;
3927
3928            assert_ne!(oldfd, newfd);
3929            let files = &current_task.files;
3930            assert!(Arc::ptr_eq(&files.get(oldfd).unwrap(), &files.get(newfd).unwrap()));
3931            assert_eq!(files.get_fd_flags_allowing_opath(oldfd).unwrap(), FdFlags::empty());
3932            assert_eq!(files.get_fd_flags_allowing_opath(newfd).unwrap(), FdFlags::CLOEXEC);
3933
3934            assert_eq!(sys_dup3(locked, current_task, oldfd, oldfd, O_CLOEXEC), error!(EINVAL));
3935
3936            // Pass invalid flags.
3937            let invalid_flags = 1234;
3938            assert_eq!(sys_dup3(locked, current_task, oldfd, newfd, invalid_flags), error!(EINVAL));
3939
3940            // Makes sure that dup closes the old file handle before the fd points
3941            // to the new file handle.
3942            let second_file_handle =
3943                current_task.open_file(locked, "data/testfile.txt".into(), OpenFlags::RDONLY)?;
3944            let different_file_fd =
3945                current_task.add_file(locked, second_file_handle, FdFlags::empty())?;
3946            assert!(!Arc::ptr_eq(
3947                &files.get(oldfd).unwrap(),
3948                &files.get(different_file_fd).unwrap()
3949            ));
3950            sys_dup3(locked, current_task, oldfd, different_file_fd, O_CLOEXEC)?;
3951            assert!(Arc::ptr_eq(
3952                &files.get(oldfd).unwrap(),
3953                &files.get(different_file_fd).unwrap()
3954            ));
3955
3956            Ok(())
3957        })
3958        .await
3959    }
3960
3961    #[::fuchsia::test]
3962    async fn test_sys_open_cloexec() -> Result<(), Errno> {
3963        spawn_kernel_and_run_with_pkgfs(async |locked, current_task| {
3964            let path_addr = map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
3965            let path = b"data/testfile.txt\0";
3966            current_task.write_memory(path_addr, path)?;
3967            let fd = sys_openat(
3968                locked,
3969                &current_task,
3970                FdNumber::AT_FDCWD,
3971                UserCString::new(current_task, path_addr),
3972                O_RDONLY | O_CLOEXEC,
3973                FileMode::default(),
3974            )?;
3975            assert!(current_task.files.get_fd_flags_allowing_opath(fd)?.contains(FdFlags::CLOEXEC));
3976            Ok(())
3977        })
3978        .await
3979    }
3980
3981    #[::fuchsia::test]
3982    async fn test_sys_epoll() -> Result<(), Errno> {
3983        spawn_kernel_and_run_with_pkgfs(async |locked, current_task| {
3984            let epoll_fd =
3985                sys_epoll_create1(locked, current_task, 0).expect("sys_epoll_create1 failed");
3986            sys_close(locked, current_task, epoll_fd).expect("sys_close failed");
3987
3988            Ok(())
3989        })
3990        .await
3991    }
3992
3993    #[::fuchsia::test]
3994    async fn test_fstat_tmp_file() {
3995        spawn_kernel_and_run_with_pkgfs(async |locked, current_task| {
3996            // Create the file that will be used to stat.
3997            let file_path = "data/testfile.txt";
3998            let _file_handle =
3999                current_task.open_file(locked, file_path.into(), OpenFlags::RDONLY).unwrap();
4000
4001            // Write the path to user memory.
4002            let path_addr = map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
4003            current_task
4004                .write_memory(path_addr, file_path.as_bytes())
4005                .expect("failed to clear struct");
4006
4007            let memory_len = (path_addr + file_path.len()).expect("OOB memory allocation!");
4008            let user_stat = UserRef::new(memory_len);
4009            current_task
4010                .write_object(user_stat, &default_statfs(0))
4011                .expect("failed to clear struct");
4012
4013            let user_path = UserCString::new(current_task, path_addr);
4014
4015            assert_eq!(sys_statfs(locked, current_task, user_path, user_stat.into()), Ok(()));
4016
4017            let returned_stat = current_task.read_object(user_stat).expect("failed to read struct");
4018            assert_eq!(
4019                returned_stat.as_bytes(),
4020                default_statfs(u32::from_be_bytes(*b"f.io")).as_bytes()
4021            );
4022        })
4023        .await;
4024    }
4025
4026    #[::fuchsia::test]
4027    async fn test_unlinkat_dir() {
4028        spawn_kernel_and_run(async |locked, current_task| {
4029            // Create the dir that we will attempt to unlink later.
4030            let no_slash_path = b"testdir";
4031            let no_slash_path_addr =
4032                map_memory(locked, &current_task, UserAddress::default(), *PAGE_SIZE);
4033            current_task
4034                .write_memory(no_slash_path_addr, no_slash_path)
4035                .expect("failed to write path");
4036            let no_slash_user_path = UserCString::new(current_task, no_slash_path_addr);
4037            sys_mkdirat(
4038                locked,
4039                &current_task,
4040                FdNumber::AT_FDCWD,
4041                no_slash_user_path,
4042                FileMode::ALLOW_ALL.with_type(FileMode::IFDIR),
4043            )
4044            .unwrap();
4045
4046            let slash_path = b"testdir/";
4047            let slash_path_addr =
4048                map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
4049            current_task.write_memory(slash_path_addr, slash_path).expect("failed to write path");
4050            let slash_user_path = UserCString::new(current_task, slash_path_addr);
4051
4052            // Try to remove a directory without specifying AT_REMOVEDIR.
4053            // This should fail with EISDIR, irrespective of the terminating slash.
4054            let error = sys_unlinkat(locked, current_task, FdNumber::AT_FDCWD, slash_user_path, 0)
4055                .unwrap_err();
4056            assert_eq!(error, errno!(EISDIR));
4057            let error =
4058                sys_unlinkat(locked, current_task, FdNumber::AT_FDCWD, no_slash_user_path, 0)
4059                    .unwrap_err();
4060            assert_eq!(error, errno!(EISDIR));
4061
4062            // Success with AT_REMOVEDIR.
4063            sys_unlinkat(locked, current_task, FdNumber::AT_FDCWD, slash_user_path, AT_REMOVEDIR)
4064                .unwrap();
4065        })
4066        .await;
4067    }
4068
4069    #[::fuchsia::test]
4070    async fn test_rename_noreplace() {
4071        spawn_kernel_and_run_with_pkgfs(async |locked, current_task| {
4072            // Create the file that will be renamed.
4073            let old_user_path = "data/testfile.txt";
4074            let _old_file_handle =
4075                current_task.open_file(locked, old_user_path.into(), OpenFlags::RDONLY).unwrap();
4076
4077            // Write the path to user memory.
4078            let old_path_addr =
4079                map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
4080            current_task
4081                .write_memory(old_path_addr, old_user_path.as_bytes())
4082                .expect("failed to clear struct");
4083
4084            // Create a second file that we will attempt to rename to.
4085            let new_user_path = "data/testfile2.txt";
4086            let _new_file_handle =
4087                current_task.open_file(locked, new_user_path.into(), OpenFlags::RDONLY).unwrap();
4088
4089            // Write the path to user memory.
4090            let new_path_addr =
4091                map_memory(locked, current_task, UserAddress::default(), *PAGE_SIZE);
4092            current_task
4093                .write_memory(new_path_addr, new_user_path.as_bytes())
4094                .expect("failed to clear struct");
4095
4096            // Try to rename first file to second file's name with RENAME_NOREPLACE flag.
4097            // This should fail with EEXIST.
4098            let error = sys_renameat2(
4099                locked,
4100                &current_task,
4101                FdNumber::AT_FDCWD,
4102                UserCString::new(current_task, old_path_addr),
4103                FdNumber::AT_FDCWD,
4104                UserCString::new(current_task, new_path_addr),
4105                RenameFlags::NOREPLACE.bits(),
4106            )
4107            .unwrap_err();
4108            assert_eq!(error, errno!(EEXIST));
4109        })
4110        .await;
4111    }
4112}