Skip to main content

starnix_core/task/
current_task.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::arch::task::{decode_page_fault_exception_report, get_signal_for_general_exception};
6use crate::execution::{TaskInfo, create_zircon_process};
7use crate::mm::{DumpPolicy, MemoryAccessor, MemoryAccessorExt, TaskMemoryAccessor};
8use crate::ptrace::{PtraceCoreState, PtraceEvent, PtraceEventData, PtraceOptions, StopState};
9use crate::security;
10use crate::signals::{RunState, SignalDetail, SignalInfo, send_signal_first, send_standard_signal};
11use crate::task::loader::{ResolvedElf, load_executable, resolve_executable};
12use crate::task::waiter::WaiterOptions;
13use crate::task::{
14    ExitStatus, RobustListHeadPtr, SeccompFilter, SeccompFilterContainer, SeccompNotifierHandle,
15    SeccompState, SeccompStateValue, Task, TaskFlags, Waiter,
16};
17use crate::vfs::{
18    CheckAccessReason, FdFlags, FdNumber, FileHandle, FsStr, LookupContext, MAX_SYMLINK_FOLLOWS,
19    NamespaceNode, ResolveBase, SymlinkMode, SymlinkTarget, new_pidfd,
20};
21use extended_pstate::ExtendedPstateState;
22use futures::FutureExt;
23use linux_uapi::CLONE_PIDFD;
24use starnix_logging::{log_error, log_warn, track_file_not_found, track_stub};
25use starnix_registers::{HeapRegs, RegisterState, RegisterStorage, RegisterStorageEnum};
26use starnix_stack::clean_stack;
27use starnix_sync::{
28    EventWaitGuard, FileOpsCore, LockBefore, LockEqualOrBefore, Locked, MmDumpable,
29    ProcessGroupState, TaskRelease, Unlocked, WakeReason,
30};
31use starnix_syscalls::SyscallResult;
32use starnix_syscalls::decls::Syscall;
33use starnix_task_command::TaskCommand;
34use starnix_types::arch::ArchWidth;
35use starnix_types::futex_address::FutexAddress;
36use starnix_types::ownership::{OwnedRef, Releasable, TempRef, WeakRef, release_on_error};
37use starnix_uapi::auth::{
38    CAP_KILL, CAP_SYS_ADMIN, CAP_SYS_PTRACE, Credentials, FsCred, PTRACE_MODE_FSCREDS,
39    PTRACE_MODE_REALCREDS, PtraceAccessMode, UserAndOrGroupId,
40};
41use starnix_uapi::device_type::DeviceType;
42use starnix_uapi::errors::{Errno, ErrnoCode};
43use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
44use starnix_uapi::open_flags::OpenFlags;
45use starnix_uapi::signals::{
46    SIGBUS, SIGCHLD, SIGCONT, SIGILL, SIGKILL, SIGSEGV, SIGSYS, SIGTRAP, SigSet, Signal,
47    UncheckedSignal,
48};
49use starnix_uapi::user_address::{ArchSpecific, UserAddress, UserRef};
50use starnix_uapi::vfs::ResolveFlags;
51use starnix_uapi::{
52    CLONE_CHILD_CLEARTID, CLONE_CHILD_SETTID, CLONE_FILES, CLONE_FS, CLONE_INTO_CGROUP,
53    CLONE_NEWUTS, CLONE_PARENT, CLONE_PARENT_SETTID, CLONE_PTRACE, CLONE_SETTLS, CLONE_SIGHAND,
54    CLONE_SYSVSEM, CLONE_THREAD, CLONE_VFORK, CLONE_VM, FUTEX_OWNER_DIED, FUTEX_TID_MASK,
55    ROBUST_LIST_LIMIT, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_NEW_LISTENER,
56    SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_TSYNC_ESRCH, clone_args, errno, error,
57    from_status_like_fdio, pid_t, sock_filter, ucred,
58};
59use std::cell::{Ref, RefCell};
60use std::collections::VecDeque;
61use std::ffi::CString;
62use std::fmt;
63use std::marker::PhantomData;
64use std::mem::MaybeUninit;
65use std::sync::Arc;
66use zx::sys::zx_restricted_state_t;
67
68use super::ThreadGroupLifecycleWaitValue;
69
70pub struct TaskBuilder {
71    /// The underlying task object.
72    pub task: OwnedRef<Task>,
73
74    pub thread_state: ThreadState<HeapRegs>,
75}
76
77impl TaskBuilder {
78    pub fn new(task: OwnedRef<Task>) -> Self {
79        Self { task, thread_state: Default::default() }
80    }
81
82    #[inline(always)]
83    pub fn release<L>(self, locked: &mut Locked<L>)
84    where
85        L: LockBefore<TaskRelease>,
86    {
87        let locked = locked.cast_locked::<TaskRelease>();
88        Releasable::release(self, locked);
89    }
90}
91
92impl From<TaskBuilder> for CurrentTask {
93    fn from(builder: TaskBuilder) -> Self {
94        Self::new(builder.task, builder.thread_state.into())
95    }
96}
97
98impl Releasable for TaskBuilder {
99    type Context<'a> = &'a mut Locked<TaskRelease>;
100
101    fn release<'a>(self, locked: Self::Context<'a>) {
102        let kernel = Arc::clone(self.kernel());
103        let mut pids = kernel.pids.write();
104
105        // We remove from the thread group here because the WeakRef in the pid
106        // table to this task must be valid until this task is removed from the
107        // thread group, and the code below will invalidate it.
108        // Moreover, this requires a OwnedRef of the task to ensure the tasks of
109        // the thread group are always valid.
110        self.task.thread_group().remove(locked, &mut pids, &self.task);
111
112        let context = (self.thread_state.into(), locked, pids);
113        self.task.release(context);
114    }
115}
116
117impl std::ops::Deref for TaskBuilder {
118    type Target = Task;
119    fn deref(&self) -> &Self::Target {
120        &self.task
121    }
122}
123
124/// The task object associated with the currently executing thread.
125///
126/// We often pass the `CurrentTask` as the first argument to functions if those functions need to
127/// know contextual information about the thread on which they are running. For example, we often
128/// use the `CurrentTask` to perform access checks, which ensures that the caller is authorized to
129/// perform the requested operation.
130///
131/// The `CurrentTask` also has state that can be referenced only on the currently executing thread,
132/// such as the register state for that thread. Syscalls are given a mutable references to the
133/// `CurrentTask`, which lets them manipulate this state.
134///
135/// See also `Task` for more information about tasks.
136pub struct CurrentTask {
137    /// The underlying task object.
138    pub task: OwnedRef<Task>,
139
140    pub thread_state: ThreadState<RegisterStorageEnum>,
141
142    /// The current subjective credentials of the task.
143    // TODO(https://fxbug.dev/433548348): Avoid interior mutability here by passing a
144    // &mut CurrentTask around instead of &CurrentTask.
145    pub current_creds: RefCell<CurrentCreds>,
146
147    /// Makes CurrentTask neither Sync not Send.
148    _local_marker: PhantomData<*mut u8>,
149}
150
151/// Represents the current state of the task's subjective credentials.
152pub enum CurrentCreds {
153    /// The task does not have overridden credentials, the subjective creds are identical to the
154    /// objective creds stored in the Task. Since credentials are often accessed from the current
155    /// task, we hold a reference here that does not necessitate going through the RCU machinery to
156    /// read.
157    Cached(Arc<Credentials>),
158    /// The task has overridden subjective credentials.
159    Overridden(Arc<Credentials>),
160}
161
162impl CurrentCreds {
163    fn creds(&self) -> &Arc<Credentials> {
164        match self {
165            CurrentCreds::Cached(creds) => creds,
166            CurrentCreds::Overridden(creds) => creds,
167        }
168    }
169}
170
171#[derive(Clone)]
172pub enum ArchExtendedPstateStorage {
173    // Storage for 64 bit restricted mode.
174    State64(Box<ExtendedPstateState>),
175}
176
177impl ArchExtendedPstateStorage {
178    pub fn as_ptr(&mut self) -> *mut ExtendedPstateState {
179        match self {
180            ArchExtendedPstateStorage::State64(state) => state.as_mut() as *mut _,
181        }
182    }
183
184    fn reset(&mut self) {
185        match self {
186            ArchExtendedPstateStorage::State64(state) => state.reset(),
187        }
188    }
189}
190
191impl Default for ArchExtendedPstateStorage {
192    fn default() -> Self {
193        Self::State64(Default::default())
194    }
195}
196
197/// The thread related information of a `CurrentTask`. The information should never be used outside
198/// of the thread owning the `CurrentTask`.
199#[derive(Default)]
200pub struct ThreadState<T: RegisterStorage> {
201    /// A copy of the registers associated with the Zircon thread. Up-to-date values can be read
202    /// from `self.handle.read_state_general_regs()`. To write these values back to the thread, call
203    /// `self.handle.write_state_general_regs(self.thread_state.registers.into())`.
204    pub registers: RegisterState<T>,
205
206    /// Copy of the current extended processor state including floating point and vector registers.
207    pub extended_pstate: ArchExtendedPstateStorage,
208
209    /// The errno code (if any) that indicated this task should restart a syscall.
210    pub restart_code: Option<ErrnoCode>,
211
212    /// A custom function to resume a syscall that has been interrupted by SIGSTOP.
213    /// To use, call set_syscall_restart_func and return ERESTART_RESTARTBLOCK. sys_restart_syscall
214    /// will eventually call it.
215    pub syscall_restart_func: Option<Box<SyscallRestartFunc>>,
216}
217
218impl<T: RegisterStorage> ThreadState<T> {
219    pub fn arch_width(&self) -> ArchWidth {
220        #[cfg(target_arch = "aarch64")]
221        {
222            return if self.is_arch32() { ArchWidth::Arch32 } else { ArchWidth::Arch64 };
223        }
224        #[cfg(not(target_arch = "aarch64"))]
225        ArchWidth::Arch64
226    }
227
228    /// Returns a new `ThreadState` with the same `registers` as this one.
229    fn snapshot<R: RegisterStorage>(&self) -> ThreadState<R>
230    where
231        RegisterState<R>: From<RegisterState<T>>,
232    {
233        ThreadState::<R> {
234            registers: self.registers.clone().into(),
235            extended_pstate: Default::default(),
236            restart_code: self.restart_code,
237            syscall_restart_func: None,
238        }
239    }
240
241    pub fn extended_snapshot<R: RegisterStorage>(&self) -> ThreadState<R>
242    where
243        RegisterState<R>: From<RegisterState<T>>,
244    {
245        ThreadState::<R> {
246            registers: self.registers.clone().into(),
247            extended_pstate: self.extended_pstate.clone(),
248            restart_code: self.restart_code,
249            syscall_restart_func: None,
250        }
251    }
252
253    pub fn replace_registers<O: RegisterStorage>(&mut self, other: &ThreadState<O>) {
254        self.registers.load(*other.registers);
255        self.extended_pstate = other.extended_pstate.clone();
256    }
257
258    pub fn get_user_register(&mut self, offset: usize) -> Result<usize, Errno> {
259        let mut result: usize = 0;
260        self.registers.apply_user_register(offset, &mut |register| result = *register as usize)?;
261        Ok(result)
262    }
263
264    pub fn set_user_register(&mut self, offset: usize, value: usize) -> Result<(), Errno> {
265        self.registers.apply_user_register(offset, &mut |register| *register = value as u64)
266    }
267}
268
269impl From<ThreadState<HeapRegs>> for ThreadState<RegisterStorageEnum> {
270    fn from(value: ThreadState<HeapRegs>) -> Self {
271        ThreadState {
272            registers: value.registers.into(),
273            extended_pstate: value.extended_pstate,
274            restart_code: value.restart_code,
275            syscall_restart_func: value.syscall_restart_func,
276        }
277    }
278}
279
280impl<T: RegisterStorage> ArchSpecific for ThreadState<T> {
281    fn is_arch32(&self) -> bool {
282        #[cfg(target_arch = "aarch64")]
283        {
284            (self.registers.cpsr as u64) & zx::sys::ZX_REG_CPSR_ARCH_32_MASK != 0
285        }
286        #[cfg(not(target_arch = "aarch64"))]
287        {
288            false
289        }
290    }
291}
292
293type SyscallRestartFunc = dyn FnOnce(&mut Locked<Unlocked>, &mut CurrentTask) -> Result<SyscallResult, Errno>
294    + Send
295    + Sync;
296
297impl Releasable for CurrentTask {
298    type Context<'a> = &'a mut Locked<TaskRelease>;
299
300    fn release<'a>(self, locked: Self::Context<'a>) {
301        self.notify_robust_list();
302        let _ignored = self.clear_child_tid_if_needed(locked);
303
304        let kernel = Arc::clone(self.kernel());
305        let mut pids = kernel.pids.write();
306
307        // We remove from the thread group here because the WeakRef in the pid
308        // table to this task must be valid until this task is removed from the
309        // thread group, and the code below will invalidate it.
310        // Moreover, this requires a OwnedRef of the task to ensure the tasks of
311        // the thread group are always valid.
312        self.task.thread_group().remove(locked, &mut pids, &self.task);
313
314        let context = (self.thread_state, locked, pids);
315        self.task.release(context);
316    }
317}
318
319impl std::ops::Deref for CurrentTask {
320    type Target = Task;
321    fn deref(&self) -> &Self::Target {
322        &self.task
323    }
324}
325
326impl fmt::Debug for CurrentTask {
327    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
328        self.task.fmt(f)
329    }
330}
331
332impl CurrentTask {
333    pub fn new(task: OwnedRef<Task>, thread_state: ThreadState<RegisterStorageEnum>) -> Self {
334        let current_creds = RefCell::new(CurrentCreds::Cached(task.clone_creds()));
335        Self { task, thread_state, current_creds, _local_marker: Default::default() }
336    }
337
338    /// Returns the current subjective credentials of the task.
339    ///
340    /// The subjective credentials are the credentials that are used to check permissions for
341    /// actions performed by the task.
342    pub fn current_creds(&self) -> Ref<'_, Arc<Credentials>> {
343        Ref::map(self.current_creds.borrow(), CurrentCreds::creds)
344    }
345
346    pub fn current_fscred(&self) -> FsCred {
347        self.current_creds().as_fscred()
348    }
349
350    pub fn current_ucred(&self) -> ucred {
351        let creds = self.current_creds();
352        ucred { pid: self.get_pid(), uid: creds.uid, gid: creds.gid }
353    }
354
355    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
356    /// `callback`.
357    /// The creds and security state will be restored to their original values at the end of the
358    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
359    ///  used to check permissions for actions performed by the task, is altered. The "objective"
360    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
361    /// for actions performed on the task, is not altered, and changes to the credentials are not
362    /// externally visible.
363    pub async fn override_creds_async<R>(
364        &self,
365        new_creds: Arc<Credentials>,
366        callback: impl AsyncFnOnce() -> R,
367    ) -> R {
368        let saved = self.current_creds.replace(CurrentCreds::Overridden(new_creds));
369        let result = callback().await;
370        self.current_creds.replace(saved);
371        result
372    }
373
374    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
375    /// `callback`.
376    /// The creds and security state will be restored to their original values at the end of the
377    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
378    ///  used to check permissions for actions performed by the task, is altered. The "objective"
379    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
380    /// for actions performed on the task, is not altered, and changes to the credentials are not
381    /// externally visible.
382    pub fn override_creds<R>(
383        &self,
384        new_creds: Arc<Credentials>,
385        callback: impl FnOnce() -> R,
386    ) -> R {
387        self.override_creds_async(new_creds, async move || callback())
388            .now_or_never()
389            .expect("Future should be ready")
390    }
391
392    pub fn has_overridden_creds(&self) -> bool {
393        matches!(*self.current_creds.borrow(), CurrentCreds::Overridden(_))
394    }
395
396    pub fn trigger_delayed_releaser<L>(&self, locked: &mut Locked<L>)
397    where
398        L: LockEqualOrBefore<FileOpsCore>,
399    {
400        let locked = locked.cast_locked::<FileOpsCore>();
401        self.kernel().delayed_releaser.apply(locked, self);
402    }
403
404    pub fn weak_task(&self) -> WeakRef<Task> {
405        WeakRef::from(&self.task)
406    }
407
408    pub fn temp_task(&self) -> TempRef<'_, Task> {
409        TempRef::from(&self.task)
410    }
411
412    /// Change the current and real creds of the task. This is invalid to call while temporary
413    /// credentials are present.
414    pub fn set_creds(&self, creds: Credentials) {
415        assert!(!self.has_overridden_creds());
416
417        let creds = Arc::new(creds);
418        let mut current_creds = self.current_creds.borrow_mut();
419        *current_creds = CurrentCreds::Cached(creds.clone());
420
421        // SAFETY: this is allowed because we are the CurrentTask.
422        unsafe {
423            self.persistent_info.write_creds().update(creds);
424        }
425        // The /proc/pid directory's ownership is updated when the task's euid
426        // or egid changes. See proc(5).
427        let maybe_node = self.proc_pid_directory_cache.lock();
428        if let Some(node) = &*maybe_node {
429            let creds = self.real_creds().euid_as_fscred();
430            // SAFETY: The /proc/pid directory held by `proc_pid_directory_cache` represents the
431            // current task. It's owner and group are supposed to track the current task's euid and
432            // egid.
433            unsafe {
434                node.force_chown(creds);
435            }
436        }
437    }
438
439    #[inline(always)]
440    pub fn release<L>(self, locked: &mut Locked<L>)
441    where
442        L: LockBefore<TaskRelease>,
443    {
444        let locked = locked.cast_locked::<TaskRelease>();
445        Releasable::release(self, locked);
446    }
447
448    pub fn set_syscall_restart_func<R: Into<SyscallResult>>(
449        &mut self,
450        f: impl FnOnce(&mut Locked<Unlocked>, &mut CurrentTask) -> Result<R, Errno>
451        + Send
452        + Sync
453        + 'static,
454    ) {
455        self.thread_state.syscall_restart_func =
456            Some(Box::new(|locked, current_task| Ok(f(locked, current_task)?.into())));
457    }
458
459    pub fn add_file<L>(
460        &self,
461        locked: &mut Locked<L>,
462        file: FileHandle,
463        flags: FdFlags,
464    ) -> Result<FdNumber, Errno>
465    where
466        L: LockEqualOrBefore<FileOpsCore>,
467    {
468        self.files.add(locked, self, file, flags)
469    }
470
471    /// Sets the task's signal mask to `signal_mask` and runs `wait_function`.
472    ///
473    /// Signals are dequeued prior to the original signal mask being restored. This is done by the
474    /// signal machinery in the syscall dispatch loop.
475    ///
476    /// The returned result is the result returned from the wait function.
477    pub fn wait_with_temporary_mask<F, T, L>(
478        &mut self,
479        locked: &mut Locked<L>,
480        signal_mask: SigSet,
481        wait_function: F,
482    ) -> Result<T, Errno>
483    where
484        L: LockEqualOrBefore<FileOpsCore>,
485        F: FnOnce(&mut Locked<L>, &CurrentTask) -> Result<T, Errno>,
486    {
487        {
488            let mut state = self.write();
489            state.set_flags(TaskFlags::TEMPORARY_SIGNAL_MASK, true);
490            state.set_temporary_signal_mask(signal_mask);
491        }
492        wait_function(locked, self)
493    }
494
495    /// If waking, promotes from waking to awake.  If not waking, make waiter async
496    /// wait until woken.  Returns true if woken.
497    pub fn wake_or_wait_until_unstopped_async(&self, waiter: &Waiter) -> bool {
498        let group_state = self.thread_group().read();
499        let mut task_state = self.write();
500
501        // Wake up if
502        //   a) we should wake up, meaning:
503        //      i) we're in group stop, and the thread group has exited group stop, or
504        //      ii) we're waking up,
505        //   b) and ptrace isn't stopping us from waking up, but
506        //   c) always wake up if we got a SIGKILL.
507        let task_stop_state = self.load_stopped();
508        let group_stop_state = self.thread_group().load_stopped();
509        if ((task_stop_state == StopState::GroupStopped && group_stop_state.is_waking_or_awake())
510            || task_stop_state.is_waking_or_awake())
511            && (!task_state.is_ptrace_listening() || task_stop_state.is_force())
512        {
513            let new_state = if task_stop_state.is_waking_or_awake() {
514                task_stop_state.finalize()
515            } else {
516                group_stop_state.finalize()
517            };
518            if let Ok(new_state) = new_state {
519                task_state.set_stopped(new_state, None, Some(self), None);
520                drop(group_state);
521                drop(task_state);
522                // It is possible for the stop state to be changed by another
523                // thread between when it is checked above and the following
524                // invocation, but set_stopped does sufficient checking while
525                // holding the lock to make sure that such a change won't result
526                // in corrupted state.
527                self.thread_group().set_stopped(new_state, None, false);
528                return true;
529            }
530        }
531
532        // We will wait.
533        if self.thread_group().load_stopped().is_stopped() || task_stop_state.is_stopped() {
534            // If we've stopped or PTRACE_LISTEN has been sent, wait for a
535            // signal or instructions from the tracer.
536            group_state
537                .lifecycle_waiters
538                .wait_async_value(&waiter, ThreadGroupLifecycleWaitValue::Stopped);
539            task_state.wait_on_ptracer(&waiter);
540        } else if task_state.can_accept_ptrace_commands() {
541            // If we're stopped because a tracer has seen the stop and not taken
542            // further action, wait for further instructions from the tracer.
543            task_state.wait_on_ptracer(&waiter);
544        } else if task_state.is_ptrace_listening() {
545            // A PTRACE_LISTEN is a state where we can get signals and notify a
546            // ptracer, but otherwise remain blocked.
547            if let Some(ptrace) = &mut task_state.ptrace {
548                ptrace.set_last_signal(Some(SignalInfo::kernel(SIGTRAP)));
549                ptrace.set_last_event(Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0)));
550            }
551            task_state.wait_on_ptracer(&waiter);
552            task_state.notify_ptracers();
553        }
554        false
555    }
556
557    /// Set the RunState for the current task to the given value and then call the given callback.
558    ///
559    /// When the callback is done, the run_state is restored to `RunState::Running`.
560    ///
561    /// This function is typically used just before blocking the current task on some operation.
562    /// The given `run_state` registers the mechanism for interrupting the blocking operation with
563    /// the task and the given `callback` actually blocks the task.
564    ///
565    /// This function can only be called in the `RunState::Running` state and cannot set the
566    /// run state to `RunState::Running`. For this reason, this function cannot be reentered.
567    pub fn run_in_state<F, T>(&self, run_state: RunState, callback: F) -> Result<T, Errno>
568    where
569        F: FnOnce() -> Result<T, Errno>,
570    {
571        assert_ne!(run_state, RunState::Running);
572
573        // As an optimization, decommit unused pages of the stack to reduce memory pressure while
574        // the thread is blocked.
575        clean_stack();
576
577        {
578            let mut state = self.write();
579            assert!(!state.is_blocked());
580
581            if matches!(run_state, RunState::Frozen(_)) {
582                // Freeze is a kernel signal and is handled before other user signals. A frozen task
583                // ignores all other signals except SIGKILL until it is thawed.
584                if state.has_signal_pending(SIGKILL) {
585                    return error!(EINTR);
586                }
587            } else if state.is_any_signal_pending() && !state.is_ptrace_listening() {
588                // A note on PTRACE_LISTEN - the thread cannot be scheduled
589                // regardless of pending signals.
590                return error!(EINTR);
591            }
592            state.set_run_state(run_state.clone());
593        }
594
595        let result = callback();
596
597        {
598            let mut state = self.write();
599            assert_eq!(
600                state.run_state(),
601                run_state,
602                "SignalState run state changed while waiting!"
603            );
604            state.set_run_state(RunState::Running);
605        };
606
607        result
608    }
609
610    pub fn block_until(
611        &self,
612        guard: EventWaitGuard<'_>,
613        deadline: zx::MonotonicInstant,
614    ) -> Result<(), Errno> {
615        self.run_in_state(RunState::Event(guard.event().clone()), move || {
616            guard.block_until(None, deadline).map_err(|e| match e {
617                WakeReason::Interrupted => errno!(EINTR),
618                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
619            })
620        })
621    }
622
623    pub fn block_with_owner_until(
624        &self,
625        guard: EventWaitGuard<'_>,
626        new_owner: &zx::Thread,
627        deadline: zx::MonotonicInstant,
628    ) -> Result<(), Errno> {
629        self.run_in_state(RunState::Event(guard.event().clone()), move || {
630            guard.block_until(Some(new_owner), deadline).map_err(|e| match e {
631                WakeReason::Interrupted => errno!(EINTR),
632                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
633            })
634        })
635    }
636
637    /// Determine namespace node indicated by the dir_fd.
638    ///
639    /// Returns the namespace node and the path to use relative to that node.
640    pub fn resolve_dir_fd<'a, L>(
641        &self,
642        locked: &mut Locked<L>,
643        dir_fd: FdNumber,
644        mut path: &'a FsStr,
645        flags: ResolveFlags,
646    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
647    where
648        L: LockEqualOrBefore<FileOpsCore>,
649    {
650        let path_is_absolute = path.starts_with(b"/");
651        if path_is_absolute {
652            if flags.contains(ResolveFlags::BENEATH) {
653                return error!(EXDEV);
654            }
655            path = &path[1..];
656        }
657
658        let dir = if path_is_absolute && !flags.contains(ResolveFlags::IN_ROOT) {
659            self.fs().root()
660        } else if dir_fd == FdNumber::AT_FDCWD {
661            self.fs().cwd()
662        } else {
663            // O_PATH allowed for:
664            //
665            //   Passing the file descriptor as the dirfd argument of
666            //   openat() and the other "*at()" system calls.  This
667            //   includes linkat(2) with AT_EMPTY_PATH (or via procfs
668            //   using AT_SYMLINK_FOLLOW) even if the file is not a
669            //   directory.
670            //
671            // See https://man7.org/linux/man-pages/man2/open.2.html
672            let file = self.files.get_allowing_opath(dir_fd)?;
673            file.name.to_passive()
674        };
675
676        if !path.is_empty() {
677            if !dir.entry.node.is_dir() {
678                return error!(ENOTDIR);
679            }
680            dir.check_access(
681                locked,
682                self,
683                Access::EXEC,
684                CheckAccessReason::InternalPermissionChecks,
685            )?;
686        }
687        Ok((dir, path.into()))
688    }
689
690    /// A convenient wrapper for opening files relative to FdNumber::AT_FDCWD.
691    ///
692    /// Returns a FileHandle but does not install the FileHandle in the FdTable
693    /// for this task.
694    pub fn open_file(
695        &self,
696        locked: &mut Locked<Unlocked>,
697        path: &FsStr,
698        flags: OpenFlags,
699    ) -> Result<FileHandle, Errno> {
700        if flags.contains(OpenFlags::CREAT) {
701            // In order to support OpenFlags::CREAT we would need to take a
702            // FileMode argument.
703            return error!(EINVAL);
704        }
705        self.open_file_at(
706            locked,
707            FdNumber::AT_FDCWD,
708            path,
709            flags,
710            FileMode::default(),
711            ResolveFlags::empty(),
712            AccessCheck::default(),
713        )
714    }
715
716    /// Resolves a path for open.
717    ///
718    /// If the final path component points to a symlink, the symlink is followed (as long as
719    /// the symlink traversal limit has not been reached).
720    ///
721    /// If the final path component (after following any symlinks, if enabled) does not exist,
722    /// and `flags` contains `OpenFlags::CREAT`, a new node is created at the location of the
723    /// final path component.
724    ///
725    /// This returns the resolved node, and a boolean indicating whether the node has been created.
726    fn resolve_open_path<L>(
727        &self,
728        locked: &mut Locked<L>,
729        context: &mut LookupContext,
730        dir: &NamespaceNode,
731        path: &FsStr,
732        mode: FileMode,
733        flags: OpenFlags,
734    ) -> Result<(NamespaceNode, bool), Errno>
735    where
736        L: LockEqualOrBefore<FileOpsCore>,
737    {
738        context.update_for_path(path);
739        let mut parent_content = context.with(SymlinkMode::Follow);
740        let (parent, basename) = self.lookup_parent(locked, &mut parent_content, dir, path)?;
741        context.remaining_follows = parent_content.remaining_follows;
742
743        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
744
745        // Lookup the child, without following a symlink or expecting it to be a directory.
746        let mut child_context = context.with(SymlinkMode::NoFollow);
747        child_context.must_be_directory = false;
748
749        match parent.lookup_child(locked, self, &mut child_context, basename) {
750            Ok(name) => {
751                if name.entry.node.is_lnk() {
752                    if flags.contains(OpenFlags::PATH)
753                        && context.symlink_mode == SymlinkMode::NoFollow
754                    {
755                        // When O_PATH is specified in flags, if pathname is a symbolic link
756                        // and the O_NOFOLLOW flag is also specified, then the call returns
757                        // a file descriptor referring to the symbolic link.
758                        // See https://man7.org/linux/man-pages/man2/openat.2.html
759                        //
760                        // If the trailing component (i.e., basename) of
761                        // pathname is a symbolic link, how.resolve contains
762                        // RESOLVE_NO_SYMLINKS, and how.flags contains both
763                        // O_PATH and O_NOFOLLOW, then an O_PATH file
764                        // descriptor referencing the symbolic link will be
765                        // returned.
766                        // See https://man7.org/linux/man-pages/man2/openat2.2.html
767                        return Ok((name, false));
768                    }
769
770                    if (!flags.contains(OpenFlags::PATH)
771                        && context.symlink_mode == SymlinkMode::NoFollow)
772                        || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
773                        || context.remaining_follows == 0
774                    {
775                        if must_create {
776                            // Since `must_create` is set, and a node was found, this returns EEXIST
777                            // instead of ELOOP.
778                            return error!(EEXIST);
779                        }
780                        // A symlink was found, but one of the following is true:
781                        // * flags specified O_NOFOLLOW but not O_PATH.
782                        // * how.resolve contains RESOLVE_NO_SYMLINKS
783                        // * too many symlink traversals have been attempted
784                        return error!(ELOOP);
785                    }
786
787                    context.remaining_follows -= 1;
788                    match name.readlink(locked, self)? {
789                        SymlinkTarget::Path(path) => {
790                            let dir = if path[0] == b'/' { self.fs().root() } else { parent };
791                            self.resolve_open_path(
792                                locked,
793                                context,
794                                &dir,
795                                path.as_ref(),
796                                mode,
797                                flags,
798                            )
799                        }
800                        SymlinkTarget::Node(name) => {
801                            if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS)
802                                || name.entry.node.is_lnk()
803                            {
804                                error!(ELOOP)
805                            } else {
806                                Ok((name, false))
807                            }
808                        }
809                    }
810                } else {
811                    if must_create {
812                        return error!(EEXIST);
813                    }
814                    Ok((name, false))
815                }
816            }
817            Err(e) if e == errno!(ENOENT) && flags.contains(OpenFlags::CREAT) => {
818                if context.must_be_directory {
819                    return error!(EISDIR);
820                }
821                Ok((
822                    parent.open_create_node(
823                        locked,
824                        self,
825                        basename,
826                        mode.with_type(FileMode::IFREG),
827                        DeviceType::NONE,
828                        flags,
829                    )?,
830                    true,
831                ))
832            }
833            Err(e) => Err(e),
834        }
835    }
836
837    /// The primary entry point for opening files relative to a task.
838    ///
839    /// Absolute paths are resolve relative to the root of the FsContext for
840    /// this task. Relative paths are resolve relative to dir_fd. To resolve
841    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
842    /// dir_fd.
843    ///
844    /// Returns a FileHandle but does not install the FileHandle in the FdTable
845    /// for this task.
846    pub fn open_file_at(
847        &self,
848        locked: &mut Locked<Unlocked>,
849        dir_fd: FdNumber,
850        path: &FsStr,
851        flags: OpenFlags,
852        mode: FileMode,
853        resolve_flags: ResolveFlags,
854        access_check: AccessCheck,
855    ) -> Result<FileHandle, Errno> {
856        if path.is_empty() {
857            return error!(ENOENT);
858        }
859
860        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, resolve_flags)?;
861        self.open_namespace_node_at(locked, dir, path, flags, mode, resolve_flags, access_check)
862    }
863
864    pub fn open_namespace_node_at(
865        &self,
866        locked: &mut Locked<Unlocked>,
867        dir: NamespaceNode,
868        path: &FsStr,
869        flags: OpenFlags,
870        mode: FileMode,
871        mut resolve_flags: ResolveFlags,
872        access_check: AccessCheck,
873    ) -> Result<FileHandle, Errno> {
874        // 64-bit kernels force the O_LARGEFILE flag to be on.
875        let mut flags = flags | OpenFlags::LARGEFILE;
876        let opath = flags.contains(OpenFlags::PATH);
877        if opath {
878            // When O_PATH is specified in flags, flag bits other than O_CLOEXEC,
879            // O_DIRECTORY, and O_NOFOLLOW are ignored.
880            const ALLOWED_FLAGS: OpenFlags = OpenFlags::from_bits_truncate(
881                OpenFlags::PATH.bits()
882                    | OpenFlags::CLOEXEC.bits()
883                    | OpenFlags::DIRECTORY.bits()
884                    | OpenFlags::NOFOLLOW.bits(),
885            );
886            flags &= ALLOWED_FLAGS;
887        }
888
889        if flags.contains(OpenFlags::TMPFILE) && !flags.can_write() {
890            return error!(EINVAL);
891        }
892
893        let nofollow = flags.contains(OpenFlags::NOFOLLOW);
894        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
895
896        let symlink_mode =
897            if nofollow || must_create { SymlinkMode::NoFollow } else { SymlinkMode::Follow };
898
899        let resolve_base = match (
900            resolve_flags.contains(ResolveFlags::BENEATH),
901            resolve_flags.contains(ResolveFlags::IN_ROOT),
902        ) {
903            (false, false) => ResolveBase::None,
904            (true, false) => ResolveBase::Beneath(dir.clone()),
905            (false, true) => ResolveBase::InRoot(dir.clone()),
906            (true, true) => return error!(EINVAL),
907        };
908
909        // `RESOLVE_BENEATH` and `RESOLVE_IN_ROOT` imply `RESOLVE_NO_MAGICLINKS`. This matches
910        // Linux behavior. Strictly speaking it's is not really required, but it's hard to
911        // implement `BENEATH` and `IN_ROOT` flags correctly otherwise.
912        if resolve_base != ResolveBase::None {
913            resolve_flags.insert(ResolveFlags::NO_MAGICLINKS);
914        }
915
916        let mut context = LookupContext {
917            symlink_mode,
918            remaining_follows: MAX_SYMLINK_FOLLOWS,
919            must_be_directory: flags.contains(OpenFlags::DIRECTORY),
920            resolve_flags,
921            resolve_base,
922        };
923        let (name, created) =
924            match self.resolve_open_path(locked, &mut context, &dir, path, mode, flags) {
925                Ok((n, c)) => (n, c),
926                Err(e) => {
927                    let mut abs_path = dir.path(&self.task);
928                    abs_path.extend(&**path);
929                    track_file_not_found(abs_path);
930                    return Err(e);
931                }
932            };
933
934        let name = if flags.contains(OpenFlags::TMPFILE) {
935            // `O_TMPFILE` is incompatible with `O_CREAT`
936            if flags.contains(OpenFlags::CREAT) {
937                return error!(EINVAL);
938            }
939            name.create_tmpfile(locked, self, mode.with_type(FileMode::IFREG), flags)?
940        } else {
941            let mode = name.entry.node.info().mode;
942
943            // These checks are not needed in the `O_TMPFILE` case because `mode` refers to the
944            // file we are opening. With `O_TMPFILE`, that file is the regular file we just
945            // created rather than the node we found by resolving the path.
946            //
947            // For example, we do not need to produce `ENOTDIR` when `must_be_directory` is set
948            // because `must_be_directory` refers to the node we found by resolving the path.
949            // If that node was not a directory, then `create_tmpfile` will produce an error.
950            //
951            // Similarly, we never need to call `truncate` because `O_TMPFILE` is newly created
952            // and therefor already an empty file.
953
954            if !opath && nofollow && mode.is_lnk() {
955                return error!(ELOOP);
956            }
957
958            if mode.is_dir() {
959                if flags.can_write()
960                    || flags.contains(OpenFlags::CREAT)
961                    || flags.contains(OpenFlags::TRUNC)
962                {
963                    return error!(EISDIR);
964                }
965                if flags.contains(OpenFlags::DIRECT) {
966                    return error!(EINVAL);
967                }
968            } else if context.must_be_directory {
969                return error!(ENOTDIR);
970            }
971
972            if flags.contains(OpenFlags::TRUNC) && mode.is_reg() && !created {
973                // You might think we should check file.can_write() at this
974                // point, which is what the docs suggest, but apparently we
975                // are supposed to truncate the file if this task can write
976                // to the underlying node, even if we are opening the file
977                // as read-only. See OpenTest.CanTruncateReadOnly.
978                name.truncate(locked, self, 0)?;
979            }
980
981            name
982        };
983
984        // If the node has been created, the open operation should not verify access right:
985        // From <https://man7.org/linux/man-pages/man2/open.2.html>
986        //
987        // > Note that mode applies only to future accesses of the newly created file; the
988        // > open() call that creates a read-only file may well return a  read/write  file
989        // > descriptor.
990
991        let access_check = if created { AccessCheck::skip() } else { access_check };
992        name.open(locked, self, flags, access_check)
993    }
994
995    /// A wrapper for FsContext::lookup_parent_at that resolves the given
996    /// dir_fd to a NamespaceNode.
997    ///
998    /// Absolute paths are resolve relative to the root of the FsContext for
999    /// this task. Relative paths are resolve relative to dir_fd. To resolve
1000    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
1001    /// dir_fd.
1002    pub fn lookup_parent_at<'a, L>(
1003        &self,
1004        locked: &mut Locked<L>,
1005        context: &mut LookupContext,
1006        dir_fd: FdNumber,
1007        path: &'a FsStr,
1008    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
1009    where
1010        L: LockEqualOrBefore<FileOpsCore>,
1011    {
1012        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, ResolveFlags::empty())?;
1013        self.lookup_parent(locked, context, &dir, path)
1014    }
1015
1016    /// Lookup the parent of a namespace node.
1017    ///
1018    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
1019    /// calling this function directly.
1020    ///
1021    /// This function resolves all but the last component of the given path.
1022    /// The function returns the parent directory of the last component as well
1023    /// as the last component.
1024    ///
1025    /// If path is empty, this function returns dir and an empty path.
1026    /// Similarly, if path ends with "." or "..", these components will be
1027    /// returned along with the parent.
1028    ///
1029    /// The returned parent might not be a directory.
1030    pub fn lookup_parent<'a, L>(
1031        &self,
1032        locked: &mut Locked<L>,
1033        context: &mut LookupContext,
1034        dir: &NamespaceNode,
1035        path: &'a FsStr,
1036    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
1037    where
1038        L: LockEqualOrBefore<FileOpsCore>,
1039    {
1040        context.update_for_path(path);
1041
1042        let mut current_node = dir.clone();
1043        let mut it = path.split(|c| *c == b'/').filter(|p| !p.is_empty()).map(<&FsStr>::from);
1044        let mut current_path_component = it.next().unwrap_or_default();
1045        for next_path_component in it {
1046            current_node =
1047                current_node.lookup_child(locked, self, context, current_path_component)?;
1048            current_path_component = next_path_component;
1049        }
1050        Ok((current_node, current_path_component))
1051    }
1052
1053    /// Lookup a namespace node.
1054    ///
1055    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
1056    /// calling this function directly.
1057    ///
1058    /// This function resolves the component of the given path.
1059    pub fn lookup_path<L>(
1060        &self,
1061        locked: &mut Locked<L>,
1062        context: &mut LookupContext,
1063        dir: NamespaceNode,
1064        path: &FsStr,
1065    ) -> Result<NamespaceNode, Errno>
1066    where
1067        L: LockEqualOrBefore<FileOpsCore>,
1068    {
1069        let (parent, basename) = self.lookup_parent(locked, context, &dir, path)?;
1070        parent.lookup_child(locked, self, context, basename)
1071    }
1072
1073    /// Lookup a namespace node starting at the root directory.
1074    ///
1075    /// Resolves symlinks.
1076    pub fn lookup_path_from_root<L>(
1077        &self,
1078        locked: &mut Locked<L>,
1079        path: &FsStr,
1080    ) -> Result<NamespaceNode, Errno>
1081    where
1082        L: LockEqualOrBefore<FileOpsCore>,
1083    {
1084        let mut context = LookupContext::default();
1085        self.lookup_path(locked, &mut context, self.fs().root(), path)
1086    }
1087
1088    pub fn exec(
1089        &mut self,
1090        locked: &mut Locked<Unlocked>,
1091        executable: FileHandle,
1092        path: CString,
1093        argv: Vec<CString>,
1094        environ: Vec<CString>,
1095    ) -> Result<(), Errno> {
1096        // Executable must be a regular file
1097        if !executable.name.entry.node.is_reg() {
1098            return error!(EACCES);
1099        }
1100
1101        // File node must have EXEC mode permissions.
1102        // Note that the ability to execute a file is unrelated to the flags
1103        // used in the `open` call.
1104        executable.name.check_access(locked, self, Access::EXEC, CheckAccessReason::Exec)?;
1105
1106        let elf_security_state = security::bprm_creds_for_exec(self, &executable.name)?;
1107
1108        let resolved_elf = resolve_executable(
1109            locked,
1110            self,
1111            executable,
1112            path.clone(),
1113            argv,
1114            environ,
1115            elf_security_state,
1116        )?;
1117
1118        let maybe_set_id = if self.kernel().features.enable_suid {
1119            resolved_elf.file.name.suid_and_sgid(&self)?
1120        } else {
1121            Default::default()
1122        };
1123
1124        if self.thread_group().read().tasks_count() > 1 {
1125            track_stub!(TODO("https://fxbug.dev/297434895"), "exec on multithread process");
1126            return error!(EINVAL);
1127        }
1128
1129        if let Err(err) = self.finish_exec(locked, path, resolved_elf, maybe_set_id) {
1130            log_warn!("unrecoverable error in exec: {err:?}");
1131
1132            send_standard_signal(locked, self, SignalInfo::forced(SIGSEGV));
1133            return Err(err);
1134        }
1135
1136        self.ptrace_event(locked, PtraceOptions::TRACEEXEC, self.task.tid as u64);
1137        self.signal_vfork();
1138        self.task.thread_group.sync_syscall_log_level();
1139
1140        Ok(())
1141    }
1142
1143    /// After the memory is unmapped, any failure in exec is unrecoverable and results in the
1144    /// process crashing. This function is for that second half; any error returned from this
1145    /// function will be considered unrecoverable.
1146    fn finish_exec(
1147        &mut self,
1148        locked: &mut Locked<Unlocked>,
1149        path: CString,
1150        resolved_elf: ResolvedElf,
1151        mut maybe_set_id: UserAndOrGroupId,
1152    ) -> Result<(), Errno> {
1153        // Now that the exec will definitely finish (or crash), notify owners of
1154        // locked futexes for the current process, which will be impossible to
1155        // update after process image is replaced.  See get_robust_list(2).
1156        self.notify_robust_list();
1157
1158        // Passing arch32 information here ensures the replacement memory
1159        // layout matches the elf being executed.
1160        let mm = {
1161            let mm = self.mm()?;
1162            let new_mm = mm
1163                .exec(resolved_elf.file.name.to_passive(), resolved_elf.arch_width)
1164                .map_err(|status| from_status_like_fdio!(status))?;
1165            self.mm.update(Some(new_mm.clone()));
1166            new_mm
1167        };
1168
1169        {
1170            let mut state = self.write();
1171
1172            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1173            //
1174            //   The aforementioned transformations of the effective IDs are not
1175            //   performed (i.e., the set-user-ID and set-group-ID bits are
1176            //   ignored) if any of the following is true:
1177            //
1178            //   * the no_new_privs attribute is set for the calling thread (see
1179            //      prctl(2));
1180            //
1181            //   *  the underlying filesystem is mounted nosuid (the MS_NOSUID
1182            //      flag for mount(2)); or
1183            //
1184            //   *  the calling process is being ptraced.
1185            //
1186            // The MS_NOSUID check is in `NamespaceNode::suid_and_sgid()`.
1187            if state.no_new_privs() || state.is_ptraced() {
1188                maybe_set_id.clear();
1189            }
1190
1191            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1192            //
1193            //   The process's "dumpable" attribute is set to the value 1,
1194            //   unless a set-user-ID program, a set-group-ID program, or a
1195            //   program with capabilities is being executed, in which case the
1196            //   dumpable flag may instead be reset to the value in
1197            //   /proc/sys/fs/suid_dumpable, in the circumstances described
1198            //   under PR_SET_DUMPABLE in prctl(2).
1199            let dumpable =
1200                if maybe_set_id.is_none() { DumpPolicy::User } else { DumpPolicy::Disable };
1201            *mm.dumpable.lock(locked) = dumpable;
1202
1203            // TODO(https://fxbug.dev/433463756): Figure out whether this is the right place to
1204            // take the lock.
1205            // SAFETY: this is allowed because we are the CurrentTask.
1206            let mut writable_creds = unsafe { self.persistent_info.write_creds() };
1207            state.set_sigaltstack(None);
1208            state.robust_list_head = RobustListHeadPtr::null(self);
1209
1210            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1211            //
1212            //   If a set-user-ID or set-group-ID
1213            //   program is being executed, then the parent death signal set by
1214            //   prctl(2) PR_SET_PDEATHSIG flag is cleared.
1215            //
1216            // TODO(https://fxbug.dev/356684424): Implement the behavior above once we support
1217            // the PR_SET_PDEATHSIG flag.
1218
1219            // TODO(tbodt): Check whether capability xattrs are set on the file, and grant/limit
1220            // capabilities accordingly.
1221            let mut new_creds = Credentials::clone(&self.current_creds());
1222            new_creds.exec(maybe_set_id);
1223            let new_creds = Arc::new(new_creds);
1224            writable_creds.update(new_creds.clone());
1225            *self.current_creds.borrow_mut() = CurrentCreds::Cached(new_creds);
1226        }
1227
1228        let security_state = resolved_elf.security_state.clone();
1229
1230        let start_info = load_executable(self, resolved_elf, &path)?;
1231
1232        let regs: zx_restricted_state_t = start_info.into();
1233        self.thread_state.registers.load(regs);
1234        self.thread_state.extended_pstate.reset();
1235        self.thread_group().signal_actions.reset_for_exec();
1236
1237        // The exit signal (and that of the children) is reset to SIGCHLD.
1238        let mut thread_group_state = self.thread_group().write();
1239        thread_group_state.exit_signal = Some(SIGCHLD);
1240        for (_, weak_child) in &mut thread_group_state.children {
1241            if let Some(child) = weak_child.upgrade() {
1242                let mut child_state = child.write();
1243                child_state.exit_signal = Some(SIGCHLD);
1244            }
1245        }
1246
1247        std::mem::drop(thread_group_state);
1248
1249        // TODO(https://fxbug.dev/42082680): All threads other than the calling thread are destroyed.
1250
1251        // TODO: POSIX timers are not preserved.
1252
1253        // TODO: Ensure that the filesystem context is un-shared, undoing the effect of CLONE_FS.
1254
1255        // The file descriptor table is unshared, undoing the effect of the CLONE_FILES flag of
1256        // clone(2).
1257        self.files.unshare();
1258        self.files.exec(locked, self);
1259
1260        // If SELinux is enabled, enforce permissions related to inheritance of file descriptors
1261        // and resource limits. Then update the current task's SID.
1262        //
1263        // TODO: https://fxbug.dev/378655436 - After the above, enforce permissions related to
1264        // signal state inheritance.
1265        //
1266        // This needs to be called after closing any files marked "close-on-exec".
1267        security::exec_binprm(locked, self, &security_state)?;
1268
1269        self.thread_group().write().did_exec = true;
1270
1271        self.set_command_name(TaskCommand::from_path_bytes(path.to_bytes()));
1272
1273        Ok(())
1274    }
1275
1276    pub fn set_command_name(&self, new_name: TaskCommand) {
1277        // set_command_name needs to run before leader_command() in cases where self is the leader.
1278        self.task.set_command_name(new_name.clone());
1279        let leader_command = self.thread_group().read().leader_command();
1280        starnix_logging::set_current_task_info(
1281            new_name,
1282            leader_command,
1283            self.thread_group().leader,
1284            self.tid,
1285        );
1286    }
1287
1288    pub fn add_seccomp_filter(
1289        &mut self,
1290        locked: &mut Locked<Unlocked>,
1291        code: Vec<sock_filter>,
1292        flags: u32,
1293    ) -> Result<SyscallResult, Errno> {
1294        let new_filter = Arc::new(SeccompFilter::from_cbpf(
1295            &code,
1296            self.thread_group().next_seccomp_filter_id.add(1),
1297            flags & SECCOMP_FILTER_FLAG_LOG != 0,
1298        )?);
1299
1300        let mut maybe_fd: Option<FdNumber> = None;
1301
1302        if flags & SECCOMP_FILTER_FLAG_NEW_LISTENER != 0 {
1303            maybe_fd = Some(SeccompFilterContainer::create_listener(locked, self)?);
1304        }
1305
1306        // We take the process lock here because we can't change any of the threads
1307        // while doing a tsync.  So, you hold the process lock while making any changes.
1308        let state = self.thread_group().write();
1309
1310        if flags & SECCOMP_FILTER_FLAG_TSYNC != 0 {
1311            // TSYNC synchronizes all filters for all threads in the current process to
1312            // the current thread's
1313
1314            // We collect the filters for the current task upfront to save us acquiring
1315            // the task's lock a lot of times below.
1316            let mut filters: SeccompFilterContainer = self.read().seccomp_filters.clone();
1317
1318            // For TSYNC to work, all of the other thread filters in this process have to
1319            // be a prefix of this thread's filters, and none of them can be in
1320            // strict mode.
1321            let tasks = state.tasks().collect::<Vec<_>>();
1322            for task in &tasks {
1323                if task.tid == self.tid {
1324                    continue;
1325                }
1326                let other_task_state = task.read();
1327
1328                // Target threads cannot be in SECCOMP_MODE_STRICT
1329                if task.seccomp_filter_state.get() == SeccompStateValue::Strict {
1330                    return Self::seccomp_tsync_error(task.tid, flags);
1331                }
1332
1333                // Target threads' filters must be a subsequence of this thread's
1334                if !other_task_state.seccomp_filters.can_sync_to(&filters) {
1335                    return Self::seccomp_tsync_error(task.tid, flags);
1336                }
1337            }
1338
1339            // Now that we're sure we're allowed to do so, add the filter to all threads.
1340            filters.add_filter(new_filter, code.len() as u16)?;
1341
1342            for task in &tasks {
1343                let mut other_task_state = task.write();
1344
1345                other_task_state.enable_no_new_privs();
1346                other_task_state.seccomp_filters = filters.clone();
1347                task.set_seccomp_state(SeccompStateValue::UserDefined)?;
1348            }
1349        } else {
1350            let mut task_state = self.task.write();
1351
1352            task_state.seccomp_filters.add_filter(new_filter, code.len() as u16)?;
1353            self.set_seccomp_state(SeccompStateValue::UserDefined)?;
1354        }
1355
1356        if let Some(fd) = maybe_fd { Ok(fd.into()) } else { Ok(().into()) }
1357    }
1358
1359    pub fn run_seccomp_filters(
1360        &mut self,
1361        locked: &mut Locked<Unlocked>,
1362        syscall: &Syscall,
1363    ) -> Option<Result<SyscallResult, Errno>> {
1364        // Implementation of SECCOMP_FILTER_STRICT, which has slightly different semantics
1365        // from user-defined seccomp filters.
1366        if self.seccomp_filter_state.get() == SeccompStateValue::Strict {
1367            return SeccompState::do_strict(locked, self, syscall);
1368        }
1369
1370        // Run user-defined seccomp filters
1371        let result = self.task.read().seccomp_filters.run_all(self, syscall);
1372
1373        SeccompState::do_user_defined(locked, result, self, syscall)
1374    }
1375
1376    fn seccomp_tsync_error(id: i32, flags: u32) -> Result<SyscallResult, Errno> {
1377        // By default, TSYNC indicates failure state by returning the first thread
1378        // id not to be able to sync, rather than by returning -1 and setting
1379        // errno.  However, if TSYNC_ESRCH is set, it returns ESRCH.  This
1380        // prevents conflicts with fact that SECCOMP_FILTER_FLAG_NEW_LISTENER
1381        // makes seccomp return an fd.
1382        if flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH != 0 { error!(ESRCH) } else { Ok(id.into()) }
1383    }
1384
1385    // Notify all futexes in robust list.  The robust list is in user space, so we
1386    // are very careful about walking it, and there are a lot of quiet returns if
1387    // we fail to walk it.
1388    // TODO(https://fxbug.dev/42079081): This only sets the FUTEX_OWNER_DIED bit; it does
1389    // not wake up a waiter.
1390    pub fn notify_robust_list(&self) {
1391        let task_state = self.write();
1392        let robust_list_addr = task_state.robust_list_head.addr();
1393        if robust_list_addr == UserAddress::NULL {
1394            // No one has called set_robust_list.
1395            return;
1396        }
1397        let robust_list_res = self.read_multi_arch_object(task_state.robust_list_head);
1398
1399        let head = if let Ok(head) = robust_list_res {
1400            head
1401        } else {
1402            return;
1403        };
1404
1405        let offset = head.futex_offset;
1406
1407        let mut entries_count = 0;
1408        let mut curr_ptr = head.list.next;
1409        while curr_ptr.addr() != robust_list_addr.into() && entries_count < ROBUST_LIST_LIMIT {
1410            let curr_ref = self.read_multi_arch_object(curr_ptr);
1411
1412            let curr = if let Ok(curr) = curr_ref {
1413                curr
1414            } else {
1415                return;
1416            };
1417
1418            let Some(futex_base) = curr_ptr.addr().checked_add_signed(offset) else {
1419                return;
1420            };
1421
1422            let futex_addr = match FutexAddress::try_from(futex_base) {
1423                Ok(addr) => addr,
1424                Err(_) => {
1425                    return;
1426                }
1427            };
1428
1429            let Ok(mm) = self.mm() else {
1430                log_error!("Asked to notify robust list futexes in system task.");
1431                return;
1432            };
1433            let futex = if let Ok(futex) = mm.atomic_load_u32_relaxed(futex_addr) {
1434                futex
1435            } else {
1436                return;
1437            };
1438
1439            if (futex & FUTEX_TID_MASK) as i32 == self.tid {
1440                let owner_died = FUTEX_OWNER_DIED | futex;
1441                if mm.atomic_store_u32_relaxed(futex_addr, owner_died).is_err() {
1442                    return;
1443                }
1444            }
1445            curr_ptr = curr.next;
1446            entries_count += 1;
1447        }
1448    }
1449
1450    /// Returns a ref to this thread's SeccompNotifier.
1451    pub fn get_seccomp_notifier(&mut self) -> Option<SeccompNotifierHandle> {
1452        self.task.write().seccomp_filters.notifier.clone()
1453    }
1454
1455    pub fn set_seccomp_notifier(&mut self, notifier: Option<SeccompNotifierHandle>) {
1456        self.task.write().seccomp_filters.notifier = notifier;
1457    }
1458
1459    /// Processes a Zircon exception associated with this task.
1460    pub fn process_exception(
1461        &self,
1462        locked: &mut Locked<Unlocked>,
1463        report: &zx::ExceptionReport,
1464    ) -> ExceptionResult {
1465        match report.ty {
1466            zx::ExceptionType::General => match get_signal_for_general_exception(&report.arch) {
1467                Some(sig) => ExceptionResult::Signal(SignalInfo::kernel(sig)),
1468                None => {
1469                    log_error!("Unrecognized general exception: {:?}", report);
1470                    ExceptionResult::Signal(SignalInfo::kernel(SIGILL))
1471                }
1472            },
1473            zx::ExceptionType::FatalPageFault { status } => {
1474                let report = decode_page_fault_exception_report(&report.arch);
1475                if let Ok(mm) = self.mm() {
1476                    mm.handle_page_fault(locked, report, status)
1477                } else {
1478                    panic!(
1479                        "system task is handling a major page fault status={:?}, report={:?}",
1480                        status, report
1481                    );
1482                }
1483            }
1484            zx::ExceptionType::UndefinedInstruction => {
1485                ExceptionResult::Signal(SignalInfo::kernel(SIGILL))
1486            }
1487            zx::ExceptionType::UnalignedAccess => {
1488                ExceptionResult::Signal(SignalInfo::kernel(SIGBUS))
1489            }
1490            zx::ExceptionType::SoftwareBreakpoint | zx::ExceptionType::HardwareBreakpoint => {
1491                ExceptionResult::Signal(SignalInfo::kernel(SIGTRAP))
1492            }
1493            zx::ExceptionType::ProcessNameChanged => {
1494                log_error!("Received unexpected process name changed exception");
1495                ExceptionResult::Handled
1496            }
1497            zx::ExceptionType::ProcessStarting
1498            | zx::ExceptionType::ThreadStarting
1499            | zx::ExceptionType::ThreadExiting => {
1500                log_error!("Received unexpected task lifecycle exception");
1501                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1502            }
1503            zx::ExceptionType::PolicyError(policy_code) => {
1504                log_error!(policy_code:?; "Received Zircon policy error exception");
1505                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1506            }
1507            zx::ExceptionType::UnknownUserGenerated { code, data } => {
1508                log_error!(code:?, data:?; "Received unexpected unknown user generated exception");
1509                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1510            }
1511            zx::ExceptionType::Unknown { ty, code, data } => {
1512                log_error!(ty:?, code:?, data:?; "Received unexpected exception");
1513                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1514            }
1515        }
1516    }
1517
1518    /// Clone this task.
1519    ///
1520    /// Creates a new task object that shares some state with this task
1521    /// according to the given flags.
1522    ///
1523    /// Used by the clone() syscall to create both processes and threads.
1524    ///
1525    /// The exit signal is broken out from the flags parameter like clone3() rather than being
1526    /// bitwise-ORed like clone().
1527    pub fn clone_task<L>(
1528        &self,
1529        locked: &mut Locked<L>,
1530        flags: u64,
1531        child_exit_signal: Option<Signal>,
1532        user_parent_tid: UserRef<pid_t>,
1533        user_child_tid: UserRef<pid_t>,
1534        user_pidfd: UserRef<FdNumber>,
1535    ) -> Result<TaskBuilder, Errno>
1536    where
1537        L: LockBefore<MmDumpable>,
1538        L: LockBefore<TaskRelease>,
1539        L: LockBefore<ProcessGroupState>,
1540    {
1541        const IMPLEMENTED_FLAGS: u64 = (CLONE_VM
1542            | CLONE_FS
1543            | CLONE_FILES
1544            | CLONE_SIGHAND
1545            | CLONE_THREAD
1546            | CLONE_SYSVSEM
1547            | CLONE_SETTLS
1548            | CLONE_PARENT
1549            | CLONE_PARENT_SETTID
1550            | CLONE_PIDFD
1551            | CLONE_CHILD_CLEARTID
1552            | CLONE_CHILD_SETTID
1553            | CLONE_VFORK
1554            | CLONE_NEWUTS
1555            | CLONE_PTRACE) as u64;
1556
1557        // A mask with all valid flags set, because we want to return a different error code for an
1558        // invalid flag vs an unimplemented flag. Subtracting 1 from the largest valid flag gives a
1559        // mask with all flags below it set. Shift up by one to make sure the largest flag is also
1560        // set.
1561        const VALID_FLAGS: u64 = (CLONE_INTO_CGROUP << 1) - 1;
1562
1563        // CLONE_SETTLS is implemented by sys_clone.
1564
1565        let clone_files = flags & (CLONE_FILES as u64) != 0;
1566        let clone_fs = flags & (CLONE_FS as u64) != 0;
1567        let clone_parent = flags & (CLONE_PARENT as u64) != 0;
1568        let clone_parent_settid = flags & (CLONE_PARENT_SETTID as u64) != 0;
1569        let clone_pidfd = flags & (CLONE_PIDFD as u64) != 0;
1570        let clone_child_cleartid = flags & (CLONE_CHILD_CLEARTID as u64) != 0;
1571        let clone_child_settid = flags & (CLONE_CHILD_SETTID as u64) != 0;
1572        let clone_sysvsem = flags & (CLONE_SYSVSEM as u64) != 0;
1573        let clone_ptrace = flags & (CLONE_PTRACE as u64) != 0;
1574        let clone_thread = flags & (CLONE_THREAD as u64) != 0;
1575        let clone_vm = flags & (CLONE_VM as u64) != 0;
1576        let clone_sighand = flags & (CLONE_SIGHAND as u64) != 0;
1577        let clone_vfork = flags & (CLONE_VFORK as u64) != 0;
1578        let clone_newuts = flags & (CLONE_NEWUTS as u64) != 0;
1579        let clone_into_cgroup = flags & CLONE_INTO_CGROUP != 0;
1580
1581        if clone_ptrace {
1582            track_stub!(TODO("https://fxbug.dev/322874630"), "CLONE_PTRACE");
1583        }
1584
1585        if clone_sysvsem {
1586            track_stub!(TODO("https://fxbug.dev/322875185"), "CLONE_SYSVSEM");
1587        }
1588
1589        if clone_into_cgroup {
1590            track_stub!(TODO("https://fxbug.dev/403612570"), "CLONE_INTO_CGROUP");
1591        }
1592
1593        if clone_sighand && !clone_vm {
1594            return error!(EINVAL);
1595        }
1596        if clone_thread && !clone_sighand {
1597            return error!(EINVAL);
1598        }
1599
1600        if clone_pidfd && clone_thread {
1601            return error!(EINVAL);
1602        }
1603        if clone_pidfd && clone_parent_settid && user_parent_tid.addr() == user_pidfd.addr() {
1604            // `clone()` uses the same out-argument for these, so error out if they have the same
1605            // user address.
1606            return error!(EINVAL);
1607        }
1608
1609        if flags & !VALID_FLAGS != 0 {
1610            return error!(EINVAL);
1611        }
1612
1613        if clone_vm && !clone_thread {
1614            // TODO(https://fxbug.dev/42066087) Implement CLONE_VM for child processes (not just child
1615            // threads). Currently this executes CLONE_VM (explicitly passed to clone() or as
1616            // used by vfork()) as a fork (the VM in the child is copy-on-write) which is almost
1617            // always OK.
1618            //
1619            // CLONE_VM is primarily as an optimization to avoid making a copy-on-write version of a
1620            // process' VM that will be immediately replaced with a call to exec(). The main users
1621            // (libc and language runtimes) don't actually rely on the memory being shared between
1622            // the two processes. And the vfork() man page explicitly allows vfork() to be
1623            // implemented as fork() which is what we do here.
1624            if !clone_vfork {
1625                track_stub!(
1626                    TODO("https://fxbug.dev/322875227"),
1627                    "CLONE_VM without CLONE_THREAD or CLONE_VFORK"
1628                );
1629            }
1630        } else if clone_thread && !clone_vm {
1631            track_stub!(TODO("https://fxbug.dev/322875167"), "CLONE_THREAD without CLONE_VM");
1632            return error!(ENOSYS);
1633        }
1634
1635        if flags & !IMPLEMENTED_FLAGS != 0 {
1636            track_stub!(
1637                TODO("https://fxbug.dev/322875130"),
1638                "clone unknown flags",
1639                flags & !IMPLEMENTED_FLAGS
1640            );
1641            return error!(ENOSYS);
1642        }
1643
1644        let fs = if clone_fs { self.fs() } else { self.fs().fork() };
1645        let files = if clone_files { self.files.clone() } else { self.files.fork() };
1646
1647        let kernel = self.kernel();
1648
1649        let mut pids = kernel.pids.write();
1650
1651        // Lock the cgroup process hierarchy so that the parent process cannot move to a different
1652        // cgroup while a new task or thread_group is created. This may be unnecessary if
1653        // CLONE_INTO_CGROUP is implemented and passed in.
1654        let mut cgroup2_pid_table = kernel.cgroups.lock_cgroup2_pid_table();
1655        // Create a `KernelSignal::Freeze` to put onto the new task, if the cgroup is frozen.
1656        let child_kernel_signals = cgroup2_pid_table
1657            .maybe_create_freeze_signal(self.thread_group())
1658            .into_iter()
1659            .collect::<VecDeque<_>>();
1660
1661        let pid;
1662        let command;
1663        let creds;
1664        let scheduler_state;
1665        let no_new_privs;
1666        let seccomp_filters;
1667        let robust_list_head = RobustListHeadPtr::null(self);
1668        let child_signal_mask;
1669        let timerslack_ns;
1670        let uts_ns;
1671
1672        let TaskInfo { thread, thread_group, memory_manager } = {
1673            // These variables hold the original parent in case we need to switch the parent of the
1674            // new task because of CLONE_PARENT.
1675            let weak_original_parent;
1676            let original_parent;
1677
1678            // Make sure to drop these locks ASAP to avoid inversion
1679            let thread_group_state = {
1680                let thread_group_state = self.thread_group().write();
1681                if clone_parent {
1682                    // With the CLONE_PARENT flag, the parent of the new task is our parent
1683                    // instead of ourselves.
1684                    weak_original_parent =
1685                        thread_group_state.parent.clone().ok_or_else(|| errno!(EINVAL))?;
1686                    std::mem::drop(thread_group_state);
1687                    original_parent = weak_original_parent.upgrade();
1688                    original_parent.write()
1689                } else {
1690                    thread_group_state
1691                }
1692            };
1693
1694            let state = self.read();
1695
1696            no_new_privs = state.no_new_privs();
1697            seccomp_filters = state.seccomp_filters.clone();
1698            child_signal_mask = state.signal_mask();
1699
1700            pid = pids.allocate_pid();
1701            command = self.command();
1702            creds = self.current_creds().clone();
1703            scheduler_state = state.scheduler_state.fork();
1704            timerslack_ns = state.timerslack_ns;
1705
1706            uts_ns = if clone_newuts {
1707                security::check_task_capable(self, CAP_SYS_ADMIN)?;
1708                state.uts_ns.read().fork()
1709            } else {
1710                state.uts_ns.clone()
1711            };
1712
1713            if clone_thread {
1714                TaskInfo {
1715                    thread: None,
1716                    thread_group: self.thread_group().clone(),
1717                    memory_manager: self.mm().ok(),
1718                }
1719            } else {
1720                // Drop the lock on this task before entering `create_zircon_process`, because it will
1721                // take a lock on the new thread group, and locks on thread groups have a higher
1722                // priority than locks on the task in the thread group.
1723                std::mem::drop(state);
1724                let signal_actions = if clone_sighand {
1725                    self.thread_group().signal_actions.clone()
1726                } else {
1727                    self.thread_group().signal_actions.fork()
1728                };
1729                let process_group = thread_group_state.process_group.clone();
1730
1731                let task_info = create_zircon_process(
1732                    locked,
1733                    kernel,
1734                    Some(thread_group_state),
1735                    pid,
1736                    child_exit_signal,
1737                    process_group,
1738                    signal_actions,
1739                    command.clone(),
1740                    self.thread_state.arch_width(),
1741                )?;
1742
1743                cgroup2_pid_table.inherit_cgroup(self.thread_group(), &task_info.thread_group);
1744
1745                task_info
1746            }
1747        };
1748
1749        // Drop the lock on the cgroup pid_table before creating the TaskBuilder.
1750        // If the TaskBuilder creation fails, the TaskBuilder is dropped, which calls
1751        // ThreadGroup::remove. ThreadGroup::remove takes the cgroup pid_table lock, causing
1752        // a cyclic lock dependency.
1753        std::mem::drop(cgroup2_pid_table);
1754
1755        // Only create the vfork event when the caller requested CLONE_VFORK.
1756        let vfork_event = if clone_vfork { Some(Arc::new(zx::Event::create())) } else { None };
1757
1758        let mut child = TaskBuilder::new(Task::new(
1759            pid,
1760            command,
1761            thread_group,
1762            thread,
1763            files,
1764            memory_manager,
1765            fs,
1766            creds,
1767            self.abstract_socket_namespace.clone(),
1768            self.abstract_vsock_namespace.clone(),
1769            child_signal_mask,
1770            child_kernel_signals,
1771            vfork_event,
1772            scheduler_state,
1773            uts_ns,
1774            no_new_privs,
1775            SeccompState::from(&self.seccomp_filter_state),
1776            seccomp_filters,
1777            robust_list_head,
1778            timerslack_ns,
1779        ));
1780
1781        release_on_error!(child, locked, {
1782            let child_task = TempRef::from(&child.task);
1783            // Drop the pids lock as soon as possible after creating the child. Destroying the child
1784            // and removing it from the pids table itself requires the pids lock, so if an early exit
1785            // takes place we have a self deadlock.
1786            pids.add_task(&child_task);
1787            std::mem::drop(pids);
1788
1789            // Child lock must be taken before this lock. Drop the lock on the task, take a writable
1790            // lock on the child and take the current state back.
1791
1792            #[cfg(any(test, debug_assertions))]
1793            {
1794                // Take the lock on the thread group and its child in the correct order to ensure any wrong ordering
1795                // will trigger the tracing-mutex at the right call site.
1796                if !clone_thread {
1797                    let _l1 = self.thread_group().read();
1798                    let _l2 = child.thread_group().read();
1799                }
1800            }
1801
1802            if clone_thread {
1803                self.thread_group().add(&child_task)?;
1804            } else {
1805                child.thread_group().add(&child_task)?;
1806
1807                // These manipulations of the signal handling state appear to be related to
1808                // CLONE_SIGHAND and CLONE_VM rather than CLONE_THREAD. However, we do not support
1809                // all the combinations of these flags, which means doing these operations here
1810                // might actually be correct. However, if you find a test that fails because of the
1811                // placement of this logic here, we might need to move it.
1812                let mut child_state = child.write();
1813                let state = self.read();
1814                child_state.set_sigaltstack(state.sigaltstack());
1815                child_state.set_signal_mask(state.signal_mask());
1816            }
1817
1818            if !clone_vm {
1819                // We do not support running threads in the same process with different
1820                // MemoryManagers.
1821                assert!(!clone_thread);
1822                self.mm()?.snapshot_to(locked, &child.mm()?)?;
1823            }
1824
1825            if clone_parent_settid {
1826                self.write_object(user_parent_tid, &child.tid)?;
1827            }
1828
1829            if clone_child_cleartid {
1830                child.write().clear_child_tid = user_child_tid;
1831            }
1832
1833            if clone_child_settid {
1834                child.write_object(user_child_tid, &child.tid)?;
1835            }
1836
1837            if clone_pidfd {
1838                let locked = locked.cast_locked::<TaskRelease>();
1839                let file = new_pidfd(
1840                    locked,
1841                    self,
1842                    child.thread_group(),
1843                    &*child.mm()?,
1844                    OpenFlags::empty(),
1845                );
1846                let pidfd = self.add_file(locked, file, FdFlags::CLOEXEC)?;
1847                self.write_object(user_pidfd, &pidfd)?;
1848            }
1849
1850            // TODO(https://fxbug.dev/42066087): We do not support running different processes with
1851            // the same MemoryManager. Instead, we implement a rough approximation of that behavior
1852            // by making a copy-on-write clone of the memory from the original process.
1853            if clone_vm && !clone_thread {
1854                self.mm()?.snapshot_to(locked, &child.mm()?)?;
1855            }
1856
1857            child.thread_state = self.thread_state.snapshot::<HeapRegs>();
1858            Ok(())
1859        });
1860
1861        // Take the lock on thread group and task in the correct order to ensure any wrong ordering
1862        // will trigger the tracing-mutex at the right call site.
1863        #[cfg(any(test, debug_assertions))]
1864        {
1865            let _l1 = child.thread_group().read();
1866            let _l2 = child.read();
1867        }
1868
1869        Ok(child)
1870    }
1871
1872    /// Sets the stop state (per set_stopped), and also notifies all listeners,
1873    /// including the parent process and the tracer if appropriate.
1874    pub fn set_stopped_and_notify(&self, stopped: StopState, siginfo: Option<SignalInfo>) {
1875        let maybe_signal_info = {
1876            let mut state = self.write();
1877            state.copy_state_from(self);
1878            state.set_stopped(stopped, siginfo, Some(self), None);
1879            state.prepare_signal_info(stopped)
1880        };
1881
1882        if let Some((tracer, signal_info)) = maybe_signal_info {
1883            if let Some(tracer) = tracer.upgrade() {
1884                tracer.write().send_signal(signal_info);
1885            }
1886        }
1887
1888        if !stopped.is_in_progress() {
1889            let parent = self.thread_group().read().parent.clone();
1890            if let Some(parent) = parent {
1891                parent
1892                    .upgrade()
1893                    .write()
1894                    .lifecycle_waiters
1895                    .notify_value(ThreadGroupLifecycleWaitValue::ChildStatus);
1896            }
1897        }
1898    }
1899
1900    /// If the task is stopping, set it as stopped. return whether the caller
1901    /// should stop.  The task might also be waking up.
1902    pub fn finalize_stop_state(&mut self) -> bool {
1903        let stopped = self.load_stopped();
1904
1905        if !stopped.is_stopping_or_stopped() {
1906            // If we are waking up, potentially write back state a tracer may have modified.
1907            let captured_state = self.write().take_captured_state();
1908            if let Some(captured) = captured_state {
1909                if captured.dirty {
1910                    self.thread_state.replace_registers(&captured.thread_state);
1911                }
1912            }
1913        }
1914
1915        // Stopping because the thread group is stopping.
1916        // Try to flip to GroupStopped - will fail if we shouldn't.
1917        if self.thread_group().set_stopped(StopState::GroupStopped, None, true)
1918            == StopState::GroupStopped
1919        {
1920            let signal = self.thread_group().read().last_signal.clone();
1921            // stopping because the thread group has stopped
1922            let event = Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0));
1923            self.write().set_stopped(StopState::GroupStopped, signal, Some(self), event);
1924            return true;
1925        }
1926
1927        // Stopping because the task is stopping
1928        if stopped.is_stopping_or_stopped() {
1929            if let Ok(stopped) = stopped.finalize() {
1930                self.set_stopped_and_notify(stopped, None);
1931            }
1932            return true;
1933        }
1934
1935        false
1936    }
1937
1938    /// Block the execution of `current_task` as long as the task is stopped and
1939    /// not terminated.
1940    pub fn block_while_stopped(&mut self, locked: &mut Locked<Unlocked>) {
1941        // Upgrade the state from stopping to stopped if needed. Return if the task
1942        // should not be stopped.
1943        if !self.finalize_stop_state() {
1944            return;
1945        }
1946
1947        let waiter = Waiter::with_options(WaiterOptions::IGNORE_SIGNALS);
1948        loop {
1949            // If we've exited, unstop the threads and return without notifying
1950            // waiters.
1951            if self.is_exitted() {
1952                self.thread_group().set_stopped(StopState::ForceAwake, None, false);
1953                self.write().set_stopped(StopState::ForceAwake, None, Some(self), None);
1954                return;
1955            }
1956
1957            if self.wake_or_wait_until_unstopped_async(&waiter) {
1958                return;
1959            }
1960
1961            // Do the wait. Result is not needed, as this is not in a syscall.
1962            let _: Result<(), Errno> = waiter.wait(locked, self);
1963
1964            // Maybe go from stopping to stopped, if we are currently stopping
1965            // again.
1966            self.finalize_stop_state();
1967        }
1968    }
1969
1970    /// For traced tasks, this will return the data neceessary for a cloned task
1971    /// to attach to the same tracer.
1972    pub fn get_ptrace_core_state_for_clone(
1973        &mut self,
1974        clone_args: &clone_args,
1975    ) -> (PtraceOptions, Option<PtraceCoreState>) {
1976        let state = self.write();
1977        if let Some(ptrace) = &state.ptrace {
1978            ptrace.get_core_state_for_clone(clone_args)
1979        } else {
1980            (PtraceOptions::empty(), None)
1981        }
1982    }
1983
1984    /// If currently being ptraced with the given option, emit the appropriate
1985    /// event.  PTRACE_EVENTMSG will return the given message.  Also emits the
1986    /// appropriate event for execve in the absence of TRACEEXEC.
1987    ///
1988    /// Note that the Linux kernel has a documented bug where, if TRACEEXIT is
1989    /// enabled, SIGKILL will trigger an event.  We do not exhibit this
1990    /// behavior.
1991    pub fn ptrace_event(
1992        &mut self,
1993        locked: &mut Locked<Unlocked>,
1994        trace_kind: PtraceOptions,
1995        msg: u64,
1996    ) {
1997        if !trace_kind.is_empty() {
1998            {
1999                let mut state = self.write();
2000                if let Some(ptrace) = &mut state.ptrace {
2001                    if !ptrace.has_option(trace_kind) {
2002                        // If this would be a TRACEEXEC, but TRACEEXEC is not
2003                        // turned on, then send a SIGTRAP.
2004                        if trace_kind == PtraceOptions::TRACEEXEC && !ptrace.is_seized() {
2005                            // Send a SIGTRAP so that the parent can gain control.
2006                            send_signal_first(locked, self, state, SignalInfo::kernel(SIGTRAP));
2007                        }
2008
2009                        return;
2010                    }
2011                    let ptrace_event = PtraceEvent::from_option(&trace_kind) as u32;
2012                    let siginfo = SignalInfo::with_detail(
2013                        SIGTRAP,
2014                        ((ptrace_event << 8) | SIGTRAP.number()) as i32,
2015                        SignalDetail::None,
2016                    );
2017                    state.set_stopped(
2018                        StopState::PtraceEventStopping,
2019                        Some(siginfo),
2020                        None,
2021                        Some(PtraceEventData::new(trace_kind, msg)),
2022                    );
2023                } else {
2024                    return;
2025                }
2026            }
2027            self.block_while_stopped(locked);
2028        }
2029    }
2030
2031    /// Causes the current thread's thread group to exit, notifying any ptracer
2032    /// of this task first.
2033    pub fn thread_group_exit(&mut self, locked: &mut Locked<Unlocked>, exit_status: ExitStatus) {
2034        self.ptrace_event(
2035            locked,
2036            PtraceOptions::TRACEEXIT,
2037            exit_status.signal_info_status() as u64,
2038        );
2039        self.thread_group().exit(locked, exit_status, None);
2040    }
2041
2042    /// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the
2043    /// exit signal as in clone().
2044    pub fn clone_task_for_test<L>(
2045        &self,
2046        locked: &mut Locked<L>,
2047        flags: u64,
2048        exit_signal: Option<Signal>,
2049    ) -> crate::testing::AutoReleasableTask
2050    where
2051        L: LockBefore<MmDumpable>,
2052        L: LockBefore<TaskRelease>,
2053        L: LockBefore<ProcessGroupState>,
2054    {
2055        let result = self
2056            .clone_task(
2057                locked,
2058                flags,
2059                exit_signal,
2060                UserRef::default(),
2061                UserRef::default(),
2062                UserRef::default(),
2063            )
2064            .expect("failed to create task in test");
2065
2066        result.into()
2067    }
2068
2069    // See "Ptrace access mode checking" in https://man7.org/linux/man-pages/man2/ptrace.2.html
2070    pub fn check_ptrace_access_mode<L>(
2071        &self,
2072        locked: &mut Locked<L>,
2073        mode: PtraceAccessMode,
2074        target: &Task,
2075    ) -> Result<(), Errno>
2076    where
2077        L: LockBefore<MmDumpable>,
2078    {
2079        // (1)  If the calling thread and the target thread are in the same
2080        //      thread group, access is always allowed.
2081        if self.thread_group().leader == target.thread_group().leader {
2082            return Ok(());
2083        }
2084
2085        // (2)  If the access mode specifies PTRACE_MODE_FSCREDS, then, for
2086        //      the check in the next step, employ the caller's filesystem
2087        //      UID and GID.  (As noted in credentials(7), the filesystem
2088        //      UID and GID almost always have the same values as the
2089        //      corresponding effective IDs.)
2090        //
2091        //      Otherwise, the access mode specifies PTRACE_MODE_REALCREDS,
2092        //      so use the caller's real UID and GID for the checks in the
2093        //      next step.  (Most APIs that check the caller's UID and GID
2094        //      use the effective IDs.  For historical reasons, the
2095        //      PTRACE_MODE_REALCREDS check uses the real IDs instead.)
2096        let (uid, gid) = if mode.contains(PTRACE_MODE_FSCREDS) {
2097            let fscred = self.current_creds().as_fscred();
2098            (fscred.uid, fscred.gid)
2099        } else if mode.contains(PTRACE_MODE_REALCREDS) {
2100            let creds = self.current_creds();
2101            (creds.uid, creds.gid)
2102        } else {
2103            unreachable!();
2104        };
2105
2106        // (3)  Deny access if neither of the following is true:
2107        //
2108        //      -  The real, effective, and saved-set user IDs of the target
2109        //         match the caller's user ID, and the real, effective, and
2110        //         saved-set group IDs of the target match the caller's
2111        //         group ID.
2112        //
2113        //      -  The caller has the CAP_SYS_PTRACE capability in the user
2114        //         namespace of the target.
2115        let target_creds = target.real_creds();
2116        if !(target_creds.uid == uid
2117            && target_creds.euid == uid
2118            && target_creds.saved_uid == uid
2119            && target_creds.gid == gid
2120            && target_creds.egid == gid
2121            && target_creds.saved_gid == gid)
2122        {
2123            security::check_task_capable(self, CAP_SYS_PTRACE)?;
2124        }
2125
2126        // (4)  Deny access if the target process "dumpable" attribute has a
2127        //      value other than 1 (SUID_DUMP_USER; see the discussion of
2128        //      PR_SET_DUMPABLE in prctl(2)), and the caller does not have
2129        //      the CAP_SYS_PTRACE capability in the user namespace of the
2130        //      target process.
2131        let dumpable = *target.mm()?.dumpable.lock(locked);
2132        match dumpable {
2133            DumpPolicy::User => (),
2134            DumpPolicy::Disable => security::check_task_capable(self, CAP_SYS_PTRACE)?,
2135        }
2136
2137        // (5)  The kernel LSM security_ptrace_access_check() interface is
2138        //      invoked to see if ptrace access is permitted.
2139        security::ptrace_access_check(self, target, mode)?;
2140
2141        // (6)  If access has not been denied by any of the preceding steps,
2142        //      then access is allowed.
2143        Ok(())
2144    }
2145
2146    pub fn can_signal(
2147        &self,
2148        target: &Task,
2149        unchecked_signal: UncheckedSignal,
2150    ) -> Result<(), Errno> {
2151        // If both the tasks share a thread group the signal can be sent. This is not documented
2152        // in kill(2) because kill does not support task-level granularity in signal sending.
2153        if self.thread_group == target.thread_group {
2154            return Ok(());
2155        }
2156
2157        let self_creds = self.current_creds();
2158        let target_creds = target.real_creds();
2159        // From https://man7.org/linux/man-pages/man2/kill.2.html:
2160        //
2161        // > For a process to have permission to send a signal, it must either be
2162        // > privileged (under Linux: have the CAP_KILL capability in the user
2163        // > namespace of the target process), or the real or effective user ID of
2164        // > the sending process must equal the real or saved set- user-ID of the
2165        // > target process.
2166        //
2167        // Returns true if the credentials are considered to have the same user ID.
2168        if self_creds.euid == target_creds.saved_uid
2169            || self_creds.euid == target_creds.uid
2170            || self_creds.uid == target_creds.uid
2171            || self_creds.uid == target_creds.saved_uid
2172        {
2173            return Ok(());
2174        }
2175
2176        if Signal::try_from(unchecked_signal) == Ok(SIGCONT) {
2177            let target_session = target.thread_group().read().process_group.session.leader;
2178            let self_session = self.thread_group().read().process_group.session.leader;
2179            if target_session == self_session {
2180                return Ok(());
2181            }
2182        }
2183
2184        security::check_task_capable(self, CAP_KILL)
2185    }
2186}
2187
2188impl ArchSpecific for CurrentTask {
2189    fn is_arch32(&self) -> bool {
2190        self.thread_state.is_arch32()
2191    }
2192}
2193
2194impl MemoryAccessor for CurrentTask {
2195    fn read_memory<'a>(
2196        &self,
2197        addr: UserAddress,
2198        bytes: &'a mut [MaybeUninit<u8>],
2199    ) -> Result<&'a mut [u8], Errno> {
2200        self.mm()?.unified_read_memory(self, addr, bytes)
2201    }
2202
2203    fn read_memory_partial_until_null_byte<'a>(
2204        &self,
2205        addr: UserAddress,
2206        bytes: &'a mut [MaybeUninit<u8>],
2207    ) -> Result<&'a mut [u8], Errno> {
2208        self.mm()?.unified_read_memory_partial_until_null_byte(self, addr, bytes)
2209    }
2210
2211    fn read_memory_partial<'a>(
2212        &self,
2213        addr: UserAddress,
2214        bytes: &'a mut [MaybeUninit<u8>],
2215    ) -> Result<&'a mut [u8], Errno> {
2216        self.mm()?.unified_read_memory_partial(self, addr, bytes)
2217    }
2218
2219    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2220        self.mm()?.unified_write_memory(self, addr, bytes)
2221    }
2222
2223    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2224        self.mm()?.unified_write_memory_partial(self, addr, bytes)
2225    }
2226
2227    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
2228        self.mm()?.unified_zero(self, addr, length)
2229    }
2230}
2231
2232impl TaskMemoryAccessor for CurrentTask {
2233    fn maximum_valid_address(&self) -> Option<UserAddress> {
2234        self.mm().ok().map(|mm| mm.maximum_valid_user_address)
2235    }
2236}
2237
2238pub enum ExceptionResult {
2239    /// The exception was handled and no further action is required.
2240    Handled,
2241
2242    // The exception generated a signal that should be delivered.
2243    Signal(SignalInfo),
2244}
2245
2246#[cfg(test)]
2247mod tests {
2248    use crate::testing::spawn_kernel_and_run;
2249    use starnix_uapi::auth::Credentials;
2250
2251    // This test will run `override_creds` and check it doesn't crash. This ensures that the
2252    // delegation to `override_creds_async` is correct.
2253    #[::fuchsia::test]
2254    async fn test_override_creds_can_delegate_to_async_version() {
2255        spawn_kernel_and_run(async move |_, current_task| {
2256            assert_eq!(current_task.override_creds(Credentials::root(), || 0), 0);
2257        })
2258        .await;
2259    }
2260}