starnix_core/task/
current_task.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::arch::task::{decode_page_fault_exception_report, get_signal_for_general_exception};
6use crate::execution::{TaskInfo, create_zircon_process};
7use crate::mm::{DumpPolicy, MemoryAccessor, MemoryAccessorExt, TaskMemoryAccessor};
8use crate::ptrace::{PtraceCoreState, PtraceEvent, PtraceEventData, PtraceOptions, StopState};
9use crate::security;
10use crate::signals::{RunState, SignalInfo, send_signal_first, send_standard_signal};
11use crate::task::loader::{ResolvedElf, load_executable, resolve_executable};
12use crate::task::waiter::WaiterOptions;
13use crate::task::{
14    ExitStatus, RobustListHeadPtr, SeccompFilter, SeccompFilterContainer, SeccompNotifierHandle,
15    SeccompState, SeccompStateValue, Task, TaskFlags, Waiter,
16};
17use crate::vfs::{
18    CheckAccessReason, FdFlags, FdNumber, FileHandle, FsStr, LookupContext, MAX_SYMLINK_FOLLOWS,
19    NamespaceNode, ResolveBase, SymlinkMode, SymlinkTarget, new_pidfd,
20};
21use extended_pstate::ExtendedPstateState;
22use futures::FutureExt;
23use linux_uapi::CLONE_PIDFD;
24use starnix_logging::{log_error, log_warn, track_file_not_found, track_stub};
25use starnix_registers::{HeapRegs, RegisterState, RegisterStorage, RegisterStorageEnum};
26use starnix_stack::clean_stack;
27use starnix_sync::{
28    EventWaitGuard, FileOpsCore, LockBefore, LockEqualOrBefore, Locked, MmDumpable,
29    ProcessGroupState, TaskRelease, Unlocked, WakeReason,
30};
31use starnix_syscalls::SyscallResult;
32use starnix_syscalls::decls::Syscall;
33use starnix_task_command::TaskCommand;
34use starnix_types::arch::ArchWidth;
35use starnix_types::futex_address::FutexAddress;
36use starnix_types::ownership::{OwnedRef, Releasable, TempRef, WeakRef, release_on_error};
37use starnix_uapi::auth::{
38    CAP_KILL, CAP_SYS_ADMIN, CAP_SYS_PTRACE, Credentials, FsCred, PTRACE_MODE_FSCREDS,
39    PTRACE_MODE_REALCREDS, PtraceAccessMode, UserAndOrGroupId,
40};
41use starnix_uapi::device_type::DeviceType;
42use starnix_uapi::errors::{Errno, ErrnoCode};
43use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
44use starnix_uapi::open_flags::OpenFlags;
45use starnix_uapi::signals::{
46    SIGBUS, SIGCHLD, SIGCONT, SIGILL, SIGKILL, SIGSEGV, SIGSYS, SIGTRAP, SigSet, Signal,
47    UncheckedSignal,
48};
49use starnix_uapi::user_address::{ArchSpecific, UserAddress, UserRef};
50use starnix_uapi::vfs::ResolveFlags;
51use starnix_uapi::{
52    CLONE_CHILD_CLEARTID, CLONE_CHILD_SETTID, CLONE_FILES, CLONE_FS, CLONE_INTO_CGROUP,
53    CLONE_NEWUTS, CLONE_PARENT, CLONE_PARENT_SETTID, CLONE_PTRACE, CLONE_SETTLS, CLONE_SIGHAND,
54    CLONE_SYSVSEM, CLONE_THREAD, CLONE_VFORK, CLONE_VM, FUTEX_OWNER_DIED, FUTEX_TID_MASK,
55    ROBUST_LIST_LIMIT, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_NEW_LISTENER,
56    SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_TSYNC_ESRCH, SI_KERNEL, clone_args, errno,
57    error, from_status_like_fdio, pid_t, sock_filter, ucred,
58};
59use std::cell::{Ref, RefCell};
60use std::collections::VecDeque;
61use std::ffi::CString;
62use std::fmt;
63use std::marker::PhantomData;
64use std::mem::MaybeUninit;
65use std::sync::Arc;
66use zx::sys::zx_restricted_state_t;
67
68use super::ThreadGroupLifecycleWaitValue;
69
70pub struct TaskBuilder {
71    /// The underlying task object.
72    pub task: OwnedRef<Task>,
73
74    pub thread_state: ThreadState<HeapRegs>,
75}
76
77impl TaskBuilder {
78    pub fn new(task: OwnedRef<Task>) -> Self {
79        Self { task, thread_state: Default::default() }
80    }
81
82    #[inline(always)]
83    pub fn release<L>(self, locked: &mut Locked<L>)
84    where
85        L: LockBefore<TaskRelease>,
86    {
87        let locked = locked.cast_locked::<TaskRelease>();
88        Releasable::release(self, locked);
89    }
90}
91
92impl From<TaskBuilder> for CurrentTask {
93    fn from(builder: TaskBuilder) -> Self {
94        Self::new(builder.task, builder.thread_state.into())
95    }
96}
97
98impl Releasable for TaskBuilder {
99    type Context<'a> = &'a mut Locked<TaskRelease>;
100
101    fn release<'a>(self, locked: Self::Context<'a>) {
102        let kernel = Arc::clone(self.kernel());
103        let mut pids = kernel.pids.write();
104
105        // We remove from the thread group here because the WeakRef in the pid
106        // table to this task must be valid until this task is removed from the
107        // thread group, and the code below will invalidate it.
108        // Moreover, this requires a OwnedRef of the task to ensure the tasks of
109        // the thread group are always valid.
110        self.task.thread_group().remove(locked, &mut pids, &self.task);
111
112        let context = (self.thread_state.into(), locked, pids);
113        self.task.release(context);
114    }
115}
116
117impl std::ops::Deref for TaskBuilder {
118    type Target = Task;
119    fn deref(&self) -> &Self::Target {
120        &self.task
121    }
122}
123
124/// Task permission are determined from their credentials, and if enabled, from their SEStarnix
125///  security state.
126#[derive(Debug, Clone)]
127pub struct FullCredentials {
128    pub creds: Arc<Credentials>,
129    pub security_state: security::TaskState,
130}
131
132impl FullCredentials {
133    pub fn for_kernel() -> Self {
134        Self { creds: Credentials::root(), security_state: security::task_alloc_for_kernel() }
135    }
136}
137
138/// The task object associated with the currently executing thread.
139///
140/// We often pass the `CurrentTask` as the first argument to functions if those functions need to
141/// know contextual information about the thread on which they are running. For example, we often
142/// use the `CurrentTask` to perform access checks, which ensures that the caller is authorized to
143/// perform the requested operation.
144///
145/// The `CurrentTask` also has state that can be referenced only on the currently executing thread,
146/// such as the register state for that thread. Syscalls are given a mutable references to the
147/// `CurrentTask`, which lets them manipulate this state.
148///
149/// See also `Task` for more information about tasks.
150pub struct CurrentTask {
151    /// The underlying task object.
152    pub task: OwnedRef<Task>,
153
154    pub thread_state: ThreadState<RegisterStorageEnum>,
155
156    /// The current subjective credentials of the task.
157    // TODO(https://fxbug.dev/433548348): Avoid interior mutability here by passing a
158    // &mut CurrentTask around instead of &CurrentTask.
159    pub current_creds: RefCell<CurrentCreds>,
160
161    /// Makes CurrentTask neither Sync not Send.
162    _local_marker: PhantomData<*mut u8>,
163}
164
165/// Represents the current state of the task's subjective credentials.
166pub enum CurrentCreds {
167    /// The task does not have overridden credentials, the subjective creds are identical to the
168    /// objective creds. Since credentials are often accessed from the current task, we hold a
169    /// reference here that does not necessitate going through the Rcu machinery to read.
170    /// The subjective security state is stored on the Task.
171    Cached(Arc<Credentials>),
172    /// The task has overridden credentials, with the given credentials and security state.
173    // TODO(https://fxbug.dev/433463756): TaskState will soon move into Credentials.
174    Overridden(Arc<Credentials>, security::TaskState),
175}
176
177impl CurrentCreds {
178    fn creds(&self) -> &Arc<Credentials> {
179        match self {
180            CurrentCreds::Cached(creds) => creds,
181            CurrentCreds::Overridden(creds, _) => creds,
182        }
183    }
184}
185
186/// The thread related information of a `CurrentTask`. The information should never be used  outside
187/// of the thread owning the `CurrentTask`.
188#[derive(Default)]
189pub struct ThreadState<T: RegisterStorage> {
190    /// A copy of the registers associated with the Zircon thread. Up-to-date values can be read
191    /// from `self.handle.read_state_general_regs()`. To write these values back to the thread, call
192    /// `self.handle.write_state_general_regs(self.thread_state.registers.into())`.
193    pub registers: RegisterState<T>,
194
195    /// Copy of the current extended processor state including floating point and vector registers.
196    pub extended_pstate: ExtendedPstateState,
197
198    /// The errno code (if any) that indicated this task should restart a syscall.
199    pub restart_code: Option<ErrnoCode>,
200
201    /// A custom function to resume a syscall that has been interrupted by SIGSTOP.
202    /// To use, call set_syscall_restart_func and return ERESTART_RESTARTBLOCK. sys_restart_syscall
203    /// will eventually call it.
204    pub syscall_restart_func: Option<Box<SyscallRestartFunc>>,
205
206    /// An architecture agnostic enum indicating the width (32 or 64 bits) of the execution
207    /// environment in use.
208    pub arch_width: ArchWidth,
209}
210
211impl<T: RegisterStorage> ThreadState<T> {
212    /// Returns a new `ThreadState` with the same `registers` as this one.
213    fn snapshot<R: RegisterStorage>(&self) -> ThreadState<R>
214    where
215        RegisterState<R>: From<RegisterState<T>>,
216    {
217        ThreadState::<R> {
218            registers: self.registers.clone().into(),
219            extended_pstate: Default::default(),
220            restart_code: self.restart_code,
221            syscall_restart_func: None,
222            arch_width: self.arch_width,
223        }
224    }
225
226    pub fn extended_snapshot<R: RegisterStorage>(&self) -> ThreadState<R>
227    where
228        RegisterState<R>: From<RegisterState<T>>,
229    {
230        ThreadState::<R> {
231            registers: self.registers.clone().into(),
232            extended_pstate: self.extended_pstate.clone(),
233            restart_code: self.restart_code,
234            syscall_restart_func: None,
235            arch_width: self.arch_width,
236        }
237    }
238
239    pub fn replace_registers<O: RegisterStorage>(&mut self, other: &ThreadState<O>) {
240        self.registers.load(*other.registers);
241        self.extended_pstate = other.extended_pstate;
242        self.arch_width = other.arch_width;
243    }
244
245    pub fn get_user_register(&mut self, offset: usize) -> Result<usize, Errno> {
246        let mut result: usize = 0;
247        self.registers.apply_user_register(offset, &mut |register| result = *register as usize)?;
248        Ok(result)
249    }
250
251    pub fn set_user_register(&mut self, offset: usize, value: usize) -> Result<(), Errno> {
252        self.registers.apply_user_register(offset, &mut |register| *register = value as u64)
253    }
254}
255
256impl From<ThreadState<HeapRegs>> for ThreadState<RegisterStorageEnum> {
257    fn from(value: ThreadState<HeapRegs>) -> Self {
258        ThreadState {
259            registers: value.registers.into(),
260            extended_pstate: value.extended_pstate,
261            restart_code: value.restart_code,
262            syscall_restart_func: value.syscall_restart_func,
263            arch_width: value.arch_width,
264        }
265    }
266}
267
268impl<T: RegisterStorage> ArchSpecific for ThreadState<T> {
269    fn is_arch32(&self) -> bool {
270        self.arch_width.is_arch32()
271    }
272}
273
274type SyscallRestartFunc = dyn FnOnce(&mut Locked<Unlocked>, &mut CurrentTask) -> Result<SyscallResult, Errno>
275    + Send
276    + Sync;
277
278impl Releasable for CurrentTask {
279    type Context<'a> = &'a mut Locked<TaskRelease>;
280
281    fn release<'a>(self, locked: Self::Context<'a>) {
282        self.notify_robust_list();
283        let _ignored = self.clear_child_tid_if_needed(locked);
284
285        let kernel = Arc::clone(self.kernel());
286        let mut pids = kernel.pids.write();
287
288        // We remove from the thread group here because the WeakRef in the pid
289        // table to this task must be valid until this task is removed from the
290        // thread group, and the code below will invalidate it.
291        // Moreover, this requires a OwnedRef of the task to ensure the tasks of
292        // the thread group are always valid.
293        self.task.thread_group().remove(locked, &mut pids, &self.task);
294
295        let context = (self.thread_state, locked, pids);
296        self.task.release(context);
297    }
298}
299
300impl std::ops::Deref for CurrentTask {
301    type Target = Task;
302    fn deref(&self) -> &Self::Target {
303        &self.task
304    }
305}
306
307impl fmt::Debug for CurrentTask {
308    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
309        self.task.fmt(f)
310    }
311}
312
313impl CurrentTask {
314    pub fn new(task: OwnedRef<Task>, thread_state: ThreadState<RegisterStorageEnum>) -> Self {
315        let current_creds = RefCell::new(CurrentCreds::Cached(task.clone_creds()));
316        Self { task, thread_state, current_creds, _local_marker: Default::default() }
317    }
318
319    /// Returns the current subjective credentials of the task.
320    ///
321    /// The subjective credentials are the credentials that are used to check permissions for
322    /// actions performed by the task.
323    pub fn current_creds(&self) -> Ref<'_, Arc<Credentials>> {
324        Ref::map(self.current_creds.borrow(), CurrentCreds::creds)
325    }
326
327    /// Returns the current subjective credentials of the task, including the security state.
328    pub fn full_current_creds(&self) -> FullCredentials {
329        match *self.current_creds.borrow() {
330            CurrentCreds::Cached(ref creds) => FullCredentials {
331                creds: creds.clone(),
332                security_state: self.security_state.clone(),
333            },
334            CurrentCreds::Overridden(ref creds, ref security_state) => {
335                FullCredentials { creds: creds.clone(), security_state: security_state.clone() }
336            }
337        }
338    }
339
340    pub fn current_fscred(&self) -> FsCred {
341        self.current_creds().as_fscred()
342    }
343
344    pub fn current_ucred(&self) -> ucred {
345        let creds = self.current_creds();
346        ucred { pid: self.get_pid(), uid: creds.uid, gid: creds.gid }
347    }
348
349    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
350    /// `callback`.
351    /// The creds and security state will be restored to their original values at the end of the
352    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
353    ///  used to check permissions for actions performed by the task, is altered. The "objective"
354    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
355    /// for actions performed on the task, is not altered, and changes to the credentials are not
356    /// externally visible.
357    pub async fn override_creds_async<R>(
358        &self,
359        new_creds: FullCredentials,
360        callback: impl AsyncFnOnce() -> R,
361    ) -> R {
362        let saved = self
363            .current_creds
364            .replace(CurrentCreds::Overridden(new_creds.creds, new_creds.security_state));
365        let result = callback().await;
366        self.current_creds.replace(saved);
367        result
368    }
369
370    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
371    /// `callback`.
372    /// The creds and security state will be restored to their original values at the end of the
373    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
374    ///  used to check permissions for actions performed by the task, is altered. The "objective"
375    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
376    /// for actions performed on the task, is not altered, and changes to the credentials are not
377    /// externally visible.
378    pub fn override_creds<R>(&self, new_creds: FullCredentials, callback: impl FnOnce() -> R) -> R {
379        self.override_creds_async(new_creds, async move || callback())
380            .now_or_never()
381            .expect("Future should be ready")
382    }
383
384    pub fn has_overridden_creds(&self) -> bool {
385        matches!(*self.current_creds.borrow(), CurrentCreds::Overridden(_, _))
386    }
387
388    pub fn trigger_delayed_releaser<L>(&self, locked: &mut Locked<L>)
389    where
390        L: LockEqualOrBefore<FileOpsCore>,
391    {
392        let locked = locked.cast_locked::<FileOpsCore>();
393        self.kernel().delayed_releaser.apply(locked, self);
394    }
395
396    pub fn weak_task(&self) -> WeakRef<Task> {
397        WeakRef::from(&self.task)
398    }
399
400    pub fn temp_task(&self) -> TempRef<'_, Task> {
401        TempRef::from(&self.task)
402    }
403
404    /// Change the current and real creds of the task. This is invalid to call while temporary
405    /// credentials are present.
406    pub fn set_creds(&self, creds: Credentials) {
407        assert!(!self.has_overridden_creds());
408
409        let creds = Arc::new(creds);
410        let mut current_creds = self.current_creds.borrow_mut();
411        *current_creds = CurrentCreds::Cached(creds.clone());
412
413        // SAFETY: this is allowed because we are the CurrentTask.
414        unsafe {
415            self.persistent_info.write_creds().update(creds);
416        }
417        // The /proc/pid directory's ownership is updated when the task's euid
418        // or egid changes. See proc(5).
419        let maybe_node = self.proc_pid_directory_cache.lock();
420        if let Some(node) = &*maybe_node {
421            let creds = self.real_creds().euid_as_fscred();
422            // SAFETY: The /proc/pid directory held by `proc_pid_directory_cache` represents the
423            // current task. It's owner and group are supposed to track the current task's euid and
424            // egid.
425            unsafe {
426                node.force_chown(creds);
427            }
428        }
429    }
430
431    #[inline(always)]
432    pub fn release<L>(self, locked: &mut Locked<L>)
433    where
434        L: LockBefore<TaskRelease>,
435    {
436        let locked = locked.cast_locked::<TaskRelease>();
437        Releasable::release(self, locked);
438    }
439
440    pub fn set_syscall_restart_func<R: Into<SyscallResult>>(
441        &mut self,
442        f: impl FnOnce(&mut Locked<Unlocked>, &mut CurrentTask) -> Result<R, Errno>
443        + Send
444        + Sync
445        + 'static,
446    ) {
447        self.thread_state.syscall_restart_func =
448            Some(Box::new(|locked, current_task| Ok(f(locked, current_task)?.into())));
449    }
450
451    pub fn add_file<L>(
452        &self,
453        locked: &mut Locked<L>,
454        file: FileHandle,
455        flags: FdFlags,
456    ) -> Result<FdNumber, Errno>
457    where
458        L: LockEqualOrBefore<FileOpsCore>,
459    {
460        self.files.add(locked, self, file, flags)
461    }
462
463    /// Sets the task's signal mask to `signal_mask` and runs `wait_function`.
464    ///
465    /// Signals are dequeued prior to the original signal mask being restored. This is done by the
466    /// signal machinery in the syscall dispatch loop.
467    ///
468    /// The returned result is the result returned from the wait function.
469    pub fn wait_with_temporary_mask<F, T, L>(
470        &mut self,
471        locked: &mut Locked<L>,
472        signal_mask: SigSet,
473        wait_function: F,
474    ) -> Result<T, Errno>
475    where
476        L: LockEqualOrBefore<FileOpsCore>,
477        F: FnOnce(&mut Locked<L>, &CurrentTask) -> Result<T, Errno>,
478    {
479        {
480            let mut state = self.write();
481            state.set_flags(TaskFlags::TEMPORARY_SIGNAL_MASK, true);
482            state.set_temporary_signal_mask(signal_mask);
483        }
484        wait_function(locked, self)
485    }
486
487    /// If waking, promotes from waking to awake.  If not waking, make waiter async
488    /// wait until woken.  Returns true if woken.
489    pub fn wake_or_wait_until_unstopped_async(&self, waiter: &Waiter) -> bool {
490        let group_state = self.thread_group().read();
491        let mut task_state = self.write();
492
493        // Wake up if
494        //   a) we should wake up, meaning:
495        //      i) we're in group stop, and the thread group has exited group stop, or
496        //      ii) we're waking up,
497        //   b) and ptrace isn't stopping us from waking up, but
498        //   c) always wake up if we got a SIGKILL.
499        let task_stop_state = self.load_stopped();
500        let group_stop_state = self.thread_group().load_stopped();
501        if ((task_stop_state == StopState::GroupStopped && group_stop_state.is_waking_or_awake())
502            || task_stop_state.is_waking_or_awake())
503            && (!task_state.is_ptrace_listening() || task_stop_state.is_force())
504        {
505            let new_state = if task_stop_state.is_waking_or_awake() {
506                task_stop_state.finalize()
507            } else {
508                group_stop_state.finalize()
509            };
510            if let Ok(new_state) = new_state {
511                task_state.set_stopped(new_state, None, Some(self), None);
512                drop(group_state);
513                drop(task_state);
514                // It is possible for the stop state to be changed by another
515                // thread between when it is checked above and the following
516                // invocation, but set_stopped does sufficient checking while
517                // holding the lock to make sure that such a change won't result
518                // in corrupted state.
519                self.thread_group().set_stopped(new_state, None, false);
520                return true;
521            }
522        }
523
524        // We will wait.
525        if self.thread_group().load_stopped().is_stopped() || task_stop_state.is_stopped() {
526            // If we've stopped or PTRACE_LISTEN has been sent, wait for a
527            // signal or instructions from the tracer.
528            group_state
529                .lifecycle_waiters
530                .wait_async_value(&waiter, ThreadGroupLifecycleWaitValue::Stopped);
531            task_state.wait_on_ptracer(&waiter);
532        } else if task_state.can_accept_ptrace_commands() {
533            // If we're stopped because a tracer has seen the stop and not taken
534            // further action, wait for further instructions from the tracer.
535            task_state.wait_on_ptracer(&waiter);
536        } else if task_state.is_ptrace_listening() {
537            // A PTRACE_LISTEN is a state where we can get signals and notify a
538            // ptracer, but otherwise remain blocked.
539            if let Some(ptrace) = &mut task_state.ptrace {
540                ptrace.set_last_signal(Some(SignalInfo::default(SIGTRAP)));
541                ptrace.set_last_event(Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0)));
542            }
543            task_state.wait_on_ptracer(&waiter);
544            task_state.notify_ptracers();
545        }
546        false
547    }
548
549    /// Set the RunState for the current task to the given value and then call the given callback.
550    ///
551    /// When the callback is done, the run_state is restored to `RunState::Running`.
552    ///
553    /// This function is typically used just before blocking the current task on some operation.
554    /// The given `run_state` registers the mechanism for interrupting the blocking operation with
555    /// the task and the given `callback` actually blocks the task.
556    ///
557    /// This function can only be called in the `RunState::Running` state and cannot set the
558    /// run state to `RunState::Running`. For this reason, this function cannot be reentered.
559    pub fn run_in_state<F, T>(&self, run_state: RunState, callback: F) -> Result<T, Errno>
560    where
561        F: FnOnce() -> Result<T, Errno>,
562    {
563        assert_ne!(run_state, RunState::Running);
564
565        // As an optimization, decommit unused pages of the stack to reduce memory pressure while
566        // the thread is blocked.
567        clean_stack();
568
569        {
570            let mut state = self.write();
571            assert!(!state.is_blocked());
572
573            if matches!(run_state, RunState::Frozen(_)) {
574                // Freeze is a kernel signal and is handled before other user signals. A frozen task
575                // ignores all other signals except SIGKILL until it is thawed.
576                if state.has_signal_pending(SIGKILL) {
577                    return error!(EINTR);
578                }
579            } else if state.is_any_signal_pending() && !state.is_ptrace_listening() {
580                // A note on PTRACE_LISTEN - the thread cannot be scheduled
581                // regardless of pending signals.
582                return error!(EINTR);
583            }
584            state.set_run_state(run_state.clone());
585        }
586
587        let result = callback();
588
589        {
590            let mut state = self.write();
591            assert_eq!(
592                state.run_state(),
593                run_state,
594                "SignalState run state changed while waiting!"
595            );
596            state.set_run_state(RunState::Running);
597        };
598
599        result
600    }
601
602    pub fn block_until(
603        &self,
604        guard: EventWaitGuard<'_>,
605        deadline: zx::MonotonicInstant,
606    ) -> Result<(), Errno> {
607        self.run_in_state(RunState::Event(guard.event().clone()), move || {
608            guard.block_until(None, deadline).map_err(|e| match e {
609                WakeReason::Interrupted => errno!(EINTR),
610                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
611            })
612        })
613    }
614
615    pub fn block_with_owner_until(
616        &self,
617        guard: EventWaitGuard<'_>,
618        new_owner: &zx::Thread,
619        deadline: zx::MonotonicInstant,
620    ) -> Result<(), Errno> {
621        self.run_in_state(RunState::Event(guard.event().clone()), move || {
622            guard.block_until(Some(new_owner), deadline).map_err(|e| match e {
623                WakeReason::Interrupted => errno!(EINTR),
624                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
625            })
626        })
627    }
628
629    /// Determine namespace node indicated by the dir_fd.
630    ///
631    /// Returns the namespace node and the path to use relative to that node.
632    pub fn resolve_dir_fd<'a, L>(
633        &self,
634        locked: &mut Locked<L>,
635        dir_fd: FdNumber,
636        mut path: &'a FsStr,
637        flags: ResolveFlags,
638    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
639    where
640        L: LockEqualOrBefore<FileOpsCore>,
641    {
642        let path_is_absolute = path.starts_with(b"/");
643        if path_is_absolute {
644            if flags.contains(ResolveFlags::BENEATH) {
645                return error!(EXDEV);
646            }
647            path = &path[1..];
648        }
649
650        let dir = if path_is_absolute && !flags.contains(ResolveFlags::IN_ROOT) {
651            self.fs().root()
652        } else if dir_fd == FdNumber::AT_FDCWD {
653            self.fs().cwd()
654        } else {
655            // O_PATH allowed for:
656            //
657            //   Passing the file descriptor as the dirfd argument of
658            //   openat() and the other "*at()" system calls.  This
659            //   includes linkat(2) with AT_EMPTY_PATH (or via procfs
660            //   using AT_SYMLINK_FOLLOW) even if the file is not a
661            //   directory.
662            //
663            // See https://man7.org/linux/man-pages/man2/open.2.html
664            let file = self.files.get_allowing_opath(dir_fd)?;
665            file.name.to_passive()
666        };
667
668        if !path.is_empty() {
669            if !dir.entry.node.is_dir() {
670                return error!(ENOTDIR);
671            }
672            dir.check_access(
673                locked,
674                self,
675                Access::EXEC,
676                CheckAccessReason::InternalPermissionChecks,
677            )?;
678        }
679        Ok((dir, path.into()))
680    }
681
682    /// A convenient wrapper for opening files relative to FdNumber::AT_FDCWD.
683    ///
684    /// Returns a FileHandle but does not install the FileHandle in the FdTable
685    /// for this task.
686    pub fn open_file(
687        &self,
688        locked: &mut Locked<Unlocked>,
689        path: &FsStr,
690        flags: OpenFlags,
691    ) -> Result<FileHandle, Errno> {
692        if flags.contains(OpenFlags::CREAT) {
693            // In order to support OpenFlags::CREAT we would need to take a
694            // FileMode argument.
695            return error!(EINVAL);
696        }
697        self.open_file_at(
698            locked,
699            FdNumber::AT_FDCWD,
700            path,
701            flags,
702            FileMode::default(),
703            ResolveFlags::empty(),
704            AccessCheck::default(),
705        )
706    }
707
708    /// Resolves a path for open.
709    ///
710    /// If the final path component points to a symlink, the symlink is followed (as long as
711    /// the symlink traversal limit has not been reached).
712    ///
713    /// If the final path component (after following any symlinks, if enabled) does not exist,
714    /// and `flags` contains `OpenFlags::CREAT`, a new node is created at the location of the
715    /// final path component.
716    ///
717    /// This returns the resolved node, and a boolean indicating whether the node has been created.
718    fn resolve_open_path<L>(
719        &self,
720        locked: &mut Locked<L>,
721        context: &mut LookupContext,
722        dir: &NamespaceNode,
723        path: &FsStr,
724        mode: FileMode,
725        flags: OpenFlags,
726    ) -> Result<(NamespaceNode, bool), Errno>
727    where
728        L: LockEqualOrBefore<FileOpsCore>,
729    {
730        context.update_for_path(path);
731        let mut parent_content = context.with(SymlinkMode::Follow);
732        let (parent, basename) = self.lookup_parent(locked, &mut parent_content, dir, path)?;
733        context.remaining_follows = parent_content.remaining_follows;
734
735        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
736
737        // Lookup the child, without following a symlink or expecting it to be a directory.
738        let mut child_context = context.with(SymlinkMode::NoFollow);
739        child_context.must_be_directory = false;
740
741        match parent.lookup_child(locked, self, &mut child_context, basename) {
742            Ok(name) => {
743                if name.entry.node.is_lnk() {
744                    if flags.contains(OpenFlags::PATH)
745                        && context.symlink_mode == SymlinkMode::NoFollow
746                    {
747                        // When O_PATH is specified in flags, if pathname is a symbolic link
748                        // and the O_NOFOLLOW flag is also specified, then the call returns
749                        // a file descriptor referring to the symbolic link.
750                        // See https://man7.org/linux/man-pages/man2/openat.2.html
751                        //
752                        // If the trailing component (i.e., basename) of
753                        // pathname is a symbolic link, how.resolve contains
754                        // RESOLVE_NO_SYMLINKS, and how.flags contains both
755                        // O_PATH and O_NOFOLLOW, then an O_PATH file
756                        // descriptor referencing the symbolic link will be
757                        // returned.
758                        // See https://man7.org/linux/man-pages/man2/openat2.2.html
759                        return Ok((name, false));
760                    }
761
762                    if (!flags.contains(OpenFlags::PATH)
763                        && context.symlink_mode == SymlinkMode::NoFollow)
764                        || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
765                        || context.remaining_follows == 0
766                    {
767                        if must_create {
768                            // Since `must_create` is set, and a node was found, this returns EEXIST
769                            // instead of ELOOP.
770                            return error!(EEXIST);
771                        }
772                        // A symlink was found, but one of the following is true:
773                        // * flags specified O_NOFOLLOW but not O_PATH.
774                        // * how.resolve contains RESOLVE_NO_SYMLINKS
775                        // * too many symlink traversals have been attempted
776                        return error!(ELOOP);
777                    }
778
779                    context.remaining_follows -= 1;
780                    match name.readlink(locked, self)? {
781                        SymlinkTarget::Path(path) => {
782                            let dir = if path[0] == b'/' { self.fs().root() } else { parent };
783                            self.resolve_open_path(
784                                locked,
785                                context,
786                                &dir,
787                                path.as_ref(),
788                                mode,
789                                flags,
790                            )
791                        }
792                        SymlinkTarget::Node(name) => {
793                            if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS)
794                                || name.entry.node.is_lnk()
795                            {
796                                error!(ELOOP)
797                            } else {
798                                Ok((name, false))
799                            }
800                        }
801                    }
802                } else {
803                    if must_create {
804                        return error!(EEXIST);
805                    }
806                    Ok((name, false))
807                }
808            }
809            Err(e) if e == errno!(ENOENT) && flags.contains(OpenFlags::CREAT) => {
810                if context.must_be_directory {
811                    return error!(EISDIR);
812                }
813                Ok((
814                    parent.open_create_node(
815                        locked,
816                        self,
817                        basename,
818                        mode.with_type(FileMode::IFREG),
819                        DeviceType::NONE,
820                        flags,
821                    )?,
822                    true,
823                ))
824            }
825            Err(e) => Err(e),
826        }
827    }
828
829    /// The primary entry point for opening files relative to a task.
830    ///
831    /// Absolute paths are resolve relative to the root of the FsContext for
832    /// this task. Relative paths are resolve relative to dir_fd. To resolve
833    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
834    /// dir_fd.
835    ///
836    /// Returns a FileHandle but does not install the FileHandle in the FdTable
837    /// for this task.
838    pub fn open_file_at(
839        &self,
840        locked: &mut Locked<Unlocked>,
841        dir_fd: FdNumber,
842        path: &FsStr,
843        flags: OpenFlags,
844        mode: FileMode,
845        resolve_flags: ResolveFlags,
846        access_check: AccessCheck,
847    ) -> Result<FileHandle, Errno> {
848        if path.is_empty() {
849            return error!(ENOENT);
850        }
851
852        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, resolve_flags)?;
853        self.open_namespace_node_at(locked, dir, path, flags, mode, resolve_flags, access_check)
854    }
855
856    pub fn open_namespace_node_at(
857        &self,
858        locked: &mut Locked<Unlocked>,
859        dir: NamespaceNode,
860        path: &FsStr,
861        flags: OpenFlags,
862        mode: FileMode,
863        mut resolve_flags: ResolveFlags,
864        access_check: AccessCheck,
865    ) -> Result<FileHandle, Errno> {
866        // 64-bit kernels force the O_LARGEFILE flag to be on.
867        let mut flags = flags | OpenFlags::LARGEFILE;
868        let opath = flags.contains(OpenFlags::PATH);
869        if opath {
870            // When O_PATH is specified in flags, flag bits other than O_CLOEXEC,
871            // O_DIRECTORY, and O_NOFOLLOW are ignored.
872            const ALLOWED_FLAGS: OpenFlags = OpenFlags::from_bits_truncate(
873                OpenFlags::PATH.bits()
874                    | OpenFlags::CLOEXEC.bits()
875                    | OpenFlags::DIRECTORY.bits()
876                    | OpenFlags::NOFOLLOW.bits(),
877            );
878            flags &= ALLOWED_FLAGS;
879        }
880
881        if flags.contains(OpenFlags::TMPFILE) && !flags.can_write() {
882            return error!(EINVAL);
883        }
884
885        let nofollow = flags.contains(OpenFlags::NOFOLLOW);
886        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
887
888        let symlink_mode =
889            if nofollow || must_create { SymlinkMode::NoFollow } else { SymlinkMode::Follow };
890
891        let resolve_base = match (
892            resolve_flags.contains(ResolveFlags::BENEATH),
893            resolve_flags.contains(ResolveFlags::IN_ROOT),
894        ) {
895            (false, false) => ResolveBase::None,
896            (true, false) => ResolveBase::Beneath(dir.clone()),
897            (false, true) => ResolveBase::InRoot(dir.clone()),
898            (true, true) => return error!(EINVAL),
899        };
900
901        // `RESOLVE_BENEATH` and `RESOLVE_IN_ROOT` imply `RESOLVE_NO_MAGICLINKS`. This matches
902        // Linux behavior. Strictly speaking it's is not really required, but it's hard to
903        // implement `BENEATH` and `IN_ROOT` flags correctly otherwise.
904        if resolve_base != ResolveBase::None {
905            resolve_flags.insert(ResolveFlags::NO_MAGICLINKS);
906        }
907
908        let mut context = LookupContext {
909            symlink_mode,
910            remaining_follows: MAX_SYMLINK_FOLLOWS,
911            must_be_directory: flags.contains(OpenFlags::DIRECTORY),
912            resolve_flags,
913            resolve_base,
914        };
915        let (name, created) =
916            match self.resolve_open_path(locked, &mut context, &dir, path, mode, flags) {
917                Ok((n, c)) => (n, c),
918                Err(e) => {
919                    let mut abs_path = dir.path(&self.task);
920                    abs_path.extend(&**path);
921                    track_file_not_found(abs_path);
922                    return Err(e);
923                }
924            };
925
926        let name = if flags.contains(OpenFlags::TMPFILE) {
927            // `O_TMPFILE` is incompatible with `O_CREAT`
928            if flags.contains(OpenFlags::CREAT) {
929                return error!(EINVAL);
930            }
931            name.create_tmpfile(locked, self, mode.with_type(FileMode::IFREG), flags)?
932        } else {
933            let mode = name.entry.node.info().mode;
934
935            // These checks are not needed in the `O_TMPFILE` case because `mode` refers to the
936            // file we are opening. With `O_TMPFILE`, that file is the regular file we just
937            // created rather than the node we found by resolving the path.
938            //
939            // For example, we do not need to produce `ENOTDIR` when `must_be_directory` is set
940            // because `must_be_directory` refers to the node we found by resolving the path.
941            // If that node was not a directory, then `create_tmpfile` will produce an error.
942            //
943            // Similarly, we never need to call `truncate` because `O_TMPFILE` is newly created
944            // and therefor already an empty file.
945
946            if !opath && nofollow && mode.is_lnk() {
947                return error!(ELOOP);
948            }
949
950            if mode.is_dir() {
951                if flags.can_write()
952                    || flags.contains(OpenFlags::CREAT)
953                    || flags.contains(OpenFlags::TRUNC)
954                {
955                    return error!(EISDIR);
956                }
957                if flags.contains(OpenFlags::DIRECT) {
958                    return error!(EINVAL);
959                }
960            } else if context.must_be_directory {
961                return error!(ENOTDIR);
962            }
963
964            if flags.contains(OpenFlags::TRUNC) && mode.is_reg() && !created {
965                // You might think we should check file.can_write() at this
966                // point, which is what the docs suggest, but apparently we
967                // are supposed to truncate the file if this task can write
968                // to the underlying node, even if we are opening the file
969                // as read-only. See OpenTest.CanTruncateReadOnly.
970                name.truncate(locked, self, 0)?;
971            }
972
973            name
974        };
975
976        // If the node has been created, the open operation should not verify access right:
977        // From <https://man7.org/linux/man-pages/man2/open.2.html>
978        //
979        // > Note that mode applies only to future accesses of the newly created file; the
980        // > open() call that creates a read-only file may well return a  read/write  file
981        // > descriptor.
982
983        let access_check = if created { AccessCheck::skip() } else { access_check };
984        name.open(locked, self, flags, access_check)
985    }
986
987    /// A wrapper for FsContext::lookup_parent_at that resolves the given
988    /// dir_fd to a NamespaceNode.
989    ///
990    /// Absolute paths are resolve relative to the root of the FsContext for
991    /// this task. Relative paths are resolve relative to dir_fd. To resolve
992    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
993    /// dir_fd.
994    pub fn lookup_parent_at<'a, L>(
995        &self,
996        locked: &mut Locked<L>,
997        context: &mut LookupContext,
998        dir_fd: FdNumber,
999        path: &'a FsStr,
1000    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
1001    where
1002        L: LockEqualOrBefore<FileOpsCore>,
1003    {
1004        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, ResolveFlags::empty())?;
1005        self.lookup_parent(locked, context, &dir, path)
1006    }
1007
1008    /// Lookup the parent of a namespace node.
1009    ///
1010    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
1011    /// calling this function directly.
1012    ///
1013    /// This function resolves all but the last component of the given path.
1014    /// The function returns the parent directory of the last component as well
1015    /// as the last component.
1016    ///
1017    /// If path is empty, this function returns dir and an empty path.
1018    /// Similarly, if path ends with "." or "..", these components will be
1019    /// returned along with the parent.
1020    ///
1021    /// The returned parent might not be a directory.
1022    pub fn lookup_parent<'a, L>(
1023        &self,
1024        locked: &mut Locked<L>,
1025        context: &mut LookupContext,
1026        dir: &NamespaceNode,
1027        path: &'a FsStr,
1028    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
1029    where
1030        L: LockEqualOrBefore<FileOpsCore>,
1031    {
1032        context.update_for_path(path);
1033
1034        let mut current_node = dir.clone();
1035        let mut it = path.split(|c| *c == b'/').filter(|p| !p.is_empty()).map(<&FsStr>::from);
1036        let mut current_path_component = it.next().unwrap_or_default();
1037        for next_path_component in it {
1038            current_node =
1039                current_node.lookup_child(locked, self, context, current_path_component)?;
1040            current_path_component = next_path_component;
1041        }
1042        Ok((current_node, current_path_component))
1043    }
1044
1045    /// Lookup a namespace node.
1046    ///
1047    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
1048    /// calling this function directly.
1049    ///
1050    /// This function resolves the component of the given path.
1051    pub fn lookup_path<L>(
1052        &self,
1053        locked: &mut Locked<L>,
1054        context: &mut LookupContext,
1055        dir: NamespaceNode,
1056        path: &FsStr,
1057    ) -> Result<NamespaceNode, Errno>
1058    where
1059        L: LockEqualOrBefore<FileOpsCore>,
1060    {
1061        let (parent, basename) = self.lookup_parent(locked, context, &dir, path)?;
1062        parent.lookup_child(locked, self, context, basename)
1063    }
1064
1065    /// Lookup a namespace node starting at the root directory.
1066    ///
1067    /// Resolves symlinks.
1068    pub fn lookup_path_from_root<L>(
1069        &self,
1070        locked: &mut Locked<L>,
1071        path: &FsStr,
1072    ) -> Result<NamespaceNode, Errno>
1073    where
1074        L: LockEqualOrBefore<FileOpsCore>,
1075    {
1076        let mut context = LookupContext::default();
1077        self.lookup_path(locked, &mut context, self.fs().root(), path)
1078    }
1079
1080    pub fn exec(
1081        &mut self,
1082        locked: &mut Locked<Unlocked>,
1083        executable: FileHandle,
1084        path: CString,
1085        argv: Vec<CString>,
1086        environ: Vec<CString>,
1087    ) -> Result<(), Errno> {
1088        // Executable must be a regular file
1089        if !executable.name.entry.node.is_reg() {
1090            return error!(EACCES);
1091        }
1092
1093        // File node must have EXEC mode permissions.
1094        // Note that the ability to execute a file is unrelated to the flags
1095        // used in the `open` call.
1096        executable.name.check_access(locked, self, Access::EXEC, CheckAccessReason::Exec)?;
1097
1098        let elf_security_state = security::bprm_creds_for_exec(self, &executable.name)?;
1099
1100        let resolved_elf = resolve_executable(
1101            locked,
1102            self,
1103            executable,
1104            path.clone(),
1105            argv,
1106            environ,
1107            elf_security_state,
1108        )?;
1109
1110        let maybe_set_id = if self.kernel().features.enable_suid {
1111            resolved_elf.file.name.suid_and_sgid(&self)?
1112        } else {
1113            Default::default()
1114        };
1115
1116        if self.thread_group().read().tasks_count() > 1 {
1117            track_stub!(TODO("https://fxbug.dev/297434895"), "exec on multithread process");
1118            return error!(EINVAL);
1119        }
1120
1121        if let Err(err) = self.finish_exec(locked, path, resolved_elf, maybe_set_id) {
1122            log_warn!("unrecoverable error in exec: {err:?}");
1123
1124            send_standard_signal(
1125                locked,
1126                self,
1127                SignalInfo { code: SI_KERNEL as i32, force: true, ..SignalInfo::default(SIGSEGV) },
1128            );
1129            return Err(err);
1130        }
1131
1132        self.ptrace_event(locked, PtraceOptions::TRACEEXEC, self.task.tid as u64);
1133        self.signal_vfork();
1134
1135        Ok(())
1136    }
1137
1138    /// After the memory is unmapped, any failure in exec is unrecoverable and results in the
1139    /// process crashing. This function is for that second half; any error returned from this
1140    /// function will be considered unrecoverable.
1141    fn finish_exec(
1142        &mut self,
1143        locked: &mut Locked<Unlocked>,
1144        path: CString,
1145        resolved_elf: ResolvedElf,
1146        mut maybe_set_id: UserAndOrGroupId,
1147    ) -> Result<(), Errno> {
1148        // Now that the exec will definitely finish (or crash), notify owners of
1149        // locked futexes for the current process, which will be impossible to
1150        // update after process image is replaced.  See get_robust_list(2).
1151        self.notify_robust_list();
1152
1153        // Passing arch32 information here ensures the replacement memory
1154        // layout matches the elf being executed.
1155        let mm = {
1156            let mm = self.mm()?;
1157            let new_mm = mm
1158                .exec(resolved_elf.file.name.to_passive(), resolved_elf.arch_width)
1159                .map_err(|status| from_status_like_fdio!(status))?;
1160            self.mm.update(Some(new_mm.clone()));
1161            new_mm
1162        };
1163
1164        {
1165            let mut state = self.write();
1166
1167            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1168            //
1169            //   The aforementioned transformations of the effective IDs are not
1170            //   performed (i.e., the set-user-ID and set-group-ID bits are
1171            //   ignored) if any of the following is true:
1172            //
1173            //   * the no_new_privs attribute is set for the calling thread (see
1174            //      prctl(2));
1175            //
1176            //   *  the underlying filesystem is mounted nosuid (the MS_NOSUID
1177            //      flag for mount(2)); or
1178            //
1179            //   *  the calling process is being ptraced.
1180            //
1181            // The MS_NOSUID check is in `NamespaceNode::suid_and_sgid()`.
1182            if state.no_new_privs() || state.is_ptraced() {
1183                maybe_set_id.clear();
1184            }
1185
1186            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1187            //
1188            //   The process's "dumpable" attribute is set to the value 1,
1189            //   unless a set-user-ID program, a set-group-ID program, or a
1190            //   program with capabilities is being executed, in which case the
1191            //   dumpable flag may instead be reset to the value in
1192            //   /proc/sys/fs/suid_dumpable, in the circumstances described
1193            //   under PR_SET_DUMPABLE in prctl(2).
1194            let dumpable =
1195                if maybe_set_id.is_none() { DumpPolicy::User } else { DumpPolicy::Disable };
1196            *mm.dumpable.lock(locked) = dumpable;
1197
1198            // TODO(https://fxbug.dev/433463756): Figure out whether this is the right place to
1199            // take the lock.
1200            // SAFETY: this is allowed because we are the CurrentTask.
1201            let mut writable_creds = unsafe { self.persistent_info.write_creds() };
1202            state.set_sigaltstack(None);
1203            state.robust_list_head = RobustListHeadPtr::null(self);
1204
1205            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1206            //
1207            //   If a set-user-ID or set-group-ID
1208            //   program is being executed, then the parent death signal set by
1209            //   prctl(2) PR_SET_PDEATHSIG flag is cleared.
1210            //
1211            // TODO(https://fxbug.dev/356684424): Implement the behavior above once we support
1212            // the PR_SET_PDEATHSIG flag.
1213
1214            // TODO(tbodt): Check whether capability xattrs are set on the file, and grant/limit
1215            // capabilities accordingly.
1216            let mut new_creds = Credentials::clone(&self.current_creds());
1217            new_creds.exec(maybe_set_id);
1218            let new_creds = Arc::new(new_creds);
1219            writable_creds.update(new_creds.clone());
1220            *self.current_creds.borrow_mut() = CurrentCreds::Cached(new_creds);
1221        }
1222
1223        let security_state = resolved_elf.security_state.clone();
1224
1225        let start_info = load_executable(self, resolved_elf, &path)?;
1226        // Before consuming start_info below, note if the task is 32-bit.
1227        self.thread_state.arch_width = start_info.arch_width;
1228
1229        let regs: zx_restricted_state_t = start_info.into();
1230        self.thread_state.registers.load(regs);
1231        self.thread_state.extended_pstate.reset();
1232        self.thread_group().signal_actions.reset_for_exec();
1233
1234        // The exit signal (and that of the children) is reset to SIGCHLD.
1235        let mut thread_group_state = self.thread_group().write();
1236        thread_group_state.exit_signal = Some(SIGCHLD);
1237        for (_, weak_child) in &mut thread_group_state.children {
1238            if let Some(child) = weak_child.upgrade() {
1239                let mut child_state = child.write();
1240                child_state.exit_signal = Some(SIGCHLD);
1241            }
1242        }
1243
1244        std::mem::drop(thread_group_state);
1245
1246        // TODO(https://fxbug.dev/42082680): All threads other than the calling thread are destroyed.
1247
1248        // TODO: POSIX timers are not preserved.
1249
1250        // TODO: Ensure that the filesystem context is un-shared, undoing the effect of CLONE_FS.
1251
1252        // The file descriptor table is unshared, undoing the effect of the CLONE_FILES flag of
1253        // clone(2).
1254        self.files.unshare();
1255        self.files.exec(locked, self);
1256
1257        // If SELinux is enabled, enforce permissions related to inheritance of file descriptors
1258        // and resource limits. Then update the current task's SID.
1259        //
1260        // TODO: https://fxbug.dev/378655436 - After the above, enforce permissions related to
1261        // signal state inheritance.
1262        //
1263        // This needs to be called after closing any files marked "close-on-exec".
1264        security::exec_binprm(locked, self, &security_state);
1265
1266        self.thread_group().write().did_exec = true;
1267
1268        self.set_command_name(TaskCommand::from_path_bytes(path.to_bytes()));
1269
1270        Ok(())
1271    }
1272
1273    pub fn set_command_name(&self, new_name: TaskCommand) {
1274        // set_command_name needs to run before leader_command() in cases where self is the leader.
1275        self.task.set_command_name(new_name.clone());
1276        let leader_command = self.thread_group().read().leader_command();
1277        starnix_logging::set_current_task_info(
1278            new_name,
1279            leader_command,
1280            self.thread_group().leader,
1281            self.tid,
1282        );
1283    }
1284
1285    pub fn add_seccomp_filter(
1286        &mut self,
1287        locked: &mut Locked<Unlocked>,
1288        code: Vec<sock_filter>,
1289        flags: u32,
1290    ) -> Result<SyscallResult, Errno> {
1291        let new_filter = Arc::new(SeccompFilter::from_cbpf(
1292            &code,
1293            self.thread_group().next_seccomp_filter_id.add(1),
1294            flags & SECCOMP_FILTER_FLAG_LOG != 0,
1295        )?);
1296
1297        let mut maybe_fd: Option<FdNumber> = None;
1298
1299        if flags & SECCOMP_FILTER_FLAG_NEW_LISTENER != 0 {
1300            maybe_fd = Some(SeccompFilterContainer::create_listener(locked, self)?);
1301        }
1302
1303        // We take the process lock here because we can't change any of the threads
1304        // while doing a tsync.  So, you hold the process lock while making any changes.
1305        let state = self.thread_group().write();
1306
1307        if flags & SECCOMP_FILTER_FLAG_TSYNC != 0 {
1308            // TSYNC synchronizes all filters for all threads in the current process to
1309            // the current thread's
1310
1311            // We collect the filters for the current task upfront to save us acquiring
1312            // the task's lock a lot of times below.
1313            let mut filters: SeccompFilterContainer = self.read().seccomp_filters.clone();
1314
1315            // For TSYNC to work, all of the other thread filters in this process have to
1316            // be a prefix of this thread's filters, and none of them can be in
1317            // strict mode.
1318            let tasks = state.tasks().collect::<Vec<_>>();
1319            for task in &tasks {
1320                if task.tid == self.tid {
1321                    continue;
1322                }
1323                let other_task_state = task.read();
1324
1325                // Target threads cannot be in SECCOMP_MODE_STRICT
1326                if task.seccomp_filter_state.get() == SeccompStateValue::Strict {
1327                    return Self::seccomp_tsync_error(task.tid, flags);
1328                }
1329
1330                // Target threads' filters must be a subsequence of this thread's
1331                if !other_task_state.seccomp_filters.can_sync_to(&filters) {
1332                    return Self::seccomp_tsync_error(task.tid, flags);
1333                }
1334            }
1335
1336            // Now that we're sure we're allowed to do so, add the filter to all threads.
1337            filters.add_filter(new_filter, code.len() as u16)?;
1338
1339            for task in &tasks {
1340                let mut other_task_state = task.write();
1341
1342                other_task_state.enable_no_new_privs();
1343                other_task_state.seccomp_filters = filters.clone();
1344                task.set_seccomp_state(SeccompStateValue::UserDefined)?;
1345            }
1346        } else {
1347            let mut task_state = self.task.write();
1348
1349            task_state.seccomp_filters.add_filter(new_filter, code.len() as u16)?;
1350            self.set_seccomp_state(SeccompStateValue::UserDefined)?;
1351        }
1352
1353        if let Some(fd) = maybe_fd { Ok(fd.into()) } else { Ok(().into()) }
1354    }
1355
1356    pub fn run_seccomp_filters(
1357        &mut self,
1358        locked: &mut Locked<Unlocked>,
1359        syscall: &Syscall,
1360    ) -> Option<Result<SyscallResult, Errno>> {
1361        // Implementation of SECCOMP_FILTER_STRICT, which has slightly different semantics
1362        // from user-defined seccomp filters.
1363        if self.seccomp_filter_state.get() == SeccompStateValue::Strict {
1364            return SeccompState::do_strict(locked, self, syscall);
1365        }
1366
1367        // Run user-defined seccomp filters
1368        let result = self.task.read().seccomp_filters.run_all(self, syscall);
1369
1370        SeccompState::do_user_defined(locked, result, self, syscall)
1371    }
1372
1373    fn seccomp_tsync_error(id: i32, flags: u32) -> Result<SyscallResult, Errno> {
1374        // By default, TSYNC indicates failure state by returning the first thread
1375        // id not to be able to sync, rather than by returning -1 and setting
1376        // errno.  However, if TSYNC_ESRCH is set, it returns ESRCH.  This
1377        // prevents conflicts with fact that SECCOMP_FILTER_FLAG_NEW_LISTENER
1378        // makes seccomp return an fd.
1379        if flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH != 0 { error!(ESRCH) } else { Ok(id.into()) }
1380    }
1381
1382    // Notify all futexes in robust list.  The robust list is in user space, so we
1383    // are very careful about walking it, and there are a lot of quiet returns if
1384    // we fail to walk it.
1385    // TODO(https://fxbug.dev/42079081): This only sets the FUTEX_OWNER_DIED bit; it does
1386    // not wake up a waiter.
1387    pub fn notify_robust_list(&self) {
1388        let task_state = self.write();
1389        let robust_list_addr = task_state.robust_list_head.addr();
1390        if robust_list_addr == UserAddress::NULL {
1391            // No one has called set_robust_list.
1392            return;
1393        }
1394        let robust_list_res = self.read_multi_arch_object(task_state.robust_list_head);
1395
1396        let head = if let Ok(head) = robust_list_res {
1397            head
1398        } else {
1399            return;
1400        };
1401
1402        let offset = head.futex_offset;
1403
1404        let mut entries_count = 0;
1405        let mut curr_ptr = head.list.next;
1406        while curr_ptr.addr() != robust_list_addr.into() && entries_count < ROBUST_LIST_LIMIT {
1407            let curr_ref = self.read_multi_arch_object(curr_ptr);
1408
1409            let curr = if let Ok(curr) = curr_ref {
1410                curr
1411            } else {
1412                return;
1413            };
1414
1415            let Some(futex_base) = curr_ptr.addr().checked_add_signed(offset) else {
1416                return;
1417            };
1418
1419            let futex_addr = match FutexAddress::try_from(futex_base) {
1420                Ok(addr) => addr,
1421                Err(_) => {
1422                    return;
1423                }
1424            };
1425
1426            let Ok(mm) = self.mm() else {
1427                log_error!("Asked to notify robust list futexes in system task.");
1428                return;
1429            };
1430            let futex = if let Ok(futex) = mm.atomic_load_u32_relaxed(futex_addr) {
1431                futex
1432            } else {
1433                return;
1434            };
1435
1436            if (futex & FUTEX_TID_MASK) as i32 == self.tid {
1437                let owner_died = FUTEX_OWNER_DIED | futex;
1438                if mm.atomic_store_u32_relaxed(futex_addr, owner_died).is_err() {
1439                    return;
1440                }
1441            }
1442            curr_ptr = curr.next;
1443            entries_count += 1;
1444        }
1445    }
1446
1447    /// Returns a ref to this thread's SeccompNotifier.
1448    pub fn get_seccomp_notifier(&mut self) -> Option<SeccompNotifierHandle> {
1449        self.task.write().seccomp_filters.notifier.clone()
1450    }
1451
1452    pub fn set_seccomp_notifier(&mut self, notifier: Option<SeccompNotifierHandle>) {
1453        self.task.write().seccomp_filters.notifier = notifier;
1454    }
1455
1456    /// Processes a Zircon exception associated with this task.
1457    pub fn process_exception(
1458        &self,
1459        locked: &mut Locked<Unlocked>,
1460        report: &zx::ExceptionReport,
1461    ) -> ExceptionResult {
1462        match report.ty {
1463            zx::ExceptionType::General => match get_signal_for_general_exception(&report.arch) {
1464                Some(sig) => ExceptionResult::Signal(SignalInfo::default(sig)),
1465                None => {
1466                    log_error!("Unrecognized general exception: {:?}", report);
1467                    ExceptionResult::Signal(SignalInfo::default(SIGILL))
1468                }
1469            },
1470            zx::ExceptionType::FatalPageFault { status } => {
1471                let report = decode_page_fault_exception_report(&report.arch);
1472                if let Ok(mm) = self.mm() {
1473                    mm.handle_page_fault(locked, report, status)
1474                } else {
1475                    panic!(
1476                        "system task is handling a major page fault status={:?}, report={:?}",
1477                        status, report
1478                    );
1479                }
1480            }
1481            zx::ExceptionType::UndefinedInstruction => {
1482                ExceptionResult::Signal(SignalInfo::default(SIGILL))
1483            }
1484            zx::ExceptionType::UnalignedAccess => {
1485                ExceptionResult::Signal(SignalInfo::default(SIGBUS))
1486            }
1487            zx::ExceptionType::SoftwareBreakpoint | zx::ExceptionType::HardwareBreakpoint => {
1488                ExceptionResult::Signal(SignalInfo::default(SIGTRAP))
1489            }
1490            zx::ExceptionType::ProcessNameChanged => {
1491                log_error!("Received unexpected process name changed exception");
1492                ExceptionResult::Handled
1493            }
1494            zx::ExceptionType::ProcessStarting
1495            | zx::ExceptionType::ThreadStarting
1496            | zx::ExceptionType::ThreadExiting => {
1497                log_error!("Received unexpected task lifecycle exception");
1498                ExceptionResult::Signal(SignalInfo::default(SIGSYS))
1499            }
1500            zx::ExceptionType::PolicyError(policy_code) => {
1501                log_error!(policy_code:?; "Received Zircon policy error exception");
1502                ExceptionResult::Signal(SignalInfo::default(SIGSYS))
1503            }
1504            zx::ExceptionType::UnknownUserGenerated { code, data } => {
1505                log_error!(code:?, data:?; "Received unexpected unknown user generated exception");
1506                ExceptionResult::Signal(SignalInfo::default(SIGSYS))
1507            }
1508            zx::ExceptionType::Unknown { ty, code, data } => {
1509                log_error!(ty:?, code:?, data:?; "Received unexpected exception");
1510                ExceptionResult::Signal(SignalInfo::default(SIGSYS))
1511            }
1512        }
1513    }
1514
1515    /// Clone this task.
1516    ///
1517    /// Creates a new task object that shares some state with this task
1518    /// according to the given flags.
1519    ///
1520    /// Used by the clone() syscall to create both processes and threads.
1521    ///
1522    /// The exit signal is broken out from the flags parameter like clone3() rather than being
1523    /// bitwise-ORed like clone().
1524    pub fn clone_task<L>(
1525        &self,
1526        locked: &mut Locked<L>,
1527        flags: u64,
1528        child_exit_signal: Option<Signal>,
1529        user_parent_tid: UserRef<pid_t>,
1530        user_child_tid: UserRef<pid_t>,
1531        user_pidfd: UserRef<FdNumber>,
1532    ) -> Result<TaskBuilder, Errno>
1533    where
1534        L: LockBefore<MmDumpable>,
1535        L: LockBefore<TaskRelease>,
1536        L: LockBefore<ProcessGroupState>,
1537    {
1538        const IMPLEMENTED_FLAGS: u64 = (CLONE_VM
1539            | CLONE_FS
1540            | CLONE_FILES
1541            | CLONE_SIGHAND
1542            | CLONE_THREAD
1543            | CLONE_SYSVSEM
1544            | CLONE_SETTLS
1545            | CLONE_PARENT
1546            | CLONE_PARENT_SETTID
1547            | CLONE_PIDFD
1548            | CLONE_CHILD_CLEARTID
1549            | CLONE_CHILD_SETTID
1550            | CLONE_VFORK
1551            | CLONE_NEWUTS
1552            | CLONE_PTRACE) as u64;
1553
1554        // A mask with all valid flags set, because we want to return a different error code for an
1555        // invalid flag vs an unimplemented flag. Subtracting 1 from the largest valid flag gives a
1556        // mask with all flags below it set. Shift up by one to make sure the largest flag is also
1557        // set.
1558        const VALID_FLAGS: u64 = (CLONE_INTO_CGROUP << 1) - 1;
1559
1560        // CLONE_SETTLS is implemented by sys_clone.
1561
1562        let clone_files = flags & (CLONE_FILES as u64) != 0;
1563        let clone_fs = flags & (CLONE_FS as u64) != 0;
1564        let clone_parent = flags & (CLONE_PARENT as u64) != 0;
1565        let clone_parent_settid = flags & (CLONE_PARENT_SETTID as u64) != 0;
1566        let clone_pidfd = flags & (CLONE_PIDFD as u64) != 0;
1567        let clone_child_cleartid = flags & (CLONE_CHILD_CLEARTID as u64) != 0;
1568        let clone_child_settid = flags & (CLONE_CHILD_SETTID as u64) != 0;
1569        let clone_sysvsem = flags & (CLONE_SYSVSEM as u64) != 0;
1570        let clone_ptrace = flags & (CLONE_PTRACE as u64) != 0;
1571        let clone_thread = flags & (CLONE_THREAD as u64) != 0;
1572        let clone_vm = flags & (CLONE_VM as u64) != 0;
1573        let clone_sighand = flags & (CLONE_SIGHAND as u64) != 0;
1574        let clone_vfork = flags & (CLONE_VFORK as u64) != 0;
1575        let clone_newuts = flags & (CLONE_NEWUTS as u64) != 0;
1576        let clone_into_cgroup = flags & CLONE_INTO_CGROUP != 0;
1577
1578        if clone_ptrace {
1579            track_stub!(TODO("https://fxbug.dev/322874630"), "CLONE_PTRACE");
1580        }
1581
1582        if clone_sysvsem {
1583            track_stub!(TODO("https://fxbug.dev/322875185"), "CLONE_SYSVSEM");
1584        }
1585
1586        if clone_into_cgroup {
1587            track_stub!(TODO("https://fxbug.dev/403612570"), "CLONE_INTO_CGROUP");
1588        }
1589
1590        if clone_sighand && !clone_vm {
1591            return error!(EINVAL);
1592        }
1593        if clone_thread && !clone_sighand {
1594            return error!(EINVAL);
1595        }
1596
1597        if clone_pidfd && clone_thread {
1598            return error!(EINVAL);
1599        }
1600        if clone_pidfd && clone_parent_settid && user_parent_tid.addr() == user_pidfd.addr() {
1601            // `clone()` uses the same out-argument for these, so error out if they have the same
1602            // user address.
1603            return error!(EINVAL);
1604        }
1605
1606        if flags & !VALID_FLAGS != 0 {
1607            return error!(EINVAL);
1608        }
1609
1610        if clone_vm && !clone_thread {
1611            // TODO(https://fxbug.dev/42066087) Implement CLONE_VM for child processes (not just child
1612            // threads). Currently this executes CLONE_VM (explicitly passed to clone() or as
1613            // used by vfork()) as a fork (the VM in the child is copy-on-write) which is almost
1614            // always OK.
1615            //
1616            // CLONE_VM is primarily as an optimization to avoid making a copy-on-write version of a
1617            // process' VM that will be immediately replaced with a call to exec(). The main users
1618            // (libc and language runtimes) don't actually rely on the memory being shared between
1619            // the two processes. And the vfork() man page explicitly allows vfork() to be
1620            // implemented as fork() which is what we do here.
1621            if !clone_vfork {
1622                track_stub!(
1623                    TODO("https://fxbug.dev/322875227"),
1624                    "CLONE_VM without CLONE_THREAD or CLONE_VFORK"
1625                );
1626            }
1627        } else if clone_thread && !clone_vm {
1628            track_stub!(TODO("https://fxbug.dev/322875167"), "CLONE_THREAD without CLONE_VM");
1629            return error!(ENOSYS);
1630        }
1631
1632        if flags & !IMPLEMENTED_FLAGS != 0 {
1633            track_stub!(
1634                TODO("https://fxbug.dev/322875130"),
1635                "clone unknown flags",
1636                flags & !IMPLEMENTED_FLAGS
1637            );
1638            return error!(ENOSYS);
1639        }
1640
1641        let fs = if clone_fs { self.fs() } else { self.fs().fork() };
1642        let files = if clone_files { self.files.clone() } else { self.files.fork() };
1643
1644        let kernel = self.kernel();
1645
1646        let mut pids = kernel.pids.write();
1647
1648        // Lock the cgroup process hierarchy so that the parent process cannot move to a different
1649        // cgroup while a new task or thread_group is created. This may be unnecessary if
1650        // CLONE_INTO_CGROUP is implemented and passed in.
1651        let mut cgroup2_pid_table = kernel.cgroups.lock_cgroup2_pid_table();
1652        // Create a `KernelSignal::Freeze` to put onto the new task, if the cgroup is frozen.
1653        let child_kernel_signals = cgroup2_pid_table
1654            .maybe_create_freeze_signal(self.thread_group())
1655            .into_iter()
1656            .collect::<VecDeque<_>>();
1657
1658        let pid;
1659        let command;
1660        let creds;
1661        let scheduler_state;
1662        let no_new_privs;
1663        let seccomp_filters;
1664        let robust_list_head = RobustListHeadPtr::null(self);
1665        let child_signal_mask;
1666        let timerslack_ns;
1667        let uts_ns;
1668        let security_state = security::task_alloc(&self, flags);
1669
1670        let TaskInfo { thread, thread_group, memory_manager } = {
1671            // These variables hold the original parent in case we need to switch the parent of the
1672            // new task because of CLONE_PARENT.
1673            let weak_original_parent;
1674            let original_parent;
1675
1676            // Make sure to drop these locks ASAP to avoid inversion
1677            let thread_group_state = {
1678                let thread_group_state = self.thread_group().write();
1679                if clone_parent {
1680                    // With the CLONE_PARENT flag, the parent of the new task is our parent
1681                    // instead of ourselves.
1682                    weak_original_parent =
1683                        thread_group_state.parent.clone().ok_or_else(|| errno!(EINVAL))?;
1684                    std::mem::drop(thread_group_state);
1685                    original_parent = weak_original_parent.upgrade();
1686                    original_parent.write()
1687                } else {
1688                    thread_group_state
1689                }
1690            };
1691
1692            let state = self.read();
1693
1694            no_new_privs = state.no_new_privs();
1695            seccomp_filters = state.seccomp_filters.clone();
1696            child_signal_mask = state.signal_mask();
1697
1698            pid = pids.allocate_pid();
1699            command = self.command();
1700            creds = self.current_creds().clone();
1701            scheduler_state = state.scheduler_state.fork();
1702            timerslack_ns = state.timerslack_ns;
1703
1704            uts_ns = if clone_newuts {
1705                security::check_task_capable(self, CAP_SYS_ADMIN)?;
1706                state.uts_ns.read().fork()
1707            } else {
1708                state.uts_ns.clone()
1709            };
1710
1711            if clone_thread {
1712                TaskInfo {
1713                    thread: None,
1714                    thread_group: self.thread_group().clone(),
1715                    memory_manager: self.mm().ok(),
1716                }
1717            } else {
1718                // Drop the lock on this task before entering `create_zircon_process`, because it will
1719                // take a lock on the new thread group, and locks on thread groups have a higher
1720                // priority than locks on the task in the thread group.
1721                std::mem::drop(state);
1722                let signal_actions = if clone_sighand {
1723                    self.thread_group().signal_actions.clone()
1724                } else {
1725                    self.thread_group().signal_actions.fork()
1726                };
1727                let process_group = thread_group_state.process_group.clone();
1728
1729                let task_info = create_zircon_process(
1730                    locked,
1731                    kernel,
1732                    Some(thread_group_state),
1733                    pid,
1734                    child_exit_signal,
1735                    process_group,
1736                    signal_actions,
1737                    command.clone(),
1738                )?;
1739
1740                cgroup2_pid_table.inherit_cgroup(self.thread_group(), &task_info.thread_group);
1741
1742                task_info
1743            }
1744        };
1745
1746        // Drop the lock on the cgroup pid_table before creating the TaskBuilder.
1747        // If the TaskBuilder creation fails, the TaskBuilder is dropped, which calls
1748        // ThreadGroup::remove. ThreadGroup::remove takes the cgroup pid_table lock, causing
1749        // a cyclic lock dependency.
1750        std::mem::drop(cgroup2_pid_table);
1751
1752        // Only create the vfork event when the caller requested CLONE_VFORK.
1753        let vfork_event = if clone_vfork { Some(Arc::new(zx::Event::create())) } else { None };
1754
1755        let mut child = TaskBuilder::new(Task::new(
1756            pid,
1757            command,
1758            thread_group,
1759            thread,
1760            files,
1761            memory_manager,
1762            fs,
1763            creds,
1764            self.abstract_socket_namespace.clone(),
1765            self.abstract_vsock_namespace.clone(),
1766            child_signal_mask,
1767            child_kernel_signals,
1768            vfork_event,
1769            scheduler_state,
1770            uts_ns,
1771            no_new_privs,
1772            SeccompState::from(&self.seccomp_filter_state),
1773            seccomp_filters,
1774            robust_list_head,
1775            timerslack_ns,
1776            security_state,
1777        ));
1778
1779        release_on_error!(child, locked, {
1780            let child_task = TempRef::from(&child.task);
1781            // Drop the pids lock as soon as possible after creating the child. Destroying the child
1782            // and removing it from the pids table itself requires the pids lock, so if an early exit
1783            // takes place we have a self deadlock.
1784            pids.add_task(&child_task);
1785            std::mem::drop(pids);
1786
1787            // Child lock must be taken before this lock. Drop the lock on the task, take a writable
1788            // lock on the child and take the current state back.
1789
1790            #[cfg(any(test, debug_assertions))]
1791            {
1792                // Take the lock on the thread group and its child in the correct order to ensure any wrong ordering
1793                // will trigger the tracing-mutex at the right call site.
1794                if !clone_thread {
1795                    let _l1 = self.thread_group().read();
1796                    let _l2 = child.thread_group().read();
1797                }
1798            }
1799
1800            if clone_thread {
1801                self.thread_group().add(&child_task)?;
1802            } else {
1803                child.thread_group().add(&child_task)?;
1804
1805                // These manipulations of the signal handling state appear to be related to
1806                // CLONE_SIGHAND and CLONE_VM rather than CLONE_THREAD. However, we do not support
1807                // all the combinations of these flags, which means doing these operations here
1808                // might actually be correct. However, if you find a test that fails because of the
1809                // placement of this logic here, we might need to move it.
1810                let mut child_state = child.write();
1811                let state = self.read();
1812                child_state.set_sigaltstack(state.sigaltstack());
1813                child_state.set_signal_mask(state.signal_mask());
1814            }
1815
1816            if !clone_vm {
1817                // We do not support running threads in the same process with different
1818                // MemoryManagers.
1819                assert!(!clone_thread);
1820                self.mm()?.snapshot_to(locked, &child.mm()?)?;
1821            }
1822
1823            if clone_parent_settid {
1824                self.write_object(user_parent_tid, &child.tid)?;
1825            }
1826
1827            if clone_child_cleartid {
1828                child.write().clear_child_tid = user_child_tid;
1829            }
1830
1831            if clone_child_settid {
1832                child.write_object(user_child_tid, &child.tid)?;
1833            }
1834
1835            if clone_pidfd {
1836                let locked = locked.cast_locked::<TaskRelease>();
1837                let file = new_pidfd(
1838                    locked,
1839                    self,
1840                    child.thread_group(),
1841                    &*child.mm()?,
1842                    OpenFlags::empty(),
1843                );
1844                let pidfd = self.add_file(locked, file, FdFlags::CLOEXEC)?;
1845                self.write_object(user_pidfd, &pidfd)?;
1846            }
1847
1848            // TODO(https://fxbug.dev/42066087): We do not support running different processes with
1849            // the same MemoryManager. Instead, we implement a rough approximation of that behavior
1850            // by making a copy-on-write clone of the memory from the original process.
1851            if clone_vm && !clone_thread {
1852                self.mm()?.snapshot_to(locked, &child.mm()?)?;
1853            }
1854
1855            child.thread_state = self.thread_state.snapshot::<HeapRegs>();
1856            Ok(())
1857        });
1858
1859        // Take the lock on thread group and task in the correct order to ensure any wrong ordering
1860        // will trigger the tracing-mutex at the right call site.
1861        #[cfg(any(test, debug_assertions))]
1862        {
1863            let _l1 = child.thread_group().read();
1864            let _l2 = child.read();
1865        }
1866
1867        Ok(child)
1868    }
1869
1870    /// Sets the stop state (per set_stopped), and also notifies all listeners,
1871    /// including the parent process and the tracer if appropriate.
1872    pub fn set_stopped_and_notify(&self, stopped: StopState, siginfo: Option<SignalInfo>) {
1873        let maybe_signal_info = {
1874            let mut state = self.write();
1875            state.copy_state_from(self);
1876            state.set_stopped(stopped, siginfo, Some(self), None);
1877            state.prepare_signal_info(stopped)
1878        };
1879
1880        if let Some((tracer, signal_info)) = maybe_signal_info {
1881            if let Some(tracer) = tracer.upgrade() {
1882                tracer.write().send_signal(signal_info);
1883            }
1884        }
1885
1886        if !stopped.is_in_progress() {
1887            let parent = self.thread_group().read().parent.clone();
1888            if let Some(parent) = parent {
1889                parent
1890                    .upgrade()
1891                    .write()
1892                    .lifecycle_waiters
1893                    .notify_value(ThreadGroupLifecycleWaitValue::ChildStatus);
1894            }
1895        }
1896    }
1897
1898    /// If the task is stopping, set it as stopped. return whether the caller
1899    /// should stop.  The task might also be waking up.
1900    pub fn finalize_stop_state(&mut self) -> bool {
1901        let stopped = self.load_stopped();
1902
1903        if !stopped.is_stopping_or_stopped() {
1904            // If we are waking up, potentially write back state a tracer may have modified.
1905            let captured_state = self.write().take_captured_state();
1906            if let Some(captured) = captured_state {
1907                if captured.dirty {
1908                    self.thread_state.replace_registers(&captured.thread_state);
1909                }
1910            }
1911        }
1912
1913        // Stopping because the thread group is stopping.
1914        // Try to flip to GroupStopped - will fail if we shouldn't.
1915        if self.thread_group().set_stopped(StopState::GroupStopped, None, true)
1916            == StopState::GroupStopped
1917        {
1918            let signal = self.thread_group().read().last_signal.clone();
1919            // stopping because the thread group has stopped
1920            let event = Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0));
1921            self.write().set_stopped(StopState::GroupStopped, signal, Some(self), event);
1922            return true;
1923        }
1924
1925        // Stopping because the task is stopping
1926        if stopped.is_stopping_or_stopped() {
1927            if let Ok(stopped) = stopped.finalize() {
1928                self.set_stopped_and_notify(stopped, None);
1929            }
1930            return true;
1931        }
1932
1933        false
1934    }
1935
1936    /// Block the execution of `current_task` as long as the task is stopped and
1937    /// not terminated.
1938    pub fn block_while_stopped(&mut self, locked: &mut Locked<Unlocked>) {
1939        // Upgrade the state from stopping to stopped if needed. Return if the task
1940        // should not be stopped.
1941        if !self.finalize_stop_state() {
1942            return;
1943        }
1944
1945        let waiter = Waiter::with_options(WaiterOptions::IGNORE_SIGNALS);
1946        loop {
1947            // If we've exited, unstop the threads and return without notifying
1948            // waiters.
1949            if self.is_exitted() {
1950                self.thread_group().set_stopped(StopState::ForceAwake, None, false);
1951                self.write().set_stopped(StopState::ForceAwake, None, Some(self), None);
1952                return;
1953            }
1954
1955            if self.wake_or_wait_until_unstopped_async(&waiter) {
1956                return;
1957            }
1958
1959            // Do the wait. Result is not needed, as this is not in a syscall.
1960            let _: Result<(), Errno> = waiter.wait(locked, self);
1961
1962            // Maybe go from stopping to stopped, if we are currently stopping
1963            // again.
1964            self.finalize_stop_state();
1965        }
1966    }
1967
1968    /// For traced tasks, this will return the data neceessary for a cloned task
1969    /// to attach to the same tracer.
1970    pub fn get_ptrace_core_state_for_clone(
1971        &mut self,
1972        clone_args: &clone_args,
1973    ) -> (PtraceOptions, Option<PtraceCoreState>) {
1974        let state = self.write();
1975        if let Some(ptrace) = &state.ptrace {
1976            ptrace.get_core_state_for_clone(clone_args)
1977        } else {
1978            (PtraceOptions::empty(), None)
1979        }
1980    }
1981
1982    /// If currently being ptraced with the given option, emit the appropriate
1983    /// event.  PTRACE_EVENTMSG will return the given message.  Also emits the
1984    /// appropriate event for execve in the absence of TRACEEXEC.
1985    ///
1986    /// Note that the Linux kernel has a documented bug where, if TRACEEXIT is
1987    /// enabled, SIGKILL will trigger an event.  We do not exhibit this
1988    /// behavior.
1989    pub fn ptrace_event(
1990        &mut self,
1991        locked: &mut Locked<Unlocked>,
1992        trace_kind: PtraceOptions,
1993        msg: u64,
1994    ) {
1995        if !trace_kind.is_empty() {
1996            {
1997                let mut state = self.write();
1998                if let Some(ptrace) = &mut state.ptrace {
1999                    if !ptrace.has_option(trace_kind) {
2000                        // If this would be a TRACEEXEC, but TRACEEXEC is not
2001                        // turned on, then send a SIGTRAP.
2002                        if trace_kind == PtraceOptions::TRACEEXEC && !ptrace.is_seized() {
2003                            // Send a SIGTRAP so that the parent can gain control.
2004                            send_signal_first(locked, self, state, SignalInfo::default(SIGTRAP));
2005                        }
2006
2007                        return;
2008                    }
2009                    let mut siginfo = SignalInfo::default(starnix_uapi::signals::SIGTRAP);
2010                    siginfo.code = (((PtraceEvent::from_option(&trace_kind) as u32) << 8)
2011                        | linux_uapi::SIGTRAP) as i32;
2012                    state.set_stopped(
2013                        StopState::PtraceEventStopping,
2014                        Some(siginfo),
2015                        None,
2016                        Some(PtraceEventData::new(trace_kind, msg)),
2017                    );
2018                } else {
2019                    return;
2020                }
2021            }
2022            self.block_while_stopped(locked);
2023        }
2024    }
2025
2026    /// Causes the current thread's thread group to exit, notifying any ptracer
2027    /// of this task first.
2028    pub fn thread_group_exit(&mut self, locked: &mut Locked<Unlocked>, exit_status: ExitStatus) {
2029        self.ptrace_event(
2030            locked,
2031            PtraceOptions::TRACEEXIT,
2032            exit_status.signal_info_status() as u64,
2033        );
2034        self.thread_group().exit(locked, exit_status, None);
2035    }
2036
2037    /// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the
2038    /// exit signal as in clone().
2039    pub fn clone_task_for_test<L>(
2040        &self,
2041        locked: &mut Locked<L>,
2042        flags: u64,
2043        exit_signal: Option<Signal>,
2044    ) -> crate::testing::AutoReleasableTask
2045    where
2046        L: LockBefore<MmDumpable>,
2047        L: LockBefore<TaskRelease>,
2048        L: LockBefore<ProcessGroupState>,
2049    {
2050        let result = self
2051            .clone_task(
2052                locked,
2053                flags,
2054                exit_signal,
2055                UserRef::default(),
2056                UserRef::default(),
2057                UserRef::default(),
2058            )
2059            .expect("failed to create task in test");
2060
2061        result.into()
2062    }
2063
2064    // See "Ptrace access mode checking" in https://man7.org/linux/man-pages/man2/ptrace.2.html
2065    pub fn check_ptrace_access_mode<L>(
2066        &self,
2067        locked: &mut Locked<L>,
2068        mode: PtraceAccessMode,
2069        target: &Task,
2070    ) -> Result<(), Errno>
2071    where
2072        L: LockBefore<MmDumpable>,
2073    {
2074        // (1)  If the calling thread and the target thread are in the same
2075        //      thread group, access is always allowed.
2076        if self.thread_group().leader == target.thread_group().leader {
2077            return Ok(());
2078        }
2079
2080        // (2)  If the access mode specifies PTRACE_MODE_FSCREDS, then, for
2081        //      the check in the next step, employ the caller's filesystem
2082        //      UID and GID.  (As noted in credentials(7), the filesystem
2083        //      UID and GID almost always have the same values as the
2084        //      corresponding effective IDs.)
2085        //
2086        //      Otherwise, the access mode specifies PTRACE_MODE_REALCREDS,
2087        //      so use the caller's real UID and GID for the checks in the
2088        //      next step.  (Most APIs that check the caller's UID and GID
2089        //      use the effective IDs.  For historical reasons, the
2090        //      PTRACE_MODE_REALCREDS check uses the real IDs instead.)
2091        let (uid, gid) = if mode.contains(PTRACE_MODE_FSCREDS) {
2092            let fscred = self.current_creds().as_fscred();
2093            (fscred.uid, fscred.gid)
2094        } else if mode.contains(PTRACE_MODE_REALCREDS) {
2095            let creds = self.current_creds();
2096            (creds.uid, creds.gid)
2097        } else {
2098            unreachable!();
2099        };
2100
2101        // (3)  Deny access if neither of the following is true:
2102        //
2103        //      -  The real, effective, and saved-set user IDs of the target
2104        //         match the caller's user ID, and the real, effective, and
2105        //         saved-set group IDs of the target match the caller's
2106        //         group ID.
2107        //
2108        //      -  The caller has the CAP_SYS_PTRACE capability in the user
2109        //         namespace of the target.
2110        let target_creds = target.real_creds();
2111        if !(target_creds.uid == uid
2112            && target_creds.euid == uid
2113            && target_creds.saved_uid == uid
2114            && target_creds.gid == gid
2115            && target_creds.egid == gid
2116            && target_creds.saved_gid == gid)
2117        {
2118            security::check_task_capable(self, CAP_SYS_PTRACE)?;
2119        }
2120
2121        // (4)  Deny access if the target process "dumpable" attribute has a
2122        //      value other than 1 (SUID_DUMP_USER; see the discussion of
2123        //      PR_SET_DUMPABLE in prctl(2)), and the caller does not have
2124        //      the CAP_SYS_PTRACE capability in the user namespace of the
2125        //      target process.
2126        let dumpable = *target.mm()?.dumpable.lock(locked);
2127        match dumpable {
2128            DumpPolicy::User => (),
2129            DumpPolicy::Disable => security::check_task_capable(self, CAP_SYS_PTRACE)?,
2130        }
2131
2132        // (5)  The kernel LSM security_ptrace_access_check() interface is
2133        //      invoked to see if ptrace access is permitted.
2134        security::ptrace_access_check(self, target, mode)?;
2135
2136        // (6)  If access has not been denied by any of the preceding steps,
2137        //      then access is allowed.
2138        Ok(())
2139    }
2140
2141    pub fn can_signal(
2142        &self,
2143        target: &Task,
2144        unchecked_signal: UncheckedSignal,
2145    ) -> Result<(), Errno> {
2146        // If both the tasks share a thread group the signal can be sent. This is not documented
2147        // in kill(2) because kill does not support task-level granularity in signal sending.
2148        if self.thread_group == target.thread_group {
2149            return Ok(());
2150        }
2151
2152        let self_creds = self.current_creds();
2153        let target_creds = target.real_creds();
2154        // From https://man7.org/linux/man-pages/man2/kill.2.html:
2155        //
2156        // > For a process to have permission to send a signal, it must either be
2157        // > privileged (under Linux: have the CAP_KILL capability in the user
2158        // > namespace of the target process), or the real or effective user ID of
2159        // > the sending process must equal the real or saved set- user-ID of the
2160        // > target process.
2161        //
2162        // Returns true if the credentials are considered to have the same user ID.
2163        if self_creds.euid == target_creds.saved_uid
2164            || self_creds.euid == target_creds.uid
2165            || self_creds.uid == target_creds.uid
2166            || self_creds.uid == target_creds.saved_uid
2167        {
2168            return Ok(());
2169        }
2170
2171        if Signal::try_from(unchecked_signal) == Ok(SIGCONT) {
2172            let target_session = target.thread_group().read().process_group.session.leader;
2173            let self_session = self.thread_group().read().process_group.session.leader;
2174            if target_session == self_session {
2175                return Ok(());
2176            }
2177        }
2178
2179        security::check_task_capable(self, CAP_KILL)
2180    }
2181}
2182
2183impl ArchSpecific for CurrentTask {
2184    fn is_arch32(&self) -> bool {
2185        self.thread_state.is_arch32()
2186    }
2187}
2188
2189impl MemoryAccessor for CurrentTask {
2190    fn read_memory<'a>(
2191        &self,
2192        addr: UserAddress,
2193        bytes: &'a mut [MaybeUninit<u8>],
2194    ) -> Result<&'a mut [u8], Errno> {
2195        self.mm()?.unified_read_memory(self, addr, bytes)
2196    }
2197
2198    fn read_memory_partial_until_null_byte<'a>(
2199        &self,
2200        addr: UserAddress,
2201        bytes: &'a mut [MaybeUninit<u8>],
2202    ) -> Result<&'a mut [u8], Errno> {
2203        self.mm()?.unified_read_memory_partial_until_null_byte(self, addr, bytes)
2204    }
2205
2206    fn read_memory_partial<'a>(
2207        &self,
2208        addr: UserAddress,
2209        bytes: &'a mut [MaybeUninit<u8>],
2210    ) -> Result<&'a mut [u8], Errno> {
2211        self.mm()?.unified_read_memory_partial(self, addr, bytes)
2212    }
2213
2214    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2215        self.mm()?.unified_write_memory(self, addr, bytes)
2216    }
2217
2218    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2219        self.mm()?.unified_write_memory_partial(self, addr, bytes)
2220    }
2221
2222    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
2223        self.mm()?.unified_zero(self, addr, length)
2224    }
2225}
2226
2227impl TaskMemoryAccessor for CurrentTask {
2228    fn maximum_valid_address(&self) -> Option<UserAddress> {
2229        self.mm().ok().map(|mm| mm.maximum_valid_user_address)
2230    }
2231}
2232
2233pub enum ExceptionResult {
2234    /// The exception was handled and no further action is required.
2235    Handled,
2236
2237    // The exception generated a signal that should be delivered.
2238    Signal(SignalInfo),
2239}
2240
2241#[cfg(test)]
2242mod tests {
2243    use crate::task::FullCredentials;
2244    use crate::testing::spawn_kernel_and_run;
2245
2246    // This test will run `override_creds` and check it doesn't crash. This ensures that the
2247    // delegation to `override_creds_async` is correct.
2248    #[::fuchsia::test]
2249    async fn test_override_creds_can_delegate_to_async_version() {
2250        spawn_kernel_and_run(async move |_, current_task| {
2251            assert_eq!(current_task.override_creds(FullCredentials::for_kernel(), || 0), 0);
2252        })
2253        .await;
2254    }
2255}