starnix_core/task/
current_task.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::arch::task::{decode_page_fault_exception_report, get_signal_for_general_exception};
6use crate::execution::{TaskInfo, create_zircon_process};
7use crate::mm::{DumpPolicy, MemoryAccessor, MemoryAccessorExt, TaskMemoryAccessor};
8use crate::security;
9use crate::signals::{RunState, SignalInfo, send_signal_first, send_standard_signal};
10use crate::task::loader::{ResolvedElf, load_executable, resolve_executable};
11use crate::task::{
12    ExitStatus, PtraceCoreState, PtraceEvent, PtraceEventData, PtraceOptions, RobustListHeadPtr,
13    SeccompFilter, SeccompFilterContainer, SeccompNotifierHandle, SeccompState, SeccompStateValue,
14    StopState, Task, TaskFlags, Waiter,
15};
16use crate::vfs::{
17    CheckAccessReason, FdFlags, FdNumber, FileHandle, FsStr, LookupContext, MAX_SYMLINK_FOLLOWS,
18    NamespaceNode, ResolveBase, SymlinkMode, SymlinkTarget, new_pidfd,
19};
20use extended_pstate::ExtendedPstateState;
21use futures::FutureExt;
22use linux_uapi::CLONE_PIDFD;
23use starnix_logging::{log_error, log_warn, track_file_not_found, track_stub};
24use starnix_registers::RegisterState;
25use starnix_stack::clean_stack;
26use starnix_sync::{
27    EventWaitGuard, FileOpsCore, LockBefore, LockEqualOrBefore, Locked, MmDumpable,
28    ProcessGroupState, TaskRelease, Unlocked, WakeReason,
29};
30use starnix_syscalls::SyscallResult;
31use starnix_syscalls::decls::Syscall;
32use starnix_task_command::TaskCommand;
33use starnix_types::arch::ArchWidth;
34use starnix_types::futex_address::FutexAddress;
35use starnix_types::ownership::{OwnedRef, Releasable, TempRef, WeakRef, release_on_error};
36use starnix_uapi::auth::{
37    CAP_KILL, CAP_SYS_ADMIN, CAP_SYS_PTRACE, Credentials, FsCred, PTRACE_MODE_FSCREDS,
38    PTRACE_MODE_REALCREDS, PtraceAccessMode, UserAndOrGroupId,
39};
40use starnix_uapi::device_type::DeviceType;
41use starnix_uapi::errors::{Errno, ErrnoCode};
42use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
43use starnix_uapi::open_flags::OpenFlags;
44use starnix_uapi::signals::{
45    SIGBUS, SIGCHLD, SIGCONT, SIGILL, SIGKILL, SIGSEGV, SIGSYS, SIGTRAP, SigSet, Signal,
46    UncheckedSignal,
47};
48use starnix_uapi::user_address::{ArchSpecific, UserAddress, UserRef};
49use starnix_uapi::vfs::ResolveFlags;
50use starnix_uapi::{
51    CLONE_CHILD_CLEARTID, CLONE_CHILD_SETTID, CLONE_FILES, CLONE_FS, CLONE_INTO_CGROUP,
52    CLONE_NEWUTS, CLONE_PARENT, CLONE_PARENT_SETTID, CLONE_PTRACE, CLONE_SETTLS, CLONE_SIGHAND,
53    CLONE_SYSVSEM, CLONE_THREAD, CLONE_VFORK, CLONE_VM, FUTEX_OWNER_DIED, FUTEX_TID_MASK,
54    ROBUST_LIST_LIMIT, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_NEW_LISTENER,
55    SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_TSYNC_ESRCH, SI_KERNEL, clone_args, errno,
56    error, from_status_like_fdio, pid_t, sock_filter, ucred,
57};
58use std::cell::RefCell;
59use std::collections::VecDeque;
60use std::ffi::CString;
61use std::fmt;
62use std::marker::PhantomData;
63use std::mem::MaybeUninit;
64use std::sync::Arc;
65use zx::sys::zx_thread_state_general_regs_t;
66
67use super::ThreadGroupLifecycleWaitValue;
68
69pub struct TaskBuilder {
70    /// The underlying task object.
71    pub task: OwnedRef<Task>,
72
73    pub thread_state: Box<ThreadState>,
74}
75
76impl TaskBuilder {
77    pub fn new(task: OwnedRef<Task>) -> Self {
78        Self { task, thread_state: Default::default() }
79    }
80
81    #[inline(always)]
82    pub fn release<L>(self, locked: &mut Locked<L>)
83    where
84        L: LockBefore<TaskRelease>,
85    {
86        let locked = locked.cast_locked::<TaskRelease>();
87        Releasable::release(self, locked);
88    }
89}
90
91impl From<TaskBuilder> for CurrentTask {
92    fn from(builder: TaskBuilder) -> Self {
93        Self::new(builder.task, builder.thread_state)
94    }
95}
96
97impl Releasable for TaskBuilder {
98    type Context<'a> = &'a mut Locked<TaskRelease>;
99
100    fn release<'a>(self, locked: Self::Context<'a>) {
101        let kernel = Arc::clone(self.kernel());
102        let mut pids = kernel.pids.write();
103
104        // We remove from the thread group here because the WeakRef in the pid
105        // table to this task must be valid until this task is removed from the
106        // thread group, and the code below will invalidate it.
107        // Moreover, this requires a OwnedRef of the task to ensure the tasks of
108        // the thread group are always valid.
109        self.task.thread_group().remove(locked, &mut pids, &self.task);
110
111        let context = (self.thread_state, locked, pids);
112        self.task.release(context);
113    }
114}
115
116impl std::ops::Deref for TaskBuilder {
117    type Target = Task;
118    fn deref(&self) -> &Self::Target {
119        &self.task
120    }
121}
122
123/// Task permission are determined from their credentials, and if enabled, from their SEStarnix
124///  security state.
125#[derive(Debug, Clone)]
126pub struct FullCredentials {
127    pub creds: Credentials,
128    pub security_state: security::TaskState,
129}
130
131impl FullCredentials {
132    pub fn for_kernel() -> Self {
133        Self { creds: Credentials::root(), security_state: security::task_alloc_for_kernel() }
134    }
135}
136
137/// The task object associated with the currently executing thread.
138///
139/// We often pass the `CurrentTask` as the first argument to functions if those functions need to
140/// know contextual information about the thread on which they are running. For example, we often
141/// use the `CurrentTask` to perform access checks, which ensures that the caller is authorized to
142/// perform the requested operation.
143///
144/// The `CurrentTask` also has state that can be referenced only on the currently executing thread,
145/// such as the register state for that thread. Syscalls are given a mutable references to the
146/// `CurrentTask`, which lets them manipulate this state.
147///
148/// See also `Task` for more information about tasks.
149pub struct CurrentTask {
150    /// The underlying task object.
151    pub task: OwnedRef<Task>,
152
153    pub thread_state: Box<ThreadState>,
154
155    // TODO(https://fxbug.dev/433548348): Avoid interior mutability here by passing a
156    // &mut CurrentTask around instead of &CurrentTask.
157    pub overridden_creds: RefCell<Option<FullCredentials>>,
158
159    /// Makes CurrentTask neither Sync not Send.
160    _local_marker: PhantomData<*mut u8>,
161}
162
163/// The thread related information of a `CurrentTask`. The information should never be used  outside
164/// of the thread owning the `CurrentTask`.
165#[derive(Default)]
166pub struct ThreadState {
167    /// A copy of the registers associated with the Zircon thread. Up-to-date values can be read
168    /// from `self.handle.read_state_general_regs()`. To write these values back to the thread, call
169    /// `self.handle.write_state_general_regs(self.thread_state.registers.into())`.
170    pub registers: RegisterState,
171
172    /// Copy of the current extended processor state including floating point and vector registers.
173    pub extended_pstate: ExtendedPstateState,
174
175    /// The errno code (if any) that indicated this task should restart a syscall.
176    pub restart_code: Option<ErrnoCode>,
177
178    /// A custom function to resume a syscall that has been interrupted by SIGSTOP.
179    /// To use, call set_syscall_restart_func and return ERESTART_RESTARTBLOCK. sys_restart_syscall
180    /// will eventually call it.
181    pub syscall_restart_func: Option<Box<SyscallRestartFunc>>,
182
183    /// An architecture agnostic enum indicating the width (32 or 64 bits) of the execution
184    /// environment in use.
185    pub arch_width: ArchWidth,
186}
187
188impl ThreadState {
189    /// Returns a new `ThreadState` with the same `registers` as this one.
190    fn snapshot(&self) -> Box<Self> {
191        Box::new(Self {
192            registers: self.registers,
193            extended_pstate: Default::default(),
194            restart_code: self.restart_code,
195            syscall_restart_func: None,
196            arch_width: self.arch_width,
197        })
198    }
199
200    pub fn extended_snapshot(&self) -> Self {
201        Self {
202            registers: self.registers.clone(),
203            extended_pstate: self.extended_pstate.clone(),
204            restart_code: self.restart_code,
205            syscall_restart_func: None,
206            arch_width: self.arch_width,
207        }
208    }
209
210    pub fn replace_registers(&mut self, other: &ThreadState) {
211        self.registers = other.registers;
212        self.extended_pstate = other.extended_pstate;
213        self.arch_width = other.arch_width;
214    }
215
216    pub fn get_user_register(&mut self, offset: usize) -> Result<usize, Errno> {
217        let mut result: usize = 0;
218        self.registers.apply_user_register(offset, &mut |register| result = *register as usize)?;
219        Ok(result)
220    }
221
222    pub fn set_user_register(&mut self, offset: usize, value: usize) -> Result<(), Errno> {
223        self.registers.apply_user_register(offset, &mut |register| *register = value as u64)
224    }
225}
226
227impl ArchSpecific for ThreadState {
228    fn is_arch32(&self) -> bool {
229        self.arch_width.is_arch32()
230    }
231}
232
233type SyscallRestartFunc = dyn FnOnce(&mut Locked<Unlocked>, &mut CurrentTask) -> Result<SyscallResult, Errno>
234    + Send
235    + Sync;
236
237impl Releasable for CurrentTask {
238    type Context<'a> = &'a mut Locked<TaskRelease>;
239
240    fn release<'a>(self, locked: Self::Context<'a>) {
241        self.notify_robust_list();
242        let _ignored = self.clear_child_tid_if_needed(locked);
243
244        let kernel = Arc::clone(self.kernel());
245        let mut pids = kernel.pids.write();
246
247        // We remove from the thread group here because the WeakRef in the pid
248        // table to this task must be valid until this task is removed from the
249        // thread group, and the code below will invalidate it.
250        // Moreover, this requires a OwnedRef of the task to ensure the tasks of
251        // the thread group are always valid.
252        self.task.thread_group().remove(locked, &mut pids, &self.task);
253
254        let context = (self.thread_state, locked, pids);
255        self.task.release(context);
256    }
257}
258
259impl std::ops::Deref for CurrentTask {
260    type Target = Task;
261    fn deref(&self) -> &Self::Target {
262        &self.task
263    }
264}
265
266impl fmt::Debug for CurrentTask {
267    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
268        self.task.fmt(f)
269    }
270}
271
272impl CurrentTask {
273    pub fn new(task: OwnedRef<Task>, thread_state: Box<ThreadState>) -> Self {
274        Self {
275            task,
276            thread_state,
277            overridden_creds: RefCell::new(None),
278            _local_marker: Default::default(),
279        }
280    }
281
282    /// Returns the current subjective credentials of the task.
283    ///
284    /// The subjective credentials are the credentials that are used to check permissions for
285    /// actions performed by the task.
286    pub fn current_creds(&self) -> Credentials {
287        match self.overridden_creds.borrow().as_ref() {
288            Some(full_creds) => full_creds.creds.clone(),
289            None => self.real_creds(),
290        }
291    }
292
293    pub fn with_current_creds<B, F>(&self, f: F) -> B
294    where
295        F: FnOnce(&Credentials) -> B,
296    {
297        match self.overridden_creds.borrow().as_ref() {
298            Some(x) => f(&x.creds),
299            None => self.with_real_creds(f),
300        }
301    }
302
303    /// Returns the current subjective credentials of the task, including the security state.
304    pub fn full_current_creds(&self) -> FullCredentials {
305        match self.overridden_creds.borrow().as_ref() {
306            Some(full_creds) => full_creds.clone(),
307            None => FullCredentials {
308                creds: self.real_creds(),
309                security_state: self.security_state.clone(),
310            },
311        }
312    }
313
314    pub fn current_fscred(&self) -> FsCred {
315        self.with_current_creds(|creds| creds.as_fscred())
316    }
317
318    pub fn current_ucred(&self) -> ucred {
319        self.with_current_creds(|creds| ucred {
320            pid: self.get_pid(),
321            uid: creds.uid,
322            gid: creds.gid,
323        })
324    }
325
326    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
327    /// `callback`.
328    /// The creds and security state will be restored to their original values at the end of the
329    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
330    ///  used to check permissions for actions performed by the task, is altered. The "objective"
331    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
332    /// for actions performed on the task, is not altered, and changes to the credentials are not
333    /// externally visible.
334    pub async fn override_creds_async<R>(
335        &self,
336        alter_creds: impl FnOnce(&mut FullCredentials),
337        callback: impl AsyncFnOnce() -> R,
338    ) -> R {
339        let saved = self.overridden_creds.take();
340        let mut new_creds = saved.clone().unwrap_or_else(|| FullCredentials {
341            creds: self.real_creds(),
342            security_state: self.security_state.clone(),
343        });
344        alter_creds(&mut new_creds);
345
346        self.overridden_creds.replace(Some(new_creds));
347
348        let result = callback().await;
349
350        self.overridden_creds.replace(saved);
351
352        result
353    }
354
355    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
356    /// `callback`.
357    /// The creds and security state will be restored to their original values at the end of the
358    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
359    ///  used to check permissions for actions performed by the task, is altered. The "objective"
360    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
361    /// for actions performed on the task, is not altered, and changes to the credentials are not
362    /// externally visible.
363    pub fn override_creds<R>(
364        &self,
365        alter_creds: impl FnOnce(&mut FullCredentials),
366        callback: impl FnOnce() -> R,
367    ) -> R {
368        self.override_creds_async(alter_creds, async move || callback())
369            .now_or_never()
370            .expect("Future should be ready")
371    }
372
373    pub fn has_overridden_creds(&self) -> bool {
374        self.overridden_creds.borrow().is_some()
375    }
376
377    pub fn trigger_delayed_releaser<L>(&self, locked: &mut Locked<L>)
378    where
379        L: LockEqualOrBefore<FileOpsCore>,
380    {
381        let locked = locked.cast_locked::<FileOpsCore>();
382        self.kernel().delayed_releaser.apply(locked, self);
383    }
384
385    pub fn weak_task(&self) -> WeakRef<Task> {
386        WeakRef::from(&self.task)
387    }
388
389    pub fn temp_task(&self) -> TempRef<'_, Task> {
390        TempRef::from(&self.task)
391    }
392
393    /// Change the current and real creds of the task. This is invalid to call while temporary
394    /// credentials are present.
395    pub fn set_creds(&self, creds: Credentials) {
396        let overridden_creds = self.overridden_creds.borrow();
397        assert!(overridden_creds.is_none());
398        #[allow(
399            clippy::undocumented_unsafe_blocks,
400            reason = "Force documented unsafe blocks in Starnix"
401        )]
402        unsafe {
403            // SAFETY: this is allowed because we are the CurrentTask.
404            *self.persistent_info.creds_mut() = creds;
405        }
406        // The /proc/pid directory's ownership is updated when the task's euid
407        // or egid changes. See proc(5).
408        let maybe_node = self.proc_pid_directory_cache.lock();
409        if let Some(node) = &*maybe_node {
410            let creds = self.real_creds().euid_as_fscred();
411            // SAFETY: The /proc/pid directory held by `proc_pid_directory_cache` represents the
412            // current task. It's owner and group are supposed to track the current task's euid and
413            // egid.
414            unsafe {
415                node.force_chown(creds);
416            }
417        }
418    }
419
420    #[inline(always)]
421    pub fn release<L>(self, locked: &mut Locked<L>)
422    where
423        L: LockBefore<TaskRelease>,
424    {
425        let locked = locked.cast_locked::<TaskRelease>();
426        Releasable::release(self, locked);
427    }
428
429    pub fn set_syscall_restart_func<R: Into<SyscallResult>>(
430        &mut self,
431        f: impl FnOnce(&mut Locked<Unlocked>, &mut CurrentTask) -> Result<R, Errno>
432        + Send
433        + Sync
434        + 'static,
435    ) {
436        self.thread_state.syscall_restart_func =
437            Some(Box::new(|locked, current_task| Ok(f(locked, current_task)?.into())));
438    }
439
440    /// Sets the task's signal mask to `signal_mask` and runs `wait_function`.
441    ///
442    /// Signals are dequeued prior to the original signal mask being restored. This is done by the
443    /// signal machinery in the syscall dispatch loop.
444    ///
445    /// The returned result is the result returned from the wait function.
446    pub fn wait_with_temporary_mask<F, T, L>(
447        &mut self,
448        locked: &mut Locked<L>,
449        signal_mask: SigSet,
450        wait_function: F,
451    ) -> Result<T, Errno>
452    where
453        L: LockEqualOrBefore<FileOpsCore>,
454        F: FnOnce(&mut Locked<L>, &CurrentTask) -> Result<T, Errno>,
455    {
456        {
457            let mut state = self.write();
458            state.set_flags(TaskFlags::TEMPORARY_SIGNAL_MASK, true);
459            state.set_temporary_signal_mask(signal_mask);
460        }
461        wait_function(locked, self)
462    }
463
464    /// If waking, promotes from waking to awake.  If not waking, make waiter async
465    /// wait until woken.  Returns true if woken.
466    pub fn wake_or_wait_until_unstopped_async(&self, waiter: &Waiter) -> bool {
467        let group_state = self.thread_group().read();
468        let mut task_state = self.write();
469
470        // Wake up if
471        //   a) we should wake up, meaning:
472        //      i) we're in group stop, and the thread group has exited group stop, or
473        //      ii) we're waking up,
474        //   b) and ptrace isn't stopping us from waking up, but
475        //   c) always wake up if we got a SIGKILL.
476        let task_stop_state = self.load_stopped();
477        let group_stop_state = self.thread_group().load_stopped();
478        if ((task_stop_state == StopState::GroupStopped && group_stop_state.is_waking_or_awake())
479            || task_stop_state.is_waking_or_awake())
480            && (!task_state.is_ptrace_listening() || task_stop_state.is_force())
481        {
482            let new_state = if task_stop_state.is_waking_or_awake() {
483                task_stop_state.finalize()
484            } else {
485                group_stop_state.finalize()
486            };
487            if let Ok(new_state) = new_state {
488                task_state.set_stopped(new_state, None, Some(self), None);
489                drop(group_state);
490                drop(task_state);
491                // It is possible for the stop state to be changed by another
492                // thread between when it is checked above and the following
493                // invocation, but set_stopped does sufficient checking while
494                // holding the lock to make sure that such a change won't result
495                // in corrupted state.
496                self.thread_group().set_stopped(new_state, None, false);
497                return true;
498            }
499        }
500
501        // We will wait.
502        if self.thread_group().load_stopped().is_stopped() || task_stop_state.is_stopped() {
503            // If we've stopped or PTRACE_LISTEN has been sent, wait for a
504            // signal or instructions from the tracer.
505            group_state
506                .lifecycle_waiters
507                .wait_async_value(&waiter, ThreadGroupLifecycleWaitValue::Stopped);
508            task_state.wait_on_ptracer(&waiter);
509        } else if task_state.can_accept_ptrace_commands() {
510            // If we're stopped because a tracer has seen the stop and not taken
511            // further action, wait for further instructions from the tracer.
512            task_state.wait_on_ptracer(&waiter);
513        } else if task_state.is_ptrace_listening() {
514            // A PTRACE_LISTEN is a state where we can get signals and notify a
515            // ptracer, but otherwise remain blocked.
516            if let Some(ptrace) = &mut task_state.ptrace {
517                ptrace.set_last_signal(Some(SignalInfo::default(SIGTRAP)));
518                ptrace.set_last_event(Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0)));
519            }
520            task_state.wait_on_ptracer(&waiter);
521            task_state.notify_ptracers();
522        }
523        false
524    }
525
526    /// Set the RunState for the current task to the given value and then call the given callback.
527    ///
528    /// When the callback is done, the run_state is restored to `RunState::Running`.
529    ///
530    /// This function is typically used just before blocking the current task on some operation.
531    /// The given `run_state` registers the mechanism for interrupting the blocking operation with
532    /// the task and the given `callback` actually blocks the task.
533    ///
534    /// This function can only be called in the `RunState::Running` state and cannot set the
535    /// run state to `RunState::Running`. For this reason, this function cannot be reentered.
536    pub fn run_in_state<F, T>(&self, run_state: RunState, callback: F) -> Result<T, Errno>
537    where
538        F: FnOnce() -> Result<T, Errno>,
539    {
540        assert_ne!(run_state, RunState::Running);
541
542        // As an optimization, decommit unused pages of the stack to reduce memory pressure while
543        // the thread is blocked.
544        clean_stack();
545
546        {
547            let mut state = self.write();
548            assert!(!state.is_blocked());
549
550            if matches!(run_state, RunState::Frozen(_)) {
551                // Freeze is a kernel signal and is handled before other user signals. A frozen task
552                // ignores all other signals except SIGKILL until it is thawed.
553                if state.has_signal_pending(SIGKILL) {
554                    return error!(EINTR);
555                }
556            } else if state.is_any_signal_pending() && !state.is_ptrace_listening() {
557                // A note on PTRACE_LISTEN - the thread cannot be scheduled
558                // regardless of pending signals.
559                return error!(EINTR);
560            }
561            state.set_run_state(run_state.clone());
562        }
563
564        let result = callback();
565
566        {
567            let mut state = self.write();
568            assert_eq!(
569                state.run_state(),
570                run_state,
571                "SignalState run state changed while waiting!"
572            );
573            state.set_run_state(RunState::Running);
574        };
575
576        result
577    }
578
579    pub fn block_until(
580        &self,
581        guard: EventWaitGuard<'_>,
582        deadline: zx::MonotonicInstant,
583    ) -> Result<(), Errno> {
584        self.run_in_state(RunState::Event(guard.event().clone()), move || {
585            guard.block_until(None, deadline).map_err(|e| match e {
586                WakeReason::Interrupted => errno!(EINTR),
587                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
588            })
589        })
590    }
591
592    pub fn block_with_owner_until(
593        &self,
594        guard: EventWaitGuard<'_>,
595        new_owner: &zx::Thread,
596        deadline: zx::MonotonicInstant,
597    ) -> Result<(), Errno> {
598        self.run_in_state(RunState::Event(guard.event().clone()), move || {
599            guard.block_until(Some(new_owner), deadline).map_err(|e| match e {
600                WakeReason::Interrupted => errno!(EINTR),
601                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
602            })
603        })
604    }
605
606    /// Determine namespace node indicated by the dir_fd.
607    ///
608    /// Returns the namespace node and the path to use relative to that node.
609    pub fn resolve_dir_fd<'a, L>(
610        &self,
611        locked: &mut Locked<L>,
612        dir_fd: FdNumber,
613        mut path: &'a FsStr,
614        flags: ResolveFlags,
615    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
616    where
617        L: LockEqualOrBefore<FileOpsCore>,
618    {
619        let path_is_absolute = path.starts_with(b"/");
620        if path_is_absolute {
621            if flags.contains(ResolveFlags::BENEATH) {
622                return error!(EXDEV);
623            }
624            path = &path[1..];
625        }
626
627        let dir = if path_is_absolute && !flags.contains(ResolveFlags::IN_ROOT) {
628            self.fs().root()
629        } else if dir_fd == FdNumber::AT_FDCWD {
630            self.fs().cwd()
631        } else {
632            // O_PATH allowed for:
633            //
634            //   Passing the file descriptor as the dirfd argument of
635            //   openat() and the other "*at()" system calls.  This
636            //   includes linkat(2) with AT_EMPTY_PATH (or via procfs
637            //   using AT_SYMLINK_FOLLOW) even if the file is not a
638            //   directory.
639            //
640            // See https://man7.org/linux/man-pages/man2/open.2.html
641            let file = self.files.get_allowing_opath(dir_fd)?;
642            file.name.to_passive()
643        };
644
645        if !path.is_empty() {
646            if !dir.entry.node.is_dir() {
647                return error!(ENOTDIR);
648            }
649            dir.check_access(
650                locked,
651                self,
652                Access::EXEC,
653                CheckAccessReason::InternalPermissionChecks,
654            )?;
655        }
656        Ok((dir, path.into()))
657    }
658
659    /// A convenient wrapper for opening files relative to FdNumber::AT_FDCWD.
660    ///
661    /// Returns a FileHandle but does not install the FileHandle in the FdTable
662    /// for this task.
663    pub fn open_file(
664        &self,
665        locked: &mut Locked<Unlocked>,
666        path: &FsStr,
667        flags: OpenFlags,
668    ) -> Result<FileHandle, Errno> {
669        if flags.contains(OpenFlags::CREAT) {
670            // In order to support OpenFlags::CREAT we would need to take a
671            // FileMode argument.
672            return error!(EINVAL);
673        }
674        self.open_file_at(
675            locked,
676            FdNumber::AT_FDCWD,
677            path,
678            flags,
679            FileMode::default(),
680            ResolveFlags::empty(),
681            AccessCheck::default(),
682        )
683    }
684
685    /// Resolves a path for open.
686    ///
687    /// If the final path component points to a symlink, the symlink is followed (as long as
688    /// the symlink traversal limit has not been reached).
689    ///
690    /// If the final path component (after following any symlinks, if enabled) does not exist,
691    /// and `flags` contains `OpenFlags::CREAT`, a new node is created at the location of the
692    /// final path component.
693    ///
694    /// This returns the resolved node, and a boolean indicating whether the node has been created.
695    fn resolve_open_path<L>(
696        &self,
697        locked: &mut Locked<L>,
698        context: &mut LookupContext,
699        dir: &NamespaceNode,
700        path: &FsStr,
701        mode: FileMode,
702        flags: OpenFlags,
703    ) -> Result<(NamespaceNode, bool), Errno>
704    where
705        L: LockEqualOrBefore<FileOpsCore>,
706    {
707        context.update_for_path(path);
708        let mut parent_content = context.with(SymlinkMode::Follow);
709        let (parent, basename) = self.lookup_parent(locked, &mut parent_content, dir, path)?;
710        context.remaining_follows = parent_content.remaining_follows;
711
712        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
713
714        // Lookup the child, without following a symlink or expecting it to be a directory.
715        let mut child_context = context.with(SymlinkMode::NoFollow);
716        child_context.must_be_directory = false;
717
718        match parent.lookup_child(locked, self, &mut child_context, basename) {
719            Ok(name) => {
720                if name.entry.node.is_lnk() {
721                    if flags.contains(OpenFlags::PATH)
722                        && context.symlink_mode == SymlinkMode::NoFollow
723                    {
724                        // When O_PATH is specified in flags, if pathname is a symbolic link
725                        // and the O_NOFOLLOW flag is also specified, then the call returns
726                        // a file descriptor referring to the symbolic link.
727                        // See https://man7.org/linux/man-pages/man2/openat.2.html
728                        //
729                        // If the trailing component (i.e., basename) of
730                        // pathname is a symbolic link, how.resolve contains
731                        // RESOLVE_NO_SYMLINKS, and how.flags contains both
732                        // O_PATH and O_NOFOLLOW, then an O_PATH file
733                        // descriptor referencing the symbolic link will be
734                        // returned.
735                        // See https://man7.org/linux/man-pages/man2/openat2.2.html
736                        return Ok((name, false));
737                    }
738
739                    if (!flags.contains(OpenFlags::PATH)
740                        && context.symlink_mode == SymlinkMode::NoFollow)
741                        || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
742                        || context.remaining_follows == 0
743                    {
744                        if must_create {
745                            // Since `must_create` is set, and a node was found, this returns EEXIST
746                            // instead of ELOOP.
747                            return error!(EEXIST);
748                        }
749                        // A symlink was found, but one of the following is true:
750                        // * flags specified O_NOFOLLOW but not O_PATH.
751                        // * how.resolve contains RESOLVE_NO_SYMLINKS
752                        // * too many symlink traversals have been attempted
753                        return error!(ELOOP);
754                    }
755
756                    context.remaining_follows -= 1;
757                    match name.readlink(locked, self)? {
758                        SymlinkTarget::Path(path) => {
759                            let dir = if path[0] == b'/' { self.fs().root() } else { parent };
760                            self.resolve_open_path(
761                                locked,
762                                context,
763                                &dir,
764                                path.as_ref(),
765                                mode,
766                                flags,
767                            )
768                        }
769                        SymlinkTarget::Node(name) => {
770                            if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS)
771                                || name.entry.node.is_lnk()
772                            {
773                                error!(ELOOP)
774                            } else {
775                                Ok((name, false))
776                            }
777                        }
778                    }
779                } else {
780                    if must_create {
781                        return error!(EEXIST);
782                    }
783                    Ok((name, false))
784                }
785            }
786            Err(e) if e == errno!(ENOENT) && flags.contains(OpenFlags::CREAT) => {
787                if context.must_be_directory {
788                    return error!(EISDIR);
789                }
790                Ok((
791                    parent.open_create_node(
792                        locked,
793                        self,
794                        basename,
795                        mode.with_type(FileMode::IFREG),
796                        DeviceType::NONE,
797                        flags,
798                    )?,
799                    true,
800                ))
801            }
802            Err(e) => Err(e),
803        }
804    }
805
806    /// The primary entry point for opening files relative to a task.
807    ///
808    /// Absolute paths are resolve relative to the root of the FsContext for
809    /// this task. Relative paths are resolve relative to dir_fd. To resolve
810    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
811    /// dir_fd.
812    ///
813    /// Returns a FileHandle but does not install the FileHandle in the FdTable
814    /// for this task.
815    pub fn open_file_at(
816        &self,
817        locked: &mut Locked<Unlocked>,
818        dir_fd: FdNumber,
819        path: &FsStr,
820        flags: OpenFlags,
821        mode: FileMode,
822        resolve_flags: ResolveFlags,
823        access_check: AccessCheck,
824    ) -> Result<FileHandle, Errno> {
825        if path.is_empty() {
826            return error!(ENOENT);
827        }
828
829        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, resolve_flags)?;
830        self.open_namespace_node_at(locked, dir, path, flags, mode, resolve_flags, access_check)
831    }
832
833    pub fn open_namespace_node_at(
834        &self,
835        locked: &mut Locked<Unlocked>,
836        dir: NamespaceNode,
837        path: &FsStr,
838        flags: OpenFlags,
839        mode: FileMode,
840        mut resolve_flags: ResolveFlags,
841        access_check: AccessCheck,
842    ) -> Result<FileHandle, Errno> {
843        // 64-bit kernels force the O_LARGEFILE flag to be on.
844        let mut flags = flags | OpenFlags::LARGEFILE;
845        let opath = flags.contains(OpenFlags::PATH);
846        if opath {
847            // When O_PATH is specified in flags, flag bits other than O_CLOEXEC,
848            // O_DIRECTORY, and O_NOFOLLOW are ignored.
849            const ALLOWED_FLAGS: OpenFlags = OpenFlags::from_bits_truncate(
850                OpenFlags::PATH.bits()
851                    | OpenFlags::CLOEXEC.bits()
852                    | OpenFlags::DIRECTORY.bits()
853                    | OpenFlags::NOFOLLOW.bits(),
854            );
855            flags &= ALLOWED_FLAGS;
856        }
857
858        if flags.contains(OpenFlags::TMPFILE) && !flags.can_write() {
859            return error!(EINVAL);
860        }
861
862        let nofollow = flags.contains(OpenFlags::NOFOLLOW);
863        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
864
865        let symlink_mode =
866            if nofollow || must_create { SymlinkMode::NoFollow } else { SymlinkMode::Follow };
867
868        let resolve_base = match (
869            resolve_flags.contains(ResolveFlags::BENEATH),
870            resolve_flags.contains(ResolveFlags::IN_ROOT),
871        ) {
872            (false, false) => ResolveBase::None,
873            (true, false) => ResolveBase::Beneath(dir.clone()),
874            (false, true) => ResolveBase::InRoot(dir.clone()),
875            (true, true) => return error!(EINVAL),
876        };
877
878        // `RESOLVE_BENEATH` and `RESOLVE_IN_ROOT` imply `RESOLVE_NO_MAGICLINKS`. This matches
879        // Linux behavior. Strictly speaking it's is not really required, but it's hard to
880        // implement `BENEATH` and `IN_ROOT` flags correctly otherwise.
881        if resolve_base != ResolveBase::None {
882            resolve_flags.insert(ResolveFlags::NO_MAGICLINKS);
883        }
884
885        let mut context = LookupContext {
886            symlink_mode,
887            remaining_follows: MAX_SYMLINK_FOLLOWS,
888            must_be_directory: flags.contains(OpenFlags::DIRECTORY),
889            resolve_flags,
890            resolve_base,
891        };
892        let (name, created) =
893            match self.resolve_open_path(locked, &mut context, &dir, path, mode, flags) {
894                Ok((n, c)) => (n, c),
895                Err(e) => {
896                    let mut abs_path = dir.path(&self.task);
897                    abs_path.extend(&**path);
898                    track_file_not_found(abs_path);
899                    return Err(e);
900                }
901            };
902
903        let name = if flags.contains(OpenFlags::TMPFILE) {
904            // `O_TMPFILE` is incompatible with `O_CREAT`
905            if flags.contains(OpenFlags::CREAT) {
906                return error!(EINVAL);
907            }
908            name.create_tmpfile(locked, self, mode.with_type(FileMode::IFREG), flags)?
909        } else {
910            let mode = name.entry.node.info().mode;
911
912            // These checks are not needed in the `O_TMPFILE` case because `mode` refers to the
913            // file we are opening. With `O_TMPFILE`, that file is the regular file we just
914            // created rather than the node we found by resolving the path.
915            //
916            // For example, we do not need to produce `ENOTDIR` when `must_be_directory` is set
917            // because `must_be_directory` refers to the node we found by resolving the path.
918            // If that node was not a directory, then `create_tmpfile` will produce an error.
919            //
920            // Similarly, we never need to call `truncate` because `O_TMPFILE` is newly created
921            // and therefor already an empty file.
922
923            if !opath && nofollow && mode.is_lnk() {
924                return error!(ELOOP);
925            }
926
927            if mode.is_dir() {
928                if flags.can_write()
929                    || flags.contains(OpenFlags::CREAT)
930                    || flags.contains(OpenFlags::TRUNC)
931                {
932                    return error!(EISDIR);
933                }
934                if flags.contains(OpenFlags::DIRECT) {
935                    return error!(EINVAL);
936                }
937            } else if context.must_be_directory {
938                return error!(ENOTDIR);
939            }
940
941            if flags.contains(OpenFlags::TRUNC) && mode.is_reg() && !created {
942                // You might think we should check file.can_write() at this
943                // point, which is what the docs suggest, but apparently we
944                // are supposed to truncate the file if this task can write
945                // to the underlying node, even if we are opening the file
946                // as read-only. See OpenTest.CanTruncateReadOnly.
947                name.truncate(locked, self, 0)?;
948            }
949
950            name
951        };
952
953        // If the node has been created, the open operation should not verify access right:
954        // From <https://man7.org/linux/man-pages/man2/open.2.html>
955        //
956        // > Note that mode applies only to future accesses of the newly created file; the
957        // > open() call that creates a read-only file may well return a  read/write  file
958        // > descriptor.
959
960        let access_check = if created { AccessCheck::skip() } else { access_check };
961        name.open(locked, self, flags, access_check)
962    }
963
964    /// A wrapper for FsContext::lookup_parent_at that resolves the given
965    /// dir_fd to a NamespaceNode.
966    ///
967    /// Absolute paths are resolve relative to the root of the FsContext for
968    /// this task. Relative paths are resolve relative to dir_fd. To resolve
969    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
970    /// dir_fd.
971    pub fn lookup_parent_at<'a, L>(
972        &self,
973        locked: &mut Locked<L>,
974        context: &mut LookupContext,
975        dir_fd: FdNumber,
976        path: &'a FsStr,
977    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
978    where
979        L: LockEqualOrBefore<FileOpsCore>,
980    {
981        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, ResolveFlags::empty())?;
982        self.lookup_parent(locked, context, &dir, path)
983    }
984
985    /// Lookup the parent of a namespace node.
986    ///
987    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
988    /// calling this function directly.
989    ///
990    /// This function resolves all but the last component of the given path.
991    /// The function returns the parent directory of the last component as well
992    /// as the last component.
993    ///
994    /// If path is empty, this function returns dir and an empty path.
995    /// Similarly, if path ends with "." or "..", these components will be
996    /// returned along with the parent.
997    ///
998    /// The returned parent might not be a directory.
999    pub fn lookup_parent<'a, L>(
1000        &self,
1001        locked: &mut Locked<L>,
1002        context: &mut LookupContext,
1003        dir: &NamespaceNode,
1004        path: &'a FsStr,
1005    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
1006    where
1007        L: LockEqualOrBefore<FileOpsCore>,
1008    {
1009        context.update_for_path(path);
1010
1011        let mut current_node = dir.clone();
1012        let mut it = path.split(|c| *c == b'/').filter(|p| !p.is_empty()).map(<&FsStr>::from);
1013        let mut current_path_component = it.next().unwrap_or_default();
1014        for next_path_component in it {
1015            current_node =
1016                current_node.lookup_child(locked, self, context, current_path_component)?;
1017            current_path_component = next_path_component;
1018        }
1019        Ok((current_node, current_path_component))
1020    }
1021
1022    /// Lookup a namespace node.
1023    ///
1024    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
1025    /// calling this function directly.
1026    ///
1027    /// This function resolves the component of the given path.
1028    pub fn lookup_path<L>(
1029        &self,
1030        locked: &mut Locked<L>,
1031        context: &mut LookupContext,
1032        dir: NamespaceNode,
1033        path: &FsStr,
1034    ) -> Result<NamespaceNode, Errno>
1035    where
1036        L: LockEqualOrBefore<FileOpsCore>,
1037    {
1038        let (parent, basename) = self.lookup_parent(locked, context, &dir, path)?;
1039        parent.lookup_child(locked, self, context, basename)
1040    }
1041
1042    /// Lookup a namespace node starting at the root directory.
1043    ///
1044    /// Resolves symlinks.
1045    pub fn lookup_path_from_root<L>(
1046        &self,
1047        locked: &mut Locked<L>,
1048        path: &FsStr,
1049    ) -> Result<NamespaceNode, Errno>
1050    where
1051        L: LockEqualOrBefore<FileOpsCore>,
1052    {
1053        let mut context = LookupContext::default();
1054        self.lookup_path(locked, &mut context, self.fs().root(), path)
1055    }
1056
1057    pub fn exec(
1058        &mut self,
1059        locked: &mut Locked<Unlocked>,
1060        executable: FileHandle,
1061        path: CString,
1062        argv: Vec<CString>,
1063        environ: Vec<CString>,
1064    ) -> Result<(), Errno> {
1065        // Executable must be a regular file
1066        if !executable.name.entry.node.is_reg() {
1067            return error!(EACCES);
1068        }
1069
1070        // File node must have EXEC mode permissions.
1071        // Note that the ability to execute a file is unrelated to the flags
1072        // used in the `open` call.
1073        executable.name.check_access(locked, self, Access::EXEC, CheckAccessReason::Exec)?;
1074
1075        let elf_security_state = security::bprm_creds_for_exec(self, &executable.name)?;
1076
1077        let resolved_elf = resolve_executable(
1078            locked,
1079            self,
1080            executable,
1081            path.clone(),
1082            argv,
1083            environ,
1084            elf_security_state,
1085        )?;
1086
1087        let maybe_set_id = if self.kernel().features.enable_suid {
1088            resolved_elf.file.name.suid_and_sgid(&self)?
1089        } else {
1090            Default::default()
1091        };
1092
1093        if self.thread_group().read().tasks_count() > 1 {
1094            track_stub!(TODO("https://fxbug.dev/297434895"), "exec on multithread process");
1095            return error!(EINVAL);
1096        }
1097
1098        if let Err(err) = self.finish_exec(locked, path, resolved_elf, maybe_set_id) {
1099            log_warn!("unrecoverable error in exec: {err:?}");
1100
1101            send_standard_signal(
1102                locked,
1103                self,
1104                SignalInfo { code: SI_KERNEL as i32, force: true, ..SignalInfo::default(SIGSEGV) },
1105            );
1106            return Err(err);
1107        }
1108
1109        self.ptrace_event(locked, PtraceOptions::TRACEEXEC, self.task.tid as u64);
1110        self.signal_vfork();
1111
1112        Ok(())
1113    }
1114
1115    /// After the memory is unmapped, any failure in exec is unrecoverable and results in the
1116    /// process crashing. This function is for that second half; any error returned from this
1117    /// function will be considered unrecoverable.
1118    fn finish_exec<L>(
1119        &mut self,
1120        locked: &mut Locked<L>,
1121        path: CString,
1122        resolved_elf: ResolvedElf,
1123        mut maybe_set_id: UserAndOrGroupId,
1124    ) -> Result<(), Errno>
1125    where
1126        L: LockBefore<MmDumpable>,
1127    {
1128        // Now that the exec will definitely finish (or crash), notify owners of
1129        // locked futexes for the current process, which will be impossible to
1130        // update after process image is replaced.  See get_robust_list(2).
1131        self.notify_robust_list();
1132
1133        // Passing arch32 information here ensures the replacement memory
1134        // layout matches the elf being executed.
1135        let mm = {
1136            let mm = self.mm()?;
1137            let new_mm = mm
1138                .exec(resolved_elf.file.name.to_passive(), resolved_elf.arch_width)
1139                .map_err(|status| from_status_like_fdio!(status))?;
1140            self.mm.update(Some(new_mm.clone()));
1141            new_mm
1142        };
1143
1144        {
1145            let mut state = self.write();
1146
1147            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1148            //
1149            //   The aforementioned transformations of the effective IDs are not
1150            //   performed (i.e., the set-user-ID and set-group-ID bits are
1151            //   ignored) if any of the following is true:
1152            //
1153            //   * the no_new_privs attribute is set for the calling thread (see
1154            //      prctl(2));
1155            //
1156            //   *  the underlying filesystem is mounted nosuid (the MS_NOSUID
1157            //      flag for mount(2)); or
1158            //
1159            //   *  the calling process is being ptraced.
1160            //
1161            // The MS_NOSUID check is in `NamespaceNode::suid_and_sgid()`.
1162            if state.no_new_privs() || state.is_ptraced() {
1163                maybe_set_id.clear();
1164            }
1165
1166            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1167            //
1168            //   The process's "dumpable" attribute is set to the value 1,
1169            //   unless a set-user-ID program, a set-group-ID program, or a
1170            //   program with capabilities is being executed, in which case the
1171            //   dumpable flag may instead be reset to the value in
1172            //   /proc/sys/fs/suid_dumpable, in the circumstances described
1173            //   under PR_SET_DUMPABLE in prctl(2).
1174            let dumpable =
1175                if maybe_set_id.is_none() { DumpPolicy::User } else { DumpPolicy::Disable };
1176            *mm.dumpable.lock(locked) = dumpable;
1177
1178            #[allow(
1179                clippy::undocumented_unsafe_blocks,
1180                reason = "Force documented unsafe blocks in Starnix"
1181            )]
1182            let mut creds = unsafe {
1183                // SAFETY: this is allowed because we are the CurrentTask.
1184                self.persistent_info.creds_mut()
1185            };
1186            state.set_sigaltstack(None);
1187            state.robust_list_head = RobustListHeadPtr::null(self);
1188
1189            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1190            //
1191            //   If a set-user-ID or set-group-ID
1192            //   program is being executed, then the parent death signal set by
1193            //   prctl(2) PR_SET_PDEATHSIG flag is cleared.
1194            //
1195            // TODO(https://fxbug.dev/356684424): Implement the behavior above once we support
1196            // the PR_SET_PDEATHSIG flag.
1197
1198            // TODO(tbodt): Check whether capability xattrs are set on the file, and grant/limit
1199            // capabilities accordingly.
1200            creds.exec(maybe_set_id);
1201        }
1202
1203        let security_state = resolved_elf.security_state.clone();
1204
1205        let start_info = load_executable(self, resolved_elf, &path)?;
1206        // Before consuming start_info below, note if the task is 32-bit.
1207        self.thread_state.arch_width = start_info.arch_width;
1208
1209        let regs: zx_thread_state_general_regs_t = start_info.into();
1210        self.thread_state.registers = regs.into();
1211        self.thread_state.extended_pstate.reset();
1212        self.thread_group().signal_actions.reset_for_exec();
1213
1214        // The exit signal (and that of the children) is reset to SIGCHLD.
1215        let mut thread_group_state = self.thread_group().write();
1216        thread_group_state.exit_signal = Some(SIGCHLD);
1217        for (_, weak_child) in &mut thread_group_state.children {
1218            if let Some(child) = weak_child.upgrade() {
1219                let mut child_state = child.write();
1220                child_state.exit_signal = Some(SIGCHLD);
1221            }
1222        }
1223
1224        std::mem::drop(thread_group_state);
1225
1226        // TODO(https://fxbug.dev/42082680): All threads other than the calling thread are destroyed.
1227
1228        // TODO: POSIX timers are not preserved.
1229
1230        // TODO: Ensure that the filesystem context is un-shared, undoing the effect of CLONE_FS.
1231
1232        // The file descriptor table is unshared, undoing the effect of the CLONE_FILES flag of
1233        // clone(2).
1234        self.files.unshare();
1235        self.files.exec();
1236
1237        // If SELinux is enabled, enforce permissions related to inheritance of file descriptors
1238        // and resource limits. Then update the current task's SID.
1239        //
1240        // TODO: https://fxbug.dev/378655436 - After the above, enforce permissions related to
1241        // signal state inheritance.
1242        //
1243        // This needs to be called after closing any files marked "close-on-exec".
1244        security::exec_binprm(&mut locked.cast_locked::<MmDumpable>(), self, &security_state);
1245
1246        self.thread_group().write().did_exec = true;
1247
1248        self.set_command_name(TaskCommand::from_path_bytes(path.to_bytes()));
1249
1250        Ok(())
1251    }
1252
1253    pub fn set_command_name(&self, new_name: TaskCommand) {
1254        // set_command_name needs to run before leader_command() in cases where self is the leader.
1255        self.task.set_command_name(new_name.clone());
1256        let leader_command = self.thread_group().read().leader_command();
1257        starnix_logging::set_current_task_info(
1258            new_name,
1259            leader_command,
1260            self.thread_group().leader,
1261            self.tid,
1262        );
1263    }
1264
1265    pub fn add_seccomp_filter(
1266        &mut self,
1267        locked: &mut Locked<Unlocked>,
1268        code: Vec<sock_filter>,
1269        flags: u32,
1270    ) -> Result<SyscallResult, Errno> {
1271        let new_filter = Arc::new(SeccompFilter::from_cbpf(
1272            &code,
1273            self.thread_group().next_seccomp_filter_id.add(1),
1274            flags & SECCOMP_FILTER_FLAG_LOG != 0,
1275        )?);
1276
1277        let mut maybe_fd: Option<FdNumber> = None;
1278
1279        if flags & SECCOMP_FILTER_FLAG_NEW_LISTENER != 0 {
1280            maybe_fd = Some(SeccompFilterContainer::create_listener(locked, self)?);
1281        }
1282
1283        // We take the process lock here because we can't change any of the threads
1284        // while doing a tsync.  So, you hold the process lock while making any changes.
1285        let state = self.thread_group().write();
1286
1287        if flags & SECCOMP_FILTER_FLAG_TSYNC != 0 {
1288            // TSYNC synchronizes all filters for all threads in the current process to
1289            // the current thread's
1290
1291            // We collect the filters for the current task upfront to save us acquiring
1292            // the task's lock a lot of times below.
1293            let mut filters: SeccompFilterContainer = self.read().seccomp_filters.clone();
1294
1295            // For TSYNC to work, all of the other thread filters in this process have to
1296            // be a prefix of this thread's filters, and none of them can be in
1297            // strict mode.
1298            let tasks = state.tasks().collect::<Vec<_>>();
1299            for task in &tasks {
1300                if task.tid == self.tid {
1301                    continue;
1302                }
1303                let other_task_state = task.read();
1304
1305                // Target threads cannot be in SECCOMP_MODE_STRICT
1306                if task.seccomp_filter_state.get() == SeccompStateValue::Strict {
1307                    return Self::seccomp_tsync_error(task.tid, flags);
1308                }
1309
1310                // Target threads' filters must be a subsequence of this thread's
1311                if !other_task_state.seccomp_filters.can_sync_to(&filters) {
1312                    return Self::seccomp_tsync_error(task.tid, flags);
1313                }
1314            }
1315
1316            // Now that we're sure we're allowed to do so, add the filter to all threads.
1317            filters.add_filter(new_filter, code.len() as u16)?;
1318
1319            for task in &tasks {
1320                let mut other_task_state = task.write();
1321
1322                other_task_state.enable_no_new_privs();
1323                other_task_state.seccomp_filters = filters.clone();
1324                task.set_seccomp_state(SeccompStateValue::UserDefined)?;
1325            }
1326        } else {
1327            let mut task_state = self.task.write();
1328
1329            task_state.seccomp_filters.add_filter(new_filter, code.len() as u16)?;
1330            self.set_seccomp_state(SeccompStateValue::UserDefined)?;
1331        }
1332
1333        if let Some(fd) = maybe_fd { Ok(fd.into()) } else { Ok(().into()) }
1334    }
1335
1336    pub fn run_seccomp_filters(
1337        &mut self,
1338        locked: &mut Locked<Unlocked>,
1339        syscall: &Syscall,
1340    ) -> Option<Result<SyscallResult, Errno>> {
1341        // Implementation of SECCOMP_FILTER_STRICT, which has slightly different semantics
1342        // from user-defined seccomp filters.
1343        if self.seccomp_filter_state.get() == SeccompStateValue::Strict {
1344            return SeccompState::do_strict(locked, self, syscall);
1345        }
1346
1347        // Run user-defined seccomp filters
1348        let result = self.task.read().seccomp_filters.run_all(self, syscall);
1349
1350        SeccompState::do_user_defined(locked, result, self, syscall)
1351    }
1352
1353    fn seccomp_tsync_error(id: i32, flags: u32) -> Result<SyscallResult, Errno> {
1354        // By default, TSYNC indicates failure state by returning the first thread
1355        // id not to be able to sync, rather than by returning -1 and setting
1356        // errno.  However, if TSYNC_ESRCH is set, it returns ESRCH.  This
1357        // prevents conflicts with fact that SECCOMP_FILTER_FLAG_NEW_LISTENER
1358        // makes seccomp return an fd.
1359        if flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH != 0 { error!(ESRCH) } else { Ok(id.into()) }
1360    }
1361
1362    // Notify all futexes in robust list.  The robust list is in user space, so we
1363    // are very careful about walking it, and there are a lot of quiet returns if
1364    // we fail to walk it.
1365    // TODO(https://fxbug.dev/42079081): This only sets the FUTEX_OWNER_DIED bit; it does
1366    // not wake up a waiter.
1367    pub fn notify_robust_list(&self) {
1368        let task_state = self.write();
1369        let robust_list_addr = task_state.robust_list_head.addr();
1370        if robust_list_addr == UserAddress::NULL {
1371            // No one has called set_robust_list.
1372            return;
1373        }
1374        let robust_list_res = self.read_multi_arch_object(task_state.robust_list_head);
1375
1376        let head = if let Ok(head) = robust_list_res {
1377            head
1378        } else {
1379            return;
1380        };
1381
1382        let offset = head.futex_offset;
1383
1384        let mut entries_count = 0;
1385        let mut curr_ptr = head.list.next;
1386        while curr_ptr.addr() != robust_list_addr.into() && entries_count < ROBUST_LIST_LIMIT {
1387            let curr_ref = self.read_multi_arch_object(curr_ptr);
1388
1389            let curr = if let Ok(curr) = curr_ref {
1390                curr
1391            } else {
1392                return;
1393            };
1394
1395            let Some(futex_base) = curr_ptr.addr().checked_add_signed(offset) else {
1396                return;
1397            };
1398
1399            let futex_addr = match FutexAddress::try_from(futex_base) {
1400                Ok(addr) => addr,
1401                Err(_) => {
1402                    return;
1403                }
1404            };
1405
1406            let Ok(mm) = self.mm() else {
1407                log_error!("Asked to notify robust list futexes in system task.");
1408                return;
1409            };
1410            let futex = if let Ok(futex) = mm.atomic_load_u32_relaxed(futex_addr) {
1411                futex
1412            } else {
1413                return;
1414            };
1415
1416            if (futex & FUTEX_TID_MASK) as i32 == self.tid {
1417                let owner_died = FUTEX_OWNER_DIED | futex;
1418                if mm.atomic_store_u32_relaxed(futex_addr, owner_died).is_err() {
1419                    return;
1420                }
1421            }
1422            curr_ptr = curr.next;
1423            entries_count += 1;
1424        }
1425    }
1426
1427    /// Returns a ref to this thread's SeccompNotifier.
1428    pub fn get_seccomp_notifier(&mut self) -> Option<SeccompNotifierHandle> {
1429        self.task.write().seccomp_filters.notifier.clone()
1430    }
1431
1432    pub fn set_seccomp_notifier(&mut self, notifier: Option<SeccompNotifierHandle>) {
1433        self.task.write().seccomp_filters.notifier = notifier;
1434    }
1435
1436    /// Processes a Zircon exception associated with this task.
1437    pub fn process_exception(
1438        &self,
1439        locked: &mut Locked<Unlocked>,
1440        report: &zx::ExceptionReport,
1441    ) -> ExceptionResult {
1442        match report.ty {
1443            zx::ExceptionType::General => match get_signal_for_general_exception(&report.arch) {
1444                Some(sig) => ExceptionResult::Signal(SignalInfo::default(sig)),
1445                None => {
1446                    log_error!("Unrecognized general exception: {:?}", report);
1447                    ExceptionResult::Signal(SignalInfo::default(SIGILL))
1448                }
1449            },
1450            zx::ExceptionType::FatalPageFault { status } => {
1451                let report = decode_page_fault_exception_report(&report.arch);
1452                if let Ok(mm) = self.mm() {
1453                    mm.handle_page_fault(locked, report, status)
1454                } else {
1455                    panic!(
1456                        "system task is handling a major page fault status={:?}, report={:?}",
1457                        status, report
1458                    );
1459                }
1460            }
1461            zx::ExceptionType::UndefinedInstruction => {
1462                ExceptionResult::Signal(SignalInfo::default(SIGILL))
1463            }
1464            zx::ExceptionType::UnalignedAccess => {
1465                ExceptionResult::Signal(SignalInfo::default(SIGBUS))
1466            }
1467            zx::ExceptionType::SoftwareBreakpoint | zx::ExceptionType::HardwareBreakpoint => {
1468                ExceptionResult::Signal(SignalInfo::default(SIGTRAP))
1469            }
1470            zx::ExceptionType::ProcessNameChanged => {
1471                log_error!("Received unexpected process name changed exception");
1472                ExceptionResult::Handled
1473            }
1474            zx::ExceptionType::ProcessStarting
1475            | zx::ExceptionType::ThreadStarting
1476            | zx::ExceptionType::ThreadExiting => {
1477                log_error!("Received unexpected task lifecycle exception");
1478                ExceptionResult::Signal(SignalInfo::default(SIGSYS))
1479            }
1480            zx::ExceptionType::PolicyError(policy_code) => {
1481                log_error!(policy_code:?; "Received Zircon policy error exception");
1482                ExceptionResult::Signal(SignalInfo::default(SIGSYS))
1483            }
1484            zx::ExceptionType::UnknownUserGenerated { code, data } => {
1485                log_error!(code:?, data:?; "Received unexpected unknown user generated exception");
1486                ExceptionResult::Signal(SignalInfo::default(SIGSYS))
1487            }
1488            zx::ExceptionType::Unknown { ty, code, data } => {
1489                log_error!(ty:?, code:?, data:?; "Received unexpected exception");
1490                ExceptionResult::Signal(SignalInfo::default(SIGSYS))
1491            }
1492        }
1493    }
1494
1495    /// Clone this task.
1496    ///
1497    /// Creates a new task object that shares some state with this task
1498    /// according to the given flags.
1499    ///
1500    /// Used by the clone() syscall to create both processes and threads.
1501    ///
1502    /// The exit signal is broken out from the flags parameter like clone3() rather than being
1503    /// bitwise-ORed like clone().
1504    pub fn clone_task<L>(
1505        &self,
1506        locked: &mut Locked<L>,
1507        flags: u64,
1508        child_exit_signal: Option<Signal>,
1509        user_parent_tid: UserRef<pid_t>,
1510        user_child_tid: UserRef<pid_t>,
1511        user_pidfd: UserRef<FdNumber>,
1512    ) -> Result<TaskBuilder, Errno>
1513    where
1514        L: LockBefore<MmDumpable>,
1515        L: LockBefore<TaskRelease>,
1516        L: LockBefore<ProcessGroupState>,
1517    {
1518        const IMPLEMENTED_FLAGS: u64 = (CLONE_VM
1519            | CLONE_FS
1520            | CLONE_FILES
1521            | CLONE_SIGHAND
1522            | CLONE_THREAD
1523            | CLONE_SYSVSEM
1524            | CLONE_SETTLS
1525            | CLONE_PARENT
1526            | CLONE_PARENT_SETTID
1527            | CLONE_PIDFD
1528            | CLONE_CHILD_CLEARTID
1529            | CLONE_CHILD_SETTID
1530            | CLONE_VFORK
1531            | CLONE_NEWUTS
1532            | CLONE_PTRACE) as u64;
1533
1534        // A mask with all valid flags set, because we want to return a different error code for an
1535        // invalid flag vs an unimplemented flag. Subtracting 1 from the largest valid flag gives a
1536        // mask with all flags below it set. Shift up by one to make sure the largest flag is also
1537        // set.
1538        const VALID_FLAGS: u64 = (CLONE_INTO_CGROUP << 1) - 1;
1539
1540        // CLONE_SETTLS is implemented by sys_clone.
1541
1542        let clone_files = flags & (CLONE_FILES as u64) != 0;
1543        let clone_fs = flags & (CLONE_FS as u64) != 0;
1544        let clone_parent = flags & (CLONE_PARENT as u64) != 0;
1545        let clone_parent_settid = flags & (CLONE_PARENT_SETTID as u64) != 0;
1546        let clone_pidfd = flags & (CLONE_PIDFD as u64) != 0;
1547        let clone_child_cleartid = flags & (CLONE_CHILD_CLEARTID as u64) != 0;
1548        let clone_child_settid = flags & (CLONE_CHILD_SETTID as u64) != 0;
1549        let clone_sysvsem = flags & (CLONE_SYSVSEM as u64) != 0;
1550        let clone_ptrace = flags & (CLONE_PTRACE as u64) != 0;
1551        let clone_thread = flags & (CLONE_THREAD as u64) != 0;
1552        let clone_vm = flags & (CLONE_VM as u64) != 0;
1553        let clone_sighand = flags & (CLONE_SIGHAND as u64) != 0;
1554        let clone_vfork = flags & (CLONE_VFORK as u64) != 0;
1555        let clone_newuts = flags & (CLONE_NEWUTS as u64) != 0;
1556        let clone_into_cgroup = flags & CLONE_INTO_CGROUP != 0;
1557
1558        if clone_ptrace {
1559            track_stub!(TODO("https://fxbug.dev/322874630"), "CLONE_PTRACE");
1560        }
1561
1562        if clone_sysvsem {
1563            track_stub!(TODO("https://fxbug.dev/322875185"), "CLONE_SYSVSEM");
1564        }
1565
1566        if clone_into_cgroup {
1567            track_stub!(TODO("https://fxbug.dev/403612570"), "CLONE_INTO_CGROUP");
1568        }
1569
1570        if clone_sighand && !clone_vm {
1571            return error!(EINVAL);
1572        }
1573        if clone_thread && !clone_sighand {
1574            return error!(EINVAL);
1575        }
1576
1577        if clone_pidfd && clone_thread {
1578            return error!(EINVAL);
1579        }
1580        if clone_pidfd && clone_parent_settid && user_parent_tid.addr() == user_pidfd.addr() {
1581            // `clone()` uses the same out-argument for these, so error out if they have the same
1582            // user address.
1583            return error!(EINVAL);
1584        }
1585
1586        if flags & !VALID_FLAGS != 0 {
1587            return error!(EINVAL);
1588        }
1589
1590        if clone_vm && !clone_thread {
1591            // TODO(https://fxbug.dev/42066087) Implement CLONE_VM for child processes (not just child
1592            // threads). Currently this executes CLONE_VM (explicitly passed to clone() or as
1593            // used by vfork()) as a fork (the VM in the child is copy-on-write) which is almost
1594            // always OK.
1595            //
1596            // CLONE_VM is primarily as an optimization to avoid making a copy-on-write version of a
1597            // process' VM that will be immediately replaced with a call to exec(). The main users
1598            // (libc and language runtimes) don't actually rely on the memory being shared between
1599            // the two processes. And the vfork() man page explicitly allows vfork() to be
1600            // implemented as fork() which is what we do here.
1601            if !clone_vfork {
1602                track_stub!(
1603                    TODO("https://fxbug.dev/322875227"),
1604                    "CLONE_VM without CLONE_THREAD or CLONE_VFORK"
1605                );
1606            }
1607        } else if clone_thread && !clone_vm {
1608            track_stub!(TODO("https://fxbug.dev/322875167"), "CLONE_THREAD without CLONE_VM");
1609            return error!(ENOSYS);
1610        }
1611
1612        if flags & !IMPLEMENTED_FLAGS != 0 {
1613            track_stub!(
1614                TODO("https://fxbug.dev/322875130"),
1615                "clone unknown flags",
1616                flags & !IMPLEMENTED_FLAGS
1617            );
1618            return error!(ENOSYS);
1619        }
1620
1621        let fs = if clone_fs { self.fs() } else { self.fs().fork() };
1622        let files = if clone_files { self.files.clone() } else { self.files.fork() };
1623
1624        let kernel = self.kernel();
1625
1626        // Lock the cgroup process hierarchy so that the parent process cannot move to a different
1627        // cgroup while a new task or thread_group is created. This may be unnecessary if
1628        // CLONE_INTO_CGROUP is implemented and passed in.
1629        let mut cgroup2_pid_table = kernel.cgroups.lock_cgroup2_pid_table();
1630        // Create a `KernelSignal::Freeze` to put onto the new task, if the cgroup is frozen.
1631        let child_kernel_signals = cgroup2_pid_table
1632            .maybe_create_freeze_signal(self.thread_group())
1633            .into_iter()
1634            .collect::<VecDeque<_>>();
1635
1636        let mut pids = kernel.pids.write();
1637
1638        let pid;
1639        let command;
1640        let creds;
1641        let scheduler_state;
1642        let no_new_privs;
1643        let seccomp_filters;
1644        let robust_list_head = RobustListHeadPtr::null(self);
1645        let child_signal_mask;
1646        let timerslack_ns;
1647        let uts_ns;
1648        let security_state = security::task_alloc(&self, flags);
1649
1650        let TaskInfo { thread, thread_group, memory_manager } = {
1651            // These variables hold the original parent in case we need to switch the parent of the
1652            // new task because of CLONE_PARENT.
1653            let weak_original_parent;
1654            let original_parent;
1655
1656            // Make sure to drop these locks ASAP to avoid inversion
1657            let thread_group_state = {
1658                let thread_group_state = self.thread_group().write();
1659                if clone_parent {
1660                    // With the CLONE_PARENT flag, the parent of the new task is our parent
1661                    // instead of ourselves.
1662                    weak_original_parent =
1663                        thread_group_state.parent.clone().ok_or_else(|| errno!(EINVAL))?;
1664                    std::mem::drop(thread_group_state);
1665                    original_parent = weak_original_parent.upgrade();
1666                    original_parent.write()
1667                } else {
1668                    thread_group_state
1669                }
1670            };
1671
1672            let state = self.read();
1673
1674            no_new_privs = state.no_new_privs();
1675            seccomp_filters = state.seccomp_filters.clone();
1676            child_signal_mask = state.signal_mask();
1677
1678            pid = pids.allocate_pid();
1679            command = self.command();
1680            creds = self.current_creds();
1681            scheduler_state = state.scheduler_state.fork();
1682            timerslack_ns = state.timerslack_ns;
1683
1684            uts_ns = if clone_newuts {
1685                security::check_task_capable(self, CAP_SYS_ADMIN)?;
1686                state.uts_ns.read().fork()
1687            } else {
1688                state.uts_ns.clone()
1689            };
1690
1691            if clone_thread {
1692                TaskInfo {
1693                    thread: None,
1694                    thread_group: self.thread_group().clone(),
1695                    memory_manager: self.mm().ok(),
1696                }
1697            } else {
1698                // Drop the lock on this task before entering `create_zircon_process`, because it will
1699                // take a lock on the new thread group, and locks on thread groups have a higher
1700                // priority than locks on the task in the thread group.
1701                std::mem::drop(state);
1702                let signal_actions = if clone_sighand {
1703                    self.thread_group().signal_actions.clone()
1704                } else {
1705                    self.thread_group().signal_actions.fork()
1706                };
1707                let process_group = thread_group_state.process_group.clone();
1708
1709                let task_info = create_zircon_process(
1710                    locked,
1711                    kernel,
1712                    Some(thread_group_state),
1713                    pid,
1714                    child_exit_signal,
1715                    process_group,
1716                    signal_actions,
1717                    command.clone(),
1718                )?;
1719
1720                cgroup2_pid_table.inherit_cgroup(self.thread_group(), &task_info.thread_group);
1721
1722                task_info
1723            }
1724        };
1725
1726        // Only create the vfork event when the caller requested CLONE_VFORK.
1727        let vfork_event = if clone_vfork { Some(Arc::new(zx::Event::create())) } else { None };
1728
1729        let mut child = TaskBuilder::new(Task::new(
1730            pid,
1731            command,
1732            thread_group,
1733            thread,
1734            files,
1735            memory_manager,
1736            fs,
1737            creds,
1738            self.abstract_socket_namespace.clone(),
1739            self.abstract_vsock_namespace.clone(),
1740            child_signal_mask,
1741            child_kernel_signals,
1742            vfork_event,
1743            scheduler_state,
1744            uts_ns,
1745            no_new_privs,
1746            SeccompState::from(&self.seccomp_filter_state),
1747            seccomp_filters,
1748            robust_list_head,
1749            timerslack_ns,
1750            security_state,
1751        ));
1752
1753        release_on_error!(child, locked, {
1754            let child_task = TempRef::from(&child.task);
1755            // Drop the pids lock as soon as possible after creating the child. Destroying the child
1756            // and removing it from the pids table itself requires the pids lock, so if an early exit
1757            // takes place we have a self deadlock.
1758            pids.add_task(&child_task);
1759            std::mem::drop(pids);
1760
1761            // Child lock must be taken before this lock. Drop the lock on the task, take a writable
1762            // lock on the child and take the current state back.
1763
1764            #[cfg(any(test, debug_assertions))]
1765            {
1766                // Take the lock on the thread group and its child in the correct order to ensure any wrong ordering
1767                // will trigger the tracing-mutex at the right call site.
1768                if !clone_thread {
1769                    let _l1 = self.thread_group().read();
1770                    let _l2 = child.thread_group().read();
1771                }
1772            }
1773
1774            if clone_thread {
1775                self.thread_group().add(&child_task)?;
1776            } else {
1777                child.thread_group().add(&child_task)?;
1778
1779                // These manipulations of the signal handling state appear to be related to
1780                // CLONE_SIGHAND and CLONE_VM rather than CLONE_THREAD. However, we do not support
1781                // all the combinations of these flags, which means doing these operations here
1782                // might actually be correct. However, if you find a test that fails because of the
1783                // placement of this logic here, we might need to move it.
1784                let mut child_state = child.write();
1785                let state = self.read();
1786                child_state.set_sigaltstack(state.sigaltstack());
1787                child_state.set_signal_mask(state.signal_mask());
1788            }
1789
1790            if !clone_vm {
1791                // We do not support running threads in the same process with different
1792                // MemoryManagers.
1793                assert!(!clone_thread);
1794                self.mm()?.snapshot_to(locked, &child.mm()?)?;
1795            }
1796
1797            if clone_parent_settid {
1798                self.write_object(user_parent_tid, &child.tid)?;
1799            }
1800
1801            if clone_child_cleartid {
1802                child.write().clear_child_tid = user_child_tid;
1803            }
1804
1805            if clone_child_settid {
1806                child.write_object(user_child_tid, &child.tid)?;
1807            }
1808
1809            if clone_pidfd {
1810                let locked = locked.cast_locked::<TaskRelease>();
1811                let file = new_pidfd(
1812                    locked,
1813                    self,
1814                    child.thread_group(),
1815                    &*child.mm()?,
1816                    OpenFlags::empty(),
1817                );
1818                let pidfd = self.add_file(locked, file, FdFlags::CLOEXEC)?;
1819                self.write_object(user_pidfd, &pidfd)?;
1820            }
1821
1822            // TODO(https://fxbug.dev/42066087): We do not support running different processes with
1823            // the same MemoryManager. Instead, we implement a rough approximation of that behavior
1824            // by making a copy-on-write clone of the memory from the original process.
1825            if clone_vm && !clone_thread {
1826                self.mm()?.snapshot_to(locked, &child.mm()?)?;
1827            }
1828
1829            child.thread_state = self.thread_state.snapshot();
1830            Ok(())
1831        });
1832
1833        // Take the lock on thread group and task in the correct order to ensure any wrong ordering
1834        // will trigger the tracing-mutex at the right call site.
1835        #[cfg(any(test, debug_assertions))]
1836        {
1837            let _l1 = child.thread_group().read();
1838            let _l2 = child.read();
1839        }
1840
1841        Ok(child)
1842    }
1843
1844    /// Sets the stop state (per set_stopped), and also notifies all listeners,
1845    /// including the parent process if appropriate.
1846    pub fn set_stopped_and_notify(&self, stopped: StopState, siginfo: Option<SignalInfo>) {
1847        {
1848            let mut state = self.write();
1849            state.copy_state_from(self);
1850            state.set_stopped(stopped, siginfo, Some(self), None);
1851        }
1852
1853        if !stopped.is_in_progress() {
1854            let parent = self.thread_group().read().parent.clone();
1855            if let Some(parent) = parent {
1856                parent
1857                    .upgrade()
1858                    .write()
1859                    .lifecycle_waiters
1860                    .notify_value(ThreadGroupLifecycleWaitValue::ChildStatus);
1861            }
1862        }
1863    }
1864
1865    /// If the task is stopping, set it as stopped. return whether the caller
1866    /// should stop.  The task might also be waking up.
1867    pub fn finalize_stop_state(&mut self) -> bool {
1868        let stopped = self.load_stopped();
1869
1870        if !stopped.is_stopping_or_stopped() {
1871            // If we are waking up, potentially write back state a tracer may have modified.
1872            let captured_state = self.write().take_captured_state();
1873            if let Some(captured) = captured_state {
1874                if captured.dirty {
1875                    self.thread_state.replace_registers(&captured.thread_state);
1876                }
1877            }
1878        }
1879
1880        // Stopping because the thread group is stopping.
1881        // Try to flip to GroupStopped - will fail if we shouldn't.
1882        if self.thread_group().set_stopped(StopState::GroupStopped, None, true)
1883            == StopState::GroupStopped
1884        {
1885            let signal = self.thread_group().read().last_signal.clone();
1886            // stopping because the thread group has stopped
1887            let event = Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0));
1888            self.write().set_stopped(StopState::GroupStopped, signal, Some(self), event);
1889            return true;
1890        }
1891
1892        // Stopping because the task is stopping
1893        if stopped.is_stopping_or_stopped() {
1894            if let Ok(stopped) = stopped.finalize() {
1895                self.set_stopped_and_notify(stopped, None);
1896            }
1897            return true;
1898        }
1899
1900        false
1901    }
1902
1903    /// Block the execution of `current_task` as long as the task is stopped and
1904    /// not terminated.
1905    pub fn block_while_stopped(&mut self, locked: &mut Locked<Unlocked>) {
1906        // Upgrade the state from stopping to stopped if needed. Return if the task
1907        // should not be stopped.
1908        if !self.finalize_stop_state() {
1909            return;
1910        }
1911
1912        let waiter = Waiter::new_ignoring_signals();
1913        loop {
1914            // If we've exited, unstop the threads and return without notifying
1915            // waiters.
1916            if self.is_exitted() {
1917                self.thread_group().set_stopped(StopState::ForceAwake, None, false);
1918                self.write().set_stopped(StopState::ForceAwake, None, Some(self), None);
1919                return;
1920            }
1921
1922            if self.wake_or_wait_until_unstopped_async(&waiter) {
1923                return;
1924            }
1925
1926            // Do the wait. Result is not needed, as this is not in a syscall.
1927            let _: Result<(), Errno> = waiter.wait(locked, self);
1928
1929            // Maybe go from stopping to stopped, if we are currently stopping
1930            // again.
1931            self.finalize_stop_state();
1932        }
1933    }
1934
1935    /// For traced tasks, this will return the data neceessary for a cloned task
1936    /// to attach to the same tracer.
1937    pub fn get_ptrace_core_state_for_clone(
1938        &mut self,
1939        clone_args: &clone_args,
1940    ) -> (PtraceOptions, Option<PtraceCoreState>) {
1941        let state = self.write();
1942        if let Some(ptrace) = &state.ptrace {
1943            ptrace.get_core_state_for_clone(clone_args)
1944        } else {
1945            (PtraceOptions::empty(), None)
1946        }
1947    }
1948
1949    /// If currently being ptraced with the given option, emit the appropriate
1950    /// event.  PTRACE_EVENTMSG will return the given message.  Also emits the
1951    /// appropriate event for execve in the absence of TRACEEXEC.
1952    ///
1953    /// Note that the Linux kernel has a documented bug where, if TRACEEXIT is
1954    /// enabled, SIGKILL will trigger an event.  We do not exhibit this
1955    /// behavior.
1956    pub fn ptrace_event(
1957        &mut self,
1958        locked: &mut Locked<Unlocked>,
1959        trace_kind: PtraceOptions,
1960        msg: u64,
1961    ) {
1962        if !trace_kind.is_empty() {
1963            {
1964                let mut state = self.write();
1965                if let Some(ptrace) = &mut state.ptrace {
1966                    if !ptrace.has_option(trace_kind) {
1967                        // If this would be a TRACEEXEC, but TRACEEXEC is not
1968                        // turned on, then send a SIGTRAP.
1969                        if trace_kind == PtraceOptions::TRACEEXEC && !ptrace.is_seized() {
1970                            // Send a SIGTRAP so that the parent can gain control.
1971                            send_signal_first(locked, self, state, SignalInfo::default(SIGTRAP));
1972                        }
1973
1974                        return;
1975                    }
1976                    let mut siginfo = SignalInfo::default(starnix_uapi::signals::SIGTRAP);
1977                    siginfo.code = (((PtraceEvent::from_option(&trace_kind) as u32) << 8)
1978                        | linux_uapi::SIGTRAP) as i32;
1979                    state.set_stopped(
1980                        StopState::PtraceEventStopping,
1981                        Some(siginfo),
1982                        None,
1983                        Some(PtraceEventData::new(trace_kind, msg)),
1984                    );
1985                } else {
1986                    return;
1987                }
1988            }
1989            self.block_while_stopped(locked);
1990        }
1991    }
1992
1993    /// Causes the current thread's thread group to exit, notifying any ptracer
1994    /// of this task first.
1995    pub fn thread_group_exit(&mut self, locked: &mut Locked<Unlocked>, exit_status: ExitStatus) {
1996        self.ptrace_event(
1997            locked,
1998            PtraceOptions::TRACEEXIT,
1999            exit_status.signal_info_status() as u64,
2000        );
2001        self.thread_group().exit(locked, exit_status, None);
2002    }
2003
2004    /// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the
2005    /// exit signal as in clone().
2006    pub fn clone_task_for_test<L>(
2007        &self,
2008        locked: &mut Locked<L>,
2009        flags: u64,
2010        exit_signal: Option<Signal>,
2011    ) -> crate::testing::AutoReleasableTask
2012    where
2013        L: LockBefore<MmDumpable>,
2014        L: LockBefore<TaskRelease>,
2015        L: LockBefore<ProcessGroupState>,
2016    {
2017        let result = self
2018            .clone_task(
2019                locked,
2020                flags,
2021                exit_signal,
2022                UserRef::default(),
2023                UserRef::default(),
2024                UserRef::default(),
2025            )
2026            .expect("failed to create task in test");
2027
2028        result.into()
2029    }
2030
2031    // See "Ptrace access mode checking" in https://man7.org/linux/man-pages/man2/ptrace.2.html
2032    pub fn check_ptrace_access_mode<L>(
2033        &self,
2034        locked: &mut Locked<L>,
2035        mode: PtraceAccessMode,
2036        target: &Task,
2037    ) -> Result<(), Errno>
2038    where
2039        L: LockBefore<MmDumpable>,
2040    {
2041        // (1)  If the calling thread and the target thread are in the same
2042        //      thread group, access is always allowed.
2043        if self.thread_group().leader == target.thread_group().leader {
2044            return Ok(());
2045        }
2046
2047        // (2)  If the access mode specifies PTRACE_MODE_FSCREDS, then, for
2048        //      the check in the next step, employ the caller's filesystem
2049        //      UID and GID.  (As noted in credentials(7), the filesystem
2050        //      UID and GID almost always have the same values as the
2051        //      corresponding effective IDs.)
2052        //
2053        //      Otherwise, the access mode specifies PTRACE_MODE_REALCREDS,
2054        //      so use the caller's real UID and GID for the checks in the
2055        //      next step.  (Most APIs that check the caller's UID and GID
2056        //      use the effective IDs.  For historical reasons, the
2057        //      PTRACE_MODE_REALCREDS check uses the real IDs instead.)
2058        let (uid, gid) = self.with_current_creds(|creds| {
2059            if mode.contains(PTRACE_MODE_FSCREDS) {
2060                let fscred = creds.as_fscred();
2061                (fscred.uid, fscred.gid)
2062            } else if mode.contains(PTRACE_MODE_REALCREDS) {
2063                (creds.uid, creds.gid)
2064            } else {
2065                unreachable!();
2066            }
2067        });
2068
2069        // (3)  Deny access if neither of the following is true:
2070        //
2071        //      -  The real, effective, and saved-set user IDs of the target
2072        //         match the caller's user ID, and the real, effective, and
2073        //         saved-set group IDs of the target match the caller's
2074        //         group ID.
2075        //
2076        //      -  The caller has the CAP_SYS_PTRACE capability in the user
2077        //         namespace of the target.
2078        let target_creds = target.real_creds();
2079        if !(target_creds.uid == uid
2080            && target_creds.euid == uid
2081            && target_creds.saved_uid == uid
2082            && target_creds.gid == gid
2083            && target_creds.egid == gid
2084            && target_creds.saved_gid == gid)
2085        {
2086            security::check_task_capable(self, CAP_SYS_PTRACE)?;
2087        }
2088
2089        // (4)  Deny access if the target process "dumpable" attribute has a
2090        //      value other than 1 (SUID_DUMP_USER; see the discussion of
2091        //      PR_SET_DUMPABLE in prctl(2)), and the caller does not have
2092        //      the CAP_SYS_PTRACE capability in the user namespace of the
2093        //      target process.
2094        let dumpable = *target.mm()?.dumpable.lock(locked);
2095        match dumpable {
2096            DumpPolicy::User => (),
2097            DumpPolicy::Disable => security::check_task_capable(self, CAP_SYS_PTRACE)?,
2098        }
2099
2100        // (5)  The kernel LSM security_ptrace_access_check() interface is
2101        //      invoked to see if ptrace access is permitted.
2102        security::ptrace_access_check(self, target, mode)?;
2103
2104        // (6)  If access has not been denied by any of the preceding steps,
2105        //      then access is allowed.
2106        Ok(())
2107    }
2108
2109    pub fn can_signal(
2110        &self,
2111        target: &Task,
2112        unchecked_signal: UncheckedSignal,
2113    ) -> Result<(), Errno> {
2114        // If both the tasks share a thread group the signal can be sent. This is not documented
2115        // in kill(2) because kill does not support task-level granularity in signal sending.
2116        if self.thread_group == target.thread_group {
2117            return Ok(());
2118        }
2119
2120        let (target_uid, target_saved_uid) =
2121            target.with_real_creds(|creds| (creds.uid, creds.saved_uid));
2122        if self.with_current_creds(|creds| {
2123            // From https://man7.org/linux/man-pages/man2/kill.2.html:
2124            //
2125            // > For a process to have permission to send a signal, it must either be
2126            // > privileged (under Linux: have the CAP_KILL capability in the user
2127            // > namespace of the target process), or the real or effective user ID of
2128            // > the sending process must equal the real or saved set- user-ID of the
2129            // > target process.
2130            //
2131            // Returns true if the credentials are considered to have the same user ID.
2132            creds.euid == target_saved_uid
2133                || creds.euid == target_uid
2134                || creds.uid == target_uid
2135                || creds.uid == target_saved_uid
2136        }) {
2137            return Ok(());
2138        }
2139
2140        if Signal::try_from(unchecked_signal) == Ok(SIGCONT) {
2141            let target_session = target.thread_group().read().process_group.session.leader;
2142            let self_session = self.thread_group().read().process_group.session.leader;
2143            if target_session == self_session {
2144                return Ok(());
2145            }
2146        }
2147
2148        security::check_task_capable(self, CAP_KILL)
2149    }
2150}
2151
2152impl ArchSpecific for CurrentTask {
2153    fn is_arch32(&self) -> bool {
2154        self.thread_state.is_arch32()
2155    }
2156}
2157
2158impl MemoryAccessor for CurrentTask {
2159    fn read_memory<'a>(
2160        &self,
2161        addr: UserAddress,
2162        bytes: &'a mut [MaybeUninit<u8>],
2163    ) -> Result<&'a mut [u8], Errno> {
2164        self.mm()?.unified_read_memory(self, addr, bytes)
2165    }
2166
2167    fn read_memory_partial_until_null_byte<'a>(
2168        &self,
2169        addr: UserAddress,
2170        bytes: &'a mut [MaybeUninit<u8>],
2171    ) -> Result<&'a mut [u8], Errno> {
2172        self.mm()?.unified_read_memory_partial_until_null_byte(self, addr, bytes)
2173    }
2174
2175    fn read_memory_partial<'a>(
2176        &self,
2177        addr: UserAddress,
2178        bytes: &'a mut [MaybeUninit<u8>],
2179    ) -> Result<&'a mut [u8], Errno> {
2180        self.mm()?.unified_read_memory_partial(self, addr, bytes)
2181    }
2182
2183    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2184        self.mm()?.unified_write_memory(self, addr, bytes)
2185    }
2186
2187    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2188        self.mm()?.unified_write_memory_partial(self, addr, bytes)
2189    }
2190
2191    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
2192        self.mm()?.unified_zero(self, addr, length)
2193    }
2194}
2195
2196impl TaskMemoryAccessor for CurrentTask {
2197    fn maximum_valid_address(&self) -> Option<UserAddress> {
2198        self.mm().ok().map(|mm| mm.maximum_valid_user_address)
2199    }
2200}
2201
2202pub enum ExceptionResult {
2203    /// The exception was handled and no further action is required.
2204    Handled,
2205
2206    // The exception generated a signal that should be delivered.
2207    Signal(SignalInfo),
2208}
2209
2210#[cfg(test)]
2211mod tests {
2212    use crate::testing::spawn_kernel_and_run;
2213
2214    // This test will run `override_creds` and check it doesn't crash. This ensures that the
2215    // delegation to `override_creds_async` is correct.
2216    #[::fuchsia::test]
2217    async fn test_override_creds_can_delegate_to_async_version() {
2218        spawn_kernel_and_run(async move |_, current_task| {
2219            assert_eq!(current_task.override_creds(|_| {}, || 0), 0);
2220        })
2221        .await;
2222    }
2223}