Skip to main content

starnix_core/task/
current_task.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::arch::task::{decode_page_fault_exception_report, get_signal_for_general_exception};
6use crate::execution::{TaskInfo, create_zircon_process};
7use crate::mm::{DumpPolicy, MemoryAccessor, MemoryAccessorExt, TaskMemoryAccessor};
8use crate::ptrace::{PtraceCoreState, PtraceEvent, PtraceEventData, PtraceOptions, StopState};
9use crate::security;
10use crate::signals::{RunState, SignalDetail, SignalInfo, send_signal_first, send_standard_signal};
11use crate::task::loader::{ResolvedElf, load_executable, resolve_executable};
12use crate::task::waiter::WaiterOptions;
13use crate::task::{
14    ExitStatus, RobustListHeadPtr, SeccompFilter, SeccompFilterContainer, SeccompNotifierHandle,
15    SeccompState, SeccompStateValue, Task, TaskFlags, TaskLiveState, ThreadState, Waiter,
16};
17use crate::vfs::{
18    CheckAccessReason, FdFlags, FdNumber, FileHandle, FsContext, FsStr, LookupContext,
19    MAX_SYMLINK_FOLLOWS, NamespaceNode, ResolveBase, SymlinkMode, SymlinkTarget, new_pidfd,
20};
21use futures::FutureExt;
22use linux_uapi::CLONE_PIDFD;
23use starnix_logging::{log_error, log_warn, track_file_not_found, track_stub};
24use starnix_registers::{HeapRegs, RegisterStorageEnum};
25use starnix_stack::clean_stack;
26use starnix_sync::{
27    EventWaitGuard, FileOpsCore, LockBefore, LockEqualOrBefore, Locked, MmDumpable,
28    ProcessGroupState, TaskRelease, Unlocked, WakeReason,
29};
30use starnix_syscalls::SyscallResult;
31use starnix_syscalls::decls::Syscall;
32use starnix_task_command::TaskCommand;
33use starnix_types::futex_address::FutexAddress;
34use starnix_types::ownership::{OwnedRef, Releasable, TempRef, WeakRef, release_on_error};
35use starnix_uapi::auth::{
36    CAP_KILL, CAP_SYS_ADMIN, CAP_SYS_PTRACE, Credentials, FsCred, PTRACE_MODE_FSCREDS,
37    PTRACE_MODE_REALCREDS, PtraceAccessMode, UserAndOrGroupId,
38};
39use starnix_uapi::device_type::DeviceType;
40use starnix_uapi::errors::Errno;
41use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
42use starnix_uapi::open_flags::OpenFlags;
43use starnix_uapi::signals::{
44    SIGBUS, SIGCHLD, SIGCONT, SIGILL, SIGKILL, SIGSEGV, SIGSYS, SIGTRAP, SigSet, Signal,
45    UncheckedSignal,
46};
47use starnix_uapi::user_address::{ArchSpecific, UserAddress, UserRef};
48use starnix_uapi::vfs::ResolveFlags;
49use starnix_uapi::{
50    CLONE_CHILD_CLEARTID, CLONE_CHILD_SETTID, CLONE_FILES, CLONE_FS, CLONE_INTO_CGROUP,
51    CLONE_NEWUTS, CLONE_PARENT, CLONE_PARENT_SETTID, CLONE_PTRACE, CLONE_SETTLS, CLONE_SIGHAND,
52    CLONE_SYSVSEM, CLONE_THREAD, CLONE_VFORK, CLONE_VM, FUTEX_OWNER_DIED, FUTEX_TID_MASK,
53    ROBUST_LIST_LIMIT, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_NEW_LISTENER,
54    SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_TSYNC_ESRCH, clone_args, errno, error,
55    from_status_like_fdio, pid_t, sock_filter, ucred,
56};
57use std::cell::{Ref, RefCell};
58use std::collections::VecDeque;
59use std::ffi::CString;
60use std::fmt;
61use std::marker::PhantomData;
62use std::mem::MaybeUninit;
63use std::sync::Arc;
64use zx::sys::zx_restricted_state_t;
65
66use super::ThreadGroupLifecycleWaitValue;
67
68pub struct TaskBuilder {
69    /// The underlying task object.
70    pub task: OwnedRef<Task>,
71
72    pub thread_state: ThreadState<HeapRegs>,
73}
74
75impl TaskBuilder {
76    pub fn new(task: OwnedRef<Task>) -> Self {
77        Self { task, thread_state: Default::default() }
78    }
79
80    #[inline(always)]
81    pub fn release<L>(self, locked: &mut Locked<L>)
82    where
83        L: LockBefore<TaskRelease>,
84    {
85        let locked = locked.cast_locked::<TaskRelease>();
86        Releasable::release(self, locked);
87    }
88}
89
90impl From<TaskBuilder> for CurrentTask {
91    fn from(builder: TaskBuilder) -> Self {
92        Self::new(builder.task, builder.thread_state.into())
93    }
94}
95
96impl Releasable for TaskBuilder {
97    type Context<'a> = &'a mut Locked<TaskRelease>;
98
99    fn release<'a>(self, locked: Self::Context<'a>) {
100        let kernel = Arc::clone(self.kernel());
101        let mut pids = kernel.pids.write();
102
103        // We remove from the thread group here because the WeakRef in the pid
104        // table to this task must be valid until this task is removed from the
105        // thread group, and the code below will invalidate it.
106        // Moreover, this requires a OwnedRef of the task to ensure the tasks of
107        // the thread group are always valid.
108        self.task.thread_group().remove(locked, &mut pids, &self.task);
109
110        let context = (self.thread_state.into(), locked, pids);
111        self.task.release(context);
112    }
113}
114
115impl std::ops::Deref for TaskBuilder {
116    type Target = Task;
117    fn deref(&self) -> &Self::Target {
118        &self.task
119    }
120}
121
122/// The task object associated with the currently executing thread.
123///
124/// We often pass the `CurrentTask` as the first argument to functions if those functions need to
125/// know contextual information about the thread on which they are running. For example, we often
126/// use the `CurrentTask` to perform access checks, which ensures that the caller is authorized to
127/// perform the requested operation.
128///
129/// The `CurrentTask` also has state that can be referenced only on the currently executing thread,
130/// such as the register state for that thread. Syscalls are given a mutable references to the
131/// `CurrentTask`, which lets them manipulate this state.
132///
133/// See also `Task` for more information about tasks.
134pub struct CurrentTask {
135    /// The underlying task object.
136    pub task: OwnedRef<Task>,
137
138    pub thread_state: ThreadState<RegisterStorageEnum>,
139
140    /// The current subjective credentials of the task.
141    // TODO(https://fxbug.dev/433548348): Avoid interior mutability here by passing a
142    // &mut CurrentTask around instead of &CurrentTask.
143    pub current_creds: RefCell<CurrentCreds>,
144
145    /// Makes CurrentTask neither Sync not Send.
146    _local_marker: PhantomData<*mut u8>,
147}
148
149/// Represents the current state of the task's subjective credentials.
150pub enum CurrentCreds {
151    /// The task does not have overridden credentials, the subjective creds are identical to the
152    /// objective creds stored in the Task. Since credentials are often accessed from the current
153    /// task, we hold a reference here that does not necessitate going through the RCU machinery to
154    /// read.
155    Cached(Arc<Credentials>),
156    /// The task has overridden subjective credentials.
157    Overridden(Arc<Credentials>),
158}
159
160impl CurrentCreds {
161    fn creds(&self) -> &Arc<Credentials> {
162        match self {
163            CurrentCreds::Cached(creds) => creds,
164            CurrentCreds::Overridden(creds) => creds,
165        }
166    }
167}
168
169impl Releasable for CurrentTask {
170    type Context<'a> = &'a mut Locked<TaskRelease>;
171
172    fn release<'a>(self, locked: Self::Context<'a>) {
173        self.notify_robust_list();
174        let _ignored = self.clear_child_tid_if_needed(locked);
175
176        let kernel = Arc::clone(self.kernel());
177        let mut pids = kernel.pids.write();
178
179        // We remove from the thread group here because the WeakRef in the pid
180        // table to this task must be valid until this task is removed from the
181        // thread group, and the code below will invalidate it.
182        // Moreover, this requires a OwnedRef of the task to ensure the tasks of
183        // the thread group are always valid.
184        self.task.thread_group().remove(locked, &mut pids, &self.task);
185
186        let context = (self.thread_state, locked, pids);
187        self.task.release(context);
188    }
189}
190
191impl std::ops::Deref for CurrentTask {
192    type Target = Task;
193    fn deref(&self) -> &Self::Target {
194        &self.task
195    }
196}
197
198impl fmt::Debug for CurrentTask {
199    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
200        self.task.fmt(f)
201    }
202}
203
204impl CurrentTask {
205    pub fn new(task: OwnedRef<Task>, thread_state: ThreadState<RegisterStorageEnum>) -> Self {
206        let current_creds = RefCell::new(CurrentCreds::Cached(task.clone_creds()));
207        Self { task, thread_state, current_creds, _local_marker: Default::default() }
208    }
209
210    /// Returns the live state of the task.
211    ///
212    /// This panics if the task has already transitioned to a zombie state because `CurrentTask`
213    /// only exists for live tasks.
214    #[track_caller]
215    pub fn live(&self) -> Arc<TaskLiveState> {
216        self.task.live().expect("CurrentTask must have TaskLiveState")
217    }
218
219    pub fn fs(&self) -> Arc<FsContext> {
220        self.live().fs()
221    }
222
223    pub fn has_shared_fs(&self) -> bool {
224        let fs = self.fs();
225        // This check is incorrect because someone else could be holding a temporary Arc to the
226        // FsContext and therefore increasing the strong count.
227        Arc::strong_count(&fs) > 2usize
228    }
229
230    pub fn unshare_fs(&self) {
231        let new_fs = self.fs().fork();
232        self.live().fs.update(new_fs);
233    }
234
235    /// Returns the current subjective credentials of the task.
236    ///
237    /// The subjective credentials are the credentials that are used to check permissions for
238    /// actions performed by the task.
239    pub fn current_creds(&self) -> Ref<'_, Arc<Credentials>> {
240        Ref::map(self.current_creds.borrow(), CurrentCreds::creds)
241    }
242
243    pub fn current_fscred(&self) -> FsCred {
244        self.current_creds().as_fscred()
245    }
246
247    pub fn current_ucred(&self) -> ucred {
248        let creds = self.current_creds();
249        ucred { pid: self.get_pid(), uid: creds.uid, gid: creds.gid }
250    }
251
252    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
253    /// `callback`.
254    /// The creds and security state will be restored to their original values at the end of the
255    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
256    ///  used to check permissions for actions performed by the task, is altered. The "objective"
257    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
258    /// for actions performed on the task, is not altered, and changes to the credentials are not
259    /// externally visible.
260    pub async fn override_creds_async<R>(
261        &self,
262        new_creds: Arc<Credentials>,
263        callback: impl AsyncFnOnce() -> R,
264    ) -> R {
265        let saved = self.current_creds.replace(CurrentCreds::Overridden(new_creds));
266        let result = callback().await;
267        self.current_creds.replace(saved);
268        result
269    }
270
271    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
272    /// `callback`.
273    /// The creds and security state will be restored to their original values at the end of the
274    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
275    ///  used to check permissions for actions performed by the task, is altered. The "objective"
276    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
277    /// for actions performed on the task, is not altered, and changes to the credentials are not
278    /// externally visible.
279    pub fn override_creds<R>(
280        &self,
281        new_creds: Arc<Credentials>,
282        callback: impl FnOnce() -> R,
283    ) -> R {
284        self.override_creds_async(new_creds, async move || callback())
285            .now_or_never()
286            .expect("Future should be ready")
287    }
288
289    pub fn has_overridden_creds(&self) -> bool {
290        matches!(*self.current_creds.borrow(), CurrentCreds::Overridden(_))
291    }
292
293    pub fn trigger_delayed_releaser<L>(&self, locked: &mut Locked<L>)
294    where
295        L: LockEqualOrBefore<FileOpsCore>,
296    {
297        let locked = locked.cast_locked::<FileOpsCore>();
298        self.kernel().delayed_releaser.apply(locked, self);
299    }
300
301    pub fn weak_task(&self) -> WeakRef<Task> {
302        WeakRef::from(&self.task)
303    }
304
305    pub fn temp_task(&self) -> TempRef<'_, Task> {
306        TempRef::from(&self.task)
307    }
308
309    /// Change the current and real creds of the task. This is invalid to call while temporary
310    /// credentials are present.
311    pub fn set_creds(&self, creds: Credentials) {
312        assert!(!self.has_overridden_creds());
313
314        let creds = Arc::new(creds);
315        let mut current_creds = self.current_creds.borrow_mut();
316        *current_creds = CurrentCreds::Cached(creds.clone());
317
318        // SAFETY: this is allowed because we are the CurrentTask.
319        unsafe {
320            self.persistent_info.write_creds().update(creds);
321        }
322        // The /proc/pid directory's ownership is updated when the task's euid
323        // or egid changes. See proc(5).
324        let maybe_node = self.proc_pid_directory_cache.lock();
325        if let Some(node) = &*maybe_node {
326            let creds = self.real_creds().euid_as_fscred();
327            // SAFETY: The /proc/pid directory held by `proc_pid_directory_cache` represents the
328            // current task. It's owner and group are supposed to track the current task's euid and
329            // egid.
330            unsafe {
331                node.force_chown(creds);
332            }
333        }
334    }
335
336    #[inline(always)]
337    pub fn release<L>(self, locked: &mut Locked<L>)
338    where
339        L: LockBefore<TaskRelease>,
340    {
341        let locked = locked.cast_locked::<TaskRelease>();
342        Releasable::release(self, locked);
343    }
344
345    pub fn set_syscall_restart_func<R: Into<SyscallResult>>(
346        &mut self,
347        f: impl FnOnce(&mut Locked<Unlocked>, &mut CurrentTask) -> Result<R, Errno>
348        + Send
349        + Sync
350        + 'static,
351    ) {
352        self.thread_state.syscall_restart_func =
353            Some(Box::new(|locked, current_task| Ok(f(locked, current_task)?.into())));
354    }
355
356    pub fn add_file<L>(
357        &self,
358        locked: &mut Locked<L>,
359        file: FileHandle,
360        flags: FdFlags,
361    ) -> Result<FdNumber, Errno>
362    where
363        L: LockEqualOrBefore<FileOpsCore>,
364    {
365        self.live().files.add(locked, self, file, flags)
366    }
367
368    pub fn get_file(&self, fd: FdNumber) -> Result<FileHandle, Errno> {
369        self.live().files.get(fd)
370    }
371
372    pub fn get_file_allowing_opath(&self, fd: FdNumber) -> Result<FileHandle, Errno> {
373        self.live().files.get_allowing_opath(fd)
374    }
375
376    /// Sets the task's signal mask to `signal_mask` and runs `wait_function`.
377    ///
378    /// Signals are dequeued prior to the original signal mask being restored. This is done by the
379    /// signal machinery in the syscall dispatch loop.
380    ///
381    /// The returned result is the result returned from the wait function.
382    pub fn wait_with_temporary_mask<F, T, L>(
383        &mut self,
384        locked: &mut Locked<L>,
385        signal_mask: SigSet,
386        wait_function: F,
387    ) -> Result<T, Errno>
388    where
389        L: LockEqualOrBefore<FileOpsCore>,
390        F: FnOnce(&mut Locked<L>, &CurrentTask) -> Result<T, Errno>,
391    {
392        {
393            let mut state = self.write();
394            state.set_flags(TaskFlags::TEMPORARY_SIGNAL_MASK, true);
395            state.set_temporary_signal_mask(signal_mask);
396        }
397        wait_function(locked, self)
398    }
399
400    /// If waking, promotes from waking to awake.  If not waking, make waiter async
401    /// wait until woken.  Returns true if woken.
402    pub fn wake_or_wait_until_unstopped_async(&self, waiter: &Waiter) -> bool {
403        let group_state = self.thread_group().read();
404        let mut task_state = self.write();
405
406        // Wake up if
407        //   a) we should wake up, meaning:
408        //      i) we're in group stop, and the thread group has exited group stop, or
409        //      ii) we're waking up,
410        //   b) and ptrace isn't stopping us from waking up, but
411        //   c) always wake up if we got a SIGKILL.
412        let task_stop_state = self.load_stopped();
413        let group_stop_state = self.thread_group().load_stopped();
414        if ((task_stop_state == StopState::GroupStopped && group_stop_state.is_waking_or_awake())
415            || task_stop_state.is_waking_or_awake())
416            && (!task_state.is_ptrace_listening() || task_stop_state.is_force())
417        {
418            let new_state = if task_stop_state.is_waking_or_awake() {
419                task_stop_state.finalize()
420            } else {
421                group_stop_state.finalize()
422            };
423            if let Ok(new_state) = new_state {
424                task_state.set_stopped(new_state, None, Some(self), None);
425                drop(group_state);
426                drop(task_state);
427                // It is possible for the stop state to be changed by another
428                // thread between when it is checked above and the following
429                // invocation, but set_stopped does sufficient checking while
430                // holding the lock to make sure that such a change won't result
431                // in corrupted state.
432                self.thread_group().set_stopped(new_state, None, false);
433                return true;
434            }
435        }
436
437        // We will wait.
438        if self.thread_group().load_stopped().is_stopped() || task_stop_state.is_stopped() {
439            // If we've stopped or PTRACE_LISTEN has been sent, wait for a
440            // signal or instructions from the tracer.
441            group_state
442                .lifecycle_waiters
443                .wait_async_value(&waiter, ThreadGroupLifecycleWaitValue::Stopped);
444            task_state.wait_on_ptracer(&waiter);
445        } else if task_state.can_accept_ptrace_commands() {
446            // If we're stopped because a tracer has seen the stop and not taken
447            // further action, wait for further instructions from the tracer.
448            task_state.wait_on_ptracer(&waiter);
449        } else if task_state.is_ptrace_listening() {
450            // A PTRACE_LISTEN is a state where we can get signals and notify a
451            // ptracer, but otherwise remain blocked.
452            if let Some(ptrace) = &mut task_state.ptrace {
453                ptrace.set_last_signal(Some(SignalInfo::kernel(SIGTRAP)));
454                ptrace.set_last_event(Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0)));
455            }
456            task_state.wait_on_ptracer(&waiter);
457            task_state.notify_ptracers();
458        }
459        false
460    }
461
462    /// Set the RunState for the current task to the given value and then call the given callback.
463    ///
464    /// When the callback is done, the run_state is restored to `RunState::Running`.
465    ///
466    /// This function is typically used just before blocking the current task on some operation.
467    /// The given `run_state` registers the mechanism for interrupting the blocking operation with
468    /// the task and the given `callback` actually blocks the task.
469    ///
470    /// This function can only be called in the `RunState::Running` state and cannot set the
471    /// run state to `RunState::Running`. For this reason, this function cannot be reentered.
472    pub fn run_in_state<F, T>(&self, run_state: RunState, callback: F) -> Result<T, Errno>
473    where
474        F: FnOnce() -> Result<T, Errno>,
475    {
476        assert_ne!(run_state, RunState::Running);
477
478        // As an optimization, decommit unused pages of the stack to reduce memory pressure while
479        // the thread is blocked.
480        clean_stack();
481
482        {
483            let mut state = self.write();
484            assert!(!state.is_blocked());
485
486            if matches!(run_state, RunState::Frozen(_)) {
487                // Freeze is a kernel signal and is handled before other user signals. A frozen task
488                // ignores all other signals except SIGKILL until it is thawed.
489                if state.has_signal_pending(SIGKILL) {
490                    return error!(EINTR);
491                }
492            } else if state.is_any_signal_pending() && !state.is_ptrace_listening() {
493                // A note on PTRACE_LISTEN - the thread cannot be scheduled
494                // regardless of pending signals.
495                return error!(EINTR);
496            }
497            state.set_run_state(run_state.clone());
498        }
499
500        let result = callback();
501
502        {
503            let mut state = self.write();
504            assert_eq!(
505                state.run_state(),
506                run_state,
507                "SignalState run state changed while waiting!"
508            );
509            state.set_run_state(RunState::Running);
510        };
511
512        result
513    }
514
515    pub fn block_until(
516        &self,
517        guard: EventWaitGuard<'_>,
518        deadline: zx::MonotonicInstant,
519    ) -> Result<(), Errno> {
520        self.run_in_state(RunState::Event(guard.event().clone()), move || {
521            guard.block_until(None, deadline).map_err(|e| match e {
522                WakeReason::Interrupted => errno!(EINTR),
523                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
524            })
525        })
526    }
527
528    pub fn block_with_owner_until(
529        &self,
530        guard: EventWaitGuard<'_>,
531        new_owner: &zx::Thread,
532        deadline: zx::MonotonicInstant,
533    ) -> Result<(), Errno> {
534        self.run_in_state(RunState::Event(guard.event().clone()), move || {
535            guard.block_until(Some(new_owner), deadline).map_err(|e| match e {
536                WakeReason::Interrupted => errno!(EINTR),
537                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
538            })
539        })
540    }
541
542    /// Determine namespace node indicated by the dir_fd.
543    ///
544    /// Returns the namespace node and the path to use relative to that node.
545    pub fn resolve_dir_fd<'a, L>(
546        &self,
547        locked: &mut Locked<L>,
548        dir_fd: FdNumber,
549        mut path: &'a FsStr,
550        flags: ResolveFlags,
551    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
552    where
553        L: LockEqualOrBefore<FileOpsCore>,
554    {
555        let path_is_absolute = path.starts_with(b"/");
556        if path_is_absolute {
557            if flags.contains(ResolveFlags::BENEATH) {
558                return error!(EXDEV);
559            }
560            path = &path[1..];
561        }
562
563        let dir = if path_is_absolute && !flags.contains(ResolveFlags::IN_ROOT) {
564            self.fs().root()
565        } else if dir_fd == FdNumber::AT_FDCWD {
566            self.fs().cwd()
567        } else {
568            // O_PATH allowed for:
569            //
570            //   Passing the file descriptor as the dirfd argument of
571            //   openat() and the other "*at()" system calls.  This
572            //   includes linkat(2) with AT_EMPTY_PATH (or via procfs
573            //   using AT_SYMLINK_FOLLOW) even if the file is not a
574            //   directory.
575            //
576            // See https://man7.org/linux/man-pages/man2/open.2.html
577            let file = self.get_file_allowing_opath(dir_fd)?;
578            file.name.to_passive()
579        };
580
581        if !path.is_empty() {
582            if !dir.entry.node.is_dir() {
583                return error!(ENOTDIR);
584            }
585            dir.check_access(
586                locked,
587                self,
588                Access::EXEC,
589                CheckAccessReason::InternalPermissionChecks,
590            )?;
591        }
592        Ok((dir, path.into()))
593    }
594
595    /// A convenient wrapper for opening files relative to FdNumber::AT_FDCWD.
596    ///
597    /// Returns a FileHandle but does not install the FileHandle in the FdTable
598    /// for this task.
599    pub fn open_file(
600        &self,
601        locked: &mut Locked<Unlocked>,
602        path: &FsStr,
603        flags: OpenFlags,
604    ) -> Result<FileHandle, Errno> {
605        if flags.contains(OpenFlags::CREAT) {
606            // In order to support OpenFlags::CREAT we would need to take a
607            // FileMode argument.
608            return error!(EINVAL);
609        }
610        self.open_file_at(
611            locked,
612            FdNumber::AT_FDCWD,
613            path,
614            flags,
615            FileMode::default(),
616            ResolveFlags::empty(),
617            AccessCheck::default(),
618        )
619    }
620
621    /// Resolves a path for open.
622    ///
623    /// If the final path component points to a symlink, the symlink is followed (as long as
624    /// the symlink traversal limit has not been reached).
625    ///
626    /// If the final path component (after following any symlinks, if enabled) does not exist,
627    /// and `flags` contains `OpenFlags::CREAT`, a new node is created at the location of the
628    /// final path component.
629    ///
630    /// This returns the resolved node, and a boolean indicating whether the node has been created.
631    fn resolve_open_path<L>(
632        &self,
633        locked: &mut Locked<L>,
634        context: &mut LookupContext,
635        dir: &NamespaceNode,
636        path: &FsStr,
637        mode: FileMode,
638        flags: OpenFlags,
639    ) -> Result<(NamespaceNode, bool), Errno>
640    where
641        L: LockEqualOrBefore<FileOpsCore>,
642    {
643        context.update_for_path(path);
644        let mut parent_content = context.with(SymlinkMode::Follow);
645        let (parent, basename) = self.lookup_parent(locked, &mut parent_content, dir, path)?;
646        context.remaining_follows = parent_content.remaining_follows;
647
648        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
649
650        // Lookup the child, without following a symlink or expecting it to be a directory.
651        let mut child_context = context.with(SymlinkMode::NoFollow);
652        child_context.must_be_directory = false;
653
654        match parent.lookup_child(locked, self, &mut child_context, basename) {
655            Ok(name) => {
656                if name.entry.node.is_lnk() {
657                    if flags.contains(OpenFlags::PATH)
658                        && context.symlink_mode == SymlinkMode::NoFollow
659                    {
660                        // When O_PATH is specified in flags, if pathname is a symbolic link
661                        // and the O_NOFOLLOW flag is also specified, then the call returns
662                        // a file descriptor referring to the symbolic link.
663                        // See https://man7.org/linux/man-pages/man2/openat.2.html
664                        //
665                        // If the trailing component (i.e., basename) of
666                        // pathname is a symbolic link, how.resolve contains
667                        // RESOLVE_NO_SYMLINKS, and how.flags contains both
668                        // O_PATH and O_NOFOLLOW, then an O_PATH file
669                        // descriptor referencing the symbolic link will be
670                        // returned.
671                        // See https://man7.org/linux/man-pages/man2/openat2.2.html
672                        return Ok((name, false));
673                    }
674
675                    if (!flags.contains(OpenFlags::PATH)
676                        && context.symlink_mode == SymlinkMode::NoFollow)
677                        || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
678                        || context.remaining_follows == 0
679                    {
680                        if must_create {
681                            // Since `must_create` is set, and a node was found, this returns EEXIST
682                            // instead of ELOOP.
683                            return error!(EEXIST);
684                        }
685                        // A symlink was found, but one of the following is true:
686                        // * flags specified O_NOFOLLOW but not O_PATH.
687                        // * how.resolve contains RESOLVE_NO_SYMLINKS
688                        // * too many symlink traversals have been attempted
689                        return error!(ELOOP);
690                    }
691
692                    context.remaining_follows -= 1;
693                    match name.readlink(locked, self)? {
694                        SymlinkTarget::Path(path) => {
695                            let dir = if path[0] == b'/' { self.fs().root() } else { parent };
696                            self.resolve_open_path(
697                                locked,
698                                context,
699                                &dir,
700                                path.as_ref(),
701                                mode,
702                                flags,
703                            )
704                        }
705                        SymlinkTarget::Node(name) => {
706                            if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS)
707                                || name.entry.node.is_lnk()
708                            {
709                                error!(ELOOP)
710                            } else {
711                                Ok((name, false))
712                            }
713                        }
714                    }
715                } else {
716                    if must_create {
717                        return error!(EEXIST);
718                    }
719                    Ok((name, false))
720                }
721            }
722            Err(e) if e == errno!(ENOENT) && flags.contains(OpenFlags::CREAT) => {
723                if context.must_be_directory {
724                    return error!(EISDIR);
725                }
726                Ok((
727                    parent.open_create_node(
728                        locked,
729                        self,
730                        basename,
731                        mode.with_type(FileMode::IFREG),
732                        DeviceType::NONE,
733                        flags,
734                    )?,
735                    true,
736                ))
737            }
738            Err(e) => Err(e),
739        }
740    }
741
742    /// The primary entry point for opening files relative to a task.
743    ///
744    /// Absolute paths are resolve relative to the root of the FsContext for
745    /// this task. Relative paths are resolve relative to dir_fd. To resolve
746    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
747    /// dir_fd.
748    ///
749    /// Returns a FileHandle but does not install the FileHandle in the FdTable
750    /// for this task.
751    pub fn open_file_at(
752        &self,
753        locked: &mut Locked<Unlocked>,
754        dir_fd: FdNumber,
755        path: &FsStr,
756        flags: OpenFlags,
757        mode: FileMode,
758        resolve_flags: ResolveFlags,
759        access_check: AccessCheck,
760    ) -> Result<FileHandle, Errno> {
761        if path.is_empty() {
762            return error!(ENOENT);
763        }
764
765        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, resolve_flags)?;
766        self.open_namespace_node_at(locked, dir, path, flags, mode, resolve_flags, access_check)
767    }
768
769    pub fn open_namespace_node_at(
770        &self,
771        locked: &mut Locked<Unlocked>,
772        dir: NamespaceNode,
773        path: &FsStr,
774        flags: OpenFlags,
775        mode: FileMode,
776        mut resolve_flags: ResolveFlags,
777        access_check: AccessCheck,
778    ) -> Result<FileHandle, Errno> {
779        // 64-bit kernels force the O_LARGEFILE flag to be on.
780        let mut flags = flags | OpenFlags::LARGEFILE;
781        let opath = flags.contains(OpenFlags::PATH);
782        if opath {
783            // When O_PATH is specified in flags, flag bits other than O_CLOEXEC,
784            // O_DIRECTORY, and O_NOFOLLOW are ignored.
785            const ALLOWED_FLAGS: OpenFlags = OpenFlags::from_bits_truncate(
786                OpenFlags::PATH.bits()
787                    | OpenFlags::CLOEXEC.bits()
788                    | OpenFlags::DIRECTORY.bits()
789                    | OpenFlags::NOFOLLOW.bits(),
790            );
791            flags &= ALLOWED_FLAGS;
792        }
793
794        if flags.contains(OpenFlags::TMPFILE) && !flags.can_write() {
795            return error!(EINVAL);
796        }
797
798        let nofollow = flags.contains(OpenFlags::NOFOLLOW);
799        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
800
801        let symlink_mode =
802            if nofollow || must_create { SymlinkMode::NoFollow } else { SymlinkMode::Follow };
803
804        let resolve_base = match (
805            resolve_flags.contains(ResolveFlags::BENEATH),
806            resolve_flags.contains(ResolveFlags::IN_ROOT),
807        ) {
808            (false, false) => ResolveBase::None,
809            (true, false) => ResolveBase::Beneath(dir.clone()),
810            (false, true) => ResolveBase::InRoot(dir.clone()),
811            (true, true) => return error!(EINVAL),
812        };
813
814        // `RESOLVE_BENEATH` and `RESOLVE_IN_ROOT` imply `RESOLVE_NO_MAGICLINKS`. This matches
815        // Linux behavior. Strictly speaking it's is not really required, but it's hard to
816        // implement `BENEATH` and `IN_ROOT` flags correctly otherwise.
817        if resolve_base != ResolveBase::None {
818            resolve_flags.insert(ResolveFlags::NO_MAGICLINKS);
819        }
820
821        let mut context = LookupContext {
822            symlink_mode,
823            remaining_follows: MAX_SYMLINK_FOLLOWS,
824            must_be_directory: flags.contains(OpenFlags::DIRECTORY),
825            resolve_flags,
826            resolve_base,
827        };
828        let (name, created) =
829            match self.resolve_open_path(locked, &mut context, &dir, path, mode, flags) {
830                Ok((n, c)) => (n, c),
831                Err(e) => {
832                    let mut abs_path = dir.path(&self.fs());
833                    abs_path.extend(&**path);
834                    track_file_not_found(abs_path);
835                    return Err(e);
836                }
837            };
838
839        let name = if flags.contains(OpenFlags::TMPFILE) {
840            // `O_TMPFILE` is incompatible with `O_CREAT`
841            if flags.contains(OpenFlags::CREAT) {
842                return error!(EINVAL);
843            }
844            name.create_tmpfile(locked, self, mode.with_type(FileMode::IFREG), flags)?
845        } else {
846            let mode = name.entry.node.info().mode;
847
848            // These checks are not needed in the `O_TMPFILE` case because `mode` refers to the
849            // file we are opening. With `O_TMPFILE`, that file is the regular file we just
850            // created rather than the node we found by resolving the path.
851            //
852            // For example, we do not need to produce `ENOTDIR` when `must_be_directory` is set
853            // because `must_be_directory` refers to the node we found by resolving the path.
854            // If that node was not a directory, then `create_tmpfile` will produce an error.
855            //
856            // Similarly, we never need to call `truncate` because `O_TMPFILE` is newly created
857            // and therefor already an empty file.
858
859            if !opath && nofollow && mode.is_lnk() {
860                return error!(ELOOP);
861            }
862
863            if mode.is_dir() {
864                if flags.can_write()
865                    || flags.contains(OpenFlags::CREAT)
866                    || flags.contains(OpenFlags::TRUNC)
867                {
868                    return error!(EISDIR);
869                }
870                if flags.contains(OpenFlags::DIRECT) {
871                    return error!(EINVAL);
872                }
873            } else if context.must_be_directory {
874                return error!(ENOTDIR);
875            }
876
877            if flags.contains(OpenFlags::TRUNC) && mode.is_reg() && !created {
878                // You might think we should check file.can_write() at this
879                // point, which is what the docs suggest, but apparently we
880                // are supposed to truncate the file if this task can write
881                // to the underlying node, even if we are opening the file
882                // as read-only. See OpenTest.CanTruncateReadOnly.
883                name.truncate(locked, self, 0)?;
884            }
885
886            name
887        };
888
889        // If the node has been created, the open operation should not verify access right:
890        // From <https://man7.org/linux/man-pages/man2/open.2.html>
891        //
892        // > Note that mode applies only to future accesses of the newly created file; the
893        // > open() call that creates a read-only file may well return a  read/write  file
894        // > descriptor.
895
896        let access_check = if created { AccessCheck::skip() } else { access_check };
897        name.open(locked, self, flags, access_check)
898    }
899
900    /// A wrapper for FsContext::lookup_parent_at that resolves the given
901    /// dir_fd to a NamespaceNode.
902    ///
903    /// Absolute paths are resolve relative to the root of the FsContext for
904    /// this task. Relative paths are resolve relative to dir_fd. To resolve
905    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
906    /// dir_fd.
907    pub fn lookup_parent_at<'a, L>(
908        &self,
909        locked: &mut Locked<L>,
910        context: &mut LookupContext,
911        dir_fd: FdNumber,
912        path: &'a FsStr,
913    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
914    where
915        L: LockEqualOrBefore<FileOpsCore>,
916    {
917        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, ResolveFlags::empty())?;
918        self.lookup_parent(locked, context, &dir, path)
919    }
920
921    /// Lookup the parent of a namespace node.
922    ///
923    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
924    /// calling this function directly.
925    ///
926    /// This function resolves all but the last component of the given path.
927    /// The function returns the parent directory of the last component as well
928    /// as the last component.
929    ///
930    /// If path is empty, this function returns dir and an empty path.
931    /// Similarly, if path ends with "." or "..", these components will be
932    /// returned along with the parent.
933    ///
934    /// The returned parent might not be a directory.
935    pub fn lookup_parent<'a, L>(
936        &self,
937        locked: &mut Locked<L>,
938        context: &mut LookupContext,
939        dir: &NamespaceNode,
940        path: &'a FsStr,
941    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
942    where
943        L: LockEqualOrBefore<FileOpsCore>,
944    {
945        context.update_for_path(path);
946
947        let mut current_node = dir.clone();
948        let mut it = path.split(|c| *c == b'/').filter(|p| !p.is_empty()).map(<&FsStr>::from);
949        let mut current_path_component = it.next().unwrap_or_default();
950        for next_path_component in it {
951            current_node =
952                current_node.lookup_child(locked, self, context, current_path_component)?;
953            current_path_component = next_path_component;
954        }
955        Ok((current_node, current_path_component))
956    }
957
958    /// Lookup a namespace node.
959    ///
960    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
961    /// calling this function directly.
962    ///
963    /// This function resolves the component of the given path.
964    pub fn lookup_path<L>(
965        &self,
966        locked: &mut Locked<L>,
967        context: &mut LookupContext,
968        dir: NamespaceNode,
969        path: &FsStr,
970    ) -> Result<NamespaceNode, Errno>
971    where
972        L: LockEqualOrBefore<FileOpsCore>,
973    {
974        let (parent, basename) = self.lookup_parent(locked, context, &dir, path)?;
975        parent.lookup_child(locked, self, context, basename)
976    }
977
978    /// Lookup a namespace node starting at the root directory.
979    ///
980    /// Resolves symlinks.
981    pub fn lookup_path_from_root<L>(
982        &self,
983        locked: &mut Locked<L>,
984        path: &FsStr,
985    ) -> Result<NamespaceNode, Errno>
986    where
987        L: LockEqualOrBefore<FileOpsCore>,
988    {
989        let mut context = LookupContext::default();
990        self.lookup_path(locked, &mut context, self.fs().root(), path)
991    }
992
993    pub fn exec(
994        &mut self,
995        locked: &mut Locked<Unlocked>,
996        executable: FileHandle,
997        path: CString,
998        argv: Vec<CString>,
999        environ: Vec<CString>,
1000    ) -> Result<(), Errno> {
1001        // Executable must be a regular file
1002        if !executable.name.entry.node.is_reg() {
1003            return error!(EACCES);
1004        }
1005
1006        // File node must have EXEC mode permissions.
1007        // Note that the ability to execute a file is unrelated to the flags
1008        // used in the `open` call.
1009        executable.name.check_access(locked, self, Access::EXEC, CheckAccessReason::Exec)?;
1010
1011        let elf_security_state = security::bprm_creds_for_exec(self, &executable.name)?;
1012
1013        let resolved_elf = resolve_executable(
1014            locked,
1015            self,
1016            executable,
1017            path.clone(),
1018            argv,
1019            environ,
1020            elf_security_state,
1021        )?;
1022
1023        let maybe_set_id = if self.kernel().features.enable_suid {
1024            resolved_elf.file.name.suid_and_sgid(&self)?
1025        } else {
1026            Default::default()
1027        };
1028
1029        if self.thread_group().read().tasks_count() > 1 {
1030            track_stub!(TODO("https://fxbug.dev/297434895"), "exec on multithread process");
1031            return error!(EINVAL);
1032        }
1033
1034        if let Err(err) = self.finish_exec(locked, path, resolved_elf, maybe_set_id) {
1035            log_warn!("unrecoverable error in exec: {err:?}");
1036
1037            send_standard_signal(locked, self, SignalInfo::forced(SIGSEGV));
1038            return Err(err);
1039        }
1040
1041        self.ptrace_event(locked, PtraceOptions::TRACEEXEC, self.task.tid as u64);
1042        self.signal_vfork();
1043        self.task.thread_group.sync_syscall_log_level();
1044
1045        Ok(())
1046    }
1047
1048    /// After the memory is unmapped, any failure in exec is unrecoverable and results in the
1049    /// process crashing. This function is for that second half; any error returned from this
1050    /// function will be considered unrecoverable.
1051    fn finish_exec(
1052        &mut self,
1053        locked: &mut Locked<Unlocked>,
1054        path: CString,
1055        resolved_elf: ResolvedElf,
1056        mut maybe_set_id: UserAndOrGroupId,
1057    ) -> Result<(), Errno> {
1058        // Now that the exec will definitely finish (or crash), notify owners of
1059        // locked futexes for the current process, which will be impossible to
1060        // update after process image is replaced.  See get_robust_list(2).
1061        self.notify_robust_list();
1062
1063        // Passing arch32 information here ensures the replacement memory
1064        // layout matches the elf being executed.
1065        let mm = {
1066            let mm = self.mm()?;
1067            let new_mm = mm
1068                .exec(resolved_elf.file.name.to_passive(), resolved_elf.arch_width)
1069                .map_err(|status| from_status_like_fdio!(status))?;
1070            self.live().mm.update(Some(new_mm.clone()));
1071            new_mm
1072        };
1073
1074        {
1075            let mut state = self.write();
1076
1077            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1078            //
1079            //   The aforementioned transformations of the effective IDs are not
1080            //   performed (i.e., the set-user-ID and set-group-ID bits are
1081            //   ignored) if any of the following is true:
1082            //
1083            //   * the no_new_privs attribute is set for the calling thread (see
1084            //      prctl(2));
1085            //
1086            //   *  the underlying filesystem is mounted nosuid (the MS_NOSUID
1087            //      flag for mount(2)); or
1088            //
1089            //   *  the calling process is being ptraced.
1090            //
1091            // The MS_NOSUID check is in `NamespaceNode::suid_and_sgid()`.
1092            if state.no_new_privs() || state.is_ptraced() {
1093                maybe_set_id.clear();
1094            }
1095
1096            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1097            //
1098            //   The process's "dumpable" attribute is set to the value 1,
1099            //   unless a set-user-ID program, a set-group-ID program, or a
1100            //   program with capabilities is being executed, in which case the
1101            //   dumpable flag may instead be reset to the value in
1102            //   /proc/sys/fs/suid_dumpable, in the circumstances described
1103            //   under PR_SET_DUMPABLE in prctl(2).
1104            let dumpable =
1105                if maybe_set_id.is_none() { DumpPolicy::User } else { DumpPolicy::Disable };
1106            *mm.dumpable.lock(locked) = dumpable;
1107
1108            // TODO(https://fxbug.dev/433463756): Figure out whether this is the right place to
1109            // take the lock.
1110            // SAFETY: this is allowed because we are the CurrentTask.
1111            let mut writable_creds = unsafe { self.persistent_info.write_creds() };
1112            state.set_sigaltstack(None);
1113            state.robust_list_head = RobustListHeadPtr::null(self);
1114
1115            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1116            //
1117            //   If a set-user-ID or set-group-ID
1118            //   program is being executed, then the parent death signal set by
1119            //   prctl(2) PR_SET_PDEATHSIG flag is cleared.
1120            //
1121            // TODO(https://fxbug.dev/356684424): Implement the behavior above once we support
1122            // the PR_SET_PDEATHSIG flag.
1123
1124            // TODO(tbodt): Check whether capability xattrs are set on the file, and grant/limit
1125            // capabilities accordingly.
1126            let mut new_creds = Credentials::clone(&self.current_creds());
1127            new_creds.exec(maybe_set_id);
1128            let new_creds = Arc::new(new_creds);
1129            writable_creds.update(new_creds.clone());
1130            *self.current_creds.borrow_mut() = CurrentCreds::Cached(new_creds);
1131        }
1132
1133        let security_state = resolved_elf.security_state.clone();
1134
1135        let start_info = load_executable(self, resolved_elf, &path)?;
1136
1137        let regs: zx_restricted_state_t = start_info.into();
1138        self.thread_state.registers.load(regs);
1139        self.thread_state.extended_pstate.reset();
1140        self.thread_group().signal_actions.reset_for_exec();
1141
1142        // The exit signal (and that of the children) is reset to SIGCHLD.
1143        let mut thread_group_state = self.thread_group().write();
1144        thread_group_state.exit_signal = Some(SIGCHLD);
1145        for (_, weak_child) in &mut thread_group_state.children {
1146            if let Some(child) = weak_child.upgrade() {
1147                let mut child_state = child.write();
1148                child_state.exit_signal = Some(SIGCHLD);
1149            }
1150        }
1151
1152        std::mem::drop(thread_group_state);
1153
1154        // TODO(https://fxbug.dev/42082680): All threads other than the calling thread are destroyed.
1155
1156        // TODO: POSIX timers are not preserved.
1157
1158        // TODO: Ensure that the filesystem context is un-shared, undoing the effect of CLONE_FS.
1159
1160        // The file descriptor table is unshared, undoing the effect of the CLONE_FILES flag of
1161        // clone(2).
1162        self.live().files.unshare();
1163        self.live().files.exec(locked, self);
1164
1165        // If SELinux is enabled, enforce permissions related to inheritance of file descriptors
1166        // and resource limits. Then update the current task's SID.
1167        //
1168        // TODO: https://fxbug.dev/378655436 - After the above, enforce permissions related to
1169        // signal state inheritance.
1170        //
1171        // This needs to be called after closing any files marked "close-on-exec".
1172        security::exec_binprm(locked, self, &security_state)?;
1173
1174        self.thread_group().write().did_exec = true;
1175
1176        self.set_command_name(TaskCommand::from_path_bytes(path.to_bytes()));
1177
1178        Ok(())
1179    }
1180
1181    pub fn set_command_name(&self, new_name: TaskCommand) {
1182        // set_command_name needs to run before leader_command() in cases where self is the leader.
1183        self.task.set_command_name(new_name.clone());
1184        let leader_command = self.thread_group().read().leader_command();
1185        starnix_logging::set_current_task_info(
1186            new_name,
1187            leader_command,
1188            self.thread_group().leader,
1189            self.tid,
1190        );
1191    }
1192
1193    pub fn add_seccomp_filter(
1194        &mut self,
1195        locked: &mut Locked<Unlocked>,
1196        code: Vec<sock_filter>,
1197        flags: u32,
1198    ) -> Result<SyscallResult, Errno> {
1199        let new_filter = Arc::new(SeccompFilter::from_cbpf(
1200            &code,
1201            self.thread_group().next_seccomp_filter_id.add(1),
1202            flags & SECCOMP_FILTER_FLAG_LOG != 0,
1203        )?);
1204
1205        let mut maybe_fd: Option<FdNumber> = None;
1206
1207        if flags & SECCOMP_FILTER_FLAG_NEW_LISTENER != 0 {
1208            maybe_fd = Some(SeccompFilterContainer::create_listener(locked, self)?);
1209        }
1210
1211        // We take the process lock here because we can't change any of the threads
1212        // while doing a tsync.  So, you hold the process lock while making any changes.
1213        let state = self.thread_group().write();
1214
1215        if flags & SECCOMP_FILTER_FLAG_TSYNC != 0 {
1216            // TSYNC synchronizes all filters for all threads in the current process to
1217            // the current thread's
1218
1219            // We collect the filters for the current task upfront to save us acquiring
1220            // the task's lock a lot of times below.
1221            let mut filters: SeccompFilterContainer = self.read().seccomp_filters.clone();
1222
1223            // For TSYNC to work, all of the other thread filters in this process have to
1224            // be a prefix of this thread's filters, and none of them can be in
1225            // strict mode.
1226            let tasks = state.tasks().collect::<Vec<_>>();
1227            for task in &tasks {
1228                if task.tid == self.tid {
1229                    continue;
1230                }
1231                let other_task_state = task.read();
1232
1233                // Target threads cannot be in SECCOMP_MODE_STRICT
1234                if task.seccomp_filter_state.get() == SeccompStateValue::Strict {
1235                    return Self::seccomp_tsync_error(task.tid, flags);
1236                }
1237
1238                // Target threads' filters must be a subsequence of this thread's
1239                if !other_task_state.seccomp_filters.can_sync_to(&filters) {
1240                    return Self::seccomp_tsync_error(task.tid, flags);
1241                }
1242            }
1243
1244            // Now that we're sure we're allowed to do so, add the filter to all threads.
1245            filters.add_filter(new_filter, code.len() as u16)?;
1246
1247            for task in &tasks {
1248                let mut other_task_state = task.write();
1249
1250                other_task_state.enable_no_new_privs();
1251                other_task_state.seccomp_filters = filters.clone();
1252                task.set_seccomp_state(SeccompStateValue::UserDefined)?;
1253            }
1254        } else {
1255            let mut task_state = self.task.write();
1256
1257            task_state.seccomp_filters.add_filter(new_filter, code.len() as u16)?;
1258            self.set_seccomp_state(SeccompStateValue::UserDefined)?;
1259        }
1260
1261        if let Some(fd) = maybe_fd { Ok(fd.into()) } else { Ok(().into()) }
1262    }
1263
1264    pub fn run_seccomp_filters(
1265        &mut self,
1266        locked: &mut Locked<Unlocked>,
1267        syscall: &Syscall,
1268    ) -> Option<Result<SyscallResult, Errno>> {
1269        // Implementation of SECCOMP_FILTER_STRICT, which has slightly different semantics
1270        // from user-defined seccomp filters.
1271        if self.seccomp_filter_state.get() == SeccompStateValue::Strict {
1272            return SeccompState::do_strict(locked, self, syscall);
1273        }
1274
1275        // Run user-defined seccomp filters
1276        let result = self.task.read().seccomp_filters.run_all(self, syscall);
1277
1278        SeccompState::do_user_defined(locked, result, self, syscall)
1279    }
1280
1281    fn seccomp_tsync_error(id: i32, flags: u32) -> Result<SyscallResult, Errno> {
1282        // By default, TSYNC indicates failure state by returning the first thread
1283        // id not to be able to sync, rather than by returning -1 and setting
1284        // errno.  However, if TSYNC_ESRCH is set, it returns ESRCH.  This
1285        // prevents conflicts with fact that SECCOMP_FILTER_FLAG_NEW_LISTENER
1286        // makes seccomp return an fd.
1287        if flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH != 0 { error!(ESRCH) } else { Ok(id.into()) }
1288    }
1289
1290    // Notify all futexes in robust list.  The robust list is in user space, so we
1291    // are very careful about walking it, and there are a lot of quiet returns if
1292    // we fail to walk it.
1293    // TODO(https://fxbug.dev/42079081): This only sets the FUTEX_OWNER_DIED bit; it does
1294    // not wake up a waiter.
1295    pub fn notify_robust_list(&self) {
1296        let task_state = self.write();
1297        let robust_list_addr = task_state.robust_list_head.addr();
1298        if robust_list_addr == UserAddress::NULL {
1299            // No one has called set_robust_list.
1300            return;
1301        }
1302        let robust_list_res = self.read_multi_arch_object(task_state.robust_list_head);
1303
1304        let head = if let Ok(head) = robust_list_res {
1305            head
1306        } else {
1307            return;
1308        };
1309
1310        let offset = head.futex_offset;
1311
1312        let mut entries_count = 0;
1313        let mut curr_ptr = head.list.next;
1314        while curr_ptr.addr() != robust_list_addr.into() && entries_count < ROBUST_LIST_LIMIT {
1315            let curr_ref = self.read_multi_arch_object(curr_ptr);
1316
1317            let curr = if let Ok(curr) = curr_ref {
1318                curr
1319            } else {
1320                return;
1321            };
1322
1323            let Some(futex_base) = curr_ptr.addr().checked_add_signed(offset) else {
1324                return;
1325            };
1326
1327            let futex_addr = match FutexAddress::try_from(futex_base) {
1328                Ok(addr) => addr,
1329                Err(_) => {
1330                    return;
1331                }
1332            };
1333
1334            let Ok(mm) = self.mm() else {
1335                log_error!("Asked to notify robust list futexes in system task.");
1336                return;
1337            };
1338            let futex = if let Ok(futex) = mm.atomic_load_u32_relaxed(futex_addr) {
1339                futex
1340            } else {
1341                return;
1342            };
1343
1344            if (futex & FUTEX_TID_MASK) as i32 == self.tid {
1345                let owner_died = FUTEX_OWNER_DIED | futex;
1346                if mm.atomic_store_u32_relaxed(futex_addr, owner_died).is_err() {
1347                    return;
1348                }
1349            }
1350            curr_ptr = curr.next;
1351            entries_count += 1;
1352        }
1353    }
1354
1355    /// Returns a ref to this thread's SeccompNotifier.
1356    pub fn get_seccomp_notifier(&mut self) -> Option<SeccompNotifierHandle> {
1357        self.task.write().seccomp_filters.notifier.clone()
1358    }
1359
1360    pub fn set_seccomp_notifier(&mut self, notifier: Option<SeccompNotifierHandle>) {
1361        self.task.write().seccomp_filters.notifier = notifier;
1362    }
1363
1364    /// Processes a Zircon exception associated with this task.
1365    pub fn process_exception(
1366        &self,
1367        locked: &mut Locked<Unlocked>,
1368        report: &zx::ExceptionReport,
1369    ) -> ExceptionResult {
1370        match report.ty {
1371            zx::ExceptionType::General => match get_signal_for_general_exception(&report.arch) {
1372                Some(sig) => ExceptionResult::Signal(SignalInfo::kernel(sig)),
1373                None => {
1374                    log_error!("Unrecognized general exception: {:?}", report);
1375                    ExceptionResult::Signal(SignalInfo::kernel(SIGILL))
1376                }
1377            },
1378            zx::ExceptionType::FatalPageFault { status } => {
1379                let report = decode_page_fault_exception_report(&report.arch);
1380                if let Ok(mm) = self.mm() {
1381                    mm.handle_page_fault(locked, report, status)
1382                } else {
1383                    panic!(
1384                        "system task is handling a major page fault status={:?}, report={:?}",
1385                        status, report
1386                    );
1387                }
1388            }
1389            zx::ExceptionType::UndefinedInstruction => {
1390                ExceptionResult::Signal(SignalInfo::kernel(SIGILL))
1391            }
1392            zx::ExceptionType::UnalignedAccess => {
1393                ExceptionResult::Signal(SignalInfo::kernel(SIGBUS))
1394            }
1395            zx::ExceptionType::SoftwareBreakpoint | zx::ExceptionType::HardwareBreakpoint => {
1396                ExceptionResult::Signal(SignalInfo::kernel(SIGTRAP))
1397            }
1398            zx::ExceptionType::ProcessNameChanged => {
1399                log_error!("Received unexpected process name changed exception");
1400                ExceptionResult::Handled
1401            }
1402            zx::ExceptionType::ProcessStarting
1403            | zx::ExceptionType::ThreadStarting
1404            | zx::ExceptionType::ThreadExiting => {
1405                log_error!("Received unexpected task lifecycle exception");
1406                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1407            }
1408            zx::ExceptionType::PolicyError(policy_code) => {
1409                log_error!(policy_code:?; "Received Zircon policy error exception");
1410                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1411            }
1412            zx::ExceptionType::UnknownUserGenerated { code, data } => {
1413                log_error!(code:?, data:?; "Received unexpected unknown user generated exception");
1414                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1415            }
1416            zx::ExceptionType::Unknown { ty, code, data } => {
1417                log_error!(ty:?, code:?, data:?; "Received unexpected exception");
1418                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1419            }
1420        }
1421    }
1422
1423    /// Clone this task.
1424    ///
1425    /// Creates a new task object that shares some state with this task
1426    /// according to the given flags.
1427    ///
1428    /// Used by the clone() syscall to create both processes and threads.
1429    ///
1430    /// The exit signal is broken out from the flags parameter like clone3() rather than being
1431    /// bitwise-ORed like clone().
1432    pub fn clone_task<L>(
1433        &self,
1434        locked: &mut Locked<L>,
1435        flags: u64,
1436        child_exit_signal: Option<Signal>,
1437        user_parent_tid: UserRef<pid_t>,
1438        user_child_tid: UserRef<pid_t>,
1439        user_pidfd: UserRef<FdNumber>,
1440    ) -> Result<TaskBuilder, Errno>
1441    where
1442        L: LockBefore<MmDumpable>,
1443        L: LockBefore<TaskRelease>,
1444        L: LockBefore<ProcessGroupState>,
1445    {
1446        const IMPLEMENTED_FLAGS: u64 = (CLONE_VM
1447            | CLONE_FS
1448            | CLONE_FILES
1449            | CLONE_SIGHAND
1450            | CLONE_THREAD
1451            | CLONE_SYSVSEM
1452            | CLONE_SETTLS
1453            | CLONE_PARENT
1454            | CLONE_PARENT_SETTID
1455            | CLONE_PIDFD
1456            | CLONE_CHILD_CLEARTID
1457            | CLONE_CHILD_SETTID
1458            | CLONE_VFORK
1459            | CLONE_NEWUTS
1460            | CLONE_PTRACE) as u64;
1461
1462        // A mask with all valid flags set, because we want to return a different error code for an
1463        // invalid flag vs an unimplemented flag. Subtracting 1 from the largest valid flag gives a
1464        // mask with all flags below it set. Shift up by one to make sure the largest flag is also
1465        // set.
1466        const VALID_FLAGS: u64 = (CLONE_INTO_CGROUP << 1) - 1;
1467
1468        // CLONE_SETTLS is implemented by sys_clone.
1469
1470        let clone_files = flags & (CLONE_FILES as u64) != 0;
1471        let clone_fs = flags & (CLONE_FS as u64) != 0;
1472        let clone_parent = flags & (CLONE_PARENT as u64) != 0;
1473        let clone_parent_settid = flags & (CLONE_PARENT_SETTID as u64) != 0;
1474        let clone_pidfd = flags & (CLONE_PIDFD as u64) != 0;
1475        let clone_child_cleartid = flags & (CLONE_CHILD_CLEARTID as u64) != 0;
1476        let clone_child_settid = flags & (CLONE_CHILD_SETTID as u64) != 0;
1477        let clone_sysvsem = flags & (CLONE_SYSVSEM as u64) != 0;
1478        let clone_ptrace = flags & (CLONE_PTRACE as u64) != 0;
1479        let clone_thread = flags & (CLONE_THREAD as u64) != 0;
1480        let clone_vm = flags & (CLONE_VM as u64) != 0;
1481        let clone_sighand = flags & (CLONE_SIGHAND as u64) != 0;
1482        let clone_vfork = flags & (CLONE_VFORK as u64) != 0;
1483        let clone_newuts = flags & (CLONE_NEWUTS as u64) != 0;
1484        let clone_into_cgroup = flags & CLONE_INTO_CGROUP != 0;
1485
1486        if clone_ptrace {
1487            track_stub!(TODO("https://fxbug.dev/322874630"), "CLONE_PTRACE");
1488        }
1489
1490        if clone_sysvsem {
1491            track_stub!(TODO("https://fxbug.dev/322875185"), "CLONE_SYSVSEM");
1492        }
1493
1494        if clone_into_cgroup {
1495            track_stub!(TODO("https://fxbug.dev/403612570"), "CLONE_INTO_CGROUP");
1496        }
1497
1498        if clone_sighand && !clone_vm {
1499            return error!(EINVAL);
1500        }
1501        if clone_thread && !clone_sighand {
1502            return error!(EINVAL);
1503        }
1504
1505        if clone_pidfd && clone_thread {
1506            return error!(EINVAL);
1507        }
1508        if clone_pidfd && clone_parent_settid && user_parent_tid.addr() == user_pidfd.addr() {
1509            // `clone()` uses the same out-argument for these, so error out if they have the same
1510            // user address.
1511            return error!(EINVAL);
1512        }
1513
1514        if flags & !VALID_FLAGS != 0 {
1515            return error!(EINVAL);
1516        }
1517
1518        if clone_vm && !clone_thread {
1519            // TODO(https://fxbug.dev/42066087) Implement CLONE_VM for child processes (not just child
1520            // threads). Currently this executes CLONE_VM (explicitly passed to clone() or as
1521            // used by vfork()) as a fork (the VM in the child is copy-on-write) which is almost
1522            // always OK.
1523            //
1524            // CLONE_VM is primarily as an optimization to avoid making a copy-on-write version of a
1525            // process' VM that will be immediately replaced with a call to exec(). The main users
1526            // (libc and language runtimes) don't actually rely on the memory being shared between
1527            // the two processes. And the vfork() man page explicitly allows vfork() to be
1528            // implemented as fork() which is what we do here.
1529            if !clone_vfork {
1530                track_stub!(
1531                    TODO("https://fxbug.dev/322875227"),
1532                    "CLONE_VM without CLONE_THREAD or CLONE_VFORK"
1533                );
1534            }
1535        } else if clone_thread && !clone_vm {
1536            track_stub!(TODO("https://fxbug.dev/322875167"), "CLONE_THREAD without CLONE_VM");
1537            return error!(ENOSYS);
1538        }
1539
1540        if flags & !IMPLEMENTED_FLAGS != 0 {
1541            track_stub!(
1542                TODO("https://fxbug.dev/322875130"),
1543                "clone unknown flags",
1544                flags & !IMPLEMENTED_FLAGS
1545            );
1546            return error!(ENOSYS);
1547        }
1548
1549        let fs = if clone_fs { self.fs() } else { self.fs().fork() };
1550        let files = if clone_files { self.live().files.clone() } else { self.live().files.fork() };
1551
1552        let kernel = self.kernel();
1553
1554        let mut pids = kernel.pids.write();
1555
1556        // Lock the cgroup process hierarchy so that the parent process cannot move to a different
1557        // cgroup while a new task or thread_group is created. This may be unnecessary if
1558        // CLONE_INTO_CGROUP is implemented and passed in.
1559        let mut cgroup2_pid_table = kernel.cgroups.lock_cgroup2_pid_table();
1560        // Create a `KernelSignal::Freeze` to put onto the new task, if the cgroup is frozen.
1561        let child_kernel_signals = cgroup2_pid_table
1562            .maybe_create_freeze_signal(self.thread_group())
1563            .into_iter()
1564            .collect::<VecDeque<_>>();
1565
1566        let pid;
1567        let command;
1568        let creds;
1569        let scheduler_state;
1570        let no_new_privs;
1571        let seccomp_filters;
1572        let robust_list_head = RobustListHeadPtr::null(self);
1573        let child_signal_mask;
1574        let timerslack_ns;
1575        let uts_ns;
1576
1577        let TaskInfo { thread, thread_group, memory_manager } = {
1578            // These variables hold the original parent in case we need to switch the parent of the
1579            // new task because of CLONE_PARENT.
1580            let weak_original_parent;
1581            let original_parent;
1582
1583            // Make sure to drop these locks ASAP to avoid inversion
1584            let thread_group_state = {
1585                let thread_group_state = self.thread_group().write();
1586                if clone_parent {
1587                    // With the CLONE_PARENT flag, the parent of the new task is our parent
1588                    // instead of ourselves.
1589                    weak_original_parent =
1590                        thread_group_state.parent.clone().ok_or_else(|| errno!(EINVAL))?;
1591                    std::mem::drop(thread_group_state);
1592                    original_parent = weak_original_parent.upgrade();
1593                    original_parent.write()
1594                } else {
1595                    thread_group_state
1596                }
1597            };
1598
1599            let state = self.read();
1600
1601            no_new_privs = state.no_new_privs();
1602            seccomp_filters = state.seccomp_filters.clone();
1603            child_signal_mask = state.signal_mask();
1604
1605            pid = pids.allocate_pid();
1606            command = self.command();
1607            creds = self.current_creds().clone();
1608            scheduler_state = state.scheduler_state.fork();
1609            timerslack_ns = state.timerslack_ns;
1610
1611            uts_ns = if clone_newuts {
1612                security::check_task_capable(self, CAP_SYS_ADMIN)?;
1613                state.uts_ns.read().fork()
1614            } else {
1615                state.uts_ns.clone()
1616            };
1617
1618            if clone_thread {
1619                TaskInfo {
1620                    thread: None,
1621                    thread_group: self.thread_group().clone(),
1622                    memory_manager: self.mm().ok(),
1623                }
1624            } else {
1625                // Drop the lock on this task before entering `create_zircon_process`, because it will
1626                // take a lock on the new thread group, and locks on thread groups have a higher
1627                // priority than locks on the task in the thread group.
1628                std::mem::drop(state);
1629                let signal_actions = if clone_sighand {
1630                    self.thread_group().signal_actions.clone()
1631                } else {
1632                    self.thread_group().signal_actions.fork()
1633                };
1634                let process_group = thread_group_state.process_group.clone();
1635
1636                let task_info = create_zircon_process(
1637                    locked,
1638                    kernel,
1639                    Some(thread_group_state),
1640                    pid,
1641                    child_exit_signal,
1642                    process_group,
1643                    signal_actions,
1644                    command.clone(),
1645                    self.thread_state.arch_width(),
1646                )?;
1647
1648                cgroup2_pid_table.inherit_cgroup(self.thread_group(), &task_info.thread_group);
1649
1650                task_info
1651            }
1652        };
1653
1654        // Drop the lock on the cgroup pid_table before creating the TaskBuilder.
1655        // If the TaskBuilder creation fails, the TaskBuilder is dropped, which calls
1656        // ThreadGroup::remove. ThreadGroup::remove takes the cgroup pid_table lock, causing
1657        // a cyclic lock dependency.
1658        std::mem::drop(cgroup2_pid_table);
1659
1660        // Only create the vfork event when the caller requested CLONE_VFORK.
1661        let vfork_event = if clone_vfork { Some(Arc::new(zx::Event::create())) } else { None };
1662
1663        let live = self.live();
1664        let mut child = TaskBuilder::new(Task::new(
1665            pid,
1666            command,
1667            thread_group,
1668            thread,
1669            files,
1670            memory_manager,
1671            fs,
1672            creds,
1673            live.abstract_socket_namespace.clone(),
1674            live.abstract_vsock_namespace.clone(),
1675            child_signal_mask,
1676            child_kernel_signals,
1677            vfork_event,
1678            scheduler_state,
1679            uts_ns,
1680            no_new_privs,
1681            SeccompState::from(&self.seccomp_filter_state),
1682            seccomp_filters,
1683            robust_list_head,
1684            timerslack_ns,
1685        ));
1686
1687        release_on_error!(child, locked, {
1688            let child_task = TempRef::from(&child.task);
1689            // Drop the pids lock as soon as possible after creating the child. Destroying the child
1690            // and removing it from the pids table itself requires the pids lock, so if an early exit
1691            // takes place we have a self deadlock.
1692            pids.add_task(&child_task);
1693            std::mem::drop(pids);
1694
1695            // Child lock must be taken before this lock. Drop the lock on the task, take a writable
1696            // lock on the child and take the current state back.
1697
1698            #[cfg(any(test, debug_assertions))]
1699            {
1700                // Take the lock on the thread group and its child in the correct order to ensure any wrong ordering
1701                // will trigger the tracing-mutex at the right call site.
1702                if !clone_thread {
1703                    let _l1 = self.thread_group().read();
1704                    let _l2 = child.thread_group().read();
1705                }
1706            }
1707
1708            if clone_thread {
1709                self.thread_group().add(&child_task)?;
1710            } else {
1711                child.thread_group().add(&child_task)?;
1712
1713                // These manipulations of the signal handling state appear to be related to
1714                // CLONE_SIGHAND and CLONE_VM rather than CLONE_THREAD. However, we do not support
1715                // all the combinations of these flags, which means doing these operations here
1716                // might actually be correct. However, if you find a test that fails because of the
1717                // placement of this logic here, we might need to move it.
1718                let mut child_state = child.write();
1719                let state = self.read();
1720                child_state.set_sigaltstack(state.sigaltstack());
1721                child_state.set_signal_mask(state.signal_mask());
1722            }
1723
1724            if !clone_vm {
1725                // We do not support running threads in the same process with different
1726                // MemoryManagers.
1727                assert!(!clone_thread);
1728                self.mm()?.snapshot_to(locked, &child.mm()?)?;
1729            }
1730
1731            if clone_parent_settid {
1732                self.write_object(user_parent_tid, &child.tid)?;
1733            }
1734
1735            if clone_child_cleartid {
1736                child.write().clear_child_tid = user_child_tid;
1737            }
1738
1739            if clone_child_settid {
1740                child.write_object(user_child_tid, &child.tid)?;
1741            }
1742
1743            if clone_pidfd {
1744                let locked = locked.cast_locked::<TaskRelease>();
1745                let file = new_pidfd(
1746                    locked,
1747                    self,
1748                    child.thread_group(),
1749                    &*child.mm()?,
1750                    OpenFlags::empty(),
1751                );
1752                let pidfd = self.add_file(locked, file, FdFlags::CLOEXEC)?;
1753                self.write_object(user_pidfd, &pidfd)?;
1754            }
1755
1756            // TODO(https://fxbug.dev/42066087): We do not support running different processes with
1757            // the same MemoryManager. Instead, we implement a rough approximation of that behavior
1758            // by making a copy-on-write clone of the memory from the original process.
1759            if clone_vm && !clone_thread {
1760                self.mm()?.snapshot_to(locked, &child.mm()?)?;
1761            }
1762
1763            child.thread_state = self.thread_state.snapshot::<HeapRegs>();
1764            Ok(())
1765        });
1766
1767        // Take the lock on thread group and task in the correct order to ensure any wrong ordering
1768        // will trigger the tracing-mutex at the right call site.
1769        #[cfg(any(test, debug_assertions))]
1770        {
1771            let _l1 = child.thread_group().read();
1772            let _l2 = child.read();
1773        }
1774
1775        Ok(child)
1776    }
1777
1778    /// Sets the stop state (per set_stopped), and also notifies all listeners,
1779    /// including the parent process and the tracer if appropriate.
1780    pub fn set_stopped_and_notify(&self, stopped: StopState, siginfo: Option<SignalInfo>) {
1781        let maybe_signal_info = {
1782            let mut state = self.write();
1783            state.copy_state_from(self);
1784            state.set_stopped(stopped, siginfo, Some(self), None);
1785            state.prepare_signal_info(stopped)
1786        };
1787
1788        if let Some((tracer, signal_info)) = maybe_signal_info {
1789            if let Some(tracer) = tracer.upgrade() {
1790                tracer.write().send_signal(signal_info);
1791            }
1792        }
1793
1794        if !stopped.is_in_progress() {
1795            let parent = self.thread_group().read().parent.clone();
1796            if let Some(parent) = parent {
1797                parent
1798                    .upgrade()
1799                    .write()
1800                    .lifecycle_waiters
1801                    .notify_value(ThreadGroupLifecycleWaitValue::ChildStatus);
1802            }
1803        }
1804    }
1805
1806    /// If the task is stopping, set it as stopped. return whether the caller
1807    /// should stop.  The task might also be waking up.
1808    pub fn finalize_stop_state(&mut self) -> bool {
1809        let stopped = self.load_stopped();
1810
1811        if !stopped.is_stopping_or_stopped() {
1812            // If we are waking up, potentially write back state a tracer may have modified.
1813            let captured_state = self.write().take_captured_state();
1814            if let Some(captured) = captured_state {
1815                if captured.dirty {
1816                    self.thread_state.replace_registers(&captured.thread_state);
1817                }
1818            }
1819        }
1820
1821        // Stopping because the thread group is stopping.
1822        // Try to flip to GroupStopped - will fail if we shouldn't.
1823        if self.thread_group().set_stopped(StopState::GroupStopped, None, true)
1824            == StopState::GroupStopped
1825        {
1826            let signal = self.thread_group().read().last_signal.clone();
1827            // stopping because the thread group has stopped
1828            let event = Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0));
1829            self.write().set_stopped(StopState::GroupStopped, signal, Some(self), event);
1830            return true;
1831        }
1832
1833        // Stopping because the task is stopping
1834        if stopped.is_stopping_or_stopped() {
1835            if let Ok(stopped) = stopped.finalize() {
1836                self.set_stopped_and_notify(stopped, None);
1837            }
1838            return true;
1839        }
1840
1841        false
1842    }
1843
1844    /// Block the execution of `current_task` as long as the task is stopped and
1845    /// not terminated.
1846    pub fn block_while_stopped(&mut self, locked: &mut Locked<Unlocked>) {
1847        // Upgrade the state from stopping to stopped if needed. Return if the task
1848        // should not be stopped.
1849        if !self.finalize_stop_state() {
1850            return;
1851        }
1852
1853        let waiter = Waiter::with_options(WaiterOptions::IGNORE_SIGNALS);
1854        loop {
1855            // If we've exited, unstop the threads and return without notifying
1856            // waiters.
1857            if self.is_exitted() {
1858                self.thread_group().set_stopped(StopState::ForceAwake, None, false);
1859                self.write().set_stopped(StopState::ForceAwake, None, Some(self), None);
1860                return;
1861            }
1862
1863            if self.wake_or_wait_until_unstopped_async(&waiter) {
1864                return;
1865            }
1866
1867            // Do the wait. Result is not needed, as this is not in a syscall.
1868            let _: Result<(), Errno> = waiter.wait(locked, self);
1869
1870            // Maybe go from stopping to stopped, if we are currently stopping
1871            // again.
1872            self.finalize_stop_state();
1873        }
1874    }
1875
1876    /// For traced tasks, this will return the data neceessary for a cloned task
1877    /// to attach to the same tracer.
1878    pub fn get_ptrace_core_state_for_clone(
1879        &mut self,
1880        clone_args: &clone_args,
1881    ) -> (PtraceOptions, Option<PtraceCoreState>) {
1882        let state = self.write();
1883        if let Some(ptrace) = &state.ptrace {
1884            ptrace.get_core_state_for_clone(clone_args)
1885        } else {
1886            (PtraceOptions::empty(), None)
1887        }
1888    }
1889
1890    /// If currently being ptraced with the given option, emit the appropriate
1891    /// event.  PTRACE_EVENTMSG will return the given message.  Also emits the
1892    /// appropriate event for execve in the absence of TRACEEXEC.
1893    ///
1894    /// Note that the Linux kernel has a documented bug where, if TRACEEXIT is
1895    /// enabled, SIGKILL will trigger an event.  We do not exhibit this
1896    /// behavior.
1897    pub fn ptrace_event(
1898        &mut self,
1899        locked: &mut Locked<Unlocked>,
1900        trace_kind: PtraceOptions,
1901        msg: u64,
1902    ) {
1903        if !trace_kind.is_empty() {
1904            {
1905                let mut state = self.write();
1906                if let Some(ptrace) = &mut state.ptrace {
1907                    if !ptrace.has_option(trace_kind) {
1908                        // If this would be a TRACEEXEC, but TRACEEXEC is not
1909                        // turned on, then send a SIGTRAP.
1910                        if trace_kind == PtraceOptions::TRACEEXEC && !ptrace.is_seized() {
1911                            // Send a SIGTRAP so that the parent can gain control.
1912                            send_signal_first(locked, self, state, SignalInfo::kernel(SIGTRAP));
1913                        }
1914
1915                        return;
1916                    }
1917                    let ptrace_event = PtraceEvent::from_option(&trace_kind) as u32;
1918                    let siginfo = SignalInfo::with_detail(
1919                        SIGTRAP,
1920                        ((ptrace_event << 8) | SIGTRAP.number()) as i32,
1921                        SignalDetail::None,
1922                    );
1923                    state.set_stopped(
1924                        StopState::PtraceEventStopping,
1925                        Some(siginfo),
1926                        None,
1927                        Some(PtraceEventData::new(trace_kind, msg)),
1928                    );
1929                } else {
1930                    return;
1931                }
1932            }
1933            self.block_while_stopped(locked);
1934        }
1935    }
1936
1937    /// Causes the current thread's thread group to exit, notifying any ptracer
1938    /// of this task first.
1939    pub fn thread_group_exit(&mut self, locked: &mut Locked<Unlocked>, exit_status: ExitStatus) {
1940        self.ptrace_event(
1941            locked,
1942            PtraceOptions::TRACEEXIT,
1943            exit_status.signal_info_status() as u64,
1944        );
1945        self.thread_group().exit(locked, exit_status, None);
1946    }
1947
1948    /// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the
1949    /// exit signal as in clone().
1950    pub fn clone_task_for_test<L>(
1951        &self,
1952        locked: &mut Locked<L>,
1953        flags: u64,
1954        exit_signal: Option<Signal>,
1955    ) -> crate::testing::AutoReleasableTask
1956    where
1957        L: LockBefore<MmDumpable>,
1958        L: LockBefore<TaskRelease>,
1959        L: LockBefore<ProcessGroupState>,
1960    {
1961        let result = self
1962            .clone_task(
1963                locked,
1964                flags,
1965                exit_signal,
1966                UserRef::default(),
1967                UserRef::default(),
1968                UserRef::default(),
1969            )
1970            .expect("failed to create task in test");
1971
1972        result.into()
1973    }
1974
1975    // See "Ptrace access mode checking" in https://man7.org/linux/man-pages/man2/ptrace.2.html
1976    pub fn check_ptrace_access_mode<L>(
1977        &self,
1978        locked: &mut Locked<L>,
1979        mode: PtraceAccessMode,
1980        target: &Task,
1981    ) -> Result<(), Errno>
1982    where
1983        L: LockBefore<MmDumpable>,
1984    {
1985        // (1)  If the calling thread and the target thread are in the same
1986        //      thread group, access is always allowed.
1987        if self.thread_group().leader == target.thread_group().leader {
1988            return Ok(());
1989        }
1990
1991        // (2)  If the access mode specifies PTRACE_MODE_FSCREDS, then, for
1992        //      the check in the next step, employ the caller's filesystem
1993        //      UID and GID.  (As noted in credentials(7), the filesystem
1994        //      UID and GID almost always have the same values as the
1995        //      corresponding effective IDs.)
1996        //
1997        //      Otherwise, the access mode specifies PTRACE_MODE_REALCREDS,
1998        //      so use the caller's real UID and GID for the checks in the
1999        //      next step.  (Most APIs that check the caller's UID and GID
2000        //      use the effective IDs.  For historical reasons, the
2001        //      PTRACE_MODE_REALCREDS check uses the real IDs instead.)
2002        let (uid, gid) = if mode.contains(PTRACE_MODE_FSCREDS) {
2003            let fscred = self.current_creds().as_fscred();
2004            (fscred.uid, fscred.gid)
2005        } else if mode.contains(PTRACE_MODE_REALCREDS) {
2006            let creds = self.current_creds();
2007            (creds.uid, creds.gid)
2008        } else {
2009            unreachable!();
2010        };
2011
2012        // (3)  Deny access if neither of the following is true:
2013        //
2014        //      -  The real, effective, and saved-set user IDs of the target
2015        //         match the caller's user ID, and the real, effective, and
2016        //         saved-set group IDs of the target match the caller's
2017        //         group ID.
2018        //
2019        //      -  The caller has the CAP_SYS_PTRACE capability in the user
2020        //         namespace of the target.
2021        let target_creds = target.real_creds();
2022        if !(target_creds.uid == uid
2023            && target_creds.euid == uid
2024            && target_creds.saved_uid == uid
2025            && target_creds.gid == gid
2026            && target_creds.egid == gid
2027            && target_creds.saved_gid == gid)
2028        {
2029            security::check_task_capable(self, CAP_SYS_PTRACE)?;
2030        }
2031
2032        // (4)  Deny access if the target process "dumpable" attribute has a
2033        //      value other than 1 (SUID_DUMP_USER; see the discussion of
2034        //      PR_SET_DUMPABLE in prctl(2)), and the caller does not have
2035        //      the CAP_SYS_PTRACE capability in the user namespace of the
2036        //      target process.
2037        let dumpable = *target.mm()?.dumpable.lock(locked);
2038        match dumpable {
2039            DumpPolicy::User => (),
2040            DumpPolicy::Disable => security::check_task_capable(self, CAP_SYS_PTRACE)?,
2041        }
2042
2043        // (5)  The kernel LSM security_ptrace_access_check() interface is
2044        //      invoked to see if ptrace access is permitted.
2045        security::ptrace_access_check(self, target, mode)?;
2046
2047        // (6)  If access has not been denied by any of the preceding steps,
2048        //      then access is allowed.
2049        Ok(())
2050    }
2051
2052    pub fn can_signal(
2053        &self,
2054        target: &Task,
2055        unchecked_signal: UncheckedSignal,
2056    ) -> Result<(), Errno> {
2057        // If both the tasks share a thread group the signal can be sent. This is not documented
2058        // in kill(2) because kill does not support task-level granularity in signal sending.
2059        if self.thread_group == target.thread_group {
2060            return Ok(());
2061        }
2062
2063        let self_creds = self.current_creds();
2064        let target_creds = target.real_creds();
2065        // From https://man7.org/linux/man-pages/man2/kill.2.html:
2066        //
2067        // > For a process to have permission to send a signal, it must either be
2068        // > privileged (under Linux: have the CAP_KILL capability in the user
2069        // > namespace of the target process), or the real or effective user ID of
2070        // > the sending process must equal the real or saved set- user-ID of the
2071        // > target process.
2072        //
2073        // Returns true if the credentials are considered to have the same user ID.
2074        if self_creds.euid == target_creds.saved_uid
2075            || self_creds.euid == target_creds.uid
2076            || self_creds.uid == target_creds.uid
2077            || self_creds.uid == target_creds.saved_uid
2078        {
2079            return Ok(());
2080        }
2081
2082        if Signal::try_from(unchecked_signal) == Ok(SIGCONT) {
2083            let target_session = target.thread_group().read().process_group.session.leader;
2084            let self_session = self.thread_group().read().process_group.session.leader;
2085            if target_session == self_session {
2086                return Ok(());
2087            }
2088        }
2089
2090        security::check_task_capable(self, CAP_KILL)
2091    }
2092}
2093
2094impl ArchSpecific for CurrentTask {
2095    fn is_arch32(&self) -> bool {
2096        self.thread_state.is_arch32()
2097    }
2098}
2099
2100impl MemoryAccessor for CurrentTask {
2101    fn read_memory<'a>(
2102        &self,
2103        addr: UserAddress,
2104        bytes: &'a mut [MaybeUninit<u8>],
2105    ) -> Result<&'a mut [u8], Errno> {
2106        self.mm()?.unified_read_memory(self, addr, bytes)
2107    }
2108
2109    fn read_memory_partial_until_null_byte<'a>(
2110        &self,
2111        addr: UserAddress,
2112        bytes: &'a mut [MaybeUninit<u8>],
2113    ) -> Result<&'a mut [u8], Errno> {
2114        self.mm()?.unified_read_memory_partial_until_null_byte(self, addr, bytes)
2115    }
2116
2117    fn read_memory_partial<'a>(
2118        &self,
2119        addr: UserAddress,
2120        bytes: &'a mut [MaybeUninit<u8>],
2121    ) -> Result<&'a mut [u8], Errno> {
2122        self.mm()?.unified_read_memory_partial(self, addr, bytes)
2123    }
2124
2125    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2126        self.mm()?.unified_write_memory(self, addr, bytes)
2127    }
2128
2129    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2130        self.mm()?.unified_write_memory_partial(self, addr, bytes)
2131    }
2132
2133    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
2134        self.mm()?.unified_zero(self, addr, length)
2135    }
2136}
2137
2138impl TaskMemoryAccessor for CurrentTask {
2139    fn maximum_valid_address(&self) -> Option<UserAddress> {
2140        self.mm().ok().map(|mm| mm.maximum_valid_user_address)
2141    }
2142}
2143
2144pub enum ExceptionResult {
2145    /// The exception was handled and no further action is required.
2146    Handled,
2147
2148    // The exception generated a signal that should be delivered.
2149    Signal(SignalInfo),
2150}
2151
2152#[cfg(test)]
2153mod tests {
2154    use crate::testing::spawn_kernel_and_run;
2155    use starnix_uapi::auth::Credentials;
2156
2157    // This test will run `override_creds` and check it doesn't crash. This ensures that the
2158    // delegation to `override_creds_async` is correct.
2159    #[::fuchsia::test]
2160    async fn test_override_creds_can_delegate_to_async_version() {
2161        spawn_kernel_and_run(async move |_, current_task| {
2162            assert_eq!(current_task.override_creds(Credentials::root(), || 0), 0);
2163        })
2164        .await;
2165    }
2166}