starnix_core/task/
current_task.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::arch::task::{decode_page_fault_exception_report, get_signal_for_general_exception};
6use crate::execution::{TaskInfo, create_zircon_process};
7use crate::mm::{DumpPolicy, MemoryAccessor, MemoryAccessorExt, TaskMemoryAccessor};
8use crate::security;
9use crate::signals::{RunState, SignalInfo, send_signal_first, send_standard_signal};
10use crate::task::loader::{ResolvedElf, load_executable, resolve_executable};
11use crate::task::{
12    ExitStatus, PtraceCoreState, PtraceEvent, PtraceEventData, PtraceOptions, RobustListHeadPtr,
13    SeccompFilter, SeccompFilterContainer, SeccompNotifierHandle, SeccompState, SeccompStateValue,
14    StopState, Task, TaskFlags, Waiter,
15};
16use crate::vfs::{
17    CheckAccessReason, FdFlags, FdNumber, FileHandle, FsStr, LookupContext, MAX_SYMLINK_FOLLOWS,
18    NamespaceNode, ResolveBase, SymlinkMode, SymlinkTarget, new_pidfd,
19};
20use extended_pstate::ExtendedPstateState;
21use futures::FutureExt;
22use linux_uapi::CLONE_PIDFD;
23use starnix_logging::{log_error, log_warn, track_file_not_found, track_stub};
24use starnix_registers::RegisterState;
25use starnix_stack::clean_stack;
26use starnix_sync::{
27    EventWaitGuard, FileOpsCore, LockBefore, LockEqualOrBefore, Locked, MmDumpable,
28    ProcessGroupState, TaskRelease, Unlocked, WakeReason,
29};
30use starnix_syscalls::SyscallResult;
31use starnix_syscalls::decls::Syscall;
32use starnix_task_command::TaskCommand;
33use starnix_types::arch::ArchWidth;
34use starnix_types::futex_address::FutexAddress;
35use starnix_types::ownership::{OwnedRef, Releasable, TempRef, WeakRef, release_on_error};
36use starnix_uapi::auth::{
37    CAP_KILL, CAP_SYS_ADMIN, CAP_SYS_PTRACE, Credentials, FsCred, PTRACE_MODE_FSCREDS,
38    PTRACE_MODE_REALCREDS, PtraceAccessMode, UserAndOrGroupId,
39};
40use starnix_uapi::device_type::DeviceType;
41use starnix_uapi::errors::{Errno, ErrnoCode};
42use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
43use starnix_uapi::open_flags::OpenFlags;
44use starnix_uapi::signals::{
45    SIGBUS, SIGCHLD, SIGCONT, SIGILL, SIGKILL, SIGSEGV, SIGSYS, SIGTRAP, SigSet, Signal,
46    UncheckedSignal,
47};
48use starnix_uapi::user_address::{ArchSpecific, UserAddress, UserRef};
49use starnix_uapi::vfs::ResolveFlags;
50use starnix_uapi::{
51    CLONE_CHILD_CLEARTID, CLONE_CHILD_SETTID, CLONE_FILES, CLONE_FS, CLONE_INTO_CGROUP,
52    CLONE_NEWUTS, CLONE_PARENT, CLONE_PARENT_SETTID, CLONE_PTRACE, CLONE_SETTLS, CLONE_SIGHAND,
53    CLONE_SYSVSEM, CLONE_THREAD, CLONE_VFORK, CLONE_VM, FUTEX_OWNER_DIED, FUTEX_TID_MASK,
54    ROBUST_LIST_LIMIT, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_NEW_LISTENER,
55    SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_TSYNC_ESRCH, SI_KERNEL, clone_args, errno,
56    error, from_status_like_fdio, pid_t, sock_filter, ucred,
57};
58use std::cell::RefCell;
59use std::collections::VecDeque;
60use std::ffi::CString;
61use std::fmt;
62use std::marker::PhantomData;
63use std::mem::MaybeUninit;
64use std::sync::Arc;
65use zx::sys::zx_restricted_state_t;
66
67use super::ThreadGroupLifecycleWaitValue;
68
69pub struct TaskBuilder {
70    /// The underlying task object.
71    pub task: OwnedRef<Task>,
72
73    pub thread_state: Box<ThreadState>,
74}
75
76impl TaskBuilder {
77    pub fn new(task: OwnedRef<Task>) -> Self {
78        Self { task, thread_state: Default::default() }
79    }
80
81    #[inline(always)]
82    pub fn release<L>(self, locked: &mut Locked<L>)
83    where
84        L: LockBefore<TaskRelease>,
85    {
86        let locked = locked.cast_locked::<TaskRelease>();
87        Releasable::release(self, locked);
88    }
89}
90
91impl From<TaskBuilder> for CurrentTask {
92    fn from(builder: TaskBuilder) -> Self {
93        Self::new(builder.task, builder.thread_state)
94    }
95}
96
97impl Releasable for TaskBuilder {
98    type Context<'a> = &'a mut Locked<TaskRelease>;
99
100    fn release<'a>(self, locked: Self::Context<'a>) {
101        let kernel = Arc::clone(self.kernel());
102        let mut pids = kernel.pids.write();
103
104        // We remove from the thread group here because the WeakRef in the pid
105        // table to this task must be valid until this task is removed from the
106        // thread group, and the code below will invalidate it.
107        // Moreover, this requires a OwnedRef of the task to ensure the tasks of
108        // the thread group are always valid.
109        self.task.thread_group().remove(locked, &mut pids, &self.task);
110
111        let context = (self.thread_state, locked, pids);
112        self.task.release(context);
113    }
114}
115
116impl std::ops::Deref for TaskBuilder {
117    type Target = Task;
118    fn deref(&self) -> &Self::Target {
119        &self.task
120    }
121}
122
123/// Task permission are determined from their credentials, and if enabled, from their SEStarnix
124///  security state.
125#[derive(Debug, Clone)]
126pub struct FullCredentials {
127    pub creds: Credentials,
128    pub security_state: security::TaskState,
129}
130
131impl FullCredentials {
132    pub fn for_kernel() -> Self {
133        Self { creds: Credentials::root(), security_state: security::task_alloc_for_kernel() }
134    }
135}
136
137/// The task object associated with the currently executing thread.
138///
139/// We often pass the `CurrentTask` as the first argument to functions if those functions need to
140/// know contextual information about the thread on which they are running. For example, we often
141/// use the `CurrentTask` to perform access checks, which ensures that the caller is authorized to
142/// perform the requested operation.
143///
144/// The `CurrentTask` also has state that can be referenced only on the currently executing thread,
145/// such as the register state for that thread. Syscalls are given a mutable references to the
146/// `CurrentTask`, which lets them manipulate this state.
147///
148/// See also `Task` for more information about tasks.
149pub struct CurrentTask {
150    /// The underlying task object.
151    pub task: OwnedRef<Task>,
152
153    pub thread_state: Box<ThreadState>,
154
155    // TODO(https://fxbug.dev/433548348): Avoid interior mutability here by passing a
156    // &mut CurrentTask around instead of &CurrentTask.
157    pub overridden_creds: RefCell<Option<FullCredentials>>,
158
159    /// Makes CurrentTask neither Sync not Send.
160    _local_marker: PhantomData<*mut u8>,
161}
162
163/// The thread related information of a `CurrentTask`. The information should never be used  outside
164/// of the thread owning the `CurrentTask`.
165#[derive(Default)]
166pub struct ThreadState {
167    /// A copy of the registers associated with the Zircon thread. Up-to-date values can be read
168    /// from `self.handle.read_state_general_regs()`. To write these values back to the thread, call
169    /// `self.handle.write_state_general_regs(self.thread_state.registers.into())`.
170    pub registers: RegisterState,
171
172    /// Copy of the current extended processor state including floating point and vector registers.
173    pub extended_pstate: ExtendedPstateState,
174
175    /// The errno code (if any) that indicated this task should restart a syscall.
176    pub restart_code: Option<ErrnoCode>,
177
178    /// A custom function to resume a syscall that has been interrupted by SIGSTOP.
179    /// To use, call set_syscall_restart_func and return ERESTART_RESTARTBLOCK. sys_restart_syscall
180    /// will eventually call it.
181    pub syscall_restart_func: Option<Box<SyscallRestartFunc>>,
182
183    /// An architecture agnostic enum indicating the width (32 or 64 bits) of the execution
184    /// environment in use.
185    pub arch_width: ArchWidth,
186}
187
188impl ThreadState {
189    /// Returns a new `ThreadState` with the same `registers` as this one.
190    fn snapshot(&self) -> Box<Self> {
191        Box::new(Self {
192            registers: self.registers,
193            extended_pstate: Default::default(),
194            restart_code: self.restart_code,
195            syscall_restart_func: None,
196            arch_width: self.arch_width,
197        })
198    }
199
200    pub fn extended_snapshot(&self) -> Self {
201        Self {
202            registers: self.registers.clone(),
203            extended_pstate: self.extended_pstate.clone(),
204            restart_code: self.restart_code,
205            syscall_restart_func: None,
206            arch_width: self.arch_width,
207        }
208    }
209
210    pub fn replace_registers(&mut self, other: &ThreadState) {
211        self.registers = other.registers;
212        self.extended_pstate = other.extended_pstate;
213        self.arch_width = other.arch_width;
214    }
215
216    pub fn get_user_register(&mut self, offset: usize) -> Result<usize, Errno> {
217        let mut result: usize = 0;
218        self.registers.apply_user_register(offset, &mut |register| result = *register as usize)?;
219        Ok(result)
220    }
221
222    pub fn set_user_register(&mut self, offset: usize, value: usize) -> Result<(), Errno> {
223        self.registers.apply_user_register(offset, &mut |register| *register = value as u64)
224    }
225}
226
227impl ArchSpecific for ThreadState {
228    fn is_arch32(&self) -> bool {
229        self.arch_width.is_arch32()
230    }
231}
232
233type SyscallRestartFunc = dyn FnOnce(&mut Locked<Unlocked>, &mut CurrentTask) -> Result<SyscallResult, Errno>
234    + Send
235    + Sync;
236
237impl Releasable for CurrentTask {
238    type Context<'a> = &'a mut Locked<TaskRelease>;
239
240    fn release<'a>(self, locked: Self::Context<'a>) {
241        self.notify_robust_list();
242        let _ignored = self.clear_child_tid_if_needed(locked);
243
244        let kernel = Arc::clone(self.kernel());
245        let mut pids = kernel.pids.write();
246
247        // We remove from the thread group here because the WeakRef in the pid
248        // table to this task must be valid until this task is removed from the
249        // thread group, and the code below will invalidate it.
250        // Moreover, this requires a OwnedRef of the task to ensure the tasks of
251        // the thread group are always valid.
252        self.task.thread_group().remove(locked, &mut pids, &self.task);
253
254        let context = (self.thread_state, locked, pids);
255        self.task.release(context);
256    }
257}
258
259impl std::ops::Deref for CurrentTask {
260    type Target = Task;
261    fn deref(&self) -> &Self::Target {
262        &self.task
263    }
264}
265
266impl fmt::Debug for CurrentTask {
267    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
268        self.task.fmt(f)
269    }
270}
271
272impl CurrentTask {
273    pub fn new(task: OwnedRef<Task>, thread_state: Box<ThreadState>) -> Self {
274        Self {
275            task,
276            thread_state,
277            overridden_creds: RefCell::new(None),
278            _local_marker: Default::default(),
279        }
280    }
281
282    /// Returns the current subjective credentials of the task.
283    ///
284    /// The subjective credentials are the credentials that are used to check permissions for
285    /// actions performed by the task.
286    pub fn current_creds(&self) -> Credentials {
287        match self.overridden_creds.borrow().as_ref() {
288            Some(full_creds) => full_creds.creds.clone(),
289            None => self.real_creds(),
290        }
291    }
292
293    pub fn with_current_creds<B, F>(&self, f: F) -> B
294    where
295        F: FnOnce(&Credentials) -> B,
296    {
297        match self.overridden_creds.borrow().as_ref() {
298            Some(x) => f(&x.creds),
299            None => self.with_real_creds(f),
300        }
301    }
302
303    /// Returns the current subjective credentials of the task, including the security state.
304    pub fn full_current_creds(&self) -> FullCredentials {
305        match self.overridden_creds.borrow().as_ref() {
306            Some(full_creds) => full_creds.clone(),
307            None => FullCredentials {
308                creds: self.real_creds(),
309                security_state: self.security_state.clone(),
310            },
311        }
312    }
313
314    pub fn current_fscred(&self) -> FsCred {
315        self.with_current_creds(|creds| creds.as_fscred())
316    }
317
318    pub fn current_ucred(&self) -> ucred {
319        self.with_current_creds(|creds| ucred {
320            pid: self.get_pid(),
321            uid: creds.uid,
322            gid: creds.gid,
323        })
324    }
325
326    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
327    /// `callback`.
328    /// The creds and security state will be restored to their original values at the end of the
329    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
330    ///  used to check permissions for actions performed by the task, is altered. The "objective"
331    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
332    /// for actions performed on the task, is not altered, and changes to the credentials are not
333    /// externally visible.
334    pub async fn override_creds_async<R>(
335        &self,
336        alter_creds: impl FnOnce(&mut FullCredentials),
337        callback: impl AsyncFnOnce() -> R,
338    ) -> R {
339        let saved = self.overridden_creds.take();
340        let mut new_creds = saved.clone().unwrap_or_else(|| FullCredentials {
341            creds: self.real_creds(),
342            security_state: self.security_state.clone(),
343        });
344        alter_creds(&mut new_creds);
345
346        self.overridden_creds.replace(Some(new_creds));
347
348        let result = callback().await;
349
350        self.overridden_creds.replace(saved);
351
352        result
353    }
354
355    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
356    /// `callback`.
357    /// The creds and security state will be restored to their original values at the end of the
358    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
359    ///  used to check permissions for actions performed by the task, is altered. The "objective"
360    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
361    /// for actions performed on the task, is not altered, and changes to the credentials are not
362    /// externally visible.
363    pub fn override_creds<R>(
364        &self,
365        alter_creds: impl FnOnce(&mut FullCredentials),
366        callback: impl FnOnce() -> R,
367    ) -> R {
368        self.override_creds_async(alter_creds, async move || callback())
369            .now_or_never()
370            .expect("Future should be ready")
371    }
372
373    pub fn has_overridden_creds(&self) -> bool {
374        self.overridden_creds.borrow().is_some()
375    }
376
377    pub fn trigger_delayed_releaser<L>(&self, locked: &mut Locked<L>)
378    where
379        L: LockEqualOrBefore<FileOpsCore>,
380    {
381        let locked = locked.cast_locked::<FileOpsCore>();
382        self.kernel().delayed_releaser.apply(locked, self);
383    }
384
385    pub fn weak_task(&self) -> WeakRef<Task> {
386        WeakRef::from(&self.task)
387    }
388
389    pub fn temp_task(&self) -> TempRef<'_, Task> {
390        TempRef::from(&self.task)
391    }
392
393    /// Change the current and real creds of the task. This is invalid to call while temporary
394    /// credentials are present.
395    pub fn set_creds(&self, creds: Credentials) {
396        let overridden_creds = self.overridden_creds.borrow();
397        assert!(overridden_creds.is_none());
398        #[allow(
399            clippy::undocumented_unsafe_blocks,
400            reason = "Force documented unsafe blocks in Starnix"
401        )]
402        unsafe {
403            // SAFETY: this is allowed because we are the CurrentTask.
404            *self.persistent_info.creds_mut() = creds;
405        }
406        // The /proc/pid directory's ownership is updated when the task's euid
407        // or egid changes. See proc(5).
408        let maybe_node = self.proc_pid_directory_cache.lock();
409        if let Some(node) = &*maybe_node {
410            let creds = self.real_creds().euid_as_fscred();
411            // SAFETY: The /proc/pid directory held by `proc_pid_directory_cache` represents the
412            // current task. It's owner and group are supposed to track the current task's euid and
413            // egid.
414            unsafe {
415                node.force_chown(creds);
416            }
417        }
418    }
419
420    #[inline(always)]
421    pub fn release<L>(self, locked: &mut Locked<L>)
422    where
423        L: LockBefore<TaskRelease>,
424    {
425        let locked = locked.cast_locked::<TaskRelease>();
426        Releasable::release(self, locked);
427    }
428
429    pub fn set_syscall_restart_func<R: Into<SyscallResult>>(
430        &mut self,
431        f: impl FnOnce(&mut Locked<Unlocked>, &mut CurrentTask) -> Result<R, Errno>
432        + Send
433        + Sync
434        + 'static,
435    ) {
436        self.thread_state.syscall_restart_func =
437            Some(Box::new(|locked, current_task| Ok(f(locked, current_task)?.into())));
438    }
439
440    pub fn add_file<L>(
441        &self,
442        locked: &mut Locked<L>,
443        file: FileHandle,
444        flags: FdFlags,
445    ) -> Result<FdNumber, Errno>
446    where
447        L: LockEqualOrBefore<FileOpsCore>,
448    {
449        self.files.add(locked, self, file, flags)
450    }
451
452    /// Sets the task's signal mask to `signal_mask` and runs `wait_function`.
453    ///
454    /// Signals are dequeued prior to the original signal mask being restored. This is done by the
455    /// signal machinery in the syscall dispatch loop.
456    ///
457    /// The returned result is the result returned from the wait function.
458    pub fn wait_with_temporary_mask<F, T, L>(
459        &mut self,
460        locked: &mut Locked<L>,
461        signal_mask: SigSet,
462        wait_function: F,
463    ) -> Result<T, Errno>
464    where
465        L: LockEqualOrBefore<FileOpsCore>,
466        F: FnOnce(&mut Locked<L>, &CurrentTask) -> Result<T, Errno>,
467    {
468        {
469            let mut state = self.write();
470            state.set_flags(TaskFlags::TEMPORARY_SIGNAL_MASK, true);
471            state.set_temporary_signal_mask(signal_mask);
472        }
473        wait_function(locked, self)
474    }
475
476    /// If waking, promotes from waking to awake.  If not waking, make waiter async
477    /// wait until woken.  Returns true if woken.
478    pub fn wake_or_wait_until_unstopped_async(&self, waiter: &Waiter) -> bool {
479        let group_state = self.thread_group().read();
480        let mut task_state = self.write();
481
482        // Wake up if
483        //   a) we should wake up, meaning:
484        //      i) we're in group stop, and the thread group has exited group stop, or
485        //      ii) we're waking up,
486        //   b) and ptrace isn't stopping us from waking up, but
487        //   c) always wake up if we got a SIGKILL.
488        let task_stop_state = self.load_stopped();
489        let group_stop_state = self.thread_group().load_stopped();
490        if ((task_stop_state == StopState::GroupStopped && group_stop_state.is_waking_or_awake())
491            || task_stop_state.is_waking_or_awake())
492            && (!task_state.is_ptrace_listening() || task_stop_state.is_force())
493        {
494            let new_state = if task_stop_state.is_waking_or_awake() {
495                task_stop_state.finalize()
496            } else {
497                group_stop_state.finalize()
498            };
499            if let Ok(new_state) = new_state {
500                task_state.set_stopped(new_state, None, Some(self), None);
501                drop(group_state);
502                drop(task_state);
503                // It is possible for the stop state to be changed by another
504                // thread between when it is checked above and the following
505                // invocation, but set_stopped does sufficient checking while
506                // holding the lock to make sure that such a change won't result
507                // in corrupted state.
508                self.thread_group().set_stopped(new_state, None, false);
509                return true;
510            }
511        }
512
513        // We will wait.
514        if self.thread_group().load_stopped().is_stopped() || task_stop_state.is_stopped() {
515            // If we've stopped or PTRACE_LISTEN has been sent, wait for a
516            // signal or instructions from the tracer.
517            group_state
518                .lifecycle_waiters
519                .wait_async_value(&waiter, ThreadGroupLifecycleWaitValue::Stopped);
520            task_state.wait_on_ptracer(&waiter);
521        } else if task_state.can_accept_ptrace_commands() {
522            // If we're stopped because a tracer has seen the stop and not taken
523            // further action, wait for further instructions from the tracer.
524            task_state.wait_on_ptracer(&waiter);
525        } else if task_state.is_ptrace_listening() {
526            // A PTRACE_LISTEN is a state where we can get signals and notify a
527            // ptracer, but otherwise remain blocked.
528            if let Some(ptrace) = &mut task_state.ptrace {
529                ptrace.set_last_signal(Some(SignalInfo::default(SIGTRAP)));
530                ptrace.set_last_event(Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0)));
531            }
532            task_state.wait_on_ptracer(&waiter);
533            task_state.notify_ptracers();
534        }
535        false
536    }
537
538    /// Set the RunState for the current task to the given value and then call the given callback.
539    ///
540    /// When the callback is done, the run_state is restored to `RunState::Running`.
541    ///
542    /// This function is typically used just before blocking the current task on some operation.
543    /// The given `run_state` registers the mechanism for interrupting the blocking operation with
544    /// the task and the given `callback` actually blocks the task.
545    ///
546    /// This function can only be called in the `RunState::Running` state and cannot set the
547    /// run state to `RunState::Running`. For this reason, this function cannot be reentered.
548    pub fn run_in_state<F, T>(&self, run_state: RunState, callback: F) -> Result<T, Errno>
549    where
550        F: FnOnce() -> Result<T, Errno>,
551    {
552        assert_ne!(run_state, RunState::Running);
553
554        // As an optimization, decommit unused pages of the stack to reduce memory pressure while
555        // the thread is blocked.
556        clean_stack();
557
558        {
559            let mut state = self.write();
560            assert!(!state.is_blocked());
561
562            if matches!(run_state, RunState::Frozen(_)) {
563                // Freeze is a kernel signal and is handled before other user signals. A frozen task
564                // ignores all other signals except SIGKILL until it is thawed.
565                if state.has_signal_pending(SIGKILL) {
566                    return error!(EINTR);
567                }
568            } else if state.is_any_signal_pending() && !state.is_ptrace_listening() {
569                // A note on PTRACE_LISTEN - the thread cannot be scheduled
570                // regardless of pending signals.
571                return error!(EINTR);
572            }
573            state.set_run_state(run_state.clone());
574        }
575
576        let result = callback();
577
578        {
579            let mut state = self.write();
580            assert_eq!(
581                state.run_state(),
582                run_state,
583                "SignalState run state changed while waiting!"
584            );
585            state.set_run_state(RunState::Running);
586        };
587
588        result
589    }
590
591    pub fn block_until(
592        &self,
593        guard: EventWaitGuard<'_>,
594        deadline: zx::MonotonicInstant,
595    ) -> Result<(), Errno> {
596        self.run_in_state(RunState::Event(guard.event().clone()), move || {
597            guard.block_until(None, deadline).map_err(|e| match e {
598                WakeReason::Interrupted => errno!(EINTR),
599                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
600            })
601        })
602    }
603
604    pub fn block_with_owner_until(
605        &self,
606        guard: EventWaitGuard<'_>,
607        new_owner: &zx::Thread,
608        deadline: zx::MonotonicInstant,
609    ) -> Result<(), Errno> {
610        self.run_in_state(RunState::Event(guard.event().clone()), move || {
611            guard.block_until(Some(new_owner), deadline).map_err(|e| match e {
612                WakeReason::Interrupted => errno!(EINTR),
613                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
614            })
615        })
616    }
617
618    /// Determine namespace node indicated by the dir_fd.
619    ///
620    /// Returns the namespace node and the path to use relative to that node.
621    pub fn resolve_dir_fd<'a, L>(
622        &self,
623        locked: &mut Locked<L>,
624        dir_fd: FdNumber,
625        mut path: &'a FsStr,
626        flags: ResolveFlags,
627    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
628    where
629        L: LockEqualOrBefore<FileOpsCore>,
630    {
631        let path_is_absolute = path.starts_with(b"/");
632        if path_is_absolute {
633            if flags.contains(ResolveFlags::BENEATH) {
634                return error!(EXDEV);
635            }
636            path = &path[1..];
637        }
638
639        let dir = if path_is_absolute && !flags.contains(ResolveFlags::IN_ROOT) {
640            self.fs().root()
641        } else if dir_fd == FdNumber::AT_FDCWD {
642            self.fs().cwd()
643        } else {
644            // O_PATH allowed for:
645            //
646            //   Passing the file descriptor as the dirfd argument of
647            //   openat() and the other "*at()" system calls.  This
648            //   includes linkat(2) with AT_EMPTY_PATH (or via procfs
649            //   using AT_SYMLINK_FOLLOW) even if the file is not a
650            //   directory.
651            //
652            // See https://man7.org/linux/man-pages/man2/open.2.html
653            let file = self.files.get_allowing_opath(dir_fd)?;
654            file.name.to_passive()
655        };
656
657        if !path.is_empty() {
658            if !dir.entry.node.is_dir() {
659                return error!(ENOTDIR);
660            }
661            dir.check_access(
662                locked,
663                self,
664                Access::EXEC,
665                CheckAccessReason::InternalPermissionChecks,
666            )?;
667        }
668        Ok((dir, path.into()))
669    }
670
671    /// A convenient wrapper for opening files relative to FdNumber::AT_FDCWD.
672    ///
673    /// Returns a FileHandle but does not install the FileHandle in the FdTable
674    /// for this task.
675    pub fn open_file(
676        &self,
677        locked: &mut Locked<Unlocked>,
678        path: &FsStr,
679        flags: OpenFlags,
680    ) -> Result<FileHandle, Errno> {
681        if flags.contains(OpenFlags::CREAT) {
682            // In order to support OpenFlags::CREAT we would need to take a
683            // FileMode argument.
684            return error!(EINVAL);
685        }
686        self.open_file_at(
687            locked,
688            FdNumber::AT_FDCWD,
689            path,
690            flags,
691            FileMode::default(),
692            ResolveFlags::empty(),
693            AccessCheck::default(),
694        )
695    }
696
697    /// Resolves a path for open.
698    ///
699    /// If the final path component points to a symlink, the symlink is followed (as long as
700    /// the symlink traversal limit has not been reached).
701    ///
702    /// If the final path component (after following any symlinks, if enabled) does not exist,
703    /// and `flags` contains `OpenFlags::CREAT`, a new node is created at the location of the
704    /// final path component.
705    ///
706    /// This returns the resolved node, and a boolean indicating whether the node has been created.
707    fn resolve_open_path<L>(
708        &self,
709        locked: &mut Locked<L>,
710        context: &mut LookupContext,
711        dir: &NamespaceNode,
712        path: &FsStr,
713        mode: FileMode,
714        flags: OpenFlags,
715    ) -> Result<(NamespaceNode, bool), Errno>
716    where
717        L: LockEqualOrBefore<FileOpsCore>,
718    {
719        context.update_for_path(path);
720        let mut parent_content = context.with(SymlinkMode::Follow);
721        let (parent, basename) = self.lookup_parent(locked, &mut parent_content, dir, path)?;
722        context.remaining_follows = parent_content.remaining_follows;
723
724        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
725
726        // Lookup the child, without following a symlink or expecting it to be a directory.
727        let mut child_context = context.with(SymlinkMode::NoFollow);
728        child_context.must_be_directory = false;
729
730        match parent.lookup_child(locked, self, &mut child_context, basename) {
731            Ok(name) => {
732                if name.entry.node.is_lnk() {
733                    if flags.contains(OpenFlags::PATH)
734                        && context.symlink_mode == SymlinkMode::NoFollow
735                    {
736                        // When O_PATH is specified in flags, if pathname is a symbolic link
737                        // and the O_NOFOLLOW flag is also specified, then the call returns
738                        // a file descriptor referring to the symbolic link.
739                        // See https://man7.org/linux/man-pages/man2/openat.2.html
740                        //
741                        // If the trailing component (i.e., basename) of
742                        // pathname is a symbolic link, how.resolve contains
743                        // RESOLVE_NO_SYMLINKS, and how.flags contains both
744                        // O_PATH and O_NOFOLLOW, then an O_PATH file
745                        // descriptor referencing the symbolic link will be
746                        // returned.
747                        // See https://man7.org/linux/man-pages/man2/openat2.2.html
748                        return Ok((name, false));
749                    }
750
751                    if (!flags.contains(OpenFlags::PATH)
752                        && context.symlink_mode == SymlinkMode::NoFollow)
753                        || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
754                        || context.remaining_follows == 0
755                    {
756                        if must_create {
757                            // Since `must_create` is set, and a node was found, this returns EEXIST
758                            // instead of ELOOP.
759                            return error!(EEXIST);
760                        }
761                        // A symlink was found, but one of the following is true:
762                        // * flags specified O_NOFOLLOW but not O_PATH.
763                        // * how.resolve contains RESOLVE_NO_SYMLINKS
764                        // * too many symlink traversals have been attempted
765                        return error!(ELOOP);
766                    }
767
768                    context.remaining_follows -= 1;
769                    match name.readlink(locked, self)? {
770                        SymlinkTarget::Path(path) => {
771                            let dir = if path[0] == b'/' { self.fs().root() } else { parent };
772                            self.resolve_open_path(
773                                locked,
774                                context,
775                                &dir,
776                                path.as_ref(),
777                                mode,
778                                flags,
779                            )
780                        }
781                        SymlinkTarget::Node(name) => {
782                            if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS)
783                                || name.entry.node.is_lnk()
784                            {
785                                error!(ELOOP)
786                            } else {
787                                Ok((name, false))
788                            }
789                        }
790                    }
791                } else {
792                    if must_create {
793                        return error!(EEXIST);
794                    }
795                    Ok((name, false))
796                }
797            }
798            Err(e) if e == errno!(ENOENT) && flags.contains(OpenFlags::CREAT) => {
799                if context.must_be_directory {
800                    return error!(EISDIR);
801                }
802                Ok((
803                    parent.open_create_node(
804                        locked,
805                        self,
806                        basename,
807                        mode.with_type(FileMode::IFREG),
808                        DeviceType::NONE,
809                        flags,
810                    )?,
811                    true,
812                ))
813            }
814            Err(e) => Err(e),
815        }
816    }
817
818    /// The primary entry point for opening files relative to a task.
819    ///
820    /// Absolute paths are resolve relative to the root of the FsContext for
821    /// this task. Relative paths are resolve relative to dir_fd. To resolve
822    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
823    /// dir_fd.
824    ///
825    /// Returns a FileHandle but does not install the FileHandle in the FdTable
826    /// for this task.
827    pub fn open_file_at(
828        &self,
829        locked: &mut Locked<Unlocked>,
830        dir_fd: FdNumber,
831        path: &FsStr,
832        flags: OpenFlags,
833        mode: FileMode,
834        resolve_flags: ResolveFlags,
835        access_check: AccessCheck,
836    ) -> Result<FileHandle, Errno> {
837        if path.is_empty() {
838            return error!(ENOENT);
839        }
840
841        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, resolve_flags)?;
842        self.open_namespace_node_at(locked, dir, path, flags, mode, resolve_flags, access_check)
843    }
844
845    pub fn open_namespace_node_at(
846        &self,
847        locked: &mut Locked<Unlocked>,
848        dir: NamespaceNode,
849        path: &FsStr,
850        flags: OpenFlags,
851        mode: FileMode,
852        mut resolve_flags: ResolveFlags,
853        access_check: AccessCheck,
854    ) -> Result<FileHandle, Errno> {
855        // 64-bit kernels force the O_LARGEFILE flag to be on.
856        let mut flags = flags | OpenFlags::LARGEFILE;
857        let opath = flags.contains(OpenFlags::PATH);
858        if opath {
859            // When O_PATH is specified in flags, flag bits other than O_CLOEXEC,
860            // O_DIRECTORY, and O_NOFOLLOW are ignored.
861            const ALLOWED_FLAGS: OpenFlags = OpenFlags::from_bits_truncate(
862                OpenFlags::PATH.bits()
863                    | OpenFlags::CLOEXEC.bits()
864                    | OpenFlags::DIRECTORY.bits()
865                    | OpenFlags::NOFOLLOW.bits(),
866            );
867            flags &= ALLOWED_FLAGS;
868        }
869
870        if flags.contains(OpenFlags::TMPFILE) && !flags.can_write() {
871            return error!(EINVAL);
872        }
873
874        let nofollow = flags.contains(OpenFlags::NOFOLLOW);
875        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
876
877        let symlink_mode =
878            if nofollow || must_create { SymlinkMode::NoFollow } else { SymlinkMode::Follow };
879
880        let resolve_base = match (
881            resolve_flags.contains(ResolveFlags::BENEATH),
882            resolve_flags.contains(ResolveFlags::IN_ROOT),
883        ) {
884            (false, false) => ResolveBase::None,
885            (true, false) => ResolveBase::Beneath(dir.clone()),
886            (false, true) => ResolveBase::InRoot(dir.clone()),
887            (true, true) => return error!(EINVAL),
888        };
889
890        // `RESOLVE_BENEATH` and `RESOLVE_IN_ROOT` imply `RESOLVE_NO_MAGICLINKS`. This matches
891        // Linux behavior. Strictly speaking it's is not really required, but it's hard to
892        // implement `BENEATH` and `IN_ROOT` flags correctly otherwise.
893        if resolve_base != ResolveBase::None {
894            resolve_flags.insert(ResolveFlags::NO_MAGICLINKS);
895        }
896
897        let mut context = LookupContext {
898            symlink_mode,
899            remaining_follows: MAX_SYMLINK_FOLLOWS,
900            must_be_directory: flags.contains(OpenFlags::DIRECTORY),
901            resolve_flags,
902            resolve_base,
903        };
904        let (name, created) =
905            match self.resolve_open_path(locked, &mut context, &dir, path, mode, flags) {
906                Ok((n, c)) => (n, c),
907                Err(e) => {
908                    let mut abs_path = dir.path(&self.task);
909                    abs_path.extend(&**path);
910                    track_file_not_found(abs_path);
911                    return Err(e);
912                }
913            };
914
915        let name = if flags.contains(OpenFlags::TMPFILE) {
916            // `O_TMPFILE` is incompatible with `O_CREAT`
917            if flags.contains(OpenFlags::CREAT) {
918                return error!(EINVAL);
919            }
920            name.create_tmpfile(locked, self, mode.with_type(FileMode::IFREG), flags)?
921        } else {
922            let mode = name.entry.node.info().mode;
923
924            // These checks are not needed in the `O_TMPFILE` case because `mode` refers to the
925            // file we are opening. With `O_TMPFILE`, that file is the regular file we just
926            // created rather than the node we found by resolving the path.
927            //
928            // For example, we do not need to produce `ENOTDIR` when `must_be_directory` is set
929            // because `must_be_directory` refers to the node we found by resolving the path.
930            // If that node was not a directory, then `create_tmpfile` will produce an error.
931            //
932            // Similarly, we never need to call `truncate` because `O_TMPFILE` is newly created
933            // and therefor already an empty file.
934
935            if !opath && nofollow && mode.is_lnk() {
936                return error!(ELOOP);
937            }
938
939            if mode.is_dir() {
940                if flags.can_write()
941                    || flags.contains(OpenFlags::CREAT)
942                    || flags.contains(OpenFlags::TRUNC)
943                {
944                    return error!(EISDIR);
945                }
946                if flags.contains(OpenFlags::DIRECT) {
947                    return error!(EINVAL);
948                }
949            } else if context.must_be_directory {
950                return error!(ENOTDIR);
951            }
952
953            if flags.contains(OpenFlags::TRUNC) && mode.is_reg() && !created {
954                // You might think we should check file.can_write() at this
955                // point, which is what the docs suggest, but apparently we
956                // are supposed to truncate the file if this task can write
957                // to the underlying node, even if we are opening the file
958                // as read-only. See OpenTest.CanTruncateReadOnly.
959                name.truncate(locked, self, 0)?;
960            }
961
962            name
963        };
964
965        // If the node has been created, the open operation should not verify access right:
966        // From <https://man7.org/linux/man-pages/man2/open.2.html>
967        //
968        // > Note that mode applies only to future accesses of the newly created file; the
969        // > open() call that creates a read-only file may well return a  read/write  file
970        // > descriptor.
971
972        let access_check = if created { AccessCheck::skip() } else { access_check };
973        name.open(locked, self, flags, access_check)
974    }
975
976    /// A wrapper for FsContext::lookup_parent_at that resolves the given
977    /// dir_fd to a NamespaceNode.
978    ///
979    /// Absolute paths are resolve relative to the root of the FsContext for
980    /// this task. Relative paths are resolve relative to dir_fd. To resolve
981    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
982    /// dir_fd.
983    pub fn lookup_parent_at<'a, L>(
984        &self,
985        locked: &mut Locked<L>,
986        context: &mut LookupContext,
987        dir_fd: FdNumber,
988        path: &'a FsStr,
989    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
990    where
991        L: LockEqualOrBefore<FileOpsCore>,
992    {
993        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, ResolveFlags::empty())?;
994        self.lookup_parent(locked, context, &dir, path)
995    }
996
997    /// Lookup the parent of a namespace node.
998    ///
999    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
1000    /// calling this function directly.
1001    ///
1002    /// This function resolves all but the last component of the given path.
1003    /// The function returns the parent directory of the last component as well
1004    /// as the last component.
1005    ///
1006    /// If path is empty, this function returns dir and an empty path.
1007    /// Similarly, if path ends with "." or "..", these components will be
1008    /// returned along with the parent.
1009    ///
1010    /// The returned parent might not be a directory.
1011    pub fn lookup_parent<'a, L>(
1012        &self,
1013        locked: &mut Locked<L>,
1014        context: &mut LookupContext,
1015        dir: &NamespaceNode,
1016        path: &'a FsStr,
1017    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
1018    where
1019        L: LockEqualOrBefore<FileOpsCore>,
1020    {
1021        context.update_for_path(path);
1022
1023        let mut current_node = dir.clone();
1024        let mut it = path.split(|c| *c == b'/').filter(|p| !p.is_empty()).map(<&FsStr>::from);
1025        let mut current_path_component = it.next().unwrap_or_default();
1026        for next_path_component in it {
1027            current_node =
1028                current_node.lookup_child(locked, self, context, current_path_component)?;
1029            current_path_component = next_path_component;
1030        }
1031        Ok((current_node, current_path_component))
1032    }
1033
1034    /// Lookup a namespace node.
1035    ///
1036    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
1037    /// calling this function directly.
1038    ///
1039    /// This function resolves the component of the given path.
1040    pub fn lookup_path<L>(
1041        &self,
1042        locked: &mut Locked<L>,
1043        context: &mut LookupContext,
1044        dir: NamespaceNode,
1045        path: &FsStr,
1046    ) -> Result<NamespaceNode, Errno>
1047    where
1048        L: LockEqualOrBefore<FileOpsCore>,
1049    {
1050        let (parent, basename) = self.lookup_parent(locked, context, &dir, path)?;
1051        parent.lookup_child(locked, self, context, basename)
1052    }
1053
1054    /// Lookup a namespace node starting at the root directory.
1055    ///
1056    /// Resolves symlinks.
1057    pub fn lookup_path_from_root<L>(
1058        &self,
1059        locked: &mut Locked<L>,
1060        path: &FsStr,
1061    ) -> Result<NamespaceNode, Errno>
1062    where
1063        L: LockEqualOrBefore<FileOpsCore>,
1064    {
1065        let mut context = LookupContext::default();
1066        self.lookup_path(locked, &mut context, self.fs().root(), path)
1067    }
1068
1069    pub fn exec(
1070        &mut self,
1071        locked: &mut Locked<Unlocked>,
1072        executable: FileHandle,
1073        path: CString,
1074        argv: Vec<CString>,
1075        environ: Vec<CString>,
1076    ) -> Result<(), Errno> {
1077        // Executable must be a regular file
1078        if !executable.name.entry.node.is_reg() {
1079            return error!(EACCES);
1080        }
1081
1082        // File node must have EXEC mode permissions.
1083        // Note that the ability to execute a file is unrelated to the flags
1084        // used in the `open` call.
1085        executable.name.check_access(locked, self, Access::EXEC, CheckAccessReason::Exec)?;
1086
1087        let elf_security_state = security::bprm_creds_for_exec(self, &executable.name)?;
1088
1089        let resolved_elf = resolve_executable(
1090            locked,
1091            self,
1092            executable,
1093            path.clone(),
1094            argv,
1095            environ,
1096            elf_security_state,
1097        )?;
1098
1099        let maybe_set_id = if self.kernel().features.enable_suid {
1100            resolved_elf.file.name.suid_and_sgid(&self)?
1101        } else {
1102            Default::default()
1103        };
1104
1105        if self.thread_group().read().tasks_count() > 1 {
1106            track_stub!(TODO("https://fxbug.dev/297434895"), "exec on multithread process");
1107            return error!(EINVAL);
1108        }
1109
1110        if let Err(err) = self.finish_exec(locked, path, resolved_elf, maybe_set_id) {
1111            log_warn!("unrecoverable error in exec: {err:?}");
1112
1113            send_standard_signal(
1114                locked,
1115                self,
1116                SignalInfo { code: SI_KERNEL as i32, force: true, ..SignalInfo::default(SIGSEGV) },
1117            );
1118            return Err(err);
1119        }
1120
1121        self.ptrace_event(locked, PtraceOptions::TRACEEXEC, self.task.tid as u64);
1122        self.signal_vfork();
1123
1124        Ok(())
1125    }
1126
1127    /// After the memory is unmapped, any failure in exec is unrecoverable and results in the
1128    /// process crashing. This function is for that second half; any error returned from this
1129    /// function will be considered unrecoverable.
1130    fn finish_exec(
1131        &mut self,
1132        locked: &mut Locked<Unlocked>,
1133        path: CString,
1134        resolved_elf: ResolvedElf,
1135        mut maybe_set_id: UserAndOrGroupId,
1136    ) -> Result<(), Errno> {
1137        // Now that the exec will definitely finish (or crash), notify owners of
1138        // locked futexes for the current process, which will be impossible to
1139        // update after process image is replaced.  See get_robust_list(2).
1140        self.notify_robust_list();
1141
1142        // Passing arch32 information here ensures the replacement memory
1143        // layout matches the elf being executed.
1144        let mm = {
1145            let mm = self.mm()?;
1146            let new_mm = mm
1147                .exec(resolved_elf.file.name.to_passive(), resolved_elf.arch_width)
1148                .map_err(|status| from_status_like_fdio!(status))?;
1149            self.mm.update(Some(new_mm.clone()));
1150            new_mm
1151        };
1152
1153        {
1154            let mut state = self.write();
1155
1156            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1157            //
1158            //   The aforementioned transformations of the effective IDs are not
1159            //   performed (i.e., the set-user-ID and set-group-ID bits are
1160            //   ignored) if any of the following is true:
1161            //
1162            //   * the no_new_privs attribute is set for the calling thread (see
1163            //      prctl(2));
1164            //
1165            //   *  the underlying filesystem is mounted nosuid (the MS_NOSUID
1166            //      flag for mount(2)); or
1167            //
1168            //   *  the calling process is being ptraced.
1169            //
1170            // The MS_NOSUID check is in `NamespaceNode::suid_and_sgid()`.
1171            if state.no_new_privs() || state.is_ptraced() {
1172                maybe_set_id.clear();
1173            }
1174
1175            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1176            //
1177            //   The process's "dumpable" attribute is set to the value 1,
1178            //   unless a set-user-ID program, a set-group-ID program, or a
1179            //   program with capabilities is being executed, in which case the
1180            //   dumpable flag may instead be reset to the value in
1181            //   /proc/sys/fs/suid_dumpable, in the circumstances described
1182            //   under PR_SET_DUMPABLE in prctl(2).
1183            let dumpable =
1184                if maybe_set_id.is_none() { DumpPolicy::User } else { DumpPolicy::Disable };
1185            *mm.dumpable.lock(locked) = dumpable;
1186
1187            #[allow(
1188                clippy::undocumented_unsafe_blocks,
1189                reason = "Force documented unsafe blocks in Starnix"
1190            )]
1191            let mut creds = unsafe {
1192                // SAFETY: this is allowed because we are the CurrentTask.
1193                self.persistent_info.creds_mut()
1194            };
1195            state.set_sigaltstack(None);
1196            state.robust_list_head = RobustListHeadPtr::null(self);
1197
1198            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1199            //
1200            //   If a set-user-ID or set-group-ID
1201            //   program is being executed, then the parent death signal set by
1202            //   prctl(2) PR_SET_PDEATHSIG flag is cleared.
1203            //
1204            // TODO(https://fxbug.dev/356684424): Implement the behavior above once we support
1205            // the PR_SET_PDEATHSIG flag.
1206
1207            // TODO(tbodt): Check whether capability xattrs are set on the file, and grant/limit
1208            // capabilities accordingly.
1209            creds.exec(maybe_set_id);
1210        }
1211
1212        let security_state = resolved_elf.security_state.clone();
1213
1214        let start_info = load_executable(self, resolved_elf, &path)?;
1215        // Before consuming start_info below, note if the task is 32-bit.
1216        self.thread_state.arch_width = start_info.arch_width;
1217
1218        let regs: zx_restricted_state_t = start_info.into();
1219        self.thread_state.registers = regs.into();
1220        self.thread_state.extended_pstate.reset();
1221        self.thread_group().signal_actions.reset_for_exec();
1222
1223        // The exit signal (and that of the children) is reset to SIGCHLD.
1224        let mut thread_group_state = self.thread_group().write();
1225        thread_group_state.exit_signal = Some(SIGCHLD);
1226        for (_, weak_child) in &mut thread_group_state.children {
1227            if let Some(child) = weak_child.upgrade() {
1228                let mut child_state = child.write();
1229                child_state.exit_signal = Some(SIGCHLD);
1230            }
1231        }
1232
1233        std::mem::drop(thread_group_state);
1234
1235        // TODO(https://fxbug.dev/42082680): All threads other than the calling thread are destroyed.
1236
1237        // TODO: POSIX timers are not preserved.
1238
1239        // TODO: Ensure that the filesystem context is un-shared, undoing the effect of CLONE_FS.
1240
1241        // The file descriptor table is unshared, undoing the effect of the CLONE_FILES flag of
1242        // clone(2).
1243        self.files.unshare();
1244        self.files.exec(locked, self);
1245
1246        // If SELinux is enabled, enforce permissions related to inheritance of file descriptors
1247        // and resource limits. Then update the current task's SID.
1248        //
1249        // TODO: https://fxbug.dev/378655436 - After the above, enforce permissions related to
1250        // signal state inheritance.
1251        //
1252        // This needs to be called after closing any files marked "close-on-exec".
1253        security::exec_binprm(locked, self, &security_state);
1254
1255        self.thread_group().write().did_exec = true;
1256
1257        self.set_command_name(TaskCommand::from_path_bytes(path.to_bytes()));
1258
1259        Ok(())
1260    }
1261
1262    pub fn set_command_name(&self, new_name: TaskCommand) {
1263        // set_command_name needs to run before leader_command() in cases where self is the leader.
1264        self.task.set_command_name(new_name.clone());
1265        let leader_command = self.thread_group().read().leader_command();
1266        starnix_logging::set_current_task_info(
1267            new_name,
1268            leader_command,
1269            self.thread_group().leader,
1270            self.tid,
1271        );
1272    }
1273
1274    pub fn add_seccomp_filter(
1275        &mut self,
1276        locked: &mut Locked<Unlocked>,
1277        code: Vec<sock_filter>,
1278        flags: u32,
1279    ) -> Result<SyscallResult, Errno> {
1280        let new_filter = Arc::new(SeccompFilter::from_cbpf(
1281            &code,
1282            self.thread_group().next_seccomp_filter_id.add(1),
1283            flags & SECCOMP_FILTER_FLAG_LOG != 0,
1284        )?);
1285
1286        let mut maybe_fd: Option<FdNumber> = None;
1287
1288        if flags & SECCOMP_FILTER_FLAG_NEW_LISTENER != 0 {
1289            maybe_fd = Some(SeccompFilterContainer::create_listener(locked, self)?);
1290        }
1291
1292        // We take the process lock here because we can't change any of the threads
1293        // while doing a tsync.  So, you hold the process lock while making any changes.
1294        let state = self.thread_group().write();
1295
1296        if flags & SECCOMP_FILTER_FLAG_TSYNC != 0 {
1297            // TSYNC synchronizes all filters for all threads in the current process to
1298            // the current thread's
1299
1300            // We collect the filters for the current task upfront to save us acquiring
1301            // the task's lock a lot of times below.
1302            let mut filters: SeccompFilterContainer = self.read().seccomp_filters.clone();
1303
1304            // For TSYNC to work, all of the other thread filters in this process have to
1305            // be a prefix of this thread's filters, and none of them can be in
1306            // strict mode.
1307            let tasks = state.tasks().collect::<Vec<_>>();
1308            for task in &tasks {
1309                if task.tid == self.tid {
1310                    continue;
1311                }
1312                let other_task_state = task.read();
1313
1314                // Target threads cannot be in SECCOMP_MODE_STRICT
1315                if task.seccomp_filter_state.get() == SeccompStateValue::Strict {
1316                    return Self::seccomp_tsync_error(task.tid, flags);
1317                }
1318
1319                // Target threads' filters must be a subsequence of this thread's
1320                if !other_task_state.seccomp_filters.can_sync_to(&filters) {
1321                    return Self::seccomp_tsync_error(task.tid, flags);
1322                }
1323            }
1324
1325            // Now that we're sure we're allowed to do so, add the filter to all threads.
1326            filters.add_filter(new_filter, code.len() as u16)?;
1327
1328            for task in &tasks {
1329                let mut other_task_state = task.write();
1330
1331                other_task_state.enable_no_new_privs();
1332                other_task_state.seccomp_filters = filters.clone();
1333                task.set_seccomp_state(SeccompStateValue::UserDefined)?;
1334            }
1335        } else {
1336            let mut task_state = self.task.write();
1337
1338            task_state.seccomp_filters.add_filter(new_filter, code.len() as u16)?;
1339            self.set_seccomp_state(SeccompStateValue::UserDefined)?;
1340        }
1341
1342        if let Some(fd) = maybe_fd { Ok(fd.into()) } else { Ok(().into()) }
1343    }
1344
1345    pub fn run_seccomp_filters(
1346        &mut self,
1347        locked: &mut Locked<Unlocked>,
1348        syscall: &Syscall,
1349    ) -> Option<Result<SyscallResult, Errno>> {
1350        // Implementation of SECCOMP_FILTER_STRICT, which has slightly different semantics
1351        // from user-defined seccomp filters.
1352        if self.seccomp_filter_state.get() == SeccompStateValue::Strict {
1353            return SeccompState::do_strict(locked, self, syscall);
1354        }
1355
1356        // Run user-defined seccomp filters
1357        let result = self.task.read().seccomp_filters.run_all(self, syscall);
1358
1359        SeccompState::do_user_defined(locked, result, self, syscall)
1360    }
1361
1362    fn seccomp_tsync_error(id: i32, flags: u32) -> Result<SyscallResult, Errno> {
1363        // By default, TSYNC indicates failure state by returning the first thread
1364        // id not to be able to sync, rather than by returning -1 and setting
1365        // errno.  However, if TSYNC_ESRCH is set, it returns ESRCH.  This
1366        // prevents conflicts with fact that SECCOMP_FILTER_FLAG_NEW_LISTENER
1367        // makes seccomp return an fd.
1368        if flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH != 0 { error!(ESRCH) } else { Ok(id.into()) }
1369    }
1370
1371    // Notify all futexes in robust list.  The robust list is in user space, so we
1372    // are very careful about walking it, and there are a lot of quiet returns if
1373    // we fail to walk it.
1374    // TODO(https://fxbug.dev/42079081): This only sets the FUTEX_OWNER_DIED bit; it does
1375    // not wake up a waiter.
1376    pub fn notify_robust_list(&self) {
1377        let task_state = self.write();
1378        let robust_list_addr = task_state.robust_list_head.addr();
1379        if robust_list_addr == UserAddress::NULL {
1380            // No one has called set_robust_list.
1381            return;
1382        }
1383        let robust_list_res = self.read_multi_arch_object(task_state.robust_list_head);
1384
1385        let head = if let Ok(head) = robust_list_res {
1386            head
1387        } else {
1388            return;
1389        };
1390
1391        let offset = head.futex_offset;
1392
1393        let mut entries_count = 0;
1394        let mut curr_ptr = head.list.next;
1395        while curr_ptr.addr() != robust_list_addr.into() && entries_count < ROBUST_LIST_LIMIT {
1396            let curr_ref = self.read_multi_arch_object(curr_ptr);
1397
1398            let curr = if let Ok(curr) = curr_ref {
1399                curr
1400            } else {
1401                return;
1402            };
1403
1404            let Some(futex_base) = curr_ptr.addr().checked_add_signed(offset) else {
1405                return;
1406            };
1407
1408            let futex_addr = match FutexAddress::try_from(futex_base) {
1409                Ok(addr) => addr,
1410                Err(_) => {
1411                    return;
1412                }
1413            };
1414
1415            let Ok(mm) = self.mm() else {
1416                log_error!("Asked to notify robust list futexes in system task.");
1417                return;
1418            };
1419            let futex = if let Ok(futex) = mm.atomic_load_u32_relaxed(futex_addr) {
1420                futex
1421            } else {
1422                return;
1423            };
1424
1425            if (futex & FUTEX_TID_MASK) as i32 == self.tid {
1426                let owner_died = FUTEX_OWNER_DIED | futex;
1427                if mm.atomic_store_u32_relaxed(futex_addr, owner_died).is_err() {
1428                    return;
1429                }
1430            }
1431            curr_ptr = curr.next;
1432            entries_count += 1;
1433        }
1434    }
1435
1436    /// Returns a ref to this thread's SeccompNotifier.
1437    pub fn get_seccomp_notifier(&mut self) -> Option<SeccompNotifierHandle> {
1438        self.task.write().seccomp_filters.notifier.clone()
1439    }
1440
1441    pub fn set_seccomp_notifier(&mut self, notifier: Option<SeccompNotifierHandle>) {
1442        self.task.write().seccomp_filters.notifier = notifier;
1443    }
1444
1445    /// Processes a Zircon exception associated with this task.
1446    pub fn process_exception(
1447        &self,
1448        locked: &mut Locked<Unlocked>,
1449        report: &zx::ExceptionReport,
1450    ) -> ExceptionResult {
1451        match report.ty {
1452            zx::ExceptionType::General => match get_signal_for_general_exception(&report.arch) {
1453                Some(sig) => ExceptionResult::Signal(SignalInfo::default(sig)),
1454                None => {
1455                    log_error!("Unrecognized general exception: {:?}", report);
1456                    ExceptionResult::Signal(SignalInfo::default(SIGILL))
1457                }
1458            },
1459            zx::ExceptionType::FatalPageFault { status } => {
1460                let report = decode_page_fault_exception_report(&report.arch);
1461                if let Ok(mm) = self.mm() {
1462                    mm.handle_page_fault(locked, report, status)
1463                } else {
1464                    panic!(
1465                        "system task is handling a major page fault status={:?}, report={:?}",
1466                        status, report
1467                    );
1468                }
1469            }
1470            zx::ExceptionType::UndefinedInstruction => {
1471                ExceptionResult::Signal(SignalInfo::default(SIGILL))
1472            }
1473            zx::ExceptionType::UnalignedAccess => {
1474                ExceptionResult::Signal(SignalInfo::default(SIGBUS))
1475            }
1476            zx::ExceptionType::SoftwareBreakpoint | zx::ExceptionType::HardwareBreakpoint => {
1477                ExceptionResult::Signal(SignalInfo::default(SIGTRAP))
1478            }
1479            zx::ExceptionType::ProcessNameChanged => {
1480                log_error!("Received unexpected process name changed exception");
1481                ExceptionResult::Handled
1482            }
1483            zx::ExceptionType::ProcessStarting
1484            | zx::ExceptionType::ThreadStarting
1485            | zx::ExceptionType::ThreadExiting => {
1486                log_error!("Received unexpected task lifecycle exception");
1487                ExceptionResult::Signal(SignalInfo::default(SIGSYS))
1488            }
1489            zx::ExceptionType::PolicyError(policy_code) => {
1490                log_error!(policy_code:?; "Received Zircon policy error exception");
1491                ExceptionResult::Signal(SignalInfo::default(SIGSYS))
1492            }
1493            zx::ExceptionType::UnknownUserGenerated { code, data } => {
1494                log_error!(code:?, data:?; "Received unexpected unknown user generated exception");
1495                ExceptionResult::Signal(SignalInfo::default(SIGSYS))
1496            }
1497            zx::ExceptionType::Unknown { ty, code, data } => {
1498                log_error!(ty:?, code:?, data:?; "Received unexpected exception");
1499                ExceptionResult::Signal(SignalInfo::default(SIGSYS))
1500            }
1501        }
1502    }
1503
1504    /// Clone this task.
1505    ///
1506    /// Creates a new task object that shares some state with this task
1507    /// according to the given flags.
1508    ///
1509    /// Used by the clone() syscall to create both processes and threads.
1510    ///
1511    /// The exit signal is broken out from the flags parameter like clone3() rather than being
1512    /// bitwise-ORed like clone().
1513    pub fn clone_task<L>(
1514        &self,
1515        locked: &mut Locked<L>,
1516        flags: u64,
1517        child_exit_signal: Option<Signal>,
1518        user_parent_tid: UserRef<pid_t>,
1519        user_child_tid: UserRef<pid_t>,
1520        user_pidfd: UserRef<FdNumber>,
1521    ) -> Result<TaskBuilder, Errno>
1522    where
1523        L: LockBefore<MmDumpable>,
1524        L: LockBefore<TaskRelease>,
1525        L: LockBefore<ProcessGroupState>,
1526    {
1527        const IMPLEMENTED_FLAGS: u64 = (CLONE_VM
1528            | CLONE_FS
1529            | CLONE_FILES
1530            | CLONE_SIGHAND
1531            | CLONE_THREAD
1532            | CLONE_SYSVSEM
1533            | CLONE_SETTLS
1534            | CLONE_PARENT
1535            | CLONE_PARENT_SETTID
1536            | CLONE_PIDFD
1537            | CLONE_CHILD_CLEARTID
1538            | CLONE_CHILD_SETTID
1539            | CLONE_VFORK
1540            | CLONE_NEWUTS
1541            | CLONE_PTRACE) as u64;
1542
1543        // A mask with all valid flags set, because we want to return a different error code for an
1544        // invalid flag vs an unimplemented flag. Subtracting 1 from the largest valid flag gives a
1545        // mask with all flags below it set. Shift up by one to make sure the largest flag is also
1546        // set.
1547        const VALID_FLAGS: u64 = (CLONE_INTO_CGROUP << 1) - 1;
1548
1549        // CLONE_SETTLS is implemented by sys_clone.
1550
1551        let clone_files = flags & (CLONE_FILES as u64) != 0;
1552        let clone_fs = flags & (CLONE_FS as u64) != 0;
1553        let clone_parent = flags & (CLONE_PARENT as u64) != 0;
1554        let clone_parent_settid = flags & (CLONE_PARENT_SETTID as u64) != 0;
1555        let clone_pidfd = flags & (CLONE_PIDFD as u64) != 0;
1556        let clone_child_cleartid = flags & (CLONE_CHILD_CLEARTID as u64) != 0;
1557        let clone_child_settid = flags & (CLONE_CHILD_SETTID as u64) != 0;
1558        let clone_sysvsem = flags & (CLONE_SYSVSEM as u64) != 0;
1559        let clone_ptrace = flags & (CLONE_PTRACE as u64) != 0;
1560        let clone_thread = flags & (CLONE_THREAD as u64) != 0;
1561        let clone_vm = flags & (CLONE_VM as u64) != 0;
1562        let clone_sighand = flags & (CLONE_SIGHAND as u64) != 0;
1563        let clone_vfork = flags & (CLONE_VFORK as u64) != 0;
1564        let clone_newuts = flags & (CLONE_NEWUTS as u64) != 0;
1565        let clone_into_cgroup = flags & CLONE_INTO_CGROUP != 0;
1566
1567        if clone_ptrace {
1568            track_stub!(TODO("https://fxbug.dev/322874630"), "CLONE_PTRACE");
1569        }
1570
1571        if clone_sysvsem {
1572            track_stub!(TODO("https://fxbug.dev/322875185"), "CLONE_SYSVSEM");
1573        }
1574
1575        if clone_into_cgroup {
1576            track_stub!(TODO("https://fxbug.dev/403612570"), "CLONE_INTO_CGROUP");
1577        }
1578
1579        if clone_sighand && !clone_vm {
1580            return error!(EINVAL);
1581        }
1582        if clone_thread && !clone_sighand {
1583            return error!(EINVAL);
1584        }
1585
1586        if clone_pidfd && clone_thread {
1587            return error!(EINVAL);
1588        }
1589        if clone_pidfd && clone_parent_settid && user_parent_tid.addr() == user_pidfd.addr() {
1590            // `clone()` uses the same out-argument for these, so error out if they have the same
1591            // user address.
1592            return error!(EINVAL);
1593        }
1594
1595        if flags & !VALID_FLAGS != 0 {
1596            return error!(EINVAL);
1597        }
1598
1599        if clone_vm && !clone_thread {
1600            // TODO(https://fxbug.dev/42066087) Implement CLONE_VM for child processes (not just child
1601            // threads). Currently this executes CLONE_VM (explicitly passed to clone() or as
1602            // used by vfork()) as a fork (the VM in the child is copy-on-write) which is almost
1603            // always OK.
1604            //
1605            // CLONE_VM is primarily as an optimization to avoid making a copy-on-write version of a
1606            // process' VM that will be immediately replaced with a call to exec(). The main users
1607            // (libc and language runtimes) don't actually rely on the memory being shared between
1608            // the two processes. And the vfork() man page explicitly allows vfork() to be
1609            // implemented as fork() which is what we do here.
1610            if !clone_vfork {
1611                track_stub!(
1612                    TODO("https://fxbug.dev/322875227"),
1613                    "CLONE_VM without CLONE_THREAD or CLONE_VFORK"
1614                );
1615            }
1616        } else if clone_thread && !clone_vm {
1617            track_stub!(TODO("https://fxbug.dev/322875167"), "CLONE_THREAD without CLONE_VM");
1618            return error!(ENOSYS);
1619        }
1620
1621        if flags & !IMPLEMENTED_FLAGS != 0 {
1622            track_stub!(
1623                TODO("https://fxbug.dev/322875130"),
1624                "clone unknown flags",
1625                flags & !IMPLEMENTED_FLAGS
1626            );
1627            return error!(ENOSYS);
1628        }
1629
1630        let fs = if clone_fs { self.fs() } else { self.fs().fork() };
1631        let files = if clone_files { self.files.clone() } else { self.files.fork() };
1632
1633        let kernel = self.kernel();
1634
1635        // Lock the cgroup process hierarchy so that the parent process cannot move to a different
1636        // cgroup while a new task or thread_group is created. This may be unnecessary if
1637        // CLONE_INTO_CGROUP is implemented and passed in.
1638        let mut cgroup2_pid_table = kernel.cgroups.lock_cgroup2_pid_table();
1639        // Create a `KernelSignal::Freeze` to put onto the new task, if the cgroup is frozen.
1640        let child_kernel_signals = cgroup2_pid_table
1641            .maybe_create_freeze_signal(self.thread_group())
1642            .into_iter()
1643            .collect::<VecDeque<_>>();
1644
1645        let mut pids = kernel.pids.write();
1646
1647        let pid;
1648        let command;
1649        let creds;
1650        let scheduler_state;
1651        let no_new_privs;
1652        let seccomp_filters;
1653        let robust_list_head = RobustListHeadPtr::null(self);
1654        let child_signal_mask;
1655        let timerslack_ns;
1656        let uts_ns;
1657        let security_state = security::task_alloc(&self, flags);
1658
1659        let TaskInfo { thread, thread_group, memory_manager } = {
1660            // These variables hold the original parent in case we need to switch the parent of the
1661            // new task because of CLONE_PARENT.
1662            let weak_original_parent;
1663            let original_parent;
1664
1665            // Make sure to drop these locks ASAP to avoid inversion
1666            let thread_group_state = {
1667                let thread_group_state = self.thread_group().write();
1668                if clone_parent {
1669                    // With the CLONE_PARENT flag, the parent of the new task is our parent
1670                    // instead of ourselves.
1671                    weak_original_parent =
1672                        thread_group_state.parent.clone().ok_or_else(|| errno!(EINVAL))?;
1673                    std::mem::drop(thread_group_state);
1674                    original_parent = weak_original_parent.upgrade();
1675                    original_parent.write()
1676                } else {
1677                    thread_group_state
1678                }
1679            };
1680
1681            let state = self.read();
1682
1683            no_new_privs = state.no_new_privs();
1684            seccomp_filters = state.seccomp_filters.clone();
1685            child_signal_mask = state.signal_mask();
1686
1687            pid = pids.allocate_pid();
1688            command = self.command();
1689            creds = self.current_creds();
1690            scheduler_state = state.scheduler_state.fork();
1691            timerslack_ns = state.timerslack_ns;
1692
1693            uts_ns = if clone_newuts {
1694                security::check_task_capable(self, CAP_SYS_ADMIN)?;
1695                state.uts_ns.read().fork()
1696            } else {
1697                state.uts_ns.clone()
1698            };
1699
1700            if clone_thread {
1701                TaskInfo {
1702                    thread: None,
1703                    thread_group: self.thread_group().clone(),
1704                    memory_manager: self.mm().ok(),
1705                }
1706            } else {
1707                // Drop the lock on this task before entering `create_zircon_process`, because it will
1708                // take a lock on the new thread group, and locks on thread groups have a higher
1709                // priority than locks on the task in the thread group.
1710                std::mem::drop(state);
1711                let signal_actions = if clone_sighand {
1712                    self.thread_group().signal_actions.clone()
1713                } else {
1714                    self.thread_group().signal_actions.fork()
1715                };
1716                let process_group = thread_group_state.process_group.clone();
1717
1718                let task_info = create_zircon_process(
1719                    locked,
1720                    kernel,
1721                    Some(thread_group_state),
1722                    pid,
1723                    child_exit_signal,
1724                    process_group,
1725                    signal_actions,
1726                    command.clone(),
1727                )?;
1728
1729                cgroup2_pid_table.inherit_cgroup(self.thread_group(), &task_info.thread_group);
1730
1731                task_info
1732            }
1733        };
1734
1735        // Only create the vfork event when the caller requested CLONE_VFORK.
1736        let vfork_event = if clone_vfork { Some(Arc::new(zx::Event::create())) } else { None };
1737
1738        let mut child = TaskBuilder::new(Task::new(
1739            pid,
1740            command,
1741            thread_group,
1742            thread,
1743            files,
1744            memory_manager,
1745            fs,
1746            creds,
1747            self.abstract_socket_namespace.clone(),
1748            self.abstract_vsock_namespace.clone(),
1749            child_signal_mask,
1750            child_kernel_signals,
1751            vfork_event,
1752            scheduler_state,
1753            uts_ns,
1754            no_new_privs,
1755            SeccompState::from(&self.seccomp_filter_state),
1756            seccomp_filters,
1757            robust_list_head,
1758            timerslack_ns,
1759            security_state,
1760        ));
1761
1762        release_on_error!(child, locked, {
1763            let child_task = TempRef::from(&child.task);
1764            // Drop the pids lock as soon as possible after creating the child. Destroying the child
1765            // and removing it from the pids table itself requires the pids lock, so if an early exit
1766            // takes place we have a self deadlock.
1767            pids.add_task(&child_task);
1768            std::mem::drop(pids);
1769
1770            // Child lock must be taken before this lock. Drop the lock on the task, take a writable
1771            // lock on the child and take the current state back.
1772
1773            #[cfg(any(test, debug_assertions))]
1774            {
1775                // Take the lock on the thread group and its child in the correct order to ensure any wrong ordering
1776                // will trigger the tracing-mutex at the right call site.
1777                if !clone_thread {
1778                    let _l1 = self.thread_group().read();
1779                    let _l2 = child.thread_group().read();
1780                }
1781            }
1782
1783            if clone_thread {
1784                self.thread_group().add(&child_task)?;
1785            } else {
1786                child.thread_group().add(&child_task)?;
1787
1788                // These manipulations of the signal handling state appear to be related to
1789                // CLONE_SIGHAND and CLONE_VM rather than CLONE_THREAD. However, we do not support
1790                // all the combinations of these flags, which means doing these operations here
1791                // might actually be correct. However, if you find a test that fails because of the
1792                // placement of this logic here, we might need to move it.
1793                let mut child_state = child.write();
1794                let state = self.read();
1795                child_state.set_sigaltstack(state.sigaltstack());
1796                child_state.set_signal_mask(state.signal_mask());
1797            }
1798
1799            if !clone_vm {
1800                // We do not support running threads in the same process with different
1801                // MemoryManagers.
1802                assert!(!clone_thread);
1803                self.mm()?.snapshot_to(locked, &child.mm()?)?;
1804            }
1805
1806            if clone_parent_settid {
1807                self.write_object(user_parent_tid, &child.tid)?;
1808            }
1809
1810            if clone_child_cleartid {
1811                child.write().clear_child_tid = user_child_tid;
1812            }
1813
1814            if clone_child_settid {
1815                child.write_object(user_child_tid, &child.tid)?;
1816            }
1817
1818            if clone_pidfd {
1819                let locked = locked.cast_locked::<TaskRelease>();
1820                let file = new_pidfd(
1821                    locked,
1822                    self,
1823                    child.thread_group(),
1824                    &*child.mm()?,
1825                    OpenFlags::empty(),
1826                );
1827                let pidfd = self.add_file(locked, file, FdFlags::CLOEXEC)?;
1828                self.write_object(user_pidfd, &pidfd)?;
1829            }
1830
1831            // TODO(https://fxbug.dev/42066087): We do not support running different processes with
1832            // the same MemoryManager. Instead, we implement a rough approximation of that behavior
1833            // by making a copy-on-write clone of the memory from the original process.
1834            if clone_vm && !clone_thread {
1835                self.mm()?.snapshot_to(locked, &child.mm()?)?;
1836            }
1837
1838            child.thread_state = self.thread_state.snapshot();
1839            Ok(())
1840        });
1841
1842        // Take the lock on thread group and task in the correct order to ensure any wrong ordering
1843        // will trigger the tracing-mutex at the right call site.
1844        #[cfg(any(test, debug_assertions))]
1845        {
1846            let _l1 = child.thread_group().read();
1847            let _l2 = child.read();
1848        }
1849
1850        Ok(child)
1851    }
1852
1853    /// Sets the stop state (per set_stopped), and also notifies all listeners,
1854    /// including the parent process and the tracer if appropriate.
1855    pub fn set_stopped_and_notify(&self, stopped: StopState, siginfo: Option<SignalInfo>) {
1856        let maybe_signal_info = {
1857            let mut state = self.write();
1858            state.copy_state_from(self);
1859            state.set_stopped(stopped, siginfo, Some(self), None);
1860            state.prepare_signal_info(stopped)
1861        };
1862
1863        if let Some((tracer, signal_info)) = maybe_signal_info {
1864            if let Some(tracer) = tracer.upgrade() {
1865                tracer.write().send_signal(signal_info);
1866            }
1867        }
1868
1869        if !stopped.is_in_progress() {
1870            let parent = self.thread_group().read().parent.clone();
1871            if let Some(parent) = parent {
1872                parent
1873                    .upgrade()
1874                    .write()
1875                    .lifecycle_waiters
1876                    .notify_value(ThreadGroupLifecycleWaitValue::ChildStatus);
1877            }
1878        }
1879    }
1880
1881    /// If the task is stopping, set it as stopped. return whether the caller
1882    /// should stop.  The task might also be waking up.
1883    pub fn finalize_stop_state(&mut self) -> bool {
1884        let stopped = self.load_stopped();
1885
1886        if !stopped.is_stopping_or_stopped() {
1887            // If we are waking up, potentially write back state a tracer may have modified.
1888            let captured_state = self.write().take_captured_state();
1889            if let Some(captured) = captured_state {
1890                if captured.dirty {
1891                    self.thread_state.replace_registers(&captured.thread_state);
1892                }
1893            }
1894        }
1895
1896        // Stopping because the thread group is stopping.
1897        // Try to flip to GroupStopped - will fail if we shouldn't.
1898        if self.thread_group().set_stopped(StopState::GroupStopped, None, true)
1899            == StopState::GroupStopped
1900        {
1901            let signal = self.thread_group().read().last_signal.clone();
1902            // stopping because the thread group has stopped
1903            let event = Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0));
1904            self.write().set_stopped(StopState::GroupStopped, signal, Some(self), event);
1905            return true;
1906        }
1907
1908        // Stopping because the task is stopping
1909        if stopped.is_stopping_or_stopped() {
1910            if let Ok(stopped) = stopped.finalize() {
1911                self.set_stopped_and_notify(stopped, None);
1912            }
1913            return true;
1914        }
1915
1916        false
1917    }
1918
1919    /// Block the execution of `current_task` as long as the task is stopped and
1920    /// not terminated.
1921    pub fn block_while_stopped(&mut self, locked: &mut Locked<Unlocked>) {
1922        // Upgrade the state from stopping to stopped if needed. Return if the task
1923        // should not be stopped.
1924        if !self.finalize_stop_state() {
1925            return;
1926        }
1927
1928        let waiter = Waiter::new_ignoring_signals();
1929        loop {
1930            // If we've exited, unstop the threads and return without notifying
1931            // waiters.
1932            if self.is_exitted() {
1933                self.thread_group().set_stopped(StopState::ForceAwake, None, false);
1934                self.write().set_stopped(StopState::ForceAwake, None, Some(self), None);
1935                return;
1936            }
1937
1938            if self.wake_or_wait_until_unstopped_async(&waiter) {
1939                return;
1940            }
1941
1942            // Do the wait. Result is not needed, as this is not in a syscall.
1943            let _: Result<(), Errno> = waiter.wait(locked, self);
1944
1945            // Maybe go from stopping to stopped, if we are currently stopping
1946            // again.
1947            self.finalize_stop_state();
1948        }
1949    }
1950
1951    /// For traced tasks, this will return the data neceessary for a cloned task
1952    /// to attach to the same tracer.
1953    pub fn get_ptrace_core_state_for_clone(
1954        &mut self,
1955        clone_args: &clone_args,
1956    ) -> (PtraceOptions, Option<PtraceCoreState>) {
1957        let state = self.write();
1958        if let Some(ptrace) = &state.ptrace {
1959            ptrace.get_core_state_for_clone(clone_args)
1960        } else {
1961            (PtraceOptions::empty(), None)
1962        }
1963    }
1964
1965    /// If currently being ptraced with the given option, emit the appropriate
1966    /// event.  PTRACE_EVENTMSG will return the given message.  Also emits the
1967    /// appropriate event for execve in the absence of TRACEEXEC.
1968    ///
1969    /// Note that the Linux kernel has a documented bug where, if TRACEEXIT is
1970    /// enabled, SIGKILL will trigger an event.  We do not exhibit this
1971    /// behavior.
1972    pub fn ptrace_event(
1973        &mut self,
1974        locked: &mut Locked<Unlocked>,
1975        trace_kind: PtraceOptions,
1976        msg: u64,
1977    ) {
1978        if !trace_kind.is_empty() {
1979            {
1980                let mut state = self.write();
1981                if let Some(ptrace) = &mut state.ptrace {
1982                    if !ptrace.has_option(trace_kind) {
1983                        // If this would be a TRACEEXEC, but TRACEEXEC is not
1984                        // turned on, then send a SIGTRAP.
1985                        if trace_kind == PtraceOptions::TRACEEXEC && !ptrace.is_seized() {
1986                            // Send a SIGTRAP so that the parent can gain control.
1987                            send_signal_first(locked, self, state, SignalInfo::default(SIGTRAP));
1988                        }
1989
1990                        return;
1991                    }
1992                    let mut siginfo = SignalInfo::default(starnix_uapi::signals::SIGTRAP);
1993                    siginfo.code = (((PtraceEvent::from_option(&trace_kind) as u32) << 8)
1994                        | linux_uapi::SIGTRAP) as i32;
1995                    state.set_stopped(
1996                        StopState::PtraceEventStopping,
1997                        Some(siginfo),
1998                        None,
1999                        Some(PtraceEventData::new(trace_kind, msg)),
2000                    );
2001                } else {
2002                    return;
2003                }
2004            }
2005            self.block_while_stopped(locked);
2006        }
2007    }
2008
2009    /// Causes the current thread's thread group to exit, notifying any ptracer
2010    /// of this task first.
2011    pub fn thread_group_exit(&mut self, locked: &mut Locked<Unlocked>, exit_status: ExitStatus) {
2012        self.ptrace_event(
2013            locked,
2014            PtraceOptions::TRACEEXIT,
2015            exit_status.signal_info_status() as u64,
2016        );
2017        self.thread_group().exit(locked, exit_status, None);
2018    }
2019
2020    /// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the
2021    /// exit signal as in clone().
2022    pub fn clone_task_for_test<L>(
2023        &self,
2024        locked: &mut Locked<L>,
2025        flags: u64,
2026        exit_signal: Option<Signal>,
2027    ) -> crate::testing::AutoReleasableTask
2028    where
2029        L: LockBefore<MmDumpable>,
2030        L: LockBefore<TaskRelease>,
2031        L: LockBefore<ProcessGroupState>,
2032    {
2033        let result = self
2034            .clone_task(
2035                locked,
2036                flags,
2037                exit_signal,
2038                UserRef::default(),
2039                UserRef::default(),
2040                UserRef::default(),
2041            )
2042            .expect("failed to create task in test");
2043
2044        result.into()
2045    }
2046
2047    // See "Ptrace access mode checking" in https://man7.org/linux/man-pages/man2/ptrace.2.html
2048    pub fn check_ptrace_access_mode<L>(
2049        &self,
2050        locked: &mut Locked<L>,
2051        mode: PtraceAccessMode,
2052        target: &Task,
2053    ) -> Result<(), Errno>
2054    where
2055        L: LockBefore<MmDumpable>,
2056    {
2057        // (1)  If the calling thread and the target thread are in the same
2058        //      thread group, access is always allowed.
2059        if self.thread_group().leader == target.thread_group().leader {
2060            return Ok(());
2061        }
2062
2063        // (2)  If the access mode specifies PTRACE_MODE_FSCREDS, then, for
2064        //      the check in the next step, employ the caller's filesystem
2065        //      UID and GID.  (As noted in credentials(7), the filesystem
2066        //      UID and GID almost always have the same values as the
2067        //      corresponding effective IDs.)
2068        //
2069        //      Otherwise, the access mode specifies PTRACE_MODE_REALCREDS,
2070        //      so use the caller's real UID and GID for the checks in the
2071        //      next step.  (Most APIs that check the caller's UID and GID
2072        //      use the effective IDs.  For historical reasons, the
2073        //      PTRACE_MODE_REALCREDS check uses the real IDs instead.)
2074        let (uid, gid) = self.with_current_creds(|creds| {
2075            if mode.contains(PTRACE_MODE_FSCREDS) {
2076                let fscred = creds.as_fscred();
2077                (fscred.uid, fscred.gid)
2078            } else if mode.contains(PTRACE_MODE_REALCREDS) {
2079                (creds.uid, creds.gid)
2080            } else {
2081                unreachable!();
2082            }
2083        });
2084
2085        // (3)  Deny access if neither of the following is true:
2086        //
2087        //      -  The real, effective, and saved-set user IDs of the target
2088        //         match the caller's user ID, and the real, effective, and
2089        //         saved-set group IDs of the target match the caller's
2090        //         group ID.
2091        //
2092        //      -  The caller has the CAP_SYS_PTRACE capability in the user
2093        //         namespace of the target.
2094        let target_creds = target.real_creds();
2095        if !(target_creds.uid == uid
2096            && target_creds.euid == uid
2097            && target_creds.saved_uid == uid
2098            && target_creds.gid == gid
2099            && target_creds.egid == gid
2100            && target_creds.saved_gid == gid)
2101        {
2102            security::check_task_capable(self, CAP_SYS_PTRACE)?;
2103        }
2104
2105        // (4)  Deny access if the target process "dumpable" attribute has a
2106        //      value other than 1 (SUID_DUMP_USER; see the discussion of
2107        //      PR_SET_DUMPABLE in prctl(2)), and the caller does not have
2108        //      the CAP_SYS_PTRACE capability in the user namespace of the
2109        //      target process.
2110        let dumpable = *target.mm()?.dumpable.lock(locked);
2111        match dumpable {
2112            DumpPolicy::User => (),
2113            DumpPolicy::Disable => security::check_task_capable(self, CAP_SYS_PTRACE)?,
2114        }
2115
2116        // (5)  The kernel LSM security_ptrace_access_check() interface is
2117        //      invoked to see if ptrace access is permitted.
2118        security::ptrace_access_check(self, target, mode)?;
2119
2120        // (6)  If access has not been denied by any of the preceding steps,
2121        //      then access is allowed.
2122        Ok(())
2123    }
2124
2125    pub fn can_signal(
2126        &self,
2127        target: &Task,
2128        unchecked_signal: UncheckedSignal,
2129    ) -> Result<(), Errno> {
2130        // If both the tasks share a thread group the signal can be sent. This is not documented
2131        // in kill(2) because kill does not support task-level granularity in signal sending.
2132        if self.thread_group == target.thread_group {
2133            return Ok(());
2134        }
2135
2136        let (target_uid, target_saved_uid) =
2137            target.with_real_creds(|creds| (creds.uid, creds.saved_uid));
2138        if self.with_current_creds(|creds| {
2139            // From https://man7.org/linux/man-pages/man2/kill.2.html:
2140            //
2141            // > For a process to have permission to send a signal, it must either be
2142            // > privileged (under Linux: have the CAP_KILL capability in the user
2143            // > namespace of the target process), or the real or effective user ID of
2144            // > the sending process must equal the real or saved set- user-ID of the
2145            // > target process.
2146            //
2147            // Returns true if the credentials are considered to have the same user ID.
2148            creds.euid == target_saved_uid
2149                || creds.euid == target_uid
2150                || creds.uid == target_uid
2151                || creds.uid == target_saved_uid
2152        }) {
2153            return Ok(());
2154        }
2155
2156        if Signal::try_from(unchecked_signal) == Ok(SIGCONT) {
2157            let target_session = target.thread_group().read().process_group.session.leader;
2158            let self_session = self.thread_group().read().process_group.session.leader;
2159            if target_session == self_session {
2160                return Ok(());
2161            }
2162        }
2163
2164        security::check_task_capable(self, CAP_KILL)
2165    }
2166}
2167
2168impl ArchSpecific for CurrentTask {
2169    fn is_arch32(&self) -> bool {
2170        self.thread_state.is_arch32()
2171    }
2172}
2173
2174impl MemoryAccessor for CurrentTask {
2175    fn read_memory<'a>(
2176        &self,
2177        addr: UserAddress,
2178        bytes: &'a mut [MaybeUninit<u8>],
2179    ) -> Result<&'a mut [u8], Errno> {
2180        self.mm()?.unified_read_memory(self, addr, bytes)
2181    }
2182
2183    fn read_memory_partial_until_null_byte<'a>(
2184        &self,
2185        addr: UserAddress,
2186        bytes: &'a mut [MaybeUninit<u8>],
2187    ) -> Result<&'a mut [u8], Errno> {
2188        self.mm()?.unified_read_memory_partial_until_null_byte(self, addr, bytes)
2189    }
2190
2191    fn read_memory_partial<'a>(
2192        &self,
2193        addr: UserAddress,
2194        bytes: &'a mut [MaybeUninit<u8>],
2195    ) -> Result<&'a mut [u8], Errno> {
2196        self.mm()?.unified_read_memory_partial(self, addr, bytes)
2197    }
2198
2199    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2200        self.mm()?.unified_write_memory(self, addr, bytes)
2201    }
2202
2203    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2204        self.mm()?.unified_write_memory_partial(self, addr, bytes)
2205    }
2206
2207    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
2208        self.mm()?.unified_zero(self, addr, length)
2209    }
2210}
2211
2212impl TaskMemoryAccessor for CurrentTask {
2213    fn maximum_valid_address(&self) -> Option<UserAddress> {
2214        self.mm().ok().map(|mm| mm.maximum_valid_user_address)
2215    }
2216}
2217
2218pub enum ExceptionResult {
2219    /// The exception was handled and no further action is required.
2220    Handled,
2221
2222    // The exception generated a signal that should be delivered.
2223    Signal(SignalInfo),
2224}
2225
2226#[cfg(test)]
2227mod tests {
2228    use crate::testing::spawn_kernel_and_run;
2229
2230    // This test will run `override_creds` and check it doesn't crash. This ensures that the
2231    // delegation to `override_creds_async` is correct.
2232    #[::fuchsia::test]
2233    async fn test_override_creds_can_delegate_to_async_version() {
2234        spawn_kernel_and_run(async move |_, current_task| {
2235            assert_eq!(current_task.override_creds(|_| {}, || 0), 0);
2236        })
2237        .await;
2238    }
2239}