Skip to main content

starnix_core/task/
current_task.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::arch::task::{decode_page_fault_exception_report, get_signal_for_general_exception};
6use crate::execution::{TaskInfo, create_zircon_process};
7use crate::mm::{DumpPolicy, MemoryAccessor, MemoryAccessorExt, MemoryManager, TaskMemoryAccessor};
8use crate::ptrace::{PtraceCoreState, PtraceEvent, PtraceEventData, PtraceOptions, StopState};
9use crate::security;
10use crate::signals::{RunState, SignalDetail, SignalInfo, send_signal_first, send_standard_signal};
11use crate::task::loader::{ResolvedElf, load_executable, resolve_executable};
12use crate::task::waiter::WaiterOptions;
13use crate::task::{
14    ExitStatus, RobustListHeadPtr, SeccompFilter, SeccompFilterContainer, SeccompNotifierHandle,
15    SeccompState, SeccompStateValue, Task, TaskFlags, TaskLiveState, ThreadState, Waiter,
16};
17use crate::vfs::{
18    CheckAccessReason, FdFlags, FdNumber, FileHandle, FsContext, FsStr, LookupContext, LookupVec,
19    MAX_SYMLINK_FOLLOWS, NamespaceNode, ResolveBase, SymlinkMode, SymlinkTarget, new_pidfd,
20};
21use fuchsia_rcu::RcuReadGuard;
22use futures::FutureExt;
23use linux_uapi::CLONE_PIDFD;
24use starnix_logging::{
25    CATEGORY_STARNIX, log_error, log_warn, trace_duration, track_file_not_found, track_stub,
26};
27use starnix_registers::{HeapRegs, RegisterStorageEnum};
28use starnix_stack::clean_stack;
29use starnix_sync::{
30    EventWaitGuard, FileOpsCore, LockBefore, LockEqualOrBefore, Locked, MmDumpable,
31    ProcessGroupState, TaskRelease, Unlocked, WakeReason,
32};
33use starnix_syscalls::SyscallResult;
34use starnix_syscalls::decls::Syscall;
35use starnix_task_command::TaskCommand;
36use starnix_types::futex_address::FutexAddress;
37use starnix_types::ownership::{OwnedRef, Releasable, TempRef, WeakRef, release_on_error};
38use starnix_uapi::auth::{
39    CAP_KILL, CAP_SYS_ADMIN, CAP_SYS_PTRACE, Credentials, FsCred, PTRACE_MODE_FSCREDS,
40    PTRACE_MODE_REALCREDS, PtraceAccessMode, UserAndOrGroupId,
41};
42use starnix_uapi::device_id::DeviceId;
43use starnix_uapi::errors::Errno;
44use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
45use starnix_uapi::open_flags::OpenFlags;
46use starnix_uapi::signals::{
47    SIGBUS, SIGCHLD, SIGCONT, SIGILL, SIGKILL, SIGSEGV, SIGSYS, SIGTRAP, SigSet, Signal,
48    UncheckedSignal,
49};
50use starnix_uapi::user_address::{ArchSpecific, UserAddress, UserRef};
51use starnix_uapi::vfs::ResolveFlags;
52use starnix_uapi::{
53    CLONE_CHILD_CLEARTID, CLONE_CHILD_SETTID, CLONE_FILES, CLONE_FS, CLONE_INTO_CGROUP,
54    CLONE_NEWUTS, CLONE_PARENT, CLONE_PARENT_SETTID, CLONE_PTRACE, CLONE_SETTLS, CLONE_SIGHAND,
55    CLONE_SYSVSEM, CLONE_THREAD, CLONE_VFORK, CLONE_VM, FUTEX_OWNER_DIED, FUTEX_TID_MASK,
56    ROBUST_LIST_LIMIT, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_NEW_LISTENER,
57    SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_TSYNC_ESRCH, clone_args, errno, error, pid_t,
58    sock_filter, ucred,
59};
60use std::cell::{Ref, RefCell};
61use std::collections::VecDeque;
62use std::ffi::CString;
63use std::fmt;
64use std::marker::PhantomData;
65use std::mem::MaybeUninit;
66use std::sync::Arc;
67use zx::sys::zx_restricted_state_t;
68
69use super::ThreadGroupLifecycleWaitValue;
70
71pub struct TaskBuilder {
72    /// The underlying task object.
73    pub task: OwnedRef<Task>,
74
75    pub thread_state: ThreadState<HeapRegs>,
76}
77
78impl TaskBuilder {
79    pub fn new(task: OwnedRef<Task>) -> Self {
80        Self { task, thread_state: Default::default() }
81    }
82
83    #[inline(always)]
84    pub fn release<L>(self, locked: &mut Locked<L>)
85    where
86        L: LockBefore<TaskRelease>,
87    {
88        let locked = locked.cast_locked::<TaskRelease>();
89        Releasable::release(self, locked);
90    }
91}
92
93impl From<TaskBuilder> for CurrentTask {
94    fn from(builder: TaskBuilder) -> Self {
95        Self::new(builder.task, builder.thread_state.into())
96    }
97}
98
99impl Releasable for TaskBuilder {
100    type Context<'a> = &'a mut Locked<TaskRelease>;
101
102    fn release<'a>(self, locked: Self::Context<'a>) {
103        let kernel = Arc::clone(self.kernel());
104        let mut pids = kernel.pids.write();
105
106        // We remove from the thread group here because the WeakRef in the pid
107        // table to this task must be valid until this task is removed from the
108        // thread group, and the code below will invalidate it.
109        // Moreover, this requires a OwnedRef of the task to ensure the tasks of
110        // the thread group are always valid.
111        self.task.thread_group().remove(locked, &mut pids, &self.task);
112
113        let context = (self.thread_state.into(), locked, pids);
114        self.task.release(context);
115    }
116}
117
118impl std::ops::Deref for TaskBuilder {
119    type Target = Task;
120    fn deref(&self) -> &Self::Target {
121        &self.task
122    }
123}
124
125/// The task object associated with the currently executing thread.
126///
127/// We often pass the `CurrentTask` as the first argument to functions if those functions need to
128/// know contextual information about the thread on which they are running. For example, we often
129/// use the `CurrentTask` to perform access checks, which ensures that the caller is authorized to
130/// perform the requested operation.
131///
132/// The `CurrentTask` also has state that can be referenced only on the currently executing thread,
133/// such as the register state for that thread. Syscalls are given a mutable references to the
134/// `CurrentTask`, which lets them manipulate this state.
135///
136/// See also `Task` for more information about tasks.
137pub struct CurrentTask {
138    /// The underlying task object.
139    pub task: OwnedRef<Task>,
140
141    pub thread_state: ThreadState<RegisterStorageEnum>,
142
143    /// The current subjective credentials of the task.
144    // TODO(https://fxbug.dev/433548348): Avoid interior mutability here by passing a
145    // &mut CurrentTask around instead of &CurrentTask.
146    pub current_creds: RefCell<CurrentCreds>,
147
148    pub security_state: security::CurrentTaskState,
149
150    /// Makes CurrentTask neither Sync not Send.
151    _local_marker: PhantomData<*mut u8>,
152}
153
154/// Represents the current state of the task's subjective credentials.
155pub enum CurrentCreds {
156    /// The task does not have overridden credentials, the subjective creds are identical to the
157    /// objective creds stored in the Task. Since credentials are often accessed from the current
158    /// task, we hold a reference here that does not necessitate going through the RCU machinery to
159    /// read.
160    Cached(Arc<Credentials>),
161    /// The task has overridden subjective credentials.
162    Overridden(Arc<Credentials>),
163}
164
165impl CurrentCreds {
166    fn creds(&self) -> &Arc<Credentials> {
167        match self {
168            CurrentCreds::Cached(creds) => creds,
169            CurrentCreds::Overridden(creds) => creds,
170        }
171    }
172}
173
174impl Releasable for CurrentTask {
175    type Context<'a> = &'a mut Locked<TaskRelease>;
176
177    fn release<'a>(self, locked: Self::Context<'a>) {
178        self.notify_robust_list();
179        let _ignored = self.clear_child_tid_if_needed(locked);
180
181        let kernel = Arc::clone(self.kernel());
182        let mut pids = kernel.pids.write();
183
184        // We remove from the thread group here because the WeakRef in the pid
185        // table to this task must be valid until this task is removed from the
186        // thread group, and the code below will invalidate it.
187        // Moreover, this requires a OwnedRef of the task to ensure the tasks of
188        // the thread group are always valid.
189        self.task.thread_group().remove(locked, &mut pids, &self.task);
190
191        let context = (self.thread_state, locked, pids);
192        self.task.release(context);
193    }
194}
195
196impl std::ops::Deref for CurrentTask {
197    type Target = Task;
198    fn deref(&self) -> &Self::Target {
199        &self.task
200    }
201}
202
203impl fmt::Debug for CurrentTask {
204    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
205        self.task.fmt(f)
206    }
207}
208
209impl CurrentTask {
210    pub fn new(task: OwnedRef<Task>, thread_state: ThreadState<RegisterStorageEnum>) -> Self {
211        let current_creds = RefCell::new(CurrentCreds::Cached(task.clone_creds()));
212        Self {
213            task,
214            thread_state,
215            current_creds,
216            security_state: Default::default(),
217            _local_marker: Default::default(),
218        }
219    }
220
221    /// Returns the [`TaskLiveState`] for the [`Task`].
222    ///
223    /// # Panics
224    ///
225    /// Calling `live()` on a [`CurrentTask`] for which the [`Task`] has no live state (i.e.
226    /// zombie tasks) panics. However, such tasks should not have a `CurrentTask`.
227    #[track_caller]
228    pub fn live(&self) -> RcuReadGuard<TaskLiveState> {
229        self.task.live().expect("CurrentTask must have TaskLiveState")
230    }
231
232    pub fn fs(&self) -> Arc<FsContext> {
233        self.live().fs()
234    }
235
236    pub fn has_shared_fs(&self) -> bool {
237        let fs = self.fs();
238        // This check is incorrect because someone else could be holding a temporary Arc to the
239        // FsContext and therefore increasing the strong count.
240        Arc::strong_count(&fs) > 2usize
241    }
242
243    pub fn unshare_fs(&self) {
244        let new_fs = self.fs().fork();
245        self.live().fs.update(new_fs);
246    }
247
248    /// Returns the current subjective credentials of the task.
249    ///
250    /// The subjective credentials are the credentials that are used to check permissions for
251    /// actions performed by the task.
252    pub fn current_creds(&self) -> Ref<'_, Arc<Credentials>> {
253        Ref::map(self.current_creds.borrow(), CurrentCreds::creds)
254    }
255
256    pub fn current_fscred(&self) -> FsCred {
257        self.current_creds().as_fscred()
258    }
259
260    pub fn current_ucred(&self) -> ucred {
261        let creds = self.current_creds();
262        ucred { pid: self.get_pid(), uid: creds.uid, gid: creds.gid }
263    }
264
265    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
266    /// `callback`.
267    /// The creds and security state will be restored to their original values at the end of the
268    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
269    ///  used to check permissions for actions performed by the task, is altered. The "objective"
270    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
271    /// for actions performed on the task, is not altered, and changes to the credentials are not
272    /// externally visible.
273    pub async fn override_creds_async<R>(
274        &self,
275        new_creds: Arc<Credentials>,
276        callback: impl AsyncFnOnce() -> R,
277    ) -> R {
278        let saved = self.current_creds.replace(CurrentCreds::Overridden(new_creds));
279        let result = callback().await;
280        self.current_creds.replace(saved);
281        result
282    }
283
284    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
285    /// `callback`.
286    /// The creds and security state will be restored to their original values at the end of the
287    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
288    ///  used to check permissions for actions performed by the task, is altered. The "objective"
289    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
290    /// for actions performed on the task, is not altered, and changes to the credentials are not
291    /// externally visible.
292    pub fn override_creds<R>(
293        &self,
294        new_creds: Arc<Credentials>,
295        callback: impl FnOnce() -> R,
296    ) -> R {
297        self.override_creds_async(new_creds, async move || callback())
298            .now_or_never()
299            .expect("Future should be ready")
300    }
301
302    pub fn has_overridden_creds(&self) -> bool {
303        matches!(*self.current_creds.borrow(), CurrentCreds::Overridden(_))
304    }
305
306    pub fn trigger_delayed_releaser<L>(&self, locked: &mut Locked<L>)
307    where
308        L: LockEqualOrBefore<FileOpsCore>,
309    {
310        let locked = locked.cast_locked::<FileOpsCore>();
311        self.kernel().delayed_releaser.apply(locked, self);
312    }
313
314    pub fn weak_task(&self) -> WeakRef<Task> {
315        WeakRef::from(&self.task)
316    }
317
318    pub fn temp_task(&self) -> TempRef<'_, Task> {
319        TempRef::from(&self.task)
320    }
321
322    /// Change the current and real creds of the task. This is invalid to call while temporary
323    /// credentials are present.
324    pub fn set_creds(&self, creds: Credentials) {
325        assert!(!self.has_overridden_creds());
326
327        let creds = Arc::new(creds);
328        let mut current_creds = self.current_creds.borrow_mut();
329        *current_creds = CurrentCreds::Cached(creds.clone());
330
331        // SAFETY: this is allowed because we are the CurrentTask.
332        unsafe {
333            self.persistent_info.write_creds().update(creds);
334        }
335        // The /proc/pid directory's ownership is updated when the task's euid
336        // or egid changes. See proc(5).
337        let maybe_node = self.proc_pid_directory_cache.lock();
338        if let Some(node) = &*maybe_node {
339            let creds = self.real_creds().euid_as_fscred();
340            // SAFETY: The /proc/pid directory held by `proc_pid_directory_cache` represents the
341            // current task. It's owner and group are supposed to track the current task's euid and
342            // egid.
343            unsafe {
344                node.force_chown(creds);
345            }
346        }
347    }
348
349    #[inline(always)]
350    pub fn release<L>(self, locked: &mut Locked<L>)
351    where
352        L: LockBefore<TaskRelease>,
353    {
354        let locked = locked.cast_locked::<TaskRelease>();
355        Releasable::release(self, locked);
356    }
357
358    pub fn set_syscall_restart_func<R: Into<SyscallResult>>(
359        &mut self,
360        f: impl FnOnce(&mut Locked<Unlocked>, &mut CurrentTask) -> Result<R, Errno>
361        + Send
362        + Sync
363        + 'static,
364    ) {
365        self.thread_state.syscall_restart_func =
366            Some(Box::new(|locked, current_task| Ok(f(locked, current_task)?.into())));
367    }
368
369    pub fn add_file<L>(
370        &self,
371        locked: &mut Locked<L>,
372        file: FileHandle,
373        flags: FdFlags,
374    ) -> Result<FdNumber, Errno>
375    where
376        L: LockEqualOrBefore<FileOpsCore>,
377    {
378        self.live().files.add(locked, self, file, flags)
379    }
380
381    pub fn get_file(&self, fd: FdNumber) -> Result<FileHandle, Errno> {
382        self.live().files.get(fd)
383    }
384
385    pub fn get_file_allowing_opath(&self, fd: FdNumber) -> Result<FileHandle, Errno> {
386        self.live().files.get_allowing_opath(fd)
387    }
388
389    /// Sets the task's signal mask to `signal_mask` and runs `wait_function`.
390    ///
391    /// Signals are dequeued prior to the original signal mask being restored. This is done by the
392    /// signal machinery in the syscall dispatch loop.
393    ///
394    /// The returned result is the result returned from the wait function.
395    pub fn wait_with_temporary_mask<F, T, L>(
396        &mut self,
397        locked: &mut Locked<L>,
398        signal_mask: SigSet,
399        wait_function: F,
400    ) -> Result<T, Errno>
401    where
402        L: LockEqualOrBefore<FileOpsCore>,
403        F: FnOnce(&mut Locked<L>, &CurrentTask) -> Result<T, Errno>,
404    {
405        {
406            let mut state = self.write();
407            state.set_flags(TaskFlags::TEMPORARY_SIGNAL_MASK, true);
408            state.set_temporary_signal_mask(signal_mask);
409        }
410        wait_function(locked, self)
411    }
412
413    /// If waking, promotes from waking to awake.  If not waking, make waiter async
414    /// wait until woken.  Returns true if woken.
415    pub fn wake_or_wait_until_unstopped_async(&self, waiter: &Waiter) -> bool {
416        let group_state = self.thread_group().read();
417        let mut task_state = self.write();
418
419        // Wake up if
420        //   a) we should wake up, meaning:
421        //      i) we're in group stop, and the thread group has exited group stop, or
422        //      ii) we're waking up,
423        //   b) and ptrace isn't stopping us from waking up, but
424        //   c) always wake up if we got a SIGKILL.
425        let task_stop_state = self.load_stopped();
426        let group_stop_state = self.thread_group().load_stopped();
427        if ((task_stop_state == StopState::GroupStopped && group_stop_state.is_waking_or_awake())
428            || task_stop_state.is_waking_or_awake())
429            && (!task_state.is_ptrace_listening() || task_stop_state.is_force())
430        {
431            let new_state = if task_stop_state.is_waking_or_awake() {
432                task_stop_state.finalize()
433            } else {
434                group_stop_state.finalize()
435            };
436            if let Ok(new_state) = new_state {
437                task_state.set_stopped(new_state, None, Some(self), None);
438                drop(group_state);
439                drop(task_state);
440                // It is possible for the stop state to be changed by another
441                // thread between when it is checked above and the following
442                // invocation, but set_stopped does sufficient checking while
443                // holding the lock to make sure that such a change won't result
444                // in corrupted state.
445                self.thread_group().set_stopped(new_state, None, false);
446                return true;
447            }
448        }
449
450        // We will wait.
451        if self.thread_group().load_stopped().is_stopped() || task_stop_state.is_stopped() {
452            // If we've stopped or PTRACE_LISTEN has been sent, wait for a
453            // signal or instructions from the tracer.
454            group_state
455                .lifecycle_waiters
456                .wait_async_value(&waiter, ThreadGroupLifecycleWaitValue::Stopped);
457            task_state.wait_on_ptracer(&waiter);
458        } else if task_state.can_accept_ptrace_commands() {
459            // If we're stopped because a tracer has seen the stop and not taken
460            // further action, wait for further instructions from the tracer.
461            task_state.wait_on_ptracer(&waiter);
462        } else if task_state.is_ptrace_listening() {
463            // A PTRACE_LISTEN is a state where we can get signals and notify a
464            // ptracer, but otherwise remain blocked.
465            if let Some(ptrace) = &mut task_state.ptrace {
466                ptrace.set_last_signal(Some(SignalInfo::kernel(SIGTRAP)));
467                ptrace.set_last_event(Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0)));
468            }
469            task_state.wait_on_ptracer(&waiter);
470            task_state.notify_ptracers();
471        }
472        false
473    }
474
475    /// Set the RunState for the current task to the given value and then call the given callback.
476    ///
477    /// When the callback is done, the run_state is restored to `RunState::Running`.
478    ///
479    /// This function is typically used just before blocking the current task on some operation.
480    /// The given `run_state` registers the mechanism for interrupting the blocking operation with
481    /// the task and the given `callback` actually blocks the task.
482    ///
483    /// This function can only be called in the `RunState::Running` state and cannot set the
484    /// run state to `RunState::Running`. For this reason, this function cannot be reentered.
485    pub fn run_in_state<F, T>(&self, run_state: RunState, callback: F) -> Result<T, Errno>
486    where
487        F: FnOnce() -> Result<T, Errno>,
488    {
489        assert_ne!(run_state, RunState::Running);
490
491        // As an optimization, decommit unused pages of the stack to reduce memory pressure while
492        // the thread is blocked.
493        clean_stack();
494
495        {
496            let mut state = self.write();
497            assert!(!state.is_blocked());
498
499            if matches!(run_state, RunState::Frozen(_)) {
500                // Freeze is a kernel signal and is handled before other user signals. A frozen task
501                // ignores all other signals except SIGKILL until it is thawed.
502                if state.has_signal_pending(SIGKILL) {
503                    return error!(EINTR);
504                }
505            } else if state.is_any_signal_pending() && !state.is_ptrace_listening() {
506                // A note on PTRACE_LISTEN - the thread cannot be scheduled
507                // regardless of pending signals.
508                return error!(EINTR);
509            }
510            state.set_run_state(run_state.clone());
511        }
512
513        let result = callback();
514
515        {
516            let mut state = self.write();
517            assert_eq!(
518                state.run_state(),
519                run_state,
520                "SignalState run state changed while waiting!"
521            );
522            state.set_run_state(RunState::Running);
523        };
524
525        result
526    }
527
528    pub fn block_until(
529        &self,
530        guard: EventWaitGuard<'_>,
531        deadline: zx::MonotonicInstant,
532    ) -> Result<(), Errno> {
533        self.run_in_state(RunState::Event(guard.event().clone()), move || {
534            guard.block_until(None, deadline).map_err(|e| match e {
535                WakeReason::Interrupted => errno!(EINTR),
536                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
537            })
538        })
539    }
540
541    pub fn block_with_owner_until(
542        &self,
543        guard: EventWaitGuard<'_>,
544        new_owner: &zx::Thread,
545        deadline: zx::MonotonicInstant,
546    ) -> Result<(), Errno> {
547        self.run_in_state(RunState::Event(guard.event().clone()), move || {
548            guard.block_until(Some(new_owner), deadline).map_err(|e| match e {
549                WakeReason::Interrupted => errno!(EINTR),
550                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
551            })
552        })
553    }
554
555    /// Determine namespace node indicated by the dir_fd.
556    ///
557    /// Returns the namespace node and the path to use relative to that node.
558    pub fn resolve_dir_fd<'a, L>(
559        &self,
560        locked: &mut Locked<L>,
561        dir_fd: FdNumber,
562        mut path: &'a FsStr,
563        flags: ResolveFlags,
564    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
565    where
566        L: LockEqualOrBefore<FileOpsCore>,
567    {
568        let path_is_absolute = path.starts_with(b"/");
569        if path_is_absolute {
570            if flags.contains(ResolveFlags::BENEATH) {
571                return error!(EXDEV);
572            }
573            path = &path[1..];
574        }
575
576        let dir = if path_is_absolute && !flags.contains(ResolveFlags::IN_ROOT) {
577            self.fs().root()
578        } else if dir_fd == FdNumber::AT_FDCWD {
579            self.fs().cwd()
580        } else {
581            // O_PATH allowed for:
582            //
583            //   Passing the file descriptor as the dirfd argument of
584            //   openat() and the other "*at()" system calls.  This
585            //   includes linkat(2) with AT_EMPTY_PATH (or via procfs
586            //   using AT_SYMLINK_FOLLOW) even if the file is not a
587            //   directory.
588            //
589            // See https://man7.org/linux/man-pages/man2/open.2.html
590            let file = self.get_file_allowing_opath(dir_fd)?;
591            file.name.to_passive()
592        };
593
594        if !path.is_empty() {
595            if !dir.entry.node.is_dir() {
596                return error!(ENOTDIR);
597            }
598            dir.check_access(
599                locked,
600                self,
601                Access::EXEC,
602                CheckAccessReason::InternalPermissionChecks,
603            )?;
604        }
605        Ok((dir, path.into()))
606    }
607
608    /// A convenient wrapper for opening files relative to FdNumber::AT_FDCWD.
609    ///
610    /// Returns a FileHandle but does not install the FileHandle in the FdTable
611    /// for this task.
612    pub fn open_file(
613        &self,
614        locked: &mut Locked<Unlocked>,
615        path: &FsStr,
616        flags: OpenFlags,
617    ) -> Result<FileHandle, Errno> {
618        if flags.contains(OpenFlags::CREAT) {
619            // In order to support OpenFlags::CREAT we would need to take a
620            // FileMode argument.
621            return error!(EINVAL);
622        }
623        self.open_file_at(
624            locked,
625            FdNumber::AT_FDCWD,
626            path,
627            flags,
628            FileMode::default(),
629            ResolveFlags::empty(),
630            AccessCheck::default(),
631        )
632    }
633
634    /// Resolves a path for open.
635    ///
636    /// If the final path component points to a symlink, the symlink is followed (as long as
637    /// the symlink traversal limit has not been reached).
638    ///
639    /// If the final path component (after following any symlinks, if enabled) does not exist,
640    /// and `flags` contains `OpenFlags::CREAT`, a new node is created at the location of the
641    /// final path component.
642    ///
643    /// This returns the resolved node, and a boolean indicating whether the node has been created.
644    fn resolve_open_path<L>(
645        &self,
646        locked: &mut Locked<L>,
647        context: &mut LookupContext,
648        dir: &NamespaceNode,
649        path: &FsStr,
650        mode: FileMode,
651        flags: OpenFlags,
652    ) -> Result<(NamespaceNode, bool), Errno>
653    where
654        L: LockEqualOrBefore<FileOpsCore>,
655    {
656        context.update_for_path(path);
657        let mut parent_content = context.with(SymlinkMode::Follow);
658        let (parent, basename) = self.lookup_parent(locked, &mut parent_content, dir, path)?;
659        context.remaining_follows = parent_content.remaining_follows;
660
661        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
662
663        // Lookup the child, without following a symlink or expecting it to be a directory.
664        let mut child_context = context.with(SymlinkMode::NoFollow);
665        child_context.must_be_directory = false;
666
667        match parent.lookup_child(locked, self, &mut child_context, basename) {
668            Ok(name) => {
669                if name.entry.node.is_lnk() {
670                    if flags.contains(OpenFlags::PATH)
671                        && context.symlink_mode == SymlinkMode::NoFollow
672                    {
673                        // When O_PATH is specified in flags, if pathname is a symbolic link
674                        // and the O_NOFOLLOW flag is also specified, then the call returns
675                        // a file descriptor referring to the symbolic link.
676                        // See https://man7.org/linux/man-pages/man2/openat.2.html
677                        //
678                        // If the trailing component (i.e., basename) of
679                        // pathname is a symbolic link, how.resolve contains
680                        // RESOLVE_NO_SYMLINKS, and how.flags contains both
681                        // O_PATH and O_NOFOLLOW, then an O_PATH file
682                        // descriptor referencing the symbolic link will be
683                        // returned.
684                        // See https://man7.org/linux/man-pages/man2/openat2.2.html
685                        return Ok((name, false));
686                    }
687
688                    if (!flags.contains(OpenFlags::PATH)
689                        && context.symlink_mode == SymlinkMode::NoFollow)
690                        || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
691                        || context.remaining_follows == 0
692                    {
693                        if must_create {
694                            // Since `must_create` is set, and a node was found, this returns EEXIST
695                            // instead of ELOOP.
696                            return error!(EEXIST);
697                        }
698                        // A symlink was found, but one of the following is true:
699                        // * flags specified O_NOFOLLOW but not O_PATH.
700                        // * how.resolve contains RESOLVE_NO_SYMLINKS
701                        // * too many symlink traversals have been attempted
702                        return error!(ELOOP);
703                    }
704
705                    context.remaining_follows -= 1;
706                    match name.readlink(locked, self)? {
707                        SymlinkTarget::Path(path) => {
708                            let dir = if path[0] == b'/' { self.fs().root() } else { parent };
709                            self.resolve_open_path(
710                                locked,
711                                context,
712                                &dir,
713                                path.as_ref(),
714                                mode,
715                                flags,
716                            )
717                        }
718                        SymlinkTarget::Node(name) => {
719                            if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS)
720                                || name.entry.node.is_lnk()
721                            {
722                                error!(ELOOP)
723                            } else {
724                                Ok((name, false))
725                            }
726                        }
727                    }
728                } else {
729                    if must_create {
730                        return error!(EEXIST);
731                    }
732                    Ok((name, false))
733                }
734            }
735            Err(e) if e == errno!(ENOENT) && flags.contains(OpenFlags::CREAT) => {
736                if context.must_be_directory {
737                    return error!(EISDIR);
738                }
739                Ok((
740                    parent.open_create_node(
741                        locked,
742                        self,
743                        basename,
744                        mode.with_type(FileMode::IFREG),
745                        DeviceId::NONE,
746                        flags,
747                    )?,
748                    true,
749                ))
750            }
751            Err(e) => Err(e),
752        }
753    }
754
755    /// The primary entry point for opening files relative to a task.
756    ///
757    /// Absolute paths are resolve relative to the root of the FsContext for
758    /// this task. Relative paths are resolve relative to dir_fd. To resolve
759    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
760    /// dir_fd.
761    ///
762    /// Returns a FileHandle but does not install the FileHandle in the FdTable
763    /// for this task.
764    pub fn open_file_at(
765        &self,
766        locked: &mut Locked<Unlocked>,
767        dir_fd: FdNumber,
768        path: &FsStr,
769        flags: OpenFlags,
770        mode: FileMode,
771        resolve_flags: ResolveFlags,
772        access_check: AccessCheck,
773    ) -> Result<FileHandle, Errno> {
774        if path.is_empty() {
775            return error!(ENOENT);
776        }
777
778        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, resolve_flags)?;
779        self.open_namespace_node_at(locked, dir, path, flags, mode, resolve_flags, access_check)
780    }
781
782    pub fn open_namespace_node_at(
783        &self,
784        locked: &mut Locked<Unlocked>,
785        dir: NamespaceNode,
786        path: &FsStr,
787        flags: OpenFlags,
788        mode: FileMode,
789        mut resolve_flags: ResolveFlags,
790        access_check: AccessCheck,
791    ) -> Result<FileHandle, Errno> {
792        // 64-bit kernels force the O_LARGEFILE flag to be on.
793        let mut flags = flags | OpenFlags::LARGEFILE;
794        let opath = flags.contains(OpenFlags::PATH);
795        if opath {
796            // When O_PATH is specified in flags, flag bits other than O_CLOEXEC,
797            // O_DIRECTORY, and O_NOFOLLOW are ignored.
798            const ALLOWED_FLAGS: OpenFlags = OpenFlags::from_bits_truncate(
799                OpenFlags::PATH.bits()
800                    | OpenFlags::CLOEXEC.bits()
801                    | OpenFlags::DIRECTORY.bits()
802                    | OpenFlags::NOFOLLOW.bits(),
803            );
804            flags &= ALLOWED_FLAGS;
805        }
806
807        if flags.contains(OpenFlags::TMPFILE) && !flags.can_write() {
808            return error!(EINVAL);
809        }
810
811        let nofollow = flags.contains(OpenFlags::NOFOLLOW);
812        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
813
814        let symlink_mode =
815            if nofollow || must_create { SymlinkMode::NoFollow } else { SymlinkMode::Follow };
816
817        let resolve_base = match (
818            resolve_flags.contains(ResolveFlags::BENEATH),
819            resolve_flags.contains(ResolveFlags::IN_ROOT),
820        ) {
821            (false, false) => ResolveBase::None,
822            (true, false) => ResolveBase::Beneath(dir.clone()),
823            (false, true) => ResolveBase::InRoot(dir.clone()),
824            (true, true) => return error!(EINVAL),
825        };
826
827        // `RESOLVE_BENEATH` and `RESOLVE_IN_ROOT` imply `RESOLVE_NO_MAGICLINKS`. This matches
828        // Linux behavior. Strictly speaking it's is not really required, but it's hard to
829        // implement `BENEATH` and `IN_ROOT` flags correctly otherwise.
830        if resolve_base != ResolveBase::None {
831            resolve_flags.insert(ResolveFlags::NO_MAGICLINKS);
832        }
833
834        let mut context = LookupContext {
835            symlink_mode,
836            remaining_follows: MAX_SYMLINK_FOLLOWS,
837            must_be_directory: flags.contains(OpenFlags::DIRECTORY),
838            resolve_flags,
839            resolve_base,
840        };
841        let (name, created) =
842            match self.resolve_open_path(locked, &mut context, &dir, path, mode, flags) {
843                Ok((n, c)) => (n, c),
844                Err(e) => {
845                    let mut abs_path = dir.path(&self.fs());
846                    abs_path.extend(&**path);
847                    track_file_not_found(abs_path);
848                    return Err(e);
849                }
850            };
851
852        let name = if flags.contains(OpenFlags::TMPFILE) {
853            // `O_TMPFILE` is incompatible with `O_CREAT`
854            if flags.contains(OpenFlags::CREAT) {
855                return error!(EINVAL);
856            }
857            name.create_tmpfile(locked, self, mode.with_type(FileMode::IFREG), flags)?
858        } else {
859            let mode = name.entry.node.info().mode;
860
861            // These checks are not needed in the `O_TMPFILE` case because `mode` refers to the
862            // file we are opening. With `O_TMPFILE`, that file is the regular file we just
863            // created rather than the node we found by resolving the path.
864            //
865            // For example, we do not need to produce `ENOTDIR` when `must_be_directory` is set
866            // because `must_be_directory` refers to the node we found by resolving the path.
867            // If that node was not a directory, then `create_tmpfile` will produce an error.
868            //
869            // Similarly, we never need to call `truncate` because `O_TMPFILE` is newly created
870            // and therefor already an empty file.
871
872            if !opath && nofollow && mode.is_lnk() {
873                return error!(ELOOP);
874            }
875
876            if mode.is_dir() {
877                if flags.can_write()
878                    || flags.contains(OpenFlags::CREAT)
879                    || flags.contains(OpenFlags::TRUNC)
880                {
881                    return error!(EISDIR);
882                }
883                if flags.contains(OpenFlags::DIRECT) {
884                    return error!(EINVAL);
885                }
886            } else if context.must_be_directory {
887                return error!(ENOTDIR);
888            }
889
890            if flags.contains(OpenFlags::TRUNC) && mode.is_reg() && !created {
891                // You might think we should check file.can_write() at this
892                // point, which is what the docs suggest, but apparently we
893                // are supposed to truncate the file if this task can write
894                // to the underlying node, even if we are opening the file
895                // as read-only. See OpenTest.CanTruncateReadOnly.
896                name.truncate(locked, self, 0)?;
897            }
898
899            name
900        };
901
902        // If the node has been created, the open operation should not verify access right:
903        // From <https://man7.org/linux/man-pages/man2/open.2.html>
904        //
905        // > Note that mode applies only to future accesses of the newly created file; the
906        // > open() call that creates a read-only file may well return a  read/write  file
907        // > descriptor.
908
909        let access_check = if created { AccessCheck::skip() } else { access_check };
910        name.open(locked, self, flags, access_check)
911    }
912
913    /// A wrapper for FsContext::lookup_parent_at that resolves the given
914    /// dir_fd to a NamespaceNode.
915    ///
916    /// Absolute paths are resolve relative to the root of the FsContext for
917    /// this task. Relative paths are resolve relative to dir_fd. To resolve
918    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
919    /// dir_fd.
920    pub fn lookup_parent_at<'a, L>(
921        &self,
922        locked: &mut Locked<L>,
923        context: &mut LookupContext,
924        dir_fd: FdNumber,
925        path: &'a FsStr,
926    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
927    where
928        L: LockEqualOrBefore<FileOpsCore>,
929    {
930        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, ResolveFlags::empty())?;
931        self.lookup_parent(locked, context, &dir, path)
932    }
933
934    /// Lookup the parent of a namespace node.
935    ///
936    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
937    /// calling this function directly.
938    ///
939    /// This function resolves all but the last component of the given path.
940    /// The function returns the parent directory of the last component as well
941    /// as the last component.
942    ///
943    /// If path is empty, this function returns dir and an empty path.
944    /// Similarly, if path ends with "." or "..", these components will be
945    /// returned along with the parent.
946    ///
947    /// The returned parent might not be a directory.
948    pub fn lookup_parent<'a, L>(
949        &self,
950        locked: &mut Locked<L>,
951        context: &mut LookupContext,
952        dir: &NamespaceNode,
953        path: &'a FsStr,
954    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
955    where
956        L: LockEqualOrBefore<FileOpsCore>,
957    {
958        context.update_for_path(path);
959
960        let components = split_path(path);
961        if components.is_empty() {
962            return Ok((dir.clone(), Default::default()));
963        }
964        let result =
965            dir.lookup_children(locked, self, context, &components[0..components.len() - 1])?;
966        Ok((result, components.last().unwrap()))
967    }
968
969    /// Lookup a namespace node.
970    ///
971    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
972    /// calling this function directly.
973    ///
974    /// This function resolves the component of the given path.
975    pub fn lookup_path<L>(
976        &self,
977        locked: &mut Locked<L>,
978        context: &mut LookupContext,
979        dir: NamespaceNode,
980        path: &FsStr,
981    ) -> Result<NamespaceNode, Errno>
982    where
983        L: LockEqualOrBefore<FileOpsCore>,
984    {
985        let components = split_path(path);
986        dir.lookup_children(locked, self, context, &components)
987    }
988
989    /// Lookup a namespace node starting at the root directory.
990    ///
991    /// Resolves symlinks.
992    pub fn lookup_path_from_root<L>(
993        &self,
994        locked: &mut Locked<L>,
995        path: &FsStr,
996    ) -> Result<NamespaceNode, Errno>
997    where
998        L: LockEqualOrBefore<FileOpsCore>,
999    {
1000        let mut context = LookupContext::default();
1001        self.lookup_path(locked, &mut context, self.fs().root(), path)
1002    }
1003
1004    pub fn exec(
1005        &mut self,
1006        locked: &mut Locked<Unlocked>,
1007        executable: FileHandle,
1008        path: CString,
1009        argv: Vec<CString>,
1010        environ: Vec<CString>,
1011    ) -> Result<(), Errno> {
1012        // Executable must be a regular file
1013        if !executable.name.entry.node.is_reg() {
1014            return error!(EACCES);
1015        }
1016
1017        // File node must have EXEC mode permissions.
1018        // Note that the ability to execute a file is unrelated to the flags
1019        // used in the `open` call.
1020        executable.name.check_access(locked, self, Access::EXEC, CheckAccessReason::Exec)?;
1021
1022        // 1. Prepare a `ResolvedElf` to hold details of the binary to be executed, its credentials,
1023        //    etc.
1024        // TODO: https://fxbug.dev/483368940 - Split the initial `ResolvedElf` creation from the
1025        // resolution of the interpreter binary, if any.
1026        let mut resolved_elf =
1027            resolve_executable(locked, self, executable.clone(), path.clone(), argv, environ)?;
1028
1029        // 2. Allow LSMs to perform access-checks on the target `executable`, and to update the
1030        //    `resolved_elf.creds` as necessary.
1031        security::bprm_creds_for_exec(self, &executable.name, &mut resolved_elf)?;
1032
1033        // 3. Resolve details of the initial binary, whether the `executable` itself, or an
1034        //    interpreter, if `executable` is a script.
1035        // TODO: https://fxbug.dev/483368940 - Split the initial `ResolvedElf` creation from the
1036        // resolution of the interpreter binary, if any.
1037
1038        // 4. Apply UID, GID and capabilities according to the attributes of the resolved binary.
1039        // TODO: https://fxbug.dev/503338788 - Collate this logic into a `bprm_creds_from_file()`.
1040        let maybe_set_id = if self.kernel().features.enable_suid {
1041            resolved_elf.file.name.suid_and_sgid(&self)?
1042        } else {
1043            Default::default()
1044        };
1045
1046        if self.thread_group().read().tasks_count() > 1 {
1047            track_stub!(TODO("https://fxbug.dev/297434895"), "exec on multithread process");
1048            return error!(EINVAL);
1049        }
1050
1051        // 5. Finalize the `exec()` operation by actually updating the task state based on the
1052        //    resolved details. Failures during this step are unrecoverable.
1053        if let Err(err) = self.finish_exec(locked, path, resolved_elf, maybe_set_id) {
1054            log_warn!("unrecoverable error in exec: {err:?}");
1055
1056            send_standard_signal(locked, self, SignalInfo::forced(SIGSEGV));
1057            return Err(err);
1058        }
1059
1060        self.ptrace_event(locked, PtraceOptions::TRACEEXEC, self.task.tid as u64);
1061        self.signal_vfork();
1062        self.task.thread_group.sync_syscall_log_level();
1063
1064        Ok(())
1065    }
1066
1067    /// After the memory is unmapped, any failure in exec is unrecoverable and results in the
1068    /// process crashing. This function is for that second half; any error returned from this
1069    /// function will be considered unrecoverable.
1070    fn finish_exec(
1071        &mut self,
1072        locked: &mut Locked<Unlocked>,
1073        path: CString,
1074        mut resolved_elf: ResolvedElf,
1075        mut maybe_set_id: UserAndOrGroupId,
1076    ) -> Result<(), Errno> {
1077        // Now that the exec will definitely finish (or crash), notify owners of
1078        // locked futexes for the current process, which will be impossible to
1079        // update after process image is replaced.  See get_robust_list(2).
1080        self.notify_robust_list();
1081
1082        // If there is already a `MemoryManager` then `exec()` will tear down the underlying Zircon
1083        // address-space, before creating an address-space configured ready to run `resolved_elf`.
1084        let mm = {
1085            let new_mm = MemoryManager::exec(
1086                self.thread_group().root_vmar.unowned(),
1087                self.mm().ok(),
1088                resolved_elf.file.name.to_passive(),
1089                resolved_elf.arch_width,
1090            )?;
1091            self.live().mm.update(Some(new_mm.clone()));
1092            new_mm
1093        };
1094
1095        {
1096            let mut state = self.write();
1097
1098            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1099            //
1100            //   The aforementioned transformations of the effective IDs are not
1101            //   performed (i.e., the set-user-ID and set-group-ID bits are
1102            //   ignored) if any of the following is true:
1103            //
1104            //   * the no_new_privs attribute is set for the calling thread (see
1105            //      prctl(2));
1106            //
1107            //   *  the underlying filesystem is mounted nosuid (the MS_NOSUID
1108            //      flag for mount(2)); or
1109            //
1110            //   *  the calling process is being ptraced.
1111            //
1112            // The MS_NOSUID check is in `NamespaceNode::suid_and_sgid()`.
1113            if state.no_new_privs() || state.is_ptraced() {
1114                maybe_set_id.clear();
1115            }
1116
1117            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1118            //
1119            //   The process's "dumpable" attribute is set to the value 1,
1120            //   unless a set-user-ID program, a set-group-ID program, or a
1121            //   program with capabilities is being executed, in which case the
1122            //   dumpable flag may instead be reset to the value in
1123            //   /proc/sys/fs/suid_dumpable, in the circumstances described
1124            //   under PR_SET_DUMPABLE in prctl(2).
1125            let dumpable =
1126                if maybe_set_id.is_none() { DumpPolicy::User } else { DumpPolicy::Disable };
1127            *mm.dumpable.lock(locked) = dumpable;
1128
1129            // TODO(https://fxbug.dev/433463756): Figure out whether this is the right place to
1130            // take the lock.
1131            // SAFETY: this is allowed because we are the CurrentTask.
1132            let mut writable_creds = unsafe { self.persistent_info.write_creds() };
1133            state.set_sigaltstack(None);
1134            state.robust_list_head = RobustListHeadPtr::null(self);
1135
1136            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1137            //
1138            //   If a set-user-ID or set-group-ID
1139            //   program is being executed, then the parent death signal set by
1140            //   prctl(2) PR_SET_PDEATHSIG flag is cleared.
1141            //
1142            // TODO(https://fxbug.dev/356684424): Implement the behavior above once we support
1143            // the PR_SET_PDEATHSIG flag.
1144
1145            // TODO(tbodt): Check whether capability xattrs are set on the file, and grant/limit
1146            // capabilities accordingly.
1147            resolved_elf.creds.exec(maybe_set_id);
1148
1149            // TODO(https://fxbug.dev/503338788) - Migrate this (and other capabilities wrangling)
1150            // into a `common_cap::bprm_creds_from_file()` implementation.
1151            if state.no_new_privs() {
1152                resolved_elf.creds.cap_permitted &= self.current_creds().cap_permitted;
1153                resolved_elf.creds.cap_effective &= resolved_elf.creds.cap_permitted;
1154            }
1155
1156            security::bprm_committing_creds(locked, self, &resolved_elf)?;
1157
1158            let new_creds = Arc::new(resolved_elf.creds.clone());
1159            writable_creds.update(new_creds.clone());
1160            *self.current_creds.borrow_mut() = CurrentCreds::Cached(new_creds);
1161        }
1162
1163        let start_info = load_executable(self, resolved_elf, &path)?;
1164
1165        let regs: zx_restricted_state_t = start_info.into();
1166        self.thread_state.registers.load(regs);
1167        self.thread_state.extended_pstate.reset();
1168        self.thread_group().signal_actions.reset_for_exec();
1169
1170        // The exit signal (and that of the children) is reset to SIGCHLD.
1171        let mut thread_group_state = self.thread_group().write();
1172        thread_group_state.exit_signal = Some(SIGCHLD);
1173        for (_, weak_child) in &mut thread_group_state.children {
1174            if let Some(child) = weak_child.upgrade() {
1175                let mut child_state = child.write();
1176                child_state.exit_signal = Some(SIGCHLD);
1177            }
1178        }
1179
1180        std::mem::drop(thread_group_state);
1181
1182        // TODO(https://fxbug.dev/42082680): All threads other than the calling thread are destroyed.
1183
1184        // TODO: POSIX timers are not preserved.
1185
1186        // TODO: Ensure that the filesystem context is un-shared, undoing the effect of CLONE_FS.
1187
1188        // The file descriptor table is unshared, undoing the effect of the CLONE_FILES flag of
1189        // clone(2).
1190        self.live().files.unshare();
1191        self.live().files.exec(locked, self);
1192
1193        // If SELinux is enabled, enforce permissions related to inheritance of file descriptors
1194        // and resource limits. Then update the current task's SID.
1195        //
1196        // TODO: https://fxbug.dev/378655436 - After the above, enforce permissions related to
1197        // signal state inheritance.
1198        //
1199        // This needs to be called after closing any files marked "close-on-exec".
1200        security::bprm_committed_creds(locked, self)?;
1201
1202        self.thread_group().write().did_exec = true;
1203
1204        self.set_command_name(TaskCommand::from_path_bytes(path.to_bytes()));
1205
1206        Ok(())
1207    }
1208
1209    pub fn set_command_name(&self, new_name: TaskCommand) {
1210        // set_command_name needs to run before leader_command() in cases where self is the leader.
1211        self.task.set_command_name(new_name.clone());
1212        let leader_command = self.thread_group().read().leader_command();
1213        starnix_logging::set_current_task_info(
1214            new_name,
1215            leader_command,
1216            self.thread_group().leader,
1217            self.tid,
1218        );
1219    }
1220
1221    pub fn add_seccomp_filter(
1222        &mut self,
1223        locked: &mut Locked<Unlocked>,
1224        code: Vec<sock_filter>,
1225        flags: u32,
1226    ) -> Result<SyscallResult, Errno> {
1227        let new_filter = Arc::new(SeccompFilter::from_cbpf(
1228            &code,
1229            self.thread_group().next_seccomp_filter_id.add(1),
1230            flags & SECCOMP_FILTER_FLAG_LOG != 0,
1231        )?);
1232
1233        let mut maybe_fd: Option<FdNumber> = None;
1234
1235        if flags & SECCOMP_FILTER_FLAG_NEW_LISTENER != 0 {
1236            maybe_fd = Some(SeccompFilterContainer::create_listener(locked, self)?);
1237        }
1238
1239        // We take the process lock here because we can't change any of the threads
1240        // while doing a tsync.  So, you hold the process lock while making any changes.
1241        let state = self.thread_group().write();
1242
1243        if flags & SECCOMP_FILTER_FLAG_TSYNC != 0 {
1244            // TSYNC synchronizes all filters for all threads in the current process to
1245            // the current thread's
1246
1247            // We collect the filters for the current task upfront to save us acquiring
1248            // the task's lock a lot of times below.
1249            let mut filters: SeccompFilterContainer = self.read().seccomp_filters.clone();
1250
1251            // For TSYNC to work, all of the other thread filters in this process have to
1252            // be a prefix of this thread's filters, and none of them can be in
1253            // strict mode.
1254            let tasks = state.tasks().collect::<Vec<_>>();
1255            for task in &tasks {
1256                if task.tid == self.tid {
1257                    continue;
1258                }
1259                let other_task_state = task.read();
1260
1261                // Target threads cannot be in SECCOMP_MODE_STRICT
1262                if task.seccomp_filter_state.get() == SeccompStateValue::Strict {
1263                    return Self::seccomp_tsync_error(task.tid, flags);
1264                }
1265
1266                // Target threads' filters must be a subsequence of this thread's
1267                if !other_task_state.seccomp_filters.can_sync_to(&filters) {
1268                    return Self::seccomp_tsync_error(task.tid, flags);
1269                }
1270            }
1271
1272            // Now that we're sure we're allowed to do so, add the filter to all threads.
1273            filters.add_filter(new_filter, code.len() as u16)?;
1274
1275            for task in &tasks {
1276                let mut other_task_state = task.write();
1277
1278                other_task_state.enable_no_new_privs();
1279                other_task_state.seccomp_filters = filters.clone();
1280                task.set_seccomp_state(SeccompStateValue::UserDefined)?;
1281            }
1282        } else {
1283            let mut task_state = self.task.write();
1284
1285            task_state.seccomp_filters.add_filter(new_filter, code.len() as u16)?;
1286            self.set_seccomp_state(SeccompStateValue::UserDefined)?;
1287        }
1288
1289        if let Some(fd) = maybe_fd { Ok(fd.into()) } else { Ok(().into()) }
1290    }
1291
1292    pub fn run_seccomp_filters(
1293        &mut self,
1294        locked: &mut Locked<Unlocked>,
1295        syscall: &Syscall,
1296    ) -> Option<Result<SyscallResult, Errno>> {
1297        // Implementation of SECCOMP_FILTER_STRICT, which has slightly different semantics
1298        // from user-defined seccomp filters.
1299        if self.seccomp_filter_state.get() == SeccompStateValue::Strict {
1300            return SeccompState::do_strict(locked, self, syscall);
1301        }
1302
1303        // Run user-defined seccomp filters
1304        let result = self.task.read().seccomp_filters.run_all(self, syscall);
1305
1306        SeccompState::do_user_defined(locked, result, self, syscall)
1307    }
1308
1309    fn seccomp_tsync_error(id: i32, flags: u32) -> Result<SyscallResult, Errno> {
1310        // By default, TSYNC indicates failure state by returning the first thread
1311        // id not to be able to sync, rather than by returning -1 and setting
1312        // errno.  However, if TSYNC_ESRCH is set, it returns ESRCH.  This
1313        // prevents conflicts with fact that SECCOMP_FILTER_FLAG_NEW_LISTENER
1314        // makes seccomp return an fd.
1315        if flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH != 0 { error!(ESRCH) } else { Ok(id.into()) }
1316    }
1317
1318    // Notify all futexes in robust list.  The robust list is in user space, so we
1319    // are very careful about walking it, and there are a lot of quiet returns if
1320    // we fail to walk it.
1321    // TODO(https://fxbug.dev/42079081): This only sets the FUTEX_OWNER_DIED bit; it does
1322    // not wake up a waiter.
1323    pub fn notify_robust_list(&self) {
1324        let task_state = self.write();
1325        let robust_list_addr = task_state.robust_list_head.addr();
1326        if robust_list_addr == UserAddress::NULL {
1327            // No one has called set_robust_list.
1328            return;
1329        }
1330        let robust_list_res = self.read_multi_arch_object(task_state.robust_list_head);
1331
1332        let head = if let Ok(head) = robust_list_res {
1333            head
1334        } else {
1335            return;
1336        };
1337
1338        let offset = head.futex_offset;
1339
1340        let mut entries_count = 0;
1341        let mut curr_ptr = head.list.next;
1342        while curr_ptr.addr() != robust_list_addr.into() && entries_count < ROBUST_LIST_LIMIT {
1343            let curr_ref = self.read_multi_arch_object(curr_ptr);
1344
1345            let curr = if let Ok(curr) = curr_ref {
1346                curr
1347            } else {
1348                return;
1349            };
1350
1351            let Some(futex_base) = curr_ptr.addr().checked_add_signed(offset) else {
1352                return;
1353            };
1354
1355            let futex_addr = match FutexAddress::try_from(futex_base) {
1356                Ok(addr) => addr,
1357                Err(_) => {
1358                    return;
1359                }
1360            };
1361
1362            let Ok(mm) = self.mm() else {
1363                log_error!("Asked to notify robust list futexes in system task.");
1364                return;
1365            };
1366            let futex = if let Ok(futex) = mm.atomic_load_u32_relaxed(futex_addr) {
1367                futex
1368            } else {
1369                return;
1370            };
1371
1372            if (futex & FUTEX_TID_MASK) as i32 == self.tid {
1373                let owner_died = FUTEX_OWNER_DIED | futex;
1374                if mm.atomic_store_u32_relaxed(futex_addr, owner_died).is_err() {
1375                    return;
1376                }
1377            }
1378            curr_ptr = curr.next;
1379            entries_count += 1;
1380        }
1381    }
1382
1383    /// Returns a ref to this thread's SeccompNotifier.
1384    pub fn get_seccomp_notifier(&mut self) -> Option<SeccompNotifierHandle> {
1385        self.task.write().seccomp_filters.notifier.clone()
1386    }
1387
1388    pub fn set_seccomp_notifier(&mut self, notifier: Option<SeccompNotifierHandle>) {
1389        self.task.write().seccomp_filters.notifier = notifier;
1390    }
1391
1392    /// Processes a Zircon exception associated with this task.
1393    pub fn process_exception(
1394        &self,
1395        locked: &mut Locked<Unlocked>,
1396        report: &zx::ExceptionReport,
1397    ) -> ExceptionResult {
1398        match report.ty {
1399            zx::ExceptionType::General => match get_signal_for_general_exception(&report.arch) {
1400                Some(sig) => ExceptionResult::Signal(SignalInfo::kernel(sig)),
1401                None => {
1402                    log_error!("Unrecognized general exception: {:?}", report);
1403                    ExceptionResult::Signal(SignalInfo::kernel(SIGILL))
1404                }
1405            },
1406            zx::ExceptionType::FatalPageFault { status } => {
1407                let report = decode_page_fault_exception_report(&report.arch);
1408                if let Ok(mm) = self.mm() {
1409                    mm.handle_page_fault(locked, report, status)
1410                } else {
1411                    panic!(
1412                        "system task is handling a major page fault status={:?}, report={:?}",
1413                        status, report
1414                    );
1415                }
1416            }
1417            zx::ExceptionType::UndefinedInstruction => {
1418                ExceptionResult::Signal(SignalInfo::kernel(SIGILL))
1419            }
1420            zx::ExceptionType::UnalignedAccess => {
1421                ExceptionResult::Signal(SignalInfo::kernel(SIGBUS))
1422            }
1423            zx::ExceptionType::SoftwareBreakpoint | zx::ExceptionType::HardwareBreakpoint => {
1424                ExceptionResult::Signal(SignalInfo::kernel(SIGTRAP))
1425            }
1426            zx::ExceptionType::ProcessNameChanged => {
1427                log_error!("Received unexpected process name changed exception");
1428                ExceptionResult::Handled
1429            }
1430            zx::ExceptionType::ProcessStarting
1431            | zx::ExceptionType::ThreadStarting
1432            | zx::ExceptionType::ThreadExiting => {
1433                log_error!("Received unexpected task lifecycle exception");
1434                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1435            }
1436            zx::ExceptionType::PolicyError(policy_code) => {
1437                log_error!(policy_code:?; "Received Zircon policy error exception");
1438                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1439            }
1440            zx::ExceptionType::UnknownUserGenerated { code, data } => {
1441                log_error!(code:?, data:?; "Received unexpected unknown user generated exception");
1442                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1443            }
1444            zx::ExceptionType::Unknown { ty, code, data } => {
1445                log_error!(ty:?, code:?, data:?; "Received unexpected exception");
1446                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1447            }
1448        }
1449    }
1450
1451    /// Clone this task.
1452    ///
1453    /// Creates a new task object that shares some state with this task
1454    /// according to the given flags.
1455    ///
1456    /// Used by the clone() syscall to create both processes and threads.
1457    ///
1458    /// The exit signal is broken out from the flags parameter like clone3() rather than being
1459    /// bitwise-ORed like clone().
1460    pub fn clone_task<L>(
1461        &self,
1462        locked: &mut Locked<L>,
1463        flags: u64,
1464        child_exit_signal: Option<Signal>,
1465        user_parent_tid: UserRef<pid_t>,
1466        user_child_tid: UserRef<pid_t>,
1467        user_pidfd: UserRef<FdNumber>,
1468    ) -> Result<TaskBuilder, Errno>
1469    where
1470        L: LockBefore<MmDumpable>,
1471        L: LockBefore<TaskRelease>,
1472        L: LockBefore<ProcessGroupState>,
1473    {
1474        const IMPLEMENTED_FLAGS: u64 = (CLONE_VM
1475            | CLONE_FS
1476            | CLONE_FILES
1477            | CLONE_SIGHAND
1478            | CLONE_THREAD
1479            | CLONE_SYSVSEM
1480            | CLONE_SETTLS
1481            | CLONE_PARENT
1482            | CLONE_PARENT_SETTID
1483            | CLONE_PIDFD
1484            | CLONE_CHILD_CLEARTID
1485            | CLONE_CHILD_SETTID
1486            | CLONE_VFORK
1487            | CLONE_NEWUTS
1488            | CLONE_PTRACE) as u64;
1489
1490        // A mask with all valid flags set, because we want to return a different error code for an
1491        // invalid flag vs an unimplemented flag. Subtracting 1 from the largest valid flag gives a
1492        // mask with all flags below it set. Shift up by one to make sure the largest flag is also
1493        // set.
1494        const VALID_FLAGS: u64 = (CLONE_INTO_CGROUP << 1) - 1;
1495
1496        // CLONE_SETTLS is implemented by sys_clone.
1497
1498        let clone_files = flags & (CLONE_FILES as u64) != 0;
1499        let clone_fs = flags & (CLONE_FS as u64) != 0;
1500        let clone_parent = flags & (CLONE_PARENT as u64) != 0;
1501        let clone_parent_settid = flags & (CLONE_PARENT_SETTID as u64) != 0;
1502        let clone_pidfd = flags & (CLONE_PIDFD as u64) != 0;
1503        let clone_child_cleartid = flags & (CLONE_CHILD_CLEARTID as u64) != 0;
1504        let clone_child_settid = flags & (CLONE_CHILD_SETTID as u64) != 0;
1505        let clone_sysvsem = flags & (CLONE_SYSVSEM as u64) != 0;
1506        let clone_ptrace = flags & (CLONE_PTRACE as u64) != 0;
1507        let clone_thread = flags & (CLONE_THREAD as u64) != 0;
1508        let clone_vm = flags & (CLONE_VM as u64) != 0;
1509        let clone_sighand = flags & (CLONE_SIGHAND as u64) != 0;
1510        let clone_vfork = flags & (CLONE_VFORK as u64) != 0;
1511        let clone_newuts = flags & (CLONE_NEWUTS as u64) != 0;
1512        let clone_into_cgroup = flags & CLONE_INTO_CGROUP != 0;
1513
1514        if clone_ptrace {
1515            track_stub!(TODO("https://fxbug.dev/322874630"), "CLONE_PTRACE");
1516        }
1517
1518        if clone_sysvsem {
1519            track_stub!(TODO("https://fxbug.dev/322875185"), "CLONE_SYSVSEM");
1520        }
1521
1522        if clone_into_cgroup {
1523            track_stub!(TODO("https://fxbug.dev/403612570"), "CLONE_INTO_CGROUP");
1524        }
1525
1526        if clone_sighand && !clone_vm {
1527            return error!(EINVAL);
1528        }
1529        if clone_thread && !clone_sighand {
1530            return error!(EINVAL);
1531        }
1532
1533        if clone_pidfd && clone_thread {
1534            return error!(EINVAL);
1535        }
1536        if clone_pidfd && clone_parent_settid && user_parent_tid.addr() == user_pidfd.addr() {
1537            // `clone()` uses the same out-argument for these, so error out if they have the same
1538            // user address.
1539            return error!(EINVAL);
1540        }
1541
1542        if flags & !VALID_FLAGS != 0 {
1543            return error!(EINVAL);
1544        }
1545
1546        if clone_vm && !clone_thread {
1547            // TODO(https://fxbug.dev/42066087) Implement CLONE_VM for child processes (not just child
1548            // threads). Currently this executes CLONE_VM (explicitly passed to clone() or as
1549            // used by vfork()) as a fork (the VM in the child is copy-on-write) which is almost
1550            // always OK.
1551            //
1552            // CLONE_VM is primarily as an optimization to avoid making a copy-on-write version of a
1553            // process' VM that will be immediately replaced with a call to exec(). The main users
1554            // (libc and language runtimes) don't actually rely on the memory being shared between
1555            // the two processes. And the vfork() man page explicitly allows vfork() to be
1556            // implemented as fork() which is what we do here.
1557            if !clone_vfork {
1558                track_stub!(
1559                    TODO("https://fxbug.dev/322875227"),
1560                    "CLONE_VM without CLONE_THREAD or CLONE_VFORK"
1561                );
1562            }
1563        } else if clone_thread && !clone_vm {
1564            track_stub!(TODO("https://fxbug.dev/322875167"), "CLONE_THREAD without CLONE_VM");
1565            return error!(ENOSYS);
1566        }
1567
1568        if flags & !IMPLEMENTED_FLAGS != 0 {
1569            track_stub!(
1570                TODO("https://fxbug.dev/322875130"),
1571                "clone unknown flags",
1572                flags & !IMPLEMENTED_FLAGS
1573            );
1574            return error!(ENOSYS);
1575        }
1576
1577        let fs = if clone_fs { self.fs() } else { self.fs().fork() };
1578        let files = if clone_files { self.live().files.clone() } else { self.live().files.fork() };
1579
1580        let kernel = self.kernel();
1581
1582        let mut pids = kernel.pids.write();
1583
1584        // Lock the cgroup process hierarchy so that the parent process cannot move to a different
1585        // cgroup while a new task or thread_group is created. This may be unnecessary if
1586        // CLONE_INTO_CGROUP is implemented and passed in.
1587        let mut cgroup2_pid_table = kernel.cgroups.lock_cgroup2_pid_table();
1588        // Create a `KernelSignal::Freeze` to put onto the new task, if the cgroup is frozen.
1589        let child_kernel_signals = cgroup2_pid_table
1590            .maybe_create_freeze_signal(self.thread_group())
1591            .into_iter()
1592            .collect::<VecDeque<_>>();
1593
1594        let pid;
1595        let command;
1596        let creds;
1597        let scheduler_state;
1598        let no_new_privs;
1599        let seccomp_filters;
1600        let robust_list_head = RobustListHeadPtr::null(self);
1601        let child_signal_mask;
1602        let timerslack_ns;
1603        let uts_ns;
1604
1605        let TaskInfo { thread, thread_group, memory_manager } = {
1606            // These variables hold the original parent in case we need to switch the parent of the
1607            // new task because of CLONE_PARENT.
1608            let weak_original_parent;
1609            let original_parent;
1610
1611            // Make sure to drop these locks ASAP to avoid inversion
1612            let thread_group_state = {
1613                let thread_group_state = self.thread_group().write();
1614                if clone_parent {
1615                    // With the CLONE_PARENT flag, the parent of the new task is our parent
1616                    // instead of ourselves.
1617                    weak_original_parent =
1618                        thread_group_state.parent.clone().ok_or_else(|| errno!(EINVAL))?;
1619                    std::mem::drop(thread_group_state);
1620                    original_parent = weak_original_parent.upgrade();
1621                    original_parent.write()
1622                } else {
1623                    thread_group_state
1624                }
1625            };
1626
1627            let state = self.read();
1628
1629            no_new_privs = state.no_new_privs();
1630            seccomp_filters = state.seccomp_filters.clone();
1631            child_signal_mask = state.signal_mask();
1632
1633            pid = pids.allocate_pid();
1634            command = self.command();
1635            creds = self.current_creds().clone();
1636            scheduler_state = state.scheduler_state.fork();
1637            timerslack_ns = state.timerslack_ns;
1638
1639            uts_ns = if clone_newuts {
1640                security::check_task_capable(self, CAP_SYS_ADMIN)?;
1641                state.uts_ns.read().fork()
1642            } else {
1643                state.uts_ns.clone()
1644            };
1645
1646            if clone_thread {
1647                TaskInfo {
1648                    thread: None,
1649                    thread_group: self.thread_group().clone(),
1650                    memory_manager: self.mm().ok(),
1651                }
1652            } else {
1653                // Drop the lock on this task before entering `create_zircon_process`, because it will
1654                // take a lock on the new thread group, and locks on thread groups have a higher
1655                // priority than locks on the task in the thread group.
1656                std::mem::drop(state);
1657                let signal_actions = if clone_sighand {
1658                    self.thread_group().signal_actions.clone()
1659                } else {
1660                    self.thread_group().signal_actions.fork()
1661                };
1662                let process_group = thread_group_state.process_group.clone();
1663
1664                let task_info = {
1665                    trace_duration!(CATEGORY_STARNIX, "create_zircon_process");
1666                    create_zircon_process(
1667                        locked,
1668                        kernel,
1669                        Some(thread_group_state),
1670                        pid,
1671                        child_exit_signal,
1672                        process_group,
1673                        signal_actions,
1674                        command.clone(),
1675                    )?
1676                };
1677
1678                cgroup2_pid_table.inherit_cgroup(self.thread_group(), &task_info.thread_group);
1679
1680                task_info
1681            }
1682        };
1683
1684        // Drop the lock on the cgroup pid_table before creating the TaskBuilder.
1685        // If the TaskBuilder creation fails, the TaskBuilder is dropped, which calls
1686        // ThreadGroup::remove. ThreadGroup::remove takes the cgroup pid_table lock, causing
1687        // a cyclic lock dependency.
1688        std::mem::drop(cgroup2_pid_table);
1689
1690        // Only create the vfork event when the caller requested CLONE_VFORK.
1691        let vfork_event = if clone_vfork { Some(Arc::new(zx::Event::create())) } else { None };
1692
1693        // Clone live state in a nested scope to ensure that the RCU read scope is not held across
1694        // the release_on_error block.
1695        let abstract_socket_namespace;
1696        let abstract_vsock_namespace;
1697        {
1698            let live = self.live();
1699            abstract_socket_namespace = live.abstract_socket_namespace.clone();
1700            abstract_vsock_namespace = live.abstract_vsock_namespace.clone();
1701        }
1702
1703        let mut child = TaskBuilder::new(Task::new(
1704            pid,
1705            command,
1706            thread_group,
1707            thread,
1708            files,
1709            memory_manager,
1710            fs,
1711            creds,
1712            abstract_socket_namespace,
1713            abstract_vsock_namespace,
1714            child_signal_mask,
1715            child_kernel_signals,
1716            vfork_event,
1717            scheduler_state,
1718            uts_ns,
1719            no_new_privs,
1720            SeccompState::from(&self.seccomp_filter_state),
1721            seccomp_filters,
1722            robust_list_head,
1723            timerslack_ns,
1724        ));
1725
1726        release_on_error!(child, locked, {
1727            let child_task = TempRef::from(&child.task);
1728            // Drop the pids lock as soon as possible after creating the child. Destroying the child
1729            // and removing it from the pids table itself requires the pids lock, so if an early exit
1730            // takes place we have a self deadlock.
1731            pids.add_task(&child_task);
1732            std::mem::drop(pids);
1733
1734            // Child lock must be taken before this lock. Drop the lock on the task, take a writable
1735            // lock on the child and take the current state back.
1736
1737            #[cfg(any(test, debug_assertions))]
1738            {
1739                // Take the lock on the thread group and its child in the correct order to ensure any wrong ordering
1740                // will trigger the tracing-mutex at the right call site.
1741                if !clone_thread {
1742                    let _l1 = self.thread_group().read();
1743                    let _l2 = child.thread_group().read();
1744                }
1745            }
1746
1747            if clone_thread {
1748                self.thread_group().add(&child_task)?;
1749            } else {
1750                child.thread_group().add(&child_task)?;
1751
1752                // These manipulations of the signal handling state appear to be related to
1753                // CLONE_SIGHAND and CLONE_VM rather than CLONE_THREAD. However, we do not support
1754                // all the combinations of these flags, which means doing these operations here
1755                // might actually be correct. However, if you find a test that fails because of the
1756                // placement of this logic here, we might need to move it.
1757                let mut child_state = child.write();
1758                let state = self.read();
1759                child_state.set_sigaltstack(state.sigaltstack());
1760                child_state.set_signal_mask(state.signal_mask());
1761            }
1762
1763            if !clone_vm {
1764                // We do not support running threads in the same process with different
1765                // MemoryManagers.
1766                assert!(!clone_thread);
1767                let child_mm = MemoryManager::snapshot_of(
1768                    locked,
1769                    &self.mm()?,
1770                    child.thread_group.root_vmar.unowned(),
1771                    self.thread_state.arch_width(),
1772                )?;
1773                child.live()?.mm.update(Some(child_mm));
1774            }
1775
1776            if clone_parent_settid {
1777                self.write_object(user_parent_tid, &child.tid)?;
1778            }
1779
1780            if clone_child_cleartid {
1781                child.write().clear_child_tid = user_child_tid;
1782            }
1783
1784            if clone_child_settid {
1785                child.write_object(user_child_tid, &child.tid)?;
1786            }
1787
1788            if clone_pidfd {
1789                let locked = locked.cast_locked::<TaskRelease>();
1790                let file = new_pidfd(
1791                    locked,
1792                    self,
1793                    child.thread_group(),
1794                    &*child.mm()?,
1795                    OpenFlags::empty(),
1796                );
1797                let pidfd = self.add_file(locked, file, FdFlags::CLOEXEC)?;
1798                self.write_object(user_pidfd, &pidfd)?;
1799            }
1800
1801            // TODO(https://fxbug.dev/42066087): We do not support running different processes with
1802            // the same MemoryManager. Instead, we implement a rough approximation of that behavior
1803            // by making a copy-on-write clone of the memory from the original process.
1804            if clone_vm && !clone_thread {
1805                let child_mm = MemoryManager::snapshot_of(
1806                    locked,
1807                    &self.mm()?,
1808                    child.thread_group.root_vmar.unowned(),
1809                    self.thread_state.arch_width(),
1810                )?;
1811                child.live()?.mm.update(Some(child_mm));
1812            }
1813
1814            child.thread_state = self.thread_state.snapshot::<HeapRegs>();
1815            Ok(())
1816        });
1817
1818        // Take the lock on thread group and task in the correct order to ensure any wrong ordering
1819        // will trigger the tracing-mutex at the right call site.
1820        #[cfg(any(test, debug_assertions))]
1821        {
1822            let _l1 = child.thread_group().read();
1823            let _l2 = child.read();
1824        }
1825
1826        Ok(child)
1827    }
1828
1829    /// Sets the stop state (per set_stopped), and also notifies all listeners,
1830    /// including the parent process and the tracer if appropriate.
1831    pub fn set_stopped_and_notify(&self, stopped: StopState, siginfo: Option<SignalInfo>) {
1832        let maybe_signal_info = {
1833            let mut state = self.write();
1834            state.copy_state_from(self);
1835            state.set_stopped(stopped, siginfo, Some(self), None);
1836            state.prepare_signal_info(stopped)
1837        };
1838
1839        if let Some((tracer, signal_info)) = maybe_signal_info {
1840            if let Some(tracer) = tracer.upgrade() {
1841                tracer.write().send_signal(signal_info);
1842            }
1843        }
1844
1845        if !stopped.is_in_progress() {
1846            let parent = self.thread_group().read().parent.clone();
1847            if let Some(parent) = parent {
1848                parent
1849                    .upgrade()
1850                    .write()
1851                    .lifecycle_waiters
1852                    .notify_value(ThreadGroupLifecycleWaitValue::ChildStatus);
1853            }
1854        }
1855    }
1856
1857    /// If the task is stopping, set it as stopped. return whether the caller
1858    /// should stop.  The task might also be waking up.
1859    pub fn finalize_stop_state(&mut self) -> bool {
1860        let stopped = self.load_stopped();
1861
1862        if !stopped.is_stopping_or_stopped() {
1863            // If we are waking up, potentially write back state a tracer may have modified.
1864            let captured_state = self.write().take_captured_state();
1865            if let Some(captured) = captured_state {
1866                if captured.dirty {
1867                    self.thread_state.replace_registers(&captured.thread_state);
1868                }
1869            }
1870        }
1871
1872        // Stopping because the thread group is stopping.
1873        // Try to flip to GroupStopped - will fail if we shouldn't.
1874        if self.thread_group().set_stopped(StopState::GroupStopped, None, true)
1875            == StopState::GroupStopped
1876        {
1877            let signal = self.thread_group().read().last_signal.clone();
1878            // stopping because the thread group has stopped
1879            let event = Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0));
1880            self.write().set_stopped(StopState::GroupStopped, signal, Some(self), event);
1881            return true;
1882        }
1883
1884        // Stopping because the task is stopping
1885        if stopped.is_stopping_or_stopped() {
1886            if let Ok(stopped) = stopped.finalize() {
1887                self.set_stopped_and_notify(stopped, None);
1888            }
1889            return true;
1890        }
1891
1892        false
1893    }
1894
1895    /// Block the execution of `current_task` as long as the task is stopped and
1896    /// not terminated.
1897    pub fn block_while_stopped(&mut self, locked: &mut Locked<Unlocked>) {
1898        // Upgrade the state from stopping to stopped if needed. Return if the task
1899        // should not be stopped.
1900        if !self.finalize_stop_state() {
1901            return;
1902        }
1903
1904        let waiter = Waiter::with_options(WaiterOptions::IGNORE_SIGNALS);
1905        loop {
1906            // If we've exited, unstop the threads and return without notifying
1907            // waiters.
1908            if self.is_exitted() {
1909                self.thread_group().set_stopped(StopState::ForceAwake, None, false);
1910                self.write().set_stopped(StopState::ForceAwake, None, Some(self), None);
1911                return;
1912            }
1913
1914            if self.wake_or_wait_until_unstopped_async(&waiter) {
1915                return;
1916            }
1917
1918            // Do the wait. Result is not needed, as this is not in a syscall.
1919            let _: Result<(), Errno> = waiter.wait(locked, self);
1920
1921            // Maybe go from stopping to stopped, if we are currently stopping
1922            // again.
1923            self.finalize_stop_state();
1924        }
1925    }
1926
1927    /// For traced tasks, this will return the data neceessary for a cloned task
1928    /// to attach to the same tracer.
1929    pub fn get_ptrace_core_state_for_clone(
1930        &mut self,
1931        clone_args: &clone_args,
1932    ) -> (PtraceOptions, Option<PtraceCoreState>) {
1933        let state = self.write();
1934        if let Some(ptrace) = &state.ptrace {
1935            ptrace.get_core_state_for_clone(clone_args)
1936        } else {
1937            (PtraceOptions::empty(), None)
1938        }
1939    }
1940
1941    /// If currently being ptraced with the given option, emit the appropriate
1942    /// event.  PTRACE_EVENTMSG will return the given message.  Also emits the
1943    /// appropriate event for execve in the absence of TRACEEXEC.
1944    ///
1945    /// Note that the Linux kernel has a documented bug where, if TRACEEXIT is
1946    /// enabled, SIGKILL will trigger an event.  We do not exhibit this
1947    /// behavior.
1948    pub fn ptrace_event(
1949        &mut self,
1950        locked: &mut Locked<Unlocked>,
1951        trace_kind: PtraceOptions,
1952        msg: u64,
1953    ) {
1954        if !trace_kind.is_empty() {
1955            {
1956                let mut state = self.write();
1957                if let Some(ptrace) = &mut state.ptrace {
1958                    if !ptrace.has_option(trace_kind) {
1959                        // If this would be a TRACEEXEC, but TRACEEXEC is not
1960                        // turned on, then send a SIGTRAP.
1961                        if trace_kind == PtraceOptions::TRACEEXEC && !ptrace.is_seized() {
1962                            // Send a SIGTRAP so that the parent can gain control.
1963                            send_signal_first(locked, self, state, SignalInfo::kernel(SIGTRAP));
1964                        }
1965
1966                        return;
1967                    }
1968                    let ptrace_event = PtraceEvent::from_option(&trace_kind) as u32;
1969                    let siginfo = SignalInfo::with_detail(
1970                        SIGTRAP,
1971                        ((ptrace_event << 8) | SIGTRAP.number()) as i32,
1972                        SignalDetail::None,
1973                    );
1974                    state.set_stopped(
1975                        StopState::PtraceEventStopping,
1976                        Some(siginfo),
1977                        None,
1978                        Some(PtraceEventData::new(trace_kind, msg)),
1979                    );
1980                } else {
1981                    return;
1982                }
1983            }
1984            self.block_while_stopped(locked);
1985        }
1986    }
1987
1988    /// Causes the current thread's thread group to exit, notifying any ptracer
1989    /// of this task first.
1990    pub fn thread_group_exit(&mut self, locked: &mut Locked<Unlocked>, exit_status: ExitStatus) {
1991        self.ptrace_event(
1992            locked,
1993            PtraceOptions::TRACEEXIT,
1994            exit_status.signal_info_status() as u64,
1995        );
1996        self.thread_group().exit(locked, exit_status, None);
1997    }
1998
1999    /// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the
2000    /// exit signal as in clone().
2001    pub fn clone_task_for_test<L>(
2002        &self,
2003        locked: &mut Locked<L>,
2004        flags: u64,
2005        exit_signal: Option<Signal>,
2006    ) -> crate::testing::AutoReleasableTask
2007    where
2008        L: LockBefore<MmDumpable>,
2009        L: LockBefore<TaskRelease>,
2010        L: LockBefore<ProcessGroupState>,
2011    {
2012        let result = self
2013            .clone_task(
2014                locked,
2015                flags,
2016                exit_signal,
2017                UserRef::default(),
2018                UserRef::default(),
2019                UserRef::default(),
2020            )
2021            .expect("failed to create task in test");
2022        result.task.write().set_spawned();
2023
2024        result.into()
2025    }
2026
2027    // See "Ptrace access mode checking" in https://man7.org/linux/man-pages/man2/ptrace.2.html
2028    pub fn check_ptrace_access_mode<L>(
2029        &self,
2030        locked: &mut Locked<L>,
2031        mode: PtraceAccessMode,
2032        target: &Task,
2033    ) -> Result<(), Errno>
2034    where
2035        L: LockBefore<MmDumpable>,
2036    {
2037        // (1)  If the calling thread and the target thread are in the same
2038        //      thread group, access is always allowed.
2039        if self.thread_group().leader == target.thread_group().leader {
2040            return Ok(());
2041        }
2042
2043        // (2)  If the access mode specifies PTRACE_MODE_FSCREDS, then, for
2044        //      the check in the next step, employ the caller's filesystem
2045        //      UID and GID.  (As noted in credentials(7), the filesystem
2046        //      UID and GID almost always have the same values as the
2047        //      corresponding effective IDs.)
2048        //
2049        //      Otherwise, the access mode specifies PTRACE_MODE_REALCREDS,
2050        //      so use the caller's real UID and GID for the checks in the
2051        //      next step.  (Most APIs that check the caller's UID and GID
2052        //      use the effective IDs.  For historical reasons, the
2053        //      PTRACE_MODE_REALCREDS check uses the real IDs instead.)
2054        let (uid, gid) = if mode.contains(PTRACE_MODE_FSCREDS) {
2055            let fscred = self.current_creds().as_fscred();
2056            (fscred.uid, fscred.gid)
2057        } else if mode.contains(PTRACE_MODE_REALCREDS) {
2058            let creds = self.current_creds();
2059            (creds.uid, creds.gid)
2060        } else {
2061            unreachable!();
2062        };
2063
2064        // (3)  Deny access if neither of the following is true:
2065        //
2066        //      -  The real, effective, and saved-set user IDs of the target
2067        //         match the caller's user ID, and the real, effective, and
2068        //         saved-set group IDs of the target match the caller's
2069        //         group ID.
2070        //
2071        //      -  The caller has the CAP_SYS_PTRACE capability in the user
2072        //         namespace of the target.
2073        let target_creds = target.real_creds();
2074        if !(target_creds.uid == uid
2075            && target_creds.euid == uid
2076            && target_creds.saved_uid == uid
2077            && target_creds.gid == gid
2078            && target_creds.egid == gid
2079            && target_creds.saved_gid == gid)
2080        {
2081            security::check_task_capable(self, CAP_SYS_PTRACE)?;
2082        }
2083
2084        // (4)  Deny access if the target process "dumpable" attribute has a
2085        //      value other than 1 (SUID_DUMP_USER; see the discussion of
2086        //      PR_SET_DUMPABLE in prctl(2)), and the caller does not have
2087        //      the CAP_SYS_PTRACE capability in the user namespace of the
2088        //      target process.
2089        let dumpable = *target.mm()?.dumpable.lock(locked);
2090        match dumpable {
2091            DumpPolicy::User => (),
2092            DumpPolicy::Disable => security::check_task_capable(self, CAP_SYS_PTRACE)?,
2093        }
2094
2095        // (5)  The kernel LSM security_ptrace_access_check() interface is
2096        //      invoked to see if ptrace access is permitted.
2097        security::ptrace_access_check(self, target, mode)?;
2098
2099        // (6)  If access has not been denied by any of the preceding steps,
2100        //      then access is allowed.
2101        Ok(())
2102    }
2103
2104    pub fn can_signal(
2105        &self,
2106        target: &Task,
2107        unchecked_signal: UncheckedSignal,
2108    ) -> Result<(), Errno> {
2109        // If both the tasks share a thread group the signal can be sent. This is not documented
2110        // in kill(2) because kill does not support task-level granularity in signal sending.
2111        if self.thread_group == target.thread_group {
2112            return Ok(());
2113        }
2114
2115        let self_creds = self.current_creds();
2116        let target_creds = target.real_creds();
2117        // From https://man7.org/linux/man-pages/man2/kill.2.html:
2118        //
2119        // > For a process to have permission to send a signal, it must either be
2120        // > privileged (under Linux: have the CAP_KILL capability in the user
2121        // > namespace of the target process), or the real or effective user ID of
2122        // > the sending process must equal the real or saved set- user-ID of the
2123        // > target process.
2124        //
2125        // Returns true if the credentials are considered to have the same user ID.
2126        if self_creds.euid == target_creds.saved_uid
2127            || self_creds.euid == target_creds.uid
2128            || self_creds.uid == target_creds.uid
2129            || self_creds.uid == target_creds.saved_uid
2130        {
2131            return Ok(());
2132        }
2133
2134        if Signal::try_from(unchecked_signal) == Ok(SIGCONT) {
2135            let target_session = target.thread_group().read().process_group.session.leader;
2136            let self_session = self.thread_group().read().process_group.session.leader;
2137            if target_session == self_session {
2138                return Ok(());
2139            }
2140        }
2141
2142        security::check_task_capable(self, CAP_KILL)
2143    }
2144}
2145
2146impl ArchSpecific for CurrentTask {
2147    fn is_arch32(&self) -> bool {
2148        self.thread_state.is_arch32()
2149    }
2150}
2151
2152impl MemoryAccessor for CurrentTask {
2153    fn read_memory<'a>(
2154        &self,
2155        addr: UserAddress,
2156        bytes: &'a mut [MaybeUninit<u8>],
2157    ) -> Result<&'a mut [u8], Errno> {
2158        self.mm()?.unified_read_memory(self, addr, bytes)
2159    }
2160
2161    fn read_memory_partial_until_null_byte<'a>(
2162        &self,
2163        addr: UserAddress,
2164        bytes: &'a mut [MaybeUninit<u8>],
2165    ) -> Result<&'a mut [u8], Errno> {
2166        self.mm()?.unified_read_memory_partial_until_null_byte(self, addr, bytes)
2167    }
2168
2169    fn read_memory_partial<'a>(
2170        &self,
2171        addr: UserAddress,
2172        bytes: &'a mut [MaybeUninit<u8>],
2173    ) -> Result<&'a mut [u8], Errno> {
2174        self.mm()?.unified_read_memory_partial(self, addr, bytes)
2175    }
2176
2177    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2178        self.mm()?.unified_write_memory(self, addr, bytes)
2179    }
2180
2181    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2182        self.mm()?.unified_write_memory_partial(self, addr, bytes)
2183    }
2184
2185    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
2186        self.mm()?.unified_zero(self, addr, length)
2187    }
2188}
2189
2190impl TaskMemoryAccessor for CurrentTask {
2191    fn maximum_valid_address(&self) -> Option<UserAddress> {
2192        self.mm().ok().map(|mm| mm.maximum_valid_user_address)
2193    }
2194}
2195
2196pub enum ExceptionResult {
2197    /// The exception was handled and no further action is required.
2198    Handled,
2199
2200    // The exception generated a signal that should be delivered.
2201    Signal(SignalInfo),
2202}
2203
2204fn split_path(path: &FsStr) -> LookupVec<&FsStr> {
2205    path.split(|c| *c == b'/').filter(|p| !p.is_empty()).map(<&FsStr>::from).collect()
2206}
2207
2208#[cfg(test)]
2209mod tests {
2210    use crate::testing::spawn_kernel_and_run;
2211    use starnix_uapi::auth::Credentials;
2212
2213    // This test will run `override_creds` and check it doesn't crash. This ensures that the
2214    // delegation to `override_creds_async` is correct.
2215    #[::fuchsia::test]
2216    async fn test_override_creds_can_delegate_to_async_version() {
2217        spawn_kernel_and_run(async move |_, current_task| {
2218            assert_eq!(current_task.override_creds(Credentials::root(), || 0), 0);
2219        })
2220        .await;
2221    }
2222}