Skip to main content

starnix_core/task/
current_task.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::arch::task::handle_hardware_exception;
6use crate::execution::{TaskInfo, create_zircon_process};
7use crate::mm::{DumpPolicy, MemoryAccessor, MemoryAccessorExt, MemoryManager, TaskMemoryAccessor};
8use crate::ptrace::{PtraceCoreState, PtraceEvent, PtraceEventData, PtraceOptions, StopState};
9use crate::security;
10use crate::signals::{SignalDetail, SignalInfo, send_signal_first, send_standard_signal};
11use crate::task::loader::{ResolvedElf, load_executable, resolve_executable};
12use crate::task::waiter::WaiterOptions;
13use crate::task::{
14    CurrentTaskCredentialsWriteGuard, ExitStatus, PageFaultExceptionReport, RobustListHeadPtr,
15    RunState, SeccompFilter, SeccompFilterContainer, SeccompNotifierHandle, SeccompState,
16    SeccompStateValue, Task, TaskFlags, TaskRunningState, ThreadState, Waiter,
17};
18use crate::vfs::{
19    CheckAccessReason, FdFlags, FdNumber, FileHandle, FsContext, FsStr, LookupContext, LookupVec,
20    MAX_SYMLINK_FOLLOWS, NamespaceNode, ResolveBase, SymlinkMode, SymlinkTarget, new_pidfd,
21};
22use fuchsia_rcu::RcuReadGuard;
23use futures::FutureExt;
24use linux_uapi::CLONE_PIDFD;
25use starnix_logging::{CATEGORY_STARNIX, log_error, log_warn, track_file_not_found, track_stub};
26use starnix_registers::{HeapRegs, RegisterStorageEnum};
27use starnix_stack::clean_stack;
28use starnix_sync::{
29    EventWaitGuard, FileOpsCore, LockBefore, LockEqualOrBefore, Locked, MmDumpable,
30    ProcessGroupState, TaskRelease, UninterruptibleLock, Unlocked, WakeReason, assert_lock_level,
31};
32use starnix_syscalls::SyscallResult;
33use starnix_syscalls::decls::Syscall;
34use starnix_task_command::TaskCommand;
35use starnix_types::futex_address::FutexAddress;
36use starnix_types::ownership::{Releasable, release_on_error};
37use starnix_uapi::auth::{
38    CAP_KILL, CAP_SYS_ADMIN, CAP_SYS_PTRACE, Credentials, FsCred, PTRACE_MODE_FSCREDS,
39    PTRACE_MODE_REALCREDS, PtraceAccessMode,
40};
41use starnix_uapi::device_id::DeviceId;
42use starnix_uapi::errors::Errno;
43use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
44use starnix_uapi::open_flags::OpenFlags;
45use starnix_uapi::signals::{
46    SIGCHLD, SIGCONT, SIGILL, SIGKILL, SIGSEGV, SIGSYS, SIGTRAP, SigSet, Signal, UncheckedSignal,
47};
48use starnix_uapi::user_address::{ArchSpecific, UserAddress, UserRef};
49use starnix_uapi::vfs::ResolveFlags;
50use starnix_uapi::{
51    CLONE_CHILD_CLEARTID, CLONE_CHILD_SETTID, CLONE_CLEAR_SIGHAND, CLONE_FILES, CLONE_FS,
52    CLONE_INTO_CGROUP, CLONE_NEWUTS, CLONE_PARENT, CLONE_PARENT_SETTID, CLONE_PTRACE, CLONE_SETTLS,
53    CLONE_SIGHAND, CLONE_SYSVSEM, CLONE_THREAD, CLONE_VFORK, CLONE_VM, FUTEX_OWNER_DIED,
54    FUTEX_TID_MASK, ROBUST_LIST_LIMIT, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_NEW_LISTENER,
55    SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_TSYNC_ESRCH, clone_args, errno, error, pid_t,
56    sock_filter, ucred,
57};
58use std::cell::{Ref, RefCell};
59use std::collections::VecDeque;
60use std::ffi::CString;
61use std::fmt;
62use std::marker::PhantomData;
63use std::mem::MaybeUninit;
64use std::sync::{Arc, Weak};
65use zx::sys::zx_restricted_state_t;
66
67use super::ThreadGroupLifecycleWaitValue;
68
69pub struct TaskBuilder {
70    /// The underlying task object.
71    pub task: Arc<Task>,
72
73    pub thread_state: ThreadState<HeapRegs>,
74}
75
76impl TaskBuilder {
77    pub fn new(task: Arc<Task>) -> Self {
78        Self { task, thread_state: Default::default() }
79    }
80
81    #[inline(always)]
82    pub fn release<L>(self, locked: &mut Locked<L>)
83    where
84        L: LockBefore<TaskRelease>,
85    {
86        let locked = locked.cast_locked::<TaskRelease>();
87        Releasable::release(self, locked);
88    }
89}
90
91impl From<TaskBuilder> for CurrentTask {
92    fn from(builder: TaskBuilder) -> Self {
93        Self::new(builder.task, builder.thread_state.into())
94    }
95}
96
97impl Releasable for TaskBuilder {
98    type Context<'a> = &'a mut Locked<TaskRelease>;
99
100    fn release<'a>(self, locked: Self::Context<'a>) {
101        // Build a temporary CurrentTask to run release actions that require ThreadState.
102        let current_task = CurrentTask::new(self.task, self.thread_state.into());
103        current_task.exit(locked);
104    }
105}
106
107impl std::ops::Deref for TaskBuilder {
108    type Target = Task;
109    fn deref(&self) -> &Self::Target {
110        &self.task
111    }
112}
113
114/// The task object associated with the currently executing thread.
115///
116/// We often pass the `CurrentTask` as the first argument to functions if those functions need to
117/// know contextual information about the thread on which they are running. For example, we often
118/// use the `CurrentTask` to perform access checks, which ensures that the caller is authorized to
119/// perform the requested operation.
120///
121/// The `CurrentTask` also has state that can be referenced only on the currently executing thread,
122/// such as the register state for that thread. Syscalls are given a mutable references to the
123/// `CurrentTask`, which lets them manipulate this state.
124///
125/// See also `Task` for more information about tasks.
126pub struct CurrentTask {
127    /// The underlying task object.
128    pub task: Arc<Task>,
129
130    pub thread_state: ThreadState<RegisterStorageEnum>,
131
132    /// The current subjective credentials of the task.
133    // TODO(https://fxbug.dev/433548348): Avoid interior mutability here by passing a
134    // &mut CurrentTask around instead of &CurrentTask.
135    pub current_creds: RefCell<CurrentCreds>,
136
137    pub security_state: security::CurrentTaskState,
138
139    /// Makes CurrentTask neither Sync not Send.
140    _local_marker: PhantomData<*mut u8>,
141}
142
143/// Represents the current state of the task's subjective credentials.
144pub enum CurrentCreds {
145    /// The task does not have overridden credentials, the subjective creds are identical to the
146    /// objective creds stored in the Task. Since credentials are often accessed from the current
147    /// task, we hold a reference here that does not necessitate going through the RCU machinery to
148    /// read.
149    Cached(Arc<Credentials>),
150    /// The task has overridden subjective credentials.
151    Overridden(Arc<Credentials>),
152}
153
154impl CurrentCreds {
155    fn creds(&self) -> &Arc<Credentials> {
156        match self {
157            CurrentCreds::Cached(creds) => creds,
158            CurrentCreds::Overridden(creds) => creds,
159        }
160    }
161}
162
163impl Releasable for CurrentTask {
164    type Context<'a> = &'a mut Locked<TaskRelease>;
165
166    fn release<'a>(self, locked: Self::Context<'a>) {
167        self.exit(locked);
168    }
169}
170
171impl std::ops::Deref for CurrentTask {
172    type Target = Task;
173    fn deref(&self) -> &Self::Target {
174        &self.task
175    }
176}
177
178impl fmt::Debug for CurrentTask {
179    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
180        self.task.fmt(f)
181    }
182}
183
184impl CurrentTask {
185    pub fn new(task: Arc<Task>, thread_state: ThreadState<RegisterStorageEnum>) -> Self {
186        let current_creds = RefCell::new(CurrentCreds::Cached(task.clone_creds()));
187        Self {
188            task,
189            thread_state,
190            current_creds,
191            security_state: Default::default(),
192            _local_marker: Default::default(),
193        }
194    }
195
196    /// Exit the task by dropping its running state.
197    pub fn exit(&self, locked: &mut Locked<TaskRelease>) {
198        // When this method returns, the following invariants must be met:
199        // 1. No new references to running `Task` state must be obtainable.
200        // 2. All externally-visible `Task` state must reflect that the `Task` has exited.
201        // 3. All observers of `Task` exit events must be notified.
202
203        self.notify_robust_list();
204        let _ignored = self.clear_child_tid_if_needed(locked);
205
206        self.signal_vfork();
207
208        // Drop fields that can end up owning a FsNode to ensure no FsNode are owned by this task.
209        if let Ok(running_state) = self.task.running_state() {
210            running_state.files.release();
211            running_state.mm.update(None);
212        }
213        self.running_state.update(None);
214
215        self.trigger_delayed_releaser(locked);
216
217        // We remove from the thread group here because the Weak in the pid
218        // table to this task must be valid until this task is removed from the
219        // thread group, and the code below will invalidate it.
220        // Moreover, this requires an Arc of the task to ensure the tasks of
221        // the thread group are always valid.
222        self.task.thread_group().remove(locked, self.kernel().pids.write(), &self.task);
223
224        self.ptrace_disconnect();
225    }
226
227    /// Returns the [`TaskRunningState`] for the [`Task`].
228    ///
229    /// # Panics
230    ///
231    /// Calling `running_state()` on a [`CurrentTask`] for which the [`Task`] has no running state
232    /// (i.e. exited tasks) panics. However, such tasks should not have a `CurrentTask`.
233    #[track_caller]
234    pub fn running_state(&self) -> RcuReadGuard<TaskRunningState> {
235        self.task.running_state().expect("CurrentTask must have TaskRunningState")
236    }
237
238    pub fn fs(&self) -> Arc<FsContext> {
239        self.running_state().fs()
240    }
241
242    pub fn has_shared_fs(&self) -> bool {
243        let fs = self.fs();
244        // This check is incorrect because someone else could be holding a temporary Arc to the
245        // FsContext and therefore increasing the strong count.
246        Arc::strong_count(&fs) > 2usize
247    }
248
249    pub fn unshare_fs(&self) {
250        let new_fs = self.fs().fork();
251        self.running_state().fs.update(new_fs);
252    }
253
254    /// Returns the current subjective credentials of the task.
255    ///
256    /// The subjective credentials are the credentials that are used to check permissions for
257    /// actions performed by the task.
258    pub fn current_creds(&self) -> Ref<'_, Arc<Credentials>> {
259        Ref::map(self.current_creds.borrow(), CurrentCreds::creds)
260    }
261
262    pub fn current_fscred(&self) -> FsCred {
263        self.current_creds().as_fscred()
264    }
265
266    pub fn current_ucred(&self) -> ucred {
267        let creds = self.current_creds();
268        ucred { pid: self.get_pid(), uid: creds.uid, gid: creds.gid }
269    }
270
271    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
272    /// `callback`.
273    /// The creds and security state will be restored to their original values at the end of the
274    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
275    ///  used to check permissions for actions performed by the task, is altered. The "objective"
276    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
277    /// for actions performed on the task, is not altered, and changes to the credentials are not
278    /// externally visible.
279    pub async fn override_creds_async<R>(
280        &self,
281        new_creds: Arc<Credentials>,
282        callback: impl AsyncFnOnce() -> R,
283    ) -> R {
284        let saved = self.current_creds.replace(CurrentCreds::Overridden(new_creds));
285        let result = callback().await;
286        self.current_creds.replace(saved);
287        result
288    }
289
290    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
291    /// `callback`.
292    /// The creds and security state will be restored to their original values at the end of the
293    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
294    ///  used to check permissions for actions performed by the task, is altered. The "objective"
295    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
296    /// for actions performed on the task, is not altered, and changes to the credentials are not
297    /// externally visible.
298    pub fn override_creds<R>(
299        &self,
300        new_creds: Arc<Credentials>,
301        callback: impl FnOnce() -> R,
302    ) -> R {
303        self.override_creds_async(new_creds, async move || callback())
304            .now_or_never()
305            .expect("Future should be ready")
306    }
307
308    pub fn has_overridden_creds(&self) -> bool {
309        matches!(*self.current_creds.borrow(), CurrentCreds::Overridden(_))
310    }
311
312    pub fn trigger_delayed_releaser<L>(&self, locked: &mut Locked<L>)
313    where
314        L: LockEqualOrBefore<FileOpsCore>,
315    {
316        let locked = locked.cast_locked::<FileOpsCore>();
317        self.kernel().delayed_releaser.apply(locked, self);
318    }
319
320    pub fn weak_task(&self) -> Weak<Task> {
321        Arc::downgrade(&self.task)
322    }
323
324    /// Locks the `CurrentTask`'s credentials for writing, allowing readers to coordinate by using
325    /// `Task::lock_creds()` where necessary.  e.g. This is used to avoid ptrace attachment racing
326    /// with critical security checks affecting the task's `Credentials` during `exec()`.
327    pub fn write_creds(&self) -> CurrentTaskCredentialsWriteGuard {
328        assert!(!self.has_overridden_creds());
329        self.persistent_info.write_current_task_creds()
330    }
331
332    /// Change the current and real creds of the task. This is invalid to call while temporary
333    /// credentials are present.
334    pub fn set_creds(&self, creds: Credentials) {
335        let creds = Arc::new(creds);
336        self.write_creds().update(self, creds);
337    }
338
339    #[inline(always)]
340    pub fn release<L>(self, locked: &mut Locked<L>)
341    where
342        L: LockBefore<TaskRelease>,
343    {
344        let locked = locked.cast_locked::<TaskRelease>();
345        Releasable::release(self, locked);
346    }
347
348    pub fn set_syscall_restart_func<R: Into<SyscallResult>>(
349        &mut self,
350        f: impl FnOnce(&mut Locked<Unlocked>, &mut CurrentTask) -> Result<R, Errno>
351        + Send
352        + Sync
353        + 'static,
354    ) {
355        self.thread_state.syscall_restart_func =
356            Some(Box::new(|locked, current_task| Ok(f(locked, current_task)?.into())));
357    }
358
359    pub fn add_file<L>(
360        &self,
361        locked: &mut Locked<L>,
362        file: FileHandle,
363        flags: FdFlags,
364    ) -> Result<FdNumber, Errno>
365    where
366        L: LockEqualOrBefore<FileOpsCore>,
367    {
368        self.running_state().files.add(locked, self, file, flags)
369    }
370
371    pub fn get_file(&self, fd: FdNumber) -> Result<FileHandle, Errno> {
372        self.running_state().files.get(fd)
373    }
374
375    pub fn get_file_allowing_opath(&self, fd: FdNumber) -> Result<FileHandle, Errno> {
376        self.running_state().files.get_allowing_opath(fd)
377    }
378
379    /// Sets the task's signal mask to `signal_mask` and runs `wait_function`.
380    ///
381    /// Signals are dequeued prior to the original signal mask being restored. This is done by the
382    /// signal machinery in the syscall dispatch loop.
383    ///
384    /// The returned result is the result returned from the wait function.
385    pub fn wait_with_temporary_mask<F, T, L>(
386        &mut self,
387        locked: &mut Locked<L>,
388        signal_mask: SigSet,
389        wait_function: F,
390    ) -> Result<T, Errno>
391    where
392        L: LockEqualOrBefore<FileOpsCore>,
393        F: FnOnce(&mut Locked<L>, &CurrentTask) -> Result<T, Errno>,
394    {
395        {
396            let mut state = self.write();
397            state.set_flags(TaskFlags::TEMPORARY_SIGNAL_MASK, true);
398            state.set_temporary_signal_mask(signal_mask);
399        }
400        wait_function(locked, self)
401    }
402
403    /// If waking, promotes from waking to awake.  If not waking, make waiter async
404    /// wait until woken.  Returns true if woken.
405    pub fn wake_or_wait_until_unstopped_async(&self, waiter: &Waiter) -> bool {
406        let group_state = self.thread_group().read();
407        let mut task_state = self.write();
408
409        // Wake up if
410        //   a) we should wake up, meaning:
411        //      i) we're in group stop, and the thread group has exited group stop, or
412        //      ii) we're waking up,
413        //   b) and ptrace isn't stopping us from waking up, but
414        //   c) always wake up if we got a SIGKILL.
415        let task_stop_state = self.load_stopped();
416        let group_stop_state = self.thread_group().load_stopped();
417        if ((task_stop_state == StopState::GroupStopped && group_stop_state.is_waking_or_awake())
418            || task_stop_state.is_waking_or_awake())
419            && (!task_state.is_ptrace_listening() || task_stop_state.is_force())
420        {
421            let new_state = if task_stop_state.is_waking_or_awake() {
422                task_stop_state.finalize()
423            } else {
424                group_stop_state.finalize()
425            };
426            if let Ok(new_state) = new_state {
427                task_state.set_stopped(new_state, None, Some(self), None);
428                drop(group_state);
429                drop(task_state);
430                // It is possible for the stop state to be changed by another
431                // thread between when it is checked above and the following
432                // invocation, but set_stopped does sufficient checking while
433                // holding the lock to make sure that such a change won't result
434                // in corrupted state.
435                self.thread_group().set_stopped(new_state, None, false);
436                return true;
437            }
438        }
439
440        // We will wait.
441        if self.thread_group().load_stopped().is_stopped() || task_stop_state.is_stopped() {
442            // If we've stopped or PTRACE_LISTEN has been sent, wait for a
443            // signal or instructions from the tracer.
444            group_state
445                .lifecycle_waiters
446                .wait_async_value(&waiter, ThreadGroupLifecycleWaitValue::Stopped);
447            task_state.wait_on_ptracer(&waiter);
448        } else if task_state.can_accept_ptrace_commands() {
449            // If we're stopped because a tracer has seen the stop and not taken
450            // further action, wait for further instructions from the tracer.
451            task_state.wait_on_ptracer(&waiter);
452        } else if task_state.is_ptrace_listening() {
453            // A PTRACE_LISTEN is a state where we can get signals and notify a
454            // ptracer, but otherwise remain blocked.
455            if let Some(ptrace) = &mut task_state.ptrace {
456                ptrace.set_last_signal(Some(SignalInfo::kernel(SIGTRAP)));
457                ptrace.set_last_event(Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0)));
458            }
459            task_state.wait_on_ptracer(&waiter);
460            task_state.notify_ptracers();
461        }
462        false
463    }
464
465    /// Set the RunState for the current task to the given value and then call the given callback.
466    ///
467    /// When the callback is done, the run_state is restored to `RunState::Running`.
468    ///
469    /// This function is typically used just before blocking the current task on some operation.
470    /// The given `run_state` registers the mechanism for interrupting the blocking operation with
471    /// the task and the given `callback` actually blocks the task.
472    ///
473    /// This function can only be called in the `RunState::Running` state and cannot set the
474    /// run state to `RunState::Running`. For this reason, this function cannot be reentered.
475    pub fn run_in_state<F, T>(&self, run_state: RunState, callback: F) -> Result<T, Errno>
476    where
477        F: FnOnce() -> Result<T, Errno>,
478    {
479        assert_ne!(run_state, RunState::Running);
480
481        // Check we do not hold any uninterruptible lock
482        assert_lock_level::<UninterruptibleLock>();
483        // As an optimization, decommit unused pages of the stack to reduce memory pressure while
484        // the thread is blocked.
485        clean_stack();
486
487        {
488            let mut state = self.write();
489            assert!(!state.is_blocked());
490
491            if matches!(run_state, RunState::Frozen(_)) {
492                // Freeze is a kernel signal and is handled before other user signals. A frozen task
493                // ignores all other signals except SIGKILL until it is thawed.
494                if state.has_signal_pending(SIGKILL) {
495                    return error!(EINTR);
496                }
497            } else if state.is_any_signal_pending() && !state.is_ptrace_listening() {
498                // A note on PTRACE_LISTEN - the thread cannot be scheduled
499                // regardless of pending signals.
500                return error!(EINTR);
501            }
502            state.set_run_state(run_state.clone());
503        }
504
505        let _waiting_guard = crate::task::ThreadLockupDetector::pause_tracking();
506        let result = callback();
507
508        {
509            let mut state = self.write();
510            assert_eq!(
511                state.run_state(),
512                run_state,
513                "SignalState run state changed while waiting!"
514            );
515            state.set_run_state(RunState::Running);
516        };
517
518        result
519    }
520
521    pub fn block_until(
522        &self,
523        guard: EventWaitGuard<'_>,
524        deadline: zx::MonotonicInstant,
525    ) -> Result<(), Errno> {
526        self.run_in_state(RunState::Event(guard.event().clone()), move || {
527            guard.block_until(None, deadline).map_err(|e| match e {
528                WakeReason::Interrupted => errno!(EINTR),
529                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
530            })
531        })
532    }
533
534    pub fn block_with_owner_until(
535        &self,
536        guard: EventWaitGuard<'_>,
537        new_owner: &zx::Thread,
538        deadline: zx::MonotonicInstant,
539    ) -> Result<(), Errno> {
540        self.run_in_state(RunState::Event(guard.event().clone()), move || {
541            guard.block_until(Some(new_owner), deadline).map_err(|e| match e {
542                WakeReason::Interrupted => errno!(EINTR),
543                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
544            })
545        })
546    }
547
548    /// Determine namespace node indicated by the dir_fd.
549    ///
550    /// Returns the namespace node and the path to use relative to that node.
551    pub fn resolve_dir_fd<'a, L>(
552        &self,
553        locked: &mut Locked<L>,
554        dir_fd: FdNumber,
555        mut path: &'a FsStr,
556        flags: ResolveFlags,
557    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
558    where
559        L: LockEqualOrBefore<FileOpsCore>,
560    {
561        let path_is_absolute = path.starts_with(b"/");
562        if path_is_absolute {
563            if flags.contains(ResolveFlags::BENEATH) {
564                return error!(EXDEV);
565            }
566            path = &path[1..];
567        }
568
569        let dir = if path_is_absolute && !flags.contains(ResolveFlags::IN_ROOT) {
570            self.fs().root()
571        } else if dir_fd == FdNumber::AT_FDCWD {
572            self.fs().cwd()
573        } else {
574            // O_PATH allowed for:
575            //
576            //   Passing the file descriptor as the dirfd argument of
577            //   openat() and the other "*at()" system calls.  This
578            //   includes linkat(2) with AT_EMPTY_PATH (or via procfs
579            //   using AT_SYMLINK_FOLLOW) even if the file is not a
580            //   directory.
581            //
582            // See https://man7.org/linux/man-pages/man2/open.2.html
583            let file = self.get_file_allowing_opath(dir_fd)?;
584            file.name.to_passive()
585        };
586
587        if !path.is_empty() {
588            if !dir.entry.node.is_dir() {
589                return error!(ENOTDIR);
590            }
591            dir.check_access(
592                locked,
593                self,
594                Access::EXEC,
595                CheckAccessReason::InternalPermissionChecks,
596            )?;
597        }
598        Ok((dir, path.into()))
599    }
600
601    /// A convenient wrapper for opening files relative to FdNumber::AT_FDCWD.
602    ///
603    /// Returns a FileHandle but does not install the FileHandle in the FdTable
604    /// for this task.
605    pub fn open_file(
606        &self,
607        locked: &mut Locked<Unlocked>,
608        path: &FsStr,
609        flags: OpenFlags,
610    ) -> Result<FileHandle, Errno> {
611        if flags.contains(OpenFlags::CREAT) {
612            // In order to support OpenFlags::CREAT we would need to take a
613            // FileMode argument.
614            return error!(EINVAL);
615        }
616        self.open_file_at(
617            locked,
618            FdNumber::AT_FDCWD,
619            path,
620            flags,
621            FileMode::default(),
622            ResolveFlags::empty(),
623            AccessCheck::default(),
624        )
625    }
626
627    /// Resolves a path for open.
628    ///
629    /// If the final path component points to a symlink, the symlink is followed (as long as
630    /// the symlink traversal limit has not been reached).
631    ///
632    /// If the final path component (after following any symlinks, if enabled) does not exist,
633    /// and `flags` contains `OpenFlags::CREAT`, a new node is created at the location of the
634    /// final path component.
635    ///
636    /// This returns the resolved node, and a boolean indicating whether the node has been created.
637    fn resolve_open_path<L>(
638        &self,
639        locked: &mut Locked<L>,
640        context: &mut LookupContext,
641        dir: &NamespaceNode,
642        path: &FsStr,
643        mode: FileMode,
644        flags: OpenFlags,
645    ) -> Result<(NamespaceNode, bool), Errno>
646    where
647        L: LockEqualOrBefore<FileOpsCore>,
648    {
649        context.update_for_path(path);
650        let mut parent_content = context.with(SymlinkMode::Follow);
651        let (parent, basename) = self.lookup_parent(locked, &mut parent_content, dir, path)?;
652        context.remaining_follows = parent_content.remaining_follows;
653
654        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
655
656        // Lookup the child, without following a symlink or expecting it to be a directory.
657        let mut child_context = context.with(SymlinkMode::NoFollow);
658        child_context.must_be_directory = false;
659
660        match parent.lookup_child(locked, self, &mut child_context, basename) {
661            Ok(name) => {
662                if name.entry.node.is_lnk() {
663                    if flags.contains(OpenFlags::PATH)
664                        && context.symlink_mode == SymlinkMode::NoFollow
665                    {
666                        // When O_PATH is specified in flags, if pathname is a symbolic link
667                        // and the O_NOFOLLOW flag is also specified, then the call returns
668                        // a file descriptor referring to the symbolic link.
669                        // See https://man7.org/linux/man-pages/man2/openat.2.html
670                        //
671                        // If the trailing component (i.e., basename) of
672                        // pathname is a symbolic link, how.resolve contains
673                        // RESOLVE_NO_SYMLINKS, and how.flags contains both
674                        // O_PATH and O_NOFOLLOW, then an O_PATH file
675                        // descriptor referencing the symbolic link will be
676                        // returned.
677                        // See https://man7.org/linux/man-pages/man2/openat2.2.html
678                        return Ok((name, false));
679                    }
680
681                    if (!flags.contains(OpenFlags::PATH)
682                        && context.symlink_mode == SymlinkMode::NoFollow)
683                        || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
684                        || context.remaining_follows == 0
685                    {
686                        if must_create {
687                            // Since `must_create` is set, and a node was found, this returns EEXIST
688                            // instead of ELOOP.
689                            return error!(EEXIST);
690                        }
691                        // A symlink was found, but one of the following is true:
692                        // * flags specified O_NOFOLLOW but not O_PATH.
693                        // * how.resolve contains RESOLVE_NO_SYMLINKS
694                        // * too many symlink traversals have been attempted
695                        return error!(ELOOP);
696                    }
697
698                    context.remaining_follows -= 1;
699                    match name.readlink(locked, self)? {
700                        SymlinkTarget::Path(path) => {
701                            let dir = if path[0] == b'/' { self.fs().root() } else { parent };
702                            self.resolve_open_path(
703                                locked,
704                                context,
705                                &dir,
706                                path.as_ref(),
707                                mode,
708                                flags,
709                            )
710                        }
711                        SymlinkTarget::Node(name) => {
712                            if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS)
713                                || name.entry.node.is_lnk()
714                            {
715                                error!(ELOOP)
716                            } else {
717                                Ok((name, false))
718                            }
719                        }
720                    }
721                } else {
722                    if must_create {
723                        return error!(EEXIST);
724                    }
725                    Ok((name, false))
726                }
727            }
728            Err(e) if e == errno!(ENOENT) && flags.contains(OpenFlags::CREAT) => {
729                if context.must_be_directory {
730                    return error!(EISDIR);
731                }
732                Ok((
733                    parent.open_create_node(
734                        locked,
735                        self,
736                        basename,
737                        mode.with_type(FileMode::IFREG),
738                        DeviceId::NONE,
739                        flags,
740                    )?,
741                    true,
742                ))
743            }
744            Err(e) => Err(e),
745        }
746    }
747
748    /// The primary entry point for opening files relative to a task.
749    ///
750    /// Absolute paths are resolve relative to the root of the FsContext for
751    /// this task. Relative paths are resolve relative to dir_fd. To resolve
752    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
753    /// dir_fd.
754    ///
755    /// Returns a FileHandle but does not install the FileHandle in the FdTable
756    /// for this task.
757    pub fn open_file_at(
758        &self,
759        locked: &mut Locked<Unlocked>,
760        dir_fd: FdNumber,
761        path: &FsStr,
762        flags: OpenFlags,
763        mode: FileMode,
764        resolve_flags: ResolveFlags,
765        access_check: AccessCheck,
766    ) -> Result<FileHandle, Errno> {
767        if path.is_empty() {
768            return error!(ENOENT);
769        }
770
771        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, resolve_flags)?;
772        self.open_namespace_node_at(locked, dir, path, flags, mode, resolve_flags, access_check)
773    }
774
775    pub fn open_namespace_node_at(
776        &self,
777        locked: &mut Locked<Unlocked>,
778        dir: NamespaceNode,
779        path: &FsStr,
780        flags: OpenFlags,
781        mode: FileMode,
782        mut resolve_flags: ResolveFlags,
783        access_check: AccessCheck,
784    ) -> Result<FileHandle, Errno> {
785        // 64-bit kernels force the O_LARGEFILE flag to be on.
786        let mut flags = flags | OpenFlags::LARGEFILE;
787        let opath = flags.contains(OpenFlags::PATH);
788        if opath {
789            // When O_PATH is specified in flags, flag bits other than O_CLOEXEC,
790            // O_DIRECTORY, and O_NOFOLLOW are ignored.
791            const ALLOWED_FLAGS: OpenFlags = OpenFlags::from_bits_truncate(
792                OpenFlags::PATH.bits()
793                    | OpenFlags::CLOEXEC.bits()
794                    | OpenFlags::DIRECTORY.bits()
795                    | OpenFlags::NOFOLLOW.bits(),
796            );
797            flags &= ALLOWED_FLAGS;
798        }
799
800        if flags.contains(OpenFlags::TMPFILE) && !flags.can_write() {
801            return error!(EINVAL);
802        }
803
804        let nofollow = flags.contains(OpenFlags::NOFOLLOW);
805        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
806
807        let symlink_mode =
808            if nofollow || must_create { SymlinkMode::NoFollow } else { SymlinkMode::Follow };
809
810        let resolve_base = match (
811            resolve_flags.contains(ResolveFlags::BENEATH),
812            resolve_flags.contains(ResolveFlags::IN_ROOT),
813        ) {
814            (false, false) => ResolveBase::None,
815            (true, false) => ResolveBase::Beneath(dir.clone()),
816            (false, true) => ResolveBase::InRoot(dir.clone()),
817            (true, true) => return error!(EINVAL),
818        };
819
820        // `RESOLVE_BENEATH` and `RESOLVE_IN_ROOT` imply `RESOLVE_NO_MAGICLINKS`. This matches
821        // Linux behavior. Strictly speaking it's is not really required, but it's hard to
822        // implement `BENEATH` and `IN_ROOT` flags correctly otherwise.
823        if resolve_base != ResolveBase::None {
824            resolve_flags.insert(ResolveFlags::NO_MAGICLINKS);
825        }
826
827        let mut context = LookupContext {
828            symlink_mode,
829            remaining_follows: MAX_SYMLINK_FOLLOWS,
830            must_be_directory: flags.contains(OpenFlags::DIRECTORY),
831            resolve_flags,
832            resolve_base,
833        };
834        let (name, created) =
835            match self.resolve_open_path(locked, &mut context, &dir, path, mode, flags) {
836                Ok((n, c)) => (n, c),
837                Err(e) => {
838                    let mut abs_path = dir.path(&self.fs());
839                    abs_path.extend(&**path);
840                    track_file_not_found(abs_path);
841                    return Err(e);
842                }
843            };
844
845        let name = if flags.contains(OpenFlags::TMPFILE) {
846            // `O_TMPFILE` is incompatible with `O_CREAT`
847            if flags.contains(OpenFlags::CREAT) {
848                return error!(EINVAL);
849            }
850            name.create_tmpfile(locked, self, mode.with_type(FileMode::IFREG), flags)?
851        } else {
852            let mode = name.entry.node.info().mode;
853
854            // These checks are not needed in the `O_TMPFILE` case because `mode` refers to the
855            // file we are opening. With `O_TMPFILE`, that file is the regular file we just
856            // created rather than the node we found by resolving the path.
857            //
858            // For example, we do not need to produce `ENOTDIR` when `must_be_directory` is set
859            // because `must_be_directory` refers to the node we found by resolving the path.
860            // If that node was not a directory, then `create_tmpfile` will produce an error.
861            //
862            // Similarly, we never need to call `truncate` because `O_TMPFILE` is newly created
863            // and therefor already an empty file.
864
865            if !opath && nofollow && mode.is_lnk() {
866                return error!(ELOOP);
867            }
868
869            if mode.is_dir() {
870                if flags.can_write()
871                    || flags.contains(OpenFlags::CREAT)
872                    || flags.contains(OpenFlags::TRUNC)
873                {
874                    return error!(EISDIR);
875                }
876                if flags.contains(OpenFlags::DIRECT) {
877                    return error!(EINVAL);
878                }
879            } else if context.must_be_directory {
880                return error!(ENOTDIR);
881            }
882
883            if flags.contains(OpenFlags::TRUNC) && mode.is_reg() && !created {
884                // You might think we should check file.can_write() at this
885                // point, which is what the docs suggest, but apparently we
886                // are supposed to truncate the file if this task can write
887                // to the underlying node, even if we are opening the file
888                // as read-only. See OpenTest.CanTruncateReadOnly.
889                name.truncate(locked, self, 0)?;
890            }
891
892            name
893        };
894
895        // If the node has been created, the open operation should not verify access right:
896        // From <https://man7.org/linux/man-pages/man2/open.2.html>
897        //
898        // > Note that mode applies only to future accesses of the newly created file; the
899        // > open() call that creates a read-only file may well return a  read/write  file
900        // > descriptor.
901        let access_check = if created { AccessCheck::skip() } else { access_check };
902        let file = name.open(locked, self, flags, access_check)?;
903
904        // If the new `FileHandle` represents an open file (rather than a handle to a location in
905        // the virtual file system, as created with `O_PATH`), then LSM permission checks may be
906        // required.
907        if !opath {
908            security::file_open(self, &file)?;
909        }
910
911        Ok(file)
912    }
913
914    /// A wrapper for FsContext::lookup_parent_at that resolves the given
915    /// dir_fd to a NamespaceNode.
916    ///
917    /// Absolute paths are resolve relative to the root of the FsContext for
918    /// this task. Relative paths are resolve relative to dir_fd. To resolve
919    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
920    /// dir_fd.
921    pub fn lookup_parent_at<'a, L>(
922        &self,
923        locked: &mut Locked<L>,
924        context: &mut LookupContext,
925        dir_fd: FdNumber,
926        path: &'a FsStr,
927    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
928    where
929        L: LockEqualOrBefore<FileOpsCore>,
930    {
931        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, ResolveFlags::empty())?;
932        self.lookup_parent(locked, context, &dir, path)
933    }
934
935    /// Lookup the parent of a namespace node.
936    ///
937    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
938    /// calling this function directly.
939    ///
940    /// This function resolves all but the last component of the given path.
941    /// The function returns the parent directory of the last component as well
942    /// as the last component.
943    ///
944    /// If path is empty, this function returns dir and an empty path.
945    /// Similarly, if path ends with "." or "..", these components will be
946    /// returned along with the parent.
947    ///
948    /// The returned parent might not be a directory.
949    pub fn lookup_parent<'a, L>(
950        &self,
951        locked: &mut Locked<L>,
952        context: &mut LookupContext,
953        dir: &NamespaceNode,
954        path: &'a FsStr,
955    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
956    where
957        L: LockEqualOrBefore<FileOpsCore>,
958    {
959        context.update_for_path(path);
960
961        let components = split_path(path);
962        if components.is_empty() {
963            return Ok((dir.clone(), Default::default()));
964        }
965        let result =
966            dir.lookup_children(locked, self, context, &components[0..components.len() - 1])?;
967        Ok((result, components.last().unwrap()))
968    }
969
970    /// Lookup a namespace node.
971    ///
972    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
973    /// calling this function directly.
974    ///
975    /// This function resolves the component of the given path.
976    pub fn lookup_path<L>(
977        &self,
978        locked: &mut Locked<L>,
979        context: &mut LookupContext,
980        dir: NamespaceNode,
981        path: &FsStr,
982    ) -> Result<NamespaceNode, Errno>
983    where
984        L: LockEqualOrBefore<FileOpsCore>,
985    {
986        let components = split_path(path);
987        dir.lookup_children(locked, self, context, &components)
988    }
989
990    /// Lookup a namespace node starting at the root directory.
991    ///
992    /// Resolves symlinks.
993    pub fn lookup_path_from_root<L>(
994        &self,
995        locked: &mut Locked<L>,
996        path: &FsStr,
997    ) -> Result<NamespaceNode, Errno>
998    where
999        L: LockEqualOrBefore<FileOpsCore>,
1000    {
1001        let mut context = LookupContext::default();
1002        self.lookup_path(locked, &mut context, self.fs().root(), path)
1003    }
1004
1005    pub fn exec(
1006        &mut self,
1007        locked: &mut Locked<Unlocked>,
1008        executable: FileHandle,
1009        path: CString,
1010        argv: Vec<CString>,
1011        environ: Vec<CString>,
1012    ) -> Result<(), Errno> {
1013        // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1014        //
1015        //   EACCES: The file or a script interpreter is not a regular file.
1016        if !executable.name.entry.node.is_reg() {
1017            return error!(EACCES);
1018        }
1019
1020        // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1021        //
1022        //   EACCES: Execute permission is denied for the file or a script or
1023        //   ELF interpreter.
1024        executable.name.check_access(locked, self, Access::EXEC, CheckAccessReason::Exec)?;
1025
1026        // Resolve the executable (and any interpreter) into a `ResolvedElf`.
1027        // TODO(https://fxbug.dev/483368940): Split initial resolution from interpreter resolution.
1028        let mut resolved_elf =
1029            resolve_executable(locked, self, executable.clone(), path.clone(), argv, environ)?;
1030
1031        // Serialize against ptrace_attach by holding the credentials write lock.
1032        let writable_creds = self.write_creds();
1033
1034        // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1035        //
1036        //   The aforementioned transformations of the effective IDs are not
1037        //   performed (i.e., the set-user-ID and set-group-ID bits are
1038        //   ignored) if any of the following is true:
1039        //
1040        //   * the calling thread is being ptraced (see ptrace(2));
1041        //
1042        //   * the calling thread has a non-zero "no-new-privs" attribute
1043        //     (see prctl(2));
1044        let (no_new_privs, is_ptraced) = {
1045            let state = self.read();
1046            (state.no_new_privs(), state.is_ptraced())
1047        };
1048
1049        let enable_suid = self.kernel().features.enable_suid && !no_new_privs && !is_ptraced;
1050        if enable_suid {
1051            resolved_elf.file.name.apply_suid_and_sgid(&mut resolved_elf.creds);
1052        }
1053
1054        resolved_elf.secure_exec |= resolved_elf.creds.exec(&self.current_creds());
1055
1056        // TODO(tbodt): Check whether capability xattrs are set on the file, and grant/limit
1057        // capabilities accordingly.
1058        //
1059        // TODO(https://fxbug.dev/503338788) - Migrate this (and other capabilities wrangling)
1060        // into a `common_cap::bprm_creds_from_file()` implementation.
1061        if no_new_privs {
1062            resolved_elf.creds.cap_permitted &= self.current_creds().cap_permitted;
1063            resolved_elf.creds.cap_effective &= resolved_elf.creds.cap_permitted;
1064        }
1065
1066        // LSM hook: Perform access checks and allow LSM to update credentials.
1067        security::bprm_creds_for_exec(self, &executable.name, &mut resolved_elf)?;
1068
1069        if self.thread_group().read().tasks_count() > 1 {
1070            track_stub!(TODO("https://fxbug.dev/297434895"), "exec on multithread process");
1071            return error!(EINVAL);
1072        }
1073
1074        // Commit the exec. Failures after this point are unrecoverable.
1075        if let Err(err) = self.finish_exec(locked, path, resolved_elf, writable_creds) {
1076            log_warn!("unrecoverable error in exec: {err:?}");
1077
1078            send_standard_signal(locked, self, SignalInfo::forced(SIGSEGV));
1079            return Err(err);
1080        }
1081
1082        self.ptrace_event(locked, PtraceOptions::TRACEEXEC, self.task.tid as u64);
1083        self.signal_vfork();
1084        self.task.thread_group.sync_syscall_log_level();
1085
1086        Ok(())
1087    }
1088
1089    /// After the memory is unmapped, any failure in exec is unrecoverable and results in the
1090    /// process crashing. This function is for that second half; any error returned from this
1091    /// function will be considered unrecoverable.
1092    fn finish_exec(
1093        &mut self,
1094        locked: &mut Locked<Unlocked>,
1095        path: CString,
1096        resolved_elf: ResolvedElf,
1097        writable_creds: CurrentTaskCredentialsWriteGuard,
1098    ) -> Result<(), Errno> {
1099        // Now that the exec will definitely finish (or crash), notify owners of
1100        // locked futexes for the current process, which will be impossible to
1101        // update after process image is replaced.  See get_robust_list(2).
1102        self.notify_robust_list();
1103
1104        // Tear down the old address space and create a new one for the resolved ELF.
1105        let mm = {
1106            let new_mm = MemoryManager::exec(
1107                self.thread_group().root_vmar.unowned(),
1108                self.mm().ok(),
1109                resolved_elf.file.name.to_passive(),
1110                resolved_elf.arch_width,
1111            )?;
1112            self.running_state().mm.update(Some(new_mm.clone()));
1113            new_mm
1114        };
1115        // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1116        //
1117        //   All threads other than the calling thread are destroyed during an
1118        //   execve(). Mutual exclusion locks, condition variables, and other
1119        //   pthreads objects are not preserved.
1120        //
1121        // TODO(https://fxbug.dev/42082680): Implement thread destruction.
1122
1123        // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1124        //
1125        //   POSIX timers (timer_create(2)) are not preserved.
1126        //
1127        // TODO: Implement this.
1128
1129        // TODO: Ensure that the filesystem context is un-shared, undoing the effect of CLONE_FS.
1130
1131        // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1132        //
1133        //   If the calling process was sharing its file descriptor table (via
1134        //   the use of CLONE_FILES with clone(2)), then this sharing is undone.
1135        self.running_state().files.unshare();
1136        self.running_state().files.exec(locked, self);
1137
1138        {
1139            let mut state = self.write();
1140
1141            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1142            //
1143            //   The process's "dumpable" attribute is set to the value 1,
1144            //   unless a set-user-ID program, a set-group-ID program, or a
1145            //   program with capabilities is being executed, in which case the
1146            //   dumpable flag may instead be reset to the value in
1147            //   /proc/sys/fs/suid_dumpable, in the circumstances described
1148            //   under PR_SET_DUMPABLE in prctl(2).
1149            let dumpable =
1150                if resolved_elf.secure_exec { DumpPolicy::Disable } else { DumpPolicy::User };
1151            *mm.dumpable.lock(locked) = dumpable;
1152
1153            state.set_sigaltstack(None);
1154            state.robust_list_head = RobustListHeadPtr::null(self);
1155            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1156            //
1157            //   If a set-user-ID or set-group-ID
1158            //   program is being executed, then the parent death signal set by
1159            //   prctl(2) PR_SET_PDEATHSIG flag is cleared.
1160            //
1161            // TODO(https://fxbug.dev/356684424): Implement the behavior above once we support
1162            // the PR_SET_PDEATHSIG flag.
1163        }
1164
1165        security::bprm_committing_creds(locked, self, &resolved_elf)?;
1166
1167        let new_creds = Arc::new(resolved_elf.creds.clone());
1168        writable_creds.update(self, new_creds);
1169
1170        let start_info = load_executable(self, resolved_elf, &path)?;
1171
1172        let regs: zx_restricted_state_t = start_info.into();
1173        self.thread_state.registers.load(regs);
1174        self.thread_state.extended_pstate.reset();
1175        self.thread_group().signal_actions.reset_for_exec();
1176
1177        // The exit signal (and that of the children) is reset to SIGCHLD.
1178        {
1179            let mut thread_group_state = self.thread_group().write();
1180            thread_group_state.exit_signal = Some(SIGCHLD);
1181            for (_, weak_child) in &mut thread_group_state.children {
1182                if let Some(child) = weak_child.upgrade() {
1183                    let mut child_state = child.write();
1184                    child_state.exit_signal = Some(SIGCHLD);
1185                }
1186            }
1187        }
1188
1189        security::bprm_committed_creds(locked, self)?;
1190
1191        self.thread_group().write().did_exec = true;
1192
1193        self.set_command_name(TaskCommand::from_path_bytes(path.to_bytes()));
1194
1195        Ok(())
1196    }
1197
1198    pub fn set_command_name(&self, new_name: TaskCommand) {
1199        // set_command_name needs to run before leader_command() in cases where self is the leader.
1200        self.task.set_command_name(new_name.clone());
1201        let leader_command = self.thread_group().read().leader_command();
1202        starnix_logging::set_current_task_info(
1203            new_name,
1204            leader_command,
1205            self.thread_group().leader,
1206            self.tid,
1207        );
1208    }
1209
1210    pub fn add_seccomp_filter(
1211        &mut self,
1212        locked: &mut Locked<Unlocked>,
1213        code: Vec<sock_filter>,
1214        flags: u32,
1215    ) -> Result<SyscallResult, Errno> {
1216        let new_filter = Arc::new(SeccompFilter::from_cbpf(
1217            &code,
1218            self.thread_group().next_seccomp_filter_id.add(1),
1219            flags & SECCOMP_FILTER_FLAG_LOG != 0,
1220        )?);
1221
1222        let mut maybe_fd: Option<FdNumber> = None;
1223
1224        if flags & SECCOMP_FILTER_FLAG_NEW_LISTENER != 0 {
1225            maybe_fd = Some(SeccompFilterContainer::create_listener(locked, self)?);
1226        }
1227
1228        // We take the process lock here because we can't change any of the threads
1229        // while doing a tsync.  So, you hold the process lock while making any changes.
1230        let state = self.thread_group().write();
1231
1232        if flags & SECCOMP_FILTER_FLAG_TSYNC != 0 {
1233            // TSYNC synchronizes all filters for all threads in the current process to
1234            // the current thread's
1235
1236            // We collect the filters for the current task upfront to save us acquiring
1237            // the task's lock a lot of times below.
1238            let mut filters: SeccompFilterContainer = self.read().seccomp_filters.clone();
1239
1240            // For TSYNC to work, all of the other thread filters in this process have to
1241            // be a prefix of this thread's filters, and none of them can be in
1242            // strict mode.
1243            let tasks = state.tasks();
1244            for task in &tasks {
1245                if task.tid == self.tid {
1246                    continue;
1247                }
1248                let other_task_state = task.read();
1249
1250                // Target threads cannot be in SECCOMP_MODE_STRICT
1251                if task.seccomp_filter_state.get() == SeccompStateValue::Strict {
1252                    return Self::seccomp_tsync_error(task.tid, flags);
1253                }
1254
1255                // Target threads' filters must be a subsequence of this thread's
1256                if !other_task_state.seccomp_filters.can_sync_to(&filters) {
1257                    return Self::seccomp_tsync_error(task.tid, flags);
1258                }
1259            }
1260
1261            // Now that we're sure we're allowed to do so, add the filter to all threads.
1262            filters.add_filter(new_filter, code.len() as u16)?;
1263
1264            for task in &tasks {
1265                let mut other_task_state = task.write();
1266
1267                other_task_state.enable_no_new_privs();
1268                other_task_state.seccomp_filters = filters.clone();
1269                task.set_seccomp_state(SeccompStateValue::UserDefined)?;
1270            }
1271        } else {
1272            let mut task_state = self.task.write();
1273
1274            task_state.seccomp_filters.add_filter(new_filter, code.len() as u16)?;
1275            self.set_seccomp_state(SeccompStateValue::UserDefined)?;
1276        }
1277
1278        if let Some(fd) = maybe_fd { Ok(fd.into()) } else { Ok(().into()) }
1279    }
1280
1281    pub fn run_seccomp_filters(
1282        &mut self,
1283        locked: &mut Locked<Unlocked>,
1284        syscall: &Syscall,
1285    ) -> Option<Result<SyscallResult, Errno>> {
1286        // Implementation of SECCOMP_FILTER_STRICT, which has slightly different semantics
1287        // from user-defined seccomp filters.
1288        if self.seccomp_filter_state.get() == SeccompStateValue::Strict {
1289            return SeccompState::do_strict(locked, self, syscall);
1290        }
1291
1292        // Run user-defined seccomp filters
1293        let result = self.task.read().seccomp_filters.run_all(self, syscall);
1294
1295        SeccompState::do_user_defined(locked, result, self, syscall)
1296    }
1297
1298    fn seccomp_tsync_error(id: i32, flags: u32) -> Result<SyscallResult, Errno> {
1299        // By default, TSYNC indicates failure state by returning the first thread
1300        // id not to be able to sync, rather than by returning -1 and setting
1301        // errno.  However, if TSYNC_ESRCH is set, it returns ESRCH.  This
1302        // prevents conflicts with fact that SECCOMP_FILTER_FLAG_NEW_LISTENER
1303        // makes seccomp return an fd.
1304        if flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH != 0 { error!(ESRCH) } else { Ok(id.into()) }
1305    }
1306
1307    // Notify all futexes in robust list.  The robust list is in user space, so we
1308    // are very careful about walking it, and there are a lot of quiet returns if
1309    // we fail to walk it.
1310    // TODO(https://fxbug.dev/42079081): This only sets the FUTEX_OWNER_DIED bit; it does
1311    // not wake up a waiter.
1312    pub fn notify_robust_list(&self) {
1313        let task_state = self.write();
1314        let robust_list_addr = task_state.robust_list_head.addr();
1315        if robust_list_addr == UserAddress::NULL {
1316            // No one has called set_robust_list.
1317            return;
1318        }
1319        let robust_list_res = self.read_multi_arch_object(task_state.robust_list_head);
1320
1321        let head = if let Ok(head) = robust_list_res {
1322            head
1323        } else {
1324            return;
1325        };
1326
1327        let offset = head.futex_offset;
1328
1329        let mut entries_count = 0;
1330        let mut curr_ptr = head.list.next;
1331        while curr_ptr.addr() != robust_list_addr.into() && entries_count < ROBUST_LIST_LIMIT {
1332            let curr_ref = self.read_multi_arch_object(curr_ptr);
1333
1334            let curr = if let Ok(curr) = curr_ref {
1335                curr
1336            } else {
1337                return;
1338            };
1339
1340            let Some(futex_base) = curr_ptr.addr().checked_add_signed(offset) else {
1341                return;
1342            };
1343
1344            let futex_addr = match FutexAddress::try_from(futex_base) {
1345                Ok(addr) => addr,
1346                Err(_) => {
1347                    return;
1348                }
1349            };
1350
1351            let Ok(mm) = self.mm() else {
1352                log_error!("Asked to notify robust list futexes in system task.");
1353                return;
1354            };
1355            let futex = if let Ok(futex) = mm.atomic_load_u32_relaxed(futex_addr) {
1356                futex
1357            } else {
1358                return;
1359            };
1360
1361            if (futex & FUTEX_TID_MASK) as i32 == self.tid {
1362                let owner_died = FUTEX_OWNER_DIED | futex;
1363                if mm.atomic_store_u32_relaxed(futex_addr, owner_died).is_err() {
1364                    return;
1365                }
1366            }
1367            curr_ptr = curr.next;
1368            entries_count += 1;
1369        }
1370    }
1371
1372    /// Returns a ref to this thread's SeccompNotifier.
1373    pub fn get_seccomp_notifier(&mut self) -> Option<SeccompNotifierHandle> {
1374        self.task.write().seccomp_filters.notifier.clone()
1375    }
1376
1377    pub fn set_seccomp_notifier(&mut self, notifier: Option<SeccompNotifierHandle>) {
1378        self.task.write().seccomp_filters.notifier = notifier;
1379    }
1380
1381    pub(crate) fn handle_page_fault(
1382        &self,
1383        locked: &mut Locked<Unlocked>,
1384        decoded: PageFaultExceptionReport,
1385        status: zx::Status,
1386    ) -> ExceptionResult {
1387        if let Ok(mm) = self.mm() {
1388            mm.handle_page_fault(locked, decoded, status)
1389        } else {
1390            panic!(
1391                "system task is handling a major page fault status={:?}, report={:?}",
1392                status, decoded
1393            );
1394        }
1395    }
1396
1397    /// Processes a Zircon exception associated with this task.
1398    pub fn process_exception(
1399        &self,
1400        locked: &mut Locked<Unlocked>,
1401        report: &zx::ExceptionReport,
1402    ) -> ExceptionResult {
1403        if let Some(result) = handle_hardware_exception(locked, self, report) {
1404            return result;
1405        }
1406
1407        match report.ty {
1408            zx::ExceptionType::General => {
1409                log_error!("Unrecognized general exception: {:?}", report);
1410                ExceptionResult::Signal(SignalInfo::kernel(SIGILL))
1411            }
1412            zx::ExceptionType::ProcessNameChanged => {
1413                log_error!("Received unexpected process name changed exception");
1414                ExceptionResult::Handled
1415            }
1416            zx::ExceptionType::ProcessStarting
1417            | zx::ExceptionType::ThreadStarting
1418            | zx::ExceptionType::ThreadExiting => {
1419                log_error!("Received unexpected task lifecycle exception");
1420                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1421            }
1422            zx::ExceptionType::PolicyError(policy_code) => {
1423                log_error!(policy_code:?; "Received Zircon policy error exception");
1424                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1425            }
1426            zx::ExceptionType::UnknownUserGenerated { code, data } => {
1427                log_error!(code:?, data:?; "Received unexpected unknown user generated exception");
1428                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1429            }
1430            zx::ExceptionType::Unknown { ty, code, data } => {
1431                log_error!(ty:?, code:?, data:?; "Received unexpected exception");
1432                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1433            }
1434            _ => {
1435                log_error!("Received unknown zircon exception: {:?}", report.ty);
1436                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1437            }
1438        }
1439    }
1440
1441    /// Clone this task.
1442    ///
1443    /// Creates a new task object that shares some state with this task
1444    /// according to the given flags.
1445    ///
1446    /// Used by the clone() syscall to create both processes and threads.
1447    ///
1448    /// The exit signal is broken out from the flags parameter like clone3() rather than being
1449    /// bitwise-ORed like clone().
1450    pub fn clone_task<L>(
1451        &self,
1452        locked: &mut Locked<L>,
1453        flags: u64,
1454        child_exit_signal: Option<Signal>,
1455        user_parent_tid: UserRef<pid_t>,
1456        user_child_tid: UserRef<pid_t>,
1457        user_pidfd: UserRef<FdNumber>,
1458    ) -> Result<TaskBuilder, Errno>
1459    where
1460        L: LockBefore<MmDumpable>,
1461        L: LockBefore<TaskRelease>,
1462        L: LockBefore<ProcessGroupState>,
1463    {
1464        const IMPLEMENTED_FLAGS: u64 = ((CLONE_VM
1465            | CLONE_FS
1466            | CLONE_FILES
1467            | CLONE_SIGHAND
1468            | CLONE_THREAD
1469            | CLONE_SYSVSEM
1470            | CLONE_SETTLS
1471            | CLONE_PARENT
1472            | CLONE_PARENT_SETTID
1473            | CLONE_PIDFD
1474            | CLONE_CHILD_CLEARTID
1475            | CLONE_CHILD_SETTID
1476            | CLONE_VFORK
1477            | CLONE_NEWUTS
1478            | CLONE_PTRACE) as u64)
1479            | CLONE_CLEAR_SIGHAND;
1480
1481        // A mask with all valid flags set, because we want to return a different error code for an
1482        // invalid flag vs an unimplemented flag. Subtracting 1 from the largest valid flag gives a
1483        // mask with all flags below it set. Shift up by one to make sure the largest flag is also
1484        // set.
1485        const VALID_FLAGS: u64 = (CLONE_INTO_CGROUP << 1) - 1;
1486
1487        // CLONE_SETTLS is implemented by sys_clone.
1488
1489        let clone_files = flags & (CLONE_FILES as u64) != 0;
1490        let clone_fs = flags & (CLONE_FS as u64) != 0;
1491        let clone_parent = flags & (CLONE_PARENT as u64) != 0;
1492        let clone_parent_settid = flags & (CLONE_PARENT_SETTID as u64) != 0;
1493        let clone_pidfd = flags & (CLONE_PIDFD as u64) != 0;
1494        let clone_child_cleartid = flags & (CLONE_CHILD_CLEARTID as u64) != 0;
1495        let clone_child_settid = flags & (CLONE_CHILD_SETTID as u64) != 0;
1496        let clone_sysvsem = flags & (CLONE_SYSVSEM as u64) != 0;
1497        let clone_ptrace = flags & (CLONE_PTRACE as u64) != 0;
1498        let clone_thread = flags & (CLONE_THREAD as u64) != 0;
1499        let clone_vm = flags & (CLONE_VM as u64) != 0;
1500        let clone_sighand = flags & (CLONE_SIGHAND as u64) != 0;
1501        let clone_vfork = flags & (CLONE_VFORK as u64) != 0;
1502        let clone_newuts = flags & (CLONE_NEWUTS as u64) != 0;
1503        let clone_into_cgroup = flags & CLONE_INTO_CGROUP != 0;
1504        let clone_clear_sighand = flags & (CLONE_CLEAR_SIGHAND as u64) != 0;
1505
1506        if clone_ptrace {
1507            track_stub!(TODO("https://fxbug.dev/322874630"), "CLONE_PTRACE");
1508        }
1509
1510        if clone_sysvsem {
1511            track_stub!(TODO("https://fxbug.dev/322875185"), "CLONE_SYSVSEM");
1512        }
1513
1514        if clone_into_cgroup {
1515            track_stub!(TODO("https://fxbug.dev/403612570"), "CLONE_INTO_CGROUP");
1516        }
1517
1518        if clone_sighand && !clone_vm {
1519            return error!(EINVAL);
1520        }
1521        if clone_clear_sighand && clone_sighand {
1522            return error!(EINVAL);
1523        }
1524        if clone_thread && !clone_sighand {
1525            return error!(EINVAL);
1526        }
1527
1528        if clone_pidfd && clone_thread {
1529            return error!(EINVAL);
1530        }
1531        if clone_pidfd && clone_parent_settid && user_parent_tid.addr() == user_pidfd.addr() {
1532            // `clone()` uses the same out-argument for these, so error out if they have the same
1533            // user address.
1534            return error!(EINVAL);
1535        }
1536
1537        if flags & !VALID_FLAGS != 0 {
1538            return error!(EINVAL);
1539        }
1540
1541        if clone_vm && !clone_thread {
1542            // TODO(https://fxbug.dev/42066087) Implement CLONE_VM for child processes (not just child
1543            // threads). Currently this executes CLONE_VM (explicitly passed to clone() or as
1544            // used by vfork()) as a fork (the VM in the child is copy-on-write) which is almost
1545            // always OK.
1546            //
1547            // CLONE_VM is primarily as an optimization to avoid making a copy-on-write version of a
1548            // process' VM that will be immediately replaced with a call to exec(). The main users
1549            // (libc and language runtimes) don't actually rely on the memory being shared between
1550            // the two processes. And the vfork() man page explicitly allows vfork() to be
1551            // implemented as fork() which is what we do here.
1552            if !clone_vfork {
1553                track_stub!(
1554                    TODO("https://fxbug.dev/322875227"),
1555                    "CLONE_VM without CLONE_THREAD or CLONE_VFORK"
1556                );
1557            }
1558        } else if clone_thread && !clone_vm {
1559            track_stub!(TODO("https://fxbug.dev/322875167"), "CLONE_THREAD without CLONE_VM");
1560            return error!(ENOSYS);
1561        }
1562
1563        if flags & !IMPLEMENTED_FLAGS != 0 {
1564            track_stub!(
1565                TODO("https://fxbug.dev/322875130"),
1566                "clone unknown flags",
1567                flags & !IMPLEMENTED_FLAGS
1568            );
1569            return error!(ENOSYS);
1570        }
1571
1572        let fs = if clone_fs { self.fs() } else { self.fs().fork() };
1573        let files = if clone_files {
1574            self.running_state().files.clone()
1575        } else {
1576            self.running_state().files.fork()
1577        };
1578
1579        let kernel = self.kernel();
1580
1581        let mut pids = kernel.pids.write();
1582
1583        // Lock the cgroup process hierarchy so that the parent process cannot move to a different
1584        // cgroup while a new task or thread_group is created. This may be unnecessary if
1585        // CLONE_INTO_CGROUP is implemented and passed in.
1586        let mut cgroup2_pid_table = kernel.cgroups.lock_cgroup2_pid_table();
1587        // Create a `KernelSignal::Freeze` to put onto the new task, if the cgroup is frozen.
1588        let child_kernel_signals = cgroup2_pid_table
1589            .maybe_create_freeze_signal(self.thread_group())
1590            .into_iter()
1591            .collect::<VecDeque<_>>();
1592
1593        let pid;
1594        let command;
1595        let creds;
1596        let scheduler_state;
1597        let no_new_privs;
1598        let seccomp_filters;
1599        let robust_list_head = RobustListHeadPtr::null(self);
1600        let child_signal_mask;
1601        let timerslack_ns;
1602        let uts_ns;
1603
1604        let TaskInfo { thread_group, memory_manager } = {
1605            // These variables hold the original parent in case we need to switch the parent of the
1606            // new task because of CLONE_PARENT.
1607            let weak_original_parent;
1608            let original_parent;
1609
1610            // Make sure to drop these locks ASAP to avoid inversion
1611            let thread_group_state = {
1612                let thread_group_state = self.thread_group().write();
1613                if clone_parent {
1614                    // With the CLONE_PARENT flag, the parent of the new task is our parent
1615                    // instead of ourselves.
1616                    weak_original_parent =
1617                        thread_group_state.parent.clone().ok_or_else(|| errno!(EINVAL))?;
1618                    std::mem::drop(thread_group_state);
1619                    original_parent = weak_original_parent.upgrade();
1620                    original_parent.write()
1621                } else {
1622                    thread_group_state
1623                }
1624            };
1625
1626            let state = self.read();
1627
1628            no_new_privs = state.no_new_privs();
1629            seccomp_filters = state.seccomp_filters.clone();
1630            child_signal_mask = state.signal_mask();
1631
1632            pid = pids.allocate_pid();
1633            command = self.command();
1634            creds = self.current_creds().clone();
1635            scheduler_state = state.scheduler_state.fork();
1636            timerslack_ns = state.timerslack_ns;
1637
1638            uts_ns = if clone_newuts {
1639                security::check_task_capable(self, CAP_SYS_ADMIN)?;
1640                state.uts_ns.read().fork()
1641            } else {
1642                state.uts_ns.clone()
1643            };
1644
1645            if clone_thread {
1646                TaskInfo {
1647                    thread_group: self.thread_group().clone(),
1648                    memory_manager: self.mm().ok(),
1649                }
1650            } else {
1651                // Drop the lock on this task before entering `create_zircon_process`, because it will
1652                // take a lock on the new thread group, and locks on thread groups have a higher
1653                // priority than locks on the task in the thread group.
1654                std::mem::drop(state);
1655                let signal_actions = if clone_sighand {
1656                    self.thread_group().signal_actions.clone()
1657                } else if clone_clear_sighand {
1658                    let actions = self.thread_group().signal_actions.fork();
1659                    actions.reset_for_exec();
1660                    actions
1661                } else {
1662                    self.thread_group().signal_actions.fork()
1663                };
1664                let process_group = thread_group_state.process_group.clone();
1665
1666                let task_info = {
1667                    fuchsia_trace::duration!(CATEGORY_STARNIX, "create_zircon_process");
1668                    create_zircon_process(
1669                        locked,
1670                        kernel,
1671                        Some(thread_group_state),
1672                        pid,
1673                        child_exit_signal,
1674                        process_group,
1675                        signal_actions,
1676                        command.clone(),
1677                    )?
1678                };
1679
1680                cgroup2_pid_table.inherit_cgroup(self.thread_group(), &task_info.thread_group);
1681
1682                task_info
1683            }
1684        };
1685
1686        // Drop the lock on the cgroup pid_table before creating the TaskBuilder.
1687        // If the TaskBuilder creation fails, the TaskBuilder is dropped, which calls
1688        // ThreadGroup::remove. ThreadGroup::remove takes the cgroup pid_table lock, causing
1689        // a cyclic lock dependency.
1690        std::mem::drop(cgroup2_pid_table);
1691
1692        // Only create the vfork event when the caller requested CLONE_VFORK.
1693        let vfork_event = if clone_vfork { Some(Arc::new(zx::Event::create())) } else { None };
1694
1695        // Clone running state in a nested scope to ensure that the RCU read scope is not held
1696        // across the release_on_error block.
1697        let abstract_socket_namespace;
1698        let abstract_vsock_namespace;
1699        {
1700            let running_state = self.running_state();
1701            abstract_socket_namespace = running_state.abstract_socket_namespace.clone();
1702            abstract_vsock_namespace = running_state.abstract_vsock_namespace.clone();
1703        }
1704
1705        let mut child = TaskBuilder::new(Task::new(
1706            pid,
1707            command,
1708            thread_group,
1709            files,
1710            memory_manager,
1711            fs,
1712            creds,
1713            abstract_socket_namespace,
1714            abstract_vsock_namespace,
1715            child_signal_mask,
1716            child_kernel_signals,
1717            vfork_event,
1718            scheduler_state,
1719            uts_ns,
1720            no_new_privs,
1721            SeccompState::from(&self.seccomp_filter_state),
1722            seccomp_filters,
1723            robust_list_head,
1724            timerslack_ns,
1725        ));
1726
1727        release_on_error!(child, locked, {
1728            // Drop the pids lock as soon as possible after creating the child. Destroying the child
1729            // and removing it from the pids table itself requires the pids lock, so if an early exit
1730            // takes place we have a self deadlock.
1731            pids.add_task(Arc::clone(&child.task));
1732            std::mem::drop(pids);
1733
1734            // Child lock must be taken before this lock. Drop the lock on the task, take a writable
1735            // lock on the child and take the current state back.
1736
1737            #[cfg(any(test, debug_assertions))]
1738            {
1739                // Take the lock on the thread group and its child in the correct order to ensure any wrong ordering
1740                // will trigger the tracing-mutex at the right call site.
1741                if !clone_thread {
1742                    let _l1 = self.thread_group().read();
1743                    let _l2 = child.thread_group().read();
1744                }
1745            }
1746
1747            if clone_thread {
1748                self.thread_group().add(Arc::clone(&child.task))?;
1749            } else {
1750                child.thread_group().add(Arc::clone(&child.task))?;
1751
1752                // These manipulations of the signal handling state appear to be related to
1753                // CLONE_SIGHAND and CLONE_VM rather than CLONE_THREAD. However, we do not support
1754                // all the combinations of these flags, which means doing these operations here
1755                // might actually be correct. However, if you find a test that fails because of the
1756                // placement of this logic here, we might need to move it.
1757                let mut child_state = child.write();
1758                let state = self.read();
1759                child_state.set_sigaltstack(state.sigaltstack());
1760                child_state.set_signal_mask(state.signal_mask());
1761            }
1762
1763            if !clone_vm {
1764                // We do not support running threads in the same process with different
1765                // MemoryManagers.
1766                assert!(!clone_thread);
1767                let child_mm = MemoryManager::snapshot_of(
1768                    locked,
1769                    &self.mm()?,
1770                    child.thread_group.root_vmar.unowned(),
1771                    self.thread_state.arch_width(),
1772                )?;
1773                child.running_state()?.mm.update(Some(child_mm));
1774            }
1775
1776            if clone_parent_settid {
1777                self.write_object(user_parent_tid, &child.tid)?;
1778            }
1779
1780            if clone_child_cleartid {
1781                child.write().clear_child_tid = user_child_tid;
1782            }
1783
1784            if clone_child_settid {
1785                child.write_object(user_child_tid, &child.tid)?;
1786            }
1787
1788            if clone_pidfd {
1789                let locked = locked.cast_locked::<TaskRelease>();
1790                let file = new_pidfd(
1791                    locked,
1792                    self,
1793                    child.thread_group(),
1794                    &*child.mm()?,
1795                    OpenFlags::empty(),
1796                );
1797                let pidfd = self.add_file(locked, file, FdFlags::CLOEXEC)?;
1798                self.write_object(user_pidfd, &pidfd)?;
1799            }
1800
1801            // TODO(https://fxbug.dev/42066087): We do not support running different processes with
1802            // the same MemoryManager. Instead, we implement a rough approximation of that behavior
1803            // by making a copy-on-write clone of the memory from the original process.
1804            if clone_vm && !clone_thread {
1805                let child_mm = MemoryManager::snapshot_of(
1806                    locked,
1807                    &self.mm()?,
1808                    child.thread_group.root_vmar.unowned(),
1809                    self.thread_state.arch_width(),
1810                )?;
1811                child.running_state()?.mm.update(Some(child_mm));
1812            }
1813
1814            child.thread_state = self.thread_state.snapshot::<HeapRegs>();
1815            Ok(())
1816        });
1817
1818        // Take the lock on thread group and task in the correct order to ensure any wrong ordering
1819        // will trigger the tracing-mutex at the right call site.
1820        #[cfg(any(test, debug_assertions))]
1821        {
1822            let _l1 = child.thread_group().read();
1823            let _l2 = child.read();
1824        }
1825
1826        Ok(child)
1827    }
1828
1829    /// Sets the stop state (per set_stopped), and also notifies all listeners,
1830    /// including the parent process and the tracer if appropriate.
1831    pub fn set_stopped_and_notify(&self, stopped: StopState, siginfo: Option<SignalInfo>) {
1832        let maybe_signal_info = {
1833            let mut state = self.write();
1834            state.copy_state_from(self);
1835            state.set_stopped(stopped, siginfo, Some(self), None);
1836            state.prepare_signal_info(stopped)
1837        };
1838
1839        if let Some((tracer, signal_info)) = maybe_signal_info {
1840            if let Some(tracer) = tracer.upgrade() {
1841                tracer.write().send_signal(signal_info);
1842            }
1843        }
1844
1845        if !stopped.is_in_progress() {
1846            let parent = self.thread_group().read().parent.clone();
1847            if let Some(parent) = parent {
1848                parent
1849                    .upgrade()
1850                    .write()
1851                    .lifecycle_waiters
1852                    .notify_value(ThreadGroupLifecycleWaitValue::ChildStatus);
1853            }
1854        }
1855    }
1856
1857    /// Finalizes the stop state of the task, and if the task should be stopped,
1858    /// blocks the execution of `current_task` as long as the task is stopped and
1859    /// not terminated.
1860    ///
1861    /// Returns true if the task was stopped and blocked (and has now woken up),
1862    /// or false if it was not stopped and returned immediately.
1863    pub fn block_if_stopped(&mut self, locked: &mut Locked<Unlocked>) -> bool {
1864        if self.finalize_stop_state() {
1865            self.block_while_stopped(locked);
1866            true
1867        } else {
1868            false
1869        }
1870    }
1871
1872    /// If the task is stopping, set it as stopped. return whether the caller
1873    /// should stop.  The task might also be waking up.
1874    fn finalize_stop_state(&mut self) -> bool {
1875        let stopped = self.load_stopped();
1876
1877        if !stopped.is_stopping_or_stopped() {
1878            // If we are waking up, potentially write back state a tracer may have modified.
1879            let captured_state = self.write().take_captured_state();
1880            if let Some(captured) = captured_state {
1881                if captured.dirty {
1882                    self.thread_state.replace_registers(&captured.thread_state);
1883                }
1884            }
1885        }
1886
1887        // Stopping because the thread group is stopping.
1888        // Try to flip to GroupStopped - will fail if we shouldn't.
1889        if self.thread_group().set_stopped(StopState::GroupStopped, None, true)
1890            == StopState::GroupStopped
1891        {
1892            let signal = self.thread_group().read().last_signal.clone();
1893            // stopping because the thread group has stopped
1894            let event = Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0));
1895            self.write().set_stopped(StopState::GroupStopped, signal, Some(self), event);
1896            return true;
1897        }
1898
1899        // Stopping because the task is stopping
1900        if stopped.is_stopping_or_stopped() {
1901            if let Ok(stopped) = stopped.finalize() {
1902                self.set_stopped_and_notify(stopped, None);
1903            }
1904            return true;
1905        }
1906
1907        false
1908    }
1909
1910    /// Block the execution of `current_task` as long as the task is stopped and
1911    /// not terminated.
1912    fn block_while_stopped(&mut self, locked: &mut Locked<Unlocked>) {
1913        let waiter = Waiter::with_options(WaiterOptions::IGNORE_SIGNALS);
1914        loop {
1915            // If we've exited, unstop the threads and return without notifying
1916            // waiters.
1917            if self.is_exitted() {
1918                self.thread_group().set_stopped(StopState::ForceAwake, None, false);
1919                self.write().set_stopped(StopState::ForceAwake, None, Some(self), None);
1920                return;
1921            }
1922
1923            if self.wake_or_wait_until_unstopped_async(&waiter) {
1924                return;
1925            }
1926
1927            // Do the wait. Result is not needed, as this is not in a syscall.
1928            let _: Result<(), Errno> = waiter.wait(locked, self);
1929
1930            // Maybe go from stopping to stopped, if we are currently stopping
1931            // again.
1932            self.finalize_stop_state();
1933        }
1934    }
1935
1936    /// For traced tasks, this will return the data neceessary for a cloned task
1937    /// to attach to the same tracer.
1938    pub fn get_ptrace_core_state_for_clone(
1939        &mut self,
1940        clone_args: &clone_args,
1941    ) -> (PtraceOptions, Option<PtraceCoreState>) {
1942        let state = self.write();
1943        if let Some(ptrace) = &state.ptrace {
1944            ptrace.get_core_state_for_clone(clone_args)
1945        } else {
1946            (PtraceOptions::empty(), None)
1947        }
1948    }
1949
1950    /// If currently being ptraced with the given option, emit the appropriate
1951    /// event.  PTRACE_EVENTMSG will return the given message.  Also emits the
1952    /// appropriate event for execve in the absence of TRACEEXEC.
1953    ///
1954    /// Note that the Linux kernel has a documented bug where, if TRACEEXIT is
1955    /// enabled, SIGKILL will trigger an event.  We do not exhibit this
1956    /// behavior.
1957    pub fn ptrace_event(
1958        &mut self,
1959        locked: &mut Locked<Unlocked>,
1960        trace_kind: PtraceOptions,
1961        msg: u64,
1962    ) {
1963        if !trace_kind.is_empty() {
1964            {
1965                let mut state = self.write();
1966                if let Some(ptrace) = &mut state.ptrace {
1967                    if !ptrace.has_option(trace_kind) {
1968                        // If this would be a TRACEEXEC, but TRACEEXEC is not
1969                        // turned on, then send a SIGTRAP.
1970                        if trace_kind == PtraceOptions::TRACEEXEC && !ptrace.is_seized() {
1971                            // Send a SIGTRAP so that the parent can gain control.
1972                            send_signal_first(locked, self, state, SignalInfo::kernel(SIGTRAP));
1973                        }
1974
1975                        return;
1976                    }
1977                    let ptrace_event = PtraceEvent::from_option(&trace_kind) as u32;
1978                    let siginfo = SignalInfo::with_detail(
1979                        SIGTRAP,
1980                        ((ptrace_event << 8) | SIGTRAP.number()) as i32,
1981                        SignalDetail::None,
1982                    );
1983                    state.set_stopped(
1984                        StopState::PtraceEventStopping,
1985                        Some(siginfo),
1986                        None,
1987                        Some(PtraceEventData::new(trace_kind, msg)),
1988                    );
1989                } else {
1990                    return;
1991                }
1992            }
1993            self.block_if_stopped(locked);
1994        }
1995    }
1996
1997    /// Causes the current thread's thread group to exit, notifying any ptracer
1998    /// of this task first.
1999    pub fn kill_thread_group(&mut self, locked: &mut Locked<Unlocked>, exit_status: ExitStatus) {
2000        self.ptrace_event(
2001            locked,
2002            PtraceOptions::TRACEEXIT,
2003            exit_status.signal_info_status() as u64,
2004        );
2005        self.thread_group().kill(locked, exit_status, None);
2006    }
2007
2008    /// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the
2009    /// exit signal as in clone().
2010    pub fn clone_task_builder_for_test<L>(
2011        &self,
2012        locked: &mut Locked<L>,
2013        flags: u64,
2014        exit_signal: Option<Signal>,
2015    ) -> TaskBuilder
2016    where
2017        L: LockBefore<MmDumpable>,
2018        L: LockBefore<TaskRelease>,
2019        L: LockBefore<ProcessGroupState>,
2020    {
2021        let result = self
2022            .clone_task(
2023                locked,
2024                flags,
2025                exit_signal,
2026                UserRef::default(),
2027                UserRef::default(),
2028                UserRef::default(),
2029            )
2030            .expect("failed to create task in test");
2031        result.task.write().set_spawned();
2032        result
2033    }
2034
2035    /// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the
2036    /// exit signal as in clone().
2037    pub fn clone_task_for_test<L>(
2038        &self,
2039        locked: &mut Locked<L>,
2040        flags: u64,
2041        exit_signal: Option<Signal>,
2042    ) -> crate::testing::AutoReleasableTask
2043    where
2044        L: LockBefore<MmDumpable>,
2045        L: LockBefore<TaskRelease>,
2046        L: LockBefore<ProcessGroupState>,
2047    {
2048        self.clone_task_builder_for_test(locked, flags, exit_signal).into()
2049    }
2050
2051    // See "Ptrace access mode checking" in https://man7.org/linux/man-pages/man2/ptrace.2.html
2052    pub fn check_ptrace_access_mode<L>(
2053        &self,
2054        locked: &mut Locked<L>,
2055        mode: PtraceAccessMode,
2056        target: &Task,
2057    ) -> Result<(), Errno>
2058    where
2059        L: LockBefore<MmDumpable>,
2060    {
2061        // (1)  If the calling thread and the target thread are in the same
2062        //      thread group, access is always allowed.
2063        if self.thread_group().leader == target.thread_group().leader {
2064            return Ok(());
2065        }
2066
2067        // (2)  If the access mode specifies PTRACE_MODE_FSCREDS, then, for
2068        //      the check in the next step, employ the caller's filesystem
2069        //      UID and GID.  (As noted in credentials(7), the filesystem
2070        //      UID and GID almost always have the same values as the
2071        //      corresponding effective IDs.)
2072        //
2073        //      Otherwise, the access mode specifies PTRACE_MODE_REALCREDS,
2074        //      so use the caller's real UID and GID for the checks in the
2075        //      next step.  (Most APIs that check the caller's UID and GID
2076        //      use the effective IDs.  For historical reasons, the
2077        //      PTRACE_MODE_REALCREDS check uses the real IDs instead.)
2078        let (uid, gid) = if mode.contains(PTRACE_MODE_FSCREDS) {
2079            let fscred = self.current_creds().as_fscred();
2080            (fscred.uid, fscred.gid)
2081        } else if mode.contains(PTRACE_MODE_REALCREDS) {
2082            let creds = self.current_creds();
2083            (creds.uid, creds.gid)
2084        } else {
2085            unreachable!();
2086        };
2087
2088        // (3)  Deny access if neither of the following is true:
2089        //
2090        //      -  The real, effective, and saved-set user IDs of the target
2091        //         match the caller's user ID, and the real, effective, and
2092        //         saved-set group IDs of the target match the caller's
2093        //         group ID.
2094        //
2095        //      -  The caller has the CAP_SYS_PTRACE capability in the user
2096        //         namespace of the target.
2097        let target_creds = target.persistent_info.lock_creds();
2098        if !(target_creds.uid == uid
2099            && target_creds.euid == uid
2100            && target_creds.saved_uid == uid
2101            && target_creds.gid == gid
2102            && target_creds.egid == gid
2103            && target_creds.saved_gid == gid)
2104        {
2105            security::check_task_capable(self, CAP_SYS_PTRACE)?;
2106        }
2107
2108        // (4)  Deny access if the target process "dumpable" attribute has a
2109        //      value other than 1 (SUID_DUMP_USER; see the discussion of
2110        //      PR_SET_DUMPABLE in prctl(2)), and the caller does not have
2111        //      the CAP_SYS_PTRACE capability in the user namespace of the
2112        //      target process.
2113        let dumpable = *target.mm()?.dumpable.lock(locked);
2114        match dumpable {
2115            DumpPolicy::User => (),
2116            DumpPolicy::Disable => security::check_task_capable(self, CAP_SYS_PTRACE)?,
2117        }
2118
2119        // (5)  The kernel LSM security_ptrace_access_check() interface is
2120        //      invoked to see if ptrace access is permitted.
2121        security::ptrace_access_check(self, target, mode)?;
2122
2123        // (6)  If access has not been denied by any of the preceding steps,
2124        //      then access is allowed.
2125        Ok(())
2126    }
2127
2128    pub fn can_signal(
2129        &self,
2130        target: &Task,
2131        unchecked_signal: UncheckedSignal,
2132    ) -> Result<(), Errno> {
2133        // If both the tasks share a thread group the signal can be sent. This is not documented
2134        // in kill(2) because kill does not support task-level granularity in signal sending.
2135        if self.thread_group == target.thread_group {
2136            return Ok(());
2137        }
2138
2139        let self_creds = self.current_creds();
2140        let target_creds = target.real_creds();
2141        // From https://man7.org/linux/man-pages/man2/kill.2.html:
2142        //
2143        // > For a process to have permission to send a signal, it must either be
2144        // > privileged (under Linux: have the CAP_KILL capability in the user
2145        // > namespace of the target process), or the real or effective user ID of
2146        // > the sending process must equal the real or saved set- user-ID of the
2147        // > target process.
2148        //
2149        // Returns true if the credentials are considered to have the same user ID.
2150        if self_creds.euid == target_creds.saved_uid
2151            || self_creds.euid == target_creds.uid
2152            || self_creds.uid == target_creds.uid
2153            || self_creds.uid == target_creds.saved_uid
2154        {
2155            return Ok(());
2156        }
2157
2158        if Signal::try_from(unchecked_signal) == Ok(SIGCONT) {
2159            let target_session = target.thread_group().read().process_group.session.leader;
2160            let self_session = self.thread_group().read().process_group.session.leader;
2161            if target_session == self_session {
2162                return Ok(());
2163            }
2164        }
2165
2166        security::check_task_capable(self, CAP_KILL)
2167    }
2168}
2169
2170impl ArchSpecific for CurrentTask {
2171    fn is_arch32(&self) -> bool {
2172        self.thread_state.is_arch32()
2173    }
2174}
2175
2176impl MemoryAccessor for CurrentTask {
2177    fn read_memory<'a>(
2178        &self,
2179        addr: UserAddress,
2180        bytes: &'a mut [MaybeUninit<u8>],
2181    ) -> Result<&'a mut [u8], Errno> {
2182        self.mm()?.unified_read_memory(self, addr, bytes)
2183    }
2184
2185    fn read_memory_partial_until_null_byte<'a>(
2186        &self,
2187        addr: UserAddress,
2188        bytes: &'a mut [MaybeUninit<u8>],
2189    ) -> Result<&'a mut [u8], Errno> {
2190        self.mm()?.unified_read_memory_partial_until_null_byte(self, addr, bytes)
2191    }
2192
2193    fn read_memory_partial<'a>(
2194        &self,
2195        addr: UserAddress,
2196        bytes: &'a mut [MaybeUninit<u8>],
2197    ) -> Result<&'a mut [u8], Errno> {
2198        self.mm()?.unified_read_memory_partial(self, addr, bytes)
2199    }
2200
2201    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2202        self.mm()?.unified_write_memory(self, addr, bytes)
2203    }
2204
2205    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2206        self.mm()?.unified_write_memory_partial(self, addr, bytes)
2207    }
2208
2209    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
2210        self.mm()?.unified_zero(self, addr, length)
2211    }
2212}
2213
2214impl TaskMemoryAccessor for CurrentTask {
2215    fn maximum_valid_address(&self) -> Option<UserAddress> {
2216        self.mm().ok().map(|mm| mm.maximum_valid_user_address)
2217    }
2218}
2219
2220pub enum ExceptionResult {
2221    /// The exception was handled and no further action is required.
2222    Handled,
2223
2224    // The exception generated a signal that should be delivered.
2225    Signal(SignalInfo),
2226}
2227
2228fn split_path(path: &FsStr) -> LookupVec<&FsStr> {
2229    path.split(|c| *c == b'/').filter(|p| !p.is_empty()).map(<&FsStr>::from).collect()
2230}
2231
2232#[cfg(test)]
2233mod tests {
2234    use crate::testing::spawn_kernel_and_run;
2235    use starnix_uapi::auth::Credentials;
2236
2237    // This test will run `override_creds` and check it doesn't crash. This ensures that the
2238    // delegation to `override_creds_async` is correct.
2239    #[::fuchsia::test]
2240    async fn test_override_creds_can_delegate_to_async_version() {
2241        spawn_kernel_and_run(async move |_, current_task| {
2242            assert_eq!(current_task.override_creds(Credentials::root(), || 0), 0);
2243        })
2244        .await;
2245    }
2246}