Skip to main content

starnix_core/task/
current_task.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::arch::task::{decode_page_fault_exception_report, get_signal_for_general_exception};
6use crate::execution::{TaskInfo, create_zircon_process};
7use crate::mm::{DumpPolicy, MemoryAccessor, MemoryAccessorExt, MemoryManager, TaskMemoryAccessor};
8use crate::ptrace::{PtraceCoreState, PtraceEvent, PtraceEventData, PtraceOptions, StopState};
9use crate::security;
10use crate::signals::{RunState, SignalDetail, SignalInfo, send_signal_first, send_standard_signal};
11use crate::task::loader::{ResolvedElf, load_executable, resolve_executable};
12use crate::task::waiter::WaiterOptions;
13use crate::task::{
14    ExitStatus, RobustListHeadPtr, SeccompFilter, SeccompFilterContainer, SeccompNotifierHandle,
15    SeccompState, SeccompStateValue, Task, TaskFlags, TaskLiveState, ThreadState, Waiter,
16};
17use crate::vfs::{
18    CheckAccessReason, FdFlags, FdNumber, FileHandle, FsContext, FsStr, LookupContext, LookupVec,
19    MAX_SYMLINK_FOLLOWS, NamespaceNode, ResolveBase, SymlinkMode, SymlinkTarget, new_pidfd,
20};
21use fuchsia_rcu::RcuReadGuard;
22use futures::FutureExt;
23use linux_uapi::CLONE_PIDFD;
24use starnix_logging::{
25    CATEGORY_STARNIX, log_error, log_warn, trace_duration, track_file_not_found, track_stub,
26};
27use starnix_registers::{HeapRegs, RegisterStorageEnum};
28use starnix_stack::clean_stack;
29use starnix_sync::{
30    EventWaitGuard, FileOpsCore, LockBefore, LockEqualOrBefore, Locked, MmDumpable,
31    ProcessGroupState, TaskRelease, UninterruptibleLock, Unlocked, WakeReason, assert_lock_level,
32};
33use starnix_syscalls::SyscallResult;
34use starnix_syscalls::decls::Syscall;
35use starnix_task_command::TaskCommand;
36use starnix_types::futex_address::FutexAddress;
37use starnix_types::ownership::{Releasable, release_on_error};
38use starnix_uapi::auth::{
39    CAP_KILL, CAP_SYS_ADMIN, CAP_SYS_PTRACE, Credentials, FsCred, PTRACE_MODE_FSCREDS,
40    PTRACE_MODE_REALCREDS, PtraceAccessMode, UserAndOrGroupId,
41};
42use starnix_uapi::device_id::DeviceId;
43use starnix_uapi::errors::Errno;
44use starnix_uapi::file_mode::{Access, AccessCheck, FileMode};
45use starnix_uapi::open_flags::OpenFlags;
46use starnix_uapi::signals::{
47    SIGBUS, SIGCHLD, SIGCONT, SIGILL, SIGKILL, SIGSEGV, SIGSYS, SIGTRAP, SigSet, Signal,
48    UncheckedSignal,
49};
50use starnix_uapi::user_address::{ArchSpecific, UserAddress, UserRef};
51use starnix_uapi::vfs::ResolveFlags;
52use starnix_uapi::{
53    CLONE_CHILD_CLEARTID, CLONE_CHILD_SETTID, CLONE_FILES, CLONE_FS, CLONE_INTO_CGROUP,
54    CLONE_NEWUTS, CLONE_PARENT, CLONE_PARENT_SETTID, CLONE_PTRACE, CLONE_SETTLS, CLONE_SIGHAND,
55    CLONE_SYSVSEM, CLONE_THREAD, CLONE_VFORK, CLONE_VM, FUTEX_OWNER_DIED, FUTEX_TID_MASK,
56    ROBUST_LIST_LIMIT, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_NEW_LISTENER,
57    SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_TSYNC_ESRCH, clone_args, errno, error, pid_t,
58    sock_filter, ucred,
59};
60use std::cell::{Ref, RefCell};
61use std::collections::VecDeque;
62use std::ffi::CString;
63use std::fmt;
64use std::marker::PhantomData;
65use std::mem::MaybeUninit;
66use std::sync::{Arc, Weak};
67use zx::sys::zx_restricted_state_t;
68
69use super::ThreadGroupLifecycleWaitValue;
70
71pub struct TaskBuilder {
72    /// The underlying task object.
73    pub task: Arc<Task>,
74
75    pub thread_state: ThreadState<HeapRegs>,
76}
77
78impl TaskBuilder {
79    pub fn new(task: Arc<Task>) -> Self {
80        Self { task, thread_state: Default::default() }
81    }
82
83    #[inline(always)]
84    pub fn release<L>(self, locked: &mut Locked<L>)
85    where
86        L: LockBefore<TaskRelease>,
87    {
88        let locked = locked.cast_locked::<TaskRelease>();
89        Releasable::release(self, locked);
90    }
91}
92
93impl From<TaskBuilder> for CurrentTask {
94    fn from(builder: TaskBuilder) -> Self {
95        Self::new(builder.task, builder.thread_state.into())
96    }
97}
98
99impl Releasable for TaskBuilder {
100    type Context<'a> = &'a mut Locked<TaskRelease>;
101
102    fn release<'a>(self, locked: Self::Context<'a>) {
103        // Build a temporary CurrentTask to run release actions that require ThreadState.
104        let current_task = CurrentTask::new(self.task, self.thread_state.into());
105        current_task.exit(locked);
106    }
107}
108
109impl std::ops::Deref for TaskBuilder {
110    type Target = Task;
111    fn deref(&self) -> &Self::Target {
112        &self.task
113    }
114}
115
116/// The task object associated with the currently executing thread.
117///
118/// We often pass the `CurrentTask` as the first argument to functions if those functions need to
119/// know contextual information about the thread on which they are running. For example, we often
120/// use the `CurrentTask` to perform access checks, which ensures that the caller is authorized to
121/// perform the requested operation.
122///
123/// The `CurrentTask` also has state that can be referenced only on the currently executing thread,
124/// such as the register state for that thread. Syscalls are given a mutable references to the
125/// `CurrentTask`, which lets them manipulate this state.
126///
127/// See also `Task` for more information about tasks.
128pub struct CurrentTask {
129    /// The underlying task object.
130    pub task: Arc<Task>,
131
132    pub thread_state: ThreadState<RegisterStorageEnum>,
133
134    /// The current subjective credentials of the task.
135    // TODO(https://fxbug.dev/433548348): Avoid interior mutability here by passing a
136    // &mut CurrentTask around instead of &CurrentTask.
137    pub current_creds: RefCell<CurrentCreds>,
138
139    pub security_state: security::CurrentTaskState,
140
141    /// Makes CurrentTask neither Sync not Send.
142    _local_marker: PhantomData<*mut u8>,
143}
144
145/// Represents the current state of the task's subjective credentials.
146pub enum CurrentCreds {
147    /// The task does not have overridden credentials, the subjective creds are identical to the
148    /// objective creds stored in the Task. Since credentials are often accessed from the current
149    /// task, we hold a reference here that does not necessitate going through the RCU machinery to
150    /// read.
151    Cached(Arc<Credentials>),
152    /// The task has overridden subjective credentials.
153    Overridden(Arc<Credentials>),
154}
155
156impl CurrentCreds {
157    fn creds(&self) -> &Arc<Credentials> {
158        match self {
159            CurrentCreds::Cached(creds) => creds,
160            CurrentCreds::Overridden(creds) => creds,
161        }
162    }
163}
164
165impl Releasable for CurrentTask {
166    type Context<'a> = &'a mut Locked<TaskRelease>;
167
168    fn release<'a>(self, locked: Self::Context<'a>) {
169        self.exit(locked);
170    }
171}
172
173impl std::ops::Deref for CurrentTask {
174    type Target = Task;
175    fn deref(&self) -> &Self::Target {
176        &self.task
177    }
178}
179
180impl fmt::Debug for CurrentTask {
181    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
182        self.task.fmt(f)
183    }
184}
185
186impl CurrentTask {
187    pub fn new(task: Arc<Task>, thread_state: ThreadState<RegisterStorageEnum>) -> Self {
188        let current_creds = RefCell::new(CurrentCreds::Cached(task.clone_creds()));
189        Self {
190            task,
191            thread_state,
192            current_creds,
193            security_state: Default::default(),
194            _local_marker: Default::default(),
195        }
196    }
197
198    /// Exit the task by dropping its running state.
199    pub fn exit(&self, locked: &mut Locked<TaskRelease>) {
200        // When this method returns, the following invariants must be met:
201        // 1. No new references to live `Task` state must be obtainable.
202        // 2. All externally-visible `Task` state must reflect that the `Task` has exited.
203        // 3. All observers of `Task` exit events must be notified.
204
205        self.notify_robust_list();
206        let _ignored = self.clear_child_tid_if_needed(locked);
207
208        self.signal_vfork();
209
210        // Drop fields that can end up owning a FsNode to ensure no FsNode are owned by this task.
211        if let Ok(live) = self.task.live() {
212            live.files.release();
213            live.mm.update(None);
214        }
215        self.live_state.update(None);
216
217        self.trigger_delayed_releaser(locked);
218
219        // We remove from the thread group here because the Weak in the pid
220        // table to this task must be valid until this task is removed from the
221        // thread group, and the code below will invalidate it.
222        // Moreover, this requires an Arc of the task to ensure the tasks of
223        // the thread group are always valid.
224        let mut pids = self.kernel().pids.write();
225        self.task.thread_group().remove(locked, &mut pids, &self.task);
226        drop(pids);
227
228        self.ptrace_disconnect();
229    }
230
231    /// Returns the [`TaskLiveState`] for the [`Task`].
232    ///
233    /// # Panics
234    ///
235    /// Calling `live()` on a [`CurrentTask`] for which the [`Task`] has no live state (i.e.
236    /// zombie tasks) panics. However, such tasks should not have a `CurrentTask`.
237    #[track_caller]
238    pub fn live(&self) -> RcuReadGuard<TaskLiveState> {
239        self.task.live().expect("CurrentTask must have TaskLiveState")
240    }
241
242    pub fn fs(&self) -> Arc<FsContext> {
243        self.live().fs()
244    }
245
246    pub fn has_shared_fs(&self) -> bool {
247        let fs = self.fs();
248        // This check is incorrect because someone else could be holding a temporary Arc to the
249        // FsContext and therefore increasing the strong count.
250        Arc::strong_count(&fs) > 2usize
251    }
252
253    pub fn unshare_fs(&self) {
254        let new_fs = self.fs().fork();
255        self.live().fs.update(new_fs);
256    }
257
258    /// Returns the current subjective credentials of the task.
259    ///
260    /// The subjective credentials are the credentials that are used to check permissions for
261    /// actions performed by the task.
262    pub fn current_creds(&self) -> Ref<'_, Arc<Credentials>> {
263        Ref::map(self.current_creds.borrow(), CurrentCreds::creds)
264    }
265
266    pub fn current_fscred(&self) -> FsCred {
267        self.current_creds().as_fscred()
268    }
269
270    pub fn current_ucred(&self) -> ucred {
271        let creds = self.current_creds();
272        ucred { pid: self.get_pid(), uid: creds.uid, gid: creds.gid }
273    }
274
275    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
276    /// `callback`.
277    /// The creds and security state will be restored to their original values at the end of the
278    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
279    ///  used to check permissions for actions performed by the task, is altered. The "objective"
280    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
281    /// for actions performed on the task, is not altered, and changes to the credentials are not
282    /// externally visible.
283    pub async fn override_creds_async<R>(
284        &self,
285        new_creds: Arc<Credentials>,
286        callback: impl AsyncFnOnce() -> R,
287    ) -> R {
288        let saved = self.current_creds.replace(CurrentCreds::Overridden(new_creds));
289        let result = callback().await;
290        self.current_creds.replace(saved);
291        result
292    }
293
294    /// Save the current creds and security state, alter them by calling `alter_creds`, then call
295    /// `callback`.
296    /// The creds and security state will be restored to their original values at the end of the
297    /// call. Only the "subjective" state of the CurrentTask, accessed with `current_creds()` and
298    ///  used to check permissions for actions performed by the task, is altered. The "objective"
299    ///  state, accessed through `Task::real_creds()` by other tasks and used to check permissions
300    /// for actions performed on the task, is not altered, and changes to the credentials are not
301    /// externally visible.
302    pub fn override_creds<R>(
303        &self,
304        new_creds: Arc<Credentials>,
305        callback: impl FnOnce() -> R,
306    ) -> R {
307        self.override_creds_async(new_creds, async move || callback())
308            .now_or_never()
309            .expect("Future should be ready")
310    }
311
312    pub fn has_overridden_creds(&self) -> bool {
313        matches!(*self.current_creds.borrow(), CurrentCreds::Overridden(_))
314    }
315
316    pub fn trigger_delayed_releaser<L>(&self, locked: &mut Locked<L>)
317    where
318        L: LockEqualOrBefore<FileOpsCore>,
319    {
320        let locked = locked.cast_locked::<FileOpsCore>();
321        self.kernel().delayed_releaser.apply(locked, self);
322    }
323
324    pub fn weak_task(&self) -> Weak<Task> {
325        Arc::downgrade(&self.task)
326    }
327
328    /// Change the current and real creds of the task. This is invalid to call while temporary
329    /// credentials are present.
330    pub fn set_creds(&self, creds: Credentials) {
331        assert!(!self.has_overridden_creds());
332
333        let creds = Arc::new(creds);
334        let mut current_creds = self.current_creds.borrow_mut();
335        *current_creds = CurrentCreds::Cached(creds.clone());
336
337        // SAFETY: this is allowed because we are the CurrentTask.
338        unsafe {
339            self.persistent_info.write_creds().update(creds);
340        }
341        // The /proc/pid directory's ownership is updated when the task's euid
342        // or egid changes. See proc(5).
343        let maybe_node = self.live().proc_pid_directory_cache.cloned();
344        if let Some(node) = maybe_node {
345            let creds = self.real_creds().euid_as_fscred();
346            // SAFETY: The /proc/pid directory held by `proc_pid_directory_cache` represents the
347            // current task. It's owner and group are supposed to track the current task's euid and
348            // egid.
349            unsafe {
350                node.force_chown(creds);
351            }
352        }
353    }
354
355    #[inline(always)]
356    pub fn release<L>(self, locked: &mut Locked<L>)
357    where
358        L: LockBefore<TaskRelease>,
359    {
360        let locked = locked.cast_locked::<TaskRelease>();
361        Releasable::release(self, locked);
362    }
363
364    pub fn set_syscall_restart_func<R: Into<SyscallResult>>(
365        &mut self,
366        f: impl FnOnce(&mut Locked<Unlocked>, &mut CurrentTask) -> Result<R, Errno>
367        + Send
368        + Sync
369        + 'static,
370    ) {
371        self.thread_state.syscall_restart_func =
372            Some(Box::new(|locked, current_task| Ok(f(locked, current_task)?.into())));
373    }
374
375    pub fn add_file<L>(
376        &self,
377        locked: &mut Locked<L>,
378        file: FileHandle,
379        flags: FdFlags,
380    ) -> Result<FdNumber, Errno>
381    where
382        L: LockEqualOrBefore<FileOpsCore>,
383    {
384        self.live().files.add(locked, self, file, flags)
385    }
386
387    pub fn get_file(&self, fd: FdNumber) -> Result<FileHandle, Errno> {
388        self.live().files.get(fd)
389    }
390
391    pub fn get_file_allowing_opath(&self, fd: FdNumber) -> Result<FileHandle, Errno> {
392        self.live().files.get_allowing_opath(fd)
393    }
394
395    /// Sets the task's signal mask to `signal_mask` and runs `wait_function`.
396    ///
397    /// Signals are dequeued prior to the original signal mask being restored. This is done by the
398    /// signal machinery in the syscall dispatch loop.
399    ///
400    /// The returned result is the result returned from the wait function.
401    pub fn wait_with_temporary_mask<F, T, L>(
402        &mut self,
403        locked: &mut Locked<L>,
404        signal_mask: SigSet,
405        wait_function: F,
406    ) -> Result<T, Errno>
407    where
408        L: LockEqualOrBefore<FileOpsCore>,
409        F: FnOnce(&mut Locked<L>, &CurrentTask) -> Result<T, Errno>,
410    {
411        {
412            let mut state = self.write();
413            state.set_flags(TaskFlags::TEMPORARY_SIGNAL_MASK, true);
414            state.set_temporary_signal_mask(signal_mask);
415        }
416        wait_function(locked, self)
417    }
418
419    /// If waking, promotes from waking to awake.  If not waking, make waiter async
420    /// wait until woken.  Returns true if woken.
421    pub fn wake_or_wait_until_unstopped_async(&self, waiter: &Waiter) -> bool {
422        let group_state = self.thread_group().read();
423        let mut task_state = self.write();
424
425        // Wake up if
426        //   a) we should wake up, meaning:
427        //      i) we're in group stop, and the thread group has exited group stop, or
428        //      ii) we're waking up,
429        //   b) and ptrace isn't stopping us from waking up, but
430        //   c) always wake up if we got a SIGKILL.
431        let task_stop_state = self.load_stopped();
432        let group_stop_state = self.thread_group().load_stopped();
433        if ((task_stop_state == StopState::GroupStopped && group_stop_state.is_waking_or_awake())
434            || task_stop_state.is_waking_or_awake())
435            && (!task_state.is_ptrace_listening() || task_stop_state.is_force())
436        {
437            let new_state = if task_stop_state.is_waking_or_awake() {
438                task_stop_state.finalize()
439            } else {
440                group_stop_state.finalize()
441            };
442            if let Ok(new_state) = new_state {
443                task_state.set_stopped(new_state, None, Some(self), None);
444                drop(group_state);
445                drop(task_state);
446                // It is possible for the stop state to be changed by another
447                // thread between when it is checked above and the following
448                // invocation, but set_stopped does sufficient checking while
449                // holding the lock to make sure that such a change won't result
450                // in corrupted state.
451                self.thread_group().set_stopped(new_state, None, false);
452                return true;
453            }
454        }
455
456        // We will wait.
457        if self.thread_group().load_stopped().is_stopped() || task_stop_state.is_stopped() {
458            // If we've stopped or PTRACE_LISTEN has been sent, wait for a
459            // signal or instructions from the tracer.
460            group_state
461                .lifecycle_waiters
462                .wait_async_value(&waiter, ThreadGroupLifecycleWaitValue::Stopped);
463            task_state.wait_on_ptracer(&waiter);
464        } else if task_state.can_accept_ptrace_commands() {
465            // If we're stopped because a tracer has seen the stop and not taken
466            // further action, wait for further instructions from the tracer.
467            task_state.wait_on_ptracer(&waiter);
468        } else if task_state.is_ptrace_listening() {
469            // A PTRACE_LISTEN is a state where we can get signals and notify a
470            // ptracer, but otherwise remain blocked.
471            if let Some(ptrace) = &mut task_state.ptrace {
472                ptrace.set_last_signal(Some(SignalInfo::kernel(SIGTRAP)));
473                ptrace.set_last_event(Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0)));
474            }
475            task_state.wait_on_ptracer(&waiter);
476            task_state.notify_ptracers();
477        }
478        false
479    }
480
481    /// Set the RunState for the current task to the given value and then call the given callback.
482    ///
483    /// When the callback is done, the run_state is restored to `RunState::Running`.
484    ///
485    /// This function is typically used just before blocking the current task on some operation.
486    /// The given `run_state` registers the mechanism for interrupting the blocking operation with
487    /// the task and the given `callback` actually blocks the task.
488    ///
489    /// This function can only be called in the `RunState::Running` state and cannot set the
490    /// run state to `RunState::Running`. For this reason, this function cannot be reentered.
491    pub fn run_in_state<F, T>(&self, run_state: RunState, callback: F) -> Result<T, Errno>
492    where
493        F: FnOnce() -> Result<T, Errno>,
494    {
495        assert_ne!(run_state, RunState::Running);
496
497        // Check we do not hold any uninterruptible lock
498        assert_lock_level::<UninterruptibleLock>();
499        // As an optimization, decommit unused pages of the stack to reduce memory pressure while
500        // the thread is blocked.
501        clean_stack();
502
503        {
504            let mut state = self.write();
505            assert!(!state.is_blocked());
506
507            if matches!(run_state, RunState::Frozen(_)) {
508                // Freeze is a kernel signal and is handled before other user signals. A frozen task
509                // ignores all other signals except SIGKILL until it is thawed.
510                if state.has_signal_pending(SIGKILL) {
511                    return error!(EINTR);
512                }
513            } else if state.is_any_signal_pending() && !state.is_ptrace_listening() {
514                // A note on PTRACE_LISTEN - the thread cannot be scheduled
515                // regardless of pending signals.
516                return error!(EINTR);
517            }
518            state.set_run_state(run_state.clone());
519        }
520
521        let result = callback();
522
523        {
524            let mut state = self.write();
525            assert_eq!(
526                state.run_state(),
527                run_state,
528                "SignalState run state changed while waiting!"
529            );
530            state.set_run_state(RunState::Running);
531        };
532
533        result
534    }
535
536    pub fn block_until(
537        &self,
538        guard: EventWaitGuard<'_>,
539        deadline: zx::MonotonicInstant,
540    ) -> Result<(), Errno> {
541        self.run_in_state(RunState::Event(guard.event().clone()), move || {
542            guard.block_until(None, deadline).map_err(|e| match e {
543                WakeReason::Interrupted => errno!(EINTR),
544                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
545            })
546        })
547    }
548
549    pub fn block_with_owner_until(
550        &self,
551        guard: EventWaitGuard<'_>,
552        new_owner: &zx::Thread,
553        deadline: zx::MonotonicInstant,
554    ) -> Result<(), Errno> {
555        self.run_in_state(RunState::Event(guard.event().clone()), move || {
556            guard.block_until(Some(new_owner), deadline).map_err(|e| match e {
557                WakeReason::Interrupted => errno!(EINTR),
558                WakeReason::DeadlineExpired => errno!(ETIMEDOUT),
559            })
560        })
561    }
562
563    /// Determine namespace node indicated by the dir_fd.
564    ///
565    /// Returns the namespace node and the path to use relative to that node.
566    pub fn resolve_dir_fd<'a, L>(
567        &self,
568        locked: &mut Locked<L>,
569        dir_fd: FdNumber,
570        mut path: &'a FsStr,
571        flags: ResolveFlags,
572    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
573    where
574        L: LockEqualOrBefore<FileOpsCore>,
575    {
576        let path_is_absolute = path.starts_with(b"/");
577        if path_is_absolute {
578            if flags.contains(ResolveFlags::BENEATH) {
579                return error!(EXDEV);
580            }
581            path = &path[1..];
582        }
583
584        let dir = if path_is_absolute && !flags.contains(ResolveFlags::IN_ROOT) {
585            self.fs().root()
586        } else if dir_fd == FdNumber::AT_FDCWD {
587            self.fs().cwd()
588        } else {
589            // O_PATH allowed for:
590            //
591            //   Passing the file descriptor as the dirfd argument of
592            //   openat() and the other "*at()" system calls.  This
593            //   includes linkat(2) with AT_EMPTY_PATH (or via procfs
594            //   using AT_SYMLINK_FOLLOW) even if the file is not a
595            //   directory.
596            //
597            // See https://man7.org/linux/man-pages/man2/open.2.html
598            let file = self.get_file_allowing_opath(dir_fd)?;
599            file.name.to_passive()
600        };
601
602        if !path.is_empty() {
603            if !dir.entry.node.is_dir() {
604                return error!(ENOTDIR);
605            }
606            dir.check_access(
607                locked,
608                self,
609                Access::EXEC,
610                CheckAccessReason::InternalPermissionChecks,
611            )?;
612        }
613        Ok((dir, path.into()))
614    }
615
616    /// A convenient wrapper for opening files relative to FdNumber::AT_FDCWD.
617    ///
618    /// Returns a FileHandle but does not install the FileHandle in the FdTable
619    /// for this task.
620    pub fn open_file(
621        &self,
622        locked: &mut Locked<Unlocked>,
623        path: &FsStr,
624        flags: OpenFlags,
625    ) -> Result<FileHandle, Errno> {
626        if flags.contains(OpenFlags::CREAT) {
627            // In order to support OpenFlags::CREAT we would need to take a
628            // FileMode argument.
629            return error!(EINVAL);
630        }
631        self.open_file_at(
632            locked,
633            FdNumber::AT_FDCWD,
634            path,
635            flags,
636            FileMode::default(),
637            ResolveFlags::empty(),
638            AccessCheck::default(),
639        )
640    }
641
642    /// Resolves a path for open.
643    ///
644    /// If the final path component points to a symlink, the symlink is followed (as long as
645    /// the symlink traversal limit has not been reached).
646    ///
647    /// If the final path component (after following any symlinks, if enabled) does not exist,
648    /// and `flags` contains `OpenFlags::CREAT`, a new node is created at the location of the
649    /// final path component.
650    ///
651    /// This returns the resolved node, and a boolean indicating whether the node has been created.
652    fn resolve_open_path<L>(
653        &self,
654        locked: &mut Locked<L>,
655        context: &mut LookupContext,
656        dir: &NamespaceNode,
657        path: &FsStr,
658        mode: FileMode,
659        flags: OpenFlags,
660    ) -> Result<(NamespaceNode, bool), Errno>
661    where
662        L: LockEqualOrBefore<FileOpsCore>,
663    {
664        context.update_for_path(path);
665        let mut parent_content = context.with(SymlinkMode::Follow);
666        let (parent, basename) = self.lookup_parent(locked, &mut parent_content, dir, path)?;
667        context.remaining_follows = parent_content.remaining_follows;
668
669        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
670
671        // Lookup the child, without following a symlink or expecting it to be a directory.
672        let mut child_context = context.with(SymlinkMode::NoFollow);
673        child_context.must_be_directory = false;
674
675        match parent.lookup_child(locked, self, &mut child_context, basename) {
676            Ok(name) => {
677                if name.entry.node.is_lnk() {
678                    if flags.contains(OpenFlags::PATH)
679                        && context.symlink_mode == SymlinkMode::NoFollow
680                    {
681                        // When O_PATH is specified in flags, if pathname is a symbolic link
682                        // and the O_NOFOLLOW flag is also specified, then the call returns
683                        // a file descriptor referring to the symbolic link.
684                        // See https://man7.org/linux/man-pages/man2/openat.2.html
685                        //
686                        // If the trailing component (i.e., basename) of
687                        // pathname is a symbolic link, how.resolve contains
688                        // RESOLVE_NO_SYMLINKS, and how.flags contains both
689                        // O_PATH and O_NOFOLLOW, then an O_PATH file
690                        // descriptor referencing the symbolic link will be
691                        // returned.
692                        // See https://man7.org/linux/man-pages/man2/openat2.2.html
693                        return Ok((name, false));
694                    }
695
696                    if (!flags.contains(OpenFlags::PATH)
697                        && context.symlink_mode == SymlinkMode::NoFollow)
698                        || context.resolve_flags.contains(ResolveFlags::NO_SYMLINKS)
699                        || context.remaining_follows == 0
700                    {
701                        if must_create {
702                            // Since `must_create` is set, and a node was found, this returns EEXIST
703                            // instead of ELOOP.
704                            return error!(EEXIST);
705                        }
706                        // A symlink was found, but one of the following is true:
707                        // * flags specified O_NOFOLLOW but not O_PATH.
708                        // * how.resolve contains RESOLVE_NO_SYMLINKS
709                        // * too many symlink traversals have been attempted
710                        return error!(ELOOP);
711                    }
712
713                    context.remaining_follows -= 1;
714                    match name.readlink(locked, self)? {
715                        SymlinkTarget::Path(path) => {
716                            let dir = if path[0] == b'/' { self.fs().root() } else { parent };
717                            self.resolve_open_path(
718                                locked,
719                                context,
720                                &dir,
721                                path.as_ref(),
722                                mode,
723                                flags,
724                            )
725                        }
726                        SymlinkTarget::Node(name) => {
727                            if context.resolve_flags.contains(ResolveFlags::NO_MAGICLINKS)
728                                || name.entry.node.is_lnk()
729                            {
730                                error!(ELOOP)
731                            } else {
732                                Ok((name, false))
733                            }
734                        }
735                    }
736                } else {
737                    if must_create {
738                        return error!(EEXIST);
739                    }
740                    Ok((name, false))
741                }
742            }
743            Err(e) if e == errno!(ENOENT) && flags.contains(OpenFlags::CREAT) => {
744                if context.must_be_directory {
745                    return error!(EISDIR);
746                }
747                Ok((
748                    parent.open_create_node(
749                        locked,
750                        self,
751                        basename,
752                        mode.with_type(FileMode::IFREG),
753                        DeviceId::NONE,
754                        flags,
755                    )?,
756                    true,
757                ))
758            }
759            Err(e) => Err(e),
760        }
761    }
762
763    /// The primary entry point for opening files relative to a task.
764    ///
765    /// Absolute paths are resolve relative to the root of the FsContext for
766    /// this task. Relative paths are resolve relative to dir_fd. To resolve
767    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
768    /// dir_fd.
769    ///
770    /// Returns a FileHandle but does not install the FileHandle in the FdTable
771    /// for this task.
772    pub fn open_file_at(
773        &self,
774        locked: &mut Locked<Unlocked>,
775        dir_fd: FdNumber,
776        path: &FsStr,
777        flags: OpenFlags,
778        mode: FileMode,
779        resolve_flags: ResolveFlags,
780        access_check: AccessCheck,
781    ) -> Result<FileHandle, Errno> {
782        if path.is_empty() {
783            return error!(ENOENT);
784        }
785
786        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, resolve_flags)?;
787        self.open_namespace_node_at(locked, dir, path, flags, mode, resolve_flags, access_check)
788    }
789
790    pub fn open_namespace_node_at(
791        &self,
792        locked: &mut Locked<Unlocked>,
793        dir: NamespaceNode,
794        path: &FsStr,
795        flags: OpenFlags,
796        mode: FileMode,
797        mut resolve_flags: ResolveFlags,
798        access_check: AccessCheck,
799    ) -> Result<FileHandle, Errno> {
800        // 64-bit kernels force the O_LARGEFILE flag to be on.
801        let mut flags = flags | OpenFlags::LARGEFILE;
802        let opath = flags.contains(OpenFlags::PATH);
803        if opath {
804            // When O_PATH is specified in flags, flag bits other than O_CLOEXEC,
805            // O_DIRECTORY, and O_NOFOLLOW are ignored.
806            const ALLOWED_FLAGS: OpenFlags = OpenFlags::from_bits_truncate(
807                OpenFlags::PATH.bits()
808                    | OpenFlags::CLOEXEC.bits()
809                    | OpenFlags::DIRECTORY.bits()
810                    | OpenFlags::NOFOLLOW.bits(),
811            );
812            flags &= ALLOWED_FLAGS;
813        }
814
815        if flags.contains(OpenFlags::TMPFILE) && !flags.can_write() {
816            return error!(EINVAL);
817        }
818
819        let nofollow = flags.contains(OpenFlags::NOFOLLOW);
820        let must_create = flags.contains(OpenFlags::CREAT) && flags.contains(OpenFlags::EXCL);
821
822        let symlink_mode =
823            if nofollow || must_create { SymlinkMode::NoFollow } else { SymlinkMode::Follow };
824
825        let resolve_base = match (
826            resolve_flags.contains(ResolveFlags::BENEATH),
827            resolve_flags.contains(ResolveFlags::IN_ROOT),
828        ) {
829            (false, false) => ResolveBase::None,
830            (true, false) => ResolveBase::Beneath(dir.clone()),
831            (false, true) => ResolveBase::InRoot(dir.clone()),
832            (true, true) => return error!(EINVAL),
833        };
834
835        // `RESOLVE_BENEATH` and `RESOLVE_IN_ROOT` imply `RESOLVE_NO_MAGICLINKS`. This matches
836        // Linux behavior. Strictly speaking it's is not really required, but it's hard to
837        // implement `BENEATH` and `IN_ROOT` flags correctly otherwise.
838        if resolve_base != ResolveBase::None {
839            resolve_flags.insert(ResolveFlags::NO_MAGICLINKS);
840        }
841
842        let mut context = LookupContext {
843            symlink_mode,
844            remaining_follows: MAX_SYMLINK_FOLLOWS,
845            must_be_directory: flags.contains(OpenFlags::DIRECTORY),
846            resolve_flags,
847            resolve_base,
848        };
849        let (name, created) =
850            match self.resolve_open_path(locked, &mut context, &dir, path, mode, flags) {
851                Ok((n, c)) => (n, c),
852                Err(e) => {
853                    let mut abs_path = dir.path(&self.fs());
854                    abs_path.extend(&**path);
855                    track_file_not_found(abs_path);
856                    return Err(e);
857                }
858            };
859
860        let name = if flags.contains(OpenFlags::TMPFILE) {
861            // `O_TMPFILE` is incompatible with `O_CREAT`
862            if flags.contains(OpenFlags::CREAT) {
863                return error!(EINVAL);
864            }
865            name.create_tmpfile(locked, self, mode.with_type(FileMode::IFREG), flags)?
866        } else {
867            let mode = name.entry.node.info().mode;
868
869            // These checks are not needed in the `O_TMPFILE` case because `mode` refers to the
870            // file we are opening. With `O_TMPFILE`, that file is the regular file we just
871            // created rather than the node we found by resolving the path.
872            //
873            // For example, we do not need to produce `ENOTDIR` when `must_be_directory` is set
874            // because `must_be_directory` refers to the node we found by resolving the path.
875            // If that node was not a directory, then `create_tmpfile` will produce an error.
876            //
877            // Similarly, we never need to call `truncate` because `O_TMPFILE` is newly created
878            // and therefor already an empty file.
879
880            if !opath && nofollow && mode.is_lnk() {
881                return error!(ELOOP);
882            }
883
884            if mode.is_dir() {
885                if flags.can_write()
886                    || flags.contains(OpenFlags::CREAT)
887                    || flags.contains(OpenFlags::TRUNC)
888                {
889                    return error!(EISDIR);
890                }
891                if flags.contains(OpenFlags::DIRECT) {
892                    return error!(EINVAL);
893                }
894            } else if context.must_be_directory {
895                return error!(ENOTDIR);
896            }
897
898            if flags.contains(OpenFlags::TRUNC) && mode.is_reg() && !created {
899                // You might think we should check file.can_write() at this
900                // point, which is what the docs suggest, but apparently we
901                // are supposed to truncate the file if this task can write
902                // to the underlying node, even if we are opening the file
903                // as read-only. See OpenTest.CanTruncateReadOnly.
904                name.truncate(locked, self, 0)?;
905            }
906
907            name
908        };
909
910        // If the node has been created, the open operation should not verify access right:
911        // From <https://man7.org/linux/man-pages/man2/open.2.html>
912        //
913        // > Note that mode applies only to future accesses of the newly created file; the
914        // > open() call that creates a read-only file may well return a  read/write  file
915        // > descriptor.
916
917        let access_check = if created { AccessCheck::skip() } else { access_check };
918        name.open(locked, self, flags, access_check)
919    }
920
921    /// A wrapper for FsContext::lookup_parent_at that resolves the given
922    /// dir_fd to a NamespaceNode.
923    ///
924    /// Absolute paths are resolve relative to the root of the FsContext for
925    /// this task. Relative paths are resolve relative to dir_fd. To resolve
926    /// relative to the current working directory, pass FdNumber::AT_FDCWD for
927    /// dir_fd.
928    pub fn lookup_parent_at<'a, L>(
929        &self,
930        locked: &mut Locked<L>,
931        context: &mut LookupContext,
932        dir_fd: FdNumber,
933        path: &'a FsStr,
934    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
935    where
936        L: LockEqualOrBefore<FileOpsCore>,
937    {
938        let (dir, path) = self.resolve_dir_fd(locked, dir_fd, path, ResolveFlags::empty())?;
939        self.lookup_parent(locked, context, &dir, path)
940    }
941
942    /// Lookup the parent of a namespace node.
943    ///
944    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
945    /// calling this function directly.
946    ///
947    /// This function resolves all but the last component of the given path.
948    /// The function returns the parent directory of the last component as well
949    /// as the last component.
950    ///
951    /// If path is empty, this function returns dir and an empty path.
952    /// Similarly, if path ends with "." or "..", these components will be
953    /// returned along with the parent.
954    ///
955    /// The returned parent might not be a directory.
956    pub fn lookup_parent<'a, L>(
957        &self,
958        locked: &mut Locked<L>,
959        context: &mut LookupContext,
960        dir: &NamespaceNode,
961        path: &'a FsStr,
962    ) -> Result<(NamespaceNode, &'a FsStr), Errno>
963    where
964        L: LockEqualOrBefore<FileOpsCore>,
965    {
966        context.update_for_path(path);
967
968        let components = split_path(path);
969        if components.is_empty() {
970            return Ok((dir.clone(), Default::default()));
971        }
972        let result =
973            dir.lookup_children(locked, self, context, &components[0..components.len() - 1])?;
974        Ok((result, components.last().unwrap()))
975    }
976
977    /// Lookup a namespace node.
978    ///
979    /// Consider using Task::open_file_at or Task::lookup_parent_at rather than
980    /// calling this function directly.
981    ///
982    /// This function resolves the component of the given path.
983    pub fn lookup_path<L>(
984        &self,
985        locked: &mut Locked<L>,
986        context: &mut LookupContext,
987        dir: NamespaceNode,
988        path: &FsStr,
989    ) -> Result<NamespaceNode, Errno>
990    where
991        L: LockEqualOrBefore<FileOpsCore>,
992    {
993        let components = split_path(path);
994        dir.lookup_children(locked, self, context, &components)
995    }
996
997    /// Lookup a namespace node starting at the root directory.
998    ///
999    /// Resolves symlinks.
1000    pub fn lookup_path_from_root<L>(
1001        &self,
1002        locked: &mut Locked<L>,
1003        path: &FsStr,
1004    ) -> Result<NamespaceNode, Errno>
1005    where
1006        L: LockEqualOrBefore<FileOpsCore>,
1007    {
1008        let mut context = LookupContext::default();
1009        self.lookup_path(locked, &mut context, self.fs().root(), path)
1010    }
1011
1012    pub fn exec(
1013        &mut self,
1014        locked: &mut Locked<Unlocked>,
1015        executable: FileHandle,
1016        path: CString,
1017        argv: Vec<CString>,
1018        environ: Vec<CString>,
1019    ) -> Result<(), Errno> {
1020        // Executable must be a regular file
1021        if !executable.name.entry.node.is_reg() {
1022            return error!(EACCES);
1023        }
1024
1025        // File node must have EXEC mode permissions.
1026        // Note that the ability to execute a file is unrelated to the flags
1027        // used in the `open` call.
1028        executable.name.check_access(locked, self, Access::EXEC, CheckAccessReason::Exec)?;
1029
1030        // 1. Prepare a `ResolvedElf` to hold details of the binary to be executed, its credentials,
1031        //    etc.
1032        // TODO: https://fxbug.dev/483368940 - Split the initial `ResolvedElf` creation from the
1033        // resolution of the interpreter binary, if any.
1034        let mut resolved_elf =
1035            resolve_executable(locked, self, executable.clone(), path.clone(), argv, environ)?;
1036
1037        // 2. Allow LSMs to perform access-checks on the target `executable`, and to update the
1038        //    `resolved_elf.creds` as necessary.
1039        security::bprm_creds_for_exec(self, &executable.name, &mut resolved_elf)?;
1040
1041        // 3. Resolve details of the initial binary, whether the `executable` itself, or an
1042        //    interpreter, if `executable` is a script.
1043        // TODO: https://fxbug.dev/483368940 - Split the initial `ResolvedElf` creation from the
1044        // resolution of the interpreter binary, if any.
1045
1046        // 4. Apply UID, GID and capabilities according to the attributes of the resolved binary.
1047        // TODO: https://fxbug.dev/503338788 - Collate this logic into a `bprm_creds_from_file()`.
1048        let maybe_set_id = if self.kernel().features.enable_suid {
1049            resolved_elf.file.name.suid_and_sgid(&self)?
1050        } else {
1051            Default::default()
1052        };
1053
1054        if self.thread_group().read().tasks_count() > 1 {
1055            track_stub!(TODO("https://fxbug.dev/297434895"), "exec on multithread process");
1056            return error!(EINVAL);
1057        }
1058
1059        // 5. Finalize the `exec()` operation by actually updating the task state based on the
1060        //    resolved details. Failures during this step are unrecoverable.
1061        if let Err(err) = self.finish_exec(locked, path, resolved_elf, maybe_set_id) {
1062            log_warn!("unrecoverable error in exec: {err:?}");
1063
1064            send_standard_signal(locked, self, SignalInfo::forced(SIGSEGV));
1065            return Err(err);
1066        }
1067
1068        self.ptrace_event(locked, PtraceOptions::TRACEEXEC, self.task.tid as u64);
1069        self.signal_vfork();
1070        self.task.thread_group.sync_syscall_log_level();
1071
1072        Ok(())
1073    }
1074
1075    /// After the memory is unmapped, any failure in exec is unrecoverable and results in the
1076    /// process crashing. This function is for that second half; any error returned from this
1077    /// function will be considered unrecoverable.
1078    fn finish_exec(
1079        &mut self,
1080        locked: &mut Locked<Unlocked>,
1081        path: CString,
1082        mut resolved_elf: ResolvedElf,
1083        mut maybe_set_id: UserAndOrGroupId,
1084    ) -> Result<(), Errno> {
1085        // Now that the exec will definitely finish (or crash), notify owners of
1086        // locked futexes for the current process, which will be impossible to
1087        // update after process image is replaced.  See get_robust_list(2).
1088        self.notify_robust_list();
1089
1090        // If there is already a `MemoryManager` then `exec()` will tear down the underlying Zircon
1091        // address-space, before creating an address-space configured ready to run `resolved_elf`.
1092        let mm = {
1093            let new_mm = MemoryManager::exec(
1094                self.thread_group().root_vmar.unowned(),
1095                self.mm().ok(),
1096                resolved_elf.file.name.to_passive(),
1097                resolved_elf.arch_width,
1098            )?;
1099            self.live().mm.update(Some(new_mm.clone()));
1100            new_mm
1101        };
1102
1103        // TODO(https://fxbug.dev/42082680): All threads other than the calling thread are destroyed.
1104
1105        // TODO: POSIX timers are not preserved.
1106
1107        // TODO: Ensure that the filesystem context is un-shared, undoing the effect of CLONE_FS.
1108
1109        // The file descriptor table is unshared, undoing the effect of the CLONE_FILES flag of
1110        // clone(2).
1111        self.live().files.unshare();
1112        self.live().files.exec(locked, self);
1113
1114        {
1115            let mut state = self.write();
1116
1117            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1118            //
1119            //   The aforementioned transformations of the effective IDs are not
1120            //   performed (i.e., the set-user-ID and set-group-ID bits are
1121            //   ignored) if any of the following is true:
1122            //
1123            //   * the no_new_privs attribute is set for the calling thread (see
1124            //      prctl(2));
1125            //
1126            //   *  the underlying filesystem is mounted nosuid (the MS_NOSUID
1127            //      flag for mount(2)); or
1128            //
1129            //   *  the calling process is being ptraced.
1130            //
1131            // The MS_NOSUID check is in `NamespaceNode::suid_and_sgid()`.
1132            if state.no_new_privs() || state.is_ptraced() {
1133                maybe_set_id.clear();
1134            }
1135
1136            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1137            //
1138            //   The process's "dumpable" attribute is set to the value 1,
1139            //   unless a set-user-ID program, a set-group-ID program, or a
1140            //   program with capabilities is being executed, in which case the
1141            //   dumpable flag may instead be reset to the value in
1142            //   /proc/sys/fs/suid_dumpable, in the circumstances described
1143            //   under PR_SET_DUMPABLE in prctl(2).
1144            let dumpable =
1145                if maybe_set_id.is_none() { DumpPolicy::User } else { DumpPolicy::Disable };
1146            *mm.dumpable.lock(locked) = dumpable;
1147
1148            // TODO(https://fxbug.dev/433463756): Figure out whether this is the right place to
1149            // take the lock.
1150            // SAFETY: this is allowed because we are the CurrentTask.
1151            let mut writable_creds = unsafe { self.persistent_info.write_creds() };
1152            state.set_sigaltstack(None);
1153            state.robust_list_head = RobustListHeadPtr::null(self);
1154
1155            // From <https://man7.org/linux/man-pages/man2/execve.2.html>:
1156            //
1157            //   If a set-user-ID or set-group-ID
1158            //   program is being executed, then the parent death signal set by
1159            //   prctl(2) PR_SET_PDEATHSIG flag is cleared.
1160            //
1161            // TODO(https://fxbug.dev/356684424): Implement the behavior above once we support
1162            // the PR_SET_PDEATHSIG flag.
1163
1164            // TODO(tbodt): Check whether capability xattrs are set on the file, and grant/limit
1165            // capabilities accordingly.
1166            resolved_elf.creds.exec(maybe_set_id);
1167
1168            // TODO(https://fxbug.dev/503338788) - Migrate this (and other capabilities wrangling)
1169            // into a `common_cap::bprm_creds_from_file()` implementation.
1170            if state.no_new_privs() {
1171                resolved_elf.creds.cap_permitted &= self.current_creds().cap_permitted;
1172                resolved_elf.creds.cap_effective &= resolved_elf.creds.cap_permitted;
1173            }
1174
1175            security::bprm_committing_creds(locked, self, &resolved_elf)?;
1176
1177            let new_creds = Arc::new(resolved_elf.creds.clone());
1178            writable_creds.update(new_creds.clone());
1179            *self.current_creds.borrow_mut() = CurrentCreds::Cached(new_creds);
1180        }
1181
1182        let start_info = load_executable(self, resolved_elf, &path)?;
1183
1184        let regs: zx_restricted_state_t = start_info.into();
1185        self.thread_state.registers.load(regs);
1186        self.thread_state.extended_pstate.reset();
1187        self.thread_group().signal_actions.reset_for_exec();
1188
1189        // The exit signal (and that of the children) is reset to SIGCHLD.
1190        {
1191            let mut thread_group_state = self.thread_group().write();
1192            thread_group_state.exit_signal = Some(SIGCHLD);
1193            for (_, weak_child) in &mut thread_group_state.children {
1194                if let Some(child) = weak_child.upgrade() {
1195                    let mut child_state = child.write();
1196                    child_state.exit_signal = Some(SIGCHLD);
1197                }
1198            }
1199        }
1200
1201        security::bprm_committed_creds(locked, self)?;
1202
1203        self.thread_group().write().did_exec = true;
1204
1205        self.set_command_name(TaskCommand::from_path_bytes(path.to_bytes()));
1206
1207        Ok(())
1208    }
1209
1210    pub fn set_command_name(&self, new_name: TaskCommand) {
1211        // set_command_name needs to run before leader_command() in cases where self is the leader.
1212        self.task.set_command_name(new_name.clone());
1213        let leader_command = self.thread_group().read().leader_command();
1214        starnix_logging::set_current_task_info(
1215            new_name,
1216            leader_command,
1217            self.thread_group().leader,
1218            self.tid,
1219        );
1220    }
1221
1222    pub fn add_seccomp_filter(
1223        &mut self,
1224        locked: &mut Locked<Unlocked>,
1225        code: Vec<sock_filter>,
1226        flags: u32,
1227    ) -> Result<SyscallResult, Errno> {
1228        let new_filter = Arc::new(SeccompFilter::from_cbpf(
1229            &code,
1230            self.thread_group().next_seccomp_filter_id.add(1),
1231            flags & SECCOMP_FILTER_FLAG_LOG != 0,
1232        )?);
1233
1234        let mut maybe_fd: Option<FdNumber> = None;
1235
1236        if flags & SECCOMP_FILTER_FLAG_NEW_LISTENER != 0 {
1237            maybe_fd = Some(SeccompFilterContainer::create_listener(locked, self)?);
1238        }
1239
1240        // We take the process lock here because we can't change any of the threads
1241        // while doing a tsync.  So, you hold the process lock while making any changes.
1242        let state = self.thread_group().write();
1243
1244        if flags & SECCOMP_FILTER_FLAG_TSYNC != 0 {
1245            // TSYNC synchronizes all filters for all threads in the current process to
1246            // the current thread's
1247
1248            // We collect the filters for the current task upfront to save us acquiring
1249            // the task's lock a lot of times below.
1250            let mut filters: SeccompFilterContainer = self.read().seccomp_filters.clone();
1251
1252            // For TSYNC to work, all of the other thread filters in this process have to
1253            // be a prefix of this thread's filters, and none of them can be in
1254            // strict mode.
1255            let tasks = state.tasks();
1256            for task in &tasks {
1257                if task.tid == self.tid {
1258                    continue;
1259                }
1260                let other_task_state = task.read();
1261
1262                // Target threads cannot be in SECCOMP_MODE_STRICT
1263                if task.seccomp_filter_state.get() == SeccompStateValue::Strict {
1264                    return Self::seccomp_tsync_error(task.tid, flags);
1265                }
1266
1267                // Target threads' filters must be a subsequence of this thread's
1268                if !other_task_state.seccomp_filters.can_sync_to(&filters) {
1269                    return Self::seccomp_tsync_error(task.tid, flags);
1270                }
1271            }
1272
1273            // Now that we're sure we're allowed to do so, add the filter to all threads.
1274            filters.add_filter(new_filter, code.len() as u16)?;
1275
1276            for task in &tasks {
1277                let mut other_task_state = task.write();
1278
1279                other_task_state.enable_no_new_privs();
1280                other_task_state.seccomp_filters = filters.clone();
1281                task.set_seccomp_state(SeccompStateValue::UserDefined)?;
1282            }
1283        } else {
1284            let mut task_state = self.task.write();
1285
1286            task_state.seccomp_filters.add_filter(new_filter, code.len() as u16)?;
1287            self.set_seccomp_state(SeccompStateValue::UserDefined)?;
1288        }
1289
1290        if let Some(fd) = maybe_fd { Ok(fd.into()) } else { Ok(().into()) }
1291    }
1292
1293    pub fn run_seccomp_filters(
1294        &mut self,
1295        locked: &mut Locked<Unlocked>,
1296        syscall: &Syscall,
1297    ) -> Option<Result<SyscallResult, Errno>> {
1298        // Implementation of SECCOMP_FILTER_STRICT, which has slightly different semantics
1299        // from user-defined seccomp filters.
1300        if self.seccomp_filter_state.get() == SeccompStateValue::Strict {
1301            return SeccompState::do_strict(locked, self, syscall);
1302        }
1303
1304        // Run user-defined seccomp filters
1305        let result = self.task.read().seccomp_filters.run_all(self, syscall);
1306
1307        SeccompState::do_user_defined(locked, result, self, syscall)
1308    }
1309
1310    fn seccomp_tsync_error(id: i32, flags: u32) -> Result<SyscallResult, Errno> {
1311        // By default, TSYNC indicates failure state by returning the first thread
1312        // id not to be able to sync, rather than by returning -1 and setting
1313        // errno.  However, if TSYNC_ESRCH is set, it returns ESRCH.  This
1314        // prevents conflicts with fact that SECCOMP_FILTER_FLAG_NEW_LISTENER
1315        // makes seccomp return an fd.
1316        if flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH != 0 { error!(ESRCH) } else { Ok(id.into()) }
1317    }
1318
1319    // Notify all futexes in robust list.  The robust list is in user space, so we
1320    // are very careful about walking it, and there are a lot of quiet returns if
1321    // we fail to walk it.
1322    // TODO(https://fxbug.dev/42079081): This only sets the FUTEX_OWNER_DIED bit; it does
1323    // not wake up a waiter.
1324    pub fn notify_robust_list(&self) {
1325        let task_state = self.write();
1326        let robust_list_addr = task_state.robust_list_head.addr();
1327        if robust_list_addr == UserAddress::NULL {
1328            // No one has called set_robust_list.
1329            return;
1330        }
1331        let robust_list_res = self.read_multi_arch_object(task_state.robust_list_head);
1332
1333        let head = if let Ok(head) = robust_list_res {
1334            head
1335        } else {
1336            return;
1337        };
1338
1339        let offset = head.futex_offset;
1340
1341        let mut entries_count = 0;
1342        let mut curr_ptr = head.list.next;
1343        while curr_ptr.addr() != robust_list_addr.into() && entries_count < ROBUST_LIST_LIMIT {
1344            let curr_ref = self.read_multi_arch_object(curr_ptr);
1345
1346            let curr = if let Ok(curr) = curr_ref {
1347                curr
1348            } else {
1349                return;
1350            };
1351
1352            let Some(futex_base) = curr_ptr.addr().checked_add_signed(offset) else {
1353                return;
1354            };
1355
1356            let futex_addr = match FutexAddress::try_from(futex_base) {
1357                Ok(addr) => addr,
1358                Err(_) => {
1359                    return;
1360                }
1361            };
1362
1363            let Ok(mm) = self.mm() else {
1364                log_error!("Asked to notify robust list futexes in system task.");
1365                return;
1366            };
1367            let futex = if let Ok(futex) = mm.atomic_load_u32_relaxed(futex_addr) {
1368                futex
1369            } else {
1370                return;
1371            };
1372
1373            if (futex & FUTEX_TID_MASK) as i32 == self.tid {
1374                let owner_died = FUTEX_OWNER_DIED | futex;
1375                if mm.atomic_store_u32_relaxed(futex_addr, owner_died).is_err() {
1376                    return;
1377                }
1378            }
1379            curr_ptr = curr.next;
1380            entries_count += 1;
1381        }
1382    }
1383
1384    /// Returns a ref to this thread's SeccompNotifier.
1385    pub fn get_seccomp_notifier(&mut self) -> Option<SeccompNotifierHandle> {
1386        self.task.write().seccomp_filters.notifier.clone()
1387    }
1388
1389    pub fn set_seccomp_notifier(&mut self, notifier: Option<SeccompNotifierHandle>) {
1390        self.task.write().seccomp_filters.notifier = notifier;
1391    }
1392
1393    // On ARM32 Linux, some undefined instructions are treated as software breakpoints.
1394    // Read the instruction that caused the exception to handle it appropriately.
1395    fn is_arm32_breakpoint(&self) -> bool {
1396        #[cfg(target_arch = "aarch64")]
1397        if self.thread_state.arch_width().is_arch32() {
1398            let ip = self.thread_state.registers.instruction_pointer_register();
1399            let user_addr = UserAddress::from(ip);
1400
1401            if self.thread_state.registers.is_thumb() {
1402                // Read 2 bytes first to check the narrow Thumb instruction.
1403                if let Ok(insn_bytes_16) = self.read_memory_to_array::<2>(user_addr) {
1404                    let insn_u16 = u16::from_le_bytes(insn_bytes_16);
1405                    if insn_u16 == 0xde01 {
1406                        return true;
1407                    }
1408
1409                    // Next, read 4 bytes to check the wide Thumb instruction.
1410                    if let Ok(insn_bytes_32) = self.read_memory_to_array::<4>(user_addr) {
1411                        let insn_u32 = u32::from_le_bytes(insn_bytes_32);
1412                        if insn_u32 == 0xa000f7f0 {
1413                            return true;
1414                        }
1415                    }
1416                }
1417            } else {
1418                if let Ok(insn_bytes_32) = self.read_memory_to_array::<4>(user_addr) {
1419                    let insn_u32 = u32::from_le_bytes(insn_bytes_32);
1420                    if insn_u32 == 0xe7f001f0 {
1421                        return true;
1422                    }
1423                }
1424            }
1425        }
1426        false
1427    }
1428
1429    /// Processes a Zircon exception associated with this task.
1430    pub fn process_exception(
1431        &self,
1432        locked: &mut Locked<Unlocked>,
1433        report: &zx::ExceptionReport,
1434    ) -> ExceptionResult {
1435        match report.ty {
1436            zx::ExceptionType::General => match get_signal_for_general_exception(&report.arch) {
1437                Some(sig) => ExceptionResult::Signal(SignalInfo::kernel(sig)),
1438                None => {
1439                    log_error!("Unrecognized general exception: {:?}", report);
1440                    ExceptionResult::Signal(SignalInfo::kernel(SIGILL))
1441                }
1442            },
1443            zx::ExceptionType::FatalPageFault { status } => {
1444                let report = decode_page_fault_exception_report(&report.arch);
1445                if let Ok(mm) = self.mm() {
1446                    mm.handle_page_fault(locked, report, status)
1447                } else {
1448                    panic!(
1449                        "system task is handling a major page fault status={:?}, report={:?}",
1450                        status, report
1451                    );
1452                }
1453            }
1454            zx::ExceptionType::UndefinedInstruction => {
1455                if self.is_arm32_breakpoint() {
1456                    ExceptionResult::Signal(SignalInfo::kernel(SIGTRAP))
1457                } else {
1458                    ExceptionResult::Signal(SignalInfo::kernel(SIGILL))
1459                }
1460            }
1461            zx::ExceptionType::UnalignedAccess => {
1462                ExceptionResult::Signal(SignalInfo::kernel(SIGBUS))
1463            }
1464            zx::ExceptionType::SoftwareBreakpoint | zx::ExceptionType::HardwareBreakpoint => {
1465                ExceptionResult::Signal(SignalInfo::kernel(SIGTRAP))
1466            }
1467            zx::ExceptionType::ProcessNameChanged => {
1468                log_error!("Received unexpected process name changed exception");
1469                ExceptionResult::Handled
1470            }
1471            zx::ExceptionType::ProcessStarting
1472            | zx::ExceptionType::ThreadStarting
1473            | zx::ExceptionType::ThreadExiting => {
1474                log_error!("Received unexpected task lifecycle exception");
1475                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1476            }
1477            zx::ExceptionType::PolicyError(policy_code) => {
1478                log_error!(policy_code:?; "Received Zircon policy error exception");
1479                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1480            }
1481            zx::ExceptionType::UnknownUserGenerated { code, data } => {
1482                log_error!(code:?, data:?; "Received unexpected unknown user generated exception");
1483                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1484            }
1485            zx::ExceptionType::Unknown { ty, code, data } => {
1486                log_error!(ty:?, code:?, data:?; "Received unexpected exception");
1487                ExceptionResult::Signal(SignalInfo::kernel(SIGSYS))
1488            }
1489        }
1490    }
1491
1492    /// Clone this task.
1493    ///
1494    /// Creates a new task object that shares some state with this task
1495    /// according to the given flags.
1496    ///
1497    /// Used by the clone() syscall to create both processes and threads.
1498    ///
1499    /// The exit signal is broken out from the flags parameter like clone3() rather than being
1500    /// bitwise-ORed like clone().
1501    pub fn clone_task<L>(
1502        &self,
1503        locked: &mut Locked<L>,
1504        flags: u64,
1505        child_exit_signal: Option<Signal>,
1506        user_parent_tid: UserRef<pid_t>,
1507        user_child_tid: UserRef<pid_t>,
1508        user_pidfd: UserRef<FdNumber>,
1509    ) -> Result<TaskBuilder, Errno>
1510    where
1511        L: LockBefore<MmDumpable>,
1512        L: LockBefore<TaskRelease>,
1513        L: LockBefore<ProcessGroupState>,
1514    {
1515        const IMPLEMENTED_FLAGS: u64 = (CLONE_VM
1516            | CLONE_FS
1517            | CLONE_FILES
1518            | CLONE_SIGHAND
1519            | CLONE_THREAD
1520            | CLONE_SYSVSEM
1521            | CLONE_SETTLS
1522            | CLONE_PARENT
1523            | CLONE_PARENT_SETTID
1524            | CLONE_PIDFD
1525            | CLONE_CHILD_CLEARTID
1526            | CLONE_CHILD_SETTID
1527            | CLONE_VFORK
1528            | CLONE_NEWUTS
1529            | CLONE_PTRACE) as u64;
1530
1531        // A mask with all valid flags set, because we want to return a different error code for an
1532        // invalid flag vs an unimplemented flag. Subtracting 1 from the largest valid flag gives a
1533        // mask with all flags below it set. Shift up by one to make sure the largest flag is also
1534        // set.
1535        const VALID_FLAGS: u64 = (CLONE_INTO_CGROUP << 1) - 1;
1536
1537        // CLONE_SETTLS is implemented by sys_clone.
1538
1539        let clone_files = flags & (CLONE_FILES as u64) != 0;
1540        let clone_fs = flags & (CLONE_FS as u64) != 0;
1541        let clone_parent = flags & (CLONE_PARENT as u64) != 0;
1542        let clone_parent_settid = flags & (CLONE_PARENT_SETTID as u64) != 0;
1543        let clone_pidfd = flags & (CLONE_PIDFD as u64) != 0;
1544        let clone_child_cleartid = flags & (CLONE_CHILD_CLEARTID as u64) != 0;
1545        let clone_child_settid = flags & (CLONE_CHILD_SETTID as u64) != 0;
1546        let clone_sysvsem = flags & (CLONE_SYSVSEM as u64) != 0;
1547        let clone_ptrace = flags & (CLONE_PTRACE as u64) != 0;
1548        let clone_thread = flags & (CLONE_THREAD as u64) != 0;
1549        let clone_vm = flags & (CLONE_VM as u64) != 0;
1550        let clone_sighand = flags & (CLONE_SIGHAND as u64) != 0;
1551        let clone_vfork = flags & (CLONE_VFORK as u64) != 0;
1552        let clone_newuts = flags & (CLONE_NEWUTS as u64) != 0;
1553        let clone_into_cgroup = flags & CLONE_INTO_CGROUP != 0;
1554
1555        if clone_ptrace {
1556            track_stub!(TODO("https://fxbug.dev/322874630"), "CLONE_PTRACE");
1557        }
1558
1559        if clone_sysvsem {
1560            track_stub!(TODO("https://fxbug.dev/322875185"), "CLONE_SYSVSEM");
1561        }
1562
1563        if clone_into_cgroup {
1564            track_stub!(TODO("https://fxbug.dev/403612570"), "CLONE_INTO_CGROUP");
1565        }
1566
1567        if clone_sighand && !clone_vm {
1568            return error!(EINVAL);
1569        }
1570        if clone_thread && !clone_sighand {
1571            return error!(EINVAL);
1572        }
1573
1574        if clone_pidfd && clone_thread {
1575            return error!(EINVAL);
1576        }
1577        if clone_pidfd && clone_parent_settid && user_parent_tid.addr() == user_pidfd.addr() {
1578            // `clone()` uses the same out-argument for these, so error out if they have the same
1579            // user address.
1580            return error!(EINVAL);
1581        }
1582
1583        if flags & !VALID_FLAGS != 0 {
1584            return error!(EINVAL);
1585        }
1586
1587        if clone_vm && !clone_thread {
1588            // TODO(https://fxbug.dev/42066087) Implement CLONE_VM for child processes (not just child
1589            // threads). Currently this executes CLONE_VM (explicitly passed to clone() or as
1590            // used by vfork()) as a fork (the VM in the child is copy-on-write) which is almost
1591            // always OK.
1592            //
1593            // CLONE_VM is primarily as an optimization to avoid making a copy-on-write version of a
1594            // process' VM that will be immediately replaced with a call to exec(). The main users
1595            // (libc and language runtimes) don't actually rely on the memory being shared between
1596            // the two processes. And the vfork() man page explicitly allows vfork() to be
1597            // implemented as fork() which is what we do here.
1598            if !clone_vfork {
1599                track_stub!(
1600                    TODO("https://fxbug.dev/322875227"),
1601                    "CLONE_VM without CLONE_THREAD or CLONE_VFORK"
1602                );
1603            }
1604        } else if clone_thread && !clone_vm {
1605            track_stub!(TODO("https://fxbug.dev/322875167"), "CLONE_THREAD without CLONE_VM");
1606            return error!(ENOSYS);
1607        }
1608
1609        if flags & !IMPLEMENTED_FLAGS != 0 {
1610            track_stub!(
1611                TODO("https://fxbug.dev/322875130"),
1612                "clone unknown flags",
1613                flags & !IMPLEMENTED_FLAGS
1614            );
1615            return error!(ENOSYS);
1616        }
1617
1618        let fs = if clone_fs { self.fs() } else { self.fs().fork() };
1619        let files = if clone_files { self.live().files.clone() } else { self.live().files.fork() };
1620
1621        let kernel = self.kernel();
1622
1623        let mut pids = kernel.pids.write();
1624
1625        // Lock the cgroup process hierarchy so that the parent process cannot move to a different
1626        // cgroup while a new task or thread_group is created. This may be unnecessary if
1627        // CLONE_INTO_CGROUP is implemented and passed in.
1628        let mut cgroup2_pid_table = kernel.cgroups.lock_cgroup2_pid_table();
1629        // Create a `KernelSignal::Freeze` to put onto the new task, if the cgroup is frozen.
1630        let child_kernel_signals = cgroup2_pid_table
1631            .maybe_create_freeze_signal(self.thread_group())
1632            .into_iter()
1633            .collect::<VecDeque<_>>();
1634
1635        let pid;
1636        let command;
1637        let creds;
1638        let scheduler_state;
1639        let no_new_privs;
1640        let seccomp_filters;
1641        let robust_list_head = RobustListHeadPtr::null(self);
1642        let child_signal_mask;
1643        let timerslack_ns;
1644        let uts_ns;
1645
1646        let TaskInfo { thread, thread_group, memory_manager } = {
1647            // These variables hold the original parent in case we need to switch the parent of the
1648            // new task because of CLONE_PARENT.
1649            let weak_original_parent;
1650            let original_parent;
1651
1652            // Make sure to drop these locks ASAP to avoid inversion
1653            let thread_group_state = {
1654                let thread_group_state = self.thread_group().write();
1655                if clone_parent {
1656                    // With the CLONE_PARENT flag, the parent of the new task is our parent
1657                    // instead of ourselves.
1658                    weak_original_parent =
1659                        thread_group_state.parent.clone().ok_or_else(|| errno!(EINVAL))?;
1660                    std::mem::drop(thread_group_state);
1661                    original_parent = weak_original_parent.upgrade();
1662                    original_parent.write()
1663                } else {
1664                    thread_group_state
1665                }
1666            };
1667
1668            let state = self.read();
1669
1670            no_new_privs = state.no_new_privs();
1671            seccomp_filters = state.seccomp_filters.clone();
1672            child_signal_mask = state.signal_mask();
1673
1674            pid = pids.allocate_pid();
1675            command = self.command();
1676            creds = self.current_creds().clone();
1677            scheduler_state = state.scheduler_state.fork();
1678            timerslack_ns = state.timerslack_ns;
1679
1680            uts_ns = if clone_newuts {
1681                security::check_task_capable(self, CAP_SYS_ADMIN)?;
1682                state.uts_ns.read().fork()
1683            } else {
1684                state.uts_ns.clone()
1685            };
1686
1687            if clone_thread {
1688                TaskInfo {
1689                    thread: None,
1690                    thread_group: self.thread_group().clone(),
1691                    memory_manager: self.mm().ok(),
1692                }
1693            } else {
1694                // Drop the lock on this task before entering `create_zircon_process`, because it will
1695                // take a lock on the new thread group, and locks on thread groups have a higher
1696                // priority than locks on the task in the thread group.
1697                std::mem::drop(state);
1698                let signal_actions = if clone_sighand {
1699                    self.thread_group().signal_actions.clone()
1700                } else {
1701                    self.thread_group().signal_actions.fork()
1702                };
1703                let process_group = thread_group_state.process_group.clone();
1704
1705                let task_info = {
1706                    trace_duration!(CATEGORY_STARNIX, "create_zircon_process");
1707                    create_zircon_process(
1708                        locked,
1709                        kernel,
1710                        Some(thread_group_state),
1711                        pid,
1712                        child_exit_signal,
1713                        process_group,
1714                        signal_actions,
1715                        command.clone(),
1716                    )?
1717                };
1718
1719                cgroup2_pid_table.inherit_cgroup(self.thread_group(), &task_info.thread_group);
1720
1721                task_info
1722            }
1723        };
1724
1725        // Drop the lock on the cgroup pid_table before creating the TaskBuilder.
1726        // If the TaskBuilder creation fails, the TaskBuilder is dropped, which calls
1727        // ThreadGroup::remove. ThreadGroup::remove takes the cgroup pid_table lock, causing
1728        // a cyclic lock dependency.
1729        std::mem::drop(cgroup2_pid_table);
1730
1731        // Only create the vfork event when the caller requested CLONE_VFORK.
1732        let vfork_event = if clone_vfork { Some(Arc::new(zx::Event::create())) } else { None };
1733
1734        // Clone live state in a nested scope to ensure that the RCU read scope is not held across
1735        // the release_on_error block.
1736        let abstract_socket_namespace;
1737        let abstract_vsock_namespace;
1738        {
1739            let live = self.live();
1740            abstract_socket_namespace = live.abstract_socket_namespace.clone();
1741            abstract_vsock_namespace = live.abstract_vsock_namespace.clone();
1742        }
1743
1744        let mut child = TaskBuilder::new(Task::new(
1745            pid,
1746            command,
1747            thread_group,
1748            thread,
1749            files,
1750            memory_manager,
1751            fs,
1752            creds,
1753            abstract_socket_namespace,
1754            abstract_vsock_namespace,
1755            child_signal_mask,
1756            child_kernel_signals,
1757            vfork_event,
1758            scheduler_state,
1759            uts_ns,
1760            no_new_privs,
1761            SeccompState::from(&self.seccomp_filter_state),
1762            seccomp_filters,
1763            robust_list_head,
1764            timerslack_ns,
1765        ));
1766
1767        release_on_error!(child, locked, {
1768            // Drop the pids lock as soon as possible after creating the child. Destroying the child
1769            // and removing it from the pids table itself requires the pids lock, so if an early exit
1770            // takes place we have a self deadlock.
1771            pids.add_task(Arc::clone(&child.task));
1772            std::mem::drop(pids);
1773
1774            // Child lock must be taken before this lock. Drop the lock on the task, take a writable
1775            // lock on the child and take the current state back.
1776
1777            #[cfg(any(test, debug_assertions))]
1778            {
1779                // Take the lock on the thread group and its child in the correct order to ensure any wrong ordering
1780                // will trigger the tracing-mutex at the right call site.
1781                if !clone_thread {
1782                    let _l1 = self.thread_group().read();
1783                    let _l2 = child.thread_group().read();
1784                }
1785            }
1786
1787            if clone_thread {
1788                self.thread_group().add(Arc::clone(&child.task))?;
1789            } else {
1790                child.thread_group().add(Arc::clone(&child.task))?;
1791
1792                // These manipulations of the signal handling state appear to be related to
1793                // CLONE_SIGHAND and CLONE_VM rather than CLONE_THREAD. However, we do not support
1794                // all the combinations of these flags, which means doing these operations here
1795                // might actually be correct. However, if you find a test that fails because of the
1796                // placement of this logic here, we might need to move it.
1797                let mut child_state = child.write();
1798                let state = self.read();
1799                child_state.set_sigaltstack(state.sigaltstack());
1800                child_state.set_signal_mask(state.signal_mask());
1801            }
1802
1803            if !clone_vm {
1804                // We do not support running threads in the same process with different
1805                // MemoryManagers.
1806                assert!(!clone_thread);
1807                let child_mm = MemoryManager::snapshot_of(
1808                    locked,
1809                    &self.mm()?,
1810                    child.thread_group.root_vmar.unowned(),
1811                    self.thread_state.arch_width(),
1812                )?;
1813                child.live()?.mm.update(Some(child_mm));
1814            }
1815
1816            if clone_parent_settid {
1817                self.write_object(user_parent_tid, &child.tid)?;
1818            }
1819
1820            if clone_child_cleartid {
1821                child.write().clear_child_tid = user_child_tid;
1822            }
1823
1824            if clone_child_settid {
1825                child.write_object(user_child_tid, &child.tid)?;
1826            }
1827
1828            if clone_pidfd {
1829                let locked = locked.cast_locked::<TaskRelease>();
1830                let file = new_pidfd(
1831                    locked,
1832                    self,
1833                    child.thread_group(),
1834                    &*child.mm()?,
1835                    OpenFlags::empty(),
1836                );
1837                let pidfd = self.add_file(locked, file, FdFlags::CLOEXEC)?;
1838                self.write_object(user_pidfd, &pidfd)?;
1839            }
1840
1841            // TODO(https://fxbug.dev/42066087): We do not support running different processes with
1842            // the same MemoryManager. Instead, we implement a rough approximation of that behavior
1843            // by making a copy-on-write clone of the memory from the original process.
1844            if clone_vm && !clone_thread {
1845                let child_mm = MemoryManager::snapshot_of(
1846                    locked,
1847                    &self.mm()?,
1848                    child.thread_group.root_vmar.unowned(),
1849                    self.thread_state.arch_width(),
1850                )?;
1851                child.live()?.mm.update(Some(child_mm));
1852            }
1853
1854            child.thread_state = self.thread_state.snapshot::<HeapRegs>();
1855            Ok(())
1856        });
1857
1858        // Take the lock on thread group and task in the correct order to ensure any wrong ordering
1859        // will trigger the tracing-mutex at the right call site.
1860        #[cfg(any(test, debug_assertions))]
1861        {
1862            let _l1 = child.thread_group().read();
1863            let _l2 = child.read();
1864        }
1865
1866        Ok(child)
1867    }
1868
1869    /// Sets the stop state (per set_stopped), and also notifies all listeners,
1870    /// including the parent process and the tracer if appropriate.
1871    pub fn set_stopped_and_notify(&self, stopped: StopState, siginfo: Option<SignalInfo>) {
1872        let maybe_signal_info = {
1873            let mut state = self.write();
1874            state.copy_state_from(self);
1875            state.set_stopped(stopped, siginfo, Some(self), None);
1876            state.prepare_signal_info(stopped)
1877        };
1878
1879        if let Some((tracer, signal_info)) = maybe_signal_info {
1880            if let Some(tracer) = tracer.upgrade() {
1881                tracer.write().send_signal(signal_info);
1882            }
1883        }
1884
1885        if !stopped.is_in_progress() {
1886            let parent = self.thread_group().read().parent.clone();
1887            if let Some(parent) = parent {
1888                parent
1889                    .upgrade()
1890                    .write()
1891                    .lifecycle_waiters
1892                    .notify_value(ThreadGroupLifecycleWaitValue::ChildStatus);
1893            }
1894        }
1895    }
1896
1897    /// If the task is stopping, set it as stopped. return whether the caller
1898    /// should stop.  The task might also be waking up.
1899    pub fn finalize_stop_state(&mut self) -> bool {
1900        let stopped = self.load_stopped();
1901
1902        if !stopped.is_stopping_or_stopped() {
1903            // If we are waking up, potentially write back state a tracer may have modified.
1904            let captured_state = self.write().take_captured_state();
1905            if let Some(captured) = captured_state {
1906                if captured.dirty {
1907                    self.thread_state.replace_registers(&captured.thread_state);
1908                }
1909            }
1910        }
1911
1912        // Stopping because the thread group is stopping.
1913        // Try to flip to GroupStopped - will fail if we shouldn't.
1914        if self.thread_group().set_stopped(StopState::GroupStopped, None, true)
1915            == StopState::GroupStopped
1916        {
1917            let signal = self.thread_group().read().last_signal.clone();
1918            // stopping because the thread group has stopped
1919            let event = Some(PtraceEventData::new_from_event(PtraceEvent::Stop, 0));
1920            self.write().set_stopped(StopState::GroupStopped, signal, Some(self), event);
1921            return true;
1922        }
1923
1924        // Stopping because the task is stopping
1925        if stopped.is_stopping_or_stopped() {
1926            if let Ok(stopped) = stopped.finalize() {
1927                self.set_stopped_and_notify(stopped, None);
1928            }
1929            return true;
1930        }
1931
1932        false
1933    }
1934
1935    /// Block the execution of `current_task` as long as the task is stopped and
1936    /// not terminated.
1937    pub fn block_while_stopped(&mut self, locked: &mut Locked<Unlocked>) {
1938        // Upgrade the state from stopping to stopped if needed. Return if the task
1939        // should not be stopped.
1940        if !self.finalize_stop_state() {
1941            return;
1942        }
1943
1944        let waiter = Waiter::with_options(WaiterOptions::IGNORE_SIGNALS);
1945        loop {
1946            // If we've exited, unstop the threads and return without notifying
1947            // waiters.
1948            if self.is_exitted() {
1949                self.thread_group().set_stopped(StopState::ForceAwake, None, false);
1950                self.write().set_stopped(StopState::ForceAwake, None, Some(self), None);
1951                return;
1952            }
1953
1954            if self.wake_or_wait_until_unstopped_async(&waiter) {
1955                return;
1956            }
1957
1958            // Do the wait. Result is not needed, as this is not in a syscall.
1959            let _: Result<(), Errno> = waiter.wait(locked, self);
1960
1961            // Maybe go from stopping to stopped, if we are currently stopping
1962            // again.
1963            self.finalize_stop_state();
1964        }
1965    }
1966
1967    /// For traced tasks, this will return the data neceessary for a cloned task
1968    /// to attach to the same tracer.
1969    pub fn get_ptrace_core_state_for_clone(
1970        &mut self,
1971        clone_args: &clone_args,
1972    ) -> (PtraceOptions, Option<PtraceCoreState>) {
1973        let state = self.write();
1974        if let Some(ptrace) = &state.ptrace {
1975            ptrace.get_core_state_for_clone(clone_args)
1976        } else {
1977            (PtraceOptions::empty(), None)
1978        }
1979    }
1980
1981    /// If currently being ptraced with the given option, emit the appropriate
1982    /// event.  PTRACE_EVENTMSG will return the given message.  Also emits the
1983    /// appropriate event for execve in the absence of TRACEEXEC.
1984    ///
1985    /// Note that the Linux kernel has a documented bug where, if TRACEEXIT is
1986    /// enabled, SIGKILL will trigger an event.  We do not exhibit this
1987    /// behavior.
1988    pub fn ptrace_event(
1989        &mut self,
1990        locked: &mut Locked<Unlocked>,
1991        trace_kind: PtraceOptions,
1992        msg: u64,
1993    ) {
1994        if !trace_kind.is_empty() {
1995            {
1996                let mut state = self.write();
1997                if let Some(ptrace) = &mut state.ptrace {
1998                    if !ptrace.has_option(trace_kind) {
1999                        // If this would be a TRACEEXEC, but TRACEEXEC is not
2000                        // turned on, then send a SIGTRAP.
2001                        if trace_kind == PtraceOptions::TRACEEXEC && !ptrace.is_seized() {
2002                            // Send a SIGTRAP so that the parent can gain control.
2003                            send_signal_first(locked, self, state, SignalInfo::kernel(SIGTRAP));
2004                        }
2005
2006                        return;
2007                    }
2008                    let ptrace_event = PtraceEvent::from_option(&trace_kind) as u32;
2009                    let siginfo = SignalInfo::with_detail(
2010                        SIGTRAP,
2011                        ((ptrace_event << 8) | SIGTRAP.number()) as i32,
2012                        SignalDetail::None,
2013                    );
2014                    state.set_stopped(
2015                        StopState::PtraceEventStopping,
2016                        Some(siginfo),
2017                        None,
2018                        Some(PtraceEventData::new(trace_kind, msg)),
2019                    );
2020                } else {
2021                    return;
2022                }
2023            }
2024            self.block_while_stopped(locked);
2025        }
2026    }
2027
2028    /// Causes the current thread's thread group to exit, notifying any ptracer
2029    /// of this task first.
2030    pub fn thread_group_exit(&mut self, locked: &mut Locked<Unlocked>, exit_status: ExitStatus) {
2031        self.ptrace_event(
2032            locked,
2033            PtraceOptions::TRACEEXIT,
2034            exit_status.signal_info_status() as u64,
2035        );
2036        self.thread_group().exit(locked, exit_status, None);
2037    }
2038
2039    /// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the
2040    /// exit signal as in clone().
2041    pub fn clone_task_builder_for_test<L>(
2042        &self,
2043        locked: &mut Locked<L>,
2044        flags: u64,
2045        exit_signal: Option<Signal>,
2046    ) -> TaskBuilder
2047    where
2048        L: LockBefore<MmDumpable>,
2049        L: LockBefore<TaskRelease>,
2050        L: LockBefore<ProcessGroupState>,
2051    {
2052        let result = self
2053            .clone_task(
2054                locked,
2055                flags,
2056                exit_signal,
2057                UserRef::default(),
2058                UserRef::default(),
2059                UserRef::default(),
2060            )
2061            .expect("failed to create task in test");
2062        result.task.write().set_spawned();
2063        result
2064    }
2065
2066    /// The flags indicates only the flags as in clone3(), and does not use the low 8 bits for the
2067    /// exit signal as in clone().
2068    pub fn clone_task_for_test<L>(
2069        &self,
2070        locked: &mut Locked<L>,
2071        flags: u64,
2072        exit_signal: Option<Signal>,
2073    ) -> crate::testing::AutoReleasableTask
2074    where
2075        L: LockBefore<MmDumpable>,
2076        L: LockBefore<TaskRelease>,
2077        L: LockBefore<ProcessGroupState>,
2078    {
2079        self.clone_task_builder_for_test(locked, flags, exit_signal).into()
2080    }
2081
2082    // See "Ptrace access mode checking" in https://man7.org/linux/man-pages/man2/ptrace.2.html
2083    pub fn check_ptrace_access_mode<L>(
2084        &self,
2085        locked: &mut Locked<L>,
2086        mode: PtraceAccessMode,
2087        target: &Task,
2088    ) -> Result<(), Errno>
2089    where
2090        L: LockBefore<MmDumpable>,
2091    {
2092        // (1)  If the calling thread and the target thread are in the same
2093        //      thread group, access is always allowed.
2094        if self.thread_group().leader == target.thread_group().leader {
2095            return Ok(());
2096        }
2097
2098        // (2)  If the access mode specifies PTRACE_MODE_FSCREDS, then, for
2099        //      the check in the next step, employ the caller's filesystem
2100        //      UID and GID.  (As noted in credentials(7), the filesystem
2101        //      UID and GID almost always have the same values as the
2102        //      corresponding effective IDs.)
2103        //
2104        //      Otherwise, the access mode specifies PTRACE_MODE_REALCREDS,
2105        //      so use the caller's real UID and GID for the checks in the
2106        //      next step.  (Most APIs that check the caller's UID and GID
2107        //      use the effective IDs.  For historical reasons, the
2108        //      PTRACE_MODE_REALCREDS check uses the real IDs instead.)
2109        let (uid, gid) = if mode.contains(PTRACE_MODE_FSCREDS) {
2110            let fscred = self.current_creds().as_fscred();
2111            (fscred.uid, fscred.gid)
2112        } else if mode.contains(PTRACE_MODE_REALCREDS) {
2113            let creds = self.current_creds();
2114            (creds.uid, creds.gid)
2115        } else {
2116            unreachable!();
2117        };
2118
2119        // (3)  Deny access if neither of the following is true:
2120        //
2121        //      -  The real, effective, and saved-set user IDs of the target
2122        //         match the caller's user ID, and the real, effective, and
2123        //         saved-set group IDs of the target match the caller's
2124        //         group ID.
2125        //
2126        //      -  The caller has the CAP_SYS_PTRACE capability in the user
2127        //         namespace of the target.
2128        let target_creds = target.real_creds();
2129        if !(target_creds.uid == uid
2130            && target_creds.euid == uid
2131            && target_creds.saved_uid == uid
2132            && target_creds.gid == gid
2133            && target_creds.egid == gid
2134            && target_creds.saved_gid == gid)
2135        {
2136            security::check_task_capable(self, CAP_SYS_PTRACE)?;
2137        }
2138
2139        // (4)  Deny access if the target process "dumpable" attribute has a
2140        //      value other than 1 (SUID_DUMP_USER; see the discussion of
2141        //      PR_SET_DUMPABLE in prctl(2)), and the caller does not have
2142        //      the CAP_SYS_PTRACE capability in the user namespace of the
2143        //      target process.
2144        let dumpable = *target.mm()?.dumpable.lock(locked);
2145        match dumpable {
2146            DumpPolicy::User => (),
2147            DumpPolicy::Disable => security::check_task_capable(self, CAP_SYS_PTRACE)?,
2148        }
2149
2150        // (5)  The kernel LSM security_ptrace_access_check() interface is
2151        //      invoked to see if ptrace access is permitted.
2152        security::ptrace_access_check(self, target, mode)?;
2153
2154        // (6)  If access has not been denied by any of the preceding steps,
2155        //      then access is allowed.
2156        Ok(())
2157    }
2158
2159    pub fn can_signal(
2160        &self,
2161        target: &Task,
2162        unchecked_signal: UncheckedSignal,
2163    ) -> Result<(), Errno> {
2164        // If both the tasks share a thread group the signal can be sent. This is not documented
2165        // in kill(2) because kill does not support task-level granularity in signal sending.
2166        if self.thread_group == target.thread_group {
2167            return Ok(());
2168        }
2169
2170        let self_creds = self.current_creds();
2171        let target_creds = target.real_creds();
2172        // From https://man7.org/linux/man-pages/man2/kill.2.html:
2173        //
2174        // > For a process to have permission to send a signal, it must either be
2175        // > privileged (under Linux: have the CAP_KILL capability in the user
2176        // > namespace of the target process), or the real or effective user ID of
2177        // > the sending process must equal the real or saved set- user-ID of the
2178        // > target process.
2179        //
2180        // Returns true if the credentials are considered to have the same user ID.
2181        if self_creds.euid == target_creds.saved_uid
2182            || self_creds.euid == target_creds.uid
2183            || self_creds.uid == target_creds.uid
2184            || self_creds.uid == target_creds.saved_uid
2185        {
2186            return Ok(());
2187        }
2188
2189        if Signal::try_from(unchecked_signal) == Ok(SIGCONT) {
2190            let target_session = target.thread_group().read().process_group.session.leader;
2191            let self_session = self.thread_group().read().process_group.session.leader;
2192            if target_session == self_session {
2193                return Ok(());
2194            }
2195        }
2196
2197        security::check_task_capable(self, CAP_KILL)
2198    }
2199}
2200
2201impl ArchSpecific for CurrentTask {
2202    fn is_arch32(&self) -> bool {
2203        self.thread_state.is_arch32()
2204    }
2205}
2206
2207impl MemoryAccessor for CurrentTask {
2208    fn read_memory<'a>(
2209        &self,
2210        addr: UserAddress,
2211        bytes: &'a mut [MaybeUninit<u8>],
2212    ) -> Result<&'a mut [u8], Errno> {
2213        self.mm()?.unified_read_memory(self, addr, bytes)
2214    }
2215
2216    fn read_memory_partial_until_null_byte<'a>(
2217        &self,
2218        addr: UserAddress,
2219        bytes: &'a mut [MaybeUninit<u8>],
2220    ) -> Result<&'a mut [u8], Errno> {
2221        self.mm()?.unified_read_memory_partial_until_null_byte(self, addr, bytes)
2222    }
2223
2224    fn read_memory_partial<'a>(
2225        &self,
2226        addr: UserAddress,
2227        bytes: &'a mut [MaybeUninit<u8>],
2228    ) -> Result<&'a mut [u8], Errno> {
2229        self.mm()?.unified_read_memory_partial(self, addr, bytes)
2230    }
2231
2232    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2233        self.mm()?.unified_write_memory(self, addr, bytes)
2234    }
2235
2236    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
2237        self.mm()?.unified_write_memory_partial(self, addr, bytes)
2238    }
2239
2240    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
2241        self.mm()?.unified_zero(self, addr, length)
2242    }
2243}
2244
2245impl TaskMemoryAccessor for CurrentTask {
2246    fn maximum_valid_address(&self) -> Option<UserAddress> {
2247        self.mm().ok().map(|mm| mm.maximum_valid_user_address)
2248    }
2249}
2250
2251pub enum ExceptionResult {
2252    /// The exception was handled and no further action is required.
2253    Handled,
2254
2255    // The exception generated a signal that should be delivered.
2256    Signal(SignalInfo),
2257}
2258
2259fn split_path(path: &FsStr) -> LookupVec<&FsStr> {
2260    path.split(|c| *c == b'/').filter(|p| !p.is_empty()).map(<&FsStr>::from).collect()
2261}
2262
2263#[cfg(test)]
2264mod tests {
2265    use crate::testing::spawn_kernel_and_run;
2266    use starnix_uapi::auth::Credentials;
2267
2268    // This test will run `override_creds` and check it doesn't crash. This ensures that the
2269    // delegation to `override_creds_async` is correct.
2270    #[::fuchsia::test]
2271    async fn test_override_creds_can_delegate_to_async_version() {
2272        spawn_kernel_and_run(async move |_, current_task| {
2273            assert_eq!(current_task.override_creds(Credentials::root(), || 0), 0);
2274        })
2275        .await;
2276    }
2277}