starnix_core/task/
task.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::{MemoryAccessor, MemoryAccessorExt, MemoryManager, TaskMemoryAccessor};
6use crate::mutable_state::{state_accessor, state_implementation};
7use crate::ptrace::{
8    AtomicStopState, PtraceEvent, PtraceEventData, PtraceState, PtraceStatus, StopState,
9};
10use crate::security;
11use crate::signals::{KernelSignal, RunState, SignalDetail, SignalInfo, SignalState};
12use crate::task::memory_attribution::MemoryAttributionLifecycleEvent;
13use crate::task::tracing::KoidPair;
14use crate::task::{
15    AbstractUnixSocketNamespace, AbstractVsockSocketNamespace, CurrentTask, EventHandler, Kernel,
16    NormalPriority, PidTable, ProcessEntryRef, ProcessExitInfo, RealtimePriority, SchedulerState,
17    SchedulingPolicy, SeccompFilterContainer, SeccompState, SeccompStateValue, ThreadGroup,
18    ThreadGroupKey, ThreadState, UtsNamespaceHandle, WaitCanceler, Waiter, ZombieProcess,
19};
20use crate::vfs::{FdTable, FsContext, FsNodeHandle, FsString};
21use bitflags::bitflags;
22use fuchsia_rcu::rcu_arc::RcuArc;
23use fuchsia_rcu::rcu_option_arc::RcuOptionArc;
24use fuchsia_rcu::rcu_ptr::RcuReadGuard;
25use macro_rules_attribute::apply;
26use starnix_logging::{log_warn, set_zx_name};
27use starnix_registers::{HeapRegs, RegisterStorageEnum};
28use starnix_sync::{
29    LockBefore, Locked, Mutex, MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, TaskRelease,
30    TerminalLock,
31};
32use starnix_task_command::TaskCommand;
33use starnix_types::arch::ArchWidth;
34use starnix_types::ownership::{OwnedRef, Releasable, ReleaseGuard, TempRef, WeakRef};
35use starnix_types::stats::TaskTimeStats;
36use starnix_uapi::auth::{Credentials, FsCred};
37use starnix_uapi::errors::Errno;
38use starnix_uapi::signals::{SIGCHLD, SigSet, Signal, sigaltstack_contains_pointer};
39use starnix_uapi::user_address::{
40    ArchSpecific, MappingMultiArchUserRef, UserAddress, UserCString, UserRef,
41};
42use starnix_uapi::{
43    CLD_CONTINUED, CLD_DUMPED, CLD_EXITED, CLD_KILLED, CLD_STOPPED, CLD_TRAPPED,
44    FUTEX_BITSET_MATCH_ANY, errno, error, from_status_like_fdio, pid_t, sigaction_t, sigaltstack,
45    tid_t, uapi,
46};
47use std::collections::VecDeque;
48use std::mem::MaybeUninit;
49use std::ops::Deref;
50use std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
51use std::sync::{Arc, Weak};
52use std::{cmp, fmt};
53use zx::{Signals, Task as _};
54
55#[derive(Clone, Debug, Eq, PartialEq)]
56pub enum ExitStatus {
57    Exit(u8),
58    Kill(SignalInfo),
59    CoreDump(SignalInfo),
60    // The second field for Stop and Continue contains the type of ptrace stop
61    // event that made it stop / continue, if applicable (PTRACE_EVENT_STOP,
62    // PTRACE_EVENT_FORK, etc)
63    Stop(SignalInfo, PtraceEvent),
64    Continue(SignalInfo, PtraceEvent),
65}
66impl ExitStatus {
67    /// Converts the given exit status to a status code suitable for returning from wait syscalls.
68    pub fn wait_status(&self) -> i32 {
69        match self {
70            ExitStatus::Exit(status) => (*status as i32) << 8,
71            ExitStatus::Kill(siginfo) => siginfo.signal.number() as i32,
72            ExitStatus::CoreDump(siginfo) => (siginfo.signal.number() as i32) | 0x80,
73            ExitStatus::Continue(siginfo, trace_event) => {
74                let trace_event_val = *trace_event as u32;
75                if trace_event_val != 0 {
76                    (siginfo.signal.number() as i32) | (trace_event_val << 16) as i32
77                } else {
78                    0xffff
79                }
80            }
81            ExitStatus::Stop(siginfo, trace_event) => {
82                let trace_event_val = *trace_event as u32;
83                (0x7f + ((siginfo.signal.number() as i32) << 8)) | (trace_event_val << 16) as i32
84            }
85        }
86    }
87
88    pub fn signal_info_code(&self) -> i32 {
89        match self {
90            ExitStatus::Exit(_) => CLD_EXITED as i32,
91            ExitStatus::Kill(_) => CLD_KILLED as i32,
92            ExitStatus::CoreDump(_) => CLD_DUMPED as i32,
93            ExitStatus::Stop(_, _) => CLD_STOPPED as i32,
94            ExitStatus::Continue(_, _) => CLD_CONTINUED as i32,
95        }
96    }
97
98    pub fn signal_info_status(&self) -> i32 {
99        match self {
100            ExitStatus::Exit(status) => *status as i32,
101            ExitStatus::Kill(siginfo)
102            | ExitStatus::CoreDump(siginfo)
103            | ExitStatus::Continue(siginfo, _)
104            | ExitStatus::Stop(siginfo, _) => siginfo.signal.number() as i32,
105        }
106    }
107}
108
109bitflags! {
110    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
111    pub struct TaskFlags: u8 {
112        const EXITED = 0x1;
113        const SIGNALS_AVAILABLE = 0x2;
114        const TEMPORARY_SIGNAL_MASK = 0x4;
115        /// Whether the executor should dump the stack of this task when it exits.
116        /// Currently used to implement ExitStatus::CoreDump.
117        const DUMP_ON_EXIT = 0x8;
118    }
119}
120
121pub struct AtomicTaskFlags {
122    flags: AtomicU8,
123}
124
125impl AtomicTaskFlags {
126    fn new(flags: TaskFlags) -> Self {
127        Self { flags: AtomicU8::new(flags.bits()) }
128    }
129
130    fn load(&self, ordering: Ordering) -> TaskFlags {
131        let flags = self.flags.load(ordering);
132        // We only ever store values from a `TaskFlags`.
133        TaskFlags::from_bits_retain(flags)
134    }
135
136    fn swap(&self, flags: TaskFlags, ordering: Ordering) -> TaskFlags {
137        let flags = self.flags.swap(flags.bits(), ordering);
138        // We only ever store values from a `TaskFlags`.
139        TaskFlags::from_bits_retain(flags)
140    }
141}
142
143/// This contains thread state that tracers can inspect and modify.  It is
144/// captured when a thread stops, and optionally copied back (if dirty) when a
145/// thread starts again.  An alternative implementation would involve the
146/// tracers acting on thread state directly; however, this would involve sharing
147/// CurrentTask structures across multiple threads, which goes against the
148/// intent of the design of CurrentTask.
149pub struct CapturedThreadState {
150    /// The thread state of the traced task.  This is copied out when the thread
151    /// stops.
152    pub thread_state: ThreadState<HeapRegs>,
153
154    /// Indicates that the last ptrace operation changed the thread state, so it
155    /// should be written back to the original thread.
156    pub dirty: bool,
157}
158
159impl ArchSpecific for CapturedThreadState {
160    fn is_arch32(&self) -> bool {
161        self.thread_state.is_arch32()
162    }
163}
164
165#[derive(Debug)]
166pub struct RobustList {
167    pub next: RobustListPtr,
168}
169
170pub type RobustListPtr =
171    MappingMultiArchUserRef<RobustList, uapi::robust_list, uapi::arch32::robust_list>;
172
173impl From<uapi::robust_list> for RobustList {
174    fn from(robust_list: uapi::robust_list) -> Self {
175        Self { next: RobustListPtr::from(robust_list.next) }
176    }
177}
178
179#[cfg(target_arch = "aarch64")]
180impl From<uapi::arch32::robust_list> for RobustList {
181    fn from(robust_list: uapi::arch32::robust_list) -> Self {
182        Self { next: RobustListPtr::from(robust_list.next) }
183    }
184}
185
186#[derive(Debug)]
187pub struct RobustListHead {
188    pub list: RobustList,
189    pub futex_offset: isize,
190}
191
192pub type RobustListHeadPtr =
193    MappingMultiArchUserRef<RobustListHead, uapi::robust_list_head, uapi::arch32::robust_list_head>;
194
195impl From<uapi::robust_list_head> for RobustListHead {
196    fn from(robust_list_head: uapi::robust_list_head) -> Self {
197        Self {
198            list: robust_list_head.list.into(),
199            futex_offset: robust_list_head.futex_offset as isize,
200        }
201    }
202}
203
204#[cfg(target_arch = "aarch64")]
205impl From<uapi::arch32::robust_list_head> for RobustListHead {
206    fn from(robust_list_head: uapi::arch32::robust_list_head) -> Self {
207        Self {
208            list: robust_list_head.list.into(),
209            futex_offset: robust_list_head.futex_offset as isize,
210        }
211    }
212}
213
214pub struct TaskMutableState {
215    // See https://man7.org/linux/man-pages/man2/set_tid_address.2.html
216    pub clear_child_tid: UserRef<tid_t>,
217
218    /// Signal handler related state. This is grouped together for when atomicity is needed during
219    /// signal sending and delivery.
220    signals: SignalState,
221
222    /// Internal signals that have a higher priority than a regular signal.
223    ///
224    /// Storing in a separate queue outside of `SignalState` ensures the internal signals will
225    /// never be ignored or masked when dequeuing. Higher priority ensures that no user signals
226    /// will jump the queue, e.g. ptrace, which delays the delivery.
227    ///
228    /// This design is not about observable consequence, but about convenient implementation.
229    kernel_signals: VecDeque<KernelSignal>,
230
231    /// The exit status that this task exited with.
232    exit_status: Option<ExitStatus>,
233
234    /// Desired scheduler state for the task.
235    pub scheduler_state: SchedulerState,
236
237    /// The UTS namespace assigned to this thread.
238    ///
239    /// This field is kept in the mutable state because the UTS namespace of a thread
240    /// can be forked using `clone()` or `unshare()` syscalls.
241    ///
242    /// We use UtsNamespaceHandle because the UTS properties can be modified
243    /// by any other thread that shares this namespace.
244    pub uts_ns: UtsNamespaceHandle,
245
246    /// Bit that determines whether a newly started program can have privileges its parent does
247    /// not have.  See Documentation/prctl/no_new_privs.txt in the Linux kernel for details.
248    /// Note that Starnix does not currently implement the relevant privileges (e.g.,
249    /// setuid/setgid binaries).  So, you can set this, but it does nothing other than get
250    /// propagated to children.
251    ///
252    /// The documentation indicates that this can only ever be set to
253    /// true, and it cannot be reverted to false.  Accessor methods
254    /// for this field ensure this property.
255    no_new_privs: bool,
256
257    /// Userspace hint about how to adjust the OOM score for this process.
258    pub oom_score_adj: i32,
259
260    /// List of currently installed seccomp_filters
261    pub seccomp_filters: SeccompFilterContainer,
262
263    /// A pointer to the head of the robust futex list of this thread in
264    /// userspace. See get_robust_list(2)
265    pub robust_list_head: RobustListHeadPtr,
266
267    /// The timer slack used to group timer expirations for the calling thread.
268    ///
269    /// Timers may expire up to `timerslack_ns` late, but never early.
270    ///
271    /// If this value is 0, the task's default timerslack is used.
272    pub timerslack_ns: u64,
273
274    /// The default value for `timerslack_ns`. This value cannot change during the lifetime of a
275    /// task.
276    ///
277    /// This value is set to the `timerslack_ns` of the creating thread, and thus is not constant
278    /// across tasks.
279    pub default_timerslack_ns: u64,
280
281    /// Information that a tracer needs to communicate with this process, if it
282    /// is being traced.
283    pub ptrace: Option<Box<PtraceState>>,
284
285    /// Information that a tracer needs to inspect this process.
286    pub captured_thread_state: Option<Box<CapturedThreadState>>,
287}
288
289impl TaskMutableState {
290    pub fn no_new_privs(&self) -> bool {
291        self.no_new_privs
292    }
293
294    /// Sets the value of no_new_privs to true.  It is an error to set
295    /// it to anything else.
296    pub fn enable_no_new_privs(&mut self) {
297        self.no_new_privs = true;
298    }
299
300    pub fn get_timerslack<T: zx::Timeline>(&self) -> zx::Duration<T> {
301        zx::Duration::from_nanos(self.timerslack_ns as i64)
302    }
303
304    /// Sets the current timerslack of the task to `ns`.
305    ///
306    /// If `ns` is zero, the current timerslack gets reset to the task's default timerslack.
307    pub fn set_timerslack_ns(&mut self, ns: u64) {
308        if ns == 0 {
309            self.timerslack_ns = self.default_timerslack_ns;
310        } else {
311            self.timerslack_ns = ns;
312        }
313    }
314
315    pub fn is_ptraced(&self) -> bool {
316        self.ptrace.is_some()
317    }
318
319    pub fn is_ptrace_listening(&self) -> bool {
320        self.ptrace.as_ref().is_some_and(|ptrace| ptrace.stop_status == PtraceStatus::Listening)
321    }
322
323    pub fn ptrace_on_signal_consume(&mut self) -> bool {
324        self.ptrace.as_mut().is_some_and(|ptrace: &mut Box<PtraceState>| {
325            if ptrace.stop_status.is_continuing() {
326                ptrace.stop_status = PtraceStatus::Default;
327                false
328            } else {
329                true
330            }
331        })
332    }
333
334    pub fn notify_ptracers(&mut self) {
335        if let Some(ptrace) = &self.ptrace {
336            ptrace.tracer_waiters().notify_all();
337        }
338    }
339
340    pub fn wait_on_ptracer(&self, waiter: &Waiter) {
341        if let Some(ptrace) = &self.ptrace {
342            ptrace.tracee_waiters.wait_async(&waiter);
343        }
344    }
345
346    pub fn notify_ptracees(&mut self) {
347        if let Some(ptrace) = &self.ptrace {
348            ptrace.tracee_waiters.notify_all();
349        }
350    }
351
352    pub fn take_captured_state(&mut self) -> Option<Box<CapturedThreadState>> {
353        if self.captured_thread_state.is_some() {
354            let mut state = None;
355            std::mem::swap(&mut state, &mut self.captured_thread_state);
356            return state;
357        }
358        None
359    }
360
361    pub fn copy_state_from(&mut self, current_task: &CurrentTask) {
362        self.captured_thread_state = Some(Box::new(CapturedThreadState {
363            thread_state: current_task.thread_state.extended_snapshot::<HeapRegs>(),
364            dirty: false,
365        }));
366    }
367
368    /// Returns the task's currently active signal mask.
369    pub fn signal_mask(&self) -> SigSet {
370        self.signals.mask()
371    }
372
373    /// Returns true if `signal` is currently blocked by this task's signal mask.
374    pub fn is_signal_masked(&self, signal: Signal) -> bool {
375        self.signals.mask().has_signal(signal)
376    }
377
378    /// Returns true if `signal` is blocked by the saved signal mask.
379    ///
380    /// Note that the current signal mask may still not be blocking the signal.
381    pub fn is_signal_masked_by_saved_mask(&self, signal: Signal) -> bool {
382        self.signals.saved_mask().is_some_and(|mask| mask.has_signal(signal))
383    }
384
385    /// Enqueues an internal signal at the back of the task's kernel signal queue.
386    pub fn enqueue_kernel_signal(&mut self, signal: KernelSignal) {
387        self.kernel_signals.push_back(signal);
388    }
389
390    /// Enqueues a signal at the back of the task's signal queue.
391    pub fn enqueue_signal(&mut self, signal: SignalInfo) {
392        self.signals.enqueue(signal);
393    }
394
395    /// Enqueues the signal, allowing the signal to skip straight to the front of the task's queue.
396    ///
397    /// `enqueue_signal` is the more common API to use.
398    ///
399    /// Note that this will not guarantee that the signal is dequeued before any process-directed
400    /// signals.
401    pub fn enqueue_signal_front(&mut self, signal: SignalInfo) {
402        self.signals.enqueue(signal);
403    }
404
405    /// Sets the current signal mask of the task.
406    pub fn set_signal_mask(&mut self, mask: SigSet) {
407        self.signals.set_mask(mask);
408    }
409
410    /// Sets a temporary signal mask for the task.
411    ///
412    /// This mask should be removed by a matching call to `restore_signal_mask`.
413    pub fn set_temporary_signal_mask(&mut self, mask: SigSet) {
414        self.signals.set_temporary_mask(mask);
415    }
416
417    /// Removes the currently active, temporary, signal mask and restores the
418    /// previously active signal mask.
419    pub fn restore_signal_mask(&mut self) {
420        self.signals.restore_mask();
421    }
422
423    /// Returns true if the task's current `RunState` is blocked.
424    pub fn is_blocked(&self) -> bool {
425        self.signals.run_state.is_blocked()
426    }
427
428    /// Sets the task's `RunState` to `run_state`.
429    pub fn set_run_state(&mut self, run_state: RunState) {
430        self.signals.run_state = run_state;
431    }
432
433    pub fn run_state(&self) -> RunState {
434        self.signals.run_state.clone()
435    }
436
437    pub fn on_signal_stack(&self, stack_pointer_register: u64) -> bool {
438        self.signals
439            .alt_stack
440            .map(|signal_stack| sigaltstack_contains_pointer(&signal_stack, stack_pointer_register))
441            .unwrap_or(false)
442    }
443
444    pub fn set_sigaltstack(&mut self, stack: Option<sigaltstack>) {
445        self.signals.alt_stack = stack;
446    }
447
448    pub fn sigaltstack(&self) -> Option<sigaltstack> {
449        self.signals.alt_stack
450    }
451
452    pub fn wait_on_signal(&mut self, waiter: &Waiter) {
453        self.signals.signal_wait.wait_async(waiter);
454    }
455
456    pub fn signals_mut(&mut self) -> &mut SignalState {
457        &mut self.signals
458    }
459
460    pub fn wait_on_signal_fd_events(
461        &self,
462        waiter: &Waiter,
463        mask: SigSet,
464        handler: EventHandler,
465    ) -> WaitCanceler {
466        self.signals.signal_wait.wait_async_signal_mask(waiter, mask, handler)
467    }
468
469    pub fn notify_signal_waiters(&self, signal: &Signal) {
470        self.signals.signal_wait.notify_signal(signal);
471    }
472
473    /// Thaw the task if has been frozen
474    pub fn thaw(&mut self) {
475        if let RunState::Frozen(waiter) = self.run_state() {
476            waiter.notify();
477        }
478    }
479
480    pub fn is_frozen(&self) -> bool {
481        matches!(self.run_state(), RunState::Frozen(_))
482    }
483
484    #[cfg(test)]
485    pub fn kernel_signals_for_test(&self) -> &VecDeque<KernelSignal> {
486        &self.kernel_signals
487    }
488}
489
490#[apply(state_implementation!)]
491impl TaskMutableState<Base = Task> {
492    pub fn set_stopped(
493        &mut self,
494        stopped: StopState,
495        siginfo: Option<SignalInfo>,
496        current_task: Option<&CurrentTask>,
497        event: Option<PtraceEventData>,
498    ) {
499        if stopped.ptrace_only() && self.ptrace.is_none() {
500            return;
501        }
502
503        if self.base.load_stopped().is_illegal_transition(stopped) {
504            return;
505        }
506
507        // TODO(https://g-issues.fuchsia.dev/issues/306438676): When task can be
508        // stopped inside user code, task will need to be either restarted or
509        // stopped here.
510        self.store_stopped(stopped);
511        if stopped.is_stopped() {
512            if let Some(ref current_task) = current_task {
513                self.copy_state_from(current_task);
514            }
515        }
516        if let Some(ptrace) = &mut self.ptrace {
517            ptrace.set_last_signal(siginfo);
518            ptrace.set_last_event(event);
519        }
520        if stopped == StopState::Waking || stopped == StopState::ForceWaking {
521            self.notify_ptracees();
522        }
523        if !stopped.is_in_progress() {
524            self.notify_ptracers();
525        }
526    }
527
528    // Prepare a SignalInfo to be sent to the tracer, if any.
529    pub fn prepare_signal_info(
530        &mut self,
531        stopped: StopState,
532    ) -> Option<(Weak<ThreadGroup>, SignalInfo)> {
533        if !stopped.is_stopped() {
534            return None;
535        }
536
537        if let Some(ptrace) = &self.ptrace {
538            if let Some(last_signal) = ptrace.get_last_signal_ref() {
539                let signal_info = SignalInfo::new(
540                    SIGCHLD,
541                    CLD_TRAPPED as i32,
542                    SignalDetail::SIGCHLD {
543                        pid: self.base.tid,
544                        uid: self.base.real_creds().uid,
545                        status: last_signal.signal.number() as i32,
546                    },
547                );
548
549                return Some((ptrace.core_state.thread_group.clone(), signal_info));
550            }
551        }
552
553        None
554    }
555
556    pub fn set_ptrace(&mut self, tracer: Option<Box<PtraceState>>) -> Result<(), Errno> {
557        if tracer.is_some() && self.ptrace.is_some() {
558            return error!(EPERM);
559        }
560
561        if tracer.is_none() {
562            // Handle the case where this is called while the thread group is being released.
563            if let Ok(tg_stop_state) = self.base.thread_group().load_stopped().as_in_progress() {
564                self.set_stopped(tg_stop_state, None, None, None);
565            }
566        }
567        self.ptrace = tracer;
568        Ok(())
569    }
570
571    pub fn can_accept_ptrace_commands(&mut self) -> bool {
572        !self.base.load_stopped().is_waking_or_awake()
573            && self.is_ptraced()
574            && !self.is_ptrace_listening()
575    }
576
577    fn store_stopped(&mut self, state: StopState) {
578        // We don't actually use the guard but we require it to enforce that the
579        // caller holds the thread group's mutable state lock (identified by
580        // mutable access to the thread group's mutable state).
581
582        self.base.stop_state.store(state, Ordering::Relaxed)
583    }
584
585    pub fn update_flags(&mut self, clear: TaskFlags, set: TaskFlags) {
586        // We don't actually use the guard but we require it to enforce that the
587        // caller holds the task's mutable state lock (identified by mutable
588        // access to the task's mutable state).
589
590        debug_assert_eq!(clear ^ set, clear | set);
591        let observed = self.base.flags();
592        let swapped = self.base.flags.swap((observed | set) & !clear, Ordering::Relaxed);
593        debug_assert_eq!(swapped, observed);
594    }
595
596    pub fn set_flags(&mut self, flag: TaskFlags, v: bool) {
597        let (clear, set) = if v { (TaskFlags::empty(), flag) } else { (flag, TaskFlags::empty()) };
598
599        self.update_flags(clear, set);
600    }
601
602    pub fn set_exit_status(&mut self, status: ExitStatus) {
603        self.set_flags(TaskFlags::EXITED, true);
604        self.exit_status = Some(status);
605    }
606
607    pub fn set_exit_status_if_not_already(&mut self, status: ExitStatus) {
608        self.set_flags(TaskFlags::EXITED, true);
609        self.exit_status.get_or_insert(status);
610    }
611
612    /// Returns the number of pending signals for this task, without considering the signal mask.
613    pub fn pending_signal_count(&self) -> usize {
614        self.signals.num_queued() + self.base.thread_group().pending_signals.lock().num_queued()
615    }
616
617    /// Returns `true` if `signal` is pending for this task, without considering the signal mask.
618    pub fn has_signal_pending(&self, signal: Signal) -> bool {
619        self.signals.has_queued(signal)
620            || self.base.thread_group().pending_signals.lock().has_queued(signal)
621    }
622
623    /// The set of pending signals for the task, including the signals pending for the thread
624    /// group.
625    pub fn pending_signals(&self) -> SigSet {
626        self.signals.pending() | self.base.thread_group().pending_signals.lock().pending()
627    }
628
629    /// The set of pending signals for the task specifically, not including the signals pending
630    /// for the thread group.
631    pub fn task_specific_pending_signals(&self) -> SigSet {
632        self.signals.pending()
633    }
634
635    /// Returns true if any currently pending signal is allowed by `mask`.
636    pub fn is_any_signal_allowed_by_mask(&self, mask: SigSet) -> bool {
637        self.signals.is_any_allowed_by_mask(mask)
638            || self.base.thread_group().pending_signals.lock().is_any_allowed_by_mask(mask)
639    }
640
641    /// Returns whether or not a signal is pending for this task, taking the current
642    /// signal mask into account.
643    pub fn is_any_signal_pending(&self) -> bool {
644        let mask = self.signal_mask();
645        self.signals.is_any_pending()
646            || self.base.thread_group().pending_signals.lock().is_any_allowed_by_mask(mask)
647    }
648
649    /// Returns the next pending signal that passes `predicate`.
650    fn take_next_signal_where<F>(&mut self, predicate: F) -> Option<SignalInfo>
651    where
652        F: Fn(&SignalInfo) -> bool,
653    {
654        if let Some(signal) =
655            self.base.thread_group().pending_signals.lock().take_next_where(&predicate)
656        {
657            Some(signal)
658        } else {
659            self.signals.take_next_where(&predicate)
660        }
661    }
662
663    /// Removes and returns the next pending `signal` for this task.
664    ///
665    /// Returns `None` if `siginfo` is a blocked signal, or no such signal is pending.
666    pub fn take_specific_signal(&mut self, siginfo: SignalInfo) -> Option<SignalInfo> {
667        let signal_mask = self.signal_mask();
668        if signal_mask.has_signal(siginfo.signal) {
669            return None;
670        }
671
672        let predicate = |s: &SignalInfo| s.signal == siginfo.signal;
673        self.take_next_signal_where(predicate)
674    }
675
676    /// Removes and returns a pending signal that is unblocked by the current signal mask.
677    ///
678    /// Returns `None` if there are no unblocked signals pending.
679    pub fn take_any_signal(&mut self) -> Option<SignalInfo> {
680        self.take_signal_with_mask(self.signal_mask())
681    }
682
683    /// Removes and returns a pending signal that is unblocked by `signal_mask`.
684    ///
685    /// Returns `None` if there are no signals pending that are unblocked by `signal_mask`.
686    pub fn take_signal_with_mask(&mut self, signal_mask: SigSet) -> Option<SignalInfo> {
687        let predicate = |s: &SignalInfo| !signal_mask.has_signal(s.signal) || s.force;
688        self.take_next_signal_where(predicate)
689    }
690
691    /// Removes and returns a pending internal signal.
692    ///
693    /// Returns `None` if there are no signals pending.
694    pub fn take_kernel_signal(&mut self) -> Option<KernelSignal> {
695        self.kernel_signals.pop_front()
696    }
697
698    #[cfg(test)]
699    pub fn queued_signal_count(&self, signal: Signal) -> usize {
700        self.signals.queued_count(signal)
701            + self.base.thread_group().pending_signals.lock().queued_count(signal)
702    }
703}
704
705#[derive(Debug, Clone, Copy, PartialEq, Eq)]
706pub enum TaskStateCode {
707    // Task is being executed.
708    Running,
709
710    // Task is waiting for an event.
711    Sleeping,
712
713    // Tracing stop
714    TracingStop,
715
716    // Task has exited.
717    Zombie,
718}
719
720impl TaskStateCode {
721    pub fn code_char(&self) -> char {
722        match self {
723            TaskStateCode::Running => 'R',
724            TaskStateCode::Sleeping => 'S',
725            TaskStateCode::TracingStop => 't',
726            TaskStateCode::Zombie => 'Z',
727        }
728    }
729
730    pub fn name(&self) -> &'static str {
731        match self {
732            TaskStateCode::Running => "running",
733            TaskStateCode::Sleeping => "sleeping",
734            TaskStateCode::TracingStop => "tracing stop",
735            TaskStateCode::Zombie => "zombie",
736        }
737    }
738}
739
740/// The information of the task that needs to be available to the `ThreadGroup` while computing
741/// which process a wait can target. It is necessary to shared this data with the `ThreadGroup` so
742/// that it is available while the task is being dropped and so is not accessible from a weak
743/// pointer.
744#[derive(Debug)]
745pub struct TaskPersistentInfoState {
746    /// Immutable information about the task
747    tid: tid_t,
748    thread_group_key: ThreadGroupKey,
749
750    /// The command of this task.
751    command: Mutex<TaskCommand>,
752
753    /// The security credentials for this task. These are only set when the task is the CurrentTask,
754    /// or on task creation.
755    creds: RcuArc<Credentials>,
756
757    // A lock for the security credentials. Writers must take the lock, readers that need to ensure
758    // that the task state does not change may take the lock.
759    creds_lock: RwLock<()>,
760}
761
762/// Guard for reading locked credentials.
763pub struct CredentialsReadGuard<'a> {
764    _lock: RwLockReadGuard<'a, ()>,
765    creds: RcuReadGuard<Credentials>,
766}
767
768impl<'a> Deref for CredentialsReadGuard<'a> {
769    type Target = Credentials;
770
771    fn deref(&self) -> &Self::Target {
772        self.creds.deref()
773    }
774}
775
776/// Guard for writing credentials. No `CredentialsReadGuard` to the same task can concurrently
777///  exist.
778pub struct CredentialsWriteGuard<'a> {
779    _lock: RwLockWriteGuard<'a, ()>,
780    creds: &'a RcuArc<Credentials>,
781}
782
783impl<'a> CredentialsWriteGuard<'a> {
784    pub fn update(&mut self, creds: Arc<Credentials>) {
785        self.creds.update(creds);
786    }
787}
788
789impl TaskPersistentInfoState {
790    fn new(
791        tid: tid_t,
792        thread_group_key: ThreadGroupKey,
793        command: TaskCommand,
794        creds: Arc<Credentials>,
795    ) -> TaskPersistentInfo {
796        Arc::new(Self {
797            tid,
798            thread_group_key,
799            command: Mutex::new(command),
800            creds: RcuArc::new(creds),
801            creds_lock: RwLock::new(()),
802        })
803    }
804
805    pub fn tid(&self) -> tid_t {
806        self.tid
807    }
808
809    pub fn pid(&self) -> pid_t {
810        self.thread_group_key.pid()
811    }
812
813    pub fn command_guard(&self) -> MutexGuard<'_, TaskCommand> {
814        self.command.lock()
815    }
816
817    /// Snapshots the credentials, returning a short-lived RCU-guarded reference.
818    pub fn real_creds(&self) -> RcuReadGuard<Credentials> {
819        self.creds.read()
820    }
821
822    /// Snapshots the credentials, returning a new reference. Use this if you need to stash the
823    /// credentials somewhere.
824    pub fn clone_creds(&self) -> Arc<Credentials> {
825        self.creds.to_arc()
826    }
827
828    /// Returns a read lock on the credentials. This is appropriate if you need to guarantee that
829    ///  the Task's credentials will not change during a security-sensitive operation.
830    pub fn lock_creds(&self) -> CredentialsReadGuard<'_> {
831        let lock = self.creds_lock.read();
832        CredentialsReadGuard { _lock: lock, creds: self.creds.read() }
833    }
834
835    /// Locks the credentials for writing.
836    /// SAFETY: Only use from CurrentTask, and keep the subjective credentials stored in CurrentTask
837    /// in sync.
838    pub(in crate::task) unsafe fn write_creds(&self) -> CredentialsWriteGuard<'_> {
839        let lock = self.creds_lock.write();
840        CredentialsWriteGuard { _lock: lock, creds: &self.creds }
841    }
842}
843
844pub type TaskPersistentInfo = Arc<TaskPersistentInfoState>;
845
846/// A unit of execution.
847///
848/// A task is the primary unit of execution in the Starnix kernel. Most tasks are *user* tasks,
849/// which have an associated Zircon thread. The Zircon thread switches between restricted mode,
850/// in which the thread runs userspace code, and normal mode, in which the thread runs Starnix
851/// code.
852///
853/// Tasks track the resources used by userspace by referencing various objects, such as an
854/// `FdTable`, a `MemoryManager`, and an `FsContext`. Many tasks can share references to these
855/// objects. In principle, which objects are shared between which tasks can be largely arbitrary,
856/// but there are common patterns of sharing. For example, tasks created with `pthread_create`
857/// will share the `FdTable`, `MemoryManager`, and `FsContext` and are often called "threads" by
858/// userspace programmers. Tasks created by `posix_spawn` do not share these objects and are often
859/// called "processes" by userspace programmers. However, inside the kernel, there is no clear
860/// definition of a "thread" or a "process".
861///
862/// During boot, the kernel creates the first task, often called `init`. The vast majority of other
863/// tasks are created as transitive clones (e.g., using `clone(2)`) of that task. Sometimes, the
864/// kernel will create new tasks from whole cloth, either with a corresponding userspace component
865/// or to represent some background work inside the kernel.
866///
867/// See also `CurrentTask`, which represents the task corresponding to the thread that is currently
868/// executing.
869pub struct Task {
870    /// Weak reference to the `OwnedRef` of this `Task`. This allows to retrieve the
871    /// `TempRef` from a raw `Task`.
872    pub weak_self: WeakRef<Self>,
873
874    /// A unique identifier for this task.
875    ///
876    /// This value can be read in userspace using `gettid(2)`. In general, this value
877    /// is different from the value return by `getpid(2)`, which returns the `id` of the leader
878    /// of the `thread_group`.
879    pub tid: tid_t,
880
881    /// The process key of this task.
882    pub thread_group_key: ThreadGroupKey,
883
884    /// The kernel to which this thread group belongs.
885    pub kernel: Arc<Kernel>,
886
887    /// The thread group to which this task belongs.
888    ///
889    /// The group of tasks in a thread group roughly corresponds to the userspace notion of a
890    /// process.
891    pub thread_group: Arc<ThreadGroup>,
892
893    /// A handle to the underlying Zircon thread object.
894    ///
895    /// Some tasks lack an underlying Zircon thread. These tasks are used internally by the
896    /// Starnix kernel to track background work, typically on a `kthread`.
897    pub thread: RwLock<Option<Arc<zx::Thread>>>,
898
899    /// The file descriptor table for this task.
900    ///
901    /// This table can be share by many tasks.
902    pub files: FdTable,
903
904    /// The memory manager for this task.  This is `None` only for system tasks.
905    pub mm: RcuOptionArc<MemoryManager>,
906
907    /// The file system for this task.
908    fs: RcuOptionArc<FsContext>,
909
910    /// The namespace for abstract AF_UNIX sockets for this task.
911    pub abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
912
913    /// The namespace for AF_VSOCK for this task.
914    pub abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
915
916    /// The stop state of the task, distinct from the stop state of the thread group.
917    ///
918    /// Must only be set when the `mutable_state` write lock is held.
919    stop_state: AtomicStopState,
920
921    /// The flags for the task.
922    ///
923    /// Must only be set the then `mutable_state` write lock is held.
924    flags: AtomicTaskFlags,
925
926    /// The mutable state of the Task.
927    mutable_state: RwLock<TaskMutableState>,
928
929    /// The information of the task that needs to be available to the `ThreadGroup` while computing
930    /// which process a wait can target.
931    /// Contains the command line, the task credentials and the exit signal.
932    /// See `TaskPersistentInfo` for more information.
933    pub persistent_info: TaskPersistentInfo,
934
935    /// For vfork and clone() with CLONE_VFORK, this is set when the task exits or calls execve().
936    /// It allows the calling task to block until the fork has been completed. Only populated
937    /// when created with the CLONE_VFORK flag.
938    vfork_event: Option<Arc<zx::Event>>,
939
940    /// Variable that can tell you whether there are currently seccomp
941    /// filters without holding a lock
942    pub seccomp_filter_state: SeccompState,
943
944    /// Tell you whether you are tracing syscall entry / exit without a lock.
945    pub trace_syscalls: AtomicBool,
946
947    // The pid directory, so it doesn't have to be generated and thrown away on every access.
948    // See https://fxbug.dev/291962828 for details.
949    pub proc_pid_directory_cache: Mutex<Option<FsNodeHandle>>,
950
951    /// The Linux Security Modules state for this thread group. This should be the last member of
952    /// this struct.
953    pub security_state: security::TaskState,
954}
955
956/// The decoded cross-platform parts we care about for page fault exception reports.
957#[derive(Debug)]
958pub struct PageFaultExceptionReport {
959    pub faulting_address: u64,
960    pub not_present: bool, // Set when the page fault was due to a not-present page.
961    pub is_write: bool,    // Set when the triggering memory operation was a write.
962    pub is_execute: bool,  // Set when the triggering memory operation was an execute.
963}
964
965impl Task {
966    pub fn kernel(&self) -> &Arc<Kernel> {
967        &self.kernel
968    }
969
970    pub fn thread_group(&self) -> &Arc<ThreadGroup> {
971        &self.thread_group
972    }
973
974    pub fn has_same_address_space(&self, other: Option<&Arc<MemoryManager>>) -> bool {
975        match (self.mm(), other) {
976            (Ok(this), Some(other)) => Arc::ptr_eq(&this, other),
977            (Err(_), None) => true,
978            _ => false,
979        }
980    }
981
982    pub fn flags(&self) -> TaskFlags {
983        self.flags.load(Ordering::Relaxed)
984    }
985
986    /// When the task exits, if there is a notification that needs to propagate
987    /// to a ptracer, make sure it will propagate.
988    pub fn set_ptrace_zombie(&self, pids: &mut crate::task::PidTable) {
989        let pgid = self.thread_group().read().process_group.leader;
990        let exit_signal = self.thread_group().read().exit_signal.clone();
991        let mut state = self.write();
992        state.set_stopped(StopState::ForceAwake, None, None, None);
993        if let Some(ptrace) = &mut state.ptrace {
994            // Add a zombie that the ptracer will notice.
995            ptrace.last_signal_waitable = true;
996            let tracer_pid = ptrace.get_pid();
997            let tracer_tg = pids.get_thread_group(tracer_pid);
998            if let Some(tracer_tg) = tracer_tg {
999                drop(state);
1000                let mut tracer_state = tracer_tg.write();
1001
1002                let exit_status = self.exit_status().unwrap_or_else(|| {
1003                    starnix_logging::log_error!("Exiting without an exit code.");
1004                    ExitStatus::Exit(u8::MAX)
1005                });
1006                let uid = self.real_creds().uid;
1007                let exit_info = ProcessExitInfo { status: exit_status, exit_signal };
1008                let zombie = ZombieProcess {
1009                    thread_group_key: self.thread_group_key.clone(),
1010                    pgid,
1011                    uid,
1012                    exit_info: exit_info,
1013                    // ptrace doesn't need this.
1014                    time_stats: TaskTimeStats::default(),
1015                    is_canonical: false,
1016                };
1017
1018                tracer_state.zombie_ptracees.add(pids, self.tid, zombie);
1019            };
1020        }
1021    }
1022
1023    /// Disconnects this task from the tracer, if the tracer is still running.
1024    pub fn ptrace_disconnect(&mut self, pids: &PidTable) {
1025        let mut state = self.write();
1026        let ptracer_pid = state.ptrace.as_ref().map(|ptrace| ptrace.get_pid());
1027        if let Some(ptracer_pid) = ptracer_pid {
1028            let _ = state.set_ptrace(None);
1029            if let Some(ProcessEntryRef::Process(tg)) = pids.get_process(ptracer_pid) {
1030                let tid = self.get_tid();
1031                drop(state);
1032                tg.ptracees.lock().remove(&tid);
1033            }
1034        }
1035    }
1036
1037    pub fn exit_status(&self) -> Option<ExitStatus> {
1038        self.is_exitted().then(|| self.read().exit_status.clone()).flatten()
1039    }
1040
1041    pub fn is_exitted(&self) -> bool {
1042        self.flags().contains(TaskFlags::EXITED)
1043    }
1044
1045    pub fn load_stopped(&self) -> StopState {
1046        self.stop_state.load(Ordering::Relaxed)
1047    }
1048
1049    /// Upgrade a Reference to a Task, returning a ESRCH errno if the reference cannot be borrowed.
1050    pub fn from_weak(weak: &WeakRef<Task>) -> Result<TempRef<'_, Task>, Errno> {
1051        weak.upgrade().ok_or_else(|| errno!(ESRCH))
1052    }
1053
1054    /// Internal function for creating a Task object. Useful when you need to specify the value of
1055    /// every field. create_process and create_thread are more likely to be what you want.
1056    ///
1057    /// Any fields that should be initialized fresh for every task, even if the task was created
1058    /// with fork, are initialized to their defaults inside this function. All other fields are
1059    /// passed as parameters.
1060    #[allow(clippy::let_and_return)]
1061    pub fn new(
1062        tid: tid_t,
1063        command: TaskCommand,
1064        thread_group: Arc<ThreadGroup>,
1065        thread: Option<zx::Thread>,
1066        files: FdTable,
1067        mm: Option<Arc<MemoryManager>>,
1068        // The only case where fs should be None if when building the initial task that is the
1069        // used to build the initial FsContext.
1070        fs: Arc<FsContext>,
1071        creds: Arc<Credentials>,
1072        abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
1073        abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
1074        signal_mask: SigSet,
1075        kernel_signals: VecDeque<KernelSignal>,
1076        vfork_event: Option<Arc<zx::Event>>,
1077        scheduler_state: SchedulerState,
1078        uts_ns: UtsNamespaceHandle,
1079        no_new_privs: bool,
1080        seccomp_filter_state: SeccompState,
1081        seccomp_filters: SeccompFilterContainer,
1082        robust_list_head: RobustListHeadPtr,
1083        timerslack_ns: u64,
1084        security_state: security::TaskState,
1085    ) -> OwnedRef<Self> {
1086        let thread_group_key = ThreadGroupKey::from(&thread_group);
1087        OwnedRef::new_cyclic(|weak_self| {
1088            let task = Task {
1089                weak_self,
1090                tid,
1091                thread_group_key: thread_group_key.clone(),
1092                kernel: Arc::clone(&thread_group.kernel),
1093                thread_group,
1094                thread: RwLock::new(thread.map(Arc::new)),
1095                files,
1096                mm: RcuOptionArc::new(mm),
1097                fs: RcuOptionArc::new(Some(fs)),
1098                abstract_socket_namespace,
1099                abstract_vsock_namespace,
1100                vfork_event,
1101                stop_state: AtomicStopState::new(StopState::Awake),
1102                flags: AtomicTaskFlags::new(TaskFlags::empty()),
1103                mutable_state: RwLock::new(TaskMutableState {
1104                    clear_child_tid: UserRef::default(),
1105                    signals: SignalState::with_mask(signal_mask),
1106                    kernel_signals,
1107                    exit_status: None,
1108                    scheduler_state,
1109                    uts_ns,
1110                    no_new_privs,
1111                    oom_score_adj: Default::default(),
1112                    seccomp_filters,
1113                    robust_list_head,
1114                    timerslack_ns,
1115                    // The default timerslack is set to the current timerslack of the creating thread.
1116                    default_timerslack_ns: timerslack_ns,
1117                    ptrace: None,
1118                    captured_thread_state: None,
1119                }),
1120                persistent_info: TaskPersistentInfoState::new(
1121                    tid,
1122                    thread_group_key,
1123                    command,
1124                    creds,
1125                ),
1126                seccomp_filter_state,
1127                trace_syscalls: AtomicBool::new(false),
1128                proc_pid_directory_cache: Mutex::new(None),
1129                security_state,
1130            };
1131
1132            #[cfg(any(test, debug_assertions))]
1133            {
1134                // Note that `Kernel::pids` is already locked by the caller of `Task::new()`.
1135                let _l1 = task.read();
1136                let _l2 = task.persistent_info.lock_creds();
1137                let _l3 = task.persistent_info.command_guard();
1138            }
1139            task
1140        })
1141    }
1142
1143    state_accessor!(Task, mutable_state);
1144
1145    /// Returns the real credentials of the task as a short-lived RCU-guarded reference. These
1146    /// credentials are used to check permissions for actions performed on the task. If the task
1147    /// itself is performing an action, use `CurrentTask::current_creds` instead. This does not
1148    /// lock the credentials.
1149    pub fn real_creds(&self) -> RcuReadGuard<Credentials> {
1150        self.persistent_info.real_creds()
1151    }
1152
1153    /// Returns a new long-lived reference to the real credentials of the task.  These credentials
1154    /// are used to check permissions for actions performed on the task. If the task itself is
1155    /// performing an action, use `CurrentTask::current_creds` instead. This does not lock the
1156    /// credentials.
1157    pub fn clone_creds(&self) -> Arc<Credentials> {
1158        self.persistent_info.clone_creds()
1159    }
1160
1161    pub fn ptracer_task(&self) -> WeakRef<Task> {
1162        let ptracer = {
1163            let state = self.read();
1164            state.ptrace.as_ref().map(|p| p.core_state.pid)
1165        };
1166
1167        let Some(ptracer) = ptracer else {
1168            return WeakRef::default();
1169        };
1170
1171        self.get_task(ptracer)
1172    }
1173
1174    pub fn fs(&self) -> Arc<FsContext> {
1175        self.fs.to_option_arc().expect("fs must be set")
1176    }
1177
1178    pub fn has_shared_fs(&self) -> bool {
1179        let maybe_fs = self.fs.to_option_arc();
1180        // This check is incorrect because someone else could be holding a temporary Arc to the
1181        // FsContext and therefore increasing the strong count.
1182        maybe_fs.is_some_and(|fs| Arc::strong_count(&fs) > 2usize)
1183    }
1184
1185    #[track_caller]
1186    pub fn mm(&self) -> Result<Arc<MemoryManager>, Errno> {
1187        self.mm.to_option_arc().ok_or_else(|| errno!(EINVAL))
1188    }
1189
1190    pub fn unshare_fs(&self) {
1191        let fs = self.fs().fork();
1192        self.fs.update(Some(fs));
1193    }
1194
1195    /// Modify the given elements of the scheduler state with new values and update the
1196    /// task's thread's role.
1197    pub(crate) fn set_scheduler_policy_priority_and_reset_on_fork(
1198        &self,
1199        policy: SchedulingPolicy,
1200        priority: RealtimePriority,
1201        reset_on_fork: bool,
1202    ) -> Result<(), Errno> {
1203        self.update_scheduler_state_then_role(|scheduler_state| {
1204            scheduler_state.policy = policy;
1205            scheduler_state.realtime_priority = priority;
1206            scheduler_state.reset_on_fork = reset_on_fork;
1207        })
1208    }
1209
1210    /// Modify the scheduler state's priority and update the task's thread's role.
1211    pub(crate) fn set_scheduler_priority(&self, priority: RealtimePriority) -> Result<(), Errno> {
1212        self.update_scheduler_state_then_role(|scheduler_state| {
1213            scheduler_state.realtime_priority = priority
1214        })
1215    }
1216
1217    /// Modify the scheduler state's nice and update the task's thread's role.
1218    pub(crate) fn set_scheduler_nice(&self, nice: NormalPriority) -> Result<(), Errno> {
1219        self.update_scheduler_state_then_role(|scheduler_state| {
1220            scheduler_state.normal_priority = nice
1221        })
1222    }
1223
1224    /// Overwrite the existing scheduler state with a new one and update the task's thread's role.
1225    pub fn set_scheduler_state(&self, scheduler_state: SchedulerState) -> Result<(), Errno> {
1226        self.update_scheduler_state_then_role(|task_scheduler_state| {
1227            *task_scheduler_state = scheduler_state
1228        })
1229    }
1230
1231    /// Update the task's thread's role based on its current scheduler state without making any
1232    /// changes to the state.
1233    ///
1234    /// This should be called on tasks that have newly created threads, e.g. after cloning.
1235    pub fn sync_scheduler_state_to_role(&self) -> Result<(), Errno> {
1236        self.update_scheduler_state_then_role(|_| {})
1237    }
1238
1239    fn update_scheduler_state_then_role(
1240        &self,
1241        updater: impl FnOnce(&mut SchedulerState),
1242    ) -> Result<(), Errno> {
1243        let new_scheduler_state = {
1244            // Hold the task state lock as briefly as possible, it's not needed to update the role.
1245            let mut state = self.write();
1246            updater(&mut state.scheduler_state);
1247            state.scheduler_state
1248        };
1249        self.thread_group().kernel.scheduler.set_thread_role(self, new_scheduler_state)?;
1250        Ok(())
1251    }
1252
1253    /// Signals the vfork event, if any, to unblock waiters.
1254    pub fn signal_vfork(&self) {
1255        if let Some(event) = &self.vfork_event {
1256            if let Err(status) = event.signal(Signals::NONE, Signals::USER_0) {
1257                log_warn!("Failed to set vfork signal {status}");
1258            }
1259        };
1260    }
1261
1262    /// Blocks the caller until the task has exited or executed execve(). This is used to implement
1263    /// vfork() and clone(... CLONE_VFORK, ...). The task must have created with CLONE_EXECVE.
1264    pub fn wait_for_execve(&self, task_to_wait: WeakRef<Task>) -> Result<(), Errno> {
1265        let event = task_to_wait.upgrade().and_then(|t| t.vfork_event.clone());
1266        if let Some(event) = event {
1267            event
1268                .wait_one(zx::Signals::USER_0, zx::MonotonicInstant::INFINITE)
1269                .map_err(|status| from_status_like_fdio!(status))?;
1270        }
1271        Ok(())
1272    }
1273
1274    /// If needed, clear the child tid for this task.
1275    ///
1276    /// Userspace can ask us to clear the child tid and issue a futex wake at
1277    /// the child tid address when we tear down a task. For example, bionic
1278    /// uses this mechanism to implement pthread_join. The thread that calls
1279    /// pthread_join sleeps using FUTEX_WAIT on the child tid address. We wake
1280    /// them up here to let them know the thread is done.
1281    pub fn clear_child_tid_if_needed<L>(&self, locked: &mut Locked<L>) -> Result<(), Errno>
1282    where
1283        L: LockBefore<TerminalLock>,
1284    {
1285        let mut state = self.write();
1286        let user_tid = state.clear_child_tid;
1287        if !user_tid.is_null() {
1288            let zero: tid_t = 0;
1289            self.write_object(user_tid, &zero)?;
1290            self.kernel().shared_futexes.wake(
1291                locked,
1292                self,
1293                user_tid.addr(),
1294                usize::MAX,
1295                FUTEX_BITSET_MATCH_ANY,
1296            )?;
1297            state.clear_child_tid = UserRef::default();
1298        }
1299        Ok(())
1300    }
1301
1302    pub fn get_task(&self, tid: tid_t) -> WeakRef<Task> {
1303        self.kernel().pids.read().get_task(tid)
1304    }
1305
1306    pub fn get_pid(&self) -> pid_t {
1307        self.thread_group_key.pid()
1308    }
1309
1310    pub fn get_tid(&self) -> tid_t {
1311        self.tid
1312    }
1313
1314    pub fn is_leader(&self) -> bool {
1315        self.get_pid() == self.get_tid()
1316    }
1317
1318    pub fn read_argv(&self, max_len: usize) -> Result<Vec<FsString>, Errno> {
1319        // argv is empty for kthreads
1320        let Ok(mm) = self.mm() else {
1321            return Ok(vec![]);
1322        };
1323        let (argv_start, argv_end) = {
1324            let mm_state = mm.state.read();
1325            (mm_state.argv_start, mm_state.argv_end)
1326        };
1327
1328        let len_to_read = std::cmp::min(argv_end - argv_start, max_len);
1329        self.read_nul_delimited_c_string_list(argv_start, len_to_read)
1330    }
1331
1332    pub fn read_argv0(&self) -> Result<FsString, Errno> {
1333        // argv is empty for kthreads
1334        let Ok(mm) = self.mm() else {
1335            return Ok(FsString::default());
1336        };
1337        let argv_start = {
1338            let mm_state = mm.state.read();
1339            mm_state.argv_start
1340        };
1341        // Assuming a 64-bit arch width is fine for a type that's just u8's on all arches.
1342        let argv_start = UserCString::new(&ArchWidth::Arch64, argv_start);
1343        self.read_path(argv_start)
1344    }
1345
1346    pub fn read_env(&self, max_len: usize) -> Result<Vec<FsString>, Errno> {
1347        // environment is empty for kthreads
1348        let Ok(mm) = self.mm() else { return Ok(vec![]) };
1349        let (env_start, env_end) = {
1350            let mm_state = mm.state.read();
1351            (mm_state.environ_start, mm_state.environ_end)
1352        };
1353
1354        let len_to_read = std::cmp::min(env_end - env_start, max_len);
1355        self.read_nul_delimited_c_string_list(env_start, len_to_read)
1356    }
1357
1358    pub fn thread_runtime_info(&self) -> Result<zx::TaskRuntimeInfo, Errno> {
1359        self.thread
1360            .read()
1361            .as_ref()
1362            .ok_or_else(|| errno!(EINVAL))?
1363            .get_runtime_info()
1364            .map_err(|status| from_status_like_fdio!(status))
1365    }
1366
1367    pub fn real_fscred(&self) -> FsCred {
1368        self.real_creds().as_fscred()
1369    }
1370
1371    /// Interrupts the current task.
1372    ///
1373    /// This will interrupt any blocking syscalls if the task is blocked on one.
1374    /// The signal_state of the task must not be locked.
1375    pub fn interrupt(&self) {
1376        self.read().signals.run_state.wake();
1377        if let Some(thread) = self.thread.read().as_ref() {
1378            #[allow(
1379                clippy::undocumented_unsafe_blocks,
1380                reason = "Force documented unsafe blocks in Starnix"
1381            )]
1382            let status = unsafe { zx::sys::zx_restricted_kick(thread.raw_handle(), 0) };
1383            if status != zx::sys::ZX_OK {
1384                // zx_restricted_kick() could return ZX_ERR_BAD_STATE if the target thread is already in the
1385                // DYING or DEAD states. That's fine since it means that the task is in the process of
1386                // tearing down, so allow it.
1387                assert_eq!(status, zx::sys::ZX_ERR_BAD_STATE);
1388            }
1389        }
1390    }
1391
1392    pub fn command(&self) -> TaskCommand {
1393        self.persistent_info.command.lock().clone()
1394    }
1395
1396    pub fn set_command_name(&self, mut new_name: TaskCommand) {
1397        // If we're going to update the process name, see if we can get a longer one than normally
1398        // provided in the Linux uapi. Only choose the argv0-based name if it's a superset of the
1399        // uapi-provided name to avoid clobbering the name provided by the user.
1400        if let Ok(argv0) = self.read_argv0() {
1401            let argv0 = TaskCommand::from_path_bytes(&argv0);
1402            if let Some(embedded_name) = argv0.try_embed(&new_name) {
1403                new_name = embedded_name;
1404            }
1405        }
1406
1407        // Acquire this before modifying Zircon state to ensure consistency under concurrent access.
1408        // Ideally this would also guard the logic above to read argv[0] but we can't due to lock
1409        // cycles with SELinux checks.
1410        let mut command_guard = self.persistent_info.command_guard();
1411
1412        // Set the name on the Linux thread.
1413        if let Some(thread) = self.thread.read().as_ref() {
1414            set_zx_name(&**thread, new_name.as_bytes());
1415        }
1416
1417        // If this is the thread group leader, use this name for the process too.
1418        if self.is_leader() {
1419            set_zx_name(&self.thread_group().process, new_name.as_bytes());
1420            let _ = zx::Thread::raise_user_exception(
1421                zx::RaiseExceptionOptions::TARGET_JOB_DEBUGGER,
1422                zx::sys::ZX_EXCP_USER_CODE_PROCESS_NAME_CHANGED,
1423                0,
1424            );
1425        }
1426
1427        // Avoid a lock cycle by dropping the guard before notifying memory attribution of the
1428        // change.
1429        *command_guard = new_name;
1430        drop(command_guard);
1431
1432        if self.is_leader() {
1433            if let Some(notifier) = &self.thread_group().read().notifier {
1434                let _ = notifier.send(MemoryAttributionLifecycleEvent::name_change(self.tid));
1435            }
1436        }
1437    }
1438
1439    pub fn set_seccomp_state(&self, state: SeccompStateValue) -> Result<(), Errno> {
1440        self.seccomp_filter_state.set(&state)
1441    }
1442
1443    pub fn state_code(&self) -> TaskStateCode {
1444        let status = self.read();
1445        if status.exit_status.is_some() {
1446            TaskStateCode::Zombie
1447        } else if status.signals.run_state.is_blocked() {
1448            let stop_state = self.load_stopped();
1449            if stop_state.ptrace_only() && stop_state.is_stopped() {
1450                TaskStateCode::TracingStop
1451            } else {
1452                TaskStateCode::Sleeping
1453            }
1454        } else {
1455            TaskStateCode::Running
1456        }
1457    }
1458
1459    pub fn time_stats(&self) -> TaskTimeStats {
1460        use zx::Task;
1461        let info = match &*self.thread.read() {
1462            Some(thread) => thread.get_runtime_info().expect("Failed to get thread stats"),
1463            None => return TaskTimeStats::default(),
1464        };
1465
1466        TaskTimeStats {
1467            user_time: zx::MonotonicDuration::from_nanos(info.cpu_time),
1468            // TODO(https://fxbug.dev/42078242): How can we calculate system time?
1469            system_time: zx::MonotonicDuration::default(),
1470        }
1471    }
1472
1473    pub fn get_signal_action(&self, signal: Signal) -> sigaction_t {
1474        self.thread_group().signal_actions.get(signal)
1475    }
1476
1477    pub fn record_pid_koid_mapping(&self) {
1478        let Some(ref mapping_table) = *self.kernel().pid_to_koid_mapping.read() else { return };
1479
1480        let pkoid = self.thread_group().get_process_koid().ok();
1481        let tkoid = self.thread.read().as_ref().and_then(|t| t.koid().ok());
1482        mapping_table.write().insert(self.tid, KoidPair { process: pkoid, thread: tkoid });
1483    }
1484}
1485
1486impl Releasable for Task {
1487    type Context<'a> = (
1488        ThreadState<RegisterStorageEnum>,
1489        &'a mut Locked<TaskRelease>,
1490        RwLockWriteGuard<'a, PidTable>,
1491    );
1492
1493    fn release<'a>(mut self, context: Self::Context<'a>) {
1494        let (thread_state, locked, pids) = context;
1495
1496        *self.proc_pid_directory_cache.get_mut() = None;
1497        self.ptrace_disconnect(&pids);
1498
1499        std::mem::drop(pids);
1500
1501        self.files.release();
1502
1503        self.signal_vfork();
1504
1505        // Drop fields that can end up owning a FsNode to ensure no FsNode are owned by this task.
1506        self.fs.update(None);
1507        self.mm.update(None);
1508
1509        // Rebuild a temporary CurrentTask to run the release actions that requires a CurrentState.
1510        let current_task = CurrentTask::new(OwnedRef::new(self), thread_state.into());
1511
1512        // Apply any delayed releasers left.
1513        current_task.trigger_delayed_releaser(locked);
1514
1515        // Drop the task now that is has been released. This requires to take it from the OwnedRef
1516        // and from the resulting ReleaseGuard.
1517        let CurrentTask { mut task, .. } = current_task;
1518        let task = OwnedRef::take(&mut task).expect("task should not have been re-owned");
1519        let _task: Self = ReleaseGuard::take(task);
1520    }
1521}
1522
1523impl MemoryAccessor for Task {
1524    fn read_memory<'a>(
1525        &self,
1526        addr: UserAddress,
1527        bytes: &'a mut [MaybeUninit<u8>],
1528    ) -> Result<&'a mut [u8], Errno> {
1529        // Using a `Task` to read memory generally indicates that the memory
1530        // is being read from a task different than the `CurrentTask`. When
1531        // this `Task` is not current, its address space is not mapped
1532        // so we need to go through the VMO.
1533        self.mm()?.syscall_read_memory(addr, bytes)
1534    }
1535
1536    fn read_memory_partial_until_null_byte<'a>(
1537        &self,
1538        addr: UserAddress,
1539        bytes: &'a mut [MaybeUninit<u8>],
1540    ) -> Result<&'a mut [u8], Errno> {
1541        // Using a `Task` to read memory generally indicates that the memory
1542        // is being read from a task different than the `CurrentTask`. When
1543        // this `Task` is not current, its address space is not mapped
1544        // so we need to go through the VMO.
1545        self.mm()?.syscall_read_memory_partial_until_null_byte(addr, bytes)
1546    }
1547
1548    fn read_memory_partial<'a>(
1549        &self,
1550        addr: UserAddress,
1551        bytes: &'a mut [MaybeUninit<u8>],
1552    ) -> Result<&'a mut [u8], Errno> {
1553        // Using a `Task` to read memory generally indicates that the memory
1554        // is being read from a task different than the `CurrentTask`. When
1555        // this `Task` is not current, its address space is not mapped
1556        // so we need to go through the VMO.
1557        self.mm()?.syscall_read_memory_partial(addr, bytes)
1558    }
1559
1560    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
1561        // Using a `Task` to write memory generally indicates that the memory
1562        // is being written to a task different than the `CurrentTask`. When
1563        // this `Task` is not current, its address space is not mapped
1564        // so we need to go through the VMO.
1565        self.mm()?.syscall_write_memory(addr, bytes)
1566    }
1567
1568    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
1569        // Using a `Task` to write memory generally indicates that the memory
1570        // is being written to a task different than the `CurrentTask`. When
1571        // this `Task` is not current, its address space is not mapped
1572        // so we need to go through the VMO.
1573        self.mm()?.syscall_write_memory_partial(addr, bytes)
1574    }
1575
1576    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
1577        // Using a `Task` to zero memory generally indicates that the memory
1578        // is being zeroed from a task different than the `CurrentTask`. When
1579        // this `Task` is not current, its address space is not mapped
1580        // so we need to go through the VMO.
1581        self.mm()?.syscall_zero(addr, length)
1582    }
1583}
1584
1585impl TaskMemoryAccessor for Task {
1586    fn maximum_valid_address(&self) -> Option<UserAddress> {
1587        self.mm().map(|mm| mm.maximum_valid_user_address).ok()
1588    }
1589}
1590
1591impl fmt::Debug for Task {
1592    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1593        write!(
1594            f,
1595            "{}:{}[{}]",
1596            self.thread_group().leader,
1597            self.tid,
1598            self.persistent_info.command.lock()
1599        )
1600    }
1601}
1602
1603impl cmp::PartialEq for Task {
1604    fn eq(&self, other: &Self) -> bool {
1605        let ptr: *const Task = self;
1606        let other_ptr: *const Task = other;
1607        ptr == other_ptr
1608    }
1609}
1610
1611impl cmp::Eq for Task {}
1612
1613#[cfg(test)]
1614mod test {
1615    use super::*;
1616    use crate::testing::*;
1617    use starnix_uapi::auth::{CAP_SYS_ADMIN, Capabilities};
1618    use starnix_uapi::resource_limits::Resource;
1619    use starnix_uapi::signals::SIGCHLD;
1620    use starnix_uapi::{CLONE_SIGHAND, CLONE_THREAD, CLONE_VM, rlimit};
1621
1622    #[::fuchsia::test]
1623    async fn test_tid_allocation() {
1624        spawn_kernel_and_run(async |locked, current_task| {
1625            let kernel = current_task.kernel();
1626            assert_eq!(current_task.get_tid(), 1);
1627            let another_current = create_task(locked, &kernel, "another-task");
1628            let another_tid = another_current.get_tid();
1629            assert!(another_tid >= 2);
1630
1631            let pids = kernel.pids.read();
1632            assert_eq!(pids.get_task(1).upgrade().unwrap().get_tid(), 1);
1633            assert_eq!(pids.get_task(another_tid).upgrade().unwrap().get_tid(), another_tid);
1634        })
1635        .await;
1636    }
1637
1638    #[::fuchsia::test]
1639    async fn test_clone_pid_and_parent_pid() {
1640        spawn_kernel_and_run(async |locked, current_task| {
1641            let thread = current_task.clone_task_for_test(
1642                locked,
1643                (CLONE_THREAD | CLONE_VM | CLONE_SIGHAND) as u64,
1644                Some(SIGCHLD),
1645            );
1646            assert_eq!(current_task.get_pid(), thread.get_pid());
1647            assert_ne!(current_task.get_tid(), thread.get_tid());
1648            assert_eq!(current_task.thread_group().leader, thread.thread_group().leader);
1649
1650            let child_task = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
1651            assert_ne!(current_task.get_pid(), child_task.get_pid());
1652            assert_ne!(current_task.get_tid(), child_task.get_tid());
1653            assert_eq!(current_task.get_pid(), child_task.thread_group().read().get_ppid());
1654        })
1655        .await;
1656    }
1657
1658    #[::fuchsia::test]
1659    async fn test_root_capabilities() {
1660        spawn_kernel_and_run(async |_, current_task| {
1661            assert!(security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN));
1662            assert_eq!(current_task.real_creds().cap_inheritable, Capabilities::empty());
1663
1664            current_task.set_creds(Credentials::with_ids(1, 1));
1665            assert!(!security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN));
1666        })
1667        .await;
1668    }
1669
1670    #[::fuchsia::test]
1671    async fn test_clone_rlimit() {
1672        spawn_kernel_and_run(async |locked, current_task| {
1673            let prev_fsize = current_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1674            assert_ne!(prev_fsize, 10);
1675            current_task
1676                .thread_group()
1677                .limits
1678                .lock(locked)
1679                .set(Resource::FSIZE, rlimit { rlim_cur: 10, rlim_max: 100 });
1680            let current_fsize = current_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1681            assert_eq!(current_fsize, 10);
1682
1683            let child_task = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
1684            let child_fsize = child_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1685            assert_eq!(child_fsize, 10)
1686        })
1687        .await;
1688    }
1689}