Skip to main content

starnix_core/task/
task.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::{MemoryAccessor, MemoryAccessorExt, MemoryManager, TaskMemoryAccessor};
6use crate::mutable_state::{state_accessor, state_implementation};
7use crate::ptrace::{
8    AtomicStopState, PtraceEvent, PtraceEventData, PtraceState, PtraceStatus, StopState,
9};
10use crate::signals::{KernelSignal, RunState, SignalDetail, SignalInfo, SignalState};
11use crate::task::memory_attribution::MemoryAttributionLifecycleEvent;
12use crate::task::tracing::KoidPair;
13use crate::task::{
14    AbstractUnixSocketNamespace, AbstractVsockSocketNamespace, CurrentTask, EventHandler, Kernel,
15    NormalPriority, PidTable, ProcessEntryRef, ProcessExitInfo, RealtimePriority, SchedulerState,
16    SchedulingPolicy, SeccompFilterContainer, SeccompState, SeccompStateValue, ThreadGroup,
17    ThreadGroupKey, ThreadState, UtsNamespaceHandle, WaitCanceler, Waiter, ZombieProcess,
18};
19use crate::vfs::{FdTable, FsContext, FsNodeHandle, FsString};
20use atomic_bitflags::atomic_bitflags;
21use fuchsia_rcu::{RcuArc, RcuOptionArc, RcuOptionCell, RcuReadGuard};
22use macro_rules_attribute::apply;
23use starnix_logging::{log_warn, set_zx_name};
24use starnix_registers::{HeapRegs, RegisterStorageEnum};
25use starnix_sync::{
26    LockBefore, Locked, Mutex, MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, TaskRelease,
27    TerminalLock,
28};
29use starnix_task_command::TaskCommand;
30use starnix_types::arch::ArchWidth;
31use starnix_types::ownership::{OwnedRef, Releasable, ReleaseGuard, TempRef, WeakRef};
32use starnix_types::stats::TaskTimeStats;
33use starnix_uapi::auth::{Credentials, FsCred};
34use starnix_uapi::errors::Errno;
35use starnix_uapi::signals::{SIGCHLD, SigSet, Signal, sigaltstack_contains_pointer};
36use starnix_uapi::user_address::{
37    ArchSpecific, MappingMultiArchUserRef, UserAddress, UserCString, UserRef,
38};
39use starnix_uapi::{
40    CLD_CONTINUED, CLD_DUMPED, CLD_EXITED, CLD_KILLED, CLD_STOPPED, CLD_TRAPPED,
41    FUTEX_BITSET_MATCH_ANY, errno, error, from_status_like_fdio, pid_t, sigaction_t, sigaltstack,
42    tid_t, uapi,
43};
44use std::collections::VecDeque;
45use std::mem::MaybeUninit;
46use std::ops::Deref;
47use std::sync::atomic::{AtomicBool, Ordering};
48use std::sync::{Arc, Weak};
49use std::{cmp, fmt};
50use zx::{Signals, Task as _};
51
52#[derive(Clone, Debug, Eq, PartialEq)]
53pub enum ExitStatus {
54    Exit(u8),
55    Kill(SignalInfo),
56    CoreDump(SignalInfo),
57    // The second field for Stop and Continue contains the type of ptrace stop
58    // event that made it stop / continue, if applicable (PTRACE_EVENT_STOP,
59    // PTRACE_EVENT_FORK, etc)
60    Stop(SignalInfo, PtraceEvent),
61    Continue(SignalInfo, PtraceEvent),
62}
63impl ExitStatus {
64    /// Converts the given exit status to a status code suitable for returning from wait syscalls.
65    pub fn wait_status(&self) -> i32 {
66        match self {
67            ExitStatus::Exit(status) => (*status as i32) << 8,
68            ExitStatus::Kill(siginfo) => siginfo.signal.number() as i32,
69            ExitStatus::CoreDump(siginfo) => (siginfo.signal.number() as i32) | 0x80,
70            ExitStatus::Continue(siginfo, trace_event) => {
71                let trace_event_val = *trace_event as u32;
72                if trace_event_val != 0 {
73                    (siginfo.signal.number() as i32) | (trace_event_val << 16) as i32
74                } else {
75                    0xffff
76                }
77            }
78            ExitStatus::Stop(siginfo, trace_event) => {
79                let trace_event_val = *trace_event as u32;
80                (0x7f + ((siginfo.signal.number() as i32) << 8)) | (trace_event_val << 16) as i32
81            }
82        }
83    }
84
85    pub fn signal_info_code(&self) -> i32 {
86        match self {
87            ExitStatus::Exit(_) => CLD_EXITED as i32,
88            ExitStatus::Kill(_) => CLD_KILLED as i32,
89            ExitStatus::CoreDump(_) => CLD_DUMPED as i32,
90            ExitStatus::Stop(_, _) => CLD_STOPPED as i32,
91            ExitStatus::Continue(_, _) => CLD_CONTINUED as i32,
92        }
93    }
94
95    pub fn signal_info_status(&self) -> i32 {
96        match self {
97            ExitStatus::Exit(status) => *status as i32,
98            ExitStatus::Kill(siginfo)
99            | ExitStatus::CoreDump(siginfo)
100            | ExitStatus::Continue(siginfo, _)
101            | ExitStatus::Stop(siginfo, _) => siginfo.signal.number() as i32,
102        }
103    }
104}
105
106atomic_bitflags! {
107    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
108    pub struct TaskFlags: u8 {
109        const EXITED                   = 1 << 0;
110        const SIGNALS_AVAILABLE        = 1 << 1;
111        const TEMPORARY_SIGNAL_MASK    = 1 << 2;
112        /// Whether the executor should dump the stack of this task when it exits.
113        /// Currently used to implement ExitStatus::CoreDump.
114        const DUMP_ON_EXIT             = 1 << 3;
115        const KERNEL_SIGNALS_AVAILABLE = 1 << 4;
116        /// Whether the executor has successfully spawned a thread for this task.
117        const SPAWNED                  = 1 << 5;
118    }
119}
120
121/// This contains thread state that tracers can inspect and modify.  It is
122/// captured when a thread stops, and optionally copied back (if dirty) when a
123/// thread starts again.  An alternative implementation would involve the
124/// tracers acting on thread state directly; however, this would involve sharing
125/// CurrentTask structures across multiple threads, which goes against the
126/// intent of the design of CurrentTask.
127pub struct CapturedThreadState {
128    /// The thread state of the traced task.  This is copied out when the thread
129    /// stops.
130    pub thread_state: ThreadState<HeapRegs>,
131
132    /// Indicates that the last ptrace operation changed the thread state, so it
133    /// should be written back to the original thread.
134    pub dirty: bool,
135}
136
137impl ArchSpecific for CapturedThreadState {
138    fn is_arch32(&self) -> bool {
139        self.thread_state.is_arch32()
140    }
141}
142
143#[derive(Debug)]
144pub struct RobustList {
145    pub next: RobustListPtr,
146}
147
148pub type RobustListPtr =
149    MappingMultiArchUserRef<RobustList, uapi::robust_list, uapi::arch32::robust_list>;
150
151impl From<uapi::robust_list> for RobustList {
152    fn from(robust_list: uapi::robust_list) -> Self {
153        Self { next: RobustListPtr::from(robust_list.next) }
154    }
155}
156
157#[cfg(target_arch = "aarch64")]
158impl From<uapi::arch32::robust_list> for RobustList {
159    fn from(robust_list: uapi::arch32::robust_list) -> Self {
160        Self { next: RobustListPtr::from(robust_list.next) }
161    }
162}
163
164#[derive(Debug)]
165pub struct RobustListHead {
166    pub list: RobustList,
167    pub futex_offset: isize,
168}
169
170pub type RobustListHeadPtr =
171    MappingMultiArchUserRef<RobustListHead, uapi::robust_list_head, uapi::arch32::robust_list_head>;
172
173impl From<uapi::robust_list_head> for RobustListHead {
174    fn from(robust_list_head: uapi::robust_list_head) -> Self {
175        Self {
176            list: robust_list_head.list.into(),
177            futex_offset: robust_list_head.futex_offset as isize,
178        }
179    }
180}
181
182#[cfg(target_arch = "aarch64")]
183impl From<uapi::arch32::robust_list_head> for RobustListHead {
184    fn from(robust_list_head: uapi::arch32::robust_list_head) -> Self {
185        Self {
186            list: robust_list_head.list.into(),
187            futex_offset: robust_list_head.futex_offset as isize,
188        }
189    }
190}
191
192pub struct TaskMutableState {
193    /// The mutable live state of the task.
194    ///
195    /// This is `None` for zombie tasks.
196    pub live: Option<TaskMutableLiveState>,
197
198    // See https://man7.org/linux/man-pages/man2/set_tid_address.2.html
199    pub clear_child_tid: UserRef<tid_t>,
200
201    /// The exit status that this task exited with.
202    exit_status: Option<ExitStatus>,
203
204    /// Desired scheduler state for the task.
205    pub scheduler_state: SchedulerState,
206
207    /// The UTS namespace assigned to this thread.
208    ///
209    /// This field is kept in the mutable state because the UTS namespace of a thread
210    /// can be forked using `clone()` or `unshare()` syscalls.
211    ///
212    /// We use UtsNamespaceHandle because the UTS properties can be modified
213    /// by any other thread that shares this namespace.
214    pub uts_ns: UtsNamespaceHandle,
215
216    /// Bit that determines whether a newly started program can have privileges its parent does
217    /// not have.  See Documentation/prctl/no_new_privs.txt in the Linux kernel for details.
218    /// Note that Starnix does not currently implement the relevant privileges (e.g.,
219    /// setuid/setgid binaries).  So, you can set this, but it does nothing other than get
220    /// propagated to children.
221    ///
222    /// The documentation indicates that this can only ever be set to
223    /// true, and it cannot be reverted to false.  Accessor methods
224    /// for this field ensure this property.
225    no_new_privs: bool,
226
227    /// Userspace hint about how to adjust the OOM score for this process.
228    pub oom_score_adj: i32,
229
230    /// List of currently installed seccomp_filters
231    pub seccomp_filters: SeccompFilterContainer,
232
233    /// A pointer to the head of the robust futex list of this thread in
234    /// userspace. See get_robust_list(2)
235    pub robust_list_head: RobustListHeadPtr,
236
237    /// The timer slack used to group timer expirations for the calling thread.
238    ///
239    /// Timers may expire up to `timerslack_ns` late, but never early.
240    ///
241    /// If this value is 0, the task's default timerslack is used.
242    pub timerslack_ns: u64,
243
244    /// The default value for `timerslack_ns`. This value cannot change during the lifetime of a
245    /// task.
246    ///
247    /// This value is set to the `timerslack_ns` of the creating thread, and thus is not constant
248    /// across tasks.
249    pub default_timerslack_ns: u64,
250
251    /// Information that a tracer needs to communicate with this process, if it
252    /// is being traced.
253    pub ptrace: Option<Box<PtraceState>>,
254
255    /// Information that a tracer needs to inspect this process.
256    pub captured_thread_state: Option<Box<CapturedThreadState>>,
257}
258
259impl TaskMutableState {
260    /// Returns the [`TaskMutableLiveState`] for the [`TaskMutableState`].
261    ///
262    /// # Panics
263    ///
264    /// Calling `live()` on a [`TaskMutableState`] which has no live state (i.e. zombie tasks)
265    /// panics.
266    pub fn live(&self) -> &TaskMutableLiveState {
267        self.live.as_ref().expect("Operation requires TaskMutableLiveState")
268    }
269
270    /// Returns the mutable [`TaskMutableLiveState`] for the [`TaskMutableState`].
271    ///
272    /// # Panics
273    ///
274    /// Calling `live_mut()` on a [`TaskMutableState`] which has no live state (i.e. zombie tasks)
275    /// panics.
276    pub fn live_mut(&mut self) -> &mut TaskMutableLiveState {
277        self.live.as_mut().expect("Operation requires TaskMutableLiveState")
278    }
279
280    pub fn no_new_privs(&self) -> bool {
281        self.no_new_privs
282    }
283
284    /// Sets the value of no_new_privs to true.  It is an error to set
285    /// it to anything else.
286    pub fn enable_no_new_privs(&mut self) {
287        self.no_new_privs = true;
288    }
289
290    pub fn get_timerslack<T: zx::Timeline>(&self) -> zx::Duration<T> {
291        zx::Duration::from_nanos(self.timerslack_ns as i64)
292    }
293
294    /// Sets the current timerslack of the task to `ns`.
295    ///
296    /// If `ns` is zero, the current timerslack gets reset to the task's default timerslack.
297    pub fn set_timerslack_ns(&mut self, ns: u64) {
298        if ns == 0 {
299            self.timerslack_ns = self.default_timerslack_ns;
300        } else {
301            self.timerslack_ns = ns;
302        }
303    }
304
305    pub fn is_ptraced(&self) -> bool {
306        self.ptrace.is_some()
307    }
308
309    pub fn is_ptrace_listening(&self) -> bool {
310        self.ptrace.as_ref().is_some_and(|ptrace| ptrace.stop_status == PtraceStatus::Listening)
311    }
312
313    pub fn ptrace_on_signal_consume(&mut self) -> bool {
314        self.ptrace.as_mut().is_some_and(|ptrace: &mut Box<PtraceState>| {
315            if ptrace.stop_status.is_continuing() {
316                ptrace.stop_status = PtraceStatus::Default;
317                false
318            } else {
319                true
320            }
321        })
322    }
323
324    pub fn notify_ptracers(&mut self) {
325        if let Some(ptrace) = &self.ptrace {
326            ptrace.tracer_waiters().notify_all();
327        }
328    }
329
330    pub fn wait_on_ptracer(&self, waiter: &Waiter) {
331        if let Some(ptrace) = &self.ptrace {
332            ptrace.tracee_waiters.wait_async(&waiter);
333        }
334    }
335
336    pub fn notify_ptracees(&mut self) {
337        if let Some(ptrace) = &self.ptrace {
338            ptrace.tracee_waiters.notify_all();
339        }
340    }
341
342    pub fn take_captured_state(&mut self) -> Option<Box<CapturedThreadState>> {
343        if self.captured_thread_state.is_some() {
344            let mut state = None;
345            std::mem::swap(&mut state, &mut self.captured_thread_state);
346            return state;
347        }
348        None
349    }
350
351    pub fn copy_state_from(&mut self, current_task: &CurrentTask) {
352        self.captured_thread_state = Some(Box::new(CapturedThreadState {
353            thread_state: current_task.thread_state.extended_snapshot::<HeapRegs>(),
354            dirty: false,
355        }));
356    }
357
358    /// Returns the task's currently active signal mask.
359    pub fn signal_mask(&self) -> SigSet {
360        self.live().signals.mask()
361    }
362
363    /// Returns true if `signal` is currently blocked by this task's signal mask.
364    pub fn is_signal_masked(&self, signal: Signal) -> bool {
365        self.live().signals.mask().has_signal(signal)
366    }
367
368    /// Returns true if `signal` is blocked by the saved signal mask.
369    ///
370    /// Note that the current signal mask may still not be blocking the signal.
371    pub fn is_signal_masked_by_saved_mask(&self, signal: Signal) -> bool {
372        self.live().signals.saved_mask().is_some_and(|mask| mask.has_signal(signal))
373    }
374
375    /// Removes the currently active, temporary, signal mask and restores the
376    /// previously active signal mask.
377    pub fn restore_signal_mask(&mut self) {
378        self.live_mut().signals.restore_mask();
379    }
380
381    /// Returns true if the task's current `RunState` is blocked.
382    pub fn is_blocked(&self) -> bool {
383        self.live().signals.run_state.is_blocked()
384    }
385
386    /// Sets the task's `RunState` to `run_state`.
387    pub fn set_run_state(&mut self, run_state: RunState) {
388        self.live_mut().signals.run_state = run_state;
389    }
390
391    pub fn run_state(&self) -> RunState {
392        self.live().signals.run_state.clone()
393    }
394
395    pub fn on_signal_stack(&self, stack_pointer_register: u64) -> bool {
396        self.live()
397            .signals
398            .alt_stack
399            .map(|signal_stack| sigaltstack_contains_pointer(&signal_stack, stack_pointer_register))
400            .unwrap_or(false)
401    }
402
403    pub fn set_sigaltstack(&mut self, stack: Option<sigaltstack>) {
404        self.live_mut().signals.alt_stack = stack;
405    }
406
407    pub fn sigaltstack(&self) -> Option<sigaltstack> {
408        self.live().signals.alt_stack
409    }
410
411    pub fn wait_on_signal(&mut self, waiter: &Waiter) {
412        self.live_mut().signals.signal_wait.wait_async(waiter);
413    }
414
415    pub fn signals_mut(&mut self) -> &mut SignalState {
416        &mut self.live_mut().signals
417    }
418
419    pub fn wait_on_signal_fd_events(
420        &self,
421        waiter: &Waiter,
422        mask: SigSet,
423        handler: EventHandler,
424    ) -> WaitCanceler {
425        self.live().signals.signal_wait.wait_async_signal_mask(waiter, mask, handler)
426    }
427
428    pub fn notify_signal_waiters(&self, signal: &Signal) {
429        self.live().signals.signal_wait.notify_signal(signal);
430    }
431
432    /// Thaw the task if has been frozen
433    pub fn thaw(&mut self) {
434        if let RunState::Frozen(waiter) = self.run_state() {
435            waiter.notify();
436        }
437    }
438
439    pub fn is_frozen(&self) -> bool {
440        matches!(self.run_state(), RunState::Frozen(_))
441    }
442
443    #[cfg(test)]
444    pub fn kernel_signals_for_test(&self) -> &VecDeque<KernelSignal> {
445        &self.live().kernel_signals
446    }
447}
448
449#[apply(state_implementation!)]
450impl TaskMutableState<Base = Task> {
451    pub fn set_stopped(
452        &mut self,
453        stopped: StopState,
454        siginfo: Option<SignalInfo>,
455        current_task: Option<&CurrentTask>,
456        event: Option<PtraceEventData>,
457    ) {
458        if stopped.ptrace_only() && self.ptrace.is_none() {
459            return;
460        }
461
462        if self.base.load_stopped().is_illegal_transition(stopped) {
463            return;
464        }
465
466        // TODO(https://g-issues.fuchsia.dev/issues/306438676): When task can be
467        // stopped inside user code, task will need to be either restarted or
468        // stopped here.
469        self.store_stopped(stopped);
470        if stopped.is_stopped() {
471            if let Some(ref current_task) = current_task {
472                self.copy_state_from(current_task);
473            }
474        }
475        if let Some(ptrace) = &mut self.ptrace {
476            ptrace.set_last_signal(siginfo);
477            ptrace.set_last_event(event);
478        }
479        if stopped == StopState::Waking || stopped == StopState::ForceWaking {
480            self.notify_ptracees();
481        }
482        if !stopped.is_in_progress() {
483            self.notify_ptracers();
484        }
485    }
486
487    /// Enqueues a signal at the back of the task's signal queue.
488    pub fn enqueue_signal(&mut self, signal: SignalInfo) {
489        self.live_mut().signals.enqueue(signal);
490        self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.live().signals.is_any_pending());
491    }
492
493    /// Enqueues the signal, allowing the signal to skip straight to the front of the task's queue.
494    ///
495    /// `enqueue_signal` is the more common API to use.
496    ///
497    /// Note that this will not guarantee that the signal is dequeued before any process-directed
498    /// signals.
499    pub fn enqueue_signal_front(&mut self, signal: SignalInfo) {
500        self.live_mut().signals.enqueue(signal);
501        self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.live().signals.is_any_pending());
502    }
503
504    /// Sets the current signal mask of the task.
505    pub fn set_signal_mask(&mut self, mask: SigSet) {
506        self.live_mut().signals.set_mask(mask);
507        self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.live().signals.is_any_pending());
508    }
509
510    /// Sets a temporary signal mask for the task.
511    ///
512    /// This mask should be removed by a matching call to `restore_signal_mask`.
513    pub fn set_temporary_signal_mask(&mut self, mask: SigSet) {
514        self.live_mut().signals.set_temporary_mask(mask);
515        self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.live().signals.is_any_pending());
516    }
517
518    /// Returns the number of pending signals for this task, without considering the signal mask.
519    pub fn pending_signal_count(&self) -> usize {
520        self.live().signals.num_queued() + self.base.thread_group().num_signals_queued()
521    }
522
523    /// Returns `true` if `signal` is pending for this task, without considering the signal mask.
524    pub fn has_signal_pending(&self, signal: Signal) -> bool {
525        self.live().signals.has_queued(signal) || self.base.thread_group().has_signal_queued(signal)
526    }
527
528    // Prepare a SignalInfo to be sent to the tracer, if any.
529    pub fn prepare_signal_info(
530        &mut self,
531        stopped: StopState,
532    ) -> Option<(Weak<ThreadGroup>, SignalInfo)> {
533        if !stopped.is_stopped() {
534            return None;
535        }
536
537        if let Some(ptrace) = &self.ptrace {
538            if let Some(last_signal) = ptrace.get_last_signal_ref() {
539                let signal_info = SignalInfo::with_detail(
540                    SIGCHLD,
541                    CLD_TRAPPED as i32,
542                    SignalDetail::SIGCHLD {
543                        pid: self.base.tid,
544                        uid: self.base.real_creds().uid,
545                        status: last_signal.signal.number() as i32,
546                    },
547                );
548
549                return Some((ptrace.core_state.thread_group.clone(), signal_info));
550            }
551        }
552
553        None
554    }
555
556    pub fn set_ptrace(&mut self, tracer: Option<Box<PtraceState>>) -> Result<(), Errno> {
557        if tracer.is_some() && self.ptrace.is_some() {
558            return error!(EPERM);
559        }
560
561        if tracer.is_none() {
562            // Handle the case where this is called while the thread group is being released.
563            if let Ok(tg_stop_state) = self.base.thread_group().load_stopped().as_in_progress() {
564                self.set_stopped(tg_stop_state, None, None, None);
565            }
566        }
567        self.ptrace = tracer;
568        Ok(())
569    }
570
571    pub fn can_accept_ptrace_commands(&mut self) -> bool {
572        !self.base.load_stopped().is_waking_or_awake()
573            && self.is_ptraced()
574            && !self.is_ptrace_listening()
575    }
576
577    fn store_stopped(&mut self, state: StopState) {
578        // We don't actually use the guard but we require it to enforce that the
579        // caller holds the thread group's mutable state lock (identified by
580        // mutable access to the thread group's mutable state).
581
582        self.base.stop_state.store(state, Ordering::Relaxed)
583    }
584
585    pub fn update_flags(&mut self, clear: TaskFlags, set: TaskFlags) {
586        // We don't actually use the guard but we require it to enforce that the
587        // caller holds the task's mutable state lock (identified by mutable
588        // access to the task's mutable state).
589
590        debug_assert_eq!(clear ^ set, clear | set);
591        let observed = self.base.flags();
592        let swapped = self.base.flags.swap((observed | set) & !clear, Ordering::Relaxed);
593        debug_assert_eq!(swapped, observed);
594    }
595
596    pub fn set_flags(&mut self, flag: TaskFlags, v: bool) {
597        let (clear, set) = if v { (TaskFlags::empty(), flag) } else { (flag, TaskFlags::empty()) };
598
599        self.update_flags(clear, set);
600    }
601
602    pub fn set_spawned(&mut self) {
603        self.set_flags(TaskFlags::SPAWNED, true);
604    }
605
606    pub fn set_exit_status(&mut self, status: ExitStatus) {
607        self.set_flags(TaskFlags::EXITED, true);
608        self.exit_status = Some(status);
609    }
610
611    pub fn set_exit_status_if_not_already(&mut self, status: ExitStatus) {
612        self.set_flags(TaskFlags::EXITED, true);
613        self.exit_status.get_or_insert(status);
614    }
615
616    /// The set of pending signals for the task, including the signals pending for the thread
617    /// group.
618    pub fn pending_signals(&self) -> SigSet {
619        self.live().signals.pending() | self.base.thread_group().get_pending_signals()
620    }
621
622    /// The set of pending signals for the task specifically, not including the signals pending
623    /// for the thread group.
624    pub fn task_specific_pending_signals(&self) -> SigSet {
625        self.live().signals.pending()
626    }
627
628    /// Returns true if any currently pending signal is allowed by `mask`.
629    pub fn is_any_signal_allowed_by_mask(&self, mask: SigSet) -> bool {
630        self.live().signals.is_any_allowed_by_mask(mask)
631            || self.base.thread_group().is_any_signal_allowed_by_mask(mask)
632    }
633
634    /// Returns whether or not a signal is pending for this task, taking the current
635    /// signal mask into account.
636    pub fn is_any_signal_pending(&self) -> bool {
637        let mask = self.signal_mask();
638        self.live().signals.is_any_pending()
639            || self.base.thread_group().is_any_signal_allowed_by_mask(mask)
640    }
641
642    /// Returns the next pending signal that passes `predicate`.
643    fn take_next_signal_where<F>(&mut self, predicate: F) -> Option<SignalInfo>
644    where
645        F: Fn(&SignalInfo) -> bool,
646    {
647        if let Some(signal) = self.base.thread_group().take_next_signal_where(&predicate) {
648            Some(signal)
649        } else {
650            let s = self.live_mut().signals.take_next_where(&predicate);
651            self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.live().signals.is_any_pending());
652            s
653        }
654    }
655
656    /// Removes and returns the next pending `signal` for this task.
657    ///
658    /// Returns `None` if `siginfo` is a blocked signal, or no such signal is pending.
659    pub fn take_specific_signal(&mut self, siginfo: SignalInfo) -> Option<SignalInfo> {
660        let signal_mask = self.signal_mask();
661        if signal_mask.has_signal(siginfo.signal) {
662            return None;
663        }
664
665        let predicate = |s: &SignalInfo| s.signal == siginfo.signal;
666        self.take_next_signal_where(predicate)
667    }
668
669    /// Removes and returns a pending signal that is unblocked by the current signal mask.
670    ///
671    /// Returns `None` if there are no unblocked signals pending.
672    pub fn take_any_signal(&mut self) -> Option<SignalInfo> {
673        self.take_signal_with_mask(self.signal_mask())
674    }
675
676    /// Removes and returns a pending signal that is unblocked by `signal_mask`.
677    ///
678    /// Returns `None` if there are no signals pending that are unblocked by `signal_mask`.
679    pub fn take_signal_with_mask(&mut self, signal_mask: SigSet) -> Option<SignalInfo> {
680        let predicate = |s: &SignalInfo| !signal_mask.has_signal(s.signal) || s.force;
681        self.take_next_signal_where(predicate)
682    }
683
684    /// Enqueues an internal signal at the back of the task's kernel signal queue.
685    pub fn enqueue_kernel_signal(&mut self, signal: KernelSignal) {
686        self.live_mut().kernel_signals.push_back(signal);
687        self.set_flags(TaskFlags::KERNEL_SIGNALS_AVAILABLE, true);
688    }
689
690    /// Removes and returns a pending internal signal.
691    ///
692    /// Returns `None` if there are no signals pending.
693    pub fn take_kernel_signal(&mut self) -> Option<KernelSignal> {
694        let signal = self.live_mut().kernel_signals.pop_front();
695        if self.live().kernel_signals.is_empty() {
696            self.set_flags(TaskFlags::KERNEL_SIGNALS_AVAILABLE, false);
697        }
698        signal
699    }
700
701    #[cfg(test)]
702    pub fn queued_signal_count(&self, signal: Signal) -> usize {
703        self.live().signals.queued_count(signal)
704            + self.base.thread_group().pending_signals.lock().queued_count(signal)
705    }
706}
707
708/// The mutate state of a [`Task`] that is only relevant while the task is alive.
709///
710/// See also: [`TaskMutableState`], [`TaskLiveState`]
711pub struct TaskMutableLiveState {
712    /// Signal handler related state. This is grouped together for when atomicity is needed during
713    /// signal sending and delivery.
714    signals: SignalState,
715
716    /// Internal signals that have a higher priority than a regular signal.
717    ///
718    /// Storing in a separate queue outside of `SignalState` ensures the internal signals will
719    /// never be ignored or masked when dequeuing. Higher priority ensures that no user signals
720    /// will jump the queue, e.g. ptrace, which delays the delivery.
721    ///
722    /// This design is not about observable consequence, but about convenient implementation.
723    kernel_signals: VecDeque<KernelSignal>,
724}
725
726/// The state of a [`Task`] that is only relevant while the task is alive.
727///
728/// See also: [`TaskMutableLiveState`]
729pub struct TaskLiveState {
730    /// A handle to the underlying Zircon thread object.
731    ///
732    /// Some tasks lack an underlying Zircon thread. These tasks are used internally by the
733    /// Starnix kernel to track background work, typically on a `kthread`.
734    pub thread: RwLock<Option<Arc<zx::Thread>>>,
735
736    /// The file descriptor table for this task.
737    ///
738    /// This table can be share by many tasks.
739    pub files: FdTable,
740
741    /// The memory manager for this task.  This is `None` only for system tasks.
742    pub mm: RcuOptionArc<MemoryManager>,
743
744    /// The file system for this task.
745    pub fs: RcuArc<FsContext>,
746
747    /// The namespace for abstract AF_UNIX sockets for this task.
748    pub abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
749
750    /// The namespace for AF_VSOCK for this task.
751    pub abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
752}
753
754impl TaskLiveState {
755    pub fn mm(&self) -> Result<Arc<MemoryManager>, Errno> {
756        self.mm.to_option_arc().ok_or_else(|| errno!(EINVAL))
757    }
758
759    pub fn fs(&self) -> Arc<FsContext> {
760        self.fs.to_arc()
761    }
762}
763
764#[derive(Debug, Clone, Copy, PartialEq, Eq)]
765pub enum TaskStateCode {
766    // Task is being executed.
767    Running,
768
769    // Task is waiting for an event.
770    Sleeping,
771
772    // Tracing stop
773    TracingStop,
774
775    // Task has exited.
776    Zombie,
777}
778
779impl TaskStateCode {
780    pub fn code_char(&self) -> char {
781        match self {
782            TaskStateCode::Running => 'R',
783            TaskStateCode::Sleeping => 'S',
784            TaskStateCode::TracingStop => 't',
785            TaskStateCode::Zombie => 'Z',
786        }
787    }
788
789    pub fn name(&self) -> &'static str {
790        match self {
791            TaskStateCode::Running => "running",
792            TaskStateCode::Sleeping => "sleeping",
793            TaskStateCode::TracingStop => "tracing stop",
794            TaskStateCode::Zombie => "zombie",
795        }
796    }
797}
798
799/// The information of the task that needs to be available to the `ThreadGroup` while computing
800/// which process a wait can target. It is necessary to shared this data with the `ThreadGroup` so
801/// that it is available while the task is being dropped and so is not accessible from a weak
802/// pointer.
803#[derive(Debug)]
804pub struct TaskPersistentInfoState {
805    /// Immutable information about the task
806    tid: tid_t,
807    thread_group_key: ThreadGroupKey,
808
809    /// The command of this task.
810    command: Mutex<TaskCommand>,
811
812    /// The security credentials for this task. These are only set when the task is the CurrentTask,
813    /// or on task creation.
814    creds: RcuArc<Credentials>,
815
816    // A lock for the security credentials. Writers must take the lock, readers that need to ensure
817    // that the task state does not change may take the lock.
818    creds_lock: RwLock<()>,
819}
820
821/// Guard for reading locked credentials.
822pub struct CredentialsReadGuard<'a> {
823    _lock: RwLockReadGuard<'a, ()>,
824    creds: RcuReadGuard<Credentials>,
825}
826
827impl<'a> Deref for CredentialsReadGuard<'a> {
828    type Target = Credentials;
829
830    fn deref(&self) -> &Self::Target {
831        self.creds.deref()
832    }
833}
834
835/// Guard for writing credentials. No `CredentialsReadGuard` to the same task can concurrently
836///  exist.
837pub struct CredentialsWriteGuard<'a> {
838    _lock: RwLockWriteGuard<'a, ()>,
839    creds: &'a RcuArc<Credentials>,
840}
841
842impl<'a> CredentialsWriteGuard<'a> {
843    pub fn update(&mut self, creds: Arc<Credentials>) {
844        self.creds.update(creds);
845    }
846}
847
848impl TaskPersistentInfoState {
849    fn new(
850        tid: tid_t,
851        thread_group_key: ThreadGroupKey,
852        command: TaskCommand,
853        creds: Arc<Credentials>,
854    ) -> TaskPersistentInfo {
855        Arc::new(Self {
856            tid,
857            thread_group_key,
858            command: Mutex::new(command),
859            creds: RcuArc::new(creds),
860            creds_lock: RwLock::new(()),
861        })
862    }
863
864    pub fn tid(&self) -> tid_t {
865        self.tid
866    }
867
868    pub fn pid(&self) -> pid_t {
869        self.thread_group_key.pid()
870    }
871
872    pub fn command_guard(&self) -> MutexGuard<'_, TaskCommand> {
873        self.command.lock()
874    }
875
876    /// Snapshots the credentials, returning a short-lived RCU-guarded reference.
877    pub fn real_creds(&self) -> RcuReadGuard<Credentials> {
878        self.creds.read()
879    }
880
881    /// Snapshots the credentials, returning a new reference. Use this if you need to stash the
882    /// credentials somewhere.
883    pub fn clone_creds(&self) -> Arc<Credentials> {
884        self.creds.to_arc()
885    }
886
887    /// Returns a read lock on the credentials. This is appropriate if you need to guarantee that
888    ///  the Task's credentials will not change during a security-sensitive operation.
889    pub fn lock_creds(&self) -> CredentialsReadGuard<'_> {
890        let lock = self.creds_lock.read();
891        CredentialsReadGuard { _lock: lock, creds: self.creds.read() }
892    }
893
894    /// Locks the credentials for writing.
895    /// SAFETY: Only use from CurrentTask, and keep the subjective credentials stored in CurrentTask
896    /// in sync.
897    pub(in crate::task) unsafe fn write_creds(&self) -> CredentialsWriteGuard<'_> {
898        let lock = self.creds_lock.write();
899        CredentialsWriteGuard { _lock: lock, creds: &self.creds }
900    }
901}
902
903pub type TaskPersistentInfo = Arc<TaskPersistentInfoState>;
904
905/// A unit of execution.
906///
907/// A task is the primary unit of execution in the Starnix kernel. Most tasks are *user* tasks,
908/// which have an associated Zircon thread. The Zircon thread switches between restricted mode,
909/// in which the thread runs userspace code, and normal mode, in which the thread runs Starnix
910/// code.
911///
912/// Tasks track the resources used by userspace by referencing various objects, such as an
913/// `FdTable`, a `MemoryManager`, and an `FsContext`. Many tasks can share references to these
914/// objects. In principle, which objects are shared between which tasks can be largely arbitrary,
915/// but there are common patterns of sharing. For example, tasks created with `pthread_create`
916/// will share the `FdTable`, `MemoryManager`, and `FsContext` and are often called "threads" by
917/// userspace programmers. Tasks created by `posix_spawn` do not share these objects and are often
918/// called "processes" by userspace programmers. However, inside the kernel, there is no clear
919/// definition of a "thread" or a "process".
920///
921/// During boot, the kernel creates the first task, often called `init`. The vast majority of other
922/// tasks are created as transitive clones (e.g., using `clone(2)`) of that task. Sometimes, the
923/// kernel will create new tasks from whole cloth, either with a corresponding userspace component
924/// or to represent some background work inside the kernel.
925///
926/// See also `CurrentTask`, which represents the task corresponding to the thread that is currently
927/// executing.
928pub struct Task {
929    /// Weak reference to the `OwnedRef` of this `Task`. This allows to retrieve the
930    /// `TempRef` from a raw `Task`.
931    pub weak_self: WeakRef<Self>,
932
933    /// A unique identifier for this task.
934    ///
935    /// This value can be read in userspace using `gettid(2)`. In general, this value
936    /// is different from the value return by `getpid(2)`, which returns the `id` of the leader
937    /// of the `thread_group`.
938    pub tid: tid_t,
939
940    /// The process key of this task.
941    pub thread_group_key: ThreadGroupKey,
942
943    /// The kernel to which this thread group belongs.
944    pub kernel: Arc<Kernel>,
945
946    /// The thread group to which this task belongs.
947    ///
948    /// The group of tasks in a thread group roughly corresponds to the userspace notion of a
949    /// process.
950    pub thread_group: Arc<ThreadGroup>,
951
952    /// The live state of the task.
953    ///
954    /// This is `None` for zombie tasks.
955    pub live_state: RcuOptionCell<TaskLiveState>,
956
957    /// The stop state of the task, distinct from the stop state of the thread group.
958    ///
959    /// Must only be set when the `mutable_state` write lock is held.
960    stop_state: AtomicStopState,
961
962    /// The flags for the task.
963    ///
964    /// Must only be set the then `mutable_state` write lock is held.
965    flags: AtomicTaskFlags,
966
967    /// The mutable state of the Task.
968    mutable_state: RwLock<TaskMutableState>,
969
970    /// The information of the task that needs to be available to the `ThreadGroup` while computing
971    /// which process a wait can target.
972    /// Contains the command line, the task credentials and the exit signal.
973    /// See `TaskPersistentInfo` for more information.
974    pub persistent_info: TaskPersistentInfo,
975
976    /// For vfork and clone() with CLONE_VFORK, this is set when the task exits or calls execve().
977    /// It allows the calling task to block until the fork has been completed. Only populated
978    /// when created with the CLONE_VFORK flag.
979    vfork_event: Option<Arc<zx::Event>>,
980
981    /// Variable that can tell you whether there are currently seccomp
982    /// filters without holding a lock
983    pub seccomp_filter_state: SeccompState,
984
985    /// Tell you whether you are tracing syscall entry / exit without a lock.
986    pub trace_syscalls: AtomicBool,
987
988    // The pid directory, so it doesn't have to be generated and thrown away on every access.
989    // See https://fxbug.dev/291962828 for details.
990    pub proc_pid_directory_cache: Mutex<Option<FsNodeHandle>>,
991}
992
993/// The decoded cross-platform parts we care about for page fault exception reports.
994#[derive(Debug)]
995pub struct PageFaultExceptionReport {
996    pub faulting_address: u64,
997    pub not_present: bool, // Set when the page fault was due to a not-present page.
998    pub is_write: bool,    // Set when the triggering memory operation was a write.
999    pub is_execute: bool,  // Set when the triggering memory operation was an execute.
1000}
1001
1002impl Task {
1003    pub fn kernel(&self) -> &Arc<Kernel> {
1004        &self.kernel
1005    }
1006
1007    pub fn thread_group(&self) -> &Arc<ThreadGroup> {
1008        &self.thread_group
1009    }
1010
1011    pub fn has_same_address_space(&self, other: Option<&Arc<MemoryManager>>) -> bool {
1012        match (self.mm(), other) {
1013            (Ok(this), Some(other)) => Arc::ptr_eq(&this, other),
1014            (Err(_), None) => true,
1015            _ => false,
1016        }
1017    }
1018
1019    pub fn flags(&self) -> TaskFlags {
1020        self.flags.load(Ordering::Relaxed)
1021    }
1022
1023    pub fn is_spawned(&self) -> bool {
1024        self.flags().contains(TaskFlags::SPAWNED)
1025    }
1026
1027    /// When the task exits, if there is a notification that needs to propagate
1028    /// to a ptracer, make sure it will propagate.
1029    pub fn set_ptrace_zombie(&self, pids: &mut crate::task::PidTable) {
1030        let pgid = self.thread_group().read().process_group.leader;
1031        let exit_signal = self.thread_group().read().exit_signal.clone();
1032        let mut state = self.write();
1033        state.set_stopped(StopState::ForceAwake, None, None, None);
1034        if let Some(ptrace) = &mut state.ptrace {
1035            // Add a zombie that the ptracer will notice.
1036            ptrace.last_signal_waitable = true;
1037            let tracer_pid = ptrace.get_pid();
1038            let tracer_tg = pids.get_thread_group(tracer_pid);
1039            if let Some(tracer_tg) = tracer_tg {
1040                drop(state);
1041                let mut tracer_state = tracer_tg.write();
1042
1043                let exit_status = self.exit_status().unwrap_or_else(|| {
1044                    starnix_logging::log_error!("Exiting without an exit code.");
1045                    ExitStatus::Exit(u8::MAX)
1046                });
1047                let uid = self.real_creds().uid;
1048                let exit_info = ProcessExitInfo { status: exit_status, exit_signal };
1049                let zombie = ZombieProcess {
1050                    thread_group_key: self.thread_group_key.clone(),
1051                    pgid,
1052                    uid,
1053                    exit_info: exit_info,
1054                    // ptrace doesn't need this.
1055                    time_stats: TaskTimeStats::default(),
1056                    is_canonical: false,
1057                };
1058
1059                tracer_state.zombie_ptracees.add(pids, self.tid, zombie);
1060            };
1061        }
1062    }
1063
1064    /// Disconnects this task from the tracer, if the tracer is still running.
1065    pub fn ptrace_disconnect(&mut self, pids: &PidTable) {
1066        let mut state = self.write();
1067        let ptracer_pid = state.ptrace.as_ref().map(|ptrace| ptrace.get_pid());
1068        if let Some(ptracer_pid) = ptracer_pid {
1069            let _ = state.set_ptrace(None);
1070            if let Some(ProcessEntryRef::Process(tg)) = pids.get_process(ptracer_pid) {
1071                let tid = self.get_tid();
1072                drop(state);
1073                tg.ptracees.lock().remove(&tid);
1074            }
1075        }
1076    }
1077
1078    pub fn exit_status(&self) -> Option<ExitStatus> {
1079        self.is_exitted().then(|| self.read().exit_status.clone()).flatten()
1080    }
1081
1082    pub fn is_exitted(&self) -> bool {
1083        self.flags().contains(TaskFlags::EXITED)
1084    }
1085
1086    pub fn load_stopped(&self) -> StopState {
1087        self.stop_state.load(Ordering::Relaxed)
1088    }
1089
1090    /// Upgrade a Reference to a Task, returning a ESRCH errno if the reference cannot be borrowed.
1091    pub fn from_weak(weak: &WeakRef<Task>) -> Result<TempRef<'_, Task>, Errno> {
1092        weak.upgrade().ok_or_else(|| errno!(ESRCH))
1093    }
1094
1095    /// Internal function for creating a Task object. Useful when you need to specify the value of
1096    /// every field. create_process and create_thread are more likely to be what you want.
1097    ///
1098    /// Any fields that should be initialized fresh for every task, even if the task was created
1099    /// with fork, are initialized to their defaults inside this function. All other fields are
1100    /// passed as parameters.
1101    #[allow(clippy::let_and_return)]
1102    pub fn new(
1103        tid: tid_t,
1104        command: TaskCommand,
1105        thread_group: Arc<ThreadGroup>,
1106        thread: Option<zx::Thread>,
1107        files: FdTable,
1108        mm: Option<Arc<MemoryManager>>,
1109        // The only case where fs should be None if when building the initial task that is the
1110        // used to build the initial FsContext.
1111        fs: Arc<FsContext>,
1112        creds: Arc<Credentials>,
1113        abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
1114        abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
1115        signal_mask: SigSet,
1116        kernel_signals: VecDeque<KernelSignal>,
1117        vfork_event: Option<Arc<zx::Event>>,
1118        scheduler_state: SchedulerState,
1119        uts_ns: UtsNamespaceHandle,
1120        no_new_privs: bool,
1121        seccomp_filter_state: SeccompState,
1122        seccomp_filters: SeccompFilterContainer,
1123        robust_list_head: RobustListHeadPtr,
1124        timerslack_ns: u64,
1125    ) -> OwnedRef<Self> {
1126        let thread_group_key = ThreadGroupKey::from(&thread_group);
1127        OwnedRef::new_cyclic(|weak_self| {
1128            let task = Task {
1129                weak_self,
1130                tid,
1131                thread_group_key: thread_group_key.clone(),
1132                kernel: Arc::clone(&thread_group.kernel),
1133                thread_group,
1134                live_state: RcuOptionCell::new(Some(TaskLiveState {
1135                    thread: RwLock::new(thread.map(Arc::new)),
1136                    files,
1137                    mm: RcuOptionArc::new(mm),
1138                    fs: RcuArc::new(fs),
1139                    abstract_socket_namespace,
1140                    abstract_vsock_namespace,
1141                })),
1142                vfork_event,
1143                stop_state: AtomicStopState::new(StopState::Awake),
1144                flags: AtomicTaskFlags::new(TaskFlags::empty()),
1145                mutable_state: RwLock::new(TaskMutableState {
1146                    live: Some(TaskMutableLiveState {
1147                        signals: SignalState::with_mask(signal_mask),
1148                        kernel_signals,
1149                    }),
1150                    clear_child_tid: UserRef::default(),
1151                    exit_status: None,
1152                    scheduler_state,
1153                    uts_ns,
1154                    no_new_privs,
1155                    oom_score_adj: Default::default(),
1156                    seccomp_filters,
1157                    robust_list_head,
1158                    timerslack_ns,
1159                    // The default timerslack is set to the current timerslack of the creating thread.
1160                    default_timerslack_ns: timerslack_ns,
1161                    ptrace: None,
1162                    captured_thread_state: None,
1163                }),
1164                persistent_info: TaskPersistentInfoState::new(
1165                    tid,
1166                    thread_group_key,
1167                    command,
1168                    creds,
1169                ),
1170                seccomp_filter_state,
1171                trace_syscalls: AtomicBool::new(false),
1172                proc_pid_directory_cache: Mutex::new(None),
1173            };
1174
1175            #[cfg(any(test, debug_assertions))]
1176            {
1177                // Note that `Kernel::pids` is already locked by the caller of `Task::new()`.
1178                let _l1 = task.read();
1179                let _l2 = task.persistent_info.lock_creds();
1180                let _l3 = task.persistent_info.command_guard();
1181            }
1182            task
1183        })
1184    }
1185
1186    state_accessor!(Task, mutable_state);
1187
1188    /// Returns the real credentials of the task as a short-lived RCU-guarded reference. These
1189    /// credentials are used to check permissions for actions performed on the task. If the task
1190    /// itself is performing an action, use `CurrentTask::current_creds` instead. This does not
1191    /// lock the credentials.
1192    pub fn real_creds(&self) -> RcuReadGuard<Credentials> {
1193        self.persistent_info.real_creds()
1194    }
1195
1196    /// Returns a new long-lived reference to the real credentials of the task.  These credentials
1197    /// are used to check permissions for actions performed on the task. If the task itself is
1198    /// performing an action, use `CurrentTask::current_creds` instead. This does not lock the
1199    /// credentials.
1200    pub fn clone_creds(&self) -> Arc<Credentials> {
1201        self.persistent_info.clone_creds()
1202    }
1203
1204    pub fn ptracer_task(&self) -> WeakRef<Task> {
1205        let ptracer = {
1206            let state = self.read();
1207            state.ptrace.as_ref().map(|p| p.core_state.pid)
1208        };
1209
1210        let Some(ptracer) = ptracer else {
1211            return WeakRef::default();
1212        };
1213
1214        self.get_task(ptracer)
1215    }
1216
1217    /// Returns the live state of the task, if it exists.
1218    ///
1219    /// # Errors
1220    ///
1221    /// Returns [`Err(ESRCH)`] if the task has already transitioned to a zombie state and its live
1222    /// resources have been dropped.
1223    #[track_caller]
1224    pub fn live(&self) -> Result<RcuReadGuard<TaskLiveState>, Errno> {
1225        self.live_state.read().ok_or_else(|| errno!(ESRCH))
1226    }
1227
1228    /// Returns the memory manager of the task, if it exists.
1229    ///
1230    /// # Errors
1231    ///
1232    /// Returns [`Err(errno)`] where `errno` is:
1233    ///
1234    ///   - `ESRCH`: the task is dead and its live resources have been dropped.
1235    ///   - `EINVAL`: the task does not have a memory manager.
1236    #[track_caller]
1237    pub fn mm(&self) -> Result<Arc<MemoryManager>, Errno> {
1238        self.live()?.mm.to_option_arc().ok_or_else(|| errno!(EINVAL))
1239    }
1240
1241    /// Modify the given elements of the scheduler state with new values and update the
1242    /// task's thread's role.
1243    pub(crate) fn set_scheduler_policy_priority_and_reset_on_fork(
1244        &self,
1245        policy: SchedulingPolicy,
1246        priority: RealtimePriority,
1247        reset_on_fork: bool,
1248    ) -> Result<(), Errno> {
1249        self.update_scheduler_state_then_role(|scheduler_state| {
1250            scheduler_state.policy = policy;
1251            scheduler_state.realtime_priority = priority;
1252            scheduler_state.reset_on_fork = reset_on_fork;
1253        })
1254    }
1255
1256    /// Modify the scheduler state's priority and update the task's thread's role.
1257    pub(crate) fn set_scheduler_priority(&self, priority: RealtimePriority) -> Result<(), Errno> {
1258        self.update_scheduler_state_then_role(|scheduler_state| {
1259            scheduler_state.realtime_priority = priority
1260        })
1261    }
1262
1263    /// Modify the scheduler state's nice and update the task's thread's role.
1264    pub(crate) fn set_scheduler_nice(&self, nice: NormalPriority) -> Result<(), Errno> {
1265        self.update_scheduler_state_then_role(|scheduler_state| {
1266            scheduler_state.normal_priority = nice
1267        })
1268    }
1269
1270    /// Overwrite the existing scheduler state with a new one and update the task's thread's role.
1271    pub fn set_scheduler_state(&self, scheduler_state: SchedulerState) -> Result<(), Errno> {
1272        self.update_scheduler_state_then_role(|task_scheduler_state| {
1273            *task_scheduler_state = scheduler_state
1274        })
1275    }
1276
1277    /// Update the task's thread's role based on its current scheduler state without making any
1278    /// changes to the state.
1279    ///
1280    /// This should be called on tasks that have newly created threads, e.g. after cloning.
1281    pub fn sync_scheduler_state_to_role(&self) -> Result<(), Errno> {
1282        self.update_scheduler_state_then_role(|_| {})
1283    }
1284
1285    fn update_scheduler_state_then_role(
1286        &self,
1287        updater: impl FnOnce(&mut SchedulerState),
1288    ) -> Result<(), Errno> {
1289        let new_scheduler_state = {
1290            // Hold the task state lock as briefly as possible, it's not needed to update the role.
1291            let mut state = self.write();
1292            updater(&mut state.scheduler_state);
1293            state.scheduler_state
1294        };
1295        self.thread_group().kernel.scheduler.set_thread_role(self, new_scheduler_state)?;
1296        Ok(())
1297    }
1298
1299    /// Signals the vfork event, if any, to unblock waiters.
1300    pub fn signal_vfork(&self) {
1301        if let Some(event) = &self.vfork_event {
1302            if let Err(status) = event.signal(Signals::NONE, Signals::USER_0) {
1303                log_warn!("Failed to set vfork signal {status}");
1304            }
1305        };
1306    }
1307
1308    /// Blocks the caller until the task has exited or executed execve(). This is used to implement
1309    /// vfork() and clone(... CLONE_VFORK, ...). The task must have created with CLONE_EXECVE.
1310    pub fn wait_for_execve(&self, task_to_wait: WeakRef<Task>) -> Result<(), Errno> {
1311        let event = task_to_wait.upgrade().and_then(|t| t.vfork_event.clone());
1312        if let Some(event) = event {
1313            event
1314                .wait_one(zx::Signals::USER_0, zx::MonotonicInstant::INFINITE)
1315                .map_err(|status| from_status_like_fdio!(status))?;
1316        }
1317        Ok(())
1318    }
1319
1320    /// If needed, clear the child tid for this task.
1321    ///
1322    /// Userspace can ask us to clear the child tid and issue a futex wake at
1323    /// the child tid address when we tear down a task. For example, bionic
1324    /// uses this mechanism to implement pthread_join. The thread that calls
1325    /// pthread_join sleeps using FUTEX_WAIT on the child tid address. We wake
1326    /// them up here to let them know the thread is done.
1327    pub fn clear_child_tid_if_needed<L>(&self, locked: &mut Locked<L>) -> Result<(), Errno>
1328    where
1329        L: LockBefore<TerminalLock>,
1330    {
1331        let mut state = self.write();
1332        let user_tid = state.clear_child_tid;
1333        if !user_tid.is_null() {
1334            let zero: tid_t = 0;
1335            self.write_object(user_tid, &zero)?;
1336            self.kernel().shared_futexes.wake(
1337                locked,
1338                self,
1339                user_tid.addr(),
1340                usize::MAX,
1341                FUTEX_BITSET_MATCH_ANY,
1342            )?;
1343            state.clear_child_tid = UserRef::default();
1344        }
1345        Ok(())
1346    }
1347
1348    pub fn get_task(&self, tid: tid_t) -> WeakRef<Task> {
1349        self.kernel().pids.read().get_task(tid)
1350    }
1351
1352    pub fn get_pid(&self) -> pid_t {
1353        self.thread_group_key.pid()
1354    }
1355
1356    pub fn get_tid(&self) -> tid_t {
1357        self.tid
1358    }
1359
1360    pub fn is_leader(&self) -> bool {
1361        self.get_pid() == self.get_tid()
1362    }
1363
1364    pub fn read_argv(&self, max_len: usize) -> Result<Vec<FsString>, Errno> {
1365        // argv is empty for kthreads
1366        let Ok(mm) = self.mm() else {
1367            return Ok(vec![]);
1368        };
1369        let (argv_start, argv_end) = {
1370            let mm_state = mm.state.read();
1371            (mm_state.argv_start, mm_state.argv_end)
1372        };
1373
1374        let len_to_read = std::cmp::min(argv_end - argv_start, max_len);
1375        self.read_nul_delimited_c_string_list(argv_start, len_to_read)
1376    }
1377
1378    pub fn read_argv0(&self) -> Result<FsString, Errno> {
1379        // argv is empty for kthreads
1380        let Ok(mm) = self.mm() else {
1381            return Ok(FsString::default());
1382        };
1383        let argv_start = {
1384            let mm_state = mm.state.read();
1385            mm_state.argv_start
1386        };
1387        // Assuming a 64-bit arch width is fine for a type that's just u8's on all arches.
1388        let argv_start = UserCString::new(&ArchWidth::Arch64, argv_start);
1389        self.read_path(argv_start)
1390    }
1391
1392    pub fn read_env(&self, max_len: usize) -> Result<Vec<FsString>, Errno> {
1393        // environment is empty for kthreads
1394        let Ok(mm) = self.mm() else { return Ok(vec![]) };
1395        let (env_start, env_end) = {
1396            let mm_state = mm.state.read();
1397            (mm_state.environ_start, mm_state.environ_end)
1398        };
1399
1400        let len_to_read = std::cmp::min(env_end - env_start, max_len);
1401        self.read_nul_delimited_c_string_list(env_start, len_to_read)
1402    }
1403
1404    pub fn thread_runtime_info(&self) -> Result<zx::TaskRuntimeInfo, Errno> {
1405        self.live()?
1406            .thread
1407            .read()
1408            .as_ref()
1409            .ok_or_else(|| errno!(EINVAL))?
1410            .get_runtime_info()
1411            .map_err(|status| from_status_like_fdio!(status))
1412    }
1413
1414    pub fn real_fscred(&self) -> FsCred {
1415        self.real_creds().as_fscred()
1416    }
1417
1418    /// Interrupts the current task.
1419    ///
1420    /// This will interrupt any blocking syscalls if the task is blocked on one.
1421    /// The signal_state of the task must not be locked.
1422    pub fn interrupt(&self) {
1423        let Ok(live) = self.live() else {
1424            log_warn!("Cannot interrupt dead task {}", self.get_tid());
1425            return;
1426        };
1427
1428        self.read().live().signals.run_state.wake();
1429        if let Some(thread) = live.thread.read().as_ref() {
1430            #[allow(
1431                clippy::undocumented_unsafe_blocks,
1432                reason = "Force documented unsafe blocks in Starnix"
1433            )]
1434            let status = unsafe { zx::sys::zx_restricted_kick(thread.raw_handle(), 0) };
1435            if status != zx::sys::ZX_OK {
1436                // zx_restricted_kick() could return ZX_ERR_BAD_STATE if the target thread is already in the
1437                // DYING or DEAD states. That's fine since it means that the task is in the process of
1438                // tearing down, so allow it.
1439                assert_eq!(status, zx::sys::ZX_ERR_BAD_STATE);
1440            }
1441        }
1442    }
1443
1444    pub fn command(&self) -> TaskCommand {
1445        self.persistent_info.command.lock().clone()
1446    }
1447
1448    pub fn set_command_name(&self, mut new_name: TaskCommand) {
1449        let Ok(live) = self.live() else {
1450            log_warn!("Cannot set command name for dead task {}", self.get_tid());
1451            return;
1452        };
1453
1454        // If we're going to update the process name, see if we can get a longer one than normally
1455        // provided in the Linux uapi. Only choose the argv0-based name if it's a superset of the
1456        // uapi-provided name to avoid clobbering the name provided by the user.
1457        if let Ok(argv0) = self.read_argv0() {
1458            let argv0 = TaskCommand::from_path_bytes(&argv0);
1459            if let Some(embedded_name) = argv0.try_embed(&new_name) {
1460                new_name = embedded_name;
1461            }
1462        }
1463
1464        // Acquire this before modifying Zircon state to ensure consistency under concurrent access.
1465        // Ideally this would also guard the logic above to read argv[0] but we can't due to lock
1466        // cycles with SELinux checks.
1467        let mut command_guard = self.persistent_info.command_guard();
1468
1469        // Set the name on the Linux thread.
1470        if let Some(thread) = live.thread.read().as_ref() {
1471            set_zx_name(&**thread, new_name.as_bytes());
1472        }
1473
1474        // If this is the thread group leader, use this name for the process too.
1475        if self.is_leader() {
1476            set_zx_name(&self.thread_group().process, new_name.as_bytes());
1477            let _ = zx::Thread::raise_user_exception(
1478                zx::RaiseExceptionOptions::TARGET_JOB_DEBUGGER,
1479                zx::sys::ZX_EXCP_USER_CODE_PROCESS_NAME_CHANGED,
1480                0,
1481            );
1482        }
1483
1484        // Avoid a lock cycle by dropping the guard before notifying memory attribution of the
1485        // change.
1486        *command_guard = new_name;
1487        drop(command_guard);
1488
1489        if self.is_leader() {
1490            if let Some(notifier) = &self.thread_group().read().notifier {
1491                let _ = notifier.send(MemoryAttributionLifecycleEvent::name_change(self.tid));
1492            }
1493        }
1494    }
1495
1496    pub fn set_seccomp_state(&self, state: SeccompStateValue) -> Result<(), Errno> {
1497        self.seccomp_filter_state.set(&state)
1498    }
1499
1500    pub fn state_code(&self) -> TaskStateCode {
1501        let status = self.read();
1502        if status.exit_status.is_some() {
1503            TaskStateCode::Zombie
1504        } else if status.live().signals.run_state.is_blocked() {
1505            let stop_state = self.load_stopped();
1506            if stop_state.ptrace_only() && stop_state.is_stopped() {
1507                TaskStateCode::TracingStop
1508            } else {
1509                TaskStateCode::Sleeping
1510            }
1511        } else {
1512            TaskStateCode::Running
1513        }
1514    }
1515
1516    pub fn time_stats(&self) -> TaskTimeStats {
1517        use zx::Task;
1518        // TODO(https://fxbug.dev/297440106): Return time stats for zombie tasks.
1519        let live = match self.live() {
1520            Ok(live) => live,
1521            Err(_) => return TaskTimeStats::default(),
1522        };
1523        let info = match &*live.thread.read() {
1524            Some(thread) => thread.get_runtime_info().expect("Failed to get thread stats"),
1525            None => return TaskTimeStats::default(),
1526        };
1527
1528        TaskTimeStats {
1529            user_time: zx::MonotonicDuration::from_nanos(info.cpu_time),
1530            // TODO(https://fxbug.dev/42078242): How can we calculate system time?
1531            system_time: zx::MonotonicDuration::default(),
1532        }
1533    }
1534
1535    pub fn get_signal_action(&self, signal: Signal) -> sigaction_t {
1536        self.thread_group().signal_actions.get(signal)
1537    }
1538
1539    pub fn should_check_for_pending_signals(&self) -> bool {
1540        self.flags().intersects(
1541            TaskFlags::KERNEL_SIGNALS_AVAILABLE
1542                | TaskFlags::SIGNALS_AVAILABLE
1543                | TaskFlags::TEMPORARY_SIGNAL_MASK,
1544        ) || self.thread_group.has_pending_signals.load(Ordering::Relaxed)
1545    }
1546
1547    pub fn record_pid_koid_mapping(&self) {
1548        let Ok(live) = self.live() else {
1549            log_warn!("Cannot record pid/koid mapping for dead task {}", self.get_tid());
1550            return;
1551        };
1552
1553        let Some(ref mapping_table) = *self.kernel().pid_to_koid_mapping.read() else { return };
1554
1555        let pkoid = self.thread_group().get_process_koid().ok();
1556        let tkoid = live.thread.read().as_ref().and_then(|t| t.koid().ok());
1557        mapping_table.write().insert(self.tid, KoidPair { process: pkoid, thread: tkoid });
1558    }
1559}
1560
1561impl Releasable for Task {
1562    type Context<'a> = (
1563        ThreadState<RegisterStorageEnum>,
1564        &'a mut Locked<TaskRelease>,
1565        RwLockWriteGuard<'a, PidTable>,
1566    );
1567
1568    fn release<'a>(mut self, context: Self::Context<'a>) {
1569        let (thread_state, locked, pids) = context;
1570
1571        *self.proc_pid_directory_cache.get_mut() = None;
1572        self.ptrace_disconnect(&pids);
1573
1574        std::mem::drop(pids);
1575
1576        self.signal_vfork();
1577
1578        // Drop fields that can end up owning a FsNode to ensure no FsNode are owned by this task.
1579        if let Ok(live) = self.live() {
1580            live.files.release();
1581            live.mm.update(None);
1582        }
1583        self.live_state.update(None);
1584
1585        // Rebuild a temporary CurrentTask to run the release actions that requires a CurrentState.
1586        let current_task = CurrentTask::new(OwnedRef::new(self), thread_state.into());
1587
1588        // Apply any delayed releasers left.
1589        current_task.trigger_delayed_releaser(locked);
1590
1591        // Drop the task now that is has been released. This requires to take it from the OwnedRef
1592        // and from the resulting ReleaseGuard.
1593        let CurrentTask { mut task, .. } = current_task;
1594        let task = OwnedRef::take(&mut task).expect("task should not have been re-owned");
1595        let _task: Self = ReleaseGuard::take(task);
1596    }
1597}
1598
1599impl MemoryAccessor for Task {
1600    fn read_memory<'a>(
1601        &self,
1602        addr: UserAddress,
1603        bytes: &'a mut [MaybeUninit<u8>],
1604    ) -> Result<&'a mut [u8], Errno> {
1605        // Using a `Task` to read memory generally indicates that the memory
1606        // is being read from a task different than the `CurrentTask`. When
1607        // this `Task` is not current, its address space is not mapped
1608        // so we need to go through the VMO.
1609        self.mm()?.syscall_read_memory(addr, bytes)
1610    }
1611
1612    fn read_memory_partial_until_null_byte<'a>(
1613        &self,
1614        addr: UserAddress,
1615        bytes: &'a mut [MaybeUninit<u8>],
1616    ) -> Result<&'a mut [u8], Errno> {
1617        // Using a `Task` to read memory generally indicates that the memory
1618        // is being read from a task different than the `CurrentTask`. When
1619        // this `Task` is not current, its address space is not mapped
1620        // so we need to go through the VMO.
1621        self.mm()?.syscall_read_memory_partial_until_null_byte(addr, bytes)
1622    }
1623
1624    fn read_memory_partial<'a>(
1625        &self,
1626        addr: UserAddress,
1627        bytes: &'a mut [MaybeUninit<u8>],
1628    ) -> Result<&'a mut [u8], Errno> {
1629        // Using a `Task` to read memory generally indicates that the memory
1630        // is being read from a task different than the `CurrentTask`. When
1631        // this `Task` is not current, its address space is not mapped
1632        // so we need to go through the VMO.
1633        self.mm()?.syscall_read_memory_partial(addr, bytes)
1634    }
1635
1636    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
1637        // Using a `Task` to write memory generally indicates that the memory
1638        // is being written to a task different than the `CurrentTask`. When
1639        // this `Task` is not current, its address space is not mapped
1640        // so we need to go through the VMO.
1641        self.mm()?.syscall_write_memory(addr, bytes)
1642    }
1643
1644    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
1645        // Using a `Task` to write memory generally indicates that the memory
1646        // is being written to a task different than the `CurrentTask`. When
1647        // this `Task` is not current, its address space is not mapped
1648        // so we need to go through the VMO.
1649        self.mm()?.syscall_write_memory_partial(addr, bytes)
1650    }
1651
1652    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
1653        // Using a `Task` to zero memory generally indicates that the memory
1654        // is being zeroed from a task different than the `CurrentTask`. When
1655        // this `Task` is not current, its address space is not mapped
1656        // so we need to go through the VMO.
1657        self.mm()?.syscall_zero(addr, length)
1658    }
1659}
1660
1661impl TaskMemoryAccessor for Task {
1662    fn maximum_valid_address(&self) -> Option<UserAddress> {
1663        self.mm().map(|mm| mm.maximum_valid_user_address).ok()
1664    }
1665}
1666
1667impl fmt::Debug for Task {
1668    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1669        write!(
1670            f,
1671            "{}:{}[{}]",
1672            self.thread_group().leader,
1673            self.tid,
1674            self.persistent_info.command.lock()
1675        )
1676    }
1677}
1678
1679impl cmp::PartialEq for Task {
1680    fn eq(&self, other: &Self) -> bool {
1681        let ptr: *const Task = self;
1682        let other_ptr: *const Task = other;
1683        ptr == other_ptr
1684    }
1685}
1686
1687impl cmp::Eq for Task {}
1688
1689#[cfg(test)]
1690mod test {
1691    use super::*;
1692    use crate::security;
1693    use crate::testing::*;
1694    use starnix_uapi::auth::{CAP_SYS_ADMIN, Capabilities};
1695    use starnix_uapi::resource_limits::Resource;
1696    use starnix_uapi::signals::SIGCHLD;
1697    use starnix_uapi::{CLONE_SIGHAND, CLONE_THREAD, CLONE_VM, rlimit};
1698
1699    #[::fuchsia::test]
1700    async fn test_tid_allocation() {
1701        spawn_kernel_and_run(async |locked, current_task| {
1702            let kernel = current_task.kernel();
1703            assert_eq!(current_task.get_tid(), 1);
1704            let another_current = create_task(locked, &kernel, "another-task");
1705            let another_tid = another_current.get_tid();
1706            assert!(another_tid >= 2);
1707
1708            let pids = kernel.pids.read();
1709            assert_eq!(pids.get_task(1).upgrade().unwrap().get_tid(), 1);
1710            assert_eq!(pids.get_task(another_tid).upgrade().unwrap().get_tid(), another_tid);
1711        })
1712        .await;
1713    }
1714
1715    #[::fuchsia::test]
1716    async fn test_clone_pid_and_parent_pid() {
1717        spawn_kernel_and_run(async |locked, current_task| {
1718            let thread = current_task.clone_task_for_test(
1719                locked,
1720                (CLONE_THREAD | CLONE_VM | CLONE_SIGHAND) as u64,
1721                Some(SIGCHLD),
1722            );
1723            assert_eq!(current_task.get_pid(), thread.get_pid());
1724            assert_ne!(current_task.get_tid(), thread.get_tid());
1725            assert_eq!(current_task.thread_group().leader, thread.thread_group().leader);
1726
1727            let child_task = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
1728            assert_ne!(current_task.get_pid(), child_task.get_pid());
1729            assert_ne!(current_task.get_tid(), child_task.get_tid());
1730            assert_eq!(current_task.get_pid(), child_task.thread_group().read().get_ppid());
1731        })
1732        .await;
1733    }
1734
1735    #[::fuchsia::test]
1736    async fn test_root_capabilities() {
1737        spawn_kernel_and_run(async |_, current_task| {
1738            assert!(security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN));
1739            assert_eq!(current_task.real_creds().cap_inheritable, Capabilities::empty());
1740
1741            current_task.set_creds(Credentials::with_ids(1, 1));
1742            assert!(!security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN));
1743        })
1744        .await;
1745    }
1746
1747    #[::fuchsia::test]
1748    async fn test_is_spawned() {
1749        spawn_kernel_and_run(async |locked, current_task| {
1750            // The init task should be marked as spawned, because it is executing.
1751            assert!(current_task.is_spawned());
1752
1753            // A cloned task should not be marked as spawned, because it has not yet been executed.
1754            let child = current_task
1755                .clone_task(
1756                    locked,
1757                    0,
1758                    Some(SIGCHLD),
1759                    UserRef::default(),
1760                    UserRef::default(),
1761                    UserRef::default(),
1762                )
1763                .expect("failed to create task in test");
1764            assert!(!child.is_spawned());
1765            child.release(locked);
1766
1767            // A cloned task for a test should be marked as spawned, because we intentionally avoid
1768            // spawning threads for test tasks but want them to behave as normal tasks.
1769            let test_child = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
1770            assert!(test_child.is_spawned());
1771        })
1772        .await;
1773    }
1774
1775    #[::fuchsia::test]
1776    async fn test_clone_rlimit() {
1777        spawn_kernel_and_run(async |locked, current_task| {
1778            let prev_fsize = current_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1779            assert_ne!(prev_fsize, 10);
1780            current_task
1781                .thread_group()
1782                .limits
1783                .lock(locked)
1784                .set(Resource::FSIZE, rlimit { rlim_cur: 10, rlim_max: 100 });
1785            let current_fsize = current_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1786            assert_eq!(current_fsize, 10);
1787
1788            let child_task = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
1789            let child_fsize = child_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1790            assert_eq!(child_fsize, 10)
1791        })
1792        .await;
1793    }
1794}