starnix_core/task/
task.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::{MemoryAccessor, MemoryAccessorExt, MemoryManager, TaskMemoryAccessor};
6use crate::mutable_state::{state_accessor, state_implementation};
7use crate::security;
8use crate::signals::{KernelSignal, RunState, SignalDetail, SignalInfo, SignalState};
9use crate::task::memory_attribution::MemoryAttributionLifecycleEvent;
10use crate::task::tracing::KoidPair;
11use crate::task::{
12    AbstractUnixSocketNamespace, AbstractVsockSocketNamespace, CurrentTask, EventHandler, Kernel,
13    NormalPriority, PidTable, ProcessEntryRef, ProcessExitInfo, PtraceEvent, PtraceEventData,
14    PtraceState, PtraceStatus, RealtimePriority, SchedulerState, SchedulingPolicy,
15    SeccompFilterContainer, SeccompState, SeccompStateValue, ThreadGroup, ThreadGroupKey,
16    ThreadState, UtsNamespaceHandle, WaitCanceler, Waiter, ZombieProcess,
17};
18use crate::vfs::{FdTable, FsContext, FsNodeHandle, FsString};
19use bitflags::bitflags;
20use fuchsia_rcu::rcu_option_arc::RcuOptionArc;
21use macro_rules_attribute::apply;
22use starnix_logging::{log_warn, set_zx_name};
23use starnix_sync::{
24    LockBefore, Locked, Mutex, MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, TaskRelease,
25    TerminalLock,
26};
27use starnix_task_command::TaskCommand;
28use starnix_types::arch::ArchWidth;
29use starnix_types::ownership::{OwnedRef, Releasable, ReleaseGuard, TempRef, WeakRef};
30use starnix_types::stats::TaskTimeStats;
31use starnix_uapi::auth::{Credentials, FsCred};
32use starnix_uapi::errors::Errno;
33use starnix_uapi::signals::{SIGCHLD, SigSet, Signal, sigaltstack_contains_pointer};
34use starnix_uapi::user_address::{
35    ArchSpecific, MappingMultiArchUserRef, UserAddress, UserCString, UserRef,
36};
37use starnix_uapi::{
38    CLD_CONTINUED, CLD_DUMPED, CLD_EXITED, CLD_KILLED, CLD_STOPPED, CLD_TRAPPED,
39    FUTEX_BITSET_MATCH_ANY, errno, error, from_status_like_fdio, pid_t, sigaction_t, sigaltstack,
40    tid_t, uapi,
41};
42use std::collections::VecDeque;
43use std::mem::MaybeUninit;
44use std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
45use std::sync::{Arc, Weak};
46use std::{cmp, fmt};
47use zx::{
48    AsHandleRef, Signals, Task as _, {self as zx},
49};
50
51#[derive(Clone, Debug, Eq, PartialEq)]
52pub enum ExitStatus {
53    Exit(u8),
54    Kill(SignalInfo),
55    CoreDump(SignalInfo),
56    // The second field for Stop and Continue contains the type of ptrace stop
57    // event that made it stop / continue, if applicable (PTRACE_EVENT_STOP,
58    // PTRACE_EVENT_FORK, etc)
59    Stop(SignalInfo, PtraceEvent),
60    Continue(SignalInfo, PtraceEvent),
61}
62impl ExitStatus {
63    /// Converts the given exit status to a status code suitable for returning from wait syscalls.
64    pub fn wait_status(&self) -> i32 {
65        match self {
66            ExitStatus::Exit(status) => (*status as i32) << 8,
67            ExitStatus::Kill(siginfo) => siginfo.signal.number() as i32,
68            ExitStatus::CoreDump(siginfo) => (siginfo.signal.number() as i32) | 0x80,
69            ExitStatus::Continue(siginfo, trace_event) => {
70                let trace_event_val = *trace_event as u32;
71                if trace_event_val != 0 {
72                    (siginfo.signal.number() as i32) | (trace_event_val << 16) as i32
73                } else {
74                    0xffff
75                }
76            }
77            ExitStatus::Stop(siginfo, trace_event) => {
78                let trace_event_val = *trace_event as u32;
79                (0x7f + ((siginfo.signal.number() as i32) << 8)) | (trace_event_val << 16) as i32
80            }
81        }
82    }
83
84    pub fn signal_info_code(&self) -> i32 {
85        match self {
86            ExitStatus::Exit(_) => CLD_EXITED as i32,
87            ExitStatus::Kill(_) => CLD_KILLED as i32,
88            ExitStatus::CoreDump(_) => CLD_DUMPED as i32,
89            ExitStatus::Stop(_, _) => CLD_STOPPED as i32,
90            ExitStatus::Continue(_, _) => CLD_CONTINUED as i32,
91        }
92    }
93
94    pub fn signal_info_status(&self) -> i32 {
95        match self {
96            ExitStatus::Exit(status) => *status as i32,
97            ExitStatus::Kill(siginfo)
98            | ExitStatus::CoreDump(siginfo)
99            | ExitStatus::Continue(siginfo, _)
100            | ExitStatus::Stop(siginfo, _) => siginfo.signal.number() as i32,
101        }
102    }
103}
104
105pub struct AtomicStopState {
106    inner: AtomicU8,
107}
108
109impl AtomicStopState {
110    pub fn new(state: StopState) -> Self {
111        Self { inner: AtomicU8::new(state as u8) }
112    }
113
114    pub fn load(&self, ordering: Ordering) -> StopState {
115        let v = self.inner.load(ordering);
116        // SAFETY: we only ever store to the atomic a value originating
117        // from a valid `StopState`.
118        unsafe { std::mem::transmute(v) }
119    }
120
121    pub fn store(&self, state: StopState, ordering: Ordering) {
122        self.inner.store(state as u8, ordering)
123    }
124}
125
126/// This enum describes the state that a task or thread group can be in when being stopped.
127/// The names are taken from ptrace(2).
128#[derive(Clone, Copy, Debug, PartialEq)]
129#[repr(u8)]
130pub enum StopState {
131    /// In this state, the process has been told to wake up, but has not yet been woken.
132    /// Individual threads may still be stopped.
133    Waking,
134    /// In this state, at least one thread is awake.
135    Awake,
136    /// Same as the above, but you are not allowed to make further transitions.  Used
137    /// to kill the task / group.  These names are not in ptrace(2).
138    ForceWaking,
139    ForceAwake,
140
141    /// In this state, the process has been told to stop via a signal, but has not yet stopped.
142    GroupStopping,
143    /// In this state, at least one thread of the process has stopped
144    GroupStopped,
145    /// In this state, the task has received a signal, and it is being traced, so it will
146    /// stop at the next opportunity.
147    SignalDeliveryStopping,
148    /// Same as the last one, but has stopped.
149    SignalDeliveryStopped,
150    /// Stop for a ptrace event: a variety of events defined by ptrace and
151    /// enabled with the use of various ptrace features, such as the
152    /// PTRACE_O_TRACE_* options.  The parameter indicates the type of
153    /// event. Examples include PTRACE_EVENT_FORK (the event is a fork),
154    /// PTRACE_EVENT_EXEC (the event is exec), and other similar events.
155    PtraceEventStopping,
156    /// Same as the last one, but has stopped
157    PtraceEventStopped,
158    /// In this state, we have stopped before executing a syscall
159    SyscallEnterStopping,
160    SyscallEnterStopped,
161    /// In this state, we have stopped after executing a syscall
162    SyscallExitStopping,
163    SyscallExitStopped,
164}
165
166impl StopState {
167    /// This means a stop is either in progress or we've stopped.
168    pub fn is_stopping_or_stopped(&self) -> bool {
169        self.is_stopped() || self.is_stopping()
170    }
171
172    /// This means a stop is in progress.  Refers to any stop state ending in "ing".
173    pub fn is_stopping(&self) -> bool {
174        match *self {
175            StopState::GroupStopping
176            | StopState::SignalDeliveryStopping
177            | StopState::PtraceEventStopping
178            | StopState::SyscallEnterStopping
179            | StopState::SyscallExitStopping => true,
180            _ => false,
181        }
182    }
183
184    /// This means task is stopped.
185    pub fn is_stopped(&self) -> bool {
186        match *self {
187            StopState::GroupStopped
188            | StopState::SignalDeliveryStopped
189            | StopState::PtraceEventStopped
190            | StopState::SyscallEnterStopped
191            | StopState::SyscallExitStopped => true,
192            _ => false,
193        }
194    }
195
196    /// Returns the "ed" version of this StopState, if it is "ing".
197    pub fn finalize(&self) -> Result<StopState, ()> {
198        match *self {
199            StopState::GroupStopping => Ok(StopState::GroupStopped),
200            StopState::SignalDeliveryStopping => Ok(StopState::SignalDeliveryStopped),
201            StopState::PtraceEventStopping => Ok(StopState::PtraceEventStopped),
202            StopState::Waking => Ok(StopState::Awake),
203            StopState::ForceWaking => Ok(StopState::ForceAwake),
204            StopState::SyscallEnterStopping => Ok(StopState::SyscallEnterStopped),
205            StopState::SyscallExitStopping => Ok(StopState::SyscallExitStopped),
206            _ => Err(()),
207        }
208    }
209
210    pub fn is_downgrade(&self, new_state: &StopState) -> bool {
211        match *self {
212            StopState::GroupStopped => *new_state == StopState::GroupStopping,
213            StopState::SignalDeliveryStopped => *new_state == StopState::SignalDeliveryStopping,
214            StopState::PtraceEventStopped => *new_state == StopState::PtraceEventStopping,
215            StopState::SyscallEnterStopped => *new_state == StopState::SyscallEnterStopping,
216            StopState::SyscallExitStopped => *new_state == StopState::SyscallExitStopping,
217            StopState::Awake => *new_state == StopState::Waking,
218            _ => false,
219        }
220    }
221
222    pub fn is_waking_or_awake(&self) -> bool {
223        *self == StopState::Waking
224            || *self == StopState::Awake
225            || *self == StopState::ForceWaking
226            || *self == StopState::ForceAwake
227    }
228
229    /// Indicate if the transition to the stopped / awake state is not finished.  This
230    /// function is typically used to determine when it is time to notify waiters.
231    pub fn is_in_progress(&self) -> bool {
232        *self == StopState::Waking
233            || *self == StopState::ForceWaking
234            || *self == StopState::GroupStopping
235            || *self == StopState::SignalDeliveryStopping
236            || *self == StopState::PtraceEventStopping
237            || *self == StopState::SyscallEnterStopping
238            || *self == StopState::SyscallExitStopping
239    }
240
241    pub fn ptrace_only(&self) -> bool {
242        !self.is_waking_or_awake()
243            && *self != StopState::GroupStopped
244            && *self != StopState::GroupStopping
245    }
246
247    pub fn is_illegal_transition(&self, new_state: StopState) -> bool {
248        *self == StopState::ForceAwake
249            || (*self == StopState::ForceWaking && new_state != StopState::ForceAwake)
250            || new_state == *self
251            // Downgrades are generally a sign that something is screwed up, but
252            // a SIGCONT can result in a downgrade from Awake to Waking, so we
253            // allowlist it.
254            || (self.is_downgrade(&new_state) && *self != StopState::Awake)
255    }
256
257    pub fn is_force(&self) -> bool {
258        *self == StopState::ForceAwake || *self == StopState::ForceWaking
259    }
260
261    pub fn as_in_progress(&self) -> Result<StopState, ()> {
262        match *self {
263            StopState::GroupStopped => Ok(StopState::GroupStopping),
264            StopState::SignalDeliveryStopped => Ok(StopState::SignalDeliveryStopping),
265            StopState::PtraceEventStopped => Ok(StopState::PtraceEventStopping),
266            StopState::Awake => Ok(StopState::Waking),
267            StopState::ForceAwake => Ok(StopState::ForceWaking),
268            StopState::SyscallEnterStopped => Ok(StopState::SyscallEnterStopping),
269            StopState::SyscallExitStopped => Ok(StopState::SyscallExitStopping),
270            _ => Ok(*self),
271        }
272    }
273}
274
275bitflags! {
276    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
277    pub struct TaskFlags: u8 {
278        const EXITED = 0x1;
279        const SIGNALS_AVAILABLE = 0x2;
280        const TEMPORARY_SIGNAL_MASK = 0x4;
281        /// Whether the executor should dump the stack of this task when it exits.
282        /// Currently used to implement ExitStatus::CoreDump.
283        const DUMP_ON_EXIT = 0x8;
284    }
285}
286
287pub struct AtomicTaskFlags {
288    flags: AtomicU8,
289}
290
291impl AtomicTaskFlags {
292    fn new(flags: TaskFlags) -> Self {
293        Self { flags: AtomicU8::new(flags.bits()) }
294    }
295
296    fn load(&self, ordering: Ordering) -> TaskFlags {
297        let flags = self.flags.load(ordering);
298        // We only ever store values from a `TaskFlags`.
299        TaskFlags::from_bits_retain(flags)
300    }
301
302    fn swap(&self, flags: TaskFlags, ordering: Ordering) -> TaskFlags {
303        let flags = self.flags.swap(flags.bits(), ordering);
304        // We only ever store values from a `TaskFlags`.
305        TaskFlags::from_bits_retain(flags)
306    }
307}
308
309/// This contains thread state that tracers can inspect and modify.  It is
310/// captured when a thread stops, and optionally copied back (if dirty) when a
311/// thread starts again.  An alternative implementation would involve the
312/// tracers acting on thread state directly; however, this would involve sharing
313/// CurrentTask structures across multiple threads, which goes against the
314/// intent of the design of CurrentTask.
315pub struct CapturedThreadState {
316    /// The thread state of the traced task.  This is copied out when the thread
317    /// stops.
318    pub thread_state: ThreadState,
319
320    /// Indicates that the last ptrace operation changed the thread state, so it
321    /// should be written back to the original thread.
322    pub dirty: bool,
323}
324
325impl ArchSpecific for CapturedThreadState {
326    fn is_arch32(&self) -> bool {
327        self.thread_state.is_arch32()
328    }
329}
330
331#[derive(Debug)]
332pub struct RobustList {
333    pub next: RobustListPtr,
334}
335
336pub type RobustListPtr =
337    MappingMultiArchUserRef<RobustList, uapi::robust_list, uapi::arch32::robust_list>;
338
339impl From<uapi::robust_list> for RobustList {
340    fn from(robust_list: uapi::robust_list) -> Self {
341        Self { next: RobustListPtr::from(robust_list.next) }
342    }
343}
344
345#[cfg(target_arch = "aarch64")]
346impl From<uapi::arch32::robust_list> for RobustList {
347    fn from(robust_list: uapi::arch32::robust_list) -> Self {
348        Self { next: RobustListPtr::from(robust_list.next) }
349    }
350}
351
352#[derive(Debug)]
353pub struct RobustListHead {
354    pub list: RobustList,
355    pub futex_offset: isize,
356}
357
358pub type RobustListHeadPtr =
359    MappingMultiArchUserRef<RobustListHead, uapi::robust_list_head, uapi::arch32::robust_list_head>;
360
361impl From<uapi::robust_list_head> for RobustListHead {
362    fn from(robust_list_head: uapi::robust_list_head) -> Self {
363        Self {
364            list: robust_list_head.list.into(),
365            futex_offset: robust_list_head.futex_offset as isize,
366        }
367    }
368}
369
370#[cfg(target_arch = "aarch64")]
371impl From<uapi::arch32::robust_list_head> for RobustListHead {
372    fn from(robust_list_head: uapi::arch32::robust_list_head) -> Self {
373        Self {
374            list: robust_list_head.list.into(),
375            futex_offset: robust_list_head.futex_offset as isize,
376        }
377    }
378}
379
380pub struct TaskMutableState {
381    // See https://man7.org/linux/man-pages/man2/set_tid_address.2.html
382    pub clear_child_tid: UserRef<tid_t>,
383
384    /// Signal handler related state. This is grouped together for when atomicity is needed during
385    /// signal sending and delivery.
386    signals: SignalState,
387
388    /// Internal signals that have a higher priority than a regular signal.
389    ///
390    /// Storing in a separate queue outside of `SignalState` ensures the internal signals will
391    /// never be ignored or masked when dequeuing. Higher priority ensures that no user signals
392    /// will jump the queue, e.g. ptrace, which delays the delivery.
393    ///
394    /// This design is not about observable consequence, but about convenient implementation.
395    kernel_signals: VecDeque<KernelSignal>,
396
397    /// The exit status that this task exited with.
398    exit_status: Option<ExitStatus>,
399
400    /// Desired scheduler state for the task.
401    pub scheduler_state: SchedulerState,
402
403    /// The UTS namespace assigned to this thread.
404    ///
405    /// This field is kept in the mutable state because the UTS namespace of a thread
406    /// can be forked using `clone()` or `unshare()` syscalls.
407    ///
408    /// We use UtsNamespaceHandle because the UTS properties can be modified
409    /// by any other thread that shares this namespace.
410    pub uts_ns: UtsNamespaceHandle,
411
412    /// Bit that determines whether a newly started program can have privileges its parent does
413    /// not have.  See Documentation/prctl/no_new_privs.txt in the Linux kernel for details.
414    /// Note that Starnix does not currently implement the relevant privileges (e.g.,
415    /// setuid/setgid binaries).  So, you can set this, but it does nothing other than get
416    /// propagated to children.
417    ///
418    /// The documentation indicates that this can only ever be set to
419    /// true, and it cannot be reverted to false.  Accessor methods
420    /// for this field ensure this property.
421    no_new_privs: bool,
422
423    /// Userspace hint about how to adjust the OOM score for this process.
424    pub oom_score_adj: i32,
425
426    /// List of currently installed seccomp_filters
427    pub seccomp_filters: SeccompFilterContainer,
428
429    /// A pointer to the head of the robust futex list of this thread in
430    /// userspace. See get_robust_list(2)
431    pub robust_list_head: RobustListHeadPtr,
432
433    /// The timer slack used to group timer expirations for the calling thread.
434    ///
435    /// Timers may expire up to `timerslack_ns` late, but never early.
436    ///
437    /// If this value is 0, the task's default timerslack is used.
438    pub timerslack_ns: u64,
439
440    /// The default value for `timerslack_ns`. This value cannot change during the lifetime of a
441    /// task.
442    ///
443    /// This value is set to the `timerslack_ns` of the creating thread, and thus is not constant
444    /// across tasks.
445    pub default_timerslack_ns: u64,
446
447    /// Information that a tracer needs to communicate with this process, if it
448    /// is being traced.
449    pub ptrace: Option<Box<PtraceState>>,
450
451    /// Information that a tracer needs to inspect this process.
452    pub captured_thread_state: Option<Box<CapturedThreadState>>,
453}
454
455impl TaskMutableState {
456    pub fn no_new_privs(&self) -> bool {
457        self.no_new_privs
458    }
459
460    /// Sets the value of no_new_privs to true.  It is an error to set
461    /// it to anything else.
462    pub fn enable_no_new_privs(&mut self) {
463        self.no_new_privs = true;
464    }
465
466    pub fn get_timerslack<T: zx::Timeline>(&self) -> zx::Duration<T> {
467        zx::Duration::from_nanos(self.timerslack_ns as i64)
468    }
469
470    /// Sets the current timerslack of the task to `ns`.
471    ///
472    /// If `ns` is zero, the current timerslack gets reset to the task's default timerslack.
473    pub fn set_timerslack_ns(&mut self, ns: u64) {
474        if ns == 0 {
475            self.timerslack_ns = self.default_timerslack_ns;
476        } else {
477            self.timerslack_ns = ns;
478        }
479    }
480
481    pub fn is_ptraced(&self) -> bool {
482        self.ptrace.is_some()
483    }
484
485    pub fn is_ptrace_listening(&self) -> bool {
486        self.ptrace.as_ref().is_some_and(|ptrace| ptrace.stop_status == PtraceStatus::Listening)
487    }
488
489    pub fn ptrace_on_signal_consume(&mut self) -> bool {
490        self.ptrace.as_mut().is_some_and(|ptrace: &mut Box<PtraceState>| {
491            if ptrace.stop_status.is_continuing() {
492                ptrace.stop_status = PtraceStatus::Default;
493                false
494            } else {
495                true
496            }
497        })
498    }
499
500    pub fn notify_ptracers(&mut self) {
501        if let Some(ptrace) = &self.ptrace {
502            ptrace.tracer_waiters().notify_all();
503        }
504    }
505
506    pub fn wait_on_ptracer(&self, waiter: &Waiter) {
507        if let Some(ptrace) = &self.ptrace {
508            ptrace.tracee_waiters.wait_async(&waiter);
509        }
510    }
511
512    pub fn notify_ptracees(&mut self) {
513        if let Some(ptrace) = &self.ptrace {
514            ptrace.tracee_waiters.notify_all();
515        }
516    }
517
518    pub fn take_captured_state(&mut self) -> Option<Box<CapturedThreadState>> {
519        if self.captured_thread_state.is_some() {
520            let mut state = None;
521            std::mem::swap(&mut state, &mut self.captured_thread_state);
522            return state;
523        }
524        None
525    }
526
527    pub fn copy_state_from(&mut self, current_task: &CurrentTask) {
528        self.captured_thread_state = Some(Box::new(CapturedThreadState {
529            thread_state: current_task.thread_state.extended_snapshot(),
530            dirty: false,
531        }));
532    }
533
534    /// Returns the task's currently active signal mask.
535    pub fn signal_mask(&self) -> SigSet {
536        self.signals.mask()
537    }
538
539    /// Returns true if `signal` is currently blocked by this task's signal mask.
540    pub fn is_signal_masked(&self, signal: Signal) -> bool {
541        self.signals.mask().has_signal(signal)
542    }
543
544    /// Returns true if `signal` is blocked by the saved signal mask.
545    ///
546    /// Note that the current signal mask may still not be blocking the signal.
547    pub fn is_signal_masked_by_saved_mask(&self, signal: Signal) -> bool {
548        self.signals.saved_mask().is_some_and(|mask| mask.has_signal(signal))
549    }
550
551    /// Enqueues an internal signal at the back of the task's kernel signal queue.
552    pub fn enqueue_kernel_signal(&mut self, signal: KernelSignal) {
553        self.kernel_signals.push_back(signal);
554    }
555
556    /// Enqueues a signal at the back of the task's signal queue.
557    pub fn enqueue_signal(&mut self, signal: SignalInfo) {
558        self.signals.enqueue(signal);
559    }
560
561    /// Enqueues the signal, allowing the signal to skip straight to the front of the task's queue.
562    ///
563    /// `enqueue_signal` is the more common API to use.
564    ///
565    /// Note that this will not guarantee that the signal is dequeued before any process-directed
566    /// signals.
567    pub fn enqueue_signal_front(&mut self, signal: SignalInfo) {
568        self.signals.enqueue(signal);
569    }
570
571    /// Sets the current signal mask of the task.
572    pub fn set_signal_mask(&mut self, mask: SigSet) {
573        self.signals.set_mask(mask);
574    }
575
576    /// Sets a temporary signal mask for the task.
577    ///
578    /// This mask should be removed by a matching call to `restore_signal_mask`.
579    pub fn set_temporary_signal_mask(&mut self, mask: SigSet) {
580        self.signals.set_temporary_mask(mask);
581    }
582
583    /// Removes the currently active, temporary, signal mask and restores the
584    /// previously active signal mask.
585    pub fn restore_signal_mask(&mut self) {
586        self.signals.restore_mask();
587    }
588
589    /// Returns true if the task's current `RunState` is blocked.
590    pub fn is_blocked(&self) -> bool {
591        self.signals.run_state.is_blocked()
592    }
593
594    /// Sets the task's `RunState` to `run_state`.
595    pub fn set_run_state(&mut self, run_state: RunState) {
596        self.signals.run_state = run_state;
597    }
598
599    pub fn run_state(&self) -> RunState {
600        self.signals.run_state.clone()
601    }
602
603    pub fn on_signal_stack(&self, stack_pointer_register: u64) -> bool {
604        self.signals
605            .alt_stack
606            .map(|signal_stack| sigaltstack_contains_pointer(&signal_stack, stack_pointer_register))
607            .unwrap_or(false)
608    }
609
610    pub fn set_sigaltstack(&mut self, stack: Option<sigaltstack>) {
611        self.signals.alt_stack = stack;
612    }
613
614    pub fn sigaltstack(&self) -> Option<sigaltstack> {
615        self.signals.alt_stack
616    }
617
618    pub fn wait_on_signal(&mut self, waiter: &Waiter) {
619        self.signals.signal_wait.wait_async(waiter);
620    }
621
622    pub fn signals_mut(&mut self) -> &mut SignalState {
623        &mut self.signals
624    }
625
626    pub fn wait_on_signal_fd_events(
627        &self,
628        waiter: &Waiter,
629        mask: SigSet,
630        handler: EventHandler,
631    ) -> WaitCanceler {
632        self.signals.signal_wait.wait_async_signal_mask(waiter, mask, handler)
633    }
634
635    pub fn notify_signal_waiters(&self, signal: &Signal) {
636        self.signals.signal_wait.notify_signal(signal);
637    }
638
639    /// Thaw the task if has been frozen
640    pub fn thaw(&mut self) {
641        if let RunState::Frozen(waiter) = self.run_state() {
642            waiter.notify();
643        }
644    }
645
646    pub fn is_frozen(&self) -> bool {
647        matches!(self.run_state(), RunState::Frozen(_))
648    }
649
650    #[cfg(test)]
651    pub fn kernel_signals_for_test(&self) -> &VecDeque<KernelSignal> {
652        &self.kernel_signals
653    }
654}
655
656#[apply(state_implementation!)]
657impl TaskMutableState<Base = Task> {
658    pub fn set_stopped(
659        &mut self,
660        stopped: StopState,
661        siginfo: Option<SignalInfo>,
662        current_task: Option<&CurrentTask>,
663        event: Option<PtraceEventData>,
664    ) {
665        if stopped.ptrace_only() && self.ptrace.is_none() {
666            return;
667        }
668
669        if self.base.load_stopped().is_illegal_transition(stopped) {
670            return;
671        }
672
673        // TODO(https://g-issues.fuchsia.dev/issues/306438676): When task can be
674        // stopped inside user code, task will need to be either restarted or
675        // stopped here.
676        self.store_stopped(stopped);
677        if stopped.is_stopped() {
678            if let Some(ref current_task) = current_task {
679                self.copy_state_from(current_task);
680            }
681        }
682        if let Some(ptrace) = &mut self.ptrace {
683            ptrace.set_last_signal(siginfo);
684            ptrace.set_last_event(event);
685        }
686        if stopped == StopState::Waking || stopped == StopState::ForceWaking {
687            self.notify_ptracees();
688        }
689        if !stopped.is_in_progress() {
690            self.notify_ptracers();
691        }
692    }
693
694    // Prepare a SignalInfo to be sent to the tracer, if any.
695    pub fn prepare_signal_info(
696        &mut self,
697        stopped: StopState,
698    ) -> Option<(Weak<ThreadGroup>, SignalInfo)> {
699        if !stopped.is_stopped() {
700            return None;
701        }
702
703        if let Some(ptrace) = &self.ptrace {
704            if let Some(last_signal) = ptrace.get_last_signal_ref() {
705                let signal_info = SignalInfo::new(
706                    SIGCHLD,
707                    CLD_TRAPPED as i32,
708                    SignalDetail::SIGCHLD {
709                        pid: self.base.tid,
710                        uid: self.base.real_creds().uid,
711                        status: last_signal.signal.number() as i32,
712                    },
713                );
714
715                return Some((ptrace.core_state.thread_group.clone(), signal_info));
716            }
717        }
718
719        None
720    }
721
722    pub fn set_ptrace(&mut self, tracer: Option<Box<PtraceState>>) -> Result<(), Errno> {
723        if tracer.is_some() && self.ptrace.is_some() {
724            return error!(EPERM);
725        }
726
727        if tracer.is_none() {
728            // Handle the case where this is called while the thread group is being released.
729            if let Ok(tg_stop_state) = self.base.thread_group().load_stopped().as_in_progress() {
730                self.set_stopped(tg_stop_state, None, None, None);
731            }
732        }
733        self.ptrace = tracer;
734        Ok(())
735    }
736
737    pub fn can_accept_ptrace_commands(&mut self) -> bool {
738        !self.base.load_stopped().is_waking_or_awake()
739            && self.is_ptraced()
740            && !self.is_ptrace_listening()
741    }
742
743    fn store_stopped(&mut self, state: StopState) {
744        // We don't actually use the guard but we require it to enforce that the
745        // caller holds the thread group's mutable state lock (identified by
746        // mutable access to the thread group's mutable state).
747
748        self.base.stop_state.store(state, Ordering::Relaxed)
749    }
750
751    pub fn update_flags(&mut self, clear: TaskFlags, set: TaskFlags) {
752        // We don't actually use the guard but we require it to enforce that the
753        // caller holds the task's mutable state lock (identified by mutable
754        // access to the task's mutable state).
755
756        debug_assert_eq!(clear ^ set, clear | set);
757        let observed = self.base.flags();
758        let swapped = self.base.flags.swap((observed | set) & !clear, Ordering::Relaxed);
759        debug_assert_eq!(swapped, observed);
760    }
761
762    pub fn set_flags(&mut self, flag: TaskFlags, v: bool) {
763        let (clear, set) = if v { (TaskFlags::empty(), flag) } else { (flag, TaskFlags::empty()) };
764
765        self.update_flags(clear, set);
766    }
767
768    pub fn set_exit_status(&mut self, status: ExitStatus) {
769        self.set_flags(TaskFlags::EXITED, true);
770        self.exit_status = Some(status);
771    }
772
773    pub fn set_exit_status_if_not_already(&mut self, status: ExitStatus) {
774        self.set_flags(TaskFlags::EXITED, true);
775        self.exit_status.get_or_insert(status);
776    }
777
778    /// Returns the number of pending signals for this task, without considering the signal mask.
779    pub fn pending_signal_count(&self) -> usize {
780        self.signals.num_queued() + self.base.thread_group().pending_signals.lock().num_queued()
781    }
782
783    /// Returns `true` if `signal` is pending for this task, without considering the signal mask.
784    pub fn has_signal_pending(&self, signal: Signal) -> bool {
785        self.signals.has_queued(signal)
786            || self.base.thread_group().pending_signals.lock().has_queued(signal)
787    }
788
789    /// The set of pending signals for the task, including the signals pending for the thread
790    /// group.
791    pub fn pending_signals(&self) -> SigSet {
792        self.signals.pending() | self.base.thread_group().pending_signals.lock().pending()
793    }
794
795    /// The set of pending signals for the task specifically, not including the signals pending
796    /// for the thread group.
797    pub fn task_specific_pending_signals(&self) -> SigSet {
798        self.signals.pending()
799    }
800
801    /// Returns true if any currently pending signal is allowed by `mask`.
802    pub fn is_any_signal_allowed_by_mask(&self, mask: SigSet) -> bool {
803        self.signals.is_any_allowed_by_mask(mask)
804            || self.base.thread_group().pending_signals.lock().is_any_allowed_by_mask(mask)
805    }
806
807    /// Returns whether or not a signal is pending for this task, taking the current
808    /// signal mask into account.
809    pub fn is_any_signal_pending(&self) -> bool {
810        let mask = self.signal_mask();
811        self.signals.is_any_pending()
812            || self.base.thread_group().pending_signals.lock().is_any_allowed_by_mask(mask)
813    }
814
815    /// Returns the next pending signal that passes `predicate`.
816    fn take_next_signal_where<F>(&mut self, predicate: F) -> Option<SignalInfo>
817    where
818        F: Fn(&SignalInfo) -> bool,
819    {
820        if let Some(signal) =
821            self.base.thread_group().pending_signals.lock().take_next_where(&predicate)
822        {
823            Some(signal)
824        } else {
825            self.signals.take_next_where(&predicate)
826        }
827    }
828
829    /// Removes and returns the next pending `signal` for this task.
830    ///
831    /// Returns `None` if `siginfo` is a blocked signal, or no such signal is pending.
832    pub fn take_specific_signal(&mut self, siginfo: SignalInfo) -> Option<SignalInfo> {
833        let signal_mask = self.signal_mask();
834        if signal_mask.has_signal(siginfo.signal) {
835            return None;
836        }
837
838        let predicate = |s: &SignalInfo| s.signal == siginfo.signal;
839        self.take_next_signal_where(predicate)
840    }
841
842    /// Removes and returns a pending signal that is unblocked by the current signal mask.
843    ///
844    /// Returns `None` if there are no unblocked signals pending.
845    pub fn take_any_signal(&mut self) -> Option<SignalInfo> {
846        self.take_signal_with_mask(self.signal_mask())
847    }
848
849    /// Removes and returns a pending signal that is unblocked by `signal_mask`.
850    ///
851    /// Returns `None` if there are no signals pending that are unblocked by `signal_mask`.
852    pub fn take_signal_with_mask(&mut self, signal_mask: SigSet) -> Option<SignalInfo> {
853        let predicate = |s: &SignalInfo| !signal_mask.has_signal(s.signal) || s.force;
854        self.take_next_signal_where(predicate)
855    }
856
857    /// Removes and returns a pending internal signal.
858    ///
859    /// Returns `None` if there are no signals pending.
860    pub fn take_kernel_signal(&mut self) -> Option<KernelSignal> {
861        self.kernel_signals.pop_front()
862    }
863
864    #[cfg(test)]
865    pub fn queued_signal_count(&self, signal: Signal) -> usize {
866        self.signals.queued_count(signal)
867            + self.base.thread_group().pending_signals.lock().queued_count(signal)
868    }
869}
870
871#[derive(Debug, Clone, Copy, PartialEq, Eq)]
872pub enum TaskStateCode {
873    // Task is being executed.
874    Running,
875
876    // Task is waiting for an event.
877    Sleeping,
878
879    // Tracing stop
880    TracingStop,
881
882    // Task has exited.
883    Zombie,
884}
885
886impl TaskStateCode {
887    pub fn code_char(&self) -> char {
888        match self {
889            TaskStateCode::Running => 'R',
890            TaskStateCode::Sleeping => 'S',
891            TaskStateCode::TracingStop => 't',
892            TaskStateCode::Zombie => 'Z',
893        }
894    }
895
896    pub fn name(&self) -> &'static str {
897        match self {
898            TaskStateCode::Running => "running",
899            TaskStateCode::Sleeping => "sleeping",
900            TaskStateCode::TracingStop => "tracing stop",
901            TaskStateCode::Zombie => "zombie",
902        }
903    }
904}
905
906/// The information of the task that needs to be available to the `ThreadGroup` while computing
907/// which process a wait can target. It is necessary to shared this data with the `ThreadGroup` so
908/// that it is available while the task is being dropped and so is not accessible from a weak
909/// pointer.
910#[derive(Debug)]
911pub struct TaskPersistentInfoState {
912    /// Immutable information about the task
913    tid: tid_t,
914    thread_group_key: ThreadGroupKey,
915
916    /// The command of this task.
917    command: Mutex<TaskCommand>,
918
919    /// The security credentials for this task. These are only set when the task is the CurrentTask,
920    /// or on task creation.
921    creds: RwLock<Credentials>,
922}
923
924impl TaskPersistentInfoState {
925    fn new(
926        tid: tid_t,
927        thread_group_key: ThreadGroupKey,
928        command: TaskCommand,
929        creds: Credentials,
930    ) -> TaskPersistentInfo {
931        Arc::new(Self {
932            tid,
933            thread_group_key,
934            command: Mutex::new(command),
935            creds: RwLock::new(creds),
936        })
937    }
938
939    pub fn tid(&self) -> tid_t {
940        self.tid
941    }
942
943    pub fn pid(&self) -> pid_t {
944        self.thread_group_key.pid()
945    }
946
947    pub fn command_guard(&self) -> MutexGuard<'_, TaskCommand> {
948        self.command.lock()
949    }
950
951    pub fn real_creds(&self) -> RwLockReadGuard<'_, Credentials> {
952        self.creds.read()
953    }
954
955    /// SAFETY: Only use from CurrentTask. Changing credentials outside of the CurrentTask may
956    /// introduce TOCTOU issues in access checks.
957    pub(in crate::task) unsafe fn creds_mut(&self) -> RwLockWriteGuard<'_, Credentials> {
958        self.creds.write()
959    }
960}
961
962pub type TaskPersistentInfo = Arc<TaskPersistentInfoState>;
963
964/// A unit of execution.
965///
966/// A task is the primary unit of execution in the Starnix kernel. Most tasks are *user* tasks,
967/// which have an associated Zircon thread. The Zircon thread switches between restricted mode,
968/// in which the thread runs userspace code, and normal mode, in which the thread runs Starnix
969/// code.
970///
971/// Tasks track the resources used by userspace by referencing various objects, such as an
972/// `FdTable`, a `MemoryManager`, and an `FsContext`. Many tasks can share references to these
973/// objects. In principle, which objects are shared between which tasks can be largely arbitrary,
974/// but there are common patterns of sharing. For example, tasks created with `pthread_create`
975/// will share the `FdTable`, `MemoryManager`, and `FsContext` and are often called "threads" by
976/// userspace programmers. Tasks created by `posix_spawn` do not share these objects and are often
977/// called "processes" by userspace programmers. However, inside the kernel, there is no clear
978/// definition of a "thread" or a "process".
979///
980/// During boot, the kernel creates the first task, often called `init`. The vast majority of other
981/// tasks are created as transitive clones (e.g., using `clone(2)`) of that task. Sometimes, the
982/// kernel will create new tasks from whole cloth, either with a corresponding userspace component
983/// or to represent some background work inside the kernel.
984///
985/// See also `CurrentTask`, which represents the task corresponding to the thread that is currently
986/// executing.
987pub struct Task {
988    /// Weak reference to the `OwnedRef` of this `Task`. This allows to retrieve the
989    /// `TempRef` from a raw `Task`.
990    pub weak_self: WeakRef<Self>,
991
992    /// A unique identifier for this task.
993    ///
994    /// This value can be read in userspace using `gettid(2)`. In general, this value
995    /// is different from the value return by `getpid(2)`, which returns the `id` of the leader
996    /// of the `thread_group`.
997    pub tid: tid_t,
998
999    /// The process key of this task.
1000    pub thread_group_key: ThreadGroupKey,
1001
1002    /// The kernel to which this thread group belongs.
1003    pub kernel: Arc<Kernel>,
1004
1005    /// The thread group to which this task belongs.
1006    ///
1007    /// The group of tasks in a thread group roughly corresponds to the userspace notion of a
1008    /// process.
1009    pub thread_group: Arc<ThreadGroup>,
1010
1011    /// A handle to the underlying Zircon thread object.
1012    ///
1013    /// Some tasks lack an underlying Zircon thread. These tasks are used internally by the
1014    /// Starnix kernel to track background work, typically on a `kthread`.
1015    pub thread: RwLock<Option<Arc<zx::Thread>>>,
1016
1017    /// The file descriptor table for this task.
1018    ///
1019    /// This table can be share by many tasks.
1020    pub files: FdTable,
1021
1022    /// The memory manager for this task.  This is `None` only for system tasks.
1023    pub mm: RcuOptionArc<MemoryManager>,
1024
1025    /// The file system for this task.
1026    fs: RcuOptionArc<FsContext>,
1027
1028    /// The namespace for abstract AF_UNIX sockets for this task.
1029    pub abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
1030
1031    /// The namespace for AF_VSOCK for this task.
1032    pub abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
1033
1034    /// The stop state of the task, distinct from the stop state of the thread group.
1035    ///
1036    /// Must only be set when the `mutable_state` write lock is held.
1037    stop_state: AtomicStopState,
1038
1039    /// The flags for the task.
1040    ///
1041    /// Must only be set the then `mutable_state` write lock is held.
1042    flags: AtomicTaskFlags,
1043
1044    /// The mutable state of the Task.
1045    mutable_state: RwLock<TaskMutableState>,
1046
1047    /// The information of the task that needs to be available to the `ThreadGroup` while computing
1048    /// which process a wait can target.
1049    /// Contains the command line, the task credentials and the exit signal.
1050    /// See `TaskPersistentInfo` for more information.
1051    pub persistent_info: TaskPersistentInfo,
1052
1053    /// For vfork and clone() with CLONE_VFORK, this is set when the task exits or calls execve().
1054    /// It allows the calling task to block until the fork has been completed. Only populated
1055    /// when created with the CLONE_VFORK flag.
1056    vfork_event: Option<Arc<zx::Event>>,
1057
1058    /// Variable that can tell you whether there are currently seccomp
1059    /// filters without holding a lock
1060    pub seccomp_filter_state: SeccompState,
1061
1062    /// Tell you whether you are tracing syscall entry / exit without a lock.
1063    pub trace_syscalls: AtomicBool,
1064
1065    // The pid directory, so it doesn't have to be generated and thrown away on every access.
1066    // See https://fxbug.dev/291962828 for details.
1067    pub proc_pid_directory_cache: Mutex<Option<FsNodeHandle>>,
1068
1069    /// The Linux Security Modules state for this thread group. This should be the last member of
1070    /// this struct.
1071    pub security_state: security::TaskState,
1072}
1073
1074/// The decoded cross-platform parts we care about for page fault exception reports.
1075#[derive(Debug)]
1076pub struct PageFaultExceptionReport {
1077    pub faulting_address: u64,
1078    pub not_present: bool, // Set when the page fault was due to a not-present page.
1079    pub is_write: bool,    // Set when the triggering memory operation was a write.
1080    pub is_execute: bool,  // Set when the triggering memory operation was an execute.
1081}
1082
1083impl Task {
1084    pub fn kernel(&self) -> &Arc<Kernel> {
1085        &self.kernel
1086    }
1087
1088    pub fn thread_group(&self) -> &Arc<ThreadGroup> {
1089        &self.thread_group
1090    }
1091
1092    pub fn has_same_address_space(&self, other: Option<&Arc<MemoryManager>>) -> bool {
1093        match (self.mm(), other) {
1094            (Ok(this), Some(other)) => Arc::ptr_eq(&this, other),
1095            (Err(_), None) => true,
1096            _ => false,
1097        }
1098    }
1099
1100    pub fn flags(&self) -> TaskFlags {
1101        self.flags.load(Ordering::Relaxed)
1102    }
1103
1104    /// When the task exits, if there is a notification that needs to propagate
1105    /// to a ptracer, make sure it will propagate.
1106    pub fn set_ptrace_zombie(&self, pids: &mut crate::task::PidTable) {
1107        let pgid = self.thread_group().read().process_group.leader;
1108        let exit_signal = self.thread_group().read().exit_signal.clone();
1109        let mut state = self.write();
1110        state.set_stopped(StopState::ForceAwake, None, None, None);
1111        if let Some(ptrace) = &mut state.ptrace {
1112            // Add a zombie that the ptracer will notice.
1113            ptrace.last_signal_waitable = true;
1114            let tracer_pid = ptrace.get_pid();
1115            let tracer_tg = pids.get_thread_group(tracer_pid);
1116            if let Some(tracer_tg) = tracer_tg {
1117                drop(state);
1118                let mut tracer_state = tracer_tg.write();
1119
1120                let exit_status = self.exit_status().unwrap_or_else(|| {
1121                    starnix_logging::log_error!("Exiting without an exit code.");
1122                    ExitStatus::Exit(u8::MAX)
1123                });
1124                let uid = self.persistent_info.real_creds().uid;
1125                let exit_info = ProcessExitInfo { status: exit_status, exit_signal };
1126                let zombie = ZombieProcess {
1127                    thread_group_key: self.thread_group_key.clone(),
1128                    pgid,
1129                    uid,
1130                    exit_info: exit_info,
1131                    // ptrace doesn't need this.
1132                    time_stats: TaskTimeStats::default(),
1133                    is_canonical: false,
1134                };
1135
1136                tracer_state.zombie_ptracees.add(pids, self.tid, zombie);
1137            };
1138        }
1139    }
1140
1141    /// Disconnects this task from the tracer, if the tracer is still running.
1142    pub fn ptrace_disconnect(&mut self, pids: &PidTable) {
1143        let mut state = self.write();
1144        let ptracer_pid = state.ptrace.as_ref().map(|ptrace| ptrace.get_pid());
1145        if let Some(ptracer_pid) = ptracer_pid {
1146            let _ = state.set_ptrace(None);
1147            if let Some(ProcessEntryRef::Process(tg)) = pids.get_process(ptracer_pid) {
1148                let tid = self.get_tid();
1149                drop(state);
1150                tg.ptracees.lock().remove(&tid);
1151            }
1152        }
1153    }
1154
1155    pub fn exit_status(&self) -> Option<ExitStatus> {
1156        self.is_exitted().then(|| self.read().exit_status.clone()).flatten()
1157    }
1158
1159    pub fn is_exitted(&self) -> bool {
1160        self.flags().contains(TaskFlags::EXITED)
1161    }
1162
1163    pub fn load_stopped(&self) -> StopState {
1164        self.stop_state.load(Ordering::Relaxed)
1165    }
1166
1167    /// Upgrade a Reference to a Task, returning a ESRCH errno if the reference cannot be borrowed.
1168    pub fn from_weak(weak: &WeakRef<Task>) -> Result<TempRef<'_, Task>, Errno> {
1169        weak.upgrade().ok_or_else(|| errno!(ESRCH))
1170    }
1171
1172    /// Internal function for creating a Task object. Useful when you need to specify the value of
1173    /// every field. create_process and create_thread are more likely to be what you want.
1174    ///
1175    /// Any fields that should be initialized fresh for every task, even if the task was created
1176    /// with fork, are initialized to their defaults inside this function. All other fields are
1177    /// passed as parameters.
1178    #[allow(clippy::let_and_return)]
1179    pub fn new(
1180        tid: tid_t,
1181        command: TaskCommand,
1182        thread_group: Arc<ThreadGroup>,
1183        thread: Option<zx::Thread>,
1184        files: FdTable,
1185        mm: Option<Arc<MemoryManager>>,
1186        // The only case where fs should be None if when building the initial task that is the
1187        // used to build the initial FsContext.
1188        fs: Arc<FsContext>,
1189        creds: Credentials,
1190        abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
1191        abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
1192        signal_mask: SigSet,
1193        kernel_signals: VecDeque<KernelSignal>,
1194        vfork_event: Option<Arc<zx::Event>>,
1195        scheduler_state: SchedulerState,
1196        uts_ns: UtsNamespaceHandle,
1197        no_new_privs: bool,
1198        seccomp_filter_state: SeccompState,
1199        seccomp_filters: SeccompFilterContainer,
1200        robust_list_head: RobustListHeadPtr,
1201        timerslack_ns: u64,
1202        security_state: security::TaskState,
1203    ) -> OwnedRef<Self> {
1204        let thread_group_key = ThreadGroupKey::from(&thread_group);
1205        OwnedRef::new_cyclic(|weak_self| {
1206            let task = Task {
1207                weak_self,
1208                tid,
1209                thread_group_key: thread_group_key.clone(),
1210                kernel: Arc::clone(&thread_group.kernel),
1211                thread_group,
1212                thread: RwLock::new(thread.map(Arc::new)),
1213                files,
1214                mm: RcuOptionArc::new(mm),
1215                fs: RcuOptionArc::new(Some(fs)),
1216                abstract_socket_namespace,
1217                abstract_vsock_namespace,
1218                vfork_event,
1219                stop_state: AtomicStopState::new(StopState::Awake),
1220                flags: AtomicTaskFlags::new(TaskFlags::empty()),
1221                mutable_state: RwLock::new(TaskMutableState {
1222                    clear_child_tid: UserRef::default(),
1223                    signals: SignalState::with_mask(signal_mask),
1224                    kernel_signals,
1225                    exit_status: None,
1226                    scheduler_state,
1227                    uts_ns,
1228                    no_new_privs,
1229                    oom_score_adj: Default::default(),
1230                    seccomp_filters,
1231                    robust_list_head,
1232                    timerslack_ns,
1233                    // The default timerslack is set to the current timerslack of the creating thread.
1234                    default_timerslack_ns: timerslack_ns,
1235                    ptrace: None,
1236                    captured_thread_state: None,
1237                }),
1238                persistent_info: TaskPersistentInfoState::new(
1239                    tid,
1240                    thread_group_key,
1241                    command,
1242                    creds,
1243                ),
1244                seccomp_filter_state,
1245                trace_syscalls: AtomicBool::new(false),
1246                proc_pid_directory_cache: Mutex::new(None),
1247                security_state,
1248            };
1249
1250            #[cfg(any(test, debug_assertions))]
1251            {
1252                // Note that `Kernel::pids` is already locked by the caller of `Task::new()`.
1253                let _l1 = task.read();
1254                let _l2 = task.persistent_info.real_creds();
1255                let _l3 = task.persistent_info.command_guard();
1256            }
1257            task
1258        })
1259    }
1260
1261    state_accessor!(Task, mutable_state);
1262
1263    /// Returns the real credentials of the task. These credentials are used to check permissions
1264    /// for actions performed on the task. If the task itself is performing an action, use
1265    /// `CurrentTask::current_creds` instead.
1266    pub fn real_creds(&self) -> Credentials {
1267        self.persistent_info.real_creds().clone()
1268    }
1269
1270    pub fn with_real_creds<B, F>(&self, f: F) -> B
1271    where
1272        F: FnOnce(&Credentials) -> B,
1273    {
1274        f(&self.persistent_info.real_creds())
1275    }
1276
1277    pub fn ptracer_task(&self) -> WeakRef<Task> {
1278        let ptracer = {
1279            let state = self.read();
1280            state.ptrace.as_ref().map(|p| p.core_state.pid)
1281        };
1282
1283        let Some(ptracer) = ptracer else {
1284            return WeakRef::default();
1285        };
1286
1287        self.get_task(ptracer)
1288    }
1289
1290    pub fn fs(&self) -> Arc<FsContext> {
1291        self.fs.to_option_arc().expect("fs must be set")
1292    }
1293
1294    pub fn has_shared_fs(&self) -> bool {
1295        let maybe_fs = self.fs.to_option_arc();
1296        // This check is incorrect because someone else could be holding a temporary Arc to the
1297        // FsContext and therefore increasing the strong count.
1298        maybe_fs.is_some_and(|fs| Arc::strong_count(&fs) > 2usize)
1299    }
1300
1301    #[track_caller]
1302    pub fn mm(&self) -> Result<Arc<MemoryManager>, Errno> {
1303        self.mm.to_option_arc().ok_or_else(|| errno!(EINVAL))
1304    }
1305
1306    pub fn unshare_fs(&self) {
1307        let fs = self.fs().fork();
1308        self.fs.update(Some(fs));
1309    }
1310
1311    /// Modify the given elements of the scheduler state with new values and update the
1312    /// task's thread's role.
1313    pub(crate) fn set_scheduler_policy_priority_and_reset_on_fork(
1314        &self,
1315        policy: SchedulingPolicy,
1316        priority: RealtimePriority,
1317        reset_on_fork: bool,
1318    ) -> Result<(), Errno> {
1319        self.update_scheduler_state_then_role(|scheduler_state| {
1320            scheduler_state.policy = policy;
1321            scheduler_state.realtime_priority = priority;
1322            scheduler_state.reset_on_fork = reset_on_fork;
1323        })
1324    }
1325
1326    /// Modify the scheduler state's priority and update the task's thread's role.
1327    pub(crate) fn set_scheduler_priority(&self, priority: RealtimePriority) -> Result<(), Errno> {
1328        self.update_scheduler_state_then_role(|scheduler_state| {
1329            scheduler_state.realtime_priority = priority
1330        })
1331    }
1332
1333    /// Modify the scheduler state's nice and update the task's thread's role.
1334    pub(crate) fn set_scheduler_nice(&self, nice: NormalPriority) -> Result<(), Errno> {
1335        self.update_scheduler_state_then_role(|scheduler_state| {
1336            scheduler_state.normal_priority = nice
1337        })
1338    }
1339
1340    /// Overwrite the existing scheduler state with a new one and update the task's thread's role.
1341    pub fn set_scheduler_state(&self, scheduler_state: SchedulerState) -> Result<(), Errno> {
1342        self.update_scheduler_state_then_role(|task_scheduler_state| {
1343            *task_scheduler_state = scheduler_state
1344        })
1345    }
1346
1347    /// Update the task's thread's role based on its current scheduler state without making any
1348    /// changes to the state.
1349    ///
1350    /// This should be called on tasks that have newly created threads, e.g. after cloning.
1351    pub fn sync_scheduler_state_to_role(&self) -> Result<(), Errno> {
1352        self.update_scheduler_state_then_role(|_| {})
1353    }
1354
1355    fn update_scheduler_state_then_role(
1356        &self,
1357        updater: impl FnOnce(&mut SchedulerState),
1358    ) -> Result<(), Errno> {
1359        let new_scheduler_state = {
1360            // Hold the task state lock as briefly as possible, it's not needed to update the role.
1361            let mut state = self.write();
1362            updater(&mut state.scheduler_state);
1363            state.scheduler_state
1364        };
1365        self.thread_group().kernel.scheduler.set_thread_role(self, new_scheduler_state)?;
1366        Ok(())
1367    }
1368
1369    /// Signals the vfork event, if any, to unblock waiters.
1370    pub fn signal_vfork(&self) {
1371        if let Some(event) = &self.vfork_event {
1372            if let Err(status) = event.signal(Signals::NONE, Signals::USER_0) {
1373                log_warn!("Failed to set vfork signal {status}");
1374            }
1375        };
1376    }
1377
1378    /// Blocks the caller until the task has exited or executed execve(). This is used to implement
1379    /// vfork() and clone(... CLONE_VFORK, ...). The task must have created with CLONE_EXECVE.
1380    pub fn wait_for_execve(&self, task_to_wait: WeakRef<Task>) -> Result<(), Errno> {
1381        let event = task_to_wait.upgrade().and_then(|t| t.vfork_event.clone());
1382        if let Some(event) = event {
1383            event
1384                .wait_one(zx::Signals::USER_0, zx::MonotonicInstant::INFINITE)
1385                .map_err(|status| from_status_like_fdio!(status))?;
1386        }
1387        Ok(())
1388    }
1389
1390    /// If needed, clear the child tid for this task.
1391    ///
1392    /// Userspace can ask us to clear the child tid and issue a futex wake at
1393    /// the child tid address when we tear down a task. For example, bionic
1394    /// uses this mechanism to implement pthread_join. The thread that calls
1395    /// pthread_join sleeps using FUTEX_WAIT on the child tid address. We wake
1396    /// them up here to let them know the thread is done.
1397    pub fn clear_child_tid_if_needed<L>(&self, locked: &mut Locked<L>) -> Result<(), Errno>
1398    where
1399        L: LockBefore<TerminalLock>,
1400    {
1401        let mut state = self.write();
1402        let user_tid = state.clear_child_tid;
1403        if !user_tid.is_null() {
1404            let zero: tid_t = 0;
1405            self.write_object(user_tid, &zero)?;
1406            self.kernel().shared_futexes.wake(
1407                locked,
1408                self,
1409                user_tid.addr(),
1410                usize::MAX,
1411                FUTEX_BITSET_MATCH_ANY,
1412            )?;
1413            state.clear_child_tid = UserRef::default();
1414        }
1415        Ok(())
1416    }
1417
1418    pub fn get_task(&self, tid: tid_t) -> WeakRef<Task> {
1419        self.kernel().pids.read().get_task(tid)
1420    }
1421
1422    pub fn get_pid(&self) -> pid_t {
1423        self.thread_group_key.pid()
1424    }
1425
1426    pub fn get_tid(&self) -> tid_t {
1427        self.tid
1428    }
1429
1430    pub fn is_leader(&self) -> bool {
1431        self.get_pid() == self.get_tid()
1432    }
1433
1434    pub fn read_argv(&self, max_len: usize) -> Result<Vec<FsString>, Errno> {
1435        // argv is empty for kthreads
1436        let Ok(mm) = self.mm() else {
1437            return Ok(vec![]);
1438        };
1439        let (argv_start, argv_end) = {
1440            let mm_state = mm.state.read();
1441            (mm_state.argv_start, mm_state.argv_end)
1442        };
1443
1444        let len_to_read = std::cmp::min(argv_end - argv_start, max_len);
1445        self.read_nul_delimited_c_string_list(argv_start, len_to_read)
1446    }
1447
1448    pub fn read_argv0(&self) -> Result<FsString, Errno> {
1449        // argv is empty for kthreads
1450        let Ok(mm) = self.mm() else {
1451            return Ok(FsString::default());
1452        };
1453        let argv_start = {
1454            let mm_state = mm.state.read();
1455            mm_state.argv_start
1456        };
1457        // Assuming a 64-bit arch width is fine for a type that's just u8's on all arches.
1458        let argv_start = UserCString::new(&ArchWidth::Arch64, argv_start);
1459        self.read_path(argv_start)
1460    }
1461
1462    pub fn read_env(&self, max_len: usize) -> Result<Vec<FsString>, Errno> {
1463        // environment is empty for kthreads
1464        let Ok(mm) = self.mm() else { return Ok(vec![]) };
1465        let (env_start, env_end) = {
1466            let mm_state = mm.state.read();
1467            (mm_state.environ_start, mm_state.environ_end)
1468        };
1469
1470        let len_to_read = std::cmp::min(env_end - env_start, max_len);
1471        self.read_nul_delimited_c_string_list(env_start, len_to_read)
1472    }
1473
1474    pub fn thread_runtime_info(&self) -> Result<zx::TaskRuntimeInfo, Errno> {
1475        self.thread
1476            .read()
1477            .as_ref()
1478            .ok_or_else(|| errno!(EINVAL))?
1479            .get_runtime_info()
1480            .map_err(|status| from_status_like_fdio!(status))
1481    }
1482
1483    pub fn real_fscred(&self) -> FsCred {
1484        self.with_real_creds(|creds| creds.as_fscred())
1485    }
1486
1487    /// Interrupts the current task.
1488    ///
1489    /// This will interrupt any blocking syscalls if the task is blocked on one.
1490    /// The signal_state of the task must not be locked.
1491    pub fn interrupt(&self) {
1492        self.read().signals.run_state.wake();
1493        if let Some(thread) = self.thread.read().as_ref() {
1494            #[allow(
1495                clippy::undocumented_unsafe_blocks,
1496                reason = "Force documented unsafe blocks in Starnix"
1497            )]
1498            let status = unsafe { zx::sys::zx_restricted_kick(thread.raw_handle(), 0) };
1499            if status != zx::sys::ZX_OK {
1500                // zx_restricted_kick() could return ZX_ERR_BAD_STATE if the target thread is already in the
1501                // DYING or DEAD states. That's fine since it means that the task is in the process of
1502                // tearing down, so allow it.
1503                assert_eq!(status, zx::sys::ZX_ERR_BAD_STATE);
1504            }
1505        }
1506    }
1507
1508    pub fn command(&self) -> TaskCommand {
1509        self.persistent_info.command.lock().clone()
1510    }
1511
1512    pub fn set_command_name(&self, mut new_name: TaskCommand) {
1513        // If we're going to update the process name, see if we can get a longer one than normally
1514        // provided in the Linux uapi. Only choose the argv0-based name if it's a superset of the
1515        // uapi-provided name to avoid clobbering the name provided by the user.
1516        if let Ok(argv0) = self.read_argv0() {
1517            let argv0 = TaskCommand::from_path_bytes(&argv0);
1518            if let Some(embedded_name) = argv0.try_embed(&new_name) {
1519                new_name = embedded_name;
1520            }
1521        }
1522
1523        // Acquire this before modifying Zircon state to ensure consistency under concurrent access.
1524        // Ideally this would also guard the logic above to read argv[0] but we can't due to lock
1525        // cycles with SELinux checks.
1526        let mut command_guard = self.persistent_info.command_guard();
1527
1528        // Set the name on the Linux thread.
1529        if let Some(thread) = self.thread.read().as_ref() {
1530            set_zx_name(&**thread, new_name.as_bytes());
1531        }
1532
1533        // If this is the thread group leader, use this name for the process too.
1534        if self.is_leader() {
1535            set_zx_name(&self.thread_group().process, new_name.as_bytes());
1536            let _ = zx::Thread::raise_user_exception(
1537                zx::RaiseExceptionOptions::TARGET_JOB_DEBUGGER,
1538                zx::sys::ZX_EXCP_USER_CODE_PROCESS_NAME_CHANGED,
1539                0,
1540            );
1541        }
1542
1543        // Avoid a lock cycle by dropping the guard before notifying memory attribution of the
1544        // change.
1545        *command_guard = new_name;
1546        drop(command_guard);
1547
1548        if self.is_leader() {
1549            if let Some(notifier) = &self.thread_group().read().notifier {
1550                let _ = notifier.send(MemoryAttributionLifecycleEvent::name_change(self.tid));
1551            }
1552        }
1553    }
1554
1555    pub fn set_seccomp_state(&self, state: SeccompStateValue) -> Result<(), Errno> {
1556        self.seccomp_filter_state.set(&state)
1557    }
1558
1559    pub fn state_code(&self) -> TaskStateCode {
1560        let status = self.read();
1561        if status.exit_status.is_some() {
1562            TaskStateCode::Zombie
1563        } else if status.signals.run_state.is_blocked() {
1564            let stop_state = self.load_stopped();
1565            if stop_state.ptrace_only() && stop_state.is_stopped() {
1566                TaskStateCode::TracingStop
1567            } else {
1568                TaskStateCode::Sleeping
1569            }
1570        } else {
1571            TaskStateCode::Running
1572        }
1573    }
1574
1575    pub fn time_stats(&self) -> TaskTimeStats {
1576        use zx::Task;
1577        let info = match &*self.thread.read() {
1578            Some(thread) => thread.get_runtime_info().expect("Failed to get thread stats"),
1579            None => return TaskTimeStats::default(),
1580        };
1581
1582        TaskTimeStats {
1583            user_time: zx::MonotonicDuration::from_nanos(info.cpu_time),
1584            // TODO(https://fxbug.dev/42078242): How can we calculate system time?
1585            system_time: zx::MonotonicDuration::default(),
1586        }
1587    }
1588
1589    pub fn get_signal_action(&self, signal: Signal) -> sigaction_t {
1590        self.thread_group().signal_actions.get(signal)
1591    }
1592
1593    pub fn record_pid_koid_mapping(&self) {
1594        let Some(ref mapping_table) = *self.kernel().pid_to_koid_mapping.read() else { return };
1595
1596        let pkoid = self.thread_group().get_process_koid().ok();
1597        let tkoid = self.thread.read().as_ref().and_then(|t| t.get_koid().ok());
1598        mapping_table.write().insert(self.tid, KoidPair { process: pkoid, thread: tkoid });
1599    }
1600}
1601
1602impl Releasable for Task {
1603    type Context<'a> =
1604        (Box<ThreadState>, &'a mut Locked<TaskRelease>, RwLockWriteGuard<'a, PidTable>);
1605
1606    fn release<'a>(mut self, context: Self::Context<'a>) {
1607        let (thread_state, locked, pids) = context;
1608
1609        *self.proc_pid_directory_cache.get_mut() = None;
1610        self.ptrace_disconnect(&pids);
1611
1612        std::mem::drop(pids);
1613
1614        self.files.release();
1615
1616        self.signal_vfork();
1617
1618        // Drop fields that can end up owning a FsNode to ensure no FsNode are owned by this task.
1619        self.fs.update(None);
1620        self.mm.update(None);
1621
1622        // Rebuild a temporary CurrentTask to run the release actions that requires a CurrentState.
1623        let current_task = CurrentTask::new(OwnedRef::new(self), thread_state);
1624
1625        // Apply any delayed releasers left.
1626        current_task.trigger_delayed_releaser(locked);
1627
1628        // Drop the task now that is has been released. This requires to take it from the OwnedRef
1629        // and from the resulting ReleaseGuard.
1630        let CurrentTask { mut task, .. } = current_task;
1631        let task = OwnedRef::take(&mut task).expect("task should not have been re-owned");
1632        let _task: Self = ReleaseGuard::take(task);
1633    }
1634}
1635
1636impl MemoryAccessor for Task {
1637    fn read_memory<'a>(
1638        &self,
1639        addr: UserAddress,
1640        bytes: &'a mut [MaybeUninit<u8>],
1641    ) -> Result<&'a mut [u8], Errno> {
1642        // Using a `Task` to read memory generally indicates that the memory
1643        // is being read from a task different than the `CurrentTask`. When
1644        // this `Task` is not current, its address space is not mapped
1645        // so we need to go through the VMO.
1646        self.mm()?.syscall_read_memory(addr, bytes)
1647    }
1648
1649    fn read_memory_partial_until_null_byte<'a>(
1650        &self,
1651        addr: UserAddress,
1652        bytes: &'a mut [MaybeUninit<u8>],
1653    ) -> Result<&'a mut [u8], Errno> {
1654        // Using a `Task` to read memory generally indicates that the memory
1655        // is being read from a task different than the `CurrentTask`. When
1656        // this `Task` is not current, its address space is not mapped
1657        // so we need to go through the VMO.
1658        self.mm()?.syscall_read_memory_partial_until_null_byte(addr, bytes)
1659    }
1660
1661    fn read_memory_partial<'a>(
1662        &self,
1663        addr: UserAddress,
1664        bytes: &'a mut [MaybeUninit<u8>],
1665    ) -> Result<&'a mut [u8], Errno> {
1666        // Using a `Task` to read memory generally indicates that the memory
1667        // is being read from a task different than the `CurrentTask`. When
1668        // this `Task` is not current, its address space is not mapped
1669        // so we need to go through the VMO.
1670        self.mm()?.syscall_read_memory_partial(addr, bytes)
1671    }
1672
1673    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
1674        // Using a `Task` to write memory generally indicates that the memory
1675        // is being written to a task different than the `CurrentTask`. When
1676        // this `Task` is not current, its address space is not mapped
1677        // so we need to go through the VMO.
1678        self.mm()?.syscall_write_memory(addr, bytes)
1679    }
1680
1681    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
1682        // Using a `Task` to write memory generally indicates that the memory
1683        // is being written to a task different than the `CurrentTask`. When
1684        // this `Task` is not current, its address space is not mapped
1685        // so we need to go through the VMO.
1686        self.mm()?.syscall_write_memory_partial(addr, bytes)
1687    }
1688
1689    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
1690        // Using a `Task` to zero memory generally indicates that the memory
1691        // is being zeroed from a task different than the `CurrentTask`. When
1692        // this `Task` is not current, its address space is not mapped
1693        // so we need to go through the VMO.
1694        self.mm()?.syscall_zero(addr, length)
1695    }
1696}
1697
1698impl TaskMemoryAccessor for Task {
1699    fn maximum_valid_address(&self) -> Option<UserAddress> {
1700        self.mm().map(|mm| mm.maximum_valid_user_address).ok()
1701    }
1702}
1703
1704impl fmt::Debug for Task {
1705    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1706        write!(
1707            f,
1708            "{}:{}[{}]",
1709            self.thread_group().leader,
1710            self.tid,
1711            self.persistent_info.command.lock()
1712        )
1713    }
1714}
1715
1716impl cmp::PartialEq for Task {
1717    fn eq(&self, other: &Self) -> bool {
1718        let ptr: *const Task = self;
1719        let other_ptr: *const Task = other;
1720        ptr == other_ptr
1721    }
1722}
1723
1724impl cmp::Eq for Task {}
1725
1726#[cfg(test)]
1727mod test {
1728    use super::*;
1729    use crate::testing::*;
1730    use starnix_uapi::auth::{CAP_SYS_ADMIN, Capabilities};
1731    use starnix_uapi::resource_limits::Resource;
1732    use starnix_uapi::signals::SIGCHLD;
1733    use starnix_uapi::{CLONE_SIGHAND, CLONE_THREAD, CLONE_VM, rlimit};
1734
1735    #[::fuchsia::test]
1736    async fn test_tid_allocation() {
1737        spawn_kernel_and_run(async |locked, current_task| {
1738            let kernel = current_task.kernel();
1739            assert_eq!(current_task.get_tid(), 1);
1740            let another_current = create_task(locked, &kernel, "another-task");
1741            let another_tid = another_current.get_tid();
1742            assert!(another_tid >= 2);
1743
1744            let pids = kernel.pids.read();
1745            assert_eq!(pids.get_task(1).upgrade().unwrap().get_tid(), 1);
1746            assert_eq!(pids.get_task(another_tid).upgrade().unwrap().get_tid(), another_tid);
1747        })
1748        .await;
1749    }
1750
1751    #[::fuchsia::test]
1752    async fn test_clone_pid_and_parent_pid() {
1753        spawn_kernel_and_run(async |locked, current_task| {
1754            let thread = current_task.clone_task_for_test(
1755                locked,
1756                (CLONE_THREAD | CLONE_VM | CLONE_SIGHAND) as u64,
1757                Some(SIGCHLD),
1758            );
1759            assert_eq!(current_task.get_pid(), thread.get_pid());
1760            assert_ne!(current_task.get_tid(), thread.get_tid());
1761            assert_eq!(current_task.thread_group().leader, thread.thread_group().leader);
1762
1763            let child_task = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
1764            assert_ne!(current_task.get_pid(), child_task.get_pid());
1765            assert_ne!(current_task.get_tid(), child_task.get_tid());
1766            assert_eq!(current_task.get_pid(), child_task.thread_group().read().get_ppid());
1767        })
1768        .await;
1769    }
1770
1771    #[::fuchsia::test]
1772    async fn test_root_capabilities() {
1773        spawn_kernel_and_run(async |_, current_task| {
1774            assert!(security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN));
1775            assert_eq!(current_task.real_creds().cap_inheritable, Capabilities::empty());
1776
1777            current_task.set_creds(Credentials::with_ids(1, 1));
1778            assert!(!security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN));
1779        })
1780        .await;
1781    }
1782
1783    #[::fuchsia::test]
1784    async fn test_clone_rlimit() {
1785        spawn_kernel_and_run(async |locked, current_task| {
1786            let prev_fsize = current_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1787            assert_ne!(prev_fsize, 10);
1788            current_task
1789                .thread_group()
1790                .limits
1791                .lock(locked)
1792                .set(Resource::FSIZE, rlimit { rlim_cur: 10, rlim_max: 100 });
1793            let current_fsize = current_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1794            assert_eq!(current_fsize, 10);
1795
1796            let child_task = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
1797            let child_fsize = child_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1798            assert_eq!(child_fsize, 10)
1799        })
1800        .await;
1801    }
1802}