starnix_core/task/
task.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::{MemoryAccessor, MemoryAccessorExt, MemoryManager, TaskMemoryAccessor};
6use crate::mutable_state::{state_accessor, state_implementation};
7use crate::security;
8use crate::signals::{KernelSignal, RunState, SignalInfo, SignalState};
9use crate::task::memory_attribution::MemoryAttributionLifecycleEvent;
10use crate::task::tracing::KoidPair;
11use crate::task::{
12    AbstractUnixSocketNamespace, AbstractVsockSocketNamespace, CurrentTask, EventHandler, Kernel,
13    NormalPriority, PidTable, ProcessEntryRef, ProcessExitInfo, PtraceEvent, PtraceEventData,
14    PtraceState, PtraceStatus, RealtimePriority, SchedulerState, SchedulingPolicy,
15    SeccompFilterContainer, SeccompState, SeccompStateValue, ThreadGroup, ThreadGroupKey,
16    ThreadState, UtsNamespaceHandle, WaitCanceler, Waiter, ZombieProcess,
17};
18use crate::vfs::{FdFlags, FdNumber, FdTable, FileHandle, FsContext, FsNodeHandle, FsString};
19use bitflags::bitflags;
20use fuchsia_rcu::rcu_option_arc::RcuOptionArc;
21use macro_rules_attribute::apply;
22use starnix_logging::{log_warn, set_zx_name};
23use starnix_sync::{
24    FileOpsCore, LockBefore, LockEqualOrBefore, Locked, Mutex, MutexGuard, RwLock, RwLockReadGuard,
25    RwLockWriteGuard, TaskRelease, TerminalLock,
26};
27use starnix_task_command::TaskCommand;
28use starnix_types::arch::ArchWidth;
29use starnix_types::ownership::{OwnedRef, Releasable, ReleaseGuard, TempRef, WeakRef};
30use starnix_types::stats::TaskTimeStats;
31use starnix_uapi::auth::{Credentials, FsCred};
32use starnix_uapi::errors::Errno;
33use starnix_uapi::signals::{SigSet, Signal, sigaltstack_contains_pointer};
34use starnix_uapi::user_address::{
35    ArchSpecific, MappingMultiArchUserRef, UserAddress, UserCString, UserRef,
36};
37use starnix_uapi::{
38    CLD_CONTINUED, CLD_DUMPED, CLD_EXITED, CLD_KILLED, CLD_STOPPED, FUTEX_BITSET_MATCH_ANY, errno,
39    error, from_status_like_fdio, pid_t, sigaction_t, sigaltstack, tid_t, uapi,
40};
41use std::collections::VecDeque;
42use std::mem::MaybeUninit;
43use std::sync::Arc;
44use std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
45use std::{cmp, fmt};
46use zx::{
47    AsHandleRef, Signals, Task as _, {self as zx},
48};
49
50#[derive(Clone, Debug, Eq, PartialEq)]
51pub enum ExitStatus {
52    Exit(u8),
53    Kill(SignalInfo),
54    CoreDump(SignalInfo),
55    // The second field for Stop and Continue contains the type of ptrace stop
56    // event that made it stop / continue, if applicable (PTRACE_EVENT_STOP,
57    // PTRACE_EVENT_FORK, etc)
58    Stop(SignalInfo, PtraceEvent),
59    Continue(SignalInfo, PtraceEvent),
60}
61impl ExitStatus {
62    /// Converts the given exit status to a status code suitable for returning from wait syscalls.
63    pub fn wait_status(&self) -> i32 {
64        match self {
65            ExitStatus::Exit(status) => (*status as i32) << 8,
66            ExitStatus::Kill(siginfo) => siginfo.signal.number() as i32,
67            ExitStatus::CoreDump(siginfo) => (siginfo.signal.number() as i32) | 0x80,
68            ExitStatus::Continue(siginfo, trace_event) => {
69                let trace_event_val = *trace_event as u32;
70                if trace_event_val != 0 {
71                    (siginfo.signal.number() as i32) | (trace_event_val << 16) as i32
72                } else {
73                    0xffff
74                }
75            }
76            ExitStatus::Stop(siginfo, trace_event) => {
77                let trace_event_val = *trace_event as u32;
78                (0x7f + ((siginfo.signal.number() as i32) << 8)) | (trace_event_val << 16) as i32
79            }
80        }
81    }
82
83    pub fn signal_info_code(&self) -> i32 {
84        match self {
85            ExitStatus::Exit(_) => CLD_EXITED as i32,
86            ExitStatus::Kill(_) => CLD_KILLED as i32,
87            ExitStatus::CoreDump(_) => CLD_DUMPED as i32,
88            ExitStatus::Stop(_, _) => CLD_STOPPED as i32,
89            ExitStatus::Continue(_, _) => CLD_CONTINUED as i32,
90        }
91    }
92
93    pub fn signal_info_status(&self) -> i32 {
94        match self {
95            ExitStatus::Exit(status) => *status as i32,
96            ExitStatus::Kill(siginfo)
97            | ExitStatus::CoreDump(siginfo)
98            | ExitStatus::Continue(siginfo, _)
99            | ExitStatus::Stop(siginfo, _) => siginfo.signal.number() as i32,
100        }
101    }
102}
103
104pub struct AtomicStopState {
105    inner: AtomicU8,
106}
107
108impl AtomicStopState {
109    pub fn new(state: StopState) -> Self {
110        Self { inner: AtomicU8::new(state as u8) }
111    }
112
113    pub fn load(&self, ordering: Ordering) -> StopState {
114        let v = self.inner.load(ordering);
115        // SAFETY: we only ever store to the atomic a value originating
116        // from a valid `StopState`.
117        unsafe { std::mem::transmute(v) }
118    }
119
120    pub fn store(&self, state: StopState, ordering: Ordering) {
121        self.inner.store(state as u8, ordering)
122    }
123}
124
125/// This enum describes the state that a task or thread group can be in when being stopped.
126/// The names are taken from ptrace(2).
127#[derive(Clone, Copy, Debug, PartialEq)]
128#[repr(u8)]
129pub enum StopState {
130    /// In this state, the process has been told to wake up, but has not yet been woken.
131    /// Individual threads may still be stopped.
132    Waking,
133    /// In this state, at least one thread is awake.
134    Awake,
135    /// Same as the above, but you are not allowed to make further transitions.  Used
136    /// to kill the task / group.  These names are not in ptrace(2).
137    ForceWaking,
138    ForceAwake,
139
140    /// In this state, the process has been told to stop via a signal, but has not yet stopped.
141    GroupStopping,
142    /// In this state, at least one thread of the process has stopped
143    GroupStopped,
144    /// In this state, the task has received a signal, and it is being traced, so it will
145    /// stop at the next opportunity.
146    SignalDeliveryStopping,
147    /// Same as the last one, but has stopped.
148    SignalDeliveryStopped,
149    /// Stop for a ptrace event: a variety of events defined by ptrace and
150    /// enabled with the use of various ptrace features, such as the
151    /// PTRACE_O_TRACE_* options.  The parameter indicates the type of
152    /// event. Examples include PTRACE_EVENT_FORK (the event is a fork),
153    /// PTRACE_EVENT_EXEC (the event is exec), and other similar events.
154    PtraceEventStopping,
155    /// Same as the last one, but has stopped
156    PtraceEventStopped,
157    /// In this state, we have stopped before executing a syscall
158    SyscallEnterStopping,
159    SyscallEnterStopped,
160    /// In this state, we have stopped after executing a syscall
161    SyscallExitStopping,
162    SyscallExitStopped,
163}
164
165impl StopState {
166    /// This means a stop is either in progress or we've stopped.
167    pub fn is_stopping_or_stopped(&self) -> bool {
168        self.is_stopped() || self.is_stopping()
169    }
170
171    /// This means a stop is in progress.  Refers to any stop state ending in "ing".
172    pub fn is_stopping(&self) -> bool {
173        match *self {
174            StopState::GroupStopping
175            | StopState::SignalDeliveryStopping
176            | StopState::PtraceEventStopping
177            | StopState::SyscallEnterStopping
178            | StopState::SyscallExitStopping => true,
179            _ => false,
180        }
181    }
182
183    /// This means task is stopped.
184    pub fn is_stopped(&self) -> bool {
185        match *self {
186            StopState::GroupStopped
187            | StopState::SignalDeliveryStopped
188            | StopState::PtraceEventStopped
189            | StopState::SyscallEnterStopped
190            | StopState::SyscallExitStopped => true,
191            _ => false,
192        }
193    }
194
195    /// Returns the "ed" version of this StopState, if it is "ing".
196    pub fn finalize(&self) -> Result<StopState, ()> {
197        match *self {
198            StopState::GroupStopping => Ok(StopState::GroupStopped),
199            StopState::SignalDeliveryStopping => Ok(StopState::SignalDeliveryStopped),
200            StopState::PtraceEventStopping => Ok(StopState::PtraceEventStopped),
201            StopState::Waking => Ok(StopState::Awake),
202            StopState::ForceWaking => Ok(StopState::ForceAwake),
203            StopState::SyscallEnterStopping => Ok(StopState::SyscallEnterStopped),
204            StopState::SyscallExitStopping => Ok(StopState::SyscallExitStopped),
205            _ => Err(()),
206        }
207    }
208
209    pub fn is_downgrade(&self, new_state: &StopState) -> bool {
210        match *self {
211            StopState::GroupStopped => *new_state == StopState::GroupStopping,
212            StopState::SignalDeliveryStopped => *new_state == StopState::SignalDeliveryStopping,
213            StopState::PtraceEventStopped => *new_state == StopState::PtraceEventStopping,
214            StopState::SyscallEnterStopped => *new_state == StopState::SyscallEnterStopping,
215            StopState::SyscallExitStopped => *new_state == StopState::SyscallExitStopping,
216            StopState::Awake => *new_state == StopState::Waking,
217            _ => false,
218        }
219    }
220
221    pub fn is_waking_or_awake(&self) -> bool {
222        *self == StopState::Waking
223            || *self == StopState::Awake
224            || *self == StopState::ForceWaking
225            || *self == StopState::ForceAwake
226    }
227
228    /// Indicate if the transition to the stopped / awake state is not finished.  This
229    /// function is typically used to determine when it is time to notify waiters.
230    pub fn is_in_progress(&self) -> bool {
231        *self == StopState::Waking
232            || *self == StopState::ForceWaking
233            || *self == StopState::GroupStopping
234            || *self == StopState::SignalDeliveryStopping
235            || *self == StopState::PtraceEventStopping
236            || *self == StopState::SyscallEnterStopping
237            || *self == StopState::SyscallExitStopping
238    }
239
240    pub fn ptrace_only(&self) -> bool {
241        !self.is_waking_or_awake()
242            && *self != StopState::GroupStopped
243            && *self != StopState::GroupStopping
244    }
245
246    pub fn is_illegal_transition(&self, new_state: StopState) -> bool {
247        *self == StopState::ForceAwake
248            || (*self == StopState::ForceWaking && new_state != StopState::ForceAwake)
249            || new_state == *self
250            // Downgrades are generally a sign that something is screwed up, but
251            // a SIGCONT can result in a downgrade from Awake to Waking, so we
252            // allowlist it.
253            || (self.is_downgrade(&new_state) && *self != StopState::Awake)
254    }
255
256    pub fn is_force(&self) -> bool {
257        *self == StopState::ForceAwake || *self == StopState::ForceWaking
258    }
259
260    pub fn as_in_progress(&self) -> Result<StopState, ()> {
261        match *self {
262            StopState::GroupStopped => Ok(StopState::GroupStopping),
263            StopState::SignalDeliveryStopped => Ok(StopState::SignalDeliveryStopping),
264            StopState::PtraceEventStopped => Ok(StopState::PtraceEventStopping),
265            StopState::Awake => Ok(StopState::Waking),
266            StopState::ForceAwake => Ok(StopState::ForceWaking),
267            StopState::SyscallEnterStopped => Ok(StopState::SyscallEnterStopping),
268            StopState::SyscallExitStopped => Ok(StopState::SyscallExitStopping),
269            _ => Ok(*self),
270        }
271    }
272}
273
274bitflags! {
275    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
276    pub struct TaskFlags: u8 {
277        const EXITED = 0x1;
278        const SIGNALS_AVAILABLE = 0x2;
279        const TEMPORARY_SIGNAL_MASK = 0x4;
280        /// Whether the executor should dump the stack of this task when it exits.
281        /// Currently used to implement ExitStatus::CoreDump.
282        const DUMP_ON_EXIT = 0x8;
283    }
284}
285
286pub struct AtomicTaskFlags {
287    flags: AtomicU8,
288}
289
290impl AtomicTaskFlags {
291    fn new(flags: TaskFlags) -> Self {
292        Self { flags: AtomicU8::new(flags.bits()) }
293    }
294
295    fn load(&self, ordering: Ordering) -> TaskFlags {
296        let flags = self.flags.load(ordering);
297        // We only ever store values from a `TaskFlags`.
298        TaskFlags::from_bits_retain(flags)
299    }
300
301    fn swap(&self, flags: TaskFlags, ordering: Ordering) -> TaskFlags {
302        let flags = self.flags.swap(flags.bits(), ordering);
303        // We only ever store values from a `TaskFlags`.
304        TaskFlags::from_bits_retain(flags)
305    }
306}
307
308/// This contains thread state that tracers can inspect and modify.  It is
309/// captured when a thread stops, and optionally copied back (if dirty) when a
310/// thread starts again.  An alternative implementation would involve the
311/// tracers acting on thread state directly; however, this would involve sharing
312/// CurrentTask structures across multiple threads, which goes against the
313/// intent of the design of CurrentTask.
314pub struct CapturedThreadState {
315    /// The thread state of the traced task.  This is copied out when the thread
316    /// stops.
317    pub thread_state: ThreadState,
318
319    /// Indicates that the last ptrace operation changed the thread state, so it
320    /// should be written back to the original thread.
321    pub dirty: bool,
322}
323
324impl ArchSpecific for CapturedThreadState {
325    fn is_arch32(&self) -> bool {
326        self.thread_state.is_arch32()
327    }
328}
329
330#[derive(Debug)]
331pub struct RobustList {
332    pub next: RobustListPtr,
333}
334
335pub type RobustListPtr =
336    MappingMultiArchUserRef<RobustList, uapi::robust_list, uapi::arch32::robust_list>;
337
338impl From<uapi::robust_list> for RobustList {
339    fn from(robust_list: uapi::robust_list) -> Self {
340        Self { next: RobustListPtr::from(robust_list.next) }
341    }
342}
343
344#[cfg(target_arch = "aarch64")]
345impl From<uapi::arch32::robust_list> for RobustList {
346    fn from(robust_list: uapi::arch32::robust_list) -> Self {
347        Self { next: RobustListPtr::from(robust_list.next) }
348    }
349}
350
351#[derive(Debug)]
352pub struct RobustListHead {
353    pub list: RobustList,
354    pub futex_offset: isize,
355}
356
357pub type RobustListHeadPtr =
358    MappingMultiArchUserRef<RobustListHead, uapi::robust_list_head, uapi::arch32::robust_list_head>;
359
360impl From<uapi::robust_list_head> for RobustListHead {
361    fn from(robust_list_head: uapi::robust_list_head) -> Self {
362        Self {
363            list: robust_list_head.list.into(),
364            futex_offset: robust_list_head.futex_offset as isize,
365        }
366    }
367}
368
369#[cfg(target_arch = "aarch64")]
370impl From<uapi::arch32::robust_list_head> for RobustListHead {
371    fn from(robust_list_head: uapi::arch32::robust_list_head) -> Self {
372        Self {
373            list: robust_list_head.list.into(),
374            futex_offset: robust_list_head.futex_offset as isize,
375        }
376    }
377}
378
379pub struct TaskMutableState {
380    // See https://man7.org/linux/man-pages/man2/set_tid_address.2.html
381    pub clear_child_tid: UserRef<tid_t>,
382
383    /// Signal handler related state. This is grouped together for when atomicity is needed during
384    /// signal sending and delivery.
385    signals: SignalState,
386
387    /// Internal signals that have a higher priority than a regular signal.
388    ///
389    /// Storing in a separate queue outside of `SignalState` ensures the internal signals will
390    /// never be ignored or masked when dequeuing. Higher priority ensures that no user signals
391    /// will jump the queue, e.g. ptrace, which delays the delivery.
392    ///
393    /// This design is not about observable consequence, but about convenient implementation.
394    kernel_signals: VecDeque<KernelSignal>,
395
396    /// The exit status that this task exited with.
397    exit_status: Option<ExitStatus>,
398
399    /// Desired scheduler state for the task.
400    pub scheduler_state: SchedulerState,
401
402    /// The UTS namespace assigned to this thread.
403    ///
404    /// This field is kept in the mutable state because the UTS namespace of a thread
405    /// can be forked using `clone()` or `unshare()` syscalls.
406    ///
407    /// We use UtsNamespaceHandle because the UTS properties can be modified
408    /// by any other thread that shares this namespace.
409    pub uts_ns: UtsNamespaceHandle,
410
411    /// Bit that determines whether a newly started program can have privileges its parent does
412    /// not have.  See Documentation/prctl/no_new_privs.txt in the Linux kernel for details.
413    /// Note that Starnix does not currently implement the relevant privileges (e.g.,
414    /// setuid/setgid binaries).  So, you can set this, but it does nothing other than get
415    /// propagated to children.
416    ///
417    /// The documentation indicates that this can only ever be set to
418    /// true, and it cannot be reverted to false.  Accessor methods
419    /// for this field ensure this property.
420    no_new_privs: bool,
421
422    /// Userspace hint about how to adjust the OOM score for this process.
423    pub oom_score_adj: i32,
424
425    /// List of currently installed seccomp_filters
426    pub seccomp_filters: SeccompFilterContainer,
427
428    /// A pointer to the head of the robust futex list of this thread in
429    /// userspace. See get_robust_list(2)
430    pub robust_list_head: RobustListHeadPtr,
431
432    /// The timer slack used to group timer expirations for the calling thread.
433    ///
434    /// Timers may expire up to `timerslack_ns` late, but never early.
435    ///
436    /// If this value is 0, the task's default timerslack is used.
437    pub timerslack_ns: u64,
438
439    /// The default value for `timerslack_ns`. This value cannot change during the lifetime of a
440    /// task.
441    ///
442    /// This value is set to the `timerslack_ns` of the creating thread, and thus is not constant
443    /// across tasks.
444    pub default_timerslack_ns: u64,
445
446    /// Information that a tracer needs to communicate with this process, if it
447    /// is being traced.
448    pub ptrace: Option<Box<PtraceState>>,
449
450    /// Information that a tracer needs to inspect this process.
451    pub captured_thread_state: Option<Box<CapturedThreadState>>,
452}
453
454impl TaskMutableState {
455    pub fn no_new_privs(&self) -> bool {
456        self.no_new_privs
457    }
458
459    /// Sets the value of no_new_privs to true.  It is an error to set
460    /// it to anything else.
461    pub fn enable_no_new_privs(&mut self) {
462        self.no_new_privs = true;
463    }
464
465    pub fn get_timerslack<T: zx::Timeline>(&self) -> zx::Duration<T> {
466        zx::Duration::from_nanos(self.timerslack_ns as i64)
467    }
468
469    /// Sets the current timerslack of the task to `ns`.
470    ///
471    /// If `ns` is zero, the current timerslack gets reset to the task's default timerslack.
472    pub fn set_timerslack_ns(&mut self, ns: u64) {
473        if ns == 0 {
474            self.timerslack_ns = self.default_timerslack_ns;
475        } else {
476            self.timerslack_ns = ns;
477        }
478    }
479
480    pub fn is_ptraced(&self) -> bool {
481        self.ptrace.is_some()
482    }
483
484    pub fn is_ptrace_listening(&self) -> bool {
485        self.ptrace.as_ref().is_some_and(|ptrace| ptrace.stop_status == PtraceStatus::Listening)
486    }
487
488    pub fn ptrace_on_signal_consume(&mut self) -> bool {
489        self.ptrace.as_mut().is_some_and(|ptrace: &mut Box<PtraceState>| {
490            if ptrace.stop_status.is_continuing() {
491                ptrace.stop_status = PtraceStatus::Default;
492                false
493            } else {
494                true
495            }
496        })
497    }
498
499    pub fn notify_ptracers(&mut self) {
500        if let Some(ptrace) = &self.ptrace {
501            ptrace.tracer_waiters().notify_all();
502        }
503    }
504
505    pub fn wait_on_ptracer(&self, waiter: &Waiter) {
506        if let Some(ptrace) = &self.ptrace {
507            ptrace.tracee_waiters.wait_async(&waiter);
508        }
509    }
510
511    pub fn notify_ptracees(&mut self) {
512        if let Some(ptrace) = &self.ptrace {
513            ptrace.tracee_waiters.notify_all();
514        }
515    }
516
517    pub fn take_captured_state(&mut self) -> Option<Box<CapturedThreadState>> {
518        if self.captured_thread_state.is_some() {
519            let mut state = None;
520            std::mem::swap(&mut state, &mut self.captured_thread_state);
521            return state;
522        }
523        None
524    }
525
526    pub fn copy_state_from(&mut self, current_task: &CurrentTask) {
527        self.captured_thread_state = Some(Box::new(CapturedThreadState {
528            thread_state: current_task.thread_state.extended_snapshot(),
529            dirty: false,
530        }));
531    }
532
533    /// Returns the task's currently active signal mask.
534    pub fn signal_mask(&self) -> SigSet {
535        self.signals.mask()
536    }
537
538    /// Returns true if `signal` is currently blocked by this task's signal mask.
539    pub fn is_signal_masked(&self, signal: Signal) -> bool {
540        self.signals.mask().has_signal(signal)
541    }
542
543    /// Returns true if `signal` is blocked by the saved signal mask.
544    ///
545    /// Note that the current signal mask may still not be blocking the signal.
546    pub fn is_signal_masked_by_saved_mask(&self, signal: Signal) -> bool {
547        self.signals.saved_mask().is_some_and(|mask| mask.has_signal(signal))
548    }
549
550    /// Enqueues an internal signal at the back of the task's kernel signal queue.
551    pub fn enqueue_kernel_signal(&mut self, signal: KernelSignal) {
552        self.kernel_signals.push_back(signal);
553    }
554
555    /// Enqueues a signal at the back of the task's signal queue.
556    pub fn enqueue_signal(&mut self, signal: SignalInfo) {
557        self.signals.enqueue(signal);
558    }
559
560    /// Enqueues the signal, allowing the signal to skip straight to the front of the task's queue.
561    ///
562    /// `enqueue_signal` is the more common API to use.
563    ///
564    /// Note that this will not guarantee that the signal is dequeued before any process-directed
565    /// signals.
566    pub fn enqueue_signal_front(&mut self, signal: SignalInfo) {
567        self.signals.enqueue(signal);
568    }
569
570    /// Sets the current signal mask of the task.
571    pub fn set_signal_mask(&mut self, mask: SigSet) {
572        self.signals.set_mask(mask);
573    }
574
575    /// Sets a temporary signal mask for the task.
576    ///
577    /// This mask should be removed by a matching call to `restore_signal_mask`.
578    pub fn set_temporary_signal_mask(&mut self, mask: SigSet) {
579        self.signals.set_temporary_mask(mask);
580    }
581
582    /// Removes the currently active, temporary, signal mask and restores the
583    /// previously active signal mask.
584    pub fn restore_signal_mask(&mut self) {
585        self.signals.restore_mask();
586    }
587
588    /// Returns true if the task's current `RunState` is blocked.
589    pub fn is_blocked(&self) -> bool {
590        self.signals.run_state.is_blocked()
591    }
592
593    /// Sets the task's `RunState` to `run_state`.
594    pub fn set_run_state(&mut self, run_state: RunState) {
595        self.signals.run_state = run_state;
596    }
597
598    pub fn run_state(&self) -> RunState {
599        self.signals.run_state.clone()
600    }
601
602    pub fn on_signal_stack(&self, stack_pointer_register: u64) -> bool {
603        self.signals
604            .alt_stack
605            .map(|signal_stack| sigaltstack_contains_pointer(&signal_stack, stack_pointer_register))
606            .unwrap_or(false)
607    }
608
609    pub fn set_sigaltstack(&mut self, stack: Option<sigaltstack>) {
610        self.signals.alt_stack = stack;
611    }
612
613    pub fn sigaltstack(&self) -> Option<sigaltstack> {
614        self.signals.alt_stack
615    }
616
617    pub fn wait_on_signal(&mut self, waiter: &Waiter) {
618        self.signals.signal_wait.wait_async(waiter);
619    }
620
621    pub fn signals_mut(&mut self) -> &mut SignalState {
622        &mut self.signals
623    }
624
625    pub fn wait_on_signal_fd_events(
626        &self,
627        waiter: &Waiter,
628        mask: SigSet,
629        handler: EventHandler,
630    ) -> WaitCanceler {
631        self.signals.signal_wait.wait_async_signal_mask(waiter, mask, handler)
632    }
633
634    pub fn notify_signal_waiters(&self, signal: &Signal) {
635        self.signals.signal_wait.notify_signal(signal);
636    }
637
638    /// Thaw the task if has been frozen
639    pub fn thaw(&mut self) {
640        if let RunState::Frozen(waiter) = self.run_state() {
641            waiter.notify();
642        }
643    }
644
645    pub fn is_frozen(&self) -> bool {
646        matches!(self.run_state(), RunState::Frozen(_))
647    }
648
649    #[cfg(test)]
650    pub fn kernel_signals_for_test(&self) -> &VecDeque<KernelSignal> {
651        &self.kernel_signals
652    }
653}
654
655#[apply(state_implementation!)]
656impl TaskMutableState<Base = Task> {
657    pub fn set_stopped(
658        &mut self,
659        stopped: StopState,
660        siginfo: Option<SignalInfo>,
661        current_task: Option<&CurrentTask>,
662        event: Option<PtraceEventData>,
663    ) {
664        if stopped.ptrace_only() && self.ptrace.is_none() {
665            return;
666        }
667
668        if self.base.load_stopped().is_illegal_transition(stopped) {
669            return;
670        }
671
672        // TODO(https://g-issues.fuchsia.dev/issues/306438676): When task can be
673        // stopped inside user code, task will need to be either restarted or
674        // stopped here.
675        self.store_stopped(stopped);
676        if stopped.is_stopped() {
677            if let Some(ref current_task) = current_task {
678                self.copy_state_from(current_task);
679            }
680        }
681        if let Some(ptrace) = &mut self.ptrace {
682            ptrace.set_last_signal(siginfo);
683            ptrace.set_last_event(event);
684        }
685        if stopped == StopState::Waking || stopped == StopState::ForceWaking {
686            self.notify_ptracees();
687        }
688        if !stopped.is_in_progress() {
689            self.notify_ptracers();
690        }
691    }
692
693    pub fn set_ptrace(&mut self, tracer: Option<Box<PtraceState>>) -> Result<(), Errno> {
694        if tracer.is_some() && self.ptrace.is_some() {
695            return error!(EPERM);
696        }
697
698        if tracer.is_none() {
699            // Handle the case where this is called while the thread group is being released.
700            if let Ok(tg_stop_state) = self.base.thread_group().load_stopped().as_in_progress() {
701                self.set_stopped(tg_stop_state, None, None, None);
702            }
703        }
704        self.ptrace = tracer;
705        Ok(())
706    }
707
708    pub fn can_accept_ptrace_commands(&mut self) -> bool {
709        !self.base.load_stopped().is_waking_or_awake()
710            && self.is_ptraced()
711            && !self.is_ptrace_listening()
712    }
713
714    fn store_stopped(&mut self, state: StopState) {
715        // We don't actually use the guard but we require it to enforce that the
716        // caller holds the thread group's mutable state lock (identified by
717        // mutable access to the thread group's mutable state).
718
719        self.base.stop_state.store(state, Ordering::Relaxed)
720    }
721
722    pub fn update_flags(&mut self, clear: TaskFlags, set: TaskFlags) {
723        // We don't actually use the guard but we require it to enforce that the
724        // caller holds the task's mutable state lock (identified by mutable
725        // access to the task's mutable state).
726
727        debug_assert_eq!(clear ^ set, clear | set);
728        let observed = self.base.flags();
729        let swapped = self.base.flags.swap((observed | set) & !clear, Ordering::Relaxed);
730        debug_assert_eq!(swapped, observed);
731    }
732
733    pub fn set_flags(&mut self, flag: TaskFlags, v: bool) {
734        let (clear, set) = if v { (TaskFlags::empty(), flag) } else { (flag, TaskFlags::empty()) };
735
736        self.update_flags(clear, set);
737    }
738
739    pub fn set_exit_status(&mut self, status: ExitStatus) {
740        self.set_flags(TaskFlags::EXITED, true);
741        self.exit_status = Some(status);
742    }
743
744    pub fn set_exit_status_if_not_already(&mut self, status: ExitStatus) {
745        self.set_flags(TaskFlags::EXITED, true);
746        self.exit_status.get_or_insert(status);
747    }
748
749    /// Returns the number of pending signals for this task, without considering the signal mask.
750    pub fn pending_signal_count(&self) -> usize {
751        self.signals.num_queued() + self.base.thread_group().pending_signals.lock().num_queued()
752    }
753
754    /// Returns `true` if `signal` is pending for this task, without considering the signal mask.
755    pub fn has_signal_pending(&self, signal: Signal) -> bool {
756        self.signals.has_queued(signal)
757            || self.base.thread_group().pending_signals.lock().has_queued(signal)
758    }
759
760    /// The set of pending signals for the task, including the signals pending for the thread
761    /// group.
762    pub fn pending_signals(&self) -> SigSet {
763        self.signals.pending() | self.base.thread_group().pending_signals.lock().pending()
764    }
765
766    /// The set of pending signals for the task specifically, not including the signals pending
767    /// for the thread group.
768    pub fn task_specific_pending_signals(&self) -> SigSet {
769        self.signals.pending()
770    }
771
772    /// Returns true if any currently pending signal is allowed by `mask`.
773    pub fn is_any_signal_allowed_by_mask(&self, mask: SigSet) -> bool {
774        self.signals.is_any_allowed_by_mask(mask)
775            || self.base.thread_group().pending_signals.lock().is_any_allowed_by_mask(mask)
776    }
777
778    /// Returns whether or not a signal is pending for this task, taking the current
779    /// signal mask into account.
780    pub fn is_any_signal_pending(&self) -> bool {
781        let mask = self.signal_mask();
782        self.signals.is_any_pending()
783            || self.base.thread_group().pending_signals.lock().is_any_allowed_by_mask(mask)
784    }
785
786    /// Returns the next pending signal that passes `predicate`.
787    fn take_next_signal_where<F>(&mut self, predicate: F) -> Option<SignalInfo>
788    where
789        F: Fn(&SignalInfo) -> bool,
790    {
791        if let Some(signal) =
792            self.base.thread_group().pending_signals.lock().take_next_where(&predicate)
793        {
794            Some(signal)
795        } else {
796            self.signals.take_next_where(&predicate)
797        }
798    }
799
800    /// Removes and returns the next pending `signal` for this task.
801    ///
802    /// Returns `None` if `siginfo` is a blocked signal, or no such signal is pending.
803    pub fn take_specific_signal(&mut self, siginfo: SignalInfo) -> Option<SignalInfo> {
804        let signal_mask = self.signal_mask();
805        if signal_mask.has_signal(siginfo.signal) {
806            return None;
807        }
808
809        let predicate = |s: &SignalInfo| s.signal == siginfo.signal;
810        self.take_next_signal_where(predicate)
811    }
812
813    /// Removes and returns a pending signal that is unblocked by the current signal mask.
814    ///
815    /// Returns `None` if there are no unblocked signals pending.
816    pub fn take_any_signal(&mut self) -> Option<SignalInfo> {
817        self.take_signal_with_mask(self.signal_mask())
818    }
819
820    /// Removes and returns a pending signal that is unblocked by `signal_mask`.
821    ///
822    /// Returns `None` if there are no signals pending that are unblocked by `signal_mask`.
823    pub fn take_signal_with_mask(&mut self, signal_mask: SigSet) -> Option<SignalInfo> {
824        let predicate = |s: &SignalInfo| !signal_mask.has_signal(s.signal) || s.force;
825        self.take_next_signal_where(predicate)
826    }
827
828    /// Removes and returns a pending internal signal.
829    ///
830    /// Returns `None` if there are no signals pending.
831    pub fn take_kernel_signal(&mut self) -> Option<KernelSignal> {
832        self.kernel_signals.pop_front()
833    }
834
835    #[cfg(test)]
836    pub fn queued_signal_count(&self, signal: Signal) -> usize {
837        self.signals.queued_count(signal)
838            + self.base.thread_group().pending_signals.lock().queued_count(signal)
839    }
840}
841
842#[derive(Debug, Clone, Copy, PartialEq, Eq)]
843pub enum TaskStateCode {
844    // Task is being executed.
845    Running,
846
847    // Task is waiting for an event.
848    Sleeping,
849
850    // Tracing stop
851    TracingStop,
852
853    // Task has exited.
854    Zombie,
855}
856
857impl TaskStateCode {
858    pub fn code_char(&self) -> char {
859        match self {
860            TaskStateCode::Running => 'R',
861            TaskStateCode::Sleeping => 'S',
862            TaskStateCode::TracingStop => 't',
863            TaskStateCode::Zombie => 'Z',
864        }
865    }
866
867    pub fn name(&self) -> &'static str {
868        match self {
869            TaskStateCode::Running => "running",
870            TaskStateCode::Sleeping => "sleeping",
871            TaskStateCode::TracingStop => "tracing stop",
872            TaskStateCode::Zombie => "zombie",
873        }
874    }
875}
876
877/// The information of the task that needs to be available to the `ThreadGroup` while computing
878/// which process a wait can target. It is necessary to shared this data with the `ThreadGroup` so
879/// that it is available while the task is being dropped and so is not accessible from a weak
880/// pointer.
881#[derive(Debug)]
882pub struct TaskPersistentInfoState {
883    /// Immutable information about the task
884    tid: tid_t,
885    thread_group_key: ThreadGroupKey,
886
887    /// The command of this task.
888    command: Mutex<TaskCommand>,
889
890    /// The security credentials for this task. These are only set when the task is the CurrentTask,
891    /// or on task creation.
892    creds: RwLock<Credentials>,
893}
894
895impl TaskPersistentInfoState {
896    fn new(
897        tid: tid_t,
898        thread_group_key: ThreadGroupKey,
899        command: TaskCommand,
900        creds: Credentials,
901    ) -> TaskPersistentInfo {
902        Arc::new(Self {
903            tid,
904            thread_group_key,
905            command: Mutex::new(command),
906            creds: RwLock::new(creds),
907        })
908    }
909
910    pub fn tid(&self) -> tid_t {
911        self.tid
912    }
913
914    pub fn pid(&self) -> pid_t {
915        self.thread_group_key.pid()
916    }
917
918    pub fn command_guard(&self) -> MutexGuard<'_, TaskCommand> {
919        self.command.lock()
920    }
921
922    pub fn real_creds(&self) -> RwLockReadGuard<'_, Credentials> {
923        self.creds.read()
924    }
925
926    /// SAFETY: Only use from CurrentTask. Changing credentials outside of the CurrentTask may
927    /// introduce TOCTOU issues in access checks.
928    pub(in crate::task) unsafe fn creds_mut(&self) -> RwLockWriteGuard<'_, Credentials> {
929        self.creds.write()
930    }
931}
932
933pub type TaskPersistentInfo = Arc<TaskPersistentInfoState>;
934
935/// A unit of execution.
936///
937/// A task is the primary unit of execution in the Starnix kernel. Most tasks are *user* tasks,
938/// which have an associated Zircon thread. The Zircon thread switches between restricted mode,
939/// in which the thread runs userspace code, and normal mode, in which the thread runs Starnix
940/// code.
941///
942/// Tasks track the resources used by userspace by referencing various objects, such as an
943/// `FdTable`, a `MemoryManager`, and an `FsContext`. Many tasks can share references to these
944/// objects. In principle, which objects are shared between which tasks can be largely arbitrary,
945/// but there are common patterns of sharing. For example, tasks created with `pthread_create`
946/// will share the `FdTable`, `MemoryManager`, and `FsContext` and are often called "threads" by
947/// userspace programmers. Tasks created by `posix_spawn` do not share these objects and are often
948/// called "processes" by userspace programmers. However, inside the kernel, there is no clear
949/// definition of a "thread" or a "process".
950///
951/// During boot, the kernel creates the first task, often called `init`. The vast majority of other
952/// tasks are created as transitive clones (e.g., using `clone(2)`) of that task. Sometimes, the
953/// kernel will create new tasks from whole cloth, either with a corresponding userspace component
954/// or to represent some background work inside the kernel.
955///
956/// See also `CurrentTask`, which represents the task corresponding to the thread that is currently
957/// executing.
958pub struct Task {
959    /// Weak reference to the `OwnedRef` of this `Task`. This allows to retrieve the
960    /// `TempRef` from a raw `Task`.
961    pub weak_self: WeakRef<Self>,
962
963    /// A unique identifier for this task.
964    ///
965    /// This value can be read in userspace using `gettid(2)`. In general, this value
966    /// is different from the value return by `getpid(2)`, which returns the `id` of the leader
967    /// of the `thread_group`.
968    pub tid: tid_t,
969
970    /// The process key of this task.
971    pub thread_group_key: ThreadGroupKey,
972
973    /// The kernel to which this thread group belongs.
974    pub kernel: Arc<Kernel>,
975
976    /// The thread group to which this task belongs.
977    ///
978    /// The group of tasks in a thread group roughly corresponds to the userspace notion of a
979    /// process.
980    pub thread_group: Arc<ThreadGroup>,
981
982    /// A handle to the underlying Zircon thread object.
983    ///
984    /// Some tasks lack an underlying Zircon thread. These tasks are used internally by the
985    /// Starnix kernel to track background work, typically on a `kthread`.
986    pub thread: RwLock<Option<Arc<zx::Thread>>>,
987
988    /// The file descriptor table for this task.
989    ///
990    /// This table can be share by many tasks.
991    pub files: FdTable,
992
993    /// The memory manager for this task.  This is `None` only for system tasks.
994    pub mm: RcuOptionArc<MemoryManager>,
995
996    /// The file system for this task.
997    fs: RcuOptionArc<FsContext>,
998
999    /// The namespace for abstract AF_UNIX sockets for this task.
1000    pub abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
1001
1002    /// The namespace for AF_VSOCK for this task.
1003    pub abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
1004
1005    /// The stop state of the task, distinct from the stop state of the thread group.
1006    ///
1007    /// Must only be set when the `mutable_state` write lock is held.
1008    stop_state: AtomicStopState,
1009
1010    /// The flags for the task.
1011    ///
1012    /// Must only be set the then `mutable_state` write lock is held.
1013    flags: AtomicTaskFlags,
1014
1015    /// The mutable state of the Task.
1016    mutable_state: RwLock<TaskMutableState>,
1017
1018    /// The information of the task that needs to be available to the `ThreadGroup` while computing
1019    /// which process a wait can target.
1020    /// Contains the command line, the task credentials and the exit signal.
1021    /// See `TaskPersistentInfo` for more information.
1022    pub persistent_info: TaskPersistentInfo,
1023
1024    /// For vfork and clone() with CLONE_VFORK, this is set when the task exits or calls execve().
1025    /// It allows the calling task to block until the fork has been completed. Only populated
1026    /// when created with the CLONE_VFORK flag.
1027    vfork_event: Option<Arc<zx::Event>>,
1028
1029    /// Variable that can tell you whether there are currently seccomp
1030    /// filters without holding a lock
1031    pub seccomp_filter_state: SeccompState,
1032
1033    /// Tell you whether you are tracing syscall entry / exit without a lock.
1034    pub trace_syscalls: AtomicBool,
1035
1036    // The pid directory, so it doesn't have to be generated and thrown away on every access.
1037    // See https://fxbug.dev/291962828 for details.
1038    pub proc_pid_directory_cache: Mutex<Option<FsNodeHandle>>,
1039
1040    /// The Linux Security Modules state for this thread group. This should be the last member of
1041    /// this struct.
1042    pub security_state: security::TaskState,
1043}
1044
1045/// The decoded cross-platform parts we care about for page fault exception reports.
1046#[derive(Debug)]
1047pub struct PageFaultExceptionReport {
1048    pub faulting_address: u64,
1049    pub not_present: bool, // Set when the page fault was due to a not-present page.
1050    pub is_write: bool,    // Set when the triggering memory operation was a write.
1051    pub is_execute: bool,  // Set when the triggering memory operation was an execute.
1052}
1053
1054impl Task {
1055    pub fn kernel(&self) -> &Arc<Kernel> {
1056        &self.kernel
1057    }
1058
1059    pub fn thread_group(&self) -> &Arc<ThreadGroup> {
1060        &self.thread_group
1061    }
1062
1063    pub fn has_same_address_space(&self, other: Option<&Arc<MemoryManager>>) -> bool {
1064        match (self.mm(), other) {
1065            (Ok(this), Some(other)) => Arc::ptr_eq(&this, other),
1066            (Err(_), None) => true,
1067            _ => false,
1068        }
1069    }
1070
1071    pub fn flags(&self) -> TaskFlags {
1072        self.flags.load(Ordering::Relaxed)
1073    }
1074
1075    /// When the task exits, if there is a notification that needs to propagate
1076    /// to a ptracer, make sure it will propagate.
1077    pub fn set_ptrace_zombie(&self, pids: &mut crate::task::PidTable) {
1078        let pgid = self.thread_group().read().process_group.leader;
1079        let exit_signal = self.thread_group().read().exit_signal.clone();
1080        let mut state = self.write();
1081        state.set_stopped(StopState::ForceAwake, None, None, None);
1082        if let Some(ptrace) = &mut state.ptrace {
1083            // Add a zombie that the ptracer will notice.
1084            ptrace.last_signal_waitable = true;
1085            let tracer_pid = ptrace.get_pid();
1086            let tracer_tg = pids.get_thread_group(tracer_pid);
1087            if let Some(tracer_tg) = tracer_tg {
1088                drop(state);
1089                let mut tracer_state = tracer_tg.write();
1090
1091                let exit_status = self.exit_status().unwrap_or_else(|| {
1092                    starnix_logging::log_error!("Exiting without an exit code.");
1093                    ExitStatus::Exit(u8::MAX)
1094                });
1095                let uid = self.persistent_info.real_creds().uid;
1096                let exit_info = ProcessExitInfo { status: exit_status, exit_signal };
1097                let zombie = ZombieProcess {
1098                    thread_group_key: self.thread_group_key.clone(),
1099                    pgid,
1100                    uid,
1101                    exit_info: exit_info,
1102                    // ptrace doesn't need this.
1103                    time_stats: TaskTimeStats::default(),
1104                    is_canonical: false,
1105                };
1106
1107                tracer_state.zombie_ptracees.add(pids, self.tid, zombie);
1108            };
1109        }
1110    }
1111
1112    /// Disconnects this task from the tracer, if the tracer is still running.
1113    pub fn ptrace_disconnect(&mut self, pids: &PidTable) {
1114        let mut state = self.write();
1115        let ptracer_pid = state.ptrace.as_ref().map(|ptrace| ptrace.get_pid());
1116        if let Some(ptracer_pid) = ptracer_pid {
1117            let _ = state.set_ptrace(None);
1118            if let Some(ProcessEntryRef::Process(tg)) = pids.get_process(ptracer_pid) {
1119                let tid = self.get_tid();
1120                drop(state);
1121                tg.ptracees.lock().remove(&tid);
1122            }
1123        }
1124    }
1125
1126    pub fn exit_status(&self) -> Option<ExitStatus> {
1127        self.is_exitted().then(|| self.read().exit_status.clone()).flatten()
1128    }
1129
1130    pub fn is_exitted(&self) -> bool {
1131        self.flags().contains(TaskFlags::EXITED)
1132    }
1133
1134    pub fn load_stopped(&self) -> StopState {
1135        self.stop_state.load(Ordering::Relaxed)
1136    }
1137
1138    /// Upgrade a Reference to a Task, returning a ESRCH errno if the reference cannot be borrowed.
1139    pub fn from_weak(weak: &WeakRef<Task>) -> Result<TempRef<'_, Task>, Errno> {
1140        weak.upgrade().ok_or_else(|| errno!(ESRCH))
1141    }
1142
1143    /// Internal function for creating a Task object. Useful when you need to specify the value of
1144    /// every field. create_process and create_thread are more likely to be what you want.
1145    ///
1146    /// Any fields that should be initialized fresh for every task, even if the task was created
1147    /// with fork, are initialized to their defaults inside this function. All other fields are
1148    /// passed as parameters.
1149    #[allow(clippy::let_and_return)]
1150    pub fn new(
1151        tid: tid_t,
1152        command: TaskCommand,
1153        thread_group: Arc<ThreadGroup>,
1154        thread: Option<zx::Thread>,
1155        files: FdTable,
1156        mm: Option<Arc<MemoryManager>>,
1157        // The only case where fs should be None if when building the initial task that is the
1158        // used to build the initial FsContext.
1159        fs: Arc<FsContext>,
1160        creds: Credentials,
1161        abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
1162        abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
1163        signal_mask: SigSet,
1164        kernel_signals: VecDeque<KernelSignal>,
1165        vfork_event: Option<Arc<zx::Event>>,
1166        scheduler_state: SchedulerState,
1167        uts_ns: UtsNamespaceHandle,
1168        no_new_privs: bool,
1169        seccomp_filter_state: SeccompState,
1170        seccomp_filters: SeccompFilterContainer,
1171        robust_list_head: RobustListHeadPtr,
1172        timerslack_ns: u64,
1173        security_state: security::TaskState,
1174    ) -> OwnedRef<Self> {
1175        let thread_group_key = ThreadGroupKey::from(&thread_group);
1176        OwnedRef::new_cyclic(|weak_self| {
1177            let task = Task {
1178                weak_self,
1179                tid,
1180                thread_group_key: thread_group_key.clone(),
1181                kernel: Arc::clone(&thread_group.kernel),
1182                thread_group,
1183                thread: RwLock::new(thread.map(Arc::new)),
1184                files,
1185                mm: RcuOptionArc::new(mm),
1186                fs: RcuOptionArc::new(Some(fs)),
1187                abstract_socket_namespace,
1188                abstract_vsock_namespace,
1189                vfork_event,
1190                stop_state: AtomicStopState::new(StopState::Awake),
1191                flags: AtomicTaskFlags::new(TaskFlags::empty()),
1192                mutable_state: RwLock::new(TaskMutableState {
1193                    clear_child_tid: UserRef::default(),
1194                    signals: SignalState::with_mask(signal_mask),
1195                    kernel_signals,
1196                    exit_status: None,
1197                    scheduler_state,
1198                    uts_ns,
1199                    no_new_privs,
1200                    oom_score_adj: Default::default(),
1201                    seccomp_filters,
1202                    robust_list_head,
1203                    timerslack_ns,
1204                    // The default timerslack is set to the current timerslack of the creating thread.
1205                    default_timerslack_ns: timerslack_ns,
1206                    ptrace: None,
1207                    captured_thread_state: None,
1208                }),
1209                persistent_info: TaskPersistentInfoState::new(
1210                    tid,
1211                    thread_group_key,
1212                    command,
1213                    creds,
1214                ),
1215                seccomp_filter_state,
1216                trace_syscalls: AtomicBool::new(false),
1217                proc_pid_directory_cache: Mutex::new(None),
1218                security_state,
1219            };
1220
1221            #[cfg(any(test, debug_assertions))]
1222            {
1223                // Note that `Kernel::pids` is already locked by the caller of `Task::new()`.
1224                let _l1 = task.read();
1225                let _l2 = task.persistent_info.real_creds();
1226                let _l3 = task.persistent_info.command_guard();
1227            }
1228            task
1229        })
1230    }
1231
1232    state_accessor!(Task, mutable_state);
1233
1234    pub fn add_file<L>(
1235        &self,
1236        locked: &mut Locked<L>,
1237        file: FileHandle,
1238        flags: FdFlags,
1239    ) -> Result<FdNumber, Errno>
1240    where
1241        L: LockEqualOrBefore<FileOpsCore>,
1242    {
1243        self.files.add_with_flags(locked.cast_locked::<FileOpsCore>(), self, file, flags)
1244    }
1245
1246    /// Returns the real credentials of the task. These credentials are used to check permissions
1247    /// for actions performed on the task. If the task itself is performing an action, use
1248    /// `CurrentTask::current_creds` instead.
1249    pub fn real_creds(&self) -> Credentials {
1250        self.persistent_info.real_creds().clone()
1251    }
1252
1253    pub fn with_real_creds<B, F>(&self, f: F) -> B
1254    where
1255        F: FnOnce(&Credentials) -> B,
1256    {
1257        f(&self.persistent_info.real_creds())
1258    }
1259
1260    pub fn ptracer_task(&self) -> WeakRef<Task> {
1261        let ptracer = {
1262            let state = self.read();
1263            state.ptrace.as_ref().map(|p| p.core_state.pid)
1264        };
1265
1266        let Some(ptracer) = ptracer else {
1267            return WeakRef::default();
1268        };
1269
1270        self.get_task(ptracer)
1271    }
1272
1273    pub fn fs(&self) -> Arc<FsContext> {
1274        self.fs.to_option_arc().expect("fs must be set")
1275    }
1276
1277    pub fn has_shared_fs(&self) -> bool {
1278        let maybe_fs = self.fs.to_option_arc();
1279        // This check is incorrect because someone else could be holding a temporary Arc to the
1280        // FsContext and therefore increasing the strong count.
1281        maybe_fs.is_some_and(|fs| Arc::strong_count(&fs) > 2usize)
1282    }
1283
1284    #[track_caller]
1285    pub fn mm(&self) -> Result<Arc<MemoryManager>, Errno> {
1286        self.mm.to_option_arc().ok_or_else(|| errno!(EINVAL))
1287    }
1288
1289    pub fn unshare_fs(&self) {
1290        let fs = self.fs().fork();
1291        self.fs.update(Some(fs));
1292    }
1293
1294    /// Modify the given elements of the scheduler state with new values and update the
1295    /// task's thread's role.
1296    pub(crate) fn set_scheduler_policy_priority_and_reset_on_fork(
1297        &self,
1298        policy: SchedulingPolicy,
1299        priority: RealtimePriority,
1300        reset_on_fork: bool,
1301    ) -> Result<(), Errno> {
1302        self.update_scheduler_state_then_role(|scheduler_state| {
1303            scheduler_state.policy = policy;
1304            scheduler_state.realtime_priority = priority;
1305            scheduler_state.reset_on_fork = reset_on_fork;
1306        })
1307    }
1308
1309    /// Modify the scheduler state's priority and update the task's thread's role.
1310    pub(crate) fn set_scheduler_priority(&self, priority: RealtimePriority) -> Result<(), Errno> {
1311        self.update_scheduler_state_then_role(|scheduler_state| {
1312            scheduler_state.realtime_priority = priority
1313        })
1314    }
1315
1316    /// Modify the scheduler state's nice and update the task's thread's role.
1317    pub(crate) fn set_scheduler_nice(&self, nice: NormalPriority) -> Result<(), Errno> {
1318        self.update_scheduler_state_then_role(|scheduler_state| {
1319            scheduler_state.normal_priority = nice
1320        })
1321    }
1322
1323    /// Overwrite the existing scheduler state with a new one and update the task's thread's role.
1324    pub fn set_scheduler_state(&self, scheduler_state: SchedulerState) -> Result<(), Errno> {
1325        self.update_scheduler_state_then_role(|task_scheduler_state| {
1326            *task_scheduler_state = scheduler_state
1327        })
1328    }
1329
1330    /// Update the task's thread's role based on its current scheduler state without making any
1331    /// changes to the state.
1332    ///
1333    /// This should be called on tasks that have newly created threads, e.g. after cloning.
1334    pub fn sync_scheduler_state_to_role(&self) -> Result<(), Errno> {
1335        self.update_scheduler_state_then_role(|_| {})
1336    }
1337
1338    fn update_scheduler_state_then_role(
1339        &self,
1340        updater: impl FnOnce(&mut SchedulerState),
1341    ) -> Result<(), Errno> {
1342        let new_scheduler_state = {
1343            // Hold the task state lock as briefly as possible, it's not needed to update the role.
1344            let mut state = self.write();
1345            updater(&mut state.scheduler_state);
1346            state.scheduler_state
1347        };
1348        self.thread_group().kernel.scheduler.set_thread_role(self, new_scheduler_state)?;
1349        Ok(())
1350    }
1351
1352    /// Signals the vfork event, if any, to unblock waiters.
1353    pub fn signal_vfork(&self) {
1354        if let Some(event) = &self.vfork_event {
1355            if let Err(status) = event.signal_handle(Signals::NONE, Signals::USER_0) {
1356                log_warn!("Failed to set vfork signal {status}");
1357            }
1358        };
1359    }
1360
1361    /// Blocks the caller until the task has exited or executed execve(). This is used to implement
1362    /// vfork() and clone(... CLONE_VFORK, ...). The task must have created with CLONE_EXECVE.
1363    pub fn wait_for_execve(&self, task_to_wait: WeakRef<Task>) -> Result<(), Errno> {
1364        let event = task_to_wait.upgrade().and_then(|t| t.vfork_event.clone());
1365        if let Some(event) = event {
1366            event
1367                .wait_handle(zx::Signals::USER_0, zx::MonotonicInstant::INFINITE)
1368                .map_err(|status| from_status_like_fdio!(status))?;
1369        }
1370        Ok(())
1371    }
1372
1373    /// If needed, clear the child tid for this task.
1374    ///
1375    /// Userspace can ask us to clear the child tid and issue a futex wake at
1376    /// the child tid address when we tear down a task. For example, bionic
1377    /// uses this mechanism to implement pthread_join. The thread that calls
1378    /// pthread_join sleeps using FUTEX_WAIT on the child tid address. We wake
1379    /// them up here to let them know the thread is done.
1380    pub fn clear_child_tid_if_needed<L>(&self, locked: &mut Locked<L>) -> Result<(), Errno>
1381    where
1382        L: LockBefore<TerminalLock>,
1383    {
1384        let mut state = self.write();
1385        let user_tid = state.clear_child_tid;
1386        if !user_tid.is_null() {
1387            let zero: tid_t = 0;
1388            self.write_object(user_tid, &zero)?;
1389            self.kernel().shared_futexes.wake(
1390                locked,
1391                self,
1392                user_tid.addr(),
1393                usize::MAX,
1394                FUTEX_BITSET_MATCH_ANY,
1395            )?;
1396            state.clear_child_tid = UserRef::default();
1397        }
1398        Ok(())
1399    }
1400
1401    pub fn get_task(&self, tid: tid_t) -> WeakRef<Task> {
1402        self.kernel().pids.read().get_task(tid)
1403    }
1404
1405    pub fn get_pid(&self) -> pid_t {
1406        self.thread_group_key.pid()
1407    }
1408
1409    pub fn get_tid(&self) -> tid_t {
1410        self.tid
1411    }
1412
1413    pub fn is_leader(&self) -> bool {
1414        self.get_pid() == self.get_tid()
1415    }
1416
1417    pub fn read_argv(&self, max_len: usize) -> Result<Vec<FsString>, Errno> {
1418        // argv is empty for kthreads
1419        let Ok(mm) = self.mm() else {
1420            return Ok(vec![]);
1421        };
1422        let (argv_start, argv_end) = {
1423            let mm_state = mm.state.read();
1424            (mm_state.argv_start, mm_state.argv_end)
1425        };
1426
1427        let len_to_read = std::cmp::min(argv_end - argv_start, max_len);
1428        self.read_nul_delimited_c_string_list(argv_start, len_to_read)
1429    }
1430
1431    pub fn read_argv0(&self) -> Result<FsString, Errno> {
1432        // argv is empty for kthreads
1433        let Ok(mm) = self.mm() else {
1434            return Ok(FsString::default());
1435        };
1436        let argv_start = {
1437            let mm_state = mm.state.read();
1438            mm_state.argv_start
1439        };
1440        // Assuming a 64-bit arch width is fine for a type that's just u8's on all arches.
1441        let argv_start = UserCString::new(&ArchWidth::Arch64, argv_start);
1442        self.read_path(argv_start)
1443    }
1444
1445    pub fn read_env(&self, max_len: usize) -> Result<Vec<FsString>, Errno> {
1446        // environment is empty for kthreads
1447        let Ok(mm) = self.mm() else { return Ok(vec![]) };
1448        let (env_start, env_end) = {
1449            let mm_state = mm.state.read();
1450            (mm_state.environ_start, mm_state.environ_end)
1451        };
1452
1453        let len_to_read = std::cmp::min(env_end - env_start, max_len);
1454        self.read_nul_delimited_c_string_list(env_start, len_to_read)
1455    }
1456
1457    pub fn thread_runtime_info(&self) -> Result<zx::TaskRuntimeInfo, Errno> {
1458        self.thread
1459            .read()
1460            .as_ref()
1461            .ok_or_else(|| errno!(EINVAL))?
1462            .get_runtime_info()
1463            .map_err(|status| from_status_like_fdio!(status))
1464    }
1465
1466    pub fn real_fscred(&self) -> FsCred {
1467        self.with_real_creds(|creds| creds.as_fscred())
1468    }
1469
1470    /// Interrupts the current task.
1471    ///
1472    /// This will interrupt any blocking syscalls if the task is blocked on one.
1473    /// The signal_state of the task must not be locked.
1474    pub fn interrupt(&self) {
1475        self.read().signals.run_state.wake();
1476        if let Some(thread) = self.thread.read().as_ref() {
1477            #[allow(
1478                clippy::undocumented_unsafe_blocks,
1479                reason = "Force documented unsafe blocks in Starnix"
1480            )]
1481            let status = unsafe { zx::sys::zx_restricted_kick(thread.raw_handle(), 0) };
1482            if status != zx::sys::ZX_OK {
1483                // zx_restricted_kick() could return ZX_ERR_BAD_STATE if the target thread is already in the
1484                // DYING or DEAD states. That's fine since it means that the task is in the process of
1485                // tearing down, so allow it.
1486                assert_eq!(status, zx::sys::ZX_ERR_BAD_STATE);
1487            }
1488        }
1489    }
1490
1491    pub fn command(&self) -> TaskCommand {
1492        self.persistent_info.command.lock().clone()
1493    }
1494
1495    pub fn set_command_name(&self, mut new_name: TaskCommand) {
1496        // If we're going to update the process name, see if we can get a longer one than normally
1497        // provided in the Linux uapi. Only choose the argv0-based name if it's a superset of the
1498        // uapi-provided name to avoid clobbering the name provided by the user.
1499        if let Ok(argv0) = self.read_argv0() {
1500            let argv0 = TaskCommand::from_path_bytes(&argv0);
1501            if let Some(embedded_name) = argv0.try_embed(&new_name) {
1502                new_name = embedded_name;
1503            }
1504        }
1505
1506        // Acquire this before modifying Zircon state to ensure consistency under concurrent access.
1507        // Ideally this would also guard the logic above to read argv[0] but we can't due to lock
1508        // cycles with SELinux checks.
1509        let mut command_guard = self.persistent_info.command_guard();
1510
1511        // Set the name on the Linux thread.
1512        if let Some(thread) = self.thread.read().as_ref() {
1513            set_zx_name(&**thread, new_name.as_bytes());
1514        }
1515
1516        // If this is the thread group leader, use this name for the process too.
1517        if self.is_leader() {
1518            set_zx_name(&self.thread_group().process, new_name.as_bytes());
1519            let _ = zx::Thread::raise_user_exception(
1520                zx::RaiseExceptionOptions::TARGET_JOB_DEBUGGER,
1521                zx::sys::ZX_EXCP_USER_CODE_PROCESS_NAME_CHANGED,
1522                0,
1523            );
1524        }
1525
1526        // Avoid a lock cycle by dropping the guard before notifying memory attribution of the
1527        // change.
1528        *command_guard = new_name;
1529        drop(command_guard);
1530
1531        if self.is_leader() {
1532            if let Some(notifier) = &self.thread_group().read().notifier {
1533                let _ = notifier.send(MemoryAttributionLifecycleEvent::name_change(self.tid));
1534            }
1535        }
1536    }
1537
1538    pub fn set_seccomp_state(&self, state: SeccompStateValue) -> Result<(), Errno> {
1539        self.seccomp_filter_state.set(&state)
1540    }
1541
1542    pub fn state_code(&self) -> TaskStateCode {
1543        let status = self.read();
1544        if status.exit_status.is_some() {
1545            TaskStateCode::Zombie
1546        } else if status.signals.run_state.is_blocked() {
1547            let stop_state = self.load_stopped();
1548            if stop_state.ptrace_only() && stop_state.is_stopped() {
1549                TaskStateCode::TracingStop
1550            } else {
1551                TaskStateCode::Sleeping
1552            }
1553        } else {
1554            TaskStateCode::Running
1555        }
1556    }
1557
1558    pub fn time_stats(&self) -> TaskTimeStats {
1559        use zx::Task;
1560        let info = match &*self.thread.read() {
1561            Some(thread) => thread.get_runtime_info().expect("Failed to get thread stats"),
1562            None => return TaskTimeStats::default(),
1563        };
1564
1565        TaskTimeStats {
1566            user_time: zx::MonotonicDuration::from_nanos(info.cpu_time),
1567            // TODO(https://fxbug.dev/42078242): How can we calculate system time?
1568            system_time: zx::MonotonicDuration::default(),
1569        }
1570    }
1571
1572    pub fn get_signal_action(&self, signal: Signal) -> sigaction_t {
1573        self.thread_group().signal_actions.get(signal)
1574    }
1575
1576    pub fn record_pid_koid_mapping(&self) {
1577        let Some(ref mapping_table) = *self.kernel().pid_to_koid_mapping.read() else { return };
1578
1579        let pkoid = self.thread_group().get_process_koid().ok();
1580        let tkoid = self.thread.read().as_ref().and_then(|t| t.get_koid().ok());
1581        mapping_table.write().insert(self.tid, KoidPair { process: pkoid, thread: tkoid });
1582    }
1583}
1584
1585impl Releasable for Task {
1586    type Context<'a> =
1587        (Box<ThreadState>, &'a mut Locked<TaskRelease>, RwLockWriteGuard<'a, PidTable>);
1588
1589    fn release<'a>(mut self, context: Self::Context<'a>) {
1590        let (thread_state, locked, pids) = context;
1591
1592        *self.proc_pid_directory_cache.get_mut() = None;
1593        self.ptrace_disconnect(&pids);
1594
1595        std::mem::drop(pids);
1596
1597        self.files.release();
1598
1599        self.signal_vfork();
1600
1601        // Drop fields that can end up owning a FsNode to ensure no FsNode are owned by this task.
1602        self.fs.update(None);
1603        self.mm.update(None);
1604
1605        // Rebuild a temporary CurrentTask to run the release actions that requires a CurrentState.
1606        let current_task = CurrentTask::new(OwnedRef::new(self), thread_state);
1607
1608        // Apply any delayed releasers left.
1609        current_task.trigger_delayed_releaser(locked);
1610
1611        // Drop the task now that is has been released. This requires to take it from the OwnedRef
1612        // and from the resulting ReleaseGuard.
1613        let CurrentTask { mut task, .. } = current_task;
1614        let task = OwnedRef::take(&mut task).expect("task should not have been re-owned");
1615        let _task: Self = ReleaseGuard::take(task);
1616    }
1617}
1618
1619impl MemoryAccessor for Task {
1620    fn read_memory<'a>(
1621        &self,
1622        addr: UserAddress,
1623        bytes: &'a mut [MaybeUninit<u8>],
1624    ) -> Result<&'a mut [u8], Errno> {
1625        // Using a `Task` to read memory generally indicates that the memory
1626        // is being read from a task different than the `CurrentTask`. When
1627        // this `Task` is not current, its address space is not mapped
1628        // so we need to go through the VMO.
1629        self.mm()?.syscall_read_memory(addr, bytes)
1630    }
1631
1632    fn read_memory_partial_until_null_byte<'a>(
1633        &self,
1634        addr: UserAddress,
1635        bytes: &'a mut [MaybeUninit<u8>],
1636    ) -> Result<&'a mut [u8], Errno> {
1637        // Using a `Task` to read memory generally indicates that the memory
1638        // is being read from a task different than the `CurrentTask`. When
1639        // this `Task` is not current, its address space is not mapped
1640        // so we need to go through the VMO.
1641        self.mm()?.syscall_read_memory_partial_until_null_byte(addr, bytes)
1642    }
1643
1644    fn read_memory_partial<'a>(
1645        &self,
1646        addr: UserAddress,
1647        bytes: &'a mut [MaybeUninit<u8>],
1648    ) -> Result<&'a mut [u8], Errno> {
1649        // Using a `Task` to read memory generally indicates that the memory
1650        // is being read from a task different than the `CurrentTask`. When
1651        // this `Task` is not current, its address space is not mapped
1652        // so we need to go through the VMO.
1653        self.mm()?.syscall_read_memory_partial(addr, bytes)
1654    }
1655
1656    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
1657        // Using a `Task` to write memory generally indicates that the memory
1658        // is being written to a task different than the `CurrentTask`. When
1659        // this `Task` is not current, its address space is not mapped
1660        // so we need to go through the VMO.
1661        self.mm()?.syscall_write_memory(addr, bytes)
1662    }
1663
1664    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
1665        // Using a `Task` to write memory generally indicates that the memory
1666        // is being written to a task different than the `CurrentTask`. When
1667        // this `Task` is not current, its address space is not mapped
1668        // so we need to go through the VMO.
1669        self.mm()?.syscall_write_memory_partial(addr, bytes)
1670    }
1671
1672    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
1673        // Using a `Task` to zero memory generally indicates that the memory
1674        // is being zeroed from a task different than the `CurrentTask`. When
1675        // this `Task` is not current, its address space is not mapped
1676        // so we need to go through the VMO.
1677        self.mm()?.syscall_zero(addr, length)
1678    }
1679}
1680
1681impl TaskMemoryAccessor for Task {
1682    fn maximum_valid_address(&self) -> Option<UserAddress> {
1683        self.mm().map(|mm| mm.maximum_valid_user_address).ok()
1684    }
1685}
1686
1687impl fmt::Debug for Task {
1688    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1689        write!(
1690            f,
1691            "{}:{}[{}]",
1692            self.thread_group().leader,
1693            self.tid,
1694            self.persistent_info.command.lock()
1695        )
1696    }
1697}
1698
1699impl cmp::PartialEq for Task {
1700    fn eq(&self, other: &Self) -> bool {
1701        let ptr: *const Task = self;
1702        let other_ptr: *const Task = other;
1703        ptr == other_ptr
1704    }
1705}
1706
1707impl cmp::Eq for Task {}
1708
1709#[cfg(test)]
1710mod test {
1711    use super::*;
1712    use crate::testing::*;
1713    use starnix_uapi::auth::{CAP_SYS_ADMIN, Capabilities};
1714    use starnix_uapi::resource_limits::Resource;
1715    use starnix_uapi::signals::SIGCHLD;
1716    use starnix_uapi::{CLONE_SIGHAND, CLONE_THREAD, CLONE_VM, rlimit};
1717
1718    #[::fuchsia::test]
1719    async fn test_tid_allocation() {
1720        spawn_kernel_and_run(async |locked, current_task| {
1721            let kernel = current_task.kernel();
1722            assert_eq!(current_task.get_tid(), 1);
1723            let another_current = create_task(locked, &kernel, "another-task");
1724            let another_tid = another_current.get_tid();
1725            assert!(another_tid >= 2);
1726
1727            let pids = kernel.pids.read();
1728            assert_eq!(pids.get_task(1).upgrade().unwrap().get_tid(), 1);
1729            assert_eq!(pids.get_task(another_tid).upgrade().unwrap().get_tid(), another_tid);
1730        })
1731        .await;
1732    }
1733
1734    #[::fuchsia::test]
1735    async fn test_clone_pid_and_parent_pid() {
1736        spawn_kernel_and_run(async |locked, current_task| {
1737            let thread = current_task.clone_task_for_test(
1738                locked,
1739                (CLONE_THREAD | CLONE_VM | CLONE_SIGHAND) as u64,
1740                Some(SIGCHLD),
1741            );
1742            assert_eq!(current_task.get_pid(), thread.get_pid());
1743            assert_ne!(current_task.get_tid(), thread.get_tid());
1744            assert_eq!(current_task.thread_group().leader, thread.thread_group().leader);
1745
1746            let child_task = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
1747            assert_ne!(current_task.get_pid(), child_task.get_pid());
1748            assert_ne!(current_task.get_tid(), child_task.get_tid());
1749            assert_eq!(current_task.get_pid(), child_task.thread_group().read().get_ppid());
1750        })
1751        .await;
1752    }
1753
1754    #[::fuchsia::test]
1755    async fn test_root_capabilities() {
1756        spawn_kernel_and_run(async |_, current_task| {
1757            assert!(security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN));
1758            assert_eq!(current_task.real_creds().cap_inheritable, Capabilities::empty());
1759
1760            current_task.set_creds(Credentials::with_ids(1, 1));
1761            assert!(!security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN));
1762        })
1763        .await;
1764    }
1765
1766    #[::fuchsia::test]
1767    async fn test_clone_rlimit() {
1768        spawn_kernel_and_run(async |locked, current_task| {
1769            let prev_fsize = current_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1770            assert_ne!(prev_fsize, 10);
1771            current_task
1772                .thread_group()
1773                .limits
1774                .lock(locked)
1775                .set(Resource::FSIZE, rlimit { rlim_cur: 10, rlim_max: 100 });
1776            let current_fsize = current_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1777            assert_eq!(current_fsize, 10);
1778
1779            let child_task = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
1780            let child_fsize = child_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1781            assert_eq!(child_fsize, 10)
1782        })
1783        .await;
1784    }
1785}