Skip to main content

starnix_core/task/
task.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::{MemoryAccessor, MemoryAccessorExt, MemoryManager, TaskMemoryAccessor};
6use crate::mutable_state::{state_accessor, state_implementation};
7use crate::ptrace::{
8    AtomicStopState, PtraceEvent, PtraceEventData, PtraceState, PtraceStatus, StopState,
9};
10use crate::signals::{KernelSignal, SignalDetail, SignalInfo, SignalState};
11use crate::task::memory_attribution::MemoryAttributionLifecycleEvent;
12use crate::task::run_state::RunState;
13use crate::task::tracing::KoidPair;
14use crate::task::{
15    AbstractUnixSocketNamespace, AbstractVsockSocketNamespace, CurrentCreds, CurrentTask,
16    EventHandler, Kernel, NormalPriority, ProcessExitInfo, RealtimePriority, SchedulerState,
17    SchedulingPolicy, SeccompFilterContainer, SeccompState, SeccompStateValue, TaskRunningState,
18    ThreadGroup, ThreadGroupKey, ThreadState, UtsNamespaceHandle, WaitCanceler, Waiter,
19    ZombieProcess,
20};
21use crate::vfs::{FdTable, FsContext, FsString};
22use atomic_bitflags::atomic_bitflags;
23use fuchsia_rcu::{RcuArc, RcuOptionArc, RcuOptionBox, RcuReadGuard};
24use macro_rules_attribute::apply;
25use starnix_logging::{log_warn, set_zx_name};
26use starnix_registers::HeapRegs;
27use starnix_sync::{
28    FutexTableStateLock, LockBefore, LockDepGuard, LockDepMutex, Locked, RwLock, RwLockReadGuard,
29    RwLockWriteGuard, TaskCommandLevel,
30};
31use starnix_task_command::TaskCommand;
32use starnix_types::arch::ArchWidth;
33use starnix_types::stats::TaskTimeStats;
34use starnix_uapi::auth::{Credentials, FsCred};
35use starnix_uapi::errors::Errno;
36use starnix_uapi::signals::{SIGCHLD, SigSet, Signal, sigaltstack_contains_pointer};
37use starnix_uapi::user_address::{
38    ArchSpecific, MappingMultiArchUserRef, UserAddress, UserCString, UserRef,
39};
40use starnix_uapi::{
41    CLD_CONTINUED, CLD_DUMPED, CLD_EXITED, CLD_KILLED, CLD_STOPPED, CLD_TRAPPED,
42    FUTEX_BITSET_MATCH_ANY, errno, error, from_status_like_fdio, pid_t, sigaction_t, sigaltstack,
43    tid_t, uapi,
44};
45use std::collections::VecDeque;
46use std::mem::MaybeUninit;
47use std::ops::Deref;
48use std::sync::atomic::{AtomicBool, Ordering};
49use std::sync::{Arc, Weak};
50use std::{cmp, fmt};
51use zx::{Signals, Task as _};
52
53#[derive(Clone, Debug, Eq, PartialEq)]
54pub enum ExitStatus {
55    Exit(u8),
56    Kill(SignalInfo),
57    CoreDump(SignalInfo),
58    // The second field for Stop and Continue contains the type of ptrace stop
59    // event that made it stop / continue, if applicable (PTRACE_EVENT_STOP,
60    // PTRACE_EVENT_FORK, etc)
61    Stop(SignalInfo, PtraceEvent),
62    Continue(SignalInfo, PtraceEvent),
63}
64impl ExitStatus {
65    /// Converts the given exit status to a status code suitable for returning from wait syscalls.
66    pub fn wait_status(&self) -> i32 {
67        match self {
68            ExitStatus::Exit(status) => (*status as i32) << 8,
69            ExitStatus::Kill(siginfo) => siginfo.signal.number() as i32,
70            ExitStatus::CoreDump(siginfo) => (siginfo.signal.number() as i32) | 0x80,
71            ExitStatus::Continue(siginfo, trace_event) => {
72                let trace_event_val = *trace_event as u32;
73                if trace_event_val != 0 {
74                    (siginfo.signal.number() as i32) | (trace_event_val << 16) as i32
75                } else {
76                    0xffff
77                }
78            }
79            ExitStatus::Stop(siginfo, trace_event) => {
80                let trace_event_val = *trace_event as u32;
81                (0x7f + ((siginfo.signal.number() as i32) << 8)) | (trace_event_val << 16) as i32
82            }
83        }
84    }
85
86    pub fn signal_info_code(&self) -> i32 {
87        match self {
88            ExitStatus::Exit(_) => CLD_EXITED as i32,
89            ExitStatus::Kill(_) => CLD_KILLED as i32,
90            ExitStatus::CoreDump(_) => CLD_DUMPED as i32,
91            ExitStatus::Stop(_, _) => CLD_STOPPED as i32,
92            ExitStatus::Continue(_, _) => CLD_CONTINUED as i32,
93        }
94    }
95
96    pub fn signal_info_status(&self) -> i32 {
97        match self {
98            ExitStatus::Exit(status) => *status as i32,
99            ExitStatus::Kill(siginfo)
100            | ExitStatus::CoreDump(siginfo)
101            | ExitStatus::Continue(siginfo, _)
102            | ExitStatus::Stop(siginfo, _) => siginfo.signal.number() as i32,
103        }
104    }
105}
106
107atomic_bitflags! {
108    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
109    pub struct TaskFlags: u8 {
110        const EXITED                   = 1 << 0;
111        const SIGNALS_AVAILABLE        = 1 << 1;
112        const TEMPORARY_SIGNAL_MASK    = 1 << 2;
113        /// Whether the executor should dump the stack of this task when it exits.
114        /// Currently used to implement ExitStatus::CoreDump.
115        const DUMP_ON_EXIT             = 1 << 3;
116        const KERNEL_SIGNALS_AVAILABLE = 1 << 4;
117        /// Whether the executor has successfully spawned a thread for this task.
118        const SPAWNED                  = 1 << 5;
119    }
120}
121
122/// This contains thread state that tracers can inspect and modify.  It is
123/// captured when a thread stops, and optionally copied back (if dirty) when a
124/// thread starts again.  An alternative implementation would involve the
125/// tracers acting on thread state directly; however, this would involve sharing
126/// CurrentTask structures across multiple threads, which goes against the
127/// intent of the design of CurrentTask.
128pub struct CapturedThreadState {
129    /// The thread state of the traced task.  This is copied out when the thread
130    /// stops.
131    pub thread_state: ThreadState<HeapRegs>,
132
133    /// Indicates that the last ptrace operation changed the thread state, so it
134    /// should be written back to the original thread.
135    pub dirty: bool,
136}
137
138impl ArchSpecific for CapturedThreadState {
139    fn is_arch32(&self) -> bool {
140        self.thread_state.is_arch32()
141    }
142}
143
144#[derive(Debug)]
145pub struct RobustList {
146    pub next: RobustListPtr,
147}
148
149pub type RobustListPtr =
150    MappingMultiArchUserRef<RobustList, uapi::robust_list, uapi::arch32::robust_list>;
151
152impl From<uapi::robust_list> for RobustList {
153    fn from(robust_list: uapi::robust_list) -> Self {
154        Self { next: RobustListPtr::from(robust_list.next) }
155    }
156}
157
158#[cfg(target_arch = "aarch64")]
159impl From<uapi::arch32::robust_list> for RobustList {
160    fn from(robust_list: uapi::arch32::robust_list) -> Self {
161        Self { next: RobustListPtr::from(robust_list.next) }
162    }
163}
164
165#[derive(Debug)]
166pub struct RobustListHead {
167    pub list: RobustList,
168    pub futex_offset: isize,
169}
170
171pub type RobustListHeadPtr =
172    MappingMultiArchUserRef<RobustListHead, uapi::robust_list_head, uapi::arch32::robust_list_head>;
173
174impl From<uapi::robust_list_head> for RobustListHead {
175    fn from(robust_list_head: uapi::robust_list_head) -> Self {
176        Self {
177            list: robust_list_head.list.into(),
178            futex_offset: robust_list_head.futex_offset as isize,
179        }
180    }
181}
182
183#[cfg(target_arch = "aarch64")]
184impl From<uapi::arch32::robust_list_head> for RobustListHead {
185    fn from(robust_list_head: uapi::arch32::robust_list_head) -> Self {
186        Self {
187            list: robust_list_head.list.into(),
188            futex_offset: robust_list_head.futex_offset as isize,
189        }
190    }
191}
192
193pub struct TaskMutableState {
194    // See https://man7.org/linux/man-pages/man2/set_tid_address.2.html
195    pub clear_child_tid: UserRef<tid_t>,
196
197    /// Signal handler related state. This is grouped together for when atomicity is needed during
198    /// signal sending and delivery.
199    signals: SignalState,
200
201    /// The current run state of the task.
202    pub run_state: RunState,
203
204    /// Internal signals that have a higher priority than a regular signal.
205    ///
206    /// Storing in a separate queue outside of `SignalState` ensures the internal signals will
207    /// never be ignored or masked when dequeuing. Higher priority ensures that no user signals
208    /// will jump the queue, e.g. ptrace, which delays the delivery.
209    ///
210    /// This design is not about observable consequence, but about convenient implementation.
211    kernel_signals: VecDeque<KernelSignal>,
212
213    /// The exit status that this task exited with.
214    exit_status: Option<ExitStatus>,
215
216    /// Desired scheduler state for the task.
217    pub scheduler_state: SchedulerState,
218
219    /// The UTS namespace assigned to this thread.
220    ///
221    /// This field is kept in the mutable state because the UTS namespace of a thread
222    /// can be forked using `clone()` or `unshare()` syscalls.
223    ///
224    /// We use UtsNamespaceHandle because the UTS properties can be modified
225    /// by any other thread that shares this namespace.
226    pub uts_ns: UtsNamespaceHandle,
227
228    /// Bit that determines whether a newly started program can have privileges its parent does
229    /// not have.  See Documentation/prctl/no_new_privs.txt in the Linux kernel for details.
230    /// Note that Starnix does not currently implement the relevant privileges (e.g.,
231    /// setuid/setgid binaries).  So, you can set this, but it does nothing other than get
232    /// propagated to children.
233    ///
234    /// The documentation indicates that this can only ever be set to
235    /// true, and it cannot be reverted to false.  Accessor methods
236    /// for this field ensure this property.
237    no_new_privs: bool,
238
239    /// Userspace hint about how to adjust the OOM score for this process.
240    pub oom_score_adj: i32,
241
242    /// List of currently installed seccomp_filters
243    pub seccomp_filters: SeccompFilterContainer,
244
245    /// A pointer to the head of the robust futex list of this thread in
246    /// userspace. See get_robust_list(2)
247    pub robust_list_head: RobustListHeadPtr,
248
249    /// The timer slack used to group timer expirations for the calling thread.
250    ///
251    /// Timers may expire up to `timerslack_ns` late, but never early.
252    ///
253    /// If this value is 0, the task's default timerslack is used.
254    pub timerslack_ns: u64,
255
256    /// The default value for `timerslack_ns`. This value cannot change during the lifetime of a
257    /// task.
258    ///
259    /// This value is set to the `timerslack_ns` of the creating thread, and thus is not constant
260    /// across tasks.
261    pub default_timerslack_ns: u64,
262
263    /// Information that a tracer needs to communicate with this process, if it
264    /// is being traced.
265    pub ptrace: Option<Box<PtraceState>>,
266
267    /// Information that a tracer needs to inspect this process.
268    pub captured_thread_state: Option<Box<CapturedThreadState>>,
269}
270
271impl TaskMutableState {
272    pub fn no_new_privs(&self) -> bool {
273        self.no_new_privs
274    }
275
276    /// Sets the value of no_new_privs to true.  It is an error to set
277    /// it to anything else.
278    pub fn enable_no_new_privs(&mut self) {
279        self.no_new_privs = true;
280    }
281
282    pub fn get_timerslack<T: zx::Timeline>(&self) -> zx::Duration<T> {
283        zx::Duration::from_nanos(self.timerslack_ns as i64)
284    }
285
286    /// Sets the current timerslack of the task to `ns`.
287    ///
288    /// If `ns` is zero, the current timerslack gets reset to the task's default timerslack.
289    pub fn set_timerslack_ns(&mut self, ns: u64) {
290        if ns == 0 {
291            self.timerslack_ns = self.default_timerslack_ns;
292        } else {
293            self.timerslack_ns = ns;
294        }
295    }
296
297    pub fn is_ptraced(&self) -> bool {
298        self.ptrace.is_some()
299    }
300
301    pub fn is_ptrace_listening(&self) -> bool {
302        self.ptrace.as_ref().is_some_and(|ptrace| ptrace.stop_status == PtraceStatus::Listening)
303    }
304
305    pub fn ptrace_on_signal_consume(&mut self) -> bool {
306        self.ptrace.as_mut().is_some_and(|ptrace: &mut Box<PtraceState>| {
307            if ptrace.stop_status.is_continuing() {
308                ptrace.stop_status = PtraceStatus::Default;
309                false
310            } else {
311                true
312            }
313        })
314    }
315
316    pub fn notify_ptracers(&mut self) {
317        if let Some(ptrace) = &self.ptrace {
318            ptrace.tracer_waiters().notify_all();
319        }
320    }
321
322    pub fn wait_on_ptracer(&self, waiter: &Waiter) {
323        if let Some(ptrace) = &self.ptrace {
324            ptrace.tracee_waiters.wait_async(&waiter);
325        }
326    }
327
328    pub fn notify_ptracees(&mut self) {
329        if let Some(ptrace) = &self.ptrace {
330            ptrace.tracee_waiters.notify_all();
331        }
332    }
333
334    pub fn take_captured_state(&mut self) -> Option<Box<CapturedThreadState>> {
335        if self.captured_thread_state.is_some() {
336            let mut state = None;
337            std::mem::swap(&mut state, &mut self.captured_thread_state);
338            return state;
339        }
340        None
341    }
342
343    pub fn copy_state_from(&mut self, current_task: &CurrentTask) {
344        self.captured_thread_state = Some(Box::new(CapturedThreadState {
345            thread_state: current_task.thread_state.extended_snapshot::<HeapRegs>(),
346            dirty: false,
347        }));
348    }
349
350    /// Returns the task's currently active signal mask.
351    pub fn signal_mask(&self) -> SigSet {
352        self.signals.mask()
353    }
354
355    /// Returns true if `signal` is currently blocked by this task's signal mask.
356    pub fn is_signal_masked(&self, signal: Signal) -> bool {
357        self.signals.mask().has_signal(signal)
358    }
359
360    /// Returns true if `signal` is blocked by the saved signal mask.
361    ///
362    /// Note that the current signal mask may still not be blocking the signal.
363    pub fn is_signal_masked_by_saved_mask(&self, signal: Signal) -> bool {
364        self.signals.saved_mask().is_some_and(|mask| mask.has_signal(signal))
365    }
366
367    /// Removes the currently active, temporary, signal mask and restores the
368    /// previously active signal mask.
369    pub fn restore_signal_mask(&mut self) {
370        self.signals.restore_mask();
371    }
372
373    /// Returns true if the task's current `RunState` is blocked.
374    pub fn is_blocked(&self) -> bool {
375        self.run_state.is_blocked()
376    }
377
378    /// Sets the task's `RunState` to `run_state`.
379    pub fn set_run_state(&mut self, run_state: RunState) {
380        self.run_state = run_state;
381    }
382
383    pub fn run_state(&self) -> RunState {
384        self.run_state.clone()
385    }
386
387    pub fn on_signal_stack(&self, stack_pointer_register: u64) -> bool {
388        self.signals
389            .alt_stack
390            .map(|signal_stack| sigaltstack_contains_pointer(&signal_stack, stack_pointer_register))
391            .unwrap_or(false)
392    }
393
394    pub fn set_sigaltstack(&mut self, stack: Option<sigaltstack>) {
395        self.signals.alt_stack = stack;
396    }
397
398    pub fn sigaltstack(&self) -> Option<sigaltstack> {
399        self.signals.alt_stack
400    }
401
402    pub fn wait_on_signal(&mut self, waiter: &Waiter) {
403        self.signals.signal_wait.wait_async(waiter);
404    }
405
406    pub fn signals_mut(&mut self) -> &mut SignalState {
407        &mut self.signals
408    }
409
410    pub fn wait_on_signal_fd_events(
411        &self,
412        waiter: &Waiter,
413        mask: SigSet,
414        handler: EventHandler,
415    ) -> WaitCanceler {
416        self.signals.signal_wait.wait_async_signal_mask(waiter, mask, handler)
417    }
418
419    pub fn notify_signal_waiters(&self, signal: &Signal) {
420        self.signals.signal_wait.notify_signal(signal);
421    }
422
423    /// Thaw the task if has been frozen
424    pub fn thaw(&mut self) {
425        if let RunState::Frozen(waiter) = self.run_state() {
426            waiter.notify();
427        }
428    }
429
430    pub fn is_frozen(&self) -> bool {
431        matches!(self.run_state(), RunState::Frozen(_))
432    }
433
434    #[cfg(test)]
435    pub fn kernel_signals_for_test(&self) -> &VecDeque<KernelSignal> {
436        &self.kernel_signals
437    }
438}
439
440#[apply(state_implementation!)]
441impl TaskMutableState<Base = Task> {
442    pub fn set_stopped(
443        &mut self,
444        stopped: StopState,
445        siginfo: Option<SignalInfo>,
446        current_task: Option<&CurrentTask>,
447        event: Option<PtraceEventData>,
448    ) {
449        if stopped.ptrace_only() && self.ptrace.is_none() {
450            return;
451        }
452
453        if self.base.load_stopped().is_illegal_transition(stopped) {
454            return;
455        }
456
457        // TODO(https://g-issues.fuchsia.dev/issues/306438676): When task can be
458        // stopped inside user code, task will need to be either restarted or
459        // stopped here.
460        self.store_stopped(stopped);
461        if stopped.is_stopped() {
462            if let Some(ref current_task) = current_task {
463                self.copy_state_from(current_task);
464            }
465        }
466        if let Some(ptrace) = &mut self.ptrace {
467            ptrace.set_last_signal(siginfo);
468            ptrace.set_last_event(event);
469        }
470        if stopped == StopState::Waking || stopped == StopState::ForceWaking {
471            self.notify_ptracees();
472        }
473        if !stopped.is_in_progress() {
474            self.notify_ptracers();
475        }
476    }
477
478    /// Enqueues a signal at the back of the task's signal queue.
479    pub fn enqueue_signal(&mut self, signal: SignalInfo) {
480        self.signals.enqueue(signal);
481        self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.signals.is_any_pending());
482    }
483
484    /// Enqueues the signal, allowing the signal to skip straight to the front of the task's queue.
485    ///
486    /// `enqueue_signal` is the more common API to use.
487    ///
488    /// Note that this will not guarantee that the signal is dequeued before any process-directed
489    /// signals.
490    pub fn enqueue_signal_front(&mut self, signal: SignalInfo) {
491        self.signals.enqueue(signal);
492        self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.signals.is_any_pending());
493    }
494
495    /// Sets the current signal mask of the task.
496    pub fn set_signal_mask(&mut self, mask: SigSet) {
497        self.signals.set_mask(mask);
498        self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.signals.is_any_pending());
499    }
500
501    /// Sets a temporary signal mask for the task.
502    ///
503    /// This mask should be removed by a matching call to `restore_signal_mask`.
504    pub fn set_temporary_signal_mask(&mut self, mask: SigSet) {
505        self.signals.set_temporary_mask(mask);
506        self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.signals.is_any_pending());
507    }
508
509    /// Returns the number of pending signals for this task, without considering the signal mask.
510    pub fn pending_signal_count(&self) -> usize {
511        self.signals.num_queued() + self.base.thread_group().num_signals_queued()
512    }
513
514    /// Returns `true` if `signal` is pending for this task, without considering the signal mask.
515    pub fn has_signal_pending(&self, signal: Signal) -> bool {
516        self.signals.has_queued(signal) || self.base.thread_group().has_signal_queued(signal)
517    }
518
519    // Prepare a SignalInfo to be sent to the tracer, if any.
520    pub fn prepare_signal_info(
521        &mut self,
522        stopped: StopState,
523    ) -> Option<(Weak<ThreadGroup>, SignalInfo)> {
524        if !stopped.is_stopped() {
525            return None;
526        }
527
528        if let Some(ptrace) = &self.ptrace {
529            if let Some(last_signal) = ptrace.get_last_signal_ref() {
530                let signal_info = SignalInfo::with_detail(
531                    SIGCHLD,
532                    CLD_TRAPPED as i32,
533                    SignalDetail::SIGCHLD {
534                        pid: self.base.tid,
535                        uid: self.base.real_creds().uid,
536                        status: last_signal.signal.number() as i32,
537                    },
538                );
539
540                return Some((ptrace.core_state.thread_group.clone(), signal_info));
541            }
542        }
543
544        None
545    }
546
547    pub fn set_ptrace(&mut self, tracer: Option<Box<PtraceState>>) -> Result<(), Errno> {
548        if tracer.is_some() && self.ptrace.is_some() {
549            return error!(EPERM);
550        }
551
552        if tracer.is_none() {
553            // Handle the case where this is called while the thread group is being released.
554            if let Ok(tg_stop_state) = self.base.thread_group().load_stopped().as_in_progress() {
555                self.set_stopped(tg_stop_state, None, None, None);
556            }
557        }
558        self.ptrace = tracer;
559        Ok(())
560    }
561
562    pub fn can_accept_ptrace_commands(&mut self) -> bool {
563        !self.base.load_stopped().is_waking_or_awake()
564            && self.is_ptraced()
565            && !self.is_ptrace_listening()
566    }
567
568    fn store_stopped(&mut self, state: StopState) {
569        // We don't actually use the guard but we require it to enforce that the
570        // caller holds the thread group's mutable state lock (identified by
571        // mutable access to the thread group's mutable state).
572
573        self.base.stop_state.store(state, Ordering::Relaxed)
574    }
575
576    pub fn update_flags(&mut self, clear: TaskFlags, set: TaskFlags) {
577        // We don't actually use the guard but we require it to enforce that the
578        // caller holds the task's mutable state lock (identified by mutable
579        // access to the task's mutable state).
580
581        debug_assert_eq!(clear ^ set, clear | set);
582        let observed = self.base.flags();
583        let swapped = self.base.flags.swap((observed | set) & !clear, Ordering::Relaxed);
584        debug_assert_eq!(swapped, observed);
585    }
586
587    pub fn set_flags(&mut self, flag: TaskFlags, v: bool) {
588        let (clear, set) = if v { (TaskFlags::empty(), flag) } else { (flag, TaskFlags::empty()) };
589
590        self.update_flags(clear, set);
591    }
592
593    pub fn set_spawned(&mut self) {
594        self.set_flags(TaskFlags::SPAWNED, true);
595    }
596
597    pub fn set_exit_status(&mut self, status: ExitStatus) {
598        self.set_flags(TaskFlags::EXITED, true);
599        self.exit_status = Some(status);
600    }
601
602    pub fn set_exit_status_if_not_already(&mut self, status: ExitStatus) {
603        self.set_flags(TaskFlags::EXITED, true);
604        self.exit_status.get_or_insert(status);
605    }
606
607    /// The set of pending signals for the task, including the signals pending for the thread
608    /// group.
609    pub fn pending_signals(&self) -> SigSet {
610        self.signals.pending() | self.base.thread_group().get_pending_signals()
611    }
612
613    /// The set of pending signals for the task specifically, not including the signals pending
614    /// for the thread group.
615    pub fn task_specific_pending_signals(&self) -> SigSet {
616        self.signals.pending()
617    }
618
619    /// Returns true if any currently pending signal is allowed by `mask`.
620    pub fn is_any_signal_allowed_by_mask(&self, mask: SigSet) -> bool {
621        self.signals.is_any_allowed_by_mask(mask)
622            || self.base.thread_group().is_any_signal_allowed_by_mask(mask)
623    }
624
625    /// Returns whether or not a signal is pending for this task, taking the current
626    /// signal mask into account.
627    pub fn is_any_signal_pending(&self) -> bool {
628        let mask = self.signal_mask();
629        self.signals.is_any_pending()
630            || self.base.thread_group().is_any_signal_allowed_by_mask(mask)
631    }
632
633    /// Returns the next pending signal that passes `predicate`.
634    fn take_next_signal_where<F>(&mut self, predicate: F) -> Option<SignalInfo>
635    where
636        F: Fn(&SignalInfo) -> bool,
637    {
638        if let Some(signal) = self.base.thread_group().take_next_signal_where(&predicate) {
639            Some(signal)
640        } else {
641            let s = self.signals.take_next_where(&predicate);
642            self.set_flags(TaskFlags::SIGNALS_AVAILABLE, self.signals.is_any_pending());
643            s
644        }
645    }
646
647    /// Removes and returns the next pending `signal` for this task.
648    ///
649    /// Returns `None` if `siginfo` is a blocked signal, or no such signal is pending.
650    pub fn take_specific_signal(&mut self, siginfo: SignalInfo) -> Option<SignalInfo> {
651        let signal_mask = self.signal_mask();
652        if signal_mask.has_signal(siginfo.signal) {
653            return None;
654        }
655
656        let predicate = |s: &SignalInfo| s.signal == siginfo.signal;
657        self.take_next_signal_where(predicate)
658    }
659
660    /// Removes and returns a pending signal that is unblocked by the current signal mask.
661    ///
662    /// Returns `None` if there are no unblocked signals pending.
663    pub fn take_any_signal(&mut self) -> Option<SignalInfo> {
664        self.take_signal_with_mask(self.signal_mask())
665    }
666
667    /// Removes and returns a pending signal that is unblocked by `signal_mask`.
668    ///
669    /// Returns `None` if there are no signals pending that are unblocked by `signal_mask`.
670    pub fn take_signal_with_mask(&mut self, signal_mask: SigSet) -> Option<SignalInfo> {
671        let predicate = |s: &SignalInfo| !signal_mask.has_signal(s.signal) || s.force;
672        self.take_next_signal_where(predicate)
673    }
674
675    /// Enqueues an internal signal at the back of the task's kernel signal queue.
676    pub fn enqueue_kernel_signal(&mut self, signal: KernelSignal) {
677        self.kernel_signals.push_back(signal);
678        self.set_flags(TaskFlags::KERNEL_SIGNALS_AVAILABLE, true);
679    }
680
681    /// Removes and returns a pending internal signal.
682    ///
683    /// Returns `None` if there are no signals pending.
684    pub fn take_kernel_signal(&mut self) -> Option<KernelSignal> {
685        let signal = self.kernel_signals.pop_front();
686        if self.kernel_signals.is_empty() {
687            self.set_flags(TaskFlags::KERNEL_SIGNALS_AVAILABLE, false);
688        }
689        signal
690    }
691
692    #[cfg(test)]
693    pub fn queued_signal_count(&self, signal: Signal) -> usize {
694        self.signals.queued_count(signal)
695            + self.base.thread_group().pending_signals.lock().queued_count(signal)
696    }
697}
698
699#[derive(Debug, Clone, Copy, PartialEq, Eq)]
700pub enum TaskStateCode {
701    // Task is being executed.
702    Running,
703
704    // Task is waiting for an event.
705    Sleeping,
706
707    // Tracing stop
708    TracingStop,
709
710    // Task has exited.
711    Zombie,
712}
713
714impl TaskStateCode {
715    pub fn code_char(&self) -> char {
716        match self {
717            TaskStateCode::Running => 'R',
718            TaskStateCode::Sleeping => 'S',
719            TaskStateCode::TracingStop => 't',
720            TaskStateCode::Zombie => 'Z',
721        }
722    }
723
724    pub fn name(&self) -> &'static str {
725        match self {
726            TaskStateCode::Running => "running",
727            TaskStateCode::Sleeping => "sleeping",
728            TaskStateCode::TracingStop => "tracing stop",
729            TaskStateCode::Zombie => "zombie",
730        }
731    }
732}
733
734/// The information of the task that needs to be available to the `ThreadGroup` while computing
735/// which process a wait can target. It is necessary to shared this data with the `ThreadGroup` so
736/// that it is available while the task is being dropped and so is not accessible from a weak
737/// pointer.
738#[derive(Debug)]
739pub struct TaskPersistentInfoState {
740    /// Immutable information about the task
741    tid: tid_t,
742    thread_group_key: ThreadGroupKey,
743
744    /// The command of this task.
745    command: LockDepMutex<TaskCommand, TaskCommandLevel>,
746
747    /// The security credentials for this task. These are only set when the task is the CurrentTask,
748    /// or on task creation.
749    creds: RcuArc<Credentials>,
750
751    // A lock for the security credentials. Writers must take the lock, readers that need to ensure
752    // that the task state does not change may take the lock.
753    creds_lock: RwLock<()>,
754}
755
756/// Guard for reading locked credentials.
757pub struct CredentialsReadGuard<'a> {
758    _lock: RwLockReadGuard<'a, ()>,
759    creds: RcuReadGuard<Credentials>,
760}
761
762impl<'a> Deref for CredentialsReadGuard<'a> {
763    type Target = Credentials;
764
765    fn deref(&self) -> &Self::Target {
766        self.creds.deref()
767    }
768}
769
770/// Guard for writing credentials. No `CredentialsReadGuard` to the same task can concurrently
771///  exist.
772pub struct CredentialsWriteGuard<'a> {
773    _lock: RwLockWriteGuard<'a, ()>,
774    creds: &'a RcuArc<Credentials>,
775}
776
777impl<'a> CredentialsWriteGuard<'a> {
778    pub fn update(&mut self, creds: Arc<Credentials>) {
779        self.creds.update(creds);
780    }
781}
782
783impl TaskPersistentInfoState {
784    fn new(
785        tid: tid_t,
786        thread_group_key: ThreadGroupKey,
787        command: TaskCommand,
788        creds: Arc<Credentials>,
789    ) -> TaskPersistentInfo {
790        Arc::new(Self {
791            tid,
792            thread_group_key,
793            command: LockDepMutex::new(command),
794            creds: RcuArc::new(creds),
795            creds_lock: RwLock::new(()),
796        })
797    }
798
799    pub fn tid(&self) -> tid_t {
800        self.tid
801    }
802
803    pub fn pid(&self) -> pid_t {
804        self.thread_group_key.pid()
805    }
806
807    pub fn command_guard(&self) -> LockDepGuard<'_, TaskCommand> {
808        self.command.lock()
809    }
810
811    /// Snapshots the credentials, returning a short-lived RCU-guarded reference.
812    pub fn real_creds(&self) -> RcuReadGuard<Credentials> {
813        self.creds.read()
814    }
815
816    /// Snapshots the credentials, returning a new reference. Use this if you need to stash the
817    /// credentials somewhere.
818    pub fn clone_creds(&self) -> Arc<Credentials> {
819        self.creds.to_arc()
820    }
821
822    /// Returns a read lock on the credentials. This is appropriate if you need to guarantee that
823    ///  the Task's credentials will not change during a security-sensitive operation.
824    pub fn lock_creds(&self) -> CredentialsReadGuard<'_> {
825        let lock = self.creds_lock.read();
826        CredentialsReadGuard { _lock: lock, creds: self.creds.read() }
827    }
828
829    /// Locks the credentials for writing, returning a guard that the `CurrentTask` can use to
830    /// update both the objective `Task` credentials, and its own subjective cached copy.
831    pub(in crate::task) fn write_current_task_creds(
832        self: &Arc<Self>,
833    ) -> CurrentTaskCredentialsWriteGuard {
834        let persistent_info = self.clone();
835        // SAFETY: `creds_lock` remains live via the `persistent_info` reference to `Self`.
836        let lock = unsafe {
837            let raw_lock = self.creds_lock.write();
838            std::mem::transmute::<RwLockWriteGuard<'_, ()>, RwLockWriteGuard<'static, ()>>(raw_lock)
839        };
840        CurrentTaskCredentialsWriteGuard { _lock: lock, persistent_info }
841    }
842}
843
844pub type TaskPersistentInfo = Arc<TaskPersistentInfoState>;
845
846pub struct CurrentTaskCredentialsWriteGuard {
847    // Drop order is critical: the lock must be dropped BEFORE the persistent_info Arc.
848    // Rust drops fields in declaration order (top-to-bottom).
849    // So _lock is dropped first, then persistent_info.
850    _lock: RwLockWriteGuard<'static, ()>,
851    pub persistent_info: TaskPersistentInfo,
852}
853
854impl CurrentTaskCredentialsWriteGuard {
855    pub fn update(self, current_task: &CurrentTask, creds: Arc<Credentials>) {
856        self.persistent_info.creds.update(creds.clone());
857        *current_task.current_creds.borrow_mut() = CurrentCreds::Cached(creds);
858
859        // The /proc/pid directory's ownership is updated when the task's euid
860        // or egid changes. See proc(5).
861        let maybe_node = current_task.running_state().proc_pid_directory_cache.cloned();
862        if let Some(node) = maybe_node {
863            let creds = current_task.real_creds().euid_as_fscred();
864            // SAFETY: The /proc/pid directory held by `proc_pid_directory_cache` represents the
865            // current task. It's owner and group are supposed to track the current task's euid and
866            // egid.
867            unsafe {
868                node.force_chown(creds);
869            }
870        }
871    }
872}
873
874/// A unit of execution.
875///
876/// A task is the primary unit of execution in the Starnix kernel. Most tasks are *user* tasks,
877/// which have an associated Zircon thread. The Zircon thread switches between restricted mode,
878/// in which the thread runs userspace code, and normal mode, in which the thread runs Starnix
879/// code.
880///
881/// Tasks track the resources used by userspace by referencing various objects, such as an
882/// `FdTable`, a `MemoryManager`, and an `FsContext`. Many tasks can share references to these
883/// objects. In principle, which objects are shared between which tasks can be largely arbitrary,
884/// but there are common patterns of sharing. For example, tasks created with `pthread_create`
885/// will share the `FdTable`, `MemoryManager`, and `FsContext` and are often called "threads" by
886/// userspace programmers. Tasks created by `posix_spawn` do not share these objects and are often
887/// called "processes" by userspace programmers. However, inside the kernel, there is no clear
888/// definition of a "thread" or a "process".
889///
890/// During boot, the kernel creates the first task, often called `init`. The vast majority of other
891/// tasks are created as transitive clones (e.g., using `clone(2)`) of that task. Sometimes, the
892/// kernel will create new tasks from whole cloth, either with a corresponding userspace component
893/// or to represent some background work inside the kernel.
894///
895/// See also `CurrentTask`, which represents the task corresponding to the thread that is currently
896/// executing.
897pub struct Task {
898    /// Weak reference to this `Task`. This allows us to retrieve an `Arc` from a raw `Task`.
899    pub weak_self: Weak<Self>,
900
901    /// A unique identifier for this task.
902    ///
903    /// This value can be read in userspace using `gettid(2)`. In general, this value
904    /// is different from the value return by `getpid(2)`, which returns the `id` of the leader
905    /// of the `thread_group`.
906    pub tid: tid_t,
907
908    /// The process key of this task.
909    pub thread_group_key: ThreadGroupKey,
910
911    /// The kernel to which this thread group belongs.
912    pub kernel: Arc<Kernel>,
913
914    /// The thread group to which this task belongs.
915    ///
916    /// The group of tasks in a thread group roughly corresponds to the userspace notion of a
917    /// process.
918    pub thread_group: Arc<ThreadGroup>,
919
920    /// The running state of the task.
921    ///
922    /// This is `None` for exited tasks.
923    pub running_state: RcuOptionBox<TaskRunningState>,
924
925    /// The stop state of the task, distinct from the stop state of the thread group.
926    ///
927    /// Must only be set when the `mutable_state` write lock is held.
928    stop_state: AtomicStopState,
929
930    /// The flags for the task.
931    ///
932    /// Must only be set the then `mutable_state` write lock is held.
933    flags: AtomicTaskFlags,
934
935    /// The mutable state of the Task.
936    mutable_state: RwLock<TaskMutableState>,
937
938    /// The information of the task that needs to be available to the `ThreadGroup` while computing
939    /// which process a wait can target.
940    /// Contains the command line, the task credentials and the exit signal.
941    /// See `TaskPersistentInfo` for more information.
942    pub persistent_info: TaskPersistentInfo,
943
944    /// For vfork and clone() with CLONE_VFORK, this is set when the task exits or calls execve().
945    /// It allows the calling task to block until the fork has been completed. Only populated
946    /// when created with the CLONE_VFORK flag.
947    vfork_event: Option<Arc<zx::Event>>,
948
949    /// Variable that can tell you whether there are currently seccomp
950    /// filters without holding a lock
951    pub seccomp_filter_state: SeccompState,
952
953    /// Tell you whether you are tracing syscall entry / exit without a lock.
954    pub trace_syscalls: AtomicBool,
955}
956
957/// The decoded cross-platform parts we care about for page fault exception reports.
958#[derive(Debug)]
959pub struct PageFaultExceptionReport {
960    pub faulting_address: u64,
961    pub not_present: bool, // Set when the page fault was due to a not-present page.
962    pub is_write: bool,    // Set when the triggering memory operation was a write.
963    pub is_execute: bool,  // Set when the triggering memory operation was an execute.
964}
965
966impl Task {
967    pub fn kernel(&self) -> &Arc<Kernel> {
968        &self.kernel
969    }
970
971    pub fn thread_group(&self) -> &Arc<ThreadGroup> {
972        &self.thread_group
973    }
974
975    pub fn has_same_address_space(&self, other: Option<&Arc<MemoryManager>>) -> bool {
976        match (self.mm(), other) {
977            (Ok(this), Some(other)) => Arc::ptr_eq(&this, other),
978            (Err(_), None) => true,
979            _ => false,
980        }
981    }
982
983    pub fn flags(&self) -> TaskFlags {
984        self.flags.load(Ordering::Relaxed)
985    }
986
987    pub fn is_spawned(&self) -> bool {
988        self.flags().contains(TaskFlags::SPAWNED)
989    }
990
991    /// When the task exits, if there is a notification that needs to propagate
992    /// to a ptracer, make sure it will propagate.
993    pub fn set_ptrace_zombie(&self, pids: &mut crate::task::PidTable) {
994        let pgid = self.thread_group().read().process_group.leader;
995        let exit_signal = self.thread_group().read().exit_signal.clone();
996        let mut state = self.write();
997        state.set_stopped(StopState::ForceAwake, None, None, None);
998        if let Some(ptrace) = &mut state.ptrace {
999            // Add a zombie that the ptracer will notice.
1000            ptrace.last_signal_waitable = true;
1001            let tracer_pid = ptrace.get_pid();
1002            let tracer_tg = pids.get_thread_group(tracer_pid);
1003            if let Some(tracer_tg) = tracer_tg {
1004                drop(state);
1005                let mut tracer_state = tracer_tg.write();
1006
1007                let exit_status = self.exit_status().unwrap_or_else(|| {
1008                    starnix_logging::log_error!("Exiting without an exit code.");
1009                    ExitStatus::Exit(u8::MAX)
1010                });
1011                let uid = self.real_creds().uid;
1012                let exit_info = ProcessExitInfo { status: exit_status, exit_signal };
1013                let zombie = ZombieProcess {
1014                    thread_group_key: self.thread_group_key.clone(),
1015                    pgid,
1016                    uid,
1017                    exit_info: exit_info,
1018                    // ptrace doesn't need this.
1019                    time_stats: TaskTimeStats::default(),
1020                    is_canonical: false,
1021                };
1022
1023                tracer_state.zombie_ptracees.add(pids, self.tid, zombie);
1024            };
1025        }
1026    }
1027
1028    /// Disconnects this task from the tracer.
1029    pub fn ptrace_disconnect(&self) {
1030        // Get a reference to the ptracer thread group through the weak reference in PtraceCoreState
1031        // to avoid acquiring a PidTable lock.
1032        let tracer_tg = self
1033            .read()
1034            .ptrace
1035            .as_ref()
1036            .map(|p| p.core_state.thread_group.clone())
1037            .and_then(|tg| tg.upgrade());
1038        if let Some(tg) = tracer_tg {
1039            tg.ptracees.lock().remove(&self.tid);
1040        }
1041    }
1042
1043    pub fn exit_status(&self) -> Option<ExitStatus> {
1044        self.is_exitted().then(|| self.read().exit_status.clone()).flatten()
1045    }
1046
1047    pub fn is_exitted(&self) -> bool {
1048        self.flags().contains(TaskFlags::EXITED)
1049    }
1050
1051    pub fn load_stopped(&self) -> StopState {
1052        self.stop_state.load(Ordering::Relaxed)
1053    }
1054
1055    /// Upgrade a [`Weak<Task>`], returning [`Err(ESRCH)`] if the reference cannot be borrowed.
1056    pub fn from_weak(weak: &Weak<Task>) -> Result<Arc<Task>, Errno> {
1057        weak.upgrade().ok_or_else(|| errno!(ESRCH))
1058    }
1059
1060    /// Internal function for creating a Task object. Useful when you need to specify the value of
1061    /// every field. create_process and create_thread are more likely to be what you want.
1062    ///
1063    /// Any fields that should be initialized fresh for every task, even if the task was created
1064    /// with fork, are initialized to their defaults inside this function. All other fields are
1065    /// passed as parameters.
1066    #[allow(clippy::let_and_return)]
1067    pub fn new(
1068        tid: tid_t,
1069        command: TaskCommand,
1070        thread_group: Arc<ThreadGroup>,
1071        files: FdTable,
1072        mm: Option<Arc<MemoryManager>>,
1073        // The only case where fs should be None if when building the initial task that is the
1074        // used to build the initial FsContext.
1075        fs: Arc<FsContext>,
1076        creds: Arc<Credentials>,
1077        abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
1078        abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
1079        signal_mask: SigSet,
1080        kernel_signals: VecDeque<KernelSignal>,
1081        vfork_event: Option<Arc<zx::Event>>,
1082        scheduler_state: SchedulerState,
1083        uts_ns: UtsNamespaceHandle,
1084        no_new_privs: bool,
1085        seccomp_filter_state: SeccompState,
1086        seccomp_filters: SeccompFilterContainer,
1087        robust_list_head: RobustListHeadPtr,
1088        timerslack_ns: u64,
1089    ) -> Arc<Self> {
1090        let thread_group_key = ThreadGroupKey::from(&thread_group);
1091        Arc::new_cyclic(|weak_self| {
1092            let task = Task {
1093                weak_self: weak_self.clone(),
1094                tid,
1095                thread_group_key: thread_group_key.clone(),
1096                kernel: Arc::clone(&thread_group.kernel),
1097                thread_group,
1098                running_state: RcuOptionBox::new(Some(TaskRunningState {
1099                    thread: Default::default(),
1100                    files,
1101                    mm: RcuOptionArc::new(mm),
1102                    fs: RcuArc::new(fs),
1103                    abstract_socket_namespace,
1104                    abstract_vsock_namespace,
1105                    proc_pid_directory_cache: Default::default(),
1106                })),
1107                vfork_event,
1108                stop_state: AtomicStopState::new(StopState::Awake),
1109                flags: AtomicTaskFlags::new(TaskFlags::empty()),
1110                mutable_state: RwLock::new(TaskMutableState {
1111                    clear_child_tid: UserRef::default(),
1112                    signals: SignalState::with_mask(signal_mask),
1113                    run_state: RunState::default(),
1114                    kernel_signals,
1115                    exit_status: None,
1116                    scheduler_state,
1117                    uts_ns,
1118                    no_new_privs,
1119                    oom_score_adj: Default::default(),
1120                    seccomp_filters,
1121                    robust_list_head,
1122                    timerslack_ns,
1123                    // The default timerslack is set to the current timerslack of the creating thread.
1124                    default_timerslack_ns: timerslack_ns,
1125                    ptrace: None,
1126                    captured_thread_state: None,
1127                }),
1128                persistent_info: TaskPersistentInfoState::new(
1129                    tid,
1130                    thread_group_key,
1131                    command,
1132                    creds,
1133                ),
1134                seccomp_filter_state,
1135                trace_syscalls: AtomicBool::new(false),
1136            };
1137
1138            #[cfg(any(test, debug_assertions))]
1139            {
1140                // Note that `Kernel::pids` is already locked by the caller of `Task::new()`.
1141                let _l1 = task.persistent_info.lock_creds();
1142                let _l2 = task.read();
1143                let _l3 = task.persistent_info.command_guard();
1144            }
1145            task
1146        })
1147    }
1148
1149    state_accessor!(Task, mutable_state);
1150
1151    /// Returns the real credentials of the task as a short-lived RCU-guarded reference. These
1152    /// credentials are used to check permissions for actions performed on the task. If the task
1153    /// itself is performing an action, use `CurrentTask::current_creds` instead. This does not
1154    /// lock the credentials.
1155    pub fn real_creds(&self) -> RcuReadGuard<Credentials> {
1156        self.persistent_info.real_creds()
1157    }
1158
1159    /// Returns a new long-lived reference to the real credentials of the task.  These credentials
1160    /// are used to check permissions for actions performed on the task. If the task itself is
1161    /// performing an action, use `CurrentTask::current_creds` instead. This does not lock the
1162    /// credentials.
1163    pub fn clone_creds(&self) -> Arc<Credentials> {
1164        self.persistent_info.clone_creds()
1165    }
1166
1167    pub fn ptracer_task(&self) -> Option<Arc<Task>> {
1168        self.read().ptrace.as_ref().and_then(|p| p.core_state.task.upgrade())
1169    }
1170
1171    /// Determine whether the task is running.
1172    ///
1173    /// # Thread Safety
1174    ///
1175    /// The task may exit immediately after `is_running()` returns `true`.
1176    pub fn is_running(&self) -> bool {
1177        self.running_state.read().is_some()
1178    }
1179
1180    /// Returns the running state of the task, if it exists.
1181    ///
1182    /// # Errors
1183    ///
1184    /// Returns [`Err(ESRCH)`] if the task has already transitioned to a zombie state and its running
1185    /// resources have been dropped.
1186    #[track_caller]
1187    pub fn running_state(&self) -> Result<RcuReadGuard<TaskRunningState>, Errno> {
1188        self.running_state.read().ok_or_else(|| errno!(ESRCH))
1189    }
1190
1191    /// Returns the memory manager of the task, if it exists.
1192    ///
1193    /// # Errors
1194    ///
1195    /// Returns [`Err(errno)`] where `errno` is:
1196    ///
1197    ///   - `ESRCH`: the task is dead and its live resources have been dropped.
1198    ///   - `EINVAL`: the task does not have a memory manager.
1199    #[track_caller]
1200    pub fn mm(&self) -> Result<Arc<MemoryManager>, Errno> {
1201        self.running_state()?.mm.to_option_arc().ok_or_else(|| errno!(EINVAL))
1202    }
1203
1204    /// Modify the given elements of the scheduler state with new values and update the
1205    /// task's thread's role.
1206    pub(crate) fn set_scheduler_policy_priority_and_reset_on_fork(
1207        &self,
1208        policy: SchedulingPolicy,
1209        priority: RealtimePriority,
1210        reset_on_fork: bool,
1211    ) -> Result<(), Errno> {
1212        self.update_scheduler_state_then_role(|scheduler_state| {
1213            scheduler_state.policy = policy;
1214            scheduler_state.realtime_priority = priority;
1215            scheduler_state.reset_on_fork = reset_on_fork;
1216        })
1217    }
1218
1219    /// Modify the scheduler state's priority and update the task's thread's role.
1220    pub(crate) fn set_scheduler_priority(&self, priority: RealtimePriority) -> Result<(), Errno> {
1221        self.update_scheduler_state_then_role(|scheduler_state| {
1222            scheduler_state.realtime_priority = priority
1223        })
1224    }
1225
1226    /// Modify the scheduler state's nice and update the task's thread's role.
1227    pub(crate) fn set_scheduler_nice(&self, nice: NormalPriority) -> Result<(), Errno> {
1228        self.update_scheduler_state_then_role(|scheduler_state| {
1229            scheduler_state.normal_priority = nice
1230        })
1231    }
1232
1233    /// Overwrite the existing scheduler state with a new one and update the task's thread's role.
1234    pub fn set_scheduler_state(&self, scheduler_state: SchedulerState) -> Result<(), Errno> {
1235        self.update_scheduler_state_then_role(|task_scheduler_state| {
1236            *task_scheduler_state = scheduler_state
1237        })
1238    }
1239
1240    /// Update the task's thread's role based on its current scheduler state without making any
1241    /// changes to the state.
1242    ///
1243    /// This should be called on tasks that have newly created threads, e.g. after cloning.
1244    pub fn sync_scheduler_state_to_role(&self) -> Result<(), Errno> {
1245        self.update_scheduler_state_then_role(|_| {})
1246    }
1247
1248    fn update_scheduler_state_then_role(
1249        &self,
1250        updater: impl FnOnce(&mut SchedulerState),
1251    ) -> Result<(), Errno> {
1252        let new_scheduler_state = {
1253            // Hold the task state lock as briefly as possible, it's not needed to update the role.
1254            let mut state = self.write();
1255            updater(&mut state.scheduler_state);
1256            state.scheduler_state
1257        };
1258        self.thread_group().kernel.scheduler.set_thread_role(self, new_scheduler_state)?;
1259        Ok(())
1260    }
1261
1262    /// Signals the vfork event, if any, to unblock waiters.
1263    pub fn signal_vfork(&self) {
1264        if let Some(event) = &self.vfork_event {
1265            if let Err(status) = event.signal(Signals::NONE, Signals::USER_0) {
1266                log_warn!("Failed to set vfork signal {status}");
1267            }
1268        };
1269    }
1270
1271    /// Blocks the caller until the task has exited or executed execve(). This is used to implement
1272    /// vfork() and clone(... CLONE_VFORK, ...). The task must have created with CLONE_EXECVE.
1273    pub fn wait_for_execve(&self, task_to_wait: Weak<Task>) -> Result<(), Errno> {
1274        let event = task_to_wait.upgrade().and_then(|t| t.vfork_event.clone());
1275        if let Some(event) = event {
1276            event
1277                .wait_one(zx::Signals::USER_0, zx::MonotonicInstant::INFINITE)
1278                .map_err(|status| from_status_like_fdio!(status))?;
1279        }
1280        Ok(())
1281    }
1282
1283    /// If needed, clear the child tid for this task.
1284    ///
1285    /// Userspace can ask us to clear the child tid and issue a futex wake at
1286    /// the child tid address when we tear down a task. For example, bionic
1287    /// uses this mechanism to implement pthread_join. The thread that calls
1288    /// pthread_join sleeps using FUTEX_WAIT on the child tid address. We wake
1289    /// them up here to let them know the thread is done.
1290    pub fn clear_child_tid_if_needed<L>(&self, locked: &mut Locked<L>) -> Result<(), Errno>
1291    where
1292        L: LockBefore<TaskCommandLevel> + LockBefore<FutexTableStateLock>,
1293    {
1294        let mut state = self.write();
1295        let user_tid = state.clear_child_tid;
1296        if !user_tid.is_null() {
1297            let zero: tid_t = 0;
1298            self.write_object(user_tid, &zero)?;
1299            self.kernel().shared_futexes.wake(
1300                locked,
1301                self,
1302                user_tid.addr(),
1303                usize::MAX,
1304                FUTEX_BITSET_MATCH_ANY,
1305            )?;
1306            state.clear_child_tid = UserRef::default();
1307        }
1308        Ok(())
1309    }
1310
1311    pub fn get_task(&self, tid: tid_t) -> Result<Arc<Task>, Errno> {
1312        self.kernel().pids.read().get_task(tid)
1313    }
1314
1315    pub fn get_pid(&self) -> pid_t {
1316        self.thread_group_key.pid()
1317    }
1318
1319    pub fn get_tid(&self) -> tid_t {
1320        self.tid
1321    }
1322
1323    pub fn is_leader(&self) -> bool {
1324        self.get_pid() == self.get_tid()
1325    }
1326
1327    pub fn read_argv(&self, max_len: usize) -> Result<Vec<FsString>, Errno> {
1328        // argv is empty for kthreads
1329        let Ok(mm) = self.mm() else {
1330            return Ok(vec![]);
1331        };
1332        let (argv_start, argv_end) = {
1333            let mm_state = mm.state.read();
1334            (mm_state.argv_start, mm_state.argv_end)
1335        };
1336
1337        let len_to_read = std::cmp::min(argv_end - argv_start, max_len);
1338        self.read_nul_delimited_c_string_list(argv_start, len_to_read)
1339    }
1340
1341    pub fn read_argv0(&self) -> Result<FsString, Errno> {
1342        // argv is empty for kthreads
1343        let Ok(mm) = self.mm() else {
1344            return Ok(FsString::default());
1345        };
1346        let argv_start = {
1347            let mm_state = mm.state.read();
1348            mm_state.argv_start
1349        };
1350        // Assuming a 64-bit arch width is fine for a type that's just u8's on all arches.
1351        let argv_start = UserCString::new(&ArchWidth::Arch64, argv_start);
1352        self.read_path(argv_start)
1353    }
1354
1355    pub fn read_env(&self, max_len: usize) -> Result<Vec<FsString>, Errno> {
1356        // environment is empty for kthreads
1357        let Ok(mm) = self.mm() else { return Ok(vec![]) };
1358        let (env_start, env_end) = {
1359            let mm_state = mm.state.read();
1360            (mm_state.environ_start, mm_state.environ_end)
1361        };
1362
1363        let len_to_read = std::cmp::min(env_end - env_start, max_len);
1364        self.read_nul_delimited_c_string_list(env_start, len_to_read)
1365    }
1366
1367    pub fn thread_runtime_info(&self) -> Result<zx::TaskRuntimeInfo, Errno> {
1368        self.running_state()?
1369            .thread
1370            .get()
1371            .ok_or_else(|| errno!(EINVAL))?
1372            .get_runtime_info()
1373            .map_err(|status| from_status_like_fdio!(status))
1374    }
1375
1376    pub fn real_fscred(&self) -> FsCred {
1377        self.real_creds().as_fscred()
1378    }
1379
1380    /// Interrupts the current task.
1381    ///
1382    /// This will interrupt any blocking syscalls if the task is blocked on one.
1383    /// The signal_state of the task must not be locked.
1384    pub fn interrupt(&self) {
1385        let Ok(running_state) = self.running_state() else {
1386            log_warn!("Cannot interrupt dead task {}", self.get_tid());
1387            return;
1388        };
1389
1390        self.read().run_state.wake();
1391        if let Some(thread) = running_state.thread.get() {
1392            #[allow(
1393                clippy::undocumented_unsafe_blocks,
1394                reason = "Force documented unsafe blocks in Starnix"
1395            )]
1396            let status = unsafe { zx::sys::zx_restricted_kick(thread.raw_handle(), 0) };
1397            if status != zx::sys::ZX_OK {
1398                // zx_restricted_kick() could return ZX_ERR_BAD_STATE if the target thread is already in the
1399                // DYING or DEAD states. That's fine since it means that the task is in the process of
1400                // tearing down, so allow it.
1401                assert_eq!(status, zx::sys::ZX_ERR_BAD_STATE);
1402            }
1403        }
1404    }
1405
1406    pub fn command(&self) -> TaskCommand {
1407        self.persistent_info.command.lock().clone()
1408    }
1409
1410    pub fn set_command_name(&self, mut new_name: TaskCommand) {
1411        let Ok(running_state) = self.running_state() else {
1412            log_warn!("Cannot set command name for dead task {}", self.get_tid());
1413            return;
1414        };
1415
1416        // If we're going to update the process name, see if we can get a longer one than normally
1417        // provided in the Linux uapi. Only choose the argv0-based name if it's a superset of the
1418        // uapi-provided name to avoid clobbering the name provided by the user.
1419        if let Ok(argv0) = self.read_argv0() {
1420            let argv0 = TaskCommand::from_path_bytes(&argv0);
1421            if let Some(embedded_name) = argv0.try_embed(&new_name) {
1422                new_name = embedded_name;
1423            }
1424        }
1425
1426        // Acquire this before modifying Zircon state to ensure consistency under concurrent access.
1427        // Ideally this would also guard the logic above to read argv[0] but we can't due to lock
1428        // cycles with SELinux checks.
1429        let mut command_guard = self.persistent_info.command_guard();
1430
1431        // Set the name on the Linux thread.
1432        if let Some(thread) = running_state.thread.get() {
1433            set_zx_name(thread.thread.as_ref(), new_name.as_bytes());
1434        }
1435
1436        // If this is the thread group leader, use this name for the process too.
1437        if self.is_leader() {
1438            set_zx_name(&*self.thread_group().process, new_name.as_bytes());
1439            let _ = zx::Thread::raise_user_exception(
1440                zx::RaiseExceptionOptions::TARGET_JOB_DEBUGGER,
1441                zx::sys::ZX_EXCP_USER_CODE_PROCESS_NAME_CHANGED,
1442                0,
1443            );
1444        }
1445
1446        // Avoid a lock cycle by dropping the guard before notifying memory attribution of the
1447        // change.
1448        *command_guard = new_name;
1449        drop(command_guard);
1450
1451        if self.is_leader() {
1452            if let Some(notifier) = &self.thread_group().read().notifier {
1453                let _ = notifier.send(MemoryAttributionLifecycleEvent::name_change(self.tid));
1454            }
1455        }
1456    }
1457
1458    pub fn set_seccomp_state(&self, state: SeccompStateValue) -> Result<(), Errno> {
1459        self.seccomp_filter_state.set(&state)
1460    }
1461
1462    pub fn state_code(&self) -> TaskStateCode {
1463        let status = self.read();
1464        if status.exit_status.is_some() {
1465            TaskStateCode::Zombie
1466        } else if status.run_state.is_blocked() {
1467            let stop_state = self.load_stopped();
1468            if stop_state.ptrace_only() && stop_state.is_stopped() {
1469                TaskStateCode::TracingStop
1470            } else {
1471                TaskStateCode::Sleeping
1472            }
1473        } else {
1474            TaskStateCode::Running
1475        }
1476    }
1477
1478    pub fn time_stats(&self) -> TaskTimeStats {
1479        use zx::Task;
1480        // TODO(https://fxbug.dev/297440106): Return time stats for zombie tasks.
1481        let running_state = match self.running_state() {
1482            Ok(running_state) => running_state,
1483            Err(_) => return TaskTimeStats::default(),
1484        };
1485        let info = match running_state.thread.get() {
1486            Some(thread) => thread.get_runtime_info().expect("Failed to get thread stats"),
1487            None => return TaskTimeStats::default(),
1488        };
1489
1490        TaskTimeStats {
1491            user_time: zx::MonotonicDuration::from_nanos(info.cpu_time),
1492            // TODO(https://fxbug.dev/42078242): How can we calculate system time?
1493            system_time: zx::MonotonicDuration::default(),
1494        }
1495    }
1496
1497    pub fn get_signal_action(&self, signal: Signal) -> sigaction_t {
1498        self.thread_group().signal_actions.get(signal)
1499    }
1500
1501    pub fn should_check_for_pending_signals(&self) -> bool {
1502        self.flags().intersects(
1503            TaskFlags::KERNEL_SIGNALS_AVAILABLE
1504                | TaskFlags::SIGNALS_AVAILABLE
1505                | TaskFlags::TEMPORARY_SIGNAL_MASK,
1506        ) || self.thread_group.has_pending_signals.load(Ordering::Relaxed)
1507    }
1508
1509    pub fn record_pid_koid_mapping(&self) {
1510        let Ok(running_state) = self.running_state() else {
1511            log_warn!("Cannot record pid/koid mapping for dead task {}", self.get_tid());
1512            return;
1513        };
1514
1515        let Some(ref mapping_table) = *self.kernel().pid_to_koid_mapping.read() else { return };
1516
1517        let pkoid = self.thread_group().get_process_koid().ok();
1518        let tkoid = running_state.thread.get().map(|t| t.koid);
1519        mapping_table.write().insert(self.tid, KoidPair { process: pkoid, thread: tkoid });
1520    }
1521}
1522
1523impl Drop for Task {
1524    fn drop(&mut self) {
1525        debug_assert!(self.running_state.read().is_none());
1526    }
1527}
1528
1529impl MemoryAccessor for Task {
1530    fn read_memory<'a>(
1531        &self,
1532        addr: UserAddress,
1533        bytes: &'a mut [MaybeUninit<u8>],
1534    ) -> Result<&'a mut [u8], Errno> {
1535        // Using a `Task` to read memory generally indicates that the memory
1536        // is being read from a task different than the `CurrentTask`. When
1537        // this `Task` is not current, its address space is not mapped
1538        // so we need to go through the VMO.
1539        self.mm()?.syscall_read_memory(addr, bytes)
1540    }
1541
1542    fn read_memory_partial_until_null_byte<'a>(
1543        &self,
1544        addr: UserAddress,
1545        bytes: &'a mut [MaybeUninit<u8>],
1546    ) -> Result<&'a mut [u8], Errno> {
1547        // Using a `Task` to read memory generally indicates that the memory
1548        // is being read from a task different than the `CurrentTask`. When
1549        // this `Task` is not current, its address space is not mapped
1550        // so we need to go through the VMO.
1551        self.mm()?.syscall_read_memory_partial_until_null_byte(addr, bytes)
1552    }
1553
1554    fn read_memory_partial<'a>(
1555        &self,
1556        addr: UserAddress,
1557        bytes: &'a mut [MaybeUninit<u8>],
1558    ) -> Result<&'a mut [u8], Errno> {
1559        // Using a `Task` to read memory generally indicates that the memory
1560        // is being read from a task different than the `CurrentTask`. When
1561        // this `Task` is not current, its address space is not mapped
1562        // so we need to go through the VMO.
1563        self.mm()?.syscall_read_memory_partial(addr, bytes)
1564    }
1565
1566    fn write_memory(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
1567        // Using a `Task` to write memory generally indicates that the memory
1568        // is being written to a task different than the `CurrentTask`. When
1569        // this `Task` is not current, its address space is not mapped
1570        // so we need to go through the VMO.
1571        self.mm()?.syscall_write_memory(addr, bytes)
1572    }
1573
1574    fn write_memory_partial(&self, addr: UserAddress, bytes: &[u8]) -> Result<usize, Errno> {
1575        // Using a `Task` to write memory generally indicates that the memory
1576        // is being written to a task different than the `CurrentTask`. When
1577        // this `Task` is not current, its address space is not mapped
1578        // so we need to go through the VMO.
1579        self.mm()?.syscall_write_memory_partial(addr, bytes)
1580    }
1581
1582    fn zero(&self, addr: UserAddress, length: usize) -> Result<usize, Errno> {
1583        // Using a `Task` to zero memory generally indicates that the memory
1584        // is being zeroed from a task different than the `CurrentTask`. When
1585        // this `Task` is not current, its address space is not mapped
1586        // so we need to go through the VMO.
1587        self.mm()?.syscall_zero(addr, length)
1588    }
1589}
1590
1591impl TaskMemoryAccessor for Task {
1592    fn maximum_valid_address(&self) -> Option<UserAddress> {
1593        self.mm().map(|mm| mm.maximum_valid_user_address).ok()
1594    }
1595}
1596
1597impl fmt::Debug for Task {
1598    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1599        write!(
1600            f,
1601            "{}:{}[{}]",
1602            self.thread_group().leader,
1603            self.tid,
1604            *self.persistent_info.command.lock()
1605        )
1606    }
1607}
1608
1609impl cmp::PartialEq for Task {
1610    fn eq(&self, other: &Self) -> bool {
1611        let ptr: *const Task = self;
1612        let other_ptr: *const Task = other;
1613        ptr == other_ptr
1614    }
1615}
1616
1617impl cmp::Eq for Task {}
1618
1619#[cfg(test)]
1620mod test {
1621    use super::*;
1622    use crate::security;
1623    use crate::testing::*;
1624    use starnix_uapi::auth::{CAP_SYS_ADMIN, Capabilities};
1625    use starnix_uapi::resource_limits::Resource;
1626    use starnix_uapi::signals::SIGCHLD;
1627    use starnix_uapi::{CLONE_SIGHAND, CLONE_THREAD, CLONE_VM, rlimit};
1628
1629    #[::fuchsia::test]
1630    async fn test_tid_allocation() {
1631        spawn_kernel_and_run(async |locked, current_task| {
1632            let kernel = current_task.kernel();
1633            assert_eq!(current_task.get_tid(), 1);
1634            let another_current = create_task(locked, &kernel, "another-task");
1635            let another_tid = another_current.get_tid();
1636            assert!(another_tid >= 2);
1637
1638            let pids = kernel.pids.read();
1639            assert_eq!(pids.get_task(1).unwrap().get_tid(), 1);
1640            assert_eq!(pids.get_task(another_tid).unwrap().get_tid(), another_tid);
1641        })
1642        .await;
1643    }
1644
1645    #[::fuchsia::test]
1646    async fn test_clone_pid_and_parent_pid() {
1647        spawn_kernel_and_run(async |locked, current_task| {
1648            let thread = current_task.clone_task_for_test(
1649                locked,
1650                (CLONE_THREAD | CLONE_VM | CLONE_SIGHAND) as u64,
1651                Some(SIGCHLD),
1652            );
1653            assert_eq!(current_task.get_pid(), thread.get_pid());
1654            assert_ne!(current_task.get_tid(), thread.get_tid());
1655            assert_eq!(current_task.thread_group().leader, thread.thread_group().leader);
1656
1657            let child_task = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
1658            assert_ne!(current_task.get_pid(), child_task.get_pid());
1659            assert_ne!(current_task.get_tid(), child_task.get_tid());
1660            assert_eq!(current_task.get_pid(), child_task.thread_group().read().get_ppid());
1661        })
1662        .await;
1663    }
1664
1665    #[::fuchsia::test]
1666    async fn test_root_capabilities() {
1667        spawn_kernel_and_run(async |_, current_task| {
1668            assert!(security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN));
1669            assert_eq!(current_task.real_creds().cap_inheritable, Capabilities::empty());
1670
1671            current_task.set_creds(Credentials::with_ids(1, 1));
1672            assert!(!security::is_task_capable_noaudit(current_task, CAP_SYS_ADMIN));
1673        })
1674        .await;
1675    }
1676
1677    #[::fuchsia::test]
1678    async fn test_is_spawned() {
1679        spawn_kernel_and_run(async |locked, current_task| {
1680            // The init task should be marked as spawned, because it is executing.
1681            assert!(current_task.is_spawned());
1682
1683            // A cloned task should not be marked as spawned, because it has not yet been executed.
1684            let child = current_task
1685                .clone_task(
1686                    locked,
1687                    0,
1688                    Some(SIGCHLD),
1689                    UserRef::default(),
1690                    UserRef::default(),
1691                    UserRef::default(),
1692                )
1693                .expect("failed to create task in test");
1694            assert!(!child.is_spawned());
1695            child.release(locked);
1696
1697            // A cloned task for a test should be marked as spawned, because we intentionally avoid
1698            // spawning threads for test tasks but want them to behave as normal tasks.
1699            let test_child = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
1700            assert!(test_child.is_spawned());
1701        })
1702        .await;
1703    }
1704
1705    #[::fuchsia::test]
1706    async fn test_clone_rlimit() {
1707        spawn_kernel_and_run(async |locked, current_task| {
1708            let prev_fsize = current_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1709            assert_ne!(prev_fsize, 10);
1710            current_task
1711                .thread_group()
1712                .limits
1713                .lock(locked)
1714                .set(Resource::FSIZE, rlimit { rlim_cur: 10, rlim_max: 100 });
1715            let current_fsize = current_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1716            assert_eq!(current_fsize, 10);
1717
1718            let child_task = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
1719            let child_fsize = child_task.thread_group().get_rlimit(locked, Resource::FSIZE);
1720            assert_eq!(child_fsize, 10)
1721        })
1722        .await;
1723    }
1724}