Skip to main content

starnix_core/execution/
task_creation.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::MemoryManager;
6use crate::security;
7use crate::signals::SignalActions;
8use crate::task::{
9    CurrentTask, Kernel, PidTable, ProcessGroup, RobustListHeadPtr, SeccompFilterContainer,
10    SeccompState, Task, TaskBuilder, ThreadGroup, ThreadGroupParent, ThreadGroupWriteGuard,
11};
12use crate::vfs::{FdTable, FsContext};
13use starnix_sync::{
14    LockBefore, Locked, ProcessGroupState, RwLockWriteGuard, TaskRelease, Unlocked,
15};
16use starnix_task_command::TaskCommand;
17use starnix_types::arch::ArchWidth;
18use starnix_types::ownership::TempRef;
19use starnix_types::release_on_error;
20use starnix_uapi::auth::Credentials;
21use starnix_uapi::errors::Errno;
22use starnix_uapi::resource_limits::Resource;
23use starnix_uapi::signals::{SIGCHLD, Signal};
24use starnix_uapi::{errno, error, from_status_like_fdio, pid_t, rlimit};
25use std::ffi::CString;
26use std::sync::Arc;
27
28/// Result returned when creating new Zircon threads and processes for tasks.
29pub struct TaskInfo {
30    /// The thread that was created for the task.
31    pub thread: Option<zx::Thread>,
32
33    /// The thread group that the task should be added to.
34    pub thread_group: Arc<ThreadGroup>,
35
36    /// The memory manager to use for the task.
37    pub memory_manager: Option<Arc<MemoryManager>>,
38}
39
40pub fn create_zircon_process<L>(
41    locked: &mut Locked<L>,
42    kernel: &Arc<Kernel>,
43    parent: Option<ThreadGroupWriteGuard<'_>>,
44    pid: pid_t,
45    exit_signal: Option<Signal>,
46    process_group: Arc<ProcessGroup>,
47    signal_actions: Arc<SignalActions>,
48    name: TaskCommand,
49    arch_width: ArchWidth,
50) -> Result<TaskInfo, Errno>
51where
52    L: LockBefore<ProcessGroupState>,
53{
54    // Don't allow new processes to be created once the kernel has started shutting down.
55    if kernel.is_shutting_down() {
56        return error!(EBUSY);
57    }
58    let (process, root_vmar) =
59        create_shared(&kernel.kthreads.starnix_process, zx::ProcessOptions::empty(), name)
60            .map_err(|status| from_status_like_fdio!(status))?;
61
62    // Make sure that if this process panics in normal mode that the whole kernel's job is killed.
63    fuchsia_runtime::job_default()
64        .set_critical(zx::JobCriticalOptions::RETCODE_NONZERO, &process)
65        .map_err(|status| from_status_like_fdio!(status))?;
66
67    let memory_manager = Arc::new(
68        MemoryManager::new(root_vmar, arch_width)
69            .map_err(|status| from_status_like_fdio!(status))?,
70    );
71
72    let thread_group = ThreadGroup::new(
73        locked,
74        kernel.clone(),
75        process,
76        parent,
77        pid,
78        exit_signal,
79        process_group,
80        signal_actions,
81    );
82
83    Ok(TaskInfo { thread: None, thread_group, memory_manager: Some(memory_manager) })
84}
85
86/// Creates a process that shares half its address space with this process.
87///
88/// The created process will also share its handle table and futex context with `self`.
89///
90/// Returns the created process and a handle to the created process' restricted address space.
91///
92/// Wraps the
93/// [zx_process_create_shared](https://fuchsia.dev/fuchsia-src/reference/syscalls/process_create_shared.md)
94/// syscall.
95fn create_shared(
96    process: &zx::Process,
97    options: zx::ProcessOptions,
98    name: TaskCommand,
99) -> Result<(zx::Process, zx::Vmar), zx::Status> {
100    let self_raw = process.raw_handle();
101    let name_bytes = name.as_bytes();
102    let mut process_out = 0;
103    let mut restricted_vmar_out = 0;
104    #[allow(
105        clippy::undocumented_unsafe_blocks,
106        reason = "Force documented unsafe blocks in Starnix"
107    )]
108    let status = unsafe {
109        zx::sys::zx_process_create_shared(
110            self_raw,
111            options.bits(),
112            name_bytes.as_ptr(),
113            name_bytes.len(),
114            &mut process_out,
115            &mut restricted_vmar_out,
116        )
117    };
118    zx::ok(status)?;
119    #[allow(
120        clippy::undocumented_unsafe_blocks,
121        reason = "Force documented unsafe blocks in Starnix"
122    )]
123    unsafe {
124        Ok((
125            zx::Process::from(zx::NullableHandle::from_raw(process_out)),
126            zx::Vmar::from(zx::NullableHandle::from_raw(restricted_vmar_out)),
127        ))
128    }
129}
130
131/// Create a process that is a child of the `init` process.
132///
133/// The created process will be a task that is the leader of a new thread group.
134///
135/// Most processes are created by userspace and are descendants of the `init` process. In
136/// some situations, the kernel needs to create a process itself. This function is the
137/// preferred way of creating an actual userspace process because making the process a child of
138/// `init` means that `init` is responsible for waiting on the process when it dies and thereby
139/// cleaning up its zombie.
140///
141/// If you just need a kernel task, and not an entire userspace process, consider using
142/// `create_system_task` instead. Even better, consider using the `kthreads` threadpool.
143///
144/// If `seclabel` is set, or the container specified a `default_seclabel`, then it will be
145/// resolved against the `kernel`'s active security policy, and applied to the new task.
146/// Otherwise the task will inherit its LSM state from the "init" task.
147///
148/// This function creates an underlying Zircon process to host the new task.
149pub fn create_init_child_process<L>(
150    locked: &mut Locked<L>,
151    kernel: &Arc<Kernel>,
152    initial_name: TaskCommand,
153    mut creds: Credentials,
154    seclabel: Option<&CString>,
155) -> Result<TaskBuilder, Errno>
156where
157    L: LockBefore<TaskRelease>,
158{
159    let weak_init = kernel.pids.read().get_task(1);
160    let init_task = weak_init.upgrade().ok_or_else(|| errno!(EINVAL))?;
161    let init_live = init_task.live()?;
162
163    let security_state = if let Some(seclabel) = seclabel {
164        security::task_for_context(&init_task, seclabel.as_bytes().into())?
165    } else if let Some(default_seclabel) = kernel.features.default_seclabel.as_ref() {
166        security::task_for_context(&init_task, default_seclabel.as_bytes().into())?
167    } else {
168        // If SELinux is enabled then this call will fail with `EINVAL`.
169        security::task_for_context(&init_task, b"".into()).map_err(|_| {
170            errno!(EINVAL, "Container has SELinux enabled but no Security Context specified")
171        })?
172    };
173    creds.security_state = security_state;
174
175    let task = create_task(
176        locked,
177        kernel,
178        initial_name.clone(),
179        init_live.fs().fork(),
180        |locked, pid, process_group| {
181            create_zircon_process(
182                locked.cast_locked::<TaskRelease>(),
183                kernel,
184                None,
185                pid,
186                Some(SIGCHLD),
187                process_group,
188                SignalActions::default(),
189                initial_name.clone(),
190                ArchWidth::Arch64,
191            )
192        },
193        creds.into(),
194    )?;
195    {
196        let mut init_writer = init_task.thread_group().write();
197        let mut new_process_writer = task.thread_group().write();
198        new_process_writer.parent =
199            Some(ThreadGroupParent::new(Arc::downgrade(&init_task.thread_group())));
200        init_writer.children.insert(task.tid, Arc::downgrade(task.thread_group()));
201    }
202    // A child process created via fork(2) inherits its parent's
203    // resource limits.  Resource limits are preserved across execve(2).
204    let limits = init_task.thread_group().limits.lock(locked.cast_locked::<TaskRelease>()).clone();
205    *task.thread_group().limits.lock(locked.cast_locked::<TaskRelease>()) = limits;
206    Ok(task)
207}
208
209/// Creates the initial process for a kernel.
210///
211/// The created process will be a task that is the leader of a new thread group.
212///
213/// The init process is special because it's the root of the parent/child relationship between
214/// tasks. If a task dies, the init process is ultimately responsible for waiting on that task
215/// and removing it from the zombie list.
216///
217/// It's possible for the kernel to create tasks whose ultimate parent isn't init, but such
218/// tasks cannot be created by userspace directly.
219///
220/// This function should only be called as part of booting a kernel instance. To create a
221/// process after the kernel has already booted, consider `create_init_child_process`
222/// or `create_system_task`.
223///
224/// The process created by this function should always have pid 1. We require the caller to
225/// pass the `pid` as an argument to clarify that it's the callers responsibility to determine
226/// the pid for the process.
227pub fn create_init_process(
228    locked: &mut Locked<Unlocked>,
229    kernel: &Arc<Kernel>,
230    pid: pid_t,
231    initial_name: TaskCommand,
232    fs: Arc<FsContext>,
233    rlimits: &[(Resource, u64)],
234) -> Result<TaskBuilder, Errno> {
235    let pids = kernel.pids.write();
236    create_task_with_pid(
237        locked,
238        kernel,
239        pids,
240        pid,
241        initial_name.clone(),
242        fs,
243        |locked, pid, process_group| {
244            create_zircon_process(
245                locked,
246                kernel,
247                None,
248                pid,
249                Some(SIGCHLD),
250                process_group,
251                SignalActions::default(),
252                initial_name.clone(),
253                ArchWidth::Arch64,
254            )
255        },
256        Credentials::root(),
257        rlimits,
258    )
259}
260
261/// Create a task that runs inside the kernel.
262///
263/// There is no underlying Zircon process to host the task. Instead, the work done by this task
264/// is performed by a thread in the original Starnix process, possible as part of a thread
265/// pool.
266///
267/// This function is the preferred way to create a context for doing background work inside the
268/// kernel.
269///
270/// Rather than calling this function directly, consider using `kthreads`, which provides both
271/// a system task and a threadpool on which the task can do work.
272pub fn create_system_task<L>(
273    locked: &mut Locked<L>,
274    kernel: &Arc<Kernel>,
275    fs: Arc<FsContext>,
276) -> Result<CurrentTask, Errno>
277where
278    L: LockBefore<TaskRelease>,
279{
280    let builder = create_task(
281        locked,
282        kernel,
283        TaskCommand::new(b"kthreadd"),
284        fs,
285        |locked, pid, process_group| {
286            let process = zx::Process::from(zx::NullableHandle::invalid());
287            let thread_group = ThreadGroup::new(
288                locked.cast_locked::<TaskRelease>(),
289                kernel.clone(),
290                process,
291                None,
292                pid,
293                Some(SIGCHLD),
294                process_group,
295                SignalActions::default(),
296            );
297            Ok(TaskInfo { thread: None, thread_group, memory_manager: None }.into())
298        },
299        Credentials::root(),
300    )?;
301    Ok(builder.into())
302}
303
304pub fn create_task<F, L>(
305    locked: &mut Locked<L>,
306    kernel: &Kernel,
307    initial_name: TaskCommand,
308    root_fs: Arc<FsContext>,
309    task_info_factory: F,
310    creds: Arc<Credentials>,
311) -> Result<TaskBuilder, Errno>
312where
313    F: FnOnce(&mut Locked<L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
314    L: LockBefore<TaskRelease>,
315{
316    let mut pids = kernel.pids.write();
317    let pid = pids.allocate_pid();
318    create_task_with_pid(
319        locked,
320        kernel,
321        pids,
322        pid,
323        initial_name,
324        root_fs,
325        task_info_factory,
326        creds,
327        &[],
328    )
329}
330
331fn create_task_with_pid<F, L>(
332    locked: &mut Locked<L>,
333    kernel: &Kernel,
334    mut pids: RwLockWriteGuard<'_, PidTable>,
335    pid: pid_t,
336    initial_name: TaskCommand,
337    root_fs: Arc<FsContext>,
338    task_info_factory: F,
339    creds: Arc<Credentials>,
340    rlimits: &[(Resource, u64)],
341) -> Result<TaskBuilder, Errno>
342where
343    F: FnOnce(&mut Locked<L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
344    L: LockBefore<TaskRelease>,
345{
346    debug_assert!(pids.get_task(pid).upgrade().is_none());
347
348    let process_group = ProcessGroup::new(pid, None);
349    pids.add_process_group(process_group.clone());
350
351    let TaskInfo { thread, thread_group, memory_manager } =
352        task_info_factory(locked, pid, process_group.clone())?;
353
354    process_group.insert(locked.cast_locked::<TaskRelease>(), &thread_group);
355
356    // > The timer slack values of init (PID 1), the ancestor of all processes, are 50,000
357    // > nanoseconds (50 microseconds).  The timer slack value is inherited by a child created
358    // > via fork(2), and is preserved across execve(2).
359    // https://man7.org/linux/man-pages/man2/prctl.2.html
360    let default_timerslack = 50_000;
361    let builder = TaskBuilder {
362        task: Task::new(
363            pid,
364            initial_name,
365            thread_group,
366            thread,
367            FdTable::default(),
368            memory_manager,
369            root_fs,
370            creds,
371            Arc::clone(&kernel.default_abstract_socket_namespace),
372            Arc::clone(&kernel.default_abstract_vsock_namespace),
373            Default::default(),
374            Default::default(),
375            None,
376            Default::default(),
377            kernel.root_uts_ns.clone(),
378            false,
379            SeccompState::default(),
380            SeccompFilterContainer::default(),
381            RobustListHeadPtr::null(&ArchWidth::Arch64),
382            default_timerslack,
383        ),
384        thread_state: Default::default(),
385    };
386    release_on_error!(builder, locked, {
387        let temp_task = TempRef::from(&builder.task);
388        builder.thread_group().add(&temp_task)?;
389        for (resource, limit) in rlimits {
390            builder
391                .thread_group()
392                .limits
393                .lock(locked.cast_locked::<TaskRelease>())
394                .set(*resource, rlimit { rlim_cur: *limit, rlim_max: *limit });
395        }
396
397        pids.add_task(&temp_task);
398        Ok(())
399    });
400    Ok(builder)
401}
402
403/// Create a kernel task in the same ThreadGroup as the given `system_task`.
404///
405/// There is no underlying Zircon thread to host the task.
406pub fn create_kernel_thread<L>(
407    locked: &mut Locked<L>,
408    system_task: &Task,
409    initial_name: TaskCommand,
410) -> Result<CurrentTask, Errno>
411where
412    L: LockBefore<TaskRelease>,
413{
414    let mut pids = system_task.kernel().pids.write();
415    let pid = pids.allocate_pid();
416
417    let scheduler_state;
418    let uts_ns;
419    let default_timerslack_ns;
420    {
421        let state = system_task.read();
422        scheduler_state = state.scheduler_state;
423        uts_ns = state.uts_ns.clone();
424        default_timerslack_ns = state.default_timerslack_ns;
425    }
426
427    let live_system_task = system_task.live().unwrap();
428    let current_task: CurrentTask = TaskBuilder::new(Task::new(
429        pid,
430        initial_name,
431        system_task.thread_group().clone(),
432        None,
433        FdTable::default(),
434        live_system_task.mm.to_option_arc(),
435        live_system_task.fs.to_arc(),
436        system_task.clone_creds(),
437        Arc::clone(&live_system_task.abstract_socket_namespace),
438        Arc::clone(&live_system_task.abstract_vsock_namespace),
439        Default::default(),
440        Default::default(),
441        None,
442        scheduler_state,
443        uts_ns,
444        false,
445        SeccompState::default(),
446        SeccompFilterContainer::default(),
447        RobustListHeadPtr::null(&ArchWidth::Arch64),
448        default_timerslack_ns,
449    ))
450    .into();
451    release_on_error!(current_task, locked, {
452        let temp_task = current_task.temp_task();
453        current_task.thread_group().add(&temp_task)?;
454        pids.add_task(&temp_task);
455        Ok(())
456    });
457    Ok(current_task)
458}