starnix_core/execution/
task_creation.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::MemoryManager;
6use crate::security;
7use crate::signals::SignalActions;
8use crate::task::{
9    CurrentTask, Kernel, PidTable, ProcessGroup, RobustListHeadPtr, SeccompFilterContainer,
10    SeccompState, Task, TaskBuilder, ThreadGroup, ThreadGroupParent, ThreadGroupWriteGuard,
11};
12use crate::vfs::{FdTable, FsContext};
13use starnix_sync::{
14    LockBefore, Locked, ProcessGroupState, RwLockWriteGuard, TaskRelease, Unlocked,
15};
16use starnix_task_command::TaskCommand;
17use starnix_types::arch::ArchWidth;
18use starnix_types::ownership::TempRef;
19use starnix_types::release_on_error;
20use starnix_uapi::auth::Credentials;
21use starnix_uapi::errors::Errno;
22use starnix_uapi::resource_limits::Resource;
23use starnix_uapi::signals::{SIGCHLD, Signal};
24use starnix_uapi::{errno, error, from_status_like_fdio, pid_t, rlimit};
25use std::ffi::CString;
26use std::sync::Arc;
27use zx::AsHandleRef;
28
29/// Result returned when creating new Zircon threads and processes for tasks.
30pub struct TaskInfo {
31    /// The thread that was created for the task.
32    pub thread: Option<zx::Thread>,
33
34    /// The thread group that the task should be added to.
35    pub thread_group: Arc<ThreadGroup>,
36
37    /// The memory manager to use for the task.
38    pub memory_manager: Option<Arc<MemoryManager>>,
39}
40
41pub fn create_zircon_process<L>(
42    locked: &mut Locked<L>,
43    kernel: &Arc<Kernel>,
44    parent: Option<ThreadGroupWriteGuard<'_>>,
45    pid: pid_t,
46    exit_signal: Option<Signal>,
47    process_group: Arc<ProcessGroup>,
48    signal_actions: Arc<SignalActions>,
49    name: TaskCommand,
50) -> Result<TaskInfo, Errno>
51where
52    L: LockBefore<ProcessGroupState>,
53{
54    // Don't allow new processes to be created once the kernel has started shutting down.
55    if kernel.is_shutting_down() {
56        return error!(EBUSY);
57    }
58    let (process, root_vmar) =
59        create_shared(&kernel.kthreads.starnix_process, zx::ProcessOptions::empty(), name)
60            .map_err(|status| from_status_like_fdio!(status))?;
61
62    // Make sure that if this process panics in normal mode that the whole kernel's job is killed.
63    fuchsia_runtime::job_default()
64        .set_critical(zx::JobCriticalOptions::RETCODE_NONZERO, &process)
65        .map_err(|status| from_status_like_fdio!(status))?;
66
67    let memory_manager =
68        Arc::new(MemoryManager::new(root_vmar).map_err(|status| from_status_like_fdio!(status))?);
69
70    let thread_group = ThreadGroup::new(
71        locked,
72        kernel.clone(),
73        process,
74        parent,
75        pid,
76        exit_signal,
77        process_group,
78        signal_actions,
79    );
80
81    Ok(TaskInfo { thread: None, thread_group, memory_manager: Some(memory_manager) })
82}
83
84/// Creates a process that shares half its address space with this process.
85///
86/// The created process will also share its handle table and futex context with `self`.
87///
88/// Returns the created process and a handle to the created process' restricted address space.
89///
90/// Wraps the
91/// [zx_process_create_shared](https://fuchsia.dev/fuchsia-src/reference/syscalls/process_create_shared.md)
92/// syscall.
93fn create_shared(
94    process: &zx::Process,
95    options: zx::ProcessOptions,
96    name: TaskCommand,
97) -> Result<(zx::Process, zx::Vmar), zx::Status> {
98    let self_raw = process.raw_handle();
99    let name_bytes = name.as_bytes();
100    let mut process_out = 0;
101    let mut restricted_vmar_out = 0;
102    #[allow(
103        clippy::undocumented_unsafe_blocks,
104        reason = "Force documented unsafe blocks in Starnix"
105    )]
106    let status = unsafe {
107        zx::sys::zx_process_create_shared(
108            self_raw,
109            options.bits(),
110            name_bytes.as_ptr(),
111            name_bytes.len(),
112            &mut process_out,
113            &mut restricted_vmar_out,
114        )
115    };
116    zx::ok(status)?;
117    #[allow(
118        clippy::undocumented_unsafe_blocks,
119        reason = "Force documented unsafe blocks in Starnix"
120    )]
121    unsafe {
122        Ok((
123            zx::Process::from(zx::NullableHandle::from_raw(process_out)),
124            zx::Vmar::from(zx::NullableHandle::from_raw(restricted_vmar_out)),
125        ))
126    }
127}
128
129/// Create a process that is a child of the `init` process.
130///
131/// The created process will be a task that is the leader of a new thread group.
132///
133/// Most processes are created by userspace and are descendants of the `init` process. In
134/// some situations, the kernel needs to create a process itself. This function is the
135/// preferred way of creating an actual userspace process because making the process a child of
136/// `init` means that `init` is responsible for waiting on the process when it dies and thereby
137/// cleaning up its zombie.
138///
139/// If you just need a kernel task, and not an entire userspace process, consider using
140/// `create_system_task` instead. Even better, consider using the `kthreads` threadpool.
141///
142/// If `seclabel` is set, or the container specified a `default_seclabel`, then it will be
143/// resolved against the `kernel`'s active security policy, and applied to the new task.
144/// Otherwise the task will inherit its LSM state from the "init" task.
145///
146/// This function creates an underlying Zircon process to host the new task.
147pub fn create_init_child_process<L>(
148    locked: &mut Locked<L>,
149    kernel: &Arc<Kernel>,
150    initial_name: TaskCommand,
151    seclabel: Option<&CString>,
152) -> Result<TaskBuilder, Errno>
153where
154    L: LockBefore<TaskRelease>,
155{
156    let weak_init = kernel.pids.read().get_task(1);
157    let init_task = weak_init.upgrade().ok_or_else(|| errno!(EINVAL))?;
158
159    let security_context = if let Some(seclabel) = seclabel {
160        security::task_for_context(&init_task, seclabel.as_bytes().into())?
161    } else if let Some(default_seclabel) = kernel.features.default_seclabel.as_ref() {
162        security::task_for_context(&init_task, default_seclabel.as_bytes().into())?
163    } else {
164        // If SELinux is enabled then this call will fail with `EINVAL`.
165        security::task_for_context(&init_task, b"".into()).map_err(|_| {
166            errno!(EINVAL, "Container has SELinux enabled but no Security Context specified")
167        })?
168    };
169
170    let task = create_task(
171        locked,
172        kernel,
173        initial_name.clone(),
174        init_task.fs().fork(),
175        |locked, pid, process_group| {
176            create_zircon_process(
177                locked.cast_locked::<TaskRelease>(),
178                kernel,
179                None,
180                pid,
181                Some(SIGCHLD),
182                process_group,
183                SignalActions::default(),
184                initial_name.clone(),
185            )
186        },
187        security_context,
188    )?;
189    {
190        let mut init_writer = init_task.thread_group().write();
191        let mut new_process_writer = task.thread_group().write();
192        new_process_writer.parent =
193            Some(ThreadGroupParent::new(Arc::downgrade(&init_task.thread_group())));
194        init_writer.children.insert(task.tid, Arc::downgrade(task.thread_group()));
195    }
196    // A child process created via fork(2) inherits its parent's
197    // resource limits.  Resource limits are preserved across execve(2).
198    let limits = init_task.thread_group().limits.lock(locked.cast_locked::<TaskRelease>()).clone();
199    *task.thread_group().limits.lock(locked.cast_locked::<TaskRelease>()) = limits;
200    Ok(task)
201}
202
203/// Creates the initial process for a kernel.
204///
205/// The created process will be a task that is the leader of a new thread group.
206///
207/// The init process is special because it's the root of the parent/child relationship between
208/// tasks. If a task dies, the init process is ultimately responsible for waiting on that task
209/// and removing it from the zombie list.
210///
211/// It's possible for the kernel to create tasks whose ultimate parent isn't init, but such
212/// tasks cannot be created by userspace directly.
213///
214/// This function should only be called as part of booting a kernel instance. To create a
215/// process after the kernel has already booted, consider `create_init_child_process`
216/// or `create_system_task`.
217///
218/// The process created by this function should always have pid 1. We require the caller to
219/// pass the `pid` as an argument to clarify that it's the callers responsibility to determine
220/// the pid for the process.
221pub fn create_init_process(
222    locked: &mut Locked<Unlocked>,
223    kernel: &Arc<Kernel>,
224    pid: pid_t,
225    initial_name: TaskCommand,
226    fs: Arc<FsContext>,
227    rlimits: &[(Resource, u64)],
228) -> Result<TaskBuilder, Errno> {
229    let pids = kernel.pids.write();
230    create_task_with_pid(
231        locked,
232        kernel,
233        pids,
234        pid,
235        initial_name.clone(),
236        fs,
237        |locked, pid, process_group| {
238            create_zircon_process(
239                locked,
240                kernel,
241                None,
242                pid,
243                Some(SIGCHLD),
244                process_group,
245                SignalActions::default(),
246                initial_name.clone(),
247            )
248        },
249        Credentials::root(),
250        rlimits,
251        // If SELinux is enabled then `exec()` of the "init" executable will normally be
252        // configured by policy to transition to the desired init task Security Context.
253        security::task_alloc_for_kernel(),
254    )
255}
256
257/// Create a task that runs inside the kernel.
258///
259/// There is no underlying Zircon process to host the task. Instead, the work done by this task
260/// is performed by a thread in the original Starnix process, possible as part of a thread
261/// pool.
262///
263/// This function is the preferred way to create a context for doing background work inside the
264/// kernel.
265///
266/// Rather than calling this function directly, consider using `kthreads`, which provides both
267/// a system task and a threadpool on which the task can do work.
268pub fn create_system_task<L>(
269    locked: &mut Locked<L>,
270    kernel: &Arc<Kernel>,
271    fs: Arc<FsContext>,
272) -> Result<CurrentTask, Errno>
273where
274    L: LockBefore<TaskRelease>,
275{
276    let builder = create_task(
277        locked,
278        kernel,
279        TaskCommand::new(b"kthreadd"),
280        fs,
281        |locked, pid, process_group| {
282            let process = zx::Process::from(zx::NullableHandle::invalid());
283            let thread_group = ThreadGroup::new(
284                locked.cast_locked::<TaskRelease>(),
285                kernel.clone(),
286                process,
287                None,
288                pid,
289                Some(SIGCHLD),
290                process_group,
291                SignalActions::default(),
292            );
293            Ok(TaskInfo { thread: None, thread_group, memory_manager: None }.into())
294        },
295        security::task_alloc_for_kernel(),
296    )?;
297    Ok(builder.into())
298}
299
300pub fn create_task<F, L>(
301    locked: &mut Locked<L>,
302    kernel: &Kernel,
303    initial_name: TaskCommand,
304    root_fs: Arc<FsContext>,
305    task_info_factory: F,
306    security_state: security::TaskState,
307) -> Result<TaskBuilder, Errno>
308where
309    F: FnOnce(&mut Locked<L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
310    L: LockBefore<TaskRelease>,
311{
312    let mut pids = kernel.pids.write();
313    let pid = pids.allocate_pid();
314    create_task_with_pid(
315        locked,
316        kernel,
317        pids,
318        pid,
319        initial_name,
320        root_fs,
321        task_info_factory,
322        Credentials::root(),
323        &[],
324        security_state,
325    )
326}
327
328fn create_task_with_pid<F, L>(
329    locked: &mut Locked<L>,
330    kernel: &Kernel,
331    mut pids: RwLockWriteGuard<'_, PidTable>,
332    pid: pid_t,
333    initial_name: TaskCommand,
334    root_fs: Arc<FsContext>,
335    task_info_factory: F,
336    creds: Credentials,
337    rlimits: &[(Resource, u64)],
338    security_state: security::TaskState,
339) -> Result<TaskBuilder, Errno>
340where
341    F: FnOnce(&mut Locked<L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
342    L: LockBefore<TaskRelease>,
343{
344    debug_assert!(pids.get_task(pid).upgrade().is_none());
345
346    let process_group = ProcessGroup::new(pid, None);
347    pids.add_process_group(process_group.clone());
348
349    let TaskInfo { thread, thread_group, memory_manager } =
350        task_info_factory(locked, pid, process_group.clone())?;
351
352    process_group.insert(locked.cast_locked::<TaskRelease>(), &thread_group);
353
354    // > The timer slack values of init (PID 1), the ancestor of all processes, are 50,000
355    // > nanoseconds (50 microseconds).  The timer slack value is inherited by a child created
356    // > via fork(2), and is preserved across execve(2).
357    // https://man7.org/linux/man-pages/man2/prctl.2.html
358    let default_timerslack = 50_000;
359    let builder = TaskBuilder {
360        task: Task::new(
361            pid,
362            initial_name,
363            thread_group,
364            thread,
365            FdTable::default(),
366            memory_manager,
367            root_fs,
368            creds,
369            Arc::clone(&kernel.default_abstract_socket_namespace),
370            Arc::clone(&kernel.default_abstract_vsock_namespace),
371            Default::default(),
372            Default::default(),
373            None,
374            Default::default(),
375            kernel.root_uts_ns.clone(),
376            false,
377            SeccompState::default(),
378            SeccompFilterContainer::default(),
379            RobustListHeadPtr::null(&ArchWidth::Arch64),
380            default_timerslack,
381            security_state,
382        ),
383        thread_state: Default::default(),
384    };
385    release_on_error!(builder, locked, {
386        let temp_task = TempRef::from(&builder.task);
387        builder.thread_group().add(&temp_task)?;
388        for (resource, limit) in rlimits {
389            builder
390                .thread_group()
391                .limits
392                .lock(locked.cast_locked::<TaskRelease>())
393                .set(*resource, rlimit { rlim_cur: *limit, rlim_max: *limit });
394        }
395
396        pids.add_task(&temp_task);
397        Ok(())
398    });
399    Ok(builder)
400}
401
402/// Create a kernel task in the same ThreadGroup as the given `system_task`.
403///
404/// There is no underlying Zircon thread to host the task.
405pub fn create_kernel_thread<L>(
406    locked: &mut Locked<L>,
407    system_task: &Task,
408    initial_name: TaskCommand,
409) -> Result<CurrentTask, Errno>
410where
411    L: LockBefore<TaskRelease>,
412{
413    let mut pids = system_task.kernel().pids.write();
414    let pid = pids.allocate_pid();
415
416    let scheduler_state;
417    let uts_ns;
418    let default_timerslack_ns;
419    let security_state;
420    {
421        let state = system_task.read();
422        scheduler_state = state.scheduler_state;
423        uts_ns = state.uts_ns.clone();
424        default_timerslack_ns = state.default_timerslack_ns;
425        security_state = security::task_alloc_for_kernel();
426    }
427
428    let current_task: CurrentTask = TaskBuilder::new(Task::new(
429        pid,
430        initial_name,
431        system_task.thread_group().clone(),
432        None,
433        FdTable::default(),
434        system_task.mm().ok(),
435        system_task.fs(),
436        system_task.real_creds(),
437        Arc::clone(&system_task.abstract_socket_namespace),
438        Arc::clone(&system_task.abstract_vsock_namespace),
439        Default::default(),
440        Default::default(),
441        None,
442        scheduler_state,
443        uts_ns,
444        false,
445        SeccompState::default(),
446        SeccompFilterContainer::default(),
447        RobustListHeadPtr::null(&ArchWidth::Arch64),
448        default_timerslack_ns,
449        security_state,
450    ))
451    .into();
452    release_on_error!(current_task, locked, {
453        let temp_task = current_task.temp_task();
454        current_task.thread_group().add(&temp_task)?;
455        pids.add_task(&temp_task);
456        Ok(())
457    });
458    Ok(current_task)
459}