starnix_core/execution/
task_creation.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::MemoryManager;
6use crate::security;
7use crate::signals::SignalActions;
8use crate::task::{
9    CurrentTask, Kernel, PidTable, ProcessGroup, RobustListHeadPtr, SeccompFilterContainer,
10    SeccompState, Task, TaskBuilder, ThreadGroup, ThreadGroupParent, ThreadGroupWriteGuard,
11};
12use crate::vfs::{FdTable, FsContext};
13use starnix_sync::{
14    LockBefore, Locked, ProcessGroupState, RwLockWriteGuard, TaskRelease, Unlocked,
15};
16use starnix_task_command::TaskCommand;
17use starnix_types::arch::ArchWidth;
18use starnix_types::ownership::TempRef;
19use starnix_types::release_on_error;
20use starnix_uapi::auth::Credentials;
21use starnix_uapi::errors::Errno;
22use starnix_uapi::resource_limits::Resource;
23use starnix_uapi::signals::{SIGCHLD, Signal};
24use starnix_uapi::{errno, error, from_status_like_fdio, pid_t, rlimit};
25use std::ffi::CString;
26use std::sync::Arc;
27
28/// Result returned when creating new Zircon threads and processes for tasks.
29pub struct TaskInfo {
30    /// The thread that was created for the task.
31    pub thread: Option<zx::Thread>,
32
33    /// The thread group that the task should be added to.
34    pub thread_group: Arc<ThreadGroup>,
35
36    /// The memory manager to use for the task.
37    pub memory_manager: Option<Arc<MemoryManager>>,
38}
39
40pub fn create_zircon_process<L>(
41    locked: &mut Locked<L>,
42    kernel: &Arc<Kernel>,
43    parent: Option<ThreadGroupWriteGuard<'_>>,
44    pid: pid_t,
45    exit_signal: Option<Signal>,
46    process_group: Arc<ProcessGroup>,
47    signal_actions: Arc<SignalActions>,
48    name: TaskCommand,
49) -> Result<TaskInfo, Errno>
50where
51    L: LockBefore<ProcessGroupState>,
52{
53    // Don't allow new processes to be created once the kernel has started shutting down.
54    if kernel.is_shutting_down() {
55        return error!(EBUSY);
56    }
57    let (process, root_vmar) =
58        create_shared(&kernel.kthreads.starnix_process, zx::ProcessOptions::empty(), name)
59            .map_err(|status| from_status_like_fdio!(status))?;
60
61    // Make sure that if this process panics in normal mode that the whole kernel's job is killed.
62    fuchsia_runtime::job_default()
63        .set_critical(zx::JobCriticalOptions::RETCODE_NONZERO, &process)
64        .map_err(|status| from_status_like_fdio!(status))?;
65
66    let memory_manager =
67        Arc::new(MemoryManager::new(root_vmar).map_err(|status| from_status_like_fdio!(status))?);
68
69    let thread_group = ThreadGroup::new(
70        locked,
71        kernel.clone(),
72        process,
73        parent,
74        pid,
75        exit_signal,
76        process_group,
77        signal_actions,
78    );
79
80    Ok(TaskInfo { thread: None, thread_group, memory_manager: Some(memory_manager) })
81}
82
83/// Creates a process that shares half its address space with this process.
84///
85/// The created process will also share its handle table and futex context with `self`.
86///
87/// Returns the created process and a handle to the created process' restricted address space.
88///
89/// Wraps the
90/// [zx_process_create_shared](https://fuchsia.dev/fuchsia-src/reference/syscalls/process_create_shared.md)
91/// syscall.
92fn create_shared(
93    process: &zx::Process,
94    options: zx::ProcessOptions,
95    name: TaskCommand,
96) -> Result<(zx::Process, zx::Vmar), zx::Status> {
97    let self_raw = process.raw_handle();
98    let name_bytes = name.as_bytes();
99    let mut process_out = 0;
100    let mut restricted_vmar_out = 0;
101    #[allow(
102        clippy::undocumented_unsafe_blocks,
103        reason = "Force documented unsafe blocks in Starnix"
104    )]
105    let status = unsafe {
106        zx::sys::zx_process_create_shared(
107            self_raw,
108            options.bits(),
109            name_bytes.as_ptr(),
110            name_bytes.len(),
111            &mut process_out,
112            &mut restricted_vmar_out,
113        )
114    };
115    zx::ok(status)?;
116    #[allow(
117        clippy::undocumented_unsafe_blocks,
118        reason = "Force documented unsafe blocks in Starnix"
119    )]
120    unsafe {
121        Ok((
122            zx::Process::from(zx::NullableHandle::from_raw(process_out)),
123            zx::Vmar::from(zx::NullableHandle::from_raw(restricted_vmar_out)),
124        ))
125    }
126}
127
128/// Create a process that is a child of the `init` process.
129///
130/// The created process will be a task that is the leader of a new thread group.
131///
132/// Most processes are created by userspace and are descendants of the `init` process. In
133/// some situations, the kernel needs to create a process itself. This function is the
134/// preferred way of creating an actual userspace process because making the process a child of
135/// `init` means that `init` is responsible for waiting on the process when it dies and thereby
136/// cleaning up its zombie.
137///
138/// If you just need a kernel task, and not an entire userspace process, consider using
139/// `create_system_task` instead. Even better, consider using the `kthreads` threadpool.
140///
141/// If `seclabel` is set, or the container specified a `default_seclabel`, then it will be
142/// resolved against the `kernel`'s active security policy, and applied to the new task.
143/// Otherwise the task will inherit its LSM state from the "init" task.
144///
145/// This function creates an underlying Zircon process to host the new task.
146pub fn create_init_child_process<L>(
147    locked: &mut Locked<L>,
148    kernel: &Arc<Kernel>,
149    initial_name: TaskCommand,
150    seclabel: Option<&CString>,
151) -> Result<TaskBuilder, Errno>
152where
153    L: LockBefore<TaskRelease>,
154{
155    let weak_init = kernel.pids.read().get_task(1);
156    let init_task = weak_init.upgrade().ok_or_else(|| errno!(EINVAL))?;
157
158    let security_context = if let Some(seclabel) = seclabel {
159        security::task_for_context(&init_task, seclabel.as_bytes().into())?
160    } else if let Some(default_seclabel) = kernel.features.default_seclabel.as_ref() {
161        security::task_for_context(&init_task, default_seclabel.as_bytes().into())?
162    } else {
163        // If SELinux is enabled then this call will fail with `EINVAL`.
164        security::task_for_context(&init_task, b"".into()).map_err(|_| {
165            errno!(EINVAL, "Container has SELinux enabled but no Security Context specified")
166        })?
167    };
168
169    let task = create_task(
170        locked,
171        kernel,
172        initial_name.clone(),
173        init_task.fs().fork(),
174        |locked, pid, process_group| {
175            create_zircon_process(
176                locked.cast_locked::<TaskRelease>(),
177                kernel,
178                None,
179                pid,
180                Some(SIGCHLD),
181                process_group,
182                SignalActions::default(),
183                initial_name.clone(),
184            )
185        },
186        security_context,
187    )?;
188    {
189        let mut init_writer = init_task.thread_group().write();
190        let mut new_process_writer = task.thread_group().write();
191        new_process_writer.parent =
192            Some(ThreadGroupParent::new(Arc::downgrade(&init_task.thread_group())));
193        init_writer.children.insert(task.tid, Arc::downgrade(task.thread_group()));
194    }
195    // A child process created via fork(2) inherits its parent's
196    // resource limits.  Resource limits are preserved across execve(2).
197    let limits = init_task.thread_group().limits.lock(locked.cast_locked::<TaskRelease>()).clone();
198    *task.thread_group().limits.lock(locked.cast_locked::<TaskRelease>()) = limits;
199    Ok(task)
200}
201
202/// Creates the initial process for a kernel.
203///
204/// The created process will be a task that is the leader of a new thread group.
205///
206/// The init process is special because it's the root of the parent/child relationship between
207/// tasks. If a task dies, the init process is ultimately responsible for waiting on that task
208/// and removing it from the zombie list.
209///
210/// It's possible for the kernel to create tasks whose ultimate parent isn't init, but such
211/// tasks cannot be created by userspace directly.
212///
213/// This function should only be called as part of booting a kernel instance. To create a
214/// process after the kernel has already booted, consider `create_init_child_process`
215/// or `create_system_task`.
216///
217/// The process created by this function should always have pid 1. We require the caller to
218/// pass the `pid` as an argument to clarify that it's the callers responsibility to determine
219/// the pid for the process.
220pub fn create_init_process(
221    locked: &mut Locked<Unlocked>,
222    kernel: &Arc<Kernel>,
223    pid: pid_t,
224    initial_name: TaskCommand,
225    fs: Arc<FsContext>,
226    rlimits: &[(Resource, u64)],
227) -> Result<TaskBuilder, Errno> {
228    let pids = kernel.pids.write();
229    create_task_with_pid(
230        locked,
231        kernel,
232        pids,
233        pid,
234        initial_name.clone(),
235        fs,
236        |locked, pid, process_group| {
237            create_zircon_process(
238                locked,
239                kernel,
240                None,
241                pid,
242                Some(SIGCHLD),
243                process_group,
244                SignalActions::default(),
245                initial_name.clone(),
246            )
247        },
248        Credentials::root(),
249        rlimits,
250        // If SELinux is enabled then `exec()` of the "init" executable will normally be
251        // configured by policy to transition to the desired init task Security Context.
252        security::task_alloc_for_kernel(),
253    )
254}
255
256/// Create a task that runs inside the kernel.
257///
258/// There is no underlying Zircon process to host the task. Instead, the work done by this task
259/// is performed by a thread in the original Starnix process, possible as part of a thread
260/// pool.
261///
262/// This function is the preferred way to create a context for doing background work inside the
263/// kernel.
264///
265/// Rather than calling this function directly, consider using `kthreads`, which provides both
266/// a system task and a threadpool on which the task can do work.
267pub fn create_system_task<L>(
268    locked: &mut Locked<L>,
269    kernel: &Arc<Kernel>,
270    fs: Arc<FsContext>,
271) -> Result<CurrentTask, Errno>
272where
273    L: LockBefore<TaskRelease>,
274{
275    let builder = create_task(
276        locked,
277        kernel,
278        TaskCommand::new(b"kthreadd"),
279        fs,
280        |locked, pid, process_group| {
281            let process = zx::Process::from(zx::NullableHandle::invalid());
282            let thread_group = ThreadGroup::new(
283                locked.cast_locked::<TaskRelease>(),
284                kernel.clone(),
285                process,
286                None,
287                pid,
288                Some(SIGCHLD),
289                process_group,
290                SignalActions::default(),
291            );
292            Ok(TaskInfo { thread: None, thread_group, memory_manager: None }.into())
293        },
294        security::task_alloc_for_kernel(),
295    )?;
296    Ok(builder.into())
297}
298
299pub fn create_task<F, L>(
300    locked: &mut Locked<L>,
301    kernel: &Kernel,
302    initial_name: TaskCommand,
303    root_fs: Arc<FsContext>,
304    task_info_factory: F,
305    security_state: security::TaskState,
306) -> Result<TaskBuilder, Errno>
307where
308    F: FnOnce(&mut Locked<L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
309    L: LockBefore<TaskRelease>,
310{
311    let mut pids = kernel.pids.write();
312    let pid = pids.allocate_pid();
313    create_task_with_pid(
314        locked,
315        kernel,
316        pids,
317        pid,
318        initial_name,
319        root_fs,
320        task_info_factory,
321        Credentials::root(),
322        &[],
323        security_state,
324    )
325}
326
327fn create_task_with_pid<F, L>(
328    locked: &mut Locked<L>,
329    kernel: &Kernel,
330    mut pids: RwLockWriteGuard<'_, PidTable>,
331    pid: pid_t,
332    initial_name: TaskCommand,
333    root_fs: Arc<FsContext>,
334    task_info_factory: F,
335    creds: Arc<Credentials>,
336    rlimits: &[(Resource, u64)],
337    security_state: security::TaskState,
338) -> Result<TaskBuilder, Errno>
339where
340    F: FnOnce(&mut Locked<L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
341    L: LockBefore<TaskRelease>,
342{
343    debug_assert!(pids.get_task(pid).upgrade().is_none());
344
345    let process_group = ProcessGroup::new(pid, None);
346    pids.add_process_group(process_group.clone());
347
348    let TaskInfo { thread, thread_group, memory_manager } =
349        task_info_factory(locked, pid, process_group.clone())?;
350
351    process_group.insert(locked.cast_locked::<TaskRelease>(), &thread_group);
352
353    // > The timer slack values of init (PID 1), the ancestor of all processes, are 50,000
354    // > nanoseconds (50 microseconds).  The timer slack value is inherited by a child created
355    // > via fork(2), and is preserved across execve(2).
356    // https://man7.org/linux/man-pages/man2/prctl.2.html
357    let default_timerslack = 50_000;
358    let builder = TaskBuilder {
359        task: Task::new(
360            pid,
361            initial_name,
362            thread_group,
363            thread,
364            FdTable::default(),
365            memory_manager,
366            root_fs,
367            creds,
368            Arc::clone(&kernel.default_abstract_socket_namespace),
369            Arc::clone(&kernel.default_abstract_vsock_namespace),
370            Default::default(),
371            Default::default(),
372            None,
373            Default::default(),
374            kernel.root_uts_ns.clone(),
375            false,
376            SeccompState::default(),
377            SeccompFilterContainer::default(),
378            RobustListHeadPtr::null(&ArchWidth::Arch64),
379            default_timerslack,
380            security_state,
381        ),
382        thread_state: Default::default(),
383    };
384    release_on_error!(builder, locked, {
385        let temp_task = TempRef::from(&builder.task);
386        builder.thread_group().add(&temp_task)?;
387        for (resource, limit) in rlimits {
388            builder
389                .thread_group()
390                .limits
391                .lock(locked.cast_locked::<TaskRelease>())
392                .set(*resource, rlimit { rlim_cur: *limit, rlim_max: *limit });
393        }
394
395        pids.add_task(&temp_task);
396        Ok(())
397    });
398    Ok(builder)
399}
400
401/// Create a kernel task in the same ThreadGroup as the given `system_task`.
402///
403/// There is no underlying Zircon thread to host the task.
404pub fn create_kernel_thread<L>(
405    locked: &mut Locked<L>,
406    system_task: &Task,
407    initial_name: TaskCommand,
408) -> Result<CurrentTask, Errno>
409where
410    L: LockBefore<TaskRelease>,
411{
412    let mut pids = system_task.kernel().pids.write();
413    let pid = pids.allocate_pid();
414
415    let scheduler_state;
416    let uts_ns;
417    let default_timerslack_ns;
418    let security_state;
419    {
420        let state = system_task.read();
421        scheduler_state = state.scheduler_state;
422        uts_ns = state.uts_ns.clone();
423        default_timerslack_ns = state.default_timerslack_ns;
424        security_state = security::task_alloc_for_kernel();
425    }
426
427    let current_task: CurrentTask = TaskBuilder::new(Task::new(
428        pid,
429        initial_name,
430        system_task.thread_group().clone(),
431        None,
432        FdTable::default(),
433        system_task.mm().ok(),
434        system_task.fs(),
435        system_task.clone_creds(),
436        Arc::clone(&system_task.abstract_socket_namespace),
437        Arc::clone(&system_task.abstract_vsock_namespace),
438        Default::default(),
439        Default::default(),
440        None,
441        scheduler_state,
442        uts_ns,
443        false,
444        SeccompState::default(),
445        SeccompFilterContainer::default(),
446        RobustListHeadPtr::null(&ArchWidth::Arch64),
447        default_timerslack_ns,
448        security_state,
449    ))
450    .into();
451    release_on_error!(current_task, locked, {
452        let temp_task = current_task.temp_task();
453        current_task.thread_group().add(&temp_task)?;
454        pids.add_task(&temp_task);
455        Ok(())
456    });
457    Ok(current_task)
458}