Skip to main content

starnix_core/execution/
task_creation.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::MemoryManager;
6use crate::security;
7use crate::signals::SignalActions;
8use crate::task::{
9    CurrentTask, Kernel, PidTable, ProcessGroup, RobustListHeadPtr, SeccompFilterContainer,
10    SeccompState, Task, TaskBuilder, ThreadGroup, ThreadGroupParent, ThreadGroupWriteGuard,
11};
12use crate::vfs::{FdTable, FsContext};
13use starnix_sync::{
14    LockBefore, Locked, ProcessGroupState, RwLockWriteGuard, TaskRelease, Unlocked,
15};
16use starnix_task_command::TaskCommand;
17use starnix_types::arch::ArchWidth;
18use starnix_types::release_on_error;
19use starnix_uapi::auth::Credentials;
20use starnix_uapi::errors::Errno;
21use starnix_uapi::resource_limits::Resource;
22use starnix_uapi::signals::{SIGCHLD, Signal};
23use starnix_uapi::{errno, error, from_status_like_fdio, pid_t, rlimit};
24use std::ffi::CString;
25use std::sync::Arc;
26
27/// Result returned when creating new Zircon threads and processes for tasks.
28pub struct TaskInfo {
29    /// The thread that was created for the task.
30    pub thread: Option<zx::Thread>,
31
32    /// The thread group that the task should be added to.
33    pub thread_group: Arc<ThreadGroup>,
34
35    /// The memory manager to use for the task.
36    pub memory_manager: Option<Arc<MemoryManager>>,
37}
38
39pub fn create_zircon_process<L>(
40    locked: &mut Locked<L>,
41    kernel: &Arc<Kernel>,
42    parent: Option<ThreadGroupWriteGuard<'_>>,
43    pid: pid_t,
44    exit_signal: Option<Signal>,
45    process_group: Arc<ProcessGroup>,
46    signal_actions: Arc<SignalActions>,
47    name: TaskCommand,
48) -> Result<TaskInfo, Errno>
49where
50    L: LockBefore<ProcessGroupState>,
51{
52    // Don't allow new processes to be created once the kernel has started shutting down.
53    if kernel.is_shutting_down() {
54        return error!(EBUSY);
55    }
56    let (process, root_vmar) =
57        create_shared(&kernel.kthreads.starnix_process, zx::ProcessOptions::empty(), name)
58            .map_err(|status| from_status_like_fdio!(status))?;
59
60    // Make sure that if this process panics in normal mode that the whole kernel's job is killed.
61    fuchsia_runtime::job_default()
62        .set_critical(zx::JobCriticalOptions::RETCODE_NONZERO, &process)
63        .map_err(|status| from_status_like_fdio!(status))?;
64
65    let thread_group = ThreadGroup::new(
66        locked,
67        kernel.clone(),
68        process,
69        root_vmar,
70        parent,
71        pid,
72        exit_signal,
73        process_group,
74        signal_actions,
75    );
76
77    Ok(TaskInfo { thread: None, thread_group, memory_manager: None })
78}
79
80/// Creates a process that shares half its address space with this process.
81///
82/// The created process will also share its handle table and futex context with `self`.
83///
84/// Returns the created process and a handle to the created process' restricted address space.
85///
86/// Wraps the
87/// [zx_process_create_shared](https://fuchsia.dev/fuchsia-src/reference/syscalls/process_create_shared.md)
88/// syscall.
89fn create_shared(
90    process: &zx::Process,
91    options: zx::ProcessOptions,
92    name: TaskCommand,
93) -> Result<(zx::Process, zx::Vmar), zx::Status> {
94    let self_raw = process.raw_handle();
95    let name_bytes = name.as_bytes();
96    let mut process_out = 0;
97    let mut restricted_vmar_out = 0;
98    #[allow(
99        clippy::undocumented_unsafe_blocks,
100        reason = "Force documented unsafe blocks in Starnix"
101    )]
102    let status = unsafe {
103        zx::sys::zx_process_create_shared(
104            self_raw,
105            options.bits(),
106            name_bytes.as_ptr(),
107            name_bytes.len(),
108            &mut process_out,
109            &mut restricted_vmar_out,
110        )
111    };
112    zx::ok(status)?;
113    #[allow(
114        clippy::undocumented_unsafe_blocks,
115        reason = "Force documented unsafe blocks in Starnix"
116    )]
117    unsafe {
118        Ok((
119            zx::Process::from(zx::NullableHandle::from_raw(process_out)),
120            zx::Vmar::from(zx::NullableHandle::from_raw(restricted_vmar_out)),
121        ))
122    }
123}
124
125/// Create a process that is a child of the `init` process.
126///
127/// The created process will be a task that is the leader of a new thread group.
128///
129/// Most processes are created by userspace and are descendants of the `init` process. In
130/// some situations, the kernel needs to create a process itself. This function is the
131/// preferred way of creating an actual userspace process because making the process a child of
132/// `init` means that `init` is responsible for waiting on the process when it dies and thereby
133/// cleaning up its zombie.
134///
135/// If you just need a kernel task, and not an entire userspace process, consider using
136/// `create_system_task` instead. Even better, consider using the `kthreads` threadpool.
137///
138/// If `seclabel` is set, or the container specified a `default_seclabel`, then it will be
139/// resolved against the `kernel`'s active security policy, and applied to the new task.
140/// Otherwise the task will inherit its LSM state from the "init" task.
141///
142/// This function creates an underlying Zircon process to host the new task.
143pub fn create_init_child_process<L>(
144    locked: &mut Locked<L>,
145    kernel: &Arc<Kernel>,
146    initial_name: TaskCommand,
147    mut creds: Credentials,
148    seclabel: Option<&CString>,
149) -> Result<TaskBuilder, Errno>
150where
151    L: LockBefore<TaskRelease>,
152{
153    let init_task = kernel.pids.read().get_task(1).map_err(|_| errno!(EINVAL))?;
154
155    let fs = init_task.live()?.fs().fork();
156
157    let security_state = if let Some(seclabel) = seclabel {
158        security::task_for_context(&init_task, seclabel.as_bytes().into())?
159    } else if let Some(default_seclabel) = kernel.features.default_seclabel.as_ref() {
160        security::task_for_context(&init_task, default_seclabel.as_bytes().into())?
161    } else {
162        // If SELinux is enabled then this call will fail with `EINVAL`.
163        security::task_for_context(&init_task, b"".into()).map_err(|_| {
164            errno!(EINVAL, "Container has SELinux enabled but no Security Context specified")
165        })?
166    };
167    creds.security_state = security_state;
168
169    let task = create_task(
170        locked,
171        kernel,
172        initial_name.clone(),
173        fs,
174        |locked, pid, process_group| {
175            create_zircon_process(
176                locked.cast_locked::<TaskRelease>(),
177                kernel,
178                None,
179                pid,
180                Some(SIGCHLD),
181                process_group,
182                SignalActions::default(),
183                initial_name.clone(),
184            )
185        },
186        creds.into(),
187    )?;
188    {
189        let mut init_writer = init_task.thread_group().write();
190        let mut new_process_writer = task.thread_group().write();
191        new_process_writer.parent =
192            Some(ThreadGroupParent::new(Arc::downgrade(&init_task.thread_group())));
193        init_writer.children.insert(task.tid, Arc::downgrade(task.thread_group()));
194    }
195    // A child process created via fork(2) inherits its parent's
196    // resource limits.  Resource limits are preserved across execve(2).
197    let limits = init_task.thread_group().limits.lock(locked.cast_locked::<TaskRelease>()).clone();
198    *task.thread_group().limits.lock(locked.cast_locked::<TaskRelease>()) = limits;
199    Ok(task)
200}
201
202/// Creates the initial process for a kernel.
203///
204/// The created process will be a task that is the leader of a new thread group.
205///
206/// The init process is special because it's the root of the parent/child relationship between
207/// tasks. If a task dies, the init process is ultimately responsible for waiting on that task
208/// and removing it from the zombie list.
209///
210/// It's possible for the kernel to create tasks whose ultimate parent isn't init, but such
211/// tasks cannot be created by userspace directly.
212///
213/// This function should only be called as part of booting a kernel instance. To create a
214/// process after the kernel has already booted, consider `create_init_child_process`
215/// or `create_system_task`.
216///
217/// The process created by this function should always have pid 1. We require the caller to
218/// pass the `pid` as an argument to clarify that it's the callers responsibility to determine
219/// the pid for the process.
220pub fn create_init_process(
221    locked: &mut Locked<Unlocked>,
222    kernel: &Arc<Kernel>,
223    pid: pid_t,
224    initial_name: TaskCommand,
225    fs: Arc<FsContext>,
226    rlimits: &[(Resource, u64)],
227) -> Result<TaskBuilder, Errno> {
228    let pids = kernel.pids.write();
229    create_task_with_pid(
230        locked,
231        kernel,
232        pids,
233        pid,
234        initial_name.clone(),
235        fs,
236        |locked, pid, process_group| {
237            create_zircon_process(
238                locked,
239                kernel,
240                None,
241                pid,
242                Some(SIGCHLD),
243                process_group,
244                SignalActions::default(),
245                initial_name.clone(),
246            )
247        },
248        Credentials::root(),
249        rlimits,
250    )
251}
252
253/// Create a task that runs inside the kernel.
254///
255/// There is no underlying Zircon process to host the task. Instead, the work done by this task
256/// is performed by a thread in the original Starnix process, possible as part of a thread
257/// pool.
258///
259/// This function is the preferred way to create a context for doing background work inside the
260/// kernel.
261///
262/// Rather than calling this function directly, consider using `kthreads`, which provides both
263/// a system task and a threadpool on which the task can do work.
264pub fn create_system_task<L>(
265    locked: &mut Locked<L>,
266    kernel: &Arc<Kernel>,
267    fs: Arc<FsContext>,
268) -> Result<CurrentTask, Errno>
269where
270    L: LockBefore<TaskRelease>,
271{
272    let builder = create_task(
273        locked,
274        kernel,
275        TaskCommand::new(b"kthreadd"),
276        fs,
277        |locked, pid, process_group| {
278            let thread_group = ThreadGroup::for_system(
279                locked.cast_locked::<TaskRelease>(),
280                kernel.clone(),
281                pid,
282                process_group,
283            );
284            Ok(TaskInfo { thread: None, thread_group, memory_manager: None }.into())
285        },
286        Credentials::root(),
287    )?;
288    Ok(builder.into())
289}
290
291pub fn create_task<F, L>(
292    locked: &mut Locked<L>,
293    kernel: &Kernel,
294    initial_name: TaskCommand,
295    root_fs: Arc<FsContext>,
296    task_info_factory: F,
297    creds: Arc<Credentials>,
298) -> Result<TaskBuilder, Errno>
299where
300    F: FnOnce(&mut Locked<L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
301    L: LockBefore<TaskRelease>,
302{
303    let mut pids = kernel.pids.write();
304    let pid = pids.allocate_pid();
305    create_task_with_pid(
306        locked,
307        kernel,
308        pids,
309        pid,
310        initial_name,
311        root_fs,
312        task_info_factory,
313        creds,
314        &[],
315    )
316}
317
318fn create_task_with_pid<F, L>(
319    locked: &mut Locked<L>,
320    kernel: &Kernel,
321    mut pids: RwLockWriteGuard<'_, PidTable>,
322    pid: pid_t,
323    initial_name: TaskCommand,
324    root_fs: Arc<FsContext>,
325    task_info_factory: F,
326    creds: Arc<Credentials>,
327    rlimits: &[(Resource, u64)],
328) -> Result<TaskBuilder, Errno>
329where
330    F: FnOnce(&mut Locked<L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
331    L: LockBefore<TaskRelease>,
332{
333    debug_assert!(pids.get_task(pid).is_err());
334
335    let process_group = ProcessGroup::new(pid, None);
336    pids.add_process_group(process_group.clone());
337
338    let TaskInfo { thread, thread_group, memory_manager } =
339        task_info_factory(locked, pid, process_group.clone())?;
340
341    process_group.insert(locked.cast_locked::<TaskRelease>(), &thread_group);
342
343    // > The timer slack values of init (PID 1), the ancestor of all processes, are 50,000
344    // > nanoseconds (50 microseconds).  The timer slack value is inherited by a child created
345    // > via fork(2), and is preserved across execve(2).
346    // https://man7.org/linux/man-pages/man2/prctl.2.html
347    let default_timerslack = 50_000;
348    let builder = TaskBuilder {
349        task: Task::new(
350            pid,
351            initial_name,
352            thread_group,
353            thread,
354            FdTable::default(),
355            memory_manager,
356            root_fs,
357            creds,
358            Arc::clone(&kernel.default_abstract_socket_namespace),
359            Arc::clone(&kernel.default_abstract_vsock_namespace),
360            Default::default(),
361            Default::default(),
362            None,
363            Default::default(),
364            kernel.root_uts_ns.clone(),
365            false,
366            SeccompState::default(),
367            SeccompFilterContainer::default(),
368            RobustListHeadPtr::null(&ArchWidth::Arch64),
369            default_timerslack,
370        ),
371        thread_state: Default::default(),
372    };
373    release_on_error!(builder, locked, {
374        builder.thread_group().add(Arc::clone(&builder.task))?;
375        for (resource, limit) in rlimits {
376            builder
377                .thread_group()
378                .limits
379                .lock(locked.cast_locked::<TaskRelease>())
380                .set(*resource, rlimit { rlim_cur: *limit, rlim_max: *limit });
381        }
382
383        pids.add_task(Arc::clone(&builder.task));
384        Ok(())
385    });
386    Ok(builder)
387}
388
389/// Create a kernel task in the same ThreadGroup as the given `system_task`.
390///
391/// There is no underlying Zircon thread to host the task.
392pub fn create_kernel_thread<L>(
393    locked: &mut Locked<L>,
394    system_task: &Task,
395    initial_name: TaskCommand,
396) -> Result<CurrentTask, Errno>
397where
398    L: LockBefore<TaskRelease>,
399{
400    let mut pids = system_task.kernel().pids.write();
401    let pid = pids.allocate_pid();
402
403    let scheduler_state;
404    let uts_ns;
405    let default_timerslack_ns;
406    {
407        let state = system_task.read();
408        scheduler_state = state.scheduler_state;
409        uts_ns = state.uts_ns.clone();
410        default_timerslack_ns = state.default_timerslack_ns;
411    }
412
413    let mm;
414    let fs;
415    let abstract_socket_namespace;
416    let abstract_vsock_namespace;
417    {
418        let live = system_task.live()?;
419        mm = live.mm.to_option_arc();
420        fs = live.fs.to_arc();
421        abstract_socket_namespace = live.abstract_socket_namespace.clone();
422        abstract_vsock_namespace = live.abstract_vsock_namespace.clone();
423    }
424
425    let current_task: CurrentTask = TaskBuilder::new(Task::new(
426        pid,
427        initial_name,
428        system_task.thread_group().clone(),
429        None,
430        FdTable::default(),
431        mm,
432        fs,
433        system_task.clone_creds(),
434        abstract_socket_namespace,
435        abstract_vsock_namespace,
436        Default::default(),
437        Default::default(),
438        None,
439        scheduler_state,
440        uts_ns,
441        false,
442        SeccompState::default(),
443        SeccompFilterContainer::default(),
444        RobustListHeadPtr::null(&ArchWidth::Arch64),
445        default_timerslack_ns,
446    ))
447    .into();
448    release_on_error!(current_task, locked, {
449        current_task.thread_group().add(Arc::clone(&current_task.task))?;
450        pids.add_task(Arc::clone(&current_task.task));
451        Ok(())
452    });
453    Ok(current_task)
454}