Skip to main content

starnix_core/execution/
task_creation.rs

1// Copyright 2025 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::mm::MemoryManager;
6use crate::security;
7use crate::signals::SignalActions;
8use crate::task::{
9    CurrentTask, Kernel, PidTable, ProcessGroup, RobustListHeadPtr, SeccompFilterContainer,
10    SeccompState, Task, TaskBuilder, ThreadGroup, ThreadGroupParent, ThreadGroupWriteGuard,
11};
12use crate::vfs::{FdTable, FsContext};
13use starnix_sync::{
14    LockBefore, Locked, ProcessGroupState, RwLockWriteGuard, TaskRelease, Unlocked,
15};
16use starnix_task_command::TaskCommand;
17use starnix_types::arch::ArchWidth;
18use starnix_types::ownership::TempRef;
19use starnix_types::release_on_error;
20use starnix_uapi::auth::Credentials;
21use starnix_uapi::errors::Errno;
22use starnix_uapi::resource_limits::Resource;
23use starnix_uapi::signals::{SIGCHLD, Signal};
24use starnix_uapi::{errno, error, from_status_like_fdio, pid_t, rlimit};
25use std::ffi::CString;
26use std::sync::Arc;
27
28/// Result returned when creating new Zircon threads and processes for tasks.
29pub struct TaskInfo {
30    /// The thread that was created for the task.
31    pub thread: Option<zx::Thread>,
32
33    /// The thread group that the task should be added to.
34    pub thread_group: Arc<ThreadGroup>,
35
36    /// The memory manager to use for the task.
37    pub memory_manager: Option<Arc<MemoryManager>>,
38}
39
40pub fn create_zircon_process<L>(
41    locked: &mut Locked<L>,
42    kernel: &Arc<Kernel>,
43    parent: Option<ThreadGroupWriteGuard<'_>>,
44    pid: pid_t,
45    exit_signal: Option<Signal>,
46    process_group: Arc<ProcessGroup>,
47    signal_actions: Arc<SignalActions>,
48    name: TaskCommand,
49) -> Result<TaskInfo, Errno>
50where
51    L: LockBefore<ProcessGroupState>,
52{
53    // Don't allow new processes to be created once the kernel has started shutting down.
54    if kernel.is_shutting_down() {
55        return error!(EBUSY);
56    }
57    let (process, root_vmar) =
58        create_shared(&kernel.kthreads.starnix_process, zx::ProcessOptions::empty(), name)
59            .map_err(|status| from_status_like_fdio!(status))?;
60
61    // Make sure that if this process panics in normal mode that the whole kernel's job is killed.
62    fuchsia_runtime::job_default()
63        .set_critical(zx::JobCriticalOptions::RETCODE_NONZERO, &process)
64        .map_err(|status| from_status_like_fdio!(status))?;
65
66    let thread_group = ThreadGroup::new(
67        locked,
68        kernel.clone(),
69        process,
70        root_vmar,
71        parent,
72        pid,
73        exit_signal,
74        process_group,
75        signal_actions,
76    );
77
78    Ok(TaskInfo { thread: None, thread_group, memory_manager: None })
79}
80
81/// Creates a process that shares half its address space with this process.
82///
83/// The created process will also share its handle table and futex context with `self`.
84///
85/// Returns the created process and a handle to the created process' restricted address space.
86///
87/// Wraps the
88/// [zx_process_create_shared](https://fuchsia.dev/fuchsia-src/reference/syscalls/process_create_shared.md)
89/// syscall.
90fn create_shared(
91    process: &zx::Process,
92    options: zx::ProcessOptions,
93    name: TaskCommand,
94) -> Result<(zx::Process, zx::Vmar), zx::Status> {
95    let self_raw = process.raw_handle();
96    let name_bytes = name.as_bytes();
97    let mut process_out = 0;
98    let mut restricted_vmar_out = 0;
99    #[allow(
100        clippy::undocumented_unsafe_blocks,
101        reason = "Force documented unsafe blocks in Starnix"
102    )]
103    let status = unsafe {
104        zx::sys::zx_process_create_shared(
105            self_raw,
106            options.bits(),
107            name_bytes.as_ptr(),
108            name_bytes.len(),
109            &mut process_out,
110            &mut restricted_vmar_out,
111        )
112    };
113    zx::ok(status)?;
114    #[allow(
115        clippy::undocumented_unsafe_blocks,
116        reason = "Force documented unsafe blocks in Starnix"
117    )]
118    unsafe {
119        Ok((
120            zx::Process::from(zx::NullableHandle::from_raw(process_out)),
121            zx::Vmar::from(zx::NullableHandle::from_raw(restricted_vmar_out)),
122        ))
123    }
124}
125
126/// Create a process that is a child of the `init` process.
127///
128/// The created process will be a task that is the leader of a new thread group.
129///
130/// Most processes are created by userspace and are descendants of the `init` process. In
131/// some situations, the kernel needs to create a process itself. This function is the
132/// preferred way of creating an actual userspace process because making the process a child of
133/// `init` means that `init` is responsible for waiting on the process when it dies and thereby
134/// cleaning up its zombie.
135///
136/// If you just need a kernel task, and not an entire userspace process, consider using
137/// `create_system_task` instead. Even better, consider using the `kthreads` threadpool.
138///
139/// If `seclabel` is set, or the container specified a `default_seclabel`, then it will be
140/// resolved against the `kernel`'s active security policy, and applied to the new task.
141/// Otherwise the task will inherit its LSM state from the "init" task.
142///
143/// This function creates an underlying Zircon process to host the new task.
144pub fn create_init_child_process<L>(
145    locked: &mut Locked<L>,
146    kernel: &Arc<Kernel>,
147    initial_name: TaskCommand,
148    mut creds: Credentials,
149    seclabel: Option<&CString>,
150) -> Result<TaskBuilder, Errno>
151where
152    L: LockBefore<TaskRelease>,
153{
154    let weak_init = kernel.pids.read().get_task(1);
155    let init_task = weak_init.upgrade().ok_or_else(|| errno!(EINVAL))?;
156
157    let fs = init_task.live()?.fs().fork();
158
159    let security_state = if let Some(seclabel) = seclabel {
160        security::task_for_context(&init_task, seclabel.as_bytes().into())?
161    } else if let Some(default_seclabel) = kernel.features.default_seclabel.as_ref() {
162        security::task_for_context(&init_task, default_seclabel.as_bytes().into())?
163    } else {
164        // If SELinux is enabled then this call will fail with `EINVAL`.
165        security::task_for_context(&init_task, b"".into()).map_err(|_| {
166            errno!(EINVAL, "Container has SELinux enabled but no Security Context specified")
167        })?
168    };
169    creds.security_state = security_state;
170
171    let task = create_task(
172        locked,
173        kernel,
174        initial_name.clone(),
175        fs,
176        |locked, pid, process_group| {
177            create_zircon_process(
178                locked.cast_locked::<TaskRelease>(),
179                kernel,
180                None,
181                pid,
182                Some(SIGCHLD),
183                process_group,
184                SignalActions::default(),
185                initial_name.clone(),
186            )
187        },
188        creds.into(),
189    )?;
190    {
191        let mut init_writer = init_task.thread_group().write();
192        let mut new_process_writer = task.thread_group().write();
193        new_process_writer.parent =
194            Some(ThreadGroupParent::new(Arc::downgrade(&init_task.thread_group())));
195        init_writer.children.insert(task.tid, Arc::downgrade(task.thread_group()));
196    }
197    // A child process created via fork(2) inherits its parent's
198    // resource limits.  Resource limits are preserved across execve(2).
199    let limits = init_task.thread_group().limits.lock(locked.cast_locked::<TaskRelease>()).clone();
200    *task.thread_group().limits.lock(locked.cast_locked::<TaskRelease>()) = limits;
201    Ok(task)
202}
203
204/// Creates the initial process for a kernel.
205///
206/// The created process will be a task that is the leader of a new thread group.
207///
208/// The init process is special because it's the root of the parent/child relationship between
209/// tasks. If a task dies, the init process is ultimately responsible for waiting on that task
210/// and removing it from the zombie list.
211///
212/// It's possible for the kernel to create tasks whose ultimate parent isn't init, but such
213/// tasks cannot be created by userspace directly.
214///
215/// This function should only be called as part of booting a kernel instance. To create a
216/// process after the kernel has already booted, consider `create_init_child_process`
217/// or `create_system_task`.
218///
219/// The process created by this function should always have pid 1. We require the caller to
220/// pass the `pid` as an argument to clarify that it's the callers responsibility to determine
221/// the pid for the process.
222pub fn create_init_process(
223    locked: &mut Locked<Unlocked>,
224    kernel: &Arc<Kernel>,
225    pid: pid_t,
226    initial_name: TaskCommand,
227    fs: Arc<FsContext>,
228    rlimits: &[(Resource, u64)],
229) -> Result<TaskBuilder, Errno> {
230    let pids = kernel.pids.write();
231    create_task_with_pid(
232        locked,
233        kernel,
234        pids,
235        pid,
236        initial_name.clone(),
237        fs,
238        |locked, pid, process_group| {
239            create_zircon_process(
240                locked,
241                kernel,
242                None,
243                pid,
244                Some(SIGCHLD),
245                process_group,
246                SignalActions::default(),
247                initial_name.clone(),
248            )
249        },
250        Credentials::root(),
251        rlimits,
252    )
253}
254
255/// Create a task that runs inside the kernel.
256///
257/// There is no underlying Zircon process to host the task. Instead, the work done by this task
258/// is performed by a thread in the original Starnix process, possible as part of a thread
259/// pool.
260///
261/// This function is the preferred way to create a context for doing background work inside the
262/// kernel.
263///
264/// Rather than calling this function directly, consider using `kthreads`, which provides both
265/// a system task and a threadpool on which the task can do work.
266pub fn create_system_task<L>(
267    locked: &mut Locked<L>,
268    kernel: &Arc<Kernel>,
269    fs: Arc<FsContext>,
270) -> Result<CurrentTask, Errno>
271where
272    L: LockBefore<TaskRelease>,
273{
274    let builder = create_task(
275        locked,
276        kernel,
277        TaskCommand::new(b"kthreadd"),
278        fs,
279        |locked, pid, process_group| {
280            let thread_group = ThreadGroup::for_system(
281                locked.cast_locked::<TaskRelease>(),
282                kernel.clone(),
283                pid,
284                process_group,
285            );
286            Ok(TaskInfo { thread: None, thread_group, memory_manager: None }.into())
287        },
288        Credentials::root(),
289    )?;
290    Ok(builder.into())
291}
292
293pub fn create_task<F, L>(
294    locked: &mut Locked<L>,
295    kernel: &Kernel,
296    initial_name: TaskCommand,
297    root_fs: Arc<FsContext>,
298    task_info_factory: F,
299    creds: Arc<Credentials>,
300) -> Result<TaskBuilder, Errno>
301where
302    F: FnOnce(&mut Locked<L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
303    L: LockBefore<TaskRelease>,
304{
305    let mut pids = kernel.pids.write();
306    let pid = pids.allocate_pid();
307    create_task_with_pid(
308        locked,
309        kernel,
310        pids,
311        pid,
312        initial_name,
313        root_fs,
314        task_info_factory,
315        creds,
316        &[],
317    )
318}
319
320fn create_task_with_pid<F, L>(
321    locked: &mut Locked<L>,
322    kernel: &Kernel,
323    mut pids: RwLockWriteGuard<'_, PidTable>,
324    pid: pid_t,
325    initial_name: TaskCommand,
326    root_fs: Arc<FsContext>,
327    task_info_factory: F,
328    creds: Arc<Credentials>,
329    rlimits: &[(Resource, u64)],
330) -> Result<TaskBuilder, Errno>
331where
332    F: FnOnce(&mut Locked<L>, i32, Arc<ProcessGroup>) -> Result<TaskInfo, Errno>,
333    L: LockBefore<TaskRelease>,
334{
335    debug_assert!(pids.get_task(pid).upgrade().is_none());
336
337    let process_group = ProcessGroup::new(pid, None);
338    pids.add_process_group(process_group.clone());
339
340    let TaskInfo { thread, thread_group, memory_manager } =
341        task_info_factory(locked, pid, process_group.clone())?;
342
343    process_group.insert(locked.cast_locked::<TaskRelease>(), &thread_group);
344
345    // > The timer slack values of init (PID 1), the ancestor of all processes, are 50,000
346    // > nanoseconds (50 microseconds).  The timer slack value is inherited by a child created
347    // > via fork(2), and is preserved across execve(2).
348    // https://man7.org/linux/man-pages/man2/prctl.2.html
349    let default_timerslack = 50_000;
350    let builder = TaskBuilder {
351        task: Task::new(
352            pid,
353            initial_name,
354            thread_group,
355            thread,
356            FdTable::default(),
357            memory_manager,
358            root_fs,
359            creds,
360            Arc::clone(&kernel.default_abstract_socket_namespace),
361            Arc::clone(&kernel.default_abstract_vsock_namespace),
362            Default::default(),
363            Default::default(),
364            None,
365            Default::default(),
366            kernel.root_uts_ns.clone(),
367            false,
368            SeccompState::default(),
369            SeccompFilterContainer::default(),
370            RobustListHeadPtr::null(&ArchWidth::Arch64),
371            default_timerslack,
372        ),
373        thread_state: Default::default(),
374    };
375    release_on_error!(builder, locked, {
376        let temp_task = TempRef::from(&builder.task);
377        builder.thread_group().add(&temp_task)?;
378        for (resource, limit) in rlimits {
379            builder
380                .thread_group()
381                .limits
382                .lock(locked.cast_locked::<TaskRelease>())
383                .set(*resource, rlimit { rlim_cur: *limit, rlim_max: *limit });
384        }
385
386        pids.add_task(&temp_task);
387        Ok(())
388    });
389    Ok(builder)
390}
391
392/// Create a kernel task in the same ThreadGroup as the given `system_task`.
393///
394/// There is no underlying Zircon thread to host the task.
395pub fn create_kernel_thread<L>(
396    locked: &mut Locked<L>,
397    system_task: &Task,
398    initial_name: TaskCommand,
399) -> Result<CurrentTask, Errno>
400where
401    L: LockBefore<TaskRelease>,
402{
403    let mut pids = system_task.kernel().pids.write();
404    let pid = pids.allocate_pid();
405
406    let scheduler_state;
407    let uts_ns;
408    let default_timerslack_ns;
409    {
410        let state = system_task.read();
411        scheduler_state = state.scheduler_state;
412        uts_ns = state.uts_ns.clone();
413        default_timerslack_ns = state.default_timerslack_ns;
414    }
415
416    let mm;
417    let fs;
418    let abstract_socket_namespace;
419    let abstract_vsock_namespace;
420    {
421        let live = system_task.live()?;
422        mm = live.mm.to_option_arc();
423        fs = live.fs.to_arc();
424        abstract_socket_namespace = live.abstract_socket_namespace.clone();
425        abstract_vsock_namespace = live.abstract_vsock_namespace.clone();
426    }
427
428    let current_task: CurrentTask = TaskBuilder::new(Task::new(
429        pid,
430        initial_name,
431        system_task.thread_group().clone(),
432        None,
433        FdTable::default(),
434        mm,
435        fs,
436        system_task.clone_creds(),
437        abstract_socket_namespace,
438        abstract_vsock_namespace,
439        Default::default(),
440        Default::default(),
441        None,
442        scheduler_state,
443        uts_ns,
444        false,
445        SeccompState::default(),
446        SeccompFilterContainer::default(),
447        RobustListHeadPtr::null(&ArchWidth::Arch64),
448        default_timerslack_ns,
449    ))
450    .into();
451    release_on_error!(current_task, locked, {
452        let temp_task = current_task.temp_task();
453        current_task.thread_group().add(&temp_task)?;
454        pids.add_task(&temp_task);
455        Ok(())
456    });
457    Ok(current_task)
458}