Skip to main content

starnix_core/task/
kernel.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::bpf::EbpfState;
6use crate::device::remote_block_device::RemoteBlockDeviceRegistry;
7use crate::device::{DeviceMode, DeviceRegistry};
8use crate::execution::CrashReporter;
9use crate::mm::{FutexTable, MappingSummary, MlockPinFlavor, SharedFutexKey};
10use crate::power::SuspendResumeManagerHandle;
11use crate::ptrace::StopState;
12use crate::security::{self, AuditLogger};
13use crate::task::container_namespace::ContainerNamespace;
14use crate::task::limits::SystemLimits;
15use crate::task::memory_attribution::MemoryAttributionManager;
16use crate::task::net::NetstackDevices;
17use crate::task::tracing::PidToKoidMap;
18use crate::task::{
19    AbstractUnixSocketNamespace, AbstractVsockSocketNamespace, CurrentTask, DelayedReleaser,
20    IpTables, KernelCgroups, KernelStats, KernelThreads, PidTable, SchedulerManager, Syslog, Task,
21    ThreadGroup, UtsNamespace, UtsNamespaceHandle,
22};
23use crate::time::{HrTimerManager, HrTimerManagerHandle};
24use crate::vdso::vdso_loader::Vdso;
25use crate::vfs::fs_args::MountParams;
26use crate::vfs::socket::{
27    GenericMessage, GenericNetlink, NetlinkAccessControl, NetlinkContextImpl,
28    NetlinkToClientSender, SocketAddress, SocketTokensStore,
29};
30use crate::vfs::{CacheConfig, FileOps, FsNodeHandle, FsString, Mounts, NamespaceNode};
31use bstr::{BString, ByteSlice};
32use devicetree::types::Devicetree;
33use expando::Expando;
34use fidl::endpoints::{
35    ClientEnd, ControlHandle, DiscoverableProtocolMarker, ProtocolMarker, create_endpoints,
36};
37use fidl_fuchsia_component_runner::{ComponentControllerControlHandle, ComponentStopInfo};
38use fidl_fuchsia_feedback::CrashReporterProxy;
39use fidl_fuchsia_io as fio;
40use fidl_fuchsia_memory_attribution as fattribution;
41use fidl_fuchsia_time_external::AdjustSynchronousProxy;
42use fuchsia_async as fasync;
43use fuchsia_inspect::ArrayProperty;
44use futures::FutureExt;
45use netlink::interfaces::InterfacesHandler;
46use netlink::{NETLINK_LOG_TAG, Netlink};
47use once_cell::sync::OnceCell;
48use starnix_lifecycle::AtomicCounter;
49use starnix_logging::{SyscallLogFilter, log_debug, log_error, log_info, log_warn};
50use starnix_sync::{
51    ComponentControllerLock, FileOpsCore, KernelSwapFiles, LockDepMutex, LockDepRwLock,
52    LockEqualOrBefore, Locked, OrderedMutex, PidToKoidMapLock, RwLock, SyscallLogFiltersLock,
53};
54use starnix_uapi::device_id::DeviceId;
55use starnix_uapi::errors::{Errno, errno};
56use starnix_uapi::open_flags::OpenFlags;
57use starnix_uapi::{VMADDR_CID_HOST, from_status_like_fdio};
58use std::borrow::Cow;
59use std::collections::{HashMap, HashSet};
60use std::num::NonZeroU64;
61use std::path::PathBuf;
62use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU16, Ordering};
63use std::sync::{Arc, OnceLock, Weak};
64use zx::CpuFeatureFlags;
65
66/// Kernel features are specified in the component manifest of the starnix container
67/// or explicitly provided to the kernel constructor in tests.
68#[derive(Debug, Default, Clone)]
69pub struct KernelFeatures {
70    pub bpf_v2: bool,
71
72    /// Whether the kernel supports the S_ISUID and S_ISGID bits.
73    ///
74    /// For example, these bits are used by `sudo`.
75    ///
76    /// Enabling this feature is potentially a security risk because they allow privilege
77    /// escalation.
78    pub enable_suid: bool,
79
80    /// Whether io_uring is enabled.
81    ///
82    /// TODO(https://fxbug.dev/297431387): Enabled by default once the feature is completed.
83    pub io_uring: bool,
84
85    /// Whether the kernel should return an error to userspace, rather than panicking, if `reboot()`
86    /// is requested but cannot be enacted because the kernel lacks the relevant capabilities.
87    pub error_on_failed_reboot: bool,
88
89    /// The default seclabel that is applied to components that are run in this kernel.
90    ///
91    /// Components can override this by setting the `seclabel` field in their program block.
92    pub default_seclabel: Option<String>,
93
94    /// Whether the kernel is being used to run the SELinux Test Suite.
95    ///
96    /// TODO: https://fxbug.dev/388077431 - remove this once we no longer need workarounds for the
97    /// SELinux Test Suite.
98    pub selinux_test_suite: bool,
99
100    /// The default mount options to use when mounting directories from a component's namespace.
101    ///
102    /// The key is the path in the component's namespace, and the value is the mount options
103    /// string.
104    pub default_ns_mount_options: Option<HashMap<String, String>>,
105
106    /// The default uid that is applied to components that are run in this kernel.
107    ///
108    /// Components can override this by setting the `uid` field in their program block.
109    pub default_uid: u32,
110
111    /// mlock() never prefaults pages.
112    pub mlock_always_onfault: bool,
113
114    /// Implementation of mlock() to use for this kernel instance.
115    pub mlock_pin_flavor: MlockPinFlavor,
116
117    /// Whether excessive crash reports should be throttled.
118    pub crash_report_throttling: bool,
119
120    /// Whether or not to serve wifi support to Android.
121    pub wifi: bool,
122
123    /// The number of bytes to cache in pages for reading zx::MapInfo from VMARs.
124    pub cached_zx_map_info_bytes: u32,
125
126    /// The size of the Dirent LRU cache.
127    pub dirent_cache_size: u32,
128
129    /// Whether to expose a stub '/dev/ion' node, as a temporary workaround for compatibility.
130    // TODO(https://fxbug.dev/485370648) remove when unnecessary
131    pub fake_ion: bool,
132}
133
134impl KernelFeatures {
135    /// Returns the `MountParams` to use when mounting the specified path from a component's
136    /// namespace.  This mechanism is also used to specified options for mounts created via
137    /// container features, by specifying a pseudo-path e.g. "#container".
138    pub fn ns_mount_options(&self, ns_path: &str) -> Result<MountParams, Errno> {
139        if let Some(all_options) = &self.default_ns_mount_options {
140            if let Some(options) = all_options.get(ns_path) {
141                return MountParams::parse(options.as_bytes().into());
142            }
143        }
144        Ok(MountParams::default())
145    }
146}
147
148/// Kernel command line argument structure
149pub struct ArgNameAndValue<'a> {
150    pub name: &'a str,
151    pub value: Option<&'a str>,
152}
153
154/// The shared, mutable state for the entire Starnix kernel.
155///
156/// The `Kernel` object holds all kernel threads, userspace tasks, and file system resources for a
157/// single instance of the Starnix kernel. In production, there is one instance of this object for
158/// the entire Starnix kernel. However, multiple instances of this object can be created in one
159/// process during unit testing.
160///
161/// The structure of this object will likely need to evolve as we implement more namespacing and
162/// isolation mechanisms, such as `namespaces(7)` and `pid_namespaces(7)`.
163pub struct Kernel {
164    /// Weak reference to self. Allows to not have to pass &Arc<Kernel> in apis.
165    pub weak_self: Weak<Kernel>,
166
167    /// The kernel threads running on behalf of this kernel.
168    pub kthreads: KernelThreads,
169
170    /// The features enabled for this kernel.
171    pub features: KernelFeatures,
172
173    /// The processes and threads running in this kernel, organized by pid_t.
174    pub pids: RwLock<PidTable>,
175
176    /// A weak reference to the init task (PID 1).
177    pub init_task: OnceLock<Weak<Task>>,
178
179    /// Used to record the pid/tid to Koid mappings. Set when collecting trace data.
180    pub pid_to_koid_mapping: Arc<LockDepRwLock<Option<PidToKoidMap>, PidToKoidMapLock>>,
181
182    /// Subsystem-specific properties that hang off the Kernel object.
183    ///
184    /// Instead of adding yet another property to the Kernel object, consider storing the property
185    /// in an expando if that property is only used by one part of the system, such as a module.
186    pub expando: Expando,
187
188    /// The default namespace for abstract AF_UNIX sockets in this kernel.
189    ///
190    /// Rather than use this default namespace, abstract socket addresses
191    /// should be looked up in the AbstractSocketNamespace on each Task
192    /// object because some Task objects might have a non-default namespace.
193    pub default_abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
194
195    /// The default namespace for abstract AF_VSOCK sockets in this kernel.
196    pub default_abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
197
198    /// The kernel command line. Shows up in /proc/cmdline.
199    pub cmdline: BString,
200
201    pub device_tree: Option<Devicetree>,
202
203    // Global state held by the Linux Security Modules subsystem.
204    pub security_state: security::KernelState,
205
206    /// The registry of device drivers.
207    pub device_registry: DeviceRegistry,
208
209    /// Mapping of top-level namespace entries to an associated proxy.
210    /// For example, "/svc" to the respective proxy. Only the namespace entries
211    /// which were known at component startup will be available by the kernel.
212    pub container_namespace: ContainerNamespace,
213
214    /// The registry of block devices backed by a remote fuchsia.io file.
215    pub remote_block_device_registry: Arc<RemoteBlockDeviceRegistry>,
216
217    /// The iptables used for filtering network packets.
218    iptables: OnceLock<IpTables>,
219
220    /// The futexes shared across processes.
221    pub shared_futexes: Arc<FutexTable<SharedFutexKey>>,
222
223    /// The default UTS namespace for all tasks.
224    ///
225    /// Because each task can have its own UTS namespace, you probably want to use
226    /// the UTS namespace handle of the task, which may/may not point to this one.
227    pub root_uts_ns: UtsNamespaceHandle,
228
229    /// A struct containing a VMO with a vDSO implementation, if implemented for a given architecture, and possibly an offset for a sigreturn function.
230    pub vdso: Vdso,
231
232    /// A struct containing a VMO with a arch32-vDSO implementation, if implemented for a given architecture.
233    // TODO(https://fxbug.dev/380431743) This could be made less clunky -- maybe a Vec<Vdso> above or
234    // something else
235    pub vdso_arch32: Option<Vdso>,
236
237    /// The table of devices installed on the netstack and their associated
238    /// state local to this `Kernel`.
239    pub netstack_devices: Arc<NetstackDevices>,
240
241    /// Files that are currently available for swapping.
242    /// Note: Starnix never actually swaps memory to these files. We just need to track them
243    /// to pass conformance tests.
244    pub swap_files: OrderedMutex<Vec<FsNodeHandle>, KernelSwapFiles>,
245
246    /// The implementation of generic Netlink protocol families.
247    generic_netlink: OnceLock<GenericNetlink<NetlinkToClientSender<GenericMessage>>>,
248
249    /// The implementation of networking-related Netlink protocol families.
250    network_netlink: OnceLock<Netlink<NetlinkContextImpl>>,
251
252    /// Inspect instrumentation for this kernel instance.
253    pub inspect_node: fuchsia_inspect::Node,
254
255    /// The kinds of seccomp action that gets logged, stored as a bit vector.
256    /// Each potential SeccompAction gets a bit in the vector, as specified by
257    /// SeccompAction::logged_bit_offset.  If the bit is set, that means the
258    /// action should be logged when it is taken, subject to the caveats
259    /// described in seccomp(2).  The value of the bit vector is exposed to users
260    /// in a text form in the file /proc/sys/kernel/seccomp/actions_logged.
261    pub actions_logged: AtomicU16,
262
263    /// The manager for suspend/resume.
264    pub suspend_resume_manager: SuspendResumeManagerHandle,
265
266    /// Unique IDs for new mounts and mount namespaces.
267    pub next_mount_id: AtomicCounter<u64>,
268    pub next_peer_group_id: AtomicCounter<u64>,
269    pub next_namespace_id: AtomicCounter<u64>,
270
271    /// Unique IDs for file objects.
272    pub next_file_object_id: AtomicCounter<u64>,
273
274    /// Unique cookie used to link two inotify events, usually an IN_MOVE_FROM/IN_MOVE_TO pair.
275    pub next_inotify_cookie: AtomicCounter<u32>,
276
277    /// Controls which processes a process is allowed to ptrace.  See Documentation/security/Yama.txt
278    pub ptrace_scope: AtomicU8,
279
280    // The Fuchsia build version returned by `fuchsia.buildinfo.Provider`.
281    pub build_version: OnceCell<String>,
282
283    pub stats: Arc<KernelStats>,
284
285    /// Resource limits that are exposed, for example, via sysctl.
286    pub system_limits: SystemLimits,
287
288    // The service to handle delayed releases. This is required for elements that requires to
289    // execute some code when released and requires a known context (both in term of lock context,
290    // as well as `CurrentTask`).
291    pub delayed_releaser: DelayedReleaser,
292
293    /// Manages task priorities.
294    pub scheduler: SchedulerManager,
295
296    /// The syslog manager.
297    pub syslog: Syslog,
298
299    /// All mounts.
300    pub mounts: Mounts,
301
302    /// The manager for creating and managing high-resolution timers.
303    pub hrtimer_manager: HrTimerManagerHandle,
304
305    /// The manager for monitoring and reporting resources used by the kernel.
306    pub memory_attribution_manager: MemoryAttributionManager,
307
308    /// Handler for crashing Linux processes.
309    pub crash_reporter: CrashReporter,
310
311    /// Whether this kernel is shutting down. When shutting down, new processes may not be spawned.
312    shutting_down: AtomicBool,
313
314    /// True to disable syslog access to unprivileged callers.  This also controls whether read
315    /// access to /dev/kmsg requires privileged capabilities.
316    pub restrict_dmesg: AtomicBool,
317
318    /// Determines whether unprivileged BPF is permitted, or can be re-enabled.
319    ///   0 - Unprivileged BPF is permitted.
320    ///   1 - Unprivileged BPF is not permitted, and cannot be enabled.
321    ///   2 - Unprivileged BPF is not permitted, but can be enabled by a privileged task.
322    pub disable_unprivileged_bpf: AtomicU8,
323
324    /// Control handle to the running container's ComponentController.
325    pub container_control_handle:
326        LockDepMutex<Option<ComponentControllerControlHandle>, ComponentControllerLock>,
327
328    /// eBPF state: loaded programs, eBPF maps, etc.
329    pub ebpf_state: EbpfState,
330
331    /// Cgroups of the kernel.
332    pub cgroups: KernelCgroups,
333
334    /// Used to communicate requests to adjust system time from within a Starnix
335    /// container. Used from syscalls.
336    pub time_adjustment_proxy: Option<AdjustSynchronousProxy>,
337
338    /// Used to store tokens for sockets, particularly per-uid sharing domain sockets.
339    pub socket_tokens_store: SocketTokensStore,
340
341    /// Hardware capabilities to push onto stack when loading an ELF binary.
342    pub hwcaps: HwCaps,
343
344    /// Filters for syscall logging. Processes with names matching these filters will have syscalls
345    /// logged at INFO level.
346    pub syscall_log_filters: LockDepMutex<Vec<SyscallLogFilter>, SyscallLogFiltersLock>,
347}
348
349/// Hardware capabilities.
350#[derive(Debug, Clone, Copy, Default)]
351pub struct HwCap {
352    /// The value for `AT_HWCAP`.
353    pub hwcap: u32,
354    /// The value for `AT_HWCAP2`.
355    pub hwcap2: u32,
356}
357
358/// Hardware capabilities for both 32-bit and 64-bit ELF binaries.
359#[derive(Debug, Clone, Copy, Default)]
360pub struct HwCaps {
361    /// For 32-bit binaries.
362    #[cfg(target_arch = "aarch64")]
363    pub arch32: HwCap,
364    /// For 64-bit binaries.
365    pub arch64: HwCap,
366}
367
368/// An implementation of [`InterfacesHandler`].
369///
370/// This holds a `Weak<Kernel>` because it is held within a [`Netlink`] which
371/// is itself held within an `Arc<Kernel>`. Holding an `Arc<T>` within an
372/// `Arc<T>` prevents the `Arc`'s ref count from ever reaching 0, causing a
373/// leak.
374struct InterfacesHandlerImpl(Weak<Kernel>);
375
376impl InterfacesHandlerImpl {
377    fn kernel(&self) -> Option<Arc<Kernel>> {
378        self.0.upgrade()
379    }
380}
381
382impl InterfacesHandler for InterfacesHandlerImpl {
383    fn handle_new_link(&mut self, name: &str, interface_id: NonZeroU64) {
384        if let Some(kernel) = self.kernel() {
385            kernel.netstack_devices.add_device(&kernel, name.into(), interface_id);
386        }
387    }
388
389    fn handle_deleted_link(&mut self, name: &str) {
390        if let Some(kernel) = self.kernel() {
391            kernel.netstack_devices.remove_device(&kernel, name.into());
392        }
393    }
394
395    fn handle_idle_event(&mut self) {
396        let Some(kernel) = self.kernel() else {
397            log_error!("kernel went away while netlink is initializing");
398            return;
399        };
400        let (initialized, wq) = &kernel.netstack_devices.initialized_and_wq;
401        if initialized.swap(true, Ordering::SeqCst) {
402            log_error!("netlink initial devices should only be reported once");
403            return;
404        }
405        wq.notify_all()
406    }
407}
408
409impl Kernel {
410    pub fn new(
411        cmdline: BString,
412        features: KernelFeatures,
413        system_limits: SystemLimits,
414        container_namespace: ContainerNamespace,
415        scheduler: SchedulerManager,
416        crash_reporter_proxy: Option<CrashReporterProxy>,
417        inspect_node: fuchsia_inspect::Node,
418        security_state: security::KernelState,
419        time_adjustment_proxy: Option<AdjustSynchronousProxy>,
420        device_tree: Option<Devicetree>,
421    ) -> Result<Arc<Kernel>, zx::Status> {
422        let unix_address_maker =
423            Box::new(|x: FsString| -> SocketAddress { SocketAddress::Unix(x) });
424        let vsock_address_maker = Box::new(|x: u32| -> SocketAddress {
425            SocketAddress::Vsock { port: x, cid: VMADDR_CID_HOST }
426        });
427
428        let crash_reporter = CrashReporter::new(
429            &inspect_node,
430            crash_reporter_proxy,
431            zx::Duration::from_minutes(8),
432            features.crash_report_throttling,
433        );
434        let hrtimer_manager = HrTimerManager::new(&inspect_node);
435
436        let cpu_feature_flags =
437            zx::system_get_feature_flags::<CpuFeatureFlags>().unwrap_or_else(|e| {
438                log_debug!("CPU feature flags are only supported on ARM64: {}, reporting 0", e);
439                CpuFeatureFlags::empty()
440            });
441        let hwcaps = HwCaps::from_cpu_feature_flags(cpu_feature_flags);
442
443        let this = Arc::new_cyclic(|kernel| Kernel {
444            weak_self: kernel.clone(),
445            kthreads: KernelThreads::new(kernel.clone()),
446            features,
447            pids: Default::default(),
448            init_task: OnceLock::new(),
449            pid_to_koid_mapping: Arc::new(LockDepRwLock::new(None)),
450            expando: Default::default(),
451            default_abstract_socket_namespace: AbstractUnixSocketNamespace::new(unix_address_maker),
452            default_abstract_vsock_namespace: AbstractVsockSocketNamespace::new(
453                vsock_address_maker,
454            ),
455            cmdline,
456            device_tree,
457            security_state,
458            device_registry: Default::default(),
459            container_namespace,
460            remote_block_device_registry: Default::default(),
461            iptables: OnceLock::new(),
462            shared_futexes: Arc::<FutexTable<SharedFutexKey>>::default(),
463            root_uts_ns: Arc::new(LockDepRwLock::new(UtsNamespace::default())),
464            vdso: Vdso::new(),
465            vdso_arch32: Vdso::new_arch32(),
466            netstack_devices: Arc::default(),
467            swap_files: Default::default(),
468            generic_netlink: OnceLock::new(),
469            network_netlink: OnceLock::new(),
470            inspect_node,
471            actions_logged: AtomicU16::new(0),
472            suspend_resume_manager: Default::default(),
473            next_mount_id: AtomicCounter::<u64>::new(1),
474            next_peer_group_id: AtomicCounter::<u64>::new(1),
475            next_namespace_id: AtomicCounter::<u64>::new(1),
476            next_inotify_cookie: AtomicCounter::<u32>::new(1),
477            next_file_object_id: Default::default(),
478            system_limits,
479            ptrace_scope: AtomicU8::new(0), // Disable YAMA checks by default.
480            restrict_dmesg: AtomicBool::new(false),
481            disable_unprivileged_bpf: AtomicU8::new(0), // Enable unprivileged BPF by default.
482            build_version: OnceCell::new(),
483            stats: Arc::new(KernelStats::default()),
484            delayed_releaser: Default::default(),
485            scheduler,
486            syslog: Default::default(),
487            mounts: Mounts::new(),
488            hrtimer_manager,
489            memory_attribution_manager: MemoryAttributionManager::new(kernel.clone()),
490            crash_reporter,
491            shutting_down: AtomicBool::new(false),
492            container_control_handle: LockDepMutex::new(None),
493            ebpf_state: Default::default(),
494            cgroups: Default::default(),
495            time_adjustment_proxy,
496            socket_tokens_store: Default::default(),
497            hwcaps,
498            syscall_log_filters: Default::default(),
499        });
500
501        // Initialize the device registry before registering any devices.
502        //
503        // We will create sysfs recursively within this function.
504        this.device_registry.objects.init(&mut this.kthreads.unlocked_for_async(), &this);
505
506        // Make a copy of this Arc for the inspect lazy node to use but don't create an Arc cycle
507        // because the inspect node that owns this reference is owned by the kernel.
508        let kernel = Arc::downgrade(&this);
509        this.inspect_node.record_lazy_child("thread_groups", move || {
510            if let Some(kernel) = kernel.upgrade() {
511                let inspector = kernel.get_thread_groups_inspect();
512                async move { Ok(inspector) }.boxed()
513            } else {
514                async move { Err(anyhow::format_err!("kernel was dropped")) }.boxed()
515            }
516        });
517
518        let kernel = Arc::downgrade(&this);
519        this.inspect_node.record_lazy_child("cgroupv2", move || {
520            if let Some(kernel) = kernel.upgrade() {
521                async move { Ok(kernel.cgroups.cgroup2.get_cgroup_inspect()) }.boxed()
522            } else {
523                async move { Err(anyhow::format_err!("kernel was dropped")) }.boxed()
524            }
525        });
526
527        Ok(this)
528    }
529
530    /// Returns the init task for this kernel.
531    pub fn get_init_task(&self) -> Result<Arc<Task>, Errno> {
532        self.init_task.get().and_then(|t| t.upgrade()).ok_or_else(|| errno!(EINVAL))
533    }
534
535    /// Shuts down userspace and the kernel in an orderly fashion, eventually terminating the root
536    /// kernel process.
537    pub fn shut_down(self: &Arc<Self>) {
538        // Run shutdown code on a kthread in the main process so that it can be the last process
539        // alive.
540        self.kthreads.spawn_future(
541            {
542                let kernel = self.clone();
543                move || async move {
544                    kernel.run_shutdown().await;
545                }
546            },
547            "run_shutdown",
548        );
549    }
550
551    /// Starts shutting down the Starnix kernel and any running container. Only one thread can drive
552    /// shutdown at a time. This function will return immediately if shut down is already under way.
553    ///
554    /// Shutdown happens in several phases:
555    ///
556    /// 1. Disable launching new processes
557    /// 2. Shut down individual ThreadGroups until only the init and system tasks remain
558    /// 3. Repeat the above for the init task
559    /// 4. Clean up kernel-internal structures that can hold processes alive
560    /// 5. Ensure this process is the only one running in the kernel job.
561    /// 6. Unmounts the kernel's mounts' FileSystems.
562    /// 7. Tell CF the container component has stopped
563    /// 8. Exit this process
564    ///
565    /// If a ThreadGroup does not shut down on its own (including after SIGKILL), that phase of
566    /// shutdown will hang. To gracefully shut down any further we need the other kernel processes
567    /// to do controlled exits that properly release access to shared state. If our orderly shutdown
568    /// does hang, eventually CF will kill the container component which will lead to the job of
569    /// this process being killed and shutdown will still complete.
570    async fn run_shutdown(&self) {
571        const INIT_PID: i32 = 1;
572        const SYSTEM_TASK_PID: i32 = 2;
573
574        // Step 1: Prevent new processes from being created once they observe this update. We don't
575        // want the thread driving shutdown to be racing with other threads creating new processes.
576        if self
577            .shutting_down
578            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
579            .is_err()
580        {
581            log_info!("Additional thread tried to initiate shutdown while already in-progress.");
582            return;
583        }
584
585        log_info!("Shutting down Starnix kernel.");
586
587        // Step 2: Shut down thread groups in a loop until init and the system task are all that
588        // remain.
589        loop {
590            let tgs = {
591                // Exiting thread groups need to acquire a write lock for the pid table to
592                // successfully exit so we need to acquire that lock in a reduced scope.
593                self.pids
594                    .read()
595                    .get_thread_groups()
596                    .into_iter()
597                    .filter(|tg| tg.leader != SYSTEM_TASK_PID && tg.leader != INIT_PID)
598                    .collect::<Vec<_>>()
599            };
600            if tgs.is_empty() {
601                log_info!("pid table is empty except init and system task");
602                break;
603            }
604
605            log_info!(tgs:?; "shutting down thread groups");
606            let mut tasks = vec![];
607            for tg in tgs {
608                let task = fasync::Task::local(ThreadGroup::shut_down(Arc::downgrade(&tg)));
609                tasks.push(task);
610            }
611            futures::future::join_all(tasks).await;
612        }
613
614        // Step 3: Terminate the init process.
615        let maybe_init = self.get_init_task().ok().map(|t| Arc::downgrade(&t.thread_group));
616        if let Some(init) = maybe_init {
617            log_info!("shutting down init");
618            ThreadGroup::shut_down(init).await;
619        } else {
620            log_info!("init already terminated");
621        }
622
623        // Step 4: Clean up any structures that can keep non-Linux processes live in our job.
624        log_info!("cleaning up pinned memory");
625        self.expando.remove::<crate::mm::InfoCacheShadowProcess>();
626        self.expando.remove::<crate::mm::MlockShadowProcess>();
627
628        // Step 5: Make sure this is the only process running in the job. We already should have
629        // cleared up all processes other than the system task at this point, but wait on any that
630        // might be around for good measure.
631        //
632        // Use unwrap liberally since we're shutting down anyway and errors will still tear down the
633        // kernel.
634        let kernel_job = fuchsia_runtime::job_default();
635        assert_eq!(kernel_job.children().unwrap(), &[], "starnix does not create any child jobs");
636        let own_koid = fuchsia_runtime::process_self().koid().unwrap();
637
638        log_info!("waiting for this to be the only process in the job");
639        loop {
640            let mut remaining_processes = kernel_job
641                .processes()
642                .unwrap()
643                .into_iter()
644                // Don't wait for ourselves to exit.
645                .filter(|pid| pid != &own_koid)
646                .peekable();
647            if remaining_processes.peek().is_none() {
648                log_info!("No stray Zircon processes.");
649                break;
650            }
651
652            let mut terminated_signals = vec![];
653            for pid in remaining_processes {
654                let handle = match kernel_job
655                    .get_child(&pid, zx::Rights::BASIC | zx::Rights::PROPERTY | zx::Rights::DESTROY)
656                {
657                    Ok(h) => h,
658                    Err(e) => {
659                        log_info!(pid:?, e:?; "failed to get child process from job");
660                        continue;
661                    }
662                };
663                log_info!(
664                    pid:?,
665                    name:? = handle.get_name();
666                    "waiting on process terminated signal"
667                );
668                terminated_signals
669                    .push(fuchsia_async::OnSignals::new(handle, zx::Signals::PROCESS_TERMINATED));
670            }
671            log_info!("waiting on process terminated signals");
672            futures::future::join_all(terminated_signals).await;
673        }
674
675        // Step 6: Forcibly unmounts the mounts' FileSystems.
676        log_info!("clearing mounts");
677        self.mounts.clear();
678
679        // Step 7: Tell CF the container stopped.
680        log_info!("all non-root processes killed, notifying CF container is stopped");
681        if let Some(control_handle) = self.container_control_handle.lock().take() {
682            log_info!("Notifying CF that the container has stopped.");
683            control_handle
684                .send_on_stop(ComponentStopInfo {
685                    termination_status: Some(zx::Status::OK.into_raw()),
686                    exit_code: Some(0),
687                    ..ComponentStopInfo::default()
688                })
689                .unwrap();
690            control_handle.shutdown_with_epitaph(zx::Status::OK);
691        } else {
692            log_warn!("Shutdown invoked without a container controller control handle.");
693        }
694
695        // Step 8: exiting this process.
696        log_info!("All tasks killed, exiting Starnix kernel root process.");
697        // Normally a Rust program exits its process by calling `std::process::exit()` which goes
698        // through libc to exit the program. This runs drop impls on any thread-local variables
699        // which can cause issues during Starnix shutdown when we haven't yet integrated every
700        // subsystem with the shutdown flow. While those issues are indicative of underlying
701        // problems, we can't solve them without finishing the implementation of graceful shutdown.
702        // Instead, ask Zircon to exit our process directly, bypassing any libc atexit handlers.
703        // TODO(https://fxbug.dev/295073633) return from main instead of avoiding atexit handlers
704        zx::Process::exit(0);
705    }
706
707    pub fn is_shutting_down(&self) -> bool {
708        self.shutting_down.load(Ordering::Acquire)
709    }
710
711    pub fn allow_unprivileged_bpf(&self) -> bool {
712        self.disable_unprivileged_bpf.load(Ordering::Relaxed) == 0
713    }
714
715    /// Opens a device file (driver) identified by `dev`.
716    pub fn open_device<L>(
717        &self,
718        locked: &mut Locked<L>,
719        current_task: &CurrentTask,
720        node: &NamespaceNode,
721        flags: OpenFlags,
722        dev: DeviceId,
723        mode: DeviceMode,
724    ) -> Result<Box<dyn FileOps>, Errno>
725    where
726        L: LockEqualOrBefore<FileOpsCore>,
727    {
728        self.device_registry.open_device(locked, current_task, node, flags, dev, mode)
729    }
730
731    /// Return a reference to the Audit Framework
732    ///
733    /// This function follows the lazy initialization pattern.
734    pub fn audit_logger(&self) -> Arc<AuditLogger> {
735        self.expando.get_or_init(|| AuditLogger::new(self))
736    }
737
738    /// Return a reference to the GenericNetlink implementation.
739    ///
740    /// This function follows the lazy initialization pattern, where the first
741    /// call will instantiate the Generic Netlink server in a separate kthread.
742    pub fn generic_netlink(&self) -> &GenericNetlink<NetlinkToClientSender<GenericMessage>> {
743        self.generic_netlink.get_or_init(|| {
744            let (generic_netlink, worker_params) = GenericNetlink::new();
745            let enable_nl80211 = self.features.wifi;
746            self.kthreads.spawn_future(
747                move || async move {
748                    crate::vfs::socket::run_generic_netlink_worker(worker_params, enable_nl80211)
749                        .await;
750                    log_error!("Generic Netlink future unexpectedly exited");
751                },
752                "generic_netlink_worker",
753            );
754            generic_netlink
755        })
756    }
757
758    /// Return a reference to the [`netlink::Netlink`] implementation.
759    ///
760    /// This function follows the lazy initialization pattern, where the first
761    /// call will instantiate the Netlink implementation.
762    pub fn network_netlink(self: &Arc<Self>) -> &Netlink<NetlinkContextImpl> {
763        self.network_netlink.get_or_init(|| {
764            let (network_netlink, worker_params) =
765                Netlink::new(InterfacesHandlerImpl(self.weak_self.clone()));
766
767            let kernel = self.clone();
768            self.kthreads.spawn_future(
769                move || async move {
770                    netlink::run_netlink_worker(
771                        worker_params,
772                        NetlinkAccessControl::new(kernel.kthreads.system_task()),
773                    )
774                    .await;
775                    log_error!(tag = NETLINK_LOG_TAG; "Netlink async worker unexpectedly exited");
776                },
777                "network_netlink_worker",
778            );
779            network_netlink
780        })
781    }
782
783    pub fn iptables(&self) -> &IpTables {
784        self.iptables.get_or_init(|| IpTables::new())
785    }
786
787    /// Returns a Proxy to the service used by the container at `filename`.
788    #[allow(unused)]
789    pub fn connect_to_named_protocol_at_container_svc<P: ProtocolMarker>(
790        &self,
791        filename: &str,
792    ) -> Result<ClientEnd<P>, Errno> {
793        match self.container_namespace.get_namespace_channel("/svc") {
794            Ok(channel) => {
795                let (client_end, server_end) = create_endpoints::<P>();
796                fdio::service_connect_at(channel.as_ref(), filename, server_end.into_channel())
797                    .map_err(|status| from_status_like_fdio!(status))?;
798                Ok(client_end)
799            }
800            Err(err) => {
801                log_error!("Unable to get /svc namespace channel! {}", err);
802                Err(errno!(ENOENT))
803            }
804        }
805    }
806
807    /// Returns a Proxy to the service `P` used by the container.
808    pub fn connect_to_protocol_at_container_svc<P: DiscoverableProtocolMarker>(
809        &self,
810    ) -> Result<ClientEnd<P>, Errno> {
811        self.connect_to_named_protocol_at_container_svc::<P>(P::PROTOCOL_NAME)
812    }
813
814    pub fn add_syscall_log_filter(&self, name: &str) {
815        let filter = SyscallLogFilter::new(name.to_string());
816        {
817            let mut filters = self.syscall_log_filters.lock();
818            if filters.contains(&filter) {
819                return;
820            }
821            filters.push(filter);
822        }
823        for headers in self.pids.read().get_thread_groups() {
824            headers.sync_syscall_log_level();
825        }
826    }
827
828    pub fn clear_syscall_log_filters(&self) {
829        {
830            let mut filters = self.syscall_log_filters.lock();
831            if filters.is_empty() {
832                return;
833            }
834            filters.clear();
835        }
836        for headers in self.pids.read().get_thread_groups() {
837            headers.sync_syscall_log_level();
838        }
839    }
840
841    fn get_thread_groups_inspect(&self) -> fuchsia_inspect::Inspector {
842        let inspector = fuchsia_inspect::Inspector::default();
843
844        let thread_groups = inspector.root();
845        let mut mm_summary = MappingSummary::default();
846        let mut mms_summarized = HashSet::new();
847
848        // Avoid holding locks for the entire iteration.
849        let all_thread_groups = {
850            let pid_table = self.pids.read();
851            pid_table.get_thread_groups()
852        };
853        for thread_group in all_thread_groups {
854            // Avoid holding the state lock while summarizing.
855            let (ppid, tasks) = {
856                let tg = thread_group.read();
857                (tg.get_ppid() as i64, tg.tasks())
858            };
859
860            let tg_node = thread_groups.create_child(format!("{}", thread_group.leader));
861            if let Ok(koid) = thread_group.process.koid() {
862                tg_node.record_int("koid", koid.raw_koid() as i64);
863            }
864            tg_node.record_int("pid", thread_group.leader as i64);
865            tg_node.record_int("ppid", ppid);
866            tg_node.record_bool("stopped", thread_group.load_stopped() == StopState::GroupStopped);
867
868            let tasks_node = tg_node.create_child("tasks");
869            for task in tasks {
870                if let Ok(mm) = task.mm() {
871                    if mms_summarized.insert(Arc::as_ptr(&mm) as usize) {
872                        mm.summarize(&mut mm_summary);
873                    }
874                }
875                let set_properties = |node: &fuchsia_inspect::Node| {
876                    node.record_string("command", task.command().to_string());
877
878                    let scheduler_state = task.read().scheduler_state;
879                    if !scheduler_state.is_default() {
880                        node.record_child("sched", |node| {
881                            node.record_string(
882                                "role_name",
883                                self.scheduler
884                                    .role_name(&task)
885                                    .map(|n| Cow::Borrowed(n))
886                                    .unwrap_or_else(|e| Cow::Owned(e.to_string())),
887                            );
888                            node.record_string("state", format!("{scheduler_state:?}"));
889                        });
890                    }
891                };
892                if task.tid == thread_group.leader {
893                    let mut argv = task.read_argv(256).unwrap_or_default();
894
895                    // Any runtime that overwrites argv is likely to leave a lot of trailing
896                    // nulls, no need to print those in inspect.
897                    argv.retain(|arg| !arg.is_empty());
898
899                    let inspect_argv = tg_node.create_string_array("argv", argv.len());
900                    for (i, arg) in argv.iter().enumerate() {
901                        inspect_argv.set(i, arg.to_string());
902                    }
903                    tg_node.record(inspect_argv);
904
905                    set_properties(&tg_node);
906                } else {
907                    tasks_node.record_child(task.tid.to_string(), |task_node| {
908                        set_properties(task_node);
909                    });
910                };
911            }
912            tg_node.record(tasks_node);
913            thread_groups.record(tg_node);
914        }
915
916        thread_groups.record_child("memory_managers", |node| mm_summary.record(node));
917
918        inspector
919    }
920
921    pub fn new_memory_attribution_observer(
922        &self,
923        control_handle: fattribution::ProviderControlHandle,
924    ) -> attribution_server::Observer {
925        self.memory_attribution_manager.new_observer(control_handle)
926    }
927
928    /// Opens and returns a directory proxy from the container's namespace, at
929    /// the requested path, using the provided flags. This method will open the
930    /// closest existing path from the namespace hierarchy, and then attempt
931    /// initialize an open on the remaining subdirectory path, using the given open_flags.
932    ///
933    /// For example, given the parameter provided is `/path/to/foo/bar` and there
934    /// are namespace entries already for `/path/to/foo` and `/path/to`. The entry
935    /// for /path/to/foo will be opened, and then the /bar will attempt to be opened
936    /// underneath that directory with the given open_flags. The returned value
937    /// will be the proxy to the parent (/path/to/foo) and the string to the child
938    /// path (/bar). The caller of this method can expect /bar to be initialized.
939    pub fn open_ns_dir(
940        &self,
941        path: &str,
942        open_flags: fio::Flags,
943    ) -> Result<(fio::DirectorySynchronousProxy, String), Errno> {
944        let ns_path = PathBuf::from(path);
945        match self.container_namespace.find_closest_channel(&ns_path) {
946            Ok((root_channel, remaining_subdir)) => {
947                let (_, server_end) = create_endpoints::<fio::DirectoryMarker>();
948                fdio::open_at(
949                    &root_channel,
950                    &remaining_subdir,
951                    open_flags,
952                    server_end.into_channel(),
953                )
954                .map_err(|e| {
955                    log_error!("Failed to intialize the subdirs: {}", e);
956                    errno!(EIO)
957                })?;
958
959                Ok((fio::DirectorySynchronousProxy::new(root_channel), remaining_subdir))
960            }
961            Err(err) => {
962                log_error!(
963                    "Unable to find a channel for {}. Received error: {}",
964                    ns_path.display(),
965                    err
966                );
967                Err(errno!(ENOENT))
968            }
969        }
970    }
971
972    /// Returns an iterator of the command line arguments.
973    pub fn cmdline_args_iter(&self) -> impl Iterator<Item = ArgNameAndValue<'_>> {
974        parse_cmdline(self.cmdline.to_str().unwrap_or_default()).filter_map(|arg| {
975            arg.split_once('=')
976                .map(|(name, value)| ArgNameAndValue { name: name, value: Some(value) })
977                .or(Some(ArgNameAndValue { name: arg, value: None }))
978        })
979    }
980
981    /// Returns the container-configured CacheConfig.
982    pub fn fs_cache_config(&self) -> CacheConfig {
983        CacheConfig { capacity: self.features.dirent_cache_size as usize }
984    }
985}
986
987pub fn parse_cmdline(cmdline: &str) -> impl Iterator<Item = &str> {
988    let mut args = Vec::new();
989    let mut arg_start: Option<usize> = None;
990    let mut in_quotes = false;
991    let mut previous_char = ' ';
992
993    for (i, c) in cmdline.char_indices() {
994        if let Some(start) = arg_start {
995            match c {
996                ' ' if !in_quotes => {
997                    args.push(&cmdline[start..i]);
998                    arg_start = None;
999                }
1000                '"' if previous_char != '\\' => {
1001                    in_quotes = !in_quotes;
1002                }
1003                _ => {}
1004            }
1005        } else if c != ' ' {
1006            arg_start = Some(i);
1007            if c == '"' {
1008                in_quotes = true;
1009            }
1010        }
1011        previous_char = c;
1012    }
1013    if let Some(start) = arg_start {
1014        args.push(&cmdline[start..]);
1015    }
1016    args.into_iter()
1017}
1018
1019impl std::fmt::Debug for Kernel {
1020    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1021        f.debug_struct("Kernel").finish()
1022    }
1023}
1024
1025// TODO(https://fxbug.dev/380427153): move arch dependent code to `kernel/core/arch/*`.
1026#[cfg(target_arch = "aarch64")]
1027fn arm32_hwcap(cpu_feature_flags: CpuFeatureFlags) -> HwCap {
1028    use starnix_uapi::arch32;
1029    const COMPAT_ARM32_ELF_HWCAP: u32 = arch32::HWCAP_HALF
1030        | arch32::HWCAP_THUMB
1031        | arch32::HWCAP_FAST_MULT
1032        | arch32::HWCAP_EDSP
1033        | arch32::HWCAP_TLS
1034        | arch32::HWCAP_IDIV // == IDIVA | IDIVT.
1035        | arch32::HWCAP_LPAE
1036        | arch32::HWCAP_EVTSTRM;
1037
1038    let mut hwcap = COMPAT_ARM32_ELF_HWCAP;
1039    let mut hwcap2 = 0;
1040    for feature in cpu_feature_flags.iter() {
1041        match feature {
1042            CpuFeatureFlags::ARM64_FEATURE_ISA_ASIMD => hwcap |= arch32::HWCAP_NEON,
1043            CpuFeatureFlags::ARM64_FEATURE_ISA_AES => hwcap2 |= arch32::HWCAP2_AES,
1044            CpuFeatureFlags::ARM64_FEATURE_ISA_PMULL => hwcap2 |= arch32::HWCAP2_PMULL,
1045            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA1 => hwcap2 |= arch32::HWCAP2_SHA1,
1046            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA256 => hwcap2 |= arch32::HWCAP2_SHA2,
1047            CpuFeatureFlags::ARM64_FEATURE_ISA_CRC32 => hwcap2 |= arch32::HWCAP2_CRC32,
1048            CpuFeatureFlags::ARM64_FEATURE_ISA_I8MM => hwcap |= arch32::HWCAP_I8MM,
1049            CpuFeatureFlags::ARM64_FEATURE_ISA_FHM => hwcap |= arch32::HWCAP_ASIMDFHM,
1050            CpuFeatureFlags::ARM64_FEATURE_ISA_DP => hwcap |= arch32::HWCAP_ASIMDDP,
1051            CpuFeatureFlags::ARM64_FEATURE_ISA_FP => {
1052                hwcap |= arch32::HWCAP_VFP | arch32::HWCAP_VFPv3 | arch32::HWCAP_VFPv4
1053            }
1054            _ => {}
1055        }
1056    }
1057    HwCap { hwcap, hwcap2 }
1058}
1059
1060#[cfg(target_arch = "aarch64")]
1061fn arm64_hwcap(cpu_feature_flags: CpuFeatureFlags) -> HwCap {
1062    // See https://docs.kernel.org/arch/arm64/elf_hwcaps.html for details.
1063    use starnix_uapi;
1064    let mut hwcap = 0;
1065    let mut hwcap2 = 0;
1066
1067    for feature in cpu_feature_flags.iter() {
1068        match feature {
1069            CpuFeatureFlags::ARM64_FEATURE_ISA_FP => hwcap |= starnix_uapi::HWCAP_FP,
1070            CpuFeatureFlags::ARM64_FEATURE_ISA_ASIMD => hwcap |= starnix_uapi::HWCAP_ASIMD,
1071            CpuFeatureFlags::ARM64_FEATURE_ISA_AES => hwcap |= starnix_uapi::HWCAP_AES,
1072            CpuFeatureFlags::ARM64_FEATURE_ISA_PMULL => hwcap |= starnix_uapi::HWCAP_PMULL,
1073            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA1 => hwcap |= starnix_uapi::HWCAP_SHA1,
1074            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA256 => hwcap |= starnix_uapi::HWCAP_SHA2,
1075            CpuFeatureFlags::ARM64_FEATURE_ISA_CRC32 => hwcap |= starnix_uapi::HWCAP_CRC32,
1076            CpuFeatureFlags::ARM64_FEATURE_ISA_I8MM => hwcap2 |= starnix_uapi::HWCAP2_I8MM,
1077            CpuFeatureFlags::ARM64_FEATURE_ISA_FHM => hwcap |= starnix_uapi::HWCAP_ASIMDFHM,
1078            CpuFeatureFlags::ARM64_FEATURE_ISA_DP => hwcap |= starnix_uapi::HWCAP_ASIMDDP,
1079            CpuFeatureFlags::ARM64_FEATURE_ISA_SM3 => hwcap |= starnix_uapi::HWCAP_SM3,
1080            CpuFeatureFlags::ARM64_FEATURE_ISA_SM4 => hwcap |= starnix_uapi::HWCAP_SM4,
1081            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA3 => hwcap |= starnix_uapi::HWCAP_SHA3,
1082            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA512 => hwcap |= starnix_uapi::HWCAP_SHA512,
1083            CpuFeatureFlags::ARM64_FEATURE_ISA_ATOMICS => hwcap |= starnix_uapi::HWCAP_ATOMICS,
1084            CpuFeatureFlags::ARM64_FEATURE_ISA_RDM => hwcap |= starnix_uapi::HWCAP_ASIMDRDM,
1085            CpuFeatureFlags::ARM64_FEATURE_ISA_TS => hwcap |= starnix_uapi::HWCAP_FLAGM,
1086            CpuFeatureFlags::ARM64_FEATURE_ISA_DPB => hwcap |= starnix_uapi::HWCAP_DCPOP,
1087            CpuFeatureFlags::ARM64_FEATURE_ISA_RNDR => hwcap2 |= starnix_uapi::HWCAP2_RNG,
1088            _ => {}
1089        }
1090    }
1091    HwCap { hwcap, hwcap2 }
1092}
1093
1094impl HwCaps {
1095    #[cfg(target_arch = "aarch64")]
1096    pub fn from_cpu_feature_flags(cpu_feature_flags: CpuFeatureFlags) -> Self {
1097        Self { arch32: arm32_hwcap(cpu_feature_flags), arch64: arm64_hwcap(cpu_feature_flags) }
1098    }
1099
1100    #[cfg(not(target_arch = "aarch64"))]
1101    pub fn from_cpu_feature_flags(_cpu_feature_flags: CpuFeatureFlags) -> Self {
1102        Self { arch64: HwCap::default() }
1103    }
1104}
1105
1106#[cfg(test)]
1107mod test {
1108    use super::parse_cmdline;
1109
1110    #[test]
1111    fn test_parse_cmdline() {
1112        let cmdline =
1113            r#"first second=third "fourth fifth" sixth="seventh eighth" "ninth\" tenth" eleventh"#;
1114        let expected = vec![
1115            "first",
1116            "second=third",
1117            "\"fourth fifth\"",
1118            "sixth=\"seventh eighth\"",
1119            "\"ninth\\\" tenth\"",
1120            "eleventh",
1121        ];
1122        assert_eq!(parse_cmdline(cmdline).collect::<Vec<_>>(), expected);
1123    }
1124}