starnix_core/task/
kernel.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::bpf::EbpfState;
6use crate::device::remote_block_device::RemoteBlockDeviceRegistry;
7use crate::device::{DeviceMode, DeviceRegistry};
8use crate::execution::CrashReporter;
9use crate::mm::{FutexTable, MappingSummary, MlockPinFlavor, SharedFutexKey};
10use crate::power::SuspendResumeManagerHandle;
11use crate::security::{self, AuditLogger};
12use crate::task::container_namespace::ContainerNamespace;
13use crate::task::limits::SystemLimits;
14use crate::task::memory_attribution::MemoryAttributionManager;
15use crate::task::net::NetstackDevices;
16use crate::task::tracing::PidToKoidMap;
17use crate::task::{
18    AbstractUnixSocketNamespace, AbstractVsockSocketNamespace, CurrentTask, DelayedReleaser,
19    HrTimerManager, HrTimerManagerHandle, IpTables, KernelCgroups, KernelStats, KernelThreads,
20    PidTable, SchedulerManager, StopState, Syslog, ThreadGroup, UtsNamespace, UtsNamespaceHandle,
21};
22use crate::vdso::vdso_loader::Vdso;
23use crate::vfs::fs_args::MountParams;
24use crate::vfs::pseudo::simple_directory::SimpleDirectoryMutator;
25use crate::vfs::socket::{
26    GenericMessage, GenericNetlink, NetlinkAccessControl, NetlinkContextImpl,
27    NetlinkToClientSender, SocketAddress, SocketTokensStore,
28};
29use crate::vfs::{FileOps, FsNodeHandle, FsString, Mounts, NamespaceNode};
30use bstr::{BString, ByteSlice};
31use devicetree::types::Devicetree;
32use expando::Expando;
33use fidl::endpoints::{
34    ClientEnd, ControlHandle, DiscoverableProtocolMarker, ProtocolMarker, create_endpoints,
35};
36use fidl_fuchsia_component_runner::{ComponentControllerControlHandle, ComponentStopInfo};
37use fidl_fuchsia_feedback::CrashReporterProxy;
38use fidl_fuchsia_time_external::AdjustSynchronousProxy;
39use fuchsia_inspect::ArrayProperty;
40use futures::FutureExt;
41use netlink::interfaces::InterfacesHandler;
42use netlink::{NETLINK_LOG_TAG, Netlink};
43use once_cell::sync::OnceCell;
44use starnix_lifecycle::{AtomicU32Counter, AtomicU64Counter};
45use starnix_logging::{log_debug, log_error, log_info, log_warn};
46use starnix_sync::{
47    FileOpsCore, KernelSwapFiles, LockEqualOrBefore, Locked, Mutex, OrderedMutex, RwLock,
48};
49use starnix_types::ownership::TempRef;
50use starnix_uapi::device_type::DeviceType;
51use starnix_uapi::errors::{Errno, errno};
52use starnix_uapi::open_flags::OpenFlags;
53use starnix_uapi::{VMADDR_CID_HOST, from_status_like_fdio};
54use std::borrow::Cow;
55use std::collections::{HashMap, HashSet};
56use std::num::NonZeroU64;
57use std::path::PathBuf;
58use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU16, Ordering};
59use std::sync::{Arc, OnceLock, Weak};
60use zx::{AsHandleRef, CpuFeatureFlags};
61use {
62    fidl_fuchsia_io as fio, fidl_fuchsia_memory_attribution as fattribution,
63    fuchsia_async as fasync,
64};
65
66#[derive(Debug, Default, Clone)]
67pub struct KernelFeatures {
68    pub bpf_v2: bool,
69
70    /// Whether the kernel supports the S_ISUID and S_ISGID bits.
71    ///
72    /// For example, these bits are used by `sudo`.
73    ///
74    /// Enabling this feature is potentially a security risk because they allow privilege
75    /// escalation.
76    pub enable_suid: bool,
77
78    /// Whether io_uring is enabled.
79    ///
80    /// TODO(https://fxbug.dev/297431387): Enabled by default once the feature is completed.
81    pub io_uring: bool,
82
83    /// Whether the kernel should return an error to userspace, rather than panicking, if `reboot()`
84    /// is requested but cannot be enacted because the kernel lacks the relevant capabilities.
85    pub error_on_failed_reboot: bool,
86
87    /// The default seclabel that is applied to components that are run in this kernel.
88    ///
89    /// Components can override this by setting the `seclabel` field in their program block.
90    pub default_seclabel: Option<String>,
91
92    /// Whether the kernel is being used to run the SELinux Test Suite.
93    ///
94    /// TODO: https://fxbug.dev/388077431 - remove this once we no longer need workarounds for the
95    /// SELinux Test Suite.
96    pub selinux_test_suite: bool,
97
98    /// The default mount options to use when mounting directories from a component's namespace.
99    ///
100    /// The key is the path in the component's namespace, and the value is the mount options
101    /// string.
102    pub default_ns_mount_options: Option<HashMap<String, String>>,
103
104    /// The default uid that is applied to components that are run in this kernel.
105    ///
106    /// Components can override this by setting the `uid` field in their program block.
107    pub default_uid: u32,
108
109    /// mlock() never prefaults pages.
110    pub mlock_always_onfault: bool,
111
112    /// Implementation of mlock() to use for this kernel instance.
113    pub mlock_pin_flavor: MlockPinFlavor,
114
115    /// Whether excessive crash reports should be throttled.
116    pub crash_report_throttling: bool,
117
118    /// Whether or not to serve wifi support to Android.
119    pub wifi: bool,
120
121    /// The number of bytes to cache in pages for reading zx::MapInfo from VMARs.
122    pub cached_zx_map_info_bytes: u32,
123}
124
125impl KernelFeatures {
126    /// Returns the `MountParams` to use when mounting the specified path from a component's
127    /// namespace.  This mechanism is also used to specified options for mounts created via
128    /// container features, by specifying a pseudo-path e.g. "#container".
129    pub fn ns_mount_options(&self, ns_path: &str) -> Result<MountParams, Errno> {
130        if let Some(all_options) = &self.default_ns_mount_options {
131            if let Some(options) = all_options.get(ns_path) {
132                return MountParams::parse(options.as_bytes().into());
133            }
134        }
135        Ok(MountParams::default())
136    }
137}
138
139/// Kernel command line argument structure
140pub struct ArgNameAndValue<'a> {
141    pub name: &'a str,
142    pub value: Option<&'a str>,
143}
144
145/// The shared, mutable state for the entire Starnix kernel.
146///
147/// The `Kernel` object holds all kernel threads, userspace tasks, and file system resources for a
148/// single instance of the Starnix kernel. In production, there is one instance of this object for
149/// the entire Starnix kernel. However, multiple instances of this object can be created in one
150/// process during unit testing.
151///
152/// The structure of this object will likely need to evolve as we implement more namespacing and
153/// isolation mechanisms, such as `namespaces(7)` and `pid_namespaces(7)`.
154pub struct Kernel {
155    /// Weak reference to self. Allows to not have to pass &Arc<Kernel> in apis.
156    pub weak_self: Weak<Kernel>,
157
158    /// The kernel threads running on behalf of this kernel.
159    pub kthreads: KernelThreads,
160
161    /// The features enabled for this kernel.
162    pub features: KernelFeatures,
163
164    /// The processes and threads running in this kernel, organized by pid_t.
165    pub pids: RwLock<PidTable>,
166
167    /// Used to record the pid/tid to Koid mappings. Set when collecting trace data.
168    pub pid_to_koid_mapping: Arc<RwLock<Option<PidToKoidMap>>>,
169
170    /// Subsystem-specific properties that hang off the Kernel object.
171    ///
172    /// Instead of adding yet another property to the Kernel object, consider storing the property
173    /// in an expando if that property is only used by one part of the system, such as a module.
174    pub expando: Expando,
175
176    /// The default namespace for abstract AF_UNIX sockets in this kernel.
177    ///
178    /// Rather than use this default namespace, abstract socket addresses
179    /// should be looked up in the AbstractSocketNamespace on each Task
180    /// object because some Task objects might have a non-default namespace.
181    pub default_abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
182
183    /// The default namespace for abstract AF_VSOCK sockets in this kernel.
184    pub default_abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
185
186    /// The kernel command line. Shows up in /proc/cmdline.
187    pub cmdline: BString,
188
189    pub device_tree: Option<Devicetree>,
190
191    // Global state held by the Linux Security Modules subsystem.
192    pub security_state: security::KernelState,
193
194    /// The registry of device drivers.
195    pub device_registry: DeviceRegistry,
196
197    /// Mapping of top-level namespace entries to an associated proxy.
198    /// For example, "/svc" to the respective proxy. Only the namespace entries
199    /// which were known at component startup will be available by the kernel.
200    pub container_namespace: ContainerNamespace,
201
202    /// The registry of block devices backed by a remote fuchsia.io file.
203    pub remote_block_device_registry: Arc<RemoteBlockDeviceRegistry>,
204
205    /// The iptables used for filtering network packets.
206    iptables: OnceLock<IpTables>,
207
208    /// The futexes shared across processes.
209    pub shared_futexes: Arc<FutexTable<SharedFutexKey>>,
210
211    /// The default UTS namespace for all tasks.
212    ///
213    /// Because each task can have its own UTS namespace, you probably want to use
214    /// the UTS namespace handle of the task, which may/may not point to this one.
215    pub root_uts_ns: UtsNamespaceHandle,
216
217    /// A struct containing a VMO with a vDSO implementation, if implemented for a given architecture, and possibly an offset for a sigreturn function.
218    pub vdso: Vdso,
219
220    /// A struct containing a VMO with a arch32-vDSO implementation, if implemented for a given architecture.
221    // TODO(https://fxbug.dev/380431743) This could be made less clunky -- maybe a Vec<Vdso> above or
222    // something else
223    pub vdso_arch32: Option<Vdso>,
224
225    /// The table of devices installed on the netstack and their associated
226    /// state local to this `Kernel`.
227    pub netstack_devices: Arc<NetstackDevices>,
228
229    /// Files that are currently available for swapping.
230    /// Note: Starnix never actually swaps memory to these files. We just need to track them
231    /// to pass conformance tests.
232    pub swap_files: OrderedMutex<Vec<FsNodeHandle>, KernelSwapFiles>,
233
234    /// The implementation of generic Netlink protocol families.
235    generic_netlink: OnceLock<GenericNetlink<NetlinkToClientSender<GenericMessage>>>,
236
237    /// The implementation of networking-related Netlink protocol families.
238    network_netlink: OnceLock<Netlink<NetlinkContextImpl>>,
239
240    /// Inspect instrumentation for this kernel instance.
241    pub inspect_node: fuchsia_inspect::Node,
242
243    /// The kinds of seccomp action that gets logged, stored as a bit vector.
244    /// Each potential SeccompAction gets a bit in the vector, as specified by
245    /// SeccompAction::logged_bit_offset.  If the bit is set, that means the
246    /// action should be logged when it is taken, subject to the caveats
247    /// described in seccomp(2).  The value of the bit vector is exposed to users
248    /// in a text form in the file /proc/sys/kernel/seccomp/actions_logged.
249    pub actions_logged: AtomicU16,
250
251    /// The manager for suspend/resume.
252    pub suspend_resume_manager: SuspendResumeManagerHandle,
253
254    /// Unique IDs for new mounts and mount namespaces.
255    pub next_mount_id: AtomicU64Counter,
256    pub next_peer_group_id: AtomicU64Counter,
257    pub next_namespace_id: AtomicU64Counter,
258
259    /// Unique IDs for file objects.
260    pub next_file_object_id: AtomicU64Counter,
261
262    /// Unique cookie used to link two inotify events, usually an IN_MOVE_FROM/IN_MOVE_TO pair.
263    pub next_inotify_cookie: AtomicU32Counter,
264
265    /// Controls which processes a process is allowed to ptrace.  See Documentation/security/Yama.txt
266    pub ptrace_scope: AtomicU8,
267
268    // The Fuchsia build version returned by `fuchsia.buildinfo.Provider`.
269    pub build_version: OnceCell<String>,
270
271    pub stats: Arc<KernelStats>,
272
273    /// Resource limits that are exposed, for example, via sysctl.
274    pub system_limits: SystemLimits,
275
276    // The service to handle delayed releases. This is required for elements that requires to
277    // execute some code when released and requires a known context (both in term of lock context,
278    // as well as `CurrentTask`).
279    pub delayed_releaser: DelayedReleaser,
280
281    /// Manages task priorities.
282    pub scheduler: SchedulerManager,
283
284    /// The syslog manager.
285    pub syslog: Syslog,
286
287    /// All mounts.
288    pub mounts: Mounts,
289
290    /// The manager for creating and managing high-resolution timers.
291    pub hrtimer_manager: HrTimerManagerHandle,
292
293    /// The manager for monitoring and reporting resources used by the kernel.
294    pub memory_attribution_manager: MemoryAttributionManager,
295
296    /// Handler for crashing Linux processes.
297    pub crash_reporter: CrashReporter,
298
299    /// Vector of functions to be run when procfs is constructed. This is to allow
300    /// modules to expose directories into /proc/device-tree.
301    pub procfs_device_tree_setup: Vec<fn(&SimpleDirectoryMutator)>,
302
303    /// Whether this kernel is shutting down. When shutting down, new processes may not be spawned.
304    shutting_down: AtomicBool,
305
306    /// True to disable syslog access to unprivileged callers.  This also controls whether read
307    /// access to /dev/kmsg requires privileged capabilities.
308    pub restrict_dmesg: AtomicBool,
309
310    /// Determines whether unprivileged BPF is permitted, or can be re-enabled.
311    ///   0 - Unprivileged BPF is permitted.
312    ///   1 - Unprivileged BPF is not permitted, and cannot be enabled.
313    ///   2 - Unprivileged BPF is not permitted, but can be enabled by a privileged task.
314    pub disable_unprivileged_bpf: AtomicU8,
315
316    /// Control handle to the running container's ComponentController.
317    pub container_control_handle: Mutex<Option<ComponentControllerControlHandle>>,
318
319    /// eBPF state: loaded programs, eBPF maps, etc.
320    pub ebpf_state: EbpfState,
321
322    /// Cgroups of the kernel.
323    pub cgroups: KernelCgroups,
324
325    /// Used to communicate requests to adjust system time from within a Starnix
326    /// container. Used from syscalls.
327    pub time_adjustment_proxy: Option<AdjustSynchronousProxy>,
328
329    /// Used to store tokens for sockets, particularly per-uid sharing domain sockets.
330    pub socket_tokens_store: SocketTokensStore,
331
332    /// Hardware capabilities to push onto stack when loading an ELF binary.
333    pub hwcaps: HwCaps,
334}
335
336/// Hardware capabilities.
337#[derive(Debug, Clone, Copy, Default)]
338pub struct HwCap {
339    /// The value for `AT_HWCAP`.
340    pub hwcap: u32,
341    /// The value for `AT_HWCAP2`.
342    pub hwcap2: u32,
343}
344
345/// Hardware capabilities for both 32-bit and 64-bit ELF binaries.
346#[derive(Debug, Clone, Copy, Default)]
347pub struct HwCaps {
348    /// For 32-bit binaries.
349    #[cfg(target_arch = "aarch64")]
350    pub arch32: HwCap,
351    /// For 64-bit binaries.
352    pub arch64: HwCap,
353}
354
355/// An implementation of [`InterfacesHandler`].
356///
357/// This holds a `Weak<Kernel>` because it is held within a [`Netlink`] which
358/// is itself held within an `Arc<Kernel>`. Holding an `Arc<T>` within an
359/// `Arc<T>` prevents the `Arc`'s ref count from ever reaching 0, causing a
360/// leak.
361struct InterfacesHandlerImpl(Weak<Kernel>);
362
363impl InterfacesHandlerImpl {
364    fn kernel(&self) -> Option<Arc<Kernel>> {
365        self.0.upgrade()
366    }
367}
368
369impl InterfacesHandler for InterfacesHandlerImpl {
370    fn handle_new_link(&mut self, name: &str, interface_id: NonZeroU64) {
371        if let Some(kernel) = self.kernel() {
372            kernel.netstack_devices.add_device(&kernel, name.into(), interface_id);
373        }
374    }
375
376    fn handle_deleted_link(&mut self, name: &str) {
377        if let Some(kernel) = self.kernel() {
378            kernel.netstack_devices.remove_device(&kernel, name.into());
379        }
380    }
381
382    fn handle_idle_event(&mut self) {
383        let Some(kernel) = self.kernel() else {
384            log_error!("kernel went away while netlink is initializing");
385            return;
386        };
387        let (initialized, wq) = &kernel.netstack_devices.initialized_and_wq;
388        if initialized.swap(true, Ordering::SeqCst) {
389            log_error!("netlink initial devices should only be reported once");
390            return;
391        }
392        wq.notify_all()
393    }
394}
395
396impl Kernel {
397    pub fn new(
398        cmdline: BString,
399        features: KernelFeatures,
400        system_limits: SystemLimits,
401        container_namespace: ContainerNamespace,
402        scheduler: SchedulerManager,
403        crash_reporter_proxy: Option<CrashReporterProxy>,
404        inspect_node: fuchsia_inspect::Node,
405        security_state: security::KernelState,
406        procfs_device_tree_setup: Vec<fn(&SimpleDirectoryMutator)>,
407        time_adjustment_proxy: Option<AdjustSynchronousProxy>,
408        device_tree: Option<Devicetree>,
409    ) -> Result<Arc<Kernel>, zx::Status> {
410        let unix_address_maker =
411            Box::new(|x: FsString| -> SocketAddress { SocketAddress::Unix(x) });
412        let vsock_address_maker = Box::new(|x: u32| -> SocketAddress {
413            SocketAddress::Vsock { port: x, cid: VMADDR_CID_HOST }
414        });
415
416        let crash_reporter = CrashReporter::new(
417            &inspect_node,
418            crash_reporter_proxy,
419            zx::Duration::from_minutes(8),
420            features.crash_report_throttling,
421        );
422        let hrtimer_manager = HrTimerManager::new(&inspect_node);
423
424        let cpu_feature_flags =
425            zx::system_get_feature_flags::<CpuFeatureFlags>().unwrap_or_else(|e| {
426                log_debug!("CPU feature flags are only supported on ARM64: {}, reporting 0", e);
427                CpuFeatureFlags::empty()
428            });
429        let hwcaps = HwCaps::from_cpu_feature_flags(cpu_feature_flags);
430
431        let this = Arc::new_cyclic(|kernel| Kernel {
432            weak_self: kernel.clone(),
433            kthreads: KernelThreads::new(kernel.clone()),
434            features,
435            pids: Default::default(),
436            pid_to_koid_mapping: Arc::new(RwLock::new(None)),
437            expando: Default::default(),
438            default_abstract_socket_namespace: AbstractUnixSocketNamespace::new(unix_address_maker),
439            default_abstract_vsock_namespace: AbstractVsockSocketNamespace::new(
440                vsock_address_maker,
441            ),
442            cmdline,
443            device_tree,
444            security_state,
445            device_registry: Default::default(),
446            container_namespace,
447            remote_block_device_registry: Default::default(),
448            iptables: OnceLock::new(),
449            shared_futexes: Arc::<FutexTable<SharedFutexKey>>::default(),
450            root_uts_ns: Arc::new(RwLock::new(UtsNamespace::default())),
451            vdso: Vdso::new(),
452            vdso_arch32: Vdso::new_arch32(),
453            netstack_devices: Arc::default(),
454            swap_files: Default::default(),
455            generic_netlink: OnceLock::new(),
456            network_netlink: OnceLock::new(),
457            inspect_node,
458            actions_logged: AtomicU16::new(0),
459            suspend_resume_manager: Default::default(),
460            next_mount_id: AtomicU64Counter::new(1),
461            next_peer_group_id: AtomicU64Counter::new(1),
462            next_namespace_id: AtomicU64Counter::new(1),
463            next_inotify_cookie: AtomicU32Counter::new(1),
464            next_file_object_id: Default::default(),
465            system_limits,
466            ptrace_scope: AtomicU8::new(0), // Disable YAMA checks by default.
467            restrict_dmesg: AtomicBool::new(false),
468            disable_unprivileged_bpf: AtomicU8::new(0), // Enable unprivileged BPF by default.
469            build_version: OnceCell::new(),
470            stats: Arc::new(KernelStats::default()),
471            delayed_releaser: Default::default(),
472            scheduler,
473            syslog: Default::default(),
474            mounts: Mounts::new(),
475            hrtimer_manager,
476            memory_attribution_manager: MemoryAttributionManager::new(kernel.clone()),
477            crash_reporter,
478            procfs_device_tree_setup,
479            shutting_down: AtomicBool::new(false),
480            container_control_handle: Mutex::new(None),
481            ebpf_state: Default::default(),
482            cgroups: Default::default(),
483            time_adjustment_proxy,
484            socket_tokens_store: Default::default(),
485            hwcaps,
486        });
487
488        // Initialize the device registry before registering any devices.
489        //
490        // We will create sysfs recursively within this function.
491        this.device_registry.objects.init(&mut this.kthreads.unlocked_for_async(), &this);
492
493        // Make a copy of this Arc for the inspect lazy node to use but don't create an Arc cycle
494        // because the inspect node that owns this reference is owned by the kernel.
495        let kernel = Arc::downgrade(&this);
496        this.inspect_node.record_lazy_child("thread_groups", move || {
497            if let Some(kernel) = kernel.upgrade() {
498                let inspector = kernel.get_thread_groups_inspect();
499                async move { Ok(inspector) }.boxed()
500            } else {
501                async move { Err(anyhow::format_err!("kernel was dropped")) }.boxed()
502            }
503        });
504
505        Ok(this)
506    }
507
508    /// Shuts down userspace and the kernel in an orderly fashion, eventually terminating the root
509    /// kernel process.
510    pub fn shut_down(self: &Arc<Self>) {
511        // Run shutdown code on a kthread in the main process so that it can be the last process
512        // alive.
513        self.kthreads.spawn_future({
514            let kernel = self.clone();
515            async move || {
516                kernel.run_shutdown().await;
517            }
518        });
519    }
520
521    /// Starts shutting down the Starnix kernel and any running container. Only one thread can drive
522    /// shutdown at a time. This function will return immediately if shut down is already under way.
523    ///
524    /// Shutdown happens in several phases:
525    ///
526    /// 1. Disable launching new processes
527    /// 2. Shut down individual ThreadGroups until only the init and system tasks remain
528    /// 3. Repeat the above for the init task
529    /// 4. Clean up kernel-internal structures that can hold processes alive
530    /// 5. Ensure this process is the only one running in the kernel job.
531    /// 6. Unmounts the kernel's mounts' FileSystems.
532    /// 7. Tell CF the container component has stopped
533    /// 8. Exit this process
534    ///
535    /// If a ThreadGroup does not shut down on its own (including after SIGKILL), that phase of
536    /// shutdown will hang. To gracefully shut down any further we need the other kernel processes
537    /// to do controlled exits that properly release access to shared state. If our orderly shutdown
538    /// does hang, eventually CF will kill the container component which will lead to the job of
539    /// this process being killed and shutdown will still complete.
540    async fn run_shutdown(&self) {
541        const INIT_PID: i32 = 1;
542        const SYSTEM_TASK_PID: i32 = 2;
543
544        // Step 1: Prevent new processes from being created once they observe this update. We don't
545        // want the thread driving shutdown to be racing with other threads creating new processes.
546        if self
547            .shutting_down
548            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
549            .is_err()
550        {
551            log_debug!("Additional thread tried to initiate shutdown while already in-progress.");
552            return;
553        }
554
555        log_info!("Shutting down Starnix kernel.");
556
557        // Step 2: Shut down thread groups in a loop until init and the system task are all that
558        // remain.
559        loop {
560            let tgs = {
561                // Exiting thread groups need to acquire a write lock for the pid table to
562                // successfully exit so we need to acquire that lock in a reduced scope.
563                self.pids
564                    .read()
565                    .get_thread_groups()
566                    .filter(|tg| tg.leader != SYSTEM_TASK_PID && tg.leader != INIT_PID)
567                    .collect::<Vec<_>>()
568            };
569            if tgs.is_empty() {
570                log_debug!("pid table is empty except init and system task");
571                break;
572            }
573
574            log_debug!(tgs:?; "shutting down thread groups");
575            let mut tasks = vec![];
576            for tg in tgs {
577                let task = fasync::Task::local(ThreadGroup::shut_down(Arc::downgrade(&tg)));
578                tasks.push(task);
579            }
580            futures::future::join_all(tasks).await;
581        }
582
583        // Step 3: Terminate the init process.
584        let maybe_init = {
585            // Exiting thread groups need to acquire a write lock for the pid table to successfully
586            // exit so we need to acquire that lock in a reduced scope.
587            self.pids.read().get_thread_group(1).map(|tg| Arc::downgrade(&tg))
588        };
589        if let Some(init) = maybe_init {
590            log_debug!("shutting down init");
591            ThreadGroup::shut_down(init).await;
592        } else {
593            log_debug!("init already terminated");
594        }
595
596        // Step 4: Clean up any structures that can keep non-Linux processes live in our job.
597        self.expando.remove::<memory_pinning::ShadowProcess>();
598
599        // Step 5: Make sure this is the only process running in the job. We already should have
600        // cleared up all processes other than the system task at this point, but wait on any that
601        // might be around for good measure.
602        //
603        // Use unwrap liberally since we're shutting down anyway and errors will still tear down the
604        // kernel.
605        let kernel_job = fuchsia_runtime::job_default();
606        assert_eq!(kernel_job.children().unwrap(), &[], "starnix does not create any child jobs");
607        let own_koid = fuchsia_runtime::process_self().get_koid().unwrap();
608
609        log_debug!("waiting for this to be the only process in the job");
610        loop {
611            let mut remaining_processes = kernel_job
612                .processes()
613                .unwrap()
614                .into_iter()
615                // Don't wait for ourselves to exit.
616                .filter(|pid| pid != &own_koid)
617                .peekable();
618            if remaining_processes.peek().is_none() {
619                log_debug!("No stray Zircon processes.");
620                break;
621            }
622
623            let mut terminated_signals = vec![];
624            for pid in remaining_processes {
625                let handle = match kernel_job
626                    .get_child(&pid, zx::Rights::BASIC | zx::Rights::PROPERTY | zx::Rights::DESTROY)
627                {
628                    Ok(h) => h,
629                    Err(e) => {
630                        log_debug!(pid:?, e:?; "failed to get child process from job");
631                        continue;
632                    }
633                };
634                terminated_signals
635                    .push(fuchsia_async::OnSignals::new(handle, zx::Signals::PROCESS_TERMINATED));
636            }
637            log_debug!("waiting on process terminated signals");
638            futures::future::join_all(terminated_signals).await;
639        }
640
641        // Step 6: Forcibly unmounts the mounts' FileSystems.
642        self.mounts.clear();
643
644        // Step 7: Tell CF the container stopped.
645        log_debug!("all non-root processes killed, notifying CF container is stopped");
646        if let Some(control_handle) = self.container_control_handle.lock().take() {
647            log_debug!("Notifying CF that the container has stopped.");
648            control_handle
649                .send_on_stop(ComponentStopInfo {
650                    termination_status: Some(zx::Status::OK.into_raw()),
651                    exit_code: Some(0),
652                    ..ComponentStopInfo::default()
653                })
654                .unwrap();
655            control_handle.shutdown_with_epitaph(zx::Status::OK);
656        } else {
657            log_warn!("Shutdown invoked without a container controller control handle.");
658        }
659
660        // Step 8: exiting this process.
661        log_info!("All tasks killed, exiting Starnix kernel root process.");
662        // Normally a Rust program exits its process by calling `std::process::exit()` which goes
663        // through libc to exit the program. This runs drop impls on any thread-local variables
664        // which can cause issues during Starnix shutdown when we haven't yet integrated every
665        // subsystem with the shutdown flow. While those issues are indicative of underlying
666        // problems, we can't solve them without finishing the implementation of graceful shutdown.
667        // Instead, ask Zircon to exit our process directly, bypassing any libc atexit handlers.
668        // TODO(https://fxbug.dev/295073633) return from main instead of avoiding atexit handlers
669        zx::Process::exit(0);
670    }
671
672    pub fn is_shutting_down(&self) -> bool {
673        self.shutting_down.load(Ordering::Acquire)
674    }
675
676    /// Opens a device file (driver) identified by `dev`.
677    pub fn open_device<L>(
678        &self,
679        locked: &mut Locked<L>,
680        current_task: &CurrentTask,
681        node: &NamespaceNode,
682        flags: OpenFlags,
683        dev: DeviceType,
684        mode: DeviceMode,
685    ) -> Result<Box<dyn FileOps>, Errno>
686    where
687        L: LockEqualOrBefore<FileOpsCore>,
688    {
689        self.device_registry.open_device(locked, current_task, node, flags, dev, mode)
690    }
691
692    /// Return a reference to the Audit Framework
693    ///
694    /// This function follows the lazy initialization pattern.
695    pub fn audit_logger(&self) -> Arc<AuditLogger> {
696        self.expando.get_or_init(|| AuditLogger::new(self))
697    }
698
699    /// Return a reference to the GenericNetlink implementation.
700    ///
701    /// This function follows the lazy initialization pattern, where the first
702    /// call will instantiate the Generic Netlink server in a separate kthread.
703    pub fn generic_netlink(&self) -> &GenericNetlink<NetlinkToClientSender<GenericMessage>> {
704        self.generic_netlink.get_or_init(|| {
705            let (generic_netlink, worker_params) = GenericNetlink::new();
706            let enable_nl80211 = self.features.wifi;
707            self.kthreads.spawn_future(async move || {
708                crate::vfs::socket::run_generic_netlink_worker(worker_params, enable_nl80211).await;
709                log_error!("Generic Netlink future unexpectedly exited");
710            });
711            generic_netlink
712        })
713    }
714
715    /// Return a reference to the [`netlink::Netlink`] implementation.
716    ///
717    /// This function follows the lazy initialization pattern, where the first
718    /// call will instantiate the Netlink implementation.
719    pub fn network_netlink(self: &Arc<Self>) -> &Netlink<NetlinkContextImpl> {
720        self.network_netlink.get_or_init(|| {
721            let (network_netlink, worker_params) =
722                Netlink::new(InterfacesHandlerImpl(self.weak_self.clone()));
723
724            let kernel = self.clone();
725            self.kthreads.spawn_future(async move || {
726                netlink::run_netlink_worker(
727                    worker_params,
728                    NetlinkAccessControl::new(kernel.kthreads.system_task()),
729                )
730                .await;
731                log_error!(tag = NETLINK_LOG_TAG; "Netlink async worker unexpectedly exited");
732            });
733            network_netlink
734        })
735    }
736
737    pub fn iptables(&self) -> &IpTables {
738        self.iptables.get_or_init(|| IpTables::new())
739    }
740
741    /// Returns a Proxy to the service used by the container at `filename`.
742    #[allow(unused)]
743    pub fn connect_to_named_protocol_at_container_svc<P: ProtocolMarker>(
744        &self,
745        filename: &str,
746    ) -> Result<ClientEnd<P>, Errno> {
747        match self.container_namespace.get_namespace_channel("/svc") {
748            Ok(channel) => {
749                let (client_end, server_end) = create_endpoints::<P>();
750                fdio::service_connect_at(channel.as_ref(), filename, server_end.into_channel())
751                    .map_err(|status| from_status_like_fdio!(status))?;
752                Ok(client_end)
753            }
754            Err(err) => {
755                log_error!("Unable to get /svc namespace channel! {}", err);
756                Err(errno!(ENOENT))
757            }
758        }
759    }
760
761    /// Returns a Proxy to the service `P` used by the container.
762    pub fn connect_to_protocol_at_container_svc<P: DiscoverableProtocolMarker>(
763        &self,
764    ) -> Result<ClientEnd<P>, Errno> {
765        self.connect_to_named_protocol_at_container_svc::<P>(P::PROTOCOL_NAME)
766    }
767
768    fn get_thread_groups_inspect(&self) -> fuchsia_inspect::Inspector {
769        let inspector = fuchsia_inspect::Inspector::default();
770
771        let thread_groups = inspector.root();
772        let mut mm_summary = MappingSummary::default();
773        let mut mms_summarized = HashSet::new();
774
775        // Avoid holding locks for the entire iteration.
776        let all_thread_groups = {
777            let pid_table = self.pids.read();
778            pid_table.get_thread_groups().collect::<Vec<_>>()
779        };
780        for thread_group in all_thread_groups {
781            // Avoid holding the state lock while summarizing.
782            let (ppid, tasks) = {
783                let tg = thread_group.read();
784                (tg.get_ppid() as i64, tg.tasks().map(TempRef::into_static).collect::<Vec<_>>())
785            };
786
787            let tg_node = thread_groups.create_child(format!("{}", thread_group.leader));
788            if let Ok(koid) = &thread_group.process.get_koid() {
789                tg_node.record_int("koid", koid.raw_koid() as i64);
790            }
791            tg_node.record_int("pid", thread_group.leader as i64);
792            tg_node.record_int("ppid", ppid);
793            tg_node.record_bool("stopped", thread_group.load_stopped() == StopState::GroupStopped);
794
795            let tasks_node = tg_node.create_child("tasks");
796            for task in tasks {
797                if let Ok(mm) = task.mm() {
798                    if mms_summarized.insert(Arc::as_ptr(&mm) as usize) {
799                        mm.summarize(&mut mm_summary);
800                    }
801                }
802                let set_properties = |node: &fuchsia_inspect::Node| {
803                    node.record_string("command", task.command().to_string());
804
805                    let scheduler_state = task.read().scheduler_state;
806                    if !scheduler_state.is_default() {
807                        node.record_child("sched", |node| {
808                            node.record_string(
809                                "role_name",
810                                self.scheduler
811                                    .role_name(&task)
812                                    .map(|n| Cow::Borrowed(n))
813                                    .unwrap_or_else(|e| Cow::Owned(e.to_string())),
814                            );
815                            node.record_string("state", format!("{scheduler_state:?}"));
816                        });
817                    }
818                };
819                if task.tid == thread_group.leader {
820                    let mut argv = task.read_argv(256).unwrap_or_default();
821
822                    // Any runtime that overwrites argv is likely to leave a lot of trailing
823                    // nulls, no need to print those in inspect.
824                    argv.retain(|arg| !arg.is_empty());
825
826                    let inspect_argv = tg_node.create_string_array("argv", argv.len());
827                    for (i, arg) in argv.iter().enumerate() {
828                        inspect_argv.set(i, arg.to_string());
829                    }
830                    tg_node.record(inspect_argv);
831
832                    set_properties(&tg_node);
833                } else {
834                    tasks_node.record_child(task.tid.to_string(), |task_node| {
835                        set_properties(task_node);
836                    });
837                };
838            }
839            tg_node.record(tasks_node);
840            thread_groups.record(tg_node);
841        }
842
843        thread_groups.record_child("memory_managers", |node| mm_summary.record(node));
844
845        inspector
846    }
847
848    pub fn new_memory_attribution_observer(
849        &self,
850        control_handle: fattribution::ProviderControlHandle,
851    ) -> attribution_server::Observer {
852        self.memory_attribution_manager.new_observer(control_handle)
853    }
854
855    /// Opens and returns a directory proxy from the container's namespace, at
856    /// the requested path, using the provided flags. This method will open the
857    /// closest existing path from the namespace hierarchy. For instance, if
858    /// the parameter provided is `/path/to/foo/bar` and the exists namespace
859    /// entries for `/path/to/foo` and `/path/to`, then the former will be used
860    /// as the root proxy and the subdir `bar` returned.
861    pub fn open_ns_dir(
862        &self,
863        path: &str,
864        open_flags: fio::Flags,
865    ) -> Result<(fio::DirectorySynchronousProxy, String), Errno> {
866        let ns_path = match path {
867            // TODO(379929394): This condition is specifically to soft
868            // transition the fstab file to the new format.
869            "" | "/" | "." => PathBuf::from("/data"),
870            _ => PathBuf::from(path),
871        };
872
873        match self.container_namespace.find_closest_channel(&ns_path) {
874            Ok((root_channel, remaining_subdir)) => {
875                let (_, server_end) = create_endpoints::<fio::DirectoryMarker>();
876                fdio::open_at(
877                    &root_channel,
878                    &remaining_subdir,
879                    open_flags,
880                    server_end.into_channel(),
881                )
882                .map_err(|e| {
883                    log_error!("Failed to intialize the subdirs: {}", e);
884                    errno!(EIO)
885                })?;
886
887                Ok((fio::DirectorySynchronousProxy::new(root_channel), remaining_subdir))
888            }
889            Err(err) => {
890                log_error!(
891                    "Unable to find a channel for {}. Received error: {}",
892                    ns_path.display(),
893                    err
894                );
895                Err(errno!(ENOENT))
896            }
897        }
898    }
899
900    /// Returns an iterator of the command line arguments.
901    pub fn cmdline_args_iter(&self) -> impl Iterator<Item = ArgNameAndValue<'_>> {
902        parse_cmdline(self.cmdline.to_str().unwrap_or_default()).filter_map(|arg| {
903            arg.split_once('=')
904                .map(|(name, value)| ArgNameAndValue { name: name, value: Some(value) })
905                .or(Some(ArgNameAndValue { name: arg, value: None }))
906        })
907    }
908}
909
910pub fn parse_cmdline(cmdline: &str) -> impl Iterator<Item = &str> {
911    let mut args = Vec::new();
912    let mut arg_start: Option<usize> = None;
913    let mut in_quotes = false;
914    let mut previous_char = ' ';
915
916    for (i, c) in cmdline.char_indices() {
917        if let Some(start) = arg_start {
918            match c {
919                ' ' if !in_quotes => {
920                    args.push(&cmdline[start..i]);
921                    arg_start = None;
922                }
923                '"' if previous_char != '\\' => {
924                    in_quotes = !in_quotes;
925                }
926                _ => {}
927            }
928        } else if c != ' ' {
929            arg_start = Some(i);
930            if c == '"' {
931                in_quotes = true;
932            }
933        }
934        previous_char = c;
935    }
936    if let Some(start) = arg_start {
937        args.push(&cmdline[start..]);
938    }
939    args.into_iter()
940}
941
942impl std::fmt::Debug for Kernel {
943    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
944        f.debug_struct("Kernel").finish()
945    }
946}
947
948// TODO(https://fxbug.dev/380427153): move arch dependent code to `kernel/core/arch/*`.
949#[cfg(target_arch = "aarch64")]
950fn arm32_hwcap(cpu_feature_flags: CpuFeatureFlags) -> HwCap {
951    use starnix_uapi::arch32;
952    const COMPAT_ARM32_ELF_HWCAP: u32 = arch32::HWCAP_HALF
953        | arch32::HWCAP_THUMB
954        | arch32::HWCAP_FAST_MULT
955        | arch32::HWCAP_EDSP
956        | arch32::HWCAP_TLS
957        | arch32::HWCAP_IDIV // == IDIVA | IDIVT.
958        | arch32::HWCAP_LPAE
959        | arch32::HWCAP_EVTSTRM;
960
961    let mut hwcap = COMPAT_ARM32_ELF_HWCAP;
962    let mut hwcap2 = 0;
963    for feature in cpu_feature_flags.iter() {
964        match feature {
965            CpuFeatureFlags::ARM64_FEATURE_ISA_ASIMD => hwcap |= arch32::HWCAP_NEON,
966            CpuFeatureFlags::ARM64_FEATURE_ISA_AES => hwcap2 |= arch32::HWCAP2_AES,
967            CpuFeatureFlags::ARM64_FEATURE_ISA_PMULL => hwcap2 |= arch32::HWCAP2_PMULL,
968            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA1 => hwcap2 |= arch32::HWCAP2_SHA1,
969            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA256 => hwcap2 |= arch32::HWCAP2_SHA2,
970            CpuFeatureFlags::ARM64_FEATURE_ISA_CRC32 => hwcap2 |= arch32::HWCAP2_CRC32,
971            CpuFeatureFlags::ARM64_FEATURE_ISA_I8MM => hwcap |= arch32::HWCAP_I8MM,
972            CpuFeatureFlags::ARM64_FEATURE_ISA_FHM => hwcap |= arch32::HWCAP_ASIMDFHM,
973            CpuFeatureFlags::ARM64_FEATURE_ISA_DP => hwcap |= arch32::HWCAP_ASIMDDP,
974            CpuFeatureFlags::ARM64_FEATURE_ISA_FP => {
975                hwcap |= arch32::HWCAP_VFP | arch32::HWCAP_VFPv3 | arch32::HWCAP_VFPv4
976            }
977            _ => {}
978        }
979    }
980    HwCap { hwcap, hwcap2 }
981}
982
983#[cfg(target_arch = "aarch64")]
984fn arm64_hwcap(cpu_feature_flags: CpuFeatureFlags) -> HwCap {
985    // See https://docs.kernel.org/arch/arm64/elf_hwcaps.html for details.
986    use starnix_uapi;
987    let mut hwcap = 0;
988    let mut hwcap2 = 0;
989
990    for feature in cpu_feature_flags.iter() {
991        match feature {
992            CpuFeatureFlags::ARM64_FEATURE_ISA_FP => hwcap |= starnix_uapi::HWCAP_FP,
993            CpuFeatureFlags::ARM64_FEATURE_ISA_ASIMD => hwcap |= starnix_uapi::HWCAP_ASIMD,
994            CpuFeatureFlags::ARM64_FEATURE_ISA_AES => hwcap |= starnix_uapi::HWCAP_AES,
995            CpuFeatureFlags::ARM64_FEATURE_ISA_PMULL => hwcap |= starnix_uapi::HWCAP_PMULL,
996            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA1 => hwcap |= starnix_uapi::HWCAP_SHA1,
997            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA256 => hwcap |= starnix_uapi::HWCAP_SHA2,
998            CpuFeatureFlags::ARM64_FEATURE_ISA_CRC32 => hwcap |= starnix_uapi::HWCAP_CRC32,
999            CpuFeatureFlags::ARM64_FEATURE_ISA_I8MM => hwcap2 |= starnix_uapi::HWCAP2_I8MM,
1000            CpuFeatureFlags::ARM64_FEATURE_ISA_FHM => hwcap |= starnix_uapi::HWCAP_ASIMDFHM,
1001            CpuFeatureFlags::ARM64_FEATURE_ISA_DP => hwcap |= starnix_uapi::HWCAP_ASIMDDP,
1002            CpuFeatureFlags::ARM64_FEATURE_ISA_SM3 => hwcap |= starnix_uapi::HWCAP_SM3,
1003            CpuFeatureFlags::ARM64_FEATURE_ISA_SM4 => hwcap |= starnix_uapi::HWCAP_SM4,
1004            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA3 => hwcap |= starnix_uapi::HWCAP_SHA3,
1005            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA512 => hwcap |= starnix_uapi::HWCAP_SHA512,
1006            CpuFeatureFlags::ARM64_FEATURE_ISA_ATOMICS => hwcap |= starnix_uapi::HWCAP_ATOMICS,
1007            CpuFeatureFlags::ARM64_FEATURE_ISA_RDM => hwcap |= starnix_uapi::HWCAP_ASIMDRDM,
1008            CpuFeatureFlags::ARM64_FEATURE_ISA_TS => hwcap |= starnix_uapi::HWCAP_FLAGM,
1009            CpuFeatureFlags::ARM64_FEATURE_ISA_DPB => hwcap |= starnix_uapi::HWCAP_DCPOP,
1010            CpuFeatureFlags::ARM64_FEATURE_ISA_RNDR => hwcap2 |= starnix_uapi::HWCAP2_RNG,
1011            _ => {}
1012        }
1013    }
1014    HwCap { hwcap, hwcap2 }
1015}
1016
1017impl HwCaps {
1018    #[cfg(target_arch = "aarch64")]
1019    pub fn from_cpu_feature_flags(cpu_feature_flags: CpuFeatureFlags) -> Self {
1020        Self { arch32: arm32_hwcap(cpu_feature_flags), arch64: arm64_hwcap(cpu_feature_flags) }
1021    }
1022
1023    #[cfg(not(target_arch = "aarch64"))]
1024    pub fn from_cpu_feature_flags(_cpu_feature_flags: CpuFeatureFlags) -> Self {
1025        Self { arch64: HwCap::default() }
1026    }
1027}
1028
1029#[cfg(test)]
1030mod test {
1031    use super::parse_cmdline;
1032
1033    #[test]
1034    fn test_parse_cmdline() {
1035        let cmdline =
1036            r#"first second=third "fourth fifth" sixth="seventh eighth" "ninth\" tenth" eleventh"#;
1037        let expected = vec![
1038            "first",
1039            "second=third",
1040            "\"fourth fifth\"",
1041            "sixth=\"seventh eighth\"",
1042            "\"ninth\\\" tenth\"",
1043            "eleventh",
1044        ];
1045        assert_eq!(parse_cmdline(cmdline).collect::<Vec<_>>(), expected);
1046    }
1047}