Skip to main content

starnix_core/task/
kernel.rs

1// Copyright 2021 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::bpf::EbpfState;
6use crate::device::remote_block_device::RemoteBlockDeviceRegistry;
7use crate::device::{DeviceMode, DeviceRegistry};
8use crate::execution::CrashReporter;
9use crate::mm::{FutexTable, MappingSummary, MlockPinFlavor, SharedFutexKey};
10use crate::power::SuspendResumeManagerHandle;
11use crate::ptrace::StopState;
12use crate::security::{self, AuditLogger};
13use crate::task::container_namespace::ContainerNamespace;
14use crate::task::limits::SystemLimits;
15use crate::task::memory_attribution::MemoryAttributionManager;
16use crate::task::net::NetstackDevices;
17use crate::task::tracing::PidToKoidMap;
18use crate::task::{
19    AbstractUnixSocketNamespace, AbstractVsockSocketNamespace, CurrentTask, DelayedReleaser,
20    IpTables, KernelCgroups, KernelStats, KernelThreads, PidTable, SchedulerManager, Syslog,
21    ThreadGroup, UtsNamespace, UtsNamespaceHandle,
22};
23use crate::time::{HrTimerManager, HrTimerManagerHandle};
24use crate::vdso::vdso_loader::Vdso;
25use crate::vfs::fs_args::MountParams;
26use crate::vfs::socket::{
27    GenericMessage, GenericNetlink, NetlinkAccessControl, NetlinkContextImpl,
28    NetlinkToClientSender, SocketAddress, SocketTokensStore,
29};
30use crate::vfs::{CacheConfig, FileOps, FsNodeHandle, FsString, Mounts, NamespaceNode};
31use bstr::{BString, ByteSlice};
32use devicetree::types::Devicetree;
33use expando::Expando;
34use fidl::endpoints::{
35    ClientEnd, ControlHandle, DiscoverableProtocolMarker, ProtocolMarker, create_endpoints,
36};
37use fidl_fuchsia_component_runner::{ComponentControllerControlHandle, ComponentStopInfo};
38use fidl_fuchsia_feedback::CrashReporterProxy;
39use fidl_fuchsia_io as fio;
40use fidl_fuchsia_memory_attribution as fattribution;
41use fidl_fuchsia_time_external::AdjustSynchronousProxy;
42use fuchsia_async as fasync;
43use fuchsia_inspect::ArrayProperty;
44use futures::FutureExt;
45use netlink::interfaces::InterfacesHandler;
46use netlink::{NETLINK_LOG_TAG, Netlink};
47use once_cell::sync::OnceCell;
48use starnix_lifecycle::AtomicCounter;
49use starnix_logging::{SyscallLogFilter, log_debug, log_error, log_info, log_warn};
50use starnix_sync::{
51    FileOpsCore, KernelSwapFiles, LockEqualOrBefore, Locked, Mutex, OrderedMutex, RwLock,
52};
53use starnix_uapi::device_id::DeviceId;
54use starnix_uapi::errors::{Errno, errno};
55use starnix_uapi::open_flags::OpenFlags;
56use starnix_uapi::{VMADDR_CID_HOST, from_status_like_fdio};
57use std::borrow::Cow;
58use std::collections::{HashMap, HashSet};
59use std::num::NonZeroU64;
60use std::path::PathBuf;
61use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU16, Ordering};
62use std::sync::{Arc, OnceLock, Weak};
63use zx::CpuFeatureFlags;
64
65/// Kernel features are specified in the component manifest of the starnix container
66/// or explicitly provided to the kernel constructor in tests.
67#[derive(Debug, Default, Clone)]
68pub struct KernelFeatures {
69    pub bpf_v2: bool,
70
71    /// Whether the kernel supports the S_ISUID and S_ISGID bits.
72    ///
73    /// For example, these bits are used by `sudo`.
74    ///
75    /// Enabling this feature is potentially a security risk because they allow privilege
76    /// escalation.
77    pub enable_suid: bool,
78
79    /// Whether io_uring is enabled.
80    ///
81    /// TODO(https://fxbug.dev/297431387): Enabled by default once the feature is completed.
82    pub io_uring: bool,
83
84    /// Whether the kernel should return an error to userspace, rather than panicking, if `reboot()`
85    /// is requested but cannot be enacted because the kernel lacks the relevant capabilities.
86    pub error_on_failed_reboot: bool,
87
88    /// The default seclabel that is applied to components that are run in this kernel.
89    ///
90    /// Components can override this by setting the `seclabel` field in their program block.
91    pub default_seclabel: Option<String>,
92
93    /// Whether the kernel is being used to run the SELinux Test Suite.
94    ///
95    /// TODO: https://fxbug.dev/388077431 - remove this once we no longer need workarounds for the
96    /// SELinux Test Suite.
97    pub selinux_test_suite: bool,
98
99    /// The default mount options to use when mounting directories from a component's namespace.
100    ///
101    /// The key is the path in the component's namespace, and the value is the mount options
102    /// string.
103    pub default_ns_mount_options: Option<HashMap<String, String>>,
104
105    /// The default uid that is applied to components that are run in this kernel.
106    ///
107    /// Components can override this by setting the `uid` field in their program block.
108    pub default_uid: u32,
109
110    /// mlock() never prefaults pages.
111    pub mlock_always_onfault: bool,
112
113    /// Implementation of mlock() to use for this kernel instance.
114    pub mlock_pin_flavor: MlockPinFlavor,
115
116    /// Whether excessive crash reports should be throttled.
117    pub crash_report_throttling: bool,
118
119    /// Whether or not to serve wifi support to Android.
120    pub wifi: bool,
121
122    /// The number of bytes to cache in pages for reading zx::MapInfo from VMARs.
123    pub cached_zx_map_info_bytes: u32,
124
125    /// The size of the Dirent LRU cache.
126    pub dirent_cache_size: u32,
127
128    /// Whether to expose a stub '/dev/ion' node, as a temporary workaround for compatibility.
129    // TODO(https://fxbug.dev/485370648) remove when unnecessary
130    pub fake_ion: bool,
131}
132
133impl KernelFeatures {
134    /// Returns the `MountParams` to use when mounting the specified path from a component's
135    /// namespace.  This mechanism is also used to specified options for mounts created via
136    /// container features, by specifying a pseudo-path e.g. "#container".
137    pub fn ns_mount_options(&self, ns_path: &str) -> Result<MountParams, Errno> {
138        if let Some(all_options) = &self.default_ns_mount_options {
139            if let Some(options) = all_options.get(ns_path) {
140                return MountParams::parse(options.as_bytes().into());
141            }
142        }
143        Ok(MountParams::default())
144    }
145}
146
147/// Kernel command line argument structure
148pub struct ArgNameAndValue<'a> {
149    pub name: &'a str,
150    pub value: Option<&'a str>,
151}
152
153/// The shared, mutable state for the entire Starnix kernel.
154///
155/// The `Kernel` object holds all kernel threads, userspace tasks, and file system resources for a
156/// single instance of the Starnix kernel. In production, there is one instance of this object for
157/// the entire Starnix kernel. However, multiple instances of this object can be created in one
158/// process during unit testing.
159///
160/// The structure of this object will likely need to evolve as we implement more namespacing and
161/// isolation mechanisms, such as `namespaces(7)` and `pid_namespaces(7)`.
162pub struct Kernel {
163    /// Weak reference to self. Allows to not have to pass &Arc<Kernel> in apis.
164    pub weak_self: Weak<Kernel>,
165
166    /// The kernel threads running on behalf of this kernel.
167    pub kthreads: KernelThreads,
168
169    /// The features enabled for this kernel.
170    pub features: KernelFeatures,
171
172    /// The processes and threads running in this kernel, organized by pid_t.
173    pub pids: RwLock<PidTable>,
174
175    /// Used to record the pid/tid to Koid mappings. Set when collecting trace data.
176    pub pid_to_koid_mapping: Arc<RwLock<Option<PidToKoidMap>>>,
177
178    /// Subsystem-specific properties that hang off the Kernel object.
179    ///
180    /// Instead of adding yet another property to the Kernel object, consider storing the property
181    /// in an expando if that property is only used by one part of the system, such as a module.
182    pub expando: Expando,
183
184    /// The default namespace for abstract AF_UNIX sockets in this kernel.
185    ///
186    /// Rather than use this default namespace, abstract socket addresses
187    /// should be looked up in the AbstractSocketNamespace on each Task
188    /// object because some Task objects might have a non-default namespace.
189    pub default_abstract_socket_namespace: Arc<AbstractUnixSocketNamespace>,
190
191    /// The default namespace for abstract AF_VSOCK sockets in this kernel.
192    pub default_abstract_vsock_namespace: Arc<AbstractVsockSocketNamespace>,
193
194    /// The kernel command line. Shows up in /proc/cmdline.
195    pub cmdline: BString,
196
197    pub device_tree: Option<Devicetree>,
198
199    // Global state held by the Linux Security Modules subsystem.
200    pub security_state: security::KernelState,
201
202    /// The registry of device drivers.
203    pub device_registry: DeviceRegistry,
204
205    /// Mapping of top-level namespace entries to an associated proxy.
206    /// For example, "/svc" to the respective proxy. Only the namespace entries
207    /// which were known at component startup will be available by the kernel.
208    pub container_namespace: ContainerNamespace,
209
210    /// The registry of block devices backed by a remote fuchsia.io file.
211    pub remote_block_device_registry: Arc<RemoteBlockDeviceRegistry>,
212
213    /// The iptables used for filtering network packets.
214    iptables: OnceLock<IpTables>,
215
216    /// The futexes shared across processes.
217    pub shared_futexes: Arc<FutexTable<SharedFutexKey>>,
218
219    /// The default UTS namespace for all tasks.
220    ///
221    /// Because each task can have its own UTS namespace, you probably want to use
222    /// the UTS namespace handle of the task, which may/may not point to this one.
223    pub root_uts_ns: UtsNamespaceHandle,
224
225    /// A struct containing a VMO with a vDSO implementation, if implemented for a given architecture, and possibly an offset for a sigreturn function.
226    pub vdso: Vdso,
227
228    /// A struct containing a VMO with a arch32-vDSO implementation, if implemented for a given architecture.
229    // TODO(https://fxbug.dev/380431743) This could be made less clunky -- maybe a Vec<Vdso> above or
230    // something else
231    pub vdso_arch32: Option<Vdso>,
232
233    /// The table of devices installed on the netstack and their associated
234    /// state local to this `Kernel`.
235    pub netstack_devices: Arc<NetstackDevices>,
236
237    /// Files that are currently available for swapping.
238    /// Note: Starnix never actually swaps memory to these files. We just need to track them
239    /// to pass conformance tests.
240    pub swap_files: OrderedMutex<Vec<FsNodeHandle>, KernelSwapFiles>,
241
242    /// The implementation of generic Netlink protocol families.
243    generic_netlink: OnceLock<GenericNetlink<NetlinkToClientSender<GenericMessage>>>,
244
245    /// The implementation of networking-related Netlink protocol families.
246    network_netlink: OnceLock<Netlink<NetlinkContextImpl>>,
247
248    /// Inspect instrumentation for this kernel instance.
249    pub inspect_node: fuchsia_inspect::Node,
250
251    /// The kinds of seccomp action that gets logged, stored as a bit vector.
252    /// Each potential SeccompAction gets a bit in the vector, as specified by
253    /// SeccompAction::logged_bit_offset.  If the bit is set, that means the
254    /// action should be logged when it is taken, subject to the caveats
255    /// described in seccomp(2).  The value of the bit vector is exposed to users
256    /// in a text form in the file /proc/sys/kernel/seccomp/actions_logged.
257    pub actions_logged: AtomicU16,
258
259    /// The manager for suspend/resume.
260    pub suspend_resume_manager: SuspendResumeManagerHandle,
261
262    /// Unique IDs for new mounts and mount namespaces.
263    pub next_mount_id: AtomicCounter<u64>,
264    pub next_peer_group_id: AtomicCounter<u64>,
265    pub next_namespace_id: AtomicCounter<u64>,
266
267    /// Unique IDs for file objects.
268    pub next_file_object_id: AtomicCounter<u64>,
269
270    /// Unique cookie used to link two inotify events, usually an IN_MOVE_FROM/IN_MOVE_TO pair.
271    pub next_inotify_cookie: AtomicCounter<u32>,
272
273    /// Controls which processes a process is allowed to ptrace.  See Documentation/security/Yama.txt
274    pub ptrace_scope: AtomicU8,
275
276    // The Fuchsia build version returned by `fuchsia.buildinfo.Provider`.
277    pub build_version: OnceCell<String>,
278
279    pub stats: Arc<KernelStats>,
280
281    /// Resource limits that are exposed, for example, via sysctl.
282    pub system_limits: SystemLimits,
283
284    // The service to handle delayed releases. This is required for elements that requires to
285    // execute some code when released and requires a known context (both in term of lock context,
286    // as well as `CurrentTask`).
287    pub delayed_releaser: DelayedReleaser,
288
289    /// Manages task priorities.
290    pub scheduler: SchedulerManager,
291
292    /// The syslog manager.
293    pub syslog: Syslog,
294
295    /// All mounts.
296    pub mounts: Mounts,
297
298    /// The manager for creating and managing high-resolution timers.
299    pub hrtimer_manager: HrTimerManagerHandle,
300
301    /// The manager for monitoring and reporting resources used by the kernel.
302    pub memory_attribution_manager: MemoryAttributionManager,
303
304    /// Handler for crashing Linux processes.
305    pub crash_reporter: CrashReporter,
306
307    /// Whether this kernel is shutting down. When shutting down, new processes may not be spawned.
308    shutting_down: AtomicBool,
309
310    /// True to disable syslog access to unprivileged callers.  This also controls whether read
311    /// access to /dev/kmsg requires privileged capabilities.
312    pub restrict_dmesg: AtomicBool,
313
314    /// Determines whether unprivileged BPF is permitted, or can be re-enabled.
315    ///   0 - Unprivileged BPF is permitted.
316    ///   1 - Unprivileged BPF is not permitted, and cannot be enabled.
317    ///   2 - Unprivileged BPF is not permitted, but can be enabled by a privileged task.
318    pub disable_unprivileged_bpf: AtomicU8,
319
320    /// Control handle to the running container's ComponentController.
321    pub container_control_handle: Mutex<Option<ComponentControllerControlHandle>>,
322
323    /// eBPF state: loaded programs, eBPF maps, etc.
324    pub ebpf_state: EbpfState,
325
326    /// Cgroups of the kernel.
327    pub cgroups: KernelCgroups,
328
329    /// Used to communicate requests to adjust system time from within a Starnix
330    /// container. Used from syscalls.
331    pub time_adjustment_proxy: Option<AdjustSynchronousProxy>,
332
333    /// Used to store tokens for sockets, particularly per-uid sharing domain sockets.
334    pub socket_tokens_store: SocketTokensStore,
335
336    /// Hardware capabilities to push onto stack when loading an ELF binary.
337    pub hwcaps: HwCaps,
338
339    /// Filters for syscall logging. Processes with names matching these filters will have syscalls
340    /// logged at INFO level.
341    pub syscall_log_filters: Mutex<Vec<SyscallLogFilter>>,
342}
343
344/// Hardware capabilities.
345#[derive(Debug, Clone, Copy, Default)]
346pub struct HwCap {
347    /// The value for `AT_HWCAP`.
348    pub hwcap: u32,
349    /// The value for `AT_HWCAP2`.
350    pub hwcap2: u32,
351}
352
353/// Hardware capabilities for both 32-bit and 64-bit ELF binaries.
354#[derive(Debug, Clone, Copy, Default)]
355pub struct HwCaps {
356    /// For 32-bit binaries.
357    #[cfg(target_arch = "aarch64")]
358    pub arch32: HwCap,
359    /// For 64-bit binaries.
360    pub arch64: HwCap,
361}
362
363/// An implementation of [`InterfacesHandler`].
364///
365/// This holds a `Weak<Kernel>` because it is held within a [`Netlink`] which
366/// is itself held within an `Arc<Kernel>`. Holding an `Arc<T>` within an
367/// `Arc<T>` prevents the `Arc`'s ref count from ever reaching 0, causing a
368/// leak.
369struct InterfacesHandlerImpl(Weak<Kernel>);
370
371impl InterfacesHandlerImpl {
372    fn kernel(&self) -> Option<Arc<Kernel>> {
373        self.0.upgrade()
374    }
375}
376
377impl InterfacesHandler for InterfacesHandlerImpl {
378    fn handle_new_link(&mut self, name: &str, interface_id: NonZeroU64) {
379        if let Some(kernel) = self.kernel() {
380            kernel.netstack_devices.add_device(&kernel, name.into(), interface_id);
381        }
382    }
383
384    fn handle_deleted_link(&mut self, name: &str) {
385        if let Some(kernel) = self.kernel() {
386            kernel.netstack_devices.remove_device(&kernel, name.into());
387        }
388    }
389
390    fn handle_idle_event(&mut self) {
391        let Some(kernel) = self.kernel() else {
392            log_error!("kernel went away while netlink is initializing");
393            return;
394        };
395        let (initialized, wq) = &kernel.netstack_devices.initialized_and_wq;
396        if initialized.swap(true, Ordering::SeqCst) {
397            log_error!("netlink initial devices should only be reported once");
398            return;
399        }
400        wq.notify_all()
401    }
402}
403
404impl Kernel {
405    pub fn new(
406        cmdline: BString,
407        features: KernelFeatures,
408        system_limits: SystemLimits,
409        container_namespace: ContainerNamespace,
410        scheduler: SchedulerManager,
411        crash_reporter_proxy: Option<CrashReporterProxy>,
412        inspect_node: fuchsia_inspect::Node,
413        security_state: security::KernelState,
414        time_adjustment_proxy: Option<AdjustSynchronousProxy>,
415        device_tree: Option<Devicetree>,
416    ) -> Result<Arc<Kernel>, zx::Status> {
417        let unix_address_maker =
418            Box::new(|x: FsString| -> SocketAddress { SocketAddress::Unix(x) });
419        let vsock_address_maker = Box::new(|x: u32| -> SocketAddress {
420            SocketAddress::Vsock { port: x, cid: VMADDR_CID_HOST }
421        });
422
423        let crash_reporter = CrashReporter::new(
424            &inspect_node,
425            crash_reporter_proxy,
426            zx::Duration::from_minutes(8),
427            features.crash_report_throttling,
428        );
429        let hrtimer_manager = HrTimerManager::new(&inspect_node);
430
431        let cpu_feature_flags =
432            zx::system_get_feature_flags::<CpuFeatureFlags>().unwrap_or_else(|e| {
433                log_debug!("CPU feature flags are only supported on ARM64: {}, reporting 0", e);
434                CpuFeatureFlags::empty()
435            });
436        let hwcaps = HwCaps::from_cpu_feature_flags(cpu_feature_flags);
437
438        let this = Arc::new_cyclic(|kernel| Kernel {
439            weak_self: kernel.clone(),
440            kthreads: KernelThreads::new(kernel.clone()),
441            features,
442            pids: Default::default(),
443            pid_to_koid_mapping: Arc::new(RwLock::new(None)),
444            expando: Default::default(),
445            default_abstract_socket_namespace: AbstractUnixSocketNamespace::new(unix_address_maker),
446            default_abstract_vsock_namespace: AbstractVsockSocketNamespace::new(
447                vsock_address_maker,
448            ),
449            cmdline,
450            device_tree,
451            security_state,
452            device_registry: Default::default(),
453            container_namespace,
454            remote_block_device_registry: Default::default(),
455            iptables: OnceLock::new(),
456            shared_futexes: Arc::<FutexTable<SharedFutexKey>>::default(),
457            root_uts_ns: Arc::new(RwLock::new(UtsNamespace::default())),
458            vdso: Vdso::new(),
459            vdso_arch32: Vdso::new_arch32(),
460            netstack_devices: Arc::default(),
461            swap_files: Default::default(),
462            generic_netlink: OnceLock::new(),
463            network_netlink: OnceLock::new(),
464            inspect_node,
465            actions_logged: AtomicU16::new(0),
466            suspend_resume_manager: Default::default(),
467            next_mount_id: AtomicCounter::<u64>::new(1),
468            next_peer_group_id: AtomicCounter::<u64>::new(1),
469            next_namespace_id: AtomicCounter::<u64>::new(1),
470            next_inotify_cookie: AtomicCounter::<u32>::new(1),
471            next_file_object_id: Default::default(),
472            system_limits,
473            ptrace_scope: AtomicU8::new(0), // Disable YAMA checks by default.
474            restrict_dmesg: AtomicBool::new(false),
475            disable_unprivileged_bpf: AtomicU8::new(0), // Enable unprivileged BPF by default.
476            build_version: OnceCell::new(),
477            stats: Arc::new(KernelStats::default()),
478            delayed_releaser: Default::default(),
479            scheduler,
480            syslog: Default::default(),
481            mounts: Mounts::new(),
482            hrtimer_manager,
483            memory_attribution_manager: MemoryAttributionManager::new(kernel.clone()),
484            crash_reporter,
485            shutting_down: AtomicBool::new(false),
486            container_control_handle: Mutex::new(None),
487            ebpf_state: Default::default(),
488            cgroups: Default::default(),
489            time_adjustment_proxy,
490            socket_tokens_store: Default::default(),
491            hwcaps,
492            syscall_log_filters: Default::default(),
493        });
494
495        // Initialize the device registry before registering any devices.
496        //
497        // We will create sysfs recursively within this function.
498        this.device_registry.objects.init(&mut this.kthreads.unlocked_for_async(), &this);
499
500        // Make a copy of this Arc for the inspect lazy node to use but don't create an Arc cycle
501        // because the inspect node that owns this reference is owned by the kernel.
502        let kernel = Arc::downgrade(&this);
503        this.inspect_node.record_lazy_child("thread_groups", move || {
504            if let Some(kernel) = kernel.upgrade() {
505                let inspector = kernel.get_thread_groups_inspect();
506                async move { Ok(inspector) }.boxed()
507            } else {
508                async move { Err(anyhow::format_err!("kernel was dropped")) }.boxed()
509            }
510        });
511
512        let kernel = Arc::downgrade(&this);
513        this.inspect_node.record_lazy_child("cgroupv2", move || {
514            if let Some(kernel) = kernel.upgrade() {
515                async move { Ok(kernel.cgroups.cgroup2.get_cgroup_inspect()) }.boxed()
516            } else {
517                async move { Err(anyhow::format_err!("kernel was dropped")) }.boxed()
518            }
519        });
520
521        Ok(this)
522    }
523
524    /// Shuts down userspace and the kernel in an orderly fashion, eventually terminating the root
525    /// kernel process.
526    pub fn shut_down(self: &Arc<Self>) {
527        // Run shutdown code on a kthread in the main process so that it can be the last process
528        // alive.
529        self.kthreads.spawn_future(
530            {
531                let kernel = self.clone();
532                move || async move {
533                    kernel.run_shutdown().await;
534                }
535            },
536            "run_shutdown",
537        );
538    }
539
540    /// Starts shutting down the Starnix kernel and any running container. Only one thread can drive
541    /// shutdown at a time. This function will return immediately if shut down is already under way.
542    ///
543    /// Shutdown happens in several phases:
544    ///
545    /// 1. Disable launching new processes
546    /// 2. Shut down individual ThreadGroups until only the init and system tasks remain
547    /// 3. Repeat the above for the init task
548    /// 4. Clean up kernel-internal structures that can hold processes alive
549    /// 5. Ensure this process is the only one running in the kernel job.
550    /// 6. Unmounts the kernel's mounts' FileSystems.
551    /// 7. Tell CF the container component has stopped
552    /// 8. Exit this process
553    ///
554    /// If a ThreadGroup does not shut down on its own (including after SIGKILL), that phase of
555    /// shutdown will hang. To gracefully shut down any further we need the other kernel processes
556    /// to do controlled exits that properly release access to shared state. If our orderly shutdown
557    /// does hang, eventually CF will kill the container component which will lead to the job of
558    /// this process being killed and shutdown will still complete.
559    async fn run_shutdown(&self) {
560        const INIT_PID: i32 = 1;
561        const SYSTEM_TASK_PID: i32 = 2;
562
563        // Step 1: Prevent new processes from being created once they observe this update. We don't
564        // want the thread driving shutdown to be racing with other threads creating new processes.
565        if self
566            .shutting_down
567            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
568            .is_err()
569        {
570            log_info!("Additional thread tried to initiate shutdown while already in-progress.");
571            return;
572        }
573
574        log_info!("Shutting down Starnix kernel.");
575
576        // Step 2: Shut down thread groups in a loop until init and the system task are all that
577        // remain.
578        loop {
579            let tgs = {
580                // Exiting thread groups need to acquire a write lock for the pid table to
581                // successfully exit so we need to acquire that lock in a reduced scope.
582                self.pids
583                    .read()
584                    .get_thread_groups()
585                    .into_iter()
586                    .filter(|tg| tg.leader != SYSTEM_TASK_PID && tg.leader != INIT_PID)
587                    .collect::<Vec<_>>()
588            };
589            if tgs.is_empty() {
590                log_info!("pid table is empty except init and system task");
591                break;
592            }
593
594            log_info!(tgs:?; "shutting down thread groups");
595            let mut tasks = vec![];
596            for tg in tgs {
597                let task = fasync::Task::local(ThreadGroup::shut_down(Arc::downgrade(&tg)));
598                tasks.push(task);
599            }
600            futures::future::join_all(tasks).await;
601        }
602
603        // Step 3: Terminate the init process.
604        let maybe_init = {
605            // Exiting thread groups need to acquire a write lock for the pid table to successfully
606            // exit so we need to acquire that lock in a reduced scope.
607            self.pids.read().get_thread_group(1).map(|tg| Arc::downgrade(&tg))
608        };
609        if let Some(init) = maybe_init {
610            log_info!("shutting down init");
611            ThreadGroup::shut_down(init).await;
612        } else {
613            log_info!("init already terminated");
614        }
615
616        // Step 4: Clean up any structures that can keep non-Linux processes live in our job.
617        log_info!("cleaning up pinned memory");
618        self.expando.remove::<crate::mm::InfoCacheShadowProcess>();
619        self.expando.remove::<crate::mm::MlockShadowProcess>();
620
621        // Step 5: Make sure this is the only process running in the job. We already should have
622        // cleared up all processes other than the system task at this point, but wait on any that
623        // might be around for good measure.
624        //
625        // Use unwrap liberally since we're shutting down anyway and errors will still tear down the
626        // kernel.
627        let kernel_job = fuchsia_runtime::job_default();
628        assert_eq!(kernel_job.children().unwrap(), &[], "starnix does not create any child jobs");
629        let own_koid = fuchsia_runtime::process_self().koid().unwrap();
630
631        log_info!("waiting for this to be the only process in the job");
632        loop {
633            let mut remaining_processes = kernel_job
634                .processes()
635                .unwrap()
636                .into_iter()
637                // Don't wait for ourselves to exit.
638                .filter(|pid| pid != &own_koid)
639                .peekable();
640            if remaining_processes.peek().is_none() {
641                log_info!("No stray Zircon processes.");
642                break;
643            }
644
645            let mut terminated_signals = vec![];
646            for pid in remaining_processes {
647                let handle = match kernel_job
648                    .get_child(&pid, zx::Rights::BASIC | zx::Rights::PROPERTY | zx::Rights::DESTROY)
649                {
650                    Ok(h) => h,
651                    Err(e) => {
652                        log_info!(pid:?, e:?; "failed to get child process from job");
653                        continue;
654                    }
655                };
656                log_info!(
657                    pid:?,
658                    name:? = handle.get_name();
659                    "waiting on process terminated signal"
660                );
661                terminated_signals
662                    .push(fuchsia_async::OnSignals::new(handle, zx::Signals::PROCESS_TERMINATED));
663            }
664            log_info!("waiting on process terminated signals");
665            futures::future::join_all(terminated_signals).await;
666        }
667
668        // Step 6: Forcibly unmounts the mounts' FileSystems.
669        log_info!("clearing mounts");
670        self.mounts.clear();
671
672        // Step 7: Tell CF the container stopped.
673        log_info!("all non-root processes killed, notifying CF container is stopped");
674        if let Some(control_handle) = self.container_control_handle.lock().take() {
675            log_info!("Notifying CF that the container has stopped.");
676            control_handle
677                .send_on_stop(ComponentStopInfo {
678                    termination_status: Some(zx::Status::OK.into_raw()),
679                    exit_code: Some(0),
680                    ..ComponentStopInfo::default()
681                })
682                .unwrap();
683            control_handle.shutdown_with_epitaph(zx::Status::OK);
684        } else {
685            log_warn!("Shutdown invoked without a container controller control handle.");
686        }
687
688        // Step 8: exiting this process.
689        log_info!("All tasks killed, exiting Starnix kernel root process.");
690        // Normally a Rust program exits its process by calling `std::process::exit()` which goes
691        // through libc to exit the program. This runs drop impls on any thread-local variables
692        // which can cause issues during Starnix shutdown when we haven't yet integrated every
693        // subsystem with the shutdown flow. While those issues are indicative of underlying
694        // problems, we can't solve them without finishing the implementation of graceful shutdown.
695        // Instead, ask Zircon to exit our process directly, bypassing any libc atexit handlers.
696        // TODO(https://fxbug.dev/295073633) return from main instead of avoiding atexit handlers
697        zx::Process::exit(0);
698    }
699
700    pub fn is_shutting_down(&self) -> bool {
701        self.shutting_down.load(Ordering::Acquire)
702    }
703
704    pub fn allow_unprivileged_bpf(&self) -> bool {
705        self.disable_unprivileged_bpf.load(Ordering::Relaxed) == 0
706    }
707
708    /// Opens a device file (driver) identified by `dev`.
709    pub fn open_device<L>(
710        &self,
711        locked: &mut Locked<L>,
712        current_task: &CurrentTask,
713        node: &NamespaceNode,
714        flags: OpenFlags,
715        dev: DeviceId,
716        mode: DeviceMode,
717    ) -> Result<Box<dyn FileOps>, Errno>
718    where
719        L: LockEqualOrBefore<FileOpsCore>,
720    {
721        self.device_registry.open_device(locked, current_task, node, flags, dev, mode)
722    }
723
724    /// Return a reference to the Audit Framework
725    ///
726    /// This function follows the lazy initialization pattern.
727    pub fn audit_logger(&self) -> Arc<AuditLogger> {
728        self.expando.get_or_init(|| AuditLogger::new(self))
729    }
730
731    /// Return a reference to the GenericNetlink implementation.
732    ///
733    /// This function follows the lazy initialization pattern, where the first
734    /// call will instantiate the Generic Netlink server in a separate kthread.
735    pub fn generic_netlink(&self) -> &GenericNetlink<NetlinkToClientSender<GenericMessage>> {
736        self.generic_netlink.get_or_init(|| {
737            let (generic_netlink, worker_params) = GenericNetlink::new();
738            let enable_nl80211 = self.features.wifi;
739            self.kthreads.spawn_future(
740                move || async move {
741                    crate::vfs::socket::run_generic_netlink_worker(worker_params, enable_nl80211)
742                        .await;
743                    log_error!("Generic Netlink future unexpectedly exited");
744                },
745                "generic_netlink_worker",
746            );
747            generic_netlink
748        })
749    }
750
751    /// Return a reference to the [`netlink::Netlink`] implementation.
752    ///
753    /// This function follows the lazy initialization pattern, where the first
754    /// call will instantiate the Netlink implementation.
755    pub fn network_netlink(self: &Arc<Self>) -> &Netlink<NetlinkContextImpl> {
756        self.network_netlink.get_or_init(|| {
757            let (network_netlink, worker_params) =
758                Netlink::new(InterfacesHandlerImpl(self.weak_self.clone()));
759
760            let kernel = self.clone();
761            self.kthreads.spawn_future(
762                move || async move {
763                    netlink::run_netlink_worker(
764                        worker_params,
765                        NetlinkAccessControl::new(kernel.kthreads.system_task()),
766                    )
767                    .await;
768                    log_error!(tag = NETLINK_LOG_TAG; "Netlink async worker unexpectedly exited");
769                },
770                "network_netlink_worker",
771            );
772            network_netlink
773        })
774    }
775
776    pub fn iptables(&self) -> &IpTables {
777        self.iptables.get_or_init(|| IpTables::new())
778    }
779
780    /// Returns a Proxy to the service used by the container at `filename`.
781    #[allow(unused)]
782    pub fn connect_to_named_protocol_at_container_svc<P: ProtocolMarker>(
783        &self,
784        filename: &str,
785    ) -> Result<ClientEnd<P>, Errno> {
786        match self.container_namespace.get_namespace_channel("/svc") {
787            Ok(channel) => {
788                let (client_end, server_end) = create_endpoints::<P>();
789                fdio::service_connect_at(channel.as_ref(), filename, server_end.into_channel())
790                    .map_err(|status| from_status_like_fdio!(status))?;
791                Ok(client_end)
792            }
793            Err(err) => {
794                log_error!("Unable to get /svc namespace channel! {}", err);
795                Err(errno!(ENOENT))
796            }
797        }
798    }
799
800    /// Returns a Proxy to the service `P` used by the container.
801    pub fn connect_to_protocol_at_container_svc<P: DiscoverableProtocolMarker>(
802        &self,
803    ) -> Result<ClientEnd<P>, Errno> {
804        self.connect_to_named_protocol_at_container_svc::<P>(P::PROTOCOL_NAME)
805    }
806
807    pub fn add_syscall_log_filter(&self, name: &str) {
808        let filter = SyscallLogFilter::new(name.to_string());
809        {
810            let mut filters = self.syscall_log_filters.lock();
811            if filters.contains(&filter) {
812                return;
813            }
814            filters.push(filter);
815        }
816        for headers in self.pids.read().get_thread_groups() {
817            headers.sync_syscall_log_level();
818        }
819    }
820
821    pub fn clear_syscall_log_filters(&self) {
822        {
823            let mut filters = self.syscall_log_filters.lock();
824            if filters.is_empty() {
825                return;
826            }
827            filters.clear();
828        }
829        for headers in self.pids.read().get_thread_groups() {
830            headers.sync_syscall_log_level();
831        }
832    }
833
834    fn get_thread_groups_inspect(&self) -> fuchsia_inspect::Inspector {
835        let inspector = fuchsia_inspect::Inspector::default();
836
837        let thread_groups = inspector.root();
838        let mut mm_summary = MappingSummary::default();
839        let mut mms_summarized = HashSet::new();
840
841        // Avoid holding locks for the entire iteration.
842        let all_thread_groups = {
843            let pid_table = self.pids.read();
844            pid_table.get_thread_groups()
845        };
846        for thread_group in all_thread_groups {
847            // Avoid holding the state lock while summarizing.
848            let (ppid, tasks) = {
849                let tg = thread_group.read();
850                (tg.get_ppid() as i64, tg.tasks())
851            };
852
853            let tg_node = thread_groups.create_child(format!("{}", thread_group.leader));
854            if let Ok(koid) = thread_group.process.koid() {
855                tg_node.record_int("koid", koid.raw_koid() as i64);
856            }
857            tg_node.record_int("pid", thread_group.leader as i64);
858            tg_node.record_int("ppid", ppid);
859            tg_node.record_bool("stopped", thread_group.load_stopped() == StopState::GroupStopped);
860
861            let tasks_node = tg_node.create_child("tasks");
862            for task in tasks {
863                if let Ok(mm) = task.mm() {
864                    if mms_summarized.insert(Arc::as_ptr(&mm) as usize) {
865                        mm.summarize(&mut mm_summary);
866                    }
867                }
868                let set_properties = |node: &fuchsia_inspect::Node| {
869                    node.record_string("command", task.command().to_string());
870
871                    let scheduler_state = task.read().scheduler_state;
872                    if !scheduler_state.is_default() {
873                        node.record_child("sched", |node| {
874                            node.record_string(
875                                "role_name",
876                                self.scheduler
877                                    .role_name(&task)
878                                    .map(|n| Cow::Borrowed(n))
879                                    .unwrap_or_else(|e| Cow::Owned(e.to_string())),
880                            );
881                            node.record_string("state", format!("{scheduler_state:?}"));
882                        });
883                    }
884                };
885                if task.tid == thread_group.leader {
886                    let mut argv = task.read_argv(256).unwrap_or_default();
887
888                    // Any runtime that overwrites argv is likely to leave a lot of trailing
889                    // nulls, no need to print those in inspect.
890                    argv.retain(|arg| !arg.is_empty());
891
892                    let inspect_argv = tg_node.create_string_array("argv", argv.len());
893                    for (i, arg) in argv.iter().enumerate() {
894                        inspect_argv.set(i, arg.to_string());
895                    }
896                    tg_node.record(inspect_argv);
897
898                    set_properties(&tg_node);
899                } else {
900                    tasks_node.record_child(task.tid.to_string(), |task_node| {
901                        set_properties(task_node);
902                    });
903                };
904            }
905            tg_node.record(tasks_node);
906            thread_groups.record(tg_node);
907        }
908
909        thread_groups.record_child("memory_managers", |node| mm_summary.record(node));
910
911        inspector
912    }
913
914    pub fn new_memory_attribution_observer(
915        &self,
916        control_handle: fattribution::ProviderControlHandle,
917    ) -> attribution_server::Observer {
918        self.memory_attribution_manager.new_observer(control_handle)
919    }
920
921    /// Opens and returns a directory proxy from the container's namespace, at
922    /// the requested path, using the provided flags. This method will open the
923    /// closest existing path from the namespace hierarchy, and then attempt
924    /// initialize an open on the remaining subdirectory path, using the given open_flags.
925    ///
926    /// For example, given the parameter provided is `/path/to/foo/bar` and there
927    /// are namespace entries already for `/path/to/foo` and `/path/to`. The entry
928    /// for /path/to/foo will be opened, and then the /bar will attempt to be opened
929    /// underneath that directory with the given open_flags. The returned value
930    /// will be the proxy to the parent (/path/to/foo) and the string to the child
931    /// path (/bar). The caller of this method can expect /bar to be initialized.
932    pub fn open_ns_dir(
933        &self,
934        path: &str,
935        open_flags: fio::Flags,
936    ) -> Result<(fio::DirectorySynchronousProxy, String), Errno> {
937        let ns_path = PathBuf::from(path);
938        match self.container_namespace.find_closest_channel(&ns_path) {
939            Ok((root_channel, remaining_subdir)) => {
940                let (_, server_end) = create_endpoints::<fio::DirectoryMarker>();
941                fdio::open_at(
942                    &root_channel,
943                    &remaining_subdir,
944                    open_flags,
945                    server_end.into_channel(),
946                )
947                .map_err(|e| {
948                    log_error!("Failed to intialize the subdirs: {}", e);
949                    errno!(EIO)
950                })?;
951
952                Ok((fio::DirectorySynchronousProxy::new(root_channel), remaining_subdir))
953            }
954            Err(err) => {
955                log_error!(
956                    "Unable to find a channel for {}. Received error: {}",
957                    ns_path.display(),
958                    err
959                );
960                Err(errno!(ENOENT))
961            }
962        }
963    }
964
965    /// Returns an iterator of the command line arguments.
966    pub fn cmdline_args_iter(&self) -> impl Iterator<Item = ArgNameAndValue<'_>> {
967        parse_cmdline(self.cmdline.to_str().unwrap_or_default()).filter_map(|arg| {
968            arg.split_once('=')
969                .map(|(name, value)| ArgNameAndValue { name: name, value: Some(value) })
970                .or(Some(ArgNameAndValue { name: arg, value: None }))
971        })
972    }
973
974    /// Returns the container-configured CacheConfig.
975    pub fn fs_cache_config(&self) -> CacheConfig {
976        CacheConfig { capacity: self.features.dirent_cache_size as usize }
977    }
978}
979
980pub fn parse_cmdline(cmdline: &str) -> impl Iterator<Item = &str> {
981    let mut args = Vec::new();
982    let mut arg_start: Option<usize> = None;
983    let mut in_quotes = false;
984    let mut previous_char = ' ';
985
986    for (i, c) in cmdline.char_indices() {
987        if let Some(start) = arg_start {
988            match c {
989                ' ' if !in_quotes => {
990                    args.push(&cmdline[start..i]);
991                    arg_start = None;
992                }
993                '"' if previous_char != '\\' => {
994                    in_quotes = !in_quotes;
995                }
996                _ => {}
997            }
998        } else if c != ' ' {
999            arg_start = Some(i);
1000            if c == '"' {
1001                in_quotes = true;
1002            }
1003        }
1004        previous_char = c;
1005    }
1006    if let Some(start) = arg_start {
1007        args.push(&cmdline[start..]);
1008    }
1009    args.into_iter()
1010}
1011
1012impl std::fmt::Debug for Kernel {
1013    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1014        f.debug_struct("Kernel").finish()
1015    }
1016}
1017
1018// TODO(https://fxbug.dev/380427153): move arch dependent code to `kernel/core/arch/*`.
1019#[cfg(target_arch = "aarch64")]
1020fn arm32_hwcap(cpu_feature_flags: CpuFeatureFlags) -> HwCap {
1021    use starnix_uapi::arch32;
1022    const COMPAT_ARM32_ELF_HWCAP: u32 = arch32::HWCAP_HALF
1023        | arch32::HWCAP_THUMB
1024        | arch32::HWCAP_FAST_MULT
1025        | arch32::HWCAP_EDSP
1026        | arch32::HWCAP_TLS
1027        | arch32::HWCAP_IDIV // == IDIVA | IDIVT.
1028        | arch32::HWCAP_LPAE
1029        | arch32::HWCAP_EVTSTRM;
1030
1031    let mut hwcap = COMPAT_ARM32_ELF_HWCAP;
1032    let mut hwcap2 = 0;
1033    for feature in cpu_feature_flags.iter() {
1034        match feature {
1035            CpuFeatureFlags::ARM64_FEATURE_ISA_ASIMD => hwcap |= arch32::HWCAP_NEON,
1036            CpuFeatureFlags::ARM64_FEATURE_ISA_AES => hwcap2 |= arch32::HWCAP2_AES,
1037            CpuFeatureFlags::ARM64_FEATURE_ISA_PMULL => hwcap2 |= arch32::HWCAP2_PMULL,
1038            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA1 => hwcap2 |= arch32::HWCAP2_SHA1,
1039            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA256 => hwcap2 |= arch32::HWCAP2_SHA2,
1040            CpuFeatureFlags::ARM64_FEATURE_ISA_CRC32 => hwcap2 |= arch32::HWCAP2_CRC32,
1041            CpuFeatureFlags::ARM64_FEATURE_ISA_I8MM => hwcap |= arch32::HWCAP_I8MM,
1042            CpuFeatureFlags::ARM64_FEATURE_ISA_FHM => hwcap |= arch32::HWCAP_ASIMDFHM,
1043            CpuFeatureFlags::ARM64_FEATURE_ISA_DP => hwcap |= arch32::HWCAP_ASIMDDP,
1044            CpuFeatureFlags::ARM64_FEATURE_ISA_FP => {
1045                hwcap |= arch32::HWCAP_VFP | arch32::HWCAP_VFPv3 | arch32::HWCAP_VFPv4
1046            }
1047            _ => {}
1048        }
1049    }
1050    HwCap { hwcap, hwcap2 }
1051}
1052
1053#[cfg(target_arch = "aarch64")]
1054fn arm64_hwcap(cpu_feature_flags: CpuFeatureFlags) -> HwCap {
1055    // See https://docs.kernel.org/arch/arm64/elf_hwcaps.html for details.
1056    use starnix_uapi;
1057    let mut hwcap = 0;
1058    let mut hwcap2 = 0;
1059
1060    for feature in cpu_feature_flags.iter() {
1061        match feature {
1062            CpuFeatureFlags::ARM64_FEATURE_ISA_FP => hwcap |= starnix_uapi::HWCAP_FP,
1063            CpuFeatureFlags::ARM64_FEATURE_ISA_ASIMD => hwcap |= starnix_uapi::HWCAP_ASIMD,
1064            CpuFeatureFlags::ARM64_FEATURE_ISA_AES => hwcap |= starnix_uapi::HWCAP_AES,
1065            CpuFeatureFlags::ARM64_FEATURE_ISA_PMULL => hwcap |= starnix_uapi::HWCAP_PMULL,
1066            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA1 => hwcap |= starnix_uapi::HWCAP_SHA1,
1067            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA256 => hwcap |= starnix_uapi::HWCAP_SHA2,
1068            CpuFeatureFlags::ARM64_FEATURE_ISA_CRC32 => hwcap |= starnix_uapi::HWCAP_CRC32,
1069            CpuFeatureFlags::ARM64_FEATURE_ISA_I8MM => hwcap2 |= starnix_uapi::HWCAP2_I8MM,
1070            CpuFeatureFlags::ARM64_FEATURE_ISA_FHM => hwcap |= starnix_uapi::HWCAP_ASIMDFHM,
1071            CpuFeatureFlags::ARM64_FEATURE_ISA_DP => hwcap |= starnix_uapi::HWCAP_ASIMDDP,
1072            CpuFeatureFlags::ARM64_FEATURE_ISA_SM3 => hwcap |= starnix_uapi::HWCAP_SM3,
1073            CpuFeatureFlags::ARM64_FEATURE_ISA_SM4 => hwcap |= starnix_uapi::HWCAP_SM4,
1074            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA3 => hwcap |= starnix_uapi::HWCAP_SHA3,
1075            CpuFeatureFlags::ARM64_FEATURE_ISA_SHA512 => hwcap |= starnix_uapi::HWCAP_SHA512,
1076            CpuFeatureFlags::ARM64_FEATURE_ISA_ATOMICS => hwcap |= starnix_uapi::HWCAP_ATOMICS,
1077            CpuFeatureFlags::ARM64_FEATURE_ISA_RDM => hwcap |= starnix_uapi::HWCAP_ASIMDRDM,
1078            CpuFeatureFlags::ARM64_FEATURE_ISA_TS => hwcap |= starnix_uapi::HWCAP_FLAGM,
1079            CpuFeatureFlags::ARM64_FEATURE_ISA_DPB => hwcap |= starnix_uapi::HWCAP_DCPOP,
1080            CpuFeatureFlags::ARM64_FEATURE_ISA_RNDR => hwcap2 |= starnix_uapi::HWCAP2_RNG,
1081            _ => {}
1082        }
1083    }
1084    HwCap { hwcap, hwcap2 }
1085}
1086
1087impl HwCaps {
1088    #[cfg(target_arch = "aarch64")]
1089    pub fn from_cpu_feature_flags(cpu_feature_flags: CpuFeatureFlags) -> Self {
1090        Self { arch32: arm32_hwcap(cpu_feature_flags), arch64: arm64_hwcap(cpu_feature_flags) }
1091    }
1092
1093    #[cfg(not(target_arch = "aarch64"))]
1094    pub fn from_cpu_feature_flags(_cpu_feature_flags: CpuFeatureFlags) -> Self {
1095        Self { arch64: HwCap::default() }
1096    }
1097}
1098
1099#[cfg(test)]
1100mod test {
1101    use super::parse_cmdline;
1102
1103    #[test]
1104    fn test_parse_cmdline() {
1105        let cmdline =
1106            r#"first second=third "fourth fifth" sixth="seventh eighth" "ninth\" tenth" eleventh"#;
1107        let expected = vec![
1108            "first",
1109            "second=third",
1110            "\"fourth fifth\"",
1111            "sixth=\"seventh eighth\"",
1112            "\"ninth\\\" tenth\"",
1113            "eleventh",
1114        ];
1115        assert_eq!(parse_cmdline(cmdline).collect::<Vec<_>>(), expected);
1116    }
1117}