Skip to main content

starnix_kernel_runner/
container.rs

1// Copyright 2022 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::{
6    Features, MountAction, expose_root, parse_features, parse_numbered_handles,
7    run_container_features, serve_component_runner, serve_container_controller,
8    serve_graphical_presenter, serve_lutex_controller,
9};
10use anyhow::{Context, Error, anyhow, bail};
11use bootreason::get_or_init_android_bootreason;
12use bstr::{BString, ByteSlice};
13use devicetree::parser::parse_devicetree;
14use devicetree::types::Devicetree;
15use fidl::endpoints::{ControlHandle, RequestStream, ServerEnd};
16use fidl_fuchsia_boot as fboot;
17use fidl_fuchsia_component as fcomponent;
18use fidl_fuchsia_component_runner as frunner;
19use fidl_fuchsia_component_runner::{TaskProviderRequest, TaskProviderRequestStream};
20use fidl_fuchsia_element as felement;
21use fidl_fuchsia_feedback::CrashReporterMarker;
22use fidl_fuchsia_io as fio;
23use fidl_fuchsia_mem as fmem;
24use fidl_fuchsia_memory_attribution as fattribution;
25use fidl_fuchsia_starnix_binder as fbinder;
26use fidl_fuchsia_starnix_container as fstarcontainer;
27use fidl_fuchsia_time_external::AdjustMarker;
28use fuchsia_async as fasync;
29use fuchsia_async::DurationExt;
30use fuchsia_component::client::{connect_to_protocol, connect_to_protocol_sync};
31use fuchsia_component::server::ServiceFs;
32use fuchsia_inspect as inspect;
33use fuchsia_runtime as fruntime;
34use fuchsia_zbi as zbi;
35use futures::channel::oneshot;
36use futures::{FutureExt, StreamExt, TryStreamExt};
37use serde::Deserialize;
38use starnix_container_structured_config::Config as ContainerStructuredConfig;
39use starnix_core::device::remote_block_device::remote_block_device_init;
40use starnix_core::execution::{
41    create_init_process, create_system_task, execute_task_with_prerun_result,
42};
43use starnix_core::fs::fuchsia::new_remotefs_in_root;
44use starnix_core::fs::tmpfs::TmpFs;
45use starnix_core::security;
46use starnix_core::task::container_namespace::ContainerNamespace;
47use starnix_core::task::{
48    CurrentTask, ExitStatus, Kernel, RoleOverrides, SchedulerManager, parse_cmdline,
49};
50use starnix_core::vfs::{FileSystemOptions, FsContext, LookupContext, Namespace, WhatToMount};
51use starnix_logging::{
52    CATEGORY_STARNIX, NAME_CREATE_CONTAINER, log_debug, log_error, log_info, log_warn,
53};
54use starnix_modules::{init_common_devices, register_common_file_systems};
55use starnix_modules_layeredfs::{LayeredFsBuilder, LayeredFsMounts};
56use starnix_modules_magma::get_magma_params;
57use starnix_modules_overlayfs::OverlayStack;
58use starnix_modules_rtc::rtc_device_init;
59use starnix_sync::{Locked, Unlocked};
60use starnix_task_command::TaskCommand;
61use starnix_uapi::errors::{ENOENT, SourceContext};
62use starnix_uapi::open_flags::OpenFlags;
63use starnix_uapi::resource_limits::Resource;
64use starnix_uapi::{errno, tid_t};
65use std::ffi::CString;
66use std::ops::DerefMut;
67use std::sync::Arc;
68use zx::Task as _;
69
70use std::sync::Weak;
71
72use crate::serve_memory_attribution_provider_container;
73use attribution_server::{AttributionServer, AttributionServerHandle};
74
75/// Manages the memory attribution protocol for a Starnix container.
76struct ContainerMemoryAttributionManager {
77    /// Holds state for the hanging-get attribution protocol.
78    memory_attribution_server: AttributionServerHandle,
79}
80
81impl ContainerMemoryAttributionManager {
82    /// Creates a new [ContainerMemoryAttributionManager] from a Starnix kernel and the moniker
83    /// token of the container component.
84    pub fn new(kernel: Weak<Kernel>, component_instance: zx::Event) -> Self {
85        let memory_attribution_server = AttributionServer::new(Box::new(move || {
86            let kernel_ref = match kernel.upgrade() {
87                None => return vec![],
88                Some(k) => k,
89            };
90            attribution_info_for_kernel(kernel_ref.as_ref(), &component_instance)
91        }));
92
93        ContainerMemoryAttributionManager { memory_attribution_server }
94    }
95
96    /// Creates a new observer for the attribution information from this container.
97    pub fn new_observer(
98        &self,
99        control_handle: fattribution::ProviderControlHandle,
100    ) -> attribution_server::Observer {
101        self.memory_attribution_server.new_observer(control_handle)
102    }
103}
104
105/// Generates the attribution information for the Starnix kernel ELF component. The attribution
106/// information for the container is handled by the container component, not the kernel
107/// component itself, even if both are hosted within the same kernel process.
108fn attribution_info_for_kernel(
109    kernel: &Kernel,
110    component_instance: &zx::Event,
111) -> Vec<fattribution::AttributionUpdate> {
112    // Start the server to handle the memory attribution requests for the container, and provide
113    // a handle to get detailed attribution. We start a new task as each incoming connection is
114    // independent.
115    let (client_end, server_end) =
116        fidl::endpoints::create_request_stream::<fattribution::ProviderMarker>();
117    fuchsia_async::Task::spawn(serve_memory_attribution_provider_container(server_end, kernel))
118        .detach();
119
120    let starnix_kernel_id = Some(1);
121    let starnix_kernel_principal = fattribution::NewPrincipal {
122        identifier: starnix_kernel_id,
123        description: Some(fattribution::Description::Part("starnix_kernel".to_string())),
124        principal_type: Some(fattribution::PrincipalType::Part),
125        // This part is created for accounting. It holds the resource used for starnix
126        // kernel operation. It neither has sub-principals, nor publishes attribution,
127        // hence it does not need to be tied to a provider server end.
128        detailed_attribution: None,
129        ..Default::default()
130    };
131
132    let starnix_kernel_attribution = fattribution::UpdatedPrincipal {
133        identifier: starnix_kernel_id, // Recipient.
134        resources: Some(fattribution::Resources::Data(fattribution::Data {
135            resources: vec![fattribution::Resource::ProcessMapped(fattribution::ProcessMapped {
136                process: fuchsia_runtime::process_self().koid().unwrap().raw_koid(),
137                base: 0, // Attribute all the range.
138                len: u64::max_value(),
139                hint_skip_handle_table: false,
140            })],
141        })),
142        ..Default::default()
143    };
144
145    let container_id = Some(2);
146    let new_principal = fattribution::NewPrincipal {
147        identifier: container_id,
148        description: Some(fattribution::Description::Component(
149            component_instance.duplicate_handle(zx::Rights::SAME_RIGHTS).unwrap(),
150        )),
151        principal_type: Some(fattribution::PrincipalType::Runnable),
152        detailed_attribution: Some(client_end),
153        ..Default::default()
154    };
155    let attribution = fattribution::UpdatedPrincipal {
156        identifier: container_id,
157        resources: Some(fattribution::Resources::Data(fattribution::Data {
158            resources: vec![fattribution::Resource::KernelObject(
159                fuchsia_runtime::job_default().koid().unwrap().raw_koid(),
160            )],
161        })),
162        ..Default::default()
163    };
164
165    vec![
166        fattribution::AttributionUpdate::Add(new_principal),
167        fattribution::AttributionUpdate::Add(starnix_kernel_principal),
168        fattribution::AttributionUpdate::Update(attribution),
169        fattribution::AttributionUpdate::Update(starnix_kernel_attribution),
170    ]
171}
172
173#[derive(Debug)]
174pub struct ContainerStartInfo {
175    /// Configuration specified by the component's `program` block.
176    pub program: ContainerProgram,
177
178    pub config: ContainerStructuredConfig,
179
180    /// The outgoing directory of the container, used to serve protocols on behalf of the container.
181    /// For example, the starnix_kernel serves a component runner in the containers' outgoing
182    /// directory.
183    outgoing_dir: Option<zx::Channel>,
184
185    /// Mapping of top-level namespace entries to an associated channel.
186    /// For example, "/svc" to the respective channel.
187    pub container_namespace: ContainerNamespace,
188
189    /// The runtime directory of the container, used to provide CF introspection.
190    runtime_dir: Option<ServerEnd<fio::DirectoryMarker>>,
191
192    /// An eventpair that debuggers can use to defer the launch of the container.
193    break_on_start: Option<zx::EventPair>,
194
195    /// Component moniker token for the container component. This token is used in various protocols
196    /// to uniquely identify a component.
197    component_instance: Option<zx::Event>,
198}
199
200const MISSING_CONFIG_VMO_CONTEXT: &str = concat!(
201    "Retrieving container config VMO. ",
202    "If this fails, make sure your container CML includes ",
203    "//src/starnix/containers/container.shard.cml.",
204);
205
206impl ContainerStartInfo {
207    fn new(mut start_info: frunner::ComponentStartInfo) -> Result<Self, Error> {
208        let program = start_info.program.as_ref().context("retrieving program block")?;
209        let program: ContainerProgram =
210            runner::serde::deserialize_program(&program).context("parsing program block")?;
211
212        let encoded_config =
213            start_info.encoded_config.as_ref().context(MISSING_CONFIG_VMO_CONTEXT)?;
214        let config = match encoded_config {
215            fmem::Data::Bytes(b) => ContainerStructuredConfig::from_bytes(b),
216            fmem::Data::Buffer(b) => ContainerStructuredConfig::from_vmo(&b.vmo),
217            other => anyhow::bail!("unknown Data variant {other:?}"),
218        }
219        .context("parsing container structured config")?;
220
221        let ns = start_info.ns.take().context("retrieving container namespace")?;
222        let container_namespace = ContainerNamespace::from(ns);
223
224        let outgoing_dir = start_info.outgoing_dir.take().map(|dir| dir.into_channel());
225        let component_instance = start_info.component_instance;
226
227        Ok(Self {
228            program,
229            config,
230            outgoing_dir,
231            container_namespace,
232            component_instance,
233            break_on_start: start_info.break_on_start,
234            runtime_dir: start_info.runtime_dir,
235        })
236    }
237}
238
239#[derive(Debug, Default, Deserialize)]
240#[serde(deny_unknown_fields)]
241pub struct ContainerProgram {
242    /// The name of this container.
243    name: String,
244
245    /// The command line for the initial process for this container.
246    init: Vec<String>,
247
248    /// The command line for the kernel.
249    #[serde(default)]
250    kernel_cmdline: String,
251
252    /// The specifications for the file system mounts for this container.
253    #[serde(default)]
254    mounts: Vec<String>,
255
256    /// The features enabled for this container.
257    #[serde(default)]
258    pub features: Vec<String>,
259
260    /// The resource limits to apply to this container.
261    #[serde(default)]
262    rlimits: Vec<String>,
263
264    /// The path that the container will wait until exists before considering itself to have started.
265    #[serde(default)]
266    startup_file_path: String,
267
268    /// The default seclabel that is applied to components that are instantiated in this container.
269    ///
270    /// Components can override this by setting the `seclabel` field in their program block.
271    #[serde(default)]
272    pub default_seclabel: Option<String>,
273
274    /// The default uid that is applied to components that are instantiated in this container.
275    ///
276    /// Components can override this by setting the `uid` field in their program block.
277    #[serde(default = "default_uid")]
278    pub default_uid: runner::serde::StoreAsString<u32>,
279
280    /// The default mount options to use when mounting directories from a component's namespace.
281    ///
282    /// Each string is expected to follow the format: "<namespace_path>:<mount_options>".
283    pub default_ns_mount_options: Option<Vec<String>>,
284
285    /// Specifies role names to use for "realtime" tasks based on their process & thread names.
286    ///
287    /// Zircon's scheduler doesn't support configuring tasks to always preempt non-"realtime"
288    /// tasks without specifying a constant bandwidth profile. These profiles specify the period and
289    /// expected runtime of a "realtime" task, bounding the amount of work it is allowed to perform
290    /// at an elevated "realtime" priority.
291    ///
292    /// Because constant bandwidth profiles require workload-specific tuning, we can't uniformly
293    /// apply a single profile for all "realtime" tasks. Instead, this container configuration
294    /// allows us to specify different constant bandwidth profiles for different workloads.
295    #[serde(default)]
296    task_role_overrides: Vec<TaskSchedulerMapping>,
297}
298
299/// Specifies a role override for a class of tasks whose process and thread names match provided
300/// patterns.
301#[derive(Default, Deserialize)]
302struct TaskSchedulerMapping {
303    /// The role name to use for tasks matching the provided patterns.
304    role: String,
305    /// A regular expression that will be matched against the process' command.
306    process: String,
307    /// A regular expression that will be matched against the thread's command.
308    thread: String,
309}
310
311impl std::fmt::Debug for TaskSchedulerMapping {
312    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
313        write!(f, "process `{}` thread `{}` role `{}`", self.process, self.thread, self.role)
314    }
315}
316
317fn default_uid() -> runner::serde::StoreAsString<u32> {
318    runner::serde::StoreAsString(42)
319}
320
321// Creates a CString from a String. Calling this with an invalid CString will panic.
322fn to_cstr(str: &str) -> CString {
323    CString::new(str.to_string()).unwrap()
324}
325
326#[must_use = "The container must run serve on this config"]
327pub struct ContainerServiceConfig {
328    start_info: ContainerStartInfo,
329    request_stream: frunner::ComponentControllerRequestStream,
330    receiver: oneshot::Receiver<Result<ExitStatus, Error>>,
331}
332
333pub struct Container {
334    /// The `Kernel` object that is associated with the container.
335    pub kernel: Arc<Kernel>,
336
337    memory_attribution_manager: ContainerMemoryAttributionManager,
338
339    /// Inspect node holding information about the state of the container.
340    _node: inspect::Node,
341
342    /// Until negative trait bound are implemented, using `*mut u8` to prevent transferring
343    /// Container across threads.
344    _thread_bound: std::marker::PhantomData<*mut u8>,
345}
346
347impl Container {
348    pub fn system_task(&self) -> &CurrentTask {
349        self.kernel.kthreads.system_task()
350    }
351
352    async fn serve_outgoing_directory(
353        &self,
354        outgoing_dir: Option<zx::Channel>,
355    ) -> Result<(), Error> {
356        if let Some(outgoing_dir) = outgoing_dir {
357            // Add `ComponentRunner` to the exposed services of the container, and then serve the
358            // outgoing directory.
359            let mut fs = ServiceFs::new_local();
360            fs.dir("svc")
361                .add_fidl_service(ExposedServices::ComponentRunner)
362                .add_fidl_service(ExposedServices::ContainerController)
363                .add_fidl_service(ExposedServices::GraphicalPresenter)
364                .add_fidl_service(ExposedServices::LutexController);
365
366            // Expose the root of the container's filesystem.
367            let (fs_root, fs_root_server_end) = fidl::endpoints::create_proxy();
368            fs.add_remote("fs_root", fs_root);
369            expose_root(
370                self.kernel.kthreads.unlocked_for_async().deref_mut(),
371                self.system_task(),
372                fs_root_server_end,
373            )?;
374
375            fs.serve_connection(outgoing_dir.into()).map_err(|_| errno!(EINVAL))?;
376
377            fs.for_each_concurrent(None, |request_stream| async {
378                match request_stream {
379                    ExposedServices::ComponentRunner(request_stream) => {
380                        match serve_component_runner(request_stream, self.system_task()).await {
381                            Ok(_) => {}
382                            Err(e) => {
383                                log_error!("Error serving component runner: {:?}", e);
384                            }
385                        }
386                    }
387                    ExposedServices::ContainerController(request_stream) => {
388                        serve_container_controller(request_stream, self.system_task())
389                            .await
390                            .expect("failed to start container.")
391                    }
392                    ExposedServices::GraphicalPresenter(request_stream) => {
393                        serve_graphical_presenter(request_stream, &self.kernel)
394                            .await
395                            .expect("failed to start GraphicalPresenter.")
396                    }
397                    ExposedServices::LutexController(request_stream) => {
398                        serve_lutex_controller(request_stream, self.system_task())
399                            .await
400                            .expect("failed to start LutexController.")
401                    }
402                }
403            })
404            .await
405        }
406        Ok(())
407    }
408
409    pub async fn serve(&self, service_config: ContainerServiceConfig) -> Result<(), Error> {
410        let (r, _) = futures::join!(
411            self.serve_outgoing_directory(service_config.start_info.outgoing_dir),
412            server_component_controller(
413                self.kernel.clone(),
414                service_config.request_stream,
415                service_config.receiver
416            )
417        );
418        r
419    }
420
421    pub fn new_memory_attribution_observer(
422        &self,
423        control_handle: fattribution::ProviderControlHandle,
424    ) -> attribution_server::Observer {
425        self.memory_attribution_manager.new_observer(control_handle)
426    }
427}
428
429/// The services that are exposed in the container component's outgoing directory.
430enum ExposedServices {
431    ComponentRunner(frunner::ComponentRunnerRequestStream),
432    ContainerController(fstarcontainer::ControllerRequestStream),
433    GraphicalPresenter(felement::GraphicalPresenterRequestStream),
434    LutexController(fbinder::LutexControllerRequestStream),
435}
436
437type TaskResult = Result<ExitStatus, Error>;
438
439async fn server_component_controller(
440    kernel: Arc<Kernel>,
441    request_stream: frunner::ComponentControllerRequestStream,
442    task_complete: oneshot::Receiver<TaskResult>,
443) {
444    *kernel.container_control_handle.lock() = Some(request_stream.control_handle());
445
446    enum Event<T, U> {
447        Controller(T),
448        Completion(U),
449    }
450
451    let mut stream = futures::stream::select(
452        request_stream.map(Event::Controller),
453        task_complete.into_stream().map(Event::Completion),
454    );
455
456    while let Some(event) = stream.next().await {
457        match event {
458            Event::Controller(Ok(frunner::ComponentControllerRequest::Stop { .. })) => {
459                log_info!("Stopping the container.");
460            }
461            Event::Controller(Ok(frunner::ComponentControllerRequest::Kill { control_handle })) => {
462                log_info!("Killing the container's job.");
463                control_handle.shutdown_with_epitaph(zx::Status::from_raw(
464                    fcomponent::Error::InstanceDied.into_primitive() as i32,
465                ));
466                fruntime::job_default().kill().expect("Failed to kill job");
467            }
468            Event::Controller(Ok(frunner::ComponentControllerRequest::_UnknownMethod {
469                ordinal,
470                method_type,
471                ..
472            })) => {
473                log_error!(ordinal, method_type:?; "Unknown component controller request received.");
474            }
475            Event::Controller(Err(e)) => {
476                log_warn!(e:?; "Container component controller channel encountered an error.");
477            }
478            Event::Completion(result) => {
479                log_info!(result:?; "init process exited.");
480            }
481        }
482
483        // We treat any event in the stream as an invitation to shut down.
484        if !kernel.is_shutting_down() {
485            kernel.shut_down();
486        }
487    }
488
489    log_debug!("done listening for container-terminating events");
490
491    // In case the stream ended without an event, shut down the kernel here.
492    if !kernel.is_shutting_down() {
493        kernel.shut_down();
494    }
495}
496
497pub async fn create_component_from_stream(
498    mut request_stream: frunner::ComponentRunnerRequestStream,
499    kernel_extra_features: Vec<String>,
500) -> Result<(Container, ContainerServiceConfig), Error> {
501    if let Some(event) = request_stream.try_next().await? {
502        match event {
503            frunner::ComponentRunnerRequest::Start { start_info, controller, .. } => {
504                let request_stream = controller.into_stream();
505                let mut start_info = ContainerStartInfo::new(start_info)?;
506                let (sender, receiver) = oneshot::channel::<TaskResult>();
507                let container = create_container(&mut start_info, &kernel_extra_features, sender)
508                    .await
509                    .with_source_context(|| {
510                        format!("creating container \"{}\"", start_info.program.name)
511                    })?;
512                let service_config =
513                    ContainerServiceConfig { start_info, request_stream, receiver };
514                return Ok((container, service_config));
515            }
516            frunner::ComponentRunnerRequest::_UnknownMethod { ordinal, .. } => {
517                log_warn!("Unknown ComponentRunner request: {ordinal}");
518            }
519        }
520    }
521    bail!("did not receive Start request");
522}
523
524async fn get_bootargs(device_tree: &Devicetree) -> Result<String, Error> {
525    device_tree
526        .root_node
527        .find("chosen")
528        .and_then(|n| {
529            n.get_property("bootargs").map(|p| {
530                let end =
531                    if p.value.last() == Some(&0) { p.value.len() - 1 } else { p.value.len() };
532                match std::str::from_utf8(&p.value[..end]) {
533                    Ok(s) => Ok(s.to_owned()),
534                    Err(e) => {
535                        log_warn!("Bootargs are not valid UTF-8: {e}");
536                        Err(anyhow!("Bootargs are not valid UTF-8"))
537                    }
538                }
539            })
540        })
541        .context("Couldn't find bootargs")?
542}
543
544async fn get_bootitems() -> Result<std::vec::Vec<u8>, Error> {
545    let items =
546        connect_to_protocol::<fboot::ItemsMarker>().context("Failed to connect to boot items")?;
547
548    let items_response = items
549        .get2(zbi::ZbiType::DeviceTree.into_raw(), None)
550        .await
551        .context("FIDL: Failed to get devicetree item")?
552        .map_err(|e| anyhow!("Failed to get devicetree item {:?}", e))?;
553
554    let Some(item) = items_response.last() else {
555        return Err(anyhow!("Failed to get items"));
556    };
557
558    let devicetree_vmo = &item.payload;
559    let bytes = devicetree_vmo
560        .read_to_vec(0, item.length as u64)
561        .context("Failed to read devicetree vmo")?;
562
563    Ok(bytes)
564}
565
566async fn create_container(
567    start_info: &mut ContainerStartInfo,
568    kernel_extra_features: &[String],
569    task_complete: oneshot::Sender<TaskResult>,
570) -> Result<Container, Error> {
571    fuchsia_trace::duration!(CATEGORY_STARNIX, NAME_CREATE_CONTAINER);
572    const DEFAULT_INIT: &str = "/container/init";
573
574    let pkg_channel = start_info.container_namespace.get_namespace_channel("/pkg").unwrap();
575    let pkg_dir_proxy = fio::DirectorySynchronousProxy::new(pkg_channel);
576
577    let device_tree: Option<Devicetree> = match get_bootitems().await {
578        Ok(items) => match parse_devicetree(&items) {
579            Ok(device_tree) => Some(device_tree),
580            Err(e) => {
581                log_warn!("Failed to parse devicetree: {e:?}");
582                None
583            }
584        },
585        Err(e) => {
586            log_warn!("Failed to get boot items for devicetree: {e:?}");
587            None
588        }
589    };
590    let mut features = parse_features(&start_info, kernel_extra_features)?;
591
592    log_debug!("Creating container with {:#?}", features);
593    let mut kernel_cmdline = BString::from(start_info.program.kernel_cmdline.as_bytes());
594    let mut android_provided_bootreason = None;
595
596    if features.android_serialno {
597        if let Some(device_tree) = &device_tree {
598            match get_bootargs(device_tree).await {
599                Ok(args) => {
600                    for item in parse_cmdline(&args) {
601                        if item.starts_with("androidboot.force_normal_boot") {
602                            // TODO(https://fxbug.dev/424152964): Support force_normal_boot.
603                            continue;
604                        }
605                        if item.starts_with("androidboot.bootreason") && features.android_bootreason
606                        {
607                            // androidboot.bootreason is sourced from the Fuchsia reboot reason.
608                            // It is still useful to log it from userspace to learn what the
609                            // possible values are.
610                            log_info!("Original devicetree bootarg {:?}", item);
611                            if let Some((_, v)) = item.split_once('=') {
612                                android_provided_bootreason = Some(v.to_string());
613                            }
614                            continue;
615                        }
616                        kernel_cmdline.extend(b" ");
617                        kernel_cmdline.extend(item.bytes());
618                    }
619                }
620                Err(err) => log_warn!("could not get bootargs: {err:?}"),
621            }
622        } else {
623            log_warn!("No devicetree available to get bootargs for android.serialno");
624        }
625    }
626    if features.android_bootreason {
627        kernel_cmdline.extend(b" androidboot.bootreason=");
628
629        let tmp_channel = start_info.container_namespace.get_namespace_channel("/tmp_lifecycle");
630        let tmp_proxy = match tmp_channel {
631            Ok(channel) => {
632                Some(fio::DirectoryProxy::new(fidl::AsyncChannel::from_channel(channel)))
633            }
634            _ => None,
635        };
636
637        match get_or_init_android_bootreason(tmp_proxy, android_provided_bootreason).await {
638            Ok(reason) => {
639                kernel_cmdline.extend(reason.bytes());
640            }
641            Err(err) => {
642                log_warn!("could not get android bootreason: {err:?}. falling back to 'unknown'");
643                kernel_cmdline.extend(b"unknown");
644            }
645        }
646    }
647    if let Some(supported_vendors) = &features.magma_supported_vendors {
648        kernel_cmdline.extend(b" ");
649        let params = get_magma_params(supported_vendors);
650        kernel_cmdline.extend(&*params);
651    }
652
653    // Check whether we actually have access to a role manager by trying to set our own
654    // thread's role.
655    let mut task_mappings = RoleOverrides::new();
656    for m in &start_info.program.task_role_overrides {
657        task_mappings.add(m.process.clone(), m.thread.clone(), m.role.clone());
658    }
659    let task_mappings = task_mappings.build().context("adding custom task role")?;
660    let scheduler_manager = SchedulerManager::new(task_mappings);
661
662    let crash_reporter = connect_to_protocol::<CrashReporterMarker>().unwrap();
663
664    let node = inspect::component::inspector().root().create_child("container");
665    let kernel_node = node.create_child("kernel");
666    kernel_node.record_int("created_at", zx::MonotonicInstant::get().into_nanos());
667    features.record_inspect(&kernel_node);
668
669    let security_state = security::kernel_init_security(
670        features.selinux.enabled,
671        features.selinux.options.clone(),
672        features.selinux.exceptions.clone(),
673        &kernel_node,
674    );
675
676    // `config.enable_utc_time_adjustment` is set through config capability
677    // `fuchsia.time.config.WritableUTCTime`.
678    let time_adjustment_proxy = if features.enable_utc_time_adjustment {
679        connect_to_protocol_sync::<AdjustMarker>()
680            .map_err(|e| log_error!("could not connect to fuchsia.time.external/Adjust: {:?}", e))
681            .ok()
682    } else {
683        // See the comment above. UTC adjustment is a per-product setting.
684        log_info!("UTC adjustment is forbidden.");
685        None
686    };
687
688    log_info!("final kernel cmdline: {kernel_cmdline:?}");
689    kernel_node.record_string("cmdline", kernel_cmdline.to_str_lossy());
690
691    let kernel = Kernel::new(
692        kernel_cmdline,
693        features.kernel.clone(),
694        std::mem::take(&mut features.system_limits),
695        start_info.container_namespace.try_clone()?,
696        scheduler_manager,
697        Some(crash_reporter),
698        kernel_node,
699        security_state,
700        time_adjustment_proxy,
701        device_tree,
702    )
703    .with_source_context(|| format!("creating Kernel: {}", start_info.program.name))?;
704    let (fs_context, feature_mounts) = create_fs_context(
705        kernel.kthreads.unlocked_for_async().deref_mut(),
706        &kernel,
707        &features,
708        start_info,
709        &pkg_dir_proxy,
710    )
711    .source_context("creating FsContext")?;
712    let init_pid = kernel.pids.write().allocate_pid();
713    // Lots of software assumes that the pid for the init process is 1.
714    debug_assert_eq!(init_pid, 1);
715
716    let system_task = create_system_task(
717        kernel.kthreads.unlocked_for_async().deref_mut(),
718        &kernel,
719        Arc::clone(&fs_context),
720    )
721    .source_context("create system task")?;
722    // The system task gives pid 2. This value is less critical than giving
723    // pid 1 to init, but this value matches what is supposed to happen.
724    debug_assert_eq!(system_task.tid, 2);
725
726    feature_mounts(kernel.kthreads.unlocked_for_async().deref_mut(), &system_task)
727        .source_context("mounting feature filesystems")?;
728
729    kernel.kthreads.init(system_task).source_context("initializing kthreads")?;
730    let system_task = kernel.kthreads.system_task();
731
732    kernel.syslog.init(&system_task).source_context("initializing syslog")?;
733
734    kernel.hrtimer_manager.init(system_task).source_context("initializing HrTimer manager")?;
735
736    log_info!("Initializing suspend resume manager.");
737    if let Err(e) = kernel.suspend_resume_manager.init(&system_task) {
738        log_warn!("Suspend/Resume manager initialization failed: ({e:?})");
739    }
740
741    // Real Time clock is present in all configuration.
742    log_info!("Initializing RTC device.");
743    rtc_device_init(kernel.kthreads.unlocked_for_async().deref_mut(), &system_task)
744        .context("in starnix_kernel_runner, while initializing RTC")?;
745
746    // Register common devices and add them in sysfs and devtmpfs.
747    log_info!("Registering devices and filesystems.");
748    init_common_devices(kernel.kthreads.unlocked_for_async().deref_mut(), &kernel)?;
749    register_common_file_systems(kernel.kthreads.unlocked_for_async().deref_mut(), &kernel);
750
751    log_info!("Mounting filesystems.");
752    mount_filesystems(
753        kernel.kthreads.unlocked_for_async().deref_mut(),
754        &system_task,
755        start_info,
756        &pkg_dir_proxy,
757    )
758    .source_context("mounting filesystems")?;
759
760    // Run all common features that were specified in the .cml.
761    {
762        log_info!("Running container features.");
763        run_container_features(
764            kernel.kthreads.unlocked_for_async().deref_mut(),
765            &system_task,
766            &features,
767        )?;
768    }
769
770    log_info!("Initializing remote block devices.");
771    init_remote_block_devices(kernel.kthreads.unlocked_for_async().deref_mut(), &system_task)
772        .source_context("initalizing remote block devices")?;
773
774    // If there is an init binary path, run it, optionally waiting for the
775    // startup_file_path to be created. The task struct is still used
776    // to initialize the system up until this point, regardless of whether
777    // or not there is an actual init to be run.
778    let argv = if start_info.program.init.is_empty() {
779        vec![DEFAULT_INIT.to_string()]
780    } else {
781        start_info.program.init.clone()
782    }
783    .iter()
784    .map(|s| to_cstr(s))
785    .collect::<Vec<_>>();
786
787    log_info!("Opening start_info file.");
788    let executable = system_task
789        .open_file(
790            kernel.kthreads.unlocked_for_async().deref_mut(),
791            argv[0].as_bytes().into(),
792            OpenFlags::RDONLY,
793        )
794        .with_source_context(|| format!("opening init: {:?}", argv[0]))?;
795
796    let initial_name = if start_info.program.init.is_empty() {
797        TaskCommand::default()
798    } else {
799        TaskCommand::new(start_info.program.init[0].as_bytes())
800    };
801
802    let rlimits = parse_rlimits(&start_info.program.rlimits)?;
803
804    // Serve the runtime directory.
805    log_info!("Starting runtime directory.");
806    if let Some(runtime_dir) = start_info.runtime_dir.take() {
807        kernel.kthreads.spawn_future(
808            move || async move { serve_runtime_dir(runtime_dir).await },
809            "serve_runtime_dir",
810        );
811    }
812
813    // At this point the runtime environment has been prepared but nothing is actually running yet.
814    // Pause here if a debugger needs time to attach to the job.
815    if let Some(break_on_start) = start_info.break_on_start.take() {
816        log_info!("Waiting for signal from debugger before spawning init process...");
817        if let Err(e) =
818            fuchsia_async::OnSignals::new(break_on_start, zx::Signals::EVENTPAIR_PEER_CLOSED).await
819        {
820            log_warn!(e:%; "Received break_on_start eventpair but couldn't wait for PEER_CLOSED.");
821        }
822    }
823
824    log_info!("Creating init process.");
825    let init_task = create_init_process(
826        kernel.kthreads.unlocked_for_async().deref_mut(),
827        &kernel,
828        init_pid,
829        initial_name,
830        Arc::clone(&fs_context),
831        &rlimits,
832    )
833    .with_source_context(|| format!("creating init task: {:?}", start_info.program.init))?;
834
835    execute_task_with_prerun_result(
836        kernel.kthreads.unlocked_for_async().deref_mut(),
837        init_task,
838        move |locked, init_task| {
839            parse_numbered_handles(locked, init_task, None, &init_task.running_state().files)
840                .expect("");
841            init_task.exec(locked, executable, argv[0].clone(), argv.clone(), vec![])
842        },
843        move |result| {
844            log_info!("Finished running init process: {:?}", result);
845            let _ = task_complete.send(result);
846        },
847        None,
848    )?;
849
850    if !start_info.program.startup_file_path.is_empty() {
851        wait_for_init_file(&start_info.program.startup_file_path, &system_task, init_pid).await?;
852    };
853
854    let memory_attribution_manager = ContainerMemoryAttributionManager::new(
855        Arc::downgrade(&kernel),
856        start_info.component_instance.take().ok_or_else(|| Error::msg("No component instance"))?,
857    );
858
859    Ok(Container {
860        kernel,
861        memory_attribution_manager,
862        _node: node,
863        _thread_bound: Default::default(),
864    })
865}
866
867fn create_fs_context(
868    locked: &mut Locked<Unlocked>,
869    kernel: &Kernel,
870    features: &Features,
871    start_info: &ContainerStartInfo,
872    pkg_dir_proxy: &fio::DirectorySynchronousProxy,
873) -> Result<(Arc<FsContext>, LayeredFsMounts), Error> {
874    // The mounts are applied in the order listed. Mounting will fail if the designated mount
875    // point doesn't exist in a previous mount. The root must be first so other mounts can be
876    // applied on top of it.
877    let mut mounts_iter =
878        start_info.program.mounts.iter().chain(start_info.config.additional_mounts.iter());
879    let root = MountAction::new_for_root(
880        locked,
881        kernel,
882        pkg_dir_proxy,
883        mounts_iter.next().ok_or_else(|| anyhow!("Mounts list is empty"))?,
884    )?;
885    if root.path != "/" {
886        anyhow::bail!("First mount in mounts list is not the root");
887    }
888
889    let mut builder = LayeredFsBuilder::new(root.fs);
890    if features.container {
891        // /container/component will be a tmpfs where component using the starnix kernel will have their
892        // package mounted.
893        let component_tmpfs_options = FileSystemOptions {
894            params: kernel
895                .features
896                .ns_mount_options("#component_tmpfs")
897                .context("#component_tmpfs options")?,
898            ..Default::default()
899        };
900        let component_tmpfs = TmpFs::new_fs_with_options(locked, kernel, component_tmpfs_options)?;
901
902        // /container will mount the container pkg
903        let container_remotefs_options = FileSystemOptions {
904            source: "data".into(),
905            params: kernel.features.ns_mount_options("#container").context("#container options")?,
906            ..Default::default()
907        };
908        let container_remotefs = new_remotefs_in_root(
909            locked,
910            kernel,
911            pkg_dir_proxy,
912            container_remotefs_options,
913            fio::PERM_READABLE | fio::PERM_EXECUTABLE,
914        )?;
915
916        builder.add("/container", container_remotefs);
917        builder.add("/container/component", component_tmpfs);
918    }
919    if features.custom_artifacts {
920        let mount_options = FileSystemOptions {
921            params: kernel
922                .features
923                .ns_mount_options("#custom_artifacts")
924                .context("#custom_artifacts options")?,
925            ..Default::default()
926        };
927        let fs = TmpFs::new_fs_with_options(locked, kernel, mount_options)?;
928        builder.add("/custom_artifacts", fs);
929    }
930    if features.test_data {
931        let mount_options = FileSystemOptions {
932            params: kernel.features.ns_mount_options("#test_data").context("#test_data options")?,
933            ..Default::default()
934        };
935        let fs = TmpFs::new_fs_with_options(locked, kernel, mount_options)?;
936        builder.add("/test_data", fs);
937    }
938
939    let (mut root_fs, feature_mounts) = builder.build(locked, kernel);
940    if features.rootfs_rw {
941        root_fs = OverlayStack::wrap_fs_in_writable_layer(locked, kernel, root_fs)?;
942    }
943
944    Ok((FsContext::new(Namespace::new_with_flags(root_fs, root.flags)), feature_mounts))
945}
946
947fn parse_rlimits(rlimits: &[String]) -> Result<Vec<(Resource, u64)>, Error> {
948    let mut res = Vec::new();
949
950    for rlimit in rlimits {
951        let (key, value) =
952            rlimit.split_once('=').ok_or_else(|| anyhow!("Invalid rlimit: {rlimit}"))?;
953        let value = value.parse::<u64>()?;
954        let kv = match key {
955            "RLIMIT_NOFILE" => (Resource::NOFILE, value),
956            "RLIMIT_RTPRIO" => (Resource::RTPRIO, value),
957            _ => bail!("Unknown rlimit: {key}"),
958        };
959        res.push(kv);
960    }
961
962    Ok(res)
963}
964
965fn mount_filesystems(
966    locked: &mut Locked<Unlocked>,
967    system_task: &CurrentTask,
968    start_info: &ContainerStartInfo,
969    pkg_dir_proxy: &fio::DirectorySynchronousProxy,
970) -> Result<(), Error> {
971    // Skip the first mount, that was used to create the root filesystem.
972    let mut mounts_iter =
973        start_info.program.mounts.iter().chain(start_info.config.additional_mounts.iter());
974    let _ = mounts_iter.next();
975    for mount_spec in mounts_iter {
976        let action = MountAction::from_spec(locked, system_task, pkg_dir_proxy, mount_spec)
977            .with_source_context(|| format!("creating filesystem from spec: {}", mount_spec))?;
978        let mount_point = system_task
979            .lookup_path_from_root(locked, action.path.as_ref())
980            .with_source_context(|| format!("lookup path from root: {}", action.path))?;
981        mount_point.mount(WhatToMount::Fs(action.fs), action.flags)?;
982    }
983    Ok(())
984}
985
986fn init_remote_block_devices(
987    locked: &mut Locked<Unlocked>,
988    system_task: &CurrentTask,
989) -> Result<(), Error> {
990    remote_block_device_init(locked, system_task);
991    let entries = match std::fs::read_dir("/block") {
992        Ok(entries) => entries,
993        Err(e) => {
994            log_warn!("Failed to read block directory: {}", e);
995            return Ok(());
996        }
997    };
998    for entry in entries {
999        let entry = entry?;
1000        let path_buf = entry.path();
1001        let path = path_buf.to_str().ok_or_else(|| anyhow!("Invalid block device path"))?;
1002        let (client_end, server_end) = fidl::endpoints::create_endpoints();
1003        match fdio::service_connect(
1004            &format!("{}/fuchsia.storage.block.Block", path),
1005            server_end.into(),
1006        ) {
1007            Ok(()) => (),
1008            Err(e) => {
1009                log_warn!("Failed to connect to block device at {}: {}", path, e);
1010                continue;
1011            }
1012        }
1013        system_task.kernel().remote_block_device_registry.create_remote_block_device(
1014            locked,
1015            system_task,
1016            entry.file_name().to_str().unwrap(),
1017            client_end,
1018        )?;
1019    }
1020    Ok(())
1021}
1022
1023async fn wait_for_init_file(
1024    startup_file_path: &str,
1025    current_task: &CurrentTask,
1026    init_tid: tid_t,
1027) -> Result<(), Error> {
1028    // TODO(https://fxbug.dev/42178400): Use inotify machinery to wait for the file.
1029    loop {
1030        fasync::Timer::new(fasync::MonotonicDuration::from_millis(100).after_now()).await;
1031
1032        let creds = security::creds_start_internal_operation(current_task);
1033        if let Some(result) = current_task.override_creds(creds, || {
1034            let root = current_task.fs().root();
1035            let mut context = LookupContext::default();
1036
1037            match current_task.lookup_path(
1038                current_task.kernel().kthreads.unlocked_for_async().deref_mut(),
1039                &mut context,
1040                root,
1041                startup_file_path.into(),
1042            ) {
1043                Ok(_) => return Some(Ok(())),
1044                Err(error) if error == ENOENT => {}
1045                Err(error) => return Some(Err(anyhow::Error::from(error))),
1046            };
1047
1048            if current_task.get_task(init_tid).is_err() {
1049                return Some(Err(anyhow!(
1050                    "Init task terminated before startup_file_path was ready"
1051                )));
1052            }
1053
1054            None
1055        }) {
1056            return result;
1057        }
1058    }
1059}
1060
1061async fn serve_runtime_dir(runtime_dir: ServerEnd<fio::DirectoryMarker>) {
1062    let mut fs = fuchsia_component::server::ServiceFs::new();
1063    match create_job_id_vmo() {
1064        Ok(vmo) => {
1065            fs.dir("elf").add_vmo_file_at("job_id", vmo);
1066        }
1067        Err(e) => log_error!(e:%; "failed to create vmo with job id for debuggers"),
1068    }
1069    match fs.serve_connection(runtime_dir) {
1070        Ok(_) => {
1071            fs.add_fidl_service(|job_requests: TaskProviderRequestStream| {
1072                fuchsia_async::Task::local(async move {
1073                    if let Err(e) = serve_task_provider(job_requests).await {
1074                        log_warn!(e:?; "Error serving TaskProvider");
1075                    }
1076                })
1077                .detach();
1078            });
1079            fs.collect::<()>().await;
1080        }
1081        Err(e) => log_error!("Couldn't serve runtime directory: {e:?}"),
1082    }
1083}
1084
1085fn create_job_id_vmo() -> Result<zx::Vmo, Error> {
1086    let job_id = fuchsia_runtime::job_default().koid().context("reading own job koid")?;
1087    let job_id_str = job_id.raw_koid().to_string();
1088    let job_id_vmo = zx::Vmo::create(job_id_str.len() as u64).context("creating job id vmo")?;
1089    job_id_vmo.write(job_id_str.as_bytes(), 0).context("write job id to vmo")?;
1090    Ok(job_id_vmo)
1091}
1092
1093async fn serve_task_provider(mut job_requests: TaskProviderRequestStream) -> Result<(), Error> {
1094    while let Some(request) = job_requests.next().await {
1095        match request.context("getting next TaskProvider request")? {
1096            TaskProviderRequest::GetJob { responder } => {
1097                responder
1098                    .send(
1099                        fuchsia_runtime::job_default()
1100                            .duplicate_handle(zx::Rights::SAME_RIGHTS)
1101                            .map_err(|s| s.into_raw()),
1102                    )
1103                    .context("sending job for runtime dir")?;
1104            }
1105            unknown => bail!("Unknown TaskProvider method {unknown:?}"),
1106        }
1107    }
1108    Ok(())
1109}
1110
1111#[cfg(test)]
1112mod test {
1113    use super::wait_for_init_file;
1114    use fuchsia_async as fasync;
1115    use futures::{SinkExt, StreamExt};
1116    #[allow(deprecated, reason = "pre-existing usage")]
1117    use starnix_core::testing::create_kernel_task_and_unlocked;
1118    use starnix_core::vfs::FdNumber;
1119    use starnix_uapi::CLONE_FS;
1120    use starnix_uapi::file_mode::{AccessCheck, FileMode};
1121    use starnix_uapi::open_flags::OpenFlags;
1122    use starnix_uapi::signals::SIGCHLD;
1123    use starnix_uapi::vfs::ResolveFlags;
1124
1125    #[fuchsia::test]
1126    async fn test_init_file_already_exists() {
1127        #[allow(deprecated, reason = "pre-existing usage")]
1128        let (_kernel, current_task, locked) = create_kernel_task_and_unlocked();
1129        let (mut sender, mut receiver) = futures::channel::mpsc::unbounded();
1130
1131        let path = "/path";
1132        current_task
1133            .open_file_at(
1134                locked,
1135                FdNumber::AT_FDCWD,
1136                path.into(),
1137                OpenFlags::CREAT,
1138                FileMode::default(),
1139                ResolveFlags::empty(),
1140                AccessCheck::default(),
1141            )
1142            .expect("Failed to create file");
1143
1144        fasync::Task::local(async move {
1145            wait_for_init_file(path, &current_task, current_task.get_tid())
1146                .await
1147                .expect("failed to wait for file");
1148            sender.send(()).await.expect("failed to send message");
1149        })
1150        .detach();
1151
1152        // Wait for the file creation to have been detected.
1153        assert!(receiver.next().await.is_some());
1154    }
1155
1156    #[fuchsia::test]
1157    async fn test_init_file_wait_required() {
1158        #[allow(deprecated, reason = "pre-existing usage")]
1159        let (_kernel, current_task, locked) = create_kernel_task_and_unlocked();
1160        let (mut sender, mut receiver) = futures::channel::mpsc::unbounded();
1161
1162        let init_task = current_task.clone_task_for_test(locked, CLONE_FS as u64, Some(SIGCHLD));
1163        let path = "/path";
1164
1165        let test_init_tid = current_task.get_tid();
1166        fasync::Task::local(async move {
1167            sender.send(()).await.expect("failed to send message");
1168            wait_for_init_file(path, &init_task, test_init_tid)
1169                .await
1170                .expect("failed to wait for file");
1171            sender.send(()).await.expect("failed to send message");
1172        })
1173        .detach();
1174
1175        // Wait for message that file check has started.
1176        assert!(receiver.next().await.is_some());
1177
1178        // Create the file that is being waited on.
1179        current_task
1180            .open_file_at(
1181                locked,
1182                FdNumber::AT_FDCWD,
1183                path.into(),
1184                OpenFlags::CREAT,
1185                FileMode::default(),
1186                ResolveFlags::empty(),
1187                AccessCheck::default(),
1188            )
1189            .expect("Failed to create file");
1190
1191        // Wait for the file creation to be detected.
1192        assert!(receiver.next().await.is_some());
1193    }
1194
1195    #[fuchsia::test]
1196    async fn test_init_exits_before_file_exists() {
1197        #[allow(deprecated, reason = "pre-existing usage")]
1198        let (_kernel, current_task, locked) = create_kernel_task_and_unlocked();
1199        let (mut sender, mut receiver) = futures::channel::mpsc::unbounded();
1200
1201        let init_task = current_task.clone_task_for_test(locked, CLONE_FS as u64, Some(SIGCHLD));
1202        const STARTUP_FILE_PATH: &str = "/path";
1203
1204        let test_init_tid = init_task.get_tid();
1205        fasync::Task::local(async move {
1206            sender.send(()).await.expect("failed to send message");
1207            wait_for_init_file(STARTUP_FILE_PATH, &current_task, test_init_tid)
1208                .await
1209                .expect_err("Did not detect init exit");
1210            sender.send(()).await.expect("failed to send message");
1211        })
1212        .detach();
1213
1214        // Wait for message that file check has started.
1215        assert!(receiver.next().await.is_some());
1216
1217        // Drop the `init_task`.
1218        std::mem::drop(init_task);
1219
1220        // Wait for the init failure to be detected.
1221        assert!(receiver.next().await.is_some());
1222    }
1223}