1use crate::{
6 Features, MountAction, expose_root, parse_features, parse_numbered_handles,
7 run_container_features, serve_component_runner, serve_container_controller,
8 serve_graphical_presenter, serve_lutex_controller,
9};
10use anyhow::{Context, Error, anyhow, bail};
11use bootreason::get_or_init_android_bootreason;
12use bstr::{BString, ByteSlice};
13use devicetree::parser::parse_devicetree;
14use devicetree::types::Devicetree;
15use fidl::endpoints::{ControlHandle, RequestStream, ServerEnd};
16use fidl_fuchsia_component_runner::{TaskProviderRequest, TaskProviderRequestStream};
17use fidl_fuchsia_feedback::CrashReporterMarker;
18use fidl_fuchsia_time_external::AdjustMarker;
19use fuchsia_async::DurationExt;
20use fuchsia_component::client::{connect_to_protocol, connect_to_protocol_sync};
21use fuchsia_component::server::ServiceFs;
22use futures::channel::oneshot;
23use futures::{FutureExt, StreamExt, TryStreamExt};
24use serde::Deserialize;
25use starnix_container_structured_config::Config as ContainerStructuredConfig;
26use starnix_core::device::remote_block_device::remote_block_device_init;
27use starnix_core::execution::{
28 create_init_process, create_system_task, execute_task_with_prerun_result,
29};
30use starnix_core::fs::fuchsia::new_remotefs_in_root;
31use starnix_core::fs::tmpfs::TmpFs;
32use starnix_core::security;
33use starnix_core::task::container_namespace::ContainerNamespace;
34use starnix_core::task::{
35 CurrentTask, ExitStatus, Kernel, RoleOverrides, SchedulerManager, parse_cmdline,
36};
37use starnix_core::vfs::{FileSystemOptions, FsContext, LookupContext, Namespace, WhatToMount};
38use starnix_logging::{
39 CATEGORY_STARNIX, NAME_CREATE_CONTAINER, log_debug, log_error, log_info, log_warn,
40 trace_duration,
41};
42use starnix_modules::{init_common_devices, register_common_file_systems};
43use starnix_modules_layeredfs::LayeredFs;
44use starnix_modules_magma::get_magma_params;
45use starnix_modules_overlayfs::OverlayStack;
46use starnix_modules_rtc::rtc_device_init;
47use starnix_sync::{Locked, Unlocked};
48use starnix_task_command::TaskCommand;
49use starnix_uapi::errors::{ENOENT, SourceContext};
50use starnix_uapi::open_flags::OpenFlags;
51use starnix_uapi::resource_limits::Resource;
52use starnix_uapi::{errno, tid_t};
53use std::collections::BTreeMap;
54use std::ffi::CString;
55use std::ops::DerefMut;
56use std::sync::Arc;
57use zx::Task as _;
58use {
59 fidl_fuchsia_boot as fboot, fidl_fuchsia_component as fcomponent,
60 fidl_fuchsia_component_runner as frunner, fidl_fuchsia_element as felement,
61 fidl_fuchsia_io as fio, fidl_fuchsia_mem as fmem,
62 fidl_fuchsia_memory_attribution as fattribution, fidl_fuchsia_starnix_binder as fbinder,
63 fidl_fuchsia_starnix_container as fstarcontainer, fuchsia_async as fasync,
64 fuchsia_inspect as inspect, fuchsia_runtime as fruntime, fuchsia_zbi as zbi,
65};
66
67use std::sync::Weak;
68
69use crate::serve_memory_attribution_provider_container;
70use attribution_server::{AttributionServer, AttributionServerHandle};
71use fidl::HandleBased;
72
73struct ContainerMemoryAttributionManager {
75 memory_attribution_server: AttributionServerHandle,
77}
78
79impl ContainerMemoryAttributionManager {
80 pub fn new(kernel: Weak<Kernel>, component_instance: zx::Event) -> Self {
83 let memory_attribution_server = AttributionServer::new(Box::new(move || {
84 let kernel_ref = match kernel.upgrade() {
85 None => return vec![],
86 Some(k) => k,
87 };
88 attribution_info_for_kernel(kernel_ref.as_ref(), &component_instance)
89 }));
90
91 ContainerMemoryAttributionManager { memory_attribution_server }
92 }
93
94 pub fn new_observer(
96 &self,
97 control_handle: fattribution::ProviderControlHandle,
98 ) -> attribution_server::Observer {
99 self.memory_attribution_server.new_observer(control_handle)
100 }
101}
102
103fn attribution_info_for_kernel(
107 kernel: &Kernel,
108 component_instance: &zx::Event,
109) -> Vec<fattribution::AttributionUpdate> {
110 let (client_end, server_end) =
114 fidl::endpoints::create_request_stream::<fattribution::ProviderMarker>();
115 fuchsia_async::Task::spawn(serve_memory_attribution_provider_container(server_end, kernel))
116 .detach();
117
118 let starnix_kernel_id = Some(1);
119 let starnix_kernel_principal = fattribution::NewPrincipal {
120 identifier: starnix_kernel_id,
121 description: Some(fattribution::Description::Part("starnix_kernel".to_string())),
122 principal_type: Some(fattribution::PrincipalType::Part),
123 detailed_attribution: None,
127 ..Default::default()
128 };
129
130 let starnix_kernel_attribution = fattribution::UpdatedPrincipal {
131 identifier: starnix_kernel_id, resources: Some(fattribution::Resources::Data(fattribution::Data {
133 resources: vec![fattribution::Resource::ProcessMapped(fattribution::ProcessMapped {
134 process: fuchsia_runtime::process_self().koid().unwrap().raw_koid(),
135 base: 0, len: u64::max_value(),
137 hint_skip_handle_table: false,
138 })],
139 })),
140 ..Default::default()
141 };
142
143 let container_id = Some(2);
144 let new_principal = fattribution::NewPrincipal {
145 identifier: container_id,
146 description: Some(fattribution::Description::Component(
147 component_instance.duplicate_handle(zx::Rights::SAME_RIGHTS).unwrap(),
148 )),
149 principal_type: Some(fattribution::PrincipalType::Runnable),
150 detailed_attribution: Some(client_end),
151 ..Default::default()
152 };
153 let attribution = fattribution::UpdatedPrincipal {
154 identifier: container_id,
155 resources: Some(fattribution::Resources::Data(fattribution::Data {
156 resources: vec![fattribution::Resource::KernelObject(
157 fuchsia_runtime::job_default().koid().unwrap().raw_koid(),
158 )],
159 })),
160 ..Default::default()
161 };
162
163 vec![
164 fattribution::AttributionUpdate::Add(new_principal),
165 fattribution::AttributionUpdate::Add(starnix_kernel_principal),
166 fattribution::AttributionUpdate::Update(attribution),
167 fattribution::AttributionUpdate::Update(starnix_kernel_attribution),
168 ]
169}
170
171#[derive(Debug)]
172pub struct ContainerStartInfo {
173 pub program: ContainerProgram,
175
176 pub config: ContainerStructuredConfig,
177
178 outgoing_dir: Option<zx::Channel>,
182
183 pub container_namespace: ContainerNamespace,
186
187 runtime_dir: Option<ServerEnd<fio::DirectoryMarker>>,
189
190 break_on_start: Option<zx::EventPair>,
192
193 component_instance: Option<zx::Event>,
196}
197
198const MISSING_CONFIG_VMO_CONTEXT: &str = concat!(
199 "Retrieving container config VMO. ",
200 "If this fails, make sure your container CML includes ",
201 "//src/starnix/containers/container.shard.cml.",
202);
203
204impl ContainerStartInfo {
205 fn new(mut start_info: frunner::ComponentStartInfo) -> Result<Self, Error> {
206 let program = start_info.program.as_ref().context("retrieving program block")?;
207 let program: ContainerProgram =
208 runner::serde::deserialize_program(&program).context("parsing program block")?;
209
210 let encoded_config =
211 start_info.encoded_config.as_ref().context(MISSING_CONFIG_VMO_CONTEXT)?;
212 let config = match encoded_config {
213 fmem::Data::Bytes(b) => ContainerStructuredConfig::from_bytes(b),
214 fmem::Data::Buffer(b) => ContainerStructuredConfig::from_vmo(&b.vmo),
215 other => anyhow::bail!("unknown Data variant {other:?}"),
216 }
217 .context("parsing container structured config")?;
218
219 let ns = start_info.ns.take().context("retrieving container namespace")?;
220 let container_namespace = ContainerNamespace::from(ns);
221
222 let outgoing_dir = start_info.outgoing_dir.take().map(|dir| dir.into_channel());
223 let component_instance = start_info.component_instance;
224
225 Ok(Self {
226 program,
227 config,
228 outgoing_dir,
229 container_namespace,
230 component_instance,
231 break_on_start: start_info.break_on_start,
232 runtime_dir: start_info.runtime_dir,
233 })
234 }
235}
236
237#[derive(Debug, Default, Deserialize)]
238#[serde(deny_unknown_fields)]
239pub struct ContainerProgram {
240 name: String,
242
243 init: Vec<String>,
245
246 #[serde(default)]
248 kernel_cmdline: String,
249
250 #[serde(default)]
252 mounts: Vec<String>,
253
254 #[serde(default)]
256 pub features: Vec<String>,
257
258 #[serde(default)]
260 rlimits: Vec<String>,
261
262 #[serde(default)]
264 startup_file_path: String,
265
266 #[serde(default)]
270 pub default_seclabel: Option<String>,
271
272 #[serde(default = "default_uid")]
276 pub default_uid: runner::serde::StoreAsString<u32>,
277
278 pub default_ns_mount_options: Option<Vec<String>>,
282
283 #[serde(default)]
294 task_role_overrides: Vec<TaskSchedulerMapping>,
295}
296
297#[derive(Default, Deserialize)]
300struct TaskSchedulerMapping {
301 role: String,
303 process: String,
305 thread: String,
307}
308
309impl std::fmt::Debug for TaskSchedulerMapping {
310 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
311 write!(f, "process `{}` thread `{}` role `{}`", self.process, self.thread, self.role)
312 }
313}
314
315fn default_uid() -> runner::serde::StoreAsString<u32> {
316 runner::serde::StoreAsString(42)
317}
318
319fn to_cstr(str: &str) -> CString {
321 CString::new(str.to_string()).unwrap()
322}
323
324#[must_use = "The container must run serve on this config"]
325pub struct ContainerServiceConfig {
326 start_info: ContainerStartInfo,
327 request_stream: frunner::ComponentControllerRequestStream,
328 receiver: oneshot::Receiver<Result<ExitStatus, Error>>,
329}
330
331pub struct Container {
332 pub kernel: Arc<Kernel>,
334
335 memory_attribution_manager: ContainerMemoryAttributionManager,
336
337 _node: inspect::Node,
339
340 _thread_bound: std::marker::PhantomData<*mut u8>,
343}
344
345impl Container {
346 pub fn system_task(&self) -> &CurrentTask {
347 self.kernel.kthreads.system_task()
348 }
349
350 async fn serve_outgoing_directory(
351 &self,
352 outgoing_dir: Option<zx::Channel>,
353 ) -> Result<(), Error> {
354 if let Some(outgoing_dir) = outgoing_dir {
355 let mut fs = ServiceFs::new_local();
358 fs.dir("svc")
359 .add_fidl_service(ExposedServices::ComponentRunner)
360 .add_fidl_service(ExposedServices::ContainerController)
361 .add_fidl_service(ExposedServices::GraphicalPresenter)
362 .add_fidl_service(ExposedServices::LutexController);
363
364 let (fs_root, fs_root_server_end) = fidl::endpoints::create_proxy();
366 fs.add_remote("fs_root", fs_root);
367 expose_root(
368 self.kernel.kthreads.unlocked_for_async().deref_mut(),
369 self.system_task(),
370 fs_root_server_end,
371 )?;
372
373 fs.serve_connection(outgoing_dir.into()).map_err(|_| errno!(EINVAL))?;
374
375 fs.for_each_concurrent(None, |request_stream| async {
376 match request_stream {
377 ExposedServices::ComponentRunner(request_stream) => {
378 match serve_component_runner(request_stream, self.system_task()).await {
379 Ok(_) => {}
380 Err(e) => {
381 log_error!("Error serving component runner: {:?}", e);
382 }
383 }
384 }
385 ExposedServices::ContainerController(request_stream) => {
386 serve_container_controller(request_stream, self.system_task())
387 .await
388 .expect("failed to start container.")
389 }
390 ExposedServices::GraphicalPresenter(request_stream) => {
391 serve_graphical_presenter(request_stream, &self.kernel)
392 .await
393 .expect("failed to start GraphicalPresenter.")
394 }
395 ExposedServices::LutexController(request_stream) => {
396 serve_lutex_controller(request_stream, self.system_task())
397 .await
398 .expect("failed to start LutexController.")
399 }
400 }
401 })
402 .await
403 }
404 Ok(())
405 }
406
407 pub async fn serve(&self, service_config: ContainerServiceConfig) -> Result<(), Error> {
408 let (r, _) = futures::join!(
409 self.serve_outgoing_directory(service_config.start_info.outgoing_dir),
410 server_component_controller(
411 self.kernel.clone(),
412 service_config.request_stream,
413 service_config.receiver
414 )
415 );
416 r
417 }
418
419 pub fn new_memory_attribution_observer(
420 &self,
421 control_handle: fattribution::ProviderControlHandle,
422 ) -> attribution_server::Observer {
423 self.memory_attribution_manager.new_observer(control_handle)
424 }
425}
426
427enum ExposedServices {
429 ComponentRunner(frunner::ComponentRunnerRequestStream),
430 ContainerController(fstarcontainer::ControllerRequestStream),
431 GraphicalPresenter(felement::GraphicalPresenterRequestStream),
432 LutexController(fbinder::LutexControllerRequestStream),
433}
434
435type TaskResult = Result<ExitStatus, Error>;
436
437async fn server_component_controller(
438 kernel: Arc<Kernel>,
439 request_stream: frunner::ComponentControllerRequestStream,
440 task_complete: oneshot::Receiver<TaskResult>,
441) {
442 *kernel.container_control_handle.lock() = Some(request_stream.control_handle());
443
444 enum Event<T, U> {
445 Controller(T),
446 Completion(U),
447 }
448
449 let mut stream = futures::stream::select(
450 request_stream.map(Event::Controller),
451 task_complete.into_stream().map(Event::Completion),
452 );
453
454 while let Some(event) = stream.next().await {
455 match event {
456 Event::Controller(Ok(frunner::ComponentControllerRequest::Stop { .. })) => {
457 log_info!("Stopping the container.");
458 }
459 Event::Controller(Ok(frunner::ComponentControllerRequest::Kill { control_handle })) => {
460 log_info!("Killing the container's job.");
461 control_handle.shutdown_with_epitaph(zx::Status::from_raw(
462 fcomponent::Error::InstanceDied.into_primitive() as i32,
463 ));
464 fruntime::job_default().kill().expect("Failed to kill job");
465 }
466 Event::Controller(Ok(frunner::ComponentControllerRequest::_UnknownMethod {
467 ordinal,
468 method_type,
469 ..
470 })) => {
471 log_error!(ordinal, method_type:?; "Unknown component controller request received.");
472 }
473 Event::Controller(Err(e)) => {
474 log_warn!(e:?; "Container component controller channel encountered an error.");
475 }
476 Event::Completion(result) => {
477 log_info!(result:?; "init process exited.");
478 }
479 }
480
481 if !kernel.is_shutting_down() {
483 kernel.shut_down();
484 }
485 }
486
487 log_debug!("done listening for container-terminating events");
488
489 if !kernel.is_shutting_down() {
491 kernel.shut_down();
492 }
493}
494
495pub async fn create_component_from_stream(
496 mut request_stream: frunner::ComponentRunnerRequestStream,
497 kernel_extra_features: Vec<String>,
498) -> Result<(Container, ContainerServiceConfig), Error> {
499 if let Some(event) = request_stream.try_next().await? {
500 match event {
501 frunner::ComponentRunnerRequest::Start { start_info, controller, .. } => {
502 let request_stream = controller.into_stream();
503 let mut start_info = ContainerStartInfo::new(start_info)?;
504 let (sender, receiver) = oneshot::channel::<TaskResult>();
505 let container = create_container(&mut start_info, &kernel_extra_features, sender)
506 .await
507 .with_source_context(|| {
508 format!("creating container \"{}\"", start_info.program.name)
509 })?;
510 let service_config =
511 ContainerServiceConfig { start_info, request_stream, receiver };
512 return Ok((container, service_config));
513 }
514 frunner::ComponentRunnerRequest::_UnknownMethod { ordinal, .. } => {
515 log_warn!("Unknown ComponentRunner request: {ordinal}");
516 }
517 }
518 }
519 bail!("did not receive Start request");
520}
521
522async fn get_bootargs(device_tree: &Devicetree) -> Result<String, Error> {
523 device_tree
524 .root_node
525 .find("chosen")
526 .and_then(|n| {
527 n.get_property("bootargs").map(|p| {
528 let end =
529 if p.value.last() == Some(&0) { p.value.len() - 1 } else { p.value.len() };
530 match std::str::from_utf8(&p.value[..end]) {
531 Ok(s) => Ok(s.to_owned()),
532 Err(e) => {
533 log_warn!("Bootargs are not valid UTF-8: {e}");
534 Err(anyhow!("Bootargs are not valid UTF-8"))
535 }
536 }
537 })
538 })
539 .context("Couldn't find bootargs")?
540}
541
542async fn get_bootitems() -> Result<std::vec::Vec<u8>, Error> {
543 let items =
544 connect_to_protocol::<fboot::ItemsMarker>().context("Failed to connect to boot items")?;
545
546 let items_response = items
547 .get2(zbi::ZbiType::DeviceTree.into_raw(), None)
548 .await
549 .context("FIDL: Failed to get devicetree item")?
550 .map_err(|e| anyhow!("Failed to get devicetree item {:?}", e))?;
551
552 let Some(item) = items_response.last() else {
553 return Err(anyhow!("Failed to get items"));
554 };
555
556 let devicetree_vmo = &item.payload;
557 let bytes = devicetree_vmo
558 .read_to_vec(0, item.length as u64)
559 .context("Failed to read devicetree vmo")?;
560
561 Ok(bytes)
562}
563
564async fn create_container(
565 start_info: &mut ContainerStartInfo,
566 kernel_extra_features: &[String],
567 task_complete: oneshot::Sender<TaskResult>,
568) -> Result<Container, Error> {
569 trace_duration!(CATEGORY_STARNIX, NAME_CREATE_CONTAINER);
570 const DEFAULT_INIT: &str = "/container/init";
571
572 let pkg_channel = start_info.container_namespace.get_namespace_channel("/pkg").unwrap();
573 let pkg_dir_proxy = fio::DirectorySynchronousProxy::new(pkg_channel);
574
575 let device_tree: Option<Devicetree> = match get_bootitems().await {
576 Ok(items) => match parse_devicetree(&items) {
577 Ok(device_tree) => Some(device_tree),
578 Err(e) => {
579 log_warn!("Failed to parse devicetree: {e:?}");
580 None
581 }
582 },
583 Err(e) => {
584 log_warn!("Failed to get boot items for devicetree: {e:?}");
585 None
586 }
587 };
588 let mut features = parse_features(&start_info, kernel_extra_features)?;
589
590 log_debug!("Creating container with {:#?}", features);
591 let mut kernel_cmdline = BString::from(start_info.program.kernel_cmdline.as_bytes());
592 let mut android_provided_bootreason = None;
593
594 if features.android_serialno {
595 if let Some(device_tree) = &device_tree {
596 match get_bootargs(device_tree).await {
597 Ok(args) => {
598 for item in parse_cmdline(&args) {
599 if item.starts_with("androidboot.force_normal_boot") {
600 continue;
602 }
603 if item.starts_with("androidboot.bootreason") && features.android_bootreason
604 {
605 log_info!("Original devicetree bootarg {:?}", item);
609 if let Some((_, v)) = item.split_once('=') {
610 android_provided_bootreason = Some(v.to_string());
611 }
612 continue;
613 }
614 kernel_cmdline.extend(b" ");
615 kernel_cmdline.extend(item.bytes());
616 }
617 }
618 Err(err) => log_warn!("could not get bootargs: {err:?}"),
619 }
620 } else {
621 log_warn!("No devicetree available to get bootargs for android.serialno");
622 }
623 }
624 if features.android_bootreason {
625 kernel_cmdline.extend(b" androidboot.bootreason=");
626
627 let tmp_channel = start_info.container_namespace.get_namespace_channel("/tmp_lifecycle");
628 let tmp_proxy = match tmp_channel {
629 Ok(channel) => {
630 Some(fio::DirectoryProxy::new(fidl::AsyncChannel::from_channel(channel)))
631 }
632 _ => None,
633 };
634
635 match get_or_init_android_bootreason(tmp_proxy, android_provided_bootreason).await {
636 Ok(reason) => {
637 kernel_cmdline.extend(reason.bytes());
638 }
639 Err(err) => {
640 log_warn!("could not get android bootreason: {err:?}. falling back to 'unknown'");
641 kernel_cmdline.extend(b"unknown");
642 }
643 }
644 }
645 if let Some(supported_vendors) = &features.magma_supported_vendors {
646 kernel_cmdline.extend(b" ");
647 let params = get_magma_params(supported_vendors);
648 kernel_cmdline.extend(&*params);
649 }
650
651 let mut task_mappings = RoleOverrides::new();
654 for m in &start_info.program.task_role_overrides {
655 task_mappings.add(m.process.clone(), m.thread.clone(), m.role.clone());
656 }
657 let task_mappings = task_mappings.build().context("adding custom task role")?;
658 let scheduler_manager = SchedulerManager::new(task_mappings);
659
660 let crash_reporter = connect_to_protocol::<CrashReporterMarker>().unwrap();
661
662 let node = inspect::component::inspector().root().create_child("container");
663 let kernel_node = node.create_child("kernel");
664 kernel_node.record_int("created_at", zx::MonotonicInstant::get().into_nanos());
665 features.record_inspect(&kernel_node);
666
667 let security_state = security::kernel_init_security(
668 features.selinux.enabled,
669 features.selinux.options.clone(),
670 features.selinux.exceptions.clone(),
671 &kernel_node,
672 );
673
674 let time_adjustment_proxy = if features.enable_utc_time_adjustment {
677 connect_to_protocol_sync::<AdjustMarker>()
678 .map_err(|e| log_error!("could not connect to fuchsia.time.external/Adjust: {:?}", e))
679 .ok()
680 } else {
681 log_info!("UTC adjustment is forbidden.");
683 None
684 };
685
686 log_info!("final kernel cmdline: {kernel_cmdline:?}");
687 kernel_node.record_string("cmdline", kernel_cmdline.to_str_lossy());
688
689 let kernel = Kernel::new(
690 kernel_cmdline,
691 features.kernel.clone(),
692 std::mem::take(&mut features.system_limits),
693 start_info.container_namespace.try_clone()?,
694 scheduler_manager,
695 Some(crash_reporter),
696 kernel_node,
697 security_state,
698 time_adjustment_proxy,
699 device_tree,
700 )
701 .with_source_context(|| format!("creating Kernel: {}", &start_info.program.name))?;
702 let fs_context = create_fs_context(
703 kernel.kthreads.unlocked_for_async().deref_mut(),
704 &kernel,
705 &features,
706 start_info,
707 &pkg_dir_proxy,
708 )
709 .source_context("creating FsContext")?;
710 let init_pid = kernel.pids.write().allocate_pid();
711 debug_assert_eq!(init_pid, 1);
713
714 let system_task = create_system_task(
715 kernel.kthreads.unlocked_for_async().deref_mut(),
716 &kernel,
717 Arc::clone(&fs_context),
718 )
719 .source_context("create system task")?;
720 debug_assert_eq!(system_task.tid, 2);
723
724 kernel.kthreads.init(system_task).source_context("initializing kthreads")?;
725 let system_task = kernel.kthreads.system_task();
726
727 kernel.syslog.init(&system_task).source_context("initializing syslog")?;
728
729 kernel.hrtimer_manager.init(system_task).source_context("initializing HrTimer manager")?;
730
731 log_info!("Initializing suspend resume manager.");
732 if let Err(e) = kernel.suspend_resume_manager.init(&system_task) {
733 log_warn!("Suspend/Resume manager initialization failed: ({e:?})");
734 }
735
736 log_info!("Initializing RTC device.");
738 rtc_device_init(kernel.kthreads.unlocked_for_async().deref_mut(), &system_task)
739 .context("in starnix_kernel_runner, while initializing RTC")?;
740
741 log_info!("Registering devices and filesystems.");
743 init_common_devices(kernel.kthreads.unlocked_for_async().deref_mut(), &kernel)?;
744 register_common_file_systems(kernel.kthreads.unlocked_for_async().deref_mut(), &kernel);
745
746 log_info!("Mounting filesystems.");
747 mount_filesystems(
748 kernel.kthreads.unlocked_for_async().deref_mut(),
749 &system_task,
750 start_info,
751 &pkg_dir_proxy,
752 )
753 .source_context("mounting filesystems")?;
754
755 {
757 log_info!("Running container features.");
758 run_container_features(
759 kernel.kthreads.unlocked_for_async().deref_mut(),
760 &system_task,
761 &features,
762 )?;
763 }
764
765 log_info!("Initializing remote block devices.");
766 init_remote_block_devices(kernel.kthreads.unlocked_for_async().deref_mut(), &system_task)
767 .source_context("initalizing remote block devices")?;
768
769 let argv = if start_info.program.init.is_empty() {
774 vec![DEFAULT_INIT.to_string()]
775 } else {
776 start_info.program.init.clone()
777 }
778 .iter()
779 .map(|s| to_cstr(s))
780 .collect::<Vec<_>>();
781
782 log_info!("Opening start_info file.");
783 let executable = system_task
784 .open_file(
785 kernel.kthreads.unlocked_for_async().deref_mut(),
786 argv[0].as_bytes().into(),
787 OpenFlags::RDONLY,
788 )
789 .with_source_context(|| format!("opening init: {:?}", &argv[0]))?;
790
791 let initial_name = if start_info.program.init.is_empty() {
792 TaskCommand::default()
793 } else {
794 TaskCommand::new(start_info.program.init[0].as_bytes())
795 };
796
797 let rlimits = parse_rlimits(&start_info.program.rlimits)?;
798
799 log_info!("Starting runtime directory.");
801 if let Some(runtime_dir) = start_info.runtime_dir.take() {
802 kernel.kthreads.spawn_future(
803 move || async move { serve_runtime_dir(runtime_dir).await },
804 "serve_runtime_dir",
805 );
806 }
807
808 if let Some(break_on_start) = start_info.break_on_start.take() {
811 log_info!("Waiting for signal from debugger before spawning init process...");
812 if let Err(e) =
813 fuchsia_async::OnSignals::new(break_on_start, zx::Signals::EVENTPAIR_PEER_CLOSED).await
814 {
815 log_warn!(e:%; "Received break_on_start eventpair but couldn't wait for PEER_CLOSED.");
816 }
817 }
818
819 log_info!("Creating init process.");
820 let init_task = create_init_process(
821 kernel.kthreads.unlocked_for_async().deref_mut(),
822 &kernel,
823 init_pid,
824 initial_name,
825 Arc::clone(&fs_context),
826 &rlimits,
827 )
828 .with_source_context(|| format!("creating init task: {:?}", &start_info.program.init))?;
829
830 execute_task_with_prerun_result(
831 kernel.kthreads.unlocked_for_async().deref_mut(),
832 init_task,
833 move |locked, init_task| {
834 parse_numbered_handles(locked, init_task, None, &init_task.live().files).expect("");
835 init_task.exec(locked, executable, argv[0].clone(), argv.clone(), vec![])
836 },
837 move |result| {
838 log_info!("Finished running init process: {:?}", result);
839 let _ = task_complete.send(result);
840 },
841 None,
842 )?;
843
844 if !start_info.program.startup_file_path.is_empty() {
845 wait_for_init_file(&start_info.program.startup_file_path, &system_task, init_pid).await?;
846 };
847
848 let memory_attribution_manager = ContainerMemoryAttributionManager::new(
849 Arc::downgrade(&kernel),
850 start_info.component_instance.take().ok_or_else(|| Error::msg("No component instance"))?,
851 );
852
853 Ok(Container {
854 kernel,
855 memory_attribution_manager,
856 _node: node,
857 _thread_bound: Default::default(),
858 })
859}
860
861fn create_fs_context(
862 locked: &mut Locked<Unlocked>,
863 kernel: &Kernel,
864 features: &Features,
865 start_info: &ContainerStartInfo,
866 pkg_dir_proxy: &fio::DirectorySynchronousProxy,
867) -> Result<Arc<FsContext>, Error> {
868 let mut mounts_iter =
872 start_info.program.mounts.iter().chain(start_info.config.additional_mounts.iter());
873 let mut root = MountAction::new_for_root(
874 locked,
875 kernel,
876 pkg_dir_proxy,
877 mounts_iter.next().ok_or_else(|| anyhow!("Mounts list is empty"))?,
878 )?;
879 if root.path != "/" {
880 anyhow::bail!("First mount in mounts list is not the root");
881 }
882
883 let mut mappings = vec![];
885 if features.container {
886 let component_tmpfs_options = FileSystemOptions {
889 params: kernel
890 .features
891 .ns_mount_options("#component_tmpfs")
892 .context("#component_tmpfs options")?,
893 ..Default::default()
894 };
895 let component_tmpfs = TmpFs::new_fs_with_options(locked, kernel, component_tmpfs_options)?;
896
897 let container_remotefs_options = FileSystemOptions {
899 source: "data".into(),
900 params: kernel.features.ns_mount_options("#container").context("#container options")?,
901 ..Default::default()
902 };
903 let container_remotefs = new_remotefs_in_root(
904 locked,
905 kernel,
906 pkg_dir_proxy,
907 container_remotefs_options,
908 fio::PERM_READABLE | fio::PERM_EXECUTABLE,
909 )?;
910
911 let container_fs = LayeredFs::new_fs(
912 locked,
913 kernel,
914 container_remotefs,
915 BTreeMap::from([("component".into(), component_tmpfs)]),
916 );
917 mappings.push(("container".into(), container_fs));
918 }
919 if features.custom_artifacts {
920 let mount_options = FileSystemOptions {
921 params: kernel
922 .features
923 .ns_mount_options("#custom_artifacts")
924 .context("#custom_artifacts options")?,
925 ..Default::default()
926 };
927 mappings.push((
928 "custom_artifacts".into(),
929 TmpFs::new_fs_with_options(locked, kernel, mount_options)?,
930 ));
931 }
932 if features.test_data {
933 let mount_options = FileSystemOptions {
934 params: kernel.features.ns_mount_options("#test_data").context("#test_data options")?,
935 ..Default::default()
936 };
937 mappings
938 .push(("test_data".into(), TmpFs::new_fs_with_options(locked, kernel, mount_options)?));
939 }
940
941 if !mappings.is_empty() {
942 root.fs = LayeredFs::new_fs(locked, kernel, root.fs, mappings.into_iter().collect());
945 }
946 if features.rootfs_rw {
947 root.fs = OverlayStack::wrap_fs_in_writable_layer(locked, kernel, root.fs)?;
948 }
949 Ok(FsContext::new(Namespace::new_with_flags(root.fs, root.flags)))
950}
951
952fn parse_rlimits(rlimits: &[String]) -> Result<Vec<(Resource, u64)>, Error> {
953 let mut res = Vec::new();
954
955 for rlimit in rlimits {
956 let (key, value) =
957 rlimit.split_once('=').ok_or_else(|| anyhow!("Invalid rlimit: {rlimit}"))?;
958 let value = value.parse::<u64>()?;
959 let kv = match key {
960 "RLIMIT_NOFILE" => (Resource::NOFILE, value),
961 "RLIMIT_RTPRIO" => (Resource::RTPRIO, value),
962 _ => bail!("Unknown rlimit: {key}"),
963 };
964 res.push(kv);
965 }
966
967 Ok(res)
968}
969
970fn mount_filesystems(
971 locked: &mut Locked<Unlocked>,
972 system_task: &CurrentTask,
973 start_info: &ContainerStartInfo,
974 pkg_dir_proxy: &fio::DirectorySynchronousProxy,
975) -> Result<(), Error> {
976 let mut mounts_iter =
978 start_info.program.mounts.iter().chain(start_info.config.additional_mounts.iter());
979 let _ = mounts_iter.next();
980 for mount_spec in mounts_iter {
981 let action = MountAction::from_spec(locked, system_task, pkg_dir_proxy, mount_spec)
982 .with_source_context(|| format!("creating filesystem from spec: {}", &mount_spec))?;
983 let mount_point = system_task
984 .lookup_path_from_root(locked, action.path.as_ref())
985 .with_source_context(|| format!("lookup path from root: {}", action.path))?;
986 mount_point.mount(WhatToMount::Fs(action.fs), action.flags)?;
987 }
988 Ok(())
989}
990
991fn init_remote_block_devices(
992 locked: &mut Locked<Unlocked>,
993 system_task: &CurrentTask,
994) -> Result<(), Error> {
995 remote_block_device_init(locked, system_task);
996 let entries = match std::fs::read_dir("/block") {
997 Ok(entries) => entries,
998 Err(e) => {
999 log_warn!("Failed to read block directory: {}", e);
1000 return Ok(());
1001 }
1002 };
1003 for entry in entries {
1004 let entry = entry?;
1005 let path_buf = entry.path();
1006 let path = path_buf.to_str().ok_or_else(|| anyhow!("Invalid block device path"))?;
1007 let (client_end, server_end) = fidl::endpoints::create_endpoints();
1008 match fdio::service_connect(
1009 &format!("{}/fuchsia.storage.block.Block", path),
1010 server_end.into(),
1011 ) {
1012 Ok(()) => (),
1013 Err(e) => {
1014 log_warn!("Failed to connect to block device at {}: {}", path, e);
1015 continue;
1016 }
1017 }
1018 system_task.kernel().remote_block_device_registry.create_remote_block_device(
1019 locked,
1020 system_task,
1021 entry.file_name().to_str().unwrap(),
1022 client_end,
1023 )?;
1024 }
1025 Ok(())
1026}
1027
1028async fn wait_for_init_file(
1029 startup_file_path: &str,
1030 current_task: &CurrentTask,
1031 init_tid: tid_t,
1032) -> Result<(), Error> {
1033 loop {
1035 fasync::Timer::new(fasync::MonotonicDuration::from_millis(100).after_now()).await;
1036
1037 let creds = security::creds_start_internal_operation(current_task);
1038 if let Some(result) = current_task.override_creds(creds, || {
1039 let root = current_task.fs().root();
1040 let mut context = LookupContext::default();
1041
1042 match current_task.lookup_path(
1043 current_task.kernel().kthreads.unlocked_for_async().deref_mut(),
1044 &mut context,
1045 root,
1046 startup_file_path.into(),
1047 ) {
1048 Ok(_) => return Some(Ok(())),
1049 Err(error) if error == ENOENT => {}
1050 Err(error) => return Some(Err(anyhow::Error::from(error))),
1051 };
1052
1053 if current_task.get_task(init_tid).upgrade().is_none() {
1054 return Some(Err(anyhow!(
1055 "Init task terminated before startup_file_path was ready"
1056 )));
1057 }
1058
1059 None
1060 }) {
1061 return result;
1062 }
1063 }
1064}
1065
1066async fn serve_runtime_dir(runtime_dir: ServerEnd<fio::DirectoryMarker>) {
1067 let mut fs = fuchsia_component::server::ServiceFs::new();
1068 match create_job_id_vmo() {
1069 Ok(vmo) => {
1070 fs.dir("elf").add_vmo_file_at("job_id", vmo);
1071 }
1072 Err(e) => log_error!(e:%; "failed to create vmo with job id for debuggers"),
1073 }
1074 match fs.serve_connection(runtime_dir) {
1075 Ok(_) => {
1076 fs.add_fidl_service(|job_requests: TaskProviderRequestStream| {
1077 fuchsia_async::Task::local(async move {
1078 if let Err(e) = serve_task_provider(job_requests).await {
1079 log_warn!(e:?; "Error serving TaskProvider");
1080 }
1081 })
1082 .detach();
1083 });
1084 fs.collect::<()>().await;
1085 }
1086 Err(e) => log_error!("Couldn't serve runtime directory: {e:?}"),
1087 }
1088}
1089
1090fn create_job_id_vmo() -> Result<zx::Vmo, Error> {
1091 let job_id = fuchsia_runtime::job_default().koid().context("reading own job koid")?;
1092 let job_id_str = job_id.raw_koid().to_string();
1093 let job_id_vmo = zx::Vmo::create(job_id_str.len() as u64).context("creating job id vmo")?;
1094 job_id_vmo.write(job_id_str.as_bytes(), 0).context("write job id to vmo")?;
1095 Ok(job_id_vmo)
1096}
1097
1098async fn serve_task_provider(mut job_requests: TaskProviderRequestStream) -> Result<(), Error> {
1099 while let Some(request) = job_requests.next().await {
1100 match request.context("getting next TaskProvider request")? {
1101 TaskProviderRequest::GetJob { responder } => {
1102 responder
1103 .send(
1104 fuchsia_runtime::job_default()
1105 .duplicate_handle(zx::Rights::SAME_RIGHTS)
1106 .map_err(|s| s.into_raw()),
1107 )
1108 .context("sending job for runtime dir")?;
1109 }
1110 unknown => bail!("Unknown TaskProvider method {unknown:?}"),
1111 }
1112 }
1113 Ok(())
1114}
1115
1116#[cfg(test)]
1117mod test {
1118 use super::wait_for_init_file;
1119 use fuchsia_async as fasync;
1120 use futures::{SinkExt, StreamExt};
1121 #[allow(deprecated, reason = "pre-existing usage")]
1122 use starnix_core::testing::create_kernel_task_and_unlocked;
1123 use starnix_core::vfs::FdNumber;
1124 use starnix_uapi::CLONE_FS;
1125 use starnix_uapi::file_mode::{AccessCheck, FileMode};
1126 use starnix_uapi::open_flags::OpenFlags;
1127 use starnix_uapi::signals::SIGCHLD;
1128 use starnix_uapi::vfs::ResolveFlags;
1129
1130 #[fuchsia::test]
1131 async fn test_init_file_already_exists() {
1132 #[allow(deprecated, reason = "pre-existing usage")]
1133 let (_kernel, current_task, locked) = create_kernel_task_and_unlocked();
1134 let (mut sender, mut receiver) = futures::channel::mpsc::unbounded();
1135
1136 let path = "/path";
1137 current_task
1138 .open_file_at(
1139 locked,
1140 FdNumber::AT_FDCWD,
1141 path.into(),
1142 OpenFlags::CREAT,
1143 FileMode::default(),
1144 ResolveFlags::empty(),
1145 AccessCheck::default(),
1146 )
1147 .expect("Failed to create file");
1148
1149 fasync::Task::local(async move {
1150 wait_for_init_file(path, ¤t_task, current_task.get_tid())
1151 .await
1152 .expect("failed to wait for file");
1153 sender.send(()).await.expect("failed to send message");
1154 })
1155 .detach();
1156
1157 assert!(receiver.next().await.is_some());
1159 }
1160
1161 #[fuchsia::test]
1162 async fn test_init_file_wait_required() {
1163 #[allow(deprecated, reason = "pre-existing usage")]
1164 let (_kernel, current_task, locked) = create_kernel_task_and_unlocked();
1165 let (mut sender, mut receiver) = futures::channel::mpsc::unbounded();
1166
1167 let init_task = current_task.clone_task_for_test(locked, CLONE_FS as u64, Some(SIGCHLD));
1168 let path = "/path";
1169
1170 let test_init_tid = current_task.get_tid();
1171 fasync::Task::local(async move {
1172 sender.send(()).await.expect("failed to send message");
1173 wait_for_init_file(path, &init_task, test_init_tid)
1174 .await
1175 .expect("failed to wait for file");
1176 sender.send(()).await.expect("failed to send message");
1177 })
1178 .detach();
1179
1180 assert!(receiver.next().await.is_some());
1182
1183 current_task
1185 .open_file_at(
1186 locked,
1187 FdNumber::AT_FDCWD,
1188 path.into(),
1189 OpenFlags::CREAT,
1190 FileMode::default(),
1191 ResolveFlags::empty(),
1192 AccessCheck::default(),
1193 )
1194 .expect("Failed to create file");
1195
1196 assert!(receiver.next().await.is_some());
1198 }
1199
1200 #[fuchsia::test]
1201 async fn test_init_exits_before_file_exists() {
1202 #[allow(deprecated, reason = "pre-existing usage")]
1203 let (_kernel, current_task, locked) = create_kernel_task_and_unlocked();
1204 let (mut sender, mut receiver) = futures::channel::mpsc::unbounded();
1205
1206 let init_task = current_task.clone_task_for_test(locked, CLONE_FS as u64, Some(SIGCHLD));
1207 const STARTUP_FILE_PATH: &str = "/path";
1208
1209 let test_init_tid = init_task.get_tid();
1210 fasync::Task::local(async move {
1211 sender.send(()).await.expect("failed to send message");
1212 wait_for_init_file(STARTUP_FILE_PATH, ¤t_task, test_init_tid)
1213 .await
1214 .expect_err("Did not detect init exit");
1215 sender.send(()).await.expect("failed to send message");
1216 })
1217 .detach();
1218
1219 assert!(receiver.next().await.is_some());
1221
1222 std::mem::drop(init_task);
1224
1225 assert!(receiver.next().await.is_some());
1227 }
1228}