1use crate::signals::{SignalInfo, send_freeze_signal};
11use crate::task::waiter::WaiterOptions;
12use crate::task::{Kernel, ThreadGroup, ThreadGroupKey, WaitQueue, Waiter};
13use crate::vfs::{FsStr, FsString, PathBuilder};
14use starnix_logging::{CATEGORY_STARNIX, log_warn, trace_duration, track_stub};
15use starnix_sync::{FileOpsCore, LockBefore, Locked, Mutex, MutexGuard, ThreadGroupLimits};
16use starnix_uapi::errors::Errno;
17use starnix_uapi::signals::SIGKILL;
18use starnix_uapi::{errno, error, pid_t};
19use std::collections::{BTreeMap, HashMap, HashSet, btree_map, hash_map};
20use std::ops::{Deref, DerefMut};
21use std::sync::atomic::{AtomicU64, Ordering};
22use std::sync::{Arc, Weak};
23
24use crate::signals::KernelSignal;
25
26#[derive(Debug)]
30pub struct KernelCgroups {
31 pub cgroup2: Arc<CgroupRoot>,
32}
33
34impl KernelCgroups {
35 pub fn lock_cgroup2_pid_table(&self) -> MutexGuard<'_, CgroupPidTable> {
42 self.cgroup2.pid_table.lock()
43 }
44}
45
46impl Default for KernelCgroups {
47 fn default() -> Self {
48 Self { cgroup2: CgroupRoot::new() }
49 }
50}
51
52#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
53pub enum FreezerState {
54 Thawed,
55 Frozen,
56}
57
58impl Default for FreezerState {
59 fn default() -> Self {
60 FreezerState::Thawed
61 }
62}
63
64impl std::fmt::Display for FreezerState {
65 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
66 match self {
67 FreezerState::Frozen => write!(f, "1"),
68 FreezerState::Thawed => write!(f, "0"),
69 }
70 }
71}
72
73#[derive(Default)]
74pub struct CgroupFreezerState {
75 pub self_freezer_state: FreezerState,
77 pub effective_freezer_state: FreezerState,
81}
82
83pub trait CgroupOps: Send + Sync + 'static {
85 fn id(&self) -> u64;
87
88 fn add_process(
90 &self,
91 locked: &mut Locked<FileOpsCore>,
92 thread_group: &ThreadGroup,
93 ) -> Result<(), Errno>;
94
95 fn new_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno>;
98
99 fn get_children(&self) -> Result<Vec<CgroupHandle>, Errno>;
101
102 fn get_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno>;
104
105 fn remove_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno>;
108
109 fn get_pids(&self, kernel: &Kernel) -> Vec<pid_t>;
111
112 fn kill(&self);
114
115 fn is_populated(&self) -> bool;
117
118 fn get_freezer_state(&self) -> CgroupFreezerState;
120
121 fn freeze(&self, locked: &mut Locked<FileOpsCore>);
123
124 fn thaw(&self);
126}
127
128#[derive(Debug, Default)]
131pub struct CgroupPidTable(HashMap<ThreadGroupKey, Weak<Cgroup>>);
132impl Deref for CgroupPidTable {
133 type Target = HashMap<ThreadGroupKey, Weak<Cgroup>>;
134
135 fn deref(&self) -> &Self::Target {
136 &self.0
137 }
138}
139impl DerefMut for CgroupPidTable {
140 fn deref_mut(&mut self) -> &mut Self::Target {
141 &mut self.0
142 }
143}
144
145impl CgroupPidTable {
146 pub fn inherit_cgroup(&mut self, parent: &ThreadGroup, child: &ThreadGroup) {
149 assert!(child.read().tasks_count() == 0, "threadgroup must be newly created");
150 if let Some(weak_cgroup) = self.0.get(&parent.into()).cloned() {
151 let Some(cgroup) = weak_cgroup.upgrade() else {
152 log_warn!("ignored attempt to inherit a non-existant cgroup");
153 return;
154 };
155 assert!(
156 self.0.insert(child.into(), weak_cgroup).map(|c| c.strong_count() == 0).is_none(),
157 "child pid should not exist when inheriting"
158 );
159 cgroup.state.lock().processes.insert(child.into());
161 }
162 }
163
164 pub fn maybe_create_freeze_signal<TG: Copy + Into<ThreadGroupKey>>(
166 &self,
167 tg: TG,
168 ) -> Option<KernelSignal> {
169 let Some(weak_cgroup) = self.0.get(&tg.into()) else {
170 return None;
171 };
172 let Some(cgroup) = weak_cgroup.upgrade() else {
173 return None;
174 };
175 let state = cgroup.state.lock();
176 if state.get_effective_freezer_state() != FreezerState::Frozen {
177 return None;
178 }
179 Some(KernelSignal::Freeze(state.create_freeze_waiter()))
180 }
181
182 pub fn remove_process(&mut self, thread_group_key: ThreadGroupKey) {
184 if let Some(entry) = self.remove(&thread_group_key) {
185 if let Some(cgroup) = entry.upgrade() {
186 cgroup.state.lock().processes.remove(&thread_group_key);
187 }
188 }
189 }
190}
191
192#[derive(Debug)]
204pub struct CgroupRoot {
205 pid_table: Mutex<CgroupPidTable>,
207
208 children: Mutex<CgroupChildren>,
210
211 weak_self: Weak<CgroupRoot>,
213
214 next_id: AtomicU64,
216}
217
218impl CgroupRoot {
219 pub fn new() -> Arc<CgroupRoot> {
220 Arc::new_cyclic(|weak_self| Self {
221 pid_table: Default::default(),
222 children: Default::default(),
223 weak_self: weak_self.clone(),
224 next_id: AtomicU64::new(1),
225 })
226 }
227
228 fn get_next_id(&self) -> u64 {
229 self.next_id.fetch_add(1, Ordering::Relaxed)
230 }
231
232 pub fn get_cgroup<TG: Copy + Into<ThreadGroupKey>>(&self, tg: TG) -> Option<Weak<Cgroup>> {
233 self.pid_table.lock().get(&tg.into()).cloned()
234 }
235
236 pub fn get_cgroup_inspect(&self) -> fuchsia_inspect::Inspector {
237 let inspector = fuchsia_inspect::Inspector::default();
238 let cgroups = inspector.root();
239 cgroups.record_uint("pids", self.pid_table.lock().len() as u64);
240 cgroups.record_uint("count", self.children.lock().count_descendants());
241 inspector
242 }
243}
244
245impl CgroupOps for CgroupRoot {
246 fn id(&self) -> u64 {
247 0
248 }
249
250 fn add_process(
251 &self,
252 locked: &mut Locked<FileOpsCore>,
253 thread_group: &ThreadGroup,
254 ) -> Result<(), Errno> {
255 let mut pid_table = self.pid_table.lock();
256 if let Some(entry) = pid_table.remove(&thread_group.into()) {
257 if let Some(cgroup) = entry.upgrade() {
259 cgroup.state.lock().remove_process(locked, thread_group)?;
260 }
261 }
262 Ok(())
266 }
267
268 fn new_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno> {
269 let id = self.get_next_id();
270 let new_child = Cgroup::new(id, name, &self.weak_self, None);
271 let mut children = self.children.lock();
272 children.insert_child(name.into(), new_child)
273 }
274
275 fn get_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno> {
276 let children = self.children.lock();
277 children.get_child(name).ok_or_else(|| errno!(ENOENT))
278 }
279
280 fn remove_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno> {
281 let mut children = self.children.lock();
282 children.remove_child(name)
283 }
284
285 fn get_children(&self) -> Result<Vec<CgroupHandle>, Errno> {
286 let children = self.children.lock();
287 Ok(children.get_children())
288 }
289
290 fn get_pids(&self, kernel: &Kernel) -> Vec<pid_t> {
291 let controlled_pids: HashSet<pid_t> =
292 self.pid_table.lock().keys().filter_map(|v| v.upgrade().map(|tg| tg.leader)).collect();
293 let kernel_pids = kernel.pids.read().process_ids();
294 kernel_pids.into_iter().filter(|pid| !controlled_pids.contains(pid)).collect()
295 }
296
297 fn kill(&self) {
298 unreachable!("Root cgroup cannot kill its processes.");
299 }
300
301 fn is_populated(&self) -> bool {
302 false
303 }
304
305 fn get_freezer_state(&self) -> CgroupFreezerState {
306 Default::default()
307 }
308
309 fn freeze(&self, _locked: &mut Locked<FileOpsCore>) {
310 unreachable!("Root cgroup cannot freeze any processes.");
311 }
312
313 fn thaw(&self) {
314 unreachable!("Root cgroup cannot thaw any processes.");
315 }
316}
317
318#[derive(Debug, Default)]
319struct CgroupChildren(BTreeMap<FsString, CgroupHandle>);
320impl CgroupChildren {
321 fn insert_child(&mut self, name: FsString, child: CgroupHandle) -> Result<CgroupHandle, Errno> {
322 let btree_map::Entry::Vacant(child_entry) = self.0.entry(name) else {
323 return error!(EEXIST);
324 };
325 Ok(child_entry.insert(child).clone())
326 }
327
328 fn remove_child(&mut self, name: &FsStr) -> Result<CgroupHandle, Errno> {
329 let btree_map::Entry::Occupied(child_entry) = self.0.entry(name.into()) else {
330 return error!(ENOENT);
331 };
332 let child = child_entry.get();
333
334 let mut child_state = child.state.lock();
335 assert!(!child_state.deleted, "child cannot be deleted");
336
337 child_state.update_processes();
338 if !child_state.processes.is_empty() {
339 return error!(EBUSY);
340 }
341 if !child_state.children.is_empty() {
342 return error!(EBUSY);
343 }
344
345 child_state.deleted = true;
346 drop(child_state);
347
348 Ok(child_entry.remove())
349 }
350
351 fn get_child(&self, name: &FsStr) -> Option<CgroupHandle> {
352 self.0.get(name).cloned()
353 }
354
355 fn get_children(&self) -> Vec<CgroupHandle> {
356 self.0.values().cloned().collect()
357 }
358
359 fn count_descendants(&self) -> u64 {
360 self.0.values().map(|child| 1 + child.count_descendants()).sum()
361 }
362}
363
364impl Deref for CgroupChildren {
365 type Target = BTreeMap<FsString, CgroupHandle>;
366
367 fn deref(&self) -> &Self::Target {
368 &self.0
369 }
370}
371
372#[derive(Debug, Default)]
373struct CgroupState {
374 children: CgroupChildren,
376
377 processes: HashSet<ThreadGroupKey>,
379
380 deleted: bool,
382
383 wait_queue: WaitQueue,
385
386 self_freezer_state: FreezerState,
388
389 inherited_freezer_state: FreezerState,
391}
392
393impl CgroupState {
394 fn create_freeze_waiter(&self) -> Waiter {
397 let waiter = Waiter::with_options(WaiterOptions::IGNORE_SIGNALS);
398 self.wait_queue.wait_async(&waiter);
399 waiter
400 }
401
402 fn update_processes(&mut self) {
404 self.processes.retain(|thread_group| {
405 let Some(thread_group) = thread_group.upgrade() else {
406 return false;
407 };
408 let terminating = thread_group.read().is_terminating();
409 !terminating
410 });
411 }
412
413 fn freeze_thread_group<L>(&self, locked: &mut Locked<L>, thread_group: &ThreadGroup)
414 where
415 L: LockBefore<ThreadGroupLimits>,
416 {
417 let tasks = thread_group.read().tasks();
418 for task in tasks {
419 send_freeze_signal(locked, &task, self.create_freeze_waiter())
420 .expect("sending freeze signal should not fail");
421 }
422 }
423
424 fn thaw_thread_group<L>(&self, _locked: &mut Locked<L>, thread_group: &ThreadGroup)
425 where
426 L: LockBefore<ThreadGroupLimits>,
427 {
428 let tasks = thread_group.read().tasks();
429 for task in tasks {
430 task.write().thaw();
431 task.interrupt();
432 }
433 }
434
435 fn get_effective_freezer_state(&self) -> FreezerState {
436 std::cmp::max(self.self_freezer_state, self.inherited_freezer_state)
437 }
438
439 fn add_process<L>(
440 &mut self,
441 locked: &mut Locked<L>,
442 thread_group: &ThreadGroup,
443 ) -> Result<(), Errno>
444 where
445 L: LockBefore<ThreadGroupLimits>,
446 {
447 if self.deleted {
448 return error!(ENOENT);
449 }
450 self.processes.insert(thread_group.into());
451
452 if self.get_effective_freezer_state() == FreezerState::Frozen {
453 self.freeze_thread_group(locked, &thread_group);
454 }
455 Ok(())
456 }
457
458 fn remove_process<L>(
459 &mut self,
460 locked: &mut Locked<L>,
461 thread_group: &ThreadGroup,
462 ) -> Result<(), Errno>
463 where
464 L: LockBefore<ThreadGroupLimits>,
465 {
466 if self.deleted {
467 return error!(ENOENT);
468 }
469 self.processes.remove(&thread_group.into());
470
471 if self.get_effective_freezer_state() == FreezerState::Frozen {
472 self.thaw_thread_group(locked, thread_group);
473 }
474 Ok(())
475 }
476
477 fn propagate_freeze<L>(&mut self, locked: &mut Locked<L>, inherited_freezer_state: FreezerState)
478 where
479 L: LockBefore<ThreadGroupLimits>,
480 {
481 let prev_effective_freezer_state = self.get_effective_freezer_state();
482 self.inherited_freezer_state = inherited_freezer_state;
483 if prev_effective_freezer_state == FreezerState::Frozen {
484 return;
485 }
486
487 for thread_group in self.processes.iter() {
488 let Some(thread_group) = thread_group.upgrade() else {
489 continue;
490 };
491 self.freeze_thread_group(locked, &thread_group);
492 }
493
494 for child in self.children.get_children() {
496 child.state.lock().propagate_freeze(locked, FreezerState::Frozen);
497 }
498 }
499
500 fn propagate_thaw(&mut self, inherited_freezer_state: FreezerState) {
501 self.inherited_freezer_state = inherited_freezer_state;
502 if self.get_effective_freezer_state() == FreezerState::Thawed {
503 self.wait_queue.notify_all();
504 for child in self.children.get_children() {
505 child.state.lock().propagate_thaw(FreezerState::Thawed);
506 }
507 }
508 }
509
510 fn propagate_kill(&self) {
511 for thread_group in self.processes.iter() {
512 let Some(thread_group) = thread_group.upgrade() else {
513 continue;
514 };
515 thread_group.write().send_signal(SignalInfo::kernel(SIGKILL));
516 }
517
518 for child in self.children.get_children() {
520 child.state.lock().propagate_kill();
521 }
522 }
523}
524
525#[derive(Debug)]
527pub struct Cgroup {
528 root: Weak<CgroupRoot>,
529
530 id: u64,
532
533 name: FsString,
535
536 parent: Option<Weak<Cgroup>>,
539
540 state: Mutex<CgroupState>,
542
543 weak_self: Weak<Cgroup>,
544}
545pub type CgroupHandle = Arc<Cgroup>;
546
547pub fn path_from_root(weak_cgroup: Option<Weak<Cgroup>>) -> Result<FsString, Errno> {
549 let cgroup = match weak_cgroup {
550 Some(weak_cgroup) => Weak::upgrade(&weak_cgroup).ok_or_else(|| errno!(ENODEV))?,
551 None => return Ok("/".into()),
552 };
553 let mut path = PathBuilder::new();
554 let mut current = Some(cgroup);
555 while let Some(cgroup) = current {
556 path.prepend_element(cgroup.name());
557 current = cgroup.parent()?;
558 }
559 Ok(path.build_absolute())
560}
561
562impl Cgroup {
563 pub fn new(
564 id: u64,
565 name: &FsStr,
566 root: &Weak<CgroupRoot>,
567 parent: Option<Weak<Cgroup>>,
568 ) -> CgroupHandle {
569 Arc::new_cyclic(|weak| Self {
570 id,
571 root: root.clone(),
572 name: name.to_owned(),
573 parent,
574 state: Default::default(),
575 weak_self: weak.clone(),
576 })
577 }
578
579 pub fn name(&self) -> &FsStr {
580 self.name.as_ref()
581 }
582
583 fn root(&self) -> Result<Arc<CgroupRoot>, Errno> {
584 self.root.upgrade().ok_or_else(|| errno!(ENODEV))
585 }
586
587 fn parent(&self) -> Result<Option<CgroupHandle>, Errno> {
590 self.parent.as_ref().map(|weak| weak.upgrade().ok_or_else(|| errno!(ENODEV))).transpose()
591 }
592
593 fn count_descendants(&self) -> u64 {
594 self.state.lock().children.count_descendants()
595 }
596}
597
598impl CgroupOps for Cgroup {
599 fn id(&self) -> u64 {
600 self.id
601 }
602
603 fn add_process(
604 &self,
605 locked: &mut Locked<FileOpsCore>,
606 thread_group: &ThreadGroup,
607 ) -> Result<(), Errno> {
608 let root = self.root()?;
609 let mut pid_table = root.pid_table.lock();
610 match pid_table.entry(thread_group.into()) {
611 hash_map::Entry::Occupied(mut entry) => {
612 if std::ptr::eq(self, entry.get().as_ptr()) {
615 return Ok(());
616 }
617
618 track_stub!(TODO("https://fxbug.dev/383374687"), "check permissions");
620 if let Some(other_cgroup) = entry.get().upgrade() {
621 other_cgroup.state.lock().remove_process(locked, thread_group)?;
622 }
623
624 self.state.lock().add_process(locked, thread_group)?;
625 entry.insert(self.weak_self.clone());
626 }
627 hash_map::Entry::Vacant(entry) => {
628 self.state.lock().add_process(locked, thread_group)?;
629 entry.insert(self.weak_self.clone());
630 }
631 }
632
633 Ok(())
634 }
635
636 fn new_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno> {
637 let id = self.root()?.get_next_id();
638 let new_child = Cgroup::new(id, name, &self.root, Some(self.weak_self.clone()));
639 let mut state = self.state.lock();
640 if state.deleted {
641 return error!(ENOENT);
642 }
643 new_child.state.lock().inherited_freezer_state = state.get_effective_freezer_state();
645 state.children.insert_child(name.into(), new_child)
646 }
647
648 fn get_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno> {
649 let state = self.state.lock();
650 state.children.get_child(name).ok_or_else(|| errno!(ENOENT))
651 }
652
653 fn remove_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno> {
654 let mut state = self.state.lock();
655 if state.deleted {
656 return error!(ENOENT);
657 }
658 state.children.remove_child(name)
659 }
660
661 fn get_children(&self) -> Result<Vec<CgroupHandle>, Errno> {
662 let state = self.state.lock();
663 if state.deleted {
664 return error!(ENOENT);
665 }
666 Ok(state.children.get_children())
667 }
668
669 fn get_pids(&self, _kernel: &Kernel) -> Vec<pid_t> {
670 let mut state = self.state.lock();
671 state.update_processes();
672 state.processes.iter().filter_map(|v| v.upgrade().map(|tg| tg.leader)).collect()
673 }
674
675 fn kill(&self) {
676 trace_duration!(CATEGORY_STARNIX, "CgroupKill");
677 let state = self.state.lock();
678 state.propagate_kill();
679 }
680
681 fn is_populated(&self) -> bool {
682 let mut state = self.state.lock();
683 if state.deleted {
684 return false;
685 }
686 state.update_processes();
687 if !state.processes.is_empty() {
688 return true;
689 }
690
691 state.children.get_children().into_iter().any(|child| child.is_populated())
692 }
693
694 fn get_freezer_state(&self) -> CgroupFreezerState {
695 let state = self.state.lock();
696 CgroupFreezerState {
697 self_freezer_state: state.self_freezer_state,
698 effective_freezer_state: state.get_effective_freezer_state(),
699 }
700 }
701
702 fn freeze(&self, locked: &mut Locked<FileOpsCore>) {
703 trace_duration!(CATEGORY_STARNIX, "CgroupFreeze");
704 let mut state = self.state.lock();
705 let inherited_freezer_state = state.inherited_freezer_state;
706 state.propagate_freeze(locked, inherited_freezer_state);
707 state.self_freezer_state = FreezerState::Frozen;
708 }
709
710 fn thaw(&self) {
711 trace_duration!(CATEGORY_STARNIX, "CgroupThaw");
712 let mut state = self.state.lock();
713 state.self_freezer_state = FreezerState::Thawed;
714 let inherited_freezer_state = state.inherited_freezer_state;
715 state.propagate_thaw(inherited_freezer_state);
716 }
717}
718
719#[cfg(test)]
720mod test {
721 use super::*;
722 use crate::testing::spawn_kernel_and_run;
723 use assert_matches::assert_matches;
724 use starnix_uapi::signals::SIGCHLD;
725 use starnix_uapi::{CLONE_SIGHAND, CLONE_THREAD, CLONE_VM};
726
727 #[::fuchsia::test]
728 async fn cgroup_path_from_root() {
729 spawn_kernel_and_run(async |_, _| {
730 let root = CgroupRoot::new();
731
732 let test_cgroup =
733 root.new_child("test".into()).expect("new_child on root cgroup succeeds");
734 let child_cgroup = test_cgroup
735 .new_child("child".into())
736 .expect("new_child on non-root cgroup succeeds");
737
738 assert_eq!(path_from_root(Some(Arc::downgrade(&test_cgroup))), Ok("/test".into()));
739 assert_eq!(
740 path_from_root(Some(Arc::downgrade(&child_cgroup))),
741 Ok("/test/child".into())
742 );
743 })
744 .await;
745 }
746
747 #[::fuchsia::test]
748 async fn cgroup_clone_task_in_frozen_cgroup() {
749 spawn_kernel_and_run(async |locked, current_task| {
750 let kernel = current_task.kernel();
751 let root = &kernel.cgroups.cgroup2;
752 let cgroup = root.new_child("test".into()).expect("new_child on root cgroup succeeds");
753
754 let process = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
755 cgroup
756 .add_process(locked.cast_locked(), process.thread_group())
757 .expect("add process to cgroup");
758 cgroup.freeze(locked.cast_locked());
759 assert_eq!(cgroup.get_pids(&kernel).first(), Some(process.get_pid()).as_ref());
760 assert_eq!(
761 root.get_cgroup(process.thread_group()).unwrap().as_ptr(),
762 Arc::as_ptr(&cgroup)
763 );
764
765 let thread = process.clone_task_for_test(
766 locked,
767 (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM) as u64,
768 Some(SIGCHLD),
769 );
770
771 let thread_state = thread.read();
772 let kernel_signals = thread_state.kernel_signals_for_test();
773 assert_matches!(kernel_signals.front(), Some(KernelSignal::Freeze(_)));
774 })
775 .await;
776 }
777
778 #[::fuchsia::test]
779 async fn cgroup_tg_release_removes_pid() {
780 spawn_kernel_and_run(async |locked, current_task| {
781 let kernel = current_task.kernel();
782 let root = &kernel.cgroups.cgroup2;
783 let cgroup = root.new_child("test".into()).expect("new_child on root cgroup succeeds");
784
785 let process = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
786 cgroup
787 .add_process(locked.cast_locked(), process.thread_group())
788 .expect("add process to cgroup");
789
790 assert_eq!(
791 root.get_cgroup(process.thread_group()).unwrap().as_ptr(),
792 Arc::as_ptr(&cgroup)
793 );
794
795 drop(process);
797
798 assert!(root.pid_table.lock().is_empty());
800 })
801 .await;
802 }
803}