1use crate::signals::{SignalInfo, send_freeze_signal};
11use crate::task::waiter::WaiterOptions;
12use crate::task::{Kernel, ThreadGroup, ThreadGroupKey, WaitQueue, Waiter};
13use crate::vfs::{FsStr, FsString, PathBuilder};
14use starnix_logging::{CATEGORY_STARNIX, log_warn, trace_duration, track_stub};
15use starnix_sync::{FileOpsCore, LockBefore, Locked, Mutex, MutexGuard, ThreadGroupLimits};
16use starnix_types::ownership::TempRef;
17use starnix_uapi::errors::Errno;
18use starnix_uapi::signals::SIGKILL;
19use starnix_uapi::{errno, error, pid_t};
20use std::collections::{BTreeMap, HashMap, HashSet, btree_map, hash_map};
21use std::ops::{Deref, DerefMut};
22use std::sync::atomic::{AtomicU64, Ordering};
23use std::sync::{Arc, Weak};
24
25use crate::signals::KernelSignal;
26
27#[derive(Debug)]
31pub struct KernelCgroups {
32 pub cgroup2: Arc<CgroupRoot>,
33}
34
35impl KernelCgroups {
36 pub fn lock_cgroup2_pid_table(&self) -> MutexGuard<'_, CgroupPidTable> {
43 self.cgroup2.pid_table.lock()
44 }
45}
46
47impl Default for KernelCgroups {
48 fn default() -> Self {
49 Self { cgroup2: CgroupRoot::new() }
50 }
51}
52
53#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
54pub enum FreezerState {
55 Thawed,
56 Frozen,
57}
58
59impl Default for FreezerState {
60 fn default() -> Self {
61 FreezerState::Thawed
62 }
63}
64
65impl std::fmt::Display for FreezerState {
66 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
67 match self {
68 FreezerState::Frozen => write!(f, "1"),
69 FreezerState::Thawed => write!(f, "0"),
70 }
71 }
72}
73
74#[derive(Default)]
75pub struct CgroupFreezerState {
76 pub self_freezer_state: FreezerState,
78 pub effective_freezer_state: FreezerState,
82}
83
84pub trait CgroupOps: Send + Sync + 'static {
86 fn id(&self) -> u64;
88
89 fn add_process(
91 &self,
92 locked: &mut Locked<FileOpsCore>,
93 thread_group: &ThreadGroup,
94 ) -> Result<(), Errno>;
95
96 fn new_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno>;
99
100 fn get_children(&self) -> Result<Vec<CgroupHandle>, Errno>;
102
103 fn get_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno>;
105
106 fn remove_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno>;
109
110 fn get_pids(&self, kernel: &Kernel) -> Vec<pid_t>;
112
113 fn kill(&self);
115
116 fn is_populated(&self) -> bool;
118
119 fn get_freezer_state(&self) -> CgroupFreezerState;
121
122 fn freeze(&self, locked: &mut Locked<FileOpsCore>);
124
125 fn thaw(&self);
127}
128
129#[derive(Debug, Default)]
132pub struct CgroupPidTable(HashMap<ThreadGroupKey, Weak<Cgroup>>);
133impl Deref for CgroupPidTable {
134 type Target = HashMap<ThreadGroupKey, Weak<Cgroup>>;
135
136 fn deref(&self) -> &Self::Target {
137 &self.0
138 }
139}
140impl DerefMut for CgroupPidTable {
141 fn deref_mut(&mut self) -> &mut Self::Target {
142 &mut self.0
143 }
144}
145
146impl CgroupPidTable {
147 pub fn inherit_cgroup(&mut self, parent: &ThreadGroup, child: &ThreadGroup) {
150 assert!(child.read().tasks_count() == 0, "threadgroup must be newly created");
151 if let Some(weak_cgroup) = self.0.get(&parent.into()).cloned() {
152 let Some(cgroup) = weak_cgroup.upgrade() else {
153 log_warn!("ignored attempt to inherit a non-existant cgroup");
154 return;
155 };
156 assert!(
157 self.0.insert(child.into(), weak_cgroup).map(|c| c.strong_count() == 0).is_none(),
158 "child pid should not exist when inheriting"
159 );
160 cgroup.state.lock().processes.insert(child.into());
162 }
163 }
164
165 pub fn maybe_create_freeze_signal<TG: Copy + Into<ThreadGroupKey>>(
167 &self,
168 tg: TG,
169 ) -> Option<KernelSignal> {
170 let Some(weak_cgroup) = self.0.get(&tg.into()) else {
171 return None;
172 };
173 let Some(cgroup) = weak_cgroup.upgrade() else {
174 return None;
175 };
176 let state = cgroup.state.lock();
177 if state.get_effective_freezer_state() != FreezerState::Frozen {
178 return None;
179 }
180 Some(KernelSignal::Freeze(state.create_freeze_waiter()))
181 }
182
183 pub fn remove_process(&mut self, thread_group_key: ThreadGroupKey) {
185 if let Some(entry) = self.remove(&thread_group_key) {
186 if let Some(cgroup) = entry.upgrade() {
187 cgroup.state.lock().processes.remove(&thread_group_key);
188 }
189 }
190 }
191}
192
193#[derive(Debug)]
205pub struct CgroupRoot {
206 pid_table: Mutex<CgroupPidTable>,
208
209 children: Mutex<CgroupChildren>,
211
212 weak_self: Weak<CgroupRoot>,
214
215 next_id: AtomicU64,
217}
218
219impl CgroupRoot {
220 pub fn new() -> Arc<CgroupRoot> {
221 Arc::new_cyclic(|weak_self| Self {
222 pid_table: Default::default(),
223 children: Default::default(),
224 weak_self: weak_self.clone(),
225 next_id: AtomicU64::new(1),
226 })
227 }
228
229 fn get_next_id(&self) -> u64 {
230 self.next_id.fetch_add(1, Ordering::Relaxed)
231 }
232
233 pub fn get_cgroup<TG: Copy + Into<ThreadGroupKey>>(&self, tg: TG) -> Option<Weak<Cgroup>> {
234 self.pid_table.lock().get(&tg.into()).cloned()
235 }
236
237 pub fn get_cgroup_inspect(&self) -> fuchsia_inspect::Inspector {
238 let inspector = fuchsia_inspect::Inspector::default();
239 let cgroups = inspector.root();
240 cgroups.record_uint("pids", self.pid_table.lock().len() as u64);
241 cgroups.record_uint("count", self.children.lock().count_descendants());
242 inspector
243 }
244}
245
246impl CgroupOps for CgroupRoot {
247 fn id(&self) -> u64 {
248 0
249 }
250
251 fn add_process(
252 &self,
253 locked: &mut Locked<FileOpsCore>,
254 thread_group: &ThreadGroup,
255 ) -> Result<(), Errno> {
256 let mut pid_table = self.pid_table.lock();
257 if let Some(entry) = pid_table.remove(&thread_group.into()) {
258 if let Some(cgroup) = entry.upgrade() {
260 cgroup.state.lock().remove_process(locked, thread_group)?;
261 }
262 }
263 Ok(())
267 }
268
269 fn new_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno> {
270 let id = self.get_next_id();
271 let new_child = Cgroup::new(id, name, &self.weak_self, None);
272 let mut children = self.children.lock();
273 children.insert_child(name.into(), new_child)
274 }
275
276 fn get_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno> {
277 let children = self.children.lock();
278 children.get_child(name).ok_or_else(|| errno!(ENOENT))
279 }
280
281 fn remove_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno> {
282 let mut children = self.children.lock();
283 children.remove_child(name)
284 }
285
286 fn get_children(&self) -> Result<Vec<CgroupHandle>, Errno> {
287 let children = self.children.lock();
288 Ok(children.get_children())
289 }
290
291 fn get_pids(&self, kernel: &Kernel) -> Vec<pid_t> {
292 let controlled_pids: HashSet<pid_t> =
293 self.pid_table.lock().keys().filter_map(|v| v.upgrade().map(|tg| tg.leader)).collect();
294 let kernel_pids = kernel.pids.read().process_ids();
295 kernel_pids.into_iter().filter(|pid| !controlled_pids.contains(pid)).collect()
296 }
297
298 fn kill(&self) {
299 unreachable!("Root cgroup cannot kill its processes.");
300 }
301
302 fn is_populated(&self) -> bool {
303 false
304 }
305
306 fn get_freezer_state(&self) -> CgroupFreezerState {
307 Default::default()
308 }
309
310 fn freeze(&self, _locked: &mut Locked<FileOpsCore>) {
311 unreachable!("Root cgroup cannot freeze any processes.");
312 }
313
314 fn thaw(&self) {
315 unreachable!("Root cgroup cannot thaw any processes.");
316 }
317}
318
319#[derive(Debug, Default)]
320struct CgroupChildren(BTreeMap<FsString, CgroupHandle>);
321impl CgroupChildren {
322 fn insert_child(&mut self, name: FsString, child: CgroupHandle) -> Result<CgroupHandle, Errno> {
323 let btree_map::Entry::Vacant(child_entry) = self.0.entry(name) else {
324 return error!(EEXIST);
325 };
326 Ok(child_entry.insert(child).clone())
327 }
328
329 fn remove_child(&mut self, name: &FsStr) -> Result<CgroupHandle, Errno> {
330 let btree_map::Entry::Occupied(child_entry) = self.0.entry(name.into()) else {
331 return error!(ENOENT);
332 };
333 let child = child_entry.get();
334
335 let mut child_state = child.state.lock();
336 assert!(!child_state.deleted, "child cannot be deleted");
337
338 child_state.update_processes();
339 if !child_state.processes.is_empty() {
340 return error!(EBUSY);
341 }
342 if !child_state.children.is_empty() {
343 return error!(EBUSY);
344 }
345
346 child_state.deleted = true;
347 drop(child_state);
348
349 Ok(child_entry.remove())
350 }
351
352 fn get_child(&self, name: &FsStr) -> Option<CgroupHandle> {
353 self.0.get(name).cloned()
354 }
355
356 fn get_children(&self) -> Vec<CgroupHandle> {
357 self.0.values().cloned().collect()
358 }
359
360 fn count_descendants(&self) -> u64 {
361 self.0.values().map(|child| 1 + child.count_descendants()).sum()
362 }
363}
364
365impl Deref for CgroupChildren {
366 type Target = BTreeMap<FsString, CgroupHandle>;
367
368 fn deref(&self) -> &Self::Target {
369 &self.0
370 }
371}
372
373#[derive(Debug, Default)]
374struct CgroupState {
375 children: CgroupChildren,
377
378 processes: HashSet<ThreadGroupKey>,
380
381 deleted: bool,
383
384 wait_queue: WaitQueue,
386
387 self_freezer_state: FreezerState,
389
390 inherited_freezer_state: FreezerState,
392}
393
394impl CgroupState {
395 fn create_freeze_waiter(&self) -> Waiter {
398 let waiter = Waiter::with_options(WaiterOptions::IGNORE_SIGNALS);
399 self.wait_queue.wait_async(&waiter);
400 waiter
401 }
402
403 fn update_processes(&mut self) {
405 self.processes.retain(|thread_group| {
406 let Some(thread_group) = thread_group.upgrade() else {
407 return false;
408 };
409 let terminating = thread_group.read().is_terminating();
410 !terminating
411 });
412 }
413
414 fn freeze_thread_group<L>(&self, locked: &mut Locked<L>, thread_group: &ThreadGroup)
415 where
416 L: LockBefore<ThreadGroupLimits>,
417 {
418 let tasks = thread_group.read().tasks().map(TempRef::into_static).collect::<Vec<_>>();
422 for task in tasks {
423 send_freeze_signal(locked, &task, self.create_freeze_waiter())
424 .expect("sending freeze signal should not fail");
425 }
426 }
427
428 fn thaw_thread_group<L>(&self, _locked: &mut Locked<L>, thread_group: &ThreadGroup)
429 where
430 L: LockBefore<ThreadGroupLimits>,
431 {
432 let tasks = thread_group.read().tasks().map(TempRef::into_static).collect::<Vec<_>>();
436 for task in tasks {
437 task.write().thaw();
438 task.interrupt();
439 }
440 }
441
442 fn get_effective_freezer_state(&self) -> FreezerState {
443 std::cmp::max(self.self_freezer_state, self.inherited_freezer_state)
444 }
445
446 fn add_process<L>(
447 &mut self,
448 locked: &mut Locked<L>,
449 thread_group: &ThreadGroup,
450 ) -> Result<(), Errno>
451 where
452 L: LockBefore<ThreadGroupLimits>,
453 {
454 if self.deleted {
455 return error!(ENOENT);
456 }
457 self.processes.insert(thread_group.into());
458
459 if self.get_effective_freezer_state() == FreezerState::Frozen {
460 self.freeze_thread_group(locked, &thread_group);
461 }
462 Ok(())
463 }
464
465 fn remove_process<L>(
466 &mut self,
467 locked: &mut Locked<L>,
468 thread_group: &ThreadGroup,
469 ) -> Result<(), Errno>
470 where
471 L: LockBefore<ThreadGroupLimits>,
472 {
473 if self.deleted {
474 return error!(ENOENT);
475 }
476 self.processes.remove(&thread_group.into());
477
478 if self.get_effective_freezer_state() == FreezerState::Frozen {
479 self.thaw_thread_group(locked, thread_group);
480 }
481 Ok(())
482 }
483
484 fn propagate_freeze<L>(&mut self, locked: &mut Locked<L>, inherited_freezer_state: FreezerState)
485 where
486 L: LockBefore<ThreadGroupLimits>,
487 {
488 let prev_effective_freezer_state = self.get_effective_freezer_state();
489 self.inherited_freezer_state = inherited_freezer_state;
490 if prev_effective_freezer_state == FreezerState::Frozen {
491 return;
492 }
493
494 for thread_group in self.processes.iter() {
495 let Some(thread_group) = thread_group.upgrade() else {
496 continue;
497 };
498 self.freeze_thread_group(locked, &thread_group);
499 }
500
501 for child in self.children.get_children() {
503 child.state.lock().propagate_freeze(locked, FreezerState::Frozen);
504 }
505 }
506
507 fn propagate_thaw(&mut self, inherited_freezer_state: FreezerState) {
508 self.inherited_freezer_state = inherited_freezer_state;
509 if self.get_effective_freezer_state() == FreezerState::Thawed {
510 self.wait_queue.notify_all();
511 for child in self.children.get_children() {
512 child.state.lock().propagate_thaw(FreezerState::Thawed);
513 }
514 }
515 }
516
517 fn propagate_kill(&self) {
518 for thread_group in self.processes.iter() {
519 let Some(thread_group) = thread_group.upgrade() else {
520 continue;
521 };
522 thread_group.write().send_signal(SignalInfo::default(SIGKILL));
523 }
524
525 for child in self.children.get_children() {
527 child.state.lock().propagate_kill();
528 }
529 }
530}
531
532#[derive(Debug)]
534pub struct Cgroup {
535 root: Weak<CgroupRoot>,
536
537 id: u64,
539
540 name: FsString,
542
543 parent: Option<Weak<Cgroup>>,
546
547 state: Mutex<CgroupState>,
549
550 weak_self: Weak<Cgroup>,
551}
552pub type CgroupHandle = Arc<Cgroup>;
553
554pub fn path_from_root(weak_cgroup: Option<Weak<Cgroup>>) -> Result<FsString, Errno> {
556 let cgroup = match weak_cgroup {
557 Some(weak_cgroup) => Weak::upgrade(&weak_cgroup).ok_or_else(|| errno!(ENODEV))?,
558 None => return Ok("/".into()),
559 };
560 let mut path = PathBuilder::new();
561 let mut current = Some(cgroup);
562 while let Some(cgroup) = current {
563 path.prepend_element(cgroup.name());
564 current = cgroup.parent()?;
565 }
566 Ok(path.build_absolute())
567}
568
569impl Cgroup {
570 pub fn new(
571 id: u64,
572 name: &FsStr,
573 root: &Weak<CgroupRoot>,
574 parent: Option<Weak<Cgroup>>,
575 ) -> CgroupHandle {
576 Arc::new_cyclic(|weak| Self {
577 id,
578 root: root.clone(),
579 name: name.to_owned(),
580 parent,
581 state: Default::default(),
582 weak_self: weak.clone(),
583 })
584 }
585
586 pub fn name(&self) -> &FsStr {
587 self.name.as_ref()
588 }
589
590 fn root(&self) -> Result<Arc<CgroupRoot>, Errno> {
591 self.root.upgrade().ok_or_else(|| errno!(ENODEV))
592 }
593
594 fn parent(&self) -> Result<Option<CgroupHandle>, Errno> {
597 self.parent.as_ref().map(|weak| weak.upgrade().ok_or_else(|| errno!(ENODEV))).transpose()
598 }
599
600 fn count_descendants(&self) -> u64 {
601 self.state.lock().children.count_descendants()
602 }
603}
604
605impl CgroupOps for Cgroup {
606 fn id(&self) -> u64 {
607 self.id
608 }
609
610 fn add_process(
611 &self,
612 locked: &mut Locked<FileOpsCore>,
613 thread_group: &ThreadGroup,
614 ) -> Result<(), Errno> {
615 let root = self.root()?;
616 let mut pid_table = root.pid_table.lock();
617 match pid_table.entry(thread_group.into()) {
618 hash_map::Entry::Occupied(mut entry) => {
619 if std::ptr::eq(self, entry.get().as_ptr()) {
622 return Ok(());
623 }
624
625 track_stub!(TODO("https://fxbug.dev/383374687"), "check permissions");
627 if let Some(other_cgroup) = entry.get().upgrade() {
628 other_cgroup.state.lock().remove_process(locked, thread_group)?;
629 }
630
631 self.state.lock().add_process(locked, thread_group)?;
632 entry.insert(self.weak_self.clone());
633 }
634 hash_map::Entry::Vacant(entry) => {
635 self.state.lock().add_process(locked, thread_group)?;
636 entry.insert(self.weak_self.clone());
637 }
638 }
639
640 Ok(())
641 }
642
643 fn new_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno> {
644 let id = self.root()?.get_next_id();
645 let new_child = Cgroup::new(id, name, &self.root, Some(self.weak_self.clone()));
646 let mut state = self.state.lock();
647 if state.deleted {
648 return error!(ENOENT);
649 }
650 new_child.state.lock().inherited_freezer_state = state.get_effective_freezer_state();
652 state.children.insert_child(name.into(), new_child)
653 }
654
655 fn get_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno> {
656 let state = self.state.lock();
657 state.children.get_child(name).ok_or_else(|| errno!(ENOENT))
658 }
659
660 fn remove_child(&self, name: &FsStr) -> Result<CgroupHandle, Errno> {
661 let mut state = self.state.lock();
662 if state.deleted {
663 return error!(ENOENT);
664 }
665 state.children.remove_child(name)
666 }
667
668 fn get_children(&self) -> Result<Vec<CgroupHandle>, Errno> {
669 let state = self.state.lock();
670 if state.deleted {
671 return error!(ENOENT);
672 }
673 Ok(state.children.get_children())
674 }
675
676 fn get_pids(&self, _kernel: &Kernel) -> Vec<pid_t> {
677 let mut state = self.state.lock();
678 state.update_processes();
679 state.processes.iter().filter_map(|v| v.upgrade().map(|tg| tg.leader)).collect()
680 }
681
682 fn kill(&self) {
683 trace_duration!(CATEGORY_STARNIX, "CgroupKill");
684 let state = self.state.lock();
685 state.propagate_kill();
686 }
687
688 fn is_populated(&self) -> bool {
689 let mut state = self.state.lock();
690 if state.deleted {
691 return false;
692 }
693 state.update_processes();
694 if !state.processes.is_empty() {
695 return true;
696 }
697
698 state.children.get_children().into_iter().any(|child| child.is_populated())
699 }
700
701 fn get_freezer_state(&self) -> CgroupFreezerState {
702 let state = self.state.lock();
703 CgroupFreezerState {
704 self_freezer_state: state.self_freezer_state,
705 effective_freezer_state: state.get_effective_freezer_state(),
706 }
707 }
708
709 fn freeze(&self, locked: &mut Locked<FileOpsCore>) {
710 trace_duration!(CATEGORY_STARNIX, "CgroupFreeze");
711 let mut state = self.state.lock();
712 let inherited_freezer_state = state.inherited_freezer_state;
713 state.propagate_freeze(locked, inherited_freezer_state);
714 state.self_freezer_state = FreezerState::Frozen;
715 }
716
717 fn thaw(&self) {
718 trace_duration!(CATEGORY_STARNIX, "CgroupThaw");
719 let mut state = self.state.lock();
720 state.self_freezer_state = FreezerState::Thawed;
721 let inherited_freezer_state = state.inherited_freezer_state;
722 state.propagate_thaw(inherited_freezer_state);
723 }
724}
725
726#[cfg(test)]
727mod test {
728 use super::*;
729 use crate::testing::spawn_kernel_and_run;
730 use assert_matches::assert_matches;
731 use starnix_uapi::signals::SIGCHLD;
732 use starnix_uapi::{CLONE_SIGHAND, CLONE_THREAD, CLONE_VM};
733
734 #[::fuchsia::test]
735 async fn cgroup_path_from_root() {
736 spawn_kernel_and_run(async |_, _| {
737 let root = CgroupRoot::new();
738
739 let test_cgroup =
740 root.new_child("test".into()).expect("new_child on root cgroup succeeds");
741 let child_cgroup = test_cgroup
742 .new_child("child".into())
743 .expect("new_child on non-root cgroup succeeds");
744
745 assert_eq!(path_from_root(Some(Arc::downgrade(&test_cgroup))), Ok("/test".into()));
746 assert_eq!(
747 path_from_root(Some(Arc::downgrade(&child_cgroup))),
748 Ok("/test/child".into())
749 );
750 })
751 .await;
752 }
753
754 #[::fuchsia::test]
755 async fn cgroup_clone_task_in_frozen_cgroup() {
756 spawn_kernel_and_run(async |locked, current_task| {
757 let kernel = current_task.kernel();
758 let root = &kernel.cgroups.cgroup2;
759 let cgroup = root.new_child("test".into()).expect("new_child on root cgroup succeeds");
760
761 let process = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
762 cgroup
763 .add_process(locked.cast_locked(), process.thread_group())
764 .expect("add process to cgroup");
765 cgroup.freeze(locked.cast_locked());
766 assert_eq!(cgroup.get_pids(&kernel).first(), Some(process.get_pid()).as_ref());
767 assert_eq!(
768 root.get_cgroup(process.thread_group()).unwrap().as_ptr(),
769 Arc::as_ptr(&cgroup)
770 );
771
772 let thread = process.clone_task_for_test(
773 locked,
774 (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM) as u64,
775 Some(SIGCHLD),
776 );
777
778 let thread_state = thread.read();
779 let kernel_signals = thread_state.kernel_signals_for_test();
780 assert_matches!(kernel_signals.front(), Some(KernelSignal::Freeze(_)));
781 })
782 .await;
783 }
784
785 #[::fuchsia::test]
786 async fn cgroup_tg_release_removes_pid() {
787 spawn_kernel_and_run(async |locked, current_task| {
788 let kernel = current_task.kernel();
789 let root = &kernel.cgroups.cgroup2;
790 let cgroup = root.new_child("test".into()).expect("new_child on root cgroup succeeds");
791
792 let process = current_task.clone_task_for_test(locked, 0, Some(SIGCHLD));
793 cgroup
794 .add_process(locked.cast_locked(), process.thread_group())
795 .expect("add process to cgroup");
796
797 assert_eq!(
798 root.get_cgroup(process.temp_task().thread_group()).unwrap().as_ptr(),
799 Arc::as_ptr(&cgroup)
800 );
801
802 drop(process);
804
805 assert!(root.pid_table.lock().is_empty());
807 })
808 .await;
809 }
810}