Skip to main content

starnix_modules_overlayfs/
lib.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#![recursion_limit = "512"]
6
7use fuchsia_rcu::RcuReadScope;
8use once_cell::sync::OnceCell;
9use rand::Rng;
10use starnix_core::fs::tmpfs::{TmpFs, TmpFsDirectory};
11use starnix_core::mm::memory::MemoryObject;
12use starnix_core::security::{self, PermissionFlags};
13use starnix_core::task::{CurrentTask, Kernel};
14use starnix_core::vfs::fs_args::MountParams;
15use starnix_core::vfs::rw_queue::{RwQueueReadGuard, RwQueueWriteGuard};
16use starnix_core::vfs::{
17    AppendLockWriteGuard, CacheMode, CheckAccessReason, DirEntry, DirEntryHandle,
18    DirectoryEntryType, DirentSink, FallocMode, FileHandle, FileObject, FileOps, FileSystem,
19    FileSystemHandle, FileSystemOps, FileSystemOptions, FsNode, FsNodeFlags, FsNodeHandle,
20    FsNodeInfo, FsNodeOps, FsStr, FsString, InputBuffer, MountInfo, OutputBuffer, RenameFlags,
21    SeekTarget, SymlinkTarget, UnlinkKind, ValueOrSize, VecInputBuffer, VecOutputBuffer, XattrOp,
22    default_seek, emit_dotdot, fileops_impl_directory, fileops_impl_noop_sync,
23    fileops_impl_seekable,
24};
25use starnix_logging::{log_error, log_warn, track_stub};
26use starnix_sync::{
27    BeforeFsNodeAppend, FileOpsCore, FsNodeAppend, LockEqualOrBefore, Locked, RwLock,
28    RwLockReadGuard, RwLockWriteGuard, Unlocked,
29};
30use starnix_uapi::auth::{Credentials, FsCred};
31use starnix_uapi::device_id::DeviceId;
32use starnix_uapi::errors::{EEXIST, ENOENT, Errno};
33use starnix_uapi::file_mode::{FileMode, mode};
34use starnix_uapi::open_flags::OpenFlags;
35use starnix_uapi::{errno, error, ino_t, off_t, statfs};
36use std::collections::BTreeSet;
37use std::sync::Arc;
38use syncio::zxio_node_attr_has_t;
39
40// Name and value for the xattr used to mark opaque directories in the upper FS.
41// See https://docs.kernel.org/filesystems/overlayfs.html#whiteouts-and-opaque-directories
42const OPAQUE_DIR_XATTR: &str = "trusted.overlay.opaque";
43const OPAQUE_DIR_XATTR_VALUE: &str = "y";
44
45#[derive(Clone)]
46struct DirEntryInfo {
47    name: FsString,
48    inode_num: ino_t,
49    entry_type: DirectoryEntryType,
50}
51
52type DirEntries = Vec<DirEntryInfo>;
53
54#[derive(Default)]
55struct DirentSinkAdapter {
56    items: Vec<DirEntryInfo>,
57    offset: off_t,
58}
59
60impl DirentSink for DirentSinkAdapter {
61    fn add(
62        &mut self,
63        inode_num: ino_t,
64        offset: off_t,
65        entry_type: DirectoryEntryType,
66        name: &FsStr,
67    ) -> Result<(), Errno> {
68        if !DirEntry::is_reserved_name(name) {
69            self.items.push(DirEntryInfo { name: name.to_owned(), inode_num, entry_type });
70        }
71        self.offset = offset;
72        Ok(())
73    }
74
75    fn offset(&self) -> off_t {
76        self.offset
77    }
78}
79
80#[derive(Copy, Clone, Eq, PartialEq)]
81enum UpperCopyMode {
82    MetadataOnly,
83    CopyAll,
84}
85
86/// An `DirEntry` associated with the mount options. This is required because OverlayFs mostly
87/// works at the `DirEntry` level (mounts on the lower, upper and work directories are ignored),
88/// but operation must still depend on mount options.
89#[derive(Clone)]
90struct ActiveEntry {
91    entry: DirEntryHandle,
92    mount: MountInfo,
93}
94
95impl ActiveEntry {
96    fn mapper<'a>(entry: &'a ActiveEntry) -> impl Fn(DirEntryHandle) -> ActiveEntry + 'a {
97        |dir_entry| ActiveEntry { entry: dir_entry, mount: entry.mount.clone() }
98    }
99
100    fn entry(&self) -> &DirEntryHandle {
101        &self.entry
102    }
103
104    fn mount(&self) -> &MountInfo {
105        &self.mount
106    }
107
108    fn component_lookup<L>(
109        &self,
110        locked: &mut Locked<L>,
111        current_task: &CurrentTask,
112        name: &FsStr,
113    ) -> Result<Self, Errno>
114    where
115        L: LockEqualOrBefore<FileOpsCore>,
116    {
117        self.entry()
118            .component_lookup(locked, current_task, self.mount(), name)
119            .map(ActiveEntry::mapper(self))
120    }
121
122    fn create_entry<L>(
123        &self,
124        locked: &mut Locked<L>,
125        current_task: &CurrentTask,
126        name: &FsStr,
127        create_node_fn: impl FnOnce(
128            &mut Locked<L>,
129            &FsNodeHandle,
130            &MountInfo,
131            &FsStr,
132        ) -> Result<FsNodeHandle, Errno>,
133    ) -> Result<Self, Errno>
134    where
135        L: LockEqualOrBefore<FileOpsCore>,
136    {
137        self.entry()
138            .create_entry(locked, current_task, self.mount(), name, create_node_fn)
139            .map(ActiveEntry::mapper(self))
140    }
141
142    /// Sets an xattr to mark the directory referenced by `entry` as opaque. Directories that are
143    /// marked as opaque in the upper FS are not merged with the corresponding directories in the
144    /// lower FS.
145    fn set_opaque_xattr<L>(
146        &self,
147        locked: &mut Locked<L>,
148        current_task: &CurrentTask,
149    ) -> Result<(), Errno>
150    where
151        L: LockEqualOrBefore<FileOpsCore>,
152    {
153        self.entry().node.set_xattr(
154            locked,
155            current_task,
156            self.mount(),
157            OPAQUE_DIR_XATTR.into(),
158            OPAQUE_DIR_XATTR_VALUE.into(),
159            XattrOp::Set,
160        )
161    }
162
163    /// Checks if the `entry` is marked as opaque.
164    fn is_opaque_node<L>(&self, locked: &mut Locked<L>, current_task: &CurrentTask) -> bool
165    where
166        L: LockEqualOrBefore<FileOpsCore>,
167    {
168        match self.entry().node.get_xattr(
169            locked,
170            current_task,
171            self.mount(),
172            OPAQUE_DIR_XATTR.into(),
173            OPAQUE_DIR_XATTR_VALUE.len(),
174        ) {
175            Ok(ValueOrSize::Value(v)) if v == OPAQUE_DIR_XATTR_VALUE => true,
176            _ => false,
177        }
178    }
179
180    /// Creates a "whiteout" entry in the directory called `name`. Whiteouts are created by
181    /// overlayfs to denote files and directories that were removed and should not be listed in the
182    /// directory. This is necessary because we cannot remove entries from the lower FS.
183    fn create_whiteout<L>(
184        &self,
185        locked: &mut Locked<L>,
186        current_task: &CurrentTask,
187        name: &FsStr,
188    ) -> Result<ActiveEntry, Errno>
189    where
190        L: LockEqualOrBefore<FileOpsCore>,
191    {
192        self.create_entry(locked, current_task, name, |locked, dir, mount, name| {
193            dir.create_node(
194                locked,
195                current_task,
196                mount,
197                name,
198                FileMode::IFCHR,
199                DeviceId::NONE,
200                FsCred::root(),
201            )
202        })
203    }
204
205    /// Returns `true` if this is a "whiteout".
206    fn is_whiteout(&self) -> bool {
207        let info = self.entry().node.info();
208        info.mode.is_chr() && info.rdev == DeviceId::NONE
209    }
210
211    /// Checks whether the child of this entry represented by `info` is a "whiteout".
212    ///
213    /// Only looks up the corresponding `DirEntry` when necessary.
214    fn is_whiteout_child<L>(
215        &self,
216        locked: &mut Locked<L>,
217        current_task: &CurrentTask,
218        info: &DirEntryInfo,
219    ) -> Result<bool, Errno>
220    where
221        L: LockEqualOrBefore<FileOpsCore>,
222    {
223        // We need to lookup the node only if the file is a char device.
224        if info.entry_type != DirectoryEntryType::CHR {
225            return Ok(false);
226        }
227        let entry = self.component_lookup(locked, current_task, info.name.as_ref())?;
228        Ok(entry.is_whiteout())
229    }
230
231    fn read_dir_entries<L>(
232        &self,
233        locked: &mut Locked<L>,
234        current_task: &CurrentTask,
235    ) -> Result<Vec<DirEntryInfo>, Errno>
236    where
237        L: LockEqualOrBefore<FileOpsCore>,
238    {
239        let mut sink = DirentSinkAdapter::default();
240        self.entry().open_anonymous(locked, current_task, OpenFlags::DIRECTORY)?.readdir(
241            locked,
242            current_task,
243            &mut sink,
244        )?;
245        Ok(sink.items)
246    }
247}
248
249struct OverlayNode {
250    stack: Arc<OverlayStack>,
251
252    // Corresponding `DirEntries` in the lower and the upper filesystems. At least one must be
253    // set. Note that we don't care about `NamespaceNode`: overlayfs overlays filesystems
254    // (i.e. not namespace subtrees). These directories may not be mounted anywhere.
255    // `upper` may be created dynamically whenever write access is required.
256    upper: OnceCell<ActiveEntry>,
257    lower: Option<ActiveEntry>,
258
259    // `prepare_to_unlink()` may mark `upper` as opaque. In that case we want to skip merging
260    // with `lower` in `readdir()`.
261    upper_is_opaque: OnceCell<()>,
262
263    parent: Option<Arc<OverlayNode>>,
264}
265
266impl OverlayNode {
267    fn new(
268        stack: Arc<OverlayStack>,
269        lower: Option<ActiveEntry>,
270        upper: Option<ActiveEntry>,
271        parent: Option<Arc<OverlayNode>>,
272    ) -> Arc<Self> {
273        assert!(upper.is_some() || parent.is_some());
274
275        let upper = match upper {
276            Some(entry) => OnceCell::with_value(entry),
277            None => OnceCell::new(),
278        };
279
280        Arc::new(OverlayNode { stack, upper, lower, upper_is_opaque: OnceCell::new(), parent })
281    }
282
283    fn from_fs_node(node: &FsNodeHandle) -> Result<&Arc<Self>, Errno> {
284        Ok(&node.downcast_ops::<OverlayNodeOps>().ok_or_else(|| errno!(EIO))?.node)
285    }
286
287    fn main_entry(&self) -> &ActiveEntry {
288        self.upper.get().or(self.lower.as_ref()).expect("Expected either upper or lower node")
289    }
290
291    fn init_fs_node_for_child(
292        self: &Arc<OverlayNode>,
293        node: &FsNode,
294        lower: Option<ActiveEntry>,
295        upper: Option<ActiveEntry>,
296    ) -> FsNodeHandle {
297        let entry = upper.as_ref().or(lower.as_ref()).expect("expect either lower or upper node");
298        let ino = entry.entry().node.ino;
299        let info = entry.entry().node.info().clone();
300
301        // Parent may be needed to initialize `upper`. We don't need to pass it if we have `upper`.
302        let parent = if upper.is_some() { None } else { Some(self.clone()) };
303
304        let overlay_node =
305            OverlayNodeOps { node: OverlayNode::new(self.stack.clone(), lower, upper, parent) };
306        FsNode::new_uncached(ino, overlay_node, &node.fs(), info, FsNodeFlags::empty())
307    }
308
309    /// If the file is currently in the lower FS, then promote it to the upper FS. No-op if the
310    /// file is already in the upper FS.
311    fn ensure_upper<L>(
312        &self,
313        locked: &mut Locked<L>,
314        current_task: &CurrentTask,
315        fs: &FileSystem,
316    ) -> Result<&ActiveEntry, Errno>
317    where
318        L: LockEqualOrBefore<FileOpsCore>,
319    {
320        self.ensure_upper_maybe_copy(locked, current_task, UpperCopyMode::CopyAll, fs)
321    }
322
323    /// Same as `ensure_upper()`, but allows to skip copying of the file content.
324    fn ensure_upper_maybe_copy<L>(
325        &self,
326        locked: &mut Locked<L>,
327        current_task: &CurrentTask,
328        copy_mode: UpperCopyMode,
329        fs: &FileSystem,
330    ) -> Result<&ActiveEntry, Errno>
331    where
332        L: LockEqualOrBefore<FileOpsCore>,
333    {
334        self.upper.get_or_try_init(|| {
335            let lower = self.lower.as_ref().expect("lower is expected when upper is missing");
336            let parent = self.parent.as_ref().expect("Parent is expected when upper is missing");
337            let parent_upper = parent.ensure_upper(locked, current_task, fs)?;
338            let name = lower.entry.local_name(&RcuReadScope::new()).to_owned();
339            let info = {
340                let info = lower.entry.node.info();
341                info.clone()
342            };
343            let cred = info.cred();
344
345            let mut copy_up_creds = Credentials::clone(&self.stack.mounter);
346            security::fs_node_copy_up(current_task, &lower.entry.node, fs, &mut copy_up_creds);
347            let res = current_task.override_creds(Arc::new(copy_up_creds), || {
348                if info.mode.is_lnk() {
349                    let link_target = lower.entry.node.readlink(locked, current_task)?;
350                    let link_path = match &link_target {
351                        SymlinkTarget::Node(_) => return error!(EIO),
352                        SymlinkTarget::Path(path) => path,
353                    };
354                    parent_upper.create_entry(
355                        locked,
356                        current_task,
357                        name.as_ref(),
358                        |locked, dir, mount, name| {
359                            dir.create_symlink(
360                                locked,
361                                current_task,
362                                mount,
363                                name,
364                                link_path.as_ref(),
365                                cred,
366                            )
367                        },
368                    )
369                } else if info.mode.is_reg() && copy_mode == UpperCopyMode::CopyAll {
370                    // Regular files need to be copied from lower FS to upper FS.
371                    self.stack.create_upper_entry(
372                        locked,
373                        current_task,
374                        parent_upper,
375                        name.as_ref(),
376                        |locked, dir, name| {
377                            dir.create_entry(
378                                locked,
379                                current_task,
380                                name,
381                                |locked, dir_node, mount, name| {
382                                    dir_node.create_node(
383                                        locked,
384                                        current_task,
385                                        mount,
386                                        name,
387                                        info.mode,
388                                        DeviceId::NONE,
389                                        cred,
390                                    )
391                                },
392                            )
393                        },
394                        |locked, entry| copy_file_content(locked, current_task, lower, &entry),
395                    )
396                } else {
397                    parent_upper.create_entry(
398                        locked,
399                        current_task,
400                        name.as_ref(),
401                        |locked, dir, mount, name| {
402                            dir.create_node(
403                                locked,
404                                current_task,
405                                mount,
406                                name,
407                                info.mode,
408                                info.rdev,
409                                cred,
410                            )
411                        },
412                    )
413                }
414            });
415
416            track_stub!(TODO("https://fxbug.dev/322874151"), "overlayfs copy xattrs");
417            res
418        })
419    }
420
421    /// Checks if this node exists in the lower FS.
422    fn has_lower(&self) -> bool {
423        self.lower.is_some()
424    }
425
426    /// Check that an item isn't present in the lower FS.
427    fn lower_entry_exists<L>(
428        &self,
429        locked: &mut Locked<L>,
430        current_task: &CurrentTask,
431        name: &FsStr,
432    ) -> Result<bool, Errno>
433    where
434        L: LockEqualOrBefore<FileOpsCore>,
435    {
436        match &self.lower {
437            Some(lower) => match lower.component_lookup(locked, current_task, name) {
438                Ok(entry) => Ok(!entry.is_whiteout()),
439                Err(err) if err.code == ENOENT => Ok(false),
440                Err(err) => Err(err),
441            },
442            None => Ok(false),
443        }
444    }
445
446    /// Helper used to create a new entry in the directory. It first checks that the target node
447    /// doesn't exist. Then `do_create` is called to create the new node in the work dir, which
448    /// is then moved to the target dir in the upper file system.
449    ///
450    /// It's assumed that the calling `DirEntry` has the current directory locked, so it is not
451    /// supposed to change while this method is executed. Note that OveralayFS doesn't handle
452    /// the case when the underlying file systems are changed directly, but that restriction
453    /// is not enforced.
454    fn create_entry<F, L>(
455        self: &Arc<OverlayNode>,
456        locked: &mut Locked<L>,
457        node: &FsNode,
458        current_task: &CurrentTask,
459        name: &FsStr,
460        do_create: F,
461    ) -> Result<ActiveEntry, Errno>
462    where
463        F: Fn(&mut Locked<L>, &ActiveEntry, &FsStr) -> Result<ActiveEntry, Errno>,
464        L: LockEqualOrBefore<FileOpsCore>,
465    {
466        let upper = self.ensure_upper(locked, current_task, &node.fs())?;
467
468        match upper.component_lookup(locked, current_task, name) {
469            Ok(existing) => {
470                // If there is an entry in the upper dir, then it must be a whiteout.
471                if !existing.is_whiteout() {
472                    return error!(EEXIST);
473                }
474            }
475
476            Err(e) if e.code == ENOENT => {
477                // If we don't have the entry in the upper fs, then check lower.
478                if self.lower_entry_exists(locked, current_task, name)? {
479                    return error!(EEXIST);
480                }
481            }
482            Err(e) => return Err(e),
483        };
484
485        self.stack.create_upper_entry(
486            locked,
487            current_task,
488            upper,
489            name,
490            |locked, entry, fs| do_create(locked, entry, fs),
491            |_, _entry| Ok(()),
492        )
493    }
494
495    /// An overlay directory may appear empty when the corresponding upper dir isn't empty:
496    /// it may contain a number of whiteout entries. In that case the whiteouts need to be
497    /// unlinked before the upper directory can be unlinked as well.
498    /// `prepare_to_unlink()` checks that the directory doesn't contain anything other
499    /// than whiteouts and if that is the case then it unlinks all of them.
500    fn prepare_to_unlink<L>(
501        self: &Arc<OverlayNode>,
502        locked: &mut Locked<L>,
503        current_task: &CurrentTask,
504    ) -> Result<(), Errno>
505    where
506        L: LockEqualOrBefore<FileOpsCore>,
507    {
508        if self.main_entry().entry().node.is_dir() {
509            let mut lower_entries = BTreeSet::new();
510            if let Some(dir) = &self.lower {
511                for item in dir.read_dir_entries(locked, current_task)?.drain(..) {
512                    if !dir.is_whiteout_child(locked, current_task, &item)? {
513                        lower_entries.insert(item.name);
514                    }
515                }
516            }
517
518            if let Some(dir) = self.upper.get() {
519                let mut to_remove = Vec::<FsString>::new();
520                for item in dir.read_dir_entries(locked, current_task)?.drain(..) {
521                    if !dir.is_whiteout_child(locked, current_task, &item)? {
522                        return error!(ENOTEMPTY);
523                    }
524                    lower_entries.remove(&item.name);
525                    to_remove.push(item.name);
526                }
527
528                if !lower_entries.is_empty() {
529                    return error!(ENOTEMPTY);
530                }
531
532                // Mark the directory as opaque. Children can be removed after this.
533                dir.set_opaque_xattr(locked, current_task)?;
534                let _ = self.upper_is_opaque.set(());
535
536                // Finally, remove the children.
537                for name in to_remove.iter() {
538                    dir.entry().unlink(
539                        locked,
540                        current_task,
541                        dir.mount(),
542                        name.as_ref(),
543                        UnlinkKind::NonDirectory,
544                        false,
545                    )?;
546                }
547            }
548        }
549
550        Ok(())
551    }
552
553    fn as_mounter<R, F: FnOnce() -> R>(&self, current_task: &CurrentTask, do_work: F) -> R {
554        current_task.override_creds(self.stack.mounter.clone(), do_work)
555    }
556}
557
558struct OverlayNodeOps {
559    node: Arc<OverlayNode>,
560}
561
562impl FsNodeOps for OverlayNodeOps {
563    fn check_access(
564        &self,
565        locked: &mut Locked<FileOpsCore>,
566        node: &FsNode,
567        current_task: &CurrentTask,
568        access: security::PermissionFlags,
569        info: &RwLock<FsNodeInfo>,
570        reason: CheckAccessReason,
571        audit_context: security::Auditable<'_>,
572    ) -> Result<(), Errno> {
573        node.default_check_access_impl(current_task, access, reason, info.read(), audit_context)?;
574
575        self.node.as_mounter(current_task, || {
576            if let Some(entry) = self.node.upper.get() {
577                entry.entry.node.check_access(
578                    locked,
579                    current_task,
580                    entry.mount(),
581                    access,
582                    reason,
583                    audit_context,
584                )
585            } else {
586                let entry = self.node.lower.as_ref().expect("Either upper or lower node is set");
587                let lower_node = &entry.entry.node;
588
589                // If the lower node is a regular file, directory or symlink then opening it for
590                // write access will cause it to be copied-up, so the mounter only requires read
591                // access to the underlying node.
592                //
593                // If the lower node is "special" (i.e. a device, FIFO or socket) then writes will
594                // affect the underlying resource, so to avoid privilege escalation via overlays,
595                // the mounter is still required to have write access to the node. This works
596                // even if the lower filesystem is readonly because special nodes remain writable
597                // in that case (though they may not be modified or unlinked, which would require
598                // actually writing to the filesystem).
599                let mut access = access;
600                if access.contains(PermissionFlags::WRITE) && !lower_node.info().mode.is_special() {
601                    // Verify that the mounter will be able to write to copy-up the node.
602                    // TODO: https://fxbug.dev/403260093 - Fix this to also verify discretionary
603                    // write access to the mounter, while correctly taking into account the
604                    // `context=` mount option (if any) for the mandatory write access check.
605                    security::fs_node_permission(
606                        current_task,
607                        node,
608                        PermissionFlags::WRITE,
609                        audit_context,
610                    )?;
611
612                    access |= PermissionFlags::READ;
613                    access &= !(PermissionFlags::WRITE | PermissionFlags::APPEND);
614                }
615
616                lower_node.check_access(
617                    locked,
618                    current_task,
619                    &entry.mount,
620                    access,
621                    reason,
622                    audit_context,
623                )
624            }
625        })
626    }
627
628    fn create_file_ops(
629        &self,
630        locked: &mut Locked<FileOpsCore>,
631        node: &FsNode,
632        current_task: &CurrentTask,
633        flags: OpenFlags,
634    ) -> Result<Box<dyn FileOps>, Errno> {
635        self.node.as_mounter(current_task, || {
636            if flags.can_write() {
637                // Only upper FS can be writable.
638                let copy_mode = if flags.contains(OpenFlags::TRUNC) {
639                    UpperCopyMode::MetadataOnly
640                } else {
641                    UpperCopyMode::CopyAll
642                };
643                self.node.ensure_upper_maybe_copy(locked, current_task, copy_mode, &node.fs())?;
644            }
645
646            let ops: Box<dyn FileOps> = if node.is_dir() {
647                Box::new(OverlayDirectory {
648                    node: self.node.clone(),
649                    dir_entries: Default::default(),
650                })
651            } else {
652                let state =
653                    match (self.node.upper.get(), &self.node.lower) {
654                        (Some(upper), _) => OverlayFileState::Upper(upper.entry().open_anonymous(
655                            locked,
656                            current_task,
657                            flags,
658                        )?),
659                        (None, Some(lower)) => OverlayFileState::Lower(
660                            lower.entry().open_anonymous(locked, current_task, flags)?,
661                        ),
662                        _ => panic!("Expected either upper or lower node"),
663                    };
664
665                Box::new(OverlayFile { node: self.node.clone(), flags, state: RwLock::new(state) })
666            };
667
668            Ok(ops)
669        })
670    }
671
672    fn lookup(
673        &self,
674        locked: &mut Locked<FileOpsCore>,
675        node: &FsNode,
676        current_task: &CurrentTask,
677        name: &FsStr,
678    ) -> Result<FsNodeHandle, Errno> {
679        self.node.as_mounter(current_task, || {
680            let resolve_child = |locked: &mut Locked<FileOpsCore>,
681                                 dir_opt: Option<&ActiveEntry>| {
682                // TODO(sergeyu): lookup() checks access, but we don't need that here.
683                dir_opt
684                    .as_ref()
685                    .map(|dir| match dir.component_lookup(locked, current_task, name) {
686                        Ok(entry) => Some(Ok(entry)),
687                        Err(e) if e.code == ENOENT => None,
688                        Err(e) => Some(Err(e)),
689                    })
690                    .flatten()
691                    .transpose()
692            };
693
694            let upper: Option<ActiveEntry> = resolve_child(locked, self.node.upper.get())?;
695
696            let (upper_is_dir, upper_is_opaque) = match &upper {
697                Some(upper) if upper.is_whiteout() => return error!(ENOENT),
698                Some(upper) => {
699                    let is_dir = upper.entry().node.is_dir();
700                    let is_opaque = !is_dir || upper.is_opaque_node(locked, current_task);
701                    (is_dir, is_opaque)
702                }
703                None => (false, false),
704            };
705
706            let parent_upper_is_opaque = self.node.upper_is_opaque.get().is_some();
707
708            // We don't need to resolve the lower node if we have an opaque node in the upper dir.
709            let lookup_lower = !parent_upper_is_opaque && !upper_is_opaque;
710            let lower: Option<ActiveEntry> = if lookup_lower {
711                match resolve_child(locked, self.node.lower.as_ref())? {
712                    // If the upper node is a directory and the lower isn't then ignore the lower node.
713                    Some(lower) if upper_is_dir && !lower.entry().node.is_dir() => None,
714                    Some(lower) if lower.is_whiteout() => None,
715                    result => result,
716                }
717            } else {
718                None
719            };
720
721            if upper.is_none() && lower.is_none() {
722                return error!(ENOENT);
723            }
724
725            Ok(self.node.init_fs_node_for_child(node, lower, upper))
726        })
727    }
728
729    fn mknod(
730        &self,
731        locked: &mut Locked<FileOpsCore>,
732        node: &FsNode,
733        current_task: &CurrentTask,
734        name: &FsStr,
735        mode: FileMode,
736        dev: DeviceId,
737        owner: FsCred,
738    ) -> Result<FsNodeHandle, Errno> {
739        if mode.fmt() == FileMode::IFCHR && dev == DeviceId::NONE {
740            // Callers are blocked from creating character device nodes with Id zero, which would
741            // be indistuinguishable from those created to represent whiteouts.
742            return error!(EPERM);
743        }
744        let mut creds = Credentials::clone(&self.node.stack.mounter);
745        security::dentry_create_files_as(current_task, node, mode, name, &mut creds)?;
746        current_task.override_creds(Arc::new(creds), || {
747            let new_upper_node = self.node.create_entry(
748                locked,
749                node,
750                current_task,
751                name,
752                |locked, dir, temp_name| {
753                    dir.create_entry(
754                        locked,
755                        current_task,
756                        temp_name,
757                        |locked, dir_node, mount, name| {
758                            dir_node.create_node(
759                                locked,
760                                current_task,
761                                mount,
762                                name,
763                                mode,
764                                dev,
765                                owner.clone(),
766                            )
767                        },
768                    )
769                },
770            )?;
771            Ok(self.node.init_fs_node_for_child(node, None, Some(new_upper_node)))
772        })
773    }
774
775    fn mkdir(
776        &self,
777        locked: &mut Locked<FileOpsCore>,
778        node: &FsNode,
779        current_task: &CurrentTask,
780        name: &FsStr,
781        mode: FileMode,
782        owner: FsCred,
783    ) -> Result<FsNodeHandle, Errno> {
784        let mut creds = Credentials::clone(&self.node.stack.mounter);
785        security::dentry_create_files_as(current_task, node, mode, name, &mut creds)?;
786        current_task.override_creds(Arc::new(creds), || {
787            let new_upper_node = self.node.create_entry(
788                locked,
789                node,
790                current_task,
791                name,
792                |locked, dir, temp_name| {
793                    let entry = dir.create_entry(
794                        locked,
795                        current_task,
796                        temp_name,
797                        |locked, dir_node, mount, name| {
798                            dir_node.create_node(
799                                locked,
800                                current_task,
801                                mount,
802                                name,
803                                mode,
804                                DeviceId::NONE,
805                                owner.clone(),
806                            )
807                        },
808                    )?;
809
810                    // Set opaque attribute to ensure the new directory is not merged with lower.
811                    entry.set_opaque_xattr(locked, current_task)?;
812
813                    Ok(entry)
814                },
815            )?;
816
817            Ok(self.node.init_fs_node_for_child(node, None, Some(new_upper_node)))
818        })
819    }
820
821    fn create_symlink(
822        &self,
823        locked: &mut Locked<FileOpsCore>,
824        node: &FsNode,
825        current_task: &CurrentTask,
826        name: &FsStr,
827        target: &FsStr,
828        owner: FsCred,
829    ) -> Result<FsNodeHandle, Errno> {
830        let mut creds = Credentials::clone(&self.node.stack.mounter);
831        security::dentry_create_files_as(current_task, node, FileMode::IFLNK, name, &mut creds)?;
832        current_task.override_creds(Arc::new(creds), || {
833            let new_upper_node = self.node.create_entry(
834                locked,
835                node,
836                current_task,
837                name,
838                |locked, dir, temp_name| {
839                    dir.create_entry(
840                        locked,
841                        current_task,
842                        temp_name,
843                        |locked, dir_node, mount, name| {
844                            dir_node.create_symlink(
845                                locked,
846                                current_task,
847                                mount,
848                                name,
849                                target,
850                                owner.clone(),
851                            )
852                        },
853                    )
854                },
855            )?;
856            Ok(self.node.init_fs_node_for_child(node, None, Some(new_upper_node)))
857        })
858    }
859
860    fn readlink(
861        &self,
862        locked: &mut Locked<FileOpsCore>,
863        _node: &FsNode,
864        current_task: &CurrentTask,
865    ) -> Result<SymlinkTarget, Errno> {
866        self.node.as_mounter(current_task, || {
867            self.node.main_entry().entry().node.readlink(locked, current_task)
868        })
869    }
870
871    fn link(
872        &self,
873        locked: &mut Locked<FileOpsCore>,
874        node: &FsNode,
875        current_task: &CurrentTask,
876        name: &FsStr,
877        child: &FsNodeHandle,
878    ) -> Result<(), Errno> {
879        self.node.as_mounter(current_task, || {
880            let child_overlay = OverlayNode::from_fs_node(child)?;
881            let upper_child = child_overlay.ensure_upper(locked, current_task, &node.fs())?;
882            self.node.create_entry(
883                locked,
884                node,
885                current_task,
886                name,
887                |locked, dir, temp_name| {
888                    dir.create_entry(
889                        locked,
890                        current_task,
891                        temp_name,
892                        |locked, dir_node, mount, name| {
893                            dir_node.link(
894                                locked,
895                                current_task,
896                                mount,
897                                name,
898                                &upper_child.entry().node,
899                            )
900                        },
901                    )
902                },
903            )?;
904            Ok(())
905        })
906    }
907
908    fn unlink(
909        &self,
910        locked: &mut Locked<FileOpsCore>,
911        node: &FsNode,
912        current_task: &CurrentTask,
913        name: &FsStr,
914        child: &FsNodeHandle,
915    ) -> Result<(), Errno> {
916        self.node.as_mounter(current_task, || {
917            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?;
918            let child_overlay = OverlayNode::from_fs_node(child)?;
919            child_overlay.prepare_to_unlink(locked, current_task)?;
920
921            let need_whiteout = self.node.lower_entry_exists(locked, current_task, name)?;
922            if need_whiteout {
923                self.node.stack.create_upper_entry(
924                    locked,
925                    current_task,
926                    &upper,
927                    &name,
928                    |locked, work, name| work.create_whiteout(locked, current_task, name),
929                    |_, _entry| Ok(()),
930                )?;
931            } else if let Some(child_upper) = child_overlay.upper.get() {
932                let kind = if child_upper.entry().node.is_dir() {
933                    UnlinkKind::Directory
934                } else {
935                    UnlinkKind::NonDirectory
936                };
937                upper.entry().unlink(locked, current_task, upper.mount(), name, kind, false)?;
938            }
939
940            Ok(())
941        })
942    }
943
944    fn fetch_and_refresh_info<'a>(
945        &self,
946        locked: &mut Locked<FileOpsCore>,
947        _node: &FsNode,
948        current_task: &CurrentTask,
949        info: &'a RwLock<FsNodeInfo>,
950    ) -> Result<RwLockReadGuard<'a, FsNodeInfo>, Errno> {
951        self.node.as_mounter(current_task, || {
952            let underlying_node = &self.node.main_entry().entry().node;
953            // Work-around to ensure that mounter `getattr` access is required when a caller tries
954            // to `stat()` a file.
955            security::check_fs_node_getattr_access(current_task, underlying_node)?;
956            let real_info = underlying_node.fetch_and_refresh_info(locked, current_task)?.clone();
957            let mut lock = info.write();
958            *lock = real_info;
959            Ok(RwLockWriteGuard::downgrade(lock))
960        })
961    }
962
963    // Work-around to allow the append-only writes to proceed without `getattr` access checks,
964    // which `fetch_and_refresh_info()`, above, would otherwise introduce.
965    fn get_size(
966        &self,
967        locked: &mut Locked<FileOpsCore>,
968        _node: &FsNode,
969        current_task: &CurrentTask,
970    ) -> Result<usize, Errno> {
971        self.node.as_mounter(current_task, || {
972            self.node.main_entry().entry().node.get_size(locked, current_task)
973        })
974    }
975
976    fn update_attributes(
977        &self,
978        locked: &mut Locked<FileOpsCore>,
979        node: &FsNode,
980        current_task: &CurrentTask,
981        new_info: &FsNodeInfo,
982        has: zxio_node_attr_has_t,
983    ) -> Result<(), Errno> {
984        self.node.as_mounter(current_task, || {
985            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?.entry();
986            upper.node.update_attributes(locked, current_task, |info| {
987                if has.modification_time {
988                    info.time_modify = new_info.time_modify;
989                }
990                if has.access_time {
991                    info.time_access = new_info.time_access;
992                }
993                if has.mode {
994                    info.mode = new_info.mode;
995                }
996                if has.uid {
997                    info.uid = new_info.uid;
998                }
999                if has.gid {
1000                    info.gid = new_info.gid;
1001                }
1002                if has.rdev {
1003                    info.rdev = new_info.rdev;
1004                }
1005                Ok(())
1006            })
1007        })
1008    }
1009
1010    fn append_lock_read<'a>(
1011        &'a self,
1012        locked: &'a mut Locked<BeforeFsNodeAppend>,
1013        node: &'a FsNode,
1014        current_task: &CurrentTask,
1015    ) -> Result<(RwQueueReadGuard<'a, FsNodeAppend>, &'a mut Locked<FsNodeAppend>), Errno> {
1016        self.node.as_mounter(current_task, || {
1017            let upper_node =
1018                self.node.ensure_upper(locked, current_task, &node.fs())?.entry.node.as_ref();
1019            upper_node.ops().append_lock_read(locked, upper_node, current_task)
1020        })
1021    }
1022
1023    fn append_lock_write<'a>(
1024        &'a self,
1025        locked: &'a mut Locked<BeforeFsNodeAppend>,
1026        node: &'a FsNode,
1027        current_task: &CurrentTask,
1028    ) -> Result<(RwQueueWriteGuard<'a, FsNodeAppend>, &'a mut Locked<FsNodeAppend>), Errno> {
1029        self.node.as_mounter(current_task, || {
1030            let upper_node =
1031                self.node.ensure_upper(locked, current_task, &node.fs())?.entry.node.as_ref();
1032            upper_node.ops().append_lock_write(locked, upper_node, current_task)
1033        })
1034    }
1035
1036    fn truncate(
1037        &self,
1038        locked: &mut Locked<FileOpsCore>,
1039        guard: &AppendLockWriteGuard<'_>,
1040        node: &FsNode,
1041        current_task: &CurrentTask,
1042        length: u64,
1043    ) -> Result<(), Errno> {
1044        self.node.as_mounter(current_task, || {
1045            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?;
1046
1047            upper.entry().node.truncate_locked(locked, guard, current_task, length)
1048        })
1049    }
1050
1051    fn allocate(
1052        &self,
1053        locked: &mut Locked<FileOpsCore>,
1054        guard: &AppendLockWriteGuard<'_>,
1055        node: &FsNode,
1056        current_task: &CurrentTask,
1057        mode: FallocMode,
1058        offset: u64,
1059        length: u64,
1060    ) -> Result<(), Errno> {
1061        self.node.as_mounter(current_task, || {
1062            let node = &self.node.ensure_upper(locked, current_task, &node.fs())?.entry().node;
1063            node.fallocate_locked(locked, guard, current_task, mode, offset, length)
1064        })
1065    }
1066
1067    fn get_xattr(
1068        &self,
1069        locked: &mut Locked<FileOpsCore>,
1070        _node: &FsNode,
1071        current_task: &CurrentTask,
1072        name: &FsStr,
1073        max_size: usize,
1074    ) -> Result<ValueOrSize<FsString>, Errno> {
1075        let entry = self
1076            .node
1077            .upper
1078            .get()
1079            .or(self.node.lower.as_ref())
1080            .expect("expect either lower or upper node");
1081        self.node.as_mounter(current_task, || {
1082            entry.entry().node.get_xattr(locked, current_task, &entry.mount, name, max_size)
1083        })
1084    }
1085
1086    fn set_xattr(
1087        &self,
1088        locked: &mut Locked<FileOpsCore>,
1089        node: &FsNode,
1090        current_task: &CurrentTask,
1091        name: &FsStr,
1092        value: &FsStr,
1093        op: XattrOp,
1094    ) -> Result<(), Errno> {
1095        self.node.as_mounter(current_task, || {
1096            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?;
1097            upper.entry().node.set_xattr(locked, current_task, &upper.mount, name, value, op)
1098        })
1099    }
1100
1101    fn remove_xattr(
1102        &self,
1103        locked: &mut Locked<FileOpsCore>,
1104        node: &FsNode,
1105        current_task: &CurrentTask,
1106        name: &FsStr,
1107    ) -> Result<(), Errno> {
1108        self.node.as_mounter(current_task, || {
1109            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?;
1110            upper.entry().node.remove_xattr(locked, current_task, &upper.mount, name)
1111        })
1112    }
1113
1114    fn list_xattrs(
1115        &self,
1116        locked: &mut Locked<FileOpsCore>,
1117        _node: &FsNode,
1118        current_task: &CurrentTask,
1119        max_size: usize,
1120    ) -> Result<ValueOrSize<Vec<FsString>>, Errno> {
1121        self.node.as_mounter(current_task, || {
1122            let entry = self
1123                .node
1124                .upper
1125                .get()
1126                .or(self.node.lower.as_ref())
1127                .expect("expect either lower or upper node");
1128            entry.entry().node.list_xattrs(locked, current_task, max_size)
1129        })
1130    }
1131}
1132struct OverlayDirectory {
1133    node: Arc<OverlayNode>,
1134    dir_entries: RwLock<DirEntries>,
1135}
1136
1137impl OverlayDirectory {
1138    fn refresh_dir_entries<L>(
1139        &self,
1140        locked: &mut Locked<L>,
1141        current_task: &CurrentTask,
1142    ) -> Result<(), Errno>
1143    where
1144        L: LockEqualOrBefore<FileOpsCore>,
1145    {
1146        let mut entries = DirEntries::new();
1147
1148        let upper_is_opaque = self.node.upper_is_opaque.get().is_some();
1149        let merge_with_lower = self.node.lower.is_some() && !upper_is_opaque;
1150
1151        // First enumerate entries in the upper dir. Then enumerate the lower dir and add only
1152        // items that are not present in the upper.
1153        let mut upper_set = BTreeSet::new();
1154        if let Some(dir) = self.node.upper.get() {
1155            for item in dir.read_dir_entries(locked, current_task)?.drain(..) {
1156                // Fill `upper_set` only if we will need it later.
1157                if merge_with_lower {
1158                    upper_set.insert(item.name.clone());
1159                }
1160                if !dir.is_whiteout_child(locked, current_task, &item)? {
1161                    entries.push(item);
1162                }
1163            }
1164        }
1165
1166        if merge_with_lower {
1167            if let Some(dir) = &self.node.lower {
1168                for item in dir.read_dir_entries(locked, current_task)?.drain(..) {
1169                    if !upper_set.contains(&item.name)
1170                        && !dir.is_whiteout_child(locked, current_task, &item)?
1171                    {
1172                        entries.push(item);
1173                    }
1174                }
1175            }
1176        }
1177
1178        *self.dir_entries.write() = entries;
1179
1180        Ok(())
1181    }
1182}
1183
1184impl FileOps for OverlayDirectory {
1185    fileops_impl_directory!();
1186    fileops_impl_noop_sync!();
1187
1188    fn seek(
1189        &self,
1190        _locked: &mut Locked<FileOpsCore>,
1191        _file: &FileObject,
1192        current_task: &CurrentTask,
1193        current_offset: off_t,
1194        target: SeekTarget,
1195    ) -> Result<off_t, Errno> {
1196        self.node
1197            .as_mounter(current_task, || default_seek(current_offset, target, || error!(EINVAL)))
1198    }
1199
1200    fn readdir(
1201        &self,
1202        locked: &mut Locked<FileOpsCore>,
1203        file: &FileObject,
1204        current_task: &CurrentTask,
1205        sink: &mut dyn DirentSink,
1206    ) -> Result<(), Errno> {
1207        self.node.as_mounter(current_task, || {
1208            if sink.offset() == 0 {
1209                self.refresh_dir_entries(locked, current_task)?;
1210            }
1211
1212            emit_dotdot(file, sink)?;
1213
1214            for item in self.dir_entries.read().iter().skip(sink.offset() as usize - 2) {
1215                sink.add(item.inode_num, sink.offset() + 1, item.entry_type, item.name.as_ref())?;
1216            }
1217
1218            Ok(())
1219        })
1220    }
1221}
1222
1223enum OverlayFileState {
1224    Lower(FileHandle),
1225    Upper(FileHandle),
1226}
1227
1228impl OverlayFileState {
1229    fn file(&self) -> &FileHandle {
1230        match self {
1231            Self::Lower(f) | Self::Upper(f) => f,
1232        }
1233    }
1234}
1235
1236struct OverlayFile {
1237    node: Arc<OverlayNode>,
1238    flags: OpenFlags,
1239    state: RwLock<OverlayFileState>,
1240}
1241
1242impl FileOps for OverlayFile {
1243    fileops_impl_seekable!();
1244
1245    fn read(
1246        &self,
1247        locked: &mut Locked<FileOpsCore>,
1248        _file: &FileObject,
1249        current_task: &CurrentTask,
1250        offset: usize,
1251        data: &mut dyn OutputBuffer,
1252    ) -> Result<usize, Errno> {
1253        self.node.as_mounter(current_task, || {
1254            let mut state = self.state.read();
1255
1256            // Check if the file was promoted to the upper FS. In that case we need to reopen it
1257            // from there.
1258            if let Some(upper) = self.node.upper.get() {
1259                if matches!(*state, OverlayFileState::Lower(_)) {
1260                    std::mem::drop(state);
1261
1262                    {
1263                        let mut write_state = self.state.write();
1264
1265                        // TODO(mariagl): don't hold write_state while calling open_anonymous.
1266                        // It may call back into read(), causing lock order inversion.
1267                        *write_state = OverlayFileState::Upper(upper.entry().open_anonymous(
1268                            locked,
1269                            current_task,
1270                            self.flags,
1271                        )?);
1272                    }
1273                    state = self.state.read();
1274                }
1275            }
1276
1277            // TODO(mariagl): Drop state here
1278            let file = state.file();
1279            security::file_permission(current_task, &file, security::PermissionFlags::READ)?;
1280            file.ops().read(locked, file, current_task, offset, data)
1281        })
1282    }
1283
1284    fn write(
1285        &self,
1286        locked: &mut Locked<FileOpsCore>,
1287        _file: &FileObject,
1288        current_task: &CurrentTask,
1289        offset: usize,
1290        data: &mut dyn InputBuffer,
1291    ) -> Result<usize, Errno> {
1292        self.node.as_mounter(current_task, || {
1293            let state = self.state.read();
1294            let file = match &*state {
1295                OverlayFileState::Upper(f) => f.clone(),
1296
1297                // `write()` should be called only for files that were opened for write, and that
1298                // required the file to be promoted to the upper FS.
1299                OverlayFileState::Lower(_) => panic!("write() called for a lower FS file."),
1300            };
1301            std::mem::drop(state);
1302            security::file_permission(current_task, &file, security::PermissionFlags::WRITE)?;
1303            file.ops().write(locked, &file, current_task, offset, data)
1304        })
1305    }
1306
1307    fn sync(&self, _file: &FileObject, current_task: &CurrentTask) -> Result<(), Errno> {
1308        self.node.as_mounter(current_task, || {
1309            let state = self.state.read();
1310            let file = state.file();
1311            file.ops().sync(file, current_task)
1312        })
1313    }
1314
1315    fn get_memory(
1316        &self,
1317        locked: &mut Locked<FileOpsCore>,
1318        _file: &FileObject,
1319        current_task: &CurrentTask,
1320        length: Option<usize>,
1321        prot: starnix_core::mm::ProtectionFlags,
1322    ) -> Result<Arc<MemoryObject>, Errno> {
1323        self.node.as_mounter(current_task, || {
1324            let state = self.state.read();
1325            let file = state.file();
1326            // Not that the VMO returned here will not updated if the file is promoted to upper FS
1327            // later. This is consistent with OverlayFS behavior on Linux, see
1328            // https://docs.kernel.org/filesystems/overlayfs.html#non-standard-behavior .
1329            file.ops().get_memory(locked, file, current_task, length, prot)
1330        })
1331    }
1332}
1333
1334pub fn new_overlay_fs(
1335    locked: &mut Locked<Unlocked>,
1336    current_task: &CurrentTask,
1337    options: FileSystemOptions,
1338) -> Result<FileSystemHandle, Errno> {
1339    OverlayStack::new_fs(locked, current_task, options)
1340}
1341
1342pub struct OverlayStack {
1343    // Keep references to the underlying file systems to ensure they outlive `overlayfs` since
1344    // they may be unmounted before overlayfs.
1345    #[allow(unused)]
1346    lower_fs: FileSystemHandle,
1347    upper_fs: FileSystemHandle,
1348
1349    work: ActiveEntry,
1350
1351    // Used when interacting with the `upper_fs`, `lower_fs` or `work` directories.
1352    mounter: Arc<Credentials>,
1353}
1354
1355impl OverlayStack {
1356    fn new_fs(
1357        locked: &mut Locked<Unlocked>,
1358        current_task: &CurrentTask,
1359        options: FileSystemOptions,
1360    ) -> Result<FileSystemHandle, Errno> {
1361        match options.params.get("redirect_dir".as_bytes()) {
1362            None => (),
1363            Some(o) if o == "off" => (),
1364            Some(_) => {
1365                track_stub!(TODO("https://fxbug.dev/322874205"), "overlayfs redirect_dir");
1366                return error!(ENOTSUP);
1367            }
1368        }
1369
1370        let lower = resolve_dir_param(locked, current_task, &options.params, "lowerdir".into())?;
1371        let upper = resolve_dir_param(locked, current_task, &options.params, "upperdir".into())?;
1372        let work = resolve_dir_param(locked, current_task, &options.params, "workdir".into())?;
1373
1374        let lower_fs = lower.entry().node.fs();
1375        let upper_fs = upper.entry().node.fs();
1376
1377        if !Arc::ptr_eq(&upper_fs, &work.entry().node.fs()) {
1378            log_error!("overlayfs: upperdir and workdir must be on the same FS");
1379            return error!(EINVAL);
1380        }
1381
1382        let kernel = current_task.kernel();
1383        let mounter = current_task.current_creds().clone();
1384        let stack = Arc::new(OverlayStack { lower_fs, upper_fs, work, mounter });
1385        let root_node = OverlayNode::new(stack.clone(), Some(lower), Some(upper), None);
1386        let fs =
1387            FileSystem::new(locked, kernel, CacheMode::Uncached, OverlayFs { stack }, options)?;
1388        let root_ino = fs.allocate_ino();
1389        fs.create_root(root_ino, OverlayNodeOps { node: root_node });
1390        Ok(fs)
1391    }
1392
1393    /// Given a filesystem, wraps it in a tmpfs-backed writable overlayfs.
1394    pub fn wrap_fs_in_writable_layer<L>(
1395        locked: &mut Locked<L>,
1396        kernel: &Kernel,
1397        rootfs: FileSystemHandle,
1398    ) -> Result<FileSystemHandle, Errno>
1399    where
1400        L: LockEqualOrBefore<FileOpsCore>,
1401    {
1402        let lower = ActiveEntry { entry: rootfs.root().clone(), mount: MountInfo::detached() };
1403
1404        // Create upper and work directories in an invisible tmpfs.
1405        let invisible_tmp = TmpFs::new_fs(locked, kernel);
1406
1407        let create_directory = |fs: &FileSystemHandle| {
1408            let ino = fs.allocate_ino();
1409            let info = FsNodeInfo::new(mode!(IFDIR, 0o777), FsCred::root());
1410            let node = fs.create_detached_node(ino, TmpFsDirectory::new(), info);
1411            let dir_entry = DirEntry::new(node, None, FsString::default());
1412
1413            // TODO: https://fxbug.dev/455771186 - Revise FsNode initialization to better ensure
1414            // that all the things are appropriately labeled.
1415            security::fs_node_init_with_dentry_deferred(kernel, &dir_entry);
1416
1417            dir_entry
1418        };
1419
1420        let upper =
1421            ActiveEntry { entry: create_directory(&invisible_tmp), mount: MountInfo::detached() };
1422        let work =
1423            ActiveEntry { entry: create_directory(&invisible_tmp), mount: MountInfo::detached() };
1424
1425        let lower_fs = rootfs;
1426        let upper_fs = invisible_tmp;
1427
1428        let mounter = Credentials::root();
1429        let stack = Arc::new(OverlayStack { lower_fs, upper_fs, work, mounter });
1430        let root_node = OverlayNode::new(stack.clone(), Some(lower), Some(upper), None);
1431        let fs = FileSystem::new(
1432            locked,
1433            kernel,
1434            CacheMode::Uncached,
1435            OverlayFs { stack },
1436            FileSystemOptions::default(),
1437        )?;
1438        let root_ino = fs.allocate_ino();
1439        fs.create_root(root_ino, OverlayNodeOps { node: root_node });
1440        Ok(fs)
1441    }
1442
1443    // Helper used to create new entry called `name` in `target_dir` in the upper FS.
1444    // 1. Calls `try_create` to create a new entry in `work`. It is called repeateadly with a
1445    //    new name until it returns any result other than `EEXIST`.
1446    // 2. `do_init` is called to initilize the contents and the attributes of the new entry, etc.
1447    // 3. The new entry is moved to `target_dir`. If there is an existing entry called `name` in
1448    //    `target_dir` then it's replaced with the new entry.
1449    // The temp file is cleared from the work dir if either of the last two steps fails.
1450    fn create_upper_entry<FCreate, FInit, L>(
1451        &self,
1452        locked: &mut Locked<L>,
1453        current_task: &CurrentTask,
1454        target_dir: &ActiveEntry,
1455        name: &FsStr,
1456        try_create: FCreate,
1457        do_init: FInit,
1458    ) -> Result<ActiveEntry, Errno>
1459    where
1460        L: LockEqualOrBefore<FileOpsCore>,
1461        FCreate: Fn(&mut Locked<L>, &ActiveEntry, &FsStr) -> Result<ActiveEntry, Errno>,
1462        FInit: FnOnce(&mut Locked<L>, &ActiveEntry) -> Result<(), Errno>,
1463    {
1464        let mut rng = rand::rng();
1465        let (temp_name, entry) = loop {
1466            let x: u64 = rng.random();
1467            let temp_name = FsString::from(format!("tmp{:x}", x));
1468            match try_create(locked, &self.work, temp_name.as_ref()) {
1469                Err(err) if err.code == EEXIST => continue,
1470                Err(err) => return Err(err),
1471                Ok(entry) => break (temp_name, entry),
1472            }
1473        };
1474
1475        do_init(locked, &entry)
1476            .and_then(|()| {
1477                DirEntry::rename(
1478                    locked,
1479                    current_task,
1480                    self.work.entry(),
1481                    self.work.mount(),
1482                    temp_name.as_ref(),
1483                    target_dir.entry(),
1484                    target_dir.mount(),
1485                    name,
1486                    RenameFlags::REPLACE_ANY,
1487                )
1488            })
1489            .map_err(|e| {
1490                // Remove the temp entry in case of a failure.
1491                self.work
1492                    .entry()
1493                    .unlink(
1494                        locked,
1495                        current_task,
1496                        self.work.mount(),
1497                        temp_name.as_ref(),
1498                        UnlinkKind::NonDirectory,
1499                        false,
1500                    )
1501                    .unwrap_or_else(|e| {
1502                        log_error!("Failed to cleanup work dir after an error: {}", e)
1503                    });
1504                e
1505            })?;
1506
1507        Ok(entry)
1508    }
1509}
1510
1511struct OverlayFs {
1512    stack: Arc<OverlayStack>,
1513}
1514
1515impl FileSystemOps for OverlayFs {
1516    fn statfs(
1517        &self,
1518        locked: &mut Locked<FileOpsCore>,
1519        _fs: &FileSystem,
1520        current_task: &CurrentTask,
1521    ) -> Result<statfs, Errno> {
1522        current_task.override_creds(self.stack.mounter.clone(), || {
1523            self.stack.upper_fs.statfs(locked, current_task)
1524        })
1525    }
1526
1527    fn name(&self) -> &'static FsStr {
1528        "overlay".into()
1529    }
1530
1531    fn rename(
1532        &self,
1533        locked: &mut Locked<FileOpsCore>,
1534        _fs: &FileSystem,
1535        current_task: &CurrentTask,
1536        old_parent: &FsNodeHandle,
1537        old_name: &FsStr,
1538        new_parent: &FsNodeHandle,
1539        new_name: &FsStr,
1540        renamed: &FsNodeHandle,
1541        _replaced: Option<&FsNodeHandle>,
1542    ) -> Result<(), Errno> {
1543        current_task.override_creds(self.stack.mounter.clone(), || {
1544            let renamed_overlay = OverlayNode::from_fs_node(renamed)?;
1545            if renamed_overlay.has_lower() && renamed_overlay.main_entry().entry().node.is_dir() {
1546                // Return EXDEV for directory renames. Potentially they may be handled with the
1547                // `redirect_dir` feature, but it's not implemented here yet.
1548                // See https://docs.kernel.org/filesystems/overlayfs.html#renaming-directories
1549                return error!(EXDEV);
1550            }
1551            renamed_overlay.ensure_upper(locked, current_task, &renamed.fs())?;
1552
1553            let old_parent_overlay = OverlayNode::from_fs_node(old_parent)?;
1554            let old_parent_upper =
1555                old_parent_overlay.ensure_upper(locked, current_task, &renamed.fs())?;
1556
1557            let new_parent_overlay = OverlayNode::from_fs_node(new_parent)?;
1558            let new_parent_upper =
1559                new_parent_overlay.ensure_upper(locked, current_task, &renamed.fs())?;
1560
1561            let need_whiteout =
1562                old_parent_overlay.lower_entry_exists(locked, current_task, old_name)?;
1563
1564            DirEntry::rename(
1565                locked,
1566                current_task,
1567                old_parent_upper.entry(),
1568                old_parent_upper.mount(),
1569                old_name,
1570                new_parent_upper.entry(),
1571                new_parent_upper.mount(),
1572                new_name,
1573                RenameFlags::REPLACE_ANY,
1574            )?;
1575
1576            // If the old node existed in lower FS, then override it in the upper FS with a
1577            // whiteout.
1578            if need_whiteout {
1579                match old_parent_upper.create_whiteout(locked, current_task, old_name) {
1580                    Err(e) => log_warn!("overlayfs: failed to create whiteout for {old_name}: {e}"),
1581                    Ok(_) => (),
1582                }
1583            }
1584
1585            Ok(())
1586        })
1587    }
1588
1589    fn unmount(&self) {}
1590}
1591
1592/// Helper used to resolve directories passed in mount options. The directory is resolved in the
1593/// namespace of the calling process, but only `DirEntry` is returned (detached from the
1594/// namespace). The corresponding file systems may be unmounted before overlayfs that uses them.
1595fn resolve_dir_param(
1596    locked: &mut Locked<Unlocked>,
1597    current_task: &CurrentTask,
1598    params: &MountParams,
1599    name: &FsStr,
1600) -> Result<ActiveEntry, Errno> {
1601    let path = params.get(&**name).ok_or_else(|| {
1602        log_error!("overlayfs: {name} was not specified");
1603        errno!(EINVAL)
1604    })?;
1605
1606    current_task
1607        .open_file(locked, path.as_ref(), OpenFlags::RDONLY | OpenFlags::DIRECTORY)
1608        .map(|f| ActiveEntry { entry: f.name.entry.clone(), mount: f.name.mount.clone() })
1609        .map_err(|e| {
1610            log_error!("overlayfs: Failed to lookup {path}: {}", e);
1611            e
1612        })
1613}
1614
1615/// Copies file content from one file to another.
1616fn copy_file_content<L>(
1617    locked: &mut Locked<L>,
1618    current_task: &CurrentTask,
1619    from: &ActiveEntry,
1620    to: &ActiveEntry,
1621) -> Result<(), Errno>
1622where
1623    L: LockEqualOrBefore<FileOpsCore>,
1624{
1625    let locked = locked.cast_locked::<FileOpsCore>();
1626    let from_file = from.entry().open_anonymous(locked, current_task, OpenFlags::RDONLY)?;
1627    let to_file = to.entry().open_anonymous(locked, current_task, OpenFlags::WRONLY)?;
1628
1629    security::fs_node_permission(
1630        current_task,
1631        from_file.node().as_ref(),
1632        security::PermissionFlags::READ,
1633        (&**from_file).into(),
1634    )?;
1635    security::fs_node_permission(
1636        current_task,
1637        to_file.node().as_ref(),
1638        security::PermissionFlags::WRITE,
1639        (&**to_file).into(),
1640    )?;
1641
1642    const BUFFER_SIZE: usize = 4096;
1643
1644    let mut read_offset = 0;
1645    let mut write_offset = 0;
1646    loop {
1647        // TODO(sergeyu): Reuse buffer between iterations.
1648
1649        let mut output_buffer = VecOutputBuffer::new(BUFFER_SIZE);
1650        let bytes_read = from_file.ops().read(
1651            locked,
1652            &from_file,
1653            current_task,
1654            read_offset,
1655            &mut output_buffer,
1656        )?;
1657        if bytes_read == 0 {
1658            break;
1659        }
1660        read_offset += bytes_read;
1661
1662        let buffer: Vec<u8> = output_buffer.into();
1663        let mut input_buffer = VecInputBuffer::from(buffer);
1664        while input_buffer.available() > 0 {
1665            write_offset += to_file.ops().write(
1666                locked,
1667                &to_file,
1668                current_task,
1669                write_offset,
1670                &mut input_buffer,
1671            )?;
1672        }
1673    }
1674
1675    to_file.ops().data_sync(&to_file, current_task)?;
1676
1677    Ok(())
1678}