Skip to main content

starnix_modules_overlayfs/
lib.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#![recursion_limit = "512"]
6
7use fuchsia_rcu::RcuReadScope;
8use once_cell::sync::OnceCell;
9use rand::Rng;
10use starnix_core::fs::tmpfs::{TmpFs, TmpFsDirectory};
11use starnix_core::mm::memory::MemoryObject;
12use starnix_core::security::{self, PermissionFlags};
13use starnix_core::task::{CurrentTask, Kernel};
14use starnix_core::vfs::fs_args::MountParams;
15use starnix_core::vfs::rw_queue::{RwQueueReadGuard, RwQueueWriteGuard};
16use starnix_core::vfs::{
17    AppendLockGuard, CacheMode, CheckAccessReason, DirEntry, DirEntryHandle, DirectoryEntryType,
18    DirentSink, FallocMode, FileHandle, FileObject, FileOps, FileSystem, FileSystemHandle,
19    FileSystemOps, FileSystemOptions, FsNode, FsNodeHandle, FsNodeInfo, FsNodeOps, FsStr, FsString,
20    InputBuffer, MountInfo, OutputBuffer, RenameFlags, SeekTarget, SymlinkTarget, UnlinkKind,
21    ValueOrSize, VecInputBuffer, VecOutputBuffer, XattrOp, default_seek, emit_dotdot,
22    fileops_impl_directory, fileops_impl_noop_sync, fileops_impl_seekable,
23};
24use starnix_logging::{log_error, log_warn, track_stub};
25use starnix_sync::{
26    BeforeFsNodeAppend, FileOpsCore, FsNodeAppend, LockEqualOrBefore, Locked, RwLock,
27    RwLockReadGuard, RwLockWriteGuard, Unlocked,
28};
29use starnix_uapi::auth::{Credentials, FsCred};
30use starnix_uapi::device_id::DeviceId;
31use starnix_uapi::errors::{EEXIST, ENOENT, Errno};
32use starnix_uapi::file_mode::{FileMode, mode};
33use starnix_uapi::open_flags::OpenFlags;
34use starnix_uapi::{errno, error, ino_t, off_t, statfs};
35use std::collections::BTreeSet;
36use std::sync::Arc;
37use syncio::zxio_node_attr_has_t;
38
39// Name and value for the xattr used to mark opaque directories in the upper FS.
40// See https://docs.kernel.org/filesystems/overlayfs.html#whiteouts-and-opaque-directories
41const OPAQUE_DIR_XATTR: &str = "trusted.overlay.opaque";
42const OPAQUE_DIR_XATTR_VALUE: &str = "y";
43
44#[derive(Clone)]
45struct DirEntryInfo {
46    name: FsString,
47    inode_num: ino_t,
48    entry_type: DirectoryEntryType,
49}
50
51type DirEntries = Vec<DirEntryInfo>;
52
53#[derive(Default)]
54struct DirentSinkAdapter {
55    items: Vec<DirEntryInfo>,
56    offset: off_t,
57}
58
59impl DirentSink for DirentSinkAdapter {
60    fn add(
61        &mut self,
62        inode_num: ino_t,
63        offset: off_t,
64        entry_type: DirectoryEntryType,
65        name: &FsStr,
66    ) -> Result<(), Errno> {
67        if !DirEntry::is_reserved_name(name) {
68            self.items.push(DirEntryInfo { name: name.to_owned(), inode_num, entry_type });
69        }
70        self.offset = offset;
71        Ok(())
72    }
73
74    fn offset(&self) -> off_t {
75        self.offset
76    }
77}
78
79#[derive(Copy, Clone, Eq, PartialEq)]
80enum UpperCopyMode {
81    MetadataOnly,
82    CopyAll,
83}
84
85/// An `DirEntry` associated with the mount options. This is required because OverlayFs mostly
86/// works at the `DirEntry` level (mounts on the lower, upper and work directories are ignored),
87/// but operation must still depend on mount options.
88#[derive(Clone)]
89struct ActiveEntry {
90    entry: DirEntryHandle,
91    mount: MountInfo,
92}
93
94impl ActiveEntry {
95    fn mapper<'a>(entry: &'a ActiveEntry) -> impl Fn(DirEntryHandle) -> ActiveEntry + 'a {
96        |dir_entry| ActiveEntry { entry: dir_entry, mount: entry.mount.clone() }
97    }
98
99    fn entry(&self) -> &DirEntryHandle {
100        &self.entry
101    }
102
103    fn mount(&self) -> &MountInfo {
104        &self.mount
105    }
106
107    fn component_lookup<L>(
108        &self,
109        locked: &mut Locked<L>,
110        current_task: &CurrentTask,
111        name: &FsStr,
112    ) -> Result<Self, Errno>
113    where
114        L: LockEqualOrBefore<FileOpsCore>,
115    {
116        self.entry()
117            .component_lookup(locked, current_task, self.mount(), name)
118            .map(ActiveEntry::mapper(self))
119    }
120
121    fn create_entry<L>(
122        &self,
123        locked: &mut Locked<L>,
124        current_task: &CurrentTask,
125        name: &FsStr,
126        create_node_fn: impl FnOnce(
127            &mut Locked<L>,
128            &FsNodeHandle,
129            &MountInfo,
130            &FsStr,
131        ) -> Result<FsNodeHandle, Errno>,
132    ) -> Result<Self, Errno>
133    where
134        L: LockEqualOrBefore<FileOpsCore>,
135    {
136        self.entry()
137            .create_entry(locked, current_task, self.mount(), name, create_node_fn)
138            .map(ActiveEntry::mapper(self))
139    }
140
141    /// Sets an xattr to mark the directory referenced by `entry` as opaque. Directories that are
142    /// marked as opaque in the upper FS are not merged with the corresponding directories in the
143    /// lower FS.
144    fn set_opaque_xattr<L>(
145        &self,
146        locked: &mut Locked<L>,
147        current_task: &CurrentTask,
148    ) -> Result<(), Errno>
149    where
150        L: LockEqualOrBefore<FileOpsCore>,
151    {
152        self.entry().node.set_xattr(
153            locked,
154            current_task,
155            self.mount(),
156            OPAQUE_DIR_XATTR.into(),
157            OPAQUE_DIR_XATTR_VALUE.into(),
158            XattrOp::Set,
159        )
160    }
161
162    /// Checks if the `entry` is marked as opaque.
163    fn is_opaque_node<L>(&self, locked: &mut Locked<L>, current_task: &CurrentTask) -> bool
164    where
165        L: LockEqualOrBefore<FileOpsCore>,
166    {
167        match self.entry().node.get_xattr(
168            locked,
169            current_task,
170            self.mount(),
171            OPAQUE_DIR_XATTR.into(),
172            OPAQUE_DIR_XATTR_VALUE.len(),
173        ) {
174            Ok(ValueOrSize::Value(v)) if v == OPAQUE_DIR_XATTR_VALUE => true,
175            _ => false,
176        }
177    }
178
179    /// Creates a "whiteout" entry in the directory called `name`. Whiteouts are created by
180    /// overlayfs to denote files and directories that were removed and should not be listed in the
181    /// directory. This is necessary because we cannot remove entries from the lower FS.
182    fn create_whiteout<L>(
183        &self,
184        locked: &mut Locked<L>,
185        current_task: &CurrentTask,
186        name: &FsStr,
187    ) -> Result<ActiveEntry, Errno>
188    where
189        L: LockEqualOrBefore<FileOpsCore>,
190    {
191        self.create_entry(locked, current_task, name, |locked, dir, mount, name| {
192            dir.create_node(
193                locked,
194                current_task,
195                mount,
196                name,
197                FileMode::IFCHR,
198                DeviceId::NONE,
199                FsCred::root(),
200            )
201        })
202    }
203
204    /// Returns `true` if this is a "whiteout".
205    fn is_whiteout(&self) -> bool {
206        let info = self.entry().node.info();
207        info.mode.is_chr() && info.rdev == DeviceId::NONE
208    }
209
210    /// Checks whether the child of this entry represented by `info` is a "whiteout".
211    ///
212    /// Only looks up the corresponding `DirEntry` when necessary.
213    fn is_whiteout_child<L>(
214        &self,
215        locked: &mut Locked<L>,
216        current_task: &CurrentTask,
217        info: &DirEntryInfo,
218    ) -> Result<bool, Errno>
219    where
220        L: LockEqualOrBefore<FileOpsCore>,
221    {
222        // We need to lookup the node only if the file is a char device.
223        if info.entry_type != DirectoryEntryType::CHR {
224            return Ok(false);
225        }
226        let entry = self.component_lookup(locked, current_task, info.name.as_ref())?;
227        Ok(entry.is_whiteout())
228    }
229
230    fn read_dir_entries<L>(
231        &self,
232        locked: &mut Locked<L>,
233        current_task: &CurrentTask,
234    ) -> Result<Vec<DirEntryInfo>, Errno>
235    where
236        L: LockEqualOrBefore<FileOpsCore>,
237    {
238        let mut sink = DirentSinkAdapter::default();
239        self.entry().open_anonymous(locked, current_task, OpenFlags::DIRECTORY)?.readdir(
240            locked,
241            current_task,
242            &mut sink,
243        )?;
244        Ok(sink.items)
245    }
246}
247
248struct OverlayNode {
249    stack: Arc<OverlayStack>,
250
251    // Corresponding `DirEntries` in the lower and the upper filesystems. At least one must be
252    // set. Note that we don't care about `NamespaceNode`: overlayfs overlays filesystems
253    // (i.e. not namespace subtrees). These directories may not be mounted anywhere.
254    // `upper` may be created dynamically whenever write access is required.
255    upper: OnceCell<ActiveEntry>,
256    lower: Option<ActiveEntry>,
257
258    // `prepare_to_unlink()` may mark `upper` as opaque. In that case we want to skip merging
259    // with `lower` in `readdir()`.
260    upper_is_opaque: OnceCell<()>,
261
262    parent: Option<Arc<OverlayNode>>,
263}
264
265impl OverlayNode {
266    fn new(
267        stack: Arc<OverlayStack>,
268        lower: Option<ActiveEntry>,
269        upper: Option<ActiveEntry>,
270        parent: Option<Arc<OverlayNode>>,
271    ) -> Arc<Self> {
272        assert!(upper.is_some() || parent.is_some());
273
274        let upper = match upper {
275            Some(entry) => OnceCell::with_value(entry),
276            None => OnceCell::new(),
277        };
278
279        Arc::new(OverlayNode { stack, upper, lower, upper_is_opaque: OnceCell::new(), parent })
280    }
281
282    fn from_fs_node(node: &FsNodeHandle) -> Result<&Arc<Self>, Errno> {
283        Ok(&node.downcast_ops::<OverlayNodeOps>().ok_or_else(|| errno!(EIO))?.node)
284    }
285
286    fn main_entry(&self) -> &ActiveEntry {
287        self.upper.get().or(self.lower.as_ref()).expect("Expected either upper or lower node")
288    }
289
290    fn init_fs_node_for_child(
291        self: &Arc<OverlayNode>,
292        node: &FsNode,
293        lower: Option<ActiveEntry>,
294        upper: Option<ActiveEntry>,
295    ) -> FsNodeHandle {
296        let entry = upper.as_ref().or(lower.as_ref()).expect("expect either lower or upper node");
297        let ino = entry.entry().node.ino;
298        let info = entry.entry().node.info().clone();
299
300        // Parent may be needed to initialize `upper`. We don't need to pass it if we have `upper`.
301        let parent = if upper.is_some() { None } else { Some(self.clone()) };
302
303        let overlay_node =
304            OverlayNodeOps { node: OverlayNode::new(self.stack.clone(), lower, upper, parent) };
305        FsNode::new_uncached(ino, overlay_node, &node.fs(), info)
306    }
307
308    /// If the file is currently in the lower FS, then promote it to the upper FS. No-op if the
309    /// file is already in the upper FS.
310    fn ensure_upper<L>(
311        &self,
312        locked: &mut Locked<L>,
313        current_task: &CurrentTask,
314        fs: &FileSystem,
315    ) -> Result<&ActiveEntry, Errno>
316    where
317        L: LockEqualOrBefore<FileOpsCore>,
318    {
319        self.ensure_upper_maybe_copy(locked, current_task, UpperCopyMode::CopyAll, fs)
320    }
321
322    /// Same as `ensure_upper()`, but allows to skip copying of the file content.
323    fn ensure_upper_maybe_copy<L>(
324        &self,
325        locked: &mut Locked<L>,
326        current_task: &CurrentTask,
327        copy_mode: UpperCopyMode,
328        fs: &FileSystem,
329    ) -> Result<&ActiveEntry, Errno>
330    where
331        L: LockEqualOrBefore<FileOpsCore>,
332    {
333        self.upper.get_or_try_init(|| {
334            let lower = self.lower.as_ref().expect("lower is expected when upper is missing");
335            let parent = self.parent.as_ref().expect("Parent is expected when upper is missing");
336            let parent_upper = parent.ensure_upper(locked, current_task, fs)?;
337            let name = lower.entry.local_name(&RcuReadScope::new()).to_owned();
338            let info = {
339                let info = lower.entry.node.info();
340                info.clone()
341            };
342            let cred = info.cred();
343
344            let mut copy_up_creds = Credentials::clone(&self.stack.mounter);
345            security::fs_node_copy_up(current_task, &lower.entry.node, fs, &mut copy_up_creds);
346            let res = current_task.override_creds(Arc::new(copy_up_creds), || {
347                if info.mode.is_lnk() {
348                    let link_target = lower.entry.node.readlink(locked, current_task)?;
349                    let link_path = match &link_target {
350                        SymlinkTarget::Node(_) => return error!(EIO),
351                        SymlinkTarget::Path(path) => path,
352                    };
353                    parent_upper.create_entry(
354                        locked,
355                        current_task,
356                        name.as_ref(),
357                        |locked, dir, mount, name| {
358                            dir.create_symlink(
359                                locked,
360                                current_task,
361                                mount,
362                                name,
363                                link_path.as_ref(),
364                                cred,
365                            )
366                        },
367                    )
368                } else if info.mode.is_reg() && copy_mode == UpperCopyMode::CopyAll {
369                    // Regular files need to be copied from lower FS to upper FS.
370                    self.stack.create_upper_entry(
371                        locked,
372                        current_task,
373                        parent_upper,
374                        name.as_ref(),
375                        |locked, dir, name| {
376                            dir.create_entry(
377                                locked,
378                                current_task,
379                                name,
380                                |locked, dir_node, mount, name| {
381                                    dir_node.create_node(
382                                        locked,
383                                        current_task,
384                                        mount,
385                                        name,
386                                        info.mode,
387                                        DeviceId::NONE,
388                                        cred,
389                                    )
390                                },
391                            )
392                        },
393                        |locked, entry| copy_file_content(locked, current_task, lower, &entry),
394                    )
395                } else {
396                    parent_upper.create_entry(
397                        locked,
398                        current_task,
399                        name.as_ref(),
400                        |locked, dir, mount, name| {
401                            dir.create_node(
402                                locked,
403                                current_task,
404                                mount,
405                                name,
406                                info.mode,
407                                info.rdev,
408                                cred,
409                            )
410                        },
411                    )
412                }
413            });
414
415            track_stub!(TODO("https://fxbug.dev/322874151"), "overlayfs copy xattrs");
416            res
417        })
418    }
419
420    /// Checks if this node exists in the lower FS.
421    fn has_lower(&self) -> bool {
422        self.lower.is_some()
423    }
424
425    /// Check that an item isn't present in the lower FS.
426    fn lower_entry_exists<L>(
427        &self,
428        locked: &mut Locked<L>,
429        current_task: &CurrentTask,
430        name: &FsStr,
431    ) -> Result<bool, Errno>
432    where
433        L: LockEqualOrBefore<FileOpsCore>,
434    {
435        match &self.lower {
436            Some(lower) => match lower.component_lookup(locked, current_task, name) {
437                Ok(entry) => Ok(!entry.is_whiteout()),
438                Err(err) if err.code == ENOENT => Ok(false),
439                Err(err) => Err(err),
440            },
441            None => Ok(false),
442        }
443    }
444
445    /// Helper used to create a new entry in the directory. It first checks that the target node
446    /// doesn't exist. Then `do_create` is called to create the new node in the work dir, which
447    /// is then moved to the target dir in the upper file system.
448    ///
449    /// It's assumed that the calling `DirEntry` has the current directory locked, so it is not
450    /// supposed to change while this method is executed. Note that OveralayFS doesn't handle
451    /// the case when the underlying file systems are changed directly, but that restriction
452    /// is not enforced.
453    fn create_entry<F, L>(
454        self: &Arc<OverlayNode>,
455        locked: &mut Locked<L>,
456        node: &FsNode,
457        current_task: &CurrentTask,
458        name: &FsStr,
459        do_create: F,
460    ) -> Result<ActiveEntry, Errno>
461    where
462        F: Fn(&mut Locked<L>, &ActiveEntry, &FsStr) -> Result<ActiveEntry, Errno>,
463        L: LockEqualOrBefore<FileOpsCore>,
464    {
465        let upper = self.ensure_upper(locked, current_task, &node.fs())?;
466
467        match upper.component_lookup(locked, current_task, name) {
468            Ok(existing) => {
469                // If there is an entry in the upper dir, then it must be a whiteout.
470                if !existing.is_whiteout() {
471                    return error!(EEXIST);
472                }
473            }
474
475            Err(e) if e.code == ENOENT => {
476                // If we don't have the entry in the upper fs, then check lower.
477                if self.lower_entry_exists(locked, current_task, name)? {
478                    return error!(EEXIST);
479                }
480            }
481            Err(e) => return Err(e),
482        };
483
484        self.stack.create_upper_entry(
485            locked,
486            current_task,
487            upper,
488            name,
489            |locked, entry, fs| do_create(locked, entry, fs),
490            |_, _entry| Ok(()),
491        )
492    }
493
494    /// An overlay directory may appear empty when the corresponding upper dir isn't empty:
495    /// it may contain a number of whiteout entries. In that case the whiteouts need to be
496    /// unlinked before the upper directory can be unlinked as well.
497    /// `prepare_to_unlink()` checks that the directory doesn't contain anything other
498    /// than whiteouts and if that is the case then it unlinks all of them.
499    fn prepare_to_unlink<L>(
500        self: &Arc<OverlayNode>,
501        locked: &mut Locked<L>,
502        current_task: &CurrentTask,
503    ) -> Result<(), Errno>
504    where
505        L: LockEqualOrBefore<FileOpsCore>,
506    {
507        if self.main_entry().entry().node.is_dir() {
508            let mut lower_entries = BTreeSet::new();
509            if let Some(dir) = &self.lower {
510                for item in dir.read_dir_entries(locked, current_task)?.drain(..) {
511                    if !dir.is_whiteout_child(locked, current_task, &item)? {
512                        lower_entries.insert(item.name);
513                    }
514                }
515            }
516
517            if let Some(dir) = self.upper.get() {
518                let mut to_remove = Vec::<FsString>::new();
519                for item in dir.read_dir_entries(locked, current_task)?.drain(..) {
520                    if !dir.is_whiteout_child(locked, current_task, &item)? {
521                        return error!(ENOTEMPTY);
522                    }
523                    lower_entries.remove(&item.name);
524                    to_remove.push(item.name);
525                }
526
527                if !lower_entries.is_empty() {
528                    return error!(ENOTEMPTY);
529                }
530
531                // Mark the directory as opaque. Children can be removed after this.
532                dir.set_opaque_xattr(locked, current_task)?;
533                let _ = self.upper_is_opaque.set(());
534
535                // Finally, remove the children.
536                for name in to_remove.iter() {
537                    dir.entry().unlink(
538                        locked,
539                        current_task,
540                        dir.mount(),
541                        name.as_ref(),
542                        UnlinkKind::NonDirectory,
543                        false,
544                    )?;
545                }
546            }
547        }
548
549        Ok(())
550    }
551
552    fn as_mounter<R, F: FnOnce() -> R>(&self, current_task: &CurrentTask, do_work: F) -> R {
553        current_task.override_creds(self.stack.mounter.clone(), do_work)
554    }
555}
556
557struct OverlayNodeOps {
558    node: Arc<OverlayNode>,
559}
560
561impl FsNodeOps for OverlayNodeOps {
562    fn check_access(
563        &self,
564        locked: &mut Locked<FileOpsCore>,
565        node: &FsNode,
566        current_task: &CurrentTask,
567        access: security::PermissionFlags,
568        info: &RwLock<FsNodeInfo>,
569        reason: CheckAccessReason,
570        audit_context: security::Auditable<'_>,
571    ) -> Result<(), Errno> {
572        node.default_check_access_impl(current_task, access, reason, info.read(), audit_context)?;
573
574        self.node.as_mounter(current_task, || {
575            if let Some(entry) = self.node.upper.get() {
576                entry.entry.node.check_access(
577                    locked,
578                    current_task,
579                    entry.mount(),
580                    access,
581                    reason,
582                    audit_context,
583                )
584            } else {
585                let entry = self.node.lower.as_ref().expect("Either upper or lower node is set");
586                let lower_node = &entry.entry.node;
587
588                // If the lower node is a regular file, directory or symlink then opening it for
589                // write access will cause it to be copied-up, so the mounter only requires read
590                // access to the underlying node.
591                //
592                // If the lower node is "special" (i.e. a device, FIFO or socket) then writes will
593                // affect the underlying resource, so to avoid privilege escalation via overlays,
594                // the mounter is still required to have write access to the node. This works
595                // even if the lower filesystem is readonly because special nodes remain writable
596                // in that case (though they may not be modified or unlinked, which would require
597                // actually writing to the filesystem).
598                let mut access = access;
599                if access.contains(PermissionFlags::WRITE) && !lower_node.info().mode.is_special() {
600                    // Verify that the mounter will be able to write to copy-up the node.
601                    // TODO: https://fxbug.dev/403260093 - Fix this to also verify discretionary
602                    // write access to the mounter, while correctly taking into account the
603                    // `context=` mount option (if any) for the mandatory write access check.
604                    security::fs_node_permission(
605                        current_task,
606                        node,
607                        PermissionFlags::WRITE,
608                        audit_context,
609                    )?;
610
611                    access |= PermissionFlags::READ;
612                    access &= !(PermissionFlags::WRITE | PermissionFlags::APPEND);
613                }
614
615                lower_node.check_access(
616                    locked,
617                    current_task,
618                    &entry.mount,
619                    access,
620                    reason,
621                    audit_context,
622                )
623            }
624        })
625    }
626
627    fn create_file_ops(
628        &self,
629        locked: &mut Locked<FileOpsCore>,
630        node: &FsNode,
631        current_task: &CurrentTask,
632        flags: OpenFlags,
633    ) -> Result<Box<dyn FileOps>, Errno> {
634        self.node.as_mounter(current_task, || {
635            if flags.can_write() {
636                // Only upper FS can be writable.
637                let copy_mode = if flags.contains(OpenFlags::TRUNC) {
638                    UpperCopyMode::MetadataOnly
639                } else {
640                    UpperCopyMode::CopyAll
641                };
642                self.node.ensure_upper_maybe_copy(locked, current_task, copy_mode, &node.fs())?;
643            }
644
645            let ops: Box<dyn FileOps> = if node.is_dir() {
646                Box::new(OverlayDirectory {
647                    node: self.node.clone(),
648                    dir_entries: Default::default(),
649                })
650            } else {
651                let state =
652                    match (self.node.upper.get(), &self.node.lower) {
653                        (Some(upper), _) => OverlayFileState::Upper(upper.entry().open_anonymous(
654                            locked,
655                            current_task,
656                            flags,
657                        )?),
658                        (None, Some(lower)) => OverlayFileState::Lower(
659                            lower.entry().open_anonymous(locked, current_task, flags)?,
660                        ),
661                        _ => panic!("Expected either upper or lower node"),
662                    };
663
664                Box::new(OverlayFile { node: self.node.clone(), flags, state: RwLock::new(state) })
665            };
666
667            Ok(ops)
668        })
669    }
670
671    fn lookup(
672        &self,
673        locked: &mut Locked<FileOpsCore>,
674        node: &FsNode,
675        current_task: &CurrentTask,
676        name: &FsStr,
677    ) -> Result<FsNodeHandle, Errno> {
678        self.node.as_mounter(current_task, || {
679            let resolve_child = |locked: &mut Locked<FileOpsCore>,
680                                 dir_opt: Option<&ActiveEntry>| {
681                // TODO(sergeyu): lookup() checks access, but we don't need that here.
682                dir_opt
683                    .as_ref()
684                    .map(|dir| match dir.component_lookup(locked, current_task, name) {
685                        Ok(entry) => Some(Ok(entry)),
686                        Err(e) if e.code == ENOENT => None,
687                        Err(e) => Some(Err(e)),
688                    })
689                    .flatten()
690                    .transpose()
691            };
692
693            let upper: Option<ActiveEntry> = resolve_child(locked, self.node.upper.get())?;
694
695            let (upper_is_dir, upper_is_opaque) = match &upper {
696                Some(upper) if upper.is_whiteout() => return error!(ENOENT),
697                Some(upper) => {
698                    let is_dir = upper.entry().node.is_dir();
699                    let is_opaque = !is_dir || upper.is_opaque_node(locked, current_task);
700                    (is_dir, is_opaque)
701                }
702                None => (false, false),
703            };
704
705            let parent_upper_is_opaque = self.node.upper_is_opaque.get().is_some();
706
707            // We don't need to resolve the lower node if we have an opaque node in the upper dir.
708            let lookup_lower = !parent_upper_is_opaque && !upper_is_opaque;
709            let lower: Option<ActiveEntry> = if lookup_lower {
710                match resolve_child(locked, self.node.lower.as_ref())? {
711                    // If the upper node is a directory and the lower isn't then ignore the lower node.
712                    Some(lower) if upper_is_dir && !lower.entry().node.is_dir() => None,
713                    Some(lower) if lower.is_whiteout() => None,
714                    result => result,
715                }
716            } else {
717                None
718            };
719
720            if upper.is_none() && lower.is_none() {
721                return error!(ENOENT);
722            }
723
724            Ok(self.node.init_fs_node_for_child(node, lower, upper))
725        })
726    }
727
728    fn mknod(
729        &self,
730        locked: &mut Locked<FileOpsCore>,
731        node: &FsNode,
732        current_task: &CurrentTask,
733        name: &FsStr,
734        mode: FileMode,
735        dev: DeviceId,
736        owner: FsCred,
737    ) -> Result<FsNodeHandle, Errno> {
738        let mut creds = Credentials::clone(&self.node.stack.mounter);
739        security::dentry_create_files_as(current_task, node, mode, name, &mut creds)?;
740        current_task.override_creds(Arc::new(creds), || {
741            let new_upper_node = self.node.create_entry(
742                locked,
743                node,
744                current_task,
745                name,
746                |locked, dir, temp_name| {
747                    dir.create_entry(
748                        locked,
749                        current_task,
750                        temp_name,
751                        |locked, dir_node, mount, name| {
752                            dir_node.create_node(
753                                locked,
754                                current_task,
755                                mount,
756                                name,
757                                mode,
758                                dev,
759                                owner.clone(),
760                            )
761                        },
762                    )
763                },
764            )?;
765            Ok(self.node.init_fs_node_for_child(node, None, Some(new_upper_node)))
766        })
767    }
768
769    fn mkdir(
770        &self,
771        locked: &mut Locked<FileOpsCore>,
772        node: &FsNode,
773        current_task: &CurrentTask,
774        name: &FsStr,
775        mode: FileMode,
776        owner: FsCred,
777    ) -> Result<FsNodeHandle, Errno> {
778        let mut creds = Credentials::clone(&self.node.stack.mounter);
779        security::dentry_create_files_as(current_task, node, mode, name, &mut creds)?;
780        current_task.override_creds(Arc::new(creds), || {
781            let new_upper_node = self.node.create_entry(
782                locked,
783                node,
784                current_task,
785                name,
786                |locked, dir, temp_name| {
787                    let entry = dir.create_entry(
788                        locked,
789                        current_task,
790                        temp_name,
791                        |locked, dir_node, mount, name| {
792                            dir_node.create_node(
793                                locked,
794                                current_task,
795                                mount,
796                                name,
797                                mode,
798                                DeviceId::NONE,
799                                owner.clone(),
800                            )
801                        },
802                    )?;
803
804                    // Set opaque attribute to ensure the new directory is not merged with lower.
805                    entry.set_opaque_xattr(locked, current_task)?;
806
807                    Ok(entry)
808                },
809            )?;
810
811            Ok(self.node.init_fs_node_for_child(node, None, Some(new_upper_node)))
812        })
813    }
814
815    fn create_symlink(
816        &self,
817        locked: &mut Locked<FileOpsCore>,
818        node: &FsNode,
819        current_task: &CurrentTask,
820        name: &FsStr,
821        target: &FsStr,
822        owner: FsCred,
823    ) -> Result<FsNodeHandle, Errno> {
824        let mut creds = Credentials::clone(&self.node.stack.mounter);
825        security::dentry_create_files_as(current_task, node, FileMode::IFLNK, name, &mut creds)?;
826        current_task.override_creds(Arc::new(creds), || {
827            let new_upper_node = self.node.create_entry(
828                locked,
829                node,
830                current_task,
831                name,
832                |locked, dir, temp_name| {
833                    dir.create_entry(
834                        locked,
835                        current_task,
836                        temp_name,
837                        |locked, dir_node, mount, name| {
838                            dir_node.create_symlink(
839                                locked,
840                                current_task,
841                                mount,
842                                name,
843                                target,
844                                owner.clone(),
845                            )
846                        },
847                    )
848                },
849            )?;
850            Ok(self.node.init_fs_node_for_child(node, None, Some(new_upper_node)))
851        })
852    }
853
854    fn readlink(
855        &self,
856        locked: &mut Locked<FileOpsCore>,
857        _node: &FsNode,
858        current_task: &CurrentTask,
859    ) -> Result<SymlinkTarget, Errno> {
860        self.node.as_mounter(current_task, || {
861            self.node.main_entry().entry().node.readlink(locked, current_task)
862        })
863    }
864
865    fn link(
866        &self,
867        locked: &mut Locked<FileOpsCore>,
868        node: &FsNode,
869        current_task: &CurrentTask,
870        name: &FsStr,
871        child: &FsNodeHandle,
872    ) -> Result<(), Errno> {
873        self.node.as_mounter(current_task, || {
874            let child_overlay = OverlayNode::from_fs_node(child)?;
875            let upper_child = child_overlay.ensure_upper(locked, current_task, &node.fs())?;
876            self.node.create_entry(
877                locked,
878                node,
879                current_task,
880                name,
881                |locked, dir, temp_name| {
882                    dir.create_entry(
883                        locked,
884                        current_task,
885                        temp_name,
886                        |locked, dir_node, mount, name| {
887                            dir_node.link(
888                                locked,
889                                current_task,
890                                mount,
891                                name,
892                                &upper_child.entry().node,
893                            )
894                        },
895                    )
896                },
897            )?;
898            Ok(())
899        })
900    }
901
902    fn unlink(
903        &self,
904        locked: &mut Locked<FileOpsCore>,
905        node: &FsNode,
906        current_task: &CurrentTask,
907        name: &FsStr,
908        child: &FsNodeHandle,
909    ) -> Result<(), Errno> {
910        self.node.as_mounter(current_task, || {
911            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?;
912            let child_overlay = OverlayNode::from_fs_node(child)?;
913            child_overlay.prepare_to_unlink(locked, current_task)?;
914
915            let need_whiteout = self.node.lower_entry_exists(locked, current_task, name)?;
916            if need_whiteout {
917                self.node.stack.create_upper_entry(
918                    locked,
919                    current_task,
920                    &upper,
921                    &name,
922                    |locked, work, name| work.create_whiteout(locked, current_task, name),
923                    |_, _entry| Ok(()),
924                )?;
925            } else if let Some(child_upper) = child_overlay.upper.get() {
926                let kind = if child_upper.entry().node.is_dir() {
927                    UnlinkKind::Directory
928                } else {
929                    UnlinkKind::NonDirectory
930                };
931                upper.entry().unlink(locked, current_task, upper.mount(), name, kind, false)?;
932            }
933
934            Ok(())
935        })
936    }
937
938    fn fetch_and_refresh_info<'a>(
939        &self,
940        locked: &mut Locked<FileOpsCore>,
941        _node: &FsNode,
942        current_task: &CurrentTask,
943        info: &'a RwLock<FsNodeInfo>,
944    ) -> Result<RwLockReadGuard<'a, FsNodeInfo>, Errno> {
945        self.node.as_mounter(current_task, || {
946            let real_info = self
947                .node
948                .main_entry()
949                .entry()
950                .node
951                .fetch_and_refresh_info(locked, current_task)?
952                .clone();
953            let mut lock = info.write();
954            *lock = real_info;
955            Ok(RwLockWriteGuard::downgrade(lock))
956        })
957    }
958
959    fn update_attributes(
960        &self,
961        locked: &mut Locked<FileOpsCore>,
962        node: &FsNode,
963        current_task: &CurrentTask,
964        new_info: &FsNodeInfo,
965        has: zxio_node_attr_has_t,
966    ) -> Result<(), Errno> {
967        self.node.as_mounter(current_task, || {
968            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?.entry();
969            upper.node.update_attributes(locked, current_task, |info| {
970                if has.modification_time {
971                    info.time_modify = new_info.time_modify;
972                }
973                if has.access_time {
974                    info.time_access = new_info.time_access;
975                }
976                if has.mode {
977                    info.mode = new_info.mode;
978                }
979                if has.uid {
980                    info.uid = new_info.uid;
981                }
982                if has.gid {
983                    info.gid = new_info.gid;
984                }
985                if has.rdev {
986                    info.rdev = new_info.rdev;
987                }
988                Ok(())
989            })
990        })
991    }
992
993    fn append_lock_read<'a>(
994        &'a self,
995        locked: &'a mut Locked<BeforeFsNodeAppend>,
996        node: &'a FsNode,
997        current_task: &CurrentTask,
998    ) -> Result<(RwQueueReadGuard<'a, FsNodeAppend>, &'a mut Locked<FsNodeAppend>), Errno> {
999        self.node.as_mounter(current_task, || {
1000            let upper_node =
1001                self.node.ensure_upper(locked, current_task, &node.fs())?.entry.node.as_ref();
1002            upper_node.ops().append_lock_read(locked, upper_node, current_task)
1003        })
1004    }
1005
1006    fn append_lock_write<'a>(
1007        &'a self,
1008        locked: &'a mut Locked<BeforeFsNodeAppend>,
1009        node: &'a FsNode,
1010        current_task: &CurrentTask,
1011    ) -> Result<(RwQueueWriteGuard<'a, FsNodeAppend>, &'a mut Locked<FsNodeAppend>), Errno> {
1012        self.node.as_mounter(current_task, || {
1013            let upper_node =
1014                self.node.ensure_upper(locked, current_task, &node.fs())?.entry.node.as_ref();
1015            upper_node.ops().append_lock_write(locked, upper_node, current_task)
1016        })
1017    }
1018
1019    fn truncate(
1020        &self,
1021        locked: &mut Locked<FileOpsCore>,
1022        guard: &AppendLockGuard<'_>,
1023        node: &FsNode,
1024        current_task: &CurrentTask,
1025        length: u64,
1026    ) -> Result<(), Errno> {
1027        self.node.as_mounter(current_task, || {
1028            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?;
1029
1030            upper.entry().node.truncate_locked(locked, guard, current_task, length)
1031        })
1032    }
1033
1034    fn allocate(
1035        &self,
1036        locked: &mut Locked<FileOpsCore>,
1037        guard: &AppendLockGuard<'_>,
1038        node: &FsNode,
1039        current_task: &CurrentTask,
1040        mode: FallocMode,
1041        offset: u64,
1042        length: u64,
1043    ) -> Result<(), Errno> {
1044        self.node.as_mounter(current_task, || {
1045            let node = &self.node.ensure_upper(locked, current_task, &node.fs())?.entry().node;
1046            node.fallocate_locked(locked, guard, current_task, mode, offset, length)
1047        })
1048    }
1049
1050    fn get_xattr(
1051        &self,
1052        locked: &mut Locked<FileOpsCore>,
1053        _node: &FsNode,
1054        current_task: &CurrentTask,
1055        name: &FsStr,
1056        max_size: usize,
1057    ) -> Result<ValueOrSize<FsString>, Errno> {
1058        let entry = self
1059            .node
1060            .upper
1061            .get()
1062            .or(self.node.lower.as_ref())
1063            .expect("expect either lower or upper node");
1064        self.node.as_mounter(current_task, || {
1065            entry.entry().node.get_xattr(locked, current_task, &entry.mount, name, max_size)
1066        })
1067    }
1068
1069    fn set_xattr(
1070        &self,
1071        locked: &mut Locked<FileOpsCore>,
1072        node: &FsNode,
1073        current_task: &CurrentTask,
1074        name: &FsStr,
1075        value: &FsStr,
1076        op: XattrOp,
1077    ) -> Result<(), Errno> {
1078        self.node.as_mounter(current_task, || {
1079            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?;
1080            upper.entry().node.set_xattr(locked, current_task, &upper.mount, name, value, op)
1081        })
1082    }
1083
1084    fn remove_xattr(
1085        &self,
1086        locked: &mut Locked<FileOpsCore>,
1087        node: &FsNode,
1088        current_task: &CurrentTask,
1089        name: &FsStr,
1090    ) -> Result<(), Errno> {
1091        self.node.as_mounter(current_task, || {
1092            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?;
1093            upper.entry().node.remove_xattr(locked, current_task, &upper.mount, name)
1094        })
1095    }
1096
1097    fn list_xattrs(
1098        &self,
1099        locked: &mut Locked<FileOpsCore>,
1100        _node: &FsNode,
1101        current_task: &CurrentTask,
1102        max_size: usize,
1103    ) -> Result<ValueOrSize<Vec<FsString>>, Errno> {
1104        self.node.as_mounter(current_task, || {
1105            let entry = self
1106                .node
1107                .upper
1108                .get()
1109                .or(self.node.lower.as_ref())
1110                .expect("expect either lower or upper node");
1111            entry.entry().node.list_xattrs(locked, current_task, max_size)
1112        })
1113    }
1114}
1115struct OverlayDirectory {
1116    node: Arc<OverlayNode>,
1117    dir_entries: RwLock<DirEntries>,
1118}
1119
1120impl OverlayDirectory {
1121    fn refresh_dir_entries<L>(
1122        &self,
1123        locked: &mut Locked<L>,
1124        current_task: &CurrentTask,
1125    ) -> Result<(), Errno>
1126    where
1127        L: LockEqualOrBefore<FileOpsCore>,
1128    {
1129        let mut entries = DirEntries::new();
1130
1131        let upper_is_opaque = self.node.upper_is_opaque.get().is_some();
1132        let merge_with_lower = self.node.lower.is_some() && !upper_is_opaque;
1133
1134        // First enumerate entries in the upper dir. Then enumerate the lower dir and add only
1135        // items that are not present in the upper.
1136        let mut upper_set = BTreeSet::new();
1137        if let Some(dir) = self.node.upper.get() {
1138            for item in dir.read_dir_entries(locked, current_task)?.drain(..) {
1139                // Fill `upper_set` only if we will need it later.
1140                if merge_with_lower {
1141                    upper_set.insert(item.name.clone());
1142                }
1143                if !dir.is_whiteout_child(locked, current_task, &item)? {
1144                    entries.push(item);
1145                }
1146            }
1147        }
1148
1149        if merge_with_lower {
1150            if let Some(dir) = &self.node.lower {
1151                for item in dir.read_dir_entries(locked, current_task)?.drain(..) {
1152                    if !upper_set.contains(&item.name)
1153                        && !dir.is_whiteout_child(locked, current_task, &item)?
1154                    {
1155                        entries.push(item);
1156                    }
1157                }
1158            }
1159        }
1160
1161        *self.dir_entries.write() = entries;
1162
1163        Ok(())
1164    }
1165}
1166
1167impl FileOps for OverlayDirectory {
1168    fileops_impl_directory!();
1169    fileops_impl_noop_sync!();
1170
1171    fn seek(
1172        &self,
1173        _locked: &mut Locked<FileOpsCore>,
1174        _file: &FileObject,
1175        current_task: &CurrentTask,
1176        current_offset: off_t,
1177        target: SeekTarget,
1178    ) -> Result<off_t, Errno> {
1179        self.node
1180            .as_mounter(current_task, || default_seek(current_offset, target, || error!(EINVAL)))
1181    }
1182
1183    fn readdir(
1184        &self,
1185        locked: &mut Locked<FileOpsCore>,
1186        file: &FileObject,
1187        current_task: &CurrentTask,
1188        sink: &mut dyn DirentSink,
1189    ) -> Result<(), Errno> {
1190        self.node.as_mounter(current_task, || {
1191            if sink.offset() == 0 {
1192                self.refresh_dir_entries(locked, current_task)?;
1193            }
1194
1195            emit_dotdot(file, sink)?;
1196
1197            for item in self.dir_entries.read().iter().skip(sink.offset() as usize - 2) {
1198                sink.add(item.inode_num, sink.offset() + 1, item.entry_type, item.name.as_ref())?;
1199            }
1200
1201            Ok(())
1202        })
1203    }
1204}
1205
1206enum OverlayFileState {
1207    Lower(FileHandle),
1208    Upper(FileHandle),
1209}
1210
1211impl OverlayFileState {
1212    fn file(&self) -> &FileHandle {
1213        match self {
1214            Self::Lower(f) | Self::Upper(f) => f,
1215        }
1216    }
1217}
1218
1219struct OverlayFile {
1220    node: Arc<OverlayNode>,
1221    flags: OpenFlags,
1222    state: RwLock<OverlayFileState>,
1223}
1224
1225impl FileOps for OverlayFile {
1226    fileops_impl_seekable!();
1227
1228    fn read(
1229        &self,
1230        locked: &mut Locked<FileOpsCore>,
1231        _file: &FileObject,
1232        current_task: &CurrentTask,
1233        offset: usize,
1234        data: &mut dyn OutputBuffer,
1235    ) -> Result<usize, Errno> {
1236        self.node.as_mounter(current_task, || {
1237            let mut state = self.state.read();
1238
1239            // Check if the file was promoted to the upper FS. In that case we need to reopen it
1240            // from there.
1241            if let Some(upper) = self.node.upper.get() {
1242                if matches!(*state, OverlayFileState::Lower(_)) {
1243                    std::mem::drop(state);
1244
1245                    {
1246                        let mut write_state = self.state.write();
1247
1248                        // TODO(mariagl): don't hold write_state while calling open_anonymous.
1249                        // It may call back into read(), causing lock order inversion.
1250                        *write_state = OverlayFileState::Upper(upper.entry().open_anonymous(
1251                            locked,
1252                            current_task,
1253                            self.flags,
1254                        )?);
1255                    }
1256                    state = self.state.read();
1257                }
1258            }
1259
1260            // TODO(mariagl): Drop state here
1261            let file = state.file();
1262            security::file_permission(current_task, &file, security::PermissionFlags::READ)?;
1263            file.ops().read(locked, file, current_task, offset, data)
1264        })
1265    }
1266
1267    fn write(
1268        &self,
1269        locked: &mut Locked<FileOpsCore>,
1270        _file: &FileObject,
1271        current_task: &CurrentTask,
1272        offset: usize,
1273        data: &mut dyn InputBuffer,
1274    ) -> Result<usize, Errno> {
1275        self.node.as_mounter(current_task, || {
1276            let state = self.state.read();
1277            let file = match &*state {
1278                OverlayFileState::Upper(f) => f.clone(),
1279
1280                // `write()` should be called only for files that were opened for write, and that
1281                // required the file to be promoted to the upper FS.
1282                OverlayFileState::Lower(_) => panic!("write() called for a lower FS file."),
1283            };
1284            std::mem::drop(state);
1285            security::file_permission(current_task, &file, security::PermissionFlags::WRITE)?;
1286            file.ops().write(locked, &file, current_task, offset, data)
1287        })
1288    }
1289
1290    fn sync(&self, _file: &FileObject, current_task: &CurrentTask) -> Result<(), Errno> {
1291        self.node.as_mounter(current_task, || {
1292            let state = self.state.read();
1293            let file = state.file();
1294            file.ops().sync(file, current_task)
1295        })
1296    }
1297
1298    fn get_memory(
1299        &self,
1300        locked: &mut Locked<FileOpsCore>,
1301        _file: &FileObject,
1302        current_task: &CurrentTask,
1303        length: Option<usize>,
1304        prot: starnix_core::mm::ProtectionFlags,
1305    ) -> Result<Arc<MemoryObject>, Errno> {
1306        self.node.as_mounter(current_task, || {
1307            let state = self.state.read();
1308            let file = state.file();
1309            // Not that the VMO returned here will not updated if the file is promoted to upper FS
1310            // later. This is consistent with OverlayFS behavior on Linux, see
1311            // https://docs.kernel.org/filesystems/overlayfs.html#non-standard-behavior .
1312            file.ops().get_memory(locked, file, current_task, length, prot)
1313        })
1314    }
1315}
1316
1317pub fn new_overlay_fs(
1318    locked: &mut Locked<Unlocked>,
1319    current_task: &CurrentTask,
1320    options: FileSystemOptions,
1321) -> Result<FileSystemHandle, Errno> {
1322    OverlayStack::new_fs(locked, current_task, options)
1323}
1324
1325pub struct OverlayStack {
1326    // Keep references to the underlying file systems to ensure they outlive `overlayfs` since
1327    // they may be unmounted before overlayfs.
1328    #[allow(unused)]
1329    lower_fs: FileSystemHandle,
1330    upper_fs: FileSystemHandle,
1331
1332    work: ActiveEntry,
1333
1334    // Used when interacting with the `upper_fs`, `lower_fs` or `work` directories.
1335    mounter: Arc<Credentials>,
1336}
1337
1338impl OverlayStack {
1339    fn new_fs(
1340        locked: &mut Locked<Unlocked>,
1341        current_task: &CurrentTask,
1342        options: FileSystemOptions,
1343    ) -> Result<FileSystemHandle, Errno> {
1344        match options.params.get("redirect_dir".as_bytes()) {
1345            None => (),
1346            Some(o) if o == "off" => (),
1347            Some(_) => {
1348                track_stub!(TODO("https://fxbug.dev/322874205"), "overlayfs redirect_dir");
1349                return error!(ENOTSUP);
1350            }
1351        }
1352
1353        let lower = resolve_dir_param(locked, current_task, &options.params, "lowerdir".into())?;
1354        let upper = resolve_dir_param(locked, current_task, &options.params, "upperdir".into())?;
1355        let work = resolve_dir_param(locked, current_task, &options.params, "workdir".into())?;
1356
1357        let lower_fs = lower.entry().node.fs();
1358        let upper_fs = upper.entry().node.fs();
1359
1360        if !Arc::ptr_eq(&upper_fs, &work.entry().node.fs()) {
1361            log_error!("overlayfs: upperdir and workdir must be on the same FS");
1362            return error!(EINVAL);
1363        }
1364
1365        let kernel = current_task.kernel();
1366        let mounter = current_task.current_creds().clone();
1367        let stack = Arc::new(OverlayStack { lower_fs, upper_fs, work, mounter });
1368        let root_node = OverlayNode::new(stack.clone(), Some(lower), Some(upper), None);
1369        let fs =
1370            FileSystem::new(locked, kernel, CacheMode::Uncached, OverlayFs { stack }, options)?;
1371        let root_ino = fs.allocate_ino();
1372        fs.create_root(root_ino, OverlayNodeOps { node: root_node });
1373        Ok(fs)
1374    }
1375
1376    /// Given a filesystem, wraps it in a tmpfs-backed writable overlayfs.
1377    pub fn wrap_fs_in_writable_layer<L>(
1378        locked: &mut Locked<L>,
1379        kernel: &Kernel,
1380        rootfs: FileSystemHandle,
1381    ) -> Result<FileSystemHandle, Errno>
1382    where
1383        L: LockEqualOrBefore<FileOpsCore>,
1384    {
1385        let lower = ActiveEntry { entry: rootfs.root().clone(), mount: MountInfo::detached() };
1386
1387        // Create upper and work directories in an invisible tmpfs.
1388        let invisible_tmp = TmpFs::new_fs(locked, kernel);
1389
1390        let create_directory = |fs: &FileSystemHandle| {
1391            let ino = fs.allocate_ino();
1392            let info = FsNodeInfo::new(mode!(IFDIR, 0o777), FsCred::root());
1393            let node = fs.create_detached_node(ino, TmpFsDirectory::new(), info);
1394            let dir_entry = DirEntry::new(node, None, FsString::default());
1395
1396            // TODO: https://fxbug.dev/455771186 - Revise FsNode initialization to better ensure
1397            // that all the things are appropriately labeled.
1398            security::fs_node_init_with_dentry_deferred(kernel, &dir_entry);
1399
1400            dir_entry
1401        };
1402
1403        let upper =
1404            ActiveEntry { entry: create_directory(&invisible_tmp), mount: MountInfo::detached() };
1405        let work =
1406            ActiveEntry { entry: create_directory(&invisible_tmp), mount: MountInfo::detached() };
1407
1408        let lower_fs = rootfs;
1409        let upper_fs = invisible_tmp;
1410
1411        let mounter = Credentials::root();
1412        let stack = Arc::new(OverlayStack { lower_fs, upper_fs, work, mounter });
1413        let root_node = OverlayNode::new(stack.clone(), Some(lower), Some(upper), None);
1414        let fs = FileSystem::new(
1415            locked,
1416            kernel,
1417            CacheMode::Uncached,
1418            OverlayFs { stack },
1419            FileSystemOptions::default(),
1420        )?;
1421        let root_ino = fs.allocate_ino();
1422        fs.create_root(root_ino, OverlayNodeOps { node: root_node });
1423        Ok(fs)
1424    }
1425
1426    // Helper used to create new entry called `name` in `target_dir` in the upper FS.
1427    // 1. Calls `try_create` to create a new entry in `work`. It is called repeateadly with a
1428    //    new name until it returns any result other than `EEXIST`.
1429    // 2. `do_init` is called to initilize the contents and the attributes of the new entry, etc.
1430    // 3. The new entry is moved to `target_dir`. If there is an existing entry called `name` in
1431    //    `target_dir` then it's replaced with the new entry.
1432    // The temp file is cleared from the work dir if either of the last two steps fails.
1433    fn create_upper_entry<FCreate, FInit, L>(
1434        &self,
1435        locked: &mut Locked<L>,
1436        current_task: &CurrentTask,
1437        target_dir: &ActiveEntry,
1438        name: &FsStr,
1439        try_create: FCreate,
1440        do_init: FInit,
1441    ) -> Result<ActiveEntry, Errno>
1442    where
1443        L: LockEqualOrBefore<FileOpsCore>,
1444        FCreate: Fn(&mut Locked<L>, &ActiveEntry, &FsStr) -> Result<ActiveEntry, Errno>,
1445        FInit: FnOnce(&mut Locked<L>, &ActiveEntry) -> Result<(), Errno>,
1446    {
1447        let mut rng = rand::rng();
1448        let (temp_name, entry) = loop {
1449            let x: u64 = rng.random();
1450            let temp_name = FsString::from(format!("tmp{:x}", x));
1451            match try_create(locked, &self.work, temp_name.as_ref()) {
1452                Err(err) if err.code == EEXIST => continue,
1453                Err(err) => return Err(err),
1454                Ok(entry) => break (temp_name, entry),
1455            }
1456        };
1457
1458        do_init(locked, &entry)
1459            .and_then(|()| {
1460                DirEntry::rename(
1461                    locked,
1462                    current_task,
1463                    self.work.entry(),
1464                    self.work.mount(),
1465                    temp_name.as_ref(),
1466                    target_dir.entry(),
1467                    target_dir.mount(),
1468                    name,
1469                    RenameFlags::REPLACE_ANY,
1470                )
1471            })
1472            .map_err(|e| {
1473                // Remove the temp entry in case of a failure.
1474                self.work
1475                    .entry()
1476                    .unlink(
1477                        locked,
1478                        current_task,
1479                        self.work.mount(),
1480                        temp_name.as_ref(),
1481                        UnlinkKind::NonDirectory,
1482                        false,
1483                    )
1484                    .unwrap_or_else(|e| {
1485                        log_error!("Failed to cleanup work dir after an error: {}", e)
1486                    });
1487                e
1488            })?;
1489
1490        Ok(entry)
1491    }
1492}
1493
1494struct OverlayFs {
1495    stack: Arc<OverlayStack>,
1496}
1497
1498impl FileSystemOps for OverlayFs {
1499    fn statfs(
1500        &self,
1501        locked: &mut Locked<FileOpsCore>,
1502        _fs: &FileSystem,
1503        current_task: &CurrentTask,
1504    ) -> Result<statfs, Errno> {
1505        current_task.override_creds(self.stack.mounter.clone(), || {
1506            self.stack.upper_fs.statfs(locked, current_task)
1507        })
1508    }
1509
1510    fn name(&self) -> &'static FsStr {
1511        "overlay".into()
1512    }
1513
1514    fn rename(
1515        &self,
1516        locked: &mut Locked<FileOpsCore>,
1517        _fs: &FileSystem,
1518        current_task: &CurrentTask,
1519        old_parent: &FsNodeHandle,
1520        old_name: &FsStr,
1521        new_parent: &FsNodeHandle,
1522        new_name: &FsStr,
1523        renamed: &FsNodeHandle,
1524        _replaced: Option<&FsNodeHandle>,
1525    ) -> Result<(), Errno> {
1526        current_task.override_creds(self.stack.mounter.clone(), || {
1527            let renamed_overlay = OverlayNode::from_fs_node(renamed)?;
1528            if renamed_overlay.has_lower() && renamed_overlay.main_entry().entry().node.is_dir() {
1529                // Return EXDEV for directory renames. Potentially they may be handled with the
1530                // `redirect_dir` feature, but it's not implemented here yet.
1531                // See https://docs.kernel.org/filesystems/overlayfs.html#renaming-directories
1532                return error!(EXDEV);
1533            }
1534            renamed_overlay.ensure_upper(locked, current_task, &renamed.fs())?;
1535
1536            let old_parent_overlay = OverlayNode::from_fs_node(old_parent)?;
1537            let old_parent_upper =
1538                old_parent_overlay.ensure_upper(locked, current_task, &renamed.fs())?;
1539
1540            let new_parent_overlay = OverlayNode::from_fs_node(new_parent)?;
1541            let new_parent_upper =
1542                new_parent_overlay.ensure_upper(locked, current_task, &renamed.fs())?;
1543
1544            let need_whiteout =
1545                old_parent_overlay.lower_entry_exists(locked, current_task, old_name)?;
1546
1547            DirEntry::rename(
1548                locked,
1549                current_task,
1550                old_parent_upper.entry(),
1551                old_parent_upper.mount(),
1552                old_name,
1553                new_parent_upper.entry(),
1554                new_parent_upper.mount(),
1555                new_name,
1556                RenameFlags::REPLACE_ANY,
1557            )?;
1558
1559            // If the old node existed in lower FS, then override it in the upper FS with a
1560            // whiteout.
1561            if need_whiteout {
1562                match old_parent_upper.create_whiteout(locked, current_task, old_name) {
1563                    Err(e) => log_warn!("overlayfs: failed to create whiteout for {old_name}: {e}"),
1564                    Ok(_) => (),
1565                }
1566            }
1567
1568            Ok(())
1569        })
1570    }
1571
1572    fn unmount(&self) {}
1573}
1574
1575/// Helper used to resolve directories passed in mount options. The directory is resolved in the
1576/// namespace of the calling process, but only `DirEntry` is returned (detached from the
1577/// namespace). The corresponding file systems may be unmounted before overlayfs that uses them.
1578fn resolve_dir_param(
1579    locked: &mut Locked<Unlocked>,
1580    current_task: &CurrentTask,
1581    params: &MountParams,
1582    name: &FsStr,
1583) -> Result<ActiveEntry, Errno> {
1584    let path = params.get(&**name).ok_or_else(|| {
1585        log_error!("overlayfs: {name} was not specified");
1586        errno!(EINVAL)
1587    })?;
1588
1589    current_task
1590        .open_file(locked, path.as_ref(), OpenFlags::RDONLY | OpenFlags::DIRECTORY)
1591        .map(|f| ActiveEntry { entry: f.name.entry.clone(), mount: f.name.mount.clone() })
1592        .map_err(|e| {
1593            log_error!("overlayfs: Failed to lookup {path}: {}", e);
1594            e
1595        })
1596}
1597
1598/// Copies file content from one file to another.
1599fn copy_file_content<L>(
1600    locked: &mut Locked<L>,
1601    current_task: &CurrentTask,
1602    from: &ActiveEntry,
1603    to: &ActiveEntry,
1604) -> Result<(), Errno>
1605where
1606    L: LockEqualOrBefore<FileOpsCore>,
1607{
1608    let locked = locked.cast_locked::<FileOpsCore>();
1609    let from_file = from.entry().open_anonymous(locked, current_task, OpenFlags::RDONLY)?;
1610    let to_file = to.entry().open_anonymous(locked, current_task, OpenFlags::WRONLY)?;
1611
1612    security::fs_node_permission(
1613        current_task,
1614        from_file.node().as_ref(),
1615        security::PermissionFlags::READ,
1616        (&**from_file).into(),
1617    )?;
1618    security::fs_node_permission(
1619        current_task,
1620        to_file.node().as_ref(),
1621        security::PermissionFlags::WRITE,
1622        (&**to_file).into(),
1623    )?;
1624
1625    const BUFFER_SIZE: usize = 4096;
1626
1627    let mut read_offset = 0;
1628    let mut write_offset = 0;
1629    loop {
1630        // TODO(sergeyu): Reuse buffer between iterations.
1631
1632        let mut output_buffer = VecOutputBuffer::new(BUFFER_SIZE);
1633        let bytes_read = from_file.ops().read(
1634            locked,
1635            &from_file,
1636            current_task,
1637            read_offset,
1638            &mut output_buffer,
1639        )?;
1640        if bytes_read == 0 {
1641            break;
1642        }
1643        read_offset += bytes_read;
1644
1645        let buffer: Vec<u8> = output_buffer.into();
1646        let mut input_buffer = VecInputBuffer::from(buffer);
1647        while input_buffer.available() > 0 {
1648            write_offset += to_file.ops().write(
1649                locked,
1650                &to_file,
1651                current_task,
1652                write_offset,
1653                &mut input_buffer,
1654            )?;
1655        }
1656    }
1657
1658    to_file.ops().data_sync(&to_file, current_task)?;
1659
1660    Ok(())
1661}