Skip to main content

starnix_modules_overlayfs/
lib.rs

1// Copyright 2023 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#![recursion_limit = "512"]
6
7use fuchsia_rcu::RcuReadScope;
8use once_cell::sync::OnceCell;
9use rand::Rng;
10use starnix_core::fs::tmpfs::{TmpFs, TmpFsDirectory};
11use starnix_core::mm::memory::MemoryObject;
12use starnix_core::security;
13use starnix_core::task::{CurrentTask, Kernel};
14use starnix_core::vfs::fs_args::MountParams;
15use starnix_core::vfs::rw_queue::RwQueueReadGuard;
16use starnix_core::vfs::{
17    AlreadyLockedAppendLockStrategy, AppendLockGuard, CacheMode, DirEntry, DirEntryHandle,
18    DirectoryEntryType, DirentSink, FallocMode, FileHandle, FileObject, FileOps, FileSystem,
19    FileSystemHandle, FileSystemOps, FileSystemOptions, FsNode, FsNodeHandle, FsNodeInfo,
20    FsNodeOps, FsStr, FsString, InputBuffer, MountInfo, OutputBuffer, RenameFlags, SeekTarget,
21    SymlinkTarget, UnlinkKind, ValueOrSize, VecInputBuffer, VecOutputBuffer, XattrOp, default_seek,
22    emit_dotdot, fileops_impl_directory, fileops_impl_noop_sync, fileops_impl_seekable,
23};
24use starnix_logging::{log_error, log_warn, track_stub};
25use starnix_sync::{
26    BeforeFsNodeAppend, FileOpsCore, FsNodeAppend, LockEqualOrBefore, Locked, RwLock,
27    RwLockReadGuard, RwLockWriteGuard, Unlocked,
28};
29use starnix_uapi::auth::{Credentials, FsCred};
30use starnix_uapi::device_type::DeviceType;
31use starnix_uapi::errors::{EEXIST, ENOENT, Errno};
32use starnix_uapi::file_mode::{FileMode, mode};
33use starnix_uapi::open_flags::OpenFlags;
34use starnix_uapi::{errno, error, ino_t, off_t, statfs};
35use std::collections::BTreeSet;
36use std::sync::Arc;
37use syncio::zxio_node_attr_has_t;
38
39// Name and value for the xattr used to mark opaque directories in the upper FS.
40// See https://docs.kernel.org/filesystems/overlayfs.html#whiteouts-and-opaque-directories
41const OPAQUE_DIR_XATTR: &str = "trusted.overlay.opaque";
42const OPAQUE_DIR_XATTR_VALUE: &str = "y";
43
44#[derive(Clone)]
45struct DirEntryInfo {
46    name: FsString,
47    inode_num: ino_t,
48    entry_type: DirectoryEntryType,
49}
50
51type DirEntries = Vec<DirEntryInfo>;
52
53#[derive(Default)]
54struct DirentSinkAdapter {
55    items: Vec<DirEntryInfo>,
56    offset: off_t,
57}
58
59impl DirentSink for DirentSinkAdapter {
60    fn add(
61        &mut self,
62        inode_num: ino_t,
63        offset: off_t,
64        entry_type: DirectoryEntryType,
65        name: &FsStr,
66    ) -> Result<(), Errno> {
67        if !DirEntry::is_reserved_name(name) {
68            self.items.push(DirEntryInfo { name: name.to_owned(), inode_num, entry_type });
69        }
70        self.offset = offset;
71        Ok(())
72    }
73
74    fn offset(&self) -> off_t {
75        self.offset
76    }
77}
78
79#[derive(Copy, Clone, Eq, PartialEq)]
80enum UpperCopyMode {
81    MetadataOnly,
82    CopyAll,
83}
84
85/// An `DirEntry` associated with the mount options. This is required because OverlayFs mostly
86/// works at the `DirEntry` level (mounts on the lower, upper and work directories are ignored),
87/// but operation must still depend on mount options.
88#[derive(Clone)]
89struct ActiveEntry {
90    entry: DirEntryHandle,
91    mount: MountInfo,
92}
93
94impl ActiveEntry {
95    fn mapper<'a>(entry: &'a ActiveEntry) -> impl Fn(DirEntryHandle) -> ActiveEntry + 'a {
96        |dir_entry| ActiveEntry { entry: dir_entry, mount: entry.mount.clone() }
97    }
98
99    fn entry(&self) -> &DirEntryHandle {
100        &self.entry
101    }
102
103    fn mount(&self) -> &MountInfo {
104        &self.mount
105    }
106
107    fn component_lookup<L>(
108        &self,
109        locked: &mut Locked<L>,
110        current_task: &CurrentTask,
111        name: &FsStr,
112    ) -> Result<Self, Errno>
113    where
114        L: LockEqualOrBefore<FileOpsCore>,
115    {
116        self.entry()
117            .component_lookup(locked, current_task, self.mount(), name)
118            .map(ActiveEntry::mapper(self))
119    }
120
121    fn create_entry<L>(
122        &self,
123        locked: &mut Locked<L>,
124        current_task: &CurrentTask,
125        name: &FsStr,
126        create_node_fn: impl FnOnce(
127            &mut Locked<L>,
128            &FsNodeHandle,
129            &MountInfo,
130            &FsStr,
131        ) -> Result<FsNodeHandle, Errno>,
132    ) -> Result<Self, Errno>
133    where
134        L: LockEqualOrBefore<FileOpsCore>,
135    {
136        self.entry()
137            .create_entry(locked, current_task, self.mount(), name, create_node_fn)
138            .map(ActiveEntry::mapper(self))
139    }
140
141    /// Sets an xattr to mark the directory referenced by `entry` as opaque. Directories that are
142    /// marked as opaque in the upper FS are not merged with the corresponding directories in the
143    /// lower FS.
144    fn set_opaque_xattr<L>(
145        &self,
146        locked: &mut Locked<L>,
147        current_task: &CurrentTask,
148    ) -> Result<(), Errno>
149    where
150        L: LockEqualOrBefore<FileOpsCore>,
151    {
152        self.entry().node.set_xattr(
153            locked,
154            current_task,
155            self.mount(),
156            OPAQUE_DIR_XATTR.into(),
157            OPAQUE_DIR_XATTR_VALUE.into(),
158            XattrOp::Set,
159        )
160    }
161
162    /// Checks if the `entry` is marked as opaque.
163    fn is_opaque_node<L>(&self, locked: &mut Locked<L>, current_task: &CurrentTask) -> bool
164    where
165        L: LockEqualOrBefore<FileOpsCore>,
166    {
167        match self.entry().node.get_xattr(
168            locked,
169            current_task,
170            self.mount(),
171            OPAQUE_DIR_XATTR.into(),
172            OPAQUE_DIR_XATTR_VALUE.len(),
173        ) {
174            Ok(ValueOrSize::Value(v)) if v == OPAQUE_DIR_XATTR_VALUE => true,
175            _ => false,
176        }
177    }
178
179    /// Creates a "whiteout" entry in the directory called `name`. Whiteouts are created by
180    /// overlayfs to denote files and directories that were removed and should not be listed in the
181    /// directory. This is necessary because we cannot remove entries from the lower FS.
182    fn create_whiteout<L>(
183        &self,
184        locked: &mut Locked<L>,
185        current_task: &CurrentTask,
186        name: &FsStr,
187    ) -> Result<ActiveEntry, Errno>
188    where
189        L: LockEqualOrBefore<FileOpsCore>,
190    {
191        self.create_entry(locked, current_task, name, |locked, dir, mount, name| {
192            dir.create_node(
193                locked,
194                current_task,
195                mount,
196                name,
197                FileMode::IFCHR,
198                DeviceType::NONE,
199                FsCred::root(),
200            )
201        })
202    }
203
204    /// Returns `true` if this is a "whiteout".
205    fn is_whiteout(&self) -> bool {
206        let info = self.entry().node.info();
207        info.mode.is_chr() && info.rdev == DeviceType::NONE
208    }
209
210    /// Checks whether the child of this entry represented by `info` is a "whiteout".
211    ///
212    /// Only looks up the corresponding `DirEntry` when necessary.
213    fn is_whiteout_child<L>(
214        &self,
215        locked: &mut Locked<L>,
216        current_task: &CurrentTask,
217        info: &DirEntryInfo,
218    ) -> Result<bool, Errno>
219    where
220        L: LockEqualOrBefore<FileOpsCore>,
221    {
222        // We need to lookup the node only if the file is a char device.
223        if info.entry_type != DirectoryEntryType::CHR {
224            return Ok(false);
225        }
226        let entry = self.component_lookup(locked, current_task, info.name.as_ref())?;
227        Ok(entry.is_whiteout())
228    }
229
230    fn read_dir_entries<L>(
231        &self,
232        locked: &mut Locked<L>,
233        current_task: &CurrentTask,
234    ) -> Result<Vec<DirEntryInfo>, Errno>
235    where
236        L: LockEqualOrBefore<FileOpsCore>,
237    {
238        let mut sink = DirentSinkAdapter::default();
239        self.entry().open_anonymous(locked, current_task, OpenFlags::DIRECTORY)?.readdir(
240            locked,
241            current_task,
242            &mut sink,
243        )?;
244        Ok(sink.items)
245    }
246}
247
248struct OverlayNode {
249    stack: Arc<OverlayStack>,
250
251    // Corresponding `DirEntries` in the lower and the upper filesystems. At least one must be
252    // set. Note that we don't care about `NamespaceNode`: overlayfs overlays filesystems
253    // (i.e. not namespace subtrees). These directories may not be mounted anywhere.
254    // `upper` may be created dynamically whenever write access is required.
255    upper: OnceCell<ActiveEntry>,
256    lower: Option<ActiveEntry>,
257
258    // `prepare_to_unlink()` may mark `upper` as opaque. In that case we want to skip merging
259    // with `lower` in `readdir()`.
260    upper_is_opaque: OnceCell<()>,
261
262    parent: Option<Arc<OverlayNode>>,
263}
264
265impl OverlayNode {
266    fn new(
267        stack: Arc<OverlayStack>,
268        lower: Option<ActiveEntry>,
269        upper: Option<ActiveEntry>,
270        parent: Option<Arc<OverlayNode>>,
271    ) -> Arc<Self> {
272        assert!(upper.is_some() || parent.is_some());
273
274        let upper = match upper {
275            Some(entry) => OnceCell::with_value(entry),
276            None => OnceCell::new(),
277        };
278
279        Arc::new(OverlayNode { stack, upper, lower, upper_is_opaque: OnceCell::new(), parent })
280    }
281
282    fn from_fs_node(node: &FsNodeHandle) -> Result<&Arc<Self>, Errno> {
283        Ok(&node.downcast_ops::<OverlayNodeOps>().ok_or_else(|| errno!(EIO))?.node)
284    }
285
286    fn main_entry(&self) -> &ActiveEntry {
287        self.upper.get().or(self.lower.as_ref()).expect("Expected either upper or lower node")
288    }
289
290    fn init_fs_node_for_child(
291        self: &Arc<OverlayNode>,
292        node: &FsNode,
293        lower: Option<ActiveEntry>,
294        upper: Option<ActiveEntry>,
295    ) -> FsNodeHandle {
296        let entry = upper.as_ref().or(lower.as_ref()).expect("expect either lower or upper node");
297        let ino = entry.entry().node.ino;
298        let info = entry.entry().node.info().clone();
299
300        // Parent may be needed to initialize `upper`. We don't need to pass it if we have `upper`.
301        let parent = if upper.is_some() { None } else { Some(self.clone()) };
302
303        let overlay_node =
304            OverlayNodeOps { node: OverlayNode::new(self.stack.clone(), lower, upper, parent) };
305        FsNode::new_uncached(ino, overlay_node, &node.fs(), info)
306    }
307
308    /// If the file is currently in the lower FS, then promote it to the upper FS. No-op if the
309    /// file is already in the upper FS.
310    fn ensure_upper<L>(
311        &self,
312        locked: &mut Locked<L>,
313        current_task: &CurrentTask,
314        fs: &FileSystem,
315    ) -> Result<&ActiveEntry, Errno>
316    where
317        L: LockEqualOrBefore<FileOpsCore>,
318    {
319        self.ensure_upper_maybe_copy(locked, current_task, UpperCopyMode::CopyAll, fs)
320    }
321
322    /// Same as `ensure_upper()`, but allows to skip copying of the file content.
323    fn ensure_upper_maybe_copy<L>(
324        &self,
325        locked: &mut Locked<L>,
326        current_task: &CurrentTask,
327        copy_mode: UpperCopyMode,
328        fs: &FileSystem,
329    ) -> Result<&ActiveEntry, Errno>
330    where
331        L: LockEqualOrBefore<FileOpsCore>,
332    {
333        self.upper.get_or_try_init(|| {
334            let lower = self.lower.as_ref().expect("lower is expected when upper is missing");
335            let parent = self.parent.as_ref().expect("Parent is expected when upper is missing");
336            let parent_upper = parent.ensure_upper(locked, current_task, fs)?;
337            let name = lower.entry.local_name(&RcuReadScope::new()).to_owned();
338            let info = {
339                let info = lower.entry.node.info();
340                info.clone()
341            };
342            let cred = info.cred();
343
344            let mut copy_up_creds = Credentials::clone(&self.stack.mounter);
345            security::fs_node_copy_up(current_task, &lower.entry.node, fs, &mut copy_up_creds);
346            let res = current_task.override_creds(Arc::new(copy_up_creds), || {
347                if info.mode.is_lnk() {
348                    let link_target = lower.entry.node.readlink(locked, current_task)?;
349                    let link_path = match &link_target {
350                        SymlinkTarget::Node(_) => return error!(EIO),
351                        SymlinkTarget::Path(path) => path,
352                    };
353                    parent_upper.create_entry(
354                        locked,
355                        current_task,
356                        name.as_ref(),
357                        |locked, dir, mount, name| {
358                            dir.create_symlink(
359                                locked,
360                                current_task,
361                                mount,
362                                name,
363                                link_path.as_ref(),
364                                cred,
365                            )
366                        },
367                    )
368                } else if info.mode.is_reg() && copy_mode == UpperCopyMode::CopyAll {
369                    // Regular files need to be copied from lower FS to upper FS.
370                    self.stack.create_upper_entry(
371                        locked,
372                        current_task,
373                        parent_upper,
374                        name.as_ref(),
375                        |locked, dir, name| {
376                            dir.create_entry(
377                                locked,
378                                current_task,
379                                name,
380                                |locked, dir_node, mount, name| {
381                                    dir_node.create_node(
382                                        locked,
383                                        current_task,
384                                        mount,
385                                        name,
386                                        info.mode,
387                                        DeviceType::NONE,
388                                        cred,
389                                    )
390                                },
391                            )
392                        },
393                        |locked, entry| copy_file_content(locked, current_task, lower, &entry),
394                    )
395                } else {
396                    parent_upper.create_entry(
397                        locked,
398                        current_task,
399                        name.as_ref(),
400                        |locked, dir, mount, name| {
401                            dir.create_node(
402                                locked,
403                                current_task,
404                                mount,
405                                name,
406                                info.mode,
407                                info.rdev,
408                                cred,
409                            )
410                        },
411                    )
412                }
413            });
414
415            track_stub!(TODO("https://fxbug.dev/322874151"), "overlayfs copy xattrs");
416            res
417        })
418    }
419
420    /// Checks if this node exists in the lower FS.
421    fn has_lower(&self) -> bool {
422        self.lower.is_some()
423    }
424
425    /// Check that an item isn't present in the lower FS.
426    fn lower_entry_exists<L>(
427        &self,
428        locked: &mut Locked<L>,
429        current_task: &CurrentTask,
430        name: &FsStr,
431    ) -> Result<bool, Errno>
432    where
433        L: LockEqualOrBefore<FileOpsCore>,
434    {
435        match &self.lower {
436            Some(lower) => match lower.component_lookup(locked, current_task, name) {
437                Ok(entry) => Ok(!entry.is_whiteout()),
438                Err(err) if err.code == ENOENT => Ok(false),
439                Err(err) => Err(err),
440            },
441            None => Ok(false),
442        }
443    }
444
445    /// Helper used to create a new entry in the directory. It first checks that the target node
446    /// doesn't exist. Then `do_create` is called to create the new node in the work dir, which
447    /// is then moved to the target dir in the upper file system.
448    ///
449    /// It's assumed that the calling `DirEntry` has the current directory locked, so it is not
450    /// supposed to change while this method is executed. Note that OveralayFS doesn't handle
451    /// the case when the underlying file systems are changed directly, but that restriction
452    /// is not enforced.
453    fn create_entry<F, L>(
454        self: &Arc<OverlayNode>,
455        locked: &mut Locked<L>,
456        node: &FsNode,
457        current_task: &CurrentTask,
458        name: &FsStr,
459        do_create: F,
460    ) -> Result<ActiveEntry, Errno>
461    where
462        F: Fn(&mut Locked<L>, &ActiveEntry, &FsStr) -> Result<ActiveEntry, Errno>,
463        L: LockEqualOrBefore<FileOpsCore>,
464    {
465        let upper = self.ensure_upper(locked, current_task, &node.fs())?;
466
467        match upper.component_lookup(locked, current_task, name) {
468            Ok(existing) => {
469                // If there is an entry in the upper dir, then it must be a whiteout.
470                if !existing.is_whiteout() {
471                    return error!(EEXIST);
472                }
473            }
474
475            Err(e) if e.code == ENOENT => {
476                // If we don't have the entry in the upper fs, then check lower.
477                if self.lower_entry_exists(locked, current_task, name)? {
478                    return error!(EEXIST);
479                }
480            }
481            Err(e) => return Err(e),
482        };
483
484        self.stack.create_upper_entry(
485            locked,
486            current_task,
487            upper,
488            name,
489            |locked, entry, fs| do_create(locked, entry, fs),
490            |_, _entry| Ok(()),
491        )
492    }
493
494    /// An overlay directory may appear empty when the corresponding upper dir isn't empty:
495    /// it may contain a number of whiteout entries. In that case the whiteouts need to be
496    /// unlinked before the upper directory can be unlinked as well.
497    /// `prepare_to_unlink()` checks that the directory doesn't contain anything other
498    /// than whiteouts and if that is the case then it unlinks all of them.
499    fn prepare_to_unlink<L>(
500        self: &Arc<OverlayNode>,
501        locked: &mut Locked<L>,
502        current_task: &CurrentTask,
503    ) -> Result<(), Errno>
504    where
505        L: LockEqualOrBefore<FileOpsCore>,
506    {
507        if self.main_entry().entry().node.is_dir() {
508            let mut lower_entries = BTreeSet::new();
509            if let Some(dir) = &self.lower {
510                for item in dir.read_dir_entries(locked, current_task)?.drain(..) {
511                    if !dir.is_whiteout_child(locked, current_task, &item)? {
512                        lower_entries.insert(item.name);
513                    }
514                }
515            }
516
517            if let Some(dir) = self.upper.get() {
518                let mut to_remove = Vec::<FsString>::new();
519                for item in dir.read_dir_entries(locked, current_task)?.drain(..) {
520                    if !dir.is_whiteout_child(locked, current_task, &item)? {
521                        return error!(ENOTEMPTY);
522                    }
523                    lower_entries.remove(&item.name);
524                    to_remove.push(item.name);
525                }
526
527                if !lower_entries.is_empty() {
528                    return error!(ENOTEMPTY);
529                }
530
531                // Mark the directory as opaque. Children can be removed after this.
532                dir.set_opaque_xattr(locked, current_task)?;
533                let _ = self.upper_is_opaque.set(());
534
535                // Finally, remove the children.
536                for name in to_remove.iter() {
537                    dir.entry().unlink(
538                        locked,
539                        current_task,
540                        dir.mount(),
541                        name.as_ref(),
542                        UnlinkKind::NonDirectory,
543                        false,
544                    )?;
545                }
546            }
547        }
548
549        Ok(())
550    }
551
552    fn as_mounter<R, F: FnOnce() -> R>(&self, current_task: &CurrentTask, do_work: F) -> R {
553        current_task.override_creds(self.stack.mounter.clone(), do_work)
554    }
555}
556
557struct OverlayNodeOps {
558    node: Arc<OverlayNode>,
559}
560
561impl FsNodeOps for OverlayNodeOps {
562    fn create_file_ops(
563        &self,
564        locked: &mut Locked<FileOpsCore>,
565        node: &FsNode,
566        current_task: &CurrentTask,
567        flags: OpenFlags,
568    ) -> Result<Box<dyn FileOps>, Errno> {
569        self.node.as_mounter(current_task, || {
570            if flags.can_write() {
571                // Only upper FS can be writable.
572                let copy_mode = if flags.contains(OpenFlags::TRUNC) {
573                    UpperCopyMode::MetadataOnly
574                } else {
575                    UpperCopyMode::CopyAll
576                };
577                self.node.ensure_upper_maybe_copy(locked, current_task, copy_mode, &node.fs())?;
578            }
579
580            let ops: Box<dyn FileOps> = if node.is_dir() {
581                Box::new(OverlayDirectory {
582                    node: self.node.clone(),
583                    dir_entries: Default::default(),
584                })
585            } else {
586                let state =
587                    match (self.node.upper.get(), &self.node.lower) {
588                        (Some(upper), _) => OverlayFileState::Upper(upper.entry().open_anonymous(
589                            locked,
590                            current_task,
591                            flags,
592                        )?),
593                        (None, Some(lower)) => OverlayFileState::Lower(
594                            lower.entry().open_anonymous(locked, current_task, flags)?,
595                        ),
596                        _ => panic!("Expected either upper or lower node"),
597                    };
598
599                Box::new(OverlayFile { node: self.node.clone(), flags, state: RwLock::new(state) })
600            };
601
602            Ok(ops)
603        })
604    }
605
606    fn lookup(
607        &self,
608        locked: &mut Locked<FileOpsCore>,
609        node: &FsNode,
610        current_task: &CurrentTask,
611        name: &FsStr,
612    ) -> Result<FsNodeHandle, Errno> {
613        self.node.as_mounter(current_task, || {
614            let resolve_child = |locked: &mut Locked<FileOpsCore>,
615                                 dir_opt: Option<&ActiveEntry>| {
616                // TODO(sergeyu): lookup() checks access, but we don't need that here.
617                dir_opt
618                    .as_ref()
619                    .map(|dir| match dir.component_lookup(locked, current_task, name) {
620                        Ok(entry) => Some(Ok(entry)),
621                        Err(e) if e.code == ENOENT => None,
622                        Err(e) => Some(Err(e)),
623                    })
624                    .flatten()
625                    .transpose()
626            };
627
628            let upper: Option<ActiveEntry> = resolve_child(locked, self.node.upper.get())?;
629
630            let (upper_is_dir, upper_is_opaque) = match &upper {
631                Some(upper) if upper.is_whiteout() => return error!(ENOENT),
632                Some(upper) => {
633                    let is_dir = upper.entry().node.is_dir();
634                    let is_opaque = !is_dir || upper.is_opaque_node(locked, current_task);
635                    (is_dir, is_opaque)
636                }
637                None => (false, false),
638            };
639
640            let parent_upper_is_opaque = self.node.upper_is_opaque.get().is_some();
641
642            // We don't need to resolve the lower node if we have an opaque node in the upper dir.
643            let lookup_lower = !parent_upper_is_opaque && !upper_is_opaque;
644            let lower: Option<ActiveEntry> = if lookup_lower {
645                match resolve_child(locked, self.node.lower.as_ref())? {
646                    // If the upper node is a directory and the lower isn't then ignore the lower node.
647                    Some(lower) if upper_is_dir && !lower.entry().node.is_dir() => None,
648                    Some(lower) if lower.is_whiteout() => None,
649                    result => result,
650                }
651            } else {
652                None
653            };
654
655            if upper.is_none() && lower.is_none() {
656                return error!(ENOENT);
657            }
658
659            Ok(self.node.init_fs_node_for_child(node, lower, upper))
660        })
661    }
662
663    fn mknod(
664        &self,
665        locked: &mut Locked<FileOpsCore>,
666        node: &FsNode,
667        current_task: &CurrentTask,
668        name: &FsStr,
669        mode: FileMode,
670        dev: DeviceType,
671        owner: FsCred,
672    ) -> Result<FsNodeHandle, Errno> {
673        let mut creds = Credentials::clone(&self.node.stack.mounter);
674        security::dentry_create_files_as(current_task, node, mode, name, &mut creds)?;
675        current_task.override_creds(Arc::new(creds), || {
676            let new_upper_node = self.node.create_entry(
677                locked,
678                node,
679                current_task,
680                name,
681                |locked, dir, temp_name| {
682                    dir.create_entry(
683                        locked,
684                        current_task,
685                        temp_name,
686                        |locked, dir_node, mount, name| {
687                            dir_node.create_node(
688                                locked,
689                                current_task,
690                                mount,
691                                name,
692                                mode,
693                                dev,
694                                owner.clone(),
695                            )
696                        },
697                    )
698                },
699            )?;
700            Ok(self.node.init_fs_node_for_child(node, None, Some(new_upper_node)))
701        })
702    }
703
704    fn mkdir(
705        &self,
706        locked: &mut Locked<FileOpsCore>,
707        node: &FsNode,
708        current_task: &CurrentTask,
709        name: &FsStr,
710        mode: FileMode,
711        owner: FsCred,
712    ) -> Result<FsNodeHandle, Errno> {
713        let mut creds = Credentials::clone(&self.node.stack.mounter);
714        security::dentry_create_files_as(current_task, node, mode, name, &mut creds)?;
715        current_task.override_creds(Arc::new(creds), || {
716            let new_upper_node = self.node.create_entry(
717                locked,
718                node,
719                current_task,
720                name,
721                |locked, dir, temp_name| {
722                    let entry = dir.create_entry(
723                        locked,
724                        current_task,
725                        temp_name,
726                        |locked, dir_node, mount, name| {
727                            dir_node.create_node(
728                                locked,
729                                current_task,
730                                mount,
731                                name,
732                                mode,
733                                DeviceType::NONE,
734                                owner.clone(),
735                            )
736                        },
737                    )?;
738
739                    // Set opaque attribute to ensure the new directory is not merged with lower.
740                    entry.set_opaque_xattr(locked, current_task)?;
741
742                    Ok(entry)
743                },
744            )?;
745
746            Ok(self.node.init_fs_node_for_child(node, None, Some(new_upper_node)))
747        })
748    }
749
750    fn create_symlink(
751        &self,
752        locked: &mut Locked<FileOpsCore>,
753        node: &FsNode,
754        current_task: &CurrentTask,
755        name: &FsStr,
756        target: &FsStr,
757        owner: FsCred,
758    ) -> Result<FsNodeHandle, Errno> {
759        let mut creds = Credentials::clone(&self.node.stack.mounter);
760        security::dentry_create_files_as(current_task, node, FileMode::IFLNK, name, &mut creds)?;
761        current_task.override_creds(Arc::new(creds), || {
762            let new_upper_node = self.node.create_entry(
763                locked,
764                node,
765                current_task,
766                name,
767                |locked, dir, temp_name| {
768                    dir.create_entry(
769                        locked,
770                        current_task,
771                        temp_name,
772                        |locked, dir_node, mount, name| {
773                            dir_node.create_symlink(
774                                locked,
775                                current_task,
776                                mount,
777                                name,
778                                target,
779                                owner.clone(),
780                            )
781                        },
782                    )
783                },
784            )?;
785            Ok(self.node.init_fs_node_for_child(node, None, Some(new_upper_node)))
786        })
787    }
788
789    fn readlink(
790        &self,
791        locked: &mut Locked<FileOpsCore>,
792        _node: &FsNode,
793        current_task: &CurrentTask,
794    ) -> Result<SymlinkTarget, Errno> {
795        self.node.as_mounter(current_task, || {
796            self.node.main_entry().entry().node.readlink(locked, current_task)
797        })
798    }
799
800    fn link(
801        &self,
802        locked: &mut Locked<FileOpsCore>,
803        node: &FsNode,
804        current_task: &CurrentTask,
805        name: &FsStr,
806        child: &FsNodeHandle,
807    ) -> Result<(), Errno> {
808        self.node.as_mounter(current_task, || {
809            let child_overlay = OverlayNode::from_fs_node(child)?;
810            let upper_child = child_overlay.ensure_upper(locked, current_task, &node.fs())?;
811            self.node.create_entry(
812                locked,
813                node,
814                current_task,
815                name,
816                |locked, dir, temp_name| {
817                    dir.create_entry(
818                        locked,
819                        current_task,
820                        temp_name,
821                        |locked, dir_node, mount, name| {
822                            dir_node.link(
823                                locked,
824                                current_task,
825                                mount,
826                                name,
827                                &upper_child.entry().node,
828                            )
829                        },
830                    )
831                },
832            )?;
833            Ok(())
834        })
835    }
836
837    fn unlink(
838        &self,
839        locked: &mut Locked<FileOpsCore>,
840        node: &FsNode,
841        current_task: &CurrentTask,
842        name: &FsStr,
843        child: &FsNodeHandle,
844    ) -> Result<(), Errno> {
845        self.node.as_mounter(current_task, || {
846            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?;
847            let child_overlay = OverlayNode::from_fs_node(child)?;
848            child_overlay.prepare_to_unlink(locked, current_task)?;
849
850            let need_whiteout = self.node.lower_entry_exists(locked, current_task, name)?;
851            if need_whiteout {
852                self.node.stack.create_upper_entry(
853                    locked,
854                    current_task,
855                    &upper,
856                    &name,
857                    |locked, work, name| work.create_whiteout(locked, current_task, name),
858                    |_, _entry| Ok(()),
859                )?;
860            } else if let Some(child_upper) = child_overlay.upper.get() {
861                let kind = if child_upper.entry().node.is_dir() {
862                    UnlinkKind::Directory
863                } else {
864                    UnlinkKind::NonDirectory
865                };
866                upper.entry().unlink(locked, current_task, upper.mount(), name, kind, false)?;
867            }
868
869            Ok(())
870        })
871    }
872
873    fn fetch_and_refresh_info<'a>(
874        &self,
875        locked: &mut Locked<FileOpsCore>,
876        _node: &FsNode,
877        current_task: &CurrentTask,
878        info: &'a RwLock<FsNodeInfo>,
879    ) -> Result<RwLockReadGuard<'a, FsNodeInfo>, Errno> {
880        self.node.as_mounter(current_task, || {
881            let real_info = self
882                .node
883                .main_entry()
884                .entry()
885                .node
886                .fetch_and_refresh_info(locked, current_task)?
887                .clone();
888            let mut lock = info.write();
889            *lock = real_info;
890            Ok(RwLockWriteGuard::downgrade(lock))
891        })
892    }
893
894    fn update_attributes(
895        &self,
896        locked: &mut Locked<FileOpsCore>,
897        node: &FsNode,
898        current_task: &CurrentTask,
899        new_info: &FsNodeInfo,
900        has: zxio_node_attr_has_t,
901    ) -> Result<(), Errno> {
902        self.node.as_mounter(current_task, || {
903            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?.entry();
904            upper.node.update_attributes(locked, current_task, |info| {
905                if has.modification_time {
906                    info.time_modify = new_info.time_modify;
907                }
908                if has.access_time {
909                    info.time_access = new_info.time_access;
910                }
911                if has.mode {
912                    info.mode = new_info.mode;
913                }
914                if has.uid {
915                    info.uid = new_info.uid;
916                }
917                if has.gid {
918                    info.gid = new_info.gid;
919                }
920                if has.rdev {
921                    info.rdev = new_info.rdev;
922                }
923                Ok(())
924            })
925        })
926    }
927
928    fn append_lock_read<'a>(
929        &'a self,
930        locked: &'a mut Locked<BeforeFsNodeAppend>,
931        node: &'a FsNode,
932        current_task: &CurrentTask,
933    ) -> Result<(RwQueueReadGuard<'a, FsNodeAppend>, &'a mut Locked<FsNodeAppend>), Errno> {
934        self.node.as_mounter(current_task, || {
935            let upper_node =
936                self.node.ensure_upper(locked, current_task, &node.fs())?.entry.node.as_ref();
937            upper_node.ops().append_lock_read(locked, upper_node, current_task)
938        })
939    }
940
941    fn truncate(
942        &self,
943        locked: &mut Locked<FileOpsCore>,
944        guard: &AppendLockGuard<'_>,
945        node: &FsNode,
946        current_task: &CurrentTask,
947        length: u64,
948    ) -> Result<(), Errno> {
949        self.node.as_mounter(current_task, || {
950            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?;
951
952            upper.entry().node.truncate_with_strategy(
953                locked,
954                AlreadyLockedAppendLockStrategy::new(guard),
955                current_task,
956                upper.mount(),
957                length,
958            )
959        })
960    }
961
962    fn allocate(
963        &self,
964        locked: &mut Locked<FileOpsCore>,
965        guard: &AppendLockGuard<'_>,
966        node: &FsNode,
967        current_task: &CurrentTask,
968        mode: FallocMode,
969        offset: u64,
970        length: u64,
971    ) -> Result<(), Errno> {
972        self.node.as_mounter(current_task, || {
973            let node = &self.node.ensure_upper(locked, current_task, &node.fs())?.entry().node;
974            node.fallocate_with_strategy(
975                locked,
976                AlreadyLockedAppendLockStrategy::new(guard),
977                current_task,
978                mode,
979                offset,
980                length,
981            )
982        })
983    }
984
985    fn get_xattr(
986        &self,
987        locked: &mut Locked<FileOpsCore>,
988        _node: &FsNode,
989        current_task: &CurrentTask,
990        name: &FsStr,
991        max_size: usize,
992    ) -> Result<ValueOrSize<FsString>, Errno> {
993        let entry = self
994            .node
995            .upper
996            .get()
997            .or(self.node.lower.as_ref())
998            .expect("expect either lower or upper node");
999        self.node.as_mounter(current_task, || {
1000            entry.entry().node.get_xattr(locked, current_task, &entry.mount, name, max_size)
1001        })
1002    }
1003
1004    fn set_xattr(
1005        &self,
1006        locked: &mut Locked<FileOpsCore>,
1007        node: &FsNode,
1008        current_task: &CurrentTask,
1009        name: &FsStr,
1010        value: &FsStr,
1011        op: XattrOp,
1012    ) -> Result<(), Errno> {
1013        self.node.as_mounter(current_task, || {
1014            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?;
1015            upper.entry().node.set_xattr(locked, current_task, &upper.mount, name, value, op)
1016        })
1017    }
1018
1019    fn remove_xattr(
1020        &self,
1021        locked: &mut Locked<FileOpsCore>,
1022        node: &FsNode,
1023        current_task: &CurrentTask,
1024        name: &FsStr,
1025    ) -> Result<(), Errno> {
1026        self.node.as_mounter(current_task, || {
1027            let upper = self.node.ensure_upper(locked, current_task, &node.fs())?;
1028            upper.entry().node.remove_xattr(locked, current_task, &upper.mount, name)
1029        })
1030    }
1031
1032    fn list_xattrs(
1033        &self,
1034        locked: &mut Locked<FileOpsCore>,
1035        _node: &FsNode,
1036        current_task: &CurrentTask,
1037        max_size: usize,
1038    ) -> Result<ValueOrSize<Vec<FsString>>, Errno> {
1039        self.node.as_mounter(current_task, || {
1040            let entry = self
1041                .node
1042                .upper
1043                .get()
1044                .or(self.node.lower.as_ref())
1045                .expect("expect either lower or upper node");
1046            entry.entry().node.list_xattrs(locked, current_task, max_size)
1047        })
1048    }
1049}
1050struct OverlayDirectory {
1051    node: Arc<OverlayNode>,
1052    dir_entries: RwLock<DirEntries>,
1053}
1054
1055impl OverlayDirectory {
1056    fn refresh_dir_entries<L>(
1057        &self,
1058        locked: &mut Locked<L>,
1059        current_task: &CurrentTask,
1060    ) -> Result<(), Errno>
1061    where
1062        L: LockEqualOrBefore<FileOpsCore>,
1063    {
1064        let mut entries = DirEntries::new();
1065
1066        let upper_is_opaque = self.node.upper_is_opaque.get().is_some();
1067        let merge_with_lower = self.node.lower.is_some() && !upper_is_opaque;
1068
1069        // First enumerate entries in the upper dir. Then enumerate the lower dir and add only
1070        // items that are not present in the upper.
1071        let mut upper_set = BTreeSet::new();
1072        if let Some(dir) = self.node.upper.get() {
1073            for item in dir.read_dir_entries(locked, current_task)?.drain(..) {
1074                // Fill `upper_set` only if we will need it later.
1075                if merge_with_lower {
1076                    upper_set.insert(item.name.clone());
1077                }
1078                if !dir.is_whiteout_child(locked, current_task, &item)? {
1079                    entries.push(item);
1080                }
1081            }
1082        }
1083
1084        if merge_with_lower {
1085            if let Some(dir) = &self.node.lower {
1086                for item in dir.read_dir_entries(locked, current_task)?.drain(..) {
1087                    if !upper_set.contains(&item.name)
1088                        && !dir.is_whiteout_child(locked, current_task, &item)?
1089                    {
1090                        entries.push(item);
1091                    }
1092                }
1093            }
1094        }
1095
1096        *self.dir_entries.write() = entries;
1097
1098        Ok(())
1099    }
1100}
1101
1102impl FileOps for OverlayDirectory {
1103    fileops_impl_directory!();
1104    fileops_impl_noop_sync!();
1105
1106    fn seek(
1107        &self,
1108        _locked: &mut Locked<FileOpsCore>,
1109        _file: &FileObject,
1110        current_task: &CurrentTask,
1111        current_offset: off_t,
1112        target: SeekTarget,
1113    ) -> Result<off_t, Errno> {
1114        self.node
1115            .as_mounter(current_task, || default_seek(current_offset, target, || error!(EINVAL)))
1116    }
1117
1118    fn readdir(
1119        &self,
1120        locked: &mut Locked<FileOpsCore>,
1121        file: &FileObject,
1122        current_task: &CurrentTask,
1123        sink: &mut dyn DirentSink,
1124    ) -> Result<(), Errno> {
1125        self.node.as_mounter(current_task, || {
1126            if sink.offset() == 0 {
1127                self.refresh_dir_entries(locked, current_task)?;
1128            }
1129
1130            emit_dotdot(file, sink)?;
1131
1132            for item in self.dir_entries.read().iter().skip(sink.offset() as usize - 2) {
1133                sink.add(item.inode_num, sink.offset() + 1, item.entry_type, item.name.as_ref())?;
1134            }
1135
1136            Ok(())
1137        })
1138    }
1139}
1140
1141enum OverlayFileState {
1142    Lower(FileHandle),
1143    Upper(FileHandle),
1144}
1145
1146impl OverlayFileState {
1147    fn file(&self) -> &FileHandle {
1148        match self {
1149            Self::Lower(f) | Self::Upper(f) => f,
1150        }
1151    }
1152}
1153
1154struct OverlayFile {
1155    node: Arc<OverlayNode>,
1156    flags: OpenFlags,
1157    state: RwLock<OverlayFileState>,
1158}
1159
1160impl FileOps for OverlayFile {
1161    fileops_impl_seekable!();
1162
1163    fn read(
1164        &self,
1165        locked: &mut Locked<FileOpsCore>,
1166        _file: &FileObject,
1167        current_task: &CurrentTask,
1168        offset: usize,
1169        data: &mut dyn OutputBuffer,
1170    ) -> Result<usize, Errno> {
1171        self.node.as_mounter(current_task, || {
1172            let mut state = self.state.read();
1173
1174            // Check if the file was promoted to the upper FS. In that case we need to reopen it
1175            // from there.
1176            if let Some(upper) = self.node.upper.get() {
1177                if matches!(*state, OverlayFileState::Lower(_)) {
1178                    std::mem::drop(state);
1179
1180                    {
1181                        let mut write_state = self.state.write();
1182
1183                        // TODO(mariagl): don't hold write_state while calling open_anonymous.
1184                        // It may call back into read(), causing lock order inversion.
1185                        *write_state = OverlayFileState::Upper(upper.entry().open_anonymous(
1186                            locked,
1187                            current_task,
1188                            self.flags,
1189                        )?);
1190                    }
1191                    state = self.state.read();
1192                }
1193            }
1194
1195            // TODO(mariagl): Drop state here
1196            state.file().read_at(locked, current_task, offset, data)
1197        })
1198    }
1199
1200    fn write(
1201        &self,
1202        locked: &mut Locked<FileOpsCore>,
1203        _file: &FileObject,
1204        current_task: &CurrentTask,
1205        offset: usize,
1206        data: &mut dyn InputBuffer,
1207    ) -> Result<usize, Errno> {
1208        self.node.as_mounter(current_task, || {
1209            let state = self.state.read();
1210            let file = match &*state {
1211                OverlayFileState::Upper(f) => f.clone(),
1212
1213                // `write()` should be called only for files that were opened for write, and that
1214                // required the file to be promoted to the upper FS.
1215                OverlayFileState::Lower(_) => panic!("write() called for a lower FS file."),
1216            };
1217            std::mem::drop(state);
1218            file.write_at(locked, current_task, offset, data)
1219        })
1220    }
1221
1222    fn sync(&self, _file: &FileObject, current_task: &CurrentTask) -> Result<(), Errno> {
1223        self.node.as_mounter(current_task, || self.state.read().file().sync(current_task))
1224    }
1225
1226    fn get_memory(
1227        &self,
1228        locked: &mut Locked<FileOpsCore>,
1229        _file: &FileObject,
1230        current_task: &CurrentTask,
1231        length: Option<usize>,
1232        prot: starnix_core::mm::ProtectionFlags,
1233    ) -> Result<Arc<MemoryObject>, Errno> {
1234        self.node.as_mounter(current_task, || {
1235            // Not that the VMO returned here will not updated if the file is promoted to upper FS
1236            // later. This is consistent with OverlayFS behavior on Linux, see
1237            // https://docs.kernel.org/filesystems/overlayfs.html#non-standard-behavior .
1238            self.state.read().file().get_memory(locked, current_task, length, prot)
1239        })
1240    }
1241}
1242
1243pub fn new_overlay_fs(
1244    locked: &mut Locked<Unlocked>,
1245    current_task: &CurrentTask,
1246    options: FileSystemOptions,
1247) -> Result<FileSystemHandle, Errno> {
1248    OverlayStack::new_fs(locked, current_task, options)
1249}
1250
1251pub struct OverlayStack {
1252    // Keep references to the underlying file systems to ensure they outlive `overlayfs` since
1253    // they may be unmounted before overlayfs.
1254    #[allow(unused)]
1255    lower_fs: FileSystemHandle,
1256    upper_fs: FileSystemHandle,
1257
1258    work: ActiveEntry,
1259
1260    // Used when interacting with the `upper_fs`, `lower_fs` or `work` directories.
1261    mounter: Arc<Credentials>,
1262}
1263
1264impl OverlayStack {
1265    fn new_fs(
1266        locked: &mut Locked<Unlocked>,
1267        current_task: &CurrentTask,
1268        options: FileSystemOptions,
1269    ) -> Result<FileSystemHandle, Errno> {
1270        match options.params.get("redirect_dir".as_bytes()) {
1271            None => (),
1272            Some(o) if o == "off" => (),
1273            Some(_) => {
1274                track_stub!(TODO("https://fxbug.dev/322874205"), "overlayfs redirect_dir");
1275                return error!(ENOTSUP);
1276            }
1277        }
1278
1279        let lower = resolve_dir_param(locked, current_task, &options.params, "lowerdir".into())?;
1280        let upper = resolve_dir_param(locked, current_task, &options.params, "upperdir".into())?;
1281        let work = resolve_dir_param(locked, current_task, &options.params, "workdir".into())?;
1282
1283        let lower_fs = lower.entry().node.fs();
1284        let upper_fs = upper.entry().node.fs();
1285
1286        if !Arc::ptr_eq(&upper_fs, &work.entry().node.fs()) {
1287            log_error!("overlayfs: upperdir and workdir must be on the same FS");
1288            return error!(EINVAL);
1289        }
1290
1291        let kernel = current_task.kernel();
1292        let mounter = current_task.current_creds().clone();
1293        let stack = Arc::new(OverlayStack { lower_fs, upper_fs, work, mounter });
1294        let root_node = OverlayNode::new(stack.clone(), Some(lower), Some(upper), None);
1295        let fs =
1296            FileSystem::new(locked, kernel, CacheMode::Uncached, OverlayFs { stack }, options)?;
1297        let root_ino = fs.allocate_ino();
1298        fs.create_root(root_ino, OverlayNodeOps { node: root_node });
1299        Ok(fs)
1300    }
1301
1302    /// Given a filesystem, wraps it in a tmpfs-backed writable overlayfs.
1303    pub fn wrap_fs_in_writable_layer<L>(
1304        locked: &mut Locked<L>,
1305        kernel: &Kernel,
1306        rootfs: FileSystemHandle,
1307    ) -> Result<FileSystemHandle, Errno>
1308    where
1309        L: LockEqualOrBefore<FileOpsCore>,
1310    {
1311        let lower = ActiveEntry { entry: rootfs.root().clone(), mount: MountInfo::detached() };
1312
1313        // Create upper and work directories in an invisible tmpfs.
1314        let invisible_tmp = TmpFs::new_fs(locked, kernel);
1315
1316        let create_directory = |fs: &FileSystemHandle| {
1317            let ino = fs.allocate_ino();
1318            let info = FsNodeInfo::new(mode!(IFDIR, 0o777), FsCred::root());
1319            let node = fs.create_detached_node(ino, TmpFsDirectory::new(), info);
1320            let dir_entry = DirEntry::new(node, None, FsString::default());
1321
1322            // TODO: https://fxbug.dev/455771186 - Revise FsNode initialization to better ensure
1323            // that all the things are appropriately labeled.
1324            security::fs_node_init_with_dentry_deferred(kernel, &dir_entry);
1325
1326            dir_entry
1327        };
1328
1329        let upper =
1330            ActiveEntry { entry: create_directory(&invisible_tmp), mount: MountInfo::detached() };
1331        let work =
1332            ActiveEntry { entry: create_directory(&invisible_tmp), mount: MountInfo::detached() };
1333
1334        let lower_fs = rootfs;
1335        let upper_fs = invisible_tmp;
1336
1337        let mounter = Credentials::root();
1338        let stack = Arc::new(OverlayStack { lower_fs, upper_fs, work, mounter });
1339        let root_node = OverlayNode::new(stack.clone(), Some(lower), Some(upper), None);
1340        let fs = FileSystem::new(
1341            locked,
1342            kernel,
1343            CacheMode::Uncached,
1344            OverlayFs { stack },
1345            FileSystemOptions::default(),
1346        )?;
1347        let root_ino = fs.allocate_ino();
1348        fs.create_root(root_ino, OverlayNodeOps { node: root_node });
1349        Ok(fs)
1350    }
1351
1352    // Helper used to create new entry called `name` in `target_dir` in the upper FS.
1353    // 1. Calls `try_create` to create a new entry in `work`. It is called repeateadly with a
1354    //    new name until it returns any result other than `EEXIST`.
1355    // 2. `do_init` is called to initilize the contents and the attributes of the new entry, etc.
1356    // 3. The new entry is moved to `target_dir`. If there is an existing entry called `name` in
1357    //    `target_dir` then it's replaced with the new entry.
1358    // The temp file is cleared from the work dir if either of the last two steps fails.
1359    fn create_upper_entry<FCreate, FInit, L>(
1360        &self,
1361        locked: &mut Locked<L>,
1362        current_task: &CurrentTask,
1363        target_dir: &ActiveEntry,
1364        name: &FsStr,
1365        try_create: FCreate,
1366        do_init: FInit,
1367    ) -> Result<ActiveEntry, Errno>
1368    where
1369        L: LockEqualOrBefore<FileOpsCore>,
1370        FCreate: Fn(&mut Locked<L>, &ActiveEntry, &FsStr) -> Result<ActiveEntry, Errno>,
1371        FInit: FnOnce(&mut Locked<L>, &ActiveEntry) -> Result<(), Errno>,
1372    {
1373        let mut rng = rand::rng();
1374        let (temp_name, entry) = loop {
1375            let x: u64 = rng.random();
1376            let temp_name = FsString::from(format!("tmp{:x}", x));
1377            match try_create(locked, &self.work, temp_name.as_ref()) {
1378                Err(err) if err.code == EEXIST => continue,
1379                Err(err) => return Err(err),
1380                Ok(entry) => break (temp_name, entry),
1381            }
1382        };
1383
1384        do_init(locked, &entry)
1385            .and_then(|()| {
1386                DirEntry::rename(
1387                    locked,
1388                    current_task,
1389                    self.work.entry(),
1390                    self.work.mount(),
1391                    temp_name.as_ref(),
1392                    target_dir.entry(),
1393                    target_dir.mount(),
1394                    name,
1395                    RenameFlags::REPLACE_ANY,
1396                )
1397            })
1398            .map_err(|e| {
1399                // Remove the temp entry in case of a failure.
1400                self.work
1401                    .entry()
1402                    .unlink(
1403                        locked,
1404                        current_task,
1405                        self.work.mount(),
1406                        temp_name.as_ref(),
1407                        UnlinkKind::NonDirectory,
1408                        false,
1409                    )
1410                    .unwrap_or_else(|e| {
1411                        log_error!("Failed to cleanup work dir after an error: {}", e)
1412                    });
1413                e
1414            })?;
1415
1416        Ok(entry)
1417    }
1418}
1419
1420struct OverlayFs {
1421    stack: Arc<OverlayStack>,
1422}
1423
1424impl FileSystemOps for OverlayFs {
1425    fn statfs(
1426        &self,
1427        locked: &mut Locked<FileOpsCore>,
1428        _fs: &FileSystem,
1429        current_task: &CurrentTask,
1430    ) -> Result<statfs, Errno> {
1431        current_task.override_creds(self.stack.mounter.clone(), || {
1432            self.stack.upper_fs.statfs(locked, current_task)
1433        })
1434    }
1435
1436    fn name(&self) -> &'static FsStr {
1437        "overlay".into()
1438    }
1439
1440    fn rename(
1441        &self,
1442        locked: &mut Locked<FileOpsCore>,
1443        _fs: &FileSystem,
1444        current_task: &CurrentTask,
1445        old_parent: &FsNodeHandle,
1446        old_name: &FsStr,
1447        new_parent: &FsNodeHandle,
1448        new_name: &FsStr,
1449        renamed: &FsNodeHandle,
1450        _replaced: Option<&FsNodeHandle>,
1451    ) -> Result<(), Errno> {
1452        current_task.override_creds(self.stack.mounter.clone(), || {
1453            let renamed_overlay = OverlayNode::from_fs_node(renamed)?;
1454            if renamed_overlay.has_lower() && renamed_overlay.main_entry().entry().node.is_dir() {
1455                // Return EXDEV for directory renames. Potentially they may be handled with the
1456                // `redirect_dir` feature, but it's not implemented here yet.
1457                // See https://docs.kernel.org/filesystems/overlayfs.html#renaming-directories
1458                return error!(EXDEV);
1459            }
1460            renamed_overlay.ensure_upper(locked, current_task, &renamed.fs())?;
1461
1462            let old_parent_overlay = OverlayNode::from_fs_node(old_parent)?;
1463            let old_parent_upper =
1464                old_parent_overlay.ensure_upper(locked, current_task, &renamed.fs())?;
1465
1466            let new_parent_overlay = OverlayNode::from_fs_node(new_parent)?;
1467            let new_parent_upper =
1468                new_parent_overlay.ensure_upper(locked, current_task, &renamed.fs())?;
1469
1470            let need_whiteout =
1471                old_parent_overlay.lower_entry_exists(locked, current_task, old_name)?;
1472
1473            DirEntry::rename(
1474                locked,
1475                current_task,
1476                old_parent_upper.entry(),
1477                old_parent_upper.mount(),
1478                old_name,
1479                new_parent_upper.entry(),
1480                new_parent_upper.mount(),
1481                new_name,
1482                RenameFlags::REPLACE_ANY,
1483            )?;
1484
1485            // If the old node existed in lower FS, then override it in the upper FS with a
1486            // whiteout.
1487            if need_whiteout {
1488                match old_parent_upper.create_whiteout(locked, current_task, old_name) {
1489                    Err(e) => log_warn!("overlayfs: failed to create whiteout for {old_name}: {e}"),
1490                    Ok(_) => (),
1491                }
1492            }
1493
1494            Ok(())
1495        })
1496    }
1497
1498    fn unmount(&self) {}
1499}
1500
1501/// Helper used to resolve directories passed in mount options. The directory is resolved in the
1502/// namespace of the calling process, but only `DirEntry` is returned (detached from the
1503/// namespace). The corresponding file systems may be unmounted before overlayfs that uses them.
1504fn resolve_dir_param(
1505    locked: &mut Locked<Unlocked>,
1506    current_task: &CurrentTask,
1507    params: &MountParams,
1508    name: &FsStr,
1509) -> Result<ActiveEntry, Errno> {
1510    let path = params.get(&**name).ok_or_else(|| {
1511        log_error!("overlayfs: {name} was not specified");
1512        errno!(EINVAL)
1513    })?;
1514
1515    current_task
1516        .open_file(locked, path.as_ref(), OpenFlags::RDONLY | OpenFlags::DIRECTORY)
1517        .map(|f| ActiveEntry { entry: f.name.entry.clone(), mount: f.name.mount.clone() })
1518        .map_err(|e| {
1519            log_error!("overlayfs: Failed to lookup {path}: {}", e);
1520            e
1521        })
1522}
1523
1524/// Copies file content from one file to another.
1525fn copy_file_content<L>(
1526    locked: &mut Locked<L>,
1527    current_task: &CurrentTask,
1528    from: &ActiveEntry,
1529    to: &ActiveEntry,
1530) -> Result<(), Errno>
1531where
1532    L: LockEqualOrBefore<FileOpsCore>,
1533{
1534    let from_file = from.entry().open_anonymous(locked, current_task, OpenFlags::RDONLY)?;
1535    let to_file = to.entry().open_anonymous(locked, current_task, OpenFlags::WRONLY)?;
1536
1537    const BUFFER_SIZE: usize = 4096;
1538
1539    loop {
1540        // TODO(sergeyu): Reuse buffer between iterations.
1541
1542        let mut output_buffer = VecOutputBuffer::new(BUFFER_SIZE);
1543        let bytes_read = from_file.read(locked, current_task, &mut output_buffer)?;
1544        if bytes_read == 0 {
1545            break;
1546        }
1547
1548        let buffer: Vec<u8> = output_buffer.into();
1549        let mut input_buffer = VecInputBuffer::from(buffer);
1550        while input_buffer.available() > 0 {
1551            to_file.write(locked, current_task, &mut input_buffer)?;
1552        }
1553    }
1554
1555    to_file.data_sync(current_task)?;
1556
1557    Ok(())
1558}