starnix_core/device/
remote_block_device.rs

1// Copyright 2024 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use crate::device::DeviceMode;
6use crate::device::kobject::DeviceMetadata;
7use crate::fs::sysfs::{BlockDeviceInfo, build_block_device_directory};
8use crate::mm::MemoryAccessorExt;
9use crate::task::CurrentTask;
10use crate::vfs::buffers::{InputBuffer, OutputBuffer};
11use crate::vfs::{
12    FileObject, FileOps, FsString, NamespaceNode, SeekTarget, default_ioctl, default_seek,
13};
14use anyhow::{Context as _, Error};
15use block_client::{BufferSlice, MutableBufferSlice, RemoteBlockClientSync};
16use fidl::endpoints::ClientEnd;
17use fidl_fuchsia_hardware_block_volume::VolumeMarker;
18use starnix_sync::{FileOpsCore, LockEqualOrBefore, Locked, Mutex, Unlocked};
19use starnix_syscalls::{SUCCESS, SyscallArg, SyscallResult};
20use starnix_uapi::device_type::{BLOCK_EXTENDED_MAJOR, DeviceType};
21use starnix_uapi::errors::Errno;
22use starnix_uapi::open_flags::OpenFlags;
23use starnix_uapi::user_address::UserRef;
24use starnix_uapi::{BLKGETSIZE, BLKGETSIZE64, errno, from_status_like_fdio, off_t};
25use std::collections::btree_map::BTreeMap;
26use std::sync::Arc;
27use std::sync::atomic::{AtomicU32, Ordering};
28
29/// A block device, backed by a partition hosted by Fuchsia.
30pub struct RemoteBlockDevice {
31    block_client: Arc<RemoteBlockClientSync>,
32}
33
34impl RemoteBlockDevice {
35    pub fn read(&self, offset: u64, buf: &mut [u8]) -> Result<(), Error> {
36        self.block_client
37            .read_at(MutableBufferSlice::Memory(buf), offset as u64)
38            .context("read_at failed")
39    }
40
41    fn new<L>(
42        locked: &mut Locked<L>,
43        current_task: &CurrentTask,
44        minor: u32,
45        name: &str,
46        block: ClientEnd<VolumeMarker>,
47    ) -> Result<Arc<Self>, Errno>
48    where
49        L: LockEqualOrBefore<FileOpsCore>,
50    {
51        let kernel = current_task.kernel();
52        let registry = &kernel.device_registry;
53        let device_name = FsString::from(name);
54        let virtual_block_class = registry.objects.virtual_block_class();
55        let block_client = RemoteBlockClientSync::new(block.into_channel().into())
56            .map_err(|status| from_status_like_fdio!(status))?;
57        let device = Arc::new(Self { block_client: Arc::new(block_client) });
58        let device_weak = Arc::<RemoteBlockDevice>::downgrade(&device);
59        registry.add_device(
60            locked,
61            current_task,
62            device_name.as_ref(),
63            DeviceMetadata::new(
64                device_name.clone(),
65                DeviceType::new(BLOCK_EXTENDED_MAJOR, minor),
66                DeviceMode::Block,
67            ),
68            virtual_block_class,
69            |device, dir| build_block_device_directory(device, device_weak, dir),
70        )?;
71        Ok(device)
72    }
73
74    pub fn create_file_ops(&self) -> Box<dyn FileOps> {
75        Box::new(RemoteBlockDeviceFile { block_client: self.block_client.clone() })
76    }
77}
78
79impl BlockDeviceInfo for RemoteBlockDevice {
80    fn size(&self) -> Result<usize, Errno> {
81        (self.block_client.block_count() as usize)
82            .checked_mul(self.block_client.block_size() as usize)
83            .ok_or_else(|| errno!(EINVAL))
84    }
85}
86
87struct RemoteBlockDeviceFile {
88    block_client: Arc<RemoteBlockClientSync>,
89}
90
91impl RemoteBlockDeviceFile {
92    fn size(&self) -> Result<usize, Errno> {
93        (self.block_client.block_count() as usize)
94            .checked_mul(self.block_client.block_size() as usize)
95            .ok_or_else(|| errno!(EINVAL))
96    }
97}
98
99impl FileOps for RemoteBlockDeviceFile {
100    fn has_persistent_offsets(&self) -> bool {
101        true
102    }
103
104    fn is_seekable(&self) -> bool {
105        true
106    }
107
108    // Manually implement seek, because default_eof_offset uses st_size (which is not used for block
109    // devices).
110    fn seek(
111        &self,
112        _locked: &mut Locked<FileOpsCore>,
113        _file: &FileObject,
114        _current_task: &CurrentTask,
115        current_offset: off_t,
116        target: SeekTarget,
117    ) -> Result<off_t, Errno> {
118        default_seek(current_offset, target, || self.size()?.try_into().map_err(|_| errno!(EINVAL)))
119    }
120
121    fn read(
122        &self,
123        _locked: &mut Locked<FileOpsCore>,
124        _file: &FileObject,
125        _current_task: &CurrentTask,
126        mut offset: usize,
127        data: &mut dyn OutputBuffer,
128    ) -> Result<usize, Errno> {
129        let size = self.size()?;
130        let block_size = self.block_client.block_size() as usize;
131        const MAX_CHUNK_SIZE: usize = 32 * 1024; // 32KB
132
133        let mut total_read = 0;
134        while data.available() > 0 && offset < size {
135            let chunk_len = std::cmp::min(data.available(), MAX_CHUNK_SIZE);
136            let chunk_len = std::cmp::min(chunk_len, size - offset);
137            if chunk_len == 0 {
138                break;
139            }
140
141            let aligned_offset = offset - offset % block_size;
142            let end_offset = offset + chunk_len;
143            let aligned_end_offset = std::cmp::min(
144                end_offset.checked_next_multiple_of(block_size).ok_or_else(|| errno!(EINVAL))?,
145                size,
146            );
147            let aligned_data_length = aligned_end_offset - aligned_offset;
148
149            let mut read_data = vec![0u8; aligned_data_length];
150            self.block_client
151                .read_at(MutableBufferSlice::Memory(&mut read_data), aligned_offset as u64)
152                .map_err(|status| from_status_like_fdio!(status))?;
153
154            let read_offset = offset - aligned_offset;
155            let read_end = read_offset + chunk_len;
156            let bytes_written = data.write(&read_data[read_offset..read_end])?;
157
158            offset += bytes_written;
159            total_read += bytes_written;
160
161            if bytes_written < chunk_len {
162                break;
163            }
164        }
165        Ok(total_read)
166    }
167
168    fn write(
169        &self,
170        _locked: &mut Locked<FileOpsCore>,
171        _file: &FileObject,
172        _current_task: &CurrentTask,
173        mut offset: usize,
174        data: &mut dyn InputBuffer,
175    ) -> Result<usize, Errno> {
176        let size = self.size()?;
177        let block_size = self.block_client.block_size() as usize;
178        const MAX_CHUNK_SIZE: usize = 32 * 1024; // 32KB
179
180        let mut total_written = 0;
181        while data.available() > 0 && offset < size {
182            let chunk_len = std::cmp::min(data.available(), MAX_CHUNK_SIZE);
183            let chunk_len = std::cmp::min(chunk_len, size - offset);
184            if chunk_len == 0 {
185                break;
186            }
187
188            let aligned_offset = offset - offset % block_size;
189            let end_offset = offset + chunk_len;
190            let aligned_end_offset = std::cmp::min(
191                end_offset.checked_next_multiple_of(block_size).ok_or_else(|| errno!(EINVAL))?,
192                size,
193            );
194            let aligned_data_length = aligned_end_offset - aligned_offset;
195
196            let mut write_buf = vec![0u8; aligned_data_length];
197
198            // Read-Modify-Write: If the write is not block-aligned at the start or end,
199            // we need to read the existing data for the first and/or last block to preserve it.
200            let head_unaligned = offset > aligned_offset;
201            let tail_unaligned = end_offset < aligned_end_offset;
202
203            if head_unaligned {
204                self.block_client
205                    .read_at(
206                        MutableBufferSlice::Memory(&mut write_buf[..block_size]),
207                        aligned_offset as u64,
208                    )
209                    .map_err(|status| from_status_like_fdio!(status))?;
210            }
211
212            if tail_unaligned {
213                let last_block_start = aligned_data_length - block_size;
214                // If we already read the first block and it's the same as the last block, don't
215                // read again.
216                if !head_unaligned || last_block_start > 0 {
217                    self.block_client
218                        .read_at(
219                            MutableBufferSlice::Memory(&mut write_buf[last_block_start..]),
220                            (aligned_offset + last_block_start) as u64,
221                        )
222                        .map_err(|status| from_status_like_fdio!(status))?;
223                }
224            }
225
226            let write_offset = offset - aligned_offset;
227            let write_slice = &mut write_buf[write_offset..write_offset + chunk_len];
228            // SAFETY: We are writing to a buffer of u8, which is always initialized.
229            // We can safely cast &mut [u8] to &mut [MaybeUninit<u8>].
230            let write_slice_uninit = unsafe {
231                std::slice::from_raw_parts_mut(
232                    write_slice.as_mut_ptr() as *mut std::mem::MaybeUninit<u8>,
233                    write_slice.len(),
234                )
235            };
236            let bytes_read = data.read(write_slice_uninit)?;
237
238            self.block_client
239                .write_at(BufferSlice::Memory(&write_buf), aligned_offset as u64)
240                .map_err(|status| from_status_like_fdio!(status))?;
241
242            offset += bytes_read;
243            total_written += bytes_read;
244
245            if bytes_read < chunk_len {
246                break;
247            }
248        }
249        Ok(total_written)
250    }
251
252    fn sync(&self, _file: &FileObject, _current_task: &CurrentTask) -> Result<(), Errno> {
253        self.block_client.flush().map_err(|status| from_status_like_fdio!(status))
254    }
255
256    fn ioctl(
257        &self,
258        locked: &mut Locked<Unlocked>,
259        file: &FileObject,
260        current_task: &CurrentTask,
261        request: u32,
262        arg: SyscallArg,
263    ) -> Result<SyscallResult, Errno> {
264        match request {
265            BLKGETSIZE => {
266                let user_size = UserRef::<u64>::from(arg);
267                let size = self.block_client.block_count();
268                current_task.write_object(user_size, &size)?;
269                Ok(SUCCESS)
270            }
271            BLKGETSIZE64 => {
272                let user_size = UserRef::<u64>::from(arg);
273                let size = self.size()? as u64;
274                current_task.write_object(user_size, &size)?;
275                Ok(SUCCESS)
276            }
277            _ => default_ioctl(file, locked, current_task, request, arg),
278        }
279    }
280}
281
282fn open_remote_block_device(
283    _locked: &mut Locked<FileOpsCore>,
284    current_task: &CurrentTask,
285    id: DeviceType,
286    _node: &NamespaceNode,
287    _flags: OpenFlags,
288) -> Result<Box<dyn FileOps>, Errno> {
289    Ok(current_task.kernel().remote_block_device_registry.open(id.minor())?.create_file_ops())
290}
291
292pub fn remote_block_device_init(locked: &mut Locked<Unlocked>, current_task: &CurrentTask) {
293    current_task
294        .kernel()
295        .device_registry
296        .register_major(
297            locked,
298            "remote-block".into(),
299            DeviceMode::Block,
300            BLOCK_EXTENDED_MAJOR,
301            open_remote_block_device,
302        )
303        .expect("remote block device register failed.");
304}
305
306#[derive(Default)]
307pub struct RemoteBlockDeviceRegistry {
308    devices: Mutex<BTreeMap<u32, Arc<RemoteBlockDevice>>>,
309    next_minor: AtomicU32,
310}
311
312impl RemoteBlockDeviceRegistry {
313    pub fn create_remote_block_device<L>(
314        &self,
315        locked: &mut Locked<L>,
316        current_task: &CurrentTask,
317        name: &str,
318        block: ClientEnd<VolumeMarker>,
319    ) -> Result<(), Error>
320    where
321        L: LockEqualOrBefore<FileOpsCore>,
322    {
323        let mut devices = self.devices.lock();
324        let minor = self.next_minor.fetch_add(1, Ordering::Relaxed);
325        let device = RemoteBlockDevice::new(locked, current_task, minor, name, block)?;
326        devices.insert(minor, device);
327        Ok(())
328    }
329
330    pub fn open(&self, minor: u32) -> Result<Arc<RemoteBlockDevice>, Errno> {
331        self.devices.lock().get(&minor).ok_or_else(|| errno!(ENODEV)).cloned()
332    }
333}
334
335#[cfg(test)]
336mod tests {
337    use super::*;
338    use crate::testing::{anon_test_file, map_object_anywhere, spawn_kernel_and_run};
339    use crate::vfs::{SeekTarget, VecInputBuffer, VecOutputBuffer};
340    use starnix_uapi::open_flags::OpenFlags;
341    use starnix_uapi::{BLKGETSIZE, BLKGETSIZE64};
342    use vmo_backed_block_server::{VmoBackedServer, VmoBackedServerTestingExt};
343    use zerocopy::FromBytes as _;
344
345    #[::fuchsia::test]
346    async fn test_remote_block_device_registry() {
347        spawn_kernel_and_run(async |locked, current_task| {
348            let kernel = current_task.kernel();
349            remote_block_device_init(locked, &current_task);
350            let registry = kernel.remote_block_device_registry.clone();
351            let server = Arc::new(VmoBackedServer::new(2, 512, &[0u8; 1024]));
352            let (client, server_end) = fidl::endpoints::create_endpoints::<VolumeMarker>();
353            std::thread::spawn(move || {
354                let mut executor = fuchsia_async::LocalExecutor::default();
355                executor.run_singlethreaded(async move {
356                    use fidl::endpoints::RequestStream;
357                    server.serve(server_end.into_stream().cast_stream()).await.unwrap();
358                });
359            });
360
361            registry
362                .create_remote_block_device(locked, &current_task, "test", client)
363                .expect("create_remote_block_device failed.");
364
365            let device = registry.open(0).expect("open failed.");
366            let file =
367                anon_test_file(locked, &current_task, device.create_file_ops(), OpenFlags::RDWR);
368
369            let arg_addr = map_object_anywhere(locked, &current_task, &0u64);
370            let mut arg = [0u8; 8];
371
372            file.ioctl(locked, &current_task, BLKGETSIZE64, arg_addr.into()).expect("ioctl failed");
373            current_task.read_memory_to_slice(arg_addr, &mut arg).unwrap();
374            assert_eq!(u64::read_from_bytes(&arg).unwrap(), 1024);
375
376            file.ioctl(locked, &current_task, BLKGETSIZE, arg_addr.into()).expect("ioctl failed");
377            current_task.read_memory_to_slice(arg_addr, &mut arg).unwrap();
378            assert_eq!(u64::read_from_bytes(&arg).unwrap(), 2);
379
380            // Deliberately read with a non-block-aligned buffer size. These reads come from
381            // uncontrolled sources so we need to be able to handle the alignment ourselves.
382            let mut buf = VecOutputBuffer::new(256);
383            file.read(locked, &current_task, &mut buf).expect("read failed.");
384            assert_eq!(buf.data(), &[0u8; 256]);
385
386            let mut buf = VecInputBuffer::from(vec![1u8; 256]);
387            file.seek(locked, &current_task, SeekTarget::Set(0)).expect("seek failed");
388            file.write(locked, &current_task, &mut buf).expect("write failed.");
389
390            let mut buf = VecOutputBuffer::new(256);
391            file.seek(locked, &current_task, SeekTarget::Set(0)).expect("seek failed");
392            file.read(locked, &current_task, &mut buf).expect("read failed.");
393            assert_eq!(buf.data(), &[1u8; 256]);
394        })
395        .await;
396    }
397
398    #[::fuchsia::test]
399    async fn test_read_write_past_eof() {
400        spawn_kernel_and_run(async |locked, current_task| {
401            let kernel = current_task.kernel();
402            remote_block_device_init(locked, &current_task);
403            let registry = kernel.remote_block_device_registry.clone();
404            let server = Arc::new(VmoBackedServer::new(2, 512, &[0u8; 1024]));
405            let (client, server_end) = fidl::endpoints::create_endpoints::<VolumeMarker>();
406            std::thread::spawn(move || {
407                let mut executor = fuchsia_async::LocalExecutor::default();
408                executor.run_singlethreaded(async move {
409                    use fidl::endpoints::RequestStream;
410                    server.serve(server_end.into_stream().cast_stream()).await.unwrap();
411                });
412            });
413
414            registry
415                .create_remote_block_device(locked, &current_task, "test", client)
416                .expect("create_remote_block_device failed.");
417
418            let device = registry.open(0).expect("open failed.");
419            let file =
420                anon_test_file(locked, &current_task, device.create_file_ops(), OpenFlags::RDWR);
421
422            file.seek(locked, &current_task, SeekTarget::End(0)).expect("seek failed");
423            let mut buf = VecOutputBuffer::new(512);
424            assert_eq!(file.read(locked, &current_task, &mut buf).expect("read failed."), 0);
425
426            let mut buf = VecInputBuffer::from(vec![1u8; 512]);
427            assert_eq!(file.write(locked, &current_task, &mut buf).expect("write failed."), 0);
428        })
429        .await;
430    }
431
432    #[::fuchsia::test]
433    async fn test_unaligned_read_write_spanning_blocks() {
434        spawn_kernel_and_run(async |locked, current_task| {
435            let kernel = current_task.kernel();
436            remote_block_device_init(locked, &current_task);
437            let registry = kernel.remote_block_device_registry.clone();
438            // 3 blocks of 512 bytes = 1536 bytes
439            let server = Arc::new(VmoBackedServer::new(3, 512, &[0u8; 1536]));
440            let (client, server_end) = fidl::endpoints::create_endpoints::<VolumeMarker>();
441            std::thread::spawn(move || {
442                let mut executor = fuchsia_async::LocalExecutor::default();
443                executor.run_singlethreaded(async move {
444                    use fidl::endpoints::RequestStream;
445                    server.serve(server_end.into_stream().cast_stream()).await.unwrap();
446                });
447            });
448
449            registry
450                .create_remote_block_device(locked, &current_task, "test", client)
451                .expect("create_remote_block_device failed.");
452
453            let device = registry.open(0).expect("open failed.");
454            let file =
455                anon_test_file(locked, &current_task, device.create_file_ops(), OpenFlags::RDWR);
456
457            // Write spanning across block boundaries (e.g., from 510 to 1026)
458            // Start at 510 (2 bytes before end of 1st block)
459            // Write 516 bytes (2 bytes in 1st block, 512 bytes in 2nd block, 2 bytes in 3rd block)
460            let mut buf = VecInputBuffer::from(vec![0xAAu8; 516]);
461            file.seek(locked, &current_task, SeekTarget::Set(510)).expect("seek failed");
462            assert_eq!(file.write(locked, &current_task, &mut buf).expect("write failed."), 516);
463
464            // Read back the data
465            let mut buf = VecOutputBuffer::new(516);
466            file.seek(locked, &current_task, SeekTarget::Set(510)).expect("seek failed");
467            assert_eq!(file.read(locked, &current_task, &mut buf).expect("read failed."), 516);
468            assert_eq!(buf.data(), &[0xAAu8; 516]);
469
470            // Verify surrounding data is still 0
471            let mut buf = VecOutputBuffer::new(1);
472            file.seek(locked, &current_task, SeekTarget::Set(509)).expect("seek failed");
473            assert_eq!(file.read(locked, &current_task, &mut buf).expect("read failed."), 1);
474            assert_eq!(buf.data(), &[0u8]);
475
476            let mut buf = VecOutputBuffer::new(1);
477            file.seek(locked, &current_task, SeekTarget::Set(1026)).expect("seek failed");
478            assert_eq!(file.read(locked, &current_task, &mut buf).expect("read failed."), 1);
479            assert_eq!(buf.data(), &[0u8]);
480        })
481        .await;
482    }
483
484    #[::fuchsia::test]
485    async fn test_exact_eof_boundary() {
486        spawn_kernel_and_run(async |locked, current_task| {
487            let kernel = current_task.kernel();
488            remote_block_device_init(locked, &current_task);
489            let registry = kernel.remote_block_device_registry.clone();
490            // 2 blocks of 512 bytes = 1024 bytes
491            let server = Arc::new(VmoBackedServer::new(2, 512, &[0u8; 1024]));
492            let (client, server_end) = fidl::endpoints::create_endpoints::<VolumeMarker>();
493            std::thread::spawn(move || {
494                let mut executor = fuchsia_async::LocalExecutor::default();
495                executor.run_singlethreaded(async move {
496                    use fidl::endpoints::RequestStream;
497                    server.serve(server_end.into_stream().cast_stream()).await.unwrap();
498                });
499            });
500
501            registry
502                .create_remote_block_device(locked, &current_task, "test", client)
503                .expect("create_remote_block_device failed.");
504
505            let device = registry.open(0).expect("open failed.");
506            let file =
507                anon_test_file(locked, &current_task, device.create_file_ops(), OpenFlags::RDWR);
508
509            // Write ending exactly at EOF (1024)
510            // Start at 1020, write 4 bytes
511            let mut buf = VecInputBuffer::from(vec![0xBBu8; 4]);
512            file.seek(locked, &current_task, SeekTarget::Set(1020)).expect("seek failed");
513            assert_eq!(file.write(locked, &current_task, &mut buf).expect("write failed."), 4);
514
515            // Read back
516            let mut buf = VecOutputBuffer::new(4);
517            file.seek(locked, &current_task, SeekTarget::Set(1020)).expect("seek failed");
518            assert_eq!(file.read(locked, &current_task, &mut buf).expect("read failed."), 4);
519            assert_eq!(buf.data(), &[0xBBu8; 4]);
520
521            // Try to read past EOF from 1020 (request 5 bytes)
522            let mut buf = VecOutputBuffer::new(5);
523            file.seek(locked, &current_task, SeekTarget::Set(1020)).expect("seek failed");
524            assert_eq!(file.read(locked, &current_task, &mut buf).expect("read failed."), 4);
525            assert_eq!(buf.data()[..4], [0xBBu8; 4]);
526        })
527        .await;
528    }
529
530    #[::fuchsia::test]
531    async fn test_rmw_preserves_data() {
532        spawn_kernel_and_run(async |locked, current_task| {
533            let kernel = current_task.kernel();
534            remote_block_device_init(locked, &current_task);
535            let registry = kernel.remote_block_device_registry.clone();
536            // 3 blocks of 512 bytes = 1536 bytes
537            // Initialize with a known pattern (0xFF)
538            let server = Arc::new(VmoBackedServer::new(3, 512, &[0xFFu8; 1536]));
539            let (client, server_end) = fidl::endpoints::create_endpoints::<VolumeMarker>();
540            std::thread::spawn(move || {
541                let mut executor = fuchsia_async::LocalExecutor::default();
542                executor.run_singlethreaded(async move {
543                    use fidl::endpoints::RequestStream;
544                    server.serve(server_end.into_stream().cast_stream()).await.unwrap();
545                });
546            });
547
548            registry
549                .create_remote_block_device(locked, &current_task, "test", client)
550                .expect("create_remote_block_device failed.");
551
552            let device = registry.open(0).expect("open failed.");
553            let file =
554                anon_test_file(locked, &current_task, device.create_file_ops(), OpenFlags::RDWR);
555
556            // Write a small chunk in the middle of the second block (offset 600, length 100)
557            // Block 1 is 512-1024. 600 is inside.
558            // This should trigger RMW for the second block.
559            let mut buf = VecInputBuffer::from(vec![0xAAu8; 100]);
560            file.seek(locked, &current_task, SeekTarget::Set(600)).expect("seek failed");
561            assert_eq!(file.write(locked, &current_task, &mut buf).expect("write failed."), 100);
562
563            // Read back the entire second block to verify
564            let mut buf = VecOutputBuffer::new(512);
565            file.seek(locked, &current_task, SeekTarget::Set(512)).expect("seek failed");
566            assert_eq!(file.read(locked, &current_task, &mut buf).expect("read failed."), 512);
567            let data = buf.data();
568
569            // 512 to 600 (88 bytes) should be 0xFF
570            assert_eq!(&data[0..88], &[0xFFu8; 88]);
571            // 600 to 700 (100 bytes) should be 0xAA
572            assert_eq!(&data[88..188], &[0xAAu8; 100]);
573            // 700 to 1024 (324 bytes) should be 0xFF
574            assert_eq!(&data[188..512], &[0xFFu8; 324]);
575        })
576        .await;
577    }
578}