bstr/
io.rs

1/*!
2Utilities for working with I/O using byte strings.
3
4This module currently only exports a single trait, `BufReadExt`, which provides
5facilities for conveniently and efficiently working with lines as byte strings.
6
7More APIs may be added in the future.
8*/
9
10use alloc::{vec, vec::Vec};
11
12use std::io;
13
14use crate::{ext_slice::ByteSlice, ext_vec::ByteVec};
15
16/// An extension trait for
17/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
18/// which provides convenience APIs for dealing with byte strings.
19pub trait BufReadExt: io::BufRead {
20    /// Returns an iterator over the lines of this reader, where each line
21    /// is represented as a byte string.
22    ///
23    /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
24    /// an error is yielded if there was a problem reading from the underlying
25    /// reader.
26    ///
27    /// On success, the next line in the iterator is returned. The line does
28    /// *not* contain a trailing `\n` or `\r\n`.
29    ///
30    /// # Examples
31    ///
32    /// Basic usage:
33    ///
34    /// ```
35    /// use std::io;
36    ///
37    /// use bstr::io::BufReadExt;
38    ///
39    /// # fn example() -> Result<(), io::Error> {
40    /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
41    ///
42    /// let mut lines = vec![];
43    /// for result in cursor.byte_lines() {
44    ///     let line = result?;
45    ///     lines.push(line);
46    /// }
47    /// assert_eq!(lines.len(), 3);
48    /// assert_eq!(lines[0], "lorem".as_bytes());
49    /// assert_eq!(lines[1], "ipsum".as_bytes());
50    /// assert_eq!(lines[2], "dolor".as_bytes());
51    /// # Ok(()) }; example().unwrap()
52    /// ```
53    fn byte_lines(self) -> ByteLines<Self>
54    where
55        Self: Sized,
56    {
57        ByteLines { buf: self }
58    }
59
60    /// Returns an iterator over byte-terminated records of this reader, where
61    /// each record is represented as a byte string.
62    ///
63    /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
64    /// an error is yielded if there was a problem reading from the underlying
65    /// reader.
66    ///
67    /// On success, the next record in the iterator is returned. The record
68    /// does *not* contain its trailing terminator.
69    ///
70    /// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in
71    /// that it has no special handling for `\r`.
72    ///
73    /// # Examples
74    ///
75    /// Basic usage:
76    ///
77    /// ```
78    /// use std::io;
79    ///
80    /// use bstr::io::BufReadExt;
81    ///
82    /// # fn example() -> Result<(), io::Error> {
83    /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
84    ///
85    /// let mut records = vec![];
86    /// for result in cursor.byte_records(b'\x00') {
87    ///     let record = result?;
88    ///     records.push(record);
89    /// }
90    /// assert_eq!(records.len(), 3);
91    /// assert_eq!(records[0], "lorem".as_bytes());
92    /// assert_eq!(records[1], "ipsum".as_bytes());
93    /// assert_eq!(records[2], "dolor".as_bytes());
94    /// # Ok(()) }; example().unwrap()
95    /// ```
96    fn byte_records(self, terminator: u8) -> ByteRecords<Self>
97    where
98        Self: Sized,
99    {
100        ByteRecords { terminator, buf: self }
101    }
102
103    /// Executes the given closure on each line in the underlying reader.
104    ///
105    /// If the closure returns an error (or if the underlying reader returns an
106    /// error), then iteration is stopped and the error is returned. If false
107    /// is returned, then iteration is stopped and no error is returned.
108    ///
109    /// The closure given is called on exactly the same values as yielded by
110    /// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
111    /// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes.
112    ///
113    /// This routine is useful for iterating over lines as quickly as
114    /// possible. Namely, a single allocation is reused for each line.
115    ///
116    /// # Examples
117    ///
118    /// Basic usage:
119    ///
120    /// ```
121    /// use std::io;
122    ///
123    /// use bstr::io::BufReadExt;
124    ///
125    /// # fn example() -> Result<(), io::Error> {
126    /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
127    ///
128    /// let mut lines = vec![];
129    /// cursor.for_byte_line(|line| {
130    ///     lines.push(line.to_vec());
131    ///     Ok(true)
132    /// })?;
133    /// assert_eq!(lines.len(), 3);
134    /// assert_eq!(lines[0], "lorem".as_bytes());
135    /// assert_eq!(lines[1], "ipsum".as_bytes());
136    /// assert_eq!(lines[2], "dolor".as_bytes());
137    /// # Ok(()) }; example().unwrap()
138    /// ```
139    fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()>
140    where
141        Self: Sized,
142        F: FnMut(&[u8]) -> io::Result<bool>,
143    {
144        self.for_byte_line_with_terminator(|line| {
145            for_each_line(&trim_line_slice(&line))
146        })
147    }
148
149    /// Executes the given closure on each byte-terminated record in the
150    /// underlying reader.
151    ///
152    /// If the closure returns an error (or if the underlying reader returns an
153    /// error), then iteration is stopped and the error is returned. If false
154    /// is returned, then iteration is stopped and no error is returned.
155    ///
156    /// The closure given is called on exactly the same values as yielded by
157    /// the [`byte_records`](trait.BufReadExt.html#method.byte_records)
158    /// iterator. Namely, records do _not_ contain a trailing terminator byte.
159    ///
160    /// This routine is useful for iterating over records as quickly as
161    /// possible. Namely, a single allocation is reused for each record.
162    ///
163    /// # Examples
164    ///
165    /// Basic usage:
166    ///
167    /// ```
168    /// use std::io;
169    ///
170    /// use bstr::io::BufReadExt;
171    ///
172    /// # fn example() -> Result<(), io::Error> {
173    /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
174    ///
175    /// let mut records = vec![];
176    /// cursor.for_byte_record(b'\x00', |record| {
177    ///     records.push(record.to_vec());
178    ///     Ok(true)
179    /// })?;
180    /// assert_eq!(records.len(), 3);
181    /// assert_eq!(records[0], "lorem".as_bytes());
182    /// assert_eq!(records[1], "ipsum".as_bytes());
183    /// assert_eq!(records[2], "dolor".as_bytes());
184    /// # Ok(()) }; example().unwrap()
185    /// ```
186    fn for_byte_record<F>(
187        &mut self,
188        terminator: u8,
189        mut for_each_record: F,
190    ) -> io::Result<()>
191    where
192        Self: Sized,
193        F: FnMut(&[u8]) -> io::Result<bool>,
194    {
195        self.for_byte_record_with_terminator(terminator, |chunk| {
196            for_each_record(&trim_record_slice(&chunk, terminator))
197        })
198    }
199
200    /// Executes the given closure on each line in the underlying reader.
201    ///
202    /// If the closure returns an error (or if the underlying reader returns an
203    /// error), then iteration is stopped and the error is returned. If false
204    /// is returned, then iteration is stopped and no error is returned.
205    ///
206    /// Unlike
207    /// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line),
208    /// the lines given to the closure *do* include the line terminator, if one
209    /// exists.
210    ///
211    /// This routine is useful for iterating over lines as quickly as
212    /// possible. Namely, a single allocation is reused for each line.
213    ///
214    /// This is identical to `for_byte_record_with_terminator` with a
215    /// terminator of `\n`.
216    ///
217    /// # Examples
218    ///
219    /// Basic usage:
220    ///
221    /// ```
222    /// use std::io;
223    ///
224    /// use bstr::io::BufReadExt;
225    ///
226    /// # fn example() -> Result<(), io::Error> {
227    /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
228    ///
229    /// let mut lines = vec![];
230    /// cursor.for_byte_line_with_terminator(|line| {
231    ///     lines.push(line.to_vec());
232    ///     Ok(true)
233    /// })?;
234    /// assert_eq!(lines.len(), 3);
235    /// assert_eq!(lines[0], "lorem\n".as_bytes());
236    /// assert_eq!(lines[1], "ipsum\r\n".as_bytes());
237    /// assert_eq!(lines[2], "dolor".as_bytes());
238    /// # Ok(()) }; example().unwrap()
239    /// ```
240    fn for_byte_line_with_terminator<F>(
241        &mut self,
242        for_each_line: F,
243    ) -> io::Result<()>
244    where
245        Self: Sized,
246        F: FnMut(&[u8]) -> io::Result<bool>,
247    {
248        self.for_byte_record_with_terminator(b'\n', for_each_line)
249    }
250
251    /// Executes the given closure on each byte-terminated record in the
252    /// underlying reader.
253    ///
254    /// If the closure returns an error (or if the underlying reader returns an
255    /// error), then iteration is stopped and the error is returned. If false
256    /// is returned, then iteration is stopped and no error is returned.
257    ///
258    /// Unlike
259    /// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record),
260    /// the lines given to the closure *do* include the record terminator, if
261    /// one exists.
262    ///
263    /// This routine is useful for iterating over records as quickly as
264    /// possible. Namely, a single allocation is reused for each record.
265    ///
266    /// # Examples
267    ///
268    /// Basic usage:
269    ///
270    /// ```
271    /// use std::io;
272    ///
273    /// use bstr::{io::BufReadExt, B};
274    ///
275    /// # fn example() -> Result<(), io::Error> {
276    /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
277    ///
278    /// let mut records = vec![];
279    /// cursor.for_byte_record_with_terminator(b'\x00', |record| {
280    ///     records.push(record.to_vec());
281    ///     Ok(true)
282    /// })?;
283    /// assert_eq!(records.len(), 3);
284    /// assert_eq!(records[0], B(b"lorem\x00"));
285    /// assert_eq!(records[1], B("ipsum\x00"));
286    /// assert_eq!(records[2], B("dolor"));
287    /// # Ok(()) }; example().unwrap()
288    /// ```
289    fn for_byte_record_with_terminator<F>(
290        &mut self,
291        terminator: u8,
292        mut for_each_record: F,
293    ) -> io::Result<()>
294    where
295        Self: Sized,
296        F: FnMut(&[u8]) -> io::Result<bool>,
297    {
298        let mut bytes = vec![];
299        let mut res = Ok(());
300        let mut consumed = 0;
301        'outer: loop {
302            // Lend out complete record slices from our buffer
303            {
304                let mut buf = self.fill_buf()?;
305                while let Some(index) = buf.find_byte(terminator) {
306                    let (record, rest) = buf.split_at(index + 1);
307                    buf = rest;
308                    consumed += record.len();
309                    match for_each_record(&record) {
310                        Ok(false) => break 'outer,
311                        Err(err) => {
312                            res = Err(err);
313                            break 'outer;
314                        }
315                        _ => (),
316                    }
317                }
318
319                // Copy the final record fragment to our local buffer. This
320                // saves read_until() from re-scanning a buffer we know
321                // contains no remaining terminators.
322                bytes.extend_from_slice(&buf);
323                consumed += buf.len();
324            }
325
326            self.consume(consumed);
327            consumed = 0;
328
329            // N.B. read_until uses a different version of memchr that may
330            // be slower than the memchr crate that bstr uses. However, this
331            // should only run for a fairly small number of records, assuming a
332            // decent buffer size.
333            self.read_until(terminator, &mut bytes)?;
334            if bytes.is_empty() || !for_each_record(&bytes)? {
335                break;
336            }
337            bytes.clear();
338        }
339        self.consume(consumed);
340        res
341    }
342}
343
344impl<B: io::BufRead> BufReadExt for B {}
345
346/// An iterator over lines from an instance of
347/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
348///
349/// This iterator is generally created by calling the
350/// [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
351/// method on the
352/// [`BufReadExt`](trait.BufReadExt.html)
353/// trait.
354#[derive(Debug)]
355pub struct ByteLines<B> {
356    buf: B,
357}
358
359/// An iterator over records from an instance of
360/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
361///
362/// A byte record is any sequence of bytes terminated by a particular byte
363/// chosen by the caller. For example, NUL separated byte strings are said to
364/// be NUL-terminated byte records.
365///
366/// This iterator is generally created by calling the
367/// [`byte_records`](trait.BufReadExt.html#method.byte_records)
368/// method on the
369/// [`BufReadExt`](trait.BufReadExt.html)
370/// trait.
371#[derive(Debug)]
372pub struct ByteRecords<B> {
373    buf: B,
374    terminator: u8,
375}
376
377impl<B: io::BufRead> Iterator for ByteLines<B> {
378    type Item = io::Result<Vec<u8>>;
379
380    fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
381        let mut bytes = vec![];
382        match self.buf.read_until(b'\n', &mut bytes) {
383            Err(e) => Some(Err(e)),
384            Ok(0) => None,
385            Ok(_) => {
386                trim_line(&mut bytes);
387                Some(Ok(bytes))
388            }
389        }
390    }
391}
392
393impl<B: io::BufRead> Iterator for ByteRecords<B> {
394    type Item = io::Result<Vec<u8>>;
395
396    fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
397        let mut bytes = vec![];
398        match self.buf.read_until(self.terminator, &mut bytes) {
399            Err(e) => Some(Err(e)),
400            Ok(0) => None,
401            Ok(_) => {
402                trim_record(&mut bytes, self.terminator);
403                Some(Ok(bytes))
404            }
405        }
406    }
407}
408
409fn trim_line(line: &mut Vec<u8>) {
410    if line.last_byte() == Some(b'\n') {
411        line.pop_byte();
412        if line.last_byte() == Some(b'\r') {
413            line.pop_byte();
414        }
415    }
416}
417
418fn trim_line_slice(mut line: &[u8]) -> &[u8] {
419    if line.last_byte() == Some(b'\n') {
420        line = &line[..line.len() - 1];
421        if line.last_byte() == Some(b'\r') {
422            line = &line[..line.len() - 1];
423        }
424    }
425    line
426}
427
428fn trim_record(record: &mut Vec<u8>, terminator: u8) {
429    if record.last_byte() == Some(terminator) {
430        record.pop_byte();
431    }
432}
433
434fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
435    if record.last_byte() == Some(terminator) {
436        record = &record[..record.len() - 1];
437    }
438    record
439}
440
441#[cfg(all(test, feature = "std"))]
442mod tests {
443    use crate::bstring::BString;
444
445    use super::BufReadExt;
446
447    fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
448        let mut lines = vec![];
449        slice
450            .as_ref()
451            .for_byte_line(|line| {
452                lines.push(BString::from(line.to_vec()));
453                Ok(true)
454            })
455            .unwrap();
456        lines
457    }
458
459    fn collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
460        let mut lines = vec![];
461        slice
462            .as_ref()
463            .for_byte_line_with_terminator(|line| {
464                lines.push(BString::from(line.to_vec()));
465                Ok(true)
466            })
467            .unwrap();
468        lines
469    }
470
471    #[test]
472    fn lines_without_terminator() {
473        assert_eq!(collect_lines(""), Vec::<BString>::new());
474
475        assert_eq!(collect_lines("\n"), vec![""]);
476        assert_eq!(collect_lines("\n\n"), vec!["", ""]);
477        assert_eq!(collect_lines("a\nb\n"), vec!["a", "b"]);
478        assert_eq!(collect_lines("a\nb"), vec!["a", "b"]);
479        assert_eq!(collect_lines("abc\nxyz\n"), vec!["abc", "xyz"]);
480        assert_eq!(collect_lines("abc\nxyz"), vec!["abc", "xyz"]);
481
482        assert_eq!(collect_lines("\r\n"), vec![""]);
483        assert_eq!(collect_lines("\r\n\r\n"), vec!["", ""]);
484        assert_eq!(collect_lines("a\r\nb\r\n"), vec!["a", "b"]);
485        assert_eq!(collect_lines("a\r\nb"), vec!["a", "b"]);
486        assert_eq!(collect_lines("abc\r\nxyz\r\n"), vec!["abc", "xyz"]);
487        assert_eq!(collect_lines("abc\r\nxyz"), vec!["abc", "xyz"]);
488
489        assert_eq!(collect_lines("abc\rxyz"), vec!["abc\rxyz"]);
490    }
491
492    #[test]
493    fn lines_with_terminator() {
494        assert_eq!(collect_lines_term(""), Vec::<BString>::new());
495
496        assert_eq!(collect_lines_term("\n"), vec!["\n"]);
497        assert_eq!(collect_lines_term("\n\n"), vec!["\n", "\n"]);
498        assert_eq!(collect_lines_term("a\nb\n"), vec!["a\n", "b\n"]);
499        assert_eq!(collect_lines_term("a\nb"), vec!["a\n", "b"]);
500        assert_eq!(collect_lines_term("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
501        assert_eq!(collect_lines_term("abc\nxyz"), vec!["abc\n", "xyz"]);
502
503        assert_eq!(collect_lines_term("\r\n"), vec!["\r\n"]);
504        assert_eq!(collect_lines_term("\r\n\r\n"), vec!["\r\n", "\r\n"]);
505        assert_eq!(collect_lines_term("a\r\nb\r\n"), vec!["a\r\n", "b\r\n"]);
506        assert_eq!(collect_lines_term("a\r\nb"), vec!["a\r\n", "b"]);
507        assert_eq!(
508            collect_lines_term("abc\r\nxyz\r\n"),
509            vec!["abc\r\n", "xyz\r\n"]
510        );
511        assert_eq!(collect_lines_term("abc\r\nxyz"), vec!["abc\r\n", "xyz"]);
512
513        assert_eq!(collect_lines_term("abc\rxyz"), vec!["abc\rxyz"]);
514    }
515}