csv/
byte_record.rs

Help
1use std::{
2    cmp, fmt,
3    iter::FromIterator,
4    ops::{self, Range},
5    result,
6};
7
8use serde::de::Deserialize;
9
10use crate::{
11    deserializer::deserialize_byte_record,
12    error::{new_utf8_error, Result, Utf8Error},
13    string_record::StringRecord,
14};
15
16/// A single CSV record stored as raw bytes.
17///
18/// A byte record permits reading or writing CSV rows that are not UTF-8.
19/// In general, you should prefer using a
20/// [`StringRecord`](struct.StringRecord.html)
21/// since it is more ergonomic, but a `ByteRecord` is provided in case you need
22/// it.
23///
24/// If you are using the Serde (de)serialization APIs, then you probably never
25/// need to interact with a `ByteRecord` or a `StringRecord`. However, there
26/// are some circumstances in which you might need to use a raw record type
27/// while still using Serde. For example, if you need to deserialize possibly
28/// invalid UTF-8 fields, then you'll need to first read your record into a
29/// `ByteRecord`, and then use `ByteRecord::deserialize` to run Serde. Another
30/// reason for using the raw record deserialization APIs is if you're using
31/// Serde to read into borrowed data such as a `&'a str` or a `&'a [u8]`.
32///
33/// Two `ByteRecord`s are compared on the basis of their field data. Any
34/// position information associated with the records is ignored.
35#[derive(Clone, Eq)]
36pub struct ByteRecord(Box<ByteRecordInner>);
37
38impl PartialEq for ByteRecord {
39    fn eq(&self, other: &ByteRecord) -> bool {
40        if self.len() != other.len() {
41            return false;
42        }
43        self.iter().zip(other.iter()).all(|e| e.0 == e.1)
44    }
45}
46
47impl<T: AsRef<[u8]>> PartialEq<Vec<T>> for ByteRecord {
48    fn eq(&self, other: &Vec<T>) -> bool {
49        self.iter_eq(other)
50    }
51}
52
53impl<'a, T: AsRef<[u8]>> PartialEq<Vec<T>> for &'a ByteRecord {
54    fn eq(&self, other: &Vec<T>) -> bool {
55        self.iter_eq(other)
56    }
57}
58
59impl<T: AsRef<[u8]>> PartialEq<[T]> for ByteRecord {
60    fn eq(&self, other: &[T]) -> bool {
61        self.iter_eq(other)
62    }
63}
64
65impl<'a, T: AsRef<[u8]>> PartialEq<[T]> for &'a ByteRecord {
66    fn eq(&self, other: &[T]) -> bool {
67        self.iter_eq(other)
68    }
69}
70
71impl fmt::Debug for ByteRecord {
72    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
73        write!(f, "ByteRecord(")?;
74        f.debug_list()
75            .entries(self.iter().map(crate::debug::Bytes))
76            .finish()?;
77        write!(f, ")")?;
78        Ok(())
79    }
80}
81
82/// The inner portion of a byte record.
83///
84/// We use this memory layout so that moving a `ByteRecord` only requires
85/// moving a single pointer. The optimization is dubious at best, but does
86/// seem to result in slightly better numbers in microbenchmarks. Methinks this
87/// may heavily depend on the underlying allocator.
88#[derive(Clone, Debug, Eq, PartialEq)]
89struct ByteRecordInner {
90    /// The position of this byte record.
91    pos: Option<Position>,
92    /// All fields in this record, stored contiguously.
93    fields: Vec<u8>,
94    /// The number of and location of each field in this record.
95    bounds: Bounds,
96}
97
98impl Default for ByteRecord {
99    #[inline]
100    fn default() -> ByteRecord {
101        ByteRecord::new()
102    }
103}
104
105impl ByteRecord {
106    /// Create a new empty `ByteRecord`.
107    ///
108    /// Note that you may find the `ByteRecord::from` constructor more
109    /// convenient, which is provided by an impl on the `From` trait.
110    ///
111    /// # Example: create an empty record
112    ///
113    /// ```
114    /// use csv::ByteRecord;
115    ///
116    /// let record = ByteRecord::new();
117    /// assert_eq!(record.len(), 0);
118    /// ```
119    ///
120    /// # Example: initialize a record from a `Vec`
121    ///
122    /// ```
123    /// use csv::ByteRecord;
124    ///
125    /// let record = ByteRecord::from(vec!["a", "b", "c"]);
126    /// assert_eq!(record.len(), 3);
127    /// ```
128    #[inline]
129    pub fn new() -> ByteRecord {
130        ByteRecord::with_capacity(0, 0)
131    }
132
133    /// Create a new empty `ByteRecord` with the given capacity settings.
134    ///
135    /// `buffer` refers to the capacity of the buffer used to store the
136    /// actual row contents. `fields` refers to the number of fields one
137    /// might expect to store.
138    #[inline]
139    pub fn with_capacity(buffer: usize, fields: usize) -> ByteRecord {
140        ByteRecord(Box::new(ByteRecordInner {
141            pos: None,
142            fields: vec![0; buffer],
143            bounds: Bounds::with_capacity(fields),
144        }))
145    }
146
147    /// Deserialize this record.
148    ///
149    /// The `D` type parameter refers to the type that this record should be
150    /// deserialized into. The `'de` lifetime refers to the lifetime of the
151    /// `ByteRecord`. The `'de` lifetime permits deserializing into structs
152    /// that borrow field data from this record.
153    ///
154    /// An optional `headers` parameter permits deserializing into a struct
155    /// based on its field names (corresponding to header values) rather than
156    /// the order in which the fields are defined.
157    ///
158    /// # Example: without headers
159    ///
160    /// This shows how to deserialize a single row into a struct based on the
161    /// order in which fields occur. This example also shows how to borrow
162    /// fields from the `ByteRecord`, which results in zero allocation
163    /// deserialization.
164    ///
165    /// ```
166    /// use std::error::Error;
167    ///
168    /// use csv::ByteRecord;
169    /// use serde::Deserialize;
170    ///
171    /// #[derive(Deserialize)]
172    /// struct Row<'a> {
173    ///     city: &'a str,
174    ///     country: &'a str,
175    ///     population: u64,
176    /// }
177    ///
178    /// # fn main() { example().unwrap() }
179    /// fn example() -> Result<(), Box<dyn Error>> {
180    ///     let record = ByteRecord::from(vec![
181    ///         "Boston", "United States", "4628910",
182    ///     ]);
183    ///
184    ///     let row: Row = record.deserialize(None)?;
185    ///     assert_eq!(row.city, "Boston");
186    ///     assert_eq!(row.country, "United States");
187    ///     assert_eq!(row.population, 4628910);
188    ///     Ok(())
189    /// }
190    /// ```
191    ///
192    /// # Example: with headers
193    ///
194    /// This example is like the previous one, but shows how to deserialize
195    /// into a struct based on the struct's field names. For this to work,
196    /// you must provide a header row.
197    ///
198    /// This example also shows that you can deserialize into owned data
199    /// types (e.g., `String`) instead of borrowed data types (e.g., `&str`).
200    ///
201    /// ```
202    /// use std::error::Error;
203    ///
204    /// use csv::ByteRecord;
205    /// use serde::Deserialize;
206    ///
207    /// #[derive(Deserialize)]
208    /// struct Row {
209    ///     city: String,
210    ///     country: String,
211    ///     population: u64,
212    /// }
213    ///
214    /// # fn main() { example().unwrap() }
215    /// fn example() -> Result<(), Box<dyn Error>> {
216    ///     // Notice that the fields are not in the same order
217    ///     // as the fields in the struct!
218    ///     let header = ByteRecord::from(vec![
219    ///         "country", "city", "population",
220    ///     ]);
221    ///     let record = ByteRecord::from(vec![
222    ///         "United States", "Boston", "4628910",
223    ///     ]);
224    ///
225    ///     let row: Row = record.deserialize(Some(&header))?;
226    ///     assert_eq!(row.city, "Boston");
227    ///     assert_eq!(row.country, "United States");
228    ///     assert_eq!(row.population, 4628910);
229    ///     Ok(())
230    /// }
231    /// ```
232    pub fn deserialize<'de, D: Deserialize<'de>>(
233        &'de self,
234        headers: Option<&'de ByteRecord>,
235    ) -> Result<D> {
236        deserialize_byte_record(self, headers)
237    }
238
239    /// Returns an iterator over all fields in this record.
240    ///
241    /// # Example
242    ///
243    /// This example shows how to iterate over each field in a `ByteRecord`.
244    ///
245    /// ```
246    /// use csv::ByteRecord;
247    ///
248    /// let record = ByteRecord::from(vec!["a", "b", "c"]);
249    /// for field in record.iter() {
250    ///     assert!(field == b"a" || field == b"b" || field == b"c");
251    /// }
252    /// ```
253    #[inline]
254    pub fn iter(&self) -> ByteRecordIter {
255        self.into_iter()
256    }
257
258    /// Return the field at index `i`.
259    ///
260    /// If no field at index `i` exists, then this returns `None`.
261    ///
262    /// # Example
263    ///
264    /// ```
265    /// use csv::ByteRecord;
266    ///
267    /// let record = ByteRecord::from(vec!["a", "b", "c"]);
268    /// assert_eq!(record.get(1), Some(&b"b"[..]));
269    /// assert_eq!(record.get(3), None);
270    /// ```
271    #[inline]
272    pub fn get(&self, i: usize) -> Option<&[u8]> {
273        self.0.bounds.get(i).map(|range| &self.0.fields[range])
274    }
275
276    /// Returns true if and only if this record is empty.
277    ///
278    /// # Example
279    ///
280    /// ```
281    /// use csv::ByteRecord;
282    ///
283    /// assert!(ByteRecord::new().is_empty());
284    /// ```
285    #[inline]
286    pub fn is_empty(&self) -> bool {
287        self.len() == 0
288    }
289
290    /// Returns the number of fields in this record.
291    ///
292    /// # Example
293    ///
294    /// ```
295    /// use csv::ByteRecord;
296    ///
297    /// let record = ByteRecord::from(vec!["a", "b", "c"]);
298    /// assert_eq!(record.len(), 3);
299    /// ```
300    #[inline]
301    pub fn len(&self) -> usize {
302        self.0.bounds.len()
303    }
304
305    /// Truncate this record to `n` fields.
306    ///
307    /// If `n` is greater than the number of fields in this record, then this
308    /// has no effect.
309    ///
310    /// # Example
311    ///
312    /// ```
313    /// use csv::ByteRecord;
314    ///
315    /// let mut record = ByteRecord::from(vec!["a", "b", "c"]);
316    /// assert_eq!(record.len(), 3);
317    /// record.truncate(1);
318    /// assert_eq!(record.len(), 1);
319    /// assert_eq!(record, vec!["a"]);
320    /// ```
321    #[inline]
322    pub fn truncate(&mut self, n: usize) {
323        if n <= self.len() {
324            self.0.bounds.len = n;
325        }
326    }
327
328    /// Clear this record so that it has zero fields.
329    ///
330    /// This is equivalent to calling `truncate(0)`.
331    ///
332    /// Note that it is not necessary to clear the record to reuse it with
333    /// the CSV reader.
334    ///
335    /// # Example
336    ///
337    /// ```
338    /// use csv::ByteRecord;
339    ///
340    /// let mut record = ByteRecord::from(vec!["a", "b", "c"]);
341    /// assert_eq!(record.len(), 3);
342    /// record.clear();
343    /// assert_eq!(record.len(), 0);
344    /// ```
345    #[inline]
346    pub fn clear(&mut self) {
347        self.truncate(0);
348    }
349
350    /// Trim the fields of this record so that leading and trailing whitespace
351    /// is removed.
352    ///
353    /// This method uses the ASCII definition of whitespace. That is, only
354    /// bytes in the class `[\t\n\v\f\r ]` are trimmed.
355    ///
356    /// # Example
357    ///
358    /// ```
359    /// use csv::ByteRecord;
360    ///
361    /// let mut record = ByteRecord::from(vec![
362    ///     "  ", "\tfoo", "bar  ", "b a z",
363    /// ]);
364    /// record.trim();
365    /// assert_eq!(record, vec!["", "foo", "bar", "b a z"]);
366    /// ```
367    pub fn trim(&mut self) {
368        let length = self.len();
369        if length == 0 {
370            return;
371        }
372        // TODO: We could likely do this in place, but for now, we allocate.
373        let mut trimmed =
374            ByteRecord::with_capacity(self.as_slice().len(), self.len());
375        trimmed.set_position(self.position().cloned());
376        for field in self.iter() {
377            trimmed.push_field(trim_ascii(field));
378        }
379        *self = trimmed;
380    }
381
382    /// Add a new field to this record.
383    ///
384    /// # Example
385    ///
386    /// ```
387    /// use csv::ByteRecord;
388    ///
389    /// let mut record = ByteRecord::new();
390    /// record.push_field(b"foo");
391    /// assert_eq!(&record[0], b"foo");
392    /// ```
393    #[inline]
394    pub fn push_field(&mut self, field: &[u8]) {
395        let (s, e) = (self.0.bounds.end(), self.0.bounds.end() + field.len());
396        while e > self.0.fields.len() {
397            self.expand_fields();
398        }
399        self.0.fields[s..e].copy_from_slice(field);
400        self.0.bounds.add(e);
401    }
402
403    /// Return the position of this record, if available.
404    ///
405    /// # Example
406    ///
407    /// ```
408    /// use std::error::Error;
409    ///
410    /// use csv::{ByteRecord, ReaderBuilder};
411    ///
412    /// # fn main() { example().unwrap(); }
413    /// fn example() -> Result<(), Box<dyn Error>> {
414    ///     let mut record = ByteRecord::new();
415    ///     let mut rdr = ReaderBuilder::new()
416    ///         .has_headers(false)
417    ///         .from_reader("a,b,c\nx,y,z".as_bytes());
418    ///
419    ///     assert!(rdr.read_byte_record(&mut record)?);
420    ///     {
421    ///         let pos = record.position().expect("a record position");
422    ///         assert_eq!(pos.byte(), 0);
423    ///         assert_eq!(pos.line(), 1);
424    ///         assert_eq!(pos.record(), 0);
425    ///     }
426    ///
427    ///     assert!(rdr.read_byte_record(&mut record)?);
428    ///     {
429    ///         let pos = record.position().expect("a record position");
430    ///         assert_eq!(pos.byte(), 6);
431    ///         assert_eq!(pos.line(), 2);
432    ///         assert_eq!(pos.record(), 1);
433    ///     }
434    ///
435    ///     // Finish the CSV reader for good measure.
436    ///     assert!(!rdr.read_byte_record(&mut record)?);
437    ///     Ok(())
438    /// }
439    /// ```
440    #[inline]
441    pub fn position(&self) -> Option<&Position> {
442        self.0.pos.as_ref()
443    }
444
445    /// Set the position of this record.
446    ///
447    /// # Example
448    ///
449    /// ```
450    /// use csv::{ByteRecord, Position};
451    ///
452    /// let mut record = ByteRecord::from(vec!["a", "b", "c"]);
453    /// let mut pos = Position::new();
454    /// pos.set_byte(100);
455    /// pos.set_line(4);
456    /// pos.set_record(2);
457    ///
458    /// record.set_position(Some(pos.clone()));
459    /// assert_eq!(record.position(), Some(&pos));
460    /// ```
461    #[inline]
462    pub fn set_position(&mut self, pos: Option<Position>) {
463        self.0.pos = pos;
464    }
465
466    /// Return the start and end position of a field in this record.
467    ///
468    /// If no such field exists at the given index, then return `None`.
469    ///
470    /// The range returned can be used with the slice returned by `as_slice`.
471    ///
472    /// # Example
473    ///
474    /// ```
475    /// use csv::ByteRecord;
476    ///
477    /// let record = ByteRecord::from(vec!["foo", "quux", "z"]);
478    /// let range = record.range(1).expect("a record range");
479    /// assert_eq!(&record.as_slice()[range], &b"quux"[..]);
480    /// ```
481    #[inline]
482    pub fn range(&self, i: usize) -> Option<Range<usize>> {
483        self.0.bounds.get(i)
484    }
485
486    /// Return the entire row as a single byte slice. The slice returned stores
487    /// all fields contiguously. The boundaries of each field can be determined
488    /// via the `range` method.
489    ///
490    /// # Example
491    ///
492    /// ```
493    /// use csv::ByteRecord;
494    ///
495    /// let record = ByteRecord::from(vec!["foo", "quux", "z"]);
496    /// assert_eq!(record.as_slice(), &b"fooquuxz"[..]);
497    /// ```
498    #[inline]
499    pub fn as_slice(&self) -> &[u8] {
500        &self.0.fields[..self.0.bounds.end()]
501    }
502
503    /// Clone this record, but only copy `fields` up to the end of bounds. This
504    /// is useful when one wants to copy a record, but not necessarily any
505    /// excess capacity in that record.
506    #[inline]
507    pub(crate) fn clone_truncated(&self) -> ByteRecord {
508        let mut br = ByteRecord::new();
509        br.0.pos = self.0.pos.clone();
510        br.0.bounds = self.0.bounds.clone();
511        br.0.fields = self.0.fields[..self.0.bounds.end()].to_vec();
512        br
513    }
514
515    /// Retrieve the underlying parts of a byte record.
516    #[inline]
517    pub(crate) fn as_parts(&mut self) -> (&mut Vec<u8>, &mut Vec<usize>) {
518        let inner = &mut *self.0;
519        (&mut inner.fields, &mut inner.bounds.ends)
520    }
521
522    /// Set the number of fields in the given record record.
523    #[inline]
524    pub(crate) fn set_len(&mut self, len: usize) {
525        self.0.bounds.len = len;
526    }
527
528    /// Expand the capacity for storing fields.
529    #[inline]
530    pub(crate) fn expand_fields(&mut self) {
531        let new_len = self.0.fields.len().checked_mul(2).unwrap();
532        self.0.fields.resize(cmp::max(4, new_len), 0);
533    }
534
535    /// Expand the capacity for storing field ending positions.
536    #[inline]
537    pub(crate) fn expand_ends(&mut self) {
538        self.0.bounds.expand();
539    }
540
541    /// Validate the given record as UTF-8.
542    ///
543    /// If it's not UTF-8, return an error.
544    #[inline]
545    pub(crate) fn validate(&self) -> result::Result<(), Utf8Error> {
546        // If the entire buffer is ASCII, then we have nothing to fear.
547        if self.0.fields[..self.0.bounds.end()].is_ascii() {
548            return Ok(());
549        }
550        // Otherwise, we must check each field individually to ensure that
551        // it's valid UTF-8.
552        for (i, field) in self.iter().enumerate() {
553            if let Err(err) = std::str::from_utf8(field) {
554                return Err(new_utf8_error(i, err.valid_up_to()));
555            }
556        }
557        Ok(())
558    }
559
560    /// Compare the given byte record with the iterator of fields for equality.
561    pub(crate) fn iter_eq<I, T>(&self, other: I) -> bool
562    where
563        I: IntoIterator<Item = T>,
564        T: AsRef<[u8]>,
565    {
566        let mut it_record = self.iter();
567        let mut it_other = other.into_iter();
568        loop {
569            match (it_record.next(), it_other.next()) {
570                (None, None) => return true,
571                (None, Some(_)) | (Some(_), None) => return false,
572                (Some(x), Some(y)) => {
573                    if x != y.as_ref() {
574                        return false;
575                    }
576                }
577            }
578        }
579    }
580}
581
582/// A position in CSV data.
583///
584/// A position is used to report errors in CSV data. All positions include the
585/// byte offset, line number and record index at which the error occurred.
586///
587/// Byte offsets and record indices start at `0`. Line numbers start at `1`.
588///
589/// A CSV reader will automatically assign the position of each record.
590#[derive(Clone, Debug, Eq, PartialEq)]
591pub struct Position {
592    byte: u64,
593    line: u64,
594    record: u64,
595}
596
597impl Position {
598    /// Returns a new position initialized to the start value.
599    #[inline]
600    pub fn new() -> Position {
601        Position { byte: 0, line: 1, record: 0 }
602    }
603
604    /// The byte offset, starting at `0`, of this position.
605    #[inline]
606    pub fn byte(&self) -> u64 {
607        self.byte
608    }
609    /// The line number, starting at `1`, of this position.
610    #[inline]
611    pub fn line(&self) -> u64 {
612        self.line
613    }
614    /// The record index, starting with the first record at `0`.
615    #[inline]
616    pub fn record(&self) -> u64 {
617        self.record
618    }
619
620    /// Set the byte offset of this position.
621    #[inline]
622    pub fn set_byte(&mut self, byte: u64) -> &mut Position {
623        self.byte = byte;
624        self
625    }
626
627    /// Set the line number of this position.
628    ///
629    /// If the line number is less than `1`, then this method panics.
630    #[inline]
631    pub fn set_line(&mut self, line: u64) -> &mut Position {
632        assert!(line > 0);
633        self.line = line;
634        self
635    }
636
637    /// Set the record index of this position.
638    #[inline]
639    pub fn set_record(&mut self, record: u64) -> &mut Position {
640        self.record = record;
641        self
642    }
643}
644
645/// The bounds of fields in a single record.
646#[derive(Clone, Debug, Eq, PartialEq)]
647struct Bounds {
648    /// The ending index of each field.
649    ends: Vec<usize>,
650    /// The number of fields in this record.
651    ///
652    /// Technically, we could drop this field and maintain an invariant that
653    /// `ends.len()` is always the number of fields, but doing that efficiently
654    /// requires attention to safety. We play it safe at essentially no cost.
655    len: usize,
656}
657
658impl Default for Bounds {
659    #[inline]
660    fn default() -> Bounds {
661        Bounds::with_capacity(0)
662    }
663}
664
665impl Bounds {
666    /// Create a new set of bounds with the given capacity for storing the
667    /// ends of fields.
668    #[inline]
669    fn with_capacity(capacity: usize) -> Bounds {
670        Bounds { ends: vec![0; capacity], len: 0 }
671    }
672
673    /// Returns the bounds of field `i`.
674    #[inline]
675    fn get(&self, i: usize) -> Option<Range<usize>> {
676        if i >= self.len {
677            return None;
678        }
679        let end = match self.ends.get(i) {
680            None => return None,
681            Some(&end) => end,
682        };
683        let start = match i.checked_sub(1).and_then(|i| self.ends.get(i)) {
684            None => 0,
685            Some(&start) => start,
686        };
687        Some(ops::Range { start, end })
688    }
689
690    /// Returns a slice of ending positions of all fields.
691    #[inline]
692    fn ends(&self) -> &[usize] {
693        &self.ends[..self.len]
694    }
695
696    /// Return the last position of the last field.
697    ///
698    /// If there are no fields, this returns `0`.
699    #[inline]
700    fn end(&self) -> usize {
701        self.ends().last().map(|&i| i).unwrap_or(0)
702    }
703
704    /// Returns the number of fields in these bounds.
705    #[inline]
706    fn len(&self) -> usize {
707        self.len
708    }
709
710    /// Expand the capacity for storing field ending positions.
711    #[inline]
712    fn expand(&mut self) {
713        let new_len = self.ends.len().checked_mul(2).unwrap();
714        self.ends.resize(cmp::max(4, new_len), 0);
715    }
716
717    /// Add a new field with the given ending position.
718    #[inline]
719    fn add(&mut self, pos: usize) {
720        if self.len >= self.ends.len() {
721            self.expand();
722        }
723        self.ends[self.len] = pos;
724        self.len += 1;
725    }
726}
727
728impl ops::Index<usize> for ByteRecord {
729    type Output = [u8];
730    #[inline]
731    fn index(&self, i: usize) -> &[u8] {
732        self.get(i).unwrap()
733    }
734}
735
736impl From<StringRecord> for ByteRecord {
737    #[inline]
738    fn from(record: StringRecord) -> ByteRecord {
739        record.into_byte_record()
740    }
741}
742
743impl<T: AsRef<[u8]>> From<Vec<T>> for ByteRecord {
744    #[inline]
745    fn from(xs: Vec<T>) -> ByteRecord {
746        ByteRecord::from_iter(&xs)
747    }
748}
749
750impl<'a, T: AsRef<[u8]>> From<&'a [T]> for ByteRecord {
751    #[inline]
752    fn from(xs: &'a [T]) -> ByteRecord {
753        ByteRecord::from_iter(xs)
754    }
755}
756
757impl<T: AsRef<[u8]>> FromIterator<T> for ByteRecord {
758    #[inline]
759    fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> ByteRecord {
760        let mut record = ByteRecord::new();
761        record.extend(iter);
762        record
763    }
764}
765
766impl<T: AsRef<[u8]>> Extend<T> for ByteRecord {
767    #[inline]
768    fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
769        for x in iter {
770            self.push_field(x.as_ref());
771        }
772    }
773}
774
775/// A double-ended iterator over the fields in a byte record.
776///
777/// The `'r` lifetime variable refers to the lifetime of the `ByteRecord` that
778/// is being iterated over.
779#[derive(Clone)]
780pub struct ByteRecordIter<'r> {
781    /// The record we are iterating over.
782    r: &'r ByteRecord,
783    /// The starting index of the previous field. (For reverse iteration.)
784    last_start: usize,
785    /// The ending index of the previous field. (For forward iteration.)
786    last_end: usize,
787    /// The index of forward iteration.
788    i_forward: usize,
789    /// The index of reverse iteration.
790    i_reverse: usize,
791}
792
793impl<'r> IntoIterator for &'r ByteRecord {
794    type IntoIter = ByteRecordIter<'r>;
795    type Item = &'r [u8];
796
797    #[inline]
798    fn into_iter(self) -> ByteRecordIter<'r> {
799        ByteRecordIter {
800            r: self,
801            last_start: self.as_slice().len(),
802            last_end: 0,
803            i_forward: 0,
804            i_reverse: self.len(),
805        }
806    }
807}
808
809impl<'r> ExactSizeIterator for ByteRecordIter<'r> {}
810
811impl<'r> Iterator for ByteRecordIter<'r> {
812    type Item = &'r [u8];
813
814    #[inline]
815    fn next(&mut self) -> Option<&'r [u8]> {
816        if self.i_forward == self.i_reverse {
817            None
818        } else {
819            let start = self.last_end;
820            let end = self.r.0.bounds.ends()[self.i_forward];
821            self.i_forward += 1;
822            self.last_end = end;
823            Some(&self.r.0.fields[start..end])
824        }
825    }
826
827    #[inline]
828    fn size_hint(&self) -> (usize, Option<usize>) {
829        let x = self.i_reverse - self.i_forward;
830        (x, Some(x))
831    }
832
833    #[inline]
834    fn count(self) -> usize {
835        self.len()
836    }
837}
838
839impl<'r> DoubleEndedIterator for ByteRecordIter<'r> {
840    #[inline]
841    fn next_back(&mut self) -> Option<&'r [u8]> {
842        if self.i_forward == self.i_reverse {
843            None
844        } else {
845            self.i_reverse -= 1;
846            let start = self
847                .i_reverse
848                .checked_sub(1)
849                .map(|i| self.r.0.bounds.ends()[i])
850                .unwrap_or(0);
851            let end = self.last_start;
852            self.last_start = start;
853            Some(&self.r.0.fields[start..end])
854        }
855    }
856}
857
858fn trim_ascii(bytes: &[u8]) -> &[u8] {
859    trim_ascii_start(trim_ascii_end(bytes))
860}
861
862fn trim_ascii_start(mut bytes: &[u8]) -> &[u8] {
863    while let [first, rest @ ..] = bytes {
864        if first.is_ascii_whitespace() {
865            bytes = rest;
866        } else {
867            break;
868        }
869    }
870    bytes
871}
872
873fn trim_ascii_end(mut bytes: &[u8]) -> &[u8] {
874    while let [rest @ .., last] = bytes {
875        if last.is_ascii_whitespace() {
876            bytes = rest;
877        } else {
878            break;
879        }
880    }
881    bytes
882}
883
884#[cfg(test)]
885mod tests {
886    use crate::string_record::StringRecord;
887
888    use super::ByteRecord;
889
890    fn b(s: &str) -> &[u8] {
891        s.as_bytes()
892    }
893
894    #[test]
895    fn record_1() {
896        let mut rec = ByteRecord::new();
897        rec.push_field(b"foo");
898
899        assert_eq!(rec.len(), 1);
900        assert_eq!(rec.get(0), Some(b("foo")));
901        assert_eq!(rec.get(1), None);
902        assert_eq!(rec.get(2), None);
903    }
904
905    #[test]
906    fn record_2() {
907        let mut rec = ByteRecord::new();
908        rec.push_field(b"foo");
909        rec.push_field(b"quux");
910
911        assert_eq!(rec.len(), 2);
912        assert_eq!(rec.get(0), Some(b("foo")));
913        assert_eq!(rec.get(1), Some(b("quux")));
914        assert_eq!(rec.get(2), None);
915        assert_eq!(rec.get(3), None);
916    }
917
918    #[test]
919    fn empty_record() {
920        let rec = ByteRecord::new();
921
922        assert_eq!(rec.len(), 0);
923        assert_eq!(rec.get(0), None);
924        assert_eq!(rec.get(1), None);
925    }
926
927    #[test]
928    fn trim_whitespace_only() {
929        let mut rec = ByteRecord::from(vec![b" \t\n\r\x0c"]);
930        rec.trim();
931        assert_eq!(rec.get(0), Some(b("")));
932    }
933
934    #[test]
935    fn trim_front() {
936        let mut rec = ByteRecord::from(vec![b" abc"]);
937        rec.trim();
938        assert_eq!(rec.get(0), Some(b("abc")));
939
940        let mut rec = ByteRecord::from(vec![b(" abc"), b("  xyz")]);
941        rec.trim();
942        assert_eq!(rec.get(0), Some(b("abc")));
943        assert_eq!(rec.get(1), Some(b("xyz")));
944    }
945
946    #[test]
947    fn trim_back() {
948        let mut rec = ByteRecord::from(vec![b"abc "]);
949        rec.trim();
950        assert_eq!(rec.get(0), Some(b("abc")));
951
952        let mut rec = ByteRecord::from(vec![b("abc "), b("xyz  ")]);
953        rec.trim();
954        assert_eq!(rec.get(0), Some(b("abc")));
955        assert_eq!(rec.get(1), Some(b("xyz")));
956    }
957
958    #[test]
959    fn trim_both() {
960        let mut rec = ByteRecord::from(vec![b" abc "]);
961        rec.trim();
962        assert_eq!(rec.get(0), Some(b("abc")));
963
964        let mut rec = ByteRecord::from(vec![b(" abc "), b("  xyz  ")]);
965        rec.trim();
966        assert_eq!(rec.get(0), Some(b("abc")));
967        assert_eq!(rec.get(1), Some(b("xyz")));
968    }
969
970    #[test]
971    fn trim_does_not_panic_on_empty_records_1() {
972        let mut rec = ByteRecord::from(vec![b""]);
973        rec.trim();
974        assert_eq!(rec.get(0), Some(b("")));
975    }
976
977    #[test]
978    fn trim_does_not_panic_on_empty_records_2() {
979        let mut rec = ByteRecord::from(vec![b"", b""]);
980        rec.trim();
981        assert_eq!(rec.get(0), Some(b("")));
982        assert_eq!(rec.get(1), Some(b("")));
983    }
984
985    #[test]
986    fn trim_does_not_panic_on_empty_records_3() {
987        let mut rec = ByteRecord::new();
988        rec.trim();
989        assert_eq!(rec.as_slice().len(), 0);
990    }
991
992    #[test]
993    fn empty_field_1() {
994        let mut rec = ByteRecord::new();
995        rec.push_field(b"");
996
997        assert_eq!(rec.len(), 1);
998        assert_eq!(rec.get(0), Some(b("")));
999        assert_eq!(rec.get(1), None);
1000        assert_eq!(rec.get(2), None);
1001    }
1002
1003    #[test]
1004    fn empty_field_2() {
1005        let mut rec = ByteRecord::new();
1006        rec.push_field(b"");
1007        rec.push_field(b"");
1008
1009        assert_eq!(rec.len(), 2);
1010        assert_eq!(rec.get(0), Some(b("")));
1011        assert_eq!(rec.get(1), Some(b("")));
1012        assert_eq!(rec.get(2), None);
1013        assert_eq!(rec.get(3), None);
1014    }
1015
1016    #[test]
1017    fn empty_surround_1() {
1018        let mut rec = ByteRecord::new();
1019        rec.push_field(b"foo");
1020        rec.push_field(b"");
1021        rec.push_field(b"quux");
1022
1023        assert_eq!(rec.len(), 3);
1024        assert_eq!(rec.get(0), Some(b("foo")));
1025        assert_eq!(rec.get(1), Some(b("")));
1026        assert_eq!(rec.get(2), Some(b("quux")));
1027        assert_eq!(rec.get(3), None);
1028        assert_eq!(rec.get(4), None);
1029    }
1030
1031    #[test]
1032    fn empty_surround_2() {
1033        let mut rec = ByteRecord::new();
1034        rec.push_field(b"foo");
1035        rec.push_field(b"");
1036        rec.push_field(b"quux");
1037        rec.push_field(b"");
1038
1039        assert_eq!(rec.len(), 4);
1040        assert_eq!(rec.get(0), Some(b("foo")));
1041        assert_eq!(rec.get(1), Some(b("")));
1042        assert_eq!(rec.get(2), Some(b("quux")));
1043        assert_eq!(rec.get(3), Some(b("")));
1044        assert_eq!(rec.get(4), None);
1045        assert_eq!(rec.get(5), None);
1046    }
1047
1048    #[test]
1049    fn utf8_error_1() {
1050        let mut rec = ByteRecord::new();
1051        rec.push_field(b"foo");
1052        rec.push_field(b"b\xFFar");
1053
1054        let err = StringRecord::from_byte_record(rec).unwrap_err();
1055        assert_eq!(err.utf8_error().field(), 1);
1056        assert_eq!(err.utf8_error().valid_up_to(), 1);
1057    }
1058
1059    #[test]
1060    fn utf8_error_2() {
1061        let mut rec = ByteRecord::new();
1062        rec.push_field(b"\xFF");
1063
1064        let err = StringRecord::from_byte_record(rec).unwrap_err();
1065        assert_eq!(err.utf8_error().field(), 0);
1066        assert_eq!(err.utf8_error().valid_up_to(), 0);
1067    }
1068
1069    #[test]
1070    fn utf8_error_3() {
1071        let mut rec = ByteRecord::new();
1072        rec.push_field(b"a\xFF");
1073
1074        let err = StringRecord::from_byte_record(rec).unwrap_err();
1075        assert_eq!(err.utf8_error().field(), 0);
1076        assert_eq!(err.utf8_error().valid_up_to(), 1);
1077    }
1078
1079    #[test]
1080    fn utf8_error_4() {
1081        let mut rec = ByteRecord::new();
1082        rec.push_field(b"a");
1083        rec.push_field(b"b");
1084        rec.push_field(b"c");
1085        rec.push_field(b"d");
1086        rec.push_field(b"xyz\xFF");
1087
1088        let err = StringRecord::from_byte_record(rec).unwrap_err();
1089        assert_eq!(err.utf8_error().field(), 4);
1090        assert_eq!(err.utf8_error().valid_up_to(), 3);
1091    }
1092
1093    #[test]
1094    fn utf8_error_5() {
1095        let mut rec = ByteRecord::new();
1096        rec.push_field(b"a");
1097        rec.push_field(b"b");
1098        rec.push_field(b"c");
1099        rec.push_field(b"d");
1100        rec.push_field(b"\xFFxyz");
1101
1102        let err = StringRecord::from_byte_record(rec).unwrap_err();
1103        assert_eq!(err.utf8_error().field(), 4);
1104        assert_eq!(err.utf8_error().valid_up_to(), 0);
1105    }
1106
1107    // This tests a tricky case where a single field on its own isn't valid
1108    // UTF-8, but the concatenation of all fields is.
1109    #[test]
1110    fn utf8_error_6() {
1111        let mut rec = ByteRecord::new();
1112        rec.push_field(b"a\xc9");
1113        rec.push_field(b"\x91b");
1114
1115        let err = StringRecord::from_byte_record(rec).unwrap_err();
1116        assert_eq!(err.utf8_error().field(), 0);
1117        assert_eq!(err.utf8_error().valid_up_to(), 1);
1118    }
1119
1120    // This tests that we can always clear a `ByteRecord` and get a guaranteed
1121    // successful conversion to UTF-8. This permits reusing the allocation.
1122    #[test]
1123    fn utf8_clear_ok() {
1124        let mut rec = ByteRecord::new();
1125        rec.push_field(b"\xFF");
1126        assert!(StringRecord::from_byte_record(rec).is_err());
1127
1128        let mut rec = ByteRecord::new();
1129        rec.push_field(b"\xFF");
1130        rec.clear();
1131        assert!(StringRecord::from_byte_record(rec).is_ok());
1132    }
1133
1134    #[test]
1135    fn iter() {
1136        let data = vec!["foo", "bar", "baz", "quux", "wat"];
1137        let rec = ByteRecord::from(&*data);
1138        let got: Vec<&str> =
1139            rec.iter().map(|x| ::std::str::from_utf8(x).unwrap()).collect();
1140        assert_eq!(data, got);
1141    }
1142
1143    #[test]
1144    fn iter_reverse() {
1145        let mut data = vec!["foo", "bar", "baz", "quux", "wat"];
1146        let rec = ByteRecord::from(&*data);
1147        let got: Vec<&str> = rec
1148            .iter()
1149            .rev()
1150            .map(|x| ::std::str::from_utf8(x).unwrap())
1151            .collect();
1152        data.reverse();
1153        assert_eq!(data, got);
1154    }
1155
1156    #[test]
1157    fn iter_forward_and_reverse() {
1158        let data = vec!["foo", "bar", "baz", "quux", "wat"];
1159        let rec = ByteRecord::from(data);
1160        let mut it = rec.iter();
1161
1162        assert_eq!(it.next_back(), Some(b("wat")));
1163        assert_eq!(it.next(), Some(b("foo")));
1164        assert_eq!(it.next(), Some(b("bar")));
1165        assert_eq!(it.next_back(), Some(b("quux")));
1166        assert_eq!(it.next(), Some(b("baz")));
1167        assert_eq!(it.next_back(), None);
1168        assert_eq!(it.next(), None);
1169    }
1170
1171    // Check that record equality respects field boundaries.
1172    //
1173    // Regression test for #138.
1174    #[test]
1175    fn eq_field_boundaries() {
1176        let test1 = ByteRecord::from(vec!["12", "34"]);
1177        let test2 = ByteRecord::from(vec!["123", "4"]);
1178
1179        assert_ne!(test1, test2);
1180    }
1181
1182    // Check that record equality respects number of fields.
1183    //
1184    // Regression test for #138.
1185    #[test]
1186    fn eq_record_len() {
1187        let test1 = ByteRecord::from(vec!["12", "34", "56"]);
1188        let test2 = ByteRecord::from(vec!["12", "34"]);
1189        assert_ne!(test1, test2);
1190    }
1191}
csv/byte_record.rs

csv/
byte_record.rs