csv/reader.rs
1use std::{
2 fs::File,
3 io::{self, BufRead, Seek},
4 marker::PhantomData,
5 path::Path,
6 result,
7};
8
9use {
10 csv_core::{Reader as CoreReader, ReaderBuilder as CoreReaderBuilder},
11 serde::de::DeserializeOwned,
12};
13
14use crate::{
15 byte_record::{ByteRecord, Position},
16 error::{Error, ErrorKind, Result, Utf8Error},
17 string_record::StringRecord,
18 {Terminator, Trim},
19};
20
21/// Builds a CSV reader with various configuration knobs.
22///
23/// This builder can be used to tweak the field delimiter, record terminator
24/// and more. Once a CSV `Reader` is built, its configuration cannot be
25/// changed.
26#[derive(Debug)]
27pub struct ReaderBuilder {
28 capacity: usize,
29 flexible: bool,
30 has_headers: bool,
31 trim: Trim,
32 /// The underlying CSV parser builder.
33 ///
34 /// We explicitly put this on the heap because CoreReaderBuilder embeds an
35 /// entire DFA transition table, which along with other things, tallies up
36 /// to almost 500 bytes on the stack.
37 builder: Box<CoreReaderBuilder>,
38}
39
40impl Default for ReaderBuilder {
41 fn default() -> ReaderBuilder {
42 ReaderBuilder {
43 capacity: 8 * (1 << 10),
44 flexible: false,
45 has_headers: true,
46 trim: Trim::default(),
47 builder: Box::new(CoreReaderBuilder::default()),
48 }
49 }
50}
51
52impl ReaderBuilder {
53 /// Create a new builder for configuring CSV parsing.
54 ///
55 /// To convert a builder into a reader, call one of the methods starting
56 /// with `from_`.
57 ///
58 /// # Example
59 ///
60 /// ```
61 /// use std::error::Error;
62 /// use csv::{ReaderBuilder, StringRecord};
63 ///
64 /// # fn main() { example().unwrap(); }
65 /// fn example() -> Result<(), Box<dyn Error>> {
66 /// let data = "\
67 /// city,country,pop
68 /// Boston,United States,4628910
69 /// Concord,United States,42695
70 /// ";
71 /// let mut rdr = ReaderBuilder::new().from_reader(data.as_bytes());
72 ///
73 /// let records = rdr
74 /// .records()
75 /// .collect::<Result<Vec<StringRecord>, csv::Error>>()?;
76 /// assert_eq!(records, vec![
77 /// vec!["Boston", "United States", "4628910"],
78 /// vec!["Concord", "United States", "42695"],
79 /// ]);
80 /// Ok(())
81 /// }
82 /// ```
83 pub fn new() -> ReaderBuilder {
84 ReaderBuilder::default()
85 }
86
87 /// Build a CSV parser from this configuration that reads data from the
88 /// given file path.
89 ///
90 /// If there was a problem opening the file at the given path, then this
91 /// returns the corresponding error.
92 ///
93 /// # Example
94 ///
95 /// ```no_run
96 /// use std::error::Error;
97 /// use csv::ReaderBuilder;
98 ///
99 /// # fn main() { example().unwrap(); }
100 /// fn example() -> Result<(), Box<dyn Error>> {
101 /// let mut rdr = ReaderBuilder::new().from_path("foo.csv")?;
102 /// for result in rdr.records() {
103 /// let record = result?;
104 /// println!("{:?}", record);
105 /// }
106 /// Ok(())
107 /// }
108 /// ```
109 pub fn from_path<P: AsRef<Path>>(&self, path: P) -> Result<Reader<File>> {
110 Ok(Reader::new(self, File::open(path)?))
111 }
112
113 /// Build a CSV parser from this configuration that reads data from `rdr`.
114 ///
115 /// Note that the CSV reader is buffered automatically, so you should not
116 /// wrap `rdr` in a buffered reader like `io::BufReader`.
117 ///
118 /// # Example
119 ///
120 /// ```
121 /// use std::error::Error;
122 /// use csv::ReaderBuilder;
123 ///
124 /// # fn main() { example().unwrap(); }
125 /// fn example() -> Result<(), Box<dyn Error>> {
126 /// let data = "\
127 /// city,country,pop
128 /// Boston,United States,4628910
129 /// Concord,United States,42695
130 /// ";
131 /// let mut rdr = ReaderBuilder::new().from_reader(data.as_bytes());
132 /// for result in rdr.records() {
133 /// let record = result?;
134 /// println!("{:?}", record);
135 /// }
136 /// Ok(())
137 /// }
138 /// ```
139 pub fn from_reader<R: io::Read>(&self, rdr: R) -> Reader<R> {
140 Reader::new(self, rdr)
141 }
142
143 /// The field delimiter to use when parsing CSV.
144 ///
145 /// The default is `b','`.
146 ///
147 /// # Example
148 ///
149 /// ```
150 /// use std::error::Error;
151 /// use csv::ReaderBuilder;
152 ///
153 /// # fn main() { example().unwrap(); }
154 /// fn example() -> Result<(), Box<dyn Error>> {
155 /// let data = "\
156 /// city;country;pop
157 /// Boston;United States;4628910
158 /// ";
159 /// let mut rdr = ReaderBuilder::new()
160 /// .delimiter(b';')
161 /// .from_reader(data.as_bytes());
162 ///
163 /// if let Some(result) = rdr.records().next() {
164 /// let record = result?;
165 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
166 /// Ok(())
167 /// } else {
168 /// Err(From::from("expected at least one record but got none"))
169 /// }
170 /// }
171 /// ```
172 pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder {
173 self.builder.delimiter(delimiter);
174 self
175 }
176
177 /// Whether to treat the first row as a special header row.
178 ///
179 /// By default, the first row is treated as a special header row, which
180 /// means the header is never returned by any of the record reading methods
181 /// or iterators. When this is disabled (`yes` set to `false`), the first
182 /// row is not treated specially.
183 ///
184 /// Note that the `headers` and `byte_headers` methods are unaffected by
185 /// whether this is set. Those methods always return the first record.
186 ///
187 /// # Example
188 ///
189 /// This example shows what happens when `has_headers` is disabled.
190 /// Namely, the first row is treated just like any other row.
191 ///
192 /// ```
193 /// use std::error::Error;
194 /// use csv::ReaderBuilder;
195 ///
196 /// # fn main() { example().unwrap(); }
197 /// fn example() -> Result<(), Box<dyn Error>> {
198 /// let data = "\
199 /// city,country,pop
200 /// Boston,United States,4628910
201 /// ";
202 /// let mut rdr = ReaderBuilder::new()
203 /// .has_headers(false)
204 /// .from_reader(data.as_bytes());
205 /// let mut iter = rdr.records();
206 ///
207 /// // Read the first record.
208 /// if let Some(result) = iter.next() {
209 /// let record = result?;
210 /// assert_eq!(record, vec!["city", "country", "pop"]);
211 /// } else {
212 /// return Err(From::from(
213 /// "expected at least two records but got none"));
214 /// }
215 ///
216 /// // Read the second record.
217 /// if let Some(result) = iter.next() {
218 /// let record = result?;
219 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
220 /// } else {
221 /// return Err(From::from(
222 /// "expected at least two records but got one"))
223 /// }
224 /// Ok(())
225 /// }
226 /// ```
227 pub fn has_headers(&mut self, yes: bool) -> &mut ReaderBuilder {
228 self.has_headers = yes;
229 self
230 }
231
232 /// Whether the number of fields in records is allowed to change or not.
233 ///
234 /// When disabled (which is the default), parsing CSV data will return an
235 /// error if a record is found with a number of fields different from the
236 /// number of fields in a previous record.
237 ///
238 /// When enabled, this error checking is turned off.
239 ///
240 /// # Example: flexible records enabled
241 ///
242 /// ```
243 /// use std::error::Error;
244 /// use csv::ReaderBuilder;
245 ///
246 /// # fn main() { example().unwrap(); }
247 /// fn example() -> Result<(), Box<dyn Error>> {
248 /// // Notice that the first row is missing the population count.
249 /// let data = "\
250 /// city,country,pop
251 /// Boston,United States
252 /// ";
253 /// let mut rdr = ReaderBuilder::new()
254 /// .flexible(true)
255 /// .from_reader(data.as_bytes());
256 ///
257 /// if let Some(result) = rdr.records().next() {
258 /// let record = result?;
259 /// assert_eq!(record, vec!["Boston", "United States"]);
260 /// Ok(())
261 /// } else {
262 /// Err(From::from("expected at least one record but got none"))
263 /// }
264 /// }
265 /// ```
266 ///
267 /// # Example: flexible records disabled
268 ///
269 /// This shows the error that appears when records of unequal length
270 /// are found and flexible records have been disabled (which is the
271 /// default).
272 ///
273 /// ```
274 /// use std::error::Error;
275 /// use csv::{ErrorKind, ReaderBuilder};
276 ///
277 /// # fn main() { example().unwrap(); }
278 /// fn example() -> Result<(), Box<dyn Error>> {
279 /// // Notice that the first row is missing the population count.
280 /// let data = "\
281 /// city,country,pop
282 /// Boston,United States
283 /// ";
284 /// let mut rdr = ReaderBuilder::new()
285 /// .flexible(false)
286 /// .from_reader(data.as_bytes());
287 ///
288 /// if let Some(Err(err)) = rdr.records().next() {
289 /// match *err.kind() {
290 /// ErrorKind::UnequalLengths { expected_len, len, .. } => {
291 /// // The header row has 3 fields...
292 /// assert_eq!(expected_len, 3);
293 /// // ... but the first row has only 2 fields.
294 /// assert_eq!(len, 2);
295 /// Ok(())
296 /// }
297 /// ref wrong => {
298 /// Err(From::from(format!(
299 /// "expected UnequalLengths error but got {:?}",
300 /// wrong)))
301 /// }
302 /// }
303 /// } else {
304 /// Err(From::from(
305 /// "expected at least one errored record but got none"))
306 /// }
307 /// }
308 /// ```
309 pub fn flexible(&mut self, yes: bool) -> &mut ReaderBuilder {
310 self.flexible = yes;
311 self
312 }
313
314 /// Whether fields are trimmed of leading and trailing whitespace or not.
315 ///
316 /// By default, no trimming is performed. This method permits one to
317 /// override that behavior and choose one of the following options:
318 ///
319 /// 1. `Trim::Headers` trims only header values.
320 /// 2. `Trim::Fields` trims only non-header or "field" values.
321 /// 3. `Trim::All` trims both header and non-header values.
322 ///
323 /// A value is only interpreted as a header value if this CSV reader is
324 /// configured to read a header record (which is the default).
325 ///
326 /// When reading string records, characters meeting the definition of
327 /// Unicode whitespace are trimmed. When reading byte records, characters
328 /// meeting the definition of ASCII whitespace are trimmed. ASCII
329 /// whitespace characters correspond to the set `[\t\n\v\f\r ]`.
330 ///
331 /// # Example
332 ///
333 /// This example shows what happens when all values are trimmed.
334 ///
335 /// ```
336 /// use std::error::Error;
337 /// use csv::{ReaderBuilder, StringRecord, Trim};
338 ///
339 /// # fn main() { example().unwrap(); }
340 /// fn example() -> Result<(), Box<dyn Error>> {
341 /// let data = "\
342 /// city , country , pop
343 /// Boston,\"
344 /// United States\",4628910
345 /// Concord, United States ,42695
346 /// ";
347 /// let mut rdr = ReaderBuilder::new()
348 /// .trim(Trim::All)
349 /// .from_reader(data.as_bytes());
350 /// let records = rdr
351 /// .records()
352 /// .collect::<Result<Vec<StringRecord>, csv::Error>>()?;
353 /// assert_eq!(records, vec![
354 /// vec!["Boston", "United States", "4628910"],
355 /// vec!["Concord", "United States", "42695"],
356 /// ]);
357 /// Ok(())
358 /// }
359 /// ```
360 pub fn trim(&mut self, trim: Trim) -> &mut ReaderBuilder {
361 self.trim = trim;
362 self
363 }
364
365 /// The record terminator to use when parsing CSV.
366 ///
367 /// A record terminator can be any single byte. The default is a special
368 /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n`
369 /// or `\r\n` as a single record terminator.
370 ///
371 /// # Example: `$` as a record terminator
372 ///
373 /// ```
374 /// use std::error::Error;
375 /// use csv::{ReaderBuilder, Terminator};
376 ///
377 /// # fn main() { example().unwrap(); }
378 /// fn example() -> Result<(), Box<dyn Error>> {
379 /// let data = "city,country,pop$Boston,United States,4628910";
380 /// let mut rdr = ReaderBuilder::new()
381 /// .terminator(Terminator::Any(b'$'))
382 /// .from_reader(data.as_bytes());
383 ///
384 /// if let Some(result) = rdr.records().next() {
385 /// let record = result?;
386 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
387 /// Ok(())
388 /// } else {
389 /// Err(From::from("expected at least one record but got none"))
390 /// }
391 /// }
392 /// ```
393 pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder {
394 self.builder.terminator(term.to_core());
395 self
396 }
397
398 /// The quote character to use when parsing CSV.
399 ///
400 /// The default is `b'"'`.
401 ///
402 /// # Example: single quotes instead of double quotes
403 ///
404 /// ```
405 /// use std::error::Error;
406 /// use csv::ReaderBuilder;
407 ///
408 /// # fn main() { example().unwrap(); }
409 /// fn example() -> Result<(), Box<dyn Error>> {
410 /// let data = "\
411 /// city,country,pop
412 /// Boston,'United States',4628910
413 /// ";
414 /// let mut rdr = ReaderBuilder::new()
415 /// .quote(b'\'')
416 /// .from_reader(data.as_bytes());
417 ///
418 /// if let Some(result) = rdr.records().next() {
419 /// let record = result?;
420 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
421 /// Ok(())
422 /// } else {
423 /// Err(From::from("expected at least one record but got none"))
424 /// }
425 /// }
426 /// ```
427 pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder {
428 self.builder.quote(quote);
429 self
430 }
431
432 /// The escape character to use when parsing CSV.
433 ///
434 /// In some variants of CSV, quotes are escaped using a special escape
435 /// character like `\` (instead of escaping quotes by doubling them).
436 ///
437 /// By default, recognizing these idiosyncratic escapes is disabled.
438 ///
439 /// # Example
440 ///
441 /// ```
442 /// use std::error::Error;
443 /// use csv::ReaderBuilder;
444 ///
445 /// # fn main() { example().unwrap(); }
446 /// fn example() -> Result<(), Box<dyn Error>> {
447 /// let data = "\
448 /// city,country,pop
449 /// Boston,\"The \\\"United\\\" States\",4628910
450 /// ";
451 /// let mut rdr = ReaderBuilder::new()
452 /// .escape(Some(b'\\'))
453 /// .from_reader(data.as_bytes());
454 ///
455 /// if let Some(result) = rdr.records().next() {
456 /// let record = result?;
457 /// assert_eq!(record, vec![
458 /// "Boston", "The \"United\" States", "4628910",
459 /// ]);
460 /// Ok(())
461 /// } else {
462 /// Err(From::from("expected at least one record but got none"))
463 /// }
464 /// }
465 /// ```
466 pub fn escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder {
467 self.builder.escape(escape);
468 self
469 }
470
471 /// Enable double quote escapes.
472 ///
473 /// This is enabled by default, but it may be disabled. When disabled,
474 /// doubled quotes are not interpreted as escapes.
475 ///
476 /// # Example
477 ///
478 /// ```
479 /// use std::error::Error;
480 /// use csv::ReaderBuilder;
481 ///
482 /// # fn main() { example().unwrap(); }
483 /// fn example() -> Result<(), Box<dyn Error>> {
484 /// let data = "\
485 /// city,country,pop
486 /// Boston,\"The \"\"United\"\" States\",4628910
487 /// ";
488 /// let mut rdr = ReaderBuilder::new()
489 /// .double_quote(false)
490 /// .from_reader(data.as_bytes());
491 ///
492 /// if let Some(result) = rdr.records().next() {
493 /// let record = result?;
494 /// assert_eq!(record, vec![
495 /// "Boston", "The \"United\"\" States\"", "4628910",
496 /// ]);
497 /// Ok(())
498 /// } else {
499 /// Err(From::from("expected at least one record but got none"))
500 /// }
501 /// }
502 /// ```
503 pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder {
504 self.builder.double_quote(yes);
505 self
506 }
507
508 /// Enable or disable quoting.
509 ///
510 /// This is enabled by default, but it may be disabled. When disabled,
511 /// quotes are not treated specially.
512 ///
513 /// # Example
514 ///
515 /// ```
516 /// use std::error::Error;
517 /// use csv::ReaderBuilder;
518 ///
519 /// # fn main() { example().unwrap(); }
520 /// fn example() -> Result<(), Box<dyn Error>> {
521 /// let data = "\
522 /// city,country,pop
523 /// Boston,\"The United States,4628910
524 /// ";
525 /// let mut rdr = ReaderBuilder::new()
526 /// .quoting(false)
527 /// .from_reader(data.as_bytes());
528 ///
529 /// if let Some(result) = rdr.records().next() {
530 /// let record = result?;
531 /// assert_eq!(record, vec![
532 /// "Boston", "\"The United States", "4628910",
533 /// ]);
534 /// Ok(())
535 /// } else {
536 /// Err(From::from("expected at least one record but got none"))
537 /// }
538 /// }
539 /// ```
540 pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder {
541 self.builder.quoting(yes);
542 self
543 }
544
545 /// The comment character to use when parsing CSV.
546 ///
547 /// If the start of a record begins with the byte given here, then that
548 /// line is ignored by the CSV parser.
549 ///
550 /// This is disabled by default.
551 ///
552 /// # Example
553 ///
554 /// ```
555 /// use std::error::Error;
556 /// use csv::ReaderBuilder;
557 ///
558 /// # fn main() { example().unwrap(); }
559 /// fn example() -> Result<(), Box<dyn Error>> {
560 /// let data = "\
561 /// city,country,pop
562 /// #Concord,United States,42695
563 /// Boston,United States,4628910
564 /// ";
565 /// let mut rdr = ReaderBuilder::new()
566 /// .comment(Some(b'#'))
567 /// .from_reader(data.as_bytes());
568 ///
569 /// if let Some(result) = rdr.records().next() {
570 /// let record = result?;
571 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
572 /// Ok(())
573 /// } else {
574 /// Err(From::from("expected at least one record but got none"))
575 /// }
576 /// }
577 /// ```
578 pub fn comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder {
579 self.builder.comment(comment);
580 self
581 }
582
583 /// A convenience method for specifying a configuration to read ASCII
584 /// delimited text.
585 ///
586 /// This sets the delimiter and record terminator to the ASCII unit
587 /// separator (`\x1F`) and record separator (`\x1E`), respectively.
588 ///
589 /// # Example
590 ///
591 /// ```
592 /// use std::error::Error;
593 /// use csv::ReaderBuilder;
594 ///
595 /// # fn main() { example().unwrap(); }
596 /// fn example() -> Result<(), Box<dyn Error>> {
597 /// let data = "\
598 /// city\x1Fcountry\x1Fpop\x1EBoston\x1FUnited States\x1F4628910";
599 /// let mut rdr = ReaderBuilder::new()
600 /// .ascii()
601 /// .from_reader(data.as_bytes());
602 ///
603 /// if let Some(result) = rdr.records().next() {
604 /// let record = result?;
605 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
606 /// Ok(())
607 /// } else {
608 /// Err(From::from("expected at least one record but got none"))
609 /// }
610 /// }
611 /// ```
612 pub fn ascii(&mut self) -> &mut ReaderBuilder {
613 self.builder.ascii();
614 self
615 }
616
617 /// Set the capacity (in bytes) of the buffer used in the CSV reader.
618 /// This defaults to a reasonable setting.
619 pub fn buffer_capacity(&mut self, capacity: usize) -> &mut ReaderBuilder {
620 self.capacity = capacity;
621 self
622 }
623
624 /// Enable or disable the NFA for parsing CSV.
625 ///
626 /// This is intended to be a debug option. The NFA is always slower than
627 /// the DFA.
628 #[doc(hidden)]
629 pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder {
630 self.builder.nfa(yes);
631 self
632 }
633}
634
635/// A already configured CSV reader.
636///
637/// A CSV reader takes as input CSV data and transforms that into standard Rust
638/// values. The most flexible way to read CSV data is as a sequence of records,
639/// where a record is a sequence of fields and each field is a string. However,
640/// a reader can also deserialize CSV data into Rust types like `i64` or
641/// `(String, f64, f64, f64)` or even a custom struct automatically using
642/// Serde.
643///
644/// # Configuration
645///
646/// A CSV reader has a couple convenient constructor methods like `from_path`
647/// and `from_reader`. However, if you want to configure the CSV reader to use
648/// a different delimiter or quote character (among many other things), then
649/// you should use a [`ReaderBuilder`](struct.ReaderBuilder.html) to construct
650/// a `Reader`. For example, to change the field delimiter:
651///
652/// ```
653/// use std::error::Error;
654/// use csv::ReaderBuilder;
655///
656/// # fn main() { example().unwrap(); }
657/// fn example() -> Result<(), Box<dyn Error>> {
658/// let data = "\
659/// city;country;pop
660/// Boston;United States;4628910
661/// ";
662/// let mut rdr = ReaderBuilder::new()
663/// .delimiter(b';')
664/// .from_reader(data.as_bytes());
665///
666/// if let Some(result) = rdr.records().next() {
667/// let record = result?;
668/// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
669/// Ok(())
670/// } else {
671/// Err(From::from("expected at least one record but got none"))
672/// }
673/// }
674/// ```
675///
676/// # Error handling
677///
678/// In general, CSV *parsing* does not ever return an error. That is, there is
679/// no such thing as malformed CSV data. Instead, this reader will prioritize
680/// finding a parse over rejecting CSV data that it does not understand. This
681/// choice was inspired by other popular CSV parsers, but also because it is
682/// pragmatic. CSV data varies wildly, so even if the CSV data is malformed,
683/// it might still be possible to work with the data. In the land of CSV, there
684/// is no "right" or "wrong," only "right" and "less right."
685///
686/// With that said, a number of errors can occur while reading CSV data:
687///
688/// * By default, all records in CSV data must have the same number of fields.
689/// If a record is found with a different number of fields than a prior
690/// record, then an error is returned. This behavior can be disabled by
691/// enabling flexible parsing via the `flexible` method on
692/// [`ReaderBuilder`](struct.ReaderBuilder.html).
693/// * When reading CSV data from a resource (like a file), it is possible for
694/// reading from the underlying resource to fail. This will return an error.
695/// For subsequent calls to the `Reader` after encountering a such error
696/// (unless `seek` is used), it will behave as if end of file had been
697/// reached, in order to avoid running into infinite loops when still
698/// attempting to read the next record when one has errored.
699/// * When reading CSV data into `String` or `&str` fields (e.g., via a
700/// [`StringRecord`](struct.StringRecord.html)), UTF-8 is strictly
701/// enforced. If CSV data is invalid UTF-8, then an error is returned. If
702/// you want to read invalid UTF-8, then you should use the byte oriented
703/// APIs such as [`ByteRecord`](struct.ByteRecord.html). If you need explicit
704/// support for another encoding entirely, then you'll need to use another
705/// crate to transcode your CSV data to UTF-8 before parsing it.
706/// * When using Serde to deserialize CSV data into Rust types, it is possible
707/// for a number of additional errors to occur. For example, deserializing
708/// a field `xyz` into an `i32` field will result in an error.
709///
710/// For more details on the precise semantics of errors, see the
711/// [`Error`](enum.Error.html) type.
712#[derive(Debug)]
713pub struct Reader<R> {
714 /// The underlying CSV parser.
715 ///
716 /// We explicitly put this on the heap because CoreReader embeds an entire
717 /// DFA transition table, which along with other things, tallies up to
718 /// almost 500 bytes on the stack.
719 core: Box<CoreReader>,
720 /// The underlying reader.
721 rdr: io::BufReader<R>,
722 /// Various state tracking.
723 ///
724 /// There is more state embedded in the `CoreReader`.
725 state: ReaderState,
726}
727
728#[derive(Debug)]
729struct ReaderState {
730 /// When set, this contains the first row of any parsed CSV data.
731 ///
732 /// This is always populated, regardless of whether `has_headers` is set.
733 headers: Option<Headers>,
734 /// When set, the first row of parsed CSV data is excluded from things
735 /// that read records, like iterators and `read_record`.
736 has_headers: bool,
737 /// When set, there is no restriction on the length of records. When not
738 /// set, every record must have the same number of fields, or else an error
739 /// is reported.
740 flexible: bool,
741 trim: Trim,
742 /// The number of fields in the first record parsed.
743 first_field_count: Option<u64>,
744 /// The current position of the parser.
745 ///
746 /// Note that this position is only observable by callers at the start
747 /// of a record. More granular positions are not supported.
748 cur_pos: Position,
749 /// Whether the first record has been read or not.
750 first: bool,
751 /// Whether the reader has been seeked or not.
752 seeked: bool,
753 /// Whether EOF of the underlying reader has been reached or not.
754 ///
755 /// IO errors on the underlying reader will be considered as an EOF for
756 /// subsequent read attempts, as it would be incorrect to keep on trying
757 /// to read when the underlying reader has broken.
758 ///
759 /// For clarity, having the best `Debug` impl and in case they need to be
760 /// treated differently at some point, we store whether the `EOF` is
761 /// considered because an actual EOF happened, or because we encoundered
762 /// an IO error.
763 /// This has no additional runtime cost.
764 eof: ReaderEofState,
765}
766
767/// Whether EOF of the underlying reader has been reached or not.
768///
769/// IO errors on the underlying reader will be considered as an EOF for
770/// subsequent read attempts, as it would be incorrect to keep on trying
771/// to read when the underlying reader has broken.
772///
773/// For clarity, having the best `Debug` impl and in case they need to be
774/// treated differently at some point, we store whether the `EOF` is
775/// considered because an actual EOF happened, or because we encoundered
776/// an IO error
777#[derive(Debug, Clone, Copy, PartialEq, Eq)]
778enum ReaderEofState {
779 NotEof,
780 Eof,
781 IOError,
782}
783
784/// Headers encapsulates any data associated with the headers of CSV data.
785///
786/// The headers always correspond to the first row.
787#[derive(Debug)]
788struct Headers {
789 /// The header, as raw bytes.
790 byte_record: ByteRecord,
791 /// The header, as valid UTF-8 (or a UTF-8 error).
792 string_record: result::Result<StringRecord, Utf8Error>,
793}
794
795impl Reader<Reader<File>> {
796 /// Create a new CSV parser with a default configuration for the given
797 /// file path.
798 ///
799 /// To customize CSV parsing, use a `ReaderBuilder`.
800 ///
801 /// # Example
802 ///
803 /// ```no_run
804 /// use std::error::Error;
805 /// use csv::Reader;
806 ///
807 /// # fn main() { example().unwrap(); }
808 /// fn example() -> Result<(), Box<dyn Error>> {
809 /// let mut rdr = Reader::from_path("foo.csv")?;
810 /// for result in rdr.records() {
811 /// let record = result?;
812 /// println!("{:?}", record);
813 /// }
814 /// Ok(())
815 /// }
816 /// ```
817 pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Reader<File>> {
818 ReaderBuilder::new().from_path(path)
819 }
820}
821
822impl<R: io::Read> Reader<R> {
823 /// Create a new CSV reader given a builder and a source of underlying
824 /// bytes.
825 fn new(builder: &ReaderBuilder, rdr: R) -> Reader<R> {
826 Reader {
827 core: Box::new(builder.builder.build()),
828 rdr: io::BufReader::with_capacity(builder.capacity, rdr),
829 state: ReaderState {
830 headers: None,
831 has_headers: builder.has_headers,
832 flexible: builder.flexible,
833 trim: builder.trim,
834 first_field_count: None,
835 cur_pos: Position::new(),
836 first: false,
837 seeked: false,
838 eof: ReaderEofState::NotEof,
839 },
840 }
841 }
842
843 /// Create a new CSV parser with a default configuration for the given
844 /// reader.
845 ///
846 /// To customize CSV parsing, use a `ReaderBuilder`.
847 ///
848 /// # Example
849 ///
850 /// ```
851 /// use std::error::Error;
852 /// use csv::Reader;
853 ///
854 /// # fn main() { example().unwrap(); }
855 /// fn example() -> Result<(), Box<dyn Error>> {
856 /// let data = "\
857 /// city,country,pop
858 /// Boston,United States,4628910
859 /// Concord,United States,42695
860 /// ";
861 /// let mut rdr = Reader::from_reader(data.as_bytes());
862 /// for result in rdr.records() {
863 /// let record = result?;
864 /// println!("{:?}", record);
865 /// }
866 /// Ok(())
867 /// }
868 /// ```
869 pub fn from_reader(rdr: R) -> Reader<R> {
870 ReaderBuilder::new().from_reader(rdr)
871 }
872
873 /// Returns a borrowed iterator over deserialized records.
874 ///
875 /// Each item yielded by this iterator is a `Result<D, Error>`.
876 /// Therefore, in order to access the record, callers must handle the
877 /// possibility of error (typically with `try!` or `?`).
878 ///
879 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
880 /// default), then this does not include the first record. Additionally,
881 /// if `has_headers` is enabled, then deserializing into a struct will
882 /// automatically align the values in each row to the fields of a struct
883 /// based on the header row.
884 ///
885 /// # Example
886 ///
887 /// This shows how to deserialize CSV data into normal Rust structs. The
888 /// fields of the header row are used to match up the values in each row
889 /// to the fields of the struct.
890 ///
891 /// ```
892 /// use std::error::Error;
893 ///
894 /// #[derive(Debug, serde::Deserialize, Eq, PartialEq)]
895 /// struct Row {
896 /// city: String,
897 /// country: String,
898 /// #[serde(rename = "popcount")]
899 /// population: u64,
900 /// }
901 ///
902 /// # fn main() { example().unwrap(); }
903 /// fn example() -> Result<(), Box<dyn Error>> {
904 /// let data = "\
905 /// city,country,popcount
906 /// Boston,United States,4628910
907 /// ";
908 /// let mut rdr = csv::Reader::from_reader(data.as_bytes());
909 /// let mut iter = rdr.deserialize();
910 ///
911 /// if let Some(result) = iter.next() {
912 /// let record: Row = result?;
913 /// assert_eq!(record, Row {
914 /// city: "Boston".to_string(),
915 /// country: "United States".to_string(),
916 /// population: 4628910,
917 /// });
918 /// Ok(())
919 /// } else {
920 /// Err(From::from("expected at least one record but got none"))
921 /// }
922 /// }
923 /// ```
924 ///
925 /// # Rules
926 ///
927 /// For the most part, any Rust type that maps straight-forwardly to a CSV
928 /// record is supported. This includes maps, structs, tuples and tuple
929 /// structs. Other Rust types, such as `Vec`s, arrays, and enums have
930 /// a more complicated story. In general, when working with CSV data, one
931 /// should avoid *nested sequences* as much as possible.
932 ///
933 /// Maps, structs, tuples and tuple structs map to CSV records in a simple
934 /// way. Tuples and tuple structs decode their fields in the order that
935 /// they are defined. Structs will do the same only if `has_headers` has
936 /// been disabled using [`ReaderBuilder`](struct.ReaderBuilder.html),
937 /// otherwise, structs and maps are deserialized based on the fields
938 /// defined in the header row. (If there is no header row, then
939 /// deserializing into a map will result in an error.)
940 ///
941 /// Nested sequences are supported in a limited capacity. Namely, they
942 /// are flattened. As a result, it's often useful to use a `Vec` to capture
943 /// a "tail" of fields in a record:
944 ///
945 /// ```
946 /// use std::error::Error;
947 ///
948 /// #[derive(Debug, serde::Deserialize, Eq, PartialEq)]
949 /// struct Row {
950 /// label: String,
951 /// values: Vec<i32>,
952 /// }
953 ///
954 /// # fn main() { example().unwrap(); }
955 /// fn example() -> Result<(), Box<dyn Error>> {
956 /// let data = "foo,1,2,3";
957 /// let mut rdr = csv::ReaderBuilder::new()
958 /// .has_headers(false)
959 /// .from_reader(data.as_bytes());
960 /// let mut iter = rdr.deserialize();
961 ///
962 /// if let Some(result) = iter.next() {
963 /// let record: Row = result?;
964 /// assert_eq!(record, Row {
965 /// label: "foo".to_string(),
966 /// values: vec![1, 2, 3],
967 /// });
968 /// Ok(())
969 /// } else {
970 /// Err(From::from("expected at least one record but got none"))
971 /// }
972 /// }
973 /// ```
974 ///
975 /// In the above example, adding another field to the `Row` struct after
976 /// the `values` field will result in a deserialization error. This is
977 /// because the deserializer doesn't know when to stop reading fields
978 /// into the `values` vector, so it will consume the rest of the fields in
979 /// the record leaving none left over for the additional field.
980 ///
981 /// Finally, simple enums in Rust can be deserialized as well. Namely,
982 /// enums must either be variants with no arguments or variants with a
983 /// single argument. Variants with no arguments are deserialized based on
984 /// which variant name the field matches. Variants with a single argument
985 /// are deserialized based on which variant can store the data. The latter
986 /// is only supported when using "untagged" enum deserialization. The
987 /// following example shows both forms in action:
988 ///
989 /// ```
990 /// use std::error::Error;
991 ///
992 /// #[derive(Debug, serde::Deserialize, PartialEq)]
993 /// struct Row {
994 /// label: Label,
995 /// value: Number,
996 /// }
997 ///
998 /// #[derive(Debug, serde::Deserialize, PartialEq)]
999 /// #[serde(rename_all = "lowercase")]
1000 /// enum Label {
1001 /// Celsius,
1002 /// Fahrenheit,
1003 /// }
1004 ///
1005 /// #[derive(Debug, serde::Deserialize, PartialEq)]
1006 /// #[serde(untagged)]
1007 /// enum Number {
1008 /// Integer(i64),
1009 /// Float(f64),
1010 /// }
1011 ///
1012 /// # fn main() { example().unwrap(); }
1013 /// fn example() -> Result<(), Box<dyn Error>> {
1014 /// let data = "\
1015 /// label,value
1016 /// celsius,22.2222
1017 /// fahrenheit,72
1018 /// ";
1019 /// let mut rdr = csv::Reader::from_reader(data.as_bytes());
1020 /// let mut iter = rdr.deserialize();
1021 ///
1022 /// // Read the first record.
1023 /// if let Some(result) = iter.next() {
1024 /// let record: Row = result?;
1025 /// assert_eq!(record, Row {
1026 /// label: Label::Celsius,
1027 /// value: Number::Float(22.2222),
1028 /// });
1029 /// } else {
1030 /// return Err(From::from(
1031 /// "expected at least two records but got none"));
1032 /// }
1033 ///
1034 /// // Read the second record.
1035 /// if let Some(result) = iter.next() {
1036 /// let record: Row = result?;
1037 /// assert_eq!(record, Row {
1038 /// label: Label::Fahrenheit,
1039 /// value: Number::Integer(72),
1040 /// });
1041 /// Ok(())
1042 /// } else {
1043 /// Err(From::from(
1044 /// "expected at least two records but got only one"))
1045 /// }
1046 /// }
1047 /// ```
1048 pub fn deserialize<D>(&mut self) -> DeserializeRecordsIter<R, D>
1049 where
1050 D: DeserializeOwned,
1051 {
1052 DeserializeRecordsIter::new(self)
1053 }
1054
1055 /// Returns an owned iterator over deserialized records.
1056 ///
1057 /// Each item yielded by this iterator is a `Result<D, Error>`.
1058 /// Therefore, in order to access the record, callers must handle the
1059 /// possibility of error (typically with `try!` or `?`).
1060 ///
1061 /// This is mostly useful when you want to return a CSV iterator or store
1062 /// it somewhere.
1063 ///
1064 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1065 /// default), then this does not include the first record. Additionally,
1066 /// if `has_headers` is enabled, then deserializing into a struct will
1067 /// automatically align the values in each row to the fields of a struct
1068 /// based on the header row.
1069 ///
1070 /// For more detailed deserialization rules, see the documentation on the
1071 /// `deserialize` method.
1072 ///
1073 /// # Example
1074 ///
1075 /// ```
1076 /// use std::error::Error;
1077 ///
1078 /// #[derive(Debug, serde::Deserialize, Eq, PartialEq)]
1079 /// struct Row {
1080 /// city: String,
1081 /// country: String,
1082 /// #[serde(rename = "popcount")]
1083 /// population: u64,
1084 /// }
1085 ///
1086 /// # fn main() { example().unwrap(); }
1087 /// fn example() -> Result<(), Box<dyn Error>> {
1088 /// let data = "\
1089 /// city,country,popcount
1090 /// Boston,United States,4628910
1091 /// ";
1092 /// let rdr = csv::Reader::from_reader(data.as_bytes());
1093 /// let mut iter = rdr.into_deserialize();
1094 ///
1095 /// if let Some(result) = iter.next() {
1096 /// let record: Row = result?;
1097 /// assert_eq!(record, Row {
1098 /// city: "Boston".to_string(),
1099 /// country: "United States".to_string(),
1100 /// population: 4628910,
1101 /// });
1102 /// Ok(())
1103 /// } else {
1104 /// Err(From::from("expected at least one record but got none"))
1105 /// }
1106 /// }
1107 /// ```
1108 pub fn into_deserialize<D>(self) -> DeserializeRecordsIntoIter<R, D>
1109 where
1110 D: DeserializeOwned,
1111 {
1112 DeserializeRecordsIntoIter::new(self)
1113 }
1114
1115 /// Returns a borrowed iterator over all records as strings.
1116 ///
1117 /// Each item yielded by this iterator is a `Result<StringRecord, Error>`.
1118 /// Therefore, in order to access the record, callers must handle the
1119 /// possibility of error (typically with `try!` or `?`).
1120 ///
1121 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1122 /// default), then this does not include the first record.
1123 ///
1124 /// # Example
1125 ///
1126 /// ```
1127 /// use std::error::Error;
1128 /// use csv::Reader;
1129 ///
1130 /// # fn main() { example().unwrap(); }
1131 /// fn example() -> Result<(), Box<dyn Error>> {
1132 /// let data = "\
1133 /// city,country,pop
1134 /// Boston,United States,4628910
1135 /// ";
1136 /// let mut rdr = Reader::from_reader(data.as_bytes());
1137 /// let mut iter = rdr.records();
1138 ///
1139 /// if let Some(result) = iter.next() {
1140 /// let record = result?;
1141 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1142 /// Ok(())
1143 /// } else {
1144 /// Err(From::from("expected at least one record but got none"))
1145 /// }
1146 /// }
1147 /// ```
1148 pub fn records(&mut self) -> StringRecordsIter<R> {
1149 StringRecordsIter::new(self)
1150 }
1151
1152 /// Returns an owned iterator over all records as strings.
1153 ///
1154 /// Each item yielded by this iterator is a `Result<StringRecord, Error>`.
1155 /// Therefore, in order to access the record, callers must handle the
1156 /// possibility of error (typically with `try!` or `?`).
1157 ///
1158 /// This is mostly useful when you want to return a CSV iterator or store
1159 /// it somewhere.
1160 ///
1161 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1162 /// default), then this does not include the first record.
1163 ///
1164 /// # Example
1165 ///
1166 /// ```
1167 /// use std::error::Error;
1168 /// use csv::Reader;
1169 ///
1170 /// # fn main() { example().unwrap(); }
1171 /// fn example() -> Result<(), Box<dyn Error>> {
1172 /// let data = "\
1173 /// city,country,pop
1174 /// Boston,United States,4628910
1175 /// ";
1176 /// let rdr = Reader::from_reader(data.as_bytes());
1177 /// let mut iter = rdr.into_records();
1178 ///
1179 /// if let Some(result) = iter.next() {
1180 /// let record = result?;
1181 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1182 /// Ok(())
1183 /// } else {
1184 /// Err(From::from("expected at least one record but got none"))
1185 /// }
1186 /// }
1187 /// ```
1188 pub fn into_records(self) -> StringRecordsIntoIter<R> {
1189 StringRecordsIntoIter::new(self)
1190 }
1191
1192 /// Returns a borrowed iterator over all records as raw bytes.
1193 ///
1194 /// Each item yielded by this iterator is a `Result<ByteRecord, Error>`.
1195 /// Therefore, in order to access the record, callers must handle the
1196 /// possibility of error (typically with `try!` or `?`).
1197 ///
1198 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1199 /// default), then this does not include the first record.
1200 ///
1201 /// # Example
1202 ///
1203 /// ```
1204 /// use std::error::Error;
1205 /// use csv::Reader;
1206 ///
1207 /// # fn main() { example().unwrap(); }
1208 /// fn example() -> Result<(), Box<dyn Error>> {
1209 /// let data = "\
1210 /// city,country,pop
1211 /// Boston,United States,4628910
1212 /// ";
1213 /// let mut rdr = Reader::from_reader(data.as_bytes());
1214 /// let mut iter = rdr.byte_records();
1215 ///
1216 /// if let Some(result) = iter.next() {
1217 /// let record = result?;
1218 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1219 /// Ok(())
1220 /// } else {
1221 /// Err(From::from("expected at least one record but got none"))
1222 /// }
1223 /// }
1224 /// ```
1225 pub fn byte_records(&mut self) -> ByteRecordsIter<R> {
1226 ByteRecordsIter::new(self)
1227 }
1228
1229 /// Returns an owned iterator over all records as raw bytes.
1230 ///
1231 /// Each item yielded by this iterator is a `Result<ByteRecord, Error>`.
1232 /// Therefore, in order to access the record, callers must handle the
1233 /// possibility of error (typically with `try!` or `?`).
1234 ///
1235 /// This is mostly useful when you want to return a CSV iterator or store
1236 /// it somewhere.
1237 ///
1238 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1239 /// default), then this does not include the first record.
1240 ///
1241 /// # Example
1242 ///
1243 /// ```
1244 /// use std::error::Error;
1245 /// use csv::Reader;
1246 ///
1247 /// # fn main() { example().unwrap(); }
1248 /// fn example() -> Result<(), Box<dyn Error>> {
1249 /// let data = "\
1250 /// city,country,pop
1251 /// Boston,United States,4628910
1252 /// ";
1253 /// let rdr = Reader::from_reader(data.as_bytes());
1254 /// let mut iter = rdr.into_byte_records();
1255 ///
1256 /// if let Some(result) = iter.next() {
1257 /// let record = result?;
1258 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1259 /// Ok(())
1260 /// } else {
1261 /// Err(From::from("expected at least one record but got none"))
1262 /// }
1263 /// }
1264 /// ```
1265 pub fn into_byte_records(self) -> ByteRecordsIntoIter<R> {
1266 ByteRecordsIntoIter::new(self)
1267 }
1268
1269 /// Returns a reference to the first row read by this parser.
1270 ///
1271 /// If no row has been read yet, then this will force parsing of the first
1272 /// row.
1273 ///
1274 /// If there was a problem parsing the row or if it wasn't valid UTF-8,
1275 /// then this returns an error.
1276 ///
1277 /// If the underlying reader emits EOF before any data, then this returns
1278 /// an empty record.
1279 ///
1280 /// Note that this method may be used regardless of whether `has_headers`
1281 /// was enabled (but it is enabled by default).
1282 ///
1283 /// # Example
1284 ///
1285 /// This example shows how to get the header row of CSV data. Notice that
1286 /// the header row does not appear as a record in the iterator!
1287 ///
1288 /// ```
1289 /// use std::error::Error;
1290 /// use csv::Reader;
1291 ///
1292 /// # fn main() { example().unwrap(); }
1293 /// fn example() -> Result<(), Box<dyn Error>> {
1294 /// let data = "\
1295 /// city,country,pop
1296 /// Boston,United States,4628910
1297 /// ";
1298 /// let mut rdr = Reader::from_reader(data.as_bytes());
1299 ///
1300 /// // We can read the headers before iterating.
1301 /// {
1302 /// // `headers` borrows from the reader, so we put this in its
1303 /// // own scope. That way, the borrow ends before we try iterating
1304 /// // below. Alternatively, we could clone the headers.
1305 /// let headers = rdr.headers()?;
1306 /// assert_eq!(headers, vec!["city", "country", "pop"]);
1307 /// }
1308 ///
1309 /// if let Some(result) = rdr.records().next() {
1310 /// let record = result?;
1311 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1312 /// } else {
1313 /// return Err(From::from(
1314 /// "expected at least one record but got none"))
1315 /// }
1316 ///
1317 /// // We can also read the headers after iterating.
1318 /// let headers = rdr.headers()?;
1319 /// assert_eq!(headers, vec!["city", "country", "pop"]);
1320 /// Ok(())
1321 /// }
1322 /// ```
1323 pub fn headers(&mut self) -> Result<&StringRecord> {
1324 if self.state.headers.is_none() {
1325 let mut record = ByteRecord::new();
1326 self.read_byte_record_impl(&mut record)?;
1327 self.set_headers_impl(Err(record));
1328 }
1329 let headers = self.state.headers.as_ref().unwrap();
1330 match headers.string_record {
1331 Ok(ref record) => Ok(record),
1332 Err(ref err) => Err(Error::new(ErrorKind::Utf8 {
1333 pos: headers.byte_record.position().map(Clone::clone),
1334 err: err.clone(),
1335 })),
1336 }
1337 }
1338
1339 /// Returns a reference to the first row read by this parser as raw bytes.
1340 ///
1341 /// If no row has been read yet, then this will force parsing of the first
1342 /// row.
1343 ///
1344 /// If there was a problem parsing the row then this returns an error.
1345 ///
1346 /// If the underlying reader emits EOF before any data, then this returns
1347 /// an empty record.
1348 ///
1349 /// Note that this method may be used regardless of whether `has_headers`
1350 /// was enabled (but it is enabled by default).
1351 ///
1352 /// # Example
1353 ///
1354 /// This example shows how to get the header row of CSV data. Notice that
1355 /// the header row does not appear as a record in the iterator!
1356 ///
1357 /// ```
1358 /// use std::error::Error;
1359 /// use csv::Reader;
1360 ///
1361 /// # fn main() { example().unwrap(); }
1362 /// fn example() -> Result<(), Box<dyn Error>> {
1363 /// let data = "\
1364 /// city,country,pop
1365 /// Boston,United States,4628910
1366 /// ";
1367 /// let mut rdr = Reader::from_reader(data.as_bytes());
1368 ///
1369 /// // We can read the headers before iterating.
1370 /// {
1371 /// // `headers` borrows from the reader, so we put this in its
1372 /// // own scope. That way, the borrow ends before we try iterating
1373 /// // below. Alternatively, we could clone the headers.
1374 /// let headers = rdr.byte_headers()?;
1375 /// assert_eq!(headers, vec!["city", "country", "pop"]);
1376 /// }
1377 ///
1378 /// if let Some(result) = rdr.byte_records().next() {
1379 /// let record = result?;
1380 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1381 /// } else {
1382 /// return Err(From::from(
1383 /// "expected at least one record but got none"))
1384 /// }
1385 ///
1386 /// // We can also read the headers after iterating.
1387 /// let headers = rdr.byte_headers()?;
1388 /// assert_eq!(headers, vec!["city", "country", "pop"]);
1389 /// Ok(())
1390 /// }
1391 /// ```
1392 pub fn byte_headers(&mut self) -> Result<&ByteRecord> {
1393 if self.state.headers.is_none() {
1394 let mut record = ByteRecord::new();
1395 self.read_byte_record_impl(&mut record)?;
1396 self.set_headers_impl(Err(record));
1397 }
1398 Ok(&self.state.headers.as_ref().unwrap().byte_record)
1399 }
1400
1401 /// Set the headers of this CSV parser manually.
1402 ///
1403 /// This overrides any other setting (including `set_byte_headers`). Any
1404 /// automatic detection of headers is disabled. This may be called at any
1405 /// time.
1406 ///
1407 /// # Example
1408 ///
1409 /// ```
1410 /// use std::error::Error;
1411 /// use csv::{Reader, StringRecord};
1412 ///
1413 /// # fn main() { example().unwrap(); }
1414 /// fn example() -> Result<(), Box<dyn Error>> {
1415 /// let data = "\
1416 /// city,country,pop
1417 /// Boston,United States,4628910
1418 /// ";
1419 /// let mut rdr = Reader::from_reader(data.as_bytes());
1420 ///
1421 /// assert_eq!(rdr.headers()?, vec!["city", "country", "pop"]);
1422 /// rdr.set_headers(StringRecord::from(vec!["a", "b", "c"]));
1423 /// assert_eq!(rdr.headers()?, vec!["a", "b", "c"]);
1424 ///
1425 /// Ok(())
1426 /// }
1427 /// ```
1428 pub fn set_headers(&mut self, headers: StringRecord) {
1429 self.set_headers_impl(Ok(headers));
1430 }
1431
1432 /// Set the headers of this CSV parser manually as raw bytes.
1433 ///
1434 /// This overrides any other setting (including `set_headers`). Any
1435 /// automatic detection of headers is disabled. This may be called at any
1436 /// time.
1437 ///
1438 /// # Example
1439 ///
1440 /// ```
1441 /// use std::error::Error;
1442 /// use csv::{Reader, ByteRecord};
1443 ///
1444 /// # fn main() { example().unwrap(); }
1445 /// fn example() -> Result<(), Box<dyn Error>> {
1446 /// let data = "\
1447 /// city,country,pop
1448 /// Boston,United States,4628910
1449 /// ";
1450 /// let mut rdr = Reader::from_reader(data.as_bytes());
1451 ///
1452 /// assert_eq!(rdr.byte_headers()?, vec!["city", "country", "pop"]);
1453 /// rdr.set_byte_headers(ByteRecord::from(vec!["a", "b", "c"]));
1454 /// assert_eq!(rdr.byte_headers()?, vec!["a", "b", "c"]);
1455 ///
1456 /// Ok(())
1457 /// }
1458 /// ```
1459 pub fn set_byte_headers(&mut self, headers: ByteRecord) {
1460 self.set_headers_impl(Err(headers));
1461 }
1462
1463 fn set_headers_impl(
1464 &mut self,
1465 headers: result::Result<StringRecord, ByteRecord>,
1466 ) {
1467 // If we have string headers, then get byte headers. But if we have
1468 // byte headers, then get the string headers (or a UTF-8 error).
1469 let (mut str_headers, mut byte_headers) = match headers {
1470 Ok(string) => {
1471 let bytes = string.clone().into_byte_record();
1472 (Ok(string), bytes)
1473 }
1474 Err(bytes) => {
1475 match StringRecord::from_byte_record(bytes.clone()) {
1476 Ok(str_headers) => (Ok(str_headers), bytes),
1477 Err(err) => (Err(err.utf8_error().clone()), bytes),
1478 }
1479 }
1480 };
1481 if self.state.trim.should_trim_headers() {
1482 if let Ok(ref mut str_headers) = str_headers.as_mut() {
1483 str_headers.trim();
1484 }
1485 byte_headers.trim();
1486 }
1487 self.state.headers = Some(Headers {
1488 byte_record: byte_headers,
1489 string_record: str_headers,
1490 });
1491 }
1492
1493 /// Read a single row into the given record. Returns false when no more
1494 /// records could be read.
1495 ///
1496 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1497 /// default), then this will never read the first record.
1498 ///
1499 /// This method is useful when you want to read records as fast as
1500 /// as possible. It's less ergonomic than an iterator, but it permits the
1501 /// caller to reuse the `StringRecord` allocation, which usually results
1502 /// in higher throughput.
1503 ///
1504 /// Records read via this method are guaranteed to have a position set
1505 /// on them, even if the reader is at EOF or if an error is returned.
1506 ///
1507 /// # Example
1508 ///
1509 /// ```
1510 /// use std::error::Error;
1511 /// use csv::{Reader, StringRecord};
1512 ///
1513 /// # fn main() { example().unwrap(); }
1514 /// fn example() -> Result<(), Box<dyn Error>> {
1515 /// let data = "\
1516 /// city,country,pop
1517 /// Boston,United States,4628910
1518 /// ";
1519 /// let mut rdr = Reader::from_reader(data.as_bytes());
1520 /// let mut record = StringRecord::new();
1521 ///
1522 /// if rdr.read_record(&mut record)? {
1523 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1524 /// Ok(())
1525 /// } else {
1526 /// Err(From::from("expected at least one record but got none"))
1527 /// }
1528 /// }
1529 /// ```
1530 pub fn read_record(&mut self, record: &mut StringRecord) -> Result<bool> {
1531 let result = record.read(self);
1532 // We need to trim again because trimming string records includes
1533 // Unicode whitespace. (ByteRecord trimming only includes ASCII
1534 // whitespace.)
1535 if self.state.trim.should_trim_fields() {
1536 record.trim();
1537 }
1538 result
1539 }
1540
1541 /// Read a single row into the given byte record. Returns false when no
1542 /// more records could be read.
1543 ///
1544 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1545 /// default), then this will never read the first record.
1546 ///
1547 /// This method is useful when you want to read records as fast as
1548 /// as possible. It's less ergonomic than an iterator, but it permits the
1549 /// caller to reuse the `ByteRecord` allocation, which usually results
1550 /// in higher throughput.
1551 ///
1552 /// Records read via this method are guaranteed to have a position set
1553 /// on them, even if the reader is at EOF or if an error is returned.
1554 ///
1555 /// # Example
1556 ///
1557 /// ```
1558 /// use std::error::Error;
1559 /// use csv::{ByteRecord, Reader};
1560 ///
1561 /// # fn main() { example().unwrap(); }
1562 /// fn example() -> Result<(), Box<dyn Error>> {
1563 /// let data = "\
1564 /// city,country,pop
1565 /// Boston,United States,4628910
1566 /// ";
1567 /// let mut rdr = Reader::from_reader(data.as_bytes());
1568 /// let mut record = ByteRecord::new();
1569 ///
1570 /// if rdr.read_byte_record(&mut record)? {
1571 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1572 /// Ok(())
1573 /// } else {
1574 /// Err(From::from("expected at least one record but got none"))
1575 /// }
1576 /// }
1577 /// ```
1578 pub fn read_byte_record(
1579 &mut self,
1580 record: &mut ByteRecord,
1581 ) -> Result<bool> {
1582 if !self.state.seeked && !self.state.has_headers && !self.state.first {
1583 // If the caller indicated "no headers" and we haven't yielded the
1584 // first record yet, then we should yield our header row if we have
1585 // one.
1586 if let Some(ref headers) = self.state.headers {
1587 self.state.first = true;
1588 record.clone_from(&headers.byte_record);
1589 if self.state.trim.should_trim_fields() {
1590 record.trim();
1591 }
1592 return Ok(!record.is_empty());
1593 }
1594 }
1595 let ok = self.read_byte_record_impl(record)?;
1596 self.state.first = true;
1597 if !self.state.seeked && self.state.headers.is_none() {
1598 self.set_headers_impl(Err(record.clone()));
1599 // If the end user indicated that we have headers, then we should
1600 // never return the first row. Instead, we should attempt to
1601 // read and return the next one.
1602 if self.state.has_headers {
1603 let result = self.read_byte_record_impl(record);
1604 if self.state.trim.should_trim_fields() {
1605 record.trim();
1606 }
1607 return result;
1608 }
1609 }
1610 if self.state.trim.should_trim_fields() {
1611 record.trim();
1612 }
1613 Ok(ok)
1614 }
1615
1616 /// Read a byte record from the underlying CSV reader, without accounting
1617 /// for headers.
1618 #[inline(always)]
1619 fn read_byte_record_impl(
1620 &mut self,
1621 record: &mut ByteRecord,
1622 ) -> Result<bool> {
1623 use csv_core::ReadRecordResult::*;
1624
1625 record.clear();
1626 record.set_position(Some(self.state.cur_pos.clone()));
1627 if self.state.eof != ReaderEofState::NotEof {
1628 return Ok(false);
1629 }
1630 let (mut outlen, mut endlen) = (0, 0);
1631 loop {
1632 let (res, nin, nout, nend) = {
1633 let input_res = self.rdr.fill_buf();
1634 if input_res.is_err() {
1635 self.state.eof = ReaderEofState::IOError;
1636 }
1637 let input = input_res?;
1638 let (fields, ends) = record.as_parts();
1639 self.core.read_record(
1640 input,
1641 &mut fields[outlen..],
1642 &mut ends[endlen..],
1643 )
1644 };
1645 self.rdr.consume(nin);
1646 let byte = self.state.cur_pos.byte();
1647 self.state
1648 .cur_pos
1649 .set_byte(byte + nin as u64)
1650 .set_line(self.core.line());
1651 outlen += nout;
1652 endlen += nend;
1653 match res {
1654 InputEmpty => continue,
1655 OutputFull => {
1656 record.expand_fields();
1657 continue;
1658 }
1659 OutputEndsFull => {
1660 record.expand_ends();
1661 continue;
1662 }
1663 Record => {
1664 record.set_len(endlen);
1665 self.state.add_record(record)?;
1666 return Ok(true);
1667 }
1668 End => {
1669 self.state.eof = ReaderEofState::Eof;
1670 return Ok(false);
1671 }
1672 }
1673 }
1674 }
1675
1676 /// Return the current position of this CSV reader.
1677 ///
1678 /// The byte offset in the position returned can be used to `seek` this
1679 /// reader. In particular, seeking to a position returned here on the same
1680 /// data will result in parsing the same subsequent record.
1681 ///
1682 /// # Example: reading the position
1683 ///
1684 /// ```
1685 /// use std::{error::Error, io};
1686 /// use csv::{Reader, Position};
1687 ///
1688 /// # fn main() { example().unwrap(); }
1689 /// fn example() -> Result<(), Box<dyn Error>> {
1690 /// let data = "\
1691 /// city,country,popcount
1692 /// Boston,United States,4628910
1693 /// Concord,United States,42695
1694 /// ";
1695 /// let rdr = Reader::from_reader(io::Cursor::new(data));
1696 /// let mut iter = rdr.into_records();
1697 /// let mut pos = Position::new();
1698 /// loop {
1699 /// // Read the position immediately before each record.
1700 /// let next_pos = iter.reader().position().clone();
1701 /// if iter.next().is_none() {
1702 /// break;
1703 /// }
1704 /// pos = next_pos;
1705 /// }
1706 ///
1707 /// // `pos` should now be the position immediately before the last
1708 /// // record.
1709 /// assert_eq!(pos.byte(), 51);
1710 /// assert_eq!(pos.line(), 3);
1711 /// assert_eq!(pos.record(), 2);
1712 /// Ok(())
1713 /// }
1714 /// ```
1715 pub fn position(&self) -> &Position {
1716 &self.state.cur_pos
1717 }
1718
1719 /// Returns true if and only if this reader has been exhausted.
1720 ///
1721 /// When this returns true, no more records can be read from this reader
1722 /// (unless it has been seeked to another position).
1723 ///
1724 /// # Example
1725 ///
1726 /// ```
1727 /// use std::{error::Error, io};
1728 /// use csv::{Reader, Position};
1729 ///
1730 /// # fn main() { example().unwrap(); }
1731 /// fn example() -> Result<(), Box<dyn Error>> {
1732 /// let data = "\
1733 /// city,country,popcount
1734 /// Boston,United States,4628910
1735 /// Concord,United States,42695
1736 /// ";
1737 /// let mut rdr = Reader::from_reader(io::Cursor::new(data));
1738 /// assert!(!rdr.is_done());
1739 /// for result in rdr.records() {
1740 /// let _ = result?;
1741 /// }
1742 /// assert!(rdr.is_done());
1743 /// Ok(())
1744 /// }
1745 /// ```
1746 pub fn is_done(&self) -> bool {
1747 self.state.eof != ReaderEofState::NotEof
1748 }
1749
1750 /// Returns true if and only if this reader has been configured to
1751 /// interpret the first record as a header record.
1752 pub fn has_headers(&self) -> bool {
1753 self.state.has_headers
1754 }
1755
1756 /// Returns a reference to the underlying reader.
1757 pub fn get_ref(&self) -> &R {
1758 self.rdr.get_ref()
1759 }
1760
1761 /// Returns a mutable reference to the underlying reader.
1762 pub fn get_mut(&mut self) -> &mut R {
1763 self.rdr.get_mut()
1764 }
1765
1766 /// Unwraps this CSV reader, returning the underlying reader.
1767 ///
1768 /// Note that any leftover data inside this reader's internal buffer is
1769 /// lost.
1770 pub fn into_inner(self) -> R {
1771 self.rdr.into_inner()
1772 }
1773}
1774
1775impl<R: io::Read + io::Seek> Reader<R> {
1776 /// Seeks the underlying reader to the position given.
1777 ///
1778 /// This comes with a few caveats:
1779 ///
1780 /// * Any internal buffer associated with this reader is cleared.
1781 /// * If the given position does not correspond to a position immediately
1782 /// before the start of a record, then the behavior of this reader is
1783 /// unspecified.
1784 /// * Any special logic that skips the first record in the CSV reader
1785 /// when reading or iterating over records is disabled.
1786 ///
1787 /// If the given position has a byte offset equivalent to the current
1788 /// position, then no seeking is performed.
1789 ///
1790 /// If the header row has not already been read, then this will attempt
1791 /// to read the header row before seeking. Therefore, it is possible that
1792 /// this returns an error associated with reading CSV data.
1793 ///
1794 /// Note that seeking is performed based only on the byte offset in the
1795 /// given position. Namely, the record or line numbers in the position may
1796 /// be incorrect, but this will cause any future position generated by
1797 /// this CSV reader to be similarly incorrect.
1798 ///
1799 /// # Example: seek to parse a record twice
1800 ///
1801 /// ```
1802 /// use std::{error::Error, io};
1803 /// use csv::{Reader, Position};
1804 ///
1805 /// # fn main() { example().unwrap(); }
1806 /// fn example() -> Result<(), Box<dyn Error>> {
1807 /// let data = "\
1808 /// city,country,popcount
1809 /// Boston,United States,4628910
1810 /// Concord,United States,42695
1811 /// ";
1812 /// let rdr = Reader::from_reader(io::Cursor::new(data));
1813 /// let mut iter = rdr.into_records();
1814 /// let mut pos = Position::new();
1815 /// loop {
1816 /// // Read the position immediately before each record.
1817 /// let next_pos = iter.reader().position().clone();
1818 /// if iter.next().is_none() {
1819 /// break;
1820 /// }
1821 /// pos = next_pos;
1822 /// }
1823 ///
1824 /// // Now seek the reader back to `pos`. This will let us read the
1825 /// // last record again.
1826 /// iter.reader_mut().seek(pos)?;
1827 /// let mut iter = iter.into_reader().into_records();
1828 /// if let Some(result) = iter.next() {
1829 /// let record = result?;
1830 /// assert_eq!(record, vec!["Concord", "United States", "42695"]);
1831 /// Ok(())
1832 /// } else {
1833 /// Err(From::from("expected at least one record but got none"))
1834 /// }
1835 /// }
1836 /// ```
1837 pub fn seek(&mut self, pos: Position) -> Result<()> {
1838 self.byte_headers()?;
1839 self.state.seeked = true;
1840 if pos.byte() == self.state.cur_pos.byte() {
1841 return Ok(());
1842 }
1843 self.rdr.seek(io::SeekFrom::Start(pos.byte()))?;
1844 self.core.reset();
1845 self.core.set_line(pos.line());
1846 self.state.cur_pos = pos;
1847 self.state.eof = ReaderEofState::NotEof;
1848 Ok(())
1849 }
1850
1851 /// This is like `seek`, but provides direct control over how the seeking
1852 /// operation is performed via `io::SeekFrom`.
1853 ///
1854 /// The `pos` position given *should* correspond the position indicated
1855 /// by `seek_from`, but there is no requirement. If the `pos` position
1856 /// given is incorrect, then the position information returned by this
1857 /// reader will be similarly incorrect.
1858 ///
1859 /// If the header row has not already been read, then this will attempt
1860 /// to read the header row before seeking. Therefore, it is possible that
1861 /// this returns an error associated with reading CSV data.
1862 ///
1863 /// Unlike `seek`, this will always cause an actual seek to be performed.
1864 pub fn seek_raw(
1865 &mut self,
1866 seek_from: io::SeekFrom,
1867 pos: Position,
1868 ) -> Result<()> {
1869 self.byte_headers()?;
1870 self.state.seeked = true;
1871 self.rdr.seek(seek_from)?;
1872 self.core.reset();
1873 self.core.set_line(pos.line());
1874 self.state.cur_pos = pos;
1875 self.state.eof = ReaderEofState::NotEof;
1876 Ok(())
1877 }
1878}
1879
1880impl ReaderState {
1881 #[inline(always)]
1882 fn add_record(&mut self, record: &ByteRecord) -> Result<()> {
1883 let i = self.cur_pos.record();
1884 self.cur_pos.set_record(i.checked_add(1).unwrap());
1885 if !self.flexible {
1886 match self.first_field_count {
1887 None => self.first_field_count = Some(record.len() as u64),
1888 Some(expected) => {
1889 if record.len() as u64 != expected {
1890 return Err(Error::new(ErrorKind::UnequalLengths {
1891 pos: record.position().map(Clone::clone),
1892 expected_len: expected,
1893 len: record.len() as u64,
1894 }));
1895 }
1896 }
1897 }
1898 }
1899 Ok(())
1900 }
1901}
1902
1903/// An owned iterator over deserialized records.
1904///
1905/// The type parameter `R` refers to the underlying `io::Read` type, and `D`
1906/// refers to the type that this iterator will deserialize a record into.
1907pub struct DeserializeRecordsIntoIter<R, D> {
1908 rdr: Reader<R>,
1909 rec: StringRecord,
1910 headers: Option<StringRecord>,
1911 _priv: PhantomData<D>,
1912}
1913
1914impl<R: io::Read, D: DeserializeOwned> DeserializeRecordsIntoIter<R, D> {
1915 fn new(mut rdr: Reader<R>) -> DeserializeRecordsIntoIter<R, D> {
1916 let headers = if !rdr.state.has_headers {
1917 None
1918 } else {
1919 rdr.headers().ok().map(Clone::clone)
1920 };
1921 DeserializeRecordsIntoIter {
1922 rdr,
1923 rec: StringRecord::new(),
1924 headers,
1925 _priv: PhantomData,
1926 }
1927 }
1928
1929 /// Return a reference to the underlying CSV reader.
1930 pub fn reader(&self) -> &Reader<R> {
1931 &self.rdr
1932 }
1933
1934 /// Return a mutable reference to the underlying CSV reader.
1935 pub fn reader_mut(&mut self) -> &mut Reader<R> {
1936 &mut self.rdr
1937 }
1938
1939 /// Drop this iterator and return the underlying CSV reader.
1940 pub fn into_reader(self) -> Reader<R> {
1941 self.rdr
1942 }
1943}
1944
1945impl<R: io::Read, D: DeserializeOwned> Iterator
1946 for DeserializeRecordsIntoIter<R, D>
1947{
1948 type Item = Result<D>;
1949
1950 fn next(&mut self) -> Option<Result<D>> {
1951 match self.rdr.read_record(&mut self.rec) {
1952 Err(err) => Some(Err(err)),
1953 Ok(false) => None,
1954 Ok(true) => Some(self.rec.deserialize(self.headers.as_ref())),
1955 }
1956 }
1957}
1958
1959/// A borrowed iterator over deserialized records.
1960///
1961/// The lifetime parameter `'r` refers to the lifetime of the underlying
1962/// CSV `Reader`. The type parameter `R` refers to the underlying `io::Read`
1963/// type, and `D` refers to the type that this iterator will deserialize a
1964/// record into.
1965pub struct DeserializeRecordsIter<'r, R: 'r, D> {
1966 rdr: &'r mut Reader<R>,
1967 rec: StringRecord,
1968 headers: Option<StringRecord>,
1969 _priv: PhantomData<D>,
1970}
1971
1972impl<'r, R: io::Read, D: DeserializeOwned> DeserializeRecordsIter<'r, R, D> {
1973 fn new(rdr: &'r mut Reader<R>) -> DeserializeRecordsIter<'r, R, D> {
1974 let headers = if !rdr.state.has_headers {
1975 None
1976 } else {
1977 rdr.headers().ok().map(Clone::clone)
1978 };
1979 DeserializeRecordsIter {
1980 rdr,
1981 rec: StringRecord::new(),
1982 headers,
1983 _priv: PhantomData,
1984 }
1985 }
1986
1987 /// Return a reference to the underlying CSV reader.
1988 pub fn reader(&self) -> &Reader<R> {
1989 &self.rdr
1990 }
1991
1992 /// Return a mutable reference to the underlying CSV reader.
1993 pub fn reader_mut(&mut self) -> &mut Reader<R> {
1994 &mut self.rdr
1995 }
1996}
1997
1998impl<'r, R: io::Read, D: DeserializeOwned> Iterator
1999 for DeserializeRecordsIter<'r, R, D>
2000{
2001 type Item = Result<D>;
2002
2003 fn next(&mut self) -> Option<Result<D>> {
2004 match self.rdr.read_record(&mut self.rec) {
2005 Err(err) => Some(Err(err)),
2006 Ok(false) => None,
2007 Ok(true) => Some(self.rec.deserialize(self.headers.as_ref())),
2008 }
2009 }
2010}
2011
2012/// An owned iterator over records as strings.
2013pub struct StringRecordsIntoIter<R> {
2014 rdr: Reader<R>,
2015 rec: StringRecord,
2016}
2017
2018impl<R: io::Read> StringRecordsIntoIter<R> {
2019 fn new(rdr: Reader<R>) -> StringRecordsIntoIter<R> {
2020 StringRecordsIntoIter { rdr, rec: StringRecord::new() }
2021 }
2022
2023 /// Return a reference to the underlying CSV reader.
2024 pub fn reader(&self) -> &Reader<R> {
2025 &self.rdr
2026 }
2027
2028 /// Return a mutable reference to the underlying CSV reader.
2029 pub fn reader_mut(&mut self) -> &mut Reader<R> {
2030 &mut self.rdr
2031 }
2032
2033 /// Drop this iterator and return the underlying CSV reader.
2034 pub fn into_reader(self) -> Reader<R> {
2035 self.rdr
2036 }
2037}
2038
2039impl<R: io::Read> Iterator for StringRecordsIntoIter<R> {
2040 type Item = Result<StringRecord>;
2041
2042 fn next(&mut self) -> Option<Result<StringRecord>> {
2043 match self.rdr.read_record(&mut self.rec) {
2044 Err(err) => Some(Err(err)),
2045 Ok(true) => Some(Ok(self.rec.clone_truncated())),
2046 Ok(false) => None,
2047 }
2048 }
2049}
2050
2051/// A borrowed iterator over records as strings.
2052///
2053/// The lifetime parameter `'r` refers to the lifetime of the underlying
2054/// CSV `Reader`.
2055pub struct StringRecordsIter<'r, R: 'r> {
2056 rdr: &'r mut Reader<R>,
2057 rec: StringRecord,
2058}
2059
2060impl<'r, R: io::Read> StringRecordsIter<'r, R> {
2061 fn new(rdr: &'r mut Reader<R>) -> StringRecordsIter<'r, R> {
2062 StringRecordsIter { rdr, rec: StringRecord::new() }
2063 }
2064
2065 /// Return a reference to the underlying CSV reader.
2066 pub fn reader(&self) -> &Reader<R> {
2067 &self.rdr
2068 }
2069
2070 /// Return a mutable reference to the underlying CSV reader.
2071 pub fn reader_mut(&mut self) -> &mut Reader<R> {
2072 &mut self.rdr
2073 }
2074}
2075
2076impl<'r, R: io::Read> Iterator for StringRecordsIter<'r, R> {
2077 type Item = Result<StringRecord>;
2078
2079 fn next(&mut self) -> Option<Result<StringRecord>> {
2080 match self.rdr.read_record(&mut self.rec) {
2081 Err(err) => Some(Err(err)),
2082 Ok(true) => Some(Ok(self.rec.clone_truncated())),
2083 Ok(false) => None,
2084 }
2085 }
2086}
2087
2088/// An owned iterator over records as raw bytes.
2089pub struct ByteRecordsIntoIter<R> {
2090 rdr: Reader<R>,
2091 rec: ByteRecord,
2092}
2093
2094impl<R: io::Read> ByteRecordsIntoIter<R> {
2095 fn new(rdr: Reader<R>) -> ByteRecordsIntoIter<R> {
2096 ByteRecordsIntoIter { rdr, rec: ByteRecord::new() }
2097 }
2098
2099 /// Return a reference to the underlying CSV reader.
2100 pub fn reader(&self) -> &Reader<R> {
2101 &self.rdr
2102 }
2103
2104 /// Return a mutable reference to the underlying CSV reader.
2105 pub fn reader_mut(&mut self) -> &mut Reader<R> {
2106 &mut self.rdr
2107 }
2108
2109 /// Drop this iterator and return the underlying CSV reader.
2110 pub fn into_reader(self) -> Reader<R> {
2111 self.rdr
2112 }
2113}
2114
2115impl<R: io::Read> Iterator for ByteRecordsIntoIter<R> {
2116 type Item = Result<ByteRecord>;
2117
2118 fn next(&mut self) -> Option<Result<ByteRecord>> {
2119 match self.rdr.read_byte_record(&mut self.rec) {
2120 Err(err) => Some(Err(err)),
2121 Ok(true) => Some(Ok(self.rec.clone_truncated())),
2122 Ok(false) => None,
2123 }
2124 }
2125}
2126
2127/// A borrowed iterator over records as raw bytes.
2128///
2129/// The lifetime parameter `'r` refers to the lifetime of the underlying
2130/// CSV `Reader`.
2131pub struct ByteRecordsIter<'r, R: 'r> {
2132 rdr: &'r mut Reader<R>,
2133 rec: ByteRecord,
2134}
2135
2136impl<'r, R: io::Read> ByteRecordsIter<'r, R> {
2137 fn new(rdr: &'r mut Reader<R>) -> ByteRecordsIter<'r, R> {
2138 ByteRecordsIter { rdr, rec: ByteRecord::new() }
2139 }
2140
2141 /// Return a reference to the underlying CSV reader.
2142 pub fn reader(&self) -> &Reader<R> {
2143 &self.rdr
2144 }
2145
2146 /// Return a mutable reference to the underlying CSV reader.
2147 pub fn reader_mut(&mut self) -> &mut Reader<R> {
2148 &mut self.rdr
2149 }
2150}
2151
2152impl<'r, R: io::Read> Iterator for ByteRecordsIter<'r, R> {
2153 type Item = Result<ByteRecord>;
2154
2155 fn next(&mut self) -> Option<Result<ByteRecord>> {
2156 match self.rdr.read_byte_record(&mut self.rec) {
2157 Err(err) => Some(Err(err)),
2158 Ok(true) => Some(Ok(self.rec.clone_truncated())),
2159 Ok(false) => None,
2160 }
2161 }
2162}
2163
2164#[cfg(test)]
2165mod tests {
2166 use std::io;
2167
2168 use crate::{
2169 byte_record::ByteRecord, error::ErrorKind, string_record::StringRecord,
2170 };
2171
2172 use super::{Position, ReaderBuilder, Trim};
2173
2174 fn b(s: &str) -> &[u8] {
2175 s.as_bytes()
2176 }
2177 fn s(b: &[u8]) -> &str {
2178 ::std::str::from_utf8(b).unwrap()
2179 }
2180
2181 fn newpos(byte: u64, line: u64, record: u64) -> Position {
2182 let mut p = Position::new();
2183 p.set_byte(byte).set_line(line).set_record(record);
2184 p
2185 }
2186
2187 #[test]
2188 fn read_byte_record() {
2189 let data = b("foo,\"b,ar\",baz\nabc,mno,xyz");
2190 let mut rdr =
2191 ReaderBuilder::new().has_headers(false).from_reader(data);
2192 let mut rec = ByteRecord::new();
2193
2194 assert!(rdr.read_byte_record(&mut rec).unwrap());
2195 assert_eq!(3, rec.len());
2196 assert_eq!("foo", s(&rec[0]));
2197 assert_eq!("b,ar", s(&rec[1]));
2198 assert_eq!("baz", s(&rec[2]));
2199
2200 assert!(rdr.read_byte_record(&mut rec).unwrap());
2201 assert_eq!(3, rec.len());
2202 assert_eq!("abc", s(&rec[0]));
2203 assert_eq!("mno", s(&rec[1]));
2204 assert_eq!("xyz", s(&rec[2]));
2205
2206 assert!(!rdr.read_byte_record(&mut rec).unwrap());
2207 }
2208
2209 #[test]
2210 fn read_trimmed_records_and_headers() {
2211 let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t");
2212 let mut rdr = ReaderBuilder::new()
2213 .has_headers(true)
2214 .trim(Trim::All)
2215 .from_reader(data);
2216 let mut rec = ByteRecord::new();
2217 assert!(rdr.read_byte_record(&mut rec).unwrap());
2218 assert_eq!("1", s(&rec[0]));
2219 assert_eq!("2", s(&rec[1]));
2220 assert_eq!("3", s(&rec[2]));
2221 let mut rec = StringRecord::new();
2222 assert!(rdr.read_record(&mut rec).unwrap());
2223 assert_eq!("1", &rec[0]);
2224 assert_eq!("", &rec[1]);
2225 assert_eq!("3", &rec[2]);
2226 {
2227 let headers = rdr.headers().unwrap();
2228 assert_eq!(3, headers.len());
2229 assert_eq!("foo", &headers[0]);
2230 assert_eq!("bar", &headers[1]);
2231 assert_eq!("baz", &headers[2]);
2232 }
2233 }
2234
2235 #[test]
2236 fn read_trimmed_header() {
2237 let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t");
2238 let mut rdr = ReaderBuilder::new()
2239 .has_headers(true)
2240 .trim(Trim::Headers)
2241 .from_reader(data);
2242 let mut rec = ByteRecord::new();
2243 assert!(rdr.read_byte_record(&mut rec).unwrap());
2244 assert_eq!(" 1", s(&rec[0]));
2245 assert_eq!(" 2", s(&rec[1]));
2246 assert_eq!(" 3", s(&rec[2]));
2247 {
2248 let headers = rdr.headers().unwrap();
2249 assert_eq!(3, headers.len());
2250 assert_eq!("foo", &headers[0]);
2251 assert_eq!("bar", &headers[1]);
2252 assert_eq!("baz", &headers[2]);
2253 }
2254 }
2255
2256 #[test]
2257 fn read_trimed_header_invalid_utf8() {
2258 let data = &b"foo, b\xFFar,\tbaz\na,b,c\nd,e,f"[..];
2259 let mut rdr = ReaderBuilder::new()
2260 .has_headers(true)
2261 .trim(Trim::Headers)
2262 .from_reader(data);
2263 let mut rec = StringRecord::new();
2264
2265 // force the headers to be read
2266 let _ = rdr.read_record(&mut rec);
2267 // Check the byte headers are trimmed
2268 {
2269 let headers = rdr.byte_headers().unwrap();
2270 assert_eq!(3, headers.len());
2271 assert_eq!(b"foo", &headers[0]);
2272 assert_eq!(b"b\xFFar", &headers[1]);
2273 assert_eq!(b"baz", &headers[2]);
2274 }
2275 match *rdr.headers().unwrap_err().kind() {
2276 ErrorKind::Utf8 { pos: Some(ref pos), ref err } => {
2277 assert_eq!(pos, &newpos(0, 1, 0));
2278 assert_eq!(err.field(), 1);
2279 assert_eq!(err.valid_up_to(), 3);
2280 }
2281 ref err => panic!("match failed, got {:?}", err),
2282 }
2283 }
2284
2285 #[test]
2286 fn read_trimmed_records() {
2287 let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t");
2288 let mut rdr = ReaderBuilder::new()
2289 .has_headers(true)
2290 .trim(Trim::Fields)
2291 .from_reader(data);
2292 let mut rec = ByteRecord::new();
2293 assert!(rdr.read_byte_record(&mut rec).unwrap());
2294 assert_eq!("1", s(&rec[0]));
2295 assert_eq!("2", s(&rec[1]));
2296 assert_eq!("3", s(&rec[2]));
2297 {
2298 let headers = rdr.headers().unwrap();
2299 assert_eq!(3, headers.len());
2300 assert_eq!("foo", &headers[0]);
2301 assert_eq!(" bar", &headers[1]);
2302 assert_eq!("\tbaz", &headers[2]);
2303 }
2304 }
2305
2306 #[test]
2307 fn read_trimmed_records_without_headers() {
2308 let data = b("a1, b1\t,\t c1\t\n");
2309 let mut rdr = ReaderBuilder::new()
2310 .has_headers(false)
2311 .trim(Trim::All)
2312 .from_reader(data);
2313 let mut rec = ByteRecord::new();
2314 assert!(rdr.read_byte_record(&mut rec).unwrap());
2315 assert_eq!("a1", s(&rec[0]));
2316 assert_eq!("b1", s(&rec[1]));
2317 assert_eq!("c1", s(&rec[2]));
2318 }
2319
2320 #[test]
2321 fn read_record_unequal_fails() {
2322 let data = b("foo\nbar,baz");
2323 let mut rdr =
2324 ReaderBuilder::new().has_headers(false).from_reader(data);
2325 let mut rec = ByteRecord::new();
2326
2327 assert!(rdr.read_byte_record(&mut rec).unwrap());
2328 assert_eq!(1, rec.len());
2329 assert_eq!("foo", s(&rec[0]));
2330
2331 match rdr.read_byte_record(&mut rec) {
2332 Err(err) => match *err.kind() {
2333 ErrorKind::UnequalLengths {
2334 expected_len: 1,
2335 ref pos,
2336 len: 2,
2337 } => {
2338 assert_eq!(pos, &Some(newpos(4, 2, 1)));
2339 }
2340 ref wrong => panic!("match failed, got {:?}", wrong),
2341 },
2342 wrong => panic!("match failed, got {:?}", wrong),
2343 }
2344 }
2345
2346 #[test]
2347 fn read_record_unequal_ok() {
2348 let data = b("foo\nbar,baz");
2349 let mut rdr = ReaderBuilder::new()
2350 .has_headers(false)
2351 .flexible(true)
2352 .from_reader(data);
2353 let mut rec = ByteRecord::new();
2354
2355 assert!(rdr.read_byte_record(&mut rec).unwrap());
2356 assert_eq!(1, rec.len());
2357 assert_eq!("foo", s(&rec[0]));
2358
2359 assert!(rdr.read_byte_record(&mut rec).unwrap());
2360 assert_eq!(2, rec.len());
2361 assert_eq!("bar", s(&rec[0]));
2362 assert_eq!("baz", s(&rec[1]));
2363
2364 assert!(!rdr.read_byte_record(&mut rec).unwrap());
2365 }
2366
2367 // This tests that even if we get a CSV error, we can continue reading
2368 // if we want.
2369 #[test]
2370 fn read_record_unequal_continue() {
2371 let data = b("foo\nbar,baz\nquux");
2372 let mut rdr =
2373 ReaderBuilder::new().has_headers(false).from_reader(data);
2374 let mut rec = ByteRecord::new();
2375
2376 assert!(rdr.read_byte_record(&mut rec).unwrap());
2377 assert_eq!(1, rec.len());
2378 assert_eq!("foo", s(&rec[0]));
2379
2380 match rdr.read_byte_record(&mut rec) {
2381 Err(err) => match err.kind() {
2382 &ErrorKind::UnequalLengths {
2383 expected_len: 1,
2384 ref pos,
2385 len: 2,
2386 } => {
2387 assert_eq!(pos, &Some(newpos(4, 2, 1)));
2388 }
2389 wrong => panic!("match failed, got {:?}", wrong),
2390 },
2391 wrong => panic!("match failed, got {:?}", wrong),
2392 }
2393
2394 assert!(rdr.read_byte_record(&mut rec).unwrap());
2395 assert_eq!(1, rec.len());
2396 assert_eq!("quux", s(&rec[0]));
2397
2398 assert!(!rdr.read_byte_record(&mut rec).unwrap());
2399 }
2400
2401 #[test]
2402 fn read_record_headers() {
2403 let data = b("foo,bar,baz\na,b,c\nd,e,f");
2404 let mut rdr = ReaderBuilder::new().has_headers(true).from_reader(data);
2405 let mut rec = StringRecord::new();
2406
2407 assert!(rdr.read_record(&mut rec).unwrap());
2408 assert_eq!(3, rec.len());
2409 assert_eq!("a", &rec[0]);
2410
2411 assert!(rdr.read_record(&mut rec).unwrap());
2412 assert_eq!(3, rec.len());
2413 assert_eq!("d", &rec[0]);
2414
2415 assert!(!rdr.read_record(&mut rec).unwrap());
2416
2417 {
2418 let headers = rdr.byte_headers().unwrap();
2419 assert_eq!(3, headers.len());
2420 assert_eq!(b"foo", &headers[0]);
2421 assert_eq!(b"bar", &headers[1]);
2422 assert_eq!(b"baz", &headers[2]);
2423 }
2424 {
2425 let headers = rdr.headers().unwrap();
2426 assert_eq!(3, headers.len());
2427 assert_eq!("foo", &headers[0]);
2428 assert_eq!("bar", &headers[1]);
2429 assert_eq!("baz", &headers[2]);
2430 }
2431 }
2432
2433 #[test]
2434 fn read_record_headers_invalid_utf8() {
2435 let data = &b"foo,b\xFFar,baz\na,b,c\nd,e,f"[..];
2436 let mut rdr = ReaderBuilder::new().has_headers(true).from_reader(data);
2437 let mut rec = StringRecord::new();
2438
2439 assert!(rdr.read_record(&mut rec).unwrap());
2440 assert_eq!(3, rec.len());
2441 assert_eq!("a", &rec[0]);
2442
2443 assert!(rdr.read_record(&mut rec).unwrap());
2444 assert_eq!(3, rec.len());
2445 assert_eq!("d", &rec[0]);
2446
2447 assert!(!rdr.read_record(&mut rec).unwrap());
2448
2449 // Check that we can read the headers as raw bytes, but that
2450 // if we read them as strings, we get an appropriate UTF-8 error.
2451 {
2452 let headers = rdr.byte_headers().unwrap();
2453 assert_eq!(3, headers.len());
2454 assert_eq!(b"foo", &headers[0]);
2455 assert_eq!(b"b\xFFar", &headers[1]);
2456 assert_eq!(b"baz", &headers[2]);
2457 }
2458 match *rdr.headers().unwrap_err().kind() {
2459 ErrorKind::Utf8 { pos: Some(ref pos), ref err } => {
2460 assert_eq!(pos, &newpos(0, 1, 0));
2461 assert_eq!(err.field(), 1);
2462 assert_eq!(err.valid_up_to(), 1);
2463 }
2464 ref err => panic!("match failed, got {:?}", err),
2465 }
2466 }
2467
2468 #[test]
2469 fn read_record_no_headers_before() {
2470 let data = b("foo,bar,baz\na,b,c\nd,e,f");
2471 let mut rdr =
2472 ReaderBuilder::new().has_headers(false).from_reader(data);
2473 let mut rec = StringRecord::new();
2474
2475 {
2476 let headers = rdr.headers().unwrap();
2477 assert_eq!(3, headers.len());
2478 assert_eq!("foo", &headers[0]);
2479 assert_eq!("bar", &headers[1]);
2480 assert_eq!("baz", &headers[2]);
2481 }
2482
2483 assert!(rdr.read_record(&mut rec).unwrap());
2484 assert_eq!(3, rec.len());
2485 assert_eq!("foo", &rec[0]);
2486
2487 assert!(rdr.read_record(&mut rec).unwrap());
2488 assert_eq!(3, rec.len());
2489 assert_eq!("a", &rec[0]);
2490
2491 assert!(rdr.read_record(&mut rec).unwrap());
2492 assert_eq!(3, rec.len());
2493 assert_eq!("d", &rec[0]);
2494
2495 assert!(!rdr.read_record(&mut rec).unwrap());
2496 }
2497
2498 #[test]
2499 fn read_record_no_headers_after() {
2500 let data = b("foo,bar,baz\na,b,c\nd,e,f");
2501 let mut rdr =
2502 ReaderBuilder::new().has_headers(false).from_reader(data);
2503 let mut rec = StringRecord::new();
2504
2505 assert!(rdr.read_record(&mut rec).unwrap());
2506 assert_eq!(3, rec.len());
2507 assert_eq!("foo", &rec[0]);
2508
2509 assert!(rdr.read_record(&mut rec).unwrap());
2510 assert_eq!(3, rec.len());
2511 assert_eq!("a", &rec[0]);
2512
2513 assert!(rdr.read_record(&mut rec).unwrap());
2514 assert_eq!(3, rec.len());
2515 assert_eq!("d", &rec[0]);
2516
2517 assert!(!rdr.read_record(&mut rec).unwrap());
2518
2519 let headers = rdr.headers().unwrap();
2520 assert_eq!(3, headers.len());
2521 assert_eq!("foo", &headers[0]);
2522 assert_eq!("bar", &headers[1]);
2523 assert_eq!("baz", &headers[2]);
2524 }
2525
2526 #[test]
2527 fn seek() {
2528 let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i");
2529 let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data));
2530 rdr.seek(newpos(18, 3, 2)).unwrap();
2531
2532 let mut rec = StringRecord::new();
2533
2534 assert_eq!(18, rdr.position().byte());
2535 assert!(rdr.read_record(&mut rec).unwrap());
2536 assert_eq!(3, rec.len());
2537 assert_eq!("d", &rec[0]);
2538
2539 assert_eq!(24, rdr.position().byte());
2540 assert_eq!(4, rdr.position().line());
2541 assert_eq!(3, rdr.position().record());
2542 assert!(rdr.read_record(&mut rec).unwrap());
2543 assert_eq!(3, rec.len());
2544 assert_eq!("g", &rec[0]);
2545
2546 assert!(!rdr.read_record(&mut rec).unwrap());
2547 }
2548
2549 // Test that we can read headers after seeking even if the headers weren't
2550 // explicit read before seeking.
2551 #[test]
2552 fn seek_headers_after() {
2553 let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i");
2554 let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data));
2555 rdr.seek(newpos(18, 3, 2)).unwrap();
2556 assert_eq!(rdr.headers().unwrap(), vec!["foo", "bar", "baz"]);
2557 }
2558
2559 // Test that we can read headers after seeking if the headers were read
2560 // before seeking.
2561 #[test]
2562 fn seek_headers_before_after() {
2563 let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i");
2564 let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data));
2565 let headers = rdr.headers().unwrap().clone();
2566 rdr.seek(newpos(18, 3, 2)).unwrap();
2567 assert_eq!(&headers, rdr.headers().unwrap());
2568 }
2569
2570 // Test that even if we didn't read headers before seeking, if we seek to
2571 // the current byte offset, then no seeking is done and therefore we can
2572 // still read headers after seeking.
2573 #[test]
2574 fn seek_headers_no_actual_seek() {
2575 let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i");
2576 let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data));
2577 rdr.seek(Position::new()).unwrap();
2578 assert_eq!("foo", &rdr.headers().unwrap()[0]);
2579 }
2580
2581 // Test that position info is reported correctly in absence of headers.
2582 #[test]
2583 fn positions_no_headers() {
2584 let mut rdr = ReaderBuilder::new()
2585 .has_headers(false)
2586 .from_reader("a,b,c\nx,y,z".as_bytes())
2587 .into_records();
2588
2589 let pos = rdr.next().unwrap().unwrap().position().unwrap().clone();
2590 assert_eq!(pos.byte(), 0);
2591 assert_eq!(pos.line(), 1);
2592 assert_eq!(pos.record(), 0);
2593
2594 let pos = rdr.next().unwrap().unwrap().position().unwrap().clone();
2595 assert_eq!(pos.byte(), 6);
2596 assert_eq!(pos.line(), 2);
2597 assert_eq!(pos.record(), 1);
2598 }
2599
2600 // Test that position info is reported correctly with headers.
2601 #[test]
2602 fn positions_headers() {
2603 let mut rdr = ReaderBuilder::new()
2604 .has_headers(true)
2605 .from_reader("a,b,c\nx,y,z".as_bytes())
2606 .into_records();
2607
2608 let pos = rdr.next().unwrap().unwrap().position().unwrap().clone();
2609 assert_eq!(pos.byte(), 6);
2610 assert_eq!(pos.line(), 2);
2611 assert_eq!(pos.record(), 1);
2612 }
2613
2614 // Test that reading headers on empty data yields an empty record.
2615 #[test]
2616 fn headers_on_empty_data() {
2617 let mut rdr = ReaderBuilder::new().from_reader("".as_bytes());
2618 let r = rdr.byte_headers().unwrap();
2619 assert_eq!(r.len(), 0);
2620 }
2621
2622 // Test that reading the first record on empty data works.
2623 #[test]
2624 fn no_headers_on_empty_data() {
2625 let mut rdr =
2626 ReaderBuilder::new().has_headers(false).from_reader("".as_bytes());
2627 assert_eq!(rdr.records().count(), 0);
2628 }
2629
2630 // Test that reading the first record on empty data works, even if
2631 // we've tried to read headers before hand.
2632 #[test]
2633 fn no_headers_on_empty_data_after_headers() {
2634 let mut rdr =
2635 ReaderBuilder::new().has_headers(false).from_reader("".as_bytes());
2636 assert_eq!(rdr.headers().unwrap().len(), 0);
2637 assert_eq!(rdr.records().count(), 0);
2638 }
2639}