csv_core/
reader.rs

Help
1use core::fmt;
2
3use crate::Terminator;
4
5// BE ADVISED
6//
7// This may just be one of the more complicated CSV parsers you'll come across.
8// The implementation never allocates and consists of both a functional NFA
9// parser and a DFA parser. The DFA parser is the work horse and we could elide
10// much of the work involved in making the NFA parser work, but the NFA parser
11// is much easier to debug. The NFA parser is tested alongside the DFA parser,
12// so they should never be out of sync.
13//
14// The basic structure of the implementation is to encode the NFA parser as
15// an explicit state machine in code. The DFA is then generated by populating
16// a transition table on the stack by exhaustively enumerating all possible
17// states on all possible inputs (this is possible because the number of states
18// and the number of inputs is very small).
19//
20// Note that some pieces of the NFA parser (such as the NFA state machine) are
21// required. In particular, the translation from the NFA to the DFA depends on
22// the configuration of the CSV parser as given by the caller, and indeed, this
23// is one of the key performance benefits of the DFA: it doesn't have any
24// overhead (other than a bigger transition table) associated with the number
25// of configuration options.
26//
27// ADVICE FOR HACKERS
28//
29// This code is too clever for its own good. As such, changes to some parts of
30// the code may have a non-obvious impact on other parts. This is mostly
31// motivated by trying to keep the DFA transition table as small as possible,
32// since it is stored on the stack. Here are some tips that may save you some
33// time:
34//
35// * If you add a new NFA state, then you also need to consider how it impacts
36//   the DFA. If all of the incoming transitions into an NFA state are
37//   epsilon transitions, then it probably isn't materialized in the DFA.
38//   If the NFA state indicates that a field or a record has been parsed, then
39//   it should be considered final. Let the comments in `NfaState` be your
40//   guide.
41// * If you add a new configuration knob to the parser, then you may need to
42//   modify the `TRANS_CLASSES` constant below. The `TRANS_CLASSES` constant
43//   indicates the total number of discriminating bytes in the DFA. And if you
44//   modify `TRANS_CLASSES`, you probably also need to modify `build_dfa` to
45//   add a new class. For example, in order to add parsing support for
46//   comments, I bumped `TRANS_CLASSES` from `6` to `7` and added the comment
47//   byte (if one exists) to the list of classes in `build_dfa`.
48// * The special DFA start state doubles as the final state once all input
49//   from the caller has been exhausted. We must be careful to guard this
50//   case analysis on whether the input is actually exhausted, since the start
51//   state is an otherwise valid state.
52
53/// A pull based CSV reader.
54///
55/// This reader parses CSV data using a finite state machine. Callers can
56/// extract parsed data incrementally using one of the `read` methods.
57///
58/// Note that this CSV reader is somewhat encoding agnostic. The source data
59/// needs to be at least ASCII compatible. There is no support for specifying
60/// the full gamut of Unicode delimiters/terminators/quotes/escapes. Instead,
61/// any byte can be used, although callers probably want to stick to the ASCII
62/// subset (`<= 0x7F`).
63///
64/// # Usage
65///
66/// A reader has two different ways to read CSV data, each with their own
67/// trade offs.
68///
69/// * `read_field` - Copies a single CSV field into an output buffer while
70///   unescaping quotes. This is simple to use and doesn't require storing an
71///   entire record contiguously in memory, but it is slower.
72/// * `read_record` - Copies an entire CSV record into an output buffer while
73///   unescaping quotes. The ending positions of each field are copied into
74///   an additional buffer. This is harder to use and requires larger output
75///   buffers, but it is faster than `read_field` since it amortizes more
76///   costs.
77///
78/// # RFC 4180
79///
80/// [RFC 4180](https://tools.ietf.org/html/rfc4180)
81/// is the closest thing to a specification for CSV data. Unfortunately,
82/// CSV data that is seen in the wild can vary significantly. Often, the CSV
83/// data is outright invalid. Instead of fixing the producers of bad CSV data,
84/// we have seen fit to make consumers much more flexible in what they accept.
85/// This reader continues that tradition, and therefore, isn't technically
86/// compliant with RFC 4180. In particular, this reader will never return an
87/// error and will always find *a* parse.
88///
89/// Here are some detailed differences from RFC 4180:
90///
91/// * CRLF, LF and CR are each treated as a single record terminator by
92///   default.
93/// * Records are permitted to be of varying length.
94/// * Empty lines (that do not include other whitespace) are ignored.
95#[derive(Clone, Debug)]
96pub struct Reader {
97    /// A table-based DFA for parsing CSV.
98    dfa: Dfa,
99    /// The current DFA state, if the DFA is used.
100    dfa_state: DfaState,
101    /// The current NFA state, if the NFA is used.
102    nfa_state: NfaState,
103    /// The delimiter that separates fields.
104    delimiter: u8,
105    /// The terminator that separates records.
106    term: Terminator,
107    /// The quotation byte.
108    quote: u8,
109    /// Whether to recognize escaped quotes.
110    escape: Option<u8>,
111    /// Whether to recognized doubled quotes.
112    double_quote: bool,
113    /// If enabled, lines beginning with this byte are ignored.
114    comment: Option<u8>,
115    /// If enabled (the default), then quotes are respected. When disabled,
116    /// quotes are not treated specially.
117    quoting: bool,
118    /// Whether to use the NFA for parsing.
119    ///
120    /// Generally this is for debugging. There's otherwise no good reason
121    /// to avoid the DFA.
122    use_nfa: bool,
123    /// The current line number.
124    line: u64,
125    /// Whether this parser has ever read anything.
126    has_read: bool,
127    /// The current position in the output buffer when reading a record.
128    output_pos: usize,
129}
130
131impl Default for Reader {
132    fn default() -> Reader {
133        Reader {
134            dfa: Dfa::new(),
135            dfa_state: DfaState::start(),
136            nfa_state: NfaState::StartRecord,
137            delimiter: b',',
138            term: Terminator::default(),
139            quote: b'"',
140            escape: None,
141            double_quote: true,
142            comment: None,
143            quoting: true,
144            use_nfa: false,
145            line: 1,
146            has_read: false,
147            output_pos: 0,
148        }
149    }
150}
151
152/// Builds a CSV reader with various configuration knobs.
153///
154/// This builder can be used to tweak the field delimiter, record terminator
155/// and more for parsing CSV. Once a CSV `Reader` is built, its configuration
156/// cannot be changed.
157#[derive(Debug, Default)]
158pub struct ReaderBuilder {
159    rdr: Reader,
160}
161
162impl ReaderBuilder {
163    /// Create a new builder.
164    pub fn new() -> ReaderBuilder {
165        ReaderBuilder::default()
166    }
167
168    /// Build a CSV parser from this configuration.
169    pub fn build(&self) -> Reader {
170        let mut rdr = self.rdr.clone();
171        rdr.build_dfa();
172        rdr
173    }
174
175    /// The field delimiter to use when parsing CSV.
176    ///
177    /// The default is `b','`.
178    pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder {
179        self.rdr.delimiter = delimiter;
180        self
181    }
182
183    /// The record terminator to use when parsing CSV.
184    ///
185    /// A record terminator can be any single byte. The default is a special
186    /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n`
187    /// or `\r\n` as a single record terminator.
188    pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder {
189        self.rdr.term = term;
190        self
191    }
192
193    /// The quote character to use when parsing CSV.
194    ///
195    /// The default is `b'"'`.
196    pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder {
197        self.rdr.quote = quote;
198        self
199    }
200
201    /// The escape character to use when parsing CSV.
202    ///
203    /// In some variants of CSV, quotes are escaped using a special escape
204    /// character like `\` (instead of escaping quotes by doubling them).
205    ///
206    /// By default, recognizing these idiosyncratic escapes is disabled.
207    pub fn escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder {
208        self.rdr.escape = escape;
209        self
210    }
211
212    /// Enable double quote escapes.
213    ///
214    /// This is enabled by default, but it may be disabled. When disabled,
215    /// doubled quotes are not interpreted as escapes.
216    pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder {
217        self.rdr.double_quote = yes;
218        self
219    }
220
221    /// Enable or disable quoting.
222    ///
223    /// This is enabled by default, but it may be disabled. When disabled,
224    /// quotes are not treated specially.
225    pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder {
226        self.rdr.quoting = yes;
227        self
228    }
229
230    /// The comment character to use when parsing CSV.
231    ///
232    /// If the start of a record begins with the byte given here, then that
233    /// line is ignored by the CSV parser.
234    ///
235    /// This is disabled by default.
236    pub fn comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder {
237        self.rdr.comment = comment;
238        self
239    }
240
241    /// A convenience method for specifying a configuration to read ASCII
242    /// delimited text.
243    ///
244    /// This sets the delimiter and record terminator to the ASCII unit
245    /// separator (`\x1F`) and record separator (`\x1E`), respectively.
246    pub fn ascii(&mut self) -> &mut ReaderBuilder {
247        self.delimiter(b'\x1F').terminator(Terminator::Any(b'\x1E'))
248    }
249
250    /// Enable or disable the NFA for parsing CSV.
251    ///
252    /// This is intended to be a debug option useful for debugging. The NFA
253    /// is always slower than the DFA.
254    #[doc(hidden)]
255    pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder {
256        self.rdr.use_nfa = yes;
257        self
258    }
259}
260
261/// The result of parsing at most one field from CSV data.
262#[derive(Clone, Debug, Eq, PartialEq)]
263pub enum ReadFieldResult {
264    /// The caller provided input was exhausted before the end of a field or
265    /// record was found.
266    InputEmpty,
267    /// The caller provided output buffer was filled before an entire field
268    /// could be written to it.
269    OutputFull,
270    /// The end of a field was found.
271    ///
272    /// Note that when `record_end` is true, then the end of this field also
273    /// corresponds to the end of a record.
274    Field {
275        /// Whether this was the last field in a record or not.
276        record_end: bool,
277    },
278    /// All CSV data has been read.
279    ///
280    /// This state can only be returned when an empty input buffer is provided
281    /// by the caller.
282    End,
283}
284
285impl ReadFieldResult {
286    fn from_nfa(
287        state: NfaState,
288        inpdone: bool,
289        outdone: bool,
290    ) -> ReadFieldResult {
291        match state {
292            NfaState::End => ReadFieldResult::End,
293            NfaState::EndRecord | NfaState::CRLF => {
294                ReadFieldResult::Field { record_end: true }
295            }
296            NfaState::EndFieldDelim => {
297                ReadFieldResult::Field { record_end: false }
298            }
299            _ => {
300                assert!(!state.is_field_final());
301                if !inpdone && outdone {
302                    ReadFieldResult::OutputFull
303                } else {
304                    ReadFieldResult::InputEmpty
305                }
306            }
307        }
308    }
309}
310
311/// The result of parsing at most one field from CSV data while ignoring the
312/// output.
313#[derive(Clone, Debug, Eq, PartialEq)]
314pub enum ReadFieldNoCopyResult {
315    /// The caller provided input was exhausted before the end of a field or
316    /// record was found.
317    InputEmpty,
318    /// The end of a field was found.
319    ///
320    /// Note that when `record_end` is true, then the end of this field also
321    /// corresponds to the end of a record.
322    Field {
323        /// Whether this was the last field in a record or not.
324        record_end: bool,
325    },
326    /// All CSV data has been read.
327    ///
328    /// This state can only be returned when an empty input buffer is provided
329    /// by the caller.
330    End,
331}
332
333/// The result of parsing at most one record from CSV data.
334#[derive(Clone, Debug, Eq, PartialEq)]
335pub enum ReadRecordResult {
336    /// The caller provided input was exhausted before the end of a record was
337    /// found.
338    InputEmpty,
339    /// The caller provided output buffer was filled before an entire field
340    /// could be written to it.
341    OutputFull,
342    /// The caller provided output buffer of field end poisitions was filled
343    /// before the next field could be parsed.
344    OutputEndsFull,
345    /// The end of a record was found.
346    Record,
347    /// All CSV data has been read.
348    ///
349    /// This state can only be returned when an empty input buffer is provided
350    /// by the caller.
351    End,
352}
353
354impl ReadRecordResult {
355    fn is_record(&self) -> bool {
356        *self == ReadRecordResult::Record
357    }
358
359    fn from_nfa(
360        state: NfaState,
361        inpdone: bool,
362        outdone: bool,
363        endsdone: bool,
364    ) -> ReadRecordResult {
365        match state {
366            NfaState::End => ReadRecordResult::End,
367            NfaState::EndRecord | NfaState::CRLF => ReadRecordResult::Record,
368            _ => {
369                assert!(!state.is_record_final());
370                if !inpdone && outdone {
371                    ReadRecordResult::OutputFull
372                } else if !inpdone && endsdone {
373                    ReadRecordResult::OutputEndsFull
374                } else {
375                    ReadRecordResult::InputEmpty
376                }
377            }
378        }
379    }
380}
381
382/// The result of parsing at most one record from CSV data while ignoring
383/// output.
384#[derive(Clone, Debug, Eq, PartialEq)]
385pub enum ReadRecordNoCopyResult {
386    /// The caller provided input was exhausted before the end of a record was
387    /// found.
388    InputEmpty,
389    /// The end of a record was found.
390    Record,
391    /// All CSV data has been read.
392    ///
393    /// This state can only be returned when an empty input buffer is provided
394    /// by the caller.
395    End,
396}
397
398/// What should be done with input bytes during an NFA transition
399#[derive(Clone, Debug, Eq, PartialEq)]
400enum NfaInputAction {
401    // Do not consume an input byte
402    Epsilon,
403    // Copy input byte to a caller-provided output buffer
404    CopyToOutput,
405    // Consume but do not copy input byte (for example, seeing a field
406    // delimiter will consume an input byte but should not copy it to the
407    // output buffer.
408    Discard,
409}
410
411/// An NFA state is a state that can be visited in the NFA parser.
412///
413/// Given the simplicity of the machine, a subset of NFA states double as DFA
414/// states. NFA states that only have incoming epsilon transitions are
415/// optimized out when converting the machine to a DFA.
416#[derive(Copy, Clone, Debug, Eq, PartialEq)]
417enum NfaState {
418    // These states aren't used in the DFA, so we
419    // assign them meaningless numbers.
420    EndFieldTerm = 200,
421    InRecordTerm = 201,
422    End = 202,
423
424    // All states below are DFA states.
425    StartRecord = 0,
426    StartField = 1,
427    InField = 2,
428    InQuotedField = 3,
429    InEscapedQuote = 4,
430    InDoubleEscapedQuote = 5,
431    InComment = 6,
432    // All states below are "final field" states.
433    // Namely, they indicate that a field has been parsed.
434    EndFieldDelim = 7,
435    // All states below are "final record" states.
436    // Namely, they indicate that a record has been parsed.
437    EndRecord = 8,
438    CRLF = 9,
439}
440
441/// A list of NFA states that have an explicit representation in the DFA.
442const NFA_STATES: &'static [NfaState] = &[
443    NfaState::StartRecord,
444    NfaState::StartField,
445    NfaState::EndFieldDelim,
446    NfaState::InField,
447    NfaState::InQuotedField,
448    NfaState::InEscapedQuote,
449    NfaState::InDoubleEscapedQuote,
450    NfaState::InComment,
451    NfaState::EndRecord,
452    NfaState::CRLF,
453];
454
455impl NfaState {
456    /// Returns true if this state indicates that a field has been parsed.
457    fn is_field_final(&self) -> bool {
458        match *self {
459            NfaState::End
460            | NfaState::EndRecord
461            | NfaState::CRLF
462            | NfaState::EndFieldDelim => true,
463            _ => false,
464        }
465    }
466
467    /// Returns true if this state indicates that a record has been parsed.
468    fn is_record_final(&self) -> bool {
469        match *self {
470            NfaState::End | NfaState::EndRecord | NfaState::CRLF => true,
471            _ => false,
472        }
473    }
474}
475
476impl Reader {
477    /// Create a new CSV reader with a default parser configuration.
478    pub fn new() -> Reader {
479        ReaderBuilder::new().build()
480    }
481
482    /// Reset the parser such that it behaves as if it had never been used.
483    ///
484    /// This may be useful when reading CSV data in a random access pattern.
485    pub fn reset(&mut self) {
486        self.dfa_state = self.dfa.new_state(NfaState::StartRecord);
487        self.nfa_state = NfaState::StartRecord;
488        self.line = 1;
489        self.has_read = false;
490        self.output_pos = 0;
491    }
492
493    /// Return the current line number as measured by the number of occurrences
494    /// of `\n`.
495    ///
496    /// Line numbers starts at `1` and are reset when `reset` is called.
497    pub fn line(&self) -> u64 {
498        self.line
499    }
500
501    /// Set the line number.
502    ///
503    /// This is useful after a call to `reset` where the caller knows the
504    /// line number from some additional context.
505    pub fn set_line(&mut self, line: u64) {
506        self.line = line;
507    }
508
509    /// Parse a single CSV field in `input` and copy field data to `output`.
510    ///
511    /// This routine requires a caller provided buffer of CSV data as the
512    /// `input` and a caller provided buffer, `output`, in which to store field
513    /// data extracted from `input`. The field data copied to `output` will
514    /// have its quotes unescaped.
515    ///
516    /// Calling this routine parses at most a single field and returns
517    /// three values indicating the state of the parser. The first value, a
518    /// `ReadFieldResult`, tells the caller what to do next. For example, if
519    /// the entire input was read or if the output buffer was filled before
520    /// a full field had been read, then `ReadFieldResult::InputEmpty` or
521    /// `ReadFieldResult::OutputFull` is returned, respectively. See the
522    /// documentation for `ReadFieldResult` for more details.
523    ///
524    /// The other two values returned correspond to the number of bytes
525    /// read from `input` and written to `output`, respectively.
526    ///
527    /// # Termination
528    ///
529    /// This reader interprets an empty `input` buffer as an indication that
530    /// there is no CSV data left to read. Namely, when the caller has
531    /// exhausted all CSV data, the caller should continue to call `read` with
532    /// an empty input buffer until `ReadFieldResult::End` is returned.
533    ///
534    /// # Errors
535    ///
536    /// This CSV reader can never return an error. Instead, it prefers *a*
537    /// parse over *no* parse.
538    pub fn read_field(
539        &mut self,
540        input: &[u8],
541        output: &mut [u8],
542    ) -> (ReadFieldResult, usize, usize) {
543        let (input, bom_nin) = self.strip_utf8_bom(input);
544        let (res, nin, nout) = if self.use_nfa {
545            self.read_field_nfa(input, output)
546        } else {
547            self.read_field_dfa(input, output)
548        };
549        self.has_read = true;
550        (res, nin + bom_nin, nout)
551    }
552
553    /// Parse a single CSV record in `input` and copy each field contiguously
554    /// to `output`, with the end position of each field written to `ends`.
555    ///
556    /// **NOTE**: This method is more cumbersome to use than `read_field`, but
557    /// it can be faster since it amortizes more work.
558    ///
559    /// This routine requires a caller provided buffer of CSV data as the
560    /// `input` and two caller provided buffers to store the unescaped field
561    /// data (`output`) and the end position of each field in the record
562    /// (`fields`).
563    ///
564    /// Calling this routine parses at most a single record and returns four
565    /// values indicating the state of the parser. The first value, a
566    /// `ReadRecordResult`, tells the caller what to do next. For example, if
567    /// the entire input was read or if the output buffer was filled before a
568    /// full field had been read, then `ReadRecordResult::InputEmpty` or
569    /// `ReadRecordResult::OutputFull` is returned, respectively. Similarly, if
570    /// the `ends` buffer is full, then `ReadRecordResult::OutputEndsFull` is
571    /// returned. See the documentation for `ReadRecordResult` for more
572    /// details.
573    ///
574    /// The other three values correspond to the number of bytes read from
575    /// `input`, the number of bytes written to `output` and the number of
576    /// end positions written to `ends`, respectively.
577    ///
578    /// The end positions written to `ends` are constructed as if there was
579    /// a single contiguous buffer in memory containing the entire row, even
580    /// if `ReadRecordResult::OutputFull` was returned in the middle of reading
581    /// a row.
582    ///
583    /// # Termination
584    ///
585    /// This reader interprets an empty `input` buffer as an indication that
586    /// there is no CSV data left to read. Namely, when the caller has
587    /// exhausted all CSV data, the caller should continue to call `read` with
588    /// an empty input buffer until `ReadRecordResult::End` is returned.
589    ///
590    /// # Errors
591    ///
592    /// This CSV reader can never return an error. Instead, it prefers *a*
593    /// parse over *no* parse.
594    pub fn read_record(
595        &mut self,
596        input: &[u8],
597        output: &mut [u8],
598        ends: &mut [usize],
599    ) -> (ReadRecordResult, usize, usize, usize) {
600        let (input, bom_nin) = self.strip_utf8_bom(input);
601        let (res, nin, nout, nend) = if self.use_nfa {
602            self.read_record_nfa(input, output, ends)
603        } else {
604            self.read_record_dfa(input, output, ends)
605        };
606        self.has_read = true;
607        (res, nin + bom_nin, nout, nend)
608    }
609
610    /// Strip off a possible UTF-8 BOM at the start of a file. Quick note that
611    /// this method will fail to strip off the BOM if only part of the BOM is
612    /// buffered. Hopefully that won't happen very often.
613    fn strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize) {
614        let (input, nin) = if {
615            !self.has_read
616                && input.len() >= 3
617                && &input[0..3] == b"\xef\xbb\xbf"
618        } {
619            (&input[3..], 3)
620        } else {
621            (input, 0)
622        };
623        (input, nin)
624    }
625
626    #[inline(always)]
627    fn read_record_dfa(
628        &mut self,
629        input: &[u8],
630        output: &mut [u8],
631        ends: &mut [usize],
632    ) -> (ReadRecordResult, usize, usize, usize) {
633        if input.is_empty() {
634            let s = self.transition_final_dfa(self.dfa_state);
635            let res =
636                self.dfa.new_read_record_result(s, true, false, false, false);
637            // This part is a little tricky. When reading the final record,
638            // the last result the caller will get is an InputEmpty, and while
639            // they'll have everything they need in `output`, they'll be
640            // missing the final end position of the final field in `ends`.
641            // We insert that here, but we must take care to handle the case
642            // where `ends` doesn't have enough space. If it doesn't have
643            // enough space, then we also can't transition to the next state.
644            return match res {
645                ReadRecordResult::Record => {
646                    if ends.is_empty() {
647                        return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
648                    }
649                    self.dfa_state = s;
650                    ends[0] = self.output_pos;
651                    self.output_pos = 0;
652                    (res, 0, 0, 1)
653                }
654                _ => {
655                    self.dfa_state = s;
656                    (res, 0, 0, 0)
657                }
658            };
659        }
660        if output.is_empty() {
661            return (ReadRecordResult::OutputFull, 0, 0, 0);
662        }
663        if ends.is_empty() {
664            return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
665        }
666        let (mut nin, mut nout, mut nend) = (0, 0, 0);
667        let mut state = self.dfa_state;
668        while nin < input.len() && nout < output.len() && nend < ends.len() {
669            let (s, has_out) = self.dfa.get_output(state, input[nin]);
670            self.line += (input[nin] == b'\n') as u64;
671            state = s;
672            if has_out {
673                output[nout] = input[nin];
674                nout += 1;
675            }
676            nin += 1;
677            if state >= self.dfa.final_field {
678                ends[nend] = self.output_pos + nout;
679                nend += 1;
680                if state > self.dfa.final_field {
681                    break;
682                }
683            }
684            if state == self.dfa.in_field || state == self.dfa.in_quoted {
685                self.dfa
686                    .classes
687                    .scan_and_copy(input, &mut nin, output, &mut nout);
688            }
689        }
690        let res = self.dfa.new_read_record_result(
691            state,
692            false,
693            nin >= input.len(),
694            nout >= output.len(),
695            nend >= ends.len(),
696        );
697        self.dfa_state = state;
698        if res.is_record() {
699            self.output_pos = 0;
700        } else {
701            self.output_pos += nout;
702        }
703        (res, nin, nout, nend)
704    }
705
706    #[inline(always)]
707    fn read_field_dfa(
708        &mut self,
709        input: &[u8],
710        output: &mut [u8],
711    ) -> (ReadFieldResult, usize, usize) {
712        if input.is_empty() {
713            self.dfa_state = self.transition_final_dfa(self.dfa_state);
714            let res = self.dfa.new_read_field_result(
715                self.dfa_state,
716                true,
717                false,
718                false,
719            );
720            return (res, 0, 0);
721        }
722        if output.is_empty() {
723            return (ReadFieldResult::OutputFull, 0, 0);
724        }
725        let (mut nin, mut nout) = (0, 0);
726        let mut state = self.dfa_state;
727        while nin < input.len() && nout < output.len() {
728            let b = input[nin];
729            self.line += (b == b'\n') as u64;
730            let (s, has_out) = self.dfa.get_output(state, b);
731            state = s;
732            if has_out {
733                output[nout] = b;
734                nout += 1;
735            }
736            nin += 1;
737            if state >= self.dfa.final_field {
738                break;
739            }
740        }
741        let res = self.dfa.new_read_field_result(
742            state,
743            false,
744            nin >= input.len(),
745            nout >= output.len(),
746        );
747        self.dfa_state = state;
748        (res, nin, nout)
749    }
750
751    /// Perform the final state transition, i.e., when the caller indicates
752    /// that the input has been exhausted.
753    fn transition_final_dfa(&self, state: DfaState) -> DfaState {
754        // If we''ve already emitted a record or think we're ready to start
755        // parsing a new record, then we should sink into the final state
756        // and never move from there. (pro-tip: the start state doubles as
757        // the final state!)
758        if state >= self.dfa.final_record || state.is_start() {
759            self.dfa.new_state_final_end()
760        } else {
761            self.dfa.new_state_final_record()
762        }
763    }
764
765    /// Write the transition tables for the DFA based on this parser's
766    /// configuration.
767    fn build_dfa(&mut self) {
768        // A naive DFA transition table has
769        // `cells = (# number of states) * (# size of alphabet)`. While we
770        // could get away with that, the table would have `10 * 256 = 2560`
771        // entries. Even worse, in order to avoid a multiplication instruction
772        // when computing the next transition, we store the starting index of
773        // each state's row, which would not be representible in a single byte.
774        // So we'd need a `u16`, which doubles our transition table size to
775        // ~5KB. This is a lot to put on the stack, even though it probably
776        // fits in the L1 cache of most modern CPUs.
777        //
778        // To avoid this, we note that while our "true" alphabet
779        // has 256 distinct possibilities, the DFA itself is only
780        // discriminatory on a very small subset of that alphabet. For
781        // example, assuming neither `a` nor `b` are set as special
782        // quote/comment/escape/delimiter/terminator bytes, they are otherwise
783        // indistinguishable to the DFA, so it would be OK to treat them as
784        // if they were equivalent. That is, they are in the same equivalence
785        // class.
786        //
787        // As it turns out, using this logic, we can shrink our effective
788        // alphabet down to 7 equivalence classes:
789        //
790        //   1. The field delimiter.
791        //   2. The record terminator.
792        //   3. If the record terminator is CRLF, then CR and LF are
793        //      distinct equivalence classes.
794        //   4. The quote byte.
795        //   5. The escape byte.
796        //   6. The comment byte.
797        //   7. Everything else.
798        //
799        // We add those equivalence classes here. If more configuration knobs
800        // are added to the parser with more discriminating bytes, then this
801        // logic will need to be adjusted further.
802        //
803        // Even though this requires an extra bit of indirection when computing
804        // the next transition, microbenchmarks say that it doesn't make much
805        // of a difference. Perhaps because everything fits into the L1 cache.
806        self.dfa.classes.add(self.delimiter);
807        if self.quoting {
808            self.dfa.classes.add(self.quote);
809            if let Some(escape) = self.escape {
810                self.dfa.classes.add(escape);
811            }
812        }
813        if let Some(comment) = self.comment {
814            self.dfa.classes.add(comment);
815        }
816        match self.term {
817            Terminator::Any(b) => self.dfa.classes.add(b),
818            Terminator::CRLF => {
819                self.dfa.classes.add(b'\r');
820                self.dfa.classes.add(b'\n');
821            }
822            _ => unreachable!(),
823        }
824        // Build the DFA transition table by computing the DFA state for all
825        // possible combinations of state and input byte.
826        for &state in NFA_STATES {
827            for c in (0..256).map(|c| c as u8) {
828                let mut nfa_result = (state, NfaInputAction::Epsilon);
829                // Consume NFA states until we hit a non-epsilon transition.
830                while nfa_result.0 != NfaState::End
831                    && nfa_result.1 == NfaInputAction::Epsilon
832                {
833                    nfa_result = self.transition_nfa(nfa_result.0, c);
834                }
835                let from = self.dfa.new_state(state);
836                let to = self.dfa.new_state(nfa_result.0);
837                self.dfa.set(
838                    from,
839                    c,
840                    to,
841                    nfa_result.1 == NfaInputAction::CopyToOutput,
842                );
843            }
844        }
845        self.dfa_state = self.dfa.new_state(NfaState::StartRecord);
846        self.dfa.finish();
847    }
848
849    // The NFA implementation follows. The transition_final_nfa and
850    // transition_nfa methods are required for the DFA to operate. The
851    // rest are included for completeness (and debugging). Note that this
852    // NFA implementation is included in most of the CSV parser tests below.
853
854    #[inline(always)]
855    fn read_record_nfa(
856        &mut self,
857        input: &[u8],
858        output: &mut [u8],
859        ends: &mut [usize],
860    ) -> (ReadRecordResult, usize, usize, usize) {
861        if input.is_empty() {
862            let s = self.transition_final_nfa(self.nfa_state);
863            let res = ReadRecordResult::from_nfa(s, false, false, false);
864            return match res {
865                ReadRecordResult::Record => {
866                    if ends.is_empty() {
867                        return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
868                    }
869                    self.nfa_state = s;
870                    ends[0] = self.output_pos;
871                    self.output_pos = 0;
872                    (res, 0, 0, 1)
873                }
874                _ => {
875                    self.nfa_state = s;
876                    (res, 0, 0, 0)
877                }
878            };
879        }
880        if output.is_empty() {
881            return (ReadRecordResult::OutputFull, 0, 0, 0);
882        }
883        if ends.is_empty() {
884            return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
885        }
886        let (mut nin, mut nout, mut nend) = (0, self.output_pos, 0);
887        let mut state = self.nfa_state;
888        while nin < input.len() && nout < output.len() && nend < ends.len() {
889            let (s, io) = self.transition_nfa(state, input[nin]);
890            match io {
891                NfaInputAction::CopyToOutput => {
892                    output[nout] = input[nin];
893                    nout += 1;
894                    nin += 1;
895                }
896                NfaInputAction::Discard => {
897                    nin += 1;
898                }
899                NfaInputAction::Epsilon => {}
900            }
901            state = s;
902            if state.is_field_final() {
903                ends[nend] = nout;
904                nend += 1;
905                if state != NfaState::EndFieldDelim {
906                    break;
907                }
908            }
909        }
910        let res = ReadRecordResult::from_nfa(
911            state,
912            nin >= input.len(),
913            nout >= output.len(),
914            nend >= ends.len(),
915        );
916        self.nfa_state = state;
917        self.output_pos = if res.is_record() { 0 } else { nout };
918        (res, nin, nout, nend)
919    }
920
921    #[inline(always)]
922    fn read_field_nfa(
923        &mut self,
924        input: &[u8],
925        output: &mut [u8],
926    ) -> (ReadFieldResult, usize, usize) {
927        if input.is_empty() {
928            self.nfa_state = self.transition_final_nfa(self.nfa_state);
929            let res = ReadFieldResult::from_nfa(self.nfa_state, false, false);
930            return (res, 0, 0);
931        }
932        if output.is_empty() {
933            // If the output buffer is empty, then we can never make progress,
934            // so just quit now.
935            return (ReadFieldResult::OutputFull, 0, 0);
936        }
937        let (mut nin, mut nout) = (0, 0);
938        let mut state = self.nfa_state;
939        while nin < input.len() && nout < output.len() {
940            let (s, io) = self.transition_nfa(state, input[nin]);
941            match io {
942                NfaInputAction::CopyToOutput => {
943                    output[nout] = input[nin];
944                    nout += 1;
945                    nin += 1;
946                }
947                NfaInputAction::Discard => {
948                    nin += 1;
949                }
950                NfaInputAction::Epsilon => (),
951            }
952            state = s;
953            if state.is_field_final() {
954                break;
955            }
956        }
957        let res = ReadFieldResult::from_nfa(
958            state,
959            nin >= input.len(),
960            nout >= output.len(),
961        );
962        self.nfa_state = state;
963        (res, nin, nout)
964    }
965
966    /// Compute the final NFA transition after all caller-provided input has
967    /// been exhausted.
968    #[inline(always)]
969    fn transition_final_nfa(&self, state: NfaState) -> NfaState {
970        use self::NfaState::*;
971        match state {
972            End | StartRecord | EndRecord | InComment | CRLF => End,
973            StartField | EndFieldDelim | EndFieldTerm | InField
974            | InQuotedField | InEscapedQuote | InDoubleEscapedQuote
975            | InRecordTerm => EndRecord,
976        }
977    }
978
979    /// Compute the next NFA state given the current NFA state and the current
980    /// input byte.
981    ///
982    /// This returns the next NFA state along with an NfaInputAction that
983    /// indicates what should be done with the input byte (nothing for an epsilon
984    /// transition, copied to a caller provided output buffer, or discarded).
985    #[inline(always)]
986    fn transition_nfa(
987        &self,
988        state: NfaState,
989        c: u8,
990    ) -> (NfaState, NfaInputAction) {
991        use self::NfaState::*;
992        match state {
993            End => (End, NfaInputAction::Epsilon),
994            StartRecord => {
995                if self.term.equals(c) {
996                    (StartRecord, NfaInputAction::Discard)
997                } else if self.comment == Some(c) {
998                    (InComment, NfaInputAction::Discard)
999                } else {
1000                    (StartField, NfaInputAction::Epsilon)
1001                }
1002            }
1003            EndRecord => (StartRecord, NfaInputAction::Epsilon),
1004            StartField => {
1005                if self.quoting && self.quote == c {
1006                    (InQuotedField, NfaInputAction::Discard)
1007                } else if self.delimiter == c {
1008                    (EndFieldDelim, NfaInputAction::Discard)
1009                } else if self.term.equals(c) {
1010                    (EndFieldTerm, NfaInputAction::Epsilon)
1011                } else {
1012                    (InField, NfaInputAction::CopyToOutput)
1013                }
1014            }
1015            EndFieldDelim => (StartField, NfaInputAction::Epsilon),
1016            EndFieldTerm => (InRecordTerm, NfaInputAction::Epsilon),
1017            InField => {
1018                if self.delimiter == c {
1019                    (EndFieldDelim, NfaInputAction::Discard)
1020                } else if self.term.equals(c) {
1021                    (EndFieldTerm, NfaInputAction::Epsilon)
1022                } else {
1023                    (InField, NfaInputAction::CopyToOutput)
1024                }
1025            }
1026            InQuotedField => {
1027                if self.quoting && self.quote == c {
1028                    (InDoubleEscapedQuote, NfaInputAction::Discard)
1029                } else if self.quoting && self.escape == Some(c) {
1030                    (InEscapedQuote, NfaInputAction::Discard)
1031                } else {
1032                    (InQuotedField, NfaInputAction::CopyToOutput)
1033                }
1034            }
1035            InEscapedQuote => (InQuotedField, NfaInputAction::CopyToOutput),
1036            InDoubleEscapedQuote => {
1037                if self.quoting && self.double_quote && self.quote == c {
1038                    (InQuotedField, NfaInputAction::CopyToOutput)
1039                } else if self.delimiter == c {
1040                    (EndFieldDelim, NfaInputAction::Discard)
1041                } else if self.term.equals(c) {
1042                    (EndFieldTerm, NfaInputAction::Epsilon)
1043                } else {
1044                    (InField, NfaInputAction::CopyToOutput)
1045                }
1046            }
1047            InComment => {
1048                if b'\n' == c {
1049                    (StartRecord, NfaInputAction::Discard)
1050                } else {
1051                    (InComment, NfaInputAction::Discard)
1052                }
1053            }
1054            InRecordTerm => {
1055                if self.term.is_crlf() && b'\r' == c {
1056                    (CRLF, NfaInputAction::Discard)
1057                } else {
1058                    (EndRecord, NfaInputAction::Discard)
1059                }
1060            }
1061            CRLF => {
1062                if b'\n' == c {
1063                    (StartRecord, NfaInputAction::Discard)
1064                } else {
1065                    (StartRecord, NfaInputAction::Epsilon)
1066                }
1067            }
1068        }
1069    }
1070}
1071
1072/// The number of slots in the DFA transition table.
1073///
1074/// This number is computed by multiplying the maximum number of transition
1075/// classes (7) by the total number of NFA states that are used in the DFA
1076/// (10).
1077///
1078/// The number of transition classes is determined by an equivalence class of
1079/// bytes, where every byte in the same equivalence classes is
1080/// indistinguishable from any other byte with respect to the DFA. For example,
1081/// if neither `a` nor `b` are specifed as a delimiter/quote/terminator/escape,
1082/// then the DFA will never discriminate between `a` or `b`, so they can
1083/// effectively be treated as identical. This reduces storage space
1084/// substantially.
1085///
1086/// The total number of NFA states (13) is greater than the total number of
1087/// NFA states that are in the DFA. In particular, any NFA state that can only
1088/// be reached by epsilon transitions will never have explicit usage in the
1089/// DFA.
1090const TRANS_CLASSES: usize = 7;
1091const DFA_STATES: usize = 10;
1092const TRANS_SIZE: usize = TRANS_CLASSES * DFA_STATES;
1093
1094/// The number of possible transition classes. (See the comment on `TRANS_SIZE`
1095/// for more details.)
1096const CLASS_SIZE: usize = 256;
1097
1098/// A representation of a DFA.
1099///
1100/// For the most part, this is a transition table, but various optimizations
1101/// have been applied to reduce its memory footprint.
1102struct Dfa {
1103    /// The core transition table. Each row corresponds to the transitions for
1104    /// each input equivalence class. (Input bytes are mapped to their
1105    /// corresponding equivalence class with the `classes` map.)
1106    ///
1107    /// DFA states are represented as an index corresponding to the start of
1108    /// its row in this table.
1109    trans: [DfaState; TRANS_SIZE],
1110    /// A table with the same layout as `trans`, except its values indicate
1111    /// whether a particular `(state, equivalence class)` pair should emit an
1112    /// output byte.
1113    has_output: [bool; TRANS_SIZE],
1114    /// A map from input byte to equivalence class.
1115    ///
1116    /// This is responsible for reducing the effective alphabet size from
1117    /// 256 to `TRANS_CLASSES`.
1118    classes: DfaClasses,
1119    /// The DFA state corresponding to being inside an unquoted field.
1120    in_field: DfaState,
1121    /// The DFA state corresponding to being inside an quoted field.
1122    in_quoted: DfaState,
1123    /// The minimum DFA state that indicates a field has been parsed. All DFA
1124    /// states greater than this are also final-field states.
1125    final_field: DfaState,
1126    /// The minimum DFA state that indicates a record has been parsed. All DFA
1127    /// states greater than this are also final-record states.
1128    final_record: DfaState,
1129}
1130
1131impl Dfa {
1132    fn new() -> Dfa {
1133        Dfa {
1134            trans: [DfaState(0); TRANS_SIZE],
1135            has_output: [false; TRANS_SIZE],
1136            classes: DfaClasses::new(),
1137            in_field: DfaState(0),
1138            in_quoted: DfaState(0),
1139            final_field: DfaState(0),
1140            final_record: DfaState(0),
1141        }
1142    }
1143
1144    fn new_state(&self, nfa_state: NfaState) -> DfaState {
1145        let nclasses = self.classes.num_classes() as u8;
1146        let idx = (nfa_state as u8).checked_mul(nclasses).unwrap();
1147        DfaState(idx)
1148    }
1149
1150    fn new_state_final_end(&self) -> DfaState {
1151        self.new_state(NfaState::StartRecord)
1152    }
1153
1154    fn new_state_final_record(&self) -> DfaState {
1155        self.new_state(NfaState::EndRecord)
1156    }
1157
1158    fn get_output(&self, state: DfaState, c: u8) -> (DfaState, bool) {
1159        let cls = self.classes.classes[c as usize];
1160        let idx = state.0 as usize + cls as usize;
1161        (self.trans[idx], self.has_output[idx])
1162    }
1163
1164    fn set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool) {
1165        let cls = self.classes.classes[c as usize];
1166        let idx = from.0 as usize + cls as usize;
1167        self.trans[idx] = to;
1168        self.has_output[idx] = output;
1169    }
1170
1171    fn finish(&mut self) {
1172        self.in_field = self.new_state(NfaState::InField);
1173        self.in_quoted = self.new_state(NfaState::InQuotedField);
1174        self.final_field = self.new_state(NfaState::EndFieldDelim);
1175        self.final_record = self.new_state(NfaState::EndRecord);
1176    }
1177
1178    fn new_read_field_result(
1179        &self,
1180        state: DfaState,
1181        is_final_trans: bool,
1182        inpdone: bool,
1183        outdone: bool,
1184    ) -> ReadFieldResult {
1185        if state >= self.final_record {
1186            ReadFieldResult::Field { record_end: true }
1187        } else if state == self.final_field {
1188            ReadFieldResult::Field { record_end: false }
1189        } else if is_final_trans && state.is_start() {
1190            ReadFieldResult::End
1191        } else {
1192            debug_assert!(state < self.final_field);
1193            if !inpdone && outdone {
1194                ReadFieldResult::OutputFull
1195            } else {
1196                ReadFieldResult::InputEmpty
1197            }
1198        }
1199    }
1200
1201    fn new_read_record_result(
1202        &self,
1203        state: DfaState,
1204        is_final_trans: bool,
1205        inpdone: bool,
1206        outdone: bool,
1207        endsdone: bool,
1208    ) -> ReadRecordResult {
1209        if state >= self.final_record {
1210            ReadRecordResult::Record
1211        } else if is_final_trans && state.is_start() {
1212            ReadRecordResult::End
1213        } else {
1214            debug_assert!(state < self.final_record);
1215            if !inpdone && outdone {
1216                ReadRecordResult::OutputFull
1217            } else if !inpdone && endsdone {
1218                ReadRecordResult::OutputEndsFull
1219            } else {
1220                ReadRecordResult::InputEmpty
1221            }
1222        }
1223    }
1224}
1225
1226/// A map from input byte to equivalence class.
1227struct DfaClasses {
1228    classes: [u8; CLASS_SIZE],
1229    next_class: usize,
1230}
1231
1232impl DfaClasses {
1233    fn new() -> DfaClasses {
1234        DfaClasses { classes: [0; CLASS_SIZE], next_class: 1 }
1235    }
1236
1237    fn add(&mut self, b: u8) {
1238        if self.next_class > CLASS_SIZE {
1239            panic!("added too many classes")
1240        }
1241        self.classes[b as usize] = self.next_class as u8;
1242        self.next_class = self.next_class + 1;
1243    }
1244
1245    fn num_classes(&self) -> usize {
1246        self.next_class as usize
1247    }
1248
1249    /// Scan and copy the input bytes to the output buffer quickly.
1250    ///
1251    /// This assumes that the current state of the DFA is either `InField` or
1252    /// `InQuotedField`. In this case, all bytes corresponding to the first
1253    /// equivalence class (i.e., not a delimiter/quote/escape/etc.) are
1254    /// guaranteed to never result in a state transition out of the current
1255    /// state. This function takes advantage of that copies every byte from
1256    /// `input` in the first equivalence class to `output`. Once a byte is seen
1257    /// outside the first equivalence class, we quit and should fall back to
1258    /// the main DFA loop.
1259    #[inline(always)]
1260    fn scan_and_copy(
1261        &self,
1262        input: &[u8],
1263        nin: &mut usize,
1264        output: &mut [u8],
1265        nout: &mut usize,
1266    ) {
1267        while *nin < input.len()
1268            && *nout < output.len()
1269            && self.classes[input[*nin] as usize] == 0
1270        {
1271            output[*nout] = input[*nin];
1272            *nin += 1;
1273            *nout += 1;
1274        }
1275    }
1276}
1277
1278/// A single DFA state.
1279///
1280/// A DFA state is represented by the starting index of its corresponding row
1281/// in the DFA transition table. This representation allows us to elide a
1282/// single multiplication instruction when computing the next transition for
1283/// a particular input byte.
1284#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
1285struct DfaState(u8);
1286
1287impl DfaState {
1288    fn start() -> DfaState {
1289        DfaState(0)
1290    }
1291
1292    fn is_start(&self) -> bool {
1293        self.0 == 0
1294    }
1295}
1296
1297impl fmt::Debug for Dfa {
1298    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1299        write!(f, "Dfa(N/A)")
1300    }
1301}
1302
1303impl fmt::Debug for DfaClasses {
1304    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1305        write!(
1306            f,
1307            "DfaClasses {{ classes: N/A, next_class: {:?} }}",
1308            self.next_class
1309        )
1310    }
1311}
1312
1313impl Clone for Dfa {
1314    fn clone(&self) -> Dfa {
1315        let mut dfa = Dfa::new();
1316        dfa.trans.copy_from_slice(&self.trans);
1317        dfa
1318    }
1319}
1320
1321impl Clone for DfaClasses {
1322    fn clone(&self) -> DfaClasses {
1323        let mut x = DfaClasses::new();
1324        x.classes.copy_from_slice(&self.classes);
1325        x
1326    }
1327}
1328
1329#[cfg(test)]
1330mod tests {
1331    use core::str;
1332
1333    use arrayvec::{ArrayString, ArrayVec};
1334
1335    use super::{ReadFieldResult, Reader, ReaderBuilder, Terminator};
1336
1337    type Csv = ArrayVec<[Row; 10]>;
1338    type Row = ArrayVec<[Field; 10]>;
1339    type Field = ArrayString<[u8; 10]>;
1340
1341    // OMG I HATE BYTE STRING LITERALS SO MUCH.
1342    fn b(s: &str) -> &[u8] {
1343        s.as_bytes()
1344    }
1345
1346    macro_rules! csv {
1347        ($([$($field:expr),*]),*) => {{
1348            #[allow(unused_mut)]
1349            fn x() -> Csv {
1350                let mut csv = Csv::new();
1351                $(
1352                    let mut row = Row::new();
1353                    $(
1354                        row.push(Field::from($field).unwrap());
1355                    )*
1356                    csv.push(row);
1357                )*
1358                csv
1359            }
1360            x()
1361        }}
1362    }
1363
1364    macro_rules! parses_to {
1365        ($name:ident, $data:expr, $expected:expr) => {
1366            parses_to!($name, $data, $expected, |builder| builder);
1367        };
1368        ($name:ident, $data:expr, $expected:expr, $config:expr) => {
1369            #[test]
1370            fn $name() {
1371                let mut builder = ReaderBuilder::new();
1372                builder.nfa(true);
1373                $config(&mut builder);
1374                let mut rdr = builder.build();
1375                let got = parse_by_field(&mut rdr, $data);
1376                let expected = $expected;
1377                assert_eq!(expected, got, "nfa by field");
1378
1379                let mut builder = ReaderBuilder::new();
1380                builder.nfa(true);
1381                $config(&mut builder);
1382                let mut rdr = builder.build();
1383                let got = parse_by_record(&mut rdr, $data);
1384                let expected = $expected;
1385                assert_eq!(expected, got, "nfa by record");
1386
1387                let mut builder = ReaderBuilder::new();
1388                $config(&mut builder);
1389                let mut rdr = builder.build();
1390                let got = parse_by_field(&mut rdr, $data);
1391                let expected = $expected;
1392                assert_eq!(expected, got, "dfa by field");
1393
1394                let mut builder = ReaderBuilder::new();
1395                $config(&mut builder);
1396                let mut rdr = builder.build();
1397                let got = parse_by_record(&mut rdr, $data);
1398                let expected = $expected;
1399                assert_eq!(expected, got, "dfa by record");
1400            }
1401        };
1402    }
1403
1404    fn parse_by_field(rdr: &mut Reader, data: &str) -> Csv {
1405        let mut data = data.as_bytes();
1406        let mut field = [0u8; 10];
1407        let mut csv = Csv::new();
1408        let mut row = Row::new();
1409        let mut outpos = 0;
1410        loop {
1411            let (res, nin, nout) = rdr.read_field(data, &mut field[outpos..]);
1412            data = &data[nin..];
1413            outpos += nout;
1414
1415            match res {
1416                ReadFieldResult::InputEmpty => {
1417                    if !data.is_empty() {
1418                        panic!("missing input data")
1419                    }
1420                }
1421                ReadFieldResult::OutputFull => panic!("field too large"),
1422                ReadFieldResult::Field { record_end } => {
1423                    let s = str::from_utf8(&field[..outpos]).unwrap();
1424                    row.push(Field::from(s).unwrap());
1425                    outpos = 0;
1426                    if record_end {
1427                        csv.push(row);
1428                        row = Row::new();
1429                    }
1430                }
1431                ReadFieldResult::End => {
1432                    return csv;
1433                }
1434            }
1435        }
1436    }
1437
1438    fn parse_by_record(rdr: &mut Reader, data: &str) -> Csv {
1439        use crate::ReadRecordResult::*;
1440
1441        let mut data = data.as_bytes();
1442        let mut record = [0; 1024];
1443        let mut ends = [0; 10];
1444
1445        let mut csv = Csv::new();
1446        let (mut outpos, mut endpos) = (0, 0);
1447        loop {
1448            let (res, nin, nout, nend) = rdr.read_record(
1449                data,
1450                &mut record[outpos..],
1451                &mut ends[endpos..],
1452            );
1453            data = &data[nin..];
1454            outpos += nout;
1455            endpos += nend;
1456
1457            match res {
1458                InputEmpty => {
1459                    if !data.is_empty() {
1460                        panic!("missing input data")
1461                    }
1462                }
1463                OutputFull => panic!("record too large (out buffer)"),
1464                OutputEndsFull => panic!("record too large (end buffer)"),
1465                Record => {
1466                    let s = str::from_utf8(&record[..outpos]).unwrap();
1467                    let mut start = 0;
1468                    let mut row = Row::new();
1469                    for &end in &ends[..endpos] {
1470                        row.push(Field::from(&s[start..end]).unwrap());
1471                        start = end;
1472                    }
1473                    csv.push(row);
1474                    outpos = 0;
1475                    endpos = 0;
1476                }
1477                End => return csv,
1478            }
1479        }
1480    }
1481
1482    parses_to!(one_row_one_field, "a", csv![["a"]]);
1483    parses_to!(one_row_many_fields, "a,b,c", csv![["a", "b", "c"]]);
1484    parses_to!(one_row_trailing_comma, "a,b,", csv![["a", "b", ""]]);
1485    parses_to!(one_row_one_field_lf, "a\n", csv![["a"]]);
1486    parses_to!(one_row_many_fields_lf, "a,b,c\n", csv![["a", "b", "c"]]);
1487    parses_to!(one_row_trailing_comma_lf, "a,b,\n", csv![["a", "b", ""]]);
1488    parses_to!(one_row_one_field_crlf, "a\r\n", csv![["a"]]);
1489    parses_to!(one_row_many_fields_crlf, "a,b,c\r\n", csv![["a", "b", "c"]]);
1490    parses_to!(one_row_trailing_comma_crlf, "a,b,\r\n", csv![["a", "b", ""]]);
1491    parses_to!(one_row_one_field_cr, "a\r", csv![["a"]]);
1492    parses_to!(one_row_many_fields_cr, "a,b,c\r", csv![["a", "b", "c"]]);
1493    parses_to!(one_row_trailing_comma_cr, "a,b,\r", csv![["a", "b", ""]]);
1494
1495    parses_to!(many_rows_one_field, "a\nb", csv![["a"], ["b"]]);
1496    parses_to!(
1497        many_rows_many_fields,
1498        "a,b,c\nx,y,z",
1499        csv![["a", "b", "c"], ["x", "y", "z"]]
1500    );
1501    parses_to!(
1502        many_rows_trailing_comma,
1503        "a,b,\nx,y,",
1504        csv![["a", "b", ""], ["x", "y", ""]]
1505    );
1506    parses_to!(many_rows_one_field_lf, "a\nb\n", csv![["a"], ["b"]]);
1507    parses_to!(
1508        many_rows_many_fields_lf,
1509        "a,b,c\nx,y,z\n",
1510        csv![["a", "b", "c"], ["x", "y", "z"]]
1511    );
1512    parses_to!(
1513        many_rows_trailing_comma_lf,
1514        "a,b,\nx,y,\n",
1515        csv![["a", "b", ""], ["x", "y", ""]]
1516    );
1517    parses_to!(many_rows_one_field_crlf, "a\r\nb\r\n", csv![["a"], ["b"]]);
1518    parses_to!(
1519        many_rows_many_fields_crlf,
1520        "a,b,c\r\nx,y,z\r\n",
1521        csv![["a", "b", "c"], ["x", "y", "z"]]
1522    );
1523    parses_to!(
1524        many_rows_trailing_comma_crlf,
1525        "a,b,\r\nx,y,\r\n",
1526        csv![["a", "b", ""], ["x", "y", ""]]
1527    );
1528    parses_to!(many_rows_one_field_cr, "a\rb\r", csv![["a"], ["b"]]);
1529    parses_to!(
1530        many_rows_many_fields_cr,
1531        "a,b,c\rx,y,z\r",
1532        csv![["a", "b", "c"], ["x", "y", "z"]]
1533    );
1534    parses_to!(
1535        many_rows_trailing_comma_cr,
1536        "a,b,\rx,y,\r",
1537        csv![["a", "b", ""], ["x", "y", ""]]
1538    );
1539
1540    parses_to!(
1541        trailing_lines_no_record,
1542        "\n\n\na,b,c\nx,y,z\n\n\n",
1543        csv![["a", "b", "c"], ["x", "y", "z"]]
1544    );
1545    parses_to!(
1546        trailing_lines_no_record_cr,
1547        "\r\r\ra,b,c\rx,y,z\r\r\r",
1548        csv![["a", "b", "c"], ["x", "y", "z"]]
1549    );
1550    parses_to!(
1551        trailing_lines_no_record_crlf,
1552        "\r\n\r\n\r\na,b,c\r\nx,y,z\r\n\r\n\r\n",
1553        csv![["a", "b", "c"], ["x", "y", "z"]]
1554    );
1555
1556    parses_to!(empty, "", csv![]);
1557    parses_to!(empty_lines, "\n\n\n\n", csv![]);
1558    parses_to!(
1559        empty_lines_interspersed,
1560        "\n\na,b\n\n\nx,y\n\n\nm,n\n",
1561        csv![["a", "b"], ["x", "y"], ["m", "n"]]
1562    );
1563    parses_to!(empty_lines_crlf, "\r\n\r\n\r\n\r\n", csv![]);
1564    parses_to!(
1565        empty_lines_interspersed_crlf,
1566        "\r\n\r\na,b\r\n\r\n\r\nx,y\r\n\r\n\r\nm,n\r\n",
1567        csv![["a", "b"], ["x", "y"], ["m", "n"]]
1568    );
1569    parses_to!(empty_lines_mixed, "\r\n\n\r\n\n", csv![]);
1570    parses_to!(
1571        empty_lines_interspersed_mixed,
1572        "\n\r\na,b\r\n\n\r\nx,y\r\n\n\r\nm,n\r\n",
1573        csv![["a", "b"], ["x", "y"], ["m", "n"]]
1574    );
1575    parses_to!(empty_lines_cr, "\r\r\r\r", csv![]);
1576    parses_to!(
1577        empty_lines_interspersed_cr,
1578        "\r\ra,b\r\r\rx,y\r\r\rm,n\r",
1579        csv![["a", "b"], ["x", "y"], ["m", "n"]]
1580    );
1581
1582    parses_to!(
1583        term_weird,
1584        "zza,bzc,dzz",
1585        csv![["a", "b"], ["c", "d"]],
1586        |b: &mut ReaderBuilder| {
1587            b.terminator(Terminator::Any(b'z'));
1588        }
1589    );
1590
1591    parses_to!(
1592        ascii_delimited,
1593        "a\x1fb\x1ec\x1fd",
1594        csv![["a", "b"], ["c", "d"]],
1595        |b: &mut ReaderBuilder| {
1596            b.ascii();
1597        }
1598    );
1599
1600    parses_to!(bom_at_start, "\u{feff}a", csv![["a"]]);
1601    parses_to!(bom_in_field, "a\u{feff}", csv![["a\u{feff}"]]);
1602    parses_to!(bom_at_field_start, "a,\u{feff}b", csv![["a", "\u{feff}b"]]);
1603
1604    parses_to!(quote_empty, "\"\"", csv![[""]]);
1605    parses_to!(quote_lf, "\"\"\n", csv![[""]]);
1606    parses_to!(quote_space, "\" \"", csv![[" "]]);
1607    parses_to!(quote_inner_space, "\" a \"", csv![[" a "]]);
1608    parses_to!(quote_outer_space, "  \"a\"  ", csv![["  \"a\"  "]]);
1609
1610    parses_to!(quote_change, "zaz", csv![["a"]], |b: &mut ReaderBuilder| {
1611        b.quote(b'z');
1612    });
1613
1614    // This one is pretty hokey.
1615    // I don't really know what the "right" behavior is.
1616    parses_to!(
1617        quote_delimiter,
1618        ",a,,b",
1619        csv![["a,b"]],
1620        |b: &mut ReaderBuilder| {
1621            b.quote(b',');
1622        }
1623    );
1624
1625    parses_to!(quote_no_escapes, r#""a\"b""#, csv![[r#"a\b""#]]);
1626    parses_to!(
1627        quote_escapes_no_double,
1628        r#""a""b""#,
1629        csv![[r#"a"b""#]],
1630        |b: &mut ReaderBuilder| {
1631            b.double_quote(false);
1632        }
1633    );
1634    parses_to!(
1635        quote_escapes,
1636        r#""a\"b""#,
1637        csv![[r#"a"b"#]],
1638        |b: &mut ReaderBuilder| {
1639            b.escape(Some(b'\\'));
1640        }
1641    );
1642    parses_to!(
1643        quote_escapes_change,
1644        r#""az"b""#,
1645        csv![[r#"a"b"#]],
1646        |b: &mut ReaderBuilder| {
1647            b.escape(Some(b'z'));
1648        }
1649    );
1650
1651    parses_to!(
1652        quote_escapes_with_comma,
1653        r#""\"A,B\"""#,
1654        csv![[r#""A,B""#]],
1655        |b: &mut ReaderBuilder| {
1656            b.escape(Some(b'\\')).double_quote(false);
1657        }
1658    );
1659
1660    parses_to!(
1661        quoting_disabled,
1662        r#""abc,foo""#,
1663        csv![[r#""abc"#, r#"foo""#]],
1664        |b: &mut ReaderBuilder| {
1665            b.quoting(false);
1666        }
1667    );
1668
1669    parses_to!(
1670        delimiter_tabs,
1671        "a\tb",
1672        csv![["a", "b"]],
1673        |b: &mut ReaderBuilder| {
1674            b.delimiter(b'\t');
1675        }
1676    );
1677    parses_to!(
1678        delimiter_weird,
1679        "azb",
1680        csv![["a", "b"]],
1681        |b: &mut ReaderBuilder| {
1682            b.delimiter(b'z');
1683        }
1684    );
1685
1686    parses_to!(extra_record_crlf_1, "foo\n1\n", csv![["foo"], ["1"]]);
1687    parses_to!(extra_record_crlf_2, "foo\r\n1\r\n", csv![["foo"], ["1"]]);
1688
1689    parses_to!(
1690        comment_1,
1691        "foo\n# hi\nbar\n",
1692        csv![["foo"], ["bar"]],
1693        |b: &mut ReaderBuilder| {
1694            b.comment(Some(b'#'));
1695        }
1696    );
1697    parses_to!(
1698        comment_2,
1699        "foo\n # hi\nbar\n",
1700        csv![["foo"], [" # hi"], ["bar"]],
1701        |b: &mut ReaderBuilder| {
1702            b.comment(Some(b'#'));
1703        }
1704    );
1705    parses_to!(
1706        comment_3,
1707        "foo\n# hi\nbar\n",
1708        csv![["foo"], ["# hi"], ["bar"]],
1709        |b: &mut ReaderBuilder| {
1710            b.comment(Some(b'\n'));
1711        }
1712    );
1713    parses_to!(
1714        comment_4,
1715        "foo,b#ar,baz",
1716        csv![["foo", "b#ar", "baz"]],
1717        |b: &mut ReaderBuilder| {
1718            b.comment(Some(b'#'));
1719        }
1720    );
1721    parses_to!(
1722        comment_5,
1723        "foo,#bar,baz",
1724        csv![["foo", "#bar", "baz"]],
1725        |b: &mut ReaderBuilder| {
1726            b.comment(Some(b'#'));
1727        }
1728    );
1729
1730    macro_rules! assert_read {
1731        (
1732            $rdr:expr, $input:expr, $output:expr,
1733            $expect_in:expr, $expect_out:expr, $expect_res:expr
1734        ) => {{
1735            let (res, nin, nout) = $rdr.read_field($input, $output);
1736            assert_eq!($expect_in, nin);
1737            assert_eq!($expect_out, nout);
1738            assert_eq!($expect_res, res);
1739        }};
1740    }
1741
1742    // This tests that feeding a new reader with an empty buffer sends us
1743    // straight to End.
1744    #[test]
1745    fn stream_empty() {
1746        use crate::ReadFieldResult::*;
1747
1748        let mut rdr = Reader::new();
1749        assert_read!(rdr, &[], &mut [], 0, 0, End);
1750    }
1751
1752    // Test that a single space is treated as a single field.
1753    #[test]
1754    fn stream_space() {
1755        use crate::ReadFieldResult::*;
1756
1757        let mut rdr = Reader::new();
1758        assert_read!(rdr, b(" "), &mut [0], 1, 1, InputEmpty);
1759        assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true });
1760        assert_read!(rdr, &[], &mut [0], 0, 0, End);
1761    }
1762
1763    // Test that a single comma ...
1764    #[test]
1765    fn stream_comma() {
1766        use crate::ReadFieldResult::*;
1767
1768        let mut rdr = Reader::new();
1769        assert_read!(rdr, b(","), &mut [0], 1, 0, Field { record_end: false });
1770        assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true });
1771        assert_read!(rdr, &[], &mut [0], 0, 0, End);
1772    }
1773
1774    // Test that we can read a single large field in multiple output
1775    // buffers.
1776    #[test]
1777    fn stream_output_chunks() {
1778        use crate::ReadFieldResult::*;
1779
1780        let mut inp = b("fooquux");
1781        let out = &mut [0; 2];
1782        let mut rdr = Reader::new();
1783
1784        assert_read!(rdr, inp, out, 2, 2, OutputFull);
1785        assert_eq!(out, b("fo"));
1786        inp = &inp[2..];
1787
1788        assert_read!(rdr, inp, out, 2, 2, OutputFull);
1789        assert_eq!(out, b("oq"));
1790        inp = &inp[2..];
1791
1792        assert_read!(rdr, inp, out, 2, 2, OutputFull);
1793        assert_eq!(out, b("uu"));
1794        inp = &inp[2..];
1795
1796        assert_read!(rdr, inp, out, 1, 1, InputEmpty);
1797        assert_eq!(&out[..1], b("x"));
1798        inp = &inp[1..];
1799        assert!(inp.is_empty());
1800
1801        assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1802        assert_read!(rdr, inp, out, 0, 0, End);
1803    }
1804
1805    // Test that we can read a single large field across multiple input
1806    // buffers.
1807    #[test]
1808    fn stream_input_chunks() {
1809        use crate::ReadFieldResult::*;
1810
1811        let out = &mut [0; 10];
1812        let mut rdr = Reader::new();
1813
1814        assert_read!(rdr, b("fo"), out, 2, 2, InputEmpty);
1815        assert_eq!(&out[..2], b("fo"));
1816
1817        assert_read!(rdr, b("oq"), &mut out[2..], 2, 2, InputEmpty);
1818        assert_eq!(&out[..4], b("fooq"));
1819
1820        assert_read!(rdr, b("uu"), &mut out[4..], 2, 2, InputEmpty);
1821        assert_eq!(&out[..6], b("fooquu"));
1822
1823        assert_read!(rdr, b("x"), &mut out[6..], 1, 1, InputEmpty);
1824        assert_eq!(&out[..7], b("fooquux"));
1825
1826        assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1827        assert_read!(rdr, &[], out, 0, 0, End);
1828    }
1829
1830    // Test we can read doubled quotes correctly in a stream.
1831    #[test]
1832    fn stream_doubled_quotes() {
1833        use crate::ReadFieldResult::*;
1834
1835        let out = &mut [0; 10];
1836        let mut rdr = Reader::new();
1837
1838        assert_read!(rdr, b("\"fo\""), out, 4, 2, InputEmpty);
1839        assert_eq!(&out[..2], b("fo"));
1840
1841        assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty);
1842        assert_eq!(&out[..4], b("fo\"o"));
1843
1844        assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1845        assert_read!(rdr, &[], out, 0, 0, End);
1846    }
1847
1848    // Test we can read escaped quotes correctly in a stream.
1849    #[test]
1850    fn stream_escaped_quotes() {
1851        use crate::ReadFieldResult::*;
1852
1853        let out = &mut [0; 10];
1854        let mut builder = ReaderBuilder::new();
1855        let mut rdr = builder.escape(Some(b'\\')).build();
1856
1857        assert_read!(rdr, b("\"fo\\"), out, 4, 2, InputEmpty);
1858        assert_eq!(&out[..2], b("fo"));
1859
1860        assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty);
1861        assert_eq!(&out[..4], b("fo\"o"));
1862
1863        assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1864        assert_read!(rdr, &[], out, 0, 0, End);
1865    }
1866
1867    // Test that empty output buffers don't wreak havoc.
1868    #[test]
1869    fn stream_empty_output() {
1870        use crate::ReadFieldResult::*;
1871
1872        let out = &mut [0; 10];
1873        let mut rdr = Reader::new();
1874
1875        assert_read!(
1876            rdr,
1877            b("foo,bar"),
1878            out,
1879            4,
1880            3,
1881            Field { record_end: false }
1882        );
1883        assert_eq!(&out[..3], b("foo"));
1884
1885        assert_read!(rdr, b("bar"), &mut [], 0, 0, OutputFull);
1886
1887        assert_read!(rdr, b("bar"), out, 3, 3, InputEmpty);
1888        assert_eq!(&out[..3], b("bar"));
1889
1890        assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1891        assert_read!(rdr, &[], out, 0, 0, End);
1892    }
1893
1894    // Test that we can reset the parser mid-stream and count on it to do
1895    // the right thing.
1896    #[test]
1897    fn reset_works() {
1898        use crate::ReadFieldResult::*;
1899
1900        let out = &mut [0; 10];
1901        let mut rdr = Reader::new();
1902
1903        assert_read!(rdr, b("\"foo"), out, 4, 3, InputEmpty);
1904        assert_eq!(&out[..3], b("foo"));
1905
1906        // Without reseting the parser state, the reader will remember that
1907        // we're in a quoted field, and therefore interpret the leading double
1908        // quotes below as a single quote and the trailing quote as a matching
1909        // terminator. With the reset, however, the parser forgets the quoted
1910        // field and treats the leading double quotes as a syntax quirk and
1911        // drops them, in addition to hanging on to the trailing unmatched
1912        // quote. (Matches Python's behavior.)
1913        rdr.reset();
1914
1915        assert_read!(rdr, b("\"\"bar\""), out, 6, 4, InputEmpty);
1916        assert_eq!(&out[..4], b("bar\""));
1917    }
1918
1919    // Test the line number reporting is correct.
1920    #[test]
1921    fn line_numbers() {
1922        use crate::ReadFieldResult::*;
1923
1924        let out = &mut [0; 10];
1925        let mut rdr = Reader::new();
1926
1927        assert_eq!(1, rdr.line());
1928
1929        assert_read!(rdr, b("\n\n\n\n"), out, 4, 0, InputEmpty);
1930        assert_eq!(5, rdr.line());
1931
1932        assert_read!(rdr, b("foo,"), out, 4, 3, Field { record_end: false });
1933        assert_eq!(5, rdr.line());
1934
1935        assert_read!(rdr, b("bar\n"), out, 4, 3, Field { record_end: true });
1936        assert_eq!(6, rdr.line());
1937
1938        assert_read!(rdr, &[], &mut [0], 0, 0, End);
1939        assert_eq!(6, rdr.line());
1940    }
1941
1942    macro_rules! assert_read_record {
1943        (
1944            $rdr:expr, $input:expr, $output:expr, $ends:expr,
1945            $expect_in:expr, $expect_out:expr,
1946            $expect_end:expr, $expect_res:expr
1947        ) => {{
1948            let (res, nin, nout, nend) =
1949                $rdr.read_record($input, $output, $ends);
1950            assert_eq!($expect_res, res, "result");
1951            assert_eq!($expect_in, nin, "input");
1952            assert_eq!($expect_out, nout, "output");
1953            assert_eq!($expect_end, nend, "ends");
1954        }};
1955    }
1956
1957    // Test that we can incrementally read a record.
1958    #[test]
1959    fn stream_record() {
1960        use crate::ReadRecordResult::*;
1961
1962        let mut inp = b("foo,bar\nbaz");
1963        let out = &mut [0; 1024];
1964        let ends = &mut [0; 10];
1965        let mut rdr = Reader::new();
1966
1967        assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record);
1968        assert_eq!(ends[0], 3);
1969        assert_eq!(ends[1], 6);
1970        inp = &inp[8..];
1971
1972        assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty);
1973        inp = &inp[3..];
1974
1975        assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record);
1976        assert_eq!(ends[0], 3);
1977
1978        assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End);
1979    }
1980
1981    // Test that if our output ends are full during the last read that
1982    // we get an appropriate state returned.
1983    #[test]
1984    fn stream_record_last_end_output_full() {
1985        use crate::ReadRecordResult::*;
1986
1987        let mut inp = b("foo,bar\nbaz");
1988        let out = &mut [0; 1024];
1989        let ends = &mut [0; 10];
1990        let mut rdr = Reader::new();
1991
1992        assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record);
1993        assert_eq!(ends[0], 3);
1994        assert_eq!(ends[1], 6);
1995        inp = &inp[8..];
1996
1997        assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty);
1998        inp = &inp[3..];
1999
2000        assert_read_record!(rdr, &inp, out, &mut [], 0, 0, 0, OutputEndsFull);
2001        assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record);
2002        assert_eq!(ends[0], 3);
2003
2004        assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End);
2005    }
2006
2007    #[test]
2008    fn reset_input_partial() {
2009        use crate::ReadRecordResult::*;
2010
2011        let inp = b("foo,bar\nbaz");
2012        let out = &mut [0; 1024];
2013        let ends = &mut [0; 10];
2014        let mut rdr = Reader::new();
2015
2016        assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record);
2017
2018        // Try to read incomplete record.
2019        let (result, _, _, _) = rdr.read_record(&inp[8..], out, ends);
2020        assert_eq!(result, InputEmpty);
2021
2022        rdr.reset();
2023
2024        let inp = b("baz,raz\n");
2025        let (result, _, _, _) = rdr.read_record(inp, out, ends);
2026        assert_eq!(result, Record);
2027        assert_eq!(ends[0], 3);
2028    }
2029}
csv_core/reader.rs

csv_core/
reader.rs