bstr/
escape_bytes.rs

1/// An iterator of `char` values that represent an escaping of arbitrary bytes.
2///
3/// The lifetime parameter `'a` refers to the lifetime of the bytes being
4/// escaped.
5///
6/// This iterator is created by the
7/// [`ByteSlice::escape_bytes`](crate::ByteSlice::escape_bytes) method.
8#[derive(Clone, Debug)]
9pub struct EscapeBytes<'a> {
10    remaining: &'a [u8],
11    state: EscapeState,
12}
13
14impl<'a> EscapeBytes<'a> {
15    pub(crate) fn new(bytes: &'a [u8]) -> EscapeBytes {
16        EscapeBytes { remaining: bytes, state: EscapeState::Start }
17    }
18}
19
20impl<'a> Iterator for EscapeBytes<'a> {
21    type Item = char;
22
23    #[inline]
24    fn next(&mut self) -> Option<char> {
25        use self::EscapeState::*;
26
27        match self.state {
28            Start => {
29                let byte = match crate::decode_utf8(self.remaining) {
30                    (None, 0) => return None,
31                    // If we see invalid UTF-8 or ASCII, then we always just
32                    // peel one byte off. If it's printable ASCII, we'll pass
33                    // it through as-is below. Otherwise, below, it will get
34                    // escaped in some way.
35                    (None, _) | (Some(_), 1) => {
36                        let byte = self.remaining[0];
37                        self.remaining = &self.remaining[1..];
38                        byte
39                    }
40                    // For any valid UTF-8 that is not ASCII, we pass it
41                    // through as-is. We don't do any Unicode escaping.
42                    (Some(ch), size) => {
43                        self.remaining = &self.remaining[size..];
44                        return Some(ch);
45                    }
46                };
47                self.state = match byte {
48                    0x21..=0x5B | 0x5D..=0x7E => {
49                        return Some(char::from(byte))
50                    }
51                    b'\0' => SpecialEscape('0'),
52                    b'\n' => SpecialEscape('n'),
53                    b'\r' => SpecialEscape('r'),
54                    b'\t' => SpecialEscape('t'),
55                    b'\\' => SpecialEscape('\\'),
56                    _ => HexEscapeX(byte),
57                };
58                Some('\\')
59            }
60            SpecialEscape(ch) => {
61                self.state = Start;
62                Some(ch)
63            }
64            HexEscapeX(byte) => {
65                self.state = HexEscapeHighNybble(byte);
66                Some('x')
67            }
68            HexEscapeHighNybble(byte) => {
69                self.state = HexEscapeLowNybble(byte);
70                let nybble = byte >> 4;
71                Some(hexdigit_to_char(nybble))
72            }
73            HexEscapeLowNybble(byte) => {
74                self.state = Start;
75                let nybble = byte & 0xF;
76                Some(hexdigit_to_char(nybble))
77            }
78        }
79    }
80}
81
82impl<'a> core::fmt::Display for EscapeBytes<'a> {
83    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
84        use core::fmt::Write;
85        for ch in self.clone() {
86            f.write_char(ch)?;
87        }
88        Ok(())
89    }
90}
91
92/// The state used by the FSM in the escaping iterator.
93#[derive(Clone, Debug)]
94enum EscapeState {
95    /// Read and remove the next byte from 'remaining'. If 'remaining' is
96    /// empty, then return None. Otherwise, escape the byte according to the
97    /// following rules or emit it as-is.
98    ///
99    /// If it's \n, \r, \t, \\ or \0, then emit a '\' and set the current
100    /// state to 'SpecialEscape(n | r | t | \ | 0)'. Otherwise, if the 'byte'
101    /// is not in [\x21-\x5B\x5D-\x7E], then emit a '\' and set the state to
102    /// to 'HexEscapeX(byte)'.
103    Start,
104    /// Emit the given codepoint as is. This assumes '\' has just been emitted.
105    /// Then set the state to 'Start'.
106    SpecialEscape(char),
107    /// Emit the 'x' part of a hex escape. This assumes '\' has just been
108    /// emitted. Then set the state to 'HexEscapeHighNybble(byte)'.
109    HexEscapeX(u8),
110    /// Emit the high nybble of the byte as a hexadecimal digit. This
111    /// assumes '\x' has just been emitted. Then set the state to
112    /// 'HexEscapeLowNybble(byte)'.
113    HexEscapeHighNybble(u8),
114    /// Emit the low nybble of the byte as a hexadecimal digit. This assume
115    /// '\xZ' has just been emitted, where 'Z' is the high nybble of this byte.
116    /// Then set the state to 'Start'.
117    HexEscapeLowNybble(u8),
118}
119
120/// An iterator of `u8` values that represent an unescaping of a sequence of
121/// codepoints.
122///
123/// The type parameter `I` refers to the iterator of codepoints that is
124/// unescaped.
125///
126/// Currently this iterator is not exposed in the crate API, and instead all
127/// we expose is a `ByteVec::unescape` method. Which of course requires an
128/// alloc. That's the most convenient form of this, but in theory, we could
129/// expose this for core-only use cases too. I'm just not quite sure what the
130/// API should be.
131#[derive(Clone, Debug)]
132#[cfg(feature = "alloc")]
133pub(crate) struct UnescapeBytes<I> {
134    it: I,
135    state: UnescapeState,
136}
137
138#[cfg(feature = "alloc")]
139impl<I: Iterator<Item = char>> UnescapeBytes<I> {
140    pub(crate) fn new<T: IntoIterator<IntoIter = I>>(
141        t: T,
142    ) -> UnescapeBytes<I> {
143        UnescapeBytes { it: t.into_iter(), state: UnescapeState::Start }
144    }
145}
146
147#[cfg(feature = "alloc")]
148impl<I: Iterator<Item = char>> Iterator for UnescapeBytes<I> {
149    type Item = u8;
150
151    fn next(&mut self) -> Option<u8> {
152        use self::UnescapeState::*;
153
154        loop {
155            match self.state {
156                Start => {
157                    let ch = self.it.next()?;
158                    match ch {
159                        '\\' => {
160                            self.state = Escape;
161                        }
162                        ch => {
163                            self.state = UnescapeState::bytes(&[], ch);
164                        }
165                    }
166                }
167                Bytes { buf, mut cur, len } => {
168                    let byte = buf[cur];
169                    cur += 1;
170                    if cur >= len {
171                        self.state = Start;
172                    } else {
173                        self.state = Bytes { buf, cur, len };
174                    }
175                    return Some(byte);
176                }
177                Escape => {
178                    let ch = match self.it.next() {
179                        Some(ch) => ch,
180                        None => {
181                            self.state = Start;
182                            // Incomplete escape sequences unescape as
183                            // themselves.
184                            return Some(b'\\');
185                        }
186                    };
187                    match ch {
188                        '0' => {
189                            self.state = Start;
190                            return Some(b'\x00');
191                        }
192                        '\\' => {
193                            self.state = Start;
194                            return Some(b'\\');
195                        }
196                        'r' => {
197                            self.state = Start;
198                            return Some(b'\r');
199                        }
200                        'n' => {
201                            self.state = Start;
202                            return Some(b'\n');
203                        }
204                        't' => {
205                            self.state = Start;
206                            return Some(b'\t');
207                        }
208                        'x' => {
209                            self.state = HexFirst;
210                        }
211                        ch => {
212                            // An invalid escape sequence unescapes as itself.
213                            self.state = UnescapeState::bytes(&[b'\\'], ch);
214                        }
215                    }
216                }
217                HexFirst => {
218                    let ch = match self.it.next() {
219                        Some(ch) => ch,
220                        None => {
221                            // An incomplete escape sequence unescapes as
222                            // itself.
223                            self.state = UnescapeState::bytes_raw(&[b'x']);
224                            return Some(b'\\');
225                        }
226                    };
227                    match ch {
228                        '0'..='9' | 'A'..='F' | 'a'..='f' => {
229                            self.state = HexSecond(ch);
230                        }
231                        ch => {
232                            // An invalid escape sequence unescapes as itself.
233                            self.state = UnescapeState::bytes(&[b'x'], ch);
234                            return Some(b'\\');
235                        }
236                    }
237                }
238                HexSecond(first) => {
239                    let second = match self.it.next() {
240                        Some(ch) => ch,
241                        None => {
242                            // An incomplete escape sequence unescapes as
243                            // itself.
244                            self.state = UnescapeState::bytes(&[b'x'], first);
245                            return Some(b'\\');
246                        }
247                    };
248                    match second {
249                        '0'..='9' | 'A'..='F' | 'a'..='f' => {
250                            self.state = Start;
251                            let hinybble = char_to_hexdigit(first);
252                            let lonybble = char_to_hexdigit(second);
253                            let byte = hinybble << 4 | lonybble;
254                            return Some(byte);
255                        }
256                        ch => {
257                            // An invalid escape sequence unescapes as itself.
258                            self.state =
259                                UnescapeState::bytes2(&[b'x'], first, ch);
260                            return Some(b'\\');
261                        }
262                    }
263                }
264            }
265        }
266    }
267}
268
269/// The state used by the FSM in the unescaping iterator.
270#[derive(Clone, Debug)]
271#[cfg(feature = "alloc")]
272enum UnescapeState {
273    /// The start state. Look for an escape sequence, otherwise emit the next
274    /// codepoint as-is.
275    Start,
276    /// Emit the byte at `buf[cur]`.
277    ///
278    /// This state should never be created when `cur >= len`. That is, when
279    /// this state is visited, it is assumed that `cur < len`.
280    Bytes { buf: [u8; 11], cur: usize, len: usize },
281    /// This state is entered after a `\` is seen.
282    Escape,
283    /// This state is entered after a `\x` is seen.
284    HexFirst,
285    /// This state is entered after a `\xN` is seen, where `N` is in
286    /// `[0-9A-Fa-f]`. The given codepoint corresponds to `N`.
287    HexSecond(char),
288}
289
290#[cfg(feature = "alloc")]
291impl UnescapeState {
292    /// Create a new `Bytes` variant with the given slice.
293    ///
294    /// # Panics
295    ///
296    /// Panics if `bytes.len() > 11`.
297    fn bytes_raw(bytes: &[u8]) -> UnescapeState {
298        // This can be increased, you just need to make sure 'buf' in the
299        // 'Bytes' state has enough room.
300        assert!(bytes.len() <= 11, "no more than 11 bytes allowed");
301        let mut buf = [0; 11];
302        buf[..bytes.len()].copy_from_slice(bytes);
303        UnescapeState::Bytes { buf, cur: 0, len: bytes.len() }
304    }
305
306    /// Create a new `Bytes` variant with the prefix byte slice, followed by
307    /// the UTF-8 encoding of the given char.
308    ///
309    /// # Panics
310    ///
311    /// Panics if `prefix.len() > 3`.
312    fn bytes(prefix: &[u8], ch: char) -> UnescapeState {
313        // This can be increased, you just need to make sure 'buf' in the
314        // 'Bytes' state has enough room.
315        assert!(prefix.len() <= 3, "no more than 3 bytes allowed");
316        let mut buf = [0; 11];
317        buf[..prefix.len()].copy_from_slice(prefix);
318        let chlen = ch.encode_utf8(&mut buf[prefix.len()..]).len();
319        UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + chlen }
320    }
321
322    /// Create a new `Bytes` variant with the prefix byte slice, followed by
323    /// the UTF-8 encoding of `ch1` and then `ch2`.
324    ///
325    /// # Panics
326    ///
327    /// Panics if `prefix.len() > 3`.
328    fn bytes2(prefix: &[u8], ch1: char, ch2: char) -> UnescapeState {
329        // This can be increased, you just need to make sure 'buf' in the
330        // 'Bytes' state has enough room.
331        assert!(prefix.len() <= 3, "no more than 3 bytes allowed");
332        let mut buf = [0; 11];
333        buf[..prefix.len()].copy_from_slice(prefix);
334        let len1 = ch1.encode_utf8(&mut buf[prefix.len()..]).len();
335        let len2 = ch2.encode_utf8(&mut buf[prefix.len() + len1..]).len();
336        UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + len1 + len2 }
337    }
338}
339
340/// Convert the given codepoint to its corresponding hexadecimal digit.
341///
342/// # Panics
343///
344/// This panics if `ch` is not in `[0-9A-Fa-f]`.
345#[cfg(feature = "alloc")]
346fn char_to_hexdigit(ch: char) -> u8 {
347    u8::try_from(ch.to_digit(16).unwrap()).unwrap()
348}
349
350/// Convert the given hexadecimal digit to its corresponding codepoint.
351///
352/// # Panics
353///
354/// This panics when `digit > 15`.
355fn hexdigit_to_char(digit: u8) -> char {
356    char::from_digit(u32::from(digit), 16).unwrap().to_ascii_uppercase()
357}
358
359#[cfg(all(test, feature = "std"))]
360mod tests {
361    use crate::BString;
362
363    use super::*;
364
365    #[allow(non_snake_case)]
366    fn B<B: AsRef<[u8]>>(bytes: B) -> BString {
367        BString::from(bytes.as_ref())
368    }
369
370    fn e<B: AsRef<[u8]>>(bytes: B) -> String {
371        EscapeBytes::new(bytes.as_ref()).to_string()
372    }
373
374    fn u(string: &str) -> BString {
375        UnescapeBytes::new(string.chars()).collect()
376    }
377
378    #[test]
379    fn escape() {
380        assert_eq!(r"a", e(br"a"));
381        assert_eq!(r"\\x61", e(br"\x61"));
382        assert_eq!(r"a", e(b"\x61"));
383        assert_eq!(r"~", e(b"\x7E"));
384        assert_eq!(r"\x7F", e(b"\x7F"));
385
386        assert_eq!(r"\n", e(b"\n"));
387        assert_eq!(r"\r", e(b"\r"));
388        assert_eq!(r"\t", e(b"\t"));
389        assert_eq!(r"\\", e(b"\\"));
390        assert_eq!(r"\0", e(b"\0"));
391        assert_eq!(r"\0", e(b"\x00"));
392
393        assert_eq!(r"\x88", e(b"\x88"));
394        assert_eq!(r"\x8F", e(b"\x8F"));
395        assert_eq!(r"\xF8", e(b"\xF8"));
396        assert_eq!(r"\xFF", e(b"\xFF"));
397
398        assert_eq!(r"\xE2", e(b"\xE2"));
399        assert_eq!(r"\xE2\x98", e(b"\xE2\x98"));
400        assert_eq!(r"☃", e(b"\xE2\x98\x83"));
401
402        assert_eq!(r"\xF0", e(b"\xF0"));
403        assert_eq!(r"\xF0\x9F", e(b"\xF0\x9F"));
404        assert_eq!(r"\xF0\x9F\x92", e(b"\xF0\x9F\x92"));
405        assert_eq!(r"💩", e(b"\xF0\x9F\x92\xA9"));
406    }
407
408    #[test]
409    fn unescape() {
410        assert_eq!(B(r"a"), u(r"a"));
411        assert_eq!(B(r"\x61"), u(r"\\x61"));
412        assert_eq!(B(r"a"), u(r"\x61"));
413        assert_eq!(B(r"~"), u(r"\x7E"));
414        assert_eq!(B(b"\x7F"), u(r"\x7F"));
415
416        assert_eq!(B(b"\n"), u(r"\n"));
417        assert_eq!(B(b"\r"), u(r"\r"));
418        assert_eq!(B(b"\t"), u(r"\t"));
419        assert_eq!(B(b"\\"), u(r"\\"));
420        assert_eq!(B(b"\0"), u(r"\0"));
421        assert_eq!(B(b"\0"), u(r"\x00"));
422
423        assert_eq!(B(b"\x88"), u(r"\x88"));
424        assert_eq!(B(b"\x8F"), u(r"\x8F"));
425        assert_eq!(B(b"\xF8"), u(r"\xF8"));
426        assert_eq!(B(b"\xFF"), u(r"\xFF"));
427
428        assert_eq!(B(b"\xE2"), u(r"\xE2"));
429        assert_eq!(B(b"\xE2\x98"), u(r"\xE2\x98"));
430        assert_eq!(B("☃"), u(r"\xE2\x98\x83"));
431
432        assert_eq!(B(b"\xF0"), u(r"\xf0"));
433        assert_eq!(B(b"\xF0\x9F"), u(r"\xf0\x9f"));
434        assert_eq!(B(b"\xF0\x9F\x92"), u(r"\xf0\x9f\x92"));
435        assert_eq!(B("💩"), u(r"\xf0\x9f\x92\xa9"));
436    }
437
438    #[test]
439    fn unescape_weird() {
440        assert_eq!(B(b"\\"), u(r"\"));
441        assert_eq!(B(b"\\"), u(r"\\"));
442        assert_eq!(B(b"\\x"), u(r"\x"));
443        assert_eq!(B(b"\\xA"), u(r"\xA"));
444
445        assert_eq!(B(b"\\xZ"), u(r"\xZ"));
446        assert_eq!(B(b"\\xZZ"), u(r"\xZZ"));
447        assert_eq!(B(b"\\i"), u(r"\i"));
448        assert_eq!(B(b"\\u"), u(r"\u"));
449        assert_eq!(B(b"\\u{2603}"), u(r"\u{2603}"));
450    }
451}