regex/
utf8.rs

1/// A few elementary UTF-8 encoding and decoding functions used by the matching
2/// engines.
3///
4/// In an ideal world, the matching engines operate on `&str` and we can just
5/// lean on the standard library for all our UTF-8 needs. However, to support
6/// byte based regexes (that can match on arbitrary bytes which may contain
7/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
8/// The standard library doesn't really recognize this use case, so we have
9/// to build it out ourselves.
10///
11/// Should this be factored out into a separate crate? It seems independently
12/// useful. There are other crates that already exist (e.g., `utf-8`) that have
13/// overlapping use cases. Not sure what to do.
14use std::char;
15
16const TAG_CONT: u8 = 0b1000_0000;
17const TAG_TWO: u8 = 0b1100_0000;
18const TAG_THREE: u8 = 0b1110_0000;
19const TAG_FOUR: u8 = 0b1111_0000;
20
21/// Returns the smallest possible index of the next valid UTF-8 sequence
22/// starting after `i`.
23pub fn next_utf8(text: &[u8], i: usize) -> usize {
24    let b = match text.get(i) {
25        None => return i + 1,
26        Some(&b) => b,
27    };
28    let inc = if b <= 0x7F {
29        1
30    } else if b <= 0b110_11111 {
31        2
32    } else if b <= 0b1110_1111 {
33        3
34    } else {
35        4
36    };
37    i + inc
38}
39
40/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
41///
42/// If no valid UTF-8 sequence could be found, then `None` is returned.
43/// Otherwise, the decoded codepoint and the number of bytes read is returned.
44/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
45/// 1, 2, 3 or 4.
46///
47/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
48/// codepoint that is out of range (surrogate codepoints are out of range) or
49/// is not the shortest possible UTF-8 sequence for that codepoint.
50#[inline]
51pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
52    let b0 = match src.get(0) {
53        None => return None,
54        Some(&b) if b <= 0x7F => return Some((b as char, 1)),
55        Some(&b) => b,
56    };
57    match b0 {
58        0b110_00000..=0b110_11111 => {
59            if src.len() < 2 {
60                return None;
61            }
62            let b1 = src[1];
63            if 0b11_000000 & b1 != TAG_CONT {
64                return None;
65            }
66            let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32);
67            match cp {
68                0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
69                _ => None,
70            }
71        }
72        0b1110_0000..=0b1110_1111 => {
73            if src.len() < 3 {
74                return None;
75            }
76            let (b1, b2) = (src[1], src[2]);
77            if 0b11_000000 & b1 != TAG_CONT {
78                return None;
79            }
80            if 0b11_000000 & b2 != TAG_CONT {
81                return None;
82            }
83            let cp = ((b0 & !TAG_THREE) as u32) << 12
84                | ((b1 & !TAG_CONT) as u32) << 6
85                | ((b2 & !TAG_CONT) as u32);
86            match cp {
87                // char::from_u32 will disallow surrogate codepoints.
88                0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
89                _ => None,
90            }
91        }
92        0b11110_000..=0b11110_111 => {
93            if src.len() < 4 {
94                return None;
95            }
96            let (b1, b2, b3) = (src[1], src[2], src[3]);
97            if 0b11_000000 & b1 != TAG_CONT {
98                return None;
99            }
100            if 0b11_000000 & b2 != TAG_CONT {
101                return None;
102            }
103            if 0b11_000000 & b3 != TAG_CONT {
104                return None;
105            }
106            let cp = ((b0 & !TAG_FOUR) as u32) << 18
107                | ((b1 & !TAG_CONT) as u32) << 12
108                | ((b2 & !TAG_CONT) as u32) << 6
109                | ((b3 & !TAG_CONT) as u32);
110            match cp {
111                0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
112                _ => None,
113            }
114        }
115        _ => None,
116    }
117}
118
119/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead
120/// of the first.
121pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
122    if src.is_empty() {
123        return None;
124    }
125    let mut start = src.len() - 1;
126    if src[start] <= 0x7F {
127        return Some((src[start] as char, 1));
128    }
129    while start > src.len().saturating_sub(4) {
130        start -= 1;
131        if is_start_byte(src[start]) {
132            break;
133        }
134    }
135    match decode_utf8(&src[start..]) {
136        None => None,
137        Some((_, n)) if n < src.len() - start => None,
138        Some((cp, n)) => Some((cp, n)),
139    }
140}
141
142fn is_start_byte(b: u8) -> bool {
143    b & 0b11_000000 != 0b1_0000000
144}
145
146#[cfg(test)]
147mod tests {
148    use std::str;
149
150    use quickcheck::quickcheck;
151
152    use super::{
153        decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO,
154    };
155
156    #[test]
157    fn prop_roundtrip() {
158        fn p(given_cp: char) -> bool {
159            let mut tmp = [0; 4];
160            let encoded_len = given_cp.encode_utf8(&mut tmp).len();
161            let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
162            encoded_len == got_len && given_cp == got_cp
163        }
164        quickcheck(p as fn(char) -> bool)
165    }
166
167    #[test]
168    fn prop_roundtrip_last() {
169        fn p(given_cp: char) -> bool {
170            let mut tmp = [0; 4];
171            let encoded_len = given_cp.encode_utf8(&mut tmp).len();
172            let (got_cp, got_len) =
173                decode_last_utf8(&tmp[..encoded_len]).unwrap();
174            encoded_len == got_len && given_cp == got_cp
175        }
176        quickcheck(p as fn(char) -> bool)
177    }
178
179    #[test]
180    fn prop_encode_matches_std() {
181        fn p(cp: char) -> bool {
182            let mut got = [0; 4];
183            let n = cp.encode_utf8(&mut got).len();
184            let expected = cp.to_string();
185            &got[..n] == expected.as_bytes()
186        }
187        quickcheck(p as fn(char) -> bool)
188    }
189
190    #[test]
191    fn prop_decode_matches_std() {
192        fn p(given_cp: char) -> bool {
193            let mut tmp = [0; 4];
194            let n = given_cp.encode_utf8(&mut tmp).len();
195            let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
196            let expected_cp =
197                str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
198            got_cp == expected_cp
199        }
200        quickcheck(p as fn(char) -> bool)
201    }
202
203    #[test]
204    fn prop_decode_last_matches_std() {
205        fn p(given_cp: char) -> bool {
206            let mut tmp = [0; 4];
207            let n = given_cp.encode_utf8(&mut tmp).len();
208            let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
209            let expected_cp = str::from_utf8(&tmp[..n])
210                .unwrap()
211                .chars()
212                .rev()
213                .next()
214                .unwrap();
215            got_cp == expected_cp
216        }
217        quickcheck(p as fn(char) -> bool)
218    }
219
220    #[test]
221    fn reject_invalid() {
222        // Invalid start byte
223        assert_eq!(decode_utf8(&[0xFF]), None);
224        // Surrogate pair
225        assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
226        // Invalid continuation byte.
227        assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
228        // Bad lengths
229        assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
230        assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
231        assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
232                                                            // Not a minimal UTF-8 sequence
233        assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
234        assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
235        assert_eq!(
236            decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]),
237            None
238        );
239    }
240
241    #[test]
242    fn reject_invalid_last() {
243        // Invalid start byte
244        assert_eq!(decode_last_utf8(&[0xFF]), None);
245        // Surrogate pair
246        assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
247        // Bad lengths
248        assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
249        assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
250        assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
251                                                                 // Not a minimal UTF-8 sequence
252        assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
253        assert_eq!(
254            decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]),
255            None
256        );
257        assert_eq!(
258            decode_last_utf8(
259                &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]
260            ),
261            None
262        );
263    }
264}