regex/
expand.rs

1use std::str;
2
3use crate::find_byte::find_byte;
4
5use crate::re_bytes;
6use crate::re_unicode;
7
8pub fn expand_str(
9    caps: &re_unicode::Captures<'_>,
10    mut replacement: &str,
11    dst: &mut String,
12) {
13    while !replacement.is_empty() {
14        match find_byte(b'$', replacement.as_bytes()) {
15            None => break,
16            Some(i) => {
17                dst.push_str(&replacement[..i]);
18                replacement = &replacement[i..];
19            }
20        }
21        if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
22            dst.push_str("$");
23            replacement = &replacement[2..];
24            continue;
25        }
26        debug_assert!(!replacement.is_empty());
27        let cap_ref = match find_cap_ref(replacement.as_bytes()) {
28            Some(cap_ref) => cap_ref,
29            None => {
30                dst.push_str("$");
31                replacement = &replacement[1..];
32                continue;
33            }
34        };
35        replacement = &replacement[cap_ref.end..];
36        match cap_ref.cap {
37            Ref::Number(i) => {
38                dst.push_str(caps.get(i).map(|m| m.as_str()).unwrap_or(""));
39            }
40            Ref::Named(name) => {
41                dst.push_str(
42                    caps.name(name).map(|m| m.as_str()).unwrap_or(""),
43                );
44            }
45        }
46    }
47    dst.push_str(replacement);
48}
49
50pub fn expand_bytes(
51    caps: &re_bytes::Captures<'_>,
52    mut replacement: &[u8],
53    dst: &mut Vec<u8>,
54) {
55    while !replacement.is_empty() {
56        match find_byte(b'$', replacement) {
57            None => break,
58            Some(i) => {
59                dst.extend(&replacement[..i]);
60                replacement = &replacement[i..];
61            }
62        }
63        if replacement.get(1).map_or(false, |&b| b == b'$') {
64            dst.push(b'$');
65            replacement = &replacement[2..];
66            continue;
67        }
68        debug_assert!(!replacement.is_empty());
69        let cap_ref = match find_cap_ref(replacement) {
70            Some(cap_ref) => cap_ref,
71            None => {
72                dst.push(b'$');
73                replacement = &replacement[1..];
74                continue;
75            }
76        };
77        replacement = &replacement[cap_ref.end..];
78        match cap_ref.cap {
79            Ref::Number(i) => {
80                dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b""));
81            }
82            Ref::Named(name) => {
83                dst.extend(
84                    caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""),
85                );
86            }
87        }
88    }
89    dst.extend(replacement);
90}
91
92/// `CaptureRef` represents a reference to a capture group inside some text.
93/// The reference is either a capture group name or a number.
94///
95/// It is also tagged with the position in the text following the
96/// capture reference.
97#[derive(Clone, Copy, Debug, Eq, PartialEq)]
98struct CaptureRef<'a> {
99    cap: Ref<'a>,
100    end: usize,
101}
102
103/// A reference to a capture group in some text.
104///
105/// e.g., `$2`, `$foo`, `${foo}`.
106#[derive(Clone, Copy, Debug, Eq, PartialEq)]
107enum Ref<'a> {
108    Named(&'a str),
109    Number(usize),
110}
111
112impl<'a> From<&'a str> for Ref<'a> {
113    fn from(x: &'a str) -> Ref<'a> {
114        Ref::Named(x)
115    }
116}
117
118impl From<usize> for Ref<'static> {
119    fn from(x: usize) -> Ref<'static> {
120        Ref::Number(x)
121    }
122}
123
124/// Parses a possible reference to a capture group name in the given text,
125/// starting at the beginning of `replacement`.
126///
127/// If no such valid reference could be found, None is returned.
128fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
129    let mut i = 0;
130    let rep: &[u8] = replacement;
131    if rep.len() <= 1 || rep[0] != b'$' {
132        return None;
133    }
134    i += 1;
135    if rep[i] == b'{' {
136        return find_cap_ref_braced(rep, i + 1);
137    }
138    let mut cap_end = i;
139    while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
140        cap_end += 1;
141    }
142    if cap_end == i {
143        return None;
144    }
145    // We just verified that the range 0..cap_end is valid ASCII, so it must
146    // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
147    // check via an unchecked conversion or by parsing the number straight from
148    // &[u8].
149    let cap =
150        str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
151    Some(CaptureRef {
152        cap: match cap.parse::<u32>() {
153            Ok(i) => Ref::Number(i as usize),
154            Err(_) => Ref::Named(cap),
155        },
156        end: cap_end,
157    })
158}
159
160fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
161    let start = i;
162    while rep.get(i).map_or(false, |&b| b != b'}') {
163        i += 1;
164    }
165    if !rep.get(i).map_or(false, |&b| b == b'}') {
166        return None;
167    }
168    // When looking at braced names, we don't put any restrictions on the name,
169    // so it's possible it could be invalid UTF-8. But a capture group name
170    // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
171    // safely return None.
172    let cap = match str::from_utf8(&rep[start..i]) {
173        Err(_) => return None,
174        Ok(cap) => cap,
175    };
176    Some(CaptureRef {
177        cap: match cap.parse::<u32>() {
178            Ok(i) => Ref::Number(i as usize),
179            Err(_) => Ref::Named(cap),
180        },
181        end: i + 1,
182    })
183}
184
185/// Returns true if and only if the given byte is allowed in a capture name.
186fn is_valid_cap_letter(b: u8) -> bool {
187    match b {
188        b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
189        _ => false,
190    }
191}
192
193#[cfg(test)]
194mod tests {
195    use super::{find_cap_ref, CaptureRef};
196
197    macro_rules! find {
198        ($name:ident, $text:expr) => {
199            #[test]
200            fn $name() {
201                assert_eq!(None, find_cap_ref($text.as_bytes()));
202            }
203        };
204        ($name:ident, $text:expr, $capref:expr) => {
205            #[test]
206            fn $name() {
207                assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
208            }
209        };
210    }
211
212    macro_rules! c {
213        ($name_or_number:expr, $pos:expr) => {
214            CaptureRef { cap: $name_or_number.into(), end: $pos }
215        };
216    }
217
218    find!(find_cap_ref1, "$foo", c!("foo", 4));
219    find!(find_cap_ref2, "${foo}", c!("foo", 6));
220    find!(find_cap_ref3, "$0", c!(0, 2));
221    find!(find_cap_ref4, "$5", c!(5, 2));
222    find!(find_cap_ref5, "$10", c!(10, 3));
223    // See https://github.com/rust-lang/regex/pull/585
224    // for more on characters following numbers
225    find!(find_cap_ref6, "$42a", c!("42a", 4));
226    find!(find_cap_ref7, "${42}a", c!(42, 5));
227    find!(find_cap_ref8, "${42");
228    find!(find_cap_ref9, "${42 ");
229    find!(find_cap_ref10, " $0 ");
230    find!(find_cap_ref11, "$");
231    find!(find_cap_ref12, " ");
232    find!(find_cap_ref13, "");
233    find!(find_cap_ref14, "$1-$2", c!(1, 2));
234    find!(find_cap_ref15, "$1_$2", c!("1_", 3));
235    find!(find_cap_ref16, "$x-$y", c!("x", 2));
236    find!(find_cap_ref17, "$x_$y", c!("x_", 3));
237    find!(find_cap_ref18, "${#}", c!("#", 4));
238    find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
239}