1use std::mem;
2
3use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
4use memchr::{memchr, memchr2, memchr3, memmem};
5use regex_syntax::hir::literal::{Literal, Literals};
6
7#[derive(Clone, Debug)]
13pub struct LiteralSearcher {
14 complete: bool,
15 lcp: Memmem,
16 lcs: Memmem,
17 matcher: Matcher,
18}
19
20#[derive(Clone, Debug)]
21enum Matcher {
22 Empty,
24 Bytes(SingleByteSet),
26 Memmem(Memmem),
28 AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
30 Packed { s: packed::Searcher, lits: Vec<Literal> },
37}
38
39impl LiteralSearcher {
40 pub fn empty() -> Self {
42 Self::new(Literals::empty(), Matcher::Empty)
43 }
44
45 pub fn prefixes(lits: Literals) -> Self {
47 let matcher = Matcher::prefixes(&lits);
48 Self::new(lits, matcher)
49 }
50
51 pub fn suffixes(lits: Literals) -> Self {
53 let matcher = Matcher::suffixes(&lits);
54 Self::new(lits, matcher)
55 }
56
57 fn new(lits: Literals, matcher: Matcher) -> Self {
58 let complete = lits.all_complete();
59 LiteralSearcher {
60 complete,
61 lcp: Memmem::new(lits.longest_common_prefix()),
62 lcs: Memmem::new(lits.longest_common_suffix()),
63 matcher,
64 }
65 }
66
67 pub fn complete(&self) -> bool {
74 self.complete && !self.is_empty()
75 }
76
77 #[cfg_attr(feature = "perf-inline", inline(always))]
79 pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
80 use self::Matcher::*;
81 match self.matcher {
82 Empty => Some((0, 0)),
83 Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
84 Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
85 AC { ref ac, .. } => {
86 ac.find(haystack).map(|m| (m.start(), m.end()))
87 }
88 Packed { ref s, .. } => {
89 s.find(haystack).map(|m| (m.start(), m.end()))
90 }
91 }
92 }
93
94 pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
96 for lit in self.iter() {
97 if lit.len() > haystack.len() {
98 continue;
99 }
100 if lit == &haystack[0..lit.len()] {
101 return Some((0, lit.len()));
102 }
103 }
104 None
105 }
106
107 pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
109 for lit in self.iter() {
110 if lit.len() > haystack.len() {
111 continue;
112 }
113 if lit == &haystack[haystack.len() - lit.len()..] {
114 return Some((haystack.len() - lit.len(), haystack.len()));
115 }
116 }
117 None
118 }
119
120 pub fn iter(&self) -> LiteralIter<'_> {
122 match self.matcher {
123 Matcher::Empty => LiteralIter::Empty,
124 Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
125 Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
126 Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
127 Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
128 }
129 }
130
131 pub fn lcp(&self) -> &Memmem {
133 &self.lcp
134 }
135
136 pub fn lcs(&self) -> &Memmem {
138 &self.lcs
139 }
140
141 pub fn is_empty(&self) -> bool {
143 self.len() == 0
144 }
145
146 pub fn len(&self) -> usize {
148 use self::Matcher::*;
149 match self.matcher {
150 Empty => 0,
151 Bytes(ref sset) => sset.dense.len(),
152 Memmem(_) => 1,
153 AC { ref ac, .. } => ac.pattern_count(),
154 Packed { ref lits, .. } => lits.len(),
155 }
156 }
157
158 pub fn approximate_size(&self) -> usize {
160 use self::Matcher::*;
161 match self.matcher {
162 Empty => 0,
163 Bytes(ref sset) => sset.approximate_size(),
164 Memmem(ref single) => single.approximate_size(),
165 AC { ref ac, .. } => ac.heap_bytes(),
166 Packed { ref s, .. } => s.heap_bytes(),
167 }
168 }
169}
170
171impl Matcher {
172 fn prefixes(lits: &Literals) -> Self {
173 let sset = SingleByteSet::prefixes(lits);
174 Matcher::new(lits, sset)
175 }
176
177 fn suffixes(lits: &Literals) -> Self {
178 let sset = SingleByteSet::suffixes(lits);
179 Matcher::new(lits, sset)
180 }
181
182 fn new(lits: &Literals, sset: SingleByteSet) -> Self {
183 if lits.literals().is_empty() {
184 return Matcher::Empty;
185 }
186 if sset.dense.len() >= 26 {
187 return Matcher::Empty;
194 }
195 if sset.complete {
196 return Matcher::Bytes(sset);
197 }
198 if lits.literals().len() == 1 {
199 return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
200 }
201
202 let pats = lits.literals().to_owned();
203 let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
204 if lits.literals().len() <= 100 && !is_aho_corasick_fast {
205 let mut builder = packed::Config::new()
206 .match_kind(packed::MatchKind::LeftmostFirst)
207 .builder();
208 if let Some(s) = builder.extend(&pats).build() {
209 return Matcher::Packed { s, lits: pats };
210 }
211 }
212 let ac = AhoCorasickBuilder::new()
213 .match_kind(aho_corasick::MatchKind::LeftmostFirst)
214 .dfa(true)
215 .build_with_size::<u32, _, _>(&pats)
216 .unwrap();
217 Matcher::AC { ac, lits: pats }
218 }
219}
220
221#[derive(Debug)]
222pub enum LiteralIter<'a> {
223 Empty,
224 Bytes(&'a [u8]),
225 Single(&'a [u8]),
226 AC(&'a [Literal]),
227 Packed(&'a [Literal]),
228}
229
230impl<'a> Iterator for LiteralIter<'a> {
231 type Item = &'a [u8];
232
233 fn next(&mut self) -> Option<Self::Item> {
234 match *self {
235 LiteralIter::Empty => None,
236 LiteralIter::Bytes(ref mut many) => {
237 if many.is_empty() {
238 None
239 } else {
240 let next = &many[0..1];
241 *many = &many[1..];
242 Some(next)
243 }
244 }
245 LiteralIter::Single(ref mut one) => {
246 if one.is_empty() {
247 None
248 } else {
249 let next = &one[..];
250 *one = &[];
251 Some(next)
252 }
253 }
254 LiteralIter::AC(ref mut lits) => {
255 if lits.is_empty() {
256 None
257 } else {
258 let next = &lits[0];
259 *lits = &lits[1..];
260 Some(&**next)
261 }
262 }
263 LiteralIter::Packed(ref mut lits) => {
264 if lits.is_empty() {
265 None
266 } else {
267 let next = &lits[0];
268 *lits = &lits[1..];
269 Some(&**next)
270 }
271 }
272 }
273 }
274}
275
276#[derive(Clone, Debug)]
277struct SingleByteSet {
278 sparse: Vec<bool>,
279 dense: Vec<u8>,
280 complete: bool,
281 all_ascii: bool,
282}
283
284impl SingleByteSet {
285 fn new() -> SingleByteSet {
286 SingleByteSet {
287 sparse: vec![false; 256],
288 dense: vec![],
289 complete: true,
290 all_ascii: true,
291 }
292 }
293
294 fn prefixes(lits: &Literals) -> SingleByteSet {
295 let mut sset = SingleByteSet::new();
296 for lit in lits.literals() {
297 sset.complete = sset.complete && lit.len() == 1;
298 if let Some(&b) = lit.get(0) {
299 if !sset.sparse[b as usize] {
300 if b > 0x7F {
301 sset.all_ascii = false;
302 }
303 sset.dense.push(b);
304 sset.sparse[b as usize] = true;
305 }
306 }
307 }
308 sset
309 }
310
311 fn suffixes(lits: &Literals) -> SingleByteSet {
312 let mut sset = SingleByteSet::new();
313 for lit in lits.literals() {
314 sset.complete = sset.complete && lit.len() == 1;
315 if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
316 if !sset.sparse[b as usize] {
317 if b > 0x7F {
318 sset.all_ascii = false;
319 }
320 sset.dense.push(b);
321 sset.sparse[b as usize] = true;
322 }
323 }
324 }
325 sset
326 }
327
328 #[cfg_attr(feature = "perf-inline", inline(always))]
330 fn find(&self, text: &[u8]) -> Option<usize> {
331 match self.dense.len() {
332 0 => None,
333 1 => memchr(self.dense[0], text),
334 2 => memchr2(self.dense[0], self.dense[1], text),
335 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
336 _ => self._find(text),
337 }
338 }
339
340 fn _find(&self, haystack: &[u8]) -> Option<usize> {
342 for (i, &b) in haystack.iter().enumerate() {
343 if self.sparse[b as usize] {
344 return Some(i);
345 }
346 }
347 None
348 }
349
350 fn approximate_size(&self) -> usize {
351 (self.dense.len() * mem::size_of::<u8>())
352 + (self.sparse.len() * mem::size_of::<bool>())
353 }
354}
355
356#[derive(Clone, Debug)]
361pub struct Memmem {
362 finder: memmem::Finder<'static>,
363 char_len: usize,
364}
365
366impl Memmem {
367 fn new(pat: &[u8]) -> Memmem {
368 Memmem {
369 finder: memmem::Finder::new(pat).into_owned(),
370 char_len: char_len_lossy(pat),
371 }
372 }
373
374 #[cfg_attr(feature = "perf-inline", inline(always))]
375 pub fn find(&self, haystack: &[u8]) -> Option<usize> {
376 self.finder.find(haystack)
377 }
378
379 #[cfg_attr(feature = "perf-inline", inline(always))]
380 pub fn is_suffix(&self, text: &[u8]) -> bool {
381 if text.len() < self.len() {
382 return false;
383 }
384 &text[text.len() - self.len()..] == self.finder.needle()
385 }
386
387 pub fn len(&self) -> usize {
388 self.finder.needle().len()
389 }
390
391 pub fn char_len(&self) -> usize {
392 self.char_len
393 }
394
395 fn approximate_size(&self) -> usize {
396 self.finder.needle().len() * mem::size_of::<u8>()
397 }
398}
399
400fn char_len_lossy(bytes: &[u8]) -> usize {
401 String::from_utf8_lossy(bytes).chars().count()
402}