form_urlencoded/
lib.rs

1// Copyright 2013-2016 The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! Parser and serializer for the [`application/x-www-form-urlencoded` syntax](
10//! http://url.spec.whatwg.org/#application/x-www-form-urlencoded),
11//! as used by HTML forms.
12//!
13//! Converts between a string (such as an URL’s query string)
14//! and a sequence of (name, value) pairs.
15
16use percent_encoding::{percent_decode, percent_encode_byte};
17use std::borrow::{Borrow, Cow};
18use std::str;
19
20/// Convert a byte string in the `application/x-www-form-urlencoded` syntax
21/// into a iterator of (name, value) pairs.
22///
23/// Use `parse(input.as_bytes())` to parse a `&str` string.
24///
25/// The names and values are percent-decoded. For instance, `%23first=%25try%25` will be
26/// converted to `[("#first", "%try%")]`.
27#[inline]
28pub fn parse(input: &[u8]) -> Parse<'_> {
29    Parse { input }
30}
31/// The return type of `parse()`.
32#[derive(Copy, Clone)]
33pub struct Parse<'a> {
34    input: &'a [u8],
35}
36
37impl<'a> Iterator for Parse<'a> {
38    type Item = (Cow<'a, str>, Cow<'a, str>);
39
40    fn next(&mut self) -> Option<Self::Item> {
41        loop {
42            if self.input.is_empty() {
43                return None;
44            }
45            let mut split2 = self.input.splitn(2, |&b| b == b'&');
46            let sequence = split2.next().unwrap();
47            self.input = split2.next().unwrap_or(&[][..]);
48            if sequence.is_empty() {
49                continue;
50            }
51            let mut split2 = sequence.splitn(2, |&b| b == b'=');
52            let name = split2.next().unwrap();
53            let value = split2.next().unwrap_or(&[][..]);
54            return Some((decode(name), decode(value)));
55        }
56    }
57}
58
59fn decode(input: &[u8]) -> Cow<'_, str> {
60    let replaced = replace_plus(input);
61    decode_utf8_lossy(match percent_decode(&replaced).into() {
62        Cow::Owned(vec) => Cow::Owned(vec),
63        Cow::Borrowed(_) => replaced,
64    })
65}
66
67/// Replace b'+' with b' '
68fn replace_plus(input: &[u8]) -> Cow<'_, [u8]> {
69    match input.iter().position(|&b| b == b'+') {
70        None => Cow::Borrowed(input),
71        Some(first_position) => {
72            let mut replaced = input.to_owned();
73            replaced[first_position] = b' ';
74            for byte in &mut replaced[first_position + 1..] {
75                if *byte == b'+' {
76                    *byte = b' ';
77                }
78            }
79            Cow::Owned(replaced)
80        }
81    }
82}
83
84impl<'a> Parse<'a> {
85    /// Return a new iterator that yields pairs of `String` instead of pairs of `Cow<str>`.
86    pub fn into_owned(self) -> ParseIntoOwned<'a> {
87        ParseIntoOwned { inner: self }
88    }
89}
90
91/// Like `Parse`, but yields pairs of `String` instead of pairs of `Cow<str>`.
92pub struct ParseIntoOwned<'a> {
93    inner: Parse<'a>,
94}
95
96impl<'a> Iterator for ParseIntoOwned<'a> {
97    type Item = (String, String);
98
99    fn next(&mut self) -> Option<Self::Item> {
100        self.inner
101            .next()
102            .map(|(k, v)| (k.into_owned(), v.into_owned()))
103    }
104}
105
106/// The [`application/x-www-form-urlencoded` byte serializer](
107/// https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer).
108///
109/// Return an iterator of `&str` slices.
110pub fn byte_serialize(input: &[u8]) -> ByteSerialize<'_> {
111    ByteSerialize { bytes: input }
112}
113
114/// Return value of `byte_serialize()`.
115#[derive(Debug)]
116pub struct ByteSerialize<'a> {
117    bytes: &'a [u8],
118}
119
120fn byte_serialized_unchanged(byte: u8) -> bool {
121    matches!(byte, b'*' | b'-' | b'.' | b'0' ..= b'9' | b'A' ..= b'Z' | b'_' | b'a' ..= b'z')
122}
123
124impl<'a> Iterator for ByteSerialize<'a> {
125    type Item = &'a str;
126
127    fn next(&mut self) -> Option<&'a str> {
128        if let Some((&first, tail)) = self.bytes.split_first() {
129            if !byte_serialized_unchanged(first) {
130                self.bytes = tail;
131                return Some(if first == b' ' {
132                    "+"
133                } else {
134                    percent_encode_byte(first)
135                });
136            }
137            let position = tail.iter().position(|&b| !byte_serialized_unchanged(b));
138            let (unchanged_slice, remaining) = match position {
139                // 1 for first_byte + i unchanged in tail
140                Some(i) => self.bytes.split_at(1 + i),
141                None => (self.bytes, &[][..]),
142            };
143            self.bytes = remaining;
144            // This unsafe is appropriate because we have already checked these
145            // bytes in byte_serialized_unchanged, which checks for a subset
146            // of UTF-8. So we know these bytes are valid UTF-8, and doing
147            // another UTF-8 check would be wasteful.
148            Some(unsafe { str::from_utf8_unchecked(unchanged_slice) })
149        } else {
150            None
151        }
152    }
153
154    fn size_hint(&self) -> (usize, Option<usize>) {
155        if self.bytes.is_empty() {
156            (0, Some(0))
157        } else {
158            (1, Some(self.bytes.len()))
159        }
160    }
161}
162
163/// The [`application/x-www-form-urlencoded` serializer](
164/// https://url.spec.whatwg.org/#concept-urlencoded-serializer).
165pub struct Serializer<'a, T: Target> {
166    target: Option<T>,
167    start_position: usize,
168    encoding: EncodingOverride<'a>,
169}
170
171pub trait Target {
172    fn as_mut_string(&mut self) -> &mut String;
173    fn finish(self) -> Self::Finished;
174    type Finished;
175}
176
177impl Target for String {
178    fn as_mut_string(&mut self) -> &mut String {
179        self
180    }
181    fn finish(self) -> Self {
182        self
183    }
184    type Finished = Self;
185}
186
187impl<'a> Target for &'a mut String {
188    fn as_mut_string(&mut self) -> &mut String {
189        &mut **self
190    }
191    fn finish(self) -> Self {
192        self
193    }
194    type Finished = Self;
195}
196
197impl<'a, T: Target> Serializer<'a, T> {
198    /// Create a new `application/x-www-form-urlencoded` serializer for the given target.
199    ///
200    /// If the target is non-empty,
201    /// its content is assumed to already be in `application/x-www-form-urlencoded` syntax.
202    pub fn new(target: T) -> Self {
203        Self::for_suffix(target, 0)
204    }
205
206    /// Create a new `application/x-www-form-urlencoded` serializer
207    /// for a suffix of the given target.
208    ///
209    /// If that suffix is non-empty,
210    /// its content is assumed to already be in `application/x-www-form-urlencoded` syntax.
211    pub fn for_suffix(mut target: T, start_position: usize) -> Self {
212        if target.as_mut_string().len() < start_position {
213            panic!(
214                "invalid length {} for target of length {}",
215                start_position,
216                target.as_mut_string().len()
217            );
218        }
219
220        Serializer {
221            target: Some(target),
222            start_position,
223            encoding: None,
224        }
225    }
226
227    /// Remove any existing name/value pair.
228    ///
229    /// Panics if called after `.finish()`.
230    pub fn clear(&mut self) -> &mut Self {
231        string(&mut self.target).truncate(self.start_position);
232        self
233    }
234
235    /// Set the character encoding to be used for names and values before percent-encoding.
236    pub fn encoding_override(&mut self, new: EncodingOverride<'a>) -> &mut Self {
237        self.encoding = new;
238        self
239    }
240
241    /// Serialize and append a name/value pair.
242    ///
243    /// Panics if called after `.finish()`.
244    pub fn append_pair(&mut self, name: &str, value: &str) -> &mut Self {
245        append_pair(
246            string(&mut self.target),
247            self.start_position,
248            self.encoding,
249            name,
250            value,
251        );
252        self
253    }
254
255    /// Serialize and append a name of parameter without any value.
256    ///
257    /// Panics if called after `.finish()`.
258    pub fn append_key_only(&mut self, name: &str) -> &mut Self {
259        append_key_only(
260            string(&mut self.target),
261            self.start_position,
262            self.encoding,
263            name,
264        );
265        self
266    }
267
268    /// Serialize and append a number of name/value pairs.
269    ///
270    /// This simply calls `append_pair` repeatedly.
271    /// This can be more convenient, so the user doesn’t need to introduce a block
272    /// to limit the scope of `Serializer`’s borrow of its string.
273    ///
274    /// Panics if called after `.finish()`.
275    pub fn extend_pairs<I, K, V>(&mut self, iter: I) -> &mut Self
276    where
277        I: IntoIterator,
278        I::Item: Borrow<(K, V)>,
279        K: AsRef<str>,
280        V: AsRef<str>,
281    {
282        {
283            let string = string(&mut self.target);
284            for pair in iter {
285                let &(ref k, ref v) = pair.borrow();
286                append_pair(
287                    string,
288                    self.start_position,
289                    self.encoding,
290                    k.as_ref(),
291                    v.as_ref(),
292                );
293            }
294        }
295        self
296    }
297
298    /// Serialize and append a number of names without values.
299    ///
300    /// This simply calls `append_key_only` repeatedly.
301    /// This can be more convenient, so the user doesn’t need to introduce a block
302    /// to limit the scope of `Serializer`’s borrow of its string.
303    ///
304    /// Panics if called after `.finish()`.
305    pub fn extend_keys_only<I, K>(&mut self, iter: I) -> &mut Self
306    where
307        I: IntoIterator,
308        I::Item: Borrow<K>,
309        K: AsRef<str>,
310    {
311        {
312            let string = string(&mut self.target);
313            for key in iter {
314                let k = key.borrow().as_ref();
315                append_key_only(string, self.start_position, self.encoding, k);
316            }
317        }
318        self
319    }
320
321    /// If this serializer was constructed with a string, take and return that string.
322    ///
323    /// ```rust
324    /// use form_urlencoded;
325    /// let encoded: String = form_urlencoded::Serializer::new(String::new())
326    ///     .append_pair("foo", "bar & baz")
327    ///     .append_pair("saison", "Été+hiver")
328    ///     .finish();
329    /// assert_eq!(encoded, "foo=bar+%26+baz&saison=%C3%89t%C3%A9%2Bhiver");
330    /// ```
331    ///
332    /// Panics if called more than once.
333    pub fn finish(&mut self) -> T::Finished {
334        self.target
335            .take()
336            .expect("url::form_urlencoded::Serializer double finish")
337            .finish()
338    }
339}
340
341fn append_separator_if_needed(string: &mut String, start_position: usize) {
342    if string.len() > start_position {
343        string.push('&')
344    }
345}
346
347fn string<T: Target>(target: &mut Option<T>) -> &mut String {
348    target
349        .as_mut()
350        .expect("url::form_urlencoded::Serializer finished")
351        .as_mut_string()
352}
353
354fn append_pair(
355    string: &mut String,
356    start_position: usize,
357    encoding: EncodingOverride<'_>,
358    name: &str,
359    value: &str,
360) {
361    append_separator_if_needed(string, start_position);
362    append_encoded(name, string, encoding);
363    string.push('=');
364    append_encoded(value, string, encoding);
365}
366
367fn append_key_only(
368    string: &mut String,
369    start_position: usize,
370    encoding: EncodingOverride,
371    name: &str,
372) {
373    append_separator_if_needed(string, start_position);
374    append_encoded(name, string, encoding);
375}
376
377fn append_encoded(s: &str, string: &mut String, encoding: EncodingOverride<'_>) {
378    string.extend(byte_serialize(&encode(encoding, s)))
379}
380
381pub(crate) fn encode<'a>(encoding_override: EncodingOverride<'_>, input: &'a str) -> Cow<'a, [u8]> {
382    if let Some(o) = encoding_override {
383        return o(input);
384    }
385    input.as_bytes().into()
386}
387
388pub(crate) fn decode_utf8_lossy(input: Cow<'_, [u8]>) -> Cow<'_, str> {
389    // Note: This function is duplicated in `percent_encoding/lib.rs`.
390    match input {
391        Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes),
392        Cow::Owned(bytes) => {
393            match String::from_utf8_lossy(&bytes) {
394                Cow::Borrowed(utf8) => {
395                    // If from_utf8_lossy returns a Cow::Borrowed, then we can
396                    // be sure our original bytes were valid UTF-8. This is because
397                    // if the bytes were invalid UTF-8 from_utf8_lossy would have
398                    // to allocate a new owned string to back the Cow so it could
399                    // replace invalid bytes with a placeholder.
400
401                    // First we do a debug_assert to confirm our description above.
402                    let raw_utf8: *const [u8] = utf8.as_bytes();
403                    debug_assert!(raw_utf8 == &*bytes as *const [u8]);
404
405                    // Given we know the original input bytes are valid UTF-8,
406                    // and we have ownership of those bytes, we re-use them and
407                    // return a Cow::Owned here.
408                    Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) })
409                }
410                Cow::Owned(s) => Cow::Owned(s),
411            }
412        }
413    }
414}
415
416pub type EncodingOverride<'a> = Option<&'a dyn Fn(&str) -> Cow<'_, [u8]>>;