idna/
punycode.rs

1// Copyright 2013 The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation.
10//!
11//! Since Punycode fundamentally works on unicode code points,
12//! `encode` and `decode` take and return slices and vectors of `char`.
13//! `encode_str` and `decode_to_string` provide convenience wrappers
14//! that convert from and to Rust’s UTF-8 based `str` and `String` types.
15
16use std::char;
17use std::u32;
18
19// Bootstring parameters for Punycode
20static BASE: u32 = 36;
21static T_MIN: u32 = 1;
22static T_MAX: u32 = 26;
23static SKEW: u32 = 38;
24static DAMP: u32 = 700;
25static INITIAL_BIAS: u32 = 72;
26static INITIAL_N: u32 = 0x80;
27static DELIMITER: char = '-';
28
29#[inline]
30fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
31    delta /= if first_time { DAMP } else { 2 };
32    delta += delta / num_points;
33    let mut k = 0;
34    while delta > ((BASE - T_MIN) * T_MAX) / 2 {
35        delta /= BASE - T_MIN;
36        k += BASE;
37    }
38    k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW))
39}
40
41/// Convert Punycode to an Unicode `String`.
42///
43/// This is a convenience wrapper around `decode`.
44#[inline]
45pub fn decode_to_string(input: &str) -> Option<String> {
46    decode(input).map(|chars| chars.into_iter().collect())
47}
48
49/// Convert Punycode to Unicode.
50///
51/// Return None on malformed input or overflow.
52/// Overflow can only happen on inputs that take more than
53/// 63 encoded bytes, the DNS limit on domain name labels.
54pub fn decode(input: &str) -> Option<Vec<char>> {
55    Some(Decoder::default().decode(input).ok()?.collect())
56}
57
58#[derive(Default)]
59pub(crate) struct Decoder {
60    insertions: Vec<(usize, char)>,
61}
62
63impl Decoder {
64    /// Split the input iterator and return a Vec with insertions of encoded characters
65    pub(crate) fn decode<'a>(&'a mut self, input: &'a str) -> Result<Decode<'a>, ()> {
66        self.insertions.clear();
67        // Handle "basic" (ASCII) code points.
68        // They are encoded as-is before the last delimiter, if any.
69        let (base, input) = match input.rfind(DELIMITER) {
70            None => ("", input),
71            Some(position) => (
72                &input[..position],
73                if position > 0 {
74                    &input[position + 1..]
75                } else {
76                    input
77                },
78            ),
79        };
80
81        if !base.is_ascii() {
82            return Err(());
83        }
84
85        let base_len = base.len();
86        let mut length = base_len as u32;
87        let mut code_point = INITIAL_N;
88        let mut bias = INITIAL_BIAS;
89        let mut i = 0;
90        let mut iter = input.bytes();
91        loop {
92            let previous_i = i;
93            let mut weight = 1;
94            let mut k = BASE;
95            let mut byte = match iter.next() {
96                None => break,
97                Some(byte) => byte,
98            };
99
100            // Decode a generalized variable-length integer into delta,
101            // which gets added to i.
102            loop {
103                let digit = match byte {
104                    byte @ b'0'..=b'9' => byte - b'0' + 26,
105                    byte @ b'A'..=b'Z' => byte - b'A',
106                    byte @ b'a'..=b'z' => byte - b'a',
107                    _ => return Err(()),
108                } as u32;
109                if digit > (u32::MAX - i) / weight {
110                    return Err(()); // Overflow
111                }
112                i += digit * weight;
113                let t = if k <= bias {
114                    T_MIN
115                } else if k >= bias + T_MAX {
116                    T_MAX
117                } else {
118                    k - bias
119                };
120                if digit < t {
121                    break;
122                }
123                if weight > u32::MAX / (BASE - t) {
124                    return Err(()); // Overflow
125                }
126                weight *= BASE - t;
127                k += BASE;
128                byte = match iter.next() {
129                    None => return Err(()), // End of input before the end of this delta
130                    Some(byte) => byte,
131                };
132            }
133
134            bias = adapt(i - previous_i, length + 1, previous_i == 0);
135            if i / (length + 1) > u32::MAX - code_point {
136                return Err(()); // Overflow
137            }
138
139            // i was supposed to wrap around from length+1 to 0,
140            // incrementing code_point each time.
141            code_point += i / (length + 1);
142            i %= length + 1;
143            let c = match char::from_u32(code_point) {
144                Some(c) => c,
145                None => return Err(()),
146            };
147
148            // Move earlier insertions farther out in the string
149            for (idx, _) in &mut self.insertions {
150                if *idx >= i as usize {
151                    *idx += 1;
152                }
153            }
154            self.insertions.push((i as usize, c));
155            length += 1;
156            i += 1;
157        }
158
159        self.insertions.sort_by_key(|(i, _)| *i);
160        Ok(Decode {
161            base: base.chars(),
162            insertions: &self.insertions,
163            inserted: 0,
164            position: 0,
165            len: base_len + self.insertions.len(),
166        })
167    }
168}
169
170pub(crate) struct Decode<'a> {
171    base: std::str::Chars<'a>,
172    pub(crate) insertions: &'a [(usize, char)],
173    inserted: usize,
174    position: usize,
175    len: usize,
176}
177
178impl<'a> Iterator for Decode<'a> {
179    type Item = char;
180
181    fn next(&mut self) -> Option<Self::Item> {
182        loop {
183            match self.insertions.get(self.inserted) {
184                Some((pos, c)) if *pos == self.position => {
185                    self.inserted += 1;
186                    self.position += 1;
187                    return Some(*c);
188                }
189                _ => {}
190            }
191            if let Some(c) = self.base.next() {
192                self.position += 1;
193                return Some(c);
194            } else if self.inserted >= self.insertions.len() {
195                return None;
196            }
197        }
198    }
199
200    fn size_hint(&self) -> (usize, Option<usize>) {
201        let len = self.len - self.position;
202        (len, Some(len))
203    }
204}
205
206impl<'a> ExactSizeIterator for Decode<'a> {
207    fn len(&self) -> usize {
208        self.len - self.position
209    }
210}
211
212/// Convert an Unicode `str` to Punycode.
213///
214/// This is a convenience wrapper around `encode`.
215#[inline]
216pub fn encode_str(input: &str) -> Option<String> {
217    let mut buf = String::with_capacity(input.len());
218    encode_into(input.chars(), &mut buf).ok().map(|()| buf)
219}
220
221/// Convert Unicode to Punycode.
222///
223/// Return None on overflow, which can only happen on inputs that would take more than
224/// 63 encoded bytes, the DNS limit on domain name labels.
225pub fn encode(input: &[char]) -> Option<String> {
226    let mut buf = String::with_capacity(input.len());
227    encode_into(input.iter().copied(), &mut buf)
228        .ok()
229        .map(|()| buf)
230}
231
232pub(crate) fn encode_into<I>(input: I, output: &mut String) -> Result<(), ()>
233where
234    I: Iterator<Item = char> + Clone,
235{
236    // Handle "basic" (ASCII) code points. They are encoded as-is.
237    let (mut input_length, mut basic_length) = (0, 0);
238    for c in input.clone() {
239        input_length += 1;
240        if c.is_ascii() {
241            output.push(c);
242            basic_length += 1;
243        }
244    }
245
246    if basic_length > 0 {
247        output.push('-')
248    }
249    let mut code_point = INITIAL_N;
250    let mut delta = 0;
251    let mut bias = INITIAL_BIAS;
252    let mut processed = basic_length;
253    while processed < input_length {
254        // All code points < code_point have been handled already.
255        // Find the next larger one.
256        let min_code_point = input
257            .clone()
258            .map(|c| c as u32)
259            .filter(|&c| c >= code_point)
260            .min()
261            .unwrap();
262        if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) {
263            return Err(()); // Overflow
264        }
265        // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0>
266        delta += (min_code_point - code_point) * (processed + 1);
267        code_point = min_code_point;
268        for c in input.clone() {
269            let c = c as u32;
270            if c < code_point {
271                delta += 1;
272                if delta == 0 {
273                    return Err(()); // Overflow
274                }
275            }
276            if c == code_point {
277                // Represent delta as a generalized variable-length integer:
278                let mut q = delta;
279                let mut k = BASE;
280                loop {
281                    let t = if k <= bias {
282                        T_MIN
283                    } else if k >= bias + T_MAX {
284                        T_MAX
285                    } else {
286                        k - bias
287                    };
288                    if q < t {
289                        break;
290                    }
291                    let value = t + ((q - t) % (BASE - t));
292                    output.push(value_to_digit(value));
293                    q = (q - t) / (BASE - t);
294                    k += BASE;
295                }
296                output.push(value_to_digit(q));
297                bias = adapt(delta, processed + 1, processed == basic_length);
298                delta = 0;
299                processed += 1;
300            }
301        }
302        delta += 1;
303        code_point += 1;
304    }
305    Ok(())
306}
307
308#[inline]
309fn value_to_digit(value: u32) -> char {
310    match value {
311        0..=25 => (value as u8 + b'a') as char,       // a..z
312        26..=35 => (value as u8 - 26 + b'0') as char, // 0..9
313        _ => panic!(),
314    }
315}