idna/
uts46.rs

1// Copyright 2013-2014 The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! [*Unicode IDNA Compatibility Processing*
10//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
11
12use self::Mapping::*;
13use crate::punycode;
14use std::{error::Error as StdError, fmt};
15use unicode_bidi::{bidi_class, BidiClass};
16use unicode_normalization::char::is_combining_mark;
17use unicode_normalization::{is_nfc, UnicodeNormalization};
18
19include!("uts46_mapping_table.rs");
20
21const PUNYCODE_PREFIX: &str = "xn--";
22
23#[derive(Debug)]
24struct StringTableSlice {
25    // Store these as separate fields so the structure will have an
26    // alignment of 1 and thus pack better into the Mapping enum, below.
27    byte_start_lo: u8,
28    byte_start_hi: u8,
29    byte_len: u8,
30}
31
32fn decode_slice(slice: &StringTableSlice) -> &'static str {
33    let lo = slice.byte_start_lo as usize;
34    let hi = slice.byte_start_hi as usize;
35    let start = (hi << 8) | lo;
36    let len = slice.byte_len as usize;
37    &STRING_TABLE[start..(start + len)]
38}
39
40#[repr(u8)]
41#[derive(Debug)]
42enum Mapping {
43    Valid,
44    Ignored,
45    Mapped(StringTableSlice),
46    Deviation(StringTableSlice),
47    Disallowed,
48    DisallowedStd3Valid,
49    DisallowedStd3Mapped(StringTableSlice),
50    DisallowedIdna2008,
51}
52
53fn find_char(codepoint: char) -> &'static Mapping {
54    let idx = match TABLE.binary_search_by_key(&codepoint, |&val| val.0) {
55        Ok(idx) => idx,
56        Err(idx) => idx - 1,
57    };
58
59    const SINGLE_MARKER: u16 = 1 << 15;
60
61    let (base, x) = TABLE[idx];
62    let single = (x & SINGLE_MARKER) != 0;
63    let offset = !SINGLE_MARKER & x;
64
65    if single {
66        &MAPPING_TABLE[offset as usize]
67    } else {
68        &MAPPING_TABLE[(offset + (codepoint as u16 - base as u16)) as usize]
69    }
70}
71
72struct Mapper<'a> {
73    chars: std::str::Chars<'a>,
74    config: Config,
75    errors: &'a mut Errors,
76    slice: Option<std::str::Chars<'static>>,
77}
78
79impl<'a> Iterator for Mapper<'a> {
80    type Item = char;
81
82    fn next(&mut self) -> Option<Self::Item> {
83        loop {
84            if let Some(s) = &mut self.slice {
85                match s.next() {
86                    Some(c) => return Some(c),
87                    None => {
88                        self.slice = None;
89                    }
90                }
91            }
92
93            let codepoint = self.chars.next()?;
94            if let '.' | '-' | 'a'..='z' | '0'..='9' = codepoint {
95                return Some(codepoint);
96            }
97
98            return Some(match *find_char(codepoint) {
99                Mapping::Valid => codepoint,
100                Mapping::Ignored => continue,
101                Mapping::Mapped(ref slice) => {
102                    self.slice = Some(decode_slice(slice).chars());
103                    continue;
104                }
105                Mapping::Deviation(ref slice) => {
106                    if self.config.transitional_processing {
107                        self.slice = Some(decode_slice(slice).chars());
108                        continue;
109                    } else {
110                        codepoint
111                    }
112                }
113                Mapping::Disallowed => {
114                    self.errors.disallowed_character = true;
115                    codepoint
116                }
117                Mapping::DisallowedStd3Valid => {
118                    if self.config.use_std3_ascii_rules {
119                        self.errors.disallowed_by_std3_ascii_rules = true;
120                    };
121                    codepoint
122                }
123                Mapping::DisallowedStd3Mapped(ref slice) => {
124                    if self.config.use_std3_ascii_rules {
125                        self.errors.disallowed_mapped_in_std3 = true;
126                    };
127                    self.slice = Some(decode_slice(slice).chars());
128                    continue;
129                }
130                Mapping::DisallowedIdna2008 => {
131                    if self.config.use_idna_2008_rules {
132                        self.errors.disallowed_in_idna_2008 = true;
133                    }
134                    codepoint
135                }
136            });
137        }
138    }
139}
140
141// http://tools.ietf.org/html/rfc5893#section-2
142fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
143    // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label.  A label
144    // is RTL if it contains at least one character of bidi class R, AL or AN.
145    if !is_bidi_domain {
146        return true;
147    }
148
149    let mut chars = label.chars();
150    let first_char_class = match chars.next() {
151        Some(c) => bidi_class(c),
152        None => return true, // empty string
153    };
154
155    match first_char_class {
156        // LTR label
157        BidiClass::L => {
158            // Rule 5
159            for c in chars.by_ref() {
160                if !matches!(
161                    bidi_class(c),
162                    BidiClass::L
163                        | BidiClass::EN
164                        | BidiClass::ES
165                        | BidiClass::CS
166                        | BidiClass::ET
167                        | BidiClass::ON
168                        | BidiClass::BN
169                        | BidiClass::NSM
170                ) {
171                    return false;
172                }
173            }
174
175            // Rule 6
176            // must end in L or EN followed by 0 or more NSM
177            let mut rev_chars = label.chars().rev();
178            let mut last_non_nsm = rev_chars.next();
179            loop {
180                match last_non_nsm {
181                    Some(c) if bidi_class(c) == BidiClass::NSM => {
182                        last_non_nsm = rev_chars.next();
183                        continue;
184                    }
185                    _ => {
186                        break;
187                    }
188                }
189            }
190            match last_non_nsm {
191                Some(c) if bidi_class(c) == BidiClass::L || bidi_class(c) == BidiClass::EN => {}
192                Some(_) => {
193                    return false;
194                }
195                _ => {}
196            }
197        }
198
199        // RTL label
200        BidiClass::R | BidiClass::AL => {
201            let mut found_en = false;
202            let mut found_an = false;
203
204            // Rule 2
205            for c in chars {
206                let char_class = bidi_class(c);
207                if char_class == BidiClass::EN {
208                    found_en = true;
209                } else if char_class == BidiClass::AN {
210                    found_an = true;
211                }
212
213                if !matches!(
214                    char_class,
215                    BidiClass::R
216                        | BidiClass::AL
217                        | BidiClass::AN
218                        | BidiClass::EN
219                        | BidiClass::ES
220                        | BidiClass::CS
221                        | BidiClass::ET
222                        | BidiClass::ON
223                        | BidiClass::BN
224                        | BidiClass::NSM
225                ) {
226                    return false;
227                }
228            }
229            // Rule 3
230            let mut rev_chars = label.chars().rev();
231            let mut last = rev_chars.next();
232            loop {
233                // must end in L or EN followed by 0 or more NSM
234                match last {
235                    Some(c) if bidi_class(c) == BidiClass::NSM => {
236                        last = rev_chars.next();
237                        continue;
238                    }
239                    _ => {
240                        break;
241                    }
242                }
243            }
244            match last {
245                Some(c)
246                    if matches!(
247                        bidi_class(c),
248                        BidiClass::R | BidiClass::AL | BidiClass::EN | BidiClass::AN
249                    ) => {}
250                _ => {
251                    return false;
252                }
253            }
254
255            // Rule 4
256            if found_an && found_en {
257                return false;
258            }
259        }
260
261        // Rule 1: Should start with L or R/AL
262        _ => {
263            return false;
264        }
265    }
266
267    true
268}
269
270/// Check the validity criteria for the given label
271///
272/// V1 (NFC) and V8 (Bidi) are checked inside `processing()` to prevent doing duplicate work.
273///
274/// http://www.unicode.org/reports/tr46/#Validity_Criteria
275fn check_validity(label: &str, config: Config, errors: &mut Errors) {
276    let first_char = label.chars().next();
277    if first_char == None {
278        // Empty string, pass
279        return;
280    }
281
282    // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
283    //
284    // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
285    // third and fourth positions. But nobody follows this criteria. See the spec issue below:
286    // https://github.com/whatwg/url/issues/53
287
288    // V3: neither begin nor end with a U+002D HYPHEN-MINUS
289    if config.check_hyphens && (label.starts_with('-') || label.ends_with('-')) {
290        errors.check_hyphens = true;
291        return;
292    }
293
294    // V4: not contain a U+002E FULL STOP
295    //
296    // Here, label can't contain '.' since the input is from .split('.')
297
298    // V5: not begin with a GC=Mark
299    if is_combining_mark(first_char.unwrap()) {
300        errors.start_combining_mark = true;
301        return;
302    }
303
304    // V6: Check against Mapping Table
305    if label.chars().any(|c| match *find_char(c) {
306        Mapping::Valid | Mapping::DisallowedIdna2008 => false,
307        Mapping::Deviation(_) => config.transitional_processing,
308        Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
309        _ => true,
310    }) {
311        errors.invalid_mapping = true;
312    }
313
314    // V7: ContextJ rules
315    //
316    // TODO: Implement rules and add *CheckJoiners* flag.
317
318    // V8: Bidi rules are checked inside `processing()`
319}
320
321// Detect simple cases: all lowercase ASCII characters and digits where none
322// of the labels start with PUNYCODE_PREFIX and labels don't start or end with hyphen.
323fn is_simple(domain: &str) -> bool {
324    if domain.is_empty() {
325        return false;
326    }
327    let (mut prev, mut puny_prefix) = ('?', 0);
328    for c in domain.chars() {
329        if c == '.' {
330            if prev == '-' {
331                return false;
332            }
333            puny_prefix = 0;
334            continue;
335        } else if puny_prefix == 0 && c == '-' {
336            return false;
337        } else if puny_prefix < 5 {
338            if c == ['x', 'n', '-', '-'][puny_prefix] {
339                puny_prefix += 1;
340                if puny_prefix == 4 {
341                    return false;
342                }
343            } else {
344                puny_prefix = 5;
345            }
346        }
347        if !c.is_ascii_lowercase() && !c.is_ascii_digit() {
348            return false;
349        }
350        prev = c;
351    }
352
353    true
354}
355
356/// http://www.unicode.org/reports/tr46/#Processing
357fn processing(
358    domain: &str,
359    config: Config,
360    normalized: &mut String,
361    output: &mut String,
362) -> Errors {
363    normalized.clear();
364    let mut errors = Errors::default();
365    let offset = output.len();
366
367    let iter = Mapper {
368        chars: domain.chars(),
369        config,
370        errors: &mut errors,
371        slice: None,
372    };
373
374    normalized.extend(iter.nfc());
375
376    let mut decoder = punycode::Decoder::default();
377    let non_transitional = config.transitional_processing(false);
378    let (mut first, mut has_bidi_labels) = (true, false);
379    for label in normalized.split('.') {
380        if !first {
381            output.push('.');
382        }
383        first = false;
384        if let Some(remainder) = label.strip_prefix(PUNYCODE_PREFIX) {
385            match decoder.decode(remainder) {
386                Ok(decode) => {
387                    let start = output.len();
388                    output.extend(decode);
389                    let decoded_label = &output[start..];
390
391                    if !has_bidi_labels {
392                        has_bidi_labels |= is_bidi_domain(decoded_label);
393                    }
394
395                    if !errors.is_err() {
396                        if !is_nfc(decoded_label) {
397                            errors.nfc = true;
398                        } else {
399                            check_validity(decoded_label, non_transitional, &mut errors);
400                        }
401                    }
402                }
403                Err(()) => {
404                    has_bidi_labels = true;
405                    errors.punycode = true;
406                }
407            }
408        } else {
409            if !has_bidi_labels {
410                has_bidi_labels |= is_bidi_domain(label);
411            }
412
413            // `normalized` is already `NFC` so we can skip that check
414            check_validity(label, config, &mut errors);
415            output.push_str(label)
416        }
417    }
418
419    for label in output[offset..].split('.') {
420        // V8: Bidi rules
421        //
422        // TODO: Add *CheckBidi* flag
423        if !passes_bidi(label, has_bidi_labels) {
424            errors.check_bidi = true;
425            break;
426        }
427    }
428
429    errors
430}
431
432#[derive(Default)]
433pub struct Idna {
434    config: Config,
435    normalized: String,
436    output: String,
437}
438
439impl Idna {
440    pub fn new(config: Config) -> Self {
441        Self {
442            config,
443            normalized: String::new(),
444            output: String::new(),
445        }
446    }
447
448    pub fn to_ascii_inner(&mut self, domain: &str, out: &mut String) -> Errors {
449        if is_simple(domain) {
450            out.push_str(domain);
451            return Errors::default();
452        }
453        let mut errors = processing(domain, self.config, &mut self.normalized, out);
454        self.output = std::mem::replace(out, String::with_capacity(out.len()));
455        let mut first = true;
456        for label in self.output.split('.') {
457            if !first {
458                out.push('.');
459            }
460            first = false;
461
462            if label.is_ascii() {
463                out.push_str(label);
464            } else {
465                let offset = out.len();
466                out.push_str(PUNYCODE_PREFIX);
467                if let Err(()) = punycode::encode_into(label.chars(), out) {
468                    errors.punycode = true;
469                    out.truncate(offset);
470                }
471            }
472        }
473        errors
474    }
475
476    /// http://www.unicode.org/reports/tr46/#ToASCII
477    #[allow(clippy::wrong_self_convention)]
478    pub fn to_ascii<'a>(&'a mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
479        let mut errors = self.to_ascii_inner(domain, out);
480
481        if self.config.verify_dns_length {
482            let domain = if out.ends_with('.') {
483                &out[..out.len() - 1]
484            } else {
485                &*out
486            };
487            if domain.is_empty() || domain.split('.').any(|label| label.is_empty()) {
488                errors.too_short_for_dns = true;
489            }
490            if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
491                errors.too_long_for_dns = true;
492            }
493        }
494
495        errors.into()
496    }
497
498    /// http://www.unicode.org/reports/tr46/#ToUnicode
499    #[allow(clippy::wrong_self_convention)]
500    pub fn to_unicode<'a>(&'a mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
501        if is_simple(domain) {
502            out.push_str(domain);
503            return Errors::default().into();
504        }
505        processing(domain, self.config, &mut self.normalized, out).into()
506    }
507}
508
509#[derive(Clone, Copy)]
510pub struct Config {
511    use_std3_ascii_rules: bool,
512    transitional_processing: bool,
513    verify_dns_length: bool,
514    check_hyphens: bool,
515    use_idna_2008_rules: bool,
516}
517
518/// The defaults are that of https://url.spec.whatwg.org/#idna
519impl Default for Config {
520    fn default() -> Self {
521        Config {
522            use_std3_ascii_rules: false,
523            transitional_processing: false,
524            check_hyphens: false,
525            // check_bidi: true,
526            // check_joiners: true,
527
528            // Only use for to_ascii, not to_unicode
529            verify_dns_length: false,
530            use_idna_2008_rules: false,
531        }
532    }
533}
534
535impl Config {
536    #[inline]
537    pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
538        self.use_std3_ascii_rules = value;
539        self
540    }
541
542    #[inline]
543    pub fn transitional_processing(mut self, value: bool) -> Self {
544        self.transitional_processing = value;
545        self
546    }
547
548    #[inline]
549    pub fn verify_dns_length(mut self, value: bool) -> Self {
550        self.verify_dns_length = value;
551        self
552    }
553
554    #[inline]
555    pub fn check_hyphens(mut self, value: bool) -> Self {
556        self.check_hyphens = value;
557        self
558    }
559
560    #[inline]
561    pub fn use_idna_2008_rules(mut self, value: bool) -> Self {
562        self.use_idna_2008_rules = value;
563        self
564    }
565
566    /// http://www.unicode.org/reports/tr46/#ToASCII
567    pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
568        let mut result = String::with_capacity(domain.len());
569        let mut codec = Idna::new(self);
570        codec.to_ascii(domain, &mut result).map(|()| result)
571    }
572
573    /// http://www.unicode.org/reports/tr46/#ToUnicode
574    pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
575        let mut codec = Idna::new(self);
576        let mut out = String::with_capacity(domain.len());
577        let result = codec.to_unicode(domain, &mut out);
578        (out, result)
579    }
580}
581
582fn is_bidi_domain(s: &str) -> bool {
583    for c in s.chars() {
584        if c.is_ascii_graphic() {
585            continue;
586        }
587        match bidi_class(c) {
588            BidiClass::R | BidiClass::AL | BidiClass::AN => return true,
589            _ => {}
590        }
591    }
592    false
593}
594
595/// Errors recorded during UTS #46 processing.
596///
597/// This is opaque for now, indicating what types of errors have been encountered at least once.
598/// More details may be exposed in the future.
599#[derive(Default)]
600pub struct Errors {
601    punycode: bool,
602    check_hyphens: bool,
603    check_bidi: bool,
604    start_combining_mark: bool,
605    invalid_mapping: bool,
606    nfc: bool,
607    disallowed_by_std3_ascii_rules: bool,
608    disallowed_mapped_in_std3: bool,
609    disallowed_character: bool,
610    too_long_for_dns: bool,
611    too_short_for_dns: bool,
612    disallowed_in_idna_2008: bool,
613}
614
615impl Errors {
616    fn is_err(&self) -> bool {
617        let Errors {
618            punycode,
619            check_hyphens,
620            check_bidi,
621            start_combining_mark,
622            invalid_mapping,
623            nfc,
624            disallowed_by_std3_ascii_rules,
625            disallowed_mapped_in_std3,
626            disallowed_character,
627            too_long_for_dns,
628            too_short_for_dns,
629            disallowed_in_idna_2008,
630        } = *self;
631        punycode
632            || check_hyphens
633            || check_bidi
634            || start_combining_mark
635            || invalid_mapping
636            || nfc
637            || disallowed_by_std3_ascii_rules
638            || disallowed_mapped_in_std3
639            || disallowed_character
640            || too_long_for_dns
641            || too_short_for_dns
642            || disallowed_in_idna_2008
643    }
644}
645
646impl fmt::Debug for Errors {
647    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
648        let Errors {
649            punycode,
650            check_hyphens,
651            check_bidi,
652            start_combining_mark,
653            invalid_mapping,
654            nfc,
655            disallowed_by_std3_ascii_rules,
656            disallowed_mapped_in_std3,
657            disallowed_character,
658            too_long_for_dns,
659            too_short_for_dns,
660            disallowed_in_idna_2008,
661        } = *self;
662
663        let fields = [
664            ("punycode", punycode),
665            ("check_hyphens", check_hyphens),
666            ("check_bidi", check_bidi),
667            ("start_combining_mark", start_combining_mark),
668            ("invalid_mapping", invalid_mapping),
669            ("nfc", nfc),
670            (
671                "disallowed_by_std3_ascii_rules",
672                disallowed_by_std3_ascii_rules,
673            ),
674            ("disallowed_mapped_in_std3", disallowed_mapped_in_std3),
675            ("disallowed_character", disallowed_character),
676            ("too_long_for_dns", too_long_for_dns),
677            ("too_short_for_dns", too_short_for_dns),
678            ("disallowed_in_idna_2008", disallowed_in_idna_2008),
679        ];
680
681        let mut empty = true;
682        f.write_str("Errors { ")?;
683        for (name, val) in &fields {
684            if *val {
685                if !empty {
686                    f.write_str(", ")?;
687                }
688                f.write_str(*name)?;
689                empty = false;
690            }
691        }
692
693        if !empty {
694            f.write_str(" }")
695        } else {
696            f.write_str("}")
697        }
698    }
699}
700
701impl From<Errors> for Result<(), Errors> {
702    fn from(e: Errors) -> Result<(), Errors> {
703        if !e.is_err() {
704            Ok(())
705        } else {
706            Err(e)
707        }
708    }
709}
710
711impl StdError for Errors {}
712
713impl fmt::Display for Errors {
714    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
715        fmt::Debug::fmt(self, f)
716    }
717}
718
719#[cfg(test)]
720mod tests {
721    use super::{find_char, Mapping};
722
723    #[test]
724    fn mapping_fast_path() {
725        assert_matches!(find_char('-'), &Mapping::Valid);
726        assert_matches!(find_char('.'), &Mapping::Valid);
727        for c in &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] {
728            assert_matches!(find_char(*c), &Mapping::Valid);
729        }
730        for c in &[
731            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
732            'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
733        ] {
734            assert_matches!(find_char(*c), &Mapping::Valid);
735        }
736    }
737}