publicsuffix/
lib.rs

1//! Robust domain name parsing using the Public Suffix List
2//!
3//! This library allows you to easily and accurately parse any given domain name.
4//!
5//! ## Examples
6//!
7//! ```rust,no_run
8//! extern crate publicsuffix;
9//!
10//! use publicsuffix::List;
11//! # use publicsuffix::Result;
12//!
13//! # fn examples() -> Result<()> {
14//! // Fetch the list from the official URL,
15//! # #[cfg(feature = "remote_list")]
16//! let list = List::fetch()?;
17//!
18//! // from your own URL
19//! # #[cfg(feature = "remote_list")]
20//! let list = List::from_url("https://example.com/path/to/public_suffix_list.dat")?;
21//!
22//! // or from a local file.
23//! let list = List::from_path("/path/to/public_suffix_list.dat")?;
24//!
25//! // Using the list you can find out the root domain
26//! // or extension of any given domain name
27//! let domain = list.parse_domain("www.example.com")?;
28//! assert_eq!(domain.root(), Some("example.com"));
29//! assert_eq!(domain.suffix(), Some("com"));
30//!
31//! let domain = list.parse_domain("www.食狮.中国")?;
32//! assert_eq!(domain.root(), Some("食狮.中国"));
33//! assert_eq!(domain.suffix(), Some("中国"));
34//!
35//! let domain = list.parse_domain("www.xn--85x722f.xn--55qx5d.cn")?;
36//! assert_eq!(domain.root(), Some("xn--85x722f.xn--55qx5d.cn"));
37//! assert_eq!(domain.suffix(), Some("xn--55qx5d.cn"));
38//!
39//! let domain = list.parse_domain("a.b.example.uk.com")?;
40//! assert_eq!(domain.root(), Some("example.uk.com"));
41//! assert_eq!(domain.suffix(), Some("uk.com"));
42//!
43//! let name = list.parse_dns_name("_tcp.example.com.")?;
44//! assert_eq!(name.domain().and_then(|domain| domain.root()), Some("example.com"));
45//! assert_eq!(name.domain().and_then(|domain| domain.suffix()), Some("com"));
46//!
47//! // You can also find out if this is an ICANN domain
48//! assert!(!domain.is_icann());
49//!
50//! // or a private one
51//! assert!(domain.is_private());
52//!
53//! // In any case if the domain's suffix is in the list
54//! // then this is definately a registrable domain name
55//! assert!(domain.has_known_suffix());
56//! # Ok(())
57//! # }
58//! # fn main() {}
59//! ```
60
61mod matcher;
62
63#[cfg(feature = "remote_list")]
64#[cfg(test)]
65mod tests;
66
67use std::{collections::HashMap, fmt, fs::File, io::Read, net::IpAddr, path::Path, str::FromStr};
68#[cfg(feature = "remote_list")]
69use std::{io::Write, net::TcpStream, time::Duration};
70
71pub mod errors;
72pub use crate::errors::{Error, ErrorKind, Result};
73
74use idna::domain_to_unicode;
75#[cfg(feature = "remote_list")]
76use native_tls::TlsConnector;
77use url::Url;
78
79/// The official URL of the list
80pub const LIST_URL: &str = "https://publicsuffix.org/list/public_suffix_list.dat";
81
82const PREVAILING_STAR_RULE: &str = "*";
83
84#[derive(Debug, PartialEq, Eq, Hash)]
85struct Suffix {
86    rule: String,
87    typ: Type,
88}
89
90#[derive(Debug)]
91struct ListLeaf {
92    typ: Type,
93    is_exception_rule: bool,
94}
95
96impl ListLeaf {
97    fn new(typ: Type, is_exception_rule: bool) -> Self {
98        Self {
99            typ,
100            is_exception_rule,
101        }
102    }
103}
104
105#[derive(Debug)]
106struct ListNode {
107    children: HashMap<String, ListNode>,
108    leaf: Option<ListLeaf>,
109}
110
111impl ListNode {
112    fn new() -> Self {
113        Self {
114            children: HashMap::new(),
115            leaf: None,
116        }
117    }
118}
119
120/// Stores the public suffix list
121///
122/// You can use the methods, `fetch`, `from_url` or `from_path` to build the list.
123/// If you are using this in a long running server it's recommended you use either
124/// `fetch` or `from_url` to download updates at least once a week.
125#[derive(Debug)]
126pub struct List {
127    root: ListNode,
128    all: Vec<Suffix>, // to support all(), icann(), private()
129}
130
131#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
132enum Type {
133    Icann,
134    Private,
135}
136
137/// Holds information about a particular domain
138///
139/// This is created by `List::parse_domain`.
140#[derive(Debug, Clone, PartialEq, Eq, Hash)]
141pub struct Domain {
142    full: String,
143    typ: Option<Type>,
144    suffix: Option<String>,
145    registrable: Option<String>,
146}
147
148/// Holds information about a particular host
149///
150/// This is created by `List::parse_host`.
151#[derive(Debug, Clone, PartialEq, Eq, Hash)]
152pub enum Host {
153    Ip(IpAddr),
154    Domain(Domain),
155}
156
157/// Holds information about a particular DNS name
158///
159/// This is created by `List::parse_dns_name`.
160#[derive(Debug, Clone, PartialEq, Eq, Hash)]
161pub struct DnsName {
162    name: String,
163    domain: Option<Domain>,
164}
165
166/// Converts a type into a Url object
167pub trait IntoUrl {
168    fn into_url(self) -> Result<Url>;
169}
170
171impl IntoUrl for Url {
172    fn into_url(self) -> Result<Url> {
173        Ok(self)
174    }
175}
176
177impl<'a> IntoUrl for &'a str {
178    fn into_url(self) -> Result<Url> {
179        Ok(Url::parse(self)?)
180    }
181}
182
183impl<'a> IntoUrl for &'a String {
184    fn into_url(self) -> Result<Url> {
185        Ok(Url::parse(self)?)
186    }
187}
188
189impl IntoUrl for String {
190    fn into_url(self) -> Result<Url> {
191        Ok(Url::parse(&self)?)
192    }
193}
194
195#[cfg(feature = "remote_list")]
196fn request<U: IntoUrl>(u: U) -> Result<String> {
197    let url = u.into_url()?;
198    let host = match url.host_str() {
199        Some(host) => host,
200        None => {
201            return Err(ErrorKind::NoHost.into());
202        }
203    };
204    let port = match url.port_or_known_default() {
205        Some(port) => port,
206        None => {
207            return Err(ErrorKind::NoPort.into());
208        }
209    };
210    let data = format!("GET {} HTTP/1.0\r\nHost: {}\r\n\r\n", url.path(), host);
211    let addr = format!("{}:{}", host, port);
212    let stream = TcpStream::connect(addr)?;
213    let timeout = Duration::from_secs(2);
214    stream.set_read_timeout(Some(timeout))?;
215    stream.set_write_timeout(Some(timeout))?;
216
217    let mut res = String::new();
218
219    match url.scheme() {
220        scheme if scheme == "https" => {
221            let connector = TlsConnector::builder().build()?;
222            let mut stream = connector.connect(host, stream)?;
223            stream.write_all(data.as_bytes())?;
224            stream.read_to_string(&mut res)?;
225        }
226        scheme if scheme == "http" => {
227            let mut stream = stream;
228            stream.write_all(data.as_bytes())?;
229            stream.read_to_string(&mut res)?;
230        }
231        _ => {
232            return Err(ErrorKind::UnsupportedScheme.into());
233        }
234    }
235
236    Ok(res)
237}
238
239impl List {
240    fn append(&mut self, mut rule: &str, typ: Type) -> Result<()> {
241        let mut is_exception_rule = false;
242        if rule.starts_with('!') {
243            is_exception_rule = true;
244            rule = &rule[1..];
245        }
246
247        let mut current = &mut self.root;
248        for label in rule.rsplit('.') {
249            if label.is_empty() {
250                return Err(ErrorKind::InvalidRule(rule.into()).into());
251            }
252
253            let cur = current;
254            current = cur
255                .children
256                .entry(label.to_owned())
257                .or_insert_with(ListNode::new);
258        }
259
260        current.leaf = Some(ListLeaf::new(typ, is_exception_rule));
261
262        // to support all(), icann(), private()
263        self.all.push(Suffix {
264            rule: rule.to_owned(),
265            typ,
266        });
267
268        Ok(())
269    }
270
271    fn build(res: &str) -> Result<List> {
272        let mut typ = None;
273        let mut list = List::empty();
274        for line in res.lines() {
275            match line {
276                line if line.contains("BEGIN ICANN DOMAINS") => {
277                    typ = Some(Type::Icann);
278                }
279                line if line.contains("BEGIN PRIVATE DOMAINS") => {
280                    typ = Some(Type::Private);
281                }
282                line if line.starts_with("//") => {
283                    continue;
284                }
285                line => match typ {
286                    Some(typ) => {
287                        let rule = match line.split_whitespace().next() {
288                            Some(rule) => rule,
289                            None => continue,
290                        };
291                        list.append(rule, typ)?;
292                    }
293                    None => {
294                        continue;
295                    }
296                },
297            }
298        }
299        if list.root.children.is_empty() || list.all().is_empty() {
300            return Err(ErrorKind::InvalidList.into());
301        }
302
303        list.append(PREVAILING_STAR_RULE, Type::Icann)?; // add the default rule
304
305        Ok(list)
306    }
307
308    /// Build the list from a string
309    ///
310    /// The list doesn't always have to come from a file. You can maintain your own
311    /// list, say in a DBMS. You can then pull it at runtime and build the list from
312    /// the resulting String.
313    pub fn from_string(string: String) -> Result<List> {
314        Self::from_str(&string)
315    }
316
317    /// Build the list from a str
318    ///
319    /// The list doesn't always have to come from a file. You can maintain your own
320    /// list, say in a DBMS. You can then pull it at runtime and build the list from
321    /// the resulting str.
322    #[allow(clippy::should_implement_trait)]
323    pub fn from_str(string: &str) -> Result<List> {
324        Self::build(string)
325    }
326
327    /// Creates an empty List without any rules
328    ///
329    /// Sometimes all you want is to do syntax checks. If you don't really care whether
330    /// the domain has a known suffix or not you can just create an empty list and use
331    /// that to parse domain names and email addresses.
332    pub fn empty() -> List {
333        List {
334            root: ListNode::new(),
335            all: Vec::new(),
336        }
337    }
338
339    /// Pull the list from a URL
340    #[cfg(feature = "remote_list")]
341    pub fn from_url<U: IntoUrl>(url: U) -> Result<List> {
342        let s = request(url)?;
343        Self::from_str(&s)
344    }
345
346    /// Fetch the list from a local file
347    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<List> {
348        File::open(path)
349            .map_err(|err| ErrorKind::Io(err).into())
350            .and_then(|mut data| {
351                let mut res = String::new();
352                data.read_to_string(&mut res)?;
353                Self::from_str(&res)
354            })
355    }
356
357    /// Build the list from the result of anything that implements `std::io::Read`
358    ///
359    /// If you don't already have your list on the filesystem but want to use your
360    /// own library to fetch the list you can use this method so you don't have to
361    /// save it first.
362    pub fn from_reader<R: Read>(mut reader: R) -> Result<List> {
363        let mut res = String::new();
364        reader.read_to_string(&mut res)?;
365        Self::build(&res)
366    }
367
368    /// Pull the list from the official URL
369    #[cfg(feature = "remote_list")]
370    pub fn fetch() -> Result<List> {
371        let github =
372            "https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat";
373
374        Self::from_url(LIST_URL)
375            // Fallback to the Github repo if the official link
376            // is down for some reason.
377            .or_else(|_| Self::from_url(github))
378    }
379
380    fn find_type(&self, typ: Type) -> Vec<&str> {
381        self.all_internal()
382            .filter(|s| s.typ == typ)
383            .map(|s| s.rule.as_str())
384            .collect()
385    }
386
387    /// Gets a list of all ICANN domain suffices
388    pub fn icann(&self) -> Vec<&str> {
389        self.find_type(Type::Icann)
390    }
391
392    /// Gets a list of all private domain suffices
393    pub fn private(&self) -> Vec<&str> {
394        self.find_type(Type::Private)
395    }
396
397    /// Gets a list of all domain suffices
398    pub fn all(&self) -> Vec<&str> {
399        self.all_internal().map(|s| s.rule.as_str()).collect()
400    }
401
402    fn all_internal(&self) -> impl Iterator<Item = &Suffix> {
403        self.all
404            .iter()
405            // remove the default rule
406            .filter(|s| s.rule != PREVAILING_STAR_RULE)
407    }
408
409    /// Parses a domain using the list
410    pub fn parse_domain(&self, domain: &str) -> Result<Domain> {
411        Domain::parse(domain, self, true)
412    }
413
414    /// Parses a host using the list
415    ///
416    /// A host, for the purposes of this library, is either
417    /// an IP address or a domain name.
418    pub fn parse_host(&self, host: &str) -> Result<Host> {
419        Host::parse(host, self)
420    }
421
422    /// Extracts Host from a URL
423    pub fn parse_url<U: IntoUrl>(&self, url: U) -> Result<Host> {
424        let url = url.into_url()?;
425        match url.scheme() {
426            "mailto" => match url.host_str() {
427                Some(host) => self.parse_email(&format!("{}@{}", url.username(), host)),
428                None => Err(ErrorKind::InvalidEmail.into()),
429            },
430            _ => match url.host_str() {
431                Some(host) => self.parse_host(host),
432                None => Err(ErrorKind::NoHost.into()),
433            },
434        }
435    }
436
437    /// Extracts Host from an email address
438    ///
439    /// This method can also be used, simply to validate an email address.
440    /// If it returns an error, the email address is not valid.
441    // https://en.wikipedia.org/wiki/Email_address#Syntax
442    // https://en.wikipedia.org/wiki/International_email#Email_addresses
443    // http://girders.org/blog/2013/01/31/dont-rfc-validate-email-addresses/
444    // https://html.spec.whatwg.org/multipage/forms.html#valid-e-mail-address
445    // https://hackernoon.com/the-100-correct-way-to-validate-email-addresses-7c4818f24643#.pgcir4z3e
446    // http://haacked.com/archive/2007/08/21/i-knew-how-to-validate-an-email-address-until-i.aspx/
447    // https://tools.ietf.org/html/rfc6530#section-10.1
448    // http://rumkin.com/software/email/rules.php
449    pub fn parse_email(&self, address: &str) -> Result<Host> {
450        let mut parts = address.rsplitn(2, '@');
451        let host = match parts.next() {
452            Some(host) => host,
453            None => {
454                return Err(ErrorKind::InvalidEmail.into());
455            }
456        };
457        let local = match parts.next() {
458            Some(local) => local,
459            None => {
460                return Err(ErrorKind::InvalidEmail.into());
461            }
462        };
463        if local.chars().count() > 64
464            || address.chars().count() > 254
465            || (!local.starts_with('"') && local.contains(".."))
466            || !matcher::is_email_local(local)
467        {
468            return Err(ErrorKind::InvalidEmail.into());
469        }
470        self.parse_host(host)
471    }
472
473    /// Parses any arbitrary string
474    ///
475    /// Effectively this means that the string is either a URL, an email address or a host.
476    pub fn parse_str(&self, string: &str) -> Result<Host> {
477        if string.contains("://") {
478            self.parse_url(string)
479        } else if string.contains('@') {
480            self.parse_email(string)
481        } else {
482            self.parse_host(string)
483        }
484    }
485
486    /// Parses any arbitrary string that can be used as a key in a DNS database
487    pub fn parse_dns_name(&self, name: &str) -> Result<DnsName> {
488        let mut dns_name = DnsName {
489            name: Domain::try_to_ascii(name).map_err(|_| ErrorKind::InvalidDomain(name.into()))?,
490            domain: None,
491        };
492        if let Ok(mut domain) = Domain::parse(name, self, false) {
493            if let Some(root) = domain.root() {
494                if Domain::has_valid_syntax(&root) {
495                    domain.full = root.to_string();
496                    dns_name.domain = Some(domain);
497                }
498            }
499        }
500        Ok(dns_name)
501    }
502}
503
504impl Host {
505    fn parse(mut host: &str, list: &List) -> Result<Host> {
506        if let Ok(domain) = Domain::parse(host, list, true) {
507            return Ok(Host::Domain(domain));
508        }
509        if host.starts_with('[')
510            && !host.starts_with("[[")
511            && host.ends_with(']')
512            && !host.ends_with("]]")
513        {
514            host = host.trim_start_matches('[').trim_end_matches(']');
515        };
516        if let Ok(ip) = IpAddr::from_str(host) {
517            return Ok(Host::Ip(ip));
518        }
519        Err(ErrorKind::InvalidHost.into())
520    }
521
522    /// A convenient method to simply check if a host is an IP address
523    pub fn is_ip(&self) -> bool {
524        if let Host::Ip(_) = self {
525            return true;
526        }
527        false
528    }
529
530    /// A convenient method to simply check if a host is a domain name
531    pub fn is_domain(&self) -> bool {
532        if let Host::Domain(_) = self {
533            return true;
534        }
535        false
536    }
537}
538
539impl fmt::Display for Host {
540    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
541        match self {
542            Host::Ip(ref ip) => write!(f, "{}", ip),
543            Host::Domain(ref domain) => write!(f, "{}", domain),
544        }
545    }
546}
547
548impl Domain {
549    /// Check if a domain has valid syntax
550    // https://en.wikipedia.org/wiki/Domain_name#Domain_name_syntax
551    // http://blog.sacaluta.com/2011/12/dns-domain-names-253-or-255-bytesoctets.html
552    // https://blogs.msdn.microsoft.com/oldnewthing/20120412-00/?p=7873/
553    pub fn has_valid_syntax(domain: &str) -> bool {
554        // we are explicitly checking for this here before calling `domain_to_ascii`
555        // because `domain_to_ascii` strips of leading dots so we won't be able to
556        // check for this later
557        if domain.starts_with('.') {
558            return false;
559        }
560        // let's convert the domain to ascii early on so we can validate
561        // internationalised domain names as well
562        let domain = match Self::try_to_ascii(domain) {
563            Ok(domain) => domain,
564            Err(_) => {
565                return false;
566            }
567        };
568        let mut labels: Vec<&str> = domain.split('.').collect();
569        // strip of the first dot from a domain to support fully qualified domain names
570        if domain.ends_with('.') {
571            labels.pop();
572        }
573        // a domain must not have more than 127 labels
574        if labels.len() > 127 {
575            return false;
576        }
577        labels.reverse();
578        for (i, label) in labels.iter().enumerate() {
579            // the tld must not be a number
580            if i == 0 && label.parse::<f64>().is_ok() {
581                return false;
582            }
583            // any label must only contain allowed characters
584            if !matcher::is_label(label) {
585                return false;
586            }
587        }
588        true
589    }
590
591    /// Get the full domain
592    pub fn full(&self) -> &str {
593        &self.full
594    }
595
596    fn assemble(input: &str, s_len: usize) -> String {
597        let domain = input.to_lowercase();
598
599        let d_labels: Vec<&str> = domain.trim_end_matches('.').split('.').rev().collect();
600
601        (&d_labels[..s_len])
602            .iter()
603            .rev()
604            .copied()
605            .collect::<Vec<_>>()
606            .join(".")
607    }
608
609    fn find_match(input: &str, domain: &str, list: &List) -> Domain {
610        let mut longest_valid = None;
611
612        let mut current = &list.root;
613        let mut s_labels_len = 0;
614        let mut wildcard_match = false;
615
616        for label in domain.rsplit('.') {
617            if let Some(child) = current.children.get(label) {
618                current = child;
619                s_labels_len += 1;
620            } else if let Some(child) = current.children.get("*") {
621                // wildcard rule
622                current = child;
623                s_labels_len += 1;
624                wildcard_match = true;
625            } else {
626                // no match rules
627                break;
628            }
629
630            if let Some(list_leaf) = &current.leaf {
631                longest_valid = Some((list_leaf, s_labels_len));
632            }
633        }
634
635        match longest_valid {
636            Some((leaf, suffix_len)) => {
637                let typ = if !wildcard_match {
638                    Some(leaf.typ)
639                } else {
640                    None
641                };
642
643                let suffix_len = if leaf.is_exception_rule {
644                    suffix_len - 1
645                } else {
646                    suffix_len
647                };
648
649                let suffix = Some(Self::assemble(input, suffix_len));
650                let d_labels_len = domain.match_indices('.').count() + 1;
651
652                let registrable = if d_labels_len > suffix_len {
653                    Some(Self::assemble(input, suffix_len + 1))
654                } else {
655                    None
656                };
657
658                Domain {
659                    full: input.to_owned(),
660                    typ,
661                    suffix,
662                    registrable,
663                }
664            }
665            None => Domain {
666                full: input.to_owned(),
667                typ: None,
668                suffix: None,
669                registrable: None,
670            },
671        }
672    }
673
674    fn try_to_ascii(domain: &str) -> Result<String> {
675        let result = idna::Config::default()
676            .transitional_processing(true)
677            .verify_dns_length(true)
678            .to_ascii(domain);
679        result.map_err(|error| ErrorKind::Uts46(error).into())
680    }
681
682    fn parse(domain: &str, list: &List, check_syntax: bool) -> Result<Domain> {
683        if check_syntax && !Self::has_valid_syntax(domain) {
684            return Err(ErrorKind::InvalidDomain(domain.into()).into());
685        }
686        let input = domain.trim_end_matches('.');
687        let (domain, res) = domain_to_unicode(input);
688        if let Err(errors) = res {
689            return Err(ErrorKind::Uts46(errors).into());
690        }
691        Ok(Self::find_match(input, &domain, list))
692    }
693
694    /// Gets the root domain portion if any
695    pub fn root(&self) -> Option<&str> {
696        self.registrable.as_ref().map(|x| &x[..])
697    }
698
699    /// Gets the suffix if any
700    pub fn suffix(&self) -> Option<&str> {
701        self.suffix.as_ref().map(|x| &x[..])
702    }
703
704    /// Whether the domain has a private suffix
705    pub fn is_private(&self) -> bool {
706        self.typ.map(|t| t == Type::Private).unwrap_or(false)
707    }
708
709    /// Whether the domain has an ICANN suffix
710    pub fn is_icann(&self) -> bool {
711        self.typ.map(|t| t == Type::Icann).unwrap_or(false)
712    }
713
714    /// Whether this domain's suffix is in the list
715    ///
716    /// If it is, this is definately a valid domain. If it's not
717    /// chances are very high that this isn't a valid domain name,
718    /// however, it might simply be because the suffix is new and
719    /// it hasn't been added to the list yet.
720    ///
721    /// If you want to validate a domain name, use this as a quick
722    /// check but fall back to a DNS lookup if it returns false.
723    pub fn has_known_suffix(&self) -> bool {
724        self.typ.is_some()
725    }
726}
727
728impl fmt::Display for Domain {
729    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
730        write!(f, "{}", self.full.trim_end_matches('.').to_lowercase())
731    }
732}
733
734impl DnsName {
735    /// Extracts the root domain from a DNS name, if any
736    pub fn domain(&self) -> Option<&Domain> {
737        self.domain.as_ref()
738    }
739}
740
741impl fmt::Display for DnsName {
742    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
743        self.name.fmt(f)
744    }
745}