pest/
lib.rs

1// pest. The Elegant Parser
2// Copyright (c) 2018 Dragoș Tiselice
3//
4// Licensed under the Apache License, Version 2.0
5// <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
6// license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. All files in the project carrying such notice may not be copied,
8// modified, or distributed except according to those terms.
9#![no_std]
10#![doc(
11    html_logo_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg",
12    html_favicon_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg"
13)]
14#![warn(missing_docs, rust_2018_idioms, unused_qualifications)]
15//! # pest. The Elegant Parser
16//!
17//! pest is a general purpose parser written in Rust with a focus on accessibility, correctness,
18//! and performance. It uses parsing expression grammars (or [PEG]) as input, which are similar in
19//! spirit to regular expressions, but which offer the enhanced expressivity needed to parse
20//! complex languages.
21//!
22//! [PEG]: https://en.wikipedia.org/wiki/Parsing_expression_grammar
23//!
24//! ## Getting started
25//!
26//! The recommended way to start parsing with pest is to read the official [book].
27//!
28//! Other helpful resources:
29//!
30//! * API reference on [docs.rs]
31//! * play with grammars and share them on our [fiddle]
32//! * find previous common questions answered or ask questions on [GitHub Discussions]
33//! * leave feedback, ask questions, or greet us on [Gitter] or [Discord]
34//!
35//! [book]: https://pest.rs/book
36//! [docs.rs]: https://docs.rs/pest
37//! [fiddle]: https://pest.rs/#editor
38//! [Gitter]: https://gitter.im/pest-parser/pest
39//! [Discord]: https://discord.gg/XEGACtWpT2
40//! [GitHub Discussions]: https://github.com/pest-parser/pest/discussions
41//!
42//! ## Usage
43//!
44//! The core of pest is the trait [`Parser`], which provides an interface to the parsing
45//! functionality.
46//!
47//! The accompanying crate `pest_derive` can automatically generate a [`Parser`] from a PEG
48//! grammar. Using `pest_derive` is highly encouraged, but it is also possible to implement
49//! [`Parser`] manually if required.
50//!
51//! ## `.pest` files
52//!
53//! Grammar definitions reside in custom `.pest` files located in the crate `src` directory.
54//! Parsers are automatically generated from these files using `#[derive(Parser)]` and a special
55//! `#[grammar = "..."]` attribute on a dummy struct.
56//!
57//! ```ignore
58//! #[derive(Parser)]
59//! #[grammar = "path/to/my_grammar.pest"] // relative to src
60//! struct MyParser;
61//! ```
62//!
63//! The syntax of `.pest` files is documented in the [`pest_derive` crate].
64//!
65//! ## Inline grammars
66//!
67//! Grammars can also be inlined by using the `#[grammar_inline = "..."]` attribute.
68//!
69//! [`Parser`]: trait.Parser.html
70//! [`pest_derive` crate]: https://docs.rs/pest_derive/
71//!
72//! ## Grammar
73//!
74//! A grammar is a series of rules separated by whitespace, possibly containing comments.
75//!
76//! ### Comments
77//!
78//! Comments start with `//` and end at the end of the line.
79//!
80//! ```text
81//! // a comment
82//! ```
83//!
84//! ### Rules
85//!
86//! Rules have the following form:
87//!
88//! ```ignore
89//! name = optional_modifier { expression }
90//! ```
91//!
92//! The name of the rule is formed from alphanumeric characters or `_` with the condition that the
93//! first character is not a digit and is used to create token pairs. When the rule starts being
94//! parsed, the starting part of the token is being produced, with the ending part being produced
95//! when the rule finishes parsing.
96//!
97//! The following token pair notation `a(b(), c())` denotes the tokens: start `a`, start `b`, end
98//! `b`, start `c`, end `c`, end `a`.
99//!
100//! #### Modifiers
101//!
102//! Modifiers are optional and can be one of `_`, `@`, `$`, or `!`. These modifiers change the
103//! behavior of the rules.
104//!
105//! 1. Silent (`_`)
106//!
107//!     Silent rules do not create token pairs during parsing, nor are they error-reported.
108//!
109//!     ```ignore
110//!     a = _{ "a" }
111//!     b =  { a ~ "b" }
112//!     ```
113//!
114//!     Parsing `"ab"` produces the token pair `b()`.
115//!
116//! 2. Atomic (`@`)
117//!
118//!     Atomic rules do not accept whitespace or comments within their expressions and have a
119//!     cascading effect on any rule they call. I.e. rules that are not atomic but are called by atomic
120//!     rules behave atomically.
121//!
122//!     Any rules called by atomic rules do not generate token pairs.
123//!
124//!     ```ignore
125//!     a =  { "a" }
126//!     b = @{ a ~ "b" }
127//!
128//!     WHITESPACE = _{ " " }
129//!     ```
130//!
131//!     Parsing `"ab"` produces the token pair `b()`, while `"a   b"` produces an error.
132//!
133//! 3. Compound-atomic (`$`)
134//!
135//!     Compound-atomic are identical to atomic rules with the exception that rules called by them are
136//!     not forbidden from generating token pairs.
137//!
138//!     ```ignore
139//!     a =  { "a" }
140//!     b = ${ a ~ "b" }
141//!
142//!     WHITESPACE = _{ " " }
143//!     ```
144//!
145//!     Parsing `"ab"` produces the token pairs `b(a())`, while `"a   b"` produces an error.
146//!
147//! 4. Non-atomic (`!`)
148//!
149//!     Non-atomic are identical to normal rules with the exception that they stop the cascading effect
150//!     of atomic and compound-atomic rules.
151//!
152//!     ```ignore
153//!     a =  { "a" }
154//!     b = !{ a ~ "b" }
155//!     c = @{ b }
156//!
157//!     WHITESPACE = _{ " " }
158//!     ```
159//!
160//!     Parsing both `"ab"` and `"a   b"` produce the token pairs `c(a())`.
161//!
162//! #### Expressions
163//!
164//! Expressions can be either terminals or non-terminals.
165//!
166//! 1. Terminals
167//!
168//! | Terminal   | Usage                                                          |
169//! |------------|----------------------------------------------------------------|
170//! | `"a"`      | matches the exact string `"a"`                                 |
171//! | `^"a"`     | matches the exact string `"a"` case insensitively (ASCII only) |
172//! | `'a'..'z'` | matches one character between `'a'` and `'z'`                  |
173//! | `a`        | matches rule `a`                                               |
174//!
175//! Strings and characters follow
176//! [Rust's escape mechanisms](https://doc.rust-lang.org/reference/tokens.html#byte-escapes), while
177//! identifiers can contain alphanumeric characters and underscores (`_`), as long as they do not
178//! start with a digit.
179//!
180//! 2. Non-terminals
181//!
182//! | Non-terminal          | Usage                                                      |
183//! |-----------------------|------------------------------------------------------------|
184//! | `(e)`                 | matches `e`                                                |
185//! | `e1 ~ e2`             | matches the sequence `e1` `e2`                             |
186//! | <code>e1 \| e2</code> | matches either `e1` or `e2`                                |
187//! | `e*`                  | matches `e` zero or more times                             |
188//! | `e+`                  | matches `e` one or more times                              |
189//! | `e{n}`                | matches `e` exactly `n` times                              |
190//! | `e{, n}`              | matches `e` at most `n` times                              |
191//! | `e{n,}`               | matches `e` at least `n` times                             |
192//! | `e{m, n}`             | matches `e` between `m` and `n` times inclusively          |
193//! | `e?`                  | optionally matches `e`                                     |
194//! | `&e`                  | matches `e` without making progress                        |
195//! | `!e`                  | matches if `e` doesn't match without making progress       |
196//! | `PUSH(e)`             | matches `e` and pushes it's captured string down the stack |
197//!
198//! where `e`, `e1`, and `e2` are expressions.
199//!
200//! Matching is greedy, without backtracking.  Note the difference in behavior for
201//! these two rules in matching identifiers that don't end in an underscore:
202//!
203//! ```ignore
204//! // input: ab_bb_b
205//!
206//! identifier = @{ "a" ~ ("b"|"_")* ~ "b" }
207//! // matches:      a     b_bb_b       nothing -> error!      
208//!
209//! identifier = @{ "a" ~ ("_"* ~ "b")* }
210//! // matches:      a     b, _bb, _b   in three repetitions
211//! ```
212//!
213//! Expressions can modify the stack only if they match the input. For example,
214//! if `e1` in the compound expression `e1 | e2` does not match the input, then
215//! it does not modify the stack, so `e2` sees the stack in the same state as
216//! `e1` did. Repetitions and optionals (`e*`, `e+`, `e{, n}`, `e{n,}`,
217//! `e{m,n}`, `e?`) can modify the stack each time `e` matches. The `!e` and `&e`
218//! expressions are a special case; they never modify the stack.
219//! Many languages have "keyword" tokens (e.g. if, for, while) as well as general
220//! tokens (e.g. identifier) that matches any word. In order to match a keyword,
221//! generally, you may need to restrict that is not immediately followed by another
222//! letter or digit (otherwise it would be matched as an identifier).
223//!
224//! ## Special rules
225//!
226//! Special rules can be called within the grammar. They are:
227//!
228//! * `WHITESPACE` - runs between rules and sub-rules
229//! * `COMMENT` - runs between rules and sub-rules
230//! * `ANY` - matches exactly one `char`
231//! * `SOI` - (start-of-input) matches only when a `Parser` is still at the starting position
232//! * `EOI` - (end-of-input) matches only when a `Parser` has reached its end
233//! * `POP` - pops a string from the stack and matches it
234//! * `POP_ALL` - pops the entire state of the stack and matches it
235//! * `PEEK` - peeks a string from the stack and matches it
236//! * `PEEK[a..b]` - peeks part of the stack and matches it
237//! * `PEEK_ALL` - peeks the entire state of the stack and matches it
238//! * `DROP` - drops the top of the stack (fails to match if the stack is empty)
239//!
240//! `WHITESPACE` and `COMMENT` should be defined manually if needed. All other rules cannot be
241//! overridden.
242//!
243//! ## `WHITESPACE` and `COMMENT`
244//!
245//! When defined, these rules get matched automatically in sequences (`~`) and repetitions
246//! (`*`, `+`) between expressions. Atomic rules and those rules called by atomic rules are exempt
247//! from this behavior.
248//!
249//! These rules should be defined so as to match one whitespace character and one comment only since
250//! they are run in repetitions.
251//!
252//! If both `WHITESPACE` and `COMMENT` are defined, this grammar:
253//!
254//! ```ignore
255//! a = { b ~ c }
256//! ```
257//!
258//! is effectively transformed into this one behind the scenes:
259//!
260//! ```ignore
261//! a = { b ~ WHITESPACE* ~ (COMMENT ~ WHITESPACE*)* ~ c }
262//! ```
263//!
264//! ## `PUSH`, `POP`, `DROP`, and `PEEK`
265//!
266//! `PUSH(e)` simply pushes the captured string of the expression `e` down a stack. This stack can
267//! then later be used to match grammar based on its content with `POP` and `PEEK`.
268//!
269//! `PEEK` always matches the string at the top of stack. So, if the stack contains `["b", "a"]`
270//! (`"a"` being on top), this grammar:
271//!
272//! ```ignore
273//! a = { PEEK }
274//! ```
275//!
276//! is effectively transformed into at parse time:
277//!
278//! ```ignore
279//! a = { "a" }
280//! ```
281//!
282//! `POP` works the same way with the exception that it pops the string off of the stack if the
283//! match worked. With the stack from above, if `POP` matches `"a"`, the stack will be mutated
284//! to `["b"]`.
285//!
286//! `DROP` makes it possible to remove the string at the top of the stack
287//! without matching it. If the stack is nonempty, `DROP` drops the top of the
288//! stack. If the stack is empty, then `DROP` fails to match.
289//!
290//! ### Advanced peeking
291//!
292//! `PEEK[start..end]` and `PEEK_ALL` allow to peek deeper into the stack. The syntax works exactly
293//! like Rust’s exclusive slice syntax. Additionally, negative indices can be used to indicate an
294//! offset from the top. If the end lies before or at the start, the expression matches (as does
295//! a `PEEK_ALL` on an empty stack). With the stack `["c", "b", "a"]` (`"a"` on top):
296//!
297//! ```ignore
298//! fill = PUSH("c") ~ PUSH("b") ~ PUSH("a")
299//! v = { PEEK_ALL } = { "a" ~ "b" ~ "c" }  // top to bottom
300//! w = { PEEK[..] } = { "c" ~ "b" ~ "a" }  // bottom to top
301//! x = { PEEK[1..2] } = { PEEK[1..-1] } = { "b" }
302//! y = { PEEK[..-2] } = { PEEK[0..1] } = { "a" }
303//! z = { PEEK[1..] } = { PEEK[-2..3] } = { "c" ~ "b" }
304//! n = { PEEK[2..-2] } = { PEEK[2..1] } = { "" }
305//! ```
306//!
307//! For historical reasons, `PEEK_ALL` matches from top to bottom, while `PEEK[start..end]` matches
308//! from bottom to top. There is currently no syntax to match a slice of the stack top to bottom.
309//!
310//! ## `Rule`
311//!
312//! All rules defined or used in the grammar populate a generated `enum` called `Rule`. This
313//! implements `pest`'s `RuleType` and can be used throughout the API.
314//!
315//! ## `Built-in rules`
316//!
317//! Pest also comes with a number of built-in rules for convenience. They are:
318//!
319//! * `ASCII_DIGIT` - matches a numeric character from 0..9
320//! * `ASCII_NONZERO_DIGIT` - matches a numeric character from 1..9
321//! * `ASCII_BIN_DIGIT` - matches a numeric character from 0..1
322//! * `ASCII_OCT_DIGIT` - matches a numeric character from 0..7
323//! * `ASCII_HEX_DIGIT` - matches a numeric character from 0..9 or a..f or A..F
324//! * `ASCII_ALPHA_LOWER` - matches a character from a..z
325//! * `ASCII_ALPHA_UPPER` - matches a character from A..Z
326//! * `ASCII_ALPHA` - matches a character from a..z or A..Z
327//! * `ASCII_ALPHANUMERIC` - matches a character from a..z or A..Z or 0..9
328//! * `ASCII` - matches a character from \x00..\x7f
329//! * `NEWLINE` - matches either "\n" or "\r\n" or "\r"
330
331#![doc(html_root_url = "https://docs.rs/pest")]
332
333extern crate alloc;
334#[cfg(feature = "std")]
335extern crate std;
336
337pub use crate::parser::Parser;
338pub use crate::parser_state::{
339    set_call_limit, state, Atomicity, Lookahead, MatchDir, ParseResult, ParserState,
340};
341pub use crate::position::Position;
342pub use crate::span::{merge_spans, Lines, LinesSpan, Span};
343pub use crate::stack::Stack;
344pub use crate::token::Token;
345use core::fmt::Debug;
346use core::hash::Hash;
347
348pub mod error;
349pub mod iterators;
350mod macros;
351mod parser;
352mod parser_state;
353mod position;
354pub mod pratt_parser;
355#[deprecated(
356    since = "2.4.0",
357    note = "Use `pest::pratt_parser` instead (it is an equivalent which also supports unary prefix/suffix operators).
358While prec_climber is going to be kept in 2.x minor and patch releases, it may be removed in a future major release."
359)]
360pub mod prec_climber;
361mod span;
362mod stack;
363mod token;
364
365#[doc(hidden)]
366pub mod unicode;
367
368/// A trait which parser rules must implement.
369///
370/// This trait is set up so that any struct that implements all of its required traits will
371/// automatically implement this trait as well.
372///
373/// This is essentially a [trait alias](https://github.com/rust-lang/rfcs/pull/1733). When trait
374/// aliases are implemented, this may be replaced by one.
375pub trait RuleType: Copy + Debug + Eq + Hash + Ord {}
376
377impl<T: Copy + Debug + Eq + Hash + Ord> RuleType for T {}
pest/lib.rs

pest/
lib.rs