lipsum/
lib.rs

1//! Lorem ipsum generator.
2//!
3//! This crate contains functions for generating pseudo-Latin lorem
4//! ipsum placeholder text. The traditional lorem ipsum text start
5//! like this:
6//!
7//! > Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do
8//! > eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut
9//! > enim ad minim veniam, quis nostrud exercitation ullamco laboris
10//! > nisi ut aliquip ex ea commodo consequat. [...]
11//!
12//! This text is in the [`LOREM_IPSUM`] constant. Random text looking
13//! like the above can be generated using the [`lipsum`] function.
14//! This function allows you to generate as much text as desired and
15//! each invocation will generate different text. This is done using a
16//! [Markov chain] based on both the [`LOREM_IPSUM`] and
17//! [`LIBER_PRIMUS`] texts. The latter constant holds the full text of
18//! the first book of a work by Cicero, of which the lorem ipsum text
19//! is a scrambled subset.
20//!
21//! The random looking text is generatd using a Markov chain of order
22//! two, which simply means that the next word is based on the
23//! previous two words in the input texts. The Markov chain can be
24//! used with other input texts by creating an instance of
25//! [`MarkovChain`] and calling its [`learn`] method.
26//!
27//! [`LOREM_IPSUM`]: constant.LOREM_IPSUM.html
28//! [`LIBER_PRIMUS`]: constant.LIBER_PRIMUS.html
29//! [`lipsum`]: fn.lipsum.html
30//! [`MarkovChain`]: struct.MarkovChain.html
31//! [`learn`]: struct.MarkovChain.html#method.learn
32//! [Markov chain]: https://en.wikipedia.org/wiki/Markov_chain
33
34#![doc(html_root_url = "https://docs.rs/lipsum/0.6.0")]
35#![deny(missing_docs)]
36
37extern crate rand;
38#[cfg(test)]
39extern crate rand_xorshift;
40
41use rand::rngs::ThreadRng;
42use rand::seq::SliceRandom;
43use rand::Rng;
44use std::cell::RefCell;
45use std::collections::HashMap;
46
47/// A bigram is simply two consecutive words.
48pub type Bigram<'a> = (&'a str, &'a str);
49
50/// Simple order two Markov chain implementation.
51///
52/// The [Markov chain] is a chain of order two, which means that it
53/// will use the previous two words (a bigram) when predicting the
54/// next word. This is normally enough to generate random text that
55/// looks somewhat plausible. The implementation is based on
56/// [Generating arbitrary text with Markov chains in Rust][blog post].
57///
58/// [Markov chain]: https://en.wikipedia.org/wiki/Markov_chain
59/// [blog post]: https://blakewilliams.me/posts/generating-arbitrary-text-with-markov-chains-in-rust
60pub struct MarkovChain<'a, R: Rng> {
61    map: HashMap<Bigram<'a>, Vec<&'a str>>,
62    keys: Vec<Bigram<'a>>,
63    rng: R,
64}
65
66impl<'a> MarkovChain<'a, ThreadRng> {
67    /// Create a new empty Markov chain. It will use a default
68    /// thread-local random number generator.
69    ///
70    /// # Examples
71    ///
72    /// ```
73    /// use lipsum::MarkovChain;
74    ///
75    /// let chain = MarkovChain::new();
76    /// assert!(chain.is_empty());
77    /// ```
78    pub fn new() -> MarkovChain<'a, ThreadRng> {
79        MarkovChain::new_with_rng(rand::thread_rng())
80    }
81}
82
83impl<'a> Default for MarkovChain<'a, ThreadRng> {
84    /// Create a new empty Markov chain. It will use a default
85    /// thread-local random number generator.
86    fn default() -> Self {
87        Self::new()
88    }
89}
90
91impl<'a, R: Rng> MarkovChain<'a, R> {
92    /// Create a new empty Markov chain that uses the given random
93    /// number generator.
94    ///
95    /// # Examples
96    ///
97    /// ```
98    /// extern crate rand;
99    /// extern crate rand_xorshift;
100    /// # extern crate lipsum;
101    ///
102    /// # fn main() {
103    /// use rand::SeedableRng;
104    /// use rand_xorshift::XorShiftRng;
105    /// use lipsum::MarkovChain;
106    ///
107    /// let rng = XorShiftRng::seed_from_u64(0);
108    /// let mut chain = MarkovChain::new_with_rng(rng);
109    /// chain.learn("infra-red red orange yellow green blue indigo x-ray");
110    ///
111    /// // The chain jumps consistently like this:
112    /// assert_eq!(chain.generate(1), "Yellow.");
113    /// assert_eq!(chain.generate(1), "Blue.");
114    /// assert_eq!(chain.generate(1), "Green.");
115    /// # }
116    /// ```
117
118    pub fn new_with_rng(rng: R) -> MarkovChain<'a, R> {
119        MarkovChain {
120            map: HashMap::new(),
121            keys: Vec::new(),
122            rng: rng,
123        }
124    }
125
126    /// Add new text to the Markov chain. This can be called several
127    /// times to build up the chain.
128    ///
129    /// # Examples
130    ///
131    /// ```
132    /// use lipsum::MarkovChain;
133    ///
134    /// let mut chain = MarkovChain::new();
135    /// chain.learn("red green blue");
136    /// assert_eq!(chain.words(("red", "green")), Some(&vec!["blue"]));
137    ///
138    /// chain.learn("red green yellow");
139    /// assert_eq!(chain.words(("red", "green")), Some(&vec!["blue", "yellow"]));
140    /// ```
141    pub fn learn(&mut self, sentence: &'a str) {
142        let words = sentence.split_whitespace().collect::<Vec<&str>>();
143        for window in words.windows(3) {
144            let (a, b, c) = (window[0], window[1], window[2]);
145            self.map.entry((a, b)).or_insert_with(Vec::new).push(c);
146        }
147        // Sync the keys with the current map.
148        self.keys = self.map.keys().cloned().collect();
149        self.keys.sort();
150    }
151
152    /// Returs the number of states in the Markov chain.
153    ///
154    /// # Examples
155    ///
156    /// ```
157    /// use lipsum::MarkovChain;
158    ///
159    /// let mut chain = MarkovChain::new();
160    /// assert_eq!(chain.len(), 0);
161    ///
162    /// chain.learn("red orange yellow green blue indigo");
163    /// assert_eq!(chain.len(), 4);
164    /// ```
165    #[inline]
166    pub fn len(&self) -> usize {
167        self.map.len()
168    }
169
170    /// Returns `true` if the Markov chain has no states.
171    ///
172    /// # Examples
173    ///
174    /// ```
175    /// use lipsum::MarkovChain;
176    ///
177    /// let mut chain = MarkovChain::new();
178    /// assert!(chain.is_empty());
179    ///
180    /// chain.learn("foo bar baz");
181    /// assert!(!chain.is_empty());
182    /// ```
183    pub fn is_empty(&self) -> bool {
184        self.len() == 0
185    }
186
187    /// Get the possible words following the given bigram, or `None`
188    /// if the state is invalid.
189    ///
190    /// # Examples
191    ///
192    /// ```
193    /// use lipsum::MarkovChain;
194    ///
195    /// let mut chain = MarkovChain::new();
196    /// chain.learn("red green blue");
197    /// assert_eq!(chain.words(("red", "green")), Some(&vec!["blue"]));
198    /// assert_eq!(chain.words(("foo", "bar")), None);
199    /// ```
200    pub fn words(&self, state: Bigram<'a>) -> Option<&Vec<&str>> {
201        self.map.get(&state)
202    }
203
204    /// Generate a sentence with `n` words of lorem ipsum text. The
205    /// sentence will start from a random point in the Markov chain
206    /// and a `.` will be added as necessary to form a full sentence.
207    ///
208    /// See [`generate_from`] if you want to control the starting
209    /// point for the generated text and see [`iter`] if you simply
210    /// want a sequence of words.
211    ///
212    /// # Examples
213    ///
214    /// Generating the sounds of a grandfather clock:
215    ///
216    /// ```
217    /// use lipsum::MarkovChain;
218    ///
219    /// let mut chain = MarkovChain::new();
220    /// chain.learn("Tick, Tock, Tick, Tock, Ding! Tick, Tock, Ding! Ding!");
221    /// println!("{}", chain.generate(15));
222    /// ```
223    ///
224    /// The output looks like this:
225    ///
226    /// > Ding! Tick, Tock, Tick, Tock, Ding! Ding! Tock, Ding! Tick,
227    /// > Tock, Tick, Tock, Tick, Tock.
228    ///
229    /// [`generate_from`]: struct.MarkovChain.html#method.generate_from
230    /// [`iter`]: struct.MarkovChain.html#method.iter
231    pub fn generate(&mut self, n: usize) -> String {
232        join_words(self.iter().take(n))
233    }
234
235    /// Generate a sentence with `n` words of lorem ipsum text. The
236    /// sentence will start from the given bigram and a `.` will be
237    /// added as necessary to form a full sentence.
238    ///
239    /// Use [`generate`] if the starting point is not important. See
240    /// [`iter_from`] if you want a sequence of words that you can
241    /// format yourself.
242    ///
243    /// [`generate`]: struct.MarkovChain.html#method.generate
244    /// [`iter_from`]: struct.MarkovChain.html#method.iter_from
245    pub fn generate_from(&mut self, n: usize, from: Bigram<'a>) -> String {
246        join_words(self.iter_from(from).take(n))
247    }
248
249    /// Make a never-ending iterator over the words in the Markov
250    /// chain. The iterator starts at a random point in the chain.
251    pub fn iter(&mut self) -> Words<R> {
252        let state = if self.is_empty() {
253            ("", "")
254        } else {
255            *self.keys.choose(&mut self.rng).unwrap()
256        };
257        Words {
258            map: &self.map,
259            rng: &mut self.rng,
260            keys: &self.keys,
261            state: state,
262        }
263    }
264
265    /// Make a never-ending iterator over the words in the Markov
266    /// chain. The iterator starts at the given bigram.
267    pub fn iter_from(&mut self, from: Bigram<'a>) -> Words<R> {
268        Words {
269            map: &self.map,
270            rng: &mut self.rng,
271            keys: &self.keys,
272            state: from,
273        }
274    }
275}
276
277/// Never-ending iterator over words in the Markov chain.
278///
279/// Generated with the [`iter`] or [`iter_from`] methods.
280///
281/// [`iter`]: struct.MarkovChain.html#method.iter
282/// [`iter_from`]: struct.MarkovChain.html#method.iter_from
283pub struct Words<'a, R: 'a + Rng> {
284    map: &'a HashMap<Bigram<'a>, Vec<&'a str>>,
285    rng: &'a mut R,
286    keys: &'a Vec<Bigram<'a>>,
287    state: Bigram<'a>,
288}
289
290impl<'a, R: Rng> Iterator for Words<'a, R> {
291    type Item = &'a str;
292
293    fn next(&mut self) -> Option<&'a str> {
294        if self.map.is_empty() {
295            return None;
296        }
297
298        let result = Some(self.state.0);
299
300        while !self.map.contains_key(&self.state) {
301            self.state = *self.keys.choose(self.rng).unwrap();
302        }
303        let next_words = &self.map[&self.state];
304        let next = next_words.choose(self.rng).unwrap();
305        self.state = (self.state.1, next);
306        result
307    }
308}
309
310/// Check if `c` is an ASCII punctuation character.
311fn is_ascii_punctuation(c: char) -> bool {
312    // We use the table from the unstable
313    // AsciiExt::is_ascii_punctuation function:
314    //
315    // U+0021 ... U+002F `! " # $ % & ' ( ) * + , - . /`
316    // U+003A ... U+0040 `: ; < = > ? @`
317    // U+005B ... U+0060 `[ \\ ] ^ _ \``
318    // U+007B ... U+007E `{ | } ~`
319    match c {
320        '\x21'...'\x2F' | '\x3A'...'\x40' | '\x5B'...'\x60' | '\x7B'...'\x7E' => true,
321        _ => false,
322    }
323}
324
325/// Capitalize the first character in a string.
326fn capitalize<'a>(word: &'a str) -> String {
327    let idx = match word.chars().next() {
328        Some(c) => c.len_utf8(),
329        None => 0,
330    };
331
332    let mut result = String::with_capacity(word.len());
333    result.push_str(&word[..idx].to_uppercase());
334    result.push_str(&word[idx..]);
335    result
336}
337
338/// Join words from an iterator. The first word is always capitalized
339/// and the generated sentence will end with `'.'` if it doesn't
340/// already end with some other ASCII punctuation character.
341fn join_words<'a, I: Iterator<Item = &'a str>>(mut words: I) -> String {
342    match words.next() {
343        None => String::new(),
344        Some(word) => {
345            let mut sentence = capitalize(word);
346
347            // Add remaining words.
348            for word in words {
349                sentence.push(' ');
350                sentence.push_str(word);
351            }
352
353            // Ensure the sentence ends with either one of ".!?".
354            if !sentence.ends_with(|c: char| c == '.' || c == '!' || c == '?') {
355                // Trim all trailing punctuation characters to avoid
356                // adding '.' after a ',' or similar.
357                let idx = sentence.trim_right_matches(is_ascii_punctuation).len();
358                sentence.truncate(idx);
359                sentence.push('.');
360            }
361
362            sentence
363        }
364    }
365}
366
367/// The traditional lorem ipsum text as given in [Wikipedia]. Using
368/// this text alone for a Markov chain of order two doesn't work very
369/// well since each bigram (two consequtive words) is followed by just
370/// one other word. In other words, the Markov chain will always
371/// produce the same output and recreate the lorem ipsum text
372/// precisely. However, combining it with the full text in
373/// [`LIBER_PRIMUS`] works well.
374///
375/// [Wikipedia]: https://en.wikipedia.org/wiki/Lorem_ipsum
376/// [`LIBER_PRIMUS`]: constant.LIBER_PRIMUS.html
377pub const LOREM_IPSUM: &'static str = include_str!("lorem-ipsum.txt");
378
379/// The first book in Cicero's work De finibus bonorum et malorum ("On
380/// the ends of good and evil"). The lorem ipsum text in
381/// [`LOREM_IPSUM`] is derived from part of this text.
382///
383/// [`LOREM_IPSUM`]: constant.LOREM_IPSUM.html
384pub const LIBER_PRIMUS: &'static str = include_str!("liber-primus.txt");
385
386thread_local! {
387    // Markov chain generating lorem ipsum text.
388    static LOREM_IPSUM_CHAIN: RefCell<MarkovChain<'static, ThreadRng>> = {
389        let mut chain = MarkovChain::new();
390        // The cost of learning increases as more and more text is
391        // added, so we start with the smallest text.
392        chain.learn(LOREM_IPSUM);
393        chain.learn(LIBER_PRIMUS);
394        RefCell::new(chain)
395    }
396}
397
398/// Generate `n` words of lorem ipsum text. The output will always
399/// start with "Lorem ipsum".
400///
401/// The text continues with the standard lorem ipsum text from
402/// [`LOREM_IPSUM`] and becomes random if more than 18 words is
403/// requested. See [`lipsum_words`] if fully random text is needed.
404///
405/// # Examples
406///
407/// ```
408/// use lipsum::lipsum;
409///
410/// assert_eq!(lipsum(7), "Lorem ipsum dolor sit amet, consectetur adipiscing.");
411/// ```
412///
413/// [`LOREM_IPSUM`]: constant.LOREM_IPSUM.html
414/// [`lipsum_words`]: fn.lipsum_words.html
415pub fn lipsum(n: usize) -> String {
416    LOREM_IPSUM_CHAIN.with(|cell| {
417        let mut chain = cell.borrow_mut();
418        chain.generate_from(n, ("Lorem", "ipsum"))
419    })
420}
421
422/// Generate `n` words of random lorem ipsum text.
423///
424/// The text starts with a random word from [`LOREM_IPSUM`]. Multiple
425/// sentences may be generated, depending on the punctuation of the
426/// words being random selected.
427///
428/// # Examples
429///
430/// ```
431/// use lipsum::lipsum_words;
432///
433/// println!("{}", lipsum_words(6));
434/// // -> "Propter soliditatem, censet in infinito inani."
435/// ```
436///
437/// [`LOREM_IPSUM`]: constant.LOREM_IPSUM.html
438pub fn lipsum_words(n: usize) -> String {
439    LOREM_IPSUM_CHAIN.with(|cell| {
440        let mut chain = cell.borrow_mut();
441        chain.generate(n)
442    })
443}
444
445/// Minimum number of words to include in a title.
446const TITLE_MIN_WORDS: usize = 3;
447/// Maximum number of words to include in a title.
448const TITLE_MAX_WORDS: usize = 8;
449/// Words shorter than this size are not capitalized.
450const TITLE_SMALL_WORD: usize = 3;
451
452/// Generate a short lorem ipsum text with words in title case.
453///
454/// The words are capitalized and stripped for punctuation characters.
455///
456/// # Examples
457///
458/// ```
459/// use lipsum::lipsum_title;
460///
461/// println!("{}", lipsum_title());
462/// ```
463///
464/// This will generate a string like
465///
466/// > Grate Meminit et Praesentibus
467///
468/// which should be suitable for use in a document title for section
469/// heading.
470pub fn lipsum_title() -> String {
471    LOREM_IPSUM_CHAIN.with(|cell| {
472        let n = rand::thread_rng().gen_range(TITLE_MIN_WORDS, TITLE_MAX_WORDS);
473        let mut chain = cell.borrow_mut();
474        // The average word length with our corpus is 7.6 bytes so
475        // this capacity will avoid most allocations.
476        let mut title = String::with_capacity(8 * n);
477
478        let words = chain
479            .iter()
480            .map(|word| word.trim_matches(is_ascii_punctuation))
481            .filter(|word| !word.is_empty())
482            .take(n);
483
484        for (i, word) in words.enumerate() {
485            if i > 0 {
486                title.push(' ');
487            }
488
489            // Capitalize the first word and all long words.
490            if i == 0 || word.len() > TITLE_SMALL_WORD {
491                title.push_str(&capitalize(word));
492            } else {
493                title.push_str(word);
494            }
495        }
496        title
497    })
498}
499
500#[cfg(test)]
501mod tests {
502    use super::rand::SeedableRng;
503    use super::rand_xorshift::XorShiftRng;
504    use super::*;
505
506    #[test]
507    fn starts_with_lorem_ipsum() {
508        assert_eq!(&lipsum(10)[..11], "Lorem ipsum");
509    }
510
511    #[test]
512    fn generate_zero_words() {
513        assert_eq!(lipsum(0).split_whitespace().count(), 0);
514    }
515
516    #[test]
517    fn generate_one_word() {
518        assert_eq!(lipsum(1).split_whitespace().count(), 1);
519    }
520
521    #[test]
522    fn generate_two_words() {
523        assert_eq!(lipsum(2).split_whitespace().count(), 2);
524    }
525
526    #[test]
527    fn starts_differently() {
528        // Check that calls to lipsum_words don't always start with
529        // "Lorem ipsum".
530        let idx = "Lorem ipsum".len();
531        assert_ne!(&lipsum_words(5)[..idx], &lipsum_words(5)[..idx]);
532    }
533
534    #[test]
535    fn generate_title() {
536        for word in lipsum_title().split_whitespace() {
537            assert!(
538                !word.starts_with(is_ascii_punctuation) && !word.ends_with(is_ascii_punctuation),
539                "Unexpected punctuation: {:?}",
540                word
541            );
542            if word.len() > TITLE_SMALL_WORD {
543                assert!(
544                    word.starts_with(char::is_uppercase),
545                    "Expected small word to be capitalized: {:?}",
546                    word
547                );
548            }
549        }
550    }
551
552    #[test]
553    fn empty_chain() {
554        let mut chain = MarkovChain::new();
555        assert_eq!(chain.generate(10), "");
556    }
557
558    #[test]
559    fn generate_from() {
560        let mut chain = MarkovChain::new();
561        chain.learn("red orange yellow green blue indigo violet");
562        assert_eq!(
563            chain.generate_from(5, ("orange", "yellow")),
564            "Orange yellow green blue indigo."
565        );
566    }
567
568    #[test]
569    fn generate_last_bigram() {
570        // The bigram "yyy zzz" will not be present in the Markov
571        // chain's map, and so we will not generate "xxx yyy zzz" as
572        // one would expect. The chain moves from state "xxx yyy" to
573        // "yyy zzz", but sees that as invalid state and resets itself
574        // back to "xxx yyy".
575        let mut chain = MarkovChain::new();
576        chain.learn("xxx yyy zzz");
577        assert_ne!(chain.generate_from(3, ("xxx", "yyy")), "xxx yyy zzz");
578    }
579
580    #[test]
581    fn generate_from_no_panic() {
582        // No panic when asked to generate a chain from a starting
583        // point that doesn't exist in the chain.
584        let mut chain = MarkovChain::new();
585        chain.learn("foo bar baz");
586        chain.generate_from(3, ("xxx", "yyy"));
587    }
588
589    #[test]
590    fn chain_map() {
591        let mut chain = MarkovChain::new();
592        chain.learn("foo bar baz quuz");
593        let map = &chain.map;
594
595        assert_eq!(map.len(), 2);
596        assert_eq!(map[&("foo", "bar")], vec!["baz"]);
597        assert_eq!(map[&("bar", "baz")], vec!["quuz"]);
598    }
599
600    #[test]
601    fn new_with_rng() {
602        let rng = XorShiftRng::seed_from_u64(1234);
603        let mut chain = MarkovChain::new_with_rng(rng);
604        chain.learn("foo bar x y z");
605        chain.learn("foo bar a b c");
606
607        assert_eq!(chain.generate(15), "A b x y y b y bar a b y x y bar a.");
608    }
609}