bstr/unicode/
word.rs

1use regex_automata::{dfa::Automaton, Anchored, Input};
2
3use crate::{
4    ext_slice::ByteSlice,
5    unicode::fsm::{
6        simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD,
7    },
8    utf8,
9};
10
11/// An iterator over words in a byte string.
12///
13/// This iterator is typically constructed by
14/// [`ByteSlice::words`](trait.ByteSlice.html#method.words).
15///
16/// This is similar to the [`WordsWithBreaks`](struct.WordsWithBreaks.html)
17/// iterator, except it only returns elements that contain a "word" character.
18/// A word character is defined by UTS #18 (Annex C) to be the combination
19/// of the `Alphabetic` and `Join_Control` properties, along with the
20/// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
21///
22/// Since words are made up of one or more codepoints, this iterator yields
23/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
24/// are [substituted](index.html#handling-of-invalid-utf-8).
25///
26/// This iterator yields words in accordance with the default word boundary
27/// rules specified in
28/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
29/// In particular, this may not be suitable for Japanese and Chinese scripts
30/// that do not use spaces between words.
31#[derive(Clone, Debug)]
32pub struct Words<'a>(WordsWithBreaks<'a>);
33
34impl<'a> Words<'a> {
35    pub(crate) fn new(bs: &'a [u8]) -> Words<'a> {
36        Words(WordsWithBreaks::new(bs))
37    }
38
39    /// View the underlying data as a subslice of the original data.
40    ///
41    /// The slice returned has the same lifetime as the original slice, and so
42    /// the iterator can continue to be used while this exists.
43    ///
44    /// # Examples
45    ///
46    /// ```
47    /// use bstr::ByteSlice;
48    ///
49    /// let mut it = b"foo bar baz".words();
50    ///
51    /// assert_eq!(b"foo bar baz", it.as_bytes());
52    /// it.next();
53    /// it.next();
54    /// assert_eq!(b" baz", it.as_bytes());
55    /// it.next();
56    /// assert_eq!(b"", it.as_bytes());
57    /// ```
58    #[inline]
59    pub fn as_bytes(&self) -> &'a [u8] {
60        self.0.as_bytes()
61    }
62}
63
64impl<'a> Iterator for Words<'a> {
65    type Item = &'a str;
66
67    #[inline]
68    fn next(&mut self) -> Option<&'a str> {
69        for word in self.0.by_ref() {
70            let input =
71                Input::new(word).anchored(Anchored::Yes).earliest(true);
72            if SIMPLE_WORD_FWD.try_search_fwd(&input).unwrap().is_some() {
73                return Some(word);
74            }
75        }
76        None
77    }
78}
79
80/// An iterator over words in a byte string and their byte index positions.
81///
82/// This iterator is typically constructed by
83/// [`ByteSlice::word_indices`](trait.ByteSlice.html#method.word_indices).
84///
85/// This is similar to the
86/// [`WordsWithBreakIndices`](struct.WordsWithBreakIndices.html) iterator,
87/// except it only returns elements that contain a "word" character. A
88/// word character is defined by UTS #18 (Annex C) to be the combination
89/// of the `Alphabetic` and `Join_Control` properties, along with the
90/// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
91///
92/// Since words are made up of one or more codepoints, this iterator
93/// yields `&str` elements (along with their start and end byte offsets).
94/// When invalid UTF-8 is encountered, replacement codepoints are
95/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
96/// indices yielded by this iterator may not correspond to the length of the
97/// word yielded with those indices. For example, when this iterator encounters
98/// `\xFF` in the byte string, then it will yield a pair of indices ranging
99/// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
100/// which is three bytes in length. However, when given only valid UTF-8, then
101/// all indices are in exact correspondence with their paired word.
102///
103/// This iterator yields words in accordance with the default word boundary
104/// rules specified in
105/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
106/// In particular, this may not be suitable for Japanese and Chinese scripts
107/// that do not use spaces between words.
108#[derive(Clone, Debug)]
109pub struct WordIndices<'a>(WordsWithBreakIndices<'a>);
110
111impl<'a> WordIndices<'a> {
112    pub(crate) fn new(bs: &'a [u8]) -> WordIndices<'a> {
113        WordIndices(WordsWithBreakIndices::new(bs))
114    }
115
116    /// View the underlying data as a subslice of the original data.
117    ///
118    /// The slice returned has the same lifetime as the original slice, and so
119    /// the iterator can continue to be used while this exists.
120    ///
121    /// # Examples
122    ///
123    /// ```
124    /// use bstr::ByteSlice;
125    ///
126    /// let mut it = b"foo bar baz".word_indices();
127    ///
128    /// assert_eq!(b"foo bar baz", it.as_bytes());
129    /// it.next();
130    /// it.next();
131    /// assert_eq!(b" baz", it.as_bytes());
132    /// it.next();
133    /// it.next();
134    /// assert_eq!(b"", it.as_bytes());
135    /// ```
136    #[inline]
137    pub fn as_bytes(&self) -> &'a [u8] {
138        self.0.as_bytes()
139    }
140}
141
142impl<'a> Iterator for WordIndices<'a> {
143    type Item = (usize, usize, &'a str);
144
145    #[inline]
146    fn next(&mut self) -> Option<(usize, usize, &'a str)> {
147        for (start, end, word) in self.0.by_ref() {
148            let input =
149                Input::new(word).anchored(Anchored::Yes).earliest(true);
150            if SIMPLE_WORD_FWD.try_search_fwd(&input).unwrap().is_some() {
151                return Some((start, end, word));
152            }
153        }
154        None
155    }
156}
157
158/// An iterator over all word breaks in a byte string.
159///
160/// This iterator is typically constructed by
161/// [`ByteSlice::words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks).
162///
163/// This iterator yields not only all words, but the content that comes between
164/// words. In particular, if all elements yielded by this iterator are
165/// concatenated, then the result is the original string (subject to Unicode
166/// replacement codepoint substitutions).
167///
168/// Since words are made up of one or more codepoints, this iterator yields
169/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
170/// are [substituted](index.html#handling-of-invalid-utf-8).
171///
172/// This iterator yields words in accordance with the default word boundary
173/// rules specified in
174/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
175/// In particular, this may not be suitable for Japanese and Chinese scripts
176/// that do not use spaces between words.
177#[derive(Clone, Debug)]
178pub struct WordsWithBreaks<'a> {
179    bs: &'a [u8],
180}
181
182impl<'a> WordsWithBreaks<'a> {
183    pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreaks<'a> {
184        WordsWithBreaks { bs }
185    }
186
187    /// View the underlying data as a subslice of the original data.
188    ///
189    /// The slice returned has the same lifetime as the original slice, and so
190    /// the iterator can continue to be used while this exists.
191    ///
192    /// # Examples
193    ///
194    /// ```
195    /// use bstr::ByteSlice;
196    ///
197    /// let mut it = b"foo bar baz".words_with_breaks();
198    ///
199    /// assert_eq!(b"foo bar baz", it.as_bytes());
200    /// it.next();
201    /// assert_eq!(b" bar baz", it.as_bytes());
202    /// it.next();
203    /// it.next();
204    /// assert_eq!(b" baz", it.as_bytes());
205    /// it.next();
206    /// it.next();
207    /// assert_eq!(b"", it.as_bytes());
208    /// ```
209    #[inline]
210    pub fn as_bytes(&self) -> &'a [u8] {
211        self.bs
212    }
213}
214
215impl<'a> Iterator for WordsWithBreaks<'a> {
216    type Item = &'a str;
217
218    #[inline]
219    fn next(&mut self) -> Option<&'a str> {
220        let (word, size) = decode_word(self.bs);
221        if size == 0 {
222            return None;
223        }
224        self.bs = &self.bs[size..];
225        Some(word)
226    }
227}
228
229/// An iterator over all word breaks in a byte string, along with their byte
230/// index positions.
231///
232/// This iterator is typically constructed by
233/// [`ByteSlice::words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices).
234///
235/// This iterator yields not only all words, but the content that comes between
236/// words. In particular, if all elements yielded by this iterator are
237/// concatenated, then the result is the original string (subject to Unicode
238/// replacement codepoint substitutions).
239///
240/// Since words are made up of one or more codepoints, this iterator
241/// yields `&str` elements (along with their start and end byte offsets).
242/// When invalid UTF-8 is encountered, replacement codepoints are
243/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
244/// indices yielded by this iterator may not correspond to the length of the
245/// word yielded with those indices. For example, when this iterator encounters
246/// `\xFF` in the byte string, then it will yield a pair of indices ranging
247/// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
248/// which is three bytes in length. However, when given only valid UTF-8, then
249/// all indices are in exact correspondence with their paired word.
250///
251/// This iterator yields words in accordance with the default word boundary
252/// rules specified in
253/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
254/// In particular, this may not be suitable for Japanese and Chinese scripts
255/// that do not use spaces between words.
256#[derive(Clone, Debug)]
257pub struct WordsWithBreakIndices<'a> {
258    bs: &'a [u8],
259    forward_index: usize,
260}
261
262impl<'a> WordsWithBreakIndices<'a> {
263    pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
264        WordsWithBreakIndices { bs, forward_index: 0 }
265    }
266
267    /// View the underlying data as a subslice of the original data.
268    ///
269    /// The slice returned has the same lifetime as the original slice, and so
270    /// the iterator can continue to be used while this exists.
271    ///
272    /// # Examples
273    ///
274    /// ```
275    /// use bstr::ByteSlice;
276    ///
277    /// let mut it = b"foo bar baz".words_with_break_indices();
278    ///
279    /// assert_eq!(b"foo bar baz", it.as_bytes());
280    /// it.next();
281    /// assert_eq!(b" bar baz", it.as_bytes());
282    /// it.next();
283    /// it.next();
284    /// assert_eq!(b" baz", it.as_bytes());
285    /// it.next();
286    /// it.next();
287    /// assert_eq!(b"", it.as_bytes());
288    /// ```
289    #[inline]
290    pub fn as_bytes(&self) -> &'a [u8] {
291        self.bs
292    }
293}
294
295impl<'a> Iterator for WordsWithBreakIndices<'a> {
296    type Item = (usize, usize, &'a str);
297
298    #[inline]
299    fn next(&mut self) -> Option<(usize, usize, &'a str)> {
300        let index = self.forward_index;
301        let (word, size) = decode_word(self.bs);
302        if size == 0 {
303            return None;
304        }
305        self.bs = &self.bs[size..];
306        self.forward_index += size;
307        Some((index, index + size, word))
308    }
309}
310
311fn decode_word(bs: &[u8]) -> (&str, usize) {
312    if bs.is_empty() {
313        ("", 0)
314    } else if let Some(hm) = {
315        let input = Input::new(bs).anchored(Anchored::Yes);
316        WORD_BREAK_FWD.try_search_fwd(&input).unwrap()
317    } {
318        // Safe because a match can only occur for valid UTF-8.
319        let word = unsafe { bs[..hm.offset()].to_str_unchecked() };
320        (word, word.len())
321    } else {
322        const INVALID: &str = "\u{FFFD}";
323        // No match on non-empty bytes implies we found invalid UTF-8.
324        let (_, size) = utf8::decode_lossy(bs);
325        (INVALID, size)
326    }
327}
328
329#[cfg(all(test, feature = "std"))]
330mod tests {
331    use alloc::{vec, vec::Vec};
332
333    #[cfg(not(miri))]
334    use ucd_parse::WordBreakTest;
335
336    use crate::ext_slice::ByteSlice;
337
338    #[test]
339    #[cfg(not(miri))]
340    fn forward_ucd() {
341        for (i, test) in ucdtests().into_iter().enumerate() {
342            let given = test.words.concat();
343            let got = words(given.as_bytes());
344            assert_eq!(
345                test.words,
346                got,
347                "\n\nword forward break test {} failed:\n\
348                 given:    {:?}\n\
349                 expected: {:?}\n\
350                 got:      {:?}\n",
351                i,
352                given,
353                strs_to_bstrs(&test.words),
354                strs_to_bstrs(&got),
355            );
356        }
357    }
358
359    // Some additional tests that don't seem to be covered by the UCD tests.
360    //
361    // It's pretty amazing that the UCD tests miss these cases. I only found
362    // them by running this crate's segmenter and ICU's segmenter on the same
363    // text and comparing the output.
364    #[test]
365    fn forward_additional() {
366        assert_eq!(vec!["a", ".", "  ", "Y"], words(b"a.  Y"));
367        assert_eq!(vec!["r", ".", "  ", "Yo"], words(b"r.  Yo"));
368        assert_eq!(
369            vec!["whatsoever", ".", "  ", "You", " ", "may"],
370            words(b"whatsoever.  You may")
371        );
372        assert_eq!(
373            vec!["21stcentury'syesterday"],
374            words(b"21stcentury'syesterday")
375        );
376
377        assert_eq!(vec!["Bonta_", "'", "s"], words(b"Bonta_'s"));
378        assert_eq!(vec!["_vhat's"], words(b"_vhat's"));
379        assert_eq!(vec!["__on'anima"], words(b"__on'anima"));
380        assert_eq!(vec!["123_", "'", "4"], words(b"123_'4"));
381        assert_eq!(vec!["_123'4"], words(b"_123'4"));
382        assert_eq!(vec!["__12'345"], words(b"__12'345"));
383
384        assert_eq!(
385            vec!["tomorrowat4", ":", "00", ","],
386            words(b"tomorrowat4:00,")
387        );
388        assert_eq!(vec!["RS1", "'", "s"], words(b"RS1's"));
389        assert_eq!(vec!["X38"], words(b"X38"));
390
391        assert_eq!(vec!["4abc", ":", "00", ","], words(b"4abc:00,"));
392        assert_eq!(vec!["12S", "'", "1"], words(b"12S'1"));
393        assert_eq!(vec!["1XY"], words(b"1XY"));
394
395        assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes()));
396
397        // Tests that Vithkuqi works, which was introduced in Unicode 14.
398        // This test fails prior to Unicode 14.
399        assert_eq!(
400            vec!["\u{10570}\u{10597}"],
401            words("\u{10570}\u{10597}".as_bytes())
402        );
403    }
404
405    fn words(bytes: &[u8]) -> Vec<&str> {
406        bytes.words_with_breaks().collect()
407    }
408
409    #[cfg(not(miri))]
410    fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
411        strs.iter().map(|s| s.as_ref().as_bytes()).collect()
412    }
413
414    /// Return all of the UCD for word breaks.
415    #[cfg(not(miri))]
416    fn ucdtests() -> Vec<WordBreakTest> {
417        const TESTDATA: &str = include_str!("data/WordBreakTest.txt");
418
419        let mut tests = vec![];
420        for mut line in TESTDATA.lines() {
421            line = line.trim();
422            if line.starts_with("#") || line.contains("surrogate") {
423                continue;
424            }
425            tests.push(line.parse().unwrap());
426        }
427        tests
428    }
429}
bstr/unicode/word.rs

bstr/unicode/
word.rs