pub fn segment(text: &str) -> Vec { delimiter::atomize(text) } pub mod delimiter { pub struct Delimiters { pub atomic: Vec, pub flanking: Vec, pub punctuation: Vec, pub whitespace: Vec, pub double: Vec, } impl Default for Delimiters { fn default() -> Delimiters { Delimiters { atomic: vec!['`', '|', '\\'], double: vec!['_', '~'], flanking: vec!['_', '*', '~', '(', ')', '[', ']', '\'', '"'], punctuation: vec![',', '.', ';', ':', '?', '!'], whitespace: vec!['\n', ' '], } } } impl Delimiters { pub fn is_boundary(&self, c: char) -> bool { [ self.atomic.clone(), self.punctuation.clone(), self.whitespace.clone(), ] .concat() .contains(&c) } pub fn is_delimiter(&self, c: char) -> bool { self.is_boundary(c) || self.flanking.contains(&c) } fn is_str_delimiter(&self, s: &str) -> bool { if s.chars().count() > 1 { return false; } if let Some(c) = s.chars().nth(0) { self.is_delimiter(c) } else { false } } } pub fn atomize(text: &str) -> Vec { let delimiters = Delimiters::default(); let mut atomized: Vec = vec![]; let mut iterator = text.chars().peekable(); while let Some(c) = iterator.next() { // if current char is a boundary if delimiters.is_boundary(c) { atomized.push(c.to_string()); continue; // if current char is a double delimiter and the next its double } else if delimiters.double.contains(&c) && iterator.peek().is_some_and(|n| *n == c) { atomized.push(c.to_string()); iterator.next(); atomized.push(c.to_string()); continue; // if current char is a flanking delimiter } else if delimiters.flanking.contains(&c) { // if next char is a boundary if iterator .peek() .is_none_or(|next| delimiters.is_boundary(*next)) { atomized.push(c.to_string()); continue; // if previous char was whitespace } else if let Some(last_string) = atomized.last() && let Some(last_char) = last_string.chars().last() && delimiters.whitespace.contains(&last_char) { atomized.push(c.to_string()); continue; } } // if there is a last atomized element if let Some(last) = atomized.last_mut() { // if last atomized element is a boundary if delimiters.is_str_delimiter(last) { atomized.push(c.to_string()); } else { last.push(c); } // if there is no last atomized element } else { atomized.push(c.to_string()); } } atomized } #[cfg(test)] mod tests { use super::*; #[test] fn atomize_nonflanking_underscore() { assert_eq!(atomize("false_dichotomy"), vec!["false_dichotomy"]); } #[test] fn atomize_left_flanking_underscore() { assert_eq!( atomize("_false_dichotomy"), vec!["_", "false_dichotomy"] ); } #[test] fn atomize_right_flanking_underscore() { assert_eq!( atomize("false_dichotomy_"), vec!["false_dichotomy", "_"] ); } #[test] fn atomize_dual_flanking_underscore() { assert_eq!( atomize("_false_dichotomy_"), vec!["_", "false_dichotomy", "_"] ); } #[test] fn atomize_flankign_sentence() { assert_eq!( atomize( "about_colors: the colors _amber_, _orange_ and _yellow mustard_ to `jane_bishop@mail.com`." ), vec![ "about_colors", ":", " ", "the", " ", "colors", " ", "_", "amber", "_", ",", " ", "_", "orange", "_", " ", "and", " ", "_", "yellow", " ", "mustard", "_", " ", "to", " ", "`", "jane_bishop@mail", ".", "com", "`", "." ], ); } #[test] fn atomize_words() { let actual = atomize( " justification for the actions of those who hold authority inevitably dwindles ", ); let expected = vec![ " ", " ", " ", " ", "justification", " ", "for", " ", " ", "the", " ", "actions", " ", " ", " ", "of", " ", "those", " ", " ", "who", " ", "hold", " ", "authority", " ", " ", " ", "inevitably", " ", "dwindles", " ", " ", ]; assert_eq!(actual, expected); } #[test] fn atomize_ticks_no_spaces() { let s = "a`c`adc`dadcdbd`cdb`dcdb`dc`dad`bdc"; let actual = atomize(s); let expected = vec![ "a", "`", "c", "`", "adc", "`", "dadcdbd", "`", "cdb", "`", "dcdb", "`", "dc", "`", "dad", "`", "bdc", ] .iter() .map(std::string::ToString::to_string) .collect::>(); assert_eq!(actual, expected); } #[test] fn atomize_ticks_with_spaces() { let s = "a`c`adc`da dcdb d` cdb` dcdb `dc ` d ad ` bdc"; let actual = atomize(s); let expected = vec![ "a", "`", "c", "`", "adc", "`", "da", " ", "dcdb", " ", "d", "`", " ", "cdb", "`", " ", "dcdb", " ", "`", "dc", " ", "`", " ", "d", " ", "ad", " ", "`", " ", "bdc", ] .iter() .map(std::string::ToString::to_string) .collect::>(); assert_eq!(actual, expected); } #[test] fn atomize_pipes() { let actual = atomize("every other |time| as it was perceived"); let expected = vec![ "every", " ", "other", " ", "|", "time", "|", " ", "as", " ", "it", " ", "was", " ", "perceived", ]; assert_eq!(actual, expected); } #[test] fn atomize_pipes_and_ticks() { let actual = atomize( "every other |time| as `it could or |perhaps somehow|then or now| it was` perceived", ); let expected = vec![ "every", " ", "other", " ", "|", "time", "|", " ", "as", " ", "`", "it", " ", "could", " ", "or", " ", "|", "perhaps", " ", "somehow", "|", "then", " ", "or", " ", "now", "|", " ", "it", " ", "was", "`", " ", "perceived", ]; assert_eq!(actual, expected); } #[test] fn atomize_newlines() { let actual = atomize("a`c`adc`da \ndcdb d` cdb` dc\ndb `dc ` d ad ` bdc"); let expected = vec![ "a", "`", "c", "`", "adc", "`", "da", " ", "\n", "dcdb", " ", "d", "`", " ", "cdb", "`", " ", "dc", "\n", "db", " ", "`", "dc", " ", "`", " ", "d", " ", "ad", " ", "`", " ", "bdc", ] .iter() .map(std::string::ToString::to_string) .collect::>(); assert_eq!(actual, expected); } } }