From a09ddc35b93f1db7d1fa211fff6cd8fed01f1fb7 Mon Sep 17 00:00:00 2001 From: jutty Date: Thu, 1 Jan 2026 21:45:26 -0300 Subject: [PATCH] Make content syntax segmentation less terse --- src/syntax/content/parser/segment.rs | 102 ++++++++++++++++++++------- 1 file changed, 77 insertions(+), 25 deletions(-) diff --git a/src/syntax/content/parser/segment.rs b/src/syntax/content/parser/segment.rs index 33b2f04..0fa4875 100644 --- a/src/syntax/content/parser/segment.rs +++ b/src/syntax/content/parser/segment.rs @@ -4,35 +4,87 @@ pub fn segment(text: &str) -> Vec { mod delimiter { - fn make_delimiters() -> Vec { - vec!['\n', ' ', '`', '|'] + struct Delimiters { + atomic: Vec, + flanking: Vec, + punctuation: Vec, + grouping: Vec, + } + + impl Delimiters { + fn new() -> Delimiters { + Delimiters { + atomic: vec!['\n', ' ', '`', '|'], + flanking: vec!['_', '*'], + punctuation: vec![',', '.', ':', ';', '?', '!'], + grouping: vec!['(', ')', '\'', '"'], + } + } + + fn is_boundary(&self, c: char) -> bool { + self.atomic.contains(&c) + || self.punctuation.contains(&c) + || self.grouping.contains(&c) + } + + fn is_delimiter(&self, s: &str) -> bool { + Delimiters::match_str(s, &self.atomic) + || Delimiters::match_str(s, &self.flanking) + } + + fn match_str(s: &str, delimiters: &[char]) -> bool { + if s.chars().count() > 1 { + false + } else if let Some(first) = s.chars().nth(0) { + delimiters.contains(&first) + } else { + false + } + } } pub fn atomize(text: &str) -> Vec { - let delimiters = make_delimiters(); - text.chars().fold( - Vec::new(), - |mut accumulator: Vec, character| { - if delimiters.contains(&character) { - accumulator.push(character.to_string()); - } else if let Some(last) = accumulator.last_mut() { - if delimiters - .iter() - .map(char::to_string) - .filter(|d| d == last) - .count() - > 0 - { - accumulator.push(character.to_string()); - } else { - last.push(character); - } - } else { - accumulator.push(character.to_string()); + let delimiters = Delimiters::new(); + let mut atomized: Vec = vec![]; + + let mut iterator = text.chars().peekable(); + while let Some(c) = iterator.next() { + // if the current char is an atomic delimiter + if delimiters.atomic.contains(&c) { + atomized.push(c.to_string()); + + // if the current char is a flanking delimiter + } else if delimiters.flanking.contains(&c) { + // if next char is a boundary + if iterator + .peek() + .is_some_and(|next| delimiters.is_boundary(*next)) + { + atomized.push(c.to_string()); + + // if the previous char was whitespace + } else if let Some(last_string) = atomized.last() + && let Some(last_char) = last_string.chars().last() + && last_char.is_whitespace() + { + atomized.push(c.to_string()); } - accumulator - }, - ) + + // if there is a last atomized element + } else if let Some(last) = atomized.last_mut() { + // if the last atomized element is a delimiter + if delimiters.is_delimiter(last) { + atomized.push(c.to_string()); + } else { + last.push(c); + } + + // if there is no last atomized element + } else { + atomized.push(c.to_string()); + } + } + atomized } #[cfg(test)]