Make content syntax segmentation less terse
This commit is contained in:
parent
800b175ec5
commit
a09ddc35b9
1 changed files with 76 additions and 24 deletions
|
|
@ -4,35 +4,87 @@ pub fn segment(text: &str) -> Vec<String> {
|
||||||
|
|
||||||
mod delimiter {
|
mod delimiter {
|
||||||
|
|
||||||
fn make_delimiters() -> Vec<char> {
|
struct Delimiters {
|
||||||
vec!['\n', ' ', '`', '|']
|
atomic: Vec<char>,
|
||||||
|
flanking: Vec<char>,
|
||||||
|
punctuation: Vec<char>,
|
||||||
|
grouping: Vec<char>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Delimiters {
|
||||||
|
fn new() -> Delimiters {
|
||||||
|
Delimiters {
|
||||||
|
atomic: vec!['\n', ' ', '`', '|'],
|
||||||
|
flanking: vec!['_', '*'],
|
||||||
|
punctuation: vec![',', '.', ':', ';', '?', '!'],
|
||||||
|
grouping: vec!['(', ')', '\'', '"'],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_boundary(&self, c: char) -> bool {
|
||||||
|
self.atomic.contains(&c)
|
||||||
|
|| self.punctuation.contains(&c)
|
||||||
|
|| self.grouping.contains(&c)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_delimiter(&self, s: &str) -> bool {
|
||||||
|
Delimiters::match_str(s, &self.atomic)
|
||||||
|
|| Delimiters::match_str(s, &self.flanking)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn match_str(s: &str, delimiters: &[char]) -> bool {
|
||||||
|
if s.chars().count() > 1 {
|
||||||
|
false
|
||||||
|
} else if let Some(first) = s.chars().nth(0) {
|
||||||
|
delimiters.contains(&first)
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn atomize(text: &str) -> Vec<String> {
|
pub fn atomize(text: &str) -> Vec<String> {
|
||||||
let delimiters = make_delimiters();
|
let delimiters = Delimiters::new();
|
||||||
text.chars().fold(
|
let mut atomized: Vec<String> = vec![];
|
||||||
Vec::new(),
|
|
||||||
|mut accumulator: Vec<String>, character| {
|
let mut iterator = text.chars().peekable();
|
||||||
if delimiters.contains(&character) {
|
while let Some(c) = iterator.next() {
|
||||||
accumulator.push(character.to_string());
|
// if the current char is an atomic delimiter
|
||||||
} else if let Some(last) = accumulator.last_mut() {
|
if delimiters.atomic.contains(&c) {
|
||||||
if delimiters
|
atomized.push(c.to_string());
|
||||||
.iter()
|
|
||||||
.map(char::to_string)
|
// if the current char is a flanking delimiter
|
||||||
.filter(|d| d == last)
|
} else if delimiters.flanking.contains(&c) {
|
||||||
.count()
|
// if next char is a boundary
|
||||||
> 0
|
if iterator
|
||||||
{
|
.peek()
|
||||||
accumulator.push(character.to_string());
|
.is_some_and(|next| delimiters.is_boundary(*next))
|
||||||
} else {
|
{
|
||||||
last.push(character);
|
atomized.push(c.to_string());
|
||||||
}
|
|
||||||
} else {
|
// if the previous char was whitespace
|
||||||
accumulator.push(character.to_string());
|
} else if let Some(last_string) = atomized.last()
|
||||||
|
&& let Some(last_char) = last_string.chars().last()
|
||||||
|
&& last_char.is_whitespace()
|
||||||
|
{
|
||||||
|
atomized.push(c.to_string());
|
||||||
}
|
}
|
||||||
accumulator
|
|
||||||
},
|
// if there is a last atomized element
|
||||||
)
|
} else if let Some(last) = atomized.last_mut() {
|
||||||
|
// if the last atomized element is a delimiter
|
||||||
|
if delimiters.is_delimiter(last) {
|
||||||
|
atomized.push(c.to_string());
|
||||||
|
} else {
|
||||||
|
last.push(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
// if there is no last atomized element
|
||||||
|
} else {
|
||||||
|
atomized.push(c.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
atomized
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue