From c53afefb676a058faa98d5a6858ef8c34becb1b5 Mon Sep 17 00:00:00 2001 From: jutty Date: Fri, 9 Jan 2026 19:39:44 -0300 Subject: [PATCH] Add lexeme 'first' field, refactor and add methods --- src/syntax/content/parser.rs | 2 +- src/syntax/content/parser/context/anchor.rs | 12 +- src/syntax/content/parser/context/inline.rs | 2 +- src/syntax/content/parser/lexeme.rs | 116 +++++++++++++------ src/syntax/content/parser/token/checkbox.rs | 6 +- src/syntax/content/parser/token/header.rs | 4 +- src/syntax/content/parser/token/paragraph.rs | 2 +- src/syntax/content/parser/token/strike.rs | 2 +- src/syntax/content/parser/token/underline.rs | 2 +- 9 files changed, 98 insertions(+), 50 deletions(-) diff --git a/src/syntax/content/parser.rs b/src/syntax/content/parser.rs index 0804602..491b292 100644 --- a/src/syntax/content/parser.rs +++ b/src/syntax/content/parser.rs @@ -31,7 +31,7 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec { let mut iterator = lexemes.iter().peekable(); while let Some(lexeme) = iterator.next() { - if lexeme.match_as_char('\\') { + if lexeme.match_char('\\') { if let Some(next) = iterator.next() { tokens.push(Token::Literal(Literal::lex(next))); } diff --git a/src/syntax/content/parser/context/anchor.rs b/src/syntax/content/parser/context/anchor.rs index 1ce26ff..230bad0 100644 --- a/src/syntax/content/parser/context/anchor.rs +++ b/src/syntax/content/parser/context/anchor.rs @@ -53,9 +53,9 @@ pub fn parse( // Conditions in this decision tree should match the destination end // or some intermediary state necessary to finding it - if lexeme.match_as_char('s') + if lexeme.match_char('s') && lexeme.is_next_boundary() - && !lexeme.match_next_as_char('|') + && !lexeme.match_next_char('|') { log!("End: Plural anchor"); candidate.destination = Some(candidate.text.clone()); @@ -65,7 +65,7 @@ pub fn parse( state.context.inline = Inline::None; } return true; - } else if lexeme.match_as_char('|') && lexeme.is_next_delimiter() { + } else if lexeme.match_char('|') && lexeme.is_next_delimiter() { log!("End: Pipe followed by delimiter"); if buffer.destination.is_empty() { candidate.destination = Some(candidate.text.clone()); @@ -75,16 +75,16 @@ pub fn parse( tokens.push(Token::Anchor(candidate.clone())); state.context.inline = Inline::None; return true; - } else if lexeme.match_as_char('|') && !candidate.balanced { + } else if lexeme.match_char('|') && !candidate.balanced { log!("State: Found a pipe, but no boundary: destination follows"); candidate.balanced = true; return true; - } else if lexeme.match_as_char(':') { + } else if lexeme.match_char(':') { log!("State: Found a colon, marking anchor as external"); candidate.external = true; buffer.destination.push_str(&lexeme.text()); return true; - } else if lexeme.match_as_char('|') { + } else if lexeme.match_char('|') { log!("End: Explicit end-of-destination pipe"); candidate.destination = Some(buffer.destination.clone()); return true; diff --git a/src/syntax/content/parser/context/inline.rs b/src/syntax/content/parser/context/inline.rs index 10b8b6d..7d5633d 100644 --- a/src/syntax/content/parser/context/inline.rs +++ b/src/syntax/content/parser/context/inline.rs @@ -31,7 +31,7 @@ pub fn parse( state.context.inline = Inline::Anchor; state.buffers.anchor = AnchorBuffer::default(); - if lexeme.match_as_char('|') { + if lexeme.match_char('|') { state.buffers.anchor.candidate.leading = true; } else { state.buffers.anchor.candidate.text = lexeme.text(); diff --git a/src/syntax/content/parser/lexeme.rs b/src/syntax/content/parser/lexeme.rs index 2e5496c..648c6fb 100644 --- a/src/syntax/content/parser/lexeme.rs +++ b/src/syntax/content/parser/lexeme.rs @@ -7,6 +7,7 @@ pub struct Lexeme { text: String, next: String, third: String, + first: bool, last: bool, } @@ -16,6 +17,7 @@ impl Lexeme { text: raw.to_owned(), next: next.to_owned(), third: third.to_owned(), + first: false, last: false, } } @@ -35,6 +37,10 @@ impl Lexeme { self.last } + pub fn first(&self) -> bool { + self.first + } + pub fn mutate_text(&mut self, new: &str) { self.text = new.to_string(); } @@ -63,46 +69,58 @@ impl Lexeme { } } - pub fn match_as_char(&self, c: char) -> bool { + pub fn match_char(&self, c: char) -> bool { self.as_char().is_some_and(|as_char| as_char == c) } - pub fn match_next_as_char(&self, c: char) -> bool { + pub fn match_next_char(&self, c: char) -> bool { self.next_as_char().is_some_and(|next| next == c) } - pub fn match_third_as_char(&self, c: char) -> bool { + pub fn match_third_char(&self, c: char) -> bool { self.third_as_char().is_some_and(|third| third == c) } - pub fn match_triple_as_char(&self, t: (char, char, char)) -> bool { - self.match_as_char(t.0) - && self.match_next_as_char(t.1) - && self.match_third_as_char(t.2) + pub fn match_either_char(&self, c1: char, c2: char) -> bool { + self.as_char().is_some_and(|c| c == c1 || c == c2) } - pub fn contains_as_char(&self, slice: &[char]) -> bool { + pub fn match_next_either_char(&self, c1: char, c2: char) -> bool { + self.next_as_char().is_some_and(|c| c == c1 || c == c2) + } + + pub fn match_char_sequence(&self, c1: char, c2: char) -> bool { + self.match_char(c1) && self.match_next_char(c2) + } + + pub fn match_char_triple(&self, c1: char, c2: char, c3: char) -> bool { + self.match_char(c1) + && self.match_next_char(c2) + && self.match_third_char(c3) + } + + pub fn match_char_in(&self, slice: &[char]) -> bool { self.as_char().is_some_and(|c| slice.contains(&c)) } - pub fn contains_next_as_char(&self, slice: &[char]) -> bool { + pub fn match_next_char_in(&self, slice: &[char]) -> bool { self.next_as_char().is_some_and(|c| slice.contains(&c)) } pub fn is_punctuation(&self) -> bool { - self.contains_as_char(&Delimiters::default().punctuation) + self.match_char_in(&Delimiters::default().punctuation) } pub fn is_whitespace(&self) -> bool { - self.contains_as_char(&Delimiters::default().whitespace) + self.match_char_in(&Delimiters::default().whitespace) } pub fn is_next_whitespace(&self) -> bool { - self.contains_next_as_char(&Delimiters::default().whitespace) + self.match_next_char_in(&Delimiters::default().whitespace) } pub fn is_next_punctuation(&self) -> bool { - self.contains_next_as_char(&Delimiters::default().punctuation) + self.match_next_char_in(&Delimiters::default().punctuation) } pub fn is_next_boundary(&self) -> bool { @@ -159,54 +177,63 @@ impl Lexeme { vector } - pub fn split_words(self) -> Vec { + pub fn split_segments(self) -> Vec { self.text().split(' ').map(str::to_string).collect() } - pub fn first(self) -> Option { - self.split_words().first().map(String::to_owned) + pub fn first_segment(self) -> Option { + self.split_segments().first().map(String::to_owned) } - pub fn collect(segments: &[String]) -> Vec { - let mut out_vector = Vec::with_capacity(segments.len()); - let mut vec = segments.to_vec(); + pub fn collect(segments_slice: &[String]) -> Vec { + let mut lexemes = Vec::with_capacity(segments_slice.len()); + let mut segments = segments_slice.to_vec(); - let Some(mut third) = vec.pop() else { + let Some(last) = segments.pop() else { return vec![]; }; let last_lexeme = Lexeme { - text: third.clone(), + text: last.clone(), next: String::default(), third: String::default(), + first: false, last: true, }; - let Some(mut next) = vec.pop() else { + let Some(penultimate) = segments.pop() else { return vec![last_lexeme]; }; let penultimate_lexeme = Lexeme { - text: next.clone(), - next: third.clone(), + text: penultimate.clone(), + next: last.clone(), third: String::default(), + first: false, last: false, }; - for current in vec.iter().rev() { - out_vector.push(Lexeme { + let mut third = last; + let mut next = penultimate; + + let mut iterator = segments.iter().rev().peekable(); + while let Some(current) = iterator.next() { + let lexeme = Lexeme { text: current.to_owned(), next: next.clone(), third: third.clone(), + first: iterator.peek().is_none(), last: false, - }); + }; + + lexemes.push(lexeme); third.clone_from(&next); next.clone_from(current); } - out_vector.reverse(); - out_vector.push(penultimate_lexeme); - out_vector.push(last_lexeme); - out_vector + lexemes.reverse(); + lexemes.push(penultimate_lexeme); + lexemes.push(last_lexeme); + lexemes } } @@ -214,14 +241,24 @@ impl fmt::Display for Lexeme { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use crate::dev::wrap; - let next_display = if self.last() { + let properties = if self.first { + "[F] " + } else if self.last { + "[L] " + } else if self.last && self.first { + "[FL] " + } else { + "" + }; + + let next_display = if self.last { " " } else if self.third.is_empty() { &format!("-> {} -! EOI", wrap(&self.next)) } else { &format!("-> {} -> {}", wrap(&self.next), wrap(&self.third)) }; - write!(f, "{} {}", wrap(&self.text), next_display) + write!(f, "Lx {}{} {}", properties, wrap(&self.text), next_display) } } @@ -261,10 +298,19 @@ mod tests { } #[test] - fn first_word() { + fn first_segment() { let payload = "nhNc fGev QnGW E4hj ExyZ"; let lexeme = Lexeme::new(payload, "", ""); - assert_eq!(lexeme.first(), Some(String::from("nhNc"))); + assert_eq!(lexeme.clone().first_segment(), Some(String::from("nhNc"))); + } + + #[test] + fn first_lexeme() { + let input = ["h015r", "cvYde", "aw1Ui", "ASwew"].map(str::to_string); + let lexemes = Lexeme::collect(&input); + let first = lexemes.first().unwrap(); + assert!(first.clone().first()); + assert_eq!(first.text(), "h015r".to_string()); } #[test] diff --git a/src/syntax/content/parser/token/checkbox.rs b/src/syntax/content/parser/token/checkbox.rs index 23cb994..290432d 100644 --- a/src/syntax/content/parser/token/checkbox.rs +++ b/src/syntax/content/parser/token/checkbox.rs @@ -15,14 +15,14 @@ impl CheckBox { impl Parseable for CheckBox { fn probe(lexeme: &Lexeme) -> bool { - lexeme.match_triple_as_char(('[', ' ', ']')) - || lexeme.match_triple_as_char(('[', 'x', ']')) + lexeme.match_char_triple('[', ' ', ']') + || lexeme.match_char_triple('[', 'x', ']') } fn lex(lexeme: &Lexeme) -> CheckBox { use crate::prelude::*; log!("Lexing: {lexeme}"); - if lexeme.match_next_as_char('x') { + if lexeme.match_next_char('x') { CheckBox::new(true) } else { CheckBox::new(false) diff --git a/src/syntax/content/parser/token/header.rs b/src/syntax/content/parser/token/header.rs index 390a22a..f4845a6 100644 --- a/src/syntax/content/parser/token/header.rs +++ b/src/syntax/content/parser/token/header.rs @@ -83,7 +83,9 @@ impl Parseable for Header { == 0 { let level = lexeme.text().len(); - lexeme.clone().split_words().len() == 1 && level > 0 && level <= 6 + lexeme.clone().split_segments().len() == 1 + && level > 0 + && level <= 6 } else { false } diff --git a/src/syntax/content/parser/token/paragraph.rs b/src/syntax/content/parser/token/paragraph.rs index 798792b..0713ca3 100644 --- a/src/syntax/content/parser/token/paragraph.rs +++ b/src/syntax/content/parser/token/paragraph.rs @@ -11,7 +11,7 @@ impl Paragraph { } pub fn probe_end(lexeme: &Lexeme) -> bool { - lexeme.match_as_char('\n') && lexeme.match_next_as_char('\n') + lexeme.match_char('\n') && lexeme.match_next_char('\n') } } diff --git a/src/syntax/content/parser/token/strike.rs b/src/syntax/content/parser/token/strike.rs index 69ce9d4..f0e6e94 100644 --- a/src/syntax/content/parser/token/strike.rs +++ b/src/syntax/content/parser/token/strike.rs @@ -15,7 +15,7 @@ impl Strike { impl Parseable for Strike { fn probe(lexeme: &Lexeme) -> bool { - lexeme.match_as_char('~') && lexeme.match_next_as_char('~') + lexeme.match_char('~') && lexeme.match_next_char('~') } fn lex(_lexeme: &Lexeme) -> Strike { diff --git a/src/syntax/content/parser/token/underline.rs b/src/syntax/content/parser/token/underline.rs index a588d9c..539d207 100644 --- a/src/syntax/content/parser/token/underline.rs +++ b/src/syntax/content/parser/token/underline.rs @@ -15,7 +15,7 @@ impl Underline { impl Parseable for Underline { fn probe(lexeme: &Lexeme) -> bool { - lexeme.match_as_char('_') && lexeme.match_next_as_char('_') + lexeme.match_char('_') && lexeme.match_next_char('_') } fn lex(_lexeme: &Lexeme) -> Underline {