en/src/syntax/content/parser/lexeme.rs
2026-01-15 12:07:18 -03:00

429 lines
12 KiB
Rust

use std::fmt;
use crate::{syntax::content::parser::segment::delimiter::Delimiters};
#[derive(Clone, Debug, Default)]
pub struct Lexeme {
text: String,
next: String,
third: String,
first: bool,
last: bool,
}
impl Lexeme {
pub fn new(raw: &str, next: &str, third: &str) -> Lexeme {
Lexeme {
text: raw.to_owned(),
next: next.to_owned(),
third: third.to_owned(),
first: false,
last: false,
}
}
pub fn text(&self) -> String {
self.text.clone()
}
pub fn next(&self) -> String {
self.next.clone()
}
pub fn last(&self) -> bool {
self.last
}
pub fn first(&self) -> bool {
self.first
}
pub fn mutate_text(&mut self, new: &str) {
self.text = new.to_string();
}
pub fn as_char(&self) -> Option<char> {
if self.text.chars().count() == 1 {
self.text.chars().nth(0)
} else {
None
}
}
pub fn next_as_char(&self) -> Option<char> {
if self.next.chars().count() == 1 {
self.next.chars().nth(0)
} else {
None
}
}
pub fn third_as_char(&self) -> Option<char> {
if self.third.chars().count() == 1 {
self.third.chars().nth(0)
} else {
None
}
}
pub fn match_char(&self, c: char) -> bool {
self.as_char().is_some_and(|as_char| as_char == c)
}
pub fn match_next_char(&self, c: char) -> bool {
self.next_as_char().is_some_and(|next| next == c)
}
pub fn match_third_char(&self, c: char) -> bool {
self.third_as_char().is_some_and(|third| third == c)
}
pub fn match_either_char(&self, c1: char, c2: char) -> bool {
self.as_char().is_some_and(|c| c == c1 || c == c2)
}
pub fn match_next_either_char(&self, c1: char, c2: char) -> bool {
self.next_as_char().is_some_and(|c| c == c1 || c == c2)
}
pub fn match_char_sequence(&self, c1: char, c2: char) -> bool {
self.match_char(c1) && self.match_next_char(c2)
}
pub fn match_char_triple(&self, c1: char, c2: char, c3: char) -> bool {
self.match_char(c1)
&& self.match_next_char(c2)
&& self.match_third_char(c3)
}
pub fn match_char_in(&self, slice: &[char]) -> bool {
self.as_char().is_some_and(|c| slice.contains(&c))
}
pub fn match_next_char_in(&self, slice: &[char]) -> bool {
self.next_as_char().is_some_and(|c| slice.contains(&c))
}
pub fn is_punctuation(&self) -> bool {
self.match_char_in(&Delimiters::default().punctuation)
}
pub fn is_whitespace(&self) -> bool {
self.match_char_in(&Delimiters::default().whitespace)
}
pub fn is_next_whitespace(&self) -> bool {
self.match_next_char_in(&Delimiters::default().whitespace)
}
pub fn is_next_punctuation(&self) -> bool {
self.match_next_char_in(&Delimiters::default().punctuation)
}
pub fn is_next_boundary(&self) -> bool {
let delimiters = Delimiters::default();
self.last
|| self
.next_as_char()
.is_some_and(|c| delimiters.is_boundary(c))
}
pub fn is_delimiter(&self) -> bool {
let delimiters = Delimiters::default();
self.as_char().is_some_and(|c| delimiters.is_delimiter(c))
}
pub fn is_next_delimiter(&self) -> bool {
let delimiters = Delimiters::default();
self.last
|| self
.next_as_char()
.is_some_and(|c| delimiters.is_delimiter(c))
}
pub fn next_first_char(&self) -> Option<char> {
self.next.chars().nth(0)
}
pub fn match_first_char(&self, query: char) -> bool {
self.text.chars().nth(0).is_some_and(|c| c == query)
}
pub fn match_last_char(&self, query: char) -> bool {
self.text.chars().last().is_some_and(|c| c == query)
}
pub fn match_next_first_char(&self, query: char) -> bool {
self.next.chars().nth(0).is_some_and(|c| c == query)
}
/// # Panics
/// Panics if number of chars for a single lexeme exceeds `i32::MAX`
pub fn count_char(&self, c: char) -> i32 {
let count = self.text().chars().filter(|&n| n == c).count();
match i32::try_from(count) {
Ok(i) => i,
Err(e) => {
panic!("Wild char number {count} is a bit much: {e:#?}");
},
}
}
pub fn split_chars(&self) -> Vec<char> {
let vector: Vec<char> = self.text().chars().collect();
vector
}
pub fn split_segments(self) -> Vec<String> {
self.text().split(' ').map(str::to_string).collect()
}
pub fn first_segment(self) -> Option<String> {
self.split_segments().first().map(String::to_owned)
}
pub fn collect(segments_slice: &[String]) -> Vec<Lexeme> {
let mut lexemes = Vec::with_capacity(segments_slice.len());
let mut segments = segments_slice.to_vec();
let Some(last) = segments.pop() else {
return vec![];
};
let last_lexeme = Lexeme {
text: last.clone(),
next: String::default(),
third: String::default(),
first: segments.is_empty(),
last: true,
};
let Some(penultimate) = segments.pop() else {
return vec![last_lexeme];
};
let penultimate_lexeme = Lexeme {
text: penultimate.clone(),
next: last.clone(),
third: String::default(),
first: false,
last: false,
};
let mut third = last;
let mut next = penultimate;
let mut iterator = segments.iter().rev().peekable();
while let Some(current) = iterator.next() {
let lexeme = Lexeme {
text: current.to_owned(),
next: next.clone(),
third: third.clone(),
first: iterator.peek().is_none(),
last: false,
};
lexemes.push(lexeme);
third.clone_from(&next);
next.clone_from(current);
}
lexemes.reverse();
lexemes.push(penultimate_lexeme);
lexemes.push(last_lexeme);
lexemes
}
}
impl fmt::Display for Lexeme {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use crate::log::wrap;
let properties = if self.last && self.first {
"[S] "
} else if self.last {
"[L] "
} else if self.first {
"[F] "
} else {
""
};
let next_display = if self.last {
" <EOI>"
} else if self.third.is_empty() {
&format!(" -> {} <EOI>", wrap(&self.next))
} else {
&format!(" -> {} -> {}", wrap(&self.next), wrap(&self.third))
};
write!(f, "Lx {}{}{}", properties, wrap(&self.text), next_display)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn new_lexeme() {
let raw = "3PKK4RzfGgUL58rU2NZbAiGN1o5dOfNu";
let next = "wAcZe8iVEEcZLp20PP9KKf07zJbeZafa";
let third = "K0QTlujGjL2qxBzs16g8oyiCYSuQaRVE";
let lexeme = Lexeme::new(raw, next, third);
assert_eq!(lexeme.text, raw);
assert_eq!(lexeme.next, next);
assert_eq!(lexeme.third, third);
}
#[test]
fn next_first_char() {
let payload = "4IU";
let lexeme = Lexeme::new("", payload, "");
assert_eq!(lexeme.next_first_char().unwrap(), '4');
}
#[test]
fn match_first_char() {
let payload = "MKY";
let lexeme = Lexeme::new(payload, "", "");
assert!(lexeme.match_first_char('M'));
}
#[test]
fn match_absent_first_char() {
let lexeme = Lexeme::new("", "", "");
assert!(!lexeme.match_first_char('x'));
}
#[test]
fn first_segment() {
let payload = "nhNc fGev QnGW E4hj ExyZ";
let lexeme = Lexeme::new(payload, "", "");
assert_eq!(lexeme.clone().first_segment(), Some(String::from("nhNc")));
}
#[test]
fn first_lexeme() {
let input = ["h015r", "cvYde", "aw1Ui", "ASwew"].map(str::to_string);
let lexemes = Lexeme::collect(&input);
let first = lexemes.first().unwrap();
assert!(first.clone().first());
assert_eq!(first.text(), "h015r".to_string());
}
#[test]
fn count_char() {
let payload = "6Ur3UjnndhENjFNSYWF7bhej2NZKLwdY";
let lexeme = Lexeme::new(payload, "", "");
assert_eq!(lexeme.count_char('j'), 3);
}
#[test]
fn mutate_text() {
let mut lexeme = Lexeme::new("b71Je", "I6y3i", "LC8na");
lexeme.mutate_text("qkjjK2");
assert_eq!(lexeme.text(), "qkjjK2");
}
#[test]
fn third_as_char() {
let lexeme_a = Lexeme::new("1", "2", "3");
assert_eq!(lexeme_a.third_as_char().unwrap(), '3');
let lexeme_c = Lexeme::new("a", "b", "");
assert!(lexeme_c.third_as_char().is_none());
}
#[test]
fn match_third_char() {
let lexeme = Lexeme::new("1", "2", "3");
assert!(lexeme.match_third_char('3'));
}
#[test]
fn match_next_either_char() {
let lexeme = Lexeme::new("1", "2", "3");
assert!(lexeme.match_next_either_char('x', '2'));
assert!(lexeme.match_next_either_char('2', 'x'));
}
#[test]
fn match_triple() {
let lexeme = Lexeme::new("1", "2", "3");
assert!(lexeme.match_char_triple('1', '2', '3'));
}
#[test]
fn is_punctuation() {
let delimiters = Delimiters::default();
let mut lexemes: Vec<Lexeme> = vec![];
for p in delimiters.punctuation {
lexemes.push(Lexeme::new(&p.to_string(), "", ""));
}
for lexeme in lexemes {
assert!(lexeme.is_punctuation());
}
}
#[test]
fn is_next_punctuation() {
let delimiters = Delimiters::default();
let mut lexemes: Vec<Lexeme> = vec![];
for p in delimiters.punctuation {
lexemes.push(Lexeme::new("", &p.to_string(), ""));
}
for lexeme in lexemes {
assert!(lexeme.is_next_punctuation());
}
}
#[test]
fn match_last_char() {
let lexeme = Lexeme::new("qYBWuNX", "", "");
assert!(lexeme.match_last_char('X'));
}
#[test]
fn match_next_last_char() {
let lexeme = Lexeme::new("", "teDAqVx", "");
assert!(lexeme.match_next_first_char('t'));
}
#[test]
fn display() {
let input = ["pcdA", "o32X", "kz2i", "79Lz"].map(str::to_string);
let lexemes = Lexeme::collect(&input);
let first = lexemes.first().unwrap();
let second = lexemes.get(1).unwrap();
let third = lexemes.get(2).unwrap();
let last = lexemes.last().unwrap();
assert_eq!(
format!("{first}"),
String::from("Lx [F] pcdA -> o32X -> kz2i"),
"first"
);
assert_eq!(
format!("{second}"),
String::from("Lx o32X -> kz2i -> 79Lz"),
"second"
);
assert_eq!(
format!("{third}"),
String::from("Lx kz2i -> 79Lz <EOI>"),
"third"
);
assert_eq!(
format!("{last}"),
String::from("Lx [L] 79Lz <EOI>"),
"last"
);
let input_single = ["9fOC"].map(str::to_string);
let lexemes_single = Lexeme::collect(&input_single);
let single = lexemes_single.first().unwrap();
println!("{single:#?}");
assert!(input_single.to_vec().len() == 1);
assert_eq!(format!("{single}"), "Lx [S] 9fOC <EOI>");
}
}