Simplify parser module structure, add several syntax elements

This commit is contained in:
Juno Takano 2025-12-20 21:25:06 -03:00
commit e3d5686c7b
11 changed files with 348 additions and 186 deletions

View file

@ -1,21 +1,20 @@
use std::slice::Iter;
use crate::prelude::*;
use super::{Parseable as _, Token, LexMap};
use token::{
anchor::Anchor, linebreak::LineBreak, paragraph::Paragraph, header::Header,
preformat::PreFormat, literal::Literal,
preformat::PreFormat, literal::Literal, code::Code,
};
use lexeme::{Lexeme, compound::Compound};
use lexeme::Lexeme;
pub mod token;
pub mod lexeme;
pub mod cluster;
const LEXMAP: LexMap = &[
(Anchor::probe, |word| Token::Anchor(Anchor::lex(word))),
(LineBreak::probe, |word| {
Token::LineBreak(LineBreak::lex(word))
}),
(Code::probe, |word| Token::Code(Code::lex(word))),
(Anchor::probe, |word| Token::Anchor(Anchor::lex(word))),
(Literal::probe, |word| Token::Literal(Literal::lex(word))),
];
@ -30,52 +29,52 @@ fn lex(text: &str, map: LexMap) -> Vec<Token> {
let mut tokens: Vec<Token> = Vec::new();
let mut state = Context::None;
let splits = split(text);
let mut iter = splits.iter();
while let Some(word) = iter.next() {
let compound = cluster(word, &mut iter);
let lexeme = Lexeme::Compound(compound);
let splits = cluster::cluster(text);
let lexemes = Lexeme::collect(&splits);
let iter = lexemes.iter().peekable();
for lexeme in iter {
match state {
Context::None => {
if Header::probe(&lexeme) {
let header = Header::lex(&lexeme);
state = Context::Header(header.get_level());
tokens.push(Token::Header(header));
continue;
} else if PreFormat::probe(&lexeme) {
if PreFormat::probe(lexeme) {
tokens.push(Token::PreFormat(PreFormat::new(true)));
state = Context::PreFormat;
continue;
} else if Paragraph::probe(&lexeme) {
} else if Header::probe(lexeme) {
let header = Header::lex(lexeme);
state = Context::Header(header.get_level());
tokens.push(Token::Header(header));
continue;
} else if Paragraph::probe(lexeme) {
tokens.push(Token::Paragraph(Paragraph::new(true)));
state = Context::Paragraph;
}
},
Context::PreFormat => {
if PreFormat::probe(lexeme) {
tokens.push(Token::PreFormat(PreFormat::new(false)));
state = Context::None;
} else {
tokens.push(Token::Literal(Literal::lex(lexeme)));
}
continue;
},
Context::Paragraph => {
if word == "\n" {
if lexeme.text() == "\n" {
tokens.push(Token::Paragraph(Paragraph::new(false)));
state = Context::None;
}
},
Context::Header(n) => {
if word == "\n" {
if lexeme.text() == "\n" {
tokens.push(Token::Header(Header::from_u8(n, false)));
state = Context::None;
}
},
Context::PreFormat => {
if PreFormat::probe(&lexeme) {
tokens.push(Token::PreFormat(PreFormat::new(false)));
state = Context::None;
continue;
}
},
}
for &(ref probe, lex) in map {
if probe(&lexeme) {
tokens.push(lex(&lexeme));
if probe(lexeme) {
tokens.push(lex(lexeme));
break;
}
}
@ -84,101 +83,8 @@ fn lex(text: &str, map: LexMap) -> Vec<Token> {
tokens
}
fn split(text: &str) -> Vec<String> {
text.replace("\n", " \n ")
.split(' ')
.map(str::to_string)
.collect()
}
// this could be eliminated if space were a token
fn join<'i, Iterator>(rendered_tokens: Iterator) -> String
where
Iterator: IntoIterator<Item = &'i str>,
{
fn stick(current: &str, next: &str) -> bool {
// this could be in a dedicated type
fn is_tag(s: &str) -> bool {
s.starts_with("<") && s.ends_with('>')
}
fn is_opening(s: &str) -> bool {
is_tag(s) && !s.contains("</")
}
fn is_closing(s: &str) -> bool {
is_tag(s) && s.contains("</")
}
fn is_inline(s: &str) -> bool {
is_tag(s) && s.starts_with("<a")
}
log!("On {current}[?]{next}");
if is_inline(next) {
log!("Pushing space because {next} is inline");
false
} else if is_closing(next) {
log!("Not pushing space because {next} is closing");
true
} else if is_opening(current) {
log!("Not pushing space because {current} is opening");
true
} else {
false
}
}
let mut iterator = rendered_tokens.into_iter();
let mut out_string = String::new();
if let Some(mut current) = iterator.next() {
out_string.push_str(current);
for next in iterator {
if stick(current, next) {
out_string.push_str(next);
} else {
out_string.push(' ');
out_string.push_str(next);
}
current = next;
}
}
out_string
}
fn parse(tokens: &[Token]) -> String {
let rendered: Vec<String> = tokens.iter().map(Token::render).collect();
join(rendered.iter().map(String::as_str))
}
fn cluster<'c>(word: &str, iter: &mut Iter<'c, String>) -> Compound {
if word.starts_with('|') {
log!("Found opener {word}");
let mut parts = vec![word];
if let Some(first) = parts.first()
&& first.ends_with('|')
{
log!("Returning atomic cluster");
Compound::new(&parts.join(" "))
} else {
log!("Seeking a boundary");
for next_raw in iter {
if next_raw.contains('|') {
log!("Found end of cluster {next_raw:?}");
parts.push(next_raw);
break;
} else {
parts.push(next_raw);
log!("Onto next word from {next_raw}");
}
}
log!("Returning cluster {parts:?}");
Compound::new(&parts.join(" "))
}
} else {
Compound::new(word)
}
tokens.iter().map(Token::render).collect::<String>()
}
pub(super) fn read(text: &str) -> String {