Rework token segmentation

This commit is contained in:
Juno Takano 2025-12-23 21:40:57 -03:00
commit 8b782d6d20
16 changed files with 497 additions and 385 deletions

View file

@ -1,4 +1,4 @@
use std::collections::{HashMap, hash_map::Entry};
use std::collections::{HashMap};
use crate::{formats::populate_graph, types::Config};
@ -11,98 +11,147 @@ use lexeme::Lexeme;
pub mod token;
pub mod lexeme;
pub mod cluster;
pub mod segment;
const LEXMAP: LexMap = &[
(LineBreak::probe, |word| {
Token::LineBreak(LineBreak::lex(word))
}),
(Code::probe, |word| Token::Code(Code::lex(word))),
(Anchor::probe, |word| Token::Anchor(Anchor::lex(word))),
(Literal::probe, |word| Token::Literal(Literal::lex(word))),
];
enum Context {
None,
Paragraph,
Header(u8),
PreFormat,
}
struct State {
context: Context,
dom_ids: HashMap<String, Vec<String>>,
}
fn lex(text: &str, map: LexMap) -> Vec<Token> {
let mut tokens: Vec<Token> = Vec::new();
let mut state = State {
context: Context::None,
dom_ids: HashMap::new(),
};
let mut state = State::new();
let config: Config = populate_graph().meta.config;
let splits = cluster::cluster(text);
let lexemes = Lexeme::collect(&splits);
let iter = lexemes.iter().peekable();
for lexeme in iter {
match state.context {
Context::None => {
let segments = segment::segment(text);
let lexemes = Lexeme::collect(&segments);
let mut iterator = lexemes.iter().peekable();
while let Some(lexeme) = iterator.next() {
match state.context.block {
BlockContext::None => {
if PreFormat::probe(lexeme) {
state.context.block = BlockContext::PreFormat;
tokens.push(Token::PreFormat(PreFormat::new(true)));
state.context = Context::PreFormat;
continue;
} else if Header::probe(lexeme) {
let base_id =
if config.ascii_dom_ids && !lexeme.next.is_ascii() {
String::from("h")
} else {
lexeme.next.clone().to_lowercase()
};
let id = match state.dom_ids.entry(base_id.clone()) {
Entry::Occupied(mut occupied) => {
let ids = occupied.get_mut();
let suffix: u8 =
ids.len().try_into().unwrap_or_default();
let id_with_suffix = format!("{base_id}-{suffix}");
ids.push(id_with_suffix.clone());
id_with_suffix
},
Entry::Vacant(vacant) => {
vacant.insert(vec![base_id.clone()]);
base_id
},
};
let mut header = Header::lex(lexeme);
header.dom_id = Some(id);
state.context = Context::Header(header.get_level());
header.dom_id = Some(Header::make_id(
&config,
&mut iterator,
&mut state.dom_ids,
));
state.context.block = BlockContext::Header(header.level());
tokens.push(Token::Header(header));
continue;
} else if Paragraph::probe(lexeme) {
state.context.block = BlockContext::Paragraph;
tokens.push(Token::Paragraph(Paragraph::new(true)));
state.context = Context::Paragraph;
}
},
Context::PreFormat => {
BlockContext::PreFormat => {
if PreFormat::probe(lexeme) {
tokens.push(Token::PreFormat(PreFormat::new(false)));
state.context = Context::None;
state.context.block = BlockContext::None;
} else {
tokens.push(Token::Literal(Literal::lex(lexeme)));
}
continue;
},
Context::Paragraph => {
BlockContext::Paragraph => {
if lexeme.text() == "\n" {
tokens.push(Token::Paragraph(Paragraph::new(false)));
state.context = Context::None;
state.context.block = BlockContext::None;
}
},
Context::Header(n) => {
BlockContext::Header(n) => {
if lexeme.text() == "\n" {
tokens.push(Token::Header(Header::from_u8(n, false, None)));
state.context = Context::None;
state.context.block = BlockContext::None;
}
},
}
match state.context.inline {
InlineContext::None => {
if Code::probe(lexeme) {
state.context.inline = InlineContext::Code;
tokens.push(Token::Code(Code::new(true)));
continue;
} else if Anchor::probe(lexeme) {
state.context.inline = InlineContext::Anchor;
state.buffers.anchor.clear();
if lexeme.match_first_char('|') {
state.buffers.anchor.candidate.leading = true;
} else {
state.buffers.anchor.candidate.text = lexeme.text();
}
continue;
}
},
InlineContext::Code => {
if Code::probe(lexeme) {
state.context.inline = InlineContext::None;
tokens.push(Token::Code(Code::new(false)));
continue;
}
},
InlineContext::Anchor => {
let buffer = &mut state.buffers.anchor;
let candidate = &mut buffer.candidate;
if candidate.text.is_empty() {
if lexeme.next == "|" {
buffer.text.push_str(&lexeme.text());
candidate.text.clone_from(&buffer.text);
} else {
buffer.text.push_str(&lexeme.text());
}
continue;
} else if candidate.destination.is_none() {
// candidate is leading and we found the second pipe
if candidate.leading && lexeme.text() == "|" {
// whitespace after pipe: flanking node anchor
if lexeme.is_next_whitespace() {
candidate.destination =
Some(candidate.text.clone());
let token = Token::Anchor(candidate.clone());
tokens.push(token);
state.context.inline = InlineContext::None;
// non-whitespace after pipe is the destination
} else {
candidate.destination = Some(lexeme.next.clone());
let token = Token::Anchor(candidate.clone());
tokens.push(token);
state.context.inline = InlineContext::None;
// if there is a trailing pipe, consume it
if let Some(next) = iterator.next()
&& next.next == "|"
{
iterator.next();
}
}
// candidate is nonleading and we found a second pipe
} else if !candidate.leading && lexeme.next == "|" {
candidate.destination = Some(lexeme.text());
tokens.push(Token::Anchor(candidate.clone()));
state.context.inline = InlineContext::None;
iterator.next();
// candidate is nonleading and we found whitespace
} else if lexeme.is_next_whitespace() {
candidate.destination = Some(lexeme.text());
let token = Token::Anchor(candidate.clone());
tokens.push(token);
state.context.inline = InlineContext::None;
// candidate is nonleading and we haven't found whitespace
} else {
buffer.destination.push_str(&lexeme.text());
}
continue;
} else {
unreachable!("Anchor is already fully parsed");
}
},
}
@ -118,6 +167,68 @@ fn lex(text: &str, map: LexMap) -> Vec<Token> {
tokens
}
enum BlockContext {
Paragraph,
Header(u8),
PreFormat,
None,
}
enum InlineContext {
Anchor,
Code,
None,
}
struct State {
context: Context,
dom_ids: HashMap<String, Vec<String>>,
buffers: Buffers,
}
struct Buffers {
anchor: AnchorBuffer,
}
#[derive(Debug)]
struct AnchorBuffer {
candidate: Anchor,
text: String,
destination: String,
}
impl AnchorBuffer {
fn clear(&mut self) {
self.candidate = Anchor::empty();
self.text = String::new();
self.destination = String::new();
}
}
impl State {
fn new() -> State {
State {
context: Context {
inline: InlineContext::None,
block: BlockContext::None,
},
dom_ids: HashMap::new(),
buffers: Buffers {
anchor: AnchorBuffer {
candidate: Anchor::empty(),
text: String::new(),
destination: String::new(),
},
},
}
}
}
struct Context {
block: BlockContext,
inline: InlineContext,
}
fn parse(tokens: &[Token]) -> String {
tokens.iter().map(Token::render).collect::<String>()
}