Extract context-specific parsing to a separate module

This commit is contained in:
Juno Takano 2026-01-02 00:52:20 -03:00
commit 5ed2036e36
3 changed files with 158 additions and 120 deletions

View file

@ -7,10 +7,12 @@ use token::{
preformat::PreFormat, literal::Literal, code::Code, oblique::Oblique, preformat::PreFormat, literal::Literal, code::Code, oblique::Oblique,
}; };
use lexeme::Lexeme; use lexeme::Lexeme;
use context::{Context, Block, Inline};
pub mod token; pub mod token;
pub mod lexeme; pub mod lexeme;
pub mod segment; pub mod segment;
pub mod context;
const LEXMAP: LexMap = &[ const LEXMAP: LexMap = &[
(LineBreak::probe, |word| { (LineBreak::probe, |word| {
@ -29,9 +31,9 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
let mut iterator = lexemes.iter().peekable(); let mut iterator = lexemes.iter().peekable();
while let Some(lexeme) = iterator.next() { while let Some(lexeme) = iterator.next() {
match state.context.block { match state.context.block {
BlockContext::None => { Block::None => {
if PreFormat::probe(lexeme) { if PreFormat::probe(lexeme) {
state.context.block = BlockContext::PreFormat; state.context.block = Block::PreFormat;
tokens.push(Token::PreFormat(PreFormat::new(true))); tokens.push(Token::PreFormat(PreFormat::new(true)));
continue; continue;
} else if Header::probe(lexeme) { } else if Header::probe(lexeme) {
@ -41,49 +43,49 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
iterator.peek().map_or(&Lexeme::new("", ""), |l| l), iterator.peek().map_or(&Lexeme::new("", ""), |l| l),
&mut state.dom_ids, &mut state.dom_ids,
)); ));
state.context.block = BlockContext::Header(header.level()); state.context.block = Block::Header(header.level());
tokens.push(Token::Header(header)); tokens.push(Token::Header(header));
continue; continue;
} else if Paragraph::probe(lexeme) { } else if Paragraph::probe(lexeme) {
state.context.block = BlockContext::Paragraph; state.context.block = Block::Paragraph;
tokens.push(Token::Paragraph(Paragraph::new(true))); tokens.push(Token::Paragraph(Paragraph::new(true)));
} }
}, },
BlockContext::PreFormat => { Block::PreFormat => {
if PreFormat::probe(lexeme) { if PreFormat::probe(lexeme) {
tokens.push(Token::PreFormat(PreFormat::new(false))); tokens.push(Token::PreFormat(PreFormat::new(false)));
state.context.block = BlockContext::None; state.context.block = Block::None;
} else { } else {
tokens.push(Token::Literal(Literal::lex(lexeme))); tokens.push(Token::Literal(Literal::lex(lexeme)));
} }
continue; continue;
}, },
BlockContext::Paragraph => { Block::Paragraph => {
if lexeme.text() == "\n" { if lexeme.text() == "\n" {
tokens.push(Token::Paragraph(Paragraph::new(false))); tokens.push(Token::Paragraph(Paragraph::new(false)));
state.context.block = BlockContext::None; state.context.block = Block::None;
} }
}, },
BlockContext::Header(n) => { Block::Header(n) => {
if lexeme.text() == "\n" { if lexeme.text() == "\n" {
tokens.push(Token::Header(Header::from_u8(n, false, None))); tokens.push(Token::Header(Header::from_u8(n, false, None)));
state.context.block = BlockContext::None; state.context.block = Block::None;
} }
}, },
} }
match state.context.inline { match state.context.inline {
InlineContext::None => { Inline::None => {
if Code::probe(lexeme) { if Code::probe(lexeme) {
state.context.inline = InlineContext::Code; state.context.inline = Inline::Code;
tokens.push(Token::Code(Code::new(true))); tokens.push(Token::Code(Code::new(true)));
continue; continue;
} else if Oblique::probe(lexeme) { } else if Oblique::probe(lexeme) {
state.context.inline = InlineContext::Oblique; state.context.inline = Inline::Oblique;
tokens.push(Token::Oblique(Oblique::new(true))); tokens.push(Token::Oblique(Oblique::new(true)));
continue; continue;
} else if Anchor::probe(lexeme) { } else if Anchor::probe(lexeme) {
state.context.inline = InlineContext::Anchor; state.context.inline = Inline::Anchor;
state.buffers.anchor.clear(); state.buffers.anchor.clear();
if lexeme.match_first_char('|') { if lexeme.match_first_char('|') {
@ -94,81 +96,27 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
continue; continue;
} }
}, },
InlineContext::Code => { Inline::Code => {
if Code::probe(lexeme) { if Code::probe(lexeme) {
state.context.inline = InlineContext::None; state.context.inline = Inline::None;
tokens.push(Token::Code(Code::new(false))); tokens.push(Token::Code(Code::new(false)));
continue; continue;
} }
}, },
InlineContext::Oblique => { Inline::Oblique => {
if Oblique::probe(lexeme) { if Oblique::probe(lexeme) {
state.context.inline = InlineContext::None; state.context.inline = Inline::None;
tokens.push(Token::Oblique(Oblique::new(false))); tokens.push(Token::Oblique(Oblique::new(false)));
continue; continue;
} }
}, },
InlineContext::Anchor => { Inline::Anchor => {
let buffer = &mut state.buffers.anchor; if context::anchor::parse(
let candidate = &mut buffer.candidate; lexeme,
if candidate.text.is_empty() { &mut iterator,
if lexeme.next() == "|" { &mut state,
buffer.text.push_str(&lexeme.text()); &mut tokens,
candidate.text.clone_from(&buffer.text); ) {
} else {
buffer.text.push_str(&lexeme.text());
}
continue;
} else if candidate.destination.is_none() {
// candidate is leading and we found the second pipe
if candidate.leading && lexeme.text() == "|" {
// third pipe immediately after second: forcing flanking
if lexeme.match_next_first_char('|') {
candidate.destination =
Some(candidate.text.clone());
let token = Token::Anchor(candidate.clone());
tokens.push(token);
state.context.inline = InlineContext::None;
iterator.next();
continue;
// whitespace or punctuation after pipe: flanking anchor
} else if lexeme.is_next_whitespace()
|| lexeme.is_next_punctuation()
{
candidate.destination =
Some(candidate.text.clone());
let token = Token::Anchor(candidate.clone());
tokens.push(token);
state.context.inline = InlineContext::None;
// non-whitespace after pipe is the destination
} else {
candidate.destination = Some(lexeme.next().clone());
let token = Token::Anchor(candidate.clone());
tokens.push(token);
state.context.inline = InlineContext::None;
// if there is a trailing pipe, consume it
if let Some(next) = iterator.next()
&& next.next() == "|"
{
iterator.next();
}
}
// candidate is nonleading and we found a second pipe
} else if !candidate.leading && lexeme.next() == "|" {
candidate.destination = Some(lexeme.text());
tokens.push(Token::Anchor(candidate.clone()));
state.context.inline = InlineContext::None;
iterator.next();
// candidate is nonleading and we found whitespace
} else if lexeme.is_next_whitespace() {
candidate.destination = Some(lexeme.text());
let token = Token::Anchor(candidate.clone());
tokens.push(token);
state.context.inline = InlineContext::None;
// candidate is nonleading and we haven't found whitespace
} else {
buffer.destination.push_str(&lexeme.text());
}
continue; continue;
} }
}, },
@ -182,48 +130,16 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
} }
} }
close(&state, &mut tokens); context::close(&state, &mut tokens);
tokens tokens
} }
fn close(state: &State, tokens: &mut Vec<Token>) { pub struct State {
match state.context.block {
BlockContext::PreFormat => {
tokens.push(Token::PreFormat(PreFormat::new(false)));
},
BlockContext::Paragraph => {
tokens.push(Token::Paragraph(Paragraph::new(false)));
},
BlockContext::Header(_) => panic!("End of file with open header"),
BlockContext::None => (),
}
}
enum BlockContext {
Paragraph,
Header(u8),
PreFormat,
None,
}
enum InlineContext {
Anchor,
Code,
Oblique,
None,
}
struct State {
context: Context, context: Context,
dom_ids: HashMap<String, Vec<String>>, dom_ids: HashMap<String, Vec<String>>,
buffers: Buffers, buffers: Buffers,
} }
struct Context {
block: BlockContext,
inline: InlineContext,
}
struct Buffers { struct Buffers {
anchor: AnchorBuffer, anchor: AnchorBuffer,
} }
@ -247,8 +163,8 @@ impl State {
fn new() -> State { fn new() -> State {
State { State {
context: Context { context: Context {
inline: InlineContext::None, inline: Inline::None,
block: BlockContext::None, block: Block::None,
}, },
dom_ids: HashMap::new(), dom_ids: HashMap::new(),
buffers: Buffers { buffers: Buffers {
@ -325,6 +241,14 @@ mod tests {
); );
} }
#[test]
fn anchor_to_node_s() {
assert_eq!(
read_noconfig("The |letter s|s|'s node: |s|!"),
r#"<p>The <a href="/node/s">letter s</a>'s node: <a href="/node/s">s</a>!</p>"#
);
}
#[test] #[test]
fn clear_anchor_buffer() { fn clear_anchor_buffer() {
assert_eq!( assert_eq!(
@ -356,27 +280,27 @@ mod tests {
} }
#[test] #[test]
#[should_panic(expected = "End of file with open header")] #[should_panic(expected = "End of input with open header")]
fn end_with_open_header() { fn end_with_open_header() {
let default_state = State::new(); let default_state = State::new();
let state = State { let state = State {
context: Context { context: Context {
block: BlockContext::Header(1), block: Block::Header(1),
..default_state.context ..default_state.context
}, },
..default_state ..default_state
}; };
close(&state, &mut vec![]); context::close(&state, &mut vec![]);
} }
#[test] #[test]
fn end_with_open_preformat() { fn end_with_open_preformat() {
let mut state = State::new(); let mut state = State::new();
state.context.block = BlockContext::PreFormat; state.context.block = Block::PreFormat;
let mut vec: Vec<Token> = vec![]; let mut vec: Vec<Token> = vec![];
close(&state, &mut vec); context::close(&state, &mut vec);
assert_eq!(vec, vec![Token::PreFormat(PreFormat::new(false))]); assert_eq!(vec, vec![Token::PreFormat(PreFormat::new(false))]);
} }

View file

@ -0,0 +1,40 @@
use crate::syntax::content::parser::{
token::{Token, paragraph::Paragraph, preformat::PreFormat},
State,
};
pub mod anchor;
pub struct Context {
pub block: Block,
pub inline: Inline,
}
pub enum Block {
Paragraph,
Header(u8),
PreFormat,
None,
}
pub enum Inline {
Anchor,
Code,
Oblique,
None,
}
/// # Panics
/// Panics if there is an open header at end of input.
pub fn close(state: &State, tokens: &mut Vec<Token>) {
match state.context.block {
Block::PreFormat => {
tokens.push(Token::PreFormat(PreFormat::new(false)));
},
Block::Paragraph => {
tokens.push(Token::Paragraph(Paragraph::new(false)));
},
Block::Header(_) => panic!("End of input with open header"),
Block::None => (),
}
}

View file

@ -0,0 +1,74 @@
use std::{iter::Peekable, slice::Iter};
use crate::syntax::content::parser::{
State, context::Inline, lexeme::Lexeme, token::Token,
};
pub fn parse(
lexeme: &Lexeme,
iterator: &mut Peekable<Iter<'_, Lexeme>>,
state: &mut State,
tokens: &mut Vec<Token>,
) -> bool {
let buffer = &mut state.buffers.anchor;
let candidate = &mut buffer.candidate;
if candidate.text.is_empty() {
if lexeme.next() == "|" {
buffer.text.push_str(&lexeme.text());
candidate.text.clone_from(&buffer.text);
} else {
buffer.text.push_str(&lexeme.text());
}
return true;
} else if candidate.destination.is_none() {
// candidate is leading and we found the second pipe
if candidate.leading && lexeme.text() == "|" {
// third pipe immediately after second: forcing flanking
if lexeme.match_next_first_char('|') {
candidate.destination = Some(candidate.text.clone());
let token = Token::Anchor(candidate.clone());
tokens.push(token);
state.context.inline = Inline::None;
iterator.next();
return true;
// whitespace or punctuation after pipe: flanking anchor
} else if lexeme.is_next_whitespace()
|| lexeme.is_next_punctuation()
{
candidate.destination = Some(candidate.text.clone());
let token = Token::Anchor(candidate.clone());
tokens.push(token);
state.context.inline = Inline::None;
// non-whitespace after pipe is the destination
} else {
candidate.destination = Some(lexeme.next().clone());
let token = Token::Anchor(candidate.clone());
tokens.push(token);
state.context.inline = Inline::None;
// if there is a trailing pipe, consume it
if let Some(next) = iterator.next()
&& next.next() == "|"
{
iterator.next();
}
}
// candidate is nonleading and we found a second pipe
} else if !candidate.leading && lexeme.next() == "|" {
candidate.destination = Some(lexeme.text());
tokens.push(Token::Anchor(candidate.clone()));
state.context.inline = Inline::None;
iterator.next();
// candidate is nonleading and we found whitespace
} else if lexeme.is_next_whitespace() {
candidate.destination = Some(lexeme.text());
let token = Token::Anchor(candidate.clone());
tokens.push(token);
state.context.inline = Inline::None;
// candidate is nonleading and we haven't found whitespace
} else {
buffer.destination.push_str(&lexeme.text());
}
return true;
}
false
}