Add a context parser for PreFormat blocks

This commit is contained in:
Juno Takano 2026-06-02 16:00:40 -03:00
commit d0ca4e6cb3
9 changed files with 145 additions and 94 deletions

10
Cargo.lock generated
View file

@ -86,9 +86,9 @@ dependencies = [
[[package]] [[package]]
name = "bitflags" name = "bitflags"
version = "2.11.1" version = "2.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a"
[[package]] [[package]]
name = "block-buffer" name = "block-buffer"
@ -238,7 +238,7 @@ dependencies = [
[[package]] [[package]]
name = "en" name = "en"
version = "0.4.0-alpha" version = "0.4.1-alpha"
dependencies = [ dependencies = [
"axum", "axum",
"serde", "serde",
@ -531,9 +531,9 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
[[package]] [[package]]
name = "log" name = "log"
version = "0.4.30" version = "0.4.31"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5" checksum = "113b30b4cd05f7c06868fdb2854f66a7b9fece9a48425351cd532e810d74024f"
[[package]] [[package]]
name = "matchit" name = "matchit"

View file

@ -1,6 +1,6 @@
[package] [package]
name = "en" name = "en"
version = "0.4.0-alpha" version = "0.4.1-alpha"
description = "A non-linear writing instrument." description = "A non-linear writing instrument."
license = "AGPL-3.0-only" license = "AGPL-3.0-only"

View file

@ -1,12 +1,13 @@
use crate::syntax::content::parser::{ use crate::syntax::content::parser::{
State, Token, State, Token,
token::{Header, Paragraph, PreFormat, Verse}, token::{Header, Paragraph, Verse},
}; };
pub mod anchor; pub mod anchor;
pub mod block; pub mod block;
pub mod inline; pub mod inline;
pub mod list; pub mod list;
pub mod preformat;
pub mod quote; pub mod quote;
pub mod table; pub mod table;
@ -38,30 +39,32 @@ pub enum Inline {
} }
/// # Panics /// # Panics
/// Panics if there is an open header or list at end of input. /// Panics if there is an open token at end of input that can't be easily
/// closed by simply adding a matching closing token. This normally is handled
/// by context parsers and probably indicates an error in one of them.
pub fn close(state: &State, tokens: &mut Vec<Token>) { pub fn close(state: &State, tokens: &mut Vec<Token>) {
match state.context.block { match state.context.block {
Block::PreFormat => {
tokens.push(Token::PreFormat(PreFormat::new(false)));
},
Block::Paragraph => { Block::Paragraph => {
tokens.push(Token::Paragraph(Paragraph::new(false))); tokens.push(Token::Paragraph(Paragraph::new(false)));
}, },
Block::List => {
panic!("End of input with open list")
},
Block::Header(level) => { Block::Header(level) => {
tokens.push(Token::Header(Header::from_u8(level, false, None))); tokens.push(Token::Header(Header::from_u8(level, false, None)));
}, },
Block::Quote => {
panic!("End of input with open quote")
},
Block::Table => {
panic!("End of input with open table")
},
Block::Verse => { Block::Verse => {
tokens.push(Token::Verse(Verse::new(false))); tokens.push(Token::Verse(Verse::new(false)));
}, },
Block::PreFormat => {
panic!("End of input with open preformat: {tokens:#?}")
},
Block::List => {
panic!("End of input with open list: {tokens:#?}")
},
Block::Quote => {
panic!("End of input with open quote: {tokens:#?}")
},
Block::Table => {
panic!("End of input with open table: {tokens:#?}")
},
Block::None => (), Block::None => (),
} }
} }

View file

@ -6,15 +6,17 @@ use crate::{
syntax::content::{ syntax::content::{
Parseable as _, Parseable as _,
parser::{ parser::{
Block, Lexeme, State, Token, Block, Lexeme, State, Token, context,
token::{ token::{
Header, LineBreak, List, Literal, Paragraph, PreFormat, Quote, Header, LineBreak, List, Paragraph, PreFormat, Quote, Table,
Table, Verse, Verse,
}, },
}, },
}, },
}; };
/// A return of `true` will trigger a `continue` on the outer parser, causing
/// no more subsequent parsing of the current lexeme.
pub fn parse( pub fn parse(
lexeme: &Lexeme, lexeme: &Lexeme,
state: &mut State, state: &mut State,
@ -27,8 +29,7 @@ pub fn parse(
if PreFormat::probe(lexeme) { if PreFormat::probe(lexeme) {
log!(VERBOSE, "Block Context: None -> PreFormat on {lexeme}"); log!(VERBOSE, "Block Context: None -> PreFormat on {lexeme}");
state.context.block = Block::PreFormat; state.context.block = Block::PreFormat;
tokens.push(Token::PreFormat(PreFormat::new(true))); return true
return true;
} else if Header::probe(lexeme) { } else if Header::probe(lexeme) {
let mut header = Header::lex(lexeme); let mut header = Header::lex(lexeme);
header.dom_id = Some(Header::make_id( header.dom_id = Some(Header::make_id(
@ -44,7 +45,7 @@ pub fn parse(
log!(VERBOSE, "Block Context: None -> List on {lexeme}"); log!(VERBOSE, "Block Context: None -> List on {lexeme}");
state.context.block = Block::List; state.context.block = Block::List;
state.buffers.list.candidate.ordered = lexeme.match_char('+'); state.buffers.list.candidate.ordered = lexeme.match_char('+');
return super::list::parse( return context::list::parse(
lexeme, state, tokens, iterator, graph, lexeme, state, tokens, iterator, graph,
); );
} else if Quote::probe(lexeme) { } else if Quote::probe(lexeme) {
@ -71,14 +72,7 @@ pub fn parse(
} }
}, },
Block::PreFormat => { Block::PreFormat => {
if PreFormat::probe(lexeme) { return context::preformat::parse(lexeme, state, tokens, iterator);
tokens.push(Token::PreFormat(PreFormat::new(false)));
log!(VERBOSE, "Block Context: PreFormat -> None on {lexeme}");
state.context.block = Block::None;
} else {
tokens.push(Token::Literal(Literal::lex(lexeme)));
}
return true;
}, },
Block::Paragraph => { Block::Paragraph => {
if Paragraph::probe_end(lexeme) { if Paragraph::probe_end(lexeme) {
@ -95,13 +89,17 @@ pub fn parse(
} }
}, },
Block::List => { Block::List => {
return super::list::parse(lexeme, state, tokens, iterator, graph); return context::list::parse(lexeme, state, tokens, iterator, graph);
}, },
Block::Quote => { Block::Quote => {
return super::quote::parse(lexeme, state, tokens, iterator, graph); return context::quote::parse(
lexeme, state, tokens, iterator, graph,
);
}, },
Block::Table => { Block::Table => {
return super::table::parse(lexeme, state, tokens, iterator, graph); return context::table::parse(
lexeme, state, tokens, iterator, graph,
);
}, },
Block::Verse => { Block::Verse => {
if Verse::probe_end(lexeme) { if Verse::probe_end(lexeme) {
@ -127,7 +125,7 @@ mod tests {
graph::Graph, graph::Graph,
syntax::content::parser::{ syntax::content::parser::{
self, Block, State, Token, context, self, Block, State, Token, context,
token::{Header, PreFormat, header::Level}, token::{Header, header::Level},
}, },
}; };
@ -161,16 +159,6 @@ mod tests {
assert_eq!(vec, vec![Token::Header(Header::from_u8(1, false, None))]); assert_eq!(vec, vec![Token::Header(Header::from_u8(1, false, None))]);
} }
#[test]
fn end_with_open_preformat() {
let mut state = State::default();
state.context.block = Block::PreFormat;
let mut vec: Vec<Token> = vec![];
context::close(&state, &mut vec);
assert_eq!(vec, vec![Token::PreFormat(PreFormat::new(false))]);
}
#[test] #[test]
fn truncated_header_level() { fn truncated_header_level() {
let u: usize = 999; let u: usize = 999;

View file

@ -0,0 +1,61 @@
use std::{iter::Peekable, slice::Iter};
use crate::{
prelude::*,
syntax::content::{
Parseable as _,
parser::{Lexeme, State, Token, context::Block, token::PreFormat},
},
};
/// Handles open `PreFormat` contexts until a block is fully parsed.
///
/// A return of `true` will trigger a continue in the outer parser,
/// skipping any further parsing of the current lexeme.
///
/// # Panics
/// This parser can handle only the List context, and will panic if passed an
/// unrelated context since it has no knowledge on how to handle them.
pub fn parse(
lexeme: &Lexeme,
state: &mut State,
tokens: &mut Vec<Token>,
iterator: &mut Peekable<Iter<'_, Lexeme>>,
) -> bool {
let buffer = &mut state.buffers.preformat;
let candidate = &mut buffer.candidate;
#[expect(clippy::wildcard_enum_match_arm)]
match state.context.block {
Block::PreFormat => {
if lexeme.match_first_char('<') {
candidate.text.push_str("&lt;");
candidate.text.push_str(
lexeme.text().strip_prefix('<').unwrap_or(&lexeme.text()),
);
} else if lexeme.match_last_char('>') {
candidate.text.push_str(
lexeme.text().strip_suffix('>').unwrap_or(&lexeme.text()),
);
candidate.text.push_str("&gt;");
} else if lexeme.match_char('\\') {
candidate.text.push_str(lexeme.next().as_str());
iterator.next();
return true;
} else if PreFormat::probe(lexeme) {
// found end of block, push it and reset state
log!(VERBOSE, "Accepting preformat candidate {candidate}");
tokens.push(Token::PreFormat(candidate.clone()));
state.context.block = Block::None;
*candidate = PreFormat::default();
} else {
// anything else is pushed into the candidate preformat's text
candidate.text.push_str(&lexeme.text());
}
},
_ => {
panic!("PreFormat context parser called for {:?}", state.context)
},
}
true
}

View file

@ -32,6 +32,8 @@ impl Lexeme {
pub fn mutate_text(&mut self, new: &str) { self.text = new.to_string(); } pub fn mutate_text(&mut self, new: &str) { self.text = new.to_string(); }
/// Returns an Option containing the character if the raw lexeme text
/// is composed of a single character, None if it has multiple characters.
pub fn as_char(&self) -> Option<char> { pub fn as_char(&self) -> Option<char> {
if self.text.chars().count() == 1 { if self.text.chars().count() == 1 {
self.text.chars().nth(0) self.text.chars().nth(0)
@ -56,6 +58,7 @@ impl Lexeme {
} }
} }
/// Returns true if the raw lexeme text is a single matching character.
pub fn match_char(&self, c: char) -> bool { pub fn match_char(&self, c: char) -> bool {
self.as_char().is_some_and(|as_char| as_char == c) self.as_char().is_some_and(|as_char| as_char == c)
} }
@ -86,6 +89,8 @@ impl Lexeme {
&& self.match_third_char(c3) && self.match_third_char(c3)
} }
/// Returns true if the lexeme raw text is composed of a single character
/// and this character is in the provided slice.
pub fn match_char_in(&self, slice: &[char]) -> bool { pub fn match_char_in(&self, slice: &[char]) -> bool {
self.as_char().is_some_and(|c| slice.contains(&c)) self.as_char().is_some_and(|c| slice.contains(&c))
} }

View file

@ -38,7 +38,9 @@ pub(super) fn lex(
let mut iterator = lexemes.iter().peekable(); let mut iterator = lexemes.iter().peekable();
while let Some(lexeme) = iterator.next() { while let Some(lexeme) = iterator.next() {
if lexeme.match_char('\\') { if lexeme.match_char('\\')
&& !matches!(state.context.block, context::Block::PreFormat)
{
if let Some(next) = iterator.next() { if let Some(next) = iterator.next() {
tokens.push(Token::Literal(Literal::lex(next))); tokens.push(Token::Literal(Literal::lex(next)));
} }

View file

@ -3,7 +3,7 @@ use std::collections::HashMap;
use crate::syntax::content::parser::{ use crate::syntax::content::parser::{
Token, Token,
context::Context, context::Context,
token::{Anchor, Item, List, Quote, Table}, token::{Anchor, Item, List, PreFormat, Quote, Table},
}; };
#[derive(Clone, Default, Debug)] #[derive(Clone, Default, Debug)]
@ -29,6 +29,7 @@ pub struct Buffers {
pub list: ListBuffer, pub list: ListBuffer,
pub quote: QuoteBuffer, pub quote: QuoteBuffer,
pub table: TableBuffer, pub table: TableBuffer,
pub preformat: PreFormatBuffer,
} }
#[derive(Default, Clone, Debug)] #[derive(Default, Clone, Debug)]
@ -59,6 +60,11 @@ pub struct TableBuffer {
pub in_header: bool, pub in_header: bool,
} }
#[derive(Default, Clone, Debug)]
pub struct PreFormatBuffer {
pub candidate: PreFormat,
}
impl std::fmt::Display for AnchorBuffer { impl std::fmt::Display for AnchorBuffer {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
let display_text = if self.text.is_empty() { let display_text = if self.text.is_empty() {

View file

@ -1,46 +1,42 @@
use crate::syntax::content::{Lexeme, Parseable}; use crate::syntax::content::{Lexeme, Parseable};
#[derive(Debug, Clone, Eq, PartialEq)] #[derive(Debug, Default, Clone, Eq, PartialEq)]
pub struct PreFormat { pub struct PreFormat {
open: Option<bool>, pub text: String,
} }
impl PreFormat { impl PreFormat {
pub const fn new(open: bool) -> PreFormat { PreFormat { open: Some(open) } } pub fn new(text: &str) -> PreFormat {
PreFormat {
text: String::from(text),
}
}
} }
impl std::fmt::Display for PreFormat { impl std::fmt::Display for PreFormat {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
let display_open_state = if let Some(open_state) = self.open { let character_count = self.text.chars().count();
if open_state { "open" } else { "closed" } let is_whitespace = self.text.trim_ascii().is_empty();
let summary = if is_whitespace {
"empty"
} else { } else {
"unknown" &format!("{character_count} chars")
}; };
write!(f, "PreFormat [{display_open_state}]") write!(f, "PreFormat [{summary}]")
} }
} }
impl Parseable for PreFormat { impl Parseable for PreFormat {
fn probe(lexeme: &Lexeme) -> bool { fn probe(lexeme: &Lexeme) -> bool {
lexeme.match_first_char('`') && (lexeme.next() == "\n" || lexeme.last()) lexeme.match_char('`') && (lexeme.next() == "\n" || lexeme.last())
} }
fn lex(_lexeme: &Lexeme) -> PreFormat { PreFormat { open: None } } fn lex(_lexeme: &Lexeme) -> PreFormat {
panic!("Attempt to lex a preformat directly from a lexeme")
fn render(&self) -> String {
if let Some(o) = self.open {
if o {
"<pre>".to_owned()
} else {
"</pre>".to_owned()
}
} else {
panic!(
"Attempt to render a preformat tag while open state is unknown"
)
}
} }
fn render(&self) -> String { format!("<pre>{}</pre>", self.text) }
fn flatten(&self) -> String { String::default() } fn flatten(&self) -> String { String::default() }
} }
@ -50,49 +46,39 @@ mod tests {
use crate::syntax::content::parser::Token; use crate::syntax::content::parser::Token;
#[test] #[test]
#[should_panic(
expected = "Attempt to lex a preformat directly from a lexeme"
)]
fn lex() { fn lex() {
let from_empty_lexeme = PreFormat::lex(&Lexeme::default()); let lexeme = Lexeme::new("a", "b", "c");
assert!(from_empty_lexeme.open.is_none()); PreFormat::lex(&lexeme);
let from_non_empty_lexeme = PreFormat::lex(&Lexeme::default());
assert!(from_non_empty_lexeme.open.is_none());
}
#[test]
#[should_panic(expected = "Attempt to render a preformat tag while \
open state is unknown")]
fn render() {
let from_empty_lexeme = PreFormat::lex(&Lexeme::default());
from_empty_lexeme.render();
let from_non_empty_lexeme = PreFormat::lex(&Lexeme::default());
from_non_empty_lexeme.render();
} }
#[test] #[test]
fn token_display() { fn token_display() {
let mut preformat = PreFormat::new(true); let mut preformat = PreFormat::new("");
assert_eq!( assert_eq!(
format!("{}", Token::PreFormat(preformat.clone())), format!("{}", Token::PreFormat(preformat.clone())),
"Tk:PreFormat [open]" "Tk:PreFormat [empty]"
); );
preformat.open = Some(false); preformat.text = "\n ".to_string();
assert_eq!( assert_eq!(
format!("{}", Token::PreFormat(preformat.clone())), format!("{}", Token::PreFormat(preformat.clone())),
"Tk:PreFormat [closed]" "Tk:PreFormat [empty]"
); );
preformat.open = None; preformat.text = "text".to_string();
assert_eq!( assert_eq!(
format!("{}", Token::PreFormat(preformat)), format!("{}", Token::PreFormat(preformat)),
"Tk:PreFormat [unknown]" "Tk:PreFormat [4 chars]"
); );
} }
#[test] #[test]
fn flatten() { fn flatten() {
let preformat = PreFormat::new(false); let preformat = PreFormat::new("");
assert_eq!(preformat.flatten(), ""); assert_eq!(preformat.flatten(), "");
let token = Token::PreFormat(preformat); let token = Token::PreFormat(preformat);