From d0ca4e6cb38c1544920b7bd6abea0bbf946e2b21 Mon Sep 17 00:00:00 2001 From: jutty Date: Tue, 2 Jun 2026 16:00:40 -0300 Subject: [PATCH] Add a context parser for PreFormat blocks --- Cargo.lock | 10 +-- Cargo.toml | 2 +- src/syntax/content/parser/context.rs | 31 ++++---- src/syntax/content/parser/context/block.rs | 44 ++++------- .../content/parser/context/preformat.rs | 61 +++++++++++++++ src/syntax/content/parser/lexeme.rs | 5 ++ src/syntax/content/parser/lexer.rs | 4 +- src/syntax/content/parser/state.rs | 8 +- src/syntax/content/parser/token/preformat.rs | 76 ++++++++----------- 9 files changed, 146 insertions(+), 95 deletions(-) create mode 100644 src/syntax/content/parser/context/preformat.rs diff --git a/Cargo.lock b/Cargo.lock index 21ada32..2d09c10 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -86,9 +86,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.11.1" +version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a" [[package]] name = "block-buffer" @@ -238,7 +238,7 @@ dependencies = [ [[package]] name = "en" -version = "0.4.0-alpha" +version = "0.4.1-alpha" dependencies = [ "axum", "serde", @@ -531,9 +531,9 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "log" -version = "0.4.30" +version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5" +checksum = "113b30b4cd05f7c06868fdb2854f66a7b9fece9a48425351cd532e810d74024f" [[package]] name = "matchit" diff --git a/Cargo.toml b/Cargo.toml index 54a54a0..130e36b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "en" -version = "0.4.0-alpha" +version = "0.4.1-alpha" description = "A non-linear writing instrument." license = "AGPL-3.0-only" diff --git a/src/syntax/content/parser/context.rs b/src/syntax/content/parser/context.rs index e5010f6..e1379a5 100644 --- a/src/syntax/content/parser/context.rs +++ b/src/syntax/content/parser/context.rs @@ -1,12 +1,13 @@ use crate::syntax::content::parser::{ State, Token, - token::{Header, Paragraph, PreFormat, Verse}, + token::{Header, Paragraph, Verse}, }; pub mod anchor; pub mod block; pub mod inline; pub mod list; +pub mod preformat; pub mod quote; pub mod table; @@ -38,30 +39,32 @@ pub enum Inline { } /// # Panics -/// Panics if there is an open header or list at end of input. +/// Panics if there is an open token at end of input that can't be easily +/// closed by simply adding a matching closing token. This normally is handled +/// by context parsers and probably indicates an error in one of them. pub fn close(state: &State, tokens: &mut Vec) { match state.context.block { - Block::PreFormat => { - tokens.push(Token::PreFormat(PreFormat::new(false))); - }, Block::Paragraph => { tokens.push(Token::Paragraph(Paragraph::new(false))); }, - Block::List => { - panic!("End of input with open list") - }, Block::Header(level) => { tokens.push(Token::Header(Header::from_u8(level, false, None))); }, - Block::Quote => { - panic!("End of input with open quote") - }, - Block::Table => { - panic!("End of input with open table") - }, Block::Verse => { tokens.push(Token::Verse(Verse::new(false))); }, + Block::PreFormat => { + panic!("End of input with open preformat: {tokens:#?}") + }, + Block::List => { + panic!("End of input with open list: {tokens:#?}") + }, + Block::Quote => { + panic!("End of input with open quote: {tokens:#?}") + }, + Block::Table => { + panic!("End of input with open table: {tokens:#?}") + }, Block::None => (), } } diff --git a/src/syntax/content/parser/context/block.rs b/src/syntax/content/parser/context/block.rs index 3b34724..8048896 100644 --- a/src/syntax/content/parser/context/block.rs +++ b/src/syntax/content/parser/context/block.rs @@ -6,15 +6,17 @@ use crate::{ syntax::content::{ Parseable as _, parser::{ - Block, Lexeme, State, Token, + Block, Lexeme, State, Token, context, token::{ - Header, LineBreak, List, Literal, Paragraph, PreFormat, Quote, - Table, Verse, + Header, LineBreak, List, Paragraph, PreFormat, Quote, Table, + Verse, }, }, }, }; +/// A return of `true` will trigger a `continue` on the outer parser, causing +/// no more subsequent parsing of the current lexeme. pub fn parse( lexeme: &Lexeme, state: &mut State, @@ -27,8 +29,7 @@ pub fn parse( if PreFormat::probe(lexeme) { log!(VERBOSE, "Block Context: None -> PreFormat on {lexeme}"); state.context.block = Block::PreFormat; - tokens.push(Token::PreFormat(PreFormat::new(true))); - return true; + return true } else if Header::probe(lexeme) { let mut header = Header::lex(lexeme); header.dom_id = Some(Header::make_id( @@ -44,7 +45,7 @@ pub fn parse( log!(VERBOSE, "Block Context: None -> List on {lexeme}"); state.context.block = Block::List; state.buffers.list.candidate.ordered = lexeme.match_char('+'); - return super::list::parse( + return context::list::parse( lexeme, state, tokens, iterator, graph, ); } else if Quote::probe(lexeme) { @@ -71,14 +72,7 @@ pub fn parse( } }, Block::PreFormat => { - if PreFormat::probe(lexeme) { - tokens.push(Token::PreFormat(PreFormat::new(false))); - log!(VERBOSE, "Block Context: PreFormat -> None on {lexeme}"); - state.context.block = Block::None; - } else { - tokens.push(Token::Literal(Literal::lex(lexeme))); - } - return true; + return context::preformat::parse(lexeme, state, tokens, iterator); }, Block::Paragraph => { if Paragraph::probe_end(lexeme) { @@ -95,13 +89,17 @@ pub fn parse( } }, Block::List => { - return super::list::parse(lexeme, state, tokens, iterator, graph); + return context::list::parse(lexeme, state, tokens, iterator, graph); }, Block::Quote => { - return super::quote::parse(lexeme, state, tokens, iterator, graph); + return context::quote::parse( + lexeme, state, tokens, iterator, graph, + ); }, Block::Table => { - return super::table::parse(lexeme, state, tokens, iterator, graph); + return context::table::parse( + lexeme, state, tokens, iterator, graph, + ); }, Block::Verse => { if Verse::probe_end(lexeme) { @@ -127,7 +125,7 @@ mod tests { graph::Graph, syntax::content::parser::{ self, Block, State, Token, context, - token::{Header, PreFormat, header::Level}, + token::{Header, header::Level}, }, }; @@ -161,16 +159,6 @@ mod tests { assert_eq!(vec, vec![Token::Header(Header::from_u8(1, false, None))]); } - #[test] - fn end_with_open_preformat() { - let mut state = State::default(); - state.context.block = Block::PreFormat; - - let mut vec: Vec = vec![]; - context::close(&state, &mut vec); - assert_eq!(vec, vec![Token::PreFormat(PreFormat::new(false))]); - } - #[test] fn truncated_header_level() { let u: usize = 999; diff --git a/src/syntax/content/parser/context/preformat.rs b/src/syntax/content/parser/context/preformat.rs new file mode 100644 index 0000000..ed6763f --- /dev/null +++ b/src/syntax/content/parser/context/preformat.rs @@ -0,0 +1,61 @@ +use std::{iter::Peekable, slice::Iter}; + +use crate::{ + prelude::*, + syntax::content::{ + Parseable as _, + parser::{Lexeme, State, Token, context::Block, token::PreFormat}, + }, +}; + +/// Handles open `PreFormat` contexts until a block is fully parsed. +/// +/// A return of `true` will trigger a continue in the outer parser, +/// skipping any further parsing of the current lexeme. +/// +/// # Panics +/// This parser can handle only the List context, and will panic if passed an +/// unrelated context since it has no knowledge on how to handle them. +pub fn parse( + lexeme: &Lexeme, + state: &mut State, + tokens: &mut Vec, + iterator: &mut Peekable>, +) -> bool { + let buffer = &mut state.buffers.preformat; + let candidate = &mut buffer.candidate; + + #[expect(clippy::wildcard_enum_match_arm)] + match state.context.block { + Block::PreFormat => { + if lexeme.match_first_char('<') { + candidate.text.push_str("<"); + candidate.text.push_str( + lexeme.text().strip_prefix('<').unwrap_or(&lexeme.text()), + ); + } else if lexeme.match_last_char('>') { + candidate.text.push_str( + lexeme.text().strip_suffix('>').unwrap_or(&lexeme.text()), + ); + candidate.text.push_str(">"); + } else if lexeme.match_char('\\') { + candidate.text.push_str(lexeme.next().as_str()); + iterator.next(); + return true; + } else if PreFormat::probe(lexeme) { + // found end of block, push it and reset state + log!(VERBOSE, "Accepting preformat candidate {candidate}"); + tokens.push(Token::PreFormat(candidate.clone())); + state.context.block = Block::None; + *candidate = PreFormat::default(); + } else { + // anything else is pushed into the candidate preformat's text + candidate.text.push_str(&lexeme.text()); + } + }, + _ => { + panic!("PreFormat context parser called for {:?}", state.context) + }, + } + true +} diff --git a/src/syntax/content/parser/lexeme.rs b/src/syntax/content/parser/lexeme.rs index 495b27b..94b28d5 100644 --- a/src/syntax/content/parser/lexeme.rs +++ b/src/syntax/content/parser/lexeme.rs @@ -32,6 +32,8 @@ impl Lexeme { pub fn mutate_text(&mut self, new: &str) { self.text = new.to_string(); } + /// Returns an Option containing the character if the raw lexeme text + /// is composed of a single character, None if it has multiple characters. pub fn as_char(&self) -> Option { if self.text.chars().count() == 1 { self.text.chars().nth(0) @@ -56,6 +58,7 @@ impl Lexeme { } } + /// Returns true if the raw lexeme text is a single matching character. pub fn match_char(&self, c: char) -> bool { self.as_char().is_some_and(|as_char| as_char == c) } @@ -86,6 +89,8 @@ impl Lexeme { && self.match_third_char(c3) } + /// Returns true if the lexeme raw text is composed of a single character + /// and this character is in the provided slice. pub fn match_char_in(&self, slice: &[char]) -> bool { self.as_char().is_some_and(|c| slice.contains(&c)) } diff --git a/src/syntax/content/parser/lexer.rs b/src/syntax/content/parser/lexer.rs index e9a0d40..e8d3abc 100644 --- a/src/syntax/content/parser/lexer.rs +++ b/src/syntax/content/parser/lexer.rs @@ -38,7 +38,9 @@ pub(super) fn lex( let mut iterator = lexemes.iter().peekable(); while let Some(lexeme) = iterator.next() { - if lexeme.match_char('\\') { + if lexeme.match_char('\\') + && !matches!(state.context.block, context::Block::PreFormat) + { if let Some(next) = iterator.next() { tokens.push(Token::Literal(Literal::lex(next))); } diff --git a/src/syntax/content/parser/state.rs b/src/syntax/content/parser/state.rs index aa42f68..47498a3 100644 --- a/src/syntax/content/parser/state.rs +++ b/src/syntax/content/parser/state.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use crate::syntax::content::parser::{ Token, context::Context, - token::{Anchor, Item, List, Quote, Table}, + token::{Anchor, Item, List, PreFormat, Quote, Table}, }; #[derive(Clone, Default, Debug)] @@ -29,6 +29,7 @@ pub struct Buffers { pub list: ListBuffer, pub quote: QuoteBuffer, pub table: TableBuffer, + pub preformat: PreFormatBuffer, } #[derive(Default, Clone, Debug)] @@ -59,6 +60,11 @@ pub struct TableBuffer { pub in_header: bool, } +#[derive(Default, Clone, Debug)] +pub struct PreFormatBuffer { + pub candidate: PreFormat, +} + impl std::fmt::Display for AnchorBuffer { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let display_text = if self.text.is_empty() { diff --git a/src/syntax/content/parser/token/preformat.rs b/src/syntax/content/parser/token/preformat.rs index d950590..f24bbc4 100644 --- a/src/syntax/content/parser/token/preformat.rs +++ b/src/syntax/content/parser/token/preformat.rs @@ -1,46 +1,42 @@ use crate::syntax::content::{Lexeme, Parseable}; -#[derive(Debug, Clone, Eq, PartialEq)] +#[derive(Debug, Default, Clone, Eq, PartialEq)] pub struct PreFormat { - open: Option, + pub text: String, } impl PreFormat { - pub const fn new(open: bool) -> PreFormat { PreFormat { open: Some(open) } } + pub fn new(text: &str) -> PreFormat { + PreFormat { + text: String::from(text), + } + } } impl std::fmt::Display for PreFormat { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let display_open_state = if let Some(open_state) = self.open { - if open_state { "open" } else { "closed" } + let character_count = self.text.chars().count(); + let is_whitespace = self.text.trim_ascii().is_empty(); + let summary = if is_whitespace { + "empty" } else { - "unknown" + &format!("{character_count} chars") }; - write!(f, "PreFormat [{display_open_state}]") + write!(f, "PreFormat [{summary}]") } } impl Parseable for PreFormat { fn probe(lexeme: &Lexeme) -> bool { - lexeme.match_first_char('`') && (lexeme.next() == "\n" || lexeme.last()) + lexeme.match_char('`') && (lexeme.next() == "\n" || lexeme.last()) } - fn lex(_lexeme: &Lexeme) -> PreFormat { PreFormat { open: None } } - - fn render(&self) -> String { - if let Some(o) = self.open { - if o { - "
".to_owned()
-            } else {
-                "
".to_owned() - } - } else { - panic!( - "Attempt to render a preformat tag while open state is unknown" - ) - } + fn lex(_lexeme: &Lexeme) -> PreFormat { + panic!("Attempt to lex a preformat directly from a lexeme") } + fn render(&self) -> String { format!("
{}
", self.text) } + fn flatten(&self) -> String { String::default() } } @@ -50,49 +46,39 @@ mod tests { use crate::syntax::content::parser::Token; #[test] + #[should_panic( + expected = "Attempt to lex a preformat directly from a lexeme" + )] fn lex() { - let from_empty_lexeme = PreFormat::lex(&Lexeme::default()); - assert!(from_empty_lexeme.open.is_none()); - - let from_non_empty_lexeme = PreFormat::lex(&Lexeme::default()); - assert!(from_non_empty_lexeme.open.is_none()); - } - - #[test] - #[should_panic(expected = "Attempt to render a preformat tag while \ - open state is unknown")] - fn render() { - let from_empty_lexeme = PreFormat::lex(&Lexeme::default()); - from_empty_lexeme.render(); - - let from_non_empty_lexeme = PreFormat::lex(&Lexeme::default()); - from_non_empty_lexeme.render(); + let lexeme = Lexeme::new("a", "b", "c"); + PreFormat::lex(&lexeme); } #[test] fn token_display() { - let mut preformat = PreFormat::new(true); + let mut preformat = PreFormat::new(""); + assert_eq!( format!("{}", Token::PreFormat(preformat.clone())), - "Tk:PreFormat [open]" + "Tk:PreFormat [empty]" ); - preformat.open = Some(false); + preformat.text = "\n ".to_string(); assert_eq!( format!("{}", Token::PreFormat(preformat.clone())), - "Tk:PreFormat [closed]" + "Tk:PreFormat [empty]" ); - preformat.open = None; + preformat.text = "text".to_string(); assert_eq!( format!("{}", Token::PreFormat(preformat)), - "Tk:PreFormat [unknown]" + "Tk:PreFormat [4 chars]" ); } #[test] fn flatten() { - let preformat = PreFormat::new(false); + let preformat = PreFormat::new(""); assert_eq!(preformat.flatten(), ""); let token = Token::PreFormat(preformat);