From d0ca4e6cb38c1544920b7bd6abea0bbf946e2b21 Mon Sep 17 00:00:00 2001
From: jutty <j@jutty.dev>
Date: Tue, 2 Jun 2026 16:00:40 -0300
Subject: [PATCH] Add a context parser for PreFormat blocks

---
 Cargo.lock                                    | 10 +--
 Cargo.toml                                    |  2 +-
 src/syntax/content/parser/context.rs          | 31 ++++----
 src/syntax/content/parser/context/block.rs    | 44 ++++-------
 .../content/parser/context/preformat.rs       | 61 +++++++++++++++
 src/syntax/content/parser/lexeme.rs           |  5 ++
 src/syntax/content/parser/lexer.rs            |  4 +-
 src/syntax/content/parser/state.rs            |  8 +-
 src/syntax/content/parser/token/preformat.rs  | 76 ++++++++-----------
 9 files changed, 146 insertions(+), 95 deletions(-)
 create mode 100644 src/syntax/content/parser/context/preformat.rs
diff --git a/Cargo.lock b/Cargo.lock
index 21ada32..2d09c10 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -86,9 +86,9 @@ dependencies = [
 
 [[package]]
 name = "bitflags"
-version = "2.11.1"
+version = "2.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
+checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a"
 
 [[package]]
 name = "block-buffer"
@@ -238,7 +238,7 @@ dependencies = [
 
 [[package]]
 name = "en"
-version = "0.4.0-alpha"
+version = "0.4.1-alpha"
 dependencies = [
  "axum",
  "serde",
@@ -531,9 +531,9 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 
 [[package]]
 name = "log"
-version = "0.4.30"
+version = "0.4.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5"
+checksum = "113b30b4cd05f7c06868fdb2854f66a7b9fece9a48425351cd532e810d74024f"
 
 [[package]]
 name = "matchit"
diff --git a/Cargo.toml b/Cargo.toml
index 54a54a0..130e36b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "en"
-version = "0.4.0-alpha"
+version = "0.4.1-alpha"
 description = "A non-linear writing instrument."
 license = "AGPL-3.0-only"
 
diff --git a/src/syntax/content/parser/context.rs b/src/syntax/content/parser/context.rs
index e5010f6..e1379a5 100644
--- a/src/syntax/content/parser/context.rs
+++ b/src/syntax/content/parser/context.rs
@@ -1,12 +1,13 @@
 use crate::syntax::content::parser::{
     State, Token,
-    token::{Header, Paragraph, PreFormat, Verse},
+    token::{Header, Paragraph, Verse},
 };
 
 pub mod anchor;
 pub mod block;
 pub mod inline;
 pub mod list;
+pub mod preformat;
 pub mod quote;
 pub mod table;
 
@@ -38,30 +39,32 @@ pub enum Inline {
 }
 
 /// # Panics
-/// Panics if there is an open header or list at end of input.
+/// Panics if there is an open token at end of input that can't be easily
+/// closed by simply adding a matching closing token. This normally is handled
+/// by context parsers and probably indicates an error in one of them.
 pub fn close(state: &State, tokens: &mut Vec<Token>) {
     match state.context.block {
-        Block::PreFormat => {
-            tokens.push(Token::PreFormat(PreFormat::new(false)));
-        },
         Block::Paragraph => {
             tokens.push(Token::Paragraph(Paragraph::new(false)));
         },
-        Block::List => {
-            panic!("End of input with open list")
-        },
         Block::Header(level) => {
             tokens.push(Token::Header(Header::from_u8(level, false, None)));
         },
-        Block::Quote => {
-            panic!("End of input with open quote")
-        },
-        Block::Table => {
-            panic!("End of input with open table")
-        },
         Block::Verse => {
             tokens.push(Token::Verse(Verse::new(false)));
         },
+        Block::PreFormat => {
+            panic!("End of input with open preformat: {tokens:#?}")
+        },
+        Block::List => {
+            panic!("End of input with open list: {tokens:#?}")
+        },
+        Block::Quote => {
+            panic!("End of input with open quote: {tokens:#?}")
+        },
+        Block::Table => {
+            panic!("End of input with open table: {tokens:#?}")
+        },
         Block::None => (),
     }
 }
diff --git a/src/syntax/content/parser/context/block.rs b/src/syntax/content/parser/context/block.rs
index 3b34724..8048896 100644
--- a/src/syntax/content/parser/context/block.rs
+++ b/src/syntax/content/parser/context/block.rs
@@ -6,15 +6,17 @@ use crate::{
     syntax::content::{
         Parseable as _,
         parser::{
-            Block, Lexeme, State, Token,
+            Block, Lexeme, State, Token, context,
             token::{
-                Header, LineBreak, List, Literal, Paragraph, PreFormat, Quote,
-                Table, Verse,
+                Header, LineBreak, List, Paragraph, PreFormat, Quote, Table,
+                Verse,
             },
         },
     },
 };
 
+/// A return of `true` will trigger a `continue` on the outer parser, causing
+/// no more subsequent parsing of the current lexeme.
 pub fn parse(
     lexeme: &Lexeme,
     state: &mut State,
@@ -27,8 +29,7 @@ pub fn parse(
             if PreFormat::probe(lexeme) {
                 log!(VERBOSE, "Block Context: None -> PreFormat on {lexeme}");
                 state.context.block = Block::PreFormat;
-                tokens.push(Token::PreFormat(PreFormat::new(true)));
-                return true;
+                return true
             } else if Header::probe(lexeme) {
                 let mut header = Header::lex(lexeme);
                 header.dom_id = Some(Header::make_id(
@@ -44,7 +45,7 @@ pub fn parse(
                 log!(VERBOSE, "Block Context: None -> List on {lexeme}");
                 state.context.block = Block::List;
                 state.buffers.list.candidate.ordered = lexeme.match_char('+');
-                return super::list::parse(
+                return context::list::parse(
                     lexeme, state, tokens, iterator, graph,
                 );
             } else if Quote::probe(lexeme) {
@@ -71,14 +72,7 @@ pub fn parse(
             }
         },
         Block::PreFormat => {
-            if PreFormat::probe(lexeme) {
-                tokens.push(Token::PreFormat(PreFormat::new(false)));
-                log!(VERBOSE, "Block Context: PreFormat -> None on {lexeme}");
-                state.context.block = Block::None;
-            } else {
-                tokens.push(Token::Literal(Literal::lex(lexeme)));
-            }
-            return true;
+            return context::preformat::parse(lexeme, state, tokens, iterator);
         },
         Block::Paragraph => {
             if Paragraph::probe_end(lexeme) {
@@ -95,13 +89,17 @@ pub fn parse(
             }
         },
         Block::List => {
-            return super::list::parse(lexeme, state, tokens, iterator, graph);
+            return context::list::parse(lexeme, state, tokens, iterator, graph);
         },
         Block::Quote => {
-            return super::quote::parse(lexeme, state, tokens, iterator, graph);
+            return context::quote::parse(
+                lexeme, state, tokens, iterator, graph,
+            );
         },
         Block::Table => {
-            return super::table::parse(lexeme, state, tokens, iterator, graph);
+            return context::table::parse(
+                lexeme, state, tokens, iterator, graph,
+            );
         },
         Block::Verse => {
             if Verse::probe_end(lexeme) {
@@ -127,7 +125,7 @@ mod tests {
         graph::Graph,
         syntax::content::parser::{
             self, Block, State, Token, context,
-            token::{Header, PreFormat, header::Level},
+            token::{Header, header::Level},
         },
     };
 
@@ -161,16 +159,6 @@ mod tests {
         assert_eq!(vec, vec![Token::Header(Header::from_u8(1, false, None))]);
     }
 
-    #[test]
-    fn end_with_open_preformat() {
-        let mut state = State::default();
-        state.context.block = Block::PreFormat;
-
-        let mut vec: Vec<Token> = vec![];
-        context::close(&state, &mut vec);
-        assert_eq!(vec, vec![Token::PreFormat(PreFormat::new(false))]);
-    }
-
     #[test]
     fn truncated_header_level() {
         let u: usize = 999;
diff --git a/src/syntax/content/parser/context/preformat.rs b/src/syntax/content/parser/context/preformat.rs
new file mode 100644
index 0000000..ed6763f
--- /dev/null
+++ b/src/syntax/content/parser/context/preformat.rs
@@ -0,0 +1,61 @@
+use std::{iter::Peekable, slice::Iter};
+
+use crate::{
+    prelude::*,
+    syntax::content::{
+        Parseable as _,
+        parser::{Lexeme, State, Token, context::Block, token::PreFormat},
+    },
+};
+
+/// Handles open `PreFormat` contexts until a block is fully parsed.
+///
+/// A return of `true` will trigger a continue in the outer parser,
+/// skipping any further parsing of the current lexeme.
+///
+/// # Panics
+/// This parser can handle only the List context, and will panic if passed an
+/// unrelated context since it has no knowledge on how to handle them.
+pub fn parse(
+    lexeme: &Lexeme,
+    state: &mut State,
+    tokens: &mut Vec<Token>,
+    iterator: &mut Peekable<Iter<'_, Lexeme>>,
+) -> bool {
+    let buffer = &mut state.buffers.preformat;
+    let candidate = &mut buffer.candidate;
+
+    #[expect(clippy::wildcard_enum_match_arm)]
+    match state.context.block {
+        Block::PreFormat => {
+            if lexeme.match_first_char('<') {
+                candidate.text.push_str("&lt;");
+                candidate.text.push_str(
+                    lexeme.text().strip_prefix('<').unwrap_or(&lexeme.text()),
+                );
+            } else if lexeme.match_last_char('>') {
+                candidate.text.push_str(
+                    lexeme.text().strip_suffix('>').unwrap_or(&lexeme.text()),
+                );
+                candidate.text.push_str("&gt;");
+            } else if lexeme.match_char('\\') {
+                candidate.text.push_str(lexeme.next().as_str());
+                iterator.next();
+                return true;
+            } else if PreFormat::probe(lexeme) {
+                // found end of block, push it and reset state
+                log!(VERBOSE, "Accepting preformat candidate {candidate}");
+                tokens.push(Token::PreFormat(candidate.clone()));
+                state.context.block = Block::None;
+                *candidate = PreFormat::default();
+            } else {
+                // anything else is pushed into the candidate preformat's text
+                candidate.text.push_str(&lexeme.text());
+            }
+        },
+        _ => {
+            panic!("PreFormat context parser called for {:?}", state.context)
+        },
+    }
+    true
+}
diff --git a/src/syntax/content/parser/lexeme.rs b/src/syntax/content/parser/lexeme.rs
index 495b27b..94b28d5 100644
--- a/src/syntax/content/parser/lexeme.rs
+++ b/src/syntax/content/parser/lexeme.rs
@@ -32,6 +32,8 @@ impl Lexeme {
 
     pub fn mutate_text(&mut self, new: &str) { self.text = new.to_string(); }
 
+    /// Returns an Option containing the character if the raw lexeme text
+    /// is composed of a single character, None if it has multiple characters.
     pub fn as_char(&self) -> Option<char> {
         if self.text.chars().count() == 1 {
             self.text.chars().nth(0)
@@ -56,6 +58,7 @@ impl Lexeme {
         }
     }
 
+    /// Returns true if the raw lexeme text is a single matching character.
     pub fn match_char(&self, c: char) -> bool {
         self.as_char().is_some_and(|as_char| as_char == c)
     }
@@ -86,6 +89,8 @@ impl Lexeme {
             && self.match_third_char(c3)
     }
 
+    /// Returns true if the lexeme raw text is composed of a single character
+    /// and this character is in the provided slice.
     pub fn match_char_in(&self, slice: &[char]) -> bool {
         self.as_char().is_some_and(|c| slice.contains(&c))
     }
diff --git a/src/syntax/content/parser/lexer.rs b/src/syntax/content/parser/lexer.rs
index e9a0d40..e8d3abc 100644
--- a/src/syntax/content/parser/lexer.rs
+++ b/src/syntax/content/parser/lexer.rs
@@ -38,7 +38,9 @@ pub(super) fn lex(
 
     let mut iterator = lexemes.iter().peekable();
     while let Some(lexeme) = iterator.next() {
-        if lexeme.match_char('\\') {
+        if lexeme.match_char('\\')
+            && !matches!(state.context.block, context::Block::PreFormat)
+        {
             if let Some(next) = iterator.next() {
                 tokens.push(Token::Literal(Literal::lex(next)));
             }
diff --git a/src/syntax/content/parser/state.rs b/src/syntax/content/parser/state.rs
index aa42f68..47498a3 100644
--- a/src/syntax/content/parser/state.rs
+++ b/src/syntax/content/parser/state.rs
@@ -3,7 +3,7 @@ use std::collections::HashMap;
 use crate::syntax::content::parser::{
     Token,
     context::Context,
-    token::{Anchor, Item, List, Quote, Table},
+    token::{Anchor, Item, List, PreFormat, Quote, Table},
 };
 
 #[derive(Clone, Default, Debug)]
@@ -29,6 +29,7 @@ pub struct Buffers {
     pub list: ListBuffer,
     pub quote: QuoteBuffer,
     pub table: TableBuffer,
+    pub preformat: PreFormatBuffer,
 }
 
 #[derive(Default, Clone, Debug)]
@@ -59,6 +60,11 @@ pub struct TableBuffer {
     pub in_header: bool,
 }
 
+#[derive(Default, Clone, Debug)]
+pub struct PreFormatBuffer {
+    pub candidate: PreFormat,
+}
+
 impl std::fmt::Display for AnchorBuffer {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         let display_text = if self.text.is_empty() {
diff --git a/src/syntax/content/parser/token/preformat.rs b/src/syntax/content/parser/token/preformat.rs
index d950590..f24bbc4 100644
--- a/src/syntax/content/parser/token/preformat.rs
+++ b/src/syntax/content/parser/token/preformat.rs
@@ -1,46 +1,42 @@
 use crate::syntax::content::{Lexeme, Parseable};
 
-#[derive(Debug, Clone, Eq, PartialEq)]
+#[derive(Debug, Default, Clone, Eq, PartialEq)]
 pub struct PreFormat {
-    open: Option<bool>,
+    pub text: String,
 }
 
 impl PreFormat {
-    pub const fn new(open: bool) -> PreFormat { PreFormat { open: Some(open) } }
+    pub fn new(text: &str) -> PreFormat {
+        PreFormat {
+            text: String::from(text),
+        }
+    }
 }
 
 impl std::fmt::Display for PreFormat {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        let display_open_state = if let Some(open_state) = self.open {
-            if open_state { "open" } else { "closed" }
+        let character_count = self.text.chars().count();
+        let is_whitespace = self.text.trim_ascii().is_empty();
+        let summary = if is_whitespace {
+            "empty"
         } else {
-            "unknown"
+            &format!("{character_count} chars")
         };
-        write!(f, "PreFormat [{display_open_state}]")
+        write!(f, "PreFormat [{summary}]")
     }
 }
 
 impl Parseable for PreFormat {
     fn probe(lexeme: &Lexeme) -> bool {
-        lexeme.match_first_char('`') && (lexeme.next() == "\n" || lexeme.last())
+        lexeme.match_char('`') && (lexeme.next() == "\n" || lexeme.last())
     }
 
-    fn lex(_lexeme: &Lexeme) -> PreFormat { PreFormat { open: None } }
-
-    fn render(&self) -> String {
-        if let Some(o) = self.open {
-            if o {
-                "<pre>".to_owned()
-            } else {
-                "</pre>".to_owned()
-            }
-        } else {
-            panic!(
-                "Attempt to render a preformat tag while open state is unknown"
-            )
-        }
+    fn lex(_lexeme: &Lexeme) -> PreFormat {
+        panic!("Attempt to lex a preformat directly from a lexeme")
     }
 
+    fn render(&self) -> String { format!("<pre>{}</pre>", self.text) }
+
     fn flatten(&self) -> String { String::default() }
 }
 
@@ -50,49 +46,39 @@ mod tests {
     use crate::syntax::content::parser::Token;
 
     #[test]
+    #[should_panic(
+        expected = "Attempt to lex a preformat directly from a lexeme"
+    )]
     fn lex() {
-        let from_empty_lexeme = PreFormat::lex(&Lexeme::default());
-        assert!(from_empty_lexeme.open.is_none());
-
-        let from_non_empty_lexeme = PreFormat::lex(&Lexeme::default());
-        assert!(from_non_empty_lexeme.open.is_none());
-    }
-
-    #[test]
-    #[should_panic(expected = "Attempt to render a preformat tag while \
-            open state is unknown")]
-    fn render() {
-        let from_empty_lexeme = PreFormat::lex(&Lexeme::default());
-        from_empty_lexeme.render();
-
-        let from_non_empty_lexeme = PreFormat::lex(&Lexeme::default());
-        from_non_empty_lexeme.render();
+        let lexeme = Lexeme::new("a", "b", "c");
+        PreFormat::lex(&lexeme);
     }
 
     #[test]
     fn token_display() {
-        let mut preformat = PreFormat::new(true);
+        let mut preformat = PreFormat::new("");
+
         assert_eq!(
             format!("{}", Token::PreFormat(preformat.clone())),
-            "Tk:PreFormat [open]"
+            "Tk:PreFormat [empty]"
         );
 
-        preformat.open = Some(false);
+        preformat.text = "\n ".to_string();
         assert_eq!(
             format!("{}", Token::PreFormat(preformat.clone())),
-            "Tk:PreFormat [closed]"
+            "Tk:PreFormat [empty]"
         );
 
-        preformat.open = None;
+        preformat.text = "text".to_string();
         assert_eq!(
             format!("{}", Token::PreFormat(preformat)),
-            "Tk:PreFormat [unknown]"
+            "Tk:PreFormat [4 chars]"
         );
     }
 
     #[test]
     fn flatten() {
-        let preformat = PreFormat::new(false);
+        let preformat = PreFormat::new("");
         assert_eq!(preformat.flatten(), "");
 
         let token = Token::PreFormat(preformat);