Add a context parser for PreFormat blocks

2026-06-02 16:00:40 -03:00 · 2026-06-02 16:00:40 -03:00 · d0ca4e6cb3
commit d0ca4e6cb3
parent 29c2beb3ed
9 changed files with 145 additions and 94 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -86,9 +86,9 @@ dependencies = [

 [[package]]
 name = "bitflags"
-version = "2.11.1"
+version = "2.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
+checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a"

 [[package]]
 name = "block-buffer"
@ -238,7 +238,7 @@ dependencies = [

 [[package]]
 name = "en"
-version = "0.4.0-alpha"
+version = "0.4.1-alpha"
 dependencies = [
 "axum",
 "serde",
@ -531,9 +531,9 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"

 [[package]]
 name = "log"
-version = "0.4.30"
+version = "0.4.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5"
+checksum = "113b30b4cd05f7c06868fdb2854f66a7b9fece9a48425351cd532e810d74024f"

 [[package]]
 name = "matchit"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "en"
-version = "0.4.0-alpha"
+version = "0.4.1-alpha"
 description = "A non-linear writing instrument."
 license = "AGPL-3.0-only"

--- a/src/syntax/content/parser/context.rs
+++ b/src/syntax/content/parser/context.rs
@ -1,12 +1,13 @@
 use crate::syntax::content::parser::{
    State, Token,
-    token::{Header, Paragraph, PreFormat, Verse},
+    token::{Header, Paragraph, Verse},
 };

 pub mod anchor;
 pub mod block;
 pub mod inline;
 pub mod list;
+pub mod preformat;
 pub mod quote;
 pub mod table;

@ -38,30 +39,32 @@ pub enum Inline {
 }

 /// # Panics
-/// Panics if there is an open header or list at end of input.
+/// Panics if there is an open token at end of input that can't be easily
+/// closed by simply adding a matching closing token. This normally is handled
+/// by context parsers and probably indicates an error in one of them.
 pub fn close(state: &State, tokens: &mut Vec<Token>) {
    match state.context.block {
-        Block::PreFormat => {
-            tokens.push(Token::PreFormat(PreFormat::new(false)));
-        },
        Block::Paragraph => {
            tokens.push(Token::Paragraph(Paragraph::new(false)));
        },
-        Block::List => {
-            panic!("End of input with open list")
-        },
        Block::Header(level) => {
            tokens.push(Token::Header(Header::from_u8(level, false, None)));
        },
-        Block::Quote => {
-            panic!("End of input with open quote")
-        },
-        Block::Table => {
-            panic!("End of input with open table")
-        },
        Block::Verse => {
            tokens.push(Token::Verse(Verse::new(false)));
        },
+        Block::PreFormat => {
+            panic!("End of input with open preformat: {tokens:#?}")
+        },
+        Block::List => {
+            panic!("End of input with open list: {tokens:#?}")
+        },
+        Block::Quote => {
+            panic!("End of input with open quote: {tokens:#?}")
+        },
+        Block::Table => {
+            panic!("End of input with open table: {tokens:#?}")
+        },
        Block::None => (),
    }
 }
--- a/src/syntax/content/parser/context/block.rs
+++ b/src/syntax/content/parser/context/block.rs
@ -6,15 +6,17 @@ use crate::{
    syntax::content::{
        Parseable as _,
        parser::{
-            Block, Lexeme, State, Token,
+            Block, Lexeme, State, Token, context,
            token::{
-                Header, LineBreak, List, Literal, Paragraph, PreFormat, Quote,
-                Table, Verse,
+                Header, LineBreak, List, Paragraph, PreFormat, Quote, Table,
+                Verse,
            },
        },
    },
 };

+/// A return of `true` will trigger a `continue` on the outer parser, causing
+/// no more subsequent parsing of the current lexeme.
 pub fn parse(
    lexeme: &Lexeme,
    state: &mut State,
@ -27,8 +29,7 @@ pub fn parse(
            if PreFormat::probe(lexeme) {
                log!(VERBOSE, "Block Context: None -> PreFormat on {lexeme}");
                state.context.block = Block::PreFormat;
-                tokens.push(Token::PreFormat(PreFormat::new(true)));
-                return true;
+                return true
            } else if Header::probe(lexeme) {
                let mut header = Header::lex(lexeme);
                header.dom_id = Some(Header::make_id(
@ -44,7 +45,7 @@ pub fn parse(
                log!(VERBOSE, "Block Context: None -> List on {lexeme}");
                state.context.block = Block::List;
                state.buffers.list.candidate.ordered = lexeme.match_char('+');
-                return super::list::parse(
+                return context::list::parse(
                    lexeme, state, tokens, iterator, graph,
                );
            } else if Quote::probe(lexeme) {
@ -71,14 +72,7 @@ pub fn parse(
            }
        },
        Block::PreFormat => {
-            if PreFormat::probe(lexeme) {
-                tokens.push(Token::PreFormat(PreFormat::new(false)));
-                log!(VERBOSE, "Block Context: PreFormat -> None on {lexeme}");
-                state.context.block = Block::None;
-            } else {
-                tokens.push(Token::Literal(Literal::lex(lexeme)));
-            }
-            return true;
+            return context::preformat::parse(lexeme, state, tokens, iterator);
        },
        Block::Paragraph => {
            if Paragraph::probe_end(lexeme) {
@ -95,13 +89,17 @@ pub fn parse(
            }
        },
        Block::List => {
-            return super::list::parse(lexeme, state, tokens, iterator, graph);
+            return context::list::parse(lexeme, state, tokens, iterator, graph);
        },
        Block::Quote => {
-            return super::quote::parse(lexeme, state, tokens, iterator, graph);
+            return context::quote::parse(
+                lexeme, state, tokens, iterator, graph,
+            );
        },
        Block::Table => {
-            return super::table::parse(lexeme, state, tokens, iterator, graph);
+            return context::table::parse(
+                lexeme, state, tokens, iterator, graph,
+            );
        },
        Block::Verse => {
            if Verse::probe_end(lexeme) {
@ -127,7 +125,7 @@ mod tests {
        graph::Graph,
        syntax::content::parser::{
            self, Block, State, Token, context,
-            token::{Header, PreFormat, header::Level},
+            token::{Header, header::Level},
        },
    };

@ -161,16 +159,6 @@ mod tests {
        assert_eq!(vec, vec![Token::Header(Header::from_u8(1, false, None))]);
    }

-    #[test]
-    fn end_with_open_preformat() {
-        let mut state = State::default();
-        state.context.block = Block::PreFormat;
-
-        let mut vec: Vec<Token> = vec![];
-        context::close(&state, &mut vec);
-        assert_eq!(vec, vec![Token::PreFormat(PreFormat::new(false))]);
-    }
-
    #[test]
    fn truncated_header_level() {
        let u: usize = 999;
--- a/src/syntax/content/parser/context/preformat.rs
+++ b/src/syntax/content/parser/context/preformat.rs
@ -0,0 +1,61 @@
+use std::{iter::Peekable, slice::Iter};
+
+use crate::{
+    prelude::*,
+    syntax::content::{
+        Parseable as _,
+        parser::{Lexeme, State, Token, context::Block, token::PreFormat},
+    },
+};
+
+/// Handles open `PreFormat` contexts until a block is fully parsed.
+///
+/// A return of `true` will trigger a continue in the outer parser,
+/// skipping any further parsing of the current lexeme.
+///
+/// # Panics
+/// This parser can handle only the List context, and will panic if passed an
+/// unrelated context since it has no knowledge on how to handle them.
+pub fn parse(
+    lexeme: &Lexeme,
+    state: &mut State,
+    tokens: &mut Vec<Token>,
+    iterator: &mut Peekable<Iter<'_, Lexeme>>,
+) -> bool {
+    let buffer = &mut state.buffers.preformat;
+    let candidate = &mut buffer.candidate;
+
+    #[expect(clippy::wildcard_enum_match_arm)]
+    match state.context.block {
+        Block::PreFormat => {
+            if lexeme.match_first_char('<') {
+                candidate.text.push_str("&lt;");
+                candidate.text.push_str(
+                    lexeme.text().strip_prefix('<').unwrap_or(&lexeme.text()),
+                );
+            } else if lexeme.match_last_char('>') {
+                candidate.text.push_str(
+                    lexeme.text().strip_suffix('>').unwrap_or(&lexeme.text()),
+                );
+                candidate.text.push_str("&gt;");
+            } else if lexeme.match_char('\\') {
+                candidate.text.push_str(lexeme.next().as_str());
+                iterator.next();
+                return true;
+            } else if PreFormat::probe(lexeme) {
+                // found end of block, push it and reset state
+                log!(VERBOSE, "Accepting preformat candidate {candidate}");
+                tokens.push(Token::PreFormat(candidate.clone()));
+                state.context.block = Block::None;
+                *candidate = PreFormat::default();
+            } else {
+                // anything else is pushed into the candidate preformat's text
+                candidate.text.push_str(&lexeme.text());
+            }
+        },
+        _ => {
+            panic!("PreFormat context parser called for {:?}", state.context)
+        },
+    }
+    true
+}
--- a/src/syntax/content/parser/lexeme.rs
+++ b/src/syntax/content/parser/lexeme.rs
@ -32,6 +32,8 @@ impl Lexeme {

    pub fn mutate_text(&mut self, new: &str) { self.text = new.to_string(); }

+    /// Returns an Option containing the character if the raw lexeme text
+    /// is composed of a single character, None if it has multiple characters.
    pub fn as_char(&self) -> Option<char> {
        if self.text.chars().count() == 1 {
            self.text.chars().nth(0)
@ -56,6 +58,7 @@ impl Lexeme {
        }
    }

+    /// Returns true if the raw lexeme text is a single matching character.
    pub fn match_char(&self, c: char) -> bool {
        self.as_char().is_some_and(|as_char| as_char == c)
    }
@ -86,6 +89,8 @@ impl Lexeme {
            && self.match_third_char(c3)
    }

+    /// Returns true if the lexeme raw text is composed of a single character
+    /// and this character is in the provided slice.
    pub fn match_char_in(&self, slice: &[char]) -> bool {
        self.as_char().is_some_and(|c| slice.contains(&c))
    }
--- a/src/syntax/content/parser/lexer.rs
+++ b/src/syntax/content/parser/lexer.rs
@ -38,7 +38,9 @@ pub(super) fn lex(

    let mut iterator = lexemes.iter().peekable();
    while let Some(lexeme) = iterator.next() {
-        if lexeme.match_char('\\') {
+        if lexeme.match_char('\\')
+            && !matches!(state.context.block, context::Block::PreFormat)
+        {
            if let Some(next) = iterator.next() {
                tokens.push(Token::Literal(Literal::lex(next)));
            }
--- a/src/syntax/content/parser/state.rs
+++ b/src/syntax/content/parser/state.rs
@ -3,7 +3,7 @@ use std::collections::HashMap;
 use crate::syntax::content::parser::{
    Token,
    context::Context,
-    token::{Anchor, Item, List, Quote, Table},
+    token::{Anchor, Item, List, PreFormat, Quote, Table},
 };

 #[derive(Clone, Default, Debug)]
@ -29,6 +29,7 @@ pub struct Buffers {
    pub list: ListBuffer,
    pub quote: QuoteBuffer,
    pub table: TableBuffer,
+    pub preformat: PreFormatBuffer,
 }

 #[derive(Default, Clone, Debug)]
@ -59,6 +60,11 @@ pub struct TableBuffer {
    pub in_header: bool,
 }

+#[derive(Default, Clone, Debug)]
+pub struct PreFormatBuffer {
+    pub candidate: PreFormat,
+}
+
 impl std::fmt::Display for AnchorBuffer {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        let display_text = if self.text.is_empty() {
--- a/src/syntax/content/parser/token/preformat.rs
+++ b/src/syntax/content/parser/token/preformat.rs
@ -1,46 +1,42 @@
 use crate::syntax::content::{Lexeme, Parseable};

-#[derive(Debug, Clone, Eq, PartialEq)]
+#[derive(Debug, Default, Clone, Eq, PartialEq)]
 pub struct PreFormat {
-    open: Option<bool>,
+    pub text: String,
 }

 impl PreFormat {
-    pub const fn new(open: bool) -> PreFormat { PreFormat { open: Some(open) } }
+    pub fn new(text: &str) -> PreFormat {
+        PreFormat {
+            text: String::from(text),
+        }
+    }
 }

 impl std::fmt::Display for PreFormat {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        let display_open_state = if let Some(open_state) = self.open {
-            if open_state { "open" } else { "closed" }
+        let character_count = self.text.chars().count();
+        let is_whitespace = self.text.trim_ascii().is_empty();
+        let summary = if is_whitespace {
+            "empty"
        } else {
-            "unknown"
+            &format!("{character_count} chars")
        };
-        write!(f, "PreFormat [{display_open_state}]")
+        write!(f, "PreFormat [{summary}]")
    }
 }

 impl Parseable for PreFormat {
    fn probe(lexeme: &Lexeme) -> bool {
-        lexeme.match_first_char('`') && (lexeme.next() == "\n" || lexeme.last())
+        lexeme.match_char('`') && (lexeme.next() == "\n" || lexeme.last())
    }

-    fn lex(_lexeme: &Lexeme) -> PreFormat { PreFormat { open: None } }
-
-    fn render(&self) -> String {
-        if let Some(o) = self.open {
-            if o {
-                "<pre>".to_owned()
-            } else {
-                "</pre>".to_owned()
-            }
-        } else {
-            panic!(
-                "Attempt to render a preformat tag while open state is unknown"
-            )
-        }
+    fn lex(_lexeme: &Lexeme) -> PreFormat {
+        panic!("Attempt to lex a preformat directly from a lexeme")
    }

+    fn render(&self) -> String { format!("<pre>{}</pre>", self.text) }
+
    fn flatten(&self) -> String { String::default() }
 }

@ -50,49 +46,39 @@ mod tests {
    use crate::syntax::content::parser::Token;

    #[test]
+    #[should_panic(
+        expected = "Attempt to lex a preformat directly from a lexeme"
+    )]
    fn lex() {
-        let from_empty_lexeme = PreFormat::lex(&Lexeme::default());
-        assert!(from_empty_lexeme.open.is_none());
-
-        let from_non_empty_lexeme = PreFormat::lex(&Lexeme::default());
-        assert!(from_non_empty_lexeme.open.is_none());
-    }
-
-    #[test]
-    #[should_panic(expected = "Attempt to render a preformat tag while \
-            open state is unknown")]
-    fn render() {
-        let from_empty_lexeme = PreFormat::lex(&Lexeme::default());
-        from_empty_lexeme.render();
-
-        let from_non_empty_lexeme = PreFormat::lex(&Lexeme::default());
-        from_non_empty_lexeme.render();
+        let lexeme = Lexeme::new("a", "b", "c");
+        PreFormat::lex(&lexeme);
    }

    #[test]
    fn token_display() {
-        let mut preformat = PreFormat::new(true);
+        let mut preformat = PreFormat::new("");
+
        assert_eq!(
            format!("{}", Token::PreFormat(preformat.clone())),
-            "Tk:PreFormat [open]"
+            "Tk:PreFormat [empty]"
        );

-        preformat.open = Some(false);
+        preformat.text = "\n ".to_string();
        assert_eq!(
            format!("{}", Token::PreFormat(preformat.clone())),
-            "Tk:PreFormat [closed]"
+            "Tk:PreFormat [empty]"
        );

-        preformat.open = None;
+        preformat.text = "text".to_string();
        assert_eq!(
            format!("{}", Token::PreFormat(preformat)),
-            "Tk:PreFormat [unknown]"
+            "Tk:PreFormat [4 chars]"
        );
    }

    #[test]
    fn flatten() {
-        let preformat = PreFormat::new(false);
+        let preformat = PreFormat::new("");
        assert_eq!(preformat.flatten(), "");

        let token = Token::PreFormat(preformat);