Handle several anchor edge cases

2026-01-03 16:02:11 -03:00 · 2026-01-03 16:02:11 -03:00 · 9f04a4606c
commit 9f04a4606c
parent f9bff6acab
3 changed files with 164 additions and 61 deletions
--- a/src/syntax/content/parser.rs
+++ b/src/syntax/content/parser.rs
@ -15,10 +15,12 @@ pub mod segment;
 pub mod context;

 const LEXMAP: LexMap = &[
-    (LineBreak::probe, |word| {
-        Token::LineBreak(LineBreak::lex(word))
+    (LineBreak::probe, |lexeme| {
+        Token::LineBreak(LineBreak::lex(lexeme))
+    }),
+    (Literal::probe, |lexeme| {
+        Token::Literal(Literal::lex(lexeme))
    }),
-    (Literal::probe, |word| Token::Literal(Literal::lex(word))),
 ];

 fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
@ -49,6 +51,9 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
                    tokens.push(Token::Header(header));
                    continue;
                } else if Paragraph::probe(lexeme) {
+                    log!(
+                        "Probed {lexeme:#?} from Block::None -> Block::Paragraph"
+                    );
                    state.context.block = Block::Paragraph;
                    tokens.push(Token::Paragraph(Paragraph::new(true)));
                }
@ -63,7 +68,12 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
                continue;
            },
            Block::Paragraph => {
-                if lexeme.text() == "\n" {
+                if lexeme.text() == "\n"
+                    && matches!(state.context.inline, Inline::None)
+                {
+                    log!(
+                        "Probed {lexeme:#?} from Block::Paragraph -> Block::None"
+                    );
                    tokens.push(Token::Paragraph(Paragraph::new(false)));
                    state.context.block = Block::None;
                }
@ -122,12 +132,7 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
                }
            },
            Inline::Anchor => {
-                if context::anchor::parse(
-                    lexeme,
-                    &mut iterator,
-                    &mut state,
-                    &mut tokens,
-                ) {
+                if context::anchor::parse(lexeme, &mut state, &mut tokens) {
                    continue;
                }
            },
@ -135,7 +140,9 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {

        for &(ref probe, lex) in map {
            if probe(lexeme) {
-                tokens.push(lex(lexeme));
+                let token = lex(lexeme);
+                log!("Lexmap lexed {lexeme:?} into {token:?}");
+                tokens.push(token);
                break;
            }
        }
@ -223,23 +230,31 @@ mod tests {
    }

    #[test]
-    fn force_flanking() {
+    fn flanking_with_trailing_comma() {
        assert_eq!(
-            read_noconfig("|Node||"),
+            read_noconfig("|Node|,"),
+            r#"<p><a href="/node/Node">Node</a>,</p>"#
+        );
+    }
+
+    #[test]
+    fn flanking_with_trailing_comma_and_space() {
+        assert_eq!(
+            read_noconfig("|Node|, at"),
+            r#"<p><a href="/node/Node">Node</a>, at</p>"#
+        );
+    }
+
+    #[test]
+    fn flanking_at_eoi() {
+        assert_eq!(
+            read_noconfig("|Node|"),
            r#"<p><a href="/node/Node">Node</a></p>"#
        );
    }

    #[test]
-    fn force_flanking_with_trailing_letter() {
-        assert_eq!(
-            read_noconfig("|Node||s"),
-            r#"<p><a href="/node/Node">Node</a>s</p>"#
-        );
-    }
-
-    #[test]
-    fn flanking_with_trailing_pipe() {
+    fn needless_three_pipe_anchor() {
        assert_eq!(
            read_noconfig("|Node|Destination|"),
            r#"<p><a href="/node/Destination">Node</a></p>"#
@ -278,6 +293,22 @@ mod tests {
        );
    }

+    #[test]
+    fn nonleading_plural_anchor_at_eoi() {
+        assert_eq!(
+            read_noconfig("element|s"),
+            r#"<p><a href="/node/element">elements</a></p>"#
+        );
+    }
+
+    #[test]
+    fn leading_plural_anchor_at_eoi() {
+        assert_eq!(
+            read_noconfig("|element|s"),
+            r#"<p><a href="/node/element">elements</a></p>"#
+        );
+    }
+
    #[test]
    fn http_external_anchor() {
        assert_eq!(
@ -289,26 +320,26 @@ mod tests {
    }

    #[test]
-    fn http_external_anchor_leading_no_third() {
+    fn http_external_anchor_leading_no_third_then_newline() {
        assert_eq!(
-            read_noconfig("|Rust toolchain|https://rustup.rs/ "),
-            r#"<p><a href="https://rustup.rs/">Rust toolchain</a> </p>"#
+            read_noconfig(concat!(
+                "|Rust toolchain|https://rustup.rs/",
+                "\n",
+                "at rustup.rs",
+            )),
+            concat!(
+                r#"<p><a href="https://rustup.rs/">Rust toolchain</a>"#,
+                "\n",
+                "at rustup.rs</p>",
+            )
        );
    }

    #[test]
-    fn http_external_anchor_leading_no_third_then_punctuation_then_space() {
+    fn http_external_anchor_leading_no_third_then_space() {
        assert_eq!(
-            read_noconfig("|Rust toolchain|https://rustup.rs/, "),
-            r#"<p><a href="https://rustup.rs/">Rust toolchain</a>, </p>"#
-        );
-    }
-
-    #[test]
-    fn http_external_anchor_leading_no_third_then_punctuation_then_eoi() {
-        assert_eq!(
-            read_noconfig("|Rust toolchain|https://rustup.rs/,"),
-            r#"<p><a href="https://rustup.rs/">Rust toolchain</a></p>"#
+            read_noconfig("|Rust toolchain|https://rustup.rs/ at rustup.rs"),
+            r#"<p><a href="https://rustup.rs/">Rust toolchain</a> at rustup.rs</p>"#
        );
    }

@ -321,13 +352,40 @@ mod tests {
    }

    #[test]
-    fn clear_anchor_buffer() {
+    fn newline_wrapped_anchor() {
        assert_eq!(
-            read_noconfig("|SomeAnchor|\n|SomeOtherAnchor|"),
+            read_noconfig("\n|SomeAnchor|\n"),
+            concat!(
+                "\n",
+                r#"<p><a href="/node/SomeAnchor">SomeAnchor</a></p>"#,
+                "\n"
+            ),
+        );
+    }
+
+    #[test]
+    fn newline_separated_anchors() {
+        assert_eq!(
+            read_noconfig("|SomeAnchor|\n|SomeOtherAnchor|\n"),
            concat!(
                r#"<p><a href="/node/SomeAnchor">SomeAnchor</a></p>"#,
                "\n",
-                r#"<p><a href="/node/SomeOtherAnchor">SomeOtherAnchor</a></p>"#
+                r#"<p><a href="/node/SomeOtherAnchor">SomeOtherAnchor</a></p>"#,
+                "\n"
+            )
+        );
+    }
+
+    #[test]
+    fn empty_line_separated_anchors() {
+        assert_eq!(
+            read_noconfig("|SomeAnchor|\n\n|SomeOtherAnchor|\n"),
+            concat!(
+                r#"<p><a href="/node/SomeAnchor">SomeAnchor</a></p>"#,
+                "\n",
+                "\n",
+                r#"<p><a href="/node/SomeOtherAnchor">SomeOtherAnchor</a></p>"#,
+                "\n",
            ),
        );
    }
--- a/src/syntax/content/parser/context/anchor.rs
+++ b/src/syntax/content/parser/context/anchor.rs
@ -1,5 +1,3 @@
-use std::{iter::Peekable, slice::Iter};
-
 use crate::{
    prelude::*,
    syntax::content::parser::{
@ -11,61 +9,89 @@ use crate::{
 ///
 /// This function is only called if the current inline context is Anchor.
 ///
-/// A return kind of true will trigger a continue in the outer parser,
+/// A return of `true` will trigger a continue in the outer parser,
 /// skipping any further parsing of the current lexeme.
 ///
 /// # Panics
 /// This function will panic if can't determine the destination of an anchor.
 pub fn parse(
    lexeme: &Lexeme,
-    iterator: &mut Peekable<Iter<'_, Lexeme>>,
    state: &mut State,
    tokens: &mut Vec<Token>,
 ) -> bool {
-    log!("Resolving open context: {:#?}", state.clone().buffers.anchor);
+    log!(
+        "Resolving open context: {:#?}",
+        state.clone().buffers.anchor
+    );
    let buffer = &mut state.buffers.anchor;
    let candidate = &mut buffer.candidate;

    // This is only true if the anchor is leading, otherwise the outer parser
    // would already have set its text to the word before the first pipe
    if candidate.text.is_empty() {
-        log!("Seeking text at {:#?} -> {:#?}", lexeme.text(), lexeme.next());
+        log!(
+            "Seeking end of text at {:#?} -> {:#?}",
+            lexeme.text(),
+            lexeme.next()
+        );
        if lexeme.next() == "|" {
+            log!("End: Next lexeme is a pipe");
            buffer.text.push_str(&lexeme.text());
            candidate.text.clone_from(&buffer.text);
-            log!("End: {:#?}", lexeme.text());
-            return true;
        } else {
-            log!("Pushing non-terminal {:#?} into buffer {:#?}",
-                lexeme.text(), buffer.text);
+            log!(
+                "Pushing non-terminal {:#?} into buffer {:#?}",
+                lexeme.text(),
+                buffer.text
+            );
            buffer.text.push_str(&lexeme.text());
-            return true;
        }
+        return true;
    }

    if candidate.destination.is_none() {
+        log!(
+            "Seeking end of destination at {:#?} -> {:#?}",
+            lexeme.text(),
+            lexeme.next()
+        );

-        log!("Seeking destination at {:#?} -> {:#?}",
-            lexeme.text(), lexeme.next());
-
-        // Conditions to this decision tree should match the destination end
-        if lexeme.last(){
-            log!("End: no more input");
+        // Conditions in this decision tree should match the destination end
+        if lexeme.match_as_char('s')
+            && lexeme.is_next_boundary()
+            && !lexeme.match_next_as_char('|')
+        {
+            log!("End: Plural anchor");
            candidate.destination = Some(candidate.text.clone());
+            candidate.text.push('s');
+            if lexeme.last() {
+                tokens.push(Token::Anchor(candidate.clone()));
+                state.context.inline = Inline::None;
+            }
+            return true;
        } else if lexeme.match_as_char('|') && lexeme.is_next_boundary() {
-
+            log!("End: Pipe followed by boundary");
            if buffer.destination.is_empty() {
                candidate.destination = Some(candidate.text.clone());
            } else {
                candidate.destination = Some(buffer.destination.clone());
-                return true
            }
-
-        } else if lexeme.match_as_char('|') {
-            log!("Found a pipe, but no boundary: Destination likely follows");
+            tokens.push(Token::Anchor(candidate.clone()));
+            state.context.inline = Inline::None;
            return true;
-        } else if lexeme.is_punctuation() && lexeme.is_next_whitespace() {
-            log!("Found puncutation followed by whitespace");
+        } else if lexeme.match_as_char('|') && !candidate.balanced {
+            log!("Found a pipe, but no boundary: Destination likely follows");
+            candidate.balanced = true;
+            return true;
+        } else if lexeme.match_as_char('|') {
+            log!("End: Explicit end-of-destination pipe");
+            candidate.destination = Some(buffer.destination.clone());
+            return true;
+        } else if !candidate.external
+            && lexeme.is_punctuation()
+            && lexeme.is_next_whitespace()
+        {
+            log!("End: Punctuation followed by whitespace");
            candidate.destination = Some(buffer.destination.clone());
            tokens.push(Token::Anchor(candidate.clone()));
            state.context.inline = Inline::None;
@ -73,16 +99,28 @@ pub fn parse(
        } else if lexeme.is_whitespace() {
            log!("End: Whitespace");
            candidate.destination = Some(buffer.destination.clone());
+            tokens.push(Token::Anchor(candidate.clone()));
+            state.context.inline = Inline::None;
+            return false;

        // This else branch is the 'no end found yet' state and will keep
        // pushing lexemes into the buffer until an end is found above
        } else {
            log!(
                "Pushing non-terminal {:#?} into buffer {:#?}",
-                lexeme.text(), buffer.destination,
+                lexeme.text(),
+                buffer.destination,
            );
+            if lexeme.match_as_char(':') {
+                candidate.external = true;
+            }
            buffer.destination.push_str(&lexeme.text());
-            return true
+            if lexeme.last() {
+                candidate.destination = Some(buffer.destination.clone());
+                tokens.push(Token::Anchor(candidate.clone()));
+                state.context.inline = Inline::None;
+            }
+            return true;
        }
    }

@ -90,7 +128,8 @@ pub fn parse(
    // which would mean there is some case where the end of the destination
    // was never found and we kept filling the buffer endlessly,
    // causing the program to panic anyways when rendering anchors
-    assert!(candidate.destination.is_some(),
+    assert!(
+        candidate.destination.is_some(),
        "Anchor context parsing done but no destination found: {:#?}",
        state.buffers.anchor
    );
--- a/src/syntax/content/parser/lexeme.rs
+++ b/src/syntax/content/parser/lexeme.rs
@ -55,6 +55,10 @@ impl Lexeme {
        self.as_char().is_some_and(|as_char| as_char == c)
    }

+    pub fn match_next_as_char(&self, c: char) -> bool {
+        self.next_as_char().is_some_and(|next| next == c)
+    }
+
    pub fn is_punctuation(&self) -> bool {
        let punctuation = Delimiters::default().punctuation;
        self.as_char().is_some_and(|c| punctuation.contains(&c))
@ -80,7 +84,9 @@ impl Lexeme {

    pub fn is_next_boundary(&self) -> bool {
        let delimiters = Delimiters::default();
-        self.next_as_char()
+        self.last
+            || self
+                .next_as_char()
                .is_some_and(|c| delimiters.is_boundary(c))
    }