From cbefcdcad72052b92d51c0130951f63b713a68c0 Mon Sep 17 00:00:00 2001 From: jutty Date: Fri, 2 Jan 2026 21:25:41 -0300 Subject: [PATCH] Simplify Anchor context parser --- src/syntax/content/parser.rs | 21 +++- src/syntax/content/parser/context/anchor.rs | 114 ++++++++++++-------- src/syntax/content/parser/token/anchor.rs | 5 +- 3 files changed, 90 insertions(+), 50 deletions(-) diff --git a/src/syntax/content/parser.rs b/src/syntax/content/parser.rs index 5bc1639..dbc0e98 100644 --- a/src/syntax/content/parser.rs +++ b/src/syntax/content/parser.rs @@ -1,6 +1,6 @@ use std::collections::{HashMap}; -use crate::types::Config; +use crate::{prelude::*,types::Config}; use super::{Parseable as _, Token, LexMap}; use token::{ anchor::Anchor, linebreak::LineBreak, paragraph::Paragraph, header::Header, @@ -28,6 +28,8 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec { let segments = segment::segment(text); let lexemes = Lexeme::collect(&segments); + log!("Lexing segments: {segments:?}"); + let mut iterator = lexemes.iter().peekable(); while let Some(lexeme) = iterator.next() { match state.context.block { @@ -81,13 +83,22 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec { tokens.push(Token::Code(Code::new(true))); continue; } else if Anchor::probe(lexeme) { + log!("Positively probed anchor: {lexeme:?}"); state.context.inline = Inline::Anchor; state.buffers.anchor.clear(); - if lexeme.match_first_char('|') { + if lexeme.match_as_char('|') { + log!("{:#?} matches as a pipe char", lexeme.text()); state.buffers.anchor.candidate.leading = true; } else { + log!( + "{:#?} not a pipe: assuming it's the anchor text", + lexeme.text(), + ); state.buffers.anchor.candidate.text = lexeme.text(); + // because we probed positively and this is not a pipe, + // the next lexeme must be and so it was now parsed + iterator.next(); } continue; } else if Oblique::probe(lexeme) { @@ -294,7 +305,7 @@ mod tests { } #[test] - fn http_external_anchor_leading_no_third_then_punctuation_then_eof() { + fn http_external_anchor_leading_no_third_then_punctuation_then_eoi() { assert_eq!( read_noconfig("|Rust toolchain|https://rustup.rs/,"), r#"

Rust toolchain

"# @@ -302,7 +313,7 @@ mod tests { } #[test] - fn http_external_anchor_leading_no_third_then_eof() { + fn http_external_anchor_leading_no_third_then_eoi() { assert_eq!( read_noconfig("|Rust toolchain|https://rustup.rs/"), r#"

Rust toolchain

"# @@ -331,7 +342,7 @@ mod tests { } #[test] - fn eof_pre() { + fn eoi_pre() { let payload = "Jp8INpWzsQmk20jpIhBFCfMUXOztxv0w"; assert_eq!( read_noconfig(&format!("`\n{payload}\n`")), diff --git a/src/syntax/content/parser/context/anchor.rs b/src/syntax/content/parser/context/anchor.rs index fc47246..7bb1b76 100644 --- a/src/syntax/content/parser/context/anchor.rs +++ b/src/syntax/content/parser/context/anchor.rs @@ -1,74 +1,100 @@ use std::{iter::Peekable, slice::Iter}; -use crate::syntax::content::parser::{ - State, context::Inline, lexeme::Lexeme, token::Token, +use crate::{ + prelude::*, + syntax::content::parser::{ + State, context::Inline, lexeme::Lexeme, token::Token, + }, }; +/// Handles open anchor contexts until an anchor token is fully parsed. +/// +/// This function is only called if the current inline context is Anchor. +/// +/// A return kind of true will trigger a continue in the outer parser, +/// skipping any further parsing of the current lexeme. +/// +/// # Panics +/// This function will panic if can't determine the destination of an anchor. pub fn parse( lexeme: &Lexeme, iterator: &mut Peekable>, state: &mut State, tokens: &mut Vec, ) -> bool { + log!("Resolving open context: {:#?}", state.clone().buffers.anchor); let buffer = &mut state.buffers.anchor; let candidate = &mut buffer.candidate; + + // This is only true if the anchor is leading, otherwise the outer parser + // would already have set its text to the word before the first pipe if candidate.text.is_empty() { + log!("Seeking text at {:#?} -> {:#?}", lexeme.text(), lexeme.next()); if lexeme.next() == "|" { buffer.text.push_str(&lexeme.text()); candidate.text.clone_from(&buffer.text); + log!("End: {:#?}", lexeme.text()); + return true; } else { + log!("Pushing non-terminal {:#?} into buffer {:#?}", + lexeme.text(), buffer.text); buffer.text.push_str(&lexeme.text()); + return true; } - return true; - } else if candidate.destination.is_none() { - // candidate is leading and we found the second pipe - if candidate.leading && lexeme.text() == "|" { - // third pipe immediately after second: forcing flanking - if lexeme.match_next_first_char('|') { + } + + if candidate.destination.is_none() { + + log!("Seeking destination at {:#?} -> {:#?}", + lexeme.text(), lexeme.next()); + + // Conditions to this decision tree should match the destination end + if lexeme.last(){ + log!("End: no more input"); + candidate.destination = Some(candidate.text.clone()); + } else if lexeme.match_as_char('|') && lexeme.is_next_boundary() { + + if buffer.destination.is_empty() { candidate.destination = Some(candidate.text.clone()); - let token = Token::Anchor(candidate.clone()); - tokens.push(token); - state.context.inline = Inline::None; - iterator.next(); - return true; - // whitespace or punctuation after pipe: flanking anchor - } else if lexeme.is_next_whitespace() - || lexeme.is_next_punctuation() - { - candidate.destination = Some(candidate.text.clone()); - let token = Token::Anchor(candidate.clone()); - tokens.push(token); - state.context.inline = Inline::None; - // non-whitespace after pipe is the destination } else { - candidate.destination = Some(lexeme.next().clone()); - let token = Token::Anchor(candidate.clone()); - tokens.push(token); - state.context.inline = Inline::None; - // if there is a trailing pipe, consume it - if let Some(next) = iterator.next() - && next.next() == "|" - { - iterator.next(); - } + candidate.destination = Some(buffer.destination.clone()); + return true } - // candidate is nonleading and we found a second pipe - } else if !candidate.leading && lexeme.next() == "|" { - candidate.destination = Some(lexeme.text()); + + } else if lexeme.match_as_char('|') { + log!("Found a pipe, but no boundary: Destination likely follows"); + return true; + } else if lexeme.is_punctuation() && lexeme.is_next_whitespace() { + log!("Found puncutation followed by whitespace"); + candidate.destination = Some(buffer.destination.clone()); tokens.push(Token::Anchor(candidate.clone())); state.context.inline = Inline::None; - iterator.next(); - // candidate is nonleading and we found whitespace - } else if lexeme.is_next_whitespace() { - candidate.destination = Some(lexeme.text()); - let token = Token::Anchor(candidate.clone()); - tokens.push(token); - state.context.inline = Inline::None; - // candidate is nonleading and we haven't found whitespace + return false; + } else if lexeme.is_whitespace() { + log!("End: Whitespace"); + candidate.destination = Some(buffer.destination.clone()); + + // This else branch is the 'no end found yet' state and will keep + // pushing lexemes into the buffer until an end is found above } else { + log!( + "Pushing non-terminal {:#?} into buffer {:#?}", + lexeme.text(), buffer.destination, + ); buffer.destination.push_str(&lexeme.text()); + return true } - return true; } + + // This point should never be reached with a still None destination, + // which would mean there is some case where the end of the destination + // was never found and we kept filling the buffer endlessly, + // causing the program to panic anyways when rendering anchors + assert!(candidate.destination.is_some(), + "Anchor context parsing done but no destination found: {:#?}", + state.buffers.anchor + ); + tokens.push(Token::Anchor(candidate.clone())); + state.context.inline = Inline::None; false } diff --git a/src/syntax/content/parser/token/anchor.rs b/src/syntax/content/parser/token/anchor.rs index bacf03f..1f6614d 100644 --- a/src/syntax/content/parser/token/anchor.rs +++ b/src/syntax/content/parser/token/anchor.rs @@ -5,6 +5,7 @@ pub struct Anchor { pub text: String, pub destination: Option, pub leading: bool, + pub balanced: bool, pub external: bool, } @@ -45,12 +46,14 @@ impl Anchor { destination: &str, leading: bool, external: bool, + balanced: bool, ) -> Anchor { Anchor { text: text.to_owned(), destination: Some(Anchor::resolve_destination(destination)), leading, external, + balanced, } } @@ -70,7 +73,7 @@ mod tests { #[test] fn render_anchor() { - let anchor = Anchor::new("AnchorText", "AnchorDest", true, false); + let anchor = Anchor::new("AnchorText", "AnchorDest", true, false, false); assert_eq!( anchor.render(), r#"AnchorText"#