Handle several anchor edge cases

This commit is contained in:
Juno Takano 2026-01-03 16:02:11 -03:00
commit 9f04a4606c
3 changed files with 164 additions and 61 deletions

View file

@ -1,6 +1,6 @@
use std::collections::{HashMap}; use std::collections::{HashMap};
use crate::{prelude::*,types::Config}; use crate::{prelude::*, types::Config};
use super::{Parseable as _, Token, LexMap}; use super::{Parseable as _, Token, LexMap};
use token::{ use token::{
anchor::Anchor, linebreak::LineBreak, paragraph::Paragraph, header::Header, anchor::Anchor, linebreak::LineBreak, paragraph::Paragraph, header::Header,
@ -15,10 +15,12 @@ pub mod segment;
pub mod context; pub mod context;
const LEXMAP: LexMap = &[ const LEXMAP: LexMap = &[
(LineBreak::probe, |word| { (LineBreak::probe, |lexeme| {
Token::LineBreak(LineBreak::lex(word)) Token::LineBreak(LineBreak::lex(lexeme))
}),
(Literal::probe, |lexeme| {
Token::Literal(Literal::lex(lexeme))
}), }),
(Literal::probe, |word| Token::Literal(Literal::lex(word))),
]; ];
fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> { fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
@ -49,6 +51,9 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
tokens.push(Token::Header(header)); tokens.push(Token::Header(header));
continue; continue;
} else if Paragraph::probe(lexeme) { } else if Paragraph::probe(lexeme) {
log!(
"Probed {lexeme:#?} from Block::None -> Block::Paragraph"
);
state.context.block = Block::Paragraph; state.context.block = Block::Paragraph;
tokens.push(Token::Paragraph(Paragraph::new(true))); tokens.push(Token::Paragraph(Paragraph::new(true)));
} }
@ -63,7 +68,12 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
continue; continue;
}, },
Block::Paragraph => { Block::Paragraph => {
if lexeme.text() == "\n" { if lexeme.text() == "\n"
&& matches!(state.context.inline, Inline::None)
{
log!(
"Probed {lexeme:#?} from Block::Paragraph -> Block::None"
);
tokens.push(Token::Paragraph(Paragraph::new(false))); tokens.push(Token::Paragraph(Paragraph::new(false)));
state.context.block = Block::None; state.context.block = Block::None;
} }
@ -122,12 +132,7 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
} }
}, },
Inline::Anchor => { Inline::Anchor => {
if context::anchor::parse( if context::anchor::parse(lexeme, &mut state, &mut tokens) {
lexeme,
&mut iterator,
&mut state,
&mut tokens,
) {
continue; continue;
} }
}, },
@ -135,7 +140,9 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec<Token> {
for &(ref probe, lex) in map { for &(ref probe, lex) in map {
if probe(lexeme) { if probe(lexeme) {
tokens.push(lex(lexeme)); let token = lex(lexeme);
log!("Lexmap lexed {lexeme:?} into {token:?}");
tokens.push(token);
break; break;
} }
} }
@ -223,23 +230,31 @@ mod tests {
} }
#[test] #[test]
fn force_flanking() { fn flanking_with_trailing_comma() {
assert_eq!( assert_eq!(
read_noconfig("|Node||"), read_noconfig("|Node|,"),
r#"<p><a href="/node/Node">Node</a>,</p>"#
);
}
#[test]
fn flanking_with_trailing_comma_and_space() {
assert_eq!(
read_noconfig("|Node|, at"),
r#"<p><a href="/node/Node">Node</a>, at</p>"#
);
}
#[test]
fn flanking_at_eoi() {
assert_eq!(
read_noconfig("|Node|"),
r#"<p><a href="/node/Node">Node</a></p>"# r#"<p><a href="/node/Node">Node</a></p>"#
); );
} }
#[test] #[test]
fn force_flanking_with_trailing_letter() { fn needless_three_pipe_anchor() {
assert_eq!(
read_noconfig("|Node||s"),
r#"<p><a href="/node/Node">Node</a>s</p>"#
);
}
#[test]
fn flanking_with_trailing_pipe() {
assert_eq!( assert_eq!(
read_noconfig("|Node|Destination|"), read_noconfig("|Node|Destination|"),
r#"<p><a href="/node/Destination">Node</a></p>"# r#"<p><a href="/node/Destination">Node</a></p>"#
@ -278,6 +293,22 @@ mod tests {
); );
} }
#[test]
fn nonleading_plural_anchor_at_eoi() {
assert_eq!(
read_noconfig("element|s"),
r#"<p><a href="/node/element">elements</a></p>"#
);
}
#[test]
fn leading_plural_anchor_at_eoi() {
assert_eq!(
read_noconfig("|element|s"),
r#"<p><a href="/node/element">elements</a></p>"#
);
}
#[test] #[test]
fn http_external_anchor() { fn http_external_anchor() {
assert_eq!( assert_eq!(
@ -289,26 +320,26 @@ mod tests {
} }
#[test] #[test]
fn http_external_anchor_leading_no_third() { fn http_external_anchor_leading_no_third_then_newline() {
assert_eq!( assert_eq!(
read_noconfig("|Rust toolchain|https://rustup.rs/ "), read_noconfig(concat!(
r#"<p><a href="https://rustup.rs/">Rust toolchain</a> </p>"# "|Rust toolchain|https://rustup.rs/",
"\n",
"at rustup.rs",
)),
concat!(
r#"<p><a href="https://rustup.rs/">Rust toolchain</a>"#,
"\n",
"at rustup.rs</p>",
)
); );
} }
#[test] #[test]
fn http_external_anchor_leading_no_third_then_punctuation_then_space() { fn http_external_anchor_leading_no_third_then_space() {
assert_eq!( assert_eq!(
read_noconfig("|Rust toolchain|https://rustup.rs/, "), read_noconfig("|Rust toolchain|https://rustup.rs/ at rustup.rs"),
r#"<p><a href="https://rustup.rs/">Rust toolchain</a>, </p>"# r#"<p><a href="https://rustup.rs/">Rust toolchain</a> at rustup.rs</p>"#
);
}
#[test]
fn http_external_anchor_leading_no_third_then_punctuation_then_eoi() {
assert_eq!(
read_noconfig("|Rust toolchain|https://rustup.rs/,"),
r#"<p><a href="https://rustup.rs/">Rust toolchain</a></p>"#
); );
} }
@ -321,13 +352,40 @@ mod tests {
} }
#[test] #[test]
fn clear_anchor_buffer() { fn newline_wrapped_anchor() {
assert_eq!( assert_eq!(
read_noconfig("|SomeAnchor|\n|SomeOtherAnchor|"), read_noconfig("\n|SomeAnchor|\n"),
concat!(
"\n",
r#"<p><a href="/node/SomeAnchor">SomeAnchor</a></p>"#,
"\n"
),
);
}
#[test]
fn newline_separated_anchors() {
assert_eq!(
read_noconfig("|SomeAnchor|\n|SomeOtherAnchor|\n"),
concat!( concat!(
r#"<p><a href="/node/SomeAnchor">SomeAnchor</a></p>"#, r#"<p><a href="/node/SomeAnchor">SomeAnchor</a></p>"#,
"\n", "\n",
r#"<p><a href="/node/SomeOtherAnchor">SomeOtherAnchor</a></p>"# r#"<p><a href="/node/SomeOtherAnchor">SomeOtherAnchor</a></p>"#,
"\n"
)
);
}
#[test]
fn empty_line_separated_anchors() {
assert_eq!(
read_noconfig("|SomeAnchor|\n\n|SomeOtherAnchor|\n"),
concat!(
r#"<p><a href="/node/SomeAnchor">SomeAnchor</a></p>"#,
"\n",
"\n",
r#"<p><a href="/node/SomeOtherAnchor">SomeOtherAnchor</a></p>"#,
"\n",
), ),
); );
} }

View file

@ -1,5 +1,3 @@
use std::{iter::Peekable, slice::Iter};
use crate::{ use crate::{
prelude::*, prelude::*,
syntax::content::parser::{ syntax::content::parser::{
@ -11,61 +9,89 @@ use crate::{
/// ///
/// This function is only called if the current inline context is Anchor. /// This function is only called if the current inline context is Anchor.
/// ///
/// A return kind of true will trigger a continue in the outer parser, /// A return of `true` will trigger a continue in the outer parser,
/// skipping any further parsing of the current lexeme. /// skipping any further parsing of the current lexeme.
/// ///
/// # Panics /// # Panics
/// This function will panic if can't determine the destination of an anchor. /// This function will panic if can't determine the destination of an anchor.
pub fn parse( pub fn parse(
lexeme: &Lexeme, lexeme: &Lexeme,
iterator: &mut Peekable<Iter<'_, Lexeme>>,
state: &mut State, state: &mut State,
tokens: &mut Vec<Token>, tokens: &mut Vec<Token>,
) -> bool { ) -> bool {
log!("Resolving open context: {:#?}", state.clone().buffers.anchor); log!(
"Resolving open context: {:#?}",
state.clone().buffers.anchor
);
let buffer = &mut state.buffers.anchor; let buffer = &mut state.buffers.anchor;
let candidate = &mut buffer.candidate; let candidate = &mut buffer.candidate;
// This is only true if the anchor is leading, otherwise the outer parser // This is only true if the anchor is leading, otherwise the outer parser
// would already have set its text to the word before the first pipe // would already have set its text to the word before the first pipe
if candidate.text.is_empty() { if candidate.text.is_empty() {
log!("Seeking text at {:#?} -> {:#?}", lexeme.text(), lexeme.next()); log!(
"Seeking end of text at {:#?} -> {:#?}",
lexeme.text(),
lexeme.next()
);
if lexeme.next() == "|" { if lexeme.next() == "|" {
log!("End: Next lexeme is a pipe");
buffer.text.push_str(&lexeme.text()); buffer.text.push_str(&lexeme.text());
candidate.text.clone_from(&buffer.text); candidate.text.clone_from(&buffer.text);
log!("End: {:#?}", lexeme.text());
return true;
} else { } else {
log!("Pushing non-terminal {:#?} into buffer {:#?}", log!(
lexeme.text(), buffer.text); "Pushing non-terminal {:#?} into buffer {:#?}",
lexeme.text(),
buffer.text
);
buffer.text.push_str(&lexeme.text()); buffer.text.push_str(&lexeme.text());
return true;
} }
return true;
} }
if candidate.destination.is_none() { if candidate.destination.is_none() {
log!(
"Seeking end of destination at {:#?} -> {:#?}",
lexeme.text(),
lexeme.next()
);
log!("Seeking destination at {:#?} -> {:#?}", // Conditions in this decision tree should match the destination end
lexeme.text(), lexeme.next()); if lexeme.match_as_char('s')
&& lexeme.is_next_boundary()
// Conditions to this decision tree should match the destination end && !lexeme.match_next_as_char('|')
if lexeme.last(){ {
log!("End: no more input"); log!("End: Plural anchor");
candidate.destination = Some(candidate.text.clone()); candidate.destination = Some(candidate.text.clone());
candidate.text.push('s');
if lexeme.last() {
tokens.push(Token::Anchor(candidate.clone()));
state.context.inline = Inline::None;
}
return true;
} else if lexeme.match_as_char('|') && lexeme.is_next_boundary() { } else if lexeme.match_as_char('|') && lexeme.is_next_boundary() {
log!("End: Pipe followed by boundary");
if buffer.destination.is_empty() { if buffer.destination.is_empty() {
candidate.destination = Some(candidate.text.clone()); candidate.destination = Some(candidate.text.clone());
} else { } else {
candidate.destination = Some(buffer.destination.clone()); candidate.destination = Some(buffer.destination.clone());
return true
} }
tokens.push(Token::Anchor(candidate.clone()));
} else if lexeme.match_as_char('|') { state.context.inline = Inline::None;
log!("Found a pipe, but no boundary: Destination likely follows");
return true; return true;
} else if lexeme.is_punctuation() && lexeme.is_next_whitespace() { } else if lexeme.match_as_char('|') && !candidate.balanced {
log!("Found puncutation followed by whitespace"); log!("Found a pipe, but no boundary: Destination likely follows");
candidate.balanced = true;
return true;
} else if lexeme.match_as_char('|') {
log!("End: Explicit end-of-destination pipe");
candidate.destination = Some(buffer.destination.clone());
return true;
} else if !candidate.external
&& lexeme.is_punctuation()
&& lexeme.is_next_whitespace()
{
log!("End: Punctuation followed by whitespace");
candidate.destination = Some(buffer.destination.clone()); candidate.destination = Some(buffer.destination.clone());
tokens.push(Token::Anchor(candidate.clone())); tokens.push(Token::Anchor(candidate.clone()));
state.context.inline = Inline::None; state.context.inline = Inline::None;
@ -73,16 +99,28 @@ pub fn parse(
} else if lexeme.is_whitespace() { } else if lexeme.is_whitespace() {
log!("End: Whitespace"); log!("End: Whitespace");
candidate.destination = Some(buffer.destination.clone()); candidate.destination = Some(buffer.destination.clone());
tokens.push(Token::Anchor(candidate.clone()));
state.context.inline = Inline::None;
return false;
// This else branch is the 'no end found yet' state and will keep // This else branch is the 'no end found yet' state and will keep
// pushing lexemes into the buffer until an end is found above // pushing lexemes into the buffer until an end is found above
} else { } else {
log!( log!(
"Pushing non-terminal {:#?} into buffer {:#?}", "Pushing non-terminal {:#?} into buffer {:#?}",
lexeme.text(), buffer.destination, lexeme.text(),
buffer.destination,
); );
if lexeme.match_as_char(':') {
candidate.external = true;
}
buffer.destination.push_str(&lexeme.text()); buffer.destination.push_str(&lexeme.text());
return true if lexeme.last() {
candidate.destination = Some(buffer.destination.clone());
tokens.push(Token::Anchor(candidate.clone()));
state.context.inline = Inline::None;
}
return true;
} }
} }
@ -90,7 +128,8 @@ pub fn parse(
// which would mean there is some case where the end of the destination // which would mean there is some case where the end of the destination
// was never found and we kept filling the buffer endlessly, // was never found and we kept filling the buffer endlessly,
// causing the program to panic anyways when rendering anchors // causing the program to panic anyways when rendering anchors
assert!(candidate.destination.is_some(), assert!(
candidate.destination.is_some(),
"Anchor context parsing done but no destination found: {:#?}", "Anchor context parsing done but no destination found: {:#?}",
state.buffers.anchor state.buffers.anchor
); );

View file

@ -55,6 +55,10 @@ impl Lexeme {
self.as_char().is_some_and(|as_char| as_char == c) self.as_char().is_some_and(|as_char| as_char == c)
} }
pub fn match_next_as_char(&self, c: char) -> bool {
self.next_as_char().is_some_and(|next| next == c)
}
pub fn is_punctuation(&self) -> bool { pub fn is_punctuation(&self) -> bool {
let punctuation = Delimiters::default().punctuation; let punctuation = Delimiters::default().punctuation;
self.as_char().is_some_and(|c| punctuation.contains(&c)) self.as_char().is_some_and(|c| punctuation.contains(&c))
@ -80,8 +84,10 @@ impl Lexeme {
pub fn is_next_boundary(&self) -> bool { pub fn is_next_boundary(&self) -> bool {
let delimiters = Delimiters::default(); let delimiters = Delimiters::default();
self.next_as_char() self.last
.is_some_and(|c| delimiters.is_boundary(c)) || self
.next_as_char()
.is_some_and(|c| delimiters.is_boundary(c))
} }
pub fn next_first_char(&self) -> Option<char> { pub fn next_first_char(&self) -> Option<char> {