diff --git a/src/syntax/content/parser.rs b/src/syntax/content/parser.rs index dbc0e98..55fbf68 100644 --- a/src/syntax/content/parser.rs +++ b/src/syntax/content/parser.rs @@ -1,6 +1,6 @@ use std::collections::{HashMap}; -use crate::{prelude::*,types::Config}; +use crate::{prelude::*, types::Config}; use super::{Parseable as _, Token, LexMap}; use token::{ anchor::Anchor, linebreak::LineBreak, paragraph::Paragraph, header::Header, @@ -15,10 +15,12 @@ pub mod segment; pub mod context; const LEXMAP: LexMap = &[ - (LineBreak::probe, |word| { - Token::LineBreak(LineBreak::lex(word)) + (LineBreak::probe, |lexeme| { + Token::LineBreak(LineBreak::lex(lexeme)) + }), + (Literal::probe, |lexeme| { + Token::Literal(Literal::lex(lexeme)) }), - (Literal::probe, |word| Token::Literal(Literal::lex(word))), ]; fn lex(text: &str, map: LexMap, config: &Config) -> Vec { @@ -49,6 +51,9 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec { tokens.push(Token::Header(header)); continue; } else if Paragraph::probe(lexeme) { + log!( + "Probed {lexeme:#?} from Block::None -> Block::Paragraph" + ); state.context.block = Block::Paragraph; tokens.push(Token::Paragraph(Paragraph::new(true))); } @@ -63,7 +68,12 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec { continue; }, Block::Paragraph => { - if lexeme.text() == "\n" { + if lexeme.text() == "\n" + && matches!(state.context.inline, Inline::None) + { + log!( + "Probed {lexeme:#?} from Block::Paragraph -> Block::None" + ); tokens.push(Token::Paragraph(Paragraph::new(false))); state.context.block = Block::None; } @@ -122,12 +132,7 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec { } }, Inline::Anchor => { - if context::anchor::parse( - lexeme, - &mut iterator, - &mut state, - &mut tokens, - ) { + if context::anchor::parse(lexeme, &mut state, &mut tokens) { continue; } }, @@ -135,7 +140,9 @@ fn lex(text: &str, map: LexMap, config: &Config) -> Vec { for &(ref probe, lex) in map { if probe(lexeme) { - tokens.push(lex(lexeme)); + let token = lex(lexeme); + log!("Lexmap lexed {lexeme:?} into {token:?}"); + tokens.push(token); break; } } @@ -223,23 +230,31 @@ mod tests { } #[test] - fn force_flanking() { + fn flanking_with_trailing_comma() { assert_eq!( - read_noconfig("|Node||"), + read_noconfig("|Node|,"), + r#"

Node,

"# + ); + } + + #[test] + fn flanking_with_trailing_comma_and_space() { + assert_eq!( + read_noconfig("|Node|, at"), + r#"

Node, at

"# + ); + } + + #[test] + fn flanking_at_eoi() { + assert_eq!( + read_noconfig("|Node|"), r#"

Node

"# ); } #[test] - fn force_flanking_with_trailing_letter() { - assert_eq!( - read_noconfig("|Node||s"), - r#"

Nodes

"# - ); - } - - #[test] - fn flanking_with_trailing_pipe() { + fn needless_three_pipe_anchor() { assert_eq!( read_noconfig("|Node|Destination|"), r#"

Node

"# @@ -278,6 +293,22 @@ mod tests { ); } + #[test] + fn nonleading_plural_anchor_at_eoi() { + assert_eq!( + read_noconfig("element|s"), + r#"

elements

"# + ); + } + + #[test] + fn leading_plural_anchor_at_eoi() { + assert_eq!( + read_noconfig("|element|s"), + r#"

elements

"# + ); + } + #[test] fn http_external_anchor() { assert_eq!( @@ -289,26 +320,26 @@ mod tests { } #[test] - fn http_external_anchor_leading_no_third() { + fn http_external_anchor_leading_no_third_then_newline() { assert_eq!( - read_noconfig("|Rust toolchain|https://rustup.rs/ "), - r#"

Rust toolchain

"# + read_noconfig(concat!( + "|Rust toolchain|https://rustup.rs/", + "\n", + "at rustup.rs", + )), + concat!( + r#"

Rust toolchain"#, + "\n", + "at rustup.rs

", + ) ); } #[test] - fn http_external_anchor_leading_no_third_then_punctuation_then_space() { + fn http_external_anchor_leading_no_third_then_space() { assert_eq!( - read_noconfig("|Rust toolchain|https://rustup.rs/, "), - r#"

Rust toolchain,

"# - ); - } - - #[test] - fn http_external_anchor_leading_no_third_then_punctuation_then_eoi() { - assert_eq!( - read_noconfig("|Rust toolchain|https://rustup.rs/,"), - r#"

Rust toolchain

"# + read_noconfig("|Rust toolchain|https://rustup.rs/ at rustup.rs"), + r#"

Rust toolchain at rustup.rs

"# ); } @@ -321,13 +352,40 @@ mod tests { } #[test] - fn clear_anchor_buffer() { + fn newline_wrapped_anchor() { assert_eq!( - read_noconfig("|SomeAnchor|\n|SomeOtherAnchor|"), + read_noconfig("\n|SomeAnchor|\n"), + concat!( + "\n", + r#"

SomeAnchor

"#, + "\n" + ), + ); + } + + #[test] + fn newline_separated_anchors() { + assert_eq!( + read_noconfig("|SomeAnchor|\n|SomeOtherAnchor|\n"), concat!( r#"

SomeAnchor

"#, "\n", - r#"

SomeOtherAnchor

"# + r#"

SomeOtherAnchor

"#, + "\n" + ) + ); + } + + #[test] + fn empty_line_separated_anchors() { + assert_eq!( + read_noconfig("|SomeAnchor|\n\n|SomeOtherAnchor|\n"), + concat!( + r#"

SomeAnchor

"#, + "\n", + "\n", + r#"

SomeOtherAnchor

"#, + "\n", ), ); } diff --git a/src/syntax/content/parser/context/anchor.rs b/src/syntax/content/parser/context/anchor.rs index 7bb1b76..2fecad3 100644 --- a/src/syntax/content/parser/context/anchor.rs +++ b/src/syntax/content/parser/context/anchor.rs @@ -1,5 +1,3 @@ -use std::{iter::Peekable, slice::Iter}; - use crate::{ prelude::*, syntax::content::parser::{ @@ -11,61 +9,89 @@ use crate::{ /// /// This function is only called if the current inline context is Anchor. /// -/// A return kind of true will trigger a continue in the outer parser, +/// A return of `true` will trigger a continue in the outer parser, /// skipping any further parsing of the current lexeme. /// /// # Panics /// This function will panic if can't determine the destination of an anchor. pub fn parse( lexeme: &Lexeme, - iterator: &mut Peekable>, state: &mut State, tokens: &mut Vec, ) -> bool { - log!("Resolving open context: {:#?}", state.clone().buffers.anchor); + log!( + "Resolving open context: {:#?}", + state.clone().buffers.anchor + ); let buffer = &mut state.buffers.anchor; let candidate = &mut buffer.candidate; // This is only true if the anchor is leading, otherwise the outer parser // would already have set its text to the word before the first pipe if candidate.text.is_empty() { - log!("Seeking text at {:#?} -> {:#?}", lexeme.text(), lexeme.next()); + log!( + "Seeking end of text at {:#?} -> {:#?}", + lexeme.text(), + lexeme.next() + ); if lexeme.next() == "|" { + log!("End: Next lexeme is a pipe"); buffer.text.push_str(&lexeme.text()); candidate.text.clone_from(&buffer.text); - log!("End: {:#?}", lexeme.text()); - return true; } else { - log!("Pushing non-terminal {:#?} into buffer {:#?}", - lexeme.text(), buffer.text); + log!( + "Pushing non-terminal {:#?} into buffer {:#?}", + lexeme.text(), + buffer.text + ); buffer.text.push_str(&lexeme.text()); - return true; } + return true; } if candidate.destination.is_none() { + log!( + "Seeking end of destination at {:#?} -> {:#?}", + lexeme.text(), + lexeme.next() + ); - log!("Seeking destination at {:#?} -> {:#?}", - lexeme.text(), lexeme.next()); - - // Conditions to this decision tree should match the destination end - if lexeme.last(){ - log!("End: no more input"); + // Conditions in this decision tree should match the destination end + if lexeme.match_as_char('s') + && lexeme.is_next_boundary() + && !lexeme.match_next_as_char('|') + { + log!("End: Plural anchor"); candidate.destination = Some(candidate.text.clone()); + candidate.text.push('s'); + if lexeme.last() { + tokens.push(Token::Anchor(candidate.clone())); + state.context.inline = Inline::None; + } + return true; } else if lexeme.match_as_char('|') && lexeme.is_next_boundary() { - + log!("End: Pipe followed by boundary"); if buffer.destination.is_empty() { candidate.destination = Some(candidate.text.clone()); } else { candidate.destination = Some(buffer.destination.clone()); - return true } - - } else if lexeme.match_as_char('|') { - log!("Found a pipe, but no boundary: Destination likely follows"); + tokens.push(Token::Anchor(candidate.clone())); + state.context.inline = Inline::None; return true; - } else if lexeme.is_punctuation() && lexeme.is_next_whitespace() { - log!("Found puncutation followed by whitespace"); + } else if lexeme.match_as_char('|') && !candidate.balanced { + log!("Found a pipe, but no boundary: Destination likely follows"); + candidate.balanced = true; + return true; + } else if lexeme.match_as_char('|') { + log!("End: Explicit end-of-destination pipe"); + candidate.destination = Some(buffer.destination.clone()); + return true; + } else if !candidate.external + && lexeme.is_punctuation() + && lexeme.is_next_whitespace() + { + log!("End: Punctuation followed by whitespace"); candidate.destination = Some(buffer.destination.clone()); tokens.push(Token::Anchor(candidate.clone())); state.context.inline = Inline::None; @@ -73,16 +99,28 @@ pub fn parse( } else if lexeme.is_whitespace() { log!("End: Whitespace"); candidate.destination = Some(buffer.destination.clone()); + tokens.push(Token::Anchor(candidate.clone())); + state.context.inline = Inline::None; + return false; // This else branch is the 'no end found yet' state and will keep // pushing lexemes into the buffer until an end is found above } else { log!( "Pushing non-terminal {:#?} into buffer {:#?}", - lexeme.text(), buffer.destination, + lexeme.text(), + buffer.destination, ); + if lexeme.match_as_char(':') { + candidate.external = true; + } buffer.destination.push_str(&lexeme.text()); - return true + if lexeme.last() { + candidate.destination = Some(buffer.destination.clone()); + tokens.push(Token::Anchor(candidate.clone())); + state.context.inline = Inline::None; + } + return true; } } @@ -90,7 +128,8 @@ pub fn parse( // which would mean there is some case where the end of the destination // was never found and we kept filling the buffer endlessly, // causing the program to panic anyways when rendering anchors - assert!(candidate.destination.is_some(), + assert!( + candidate.destination.is_some(), "Anchor context parsing done but no destination found: {:#?}", state.buffers.anchor ); diff --git a/src/syntax/content/parser/lexeme.rs b/src/syntax/content/parser/lexeme.rs index d75a850..30c94b7 100644 --- a/src/syntax/content/parser/lexeme.rs +++ b/src/syntax/content/parser/lexeme.rs @@ -55,6 +55,10 @@ impl Lexeme { self.as_char().is_some_and(|as_char| as_char == c) } + pub fn match_next_as_char(&self, c: char) -> bool { + self.next_as_char().is_some_and(|next| next == c) + } + pub fn is_punctuation(&self) -> bool { let punctuation = Delimiters::default().punctuation; self.as_char().is_some_and(|c| punctuation.contains(&c)) @@ -80,8 +84,10 @@ impl Lexeme { pub fn is_next_boundary(&self) -> bool { let delimiters = Delimiters::default(); - self.next_as_char() - .is_some_and(|c| delimiters.is_boundary(c)) + self.last + || self + .next_as_char() + .is_some_and(|c| delimiters.is_boundary(c)) } pub fn next_first_char(&self) -> Option {