From 8b782d6d20c3f7dd64c71b086a4a2021a0297dab Mon Sep 17 00:00:00 2001 From: jutty Date: Tue, 23 Dec 2025 21:40:57 -0300 Subject: [PATCH] Rework token segmentation --- src/formats.rs | 9 - src/main.rs | 9 - src/syntax/content/parser.rs | 225 ++++++++++++++----- src/syntax/content/parser/cluster.rs | 192 ---------------- src/syntax/content/parser/lexeme.rs | 20 ++ src/syntax/content/parser/segment.rs | 199 ++++++++++++++++ src/syntax/content/parser/token.rs | 1 + src/syntax/content/parser/token/anchor.rs | 110 +++------ src/syntax/content/parser/token/code.rs | 41 ++-- src/syntax/content/parser/token/header.rs | 41 +++- src/syntax/content/parser/token/linebreak.rs | 1 + src/syntax/content/parser/token/literal.rs | 8 +- src/syntax/content/parser/token/paragraph.rs | 5 +- src/syntax/content/parser/token/preformat.rs | 1 + src/syntax/content/parser/token/span.rs | 1 + static/graph.toml | 27 +-- 16 files changed, 501 insertions(+), 389 deletions(-) delete mode 100644 src/syntax/content/parser/cluster.rs create mode 100644 src/syntax/content/parser/segment.rs diff --git a/src/formats.rs b/src/formats.rs index 043f98d..4f79aac 100644 --- a/src/formats.rs +++ b/src/formats.rs @@ -136,12 +136,3 @@ pub fn deserialize_graph(in_format: &Format, serial: &str) -> Graph { }, } } - -#[cfg(test)] -mod tests { - #[test] - fn smoke() { - let n = true; - assert!(n); - } -} diff --git a/src/main.rs b/src/main.rs index 9d45675..92d97f2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -49,12 +49,3 @@ async fn main() -> io::Result<()> { Ok(()) } - -#[cfg(test)] -mod tests { - #[test] - fn smoke() { - let e = true; - assert!(e); - } -} diff --git a/src/syntax/content/parser.rs b/src/syntax/content/parser.rs index 47e43bd..2adc02c 100644 --- a/src/syntax/content/parser.rs +++ b/src/syntax/content/parser.rs @@ -1,4 +1,4 @@ -use std::collections::{HashMap, hash_map::Entry}; +use std::collections::{HashMap}; use crate::{formats::populate_graph, types::Config}; @@ -11,98 +11,147 @@ use lexeme::Lexeme; pub mod token; pub mod lexeme; -pub mod cluster; +pub mod segment; const LEXMAP: LexMap = &[ (LineBreak::probe, |word| { Token::LineBreak(LineBreak::lex(word)) }), - (Code::probe, |word| Token::Code(Code::lex(word))), - (Anchor::probe, |word| Token::Anchor(Anchor::lex(word))), (Literal::probe, |word| Token::Literal(Literal::lex(word))), ]; -enum Context { - None, - Paragraph, - Header(u8), - PreFormat, -} - -struct State { - context: Context, - dom_ids: HashMap>, -} - fn lex(text: &str, map: LexMap) -> Vec { let mut tokens: Vec = Vec::new(); - let mut state = State { - context: Context::None, - dom_ids: HashMap::new(), - }; + let mut state = State::new(); let config: Config = populate_graph().meta.config; - let splits = cluster::cluster(text); - let lexemes = Lexeme::collect(&splits); - let iter = lexemes.iter().peekable(); - for lexeme in iter { - match state.context { - Context::None => { + let segments = segment::segment(text); + let lexemes = Lexeme::collect(&segments); + + let mut iterator = lexemes.iter().peekable(); + while let Some(lexeme) = iterator.next() { + match state.context.block { + BlockContext::None => { if PreFormat::probe(lexeme) { + state.context.block = BlockContext::PreFormat; tokens.push(Token::PreFormat(PreFormat::new(true))); - state.context = Context::PreFormat; continue; } else if Header::probe(lexeme) { - let base_id = - if config.ascii_dom_ids && !lexeme.next.is_ascii() { - String::from("h") - } else { - lexeme.next.clone().to_lowercase() - }; - let id = match state.dom_ids.entry(base_id.clone()) { - Entry::Occupied(mut occupied) => { - let ids = occupied.get_mut(); - let suffix: u8 = - ids.len().try_into().unwrap_or_default(); - let id_with_suffix = format!("{base_id}-{suffix}"); - ids.push(id_with_suffix.clone()); - id_with_suffix - }, - Entry::Vacant(vacant) => { - vacant.insert(vec![base_id.clone()]); - base_id - }, - }; - let mut header = Header::lex(lexeme); - header.dom_id = Some(id); - state.context = Context::Header(header.get_level()); + header.dom_id = Some(Header::make_id( + &config, + &mut iterator, + &mut state.dom_ids, + )); + state.context.block = BlockContext::Header(header.level()); tokens.push(Token::Header(header)); continue; } else if Paragraph::probe(lexeme) { + state.context.block = BlockContext::Paragraph; tokens.push(Token::Paragraph(Paragraph::new(true))); - state.context = Context::Paragraph; } }, - Context::PreFormat => { + BlockContext::PreFormat => { if PreFormat::probe(lexeme) { tokens.push(Token::PreFormat(PreFormat::new(false))); - state.context = Context::None; + state.context.block = BlockContext::None; } else { tokens.push(Token::Literal(Literal::lex(lexeme))); } continue; }, - Context::Paragraph => { + BlockContext::Paragraph => { if lexeme.text() == "\n" { tokens.push(Token::Paragraph(Paragraph::new(false))); - state.context = Context::None; + state.context.block = BlockContext::None; } }, - Context::Header(n) => { + BlockContext::Header(n) => { if lexeme.text() == "\n" { tokens.push(Token::Header(Header::from_u8(n, false, None))); - state.context = Context::None; + state.context.block = BlockContext::None; + } + }, + } + + match state.context.inline { + InlineContext::None => { + if Code::probe(lexeme) { + state.context.inline = InlineContext::Code; + tokens.push(Token::Code(Code::new(true))); + continue; + } else if Anchor::probe(lexeme) { + state.context.inline = InlineContext::Anchor; + state.buffers.anchor.clear(); + + if lexeme.match_first_char('|') { + state.buffers.anchor.candidate.leading = true; + } else { + state.buffers.anchor.candidate.text = lexeme.text(); + } + continue; + } + }, + InlineContext::Code => { + if Code::probe(lexeme) { + state.context.inline = InlineContext::None; + tokens.push(Token::Code(Code::new(false))); + continue; + } + }, + InlineContext::Anchor => { + let buffer = &mut state.buffers.anchor; + let candidate = &mut buffer.candidate; + if candidate.text.is_empty() { + if lexeme.next == "|" { + buffer.text.push_str(&lexeme.text()); + candidate.text.clone_from(&buffer.text); + } else { + buffer.text.push_str(&lexeme.text()); + } + continue; + } else if candidate.destination.is_none() { + // candidate is leading and we found the second pipe + if candidate.leading && lexeme.text() == "|" { + // whitespace after pipe: flanking node anchor + if lexeme.is_next_whitespace() { + candidate.destination = + Some(candidate.text.clone()); + let token = Token::Anchor(candidate.clone()); + tokens.push(token); + state.context.inline = InlineContext::None; + // non-whitespace after pipe is the destination + } else { + candidate.destination = Some(lexeme.next.clone()); + let token = Token::Anchor(candidate.clone()); + tokens.push(token); + state.context.inline = InlineContext::None; + // if there is a trailing pipe, consume it + if let Some(next) = iterator.next() + && next.next == "|" + { + iterator.next(); + } + } + // candidate is nonleading and we found a second pipe + } else if !candidate.leading && lexeme.next == "|" { + candidate.destination = Some(lexeme.text()); + tokens.push(Token::Anchor(candidate.clone())); + state.context.inline = InlineContext::None; + iterator.next(); + // candidate is nonleading and we found whitespace + } else if lexeme.is_next_whitespace() { + candidate.destination = Some(lexeme.text()); + let token = Token::Anchor(candidate.clone()); + tokens.push(token); + state.context.inline = InlineContext::None; + // candidate is nonleading and we haven't found whitespace + } else { + buffer.destination.push_str(&lexeme.text()); + } + continue; + } else { + unreachable!("Anchor is already fully parsed"); } }, } @@ -118,6 +167,68 @@ fn lex(text: &str, map: LexMap) -> Vec { tokens } +enum BlockContext { + Paragraph, + Header(u8), + PreFormat, + None, +} + +enum InlineContext { + Anchor, + Code, + None, +} + +struct State { + context: Context, + dom_ids: HashMap>, + buffers: Buffers, +} + +struct Buffers { + anchor: AnchorBuffer, +} + +#[derive(Debug)] +struct AnchorBuffer { + candidate: Anchor, + text: String, + destination: String, +} + +impl AnchorBuffer { + fn clear(&mut self) { + self.candidate = Anchor::empty(); + self.text = String::new(); + self.destination = String::new(); + } +} + +impl State { + fn new() -> State { + State { + context: Context { + inline: InlineContext::None, + block: BlockContext::None, + }, + dom_ids: HashMap::new(), + buffers: Buffers { + anchor: AnchorBuffer { + candidate: Anchor::empty(), + text: String::new(), + destination: String::new(), + }, + }, + } + } +} + +struct Context { + block: BlockContext, + inline: InlineContext, +} + fn parse(tokens: &[Token]) -> String { tokens.iter().map(Token::render).collect::() } diff --git a/src/syntax/content/parser/cluster.rs b/src/syntax/content/parser/cluster.rs deleted file mode 100644 index 8cb6f47..0000000 --- a/src/syntax/content/parser/cluster.rs +++ /dev/null @@ -1,192 +0,0 @@ -use crate::prelude::*; - -pub fn cluster(text: &str) -> Vec { - let words: Vec = text - .replace("\n", " \n ") - .split(' ') - .map(str::to_string) - .collect(); - - let mut clusters: Vec = vec![]; - let mut raw_context = false; - - let mut iterator = words.into_iter().peekable(); - while let Some(word) = iterator.next() { - log!("Iterating: {word:?}"); - - if word == "`" { - raw_context = !raw_context; - log!("Raw context is now {raw_context}"); - } else if raw_context { - log!("Skip: In raw context"); - clusters.push(word); - continue; - } - - let Some(delimiter) = delimiter::match_delimiter(&word) else { - log!("Skip: {word:?} does not have a delimiter"); - clusters.push(word); - continue; - }; - - if !delimiter.leading && !word.starts_with(delimiter.char) { - clusters.push(word); - continue; - } - - if (!delimiter.greedy - && !delimiter.triple - && word.matches(delimiter.char).count() == 2) - || (delimiter.triple - && (2..=3).contains(&word.matches(delimiter.char).count())) - { - log!("Skip: {word:?} is almost atomic, but must be split"); - match word.rsplit_once(delimiter.char) { - Some((head, tail)) => { - log!("Pushing head {head:?}, tail {tail:?} into clusters"); - clusters.push(format!("{head}{}", delimiter.char)); - clusters.push(tail.to_string()); - continue; - }, - None => unreachable!(), - } - } - - if let Some(next) = iterator.peek() - && next == "\n" - && delimiter.greedy - { - log!("Skip: Next {next:?} is a break, delimiter is greedy"); - clusters.push(word); - continue; - } - - if word.starts_with(&delimiter.string) - && word.ends_with(&delimiter.string) - { - log!("Skip: {word:?} is atomically-delimited"); - clusters.push(word); - continue; - } - - log!("Found cluster from {delimiter:?} in {word:?}"); - let mut parts: Vec = vec![word.clone()]; - log!("Seeking from a base of {parts:?}"); - - while let Some(next) = iterator.peek() { - if next.contains(&delimiter.char.to_string()) { - log!("Found end of cluster: {next:?}"); - if delimiter.greedy - && delimiter.triple - && next.matches(delimiter.char).count() > 1 - { - match next.rsplit_once(delimiter.char) { - Some((head, tail)) => { - log!( - "Pushing head {head:?} of greedy triple EOC \ - into parts and tail {tail:?} into clusters" - ); - parts.push(format!("{head}{}", delimiter.char)); - clusters.push(parts.join(" ")); - clusters.push(tail.to_string()); - log!("Breaking past clusters {clusters:?}"); - iterator.next(); - break; - }, - None => unreachable!(), - } - } else if delimiter.greedy { - log!("Pushing end of cluster into parts"); - parts.push( - iterator.next().unwrap_or_else(|| unreachable!()), - ); - log!("Pushing parts {parts:?} into clusters {clusters:?}"); - clusters.push(parts.join(" ")); - log!("Breaking past clusters {clusters:?}"); - break; - } else { - match next.rsplit_once(delimiter.char) { - Some((head, tail)) => { - log!( - "Pushing head {head:?} of humble end of \ - cluster into parts" - ); - parts.push(format!("{head}{}", delimiter.char)); - log!("Pushing parts into clusters"); - clusters.push(parts.join(" ")); - log!("Pushing tail {tail:?} into clusters"); - clusters.push(tail.to_string()); - log!("Breaking past clusters"); - iterator.next(); - break; - }, - // is this one really unreachable? - None => unreachable!(), - } - } - } else { - log!("No delimiter: Pushing {:?} into parts", iterator.peek()); - parts.push(iterator.next().unwrap_or_default()); - log!("Seeking a boundary for parts {parts:?}"); - } - } - } - - log!("Returning clusters"); - clusters -} - -mod delimiter { - - #[derive(Debug, Clone)] - pub struct Delimiter { - pub char: char, - pub string: String, - pub greedy: bool, - pub triple: bool, - pub leading: bool, - } - - fn make_delimiters() -> (Vec, Vec) { - let delimiters = [ - Delimiter { - char: '|', - string: "|".to_string(), - greedy: true, - triple: true, - leading: false, - }, - Delimiter { - char: '`', - string: "`".to_string(), - greedy: false, - triple: false, - leading: true, - }, - ]; - - ( - delimiters.iter().filter(|d| d.leading).cloned().collect(), - delimiters.iter().filter(|d| !d.leading).cloned().collect(), - ) - } - - pub fn match_delimiter(word: &str) -> Option { - let (leading, nonleading) = make_delimiters(); - - let first_char = word.chars().next()?; - - if let Some(leading_match) = - leading.iter().find(|d| d.char == first_char).cloned() - { - Some(leading_match) - } else { - for delimiter in nonleading { - if word.contains(delimiter.char) { - return Some(delimiter); - } - } - None - } - } -} diff --git a/src/syntax/content/parser/lexeme.rs b/src/syntax/content/parser/lexeme.rs index 620ff09..708e05a 100644 --- a/src/syntax/content/parser/lexeme.rs +++ b/src/syntax/content/parser/lexeme.rs @@ -16,6 +16,26 @@ impl Lexeme { self.text.clone() } + pub fn is_whitespace(&self) -> bool { + self.text == " " || self.text == "\n" + } + + pub fn is_next_whitespace(&self) -> bool { + self.next == " " || self.next == "\n" + } + + pub fn match_first_char(&self, query: char) -> bool { + if let Some(first) = self.text.chars().nth(0) { + first == query + } else { + false + } + } + + pub fn next_first_char(&self) -> Option { + self.next.chars().nth(0) + } + /// # Panics /// Panics if number of chars for a single lexeme exceeds `i2::MAX` pub fn count_char(&self, c: char) -> i32 { diff --git a/src/syntax/content/parser/segment.rs b/src/syntax/content/parser/segment.rs new file mode 100644 index 0000000..33b2f04 --- /dev/null +++ b/src/syntax/content/parser/segment.rs @@ -0,0 +1,199 @@ +pub fn segment(text: &str) -> Vec { + delimiter::atomize(text) +} + +mod delimiter { + + fn make_delimiters() -> Vec { + vec!['\n', ' ', '`', '|'] + } + + pub fn atomize(text: &str) -> Vec { + let delimiters = make_delimiters(); + text.chars().fold( + Vec::new(), + |mut accumulator: Vec, character| { + if delimiters.contains(&character) { + accumulator.push(character.to_string()); + } else if let Some(last) = accumulator.last_mut() { + if delimiters + .iter() + .map(char::to_string) + .filter(|d| d == last) + .count() + > 0 + { + accumulator.push(character.to_string()); + } else { + last.push(character); + } + } else { + accumulator.push(character.to_string()); + } + accumulator + }, + ) + } + + #[cfg(test)] + mod tests { + use super::*; + + #[test] + fn atomize_words() { + let words = " justification for the actions of those who hold authority inevitably dwindles "; // 2 + let actual = atomize(words); + let expected = vec![ + " ", + " ", + " ", + " ", + "justification", + " ", + "for", + " ", + " ", + "the", + " ", + "actions", + " ", + " ", + " ", + "of", + " ", + "those", + " ", + " ", + "who", + " ", + "hold", + " ", + "authority", + " ", + " ", + " ", + "inevitably", + " ", + "dwindles", + " ", + " ", + ]; + assert_eq!(actual, expected); + } + + #[test] + fn atomize_ticks_no_spaces() { + let s = "a`c`adc`dadcdbd`cdb`dcdb`dc`dad`bdc"; + let actual = atomize(s); + let expected = vec![ + "a", "`", "c", "`", "adc", "`", "dadcdbd", "`", "cdb", "`", + "dcdb", "`", "dc", "`", "dad", "`", "bdc", + ] + .iter() + .map(std::string::ToString::to_string) + .collect::>(); + + assert_eq!(actual, expected); + } + + #[test] + fn atomize_ticks_with_spaces() { + let s = "a`c`adc`da dcdb d` cdb` dcdb `dc ` d ad ` bdc"; + + let actual = atomize(s); + let expected = vec![ + "a", "`", "c", "`", "adc", "`", "da", " ", "dcdb", " ", "d", + "`", " ", "cdb", "`", " ", "dcdb", " ", "`", "dc", " ", "`", + " ", "d", " ", "ad", " ", "`", " ", "bdc", + ] + .iter() + .map(std::string::ToString::to_string) + .collect::>(); + assert_eq!(actual, expected); + } + + #[test] + fn atomize_pipes() { + let s = "every other |time| as it was perceived"; + let actual = atomize(s); + let expected = vec![ + "every", + " ", + "other", + " ", + "|", + "time", + "|", + " ", + "as", + " ", + "it", + " ", + "was", + " ", + "perceived", + ]; + assert_eq!(actual, expected); + } + + #[test] + fn atomize_pipes_and_ticks() { + let s = "every other |time| as `it could or |perhaps somehow|then or now| it was` perceived"; + let actual = atomize(s); + let expected = vec![ + "every", + " ", + "other", + " ", + "|", + "time", + "|", + " ", + "as", + " ", + "`", + "it", + " ", + "could", + " ", + "or", + " ", + "|", + "perhaps", + " ", + "somehow", + "|", + "then", + " ", + "or", + " ", + "now", + "|", + " ", + "it", + " ", + "was", + "`", + " ", + "perceived", + ]; + assert_eq!(actual, expected); + } + + #[test] + fn atomize_newlines() { + let s = "a`c`adc`da \ndcdb d` cdb` dc\ndb `dc ` d ad ` bdc"; + + let actual = atomize(s); + let expected = vec![ + "a", "`", "c", "`", "adc", "`", "da", " ", "\n", "dcdb", " ", + "d", "`", " ", "cdb", "`", " ", "dc", "\n", "db", " ", "`", + "dc", " ", "`", " ", "d", " ", "ad", " ", "`", " ", "bdc", + ] + .iter() + .map(std::string::ToString::to_string) + .collect::>(); + assert_eq!(actual, expected); + } + } +} diff --git a/src/syntax/content/parser/token.rs b/src/syntax/content/parser/token.rs index 19a27cd..85ed1bf 100644 --- a/src/syntax/content/parser/token.rs +++ b/src/syntax/content/parser/token.rs @@ -9,6 +9,7 @@ pub mod header; pub mod preformat; pub mod code; +#[derive(Debug)] pub enum Token { Anchor(anchor::Anchor), Code(code::Code), diff --git a/src/syntax/content/parser/token/anchor.rs b/src/syntax/content/parser/token/anchor.rs index 3b2d150..059814e 100644 --- a/src/syntax/content/parser/token/anchor.rs +++ b/src/syntax/content/parser/token/anchor.rs @@ -1,98 +1,62 @@ -use crate::prelude::*; - use std::fmt::Display; + use crate::syntax::content::{Parseable, parser::lexeme::Lexeme}; +#[derive(Debug, Clone)] pub struct Anchor { - text: String, - destination: String, - sticky: bool, + pub text: String, + pub destination: Option, + pub leading: bool, } impl Parseable for Anchor { fn probe(lexeme: &Lexeme) -> bool { - let pipe_count = lexeme.count_char('|'); - log!("{lexeme:?} has {pipe_count} pipes"); - - if !(1..=3).contains(&pipe_count) { - log!("Negative: Bad pipe count {pipe_count} in {lexeme:?}"); - return false; - } - if lexeme.text().matches("||").count() > 0 { - log!("Negative: Contiguous pipes in {lexeme:?}"); - return false; - } - - let parts = Anchor::split_parts(lexeme); - if (1..=2).contains(&parts.len()) { - log!("Positive: Parts {parts:?} with length {}", parts.len()); - true - } else { - log!("Negative: {parts:?} have length {}", parts.len()); - false - } + lexeme.text() == "|" || (!lexeme.is_whitespace() && lexeme.next == "|") } - fn lex(lexeme: &Lexeme) -> Anchor { - let parts = Anchor::split_parts(lexeme); - log!("Lexing anchor {parts:?}"); - - let text = parts.first().unwrap_or_else(|| unreachable!()); - - fn try_node_anchor(anchor: &str) -> String { - if anchor.contains(":") || anchor.contains("/") { - anchor.to_owned() - } else { - format!("/node/{anchor}") - } - } - - let destination = match parts.get(1) { - Some(d) => try_node_anchor(d), - None => try_node_anchor(text), - }; - - let sticky = [ - ",", ".", ":", ";", "!", "?", "/", "(", ")", "%", "*", "&", r#"""#, - "'", - ]; - - log!("Lexed anchor: {text} -> {destination}"); - Anchor { - text: text.to_owned(), - destination, - sticky: sticky.contains(&lexeme.next.as_str()), - } + fn lex(_lexeme: &Lexeme) -> Anchor { + panic!("Attempt to lex an anchor directly from a lexeme"); } fn render(&self) -> String { - let space = if self.sticky { - String::new() - } else { - String::from(" ") + let Some(ref destination) = self.destination else { + panic!( + "Attempt to render anchor {self:?} without knowing its destination." + ) }; - format!( - r#"{}{space}"#, - &self.destination, &self.text - ) + + format!(r#"{}"#, destination, &self.text) } } impl Anchor { - fn split_parts(lexeme: &Lexeme) -> Vec { - lexeme - .text() - .trim_start_matches('|') - .trim_end_matches('|') - .split('|') - .filter(|s| !s.is_empty()) - .map(str::to_string) - .collect() + pub fn new(text: &str, destination: &str, spaced: bool) -> Anchor { + Anchor { + text: text.to_owned(), + destination: Some(Anchor::resolve_destination(destination)), + leading: spaced, + } + } + + fn resolve_destination(raw: &str) -> String { + if raw.contains(":") || raw.contains("/") { + raw.to_owned() + } else { + format!("/node/{raw}") + } + } + + pub fn empty() -> Anchor { + Anchor { + text: String::new(), + destination: None, + leading: false, + } } } impl Display for Anchor { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "Anchor: <{}> to <{}>", &self.text, &self.destination) + write!(f, "Anchor: <{}> to <{:?}>", &self.text, &self.destination) } } diff --git a/src/syntax/content/parser/token/code.rs b/src/syntax/content/parser/token/code.rs index cf1d300..549f60e 100644 --- a/src/syntax/content/parser/token/code.rs +++ b/src/syntax/content/parser/token/code.rs @@ -2,42 +2,31 @@ use crate::{ syntax::content::{Parseable, Lexeme}, }; +#[derive(Debug)] pub struct Code { - text: String, - sticky: bool, + open: bool, +} + +impl Code { + pub fn new(open: bool) -> Code { + Code { open } + } } impl Parseable for Code { fn probe(lexeme: &Lexeme) -> bool { - let chars = lexeme.split_chars(); - - if let Some(first_char) = chars.first() - && let Some(last_char) = chars.last() - { - *first_char == '`' && *last_char == '`' - } else { - false - } + lexeme.text() == "`" } - fn lex(lexeme: &Lexeme) -> Code { - let sticky = [ - ",", ".", ":", ";", "!", "?", "/", "(", ")", "%", "*", "&", r#"""#, - "'", - ]; - - Code { - text: lexeme.text().replace("`", ""), - sticky: sticky.contains(&lexeme.next.as_str()), - } + fn lex(_lexeme: &Lexeme) -> Code { + panic!("Attempt to lex a code tag directly from a lexeme") } fn render(&self) -> String { - let space = if self.sticky { - String::new() + if self.open { + String::from("") } else { - String::from(" ") - }; - format!("{}{space}", self.text) + String::from("") + } } } diff --git a/src/syntax/content/parser/token/header.rs b/src/syntax/content/parser/token/header.rs index 612afcf..3a3c6c3 100644 --- a/src/syntax/content/parser/token/header.rs +++ b/src/syntax/content/parser/token/header.rs @@ -1,9 +1,18 @@ +use std::{ + collections::{HashMap, hash_map::Entry}, + iter::Peekable, + slice, +}; + use crate::{ prelude::*, + types::Config, syntax::content::{Parseable, Lexeme}, }; + use std::fmt::Display; +#[derive(Debug)] pub struct Header { open: Option, level: Level, @@ -19,6 +28,35 @@ impl Header { } } + pub fn make_id( + config: &Config, + iterator: &mut Peekable>, + ids: &mut HashMap>, + ) -> String { + let base_id = match iterator.peek() { + Some(next_lexeme) + if !config.ascii_dom_ids || next_lexeme.next.is_ascii() => + { + next_lexeme.next.to_lowercase() + }, + _ => String::from("h"), + }; + + match ids.entry(base_id.clone()) { + Entry::Occupied(mut occupied) => { + let ids_vec = occupied.get_mut(); + let suffix = ids_vec.len(); + let id_with_suffix = format!("{base_id}-{suffix}"); + ids_vec.push(id_with_suffix.clone()); + id_with_suffix + }, + Entry::Vacant(vacant) => { + vacant.insert(vec![base_id.clone()]); + base_id + }, + } + } + pub fn from_u8(level: u8, open: bool, dom_id: Option<&str>) -> Header { Header { level: Level::from_u8(level), @@ -27,7 +65,7 @@ impl Header { } } - pub fn get_level(&self) -> u8 { + pub fn level(&self) -> u8 { match self.level { Level::One => 1, Level::Two => 2, @@ -92,6 +130,7 @@ impl Display for Header { } } +#[derive(Debug)] pub enum Level { One, Two, diff --git a/src/syntax/content/parser/token/linebreak.rs b/src/syntax/content/parser/token/linebreak.rs index 365bbdd..d56b49c 100644 --- a/src/syntax/content/parser/token/linebreak.rs +++ b/src/syntax/content/parser/token/linebreak.rs @@ -3,6 +3,7 @@ use crate::{ syntax::content::{Parseable, parser::lexeme::Lexeme}, }; +#[derive(Debug)] pub struct LineBreak {} impl Parseable for LineBreak { diff --git a/src/syntax/content/parser/token/literal.rs b/src/syntax/content/parser/token/literal.rs index f641579..723b152 100644 --- a/src/syntax/content/parser/token/literal.rs +++ b/src/syntax/content/parser/token/literal.rs @@ -1,6 +1,7 @@ use std::fmt::Display; use crate::syntax::content::{Parseable, parser::lexeme::Lexeme}; +#[derive(Debug)] pub struct Literal { text: String, } @@ -17,12 +18,7 @@ impl Parseable for Literal { } fn render(&self) -> String { - let non_sticky = [" ", "\n"]; - if non_sticky.contains(&self.text.as_str()) { - self.text.clone() - } else { - format!("{} ", self.text.clone()) - } + self.text.clone() } } diff --git a/src/syntax/content/parser/token/paragraph.rs b/src/syntax/content/parser/token/paragraph.rs index 09718d0..2348286 100644 --- a/src/syntax/content/parser/token/paragraph.rs +++ b/src/syntax/content/parser/token/paragraph.rs @@ -1,6 +1,7 @@ use std::fmt::Display; use crate::syntax::content::{Parseable, parser::lexeme::Lexeme}; +#[derive(Debug)] pub struct Paragraph { open: Option, } @@ -14,9 +15,7 @@ impl Paragraph { impl Parseable for Paragraph { fn probe(lexeme: &Lexeme) -> bool { // lexeme for paragraph is any non-whitespace, parser knows the context - let raw = lexeme.text(); - let trimmed = raw.trim(); - !trimmed.is_empty() && trimmed != "\n" + !lexeme.is_whitespace() } fn lex(_lexeme: &Lexeme) -> Paragraph { diff --git a/src/syntax/content/parser/token/preformat.rs b/src/syntax/content/parser/token/preformat.rs index af50fbd..568bd38 100644 --- a/src/syntax/content/parser/token/preformat.rs +++ b/src/syntax/content/parser/token/preformat.rs @@ -2,6 +2,7 @@ use crate::{ syntax::content::{Parseable, Lexeme}, }; +#[derive(Debug)] pub struct PreFormat { open: Option, } diff --git a/src/syntax/content/parser/token/span.rs b/src/syntax/content/parser/token/span.rs index 961e72d..b312a28 100644 --- a/src/syntax/content/parser/token/span.rs +++ b/src/syntax/content/parser/token/span.rs @@ -1,6 +1,7 @@ use std::fmt::Display; use crate::syntax::content::{Parseable, parser::lexeme::Lexeme}; +#[derive(Debug)] pub struct Span { open: Option, } diff --git a/static/graph.toml b/static/graph.toml index 1705369..65ff79d 100644 --- a/static/graph.toml +++ b/static/graph.toml @@ -132,7 +132,7 @@ For example: docs|/node/Documentation ` -If the left side contains spaces, you need a leading `|` character. In this case, the space on the left side is mandatory: +If the left side contains spaces, you need a leading `|` character: ` |en docs|https://en.jutty.dev/node/Documentation @@ -141,12 +141,12 @@ If the left side contains spaces, you need a leading `|` character. In this case If you have a trailing character that you don't want to be considered as part of the destination, you can separate it with a third `|`: ` -This |gem|PreciousStone|, though green, was not an emerald. +This gem|PreciousStone|, though green, was not an emerald. ` Which renders as: -This |gem|PreciousStone|, though green, was not an emerald. +This gem|PreciousStone|, though green, was not an emerald. ### Node anchors @@ -169,14 +169,15 @@ Because en can resolve IDs case insensitively (with priority to case-sensitive m In summary, all of the anchors below are valid and lead to the same page: ` +|en Syntax|https://en.jutty.dev/node/Syntax| |en Syntax|https://en.jutty.dev/node/Syntax Syntax|https://en.jutty.dev/node/Syntax -|en Syntax|/node/Syntax -Syntax|/node/Syntax +Syntax|/node/syntax -Syntax|Syntax -syntax|syntax +|syntax|Syntax +Syntax|syntax +Syntax|syntax| |Syntax| |syntax| @@ -312,23 +313,23 @@ We saw example `docs|/node/Documentation`, but shorter syntax exists. #### Epistēmē #### Epistēmē +|en Syntax|https://en.jutty.dev/node/Syntax| |en Syntax|https://en.jutty.dev/node/Syntax Syntax|https://en.jutty.dev/node/Syntax -|en Syntax|/node/Syntax -Syntax|/node/Syntax +Syntax|/node/syntax -Syntax|Syntax -syntax|syntax +|syntax|Syntax +Syntax|syntax +Syntax|syntax| |Syntax| |syntax| """ [meta.config] +content_language = "en" footer_credits = false footer_text = """ made by jutty|https://jutty.dev • acknowledgements|Acknowledgments • |source code|https://codeberg.org/jutty/en """ - -