diff --git a/src/syntax/content/parser.rs b/src/syntax/content/parser.rs index 45c435f..b8b4531 100644 --- a/src/syntax/content/parser.rs +++ b/src/syntax/content/parser.rs @@ -1,21 +1,20 @@ -use std::slice::Iter; - -use crate::prelude::*; use super::{Parseable as _, Token, LexMap}; use token::{ anchor::Anchor, linebreak::LineBreak, paragraph::Paragraph, header::Header, - preformat::PreFormat, literal::Literal, + preformat::PreFormat, literal::Literal, code::Code, }; -use lexeme::{Lexeme, compound::Compound}; +use lexeme::Lexeme; pub mod token; pub mod lexeme; +pub mod cluster; const LEXMAP: LexMap = &[ - (Anchor::probe, |word| Token::Anchor(Anchor::lex(word))), (LineBreak::probe, |word| { Token::LineBreak(LineBreak::lex(word)) }), + (Code::probe, |word| Token::Code(Code::lex(word))), + (Anchor::probe, |word| Token::Anchor(Anchor::lex(word))), (Literal::probe, |word| Token::Literal(Literal::lex(word))), ]; @@ -30,52 +29,52 @@ fn lex(text: &str, map: LexMap) -> Vec { let mut tokens: Vec = Vec::new(); let mut state = Context::None; - let splits = split(text); - let mut iter = splits.iter(); - while let Some(word) = iter.next() { - let compound = cluster(word, &mut iter); - let lexeme = Lexeme::Compound(compound); - + let splits = cluster::cluster(text); + let lexemes = Lexeme::collect(&splits); + let iter = lexemes.iter().peekable(); + for lexeme in iter { match state { Context::None => { - if Header::probe(&lexeme) { - let header = Header::lex(&lexeme); - state = Context::Header(header.get_level()); - tokens.push(Token::Header(header)); - continue; - } else if PreFormat::probe(&lexeme) { + if PreFormat::probe(lexeme) { tokens.push(Token::PreFormat(PreFormat::new(true))); state = Context::PreFormat; continue; - } else if Paragraph::probe(&lexeme) { + } else if Header::probe(lexeme) { + let header = Header::lex(lexeme); + state = Context::Header(header.get_level()); + tokens.push(Token::Header(header)); + continue; + } else if Paragraph::probe(lexeme) { tokens.push(Token::Paragraph(Paragraph::new(true))); state = Context::Paragraph; } }, + Context::PreFormat => { + if PreFormat::probe(lexeme) { + tokens.push(Token::PreFormat(PreFormat::new(false))); + state = Context::None; + } else { + tokens.push(Token::Literal(Literal::lex(lexeme))); + } + continue; + }, Context::Paragraph => { - if word == "\n" { + if lexeme.text() == "\n" { tokens.push(Token::Paragraph(Paragraph::new(false))); state = Context::None; } }, Context::Header(n) => { - if word == "\n" { + if lexeme.text() == "\n" { tokens.push(Token::Header(Header::from_u8(n, false))); state = Context::None; } }, - Context::PreFormat => { - if PreFormat::probe(&lexeme) { - tokens.push(Token::PreFormat(PreFormat::new(false))); - state = Context::None; - continue; - } - }, } for &(ref probe, lex) in map { - if probe(&lexeme) { - tokens.push(lex(&lexeme)); + if probe(lexeme) { + tokens.push(lex(lexeme)); break; } } @@ -84,101 +83,8 @@ fn lex(text: &str, map: LexMap) -> Vec { tokens } -fn split(text: &str) -> Vec { - text.replace("\n", " \n ") - .split(' ') - .map(str::to_string) - .collect() -} - -// this could be eliminated if space were a token -fn join<'i, Iterator>(rendered_tokens: Iterator) -> String -where - Iterator: IntoIterator, -{ - fn stick(current: &str, next: &str) -> bool { - // this could be in a dedicated type - fn is_tag(s: &str) -> bool { - s.starts_with("<") && s.ends_with('>') - } - fn is_opening(s: &str) -> bool { - is_tag(s) && !s.contains(" bool { - is_tag(s) && s.contains(" bool { - is_tag(s) && s.starts_with(" String { - let rendered: Vec = tokens.iter().map(Token::render).collect(); - - join(rendered.iter().map(String::as_str)) -} - -fn cluster<'c>(word: &str, iter: &mut Iter<'c, String>) -> Compound { - if word.starts_with('|') { - log!("Found opener {word}"); - let mut parts = vec![word]; - - if let Some(first) = parts.first() - && first.ends_with('|') - { - log!("Returning atomic cluster"); - Compound::new(&parts.join(" ")) - } else { - log!("Seeking a boundary"); - for next_raw in iter { - if next_raw.contains('|') { - log!("Found end of cluster {next_raw:?}"); - parts.push(next_raw); - break; - } else { - parts.push(next_raw); - log!("Onto next word from {next_raw}"); - } - } - log!("Returning cluster {parts:?}"); - - Compound::new(&parts.join(" ")) - } - } else { - Compound::new(word) - } + tokens.iter().map(Token::render).collect::() } pub(super) fn read(text: &str) -> String { diff --git a/src/syntax/content/parser/cluster.rs b/src/syntax/content/parser/cluster.rs new file mode 100644 index 0000000..24462eb --- /dev/null +++ b/src/syntax/content/parser/cluster.rs @@ -0,0 +1,167 @@ +use crate::prelude::*; + +pub fn cluster(text: &str) -> Vec { + let words: Vec = text + .replace("\n", " \n ") + .split(' ') + .map(str::to_string) + .collect(); + + let mut clusters: Vec = vec![]; + let mut raw_context = false; + + let mut iterator = words.into_iter().peekable(); + while let Some(word) = iterator.next() { + log!("Iterating: {word:?}"); + + if word == "`" { + raw_context = !raw_context; + log!("Raw context is now {raw_context}"); + } else if raw_context { + log!("Skip: In raw context"); + clusters.push(word); + continue; + } + + let Some(delimiter) = delimiter::match_delimiter(&word) else { + log!("Skip: {word:?} does not start with a delimiter"); + clusters.push(word); + continue; + }; + + if let Some(next) = iterator.peek() + && next == "\n" + && delimiter.greedy + { + log!("Skip: Next {next:?} is a break, delimiter is greedy"); + clusters.push(word); + continue; + } + + if word.starts_with(&delimiter.string) + && word.ends_with(&delimiter.string) + { + log!("Skip: {word:?} is atomically-delimited"); + clusters.push(word); + continue; + } + + if (!delimiter.greedy + && !delimiter.triple + && word.matches(delimiter.char).count() == 2) + || (delimiter.triple && word.matches(delimiter.char).count() == 3) + { + log!("Skip: {word:?} is almost atomic, but must be split"); + match word.rsplit_once(delimiter.char) { + Some((head, tail)) => { + log!("Pushing head {head:?}, tail {tail:?} into clusters"); + clusters.push(format!("{head}{}", delimiter.char)); + clusters.push(tail.to_string()); + continue; + }, + None => unreachable!(), + } + } + + log!("Found cluster from {delimiter:?} in {word:?}"); + let mut parts: Vec = vec![word.clone()]; + log!("Seeking from a base of {parts:?}"); + + while let Some(next) = iterator.peek() { + if next.contains(&delimiter.char.to_string()) { + log!("Found end of cluster: {next:?}"); + if delimiter.greedy + && delimiter.triple + && next.matches(delimiter.char).count() > 1 + { + match next.rsplit_once(delimiter.char) { + Some((head, tail)) => { + log!( + "Pushing head {head:?} of greedy triple EOC \ + into parts and tail {tail:?} into clusters" + ); + parts.push(format!("{head}{}", delimiter.char)); + clusters.push(parts.join(" ")); + clusters.push(tail.to_string()); + log!("Breaking past clusters {clusters:?}"); + iterator.next(); + break; + }, + None => unreachable!(), + } + } else if delimiter.greedy { + log!("Pushing end of cluster into parts"); + parts.push( + iterator.next().unwrap_or_else(|| unreachable!()), + ); + log!("Pushing parts {parts:?} into clusters {clusters:?}"); + clusters.push(parts.join(" ")); + log!("Breaking past clusters {clusters:?}"); + break; + } else { + match next.rsplit_once(delimiter.char) { + Some((head, tail)) => { + log!( + "Pushing head {head:?} of humble end of \ + cluster into parts" + ); + parts.push(format!("{head}{}", delimiter.char)); + log!("Pushing parts into clusters"); + clusters.push(parts.join(" ")); + log!("Pushing tail {tail:?} into clusters"); + clusters.push(tail.to_string()); + log!("Breaking past clusters"); + iterator.next(); + break; + }, + // is this one really unreachable? + None => unreachable!(), + } + } + } else { + log!("No delimiter: Pushing {:?} into parts", iterator.peek()); + parts.push(iterator.next().unwrap_or_default()); + log!("Seeking a boundary for parts {parts:?}"); + } + } + } + + log!("Returning clusters"); + clusters +} + +mod delimiter { + + #[derive(Debug, Clone)] + pub struct Delimiter { + pub char: char, + pub string: String, + pub greedy: bool, + pub triple: bool, + } + + fn make_delimiters() -> Vec { + vec![ + Delimiter { + char: '|', + string: "|".to_string(), + greedy: true, + triple: true, + }, + Delimiter { + char: '`', + string: "`".to_string(), + greedy: false, + triple: false, + }, + ] + } + + pub fn match_delimiter(word: &str) -> Option { + let first_char = word.chars().next()?; + make_delimiters() + .iter() + .find(|d| d.char == first_char) + .cloned() + } +} diff --git a/src/syntax/content/parser/lexeme.rs b/src/syntax/content/parser/lexeme.rs index ef94ea6..620ff09 100644 --- a/src/syntax/content/parser/lexeme.rs +++ b/src/syntax/content/parser/lexeme.rs @@ -1,21 +1,25 @@ -#[derive(Clone)] -pub enum Lexeme { - Compound(compound::Compound), +#[derive(Clone, Debug)] +pub struct Lexeme { + text: String, + pub next: String, } -pub mod compound; - impl Lexeme { - pub fn to_raw(&self) -> String { - match *self { - Lexeme::Compound(ref d) => d.raw.clone(), + pub fn new(raw: &str, next: &str) -> Lexeme { + Lexeme { + text: raw.to_owned(), + next: next.to_owned(), } } + pub fn text(&self) -> String { + self.text.clone() + } + /// # Panics /// Panics if number of chars for a single lexeme exceeds `i2::MAX` pub fn count_char(&self, c: char) -> i32 { - let count = self.to_raw().chars().filter(|&n| n == c).count(); + let count = self.text().chars().filter(|&n| n == c).count(); match i32::try_from(count) { Ok(i) => i, Err(e) => { @@ -25,15 +29,31 @@ impl Lexeme { } pub fn split_chars(&self) -> Vec { - let vector: Vec = self.to_raw().chars().collect(); + let vector: Vec = self.text().chars().collect(); vector } pub fn split_words(self) -> Vec { - self.to_raw().split(' ').map(str::to_string).collect() + self.text().split(' ').map(str::to_string).collect() } pub fn first(self) -> Option { self.split_words().first().map(String::to_owned) } + + pub fn collect(raw_strings: &[String]) -> Vec { + let mut out_vector = Vec::with_capacity(raw_strings.len()); + let mut iterator = raw_strings.iter().peekable(); + + while let Some(raw) = iterator.next() { + let next = + iterator.peek().map(|s| (*s).clone()).unwrap_or_default(); + out_vector.push(Lexeme { + text: raw.to_owned(), + next, + }); + } + + out_vector + } } diff --git a/src/syntax/content/parser/lexeme/compound.rs b/src/syntax/content/parser/lexeme/compound.rs deleted file mode 100644 index 1f30cd1..0000000 --- a/src/syntax/content/parser/lexeme/compound.rs +++ /dev/null @@ -1,12 +0,0 @@ -#[derive(Clone)] -pub struct Compound { - pub raw: String, -} - -impl Compound { - pub fn new(text: &str) -> Compound { - Compound { - raw: text.to_owned(), - } - } -} diff --git a/src/syntax/content/parser/token.rs b/src/syntax/content/parser/token.rs index 897239d..19a27cd 100644 --- a/src/syntax/content/parser/token.rs +++ b/src/syntax/content/parser/token.rs @@ -7,9 +7,11 @@ pub mod paragraph; pub mod span; pub mod header; pub mod preformat; +pub mod code; pub enum Token { Anchor(anchor::Anchor), + Code(code::Code), Header(header::Header), LineBreak(linebreak::LineBreak), Literal(literal::Literal), @@ -22,6 +24,7 @@ impl Token { pub fn render(&self) -> String { match *self { Token::Anchor(ref d) => d.render(), + Token::Code(ref d) => d.render(), Token::Header(ref d) => d.render(), Token::LineBreak(ref d) => d.render(), Token::Literal(ref d) => d.render(), @@ -73,3 +76,9 @@ impl From for Token { Token::PreFormat(d) } } + +impl From for Token { + fn from(d: code::Code) -> Token { + Token::Code(d) + } +} diff --git a/src/syntax/content/parser/token/anchor.rs b/src/syntax/content/parser/token/anchor.rs index 394ac47..3b2d150 100644 --- a/src/syntax/content/parser/token/anchor.rs +++ b/src/syntax/content/parser/token/anchor.rs @@ -1,69 +1,93 @@ +use crate::prelude::*; + use std::fmt::Display; use crate::syntax::content::{Parseable, parser::lexeme::Lexeme}; pub struct Anchor { text: String, destination: String, + sticky: bool, } impl Parseable for Anchor { fn probe(lexeme: &Lexeme) -> bool { let pipe_count = lexeme.count_char('|'); - let chars = lexeme.split_chars(); - let c1 = *match chars.first() { - Some(c) => c, - None => return false, - }; - let cn = *match chars.last() { - Some(c) => c, - None => return false, - }; + log!("{lexeme:?} has {pipe_count} pipes"); - if !(1_i32..=3_i32).contains(&pipe_count) { + if !(1..=3).contains(&pipe_count) { + log!("Negative: Bad pipe count {pipe_count} in {lexeme:?}"); return false; } - if lexeme.to_raw().matches("||").count() > 0 { + if lexeme.text().matches("||").count() > 0 { + log!("Negative: Contiguous pipes in {lexeme:?}"); return false; } - if pipe_count == 1 { - c1 != '|' && cn != '|' - } else if pipe_count == 2 { - c1 == '|' && cn != '|' - } else if pipe_count == 3 { - c1 == '|' && cn == '|' + let parts = Anchor::split_parts(lexeme); + if (1..=2).contains(&parts.len()) { + log!("Positive: Parts {parts:?} with length {}", parts.len()); + true } else { + log!("Negative: {parts:?} have length {}", parts.len()); false } } fn lex(lexeme: &Lexeme) -> Anchor { - let parts: Vec = lexeme - .to_raw() - .split('|') - .filter(|s| !s.is_empty()) - .map(str::to_string) - .collect(); - - assert!(parts.len() == 2, "Parts should always be 2: {parts:?}"); + let parts = Anchor::split_parts(lexeme); + log!("Lexing anchor {parts:?}"); let text = parts.first().unwrap_or_else(|| unreachable!()); - let raw_destination = parts.get(1).unwrap_or_else(|| unreachable!()); - let destination = - if raw_destination.contains(":") || raw_destination.contains("/") { - raw_destination.to_owned() - } else { - format!("/node/{raw_destination}") - }; + fn try_node_anchor(anchor: &str) -> String { + if anchor.contains(":") || anchor.contains("/") { + anchor.to_owned() + } else { + format!("/node/{anchor}") + } + } + + let destination = match parts.get(1) { + Some(d) => try_node_anchor(d), + None => try_node_anchor(text), + }; + + let sticky = [ + ",", ".", ":", ";", "!", "?", "/", "(", ")", "%", "*", "&", r#"""#, + "'", + ]; + + log!("Lexed anchor: {text} -> {destination}"); Anchor { text: text.to_owned(), destination, + sticky: sticky.contains(&lexeme.next.as_str()), } } fn render(&self) -> String { - format!(r#"{}"#, &self.destination, &self.text) + let space = if self.sticky { + String::new() + } else { + String::from(" ") + }; + format!( + r#"{}{space}"#, + &self.destination, &self.text + ) + } +} + +impl Anchor { + fn split_parts(lexeme: &Lexeme) -> Vec { + lexeme + .text() + .trim_start_matches('|') + .trim_end_matches('|') + .split('|') + .filter(|s| !s.is_empty()) + .map(str::to_string) + .collect() } } diff --git a/src/syntax/content/parser/token/code.rs b/src/syntax/content/parser/token/code.rs new file mode 100644 index 0000000..cf1d300 --- /dev/null +++ b/src/syntax/content/parser/token/code.rs @@ -0,0 +1,43 @@ +use crate::{ + syntax::content::{Parseable, Lexeme}, +}; + +pub struct Code { + text: String, + sticky: bool, +} + +impl Parseable for Code { + fn probe(lexeme: &Lexeme) -> bool { + let chars = lexeme.split_chars(); + + if let Some(first_char) = chars.first() + && let Some(last_char) = chars.last() + { + *first_char == '`' && *last_char == '`' + } else { + false + } + } + + fn lex(lexeme: &Lexeme) -> Code { + let sticky = [ + ",", ".", ":", ";", "!", "?", "/", "(", ")", "%", "*", "&", r#"""#, + "'", + ]; + + Code { + text: lexeme.text().replace("`", ""), + sticky: sticky.contains(&lexeme.next.as_str()), + } + } + + fn render(&self) -> String { + let space = if self.sticky { + String::new() + } else { + String::from(" ") + }; + format!("{}{space}", self.text) + } +} diff --git a/src/syntax/content/parser/token/header.rs b/src/syntax/content/parser/token/header.rs index 1c13fbf..2a5d90b 100644 --- a/src/syntax/content/parser/token/header.rs +++ b/src/syntax/content/parser/token/header.rs @@ -45,7 +45,7 @@ impl Parseable for Header { .count() == 0 { - let level = lexeme.to_raw().len(); + let level = lexeme.text().len(); lexeme.clone().split_words().len() == 1 && level > 0 && level <= 6 } else { false @@ -53,7 +53,7 @@ impl Parseable for Header { } fn lex(lexeme: &Lexeme) -> Header { - Header::new(lexeme.to_raw().len().into(), true) + Header::new(lexeme.text().len().into(), true) } fn render(&self) -> String { @@ -116,7 +116,7 @@ impl From for Level { Ok(u) => u, Err(e) => { log!("Truncating header level {z} to 6: {e:?}"); - 6_u8 + 6 }, }; Level::from_u8(u8) diff --git a/src/syntax/content/parser/token/linebreak.rs b/src/syntax/content/parser/token/linebreak.rs index 8fb6d52..365bbdd 100644 --- a/src/syntax/content/parser/token/linebreak.rs +++ b/src/syntax/content/parser/token/linebreak.rs @@ -7,7 +7,7 @@ pub struct LineBreak {} impl Parseable for LineBreak { fn probe(lexeme: &Lexeme) -> bool { - lexeme.to_raw() == "\n" + lexeme.text() == "\n" } fn lex(_lexeme: &Lexeme) -> LineBreak { diff --git a/src/syntax/content/parser/token/literal.rs b/src/syntax/content/parser/token/literal.rs index 131af39..f641579 100644 --- a/src/syntax/content/parser/token/literal.rs +++ b/src/syntax/content/parser/token/literal.rs @@ -12,12 +12,17 @@ impl Parseable for Literal { fn lex(lexeme: &Lexeme) -> Literal { Literal { - text: lexeme.to_raw(), + text: lexeme.text(), } } fn render(&self) -> String { - self.text.clone() + let non_sticky = [" ", "\n"]; + if non_sticky.contains(&self.text.as_str()) { + self.text.clone() + } else { + format!("{} ", self.text.clone()) + } } } diff --git a/src/syntax/content/parser/token/paragraph.rs b/src/syntax/content/parser/token/paragraph.rs index e8c8543..09718d0 100644 --- a/src/syntax/content/parser/token/paragraph.rs +++ b/src/syntax/content/parser/token/paragraph.rs @@ -14,7 +14,7 @@ impl Paragraph { impl Parseable for Paragraph { fn probe(lexeme: &Lexeme) -> bool { // lexeme for paragraph is any non-whitespace, parser knows the context - let raw = lexeme.to_raw(); + let raw = lexeme.text(); let trimmed = raw.trim(); !trimmed.is_empty() && trimmed != "\n" }