Simplify parser module structure, add several syntax elements

2025-12-20 21:25:06 -03:00 · 2025-12-20 21:25:06 -03:00 · e3d5686c7b
commit e3d5686c7b
parent 070b5b7448
11 changed files with 348 additions and 186 deletions
--- a/src/syntax/content/parser.rs
+++ b/src/syntax/content/parser.rs
@ -1,21 +1,20 @@
-use std::slice::Iter;
-
-use crate::prelude::*;
 use super::{Parseable as _, Token, LexMap};
 use token::{
    anchor::Anchor, linebreak::LineBreak, paragraph::Paragraph, header::Header,
-    preformat::PreFormat, literal::Literal,
+    preformat::PreFormat, literal::Literal, code::Code,
 };
-use lexeme::{Lexeme, compound::Compound};
+use lexeme::Lexeme;

 pub mod token;
 pub mod lexeme;
+pub mod cluster;

 const LEXMAP: LexMap = &[
-    (Anchor::probe, |word| Token::Anchor(Anchor::lex(word))),
    (LineBreak::probe, |word| {
        Token::LineBreak(LineBreak::lex(word))
    }),
+    (Code::probe, |word| Token::Code(Code::lex(word))),
+    (Anchor::probe, |word| Token::Anchor(Anchor::lex(word))),
    (Literal::probe, |word| Token::Literal(Literal::lex(word))),
 ];

@ -30,52 +29,52 @@ fn lex(text: &str, map: LexMap) -> Vec<Token> {
    let mut tokens: Vec<Token> = Vec::new();
    let mut state = Context::None;

-    let splits = split(text);
-    let mut iter = splits.iter();
-    while let Some(word) = iter.next() {
-        let compound = cluster(word, &mut iter);
-        let lexeme = Lexeme::Compound(compound);
-
+    let splits = cluster::cluster(text);
+    let lexemes = Lexeme::collect(&splits);
+    let iter = lexemes.iter().peekable();
+    for lexeme in iter {
        match state {
            Context::None => {
-                if Header::probe(&lexeme) {
-                    let header = Header::lex(&lexeme);
-                    state = Context::Header(header.get_level());
-                    tokens.push(Token::Header(header));
-                    continue;
-                } else if PreFormat::probe(&lexeme) {
+                if PreFormat::probe(lexeme) {
                    tokens.push(Token::PreFormat(PreFormat::new(true)));
                    state = Context::PreFormat;
                    continue;
-                } else if Paragraph::probe(&lexeme) {
+                } else if Header::probe(lexeme) {
+                    let header = Header::lex(lexeme);
+                    state = Context::Header(header.get_level());
+                    tokens.push(Token::Header(header));
+                    continue;
+                } else if Paragraph::probe(lexeme) {
                    tokens.push(Token::Paragraph(Paragraph::new(true)));
                    state = Context::Paragraph;
                }
            },
+            Context::PreFormat => {
+                if PreFormat::probe(lexeme) {
+                    tokens.push(Token::PreFormat(PreFormat::new(false)));
+                    state = Context::None;
+                } else {
+                    tokens.push(Token::Literal(Literal::lex(lexeme)));
+                }
+                continue;
+            },
            Context::Paragraph => {
-                if word == "\n" {
+                if lexeme.text() == "\n" {
                    tokens.push(Token::Paragraph(Paragraph::new(false)));
                    state = Context::None;
                }
            },
            Context::Header(n) => {
-                if word == "\n" {
+                if lexeme.text() == "\n" {
                    tokens.push(Token::Header(Header::from_u8(n, false)));
                    state = Context::None;
                }
            },
-            Context::PreFormat => {
-                if PreFormat::probe(&lexeme) {
-                    tokens.push(Token::PreFormat(PreFormat::new(false)));
-                    state = Context::None;
-                    continue;
-                }
-            },
        }

        for &(ref probe, lex) in map {
-            if probe(&lexeme) {
-                tokens.push(lex(&lexeme));
+            if probe(lexeme) {
+                tokens.push(lex(lexeme));
                break;
            }
        }
@ -84,101 +83,8 @@ fn lex(text: &str, map: LexMap) -> Vec<Token> {
    tokens
 }

-fn split(text: &str) -> Vec<String> {
-    text.replace("\n", " \n ")
-        .split(' ')
-        .map(str::to_string)
-        .collect()
-}
-
-// this could be eliminated if space were a token
-fn join<'i, Iterator>(rendered_tokens: Iterator) -> String
-where
-    Iterator: IntoIterator<Item = &'i str>,
-{
-    fn stick(current: &str, next: &str) -> bool {
-        // this could be in a dedicated type
-        fn is_tag(s: &str) -> bool {
-            s.starts_with("<") && s.ends_with('>')
-        }
-        fn is_opening(s: &str) -> bool {
-            is_tag(s) && !s.contains("</")
-        }
-        fn is_closing(s: &str) -> bool {
-            is_tag(s) && s.contains("</")
-        }
-        fn is_inline(s: &str) -> bool {
-            is_tag(s) && s.starts_with("<a")
-        }
-
-        log!("On {current}[?]{next}");
-        if is_inline(next) {
-            log!("Pushing space because {next} is inline");
-            false
-        } else if is_closing(next) {
-            log!("Not pushing space because {next} is closing");
-            true
-        } else if is_opening(current) {
-            log!("Not pushing space because {current} is opening");
-            true
-        } else {
-            false
-        }
-    }
-
-    let mut iterator = rendered_tokens.into_iter();
-    let mut out_string = String::new();
-
-    if let Some(mut current) = iterator.next() {
-        out_string.push_str(current);
-        for next in iterator {
-            if stick(current, next) {
-                out_string.push_str(next);
-            } else {
-                out_string.push(' ');
-                out_string.push_str(next);
-            }
-            current = next;
-        }
-    }
-    out_string
-}
-
 fn parse(tokens: &[Token]) -> String {
-    let rendered: Vec<String> = tokens.iter().map(Token::render).collect();
-
-    join(rendered.iter().map(String::as_str))
-}
-
-fn cluster<'c>(word: &str, iter: &mut Iter<'c, String>) -> Compound {
-    if word.starts_with('|') {
-        log!("Found opener {word}");
-        let mut parts = vec![word];
-
-        if let Some(first) = parts.first()
-            && first.ends_with('|')
-        {
-            log!("Returning atomic cluster");
-            Compound::new(&parts.join(" "))
-        } else {
-            log!("Seeking a boundary");
-            for next_raw in iter {
-                if next_raw.contains('|') {
-                    log!("Found end of cluster {next_raw:?}");
-                    parts.push(next_raw);
-                    break;
-                } else {
-                    parts.push(next_raw);
-                    log!("Onto next word from {next_raw}");
-                }
-            }
-            log!("Returning cluster {parts:?}");
-
-            Compound::new(&parts.join(" "))
-        }
-    } else {
-        Compound::new(word)
-    }
+    tokens.iter().map(Token::render).collect::<String>()
 }

 pub(super) fn read(text: &str) -> String {
--- a/src/syntax/content/parser/cluster.rs
+++ b/src/syntax/content/parser/cluster.rs
@ -0,0 +1,167 @@
+use crate::prelude::*;
+
+pub fn cluster(text: &str) -> Vec<String> {
+    let words: Vec<String> = text
+        .replace("\n", " \n ")
+        .split(' ')
+        .map(str::to_string)
+        .collect();
+
+    let mut clusters: Vec<String> = vec![];
+    let mut raw_context = false;
+
+    let mut iterator = words.into_iter().peekable();
+    while let Some(word) = iterator.next() {
+        log!("Iterating: {word:?}");
+
+        if word == "`" {
+            raw_context = !raw_context;
+            log!("Raw context is now {raw_context}");
+        } else if raw_context {
+            log!("Skip: In raw context");
+            clusters.push(word);
+            continue;
+        }
+
+        let Some(delimiter) = delimiter::match_delimiter(&word) else {
+            log!("Skip: {word:?} does not start with a delimiter");
+            clusters.push(word);
+            continue;
+        };
+
+        if let Some(next) = iterator.peek()
+            && next == "\n"
+            && delimiter.greedy
+        {
+            log!("Skip: Next {next:?} is a break, delimiter is greedy");
+            clusters.push(word);
+            continue;
+        }
+
+        if word.starts_with(&delimiter.string)
+            && word.ends_with(&delimiter.string)
+        {
+            log!("Skip: {word:?} is atomically-delimited");
+            clusters.push(word);
+            continue;
+        }
+
+        if (!delimiter.greedy
+            && !delimiter.triple
+            && word.matches(delimiter.char).count() == 2)
+            || (delimiter.triple && word.matches(delimiter.char).count() == 3)
+        {
+            log!("Skip: {word:?} is almost atomic, but must be split");
+            match word.rsplit_once(delimiter.char) {
+                Some((head, tail)) => {
+                    log!("Pushing head {head:?}, tail {tail:?} into clusters");
+                    clusters.push(format!("{head}{}", delimiter.char));
+                    clusters.push(tail.to_string());
+                    continue;
+                },
+                None => unreachable!(),
+            }
+        }
+
+        log!("Found cluster from {delimiter:?} in {word:?}");
+        let mut parts: Vec<String> = vec![word.clone()];
+        log!("Seeking from a base of {parts:?}");
+
+        while let Some(next) = iterator.peek() {
+            if next.contains(&delimiter.char.to_string()) {
+                log!("Found end of cluster: {next:?}");
+                if delimiter.greedy
+                    && delimiter.triple
+                    && next.matches(delimiter.char).count() > 1
+                {
+                    match next.rsplit_once(delimiter.char) {
+                        Some((head, tail)) => {
+                            log!(
+                                "Pushing head {head:?} of greedy triple EOC \
+                                into parts and tail {tail:?} into clusters"
+                            );
+                            parts.push(format!("{head}{}", delimiter.char));
+                            clusters.push(parts.join(" "));
+                            clusters.push(tail.to_string());
+                            log!("Breaking past clusters {clusters:?}");
+                            iterator.next();
+                            break;
+                        },
+                        None => unreachable!(),
+                    }
+                } else if delimiter.greedy {
+                    log!("Pushing end of cluster into parts");
+                    parts.push(
+                        iterator.next().unwrap_or_else(|| unreachable!()),
+                    );
+                    log!("Pushing parts {parts:?} into clusters {clusters:?}");
+                    clusters.push(parts.join(" "));
+                    log!("Breaking past clusters {clusters:?}");
+                    break;
+                } else {
+                    match next.rsplit_once(delimiter.char) {
+                        Some((head, tail)) => {
+                            log!(
+                                "Pushing head {head:?} of humble end of \
+                                cluster into parts"
+                            );
+                            parts.push(format!("{head}{}", delimiter.char));
+                            log!("Pushing parts into clusters");
+                            clusters.push(parts.join(" "));
+                            log!("Pushing tail {tail:?} into clusters");
+                            clusters.push(tail.to_string());
+                            log!("Breaking past clusters");
+                            iterator.next();
+                            break;
+                        },
+                        // is this one really unreachable?
+                        None => unreachable!(),
+                    }
+                }
+            } else {
+                log!("No delimiter: Pushing {:?} into parts", iterator.peek());
+                parts.push(iterator.next().unwrap_or_default());
+                log!("Seeking a boundary for parts {parts:?}");
+            }
+        }
+    }
+
+    log!("Returning clusters");
+    clusters
+}
+
+mod delimiter {
+
+    #[derive(Debug, Clone)]
+    pub struct Delimiter {
+        pub char: char,
+        pub string: String,
+        pub greedy: bool,
+        pub triple: bool,
+    }
+
+    fn make_delimiters() -> Vec<Delimiter> {
+        vec![
+            Delimiter {
+                char: '|',
+                string: "|".to_string(),
+                greedy: true,
+                triple: true,
+            },
+            Delimiter {
+                char: '`',
+                string: "`".to_string(),
+                greedy: false,
+                triple: false,
+            },
+        ]
+    }
+
+    pub fn match_delimiter(word: &str) -> Option<Delimiter> {
+        let first_char = word.chars().next()?;
+        make_delimiters()
+            .iter()
+            .find(|d| d.char == first_char)
+            .cloned()
+    }
+}
--- a/src/syntax/content/parser/lexeme.rs
+++ b/src/syntax/content/parser/lexeme.rs
@ -1,21 +1,25 @@
-#[derive(Clone)]
-pub enum Lexeme {
-    Compound(compound::Compound),
+#[derive(Clone, Debug)]
+pub struct Lexeme {
+    text: String,
+    pub next: String,
 }

-pub mod compound;
-
 impl Lexeme {
-    pub fn to_raw(&self) -> String {
-        match *self {
-            Lexeme::Compound(ref d) => d.raw.clone(),
+    pub fn new(raw: &str, next: &str) -> Lexeme {
+        Lexeme {
+            text: raw.to_owned(),
+            next: next.to_owned(),
        }
    }

+    pub fn text(&self) -> String {
+        self.text.clone()
+    }
+
    /// # Panics
    /// Panics if number of chars for a single lexeme exceeds `i2::MAX`
    pub fn count_char(&self, c: char) -> i32 {
-        let count = self.to_raw().chars().filter(|&n| n == c).count();
+        let count = self.text().chars().filter(|&n| n == c).count();
        match i32::try_from(count) {
            Ok(i) => i,
            Err(e) => {
@ -25,15 +29,31 @@ impl Lexeme {
    }

    pub fn split_chars(&self) -> Vec<char> {
-        let vector: Vec<char> = self.to_raw().chars().collect();
+        let vector: Vec<char> = self.text().chars().collect();
        vector
    }

    pub fn split_words(self) -> Vec<String> {
-        self.to_raw().split(' ').map(str::to_string).collect()
+        self.text().split(' ').map(str::to_string).collect()
    }

    pub fn first(self) -> Option<String> {
        self.split_words().first().map(String::to_owned)
    }
+
+    pub fn collect(raw_strings: &[String]) -> Vec<Lexeme> {
+        let mut out_vector = Vec::with_capacity(raw_strings.len());
+        let mut iterator = raw_strings.iter().peekable();
+
+        while let Some(raw) = iterator.next() {
+            let next =
+                iterator.peek().map(|s| (*s).clone()).unwrap_or_default();
+            out_vector.push(Lexeme {
+                text: raw.to_owned(),
+                next,
+            });
+        }
+
+        out_vector
+    }
 }
--- a/src/syntax/content/parser/lexeme/compound.rs
+++ b/src/syntax/content/parser/lexeme/compound.rs
@ -1,12 +0,0 @@
-#[derive(Clone)]
-pub struct Compound {
-    pub raw: String,
-}
-
-impl Compound {
-    pub fn new(text: &str) -> Compound {
-        Compound {
-            raw: text.to_owned(),
-        }
-    }
-}
--- a/src/syntax/content/parser/token.rs
+++ b/src/syntax/content/parser/token.rs
@ -7,9 +7,11 @@ pub mod paragraph;
 pub mod span;
 pub mod header;
 pub mod preformat;
+pub mod code;

 pub enum Token {
    Anchor(anchor::Anchor),
+    Code(code::Code),
    Header(header::Header),
    LineBreak(linebreak::LineBreak),
    Literal(literal::Literal),
@ -22,6 +24,7 @@ impl Token {
    pub fn render(&self) -> String {
        match *self {
            Token::Anchor(ref d) => d.render(),
+            Token::Code(ref d) => d.render(),
            Token::Header(ref d) => d.render(),
            Token::LineBreak(ref d) => d.render(),
            Token::Literal(ref d) => d.render(),
@ -73,3 +76,9 @@ impl From<preformat::PreFormat> for Token {
        Token::PreFormat(d)
    }
 }
+
+impl From<code::Code> for Token {
+    fn from(d: code::Code) -> Token {
+        Token::Code(d)
+    }
+}
--- a/src/syntax/content/parser/token/anchor.rs
+++ b/src/syntax/content/parser/token/anchor.rs
@ -1,69 +1,93 @@
+use crate::prelude::*;
+
 use std::fmt::Display;
 use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};

 pub struct Anchor {
    text: String,
    destination: String,
+    sticky: bool,
 }

 impl Parseable for Anchor {
    fn probe(lexeme: &Lexeme) -> bool {
        let pipe_count = lexeme.count_char('|');
-        let chars = lexeme.split_chars();
-        let c1 = *match chars.first() {
-            Some(c) => c,
-            None => return false,
-        };
-        let cn = *match chars.last() {
-            Some(c) => c,
-            None => return false,
-        };
+        log!("{lexeme:?} has {pipe_count} pipes");

-        if !(1_i32..=3_i32).contains(&pipe_count) {
+        if !(1..=3).contains(&pipe_count) {
+            log!("Negative: Bad pipe count {pipe_count} in {lexeme:?}");
            return false;
        }
-        if lexeme.to_raw().matches("||").count() > 0 {
+        if lexeme.text().matches("||").count() > 0 {
+            log!("Negative: Contiguous pipes in {lexeme:?}");
            return false;
        }

-        if pipe_count == 1 {
-            c1 != '|' && cn != '|'
-        } else if pipe_count == 2 {
-            c1 == '|' && cn != '|'
-        } else if pipe_count == 3 {
-            c1 == '|' && cn == '|'
+        let parts = Anchor::split_parts(lexeme);
+        if (1..=2).contains(&parts.len()) {
+            log!("Positive: Parts {parts:?} with length {}", parts.len());
+            true
        } else {
+            log!("Negative: {parts:?} have length {}", parts.len());
            false
        }
    }

    fn lex(lexeme: &Lexeme) -> Anchor {
-        let parts: Vec<String> = lexeme
-            .to_raw()
-            .split('|')
-            .filter(|s| !s.is_empty())
-            .map(str::to_string)
-            .collect();
-
-        assert!(parts.len() == 2, "Parts should always be 2: {parts:?}");
+        let parts = Anchor::split_parts(lexeme);
+        log!("Lexing anchor {parts:?}");

        let text = parts.first().unwrap_or_else(|| unreachable!());
-        let raw_destination = parts.get(1).unwrap_or_else(|| unreachable!());
-        let destination =
-            if raw_destination.contains(":") || raw_destination.contains("/") {
-                raw_destination.to_owned()
-            } else {
-                format!("/node/{raw_destination}")
-            };

+        fn try_node_anchor(anchor: &str) -> String {
+            if anchor.contains(":") || anchor.contains("/") {
+                anchor.to_owned()
+            } else {
+                format!("/node/{anchor}")
+            }
+        }
+
+        let destination = match parts.get(1) {
+            Some(d) => try_node_anchor(d),
+            None => try_node_anchor(text),
+        };
+
+        let sticky = [
+            ",", ".", ":", ";", "!", "?", "/", "(", ")", "%", "*", "&", r#"""#,
+            "'",
+        ];
+
+        log!("Lexed anchor: {text} -> {destination}");
        Anchor {
            text: text.to_owned(),
            destination,
+            sticky: sticky.contains(&lexeme.next.as_str()),
        }
    }

    fn render(&self) -> String {
-        format!(r#"<a href="{}">{}</a>"#, &self.destination, &self.text)
+        let space = if self.sticky {
+            String::new()
+        } else {
+            String::from(" ")
+        };
+        format!(
+            r#"<a href="{}">{}</a>{space}"#,
+            &self.destination, &self.text
+        )
+    }
+}
+
+impl Anchor {
+    fn split_parts(lexeme: &Lexeme) -> Vec<String> {
+        lexeme
+            .text()
+            .trim_start_matches('|')
+            .trim_end_matches('|')
+            .split('|')
+            .filter(|s| !s.is_empty())
+            .map(str::to_string)
+            .collect()
    }
 }

--- a/src/syntax/content/parser/token/code.rs
+++ b/src/syntax/content/parser/token/code.rs
@ -0,0 +1,43 @@
+use crate::{
+    syntax::content::{Parseable, Lexeme},
+};
+
+pub struct Code {
+    text: String,
+    sticky: bool,
+}
+
+impl Parseable for Code {
+    fn probe(lexeme: &Lexeme) -> bool {
+        let chars = lexeme.split_chars();
+
+        if let Some(first_char) = chars.first()
+            && let Some(last_char) = chars.last()
+        {
+            *first_char == '`' && *last_char == '`'
+        } else {
+            false
+        }
+    }
+
+    fn lex(lexeme: &Lexeme) -> Code {
+        let sticky = [
+            ",", ".", ":", ";", "!", "?", "/", "(", ")", "%", "*", "&", r#"""#,
+            "'",
+        ];
+
+        Code {
+            text: lexeme.text().replace("`", ""),
+            sticky: sticky.contains(&lexeme.next.as_str()),
+        }
+    }
+
+    fn render(&self) -> String {
+        let space = if self.sticky {
+            String::new()
+        } else {
+            String::from(" ")
+        };
+        format!("<code>{}</code>{space}", self.text)
+    }
+}
--- a/src/syntax/content/parser/token/header.rs
+++ b/src/syntax/content/parser/token/header.rs
@ -45,7 +45,7 @@ impl Parseable for Header {
            .count()
            == 0
        {
-            let level = lexeme.to_raw().len();
+            let level = lexeme.text().len();
            lexeme.clone().split_words().len() == 1 && level > 0 && level <= 6
        } else {
            false
@ -53,7 +53,7 @@ impl Parseable for Header {
    }

    fn lex(lexeme: &Lexeme) -> Header {
-        Header::new(lexeme.to_raw().len().into(), true)
+        Header::new(lexeme.text().len().into(), true)
    }

    fn render(&self) -> String {
@ -116,7 +116,7 @@ impl From<usize> for Level {
            Ok(u) => u,
            Err(e) => {
                log!("Truncating header level {z} to 6: {e:?}");
-                6_u8
+                6
            },
        };
        Level::from_u8(u8)
--- a/src/syntax/content/parser/token/linebreak.rs
+++ b/src/syntax/content/parser/token/linebreak.rs
@ -7,7 +7,7 @@ pub struct LineBreak {}

 impl Parseable for LineBreak {
    fn probe(lexeme: &Lexeme) -> bool {
-        lexeme.to_raw() == "\n"
+        lexeme.text() == "\n"
    }

    fn lex(_lexeme: &Lexeme) -> LineBreak {
--- a/src/syntax/content/parser/token/literal.rs
+++ b/src/syntax/content/parser/token/literal.rs
@ -12,12 +12,17 @@ impl Parseable for Literal {

    fn lex(lexeme: &Lexeme) -> Literal {
        Literal {
-            text: lexeme.to_raw(),
+            text: lexeme.text(),
        }
    }

    fn render(&self) -> String {
-        self.text.clone()
+        let non_sticky = [" ", "\n"];
+        if non_sticky.contains(&self.text.as_str()) {
+            self.text.clone()
+        } else {
+            format!("{} ", self.text.clone())
+        }
    }
 }

--- a/src/syntax/content/parser/token/paragraph.rs
+++ b/src/syntax/content/parser/token/paragraph.rs
@ -14,7 +14,7 @@ impl Paragraph {
 impl Parseable for Paragraph {
    fn probe(lexeme: &Lexeme) -> bool {
        // lexeme for paragraph is any non-whitespace, parser knows the context
-        let raw = lexeme.to_raw();
+        let raw = lexeme.text();
        let trimmed = raw.trim();
        !trimmed.is_empty() && trimmed != "\n"
    }