Rework token segmentation

2025-12-23 21:40:57 -03:00 · 2025-12-23 21:40:57 -03:00 · 8b782d6d20
commit 8b782d6d20
parent a33d9cb1e1
16 changed files with 497 additions and 385 deletions
--- a/src/formats.rs
+++ b/src/formats.rs
@ -136,12 +136,3 @@ pub fn deserialize_graph(in_format: &Format, serial: &str) -> Graph {
        },
    }
 }
-
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn smoke() {
-        let n = true;
-        assert!(n);
-    }
-}
--- a/src/main.rs
+++ b/src/main.rs
@ -49,12 +49,3 @@ async fn main() -> io::Result<()> {

    Ok(())
 }
-
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn smoke() {
-        let e = true;
-        assert!(e);
-    }
-}
--- a/src/syntax/content/parser.rs
+++ b/src/syntax/content/parser.rs
@ -1,4 +1,4 @@
-use std::collections::{HashMap, hash_map::Entry};
+use std::collections::{HashMap};

 use crate::{formats::populate_graph, types::Config};

@ -11,98 +11,147 @@ use lexeme::Lexeme;

 pub mod token;
 pub mod lexeme;
-pub mod cluster;
+pub mod segment;

 const LEXMAP: LexMap = &[
    (LineBreak::probe, |word| {
        Token::LineBreak(LineBreak::lex(word))
    }),
-    (Code::probe, |word| Token::Code(Code::lex(word))),
-    (Anchor::probe, |word| Token::Anchor(Anchor::lex(word))),
    (Literal::probe, |word| Token::Literal(Literal::lex(word))),
 ];

-enum Context {
-    None,
-    Paragraph,
-    Header(u8),
-    PreFormat,
-}
-
-struct State {
-    context: Context,
-    dom_ids: HashMap<String, Vec<String>>,
-}
-
 fn lex(text: &str, map: LexMap) -> Vec<Token> {
    let mut tokens: Vec<Token> = Vec::new();
-    let mut state = State {
-        context: Context::None,
-        dom_ids: HashMap::new(),
-    };
+    let mut state = State::new();
    let config: Config = populate_graph().meta.config;

-    let splits = cluster::cluster(text);
-    let lexemes = Lexeme::collect(&splits);
-    let iter = lexemes.iter().peekable();
-    for lexeme in iter {
-        match state.context {
-            Context::None => {
+    let segments = segment::segment(text);
+    let lexemes = Lexeme::collect(&segments);
+
+    let mut iterator = lexemes.iter().peekable();
+    while let Some(lexeme) = iterator.next() {
+        match state.context.block {
+            BlockContext::None => {
                if PreFormat::probe(lexeme) {
+                    state.context.block = BlockContext::PreFormat;
                    tokens.push(Token::PreFormat(PreFormat::new(true)));
-                    state.context = Context::PreFormat;
                    continue;
                } else if Header::probe(lexeme) {
-                    let base_id =
-                        if config.ascii_dom_ids && !lexeme.next.is_ascii() {
-                            String::from("h")
-                        } else {
-                            lexeme.next.clone().to_lowercase()
-                        };
-                    let id = match state.dom_ids.entry(base_id.clone()) {
-                        Entry::Occupied(mut occupied) => {
-                            let ids = occupied.get_mut();
-                            let suffix: u8 =
-                                ids.len().try_into().unwrap_or_default();
-                            let id_with_suffix = format!("{base_id}-{suffix}");
-                            ids.push(id_with_suffix.clone());
-                            id_with_suffix
-                        },
-                        Entry::Vacant(vacant) => {
-                            vacant.insert(vec![base_id.clone()]);
-                            base_id
-                        },
-                    };
-
                    let mut header = Header::lex(lexeme);
-                    header.dom_id = Some(id);
-                    state.context = Context::Header(header.get_level());
+                    header.dom_id = Some(Header::make_id(
+                        &config,
+                        &mut iterator,
+                        &mut state.dom_ids,
+                    ));
+                    state.context.block = BlockContext::Header(header.level());
                    tokens.push(Token::Header(header));
                    continue;
                } else if Paragraph::probe(lexeme) {
+                    state.context.block = BlockContext::Paragraph;
                    tokens.push(Token::Paragraph(Paragraph::new(true)));
-                    state.context = Context::Paragraph;
                }
            },
-            Context::PreFormat => {
+            BlockContext::PreFormat => {
                if PreFormat::probe(lexeme) {
                    tokens.push(Token::PreFormat(PreFormat::new(false)));
-                    state.context = Context::None;
+                    state.context.block = BlockContext::None;
                } else {
                    tokens.push(Token::Literal(Literal::lex(lexeme)));
                }
                continue;
            },
-            Context::Paragraph => {
+            BlockContext::Paragraph => {
                if lexeme.text() == "\n" {
                    tokens.push(Token::Paragraph(Paragraph::new(false)));
-                    state.context = Context::None;
+                    state.context.block = BlockContext::None;
                }
            },
-            Context::Header(n) => {
+            BlockContext::Header(n) => {
                if lexeme.text() == "\n" {
                    tokens.push(Token::Header(Header::from_u8(n, false, None)));
-                    state.context = Context::None;
+                    state.context.block = BlockContext::None;
+                }
+            },
+        }
+
+        match state.context.inline {
+            InlineContext::None => {
+                if Code::probe(lexeme) {
+                    state.context.inline = InlineContext::Code;
+                    tokens.push(Token::Code(Code::new(true)));
+                    continue;
+                } else if Anchor::probe(lexeme) {
+                    state.context.inline = InlineContext::Anchor;
+                    state.buffers.anchor.clear();
+
+                    if lexeme.match_first_char('|') {
+                        state.buffers.anchor.candidate.leading = true;
+                    } else {
+                        state.buffers.anchor.candidate.text = lexeme.text();
+                    }
+                    continue;
+                }
+            },
+            InlineContext::Code => {
+                if Code::probe(lexeme) {
+                    state.context.inline = InlineContext::None;
+                    tokens.push(Token::Code(Code::new(false)));
+                    continue;
+                }
+            },
+            InlineContext::Anchor => {
+                let buffer = &mut state.buffers.anchor;
+                let candidate = &mut buffer.candidate;
+                if candidate.text.is_empty() {
+                    if lexeme.next == "|" {
+                        buffer.text.push_str(&lexeme.text());
+                        candidate.text.clone_from(&buffer.text);
+                    } else {
+                        buffer.text.push_str(&lexeme.text());
+                    }
+                    continue;
+                } else if candidate.destination.is_none() {
+                    // candidate is leading and we found the second pipe
+                    if candidate.leading && lexeme.text() == "|" {
+                        // whitespace after pipe: flanking node anchor
+                        if lexeme.is_next_whitespace() {
+                            candidate.destination =
+                                Some(candidate.text.clone());
+                            let token = Token::Anchor(candidate.clone());
+                            tokens.push(token);
+                            state.context.inline = InlineContext::None;
+                        // non-whitespace after pipe is the destination
+                        } else {
+                            candidate.destination = Some(lexeme.next.clone());
+                            let token = Token::Anchor(candidate.clone());
+                            tokens.push(token);
+                            state.context.inline = InlineContext::None;
+                            // if there is a trailing pipe, consume it
+                            if let Some(next) = iterator.next()
+                                && next.next == "|"
+                            {
+                                iterator.next();
+                            }
+                        }
+                    // candidate is nonleading and we found a second pipe
+                    } else if !candidate.leading && lexeme.next == "|" {
+                        candidate.destination = Some(lexeme.text());
+                        tokens.push(Token::Anchor(candidate.clone()));
+                        state.context.inline = InlineContext::None;
+                        iterator.next();
+                    // candidate is nonleading and we found whitespace
+                    } else if lexeme.is_next_whitespace() {
+                        candidate.destination = Some(lexeme.text());
+                        let token = Token::Anchor(candidate.clone());
+                        tokens.push(token);
+                        state.context.inline = InlineContext::None;
+                    // candidate is nonleading and we haven't found whitespace
+                    } else {
+                        buffer.destination.push_str(&lexeme.text());
+                    }
+                    continue;
+                } else {
+                    unreachable!("Anchor is already fully parsed");
                }
            },
        }
@ -118,6 +167,68 @@ fn lex(text: &str, map: LexMap) -> Vec<Token> {
    tokens
 }

+enum BlockContext {
+    Paragraph,
+    Header(u8),
+    PreFormat,
+    None,
+}
+
+enum InlineContext {
+    Anchor,
+    Code,
+    None,
+}
+
+struct State {
+    context: Context,
+    dom_ids: HashMap<String, Vec<String>>,
+    buffers: Buffers,
+}
+
+struct Buffers {
+    anchor: AnchorBuffer,
+}
+
+#[derive(Debug)]
+struct AnchorBuffer {
+    candidate: Anchor,
+    text: String,
+    destination: String,
+}
+
+impl AnchorBuffer {
+    fn clear(&mut self) {
+        self.candidate = Anchor::empty();
+        self.text = String::new();
+        self.destination = String::new();
+    }
+}
+
+impl State {
+    fn new() -> State {
+        State {
+            context: Context {
+                inline: InlineContext::None,
+                block: BlockContext::None,
+            },
+            dom_ids: HashMap::new(),
+            buffers: Buffers {
+                anchor: AnchorBuffer {
+                    candidate: Anchor::empty(),
+                    text: String::new(),
+                    destination: String::new(),
+                },
+            },
+        }
+    }
+}
+
+struct Context {
+    block: BlockContext,
+    inline: InlineContext,
+}
+
 fn parse(tokens: &[Token]) -> String {
    tokens.iter().map(Token::render).collect::<String>()
 }
--- a/src/syntax/content/parser/cluster.rs
+++ b/src/syntax/content/parser/cluster.rs
@ -1,192 +0,0 @@
-use crate::prelude::*;
-
-pub fn cluster(text: &str) -> Vec<String> {
-    let words: Vec<String> = text
-        .replace("\n", " \n ")
-        .split(' ')
-        .map(str::to_string)
-        .collect();
-
-    let mut clusters: Vec<String> = vec![];
-    let mut raw_context = false;
-
-    let mut iterator = words.into_iter().peekable();
-    while let Some(word) = iterator.next() {
-        log!("Iterating: {word:?}");
-
-        if word == "`" {
-            raw_context = !raw_context;
-            log!("Raw context is now {raw_context}");
-        } else if raw_context {
-            log!("Skip: In raw context");
-            clusters.push(word);
-            continue;
-        }
-
-        let Some(delimiter) = delimiter::match_delimiter(&word) else {
-            log!("Skip: {word:?} does not have a delimiter");
-            clusters.push(word);
-            continue;
-        };
-
-        if !delimiter.leading && !word.starts_with(delimiter.char) {
-            clusters.push(word);
-            continue;
-        }
-
-        if (!delimiter.greedy
-            && !delimiter.triple
-            && word.matches(delimiter.char).count() == 2)
-            || (delimiter.triple
-                && (2..=3).contains(&word.matches(delimiter.char).count()))
-        {
-            log!("Skip: {word:?} is almost atomic, but must be split");
-            match word.rsplit_once(delimiter.char) {
-                Some((head, tail)) => {
-                    log!("Pushing head {head:?}, tail {tail:?} into clusters");
-                    clusters.push(format!("{head}{}", delimiter.char));
-                    clusters.push(tail.to_string());
-                    continue;
-                },
-                None => unreachable!(),
-            }
-        }
-
-        if let Some(next) = iterator.peek()
-            && next == "\n"
-            && delimiter.greedy
-        {
-            log!("Skip: Next {next:?} is a break, delimiter is greedy");
-            clusters.push(word);
-            continue;
-        }
-
-        if word.starts_with(&delimiter.string)
-            && word.ends_with(&delimiter.string)
-        {
-            log!("Skip: {word:?} is atomically-delimited");
-            clusters.push(word);
-            continue;
-        }
-
-        log!("Found cluster from {delimiter:?} in {word:?}");
-        let mut parts: Vec<String> = vec![word.clone()];
-        log!("Seeking from a base of {parts:?}");
-
-        while let Some(next) = iterator.peek() {
-            if next.contains(&delimiter.char.to_string()) {
-                log!("Found end of cluster: {next:?}");
-                if delimiter.greedy
-                    && delimiter.triple
-                    && next.matches(delimiter.char).count() > 1
-                {
-                    match next.rsplit_once(delimiter.char) {
-                        Some((head, tail)) => {
-                            log!(
-                                "Pushing head {head:?} of greedy triple EOC \
-                                into parts and tail {tail:?} into clusters"
-                            );
-                            parts.push(format!("{head}{}", delimiter.char));
-                            clusters.push(parts.join(" "));
-                            clusters.push(tail.to_string());
-                            log!("Breaking past clusters {clusters:?}");
-                            iterator.next();
-                            break;
-                        },
-                        None => unreachable!(),
-                    }
-                } else if delimiter.greedy {
-                    log!("Pushing end of cluster into parts");
-                    parts.push(
-                        iterator.next().unwrap_or_else(|| unreachable!()),
-                    );
-                    log!("Pushing parts {parts:?} into clusters {clusters:?}");
-                    clusters.push(parts.join(" "));
-                    log!("Breaking past clusters {clusters:?}");
-                    break;
-                } else {
-                    match next.rsplit_once(delimiter.char) {
-                        Some((head, tail)) => {
-                            log!(
-                                "Pushing head {head:?} of humble end of \
-                                cluster into parts"
-                            );
-                            parts.push(format!("{head}{}", delimiter.char));
-                            log!("Pushing parts into clusters");
-                            clusters.push(parts.join(" "));
-                            log!("Pushing tail {tail:?} into clusters");
-                            clusters.push(tail.to_string());
-                            log!("Breaking past clusters");
-                            iterator.next();
-                            break;
-                        },
-                        // is this one really unreachable?
-                        None => unreachable!(),
-                    }
-                }
-            } else {
-                log!("No delimiter: Pushing {:?} into parts", iterator.peek());
-                parts.push(iterator.next().unwrap_or_default());
-                log!("Seeking a boundary for parts {parts:?}");
-            }
-        }
-    }
-
-    log!("Returning clusters");
-    clusters
-}
-
-mod delimiter {
-
-    #[derive(Debug, Clone)]
-    pub struct Delimiter {
-        pub char: char,
-        pub string: String,
-        pub greedy: bool,
-        pub triple: bool,
-        pub leading: bool,
-    }
-
-    fn make_delimiters() -> (Vec<Delimiter>, Vec<Delimiter>) {
-        let delimiters = [
-            Delimiter {
-                char: '|',
-                string: "|".to_string(),
-                greedy: true,
-                triple: true,
-                leading: false,
-            },
-            Delimiter {
-                char: '`',
-                string: "`".to_string(),
-                greedy: false,
-                triple: false,
-                leading: true,
-            },
-        ];
-
-        (
-            delimiters.iter().filter(|d| d.leading).cloned().collect(),
-            delimiters.iter().filter(|d| !d.leading).cloned().collect(),
-        )
-    }
-
-    pub fn match_delimiter(word: &str) -> Option<Delimiter> {
-        let (leading, nonleading) = make_delimiters();
-
-        let first_char = word.chars().next()?;
-
-        if let Some(leading_match) =
-            leading.iter().find(|d| d.char == first_char).cloned()
-        {
-            Some(leading_match)
-        } else {
-            for delimiter in nonleading {
-                if word.contains(delimiter.char) {
-                    return Some(delimiter);
-                }
-            }
-            None
-        }
-    }
-}
--- a/src/syntax/content/parser/lexeme.rs
+++ b/src/syntax/content/parser/lexeme.rs
@ -16,6 +16,26 @@ impl Lexeme {
        self.text.clone()
    }

+    pub fn is_whitespace(&self) -> bool {
+        self.text == " " || self.text == "\n"
+    }
+
+    pub fn is_next_whitespace(&self) -> bool {
+        self.next == " " || self.next == "\n"
+    }
+
+    pub fn match_first_char(&self, query: char) -> bool {
+        if let Some(first) = self.text.chars().nth(0) {
+            first == query
+        } else {
+            false
+        }
+    }
+
+    pub fn next_first_char(&self) -> Option<char> {
+        self.next.chars().nth(0)
+    }
+
    /// # Panics
    /// Panics if number of chars for a single lexeme exceeds `i2::MAX`
    pub fn count_char(&self, c: char) -> i32 {
--- a/src/syntax/content/parser/segment.rs
+++ b/src/syntax/content/parser/segment.rs
@ -0,0 +1,199 @@
+pub fn segment(text: &str) -> Vec<String> {
+    delimiter::atomize(text)
+}
+
+mod delimiter {
+
+    fn make_delimiters() -> Vec<char> {
+        vec!['\n', ' ', '`', '|']
+    }
+
+    pub fn atomize(text: &str) -> Vec<String> {
+        let delimiters = make_delimiters();
+        text.chars().fold(
+            Vec::new(),
+            |mut accumulator: Vec<String>, character| {
+                if delimiters.contains(&character) {
+                    accumulator.push(character.to_string());
+                } else if let Some(last) = accumulator.last_mut() {
+                    if delimiters
+                        .iter()
+                        .map(char::to_string)
+                        .filter(|d| d == last)
+                        .count()
+                        > 0
+                    {
+                        accumulator.push(character.to_string());
+                    } else {
+                        last.push(character);
+                    }
+                } else {
+                    accumulator.push(character.to_string());
+                }
+                accumulator
+            },
+        )
+    }
+
+    #[cfg(test)]
+    mod tests {
+        use super::*;
+
+        #[test]
+        fn atomize_words() {
+            let words = "    justification for  the actions   of those  who hold authority   inevitably dwindles  "; // 2
+            let actual = atomize(words);
+            let expected = vec![
+                " ",
+                " ",
+                " ",
+                " ",
+                "justification",
+                " ",
+                "for",
+                " ",
+                " ",
+                "the",
+                " ",
+                "actions",
+                " ",
+                " ",
+                " ",
+                "of",
+                " ",
+                "those",
+                " ",
+                " ",
+                "who",
+                " ",
+                "hold",
+                " ",
+                "authority",
+                " ",
+                " ",
+                " ",
+                "inevitably",
+                " ",
+                "dwindles",
+                " ",
+                " ",
+            ];
+            assert_eq!(actual, expected);
+        }
+
+        #[test]
+        fn atomize_ticks_no_spaces() {
+            let s = "a`c`adc`dadcdbd`cdb`dcdb`dc`dad`bdc";
+            let actual = atomize(s);
+            let expected = vec![
+                "a", "`", "c", "`", "adc", "`", "dadcdbd", "`", "cdb", "`",
+                "dcdb", "`", "dc", "`", "dad", "`", "bdc",
+            ]
+            .iter()
+            .map(std::string::ToString::to_string)
+            .collect::<Vec<String>>();
+
+            assert_eq!(actual, expected);
+        }
+
+        #[test]
+        fn atomize_ticks_with_spaces() {
+            let s = "a`c`adc`da dcdb d` cdb` dcdb `dc ` d ad ` bdc";
+
+            let actual = atomize(s);
+            let expected = vec![
+                "a", "`", "c", "`", "adc", "`", "da", " ", "dcdb", " ", "d",
+                "`", " ", "cdb", "`", " ", "dcdb", " ", "`", "dc", " ", "`",
+                " ", "d", " ", "ad", " ", "`", " ", "bdc",
+            ]
+            .iter()
+            .map(std::string::ToString::to_string)
+            .collect::<Vec<String>>();
+            assert_eq!(actual, expected);
+        }
+
+        #[test]
+        fn atomize_pipes() {
+            let s = "every other |time| as it was perceived";
+            let actual = atomize(s);
+            let expected = vec![
+                "every",
+                " ",
+                "other",
+                " ",
+                "|",
+                "time",
+                "|",
+                " ",
+                "as",
+                " ",
+                "it",
+                " ",
+                "was",
+                " ",
+                "perceived",
+            ];
+            assert_eq!(actual, expected);
+        }
+
+        #[test]
+        fn atomize_pipes_and_ticks() {
+            let s = "every other |time| as `it could or |perhaps somehow|then or now| it was` perceived";
+            let actual = atomize(s);
+            let expected = vec![
+                "every",
+                " ",
+                "other",
+                " ",
+                "|",
+                "time",
+                "|",
+                " ",
+                "as",
+                " ",
+                "`",
+                "it",
+                " ",
+                "could",
+                " ",
+                "or",
+                " ",
+                "|",
+                "perhaps",
+                " ",
+                "somehow",
+                "|",
+                "then",
+                " ",
+                "or",
+                " ",
+                "now",
+                "|",
+                " ",
+                "it",
+                " ",
+                "was",
+                "`",
+                " ",
+                "perceived",
+            ];
+            assert_eq!(actual, expected);
+        }
+
+        #[test]
+        fn atomize_newlines() {
+            let s = "a`c`adc`da \ndcdb d` cdb` dc\ndb `dc ` d ad ` bdc";
+
+            let actual = atomize(s);
+            let expected = vec![
+                "a", "`", "c", "`", "adc", "`", "da", " ", "\n", "dcdb", " ",
+                "d", "`", " ", "cdb", "`", " ", "dc", "\n", "db", " ", "`",
+                "dc", " ", "`", " ", "d", " ", "ad", " ", "`", " ", "bdc",
+            ]
+            .iter()
+            .map(std::string::ToString::to_string)
+            .collect::<Vec<String>>();
+            assert_eq!(actual, expected);
+        }
+    }
+}
--- a/src/syntax/content/parser/token.rs
+++ b/src/syntax/content/parser/token.rs
@ -9,6 +9,7 @@ pub mod header;
 pub mod preformat;
 pub mod code;

+#[derive(Debug)]
 pub enum Token {
    Anchor(anchor::Anchor),
    Code(code::Code),
--- a/src/syntax/content/parser/token/anchor.rs
+++ b/src/syntax/content/parser/token/anchor.rs
@ -1,98 +1,62 @@
-use crate::prelude::*;
-
 use std::fmt::Display;
+
 use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};

+#[derive(Debug, Clone)]
 pub struct Anchor {
-    text: String,
-    destination: String,
-    sticky: bool,
+    pub text: String,
+    pub destination: Option<String>,
+    pub leading: bool,
 }

 impl Parseable for Anchor {
    fn probe(lexeme: &Lexeme) -> bool {
-        let pipe_count = lexeme.count_char('|');
-        log!("{lexeme:?} has {pipe_count} pipes");
-
-        if !(1..=3).contains(&pipe_count) {
-            log!("Negative: Bad pipe count {pipe_count} in {lexeme:?}");
-            return false;
-        }
-        if lexeme.text().matches("||").count() > 0 {
-            log!("Negative: Contiguous pipes in {lexeme:?}");
-            return false;
-        }
-
-        let parts = Anchor::split_parts(lexeme);
-        if (1..=2).contains(&parts.len()) {
-            log!("Positive: Parts {parts:?} with length {}", parts.len());
-            true
-        } else {
-            log!("Negative: {parts:?} have length {}", parts.len());
-            false
-        }
+        lexeme.text() == "|" || (!lexeme.is_whitespace() && lexeme.next == "|")
    }

-    fn lex(lexeme: &Lexeme) -> Anchor {
-        let parts = Anchor::split_parts(lexeme);
-        log!("Lexing anchor {parts:?}");
-
-        let text = parts.first().unwrap_or_else(|| unreachable!());
-
-        fn try_node_anchor(anchor: &str) -> String {
-            if anchor.contains(":") || anchor.contains("/") {
-                anchor.to_owned()
-            } else {
-                format!("/node/{anchor}")
-            }
-        }
-
-        let destination = match parts.get(1) {
-            Some(d) => try_node_anchor(d),
-            None => try_node_anchor(text),
-        };
-
-        let sticky = [
-            ",", ".", ":", ";", "!", "?", "/", "(", ")", "%", "*", "&", r#"""#,
-            "'",
-        ];
-
-        log!("Lexed anchor: {text} -> {destination}");
-        Anchor {
-            text: text.to_owned(),
-            destination,
-            sticky: sticky.contains(&lexeme.next.as_str()),
-        }
+    fn lex(_lexeme: &Lexeme) -> Anchor {
+        panic!("Attempt to lex an anchor directly from a lexeme");
    }

    fn render(&self) -> String {
-        let space = if self.sticky {
-            String::new()
-        } else {
-            String::from(" ")
+        let Some(ref destination) = self.destination else {
+            panic!(
+                "Attempt to render anchor {self:?} without knowing its destination."
+            )
        };
-        format!(
-            r#"<a href="{}">{}</a>{space}"#,
-            &self.destination, &self.text
-        )
+
+        format!(r#"<a href="{}">{}</a>"#, destination, &self.text)
    }
 }

 impl Anchor {
-    fn split_parts(lexeme: &Lexeme) -> Vec<String> {
-        lexeme
-            .text()
-            .trim_start_matches('|')
-            .trim_end_matches('|')
-            .split('|')
-            .filter(|s| !s.is_empty())
-            .map(str::to_string)
-            .collect()
+    pub fn new(text: &str, destination: &str, spaced: bool) -> Anchor {
+        Anchor {
+            text: text.to_owned(),
+            destination: Some(Anchor::resolve_destination(destination)),
+            leading: spaced,
+        }
+    }
+
+    fn resolve_destination(raw: &str) -> String {
+        if raw.contains(":") || raw.contains("/") {
+            raw.to_owned()
+        } else {
+            format!("/node/{raw}")
+        }
+    }
+
+    pub fn empty() -> Anchor {
+        Anchor {
+            text: String::new(),
+            destination: None,
+            leading: false,
+        }
    }
 }

 impl Display for Anchor {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "Anchor: <{}> to <{}>", &self.text, &self.destination)
+        write!(f, "Anchor: <{}> to <{:?}>", &self.text, &self.destination)
    }
 }
--- a/src/syntax/content/parser/token/code.rs
+++ b/src/syntax/content/parser/token/code.rs
@ -2,42 +2,31 @@ use crate::{
    syntax::content::{Parseable, Lexeme},
 };

+#[derive(Debug)]
 pub struct Code {
-    text: String,
-    sticky: bool,
+    open: bool,
+}
+
+impl Code {
+    pub fn new(open: bool) -> Code {
+        Code { open }
+    }
 }

 impl Parseable for Code {
    fn probe(lexeme: &Lexeme) -> bool {
-        let chars = lexeme.split_chars();
-
-        if let Some(first_char) = chars.first()
-            && let Some(last_char) = chars.last()
-        {
-            *first_char == '`' && *last_char == '`'
-        } else {
-            false
-        }
+        lexeme.text() == "`"
    }

-    fn lex(lexeme: &Lexeme) -> Code {
-        let sticky = [
-            ",", ".", ":", ";", "!", "?", "/", "(", ")", "%", "*", "&", r#"""#,
-            "'",
-        ];
-
-        Code {
-            text: lexeme.text().replace("`", ""),
-            sticky: sticky.contains(&lexeme.next.as_str()),
-        }
+    fn lex(_lexeme: &Lexeme) -> Code {
+        panic!("Attempt to lex a code tag directly from a lexeme")
    }

    fn render(&self) -> String {
-        let space = if self.sticky {
-            String::new()
+        if self.open {
+            String::from("<code>")
        } else {
-            String::from(" ")
-        };
-        format!("<code>{}</code>{space}", self.text)
+            String::from("</code>")
+        }
    }
 }
--- a/src/syntax/content/parser/token/header.rs
+++ b/src/syntax/content/parser/token/header.rs
@ -1,9 +1,18 @@
+use std::{
+    collections::{HashMap, hash_map::Entry},
+    iter::Peekable,
+    slice,
+};
+
 use crate::{
    prelude::*,
+    types::Config,
    syntax::content::{Parseable, Lexeme},
 };
+
 use std::fmt::Display;

+#[derive(Debug)]
 pub struct Header {
    open: Option<bool>,
    level: Level,
@ -19,6 +28,35 @@ impl Header {
        }
    }

+    pub fn make_id(
+        config: &Config,
+        iterator: &mut Peekable<slice::Iter<'_, Lexeme>>,
+        ids: &mut HashMap<String, Vec<String>>,
+    ) -> String {
+        let base_id = match iterator.peek() {
+            Some(next_lexeme)
+                if !config.ascii_dom_ids || next_lexeme.next.is_ascii() =>
+            {
+                next_lexeme.next.to_lowercase()
+            },
+            _ => String::from("h"),
+        };
+
+        match ids.entry(base_id.clone()) {
+            Entry::Occupied(mut occupied) => {
+                let ids_vec = occupied.get_mut();
+                let suffix = ids_vec.len();
+                let id_with_suffix = format!("{base_id}-{suffix}");
+                ids_vec.push(id_with_suffix.clone());
+                id_with_suffix
+            },
+            Entry::Vacant(vacant) => {
+                vacant.insert(vec![base_id.clone()]);
+                base_id
+            },
+        }
+    }
+
    pub fn from_u8(level: u8, open: bool, dom_id: Option<&str>) -> Header {
        Header {
            level: Level::from_u8(level),
@ -27,7 +65,7 @@ impl Header {
        }
    }

-    pub fn get_level(&self) -> u8 {
+    pub fn level(&self) -> u8 {
        match self.level {
            Level::One => 1,
            Level::Two => 2,
@ -92,6 +130,7 @@ impl Display for Header {
    }
 }

+#[derive(Debug)]
 pub enum Level {
    One,
    Two,
--- a/src/syntax/content/parser/token/linebreak.rs
+++ b/src/syntax/content/parser/token/linebreak.rs
@ -3,6 +3,7 @@ use crate::{
    syntax::content::{Parseable, parser::lexeme::Lexeme},
 };

+#[derive(Debug)]
 pub struct LineBreak {}

 impl Parseable for LineBreak {
--- a/src/syntax/content/parser/token/literal.rs
+++ b/src/syntax/content/parser/token/literal.rs
@ -1,6 +1,7 @@
 use std::fmt::Display;
 use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};

+#[derive(Debug)]
 pub struct Literal {
    text: String,
 }
@ -17,12 +18,7 @@ impl Parseable for Literal {
    }

    fn render(&self) -> String {
-        let non_sticky = [" ", "\n"];
-        if non_sticky.contains(&self.text.as_str()) {
-            self.text.clone()
-        } else {
-            format!("{} ", self.text.clone())
-        }
+        self.text.clone()
    }
 }

--- a/src/syntax/content/parser/token/paragraph.rs
+++ b/src/syntax/content/parser/token/paragraph.rs
@ -1,6 +1,7 @@
 use std::fmt::Display;
 use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};

+#[derive(Debug)]
 pub struct Paragraph {
    open: Option<bool>,
 }
@ -14,9 +15,7 @@ impl Paragraph {
 impl Parseable for Paragraph {
    fn probe(lexeme: &Lexeme) -> bool {
        // lexeme for paragraph is any non-whitespace, parser knows the context
-        let raw = lexeme.text();
-        let trimmed = raw.trim();
-        !trimmed.is_empty() && trimmed != "\n"
+        !lexeme.is_whitespace()
    }

    fn lex(_lexeme: &Lexeme) -> Paragraph {
--- a/src/syntax/content/parser/token/preformat.rs
+++ b/src/syntax/content/parser/token/preformat.rs
@ -2,6 +2,7 @@ use crate::{
    syntax::content::{Parseable, Lexeme},
 };

+#[derive(Debug)]
 pub struct PreFormat {
    open: Option<bool>,
 }
--- a/src/syntax/content/parser/token/span.rs
+++ b/src/syntax/content/parser/token/span.rs
@ -1,6 +1,7 @@
 use std::fmt::Display;
 use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};

+#[derive(Debug)]
 pub struct Span {
    open: Option<bool>,
 }
--- a/static/graph.toml
+++ b/static/graph.toml
@ -132,7 +132,7 @@ For example:
 docs|/node/Documentation
 `

-If the left side contains spaces, you need a leading `|` character. In this case, the space on the left side is mandatory:
+If the left side contains spaces, you need a leading `|` character:

 `
 |en docs|https://en.jutty.dev/node/Documentation
@ -141,12 +141,12 @@ If the left side contains spaces, you need a leading `|` character. In this case
 If you have a trailing character that you don't want to be considered as part of the destination, you can separate it with a third `|`:

 `
-This |gem|PreciousStone|, though green, was not an emerald.
+This gem|PreciousStone|, though green, was not an emerald.
 `

 Which renders as:

-This |gem|PreciousStone|, though green, was not an emerald.
+This gem|PreciousStone|, though green, was not an emerald.

 ### Node anchors

@ -169,14 +169,15 @@ Because en can resolve IDs case insensitively (with priority to case-sensitive m
 In summary, all of the anchors below are valid and lead to the same page:

 `
+|en Syntax|https://en.jutty.dev/node/Syntax|
 |en Syntax|https://en.jutty.dev/node/Syntax
 Syntax|https://en.jutty.dev/node/Syntax

-|en Syntax|/node/Syntax
-Syntax|/node/Syntax
+Syntax|/node/syntax

-Syntax|Syntax
-syntax|syntax
+|syntax|Syntax
+Syntax|syntax
+Syntax|syntax|

 |Syntax|
 |syntax|
@ -312,23 +313,23 @@ We saw example `docs|/node/Documentation`, but shorter syntax exists.
 #### Epistēmē
 #### Epistēmē

+|en Syntax|https://en.jutty.dev/node/Syntax|
 |en Syntax|https://en.jutty.dev/node/Syntax
 Syntax|https://en.jutty.dev/node/Syntax

-|en Syntax|/node/Syntax
-Syntax|/node/Syntax
+Syntax|/node/syntax

-Syntax|Syntax
-syntax|syntax
+|syntax|Syntax
+Syntax|syntax
+Syntax|syntax|

 |Syntax|
 |syntax|
 """

 [meta.config]
+content_language = "en"
 footer_credits = false
 footer_text = """
 made by jutty|https://jutty.dev • acknowledgements|Acknowledgments • |source code|https://codeberg.org/jutty/en
 """
-
-