Rework token segmentation

2025-12-23 21:40:57 -03:00 · 2025-12-23 21:40:57 -03:00 · 8b782d6d20
commit 8b782d6d20
parent a33d9cb1e1
16 changed files with 497 additions and 385 deletions
--- a/src/syntax/content/parser.rs
+++ b/src/syntax/content/parser.rs
@ -1,4 +1,4 @@
-use std::collections::{HashMap, hash_map::Entry};
+use std::collections::{HashMap};

 use crate::{formats::populate_graph, types::Config};

@ -11,98 +11,147 @@ use lexeme::Lexeme;

 pub mod token;
 pub mod lexeme;
-pub mod cluster;
+pub mod segment;

 const LEXMAP: LexMap = &[
    (LineBreak::probe, |word| {
        Token::LineBreak(LineBreak::lex(word))
    }),
-    (Code::probe, |word| Token::Code(Code::lex(word))),
-    (Anchor::probe, |word| Token::Anchor(Anchor::lex(word))),
    (Literal::probe, |word| Token::Literal(Literal::lex(word))),
 ];

-enum Context {
-    None,
-    Paragraph,
-    Header(u8),
-    PreFormat,
-}
-
-struct State {
-    context: Context,
-    dom_ids: HashMap<String, Vec<String>>,
-}
-
 fn lex(text: &str, map: LexMap) -> Vec<Token> {
    let mut tokens: Vec<Token> = Vec::new();
-    let mut state = State {
-        context: Context::None,
-        dom_ids: HashMap::new(),
-    };
+    let mut state = State::new();
    let config: Config = populate_graph().meta.config;

-    let splits = cluster::cluster(text);
-    let lexemes = Lexeme::collect(&splits);
-    let iter = lexemes.iter().peekable();
-    for lexeme in iter {
-        match state.context {
-            Context::None => {
+    let segments = segment::segment(text);
+    let lexemes = Lexeme::collect(&segments);
+
+    let mut iterator = lexemes.iter().peekable();
+    while let Some(lexeme) = iterator.next() {
+        match state.context.block {
+            BlockContext::None => {
                if PreFormat::probe(lexeme) {
+                    state.context.block = BlockContext::PreFormat;
                    tokens.push(Token::PreFormat(PreFormat::new(true)));
-                    state.context = Context::PreFormat;
                    continue;
                } else if Header::probe(lexeme) {
-                    let base_id =
-                        if config.ascii_dom_ids && !lexeme.next.is_ascii() {
-                            String::from("h")
-                        } else {
-                            lexeme.next.clone().to_lowercase()
-                        };
-                    let id = match state.dom_ids.entry(base_id.clone()) {
-                        Entry::Occupied(mut occupied) => {
-                            let ids = occupied.get_mut();
-                            let suffix: u8 =
-                                ids.len().try_into().unwrap_or_default();
-                            let id_with_suffix = format!("{base_id}-{suffix}");
-                            ids.push(id_with_suffix.clone());
-                            id_with_suffix
-                        },
-                        Entry::Vacant(vacant) => {
-                            vacant.insert(vec![base_id.clone()]);
-                            base_id
-                        },
-                    };
-
                    let mut header = Header::lex(lexeme);
-                    header.dom_id = Some(id);
-                    state.context = Context::Header(header.get_level());
+                    header.dom_id = Some(Header::make_id(
+                        &config,
+                        &mut iterator,
+                        &mut state.dom_ids,
+                    ));
+                    state.context.block = BlockContext::Header(header.level());
                    tokens.push(Token::Header(header));
                    continue;
                } else if Paragraph::probe(lexeme) {
+                    state.context.block = BlockContext::Paragraph;
                    tokens.push(Token::Paragraph(Paragraph::new(true)));
-                    state.context = Context::Paragraph;
                }
            },
-            Context::PreFormat => {
+            BlockContext::PreFormat => {
                if PreFormat::probe(lexeme) {
                    tokens.push(Token::PreFormat(PreFormat::new(false)));
-                    state.context = Context::None;
+                    state.context.block = BlockContext::None;
                } else {
                    tokens.push(Token::Literal(Literal::lex(lexeme)));
                }
                continue;
            },
-            Context::Paragraph => {
+            BlockContext::Paragraph => {
                if lexeme.text() == "\n" {
                    tokens.push(Token::Paragraph(Paragraph::new(false)));
-                    state.context = Context::None;
+                    state.context.block = BlockContext::None;
                }
            },
-            Context::Header(n) => {
+            BlockContext::Header(n) => {
                if lexeme.text() == "\n" {
                    tokens.push(Token::Header(Header::from_u8(n, false, None)));
-                    state.context = Context::None;
+                    state.context.block = BlockContext::None;
+                }
+            },
+        }
+
+        match state.context.inline {
+            InlineContext::None => {
+                if Code::probe(lexeme) {
+                    state.context.inline = InlineContext::Code;
+                    tokens.push(Token::Code(Code::new(true)));
+                    continue;
+                } else if Anchor::probe(lexeme) {
+                    state.context.inline = InlineContext::Anchor;
+                    state.buffers.anchor.clear();
+
+                    if lexeme.match_first_char('|') {
+                        state.buffers.anchor.candidate.leading = true;
+                    } else {
+                        state.buffers.anchor.candidate.text = lexeme.text();
+                    }
+                    continue;
+                }
+            },
+            InlineContext::Code => {
+                if Code::probe(lexeme) {
+                    state.context.inline = InlineContext::None;
+                    tokens.push(Token::Code(Code::new(false)));
+                    continue;
+                }
+            },
+            InlineContext::Anchor => {
+                let buffer = &mut state.buffers.anchor;
+                let candidate = &mut buffer.candidate;
+                if candidate.text.is_empty() {
+                    if lexeme.next == "|" {
+                        buffer.text.push_str(&lexeme.text());
+                        candidate.text.clone_from(&buffer.text);
+                    } else {
+                        buffer.text.push_str(&lexeme.text());
+                    }
+                    continue;
+                } else if candidate.destination.is_none() {
+                    // candidate is leading and we found the second pipe
+                    if candidate.leading && lexeme.text() == "|" {
+                        // whitespace after pipe: flanking node anchor
+                        if lexeme.is_next_whitespace() {
+                            candidate.destination =
+                                Some(candidate.text.clone());
+                            let token = Token::Anchor(candidate.clone());
+                            tokens.push(token);
+                            state.context.inline = InlineContext::None;
+                        // non-whitespace after pipe is the destination
+                        } else {
+                            candidate.destination = Some(lexeme.next.clone());
+                            let token = Token::Anchor(candidate.clone());
+                            tokens.push(token);
+                            state.context.inline = InlineContext::None;
+                            // if there is a trailing pipe, consume it
+                            if let Some(next) = iterator.next()
+                                && next.next == "|"
+                            {
+                                iterator.next();
+                            }
+                        }
+                    // candidate is nonleading and we found a second pipe
+                    } else if !candidate.leading && lexeme.next == "|" {
+                        candidate.destination = Some(lexeme.text());
+                        tokens.push(Token::Anchor(candidate.clone()));
+                        state.context.inline = InlineContext::None;
+                        iterator.next();
+                    // candidate is nonleading and we found whitespace
+                    } else if lexeme.is_next_whitespace() {
+                        candidate.destination = Some(lexeme.text());
+                        let token = Token::Anchor(candidate.clone());
+                        tokens.push(token);
+                        state.context.inline = InlineContext::None;
+                    // candidate is nonleading and we haven't found whitespace
+                    } else {
+                        buffer.destination.push_str(&lexeme.text());
+                    }
+                    continue;
+                } else {
+                    unreachable!("Anchor is already fully parsed");
                }
            },
        }
@ -118,6 +167,68 @@ fn lex(text: &str, map: LexMap) -> Vec<Token> {
    tokens
 }

+enum BlockContext {
+    Paragraph,
+    Header(u8),
+    PreFormat,
+    None,
+}
+
+enum InlineContext {
+    Anchor,
+    Code,
+    None,
+}
+
+struct State {
+    context: Context,
+    dom_ids: HashMap<String, Vec<String>>,
+    buffers: Buffers,
+}
+
+struct Buffers {
+    anchor: AnchorBuffer,
+}
+
+#[derive(Debug)]
+struct AnchorBuffer {
+    candidate: Anchor,
+    text: String,
+    destination: String,
+}
+
+impl AnchorBuffer {
+    fn clear(&mut self) {
+        self.candidate = Anchor::empty();
+        self.text = String::new();
+        self.destination = String::new();
+    }
+}
+
+impl State {
+    fn new() -> State {
+        State {
+            context: Context {
+                inline: InlineContext::None,
+                block: BlockContext::None,
+            },
+            dom_ids: HashMap::new(),
+            buffers: Buffers {
+                anchor: AnchorBuffer {
+                    candidate: Anchor::empty(),
+                    text: String::new(),
+                    destination: String::new(),
+                },
+            },
+        }
+    }
+}
+
+struct Context {
+    block: BlockContext,
+    inline: InlineContext,
+}
+
 fn parse(tokens: &[Token]) -> String {
    tokens.iter().map(Token::render).collect::<String>()
 }