Rework token segmentation
This commit is contained in:
parent
a33d9cb1e1
commit
8b782d6d20
16 changed files with 497 additions and 385 deletions
|
|
@ -136,12 +136,3 @@ pub fn deserialize_graph(in_format: &Format, serial: &str) -> Graph {
|
|||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn smoke() {
|
||||
let n = true;
|
||||
assert!(n);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,12 +49,3 @@ async fn main() -> io::Result<()> {
|
|||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn smoke() {
|
||||
let e = true;
|
||||
assert!(e);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
use std::collections::{HashMap, hash_map::Entry};
|
||||
use std::collections::{HashMap};
|
||||
|
||||
use crate::{formats::populate_graph, types::Config};
|
||||
|
||||
|
|
@ -11,98 +11,147 @@ use lexeme::Lexeme;
|
|||
|
||||
pub mod token;
|
||||
pub mod lexeme;
|
||||
pub mod cluster;
|
||||
pub mod segment;
|
||||
|
||||
const LEXMAP: LexMap = &[
|
||||
(LineBreak::probe, |word| {
|
||||
Token::LineBreak(LineBreak::lex(word))
|
||||
}),
|
||||
(Code::probe, |word| Token::Code(Code::lex(word))),
|
||||
(Anchor::probe, |word| Token::Anchor(Anchor::lex(word))),
|
||||
(Literal::probe, |word| Token::Literal(Literal::lex(word))),
|
||||
];
|
||||
|
||||
enum Context {
|
||||
None,
|
||||
Paragraph,
|
||||
Header(u8),
|
||||
PreFormat,
|
||||
}
|
||||
|
||||
struct State {
|
||||
context: Context,
|
||||
dom_ids: HashMap<String, Vec<String>>,
|
||||
}
|
||||
|
||||
fn lex(text: &str, map: LexMap) -> Vec<Token> {
|
||||
let mut tokens: Vec<Token> = Vec::new();
|
||||
let mut state = State {
|
||||
context: Context::None,
|
||||
dom_ids: HashMap::new(),
|
||||
};
|
||||
let mut state = State::new();
|
||||
let config: Config = populate_graph().meta.config;
|
||||
|
||||
let splits = cluster::cluster(text);
|
||||
let lexemes = Lexeme::collect(&splits);
|
||||
let iter = lexemes.iter().peekable();
|
||||
for lexeme in iter {
|
||||
match state.context {
|
||||
Context::None => {
|
||||
let segments = segment::segment(text);
|
||||
let lexemes = Lexeme::collect(&segments);
|
||||
|
||||
let mut iterator = lexemes.iter().peekable();
|
||||
while let Some(lexeme) = iterator.next() {
|
||||
match state.context.block {
|
||||
BlockContext::None => {
|
||||
if PreFormat::probe(lexeme) {
|
||||
state.context.block = BlockContext::PreFormat;
|
||||
tokens.push(Token::PreFormat(PreFormat::new(true)));
|
||||
state.context = Context::PreFormat;
|
||||
continue;
|
||||
} else if Header::probe(lexeme) {
|
||||
let base_id =
|
||||
if config.ascii_dom_ids && !lexeme.next.is_ascii() {
|
||||
String::from("h")
|
||||
} else {
|
||||
lexeme.next.clone().to_lowercase()
|
||||
};
|
||||
let id = match state.dom_ids.entry(base_id.clone()) {
|
||||
Entry::Occupied(mut occupied) => {
|
||||
let ids = occupied.get_mut();
|
||||
let suffix: u8 =
|
||||
ids.len().try_into().unwrap_or_default();
|
||||
let id_with_suffix = format!("{base_id}-{suffix}");
|
||||
ids.push(id_with_suffix.clone());
|
||||
id_with_suffix
|
||||
},
|
||||
Entry::Vacant(vacant) => {
|
||||
vacant.insert(vec![base_id.clone()]);
|
||||
base_id
|
||||
},
|
||||
};
|
||||
|
||||
let mut header = Header::lex(lexeme);
|
||||
header.dom_id = Some(id);
|
||||
state.context = Context::Header(header.get_level());
|
||||
header.dom_id = Some(Header::make_id(
|
||||
&config,
|
||||
&mut iterator,
|
||||
&mut state.dom_ids,
|
||||
));
|
||||
state.context.block = BlockContext::Header(header.level());
|
||||
tokens.push(Token::Header(header));
|
||||
continue;
|
||||
} else if Paragraph::probe(lexeme) {
|
||||
state.context.block = BlockContext::Paragraph;
|
||||
tokens.push(Token::Paragraph(Paragraph::new(true)));
|
||||
state.context = Context::Paragraph;
|
||||
}
|
||||
},
|
||||
Context::PreFormat => {
|
||||
BlockContext::PreFormat => {
|
||||
if PreFormat::probe(lexeme) {
|
||||
tokens.push(Token::PreFormat(PreFormat::new(false)));
|
||||
state.context = Context::None;
|
||||
state.context.block = BlockContext::None;
|
||||
} else {
|
||||
tokens.push(Token::Literal(Literal::lex(lexeme)));
|
||||
}
|
||||
continue;
|
||||
},
|
||||
Context::Paragraph => {
|
||||
BlockContext::Paragraph => {
|
||||
if lexeme.text() == "\n" {
|
||||
tokens.push(Token::Paragraph(Paragraph::new(false)));
|
||||
state.context = Context::None;
|
||||
state.context.block = BlockContext::None;
|
||||
}
|
||||
},
|
||||
Context::Header(n) => {
|
||||
BlockContext::Header(n) => {
|
||||
if lexeme.text() == "\n" {
|
||||
tokens.push(Token::Header(Header::from_u8(n, false, None)));
|
||||
state.context = Context::None;
|
||||
state.context.block = BlockContext::None;
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
match state.context.inline {
|
||||
InlineContext::None => {
|
||||
if Code::probe(lexeme) {
|
||||
state.context.inline = InlineContext::Code;
|
||||
tokens.push(Token::Code(Code::new(true)));
|
||||
continue;
|
||||
} else if Anchor::probe(lexeme) {
|
||||
state.context.inline = InlineContext::Anchor;
|
||||
state.buffers.anchor.clear();
|
||||
|
||||
if lexeme.match_first_char('|') {
|
||||
state.buffers.anchor.candidate.leading = true;
|
||||
} else {
|
||||
state.buffers.anchor.candidate.text = lexeme.text();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
},
|
||||
InlineContext::Code => {
|
||||
if Code::probe(lexeme) {
|
||||
state.context.inline = InlineContext::None;
|
||||
tokens.push(Token::Code(Code::new(false)));
|
||||
continue;
|
||||
}
|
||||
},
|
||||
InlineContext::Anchor => {
|
||||
let buffer = &mut state.buffers.anchor;
|
||||
let candidate = &mut buffer.candidate;
|
||||
if candidate.text.is_empty() {
|
||||
if lexeme.next == "|" {
|
||||
buffer.text.push_str(&lexeme.text());
|
||||
candidate.text.clone_from(&buffer.text);
|
||||
} else {
|
||||
buffer.text.push_str(&lexeme.text());
|
||||
}
|
||||
continue;
|
||||
} else if candidate.destination.is_none() {
|
||||
// candidate is leading and we found the second pipe
|
||||
if candidate.leading && lexeme.text() == "|" {
|
||||
// whitespace after pipe: flanking node anchor
|
||||
if lexeme.is_next_whitespace() {
|
||||
candidate.destination =
|
||||
Some(candidate.text.clone());
|
||||
let token = Token::Anchor(candidate.clone());
|
||||
tokens.push(token);
|
||||
state.context.inline = InlineContext::None;
|
||||
// non-whitespace after pipe is the destination
|
||||
} else {
|
||||
candidate.destination = Some(lexeme.next.clone());
|
||||
let token = Token::Anchor(candidate.clone());
|
||||
tokens.push(token);
|
||||
state.context.inline = InlineContext::None;
|
||||
// if there is a trailing pipe, consume it
|
||||
if let Some(next) = iterator.next()
|
||||
&& next.next == "|"
|
||||
{
|
||||
iterator.next();
|
||||
}
|
||||
}
|
||||
// candidate is nonleading and we found a second pipe
|
||||
} else if !candidate.leading && lexeme.next == "|" {
|
||||
candidate.destination = Some(lexeme.text());
|
||||
tokens.push(Token::Anchor(candidate.clone()));
|
||||
state.context.inline = InlineContext::None;
|
||||
iterator.next();
|
||||
// candidate is nonleading and we found whitespace
|
||||
} else if lexeme.is_next_whitespace() {
|
||||
candidate.destination = Some(lexeme.text());
|
||||
let token = Token::Anchor(candidate.clone());
|
||||
tokens.push(token);
|
||||
state.context.inline = InlineContext::None;
|
||||
// candidate is nonleading and we haven't found whitespace
|
||||
} else {
|
||||
buffer.destination.push_str(&lexeme.text());
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
unreachable!("Anchor is already fully parsed");
|
||||
}
|
||||
},
|
||||
}
|
||||
|
|
@ -118,6 +167,68 @@ fn lex(text: &str, map: LexMap) -> Vec<Token> {
|
|||
tokens
|
||||
}
|
||||
|
||||
enum BlockContext {
|
||||
Paragraph,
|
||||
Header(u8),
|
||||
PreFormat,
|
||||
None,
|
||||
}
|
||||
|
||||
enum InlineContext {
|
||||
Anchor,
|
||||
Code,
|
||||
None,
|
||||
}
|
||||
|
||||
struct State {
|
||||
context: Context,
|
||||
dom_ids: HashMap<String, Vec<String>>,
|
||||
buffers: Buffers,
|
||||
}
|
||||
|
||||
struct Buffers {
|
||||
anchor: AnchorBuffer,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct AnchorBuffer {
|
||||
candidate: Anchor,
|
||||
text: String,
|
||||
destination: String,
|
||||
}
|
||||
|
||||
impl AnchorBuffer {
|
||||
fn clear(&mut self) {
|
||||
self.candidate = Anchor::empty();
|
||||
self.text = String::new();
|
||||
self.destination = String::new();
|
||||
}
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn new() -> State {
|
||||
State {
|
||||
context: Context {
|
||||
inline: InlineContext::None,
|
||||
block: BlockContext::None,
|
||||
},
|
||||
dom_ids: HashMap::new(),
|
||||
buffers: Buffers {
|
||||
anchor: AnchorBuffer {
|
||||
candidate: Anchor::empty(),
|
||||
text: String::new(),
|
||||
destination: String::new(),
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Context {
|
||||
block: BlockContext,
|
||||
inline: InlineContext,
|
||||
}
|
||||
|
||||
fn parse(tokens: &[Token]) -> String {
|
||||
tokens.iter().map(Token::render).collect::<String>()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,192 +0,0 @@
|
|||
use crate::prelude::*;
|
||||
|
||||
pub fn cluster(text: &str) -> Vec<String> {
|
||||
let words: Vec<String> = text
|
||||
.replace("\n", " \n ")
|
||||
.split(' ')
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
|
||||
let mut clusters: Vec<String> = vec![];
|
||||
let mut raw_context = false;
|
||||
|
||||
let mut iterator = words.into_iter().peekable();
|
||||
while let Some(word) = iterator.next() {
|
||||
log!("Iterating: {word:?}");
|
||||
|
||||
if word == "`" {
|
||||
raw_context = !raw_context;
|
||||
log!("Raw context is now {raw_context}");
|
||||
} else if raw_context {
|
||||
log!("Skip: In raw context");
|
||||
clusters.push(word);
|
||||
continue;
|
||||
}
|
||||
|
||||
let Some(delimiter) = delimiter::match_delimiter(&word) else {
|
||||
log!("Skip: {word:?} does not have a delimiter");
|
||||
clusters.push(word);
|
||||
continue;
|
||||
};
|
||||
|
||||
if !delimiter.leading && !word.starts_with(delimiter.char) {
|
||||
clusters.push(word);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!delimiter.greedy
|
||||
&& !delimiter.triple
|
||||
&& word.matches(delimiter.char).count() == 2)
|
||||
|| (delimiter.triple
|
||||
&& (2..=3).contains(&word.matches(delimiter.char).count()))
|
||||
{
|
||||
log!("Skip: {word:?} is almost atomic, but must be split");
|
||||
match word.rsplit_once(delimiter.char) {
|
||||
Some((head, tail)) => {
|
||||
log!("Pushing head {head:?}, tail {tail:?} into clusters");
|
||||
clusters.push(format!("{head}{}", delimiter.char));
|
||||
clusters.push(tail.to_string());
|
||||
continue;
|
||||
},
|
||||
None => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(next) = iterator.peek()
|
||||
&& next == "\n"
|
||||
&& delimiter.greedy
|
||||
{
|
||||
log!("Skip: Next {next:?} is a break, delimiter is greedy");
|
||||
clusters.push(word);
|
||||
continue;
|
||||
}
|
||||
|
||||
if word.starts_with(&delimiter.string)
|
||||
&& word.ends_with(&delimiter.string)
|
||||
{
|
||||
log!("Skip: {word:?} is atomically-delimited");
|
||||
clusters.push(word);
|
||||
continue;
|
||||
}
|
||||
|
||||
log!("Found cluster from {delimiter:?} in {word:?}");
|
||||
let mut parts: Vec<String> = vec![word.clone()];
|
||||
log!("Seeking from a base of {parts:?}");
|
||||
|
||||
while let Some(next) = iterator.peek() {
|
||||
if next.contains(&delimiter.char.to_string()) {
|
||||
log!("Found end of cluster: {next:?}");
|
||||
if delimiter.greedy
|
||||
&& delimiter.triple
|
||||
&& next.matches(delimiter.char).count() > 1
|
||||
{
|
||||
match next.rsplit_once(delimiter.char) {
|
||||
Some((head, tail)) => {
|
||||
log!(
|
||||
"Pushing head {head:?} of greedy triple EOC \
|
||||
into parts and tail {tail:?} into clusters"
|
||||
);
|
||||
parts.push(format!("{head}{}", delimiter.char));
|
||||
clusters.push(parts.join(" "));
|
||||
clusters.push(tail.to_string());
|
||||
log!("Breaking past clusters {clusters:?}");
|
||||
iterator.next();
|
||||
break;
|
||||
},
|
||||
None => unreachable!(),
|
||||
}
|
||||
} else if delimiter.greedy {
|
||||
log!("Pushing end of cluster into parts");
|
||||
parts.push(
|
||||
iterator.next().unwrap_or_else(|| unreachable!()),
|
||||
);
|
||||
log!("Pushing parts {parts:?} into clusters {clusters:?}");
|
||||
clusters.push(parts.join(" "));
|
||||
log!("Breaking past clusters {clusters:?}");
|
||||
break;
|
||||
} else {
|
||||
match next.rsplit_once(delimiter.char) {
|
||||
Some((head, tail)) => {
|
||||
log!(
|
||||
"Pushing head {head:?} of humble end of \
|
||||
cluster into parts"
|
||||
);
|
||||
parts.push(format!("{head}{}", delimiter.char));
|
||||
log!("Pushing parts into clusters");
|
||||
clusters.push(parts.join(" "));
|
||||
log!("Pushing tail {tail:?} into clusters");
|
||||
clusters.push(tail.to_string());
|
||||
log!("Breaking past clusters");
|
||||
iterator.next();
|
||||
break;
|
||||
},
|
||||
// is this one really unreachable?
|
||||
None => unreachable!(),
|
||||
}
|
||||
}
|
||||
} else {
|
||||
log!("No delimiter: Pushing {:?} into parts", iterator.peek());
|
||||
parts.push(iterator.next().unwrap_or_default());
|
||||
log!("Seeking a boundary for parts {parts:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log!("Returning clusters");
|
||||
clusters
|
||||
}
|
||||
|
||||
mod delimiter {
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Delimiter {
|
||||
pub char: char,
|
||||
pub string: String,
|
||||
pub greedy: bool,
|
||||
pub triple: bool,
|
||||
pub leading: bool,
|
||||
}
|
||||
|
||||
fn make_delimiters() -> (Vec<Delimiter>, Vec<Delimiter>) {
|
||||
let delimiters = [
|
||||
Delimiter {
|
||||
char: '|',
|
||||
string: "|".to_string(),
|
||||
greedy: true,
|
||||
triple: true,
|
||||
leading: false,
|
||||
},
|
||||
Delimiter {
|
||||
char: '`',
|
||||
string: "`".to_string(),
|
||||
greedy: false,
|
||||
triple: false,
|
||||
leading: true,
|
||||
},
|
||||
];
|
||||
|
||||
(
|
||||
delimiters.iter().filter(|d| d.leading).cloned().collect(),
|
||||
delimiters.iter().filter(|d| !d.leading).cloned().collect(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn match_delimiter(word: &str) -> Option<Delimiter> {
|
||||
let (leading, nonleading) = make_delimiters();
|
||||
|
||||
let first_char = word.chars().next()?;
|
||||
|
||||
if let Some(leading_match) =
|
||||
leading.iter().find(|d| d.char == first_char).cloned()
|
||||
{
|
||||
Some(leading_match)
|
||||
} else {
|
||||
for delimiter in nonleading {
|
||||
if word.contains(delimiter.char) {
|
||||
return Some(delimiter);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -16,6 +16,26 @@ impl Lexeme {
|
|||
self.text.clone()
|
||||
}
|
||||
|
||||
pub fn is_whitespace(&self) -> bool {
|
||||
self.text == " " || self.text == "\n"
|
||||
}
|
||||
|
||||
pub fn is_next_whitespace(&self) -> bool {
|
||||
self.next == " " || self.next == "\n"
|
||||
}
|
||||
|
||||
pub fn match_first_char(&self, query: char) -> bool {
|
||||
if let Some(first) = self.text.chars().nth(0) {
|
||||
first == query
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn next_first_char(&self) -> Option<char> {
|
||||
self.next.chars().nth(0)
|
||||
}
|
||||
|
||||
/// # Panics
|
||||
/// Panics if number of chars for a single lexeme exceeds `i2::MAX`
|
||||
pub fn count_char(&self, c: char) -> i32 {
|
||||
|
|
|
|||
199
src/syntax/content/parser/segment.rs
Normal file
199
src/syntax/content/parser/segment.rs
Normal file
|
|
@ -0,0 +1,199 @@
|
|||
pub fn segment(text: &str) -> Vec<String> {
|
||||
delimiter::atomize(text)
|
||||
}
|
||||
|
||||
mod delimiter {
|
||||
|
||||
fn make_delimiters() -> Vec<char> {
|
||||
vec!['\n', ' ', '`', '|']
|
||||
}
|
||||
|
||||
pub fn atomize(text: &str) -> Vec<String> {
|
||||
let delimiters = make_delimiters();
|
||||
text.chars().fold(
|
||||
Vec::new(),
|
||||
|mut accumulator: Vec<String>, character| {
|
||||
if delimiters.contains(&character) {
|
||||
accumulator.push(character.to_string());
|
||||
} else if let Some(last) = accumulator.last_mut() {
|
||||
if delimiters
|
||||
.iter()
|
||||
.map(char::to_string)
|
||||
.filter(|d| d == last)
|
||||
.count()
|
||||
> 0
|
||||
{
|
||||
accumulator.push(character.to_string());
|
||||
} else {
|
||||
last.push(character);
|
||||
}
|
||||
} else {
|
||||
accumulator.push(character.to_string());
|
||||
}
|
||||
accumulator
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn atomize_words() {
|
||||
let words = " justification for the actions of those who hold authority inevitably dwindles "; // 2
|
||||
let actual = atomize(words);
|
||||
let expected = vec![
|
||||
" ",
|
||||
" ",
|
||||
" ",
|
||||
" ",
|
||||
"justification",
|
||||
" ",
|
||||
"for",
|
||||
" ",
|
||||
" ",
|
||||
"the",
|
||||
" ",
|
||||
"actions",
|
||||
" ",
|
||||
" ",
|
||||
" ",
|
||||
"of",
|
||||
" ",
|
||||
"those",
|
||||
" ",
|
||||
" ",
|
||||
"who",
|
||||
" ",
|
||||
"hold",
|
||||
" ",
|
||||
"authority",
|
||||
" ",
|
||||
" ",
|
||||
" ",
|
||||
"inevitably",
|
||||
" ",
|
||||
"dwindles",
|
||||
" ",
|
||||
" ",
|
||||
];
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn atomize_ticks_no_spaces() {
|
||||
let s = "a`c`adc`dadcdbd`cdb`dcdb`dc`dad`bdc";
|
||||
let actual = atomize(s);
|
||||
let expected = vec![
|
||||
"a", "`", "c", "`", "adc", "`", "dadcdbd", "`", "cdb", "`",
|
||||
"dcdb", "`", "dc", "`", "dad", "`", "bdc",
|
||||
]
|
||||
.iter()
|
||||
.map(std::string::ToString::to_string)
|
||||
.collect::<Vec<String>>();
|
||||
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn atomize_ticks_with_spaces() {
|
||||
let s = "a`c`adc`da dcdb d` cdb` dcdb `dc ` d ad ` bdc";
|
||||
|
||||
let actual = atomize(s);
|
||||
let expected = vec![
|
||||
"a", "`", "c", "`", "adc", "`", "da", " ", "dcdb", " ", "d",
|
||||
"`", " ", "cdb", "`", " ", "dcdb", " ", "`", "dc", " ", "`",
|
||||
" ", "d", " ", "ad", " ", "`", " ", "bdc",
|
||||
]
|
||||
.iter()
|
||||
.map(std::string::ToString::to_string)
|
||||
.collect::<Vec<String>>();
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn atomize_pipes() {
|
||||
let s = "every other |time| as it was perceived";
|
||||
let actual = atomize(s);
|
||||
let expected = vec![
|
||||
"every",
|
||||
" ",
|
||||
"other",
|
||||
" ",
|
||||
"|",
|
||||
"time",
|
||||
"|",
|
||||
" ",
|
||||
"as",
|
||||
" ",
|
||||
"it",
|
||||
" ",
|
||||
"was",
|
||||
" ",
|
||||
"perceived",
|
||||
];
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn atomize_pipes_and_ticks() {
|
||||
let s = "every other |time| as `it could or |perhaps somehow|then or now| it was` perceived";
|
||||
let actual = atomize(s);
|
||||
let expected = vec![
|
||||
"every",
|
||||
" ",
|
||||
"other",
|
||||
" ",
|
||||
"|",
|
||||
"time",
|
||||
"|",
|
||||
" ",
|
||||
"as",
|
||||
" ",
|
||||
"`",
|
||||
"it",
|
||||
" ",
|
||||
"could",
|
||||
" ",
|
||||
"or",
|
||||
" ",
|
||||
"|",
|
||||
"perhaps",
|
||||
" ",
|
||||
"somehow",
|
||||
"|",
|
||||
"then",
|
||||
" ",
|
||||
"or",
|
||||
" ",
|
||||
"now",
|
||||
"|",
|
||||
" ",
|
||||
"it",
|
||||
" ",
|
||||
"was",
|
||||
"`",
|
||||
" ",
|
||||
"perceived",
|
||||
];
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn atomize_newlines() {
|
||||
let s = "a`c`adc`da \ndcdb d` cdb` dc\ndb `dc ` d ad ` bdc";
|
||||
|
||||
let actual = atomize(s);
|
||||
let expected = vec![
|
||||
"a", "`", "c", "`", "adc", "`", "da", " ", "\n", "dcdb", " ",
|
||||
"d", "`", " ", "cdb", "`", " ", "dc", "\n", "db", " ", "`",
|
||||
"dc", " ", "`", " ", "d", " ", "ad", " ", "`", " ", "bdc",
|
||||
]
|
||||
.iter()
|
||||
.map(std::string::ToString::to_string)
|
||||
.collect::<Vec<String>>();
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -9,6 +9,7 @@ pub mod header;
|
|||
pub mod preformat;
|
||||
pub mod code;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Token {
|
||||
Anchor(anchor::Anchor),
|
||||
Code(code::Code),
|
||||
|
|
|
|||
|
|
@ -1,98 +1,62 @@
|
|||
use crate::prelude::*;
|
||||
|
||||
use std::fmt::Display;
|
||||
|
||||
use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Anchor {
|
||||
text: String,
|
||||
destination: String,
|
||||
sticky: bool,
|
||||
pub text: String,
|
||||
pub destination: Option<String>,
|
||||
pub leading: bool,
|
||||
}
|
||||
|
||||
impl Parseable for Anchor {
|
||||
fn probe(lexeme: &Lexeme) -> bool {
|
||||
let pipe_count = lexeme.count_char('|');
|
||||
log!("{lexeme:?} has {pipe_count} pipes");
|
||||
|
||||
if !(1..=3).contains(&pipe_count) {
|
||||
log!("Negative: Bad pipe count {pipe_count} in {lexeme:?}");
|
||||
return false;
|
||||
}
|
||||
if lexeme.text().matches("||").count() > 0 {
|
||||
log!("Negative: Contiguous pipes in {lexeme:?}");
|
||||
return false;
|
||||
}
|
||||
|
||||
let parts = Anchor::split_parts(lexeme);
|
||||
if (1..=2).contains(&parts.len()) {
|
||||
log!("Positive: Parts {parts:?} with length {}", parts.len());
|
||||
true
|
||||
} else {
|
||||
log!("Negative: {parts:?} have length {}", parts.len());
|
||||
false
|
||||
}
|
||||
lexeme.text() == "|" || (!lexeme.is_whitespace() && lexeme.next == "|")
|
||||
}
|
||||
|
||||
fn lex(lexeme: &Lexeme) -> Anchor {
|
||||
let parts = Anchor::split_parts(lexeme);
|
||||
log!("Lexing anchor {parts:?}");
|
||||
|
||||
let text = parts.first().unwrap_or_else(|| unreachable!());
|
||||
|
||||
fn try_node_anchor(anchor: &str) -> String {
|
||||
if anchor.contains(":") || anchor.contains("/") {
|
||||
anchor.to_owned()
|
||||
} else {
|
||||
format!("/node/{anchor}")
|
||||
}
|
||||
}
|
||||
|
||||
let destination = match parts.get(1) {
|
||||
Some(d) => try_node_anchor(d),
|
||||
None => try_node_anchor(text),
|
||||
};
|
||||
|
||||
let sticky = [
|
||||
",", ".", ":", ";", "!", "?", "/", "(", ")", "%", "*", "&", r#"""#,
|
||||
"'",
|
||||
];
|
||||
|
||||
log!("Lexed anchor: {text} -> {destination}");
|
||||
Anchor {
|
||||
text: text.to_owned(),
|
||||
destination,
|
||||
sticky: sticky.contains(&lexeme.next.as_str()),
|
||||
}
|
||||
fn lex(_lexeme: &Lexeme) -> Anchor {
|
||||
panic!("Attempt to lex an anchor directly from a lexeme");
|
||||
}
|
||||
|
||||
fn render(&self) -> String {
|
||||
let space = if self.sticky {
|
||||
String::new()
|
||||
} else {
|
||||
String::from(" ")
|
||||
let Some(ref destination) = self.destination else {
|
||||
panic!(
|
||||
"Attempt to render anchor {self:?} without knowing its destination."
|
||||
)
|
||||
};
|
||||
format!(
|
||||
r#"<a href="{}">{}</a>{space}"#,
|
||||
&self.destination, &self.text
|
||||
)
|
||||
|
||||
format!(r#"<a href="{}">{}</a>"#, destination, &self.text)
|
||||
}
|
||||
}
|
||||
|
||||
impl Anchor {
|
||||
fn split_parts(lexeme: &Lexeme) -> Vec<String> {
|
||||
lexeme
|
||||
.text()
|
||||
.trim_start_matches('|')
|
||||
.trim_end_matches('|')
|
||||
.split('|')
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect()
|
||||
pub fn new(text: &str, destination: &str, spaced: bool) -> Anchor {
|
||||
Anchor {
|
||||
text: text.to_owned(),
|
||||
destination: Some(Anchor::resolve_destination(destination)),
|
||||
leading: spaced,
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_destination(raw: &str) -> String {
|
||||
if raw.contains(":") || raw.contains("/") {
|
||||
raw.to_owned()
|
||||
} else {
|
||||
format!("/node/{raw}")
|
||||
}
|
||||
}
|
||||
|
||||
pub fn empty() -> Anchor {
|
||||
Anchor {
|
||||
text: String::new(),
|
||||
destination: None,
|
||||
leading: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Anchor {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "Anchor: <{}> to <{}>", &self.text, &self.destination)
|
||||
write!(f, "Anchor: <{}> to <{:?}>", &self.text, &self.destination)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,42 +2,31 @@ use crate::{
|
|||
syntax::content::{Parseable, Lexeme},
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Code {
|
||||
text: String,
|
||||
sticky: bool,
|
||||
open: bool,
|
||||
}
|
||||
|
||||
impl Code {
|
||||
pub fn new(open: bool) -> Code {
|
||||
Code { open }
|
||||
}
|
||||
}
|
||||
|
||||
impl Parseable for Code {
|
||||
fn probe(lexeme: &Lexeme) -> bool {
|
||||
let chars = lexeme.split_chars();
|
||||
|
||||
if let Some(first_char) = chars.first()
|
||||
&& let Some(last_char) = chars.last()
|
||||
{
|
||||
*first_char == '`' && *last_char == '`'
|
||||
} else {
|
||||
false
|
||||
}
|
||||
lexeme.text() == "`"
|
||||
}
|
||||
|
||||
fn lex(lexeme: &Lexeme) -> Code {
|
||||
let sticky = [
|
||||
",", ".", ":", ";", "!", "?", "/", "(", ")", "%", "*", "&", r#"""#,
|
||||
"'",
|
||||
];
|
||||
|
||||
Code {
|
||||
text: lexeme.text().replace("`", ""),
|
||||
sticky: sticky.contains(&lexeme.next.as_str()),
|
||||
}
|
||||
fn lex(_lexeme: &Lexeme) -> Code {
|
||||
panic!("Attempt to lex a code tag directly from a lexeme")
|
||||
}
|
||||
|
||||
fn render(&self) -> String {
|
||||
let space = if self.sticky {
|
||||
String::new()
|
||||
if self.open {
|
||||
String::from("<code>")
|
||||
} else {
|
||||
String::from(" ")
|
||||
};
|
||||
format!("<code>{}</code>{space}", self.text)
|
||||
String::from("</code>")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,9 +1,18 @@
|
|||
use std::{
|
||||
collections::{HashMap, hash_map::Entry},
|
||||
iter::Peekable,
|
||||
slice,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
prelude::*,
|
||||
types::Config,
|
||||
syntax::content::{Parseable, Lexeme},
|
||||
};
|
||||
|
||||
use std::fmt::Display;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Header {
|
||||
open: Option<bool>,
|
||||
level: Level,
|
||||
|
|
@ -19,6 +28,35 @@ impl Header {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn make_id(
|
||||
config: &Config,
|
||||
iterator: &mut Peekable<slice::Iter<'_, Lexeme>>,
|
||||
ids: &mut HashMap<String, Vec<String>>,
|
||||
) -> String {
|
||||
let base_id = match iterator.peek() {
|
||||
Some(next_lexeme)
|
||||
if !config.ascii_dom_ids || next_lexeme.next.is_ascii() =>
|
||||
{
|
||||
next_lexeme.next.to_lowercase()
|
||||
},
|
||||
_ => String::from("h"),
|
||||
};
|
||||
|
||||
match ids.entry(base_id.clone()) {
|
||||
Entry::Occupied(mut occupied) => {
|
||||
let ids_vec = occupied.get_mut();
|
||||
let suffix = ids_vec.len();
|
||||
let id_with_suffix = format!("{base_id}-{suffix}");
|
||||
ids_vec.push(id_with_suffix.clone());
|
||||
id_with_suffix
|
||||
},
|
||||
Entry::Vacant(vacant) => {
|
||||
vacant.insert(vec![base_id.clone()]);
|
||||
base_id
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_u8(level: u8, open: bool, dom_id: Option<&str>) -> Header {
|
||||
Header {
|
||||
level: Level::from_u8(level),
|
||||
|
|
@ -27,7 +65,7 @@ impl Header {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn get_level(&self) -> u8 {
|
||||
pub fn level(&self) -> u8 {
|
||||
match self.level {
|
||||
Level::One => 1,
|
||||
Level::Two => 2,
|
||||
|
|
@ -92,6 +130,7 @@ impl Display for Header {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Level {
|
||||
One,
|
||||
Two,
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ use crate::{
|
|||
syntax::content::{Parseable, parser::lexeme::Lexeme},
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct LineBreak {}
|
||||
|
||||
impl Parseable for LineBreak {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
use std::fmt::Display;
|
||||
use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Literal {
|
||||
text: String,
|
||||
}
|
||||
|
|
@ -17,12 +18,7 @@ impl Parseable for Literal {
|
|||
}
|
||||
|
||||
fn render(&self) -> String {
|
||||
let non_sticky = [" ", "\n"];
|
||||
if non_sticky.contains(&self.text.as_str()) {
|
||||
self.text.clone()
|
||||
} else {
|
||||
format!("{} ", self.text.clone())
|
||||
}
|
||||
self.text.clone()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
use std::fmt::Display;
|
||||
use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Paragraph {
|
||||
open: Option<bool>,
|
||||
}
|
||||
|
|
@ -14,9 +15,7 @@ impl Paragraph {
|
|||
impl Parseable for Paragraph {
|
||||
fn probe(lexeme: &Lexeme) -> bool {
|
||||
// lexeme for paragraph is any non-whitespace, parser knows the context
|
||||
let raw = lexeme.text();
|
||||
let trimmed = raw.trim();
|
||||
!trimmed.is_empty() && trimmed != "\n"
|
||||
!lexeme.is_whitespace()
|
||||
}
|
||||
|
||||
fn lex(_lexeme: &Lexeme) -> Paragraph {
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ use crate::{
|
|||
syntax::content::{Parseable, Lexeme},
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PreFormat {
|
||||
open: Option<bool>,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
use std::fmt::Display;
|
||||
use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Span {
|
||||
open: Option<bool>,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -132,7 +132,7 @@ For example:
|
|||
docs|/node/Documentation
|
||||
`
|
||||
|
||||
If the left side contains spaces, you need a leading `|` character. In this case, the space on the left side is mandatory:
|
||||
If the left side contains spaces, you need a leading `|` character:
|
||||
|
||||
`
|
||||
|en docs|https://en.jutty.dev/node/Documentation
|
||||
|
|
@ -141,12 +141,12 @@ If the left side contains spaces, you need a leading `|` character. In this case
|
|||
If you have a trailing character that you don't want to be considered as part of the destination, you can separate it with a third `|`:
|
||||
|
||||
`
|
||||
This |gem|PreciousStone|, though green, was not an emerald.
|
||||
This gem|PreciousStone|, though green, was not an emerald.
|
||||
`
|
||||
|
||||
Which renders as:
|
||||
|
||||
This |gem|PreciousStone|, though green, was not an emerald.
|
||||
This gem|PreciousStone|, though green, was not an emerald.
|
||||
|
||||
### Node anchors
|
||||
|
||||
|
|
@ -169,14 +169,15 @@ Because en can resolve IDs case insensitively (with priority to case-sensitive m
|
|||
In summary, all of the anchors below are valid and lead to the same page:
|
||||
|
||||
`
|
||||
|en Syntax|https://en.jutty.dev/node/Syntax|
|
||||
|en Syntax|https://en.jutty.dev/node/Syntax
|
||||
Syntax|https://en.jutty.dev/node/Syntax
|
||||
|
||||
|en Syntax|/node/Syntax
|
||||
Syntax|/node/Syntax
|
||||
Syntax|/node/syntax
|
||||
|
||||
Syntax|Syntax
|
||||
syntax|syntax
|
||||
|syntax|Syntax
|
||||
Syntax|syntax
|
||||
Syntax|syntax|
|
||||
|
||||
|Syntax|
|
||||
|syntax|
|
||||
|
|
@ -312,23 +313,23 @@ We saw example `docs|/node/Documentation`, but shorter syntax exists.
|
|||
#### Epistēmē
|
||||
#### Epistēmē
|
||||
|
||||
|en Syntax|https://en.jutty.dev/node/Syntax|
|
||||
|en Syntax|https://en.jutty.dev/node/Syntax
|
||||
Syntax|https://en.jutty.dev/node/Syntax
|
||||
|
||||
|en Syntax|/node/Syntax
|
||||
Syntax|/node/Syntax
|
||||
Syntax|/node/syntax
|
||||
|
||||
Syntax|Syntax
|
||||
syntax|syntax
|
||||
|syntax|Syntax
|
||||
Syntax|syntax
|
||||
Syntax|syntax|
|
||||
|
||||
|Syntax|
|
||||
|syntax|
|
||||
"""
|
||||
|
||||
[meta.config]
|
||||
content_language = "en"
|
||||
footer_credits = false
|
||||
footer_text = """
|
||||
made by jutty|https://jutty.dev • acknowledgements|Acknowledgments • |source code|https://codeberg.org/jutty/en
|
||||
"""
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue