Add word-level parsing

This commit is contained in:
Juno Takano 2025-12-18 02:20:11 -03:00
commit 198bc12507
34 changed files with 743 additions and 446 deletions

View file

@ -0,0 +1,39 @@
#[derive(Clone)]
pub enum Lexeme {
Compound(compound::Compound),
}
pub mod compound;
impl Lexeme {
pub fn to_raw(&self) -> String {
match *self {
Lexeme::Compound(ref d) => d.raw.clone(),
}
}
/// # Panics
/// Panics if number of chars for a single lexeme exceeds `i2::MAX`
pub fn count_char(&self, c: char) -> i32 {
let count = self.to_raw().chars().filter(|&n| n == c).count();
match i32::try_from(count) {
Ok(i) => i,
Err(e) => {
panic!("Wild char number {count} is a bit much: {e:#?}");
},
}
}
pub fn split_chars(&self) -> Vec<char> {
let vector: Vec<char> = self.to_raw().chars().collect();
vector
}
pub fn split_words(self) -> Vec<String> {
self.to_raw().split(' ').map(str::to_string).collect()
}
pub fn first(self) -> Option<String> {
self.split_words().first().map(String::to_owned)
}
}

View file

@ -0,0 +1,12 @@
#[derive(Clone)]
pub struct Compound {
pub raw: String,
}
impl Compound {
pub fn new(text: &str) -> Compound {
Compound {
raw: text.to_owned(),
}
}
}

View file

@ -0,0 +1,75 @@
use crate::syntax::content::Parseable as _;
pub mod literal;
pub mod anchor;
pub mod linebreak;
pub mod paragraph;
pub mod span;
pub mod header;
pub mod preformat;
pub enum Token {
Anchor(anchor::Anchor),
Header(header::Header),
LineBreak(linebreak::LineBreak),
Literal(literal::Literal),
Paragraph(paragraph::Paragraph),
PreFormat(preformat::PreFormat),
Span(span::Span),
}
impl Token {
pub fn render(&self) -> String {
match *self {
Token::Anchor(ref d) => d.render(),
Token::Header(ref d) => d.render(),
Token::LineBreak(ref d) => d.render(),
Token::Literal(ref d) => d.render(),
Token::Paragraph(ref d) => d.render(),
Token::PreFormat(ref d) => d.render(),
Token::Span(ref d) => d.render(),
}
}
}
impl From<paragraph::Paragraph> for Token {
fn from(d: paragraph::Paragraph) -> Token {
Token::Paragraph(d)
}
}
impl From<header::Header> for Token {
fn from(d: header::Header) -> Token {
Token::Header(d)
}
}
impl From<span::Span> for Token {
fn from(d: span::Span) -> Token {
Token::Span(d)
}
}
impl From<literal::Literal> for Token {
fn from(d: literal::Literal) -> Token {
Token::Literal(d)
}
}
impl From<anchor::Anchor> for Token {
fn from(d: anchor::Anchor) -> Token {
Token::Anchor(d)
}
}
impl From<linebreak::LineBreak> for Token {
fn from(d: linebreak::LineBreak) -> Token {
Token::LineBreak(d)
}
}
impl From<preformat::PreFormat> for Token {
fn from(d: preformat::PreFormat) -> Token {
Token::PreFormat(d)
}
}

View file

@ -0,0 +1,68 @@
use std::fmt::Display;
use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};
pub struct Anchor {
text: String,
destination: String,
}
impl Parseable for Anchor {
fn probe(lexeme: &Lexeme) -> bool {
let pipe_count = lexeme.count_char('|');
let chars = lexeme.split_chars();
let c1 = *match chars.first() {
Some(c) => c,
None => return false,
};
let cn = *match chars.last() {
Some(c) => c,
None => return false,
};
if !(1_i32..=3_i32).contains(&pipe_count) {
return false;
}
if lexeme.to_raw().matches("||").count() > 0 {
return false;
}
if pipe_count == 1 {
c1 != '|' && cn != '|'
} else if pipe_count == 2 {
c1 == '|' && cn != '|'
} else if pipe_count == 3 {
c1 == '|' && cn == '|'
} else {
false
}
}
fn lex(lexeme: &Lexeme) -> Anchor {
let parts: Vec<String> = lexeme
.to_raw()
.split('|')
.filter(|s| !s.is_empty())
.map(str::to_string)
.collect();
assert!(parts.len() == 2, "Parts should always be 2: {parts:?}");
let text = parts.first().unwrap_or_else(|| unreachable!());
let destination = parts.get(1).unwrap_or_else(|| unreachable!());
Anchor {
text: text.to_owned(),
destination: destination.to_owned(),
}
}
fn render(&self) -> String {
format!(r#"<a href="{}">{}</a>"#, &self.destination, &self.text)
}
}
impl Display for Anchor {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "Anchor: <{}> to <{}>", &self.text, &self.destination)
}
}

View file

@ -0,0 +1,137 @@
use crate::{
prelude::*,
syntax::content::{Parseable, Lexeme},
};
use std::fmt::Display;
pub struct Header {
open: Option<bool>,
level: Level,
}
impl Header {
pub fn new(level: Level, open: bool) -> Header {
Header {
level,
open: Some(open),
}
}
pub fn from_u8(level: u8, open: bool) -> Header {
Header {
level: Level::from_u8(level),
open: Some(open),
}
}
pub fn get_level(&self) -> u8 {
match self.level {
Level::One => 1,
Level::Two => 2,
Level::Three => 3,
Level::Four => 4,
Level::Five => 5,
Level::Six => 6,
}
}
}
impl Parseable for Header {
fn probe(lexeme: &Lexeme) -> bool {
if lexeme
.split_chars()
.into_iter()
.filter(|e| *e != '#')
.count()
== 0
{
let level = lexeme.to_raw().len();
lexeme.clone().split_words().len() == 1 && level > 0 && level <= 6
} else {
false
}
}
fn lex(lexeme: &Lexeme) -> Header {
Header::new(lexeme.to_raw().len().into(), true)
}
fn render(&self) -> String {
if let Some(open) = self.open {
if open {
format!("<h{}>", &self.level)
} else {
format!("</h{}>", &self.level)
}
} else {
panic!("Attempt to render a header tag while open state is unknown")
}
}
}
impl Display for Header {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
if let Some(open) = self.open {
if open {
write!(f, "Level {} Open Header", self.level)
} else {
write!(f, "Level {} Closed Header", self.level)
}
} else {
write!(f, "Level {} Header (Unknown open state)", self.level)
}
}
}
pub enum Level {
One,
Two,
Three,
Four,
Five,
Six,
}
impl Level {
fn from_u8(u: u8) -> Level {
if u <= 1 {
Level::One
} else if u == 2 {
Level::Two
} else if u == 3 {
Level::Three
} else if u == 4 {
Level::Four
} else if u == 5 {
Level::Five
} else {
Level::Six
}
}
}
impl From<usize> for Level {
fn from(z: usize) -> Level {
let u8 = match u8::try_from(z) {
Ok(u) => u,
Err(e) => {
log!("Truncating header level {z} to 6: {e:?}");
6_u8
},
};
Level::from_u8(u8)
}
}
impl Display for Level {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match *self {
Level::One => write!(f, "1"),
Level::Two => write!(f, "2"),
Level::Three => write!(f, "3"),
Level::Four => write!(f, "4"),
Level::Five => write!(f, "5"),
Level::Six => write!(f, "6"),
}
}
}

View file

@ -0,0 +1,26 @@
use std::fmt::Display;
use crate::{
syntax::content::{Parseable, parser::lexeme::Lexeme},
};
pub struct LineBreak {}
impl Parseable for LineBreak {
fn probe(lexeme: &Lexeme) -> bool {
lexeme.to_raw() == "\n"
}
fn lex(_lexeme: &Lexeme) -> LineBreak {
LineBreak {}
}
fn render(&self) -> String {
"\n".to_owned()
}
}
impl Display for LineBreak {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "Line Break")
}
}

View file

@ -0,0 +1,28 @@
use std::fmt::Display;
use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};
pub struct Literal {
text: String,
}
impl Parseable for Literal {
fn probe(_lexeme: &Lexeme) -> bool {
true
}
fn lex(lexeme: &Lexeme) -> Literal {
Literal {
text: lexeme.to_raw(),
}
}
fn render(&self) -> String {
self.text.clone()
}
}
impl Display for Literal {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "Literal: <{}>", &self.text)
}
}

View file

@ -0,0 +1,53 @@
use std::fmt::Display;
use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};
pub struct Paragraph {
open: Option<bool>,
}
impl Paragraph {
pub fn new(open: bool) -> Paragraph {
Paragraph { open: Some(open) }
}
}
impl Parseable for Paragraph {
fn probe(lexeme: &Lexeme) -> bool {
// lexeme for paragraph is any non-whitespace, parser knows the context
let raw = lexeme.to_raw();
let trimmed = raw.trim();
!trimmed.is_empty() && trimmed != "\n"
}
fn lex(_lexeme: &Lexeme) -> Paragraph {
Paragraph { open: None }
}
fn render(&self) -> String {
if let Some(open) = self.open {
if open {
"<p>".to_owned()
} else {
"</p>".to_owned()
}
} else {
panic!(
"Attempt to render a paragraph tag while open state is unknown"
)
}
}
}
impl Display for Paragraph {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
if let Some(open) = self.open {
if open {
write!(f, "Open Paragraph")
} else {
write!(f, "Closed Paragraph")
}
} else {
write!(f, "Unitialized Paragraph (Unknown open state)")
}
}
}

View file

@ -0,0 +1,43 @@
use crate::{
syntax::content::{Parseable, Lexeme},
};
pub struct PreFormat {
open: Option<bool>,
}
impl PreFormat {
pub fn new(open: bool) -> PreFormat {
PreFormat { open: Some(open) }
}
}
impl Parseable for PreFormat {
fn probe(lexeme: &Lexeme) -> bool {
let chars = lexeme.split_chars();
if let Some(first_char) = chars.first() {
*first_char == '`'
} else {
false
}
}
fn lex(_lexeme: &Lexeme) -> PreFormat {
PreFormat { open: None }
}
fn render(&self) -> String {
if let Some(o) = self.open {
if o {
"<pre>".to_owned()
} else {
"</pre>".to_owned()
}
} else {
panic!(
"Attempt to render a preformat tag while open state is unknown"
)
}
}
}

View file

@ -0,0 +1,49 @@
use std::fmt::Display;
use crate::syntax::content::{Parseable, parser::lexeme::Lexeme};
pub struct Span {
open: Option<bool>,
}
impl Span {
pub fn new(open: bool) -> Span {
Span { open: Some(open) }
}
}
impl Parseable for Span {
fn probe(_lexeme: &Lexeme) -> bool {
// there is no lexeme for span
false
}
fn lex(_lexeme: &Lexeme) -> Span {
Span { open: None }
}
fn render(&self) -> String {
if let Some(open) = self.open {
if open {
"<span>".to_owned()
} else {
"</span>".to_owned()
}
} else {
panic!("Attempt to render a span tag while open state is unknown")
}
}
}
impl Display for Span {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
if let Some(open) = self.open {
if open {
write!(f, "Open Span")
} else {
write!(f, "Closed Span")
}
} else {
write!(f, "Span (Unknown open state)")
}
}
}