(FEAT): parse_italic implemented.

But now I have realized that the parser should work differently and with
mutual recursion. The "inline" nodes (except the text node) should all
have children.
This commit is contained in:
Hayden Hargreaves 2025-11-27 11:37:06 -07:00
parent 3c25e6b9e8
commit 79633bd059
4 changed files with 197 additions and 148 deletions

View File

@ -12,6 +12,8 @@ pub enum Node {
// Inline Nodes // Inline Nodes
Text { content: String }, Text { content: String },
// TODO: THESE SHOULD BE STRUCTURE NODES, SO THEY CAN BE CALLED RECURSIVELY
Bold { content: String }, Bold { content: String },
Italic { content: String }, Italic { content: String },
BoldItalic { content: String }, BoldItalic { content: String },

View File

@ -1,32 +1,26 @@
use std::iter::Peekable;
use std::str::Chars;
use crate::node::Node; use crate::node::Node;
#[derive(Debug)] #[derive(Debug)]
pub struct Parser { pub struct Parser<'a> {
content: String, content: &'a str,
position: usize,
} }
impl Parser { impl<'a> Parser<'a> {
/// Create a new parser object with the content attached. This does not take ownership of the // Content should be normalized before being passed into this function. Since we do not take
/// string provided and therefore dies with the string. The input string is normalized to // ownership here, we cannot mutate it.
/// support operation on all operating systems. pub fn new(content: &'a str) -> Self {
pub fn new(content: &str) -> Self {
let normalized = content.replace("\r\n", "\n").replace("\r", "");
Self { Self {
content: normalized, content,
position: 0,
} }
} }
pub fn parse_document(&self) -> Node { pub fn parse_document(&mut self) -> Node {
let mut stream = self.content.chars().peekable();
let mut root = Node::Document { children: vec![] }; let mut root = Node::Document { children: vec![] };
// Same as !IsEOF from the CPP implementation while !self.is_eof() {
while stream.peek().is_some() { let block = self.parse_block();
let block = self.parse_block(&mut stream);
if !block.is_empty() { if !block.is_empty() {
root.add_child(block); root.add_child(block);
} }
@ -35,178 +29,222 @@ impl Parser {
root root
} }
/// BUG: USING CLONES IS FUCKED AS HELL, STOP THIS SHIT // BUG: We should be using optional, not blank nodes
fn parse_block(&self, stream: &mut Peekable<Chars<'_>>) -> Node { fn parse_block(&mut self) -> Node {
self.consume_whitespace(stream); self.consume_whitespace();
// If we are at the end, return an empty node; it will be ignored // If we are at the end, return an empty node, the caller should know to return it.
let Some(&c1) = stream.peek() else { if self.is_eof() {
return Node::Text { content: "".into() }; return Node::Text { content: "".into() };
}; }
// Use a clone to look ahead let c1 = self.peek();
let mut clone = stream.clone(); let c2 = self.peek_nth(1);
clone.next(); let c3 = self.peek_nth(2);
let c2 = clone.next(); // TODO: For now we are just implementing paragraphs. So we can start with inline parsing
let c3 = clone.next();
// Now we can handle numbers from 0 to 99 for ordered lists
match (c1, c2, c3) { match (c1, c2, c3) {
('#', _, _) => self.parse_heading(stream), // (Some('#'), _, _) => self.parse_heading(),
('*' | '-' | '+', Some(' ' | '\t'), _) => self.parse_list(stream, false), _ => self.parse_paragraph(),
(d, Some('.'), _) if d.is_ascii_digit() => self.parse_list(stream, true),
(d1, Some(d2), Some('.')) if d1.is_ascii_digit() && d2.is_ascii_digit() => {
self.parse_list(stream, true)
}
('`', Some('`'), Some('`')) => self.parse_code_block(stream),
('!', Some('['), _) => self.parse_image(stream),
('>', _, _) => self.parse_block_quote(stream),
_ => self.parse_paragraph(stream),
} }
} }
// --- STRUCTURE PARSING --- fn parse_heading(&mut self) -> Node {
fn parse_paragraph(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
let children = self.parse_inline(stream);
Node::Paragraph { children }
}
fn parse_heading(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
// Consume the hashes to determine the size, then consume the whitespace
let hashes = self.consume_until_char(stream, ' ');
self.consume_whitespace(stream);
let children = self.parse_inline(stream);
dbg!(&children);
Node::Heading { Node::Heading {
level: hashes.len(), level: 1,
children, children: vec![],
} }
} }
fn parse_list(&self, stream: &mut Peekable<Chars<'_>>, ordered: bool) -> Node { fn parse_paragraph(&mut self) -> Node {
Node::Text { content: "".into() } Node::Paragraph {
children: self.parse_inline(),
} }
fn parse_block_quote(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
Node::Text { content: "".into() }
}
fn parse_code_block(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
Node::Text { content: "".into() }
}
fn parse_image(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
Node::Text { content: "".into() }
} }
// --- INLINE PARSING --- // --- INLINE PARSING ---
fn parse_inline(&self, stream: &mut Peekable<Chars<'_>>) -> Vec<Node> { fn parse_inline(&mut self) -> Vec<Node> {
let mut nodes = vec![]; let mut nodes = vec![];
let mut str = String::new(); let mut str = "".to_string();
// use a clone to allow for peeking ahead while !self.is_eof() {
// REMEMBER TO ALSO CONSUME ANYTIME MAIN STREAM IS CONSUMED // c1 stores current char, c2/c3 store future, contextual chars
let mut clone = stream.clone(); let c1 = self.peek();
clone.next(); // Stay one ahead let c2 = self.peek_nth(1);
let c3 = self.peek_nth(2);
while let Some(&c1) = stream.peek() {
let c2 = clone.next();
let c3 = clone.next();
// println!("({}, {}, {})", c1, c2.unwrap_or('~'), c3.unwrap_or('~'));
// TODO: Need to redesign the nodes
match (c1, c2, c3) { match (c1, c2, c3) {
('\n', _, _) => break, (None, _, _) | (Some('\n'), Some('\n'), _) => break,
('!', Some('['), _) => { /* Image */ } (Some('!'), Some('['), _) =>
('[', _, _) => { /* Link */ } /* parse image */
('*', Some('*'), Some('*')) => { /* Bold Italic */ } {
('*', Some('*'), _) => { continue;
nodes.push(Node::Text { content: str }); }
str = "".into(); (Some('['), _, _) =>
let node = self.parse_bold(stream); /* parse link */
{
continue;
}
(Some('*'), Some('*'), Some('*')) =>
/* parse bold italic */
{
continue;
}
(Some('*'), Some('*'), _) =>
/* parse bold */
{
continue;
}
(Some('*'), _, _) => {
nodes.push(Node::Text {
content: str.clone(),
});
str = "".to_string();
let node = self.parse_italic();
if !node.is_empty() { if !node.is_empty() {
nodes.push(node) nodes.push(node);
} }
continue; continue;
} }
('*', _, _) => { /* Italic */ } (Some('`'), _, _) =>
('`', _, _) => { /* Code */ } /* parse code */
{
continue;
}
_ => { (Some(c), _, _) => {
// Should we swap '\n' with ' ' str.push(c);
str.push(c1); self.consume();
stream.next();
clone.next();
} }
} }
} }
// Push final node // TODO: Push text node
if !str.is_empty() {
nodes.push(Node::Text { content: str }); nodes.push(Node::Text { content: str });
}
nodes nodes
} }
/// BUG: THIS FUNCTION SHOULD PARSE UNTIL IT FINDS EITHER THE ** OR AN ENDING OF A BLOCK. FOR fn parse_italic(&mut self) -> Node {
/// EXAMPLE: \n\n IS A NEW BLOCK AND THEN IT SHOULD END. BUT IF IT ENDS ON A NEW BLOCK, IT let mut str = "".to_string();
/// SHOULD RETURN A TEXT NODE, WITH THE ** PREPENDED, SIGNIFYING FAILURE TO COMPLETE THE ENTIRE self.consume(); // Consume the '*'
/// STRONG BLOCK.
fn parse_bold(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
let mut str = String::new();
stream.next(); println!("'{}'", self.content);
stream.next();
let mut clone = stream.clone(); // Use loop instead of 'while !self.is_eof()' so we can make it to the (None, _) case to
clone.next(); // exit
loop {
while let Some(&c1) = stream.peek() { let c1 = self.peek();
let c2 = clone.peek(); let c2 = self.peek_nth(1);
match (c1, c2) { match (c1, c2) {
('\n', Some('\n')) => break, (None, _) | (Some('\n'), None) | (Some('\n'), Some('\n')) => {
('*', Some('*')) => { // In this case, we did not find an ending star, so we should return a normal
stream.next(); // node. But we have to add the star back since we consumed it already
stream.next(); str.insert(0, '*');
return Node::Text { content: str };
}
(Some('*'), _) => {
self.consume();
break; break;
} }
_ => str.push(c1), (Some(c), _) => {
str.push(c);
self.consume();
}
} }
stream.next();
clone.next();
} }
println!("@str '{}'", str); Node::Italic { content: str }
Node::Bold { content: str }
} }
// --- HELPER FUNCTIONS --- // --- HELPERS ---
fn consume_whitespace(&self, stream: &mut Peekable<Chars<'_>>) { fn is_eof(&self) -> bool {
while let Some(&c) = stream.peek() { self.position >= self.content.len()
}
fn peek(&self) -> Option<char> {
self.peek_nth(0)
}
fn peek_nth(&self, n: usize) -> Option<char> {
self.content[self.position..].chars().nth(n)
}
fn consume(&mut self) {
self.consume_n(1)
}
fn consume_n(&mut self, n: usize) {
for _ in 0..n {
if let Some(c) = self.content[self.position..].chars().next() {
self.position += c.len_utf8();
} else {
break;
}
}
}
fn consume_whitespace(&mut self) {
while let Some(c) = self.peek() {
if !c.is_whitespace() { if !c.is_whitespace() {
break; break;
} }
stream.next(); self.consume();
}
} }
} }
/// THIS DOES NOT CONSUME THE TARGET, IT STOPS RIGHT BEFORE IT AND RETURNS THE STRING UNTIL #[cfg(test)]
/// ITSELF mod parser_tests {
fn consume_until_char(&self, stream: &mut Peekable<Chars<'_>>, target: char) -> String { use super::Parser;
let mut out = String::new();
while let Some(&c) = stream.peek() { #[test]
if c == target { fn test_parse_italic() {
break; // This test only tests the `parse_italic` method, so it is expected that the first
// character is a '*', otherwise the first character will be consumed. The `parse_inline`
// tests will be able to handle more specific cases.
{
let s = "*hello world*";
let html = "<em>hello world</em>";
let mut p = Parser::new(s);
let node = p.parse_italic();
assert_eq!(node.to_html(), html);
} }
stream.next(); {
out.push(c); let s = "*hello* world";
let html = "<em>hello</em>";
let mut p = Parser::new(s);
let node = p.parse_italic();
assert_eq!(node.to_html(), html);
}
{
let s = "*hello world";
let html = "*hello world";
let mut p = Parser::new(s);
let node = p.parse_italic();
assert_eq!(node.to_html(), html);
}
{
let s = "*hello world\n";
let html = "*hello world";
let mut p = Parser::new(s);
let node = p.parse_italic();
assert_eq!(node.to_html(), html);
}
{
let s = "*hello world\n\n";
let html = "*hello world";
let mut p = Parser::new(s);
let node = p.parse_italic();
assert_eq!(node.to_html(), html);
}
{
let s = "*hello\n\nworld*";
let html = "*hello";
let mut p = Parser::new(s);
let node = p.parse_italic();
assert_eq!(node.to_html(), html);
} }
out
} }
} }

View File

@ -3,13 +3,16 @@ use transpiler::parser::Parser;
pub fn main() -> Result<(), Box<dyn std::error::Error>> { pub fn main() -> Result<(), Box<dyn std::error::Error>> {
let file = Filesystem::read_file("./test.md"); let file = Filesystem::read_file("./test.md");
let content; let mut content;
match file { match file {
Ok(s) => content = s, Ok(s) => content = s,
Err(err) => panic!("Failed to read file. {}", err), Err(err) => panic!("Failed to read file. {}", err),
} }
let parser = Parser::new(&content); // Normalize char stream
content = content.replace("\r\n", "\n").replace("\r", "");
let mut parser = Parser::new(&content);
let node = parser.parse_document(); let node = parser.parse_document();
match Filesystem::write_file("./output.html", &node.to_html()) { match Filesystem::write_file("./output.html", &node.to_html()) {

View File

@ -1 +1,7 @@
# h1, **this** is sick as fuck hello *world*. This is pre*tty* cool
What about this
This *should have a star