From 79633bd059175e3a20c9456c9d5254dd56b7da04 Mon Sep 17 00:00:00 2001 From: Hayden Hargreaves Date: Thu, 27 Nov 2025 11:37:06 -0700 Subject: [PATCH] (FEAT): parse_italic implemented. But now I have realized that the parser should work differently and with mutual recursion. The "inline" nodes (except the text node) should all have children. --- lib/node.rs | 2 + lib/parser.rs | 328 ++++++++++++++++++++++++++++---------------------- src/main.rs | 7 +- test.md | 8 +- 4 files changed, 197 insertions(+), 148 deletions(-) diff --git a/lib/node.rs b/lib/node.rs index bc989e0..8300d75 100644 --- a/lib/node.rs +++ b/lib/node.rs @@ -12,6 +12,8 @@ pub enum Node { // Inline Nodes Text { content: String }, + + // TODO: THESE SHOULD BE STRUCTURE NODES, SO THEY CAN BE CALLED RECURSIVELY Bold { content: String }, Italic { content: String }, BoldItalic { content: String }, diff --git a/lib/parser.rs b/lib/parser.rs index ed93049..0df320e 100644 --- a/lib/parser.rs +++ b/lib/parser.rs @@ -1,32 +1,26 @@ -use std::iter::Peekable; -use std::str::Chars; - use crate::node::Node; #[derive(Debug)] -pub struct Parser { - content: String, +pub struct Parser<'a> { + content: &'a str, + position: usize, } -impl Parser { - /// Create a new parser object with the content attached. This does not take ownership of the - /// string provided and therefore dies with the string. The input string is normalized to - /// support operation on all operating systems. - pub fn new(content: &str) -> Self { - let normalized = content.replace("\r\n", "\n").replace("\r", ""); +impl<'a> Parser<'a> { + // Content should be normalized before being passed into this function. Since we do not take + // ownership here, we cannot mutate it. + pub fn new(content: &'a str) -> Self { Self { - content: normalized, + content, + position: 0, } } - pub fn parse_document(&self) -> Node { - let mut stream = self.content.chars().peekable(); - + pub fn parse_document(&mut self) -> Node { let mut root = Node::Document { children: vec![] }; - // Same as !IsEOF from the CPP implementation - while stream.peek().is_some() { - let block = self.parse_block(&mut stream); + while !self.is_eof() { + let block = self.parse_block(); if !block.is_empty() { root.add_child(block); } @@ -35,178 +29,222 @@ impl Parser { root } - /// BUG: USING CLONES IS FUCKED AS HELL, STOP THIS SHIT - fn parse_block(&self, stream: &mut Peekable>) -> Node { - self.consume_whitespace(stream); + // BUG: We should be using optional, not blank nodes + fn parse_block(&mut self) -> Node { + self.consume_whitespace(); - // If we are at the end, return an empty node; it will be ignored - let Some(&c1) = stream.peek() else { + // If we are at the end, return an empty node, the caller should know to return it. + if self.is_eof() { return Node::Text { content: "".into() }; - }; + } - // Use a clone to look ahead - let mut clone = stream.clone(); - clone.next(); + let c1 = self.peek(); + let c2 = self.peek_nth(1); + let c3 = self.peek_nth(2); - let c2 = clone.next(); - let c3 = clone.next(); - - // Now we can handle numbers from 0 to 99 for ordered lists + // TODO: For now we are just implementing paragraphs. So we can start with inline parsing match (c1, c2, c3) { - ('#', _, _) => self.parse_heading(stream), - ('*' | '-' | '+', Some(' ' | '\t'), _) => self.parse_list(stream, false), - (d, Some('.'), _) if d.is_ascii_digit() => self.parse_list(stream, true), - (d1, Some(d2), Some('.')) if d1.is_ascii_digit() && d2.is_ascii_digit() => { - self.parse_list(stream, true) - } - ('`', Some('`'), Some('`')) => self.parse_code_block(stream), - ('!', Some('['), _) => self.parse_image(stream), - ('>', _, _) => self.parse_block_quote(stream), - _ => self.parse_paragraph(stream), + // (Some('#'), _, _) => self.parse_heading(), + _ => self.parse_paragraph(), } } - // --- STRUCTURE PARSING --- - fn parse_paragraph(&self, stream: &mut Peekable>) -> Node { - let children = self.parse_inline(stream); - Node::Paragraph { children } - } - - fn parse_heading(&self, stream: &mut Peekable>) -> Node { - // Consume the hashes to determine the size, then consume the whitespace - let hashes = self.consume_until_char(stream, ' '); - self.consume_whitespace(stream); - - let children = self.parse_inline(stream); - dbg!(&children); + fn parse_heading(&mut self) -> Node { Node::Heading { - level: hashes.len(), - children, + level: 1, + children: vec![], } } - fn parse_list(&self, stream: &mut Peekable>, ordered: bool) -> Node { - Node::Text { content: "".into() } - } - - fn parse_block_quote(&self, stream: &mut Peekable>) -> Node { - Node::Text { content: "".into() } - } - - fn parse_code_block(&self, stream: &mut Peekable>) -> Node { - Node::Text { content: "".into() } - } - - fn parse_image(&self, stream: &mut Peekable>) -> Node { - Node::Text { content: "".into() } + fn parse_paragraph(&mut self) -> Node { + Node::Paragraph { + children: self.parse_inline(), + } } // --- INLINE PARSING --- - fn parse_inline(&self, stream: &mut Peekable>) -> Vec { + fn parse_inline(&mut self) -> Vec { let mut nodes = vec![]; - let mut str = String::new(); + let mut str = "".to_string(); - // use a clone to allow for peeking ahead - // REMEMBER TO ALSO CONSUME ANYTIME MAIN STREAM IS CONSUMED - let mut clone = stream.clone(); - clone.next(); // Stay one ahead - - while let Some(&c1) = stream.peek() { - let c2 = clone.next(); - let c3 = clone.next(); - - // println!("({}, {}, {})", c1, c2.unwrap_or('~'), c3.unwrap_or('~')); + while !self.is_eof() { + // c1 stores current char, c2/c3 store future, contextual chars + let c1 = self.peek(); + let c2 = self.peek_nth(1); + let c3 = self.peek_nth(2); + // TODO: Need to redesign the nodes match (c1, c2, c3) { - ('\n', _, _) => break, - ('!', Some('['), _) => { /* Image */ } - ('[', _, _) => { /* Link */ } - ('*', Some('*'), Some('*')) => { /* Bold Italic */ } - ('*', Some('*'), _) => { - nodes.push(Node::Text { content: str }); - str = "".into(); - let node = self.parse_bold(stream); + (None, _, _) | (Some('\n'), Some('\n'), _) => break, + (Some('!'), Some('['), _) => + /* parse image */ + { + continue; + } + (Some('['), _, _) => + /* parse link */ + { + continue; + } + (Some('*'), Some('*'), Some('*')) => + /* parse bold italic */ + { + continue; + } + (Some('*'), Some('*'), _) => + /* parse bold */ + { + continue; + } + (Some('*'), _, _) => { + nodes.push(Node::Text { + content: str.clone(), + }); + str = "".to_string(); + let node = self.parse_italic(); if !node.is_empty() { - nodes.push(node) + nodes.push(node); } continue; } - ('*', _, _) => { /* Italic */ } - ('`', _, _) => { /* Code */ } + (Some('`'), _, _) => + /* parse code */ + { + continue; + } - _ => { - // Should we swap '\n' with ' ' - str.push(c1); - stream.next(); - clone.next(); + (Some(c), _, _) => { + str.push(c); + self.consume(); } } } - // Push final node - if !str.is_empty() { - nodes.push(Node::Text { content: str }); - } + // TODO: Push text node + nodes.push(Node::Text { content: str }); nodes } - /// BUG: THIS FUNCTION SHOULD PARSE UNTIL IT FINDS EITHER THE ** OR AN ENDING OF A BLOCK. FOR - /// EXAMPLE: \n\n IS A NEW BLOCK AND THEN IT SHOULD END. BUT IF IT ENDS ON A NEW BLOCK, IT - /// SHOULD RETURN A TEXT NODE, WITH THE ** PREPENDED, SIGNIFYING FAILURE TO COMPLETE THE ENTIRE - /// STRONG BLOCK. - fn parse_bold(&self, stream: &mut Peekable>) -> Node { - let mut str = String::new(); + fn parse_italic(&mut self) -> Node { + let mut str = "".to_string(); + self.consume(); // Consume the '*' - stream.next(); - stream.next(); + println!("'{}'", self.content); - let mut clone = stream.clone(); - clone.next(); - - while let Some(&c1) = stream.peek() { - let c2 = clone.peek(); + // Use loop instead of 'while !self.is_eof()' so we can make it to the (None, _) case to + // exit + loop { + let c1 = self.peek(); + let c2 = self.peek_nth(1); match (c1, c2) { - ('\n', Some('\n')) => break, - ('*', Some('*')) => { - stream.next(); - stream.next(); + (None, _) | (Some('\n'), None) | (Some('\n'), Some('\n')) => { + // In this case, we did not find an ending star, so we should return a normal + // node. But we have to add the star back since we consumed it already + str.insert(0, '*'); + return Node::Text { content: str }; + } + (Some('*'), _) => { + self.consume(); break; } - _ => str.push(c1), + (Some(c), _) => { + str.push(c); + self.consume(); + } } - stream.next(); - clone.next(); } - println!("@str '{}'", str); - - Node::Bold { content: str } + Node::Italic { content: str } } - // --- HELPER FUNCTIONS --- - fn consume_whitespace(&self, stream: &mut Peekable>) { - while let Some(&c) = stream.peek() { + // --- HELPERS --- + fn is_eof(&self) -> bool { + self.position >= self.content.len() + } + + fn peek(&self) -> Option { + self.peek_nth(0) + } + + fn peek_nth(&self, n: usize) -> Option { + self.content[self.position..].chars().nth(n) + } + + fn consume(&mut self) { + self.consume_n(1) + } + + fn consume_n(&mut self, n: usize) { + for _ in 0..n { + if let Some(c) = self.content[self.position..].chars().next() { + self.position += c.len_utf8(); + } else { + break; + } + } + } + + fn consume_whitespace(&mut self) { + while let Some(c) = self.peek() { if !c.is_whitespace() { break; } - stream.next(); + self.consume(); + } + } +} + +#[cfg(test)] +mod parser_tests { + use super::Parser; + + #[test] + fn test_parse_italic() { + // This test only tests the `parse_italic` method, so it is expected that the first + // character is a '*', otherwise the first character will be consumed. The `parse_inline` + // tests will be able to handle more specific cases. + { + let s = "*hello world*"; + let html = "hello world"; + let mut p = Parser::new(s); + let node = p.parse_italic(); + assert_eq!(node.to_html(), html); + } + { + let s = "*hello* world"; + let html = "hello"; + let mut p = Parser::new(s); + let node = p.parse_italic(); + assert_eq!(node.to_html(), html); + } + { + let s = "*hello world"; + let html = "*hello world"; + let mut p = Parser::new(s); + let node = p.parse_italic(); + assert_eq!(node.to_html(), html); + } + { + let s = "*hello world\n"; + let html = "*hello world"; + let mut p = Parser::new(s); + let node = p.parse_italic(); + assert_eq!(node.to_html(), html); + } + { + let s = "*hello world\n\n"; + let html = "*hello world"; + let mut p = Parser::new(s); + let node = p.parse_italic(); + assert_eq!(node.to_html(), html); + } + { + let s = "*hello\n\nworld*"; + let html = "*hello"; + let mut p = Parser::new(s); + let node = p.parse_italic(); + assert_eq!(node.to_html(), html); } } - - /// THIS DOES NOT CONSUME THE TARGET, IT STOPS RIGHT BEFORE IT AND RETURNS THE STRING UNTIL - /// ITSELF - fn consume_until_char(&self, stream: &mut Peekable>, target: char) -> String { - let mut out = String::new(); - - while let Some(&c) = stream.peek() { - if c == target { - break; - } - stream.next(); - out.push(c); - } - - out - } } diff --git a/src/main.rs b/src/main.rs index c63ad3b..7fb3bcb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,13 +3,16 @@ use transpiler::parser::Parser; pub fn main() -> Result<(), Box> { let file = Filesystem::read_file("./test.md"); - let content; + let mut content; match file { Ok(s) => content = s, Err(err) => panic!("Failed to read file. {}", err), } - let parser = Parser::new(&content); + // Normalize char stream + content = content.replace("\r\n", "\n").replace("\r", ""); + + let mut parser = Parser::new(&content); let node = parser.parse_document(); match Filesystem::write_file("./output.html", &node.to_html()) { diff --git a/test.md b/test.md index 06032f7..d11d815 100644 --- a/test.md +++ b/test.md @@ -1 +1,7 @@ -# h1, **this** is sick as fuck +hello *world*. This is pre*tty* cool + + +What about this + + +This *should have a star