(FEAT): parse_italic implemented.

But now I have realized that the parser should work differently and with mutual recursion. The "inline" nodes (except the text node) should all have children.
2025-11-27 11:37:06 -07:00 · 2025-11-27 11:37:06 -07:00 · 79633bd059
commit 79633bd059
parent 3c25e6b9e8
4 changed files with 197 additions and 148 deletions
--- a/lib/node.rs
+++ b/lib/node.rs
@ -12,6 +12,8 @@ pub enum Node {

    // Inline Nodes
    Text { content: String },
+
+    // TODO: THESE SHOULD BE STRUCTURE NODES, SO THEY CAN BE CALLED RECURSIVELY
    Bold { content: String },
    Italic { content: String },
    BoldItalic { content: String },
--- a/lib/parser.rs
+++ b/lib/parser.rs
@ -1,32 +1,26 @@
-use std::iter::Peekable;
-use std::str::Chars;
-
 use crate::node::Node;

 #[derive(Debug)]
-pub struct Parser {
-    content: String,
+pub struct Parser<'a> {
+    content: &'a str,
+    position: usize,
 }

-impl Parser {
-    /// Create a new parser object with the content attached. This does not take ownership of the
-    /// string provided and therefore dies with the string. The input string is normalized to
-    /// support operation on all operating systems.
-    pub fn new(content: &str) -> Self {
-        let normalized = content.replace("\r\n", "\n").replace("\r", "");
+impl<'a> Parser<'a> {
+    // Content should be normalized before being passed into this function. Since we do not take
+    // ownership here, we cannot mutate it.
+    pub fn new(content: &'a str) -> Self {
        Self {
-            content: normalized,
+            content,
+            position: 0,
        }
    }

-    pub fn parse_document(&self) -> Node {
-        let mut stream = self.content.chars().peekable();
-
+    pub fn parse_document(&mut self) -> Node {
        let mut root = Node::Document { children: vec![] };

-        // Same as !IsEOF from the CPP implementation
-        while stream.peek().is_some() {
-            let block = self.parse_block(&mut stream);
+        while !self.is_eof() {
+            let block = self.parse_block();
            if !block.is_empty() {
                root.add_child(block);
            }
@ -35,178 +29,222 @@ impl Parser {
        root
    }

-    /// BUG: USING CLONES IS FUCKED AS HELL, STOP THIS SHIT
-    fn parse_block(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
-        self.consume_whitespace(stream);
+    // BUG: We should be using optional, not blank nodes
+    fn parse_block(&mut self) -> Node {
+        self.consume_whitespace();

-        // If we are at the end, return an empty node; it will be ignored
-        let Some(&c1) = stream.peek() else {
+        // If we are at the end, return an empty node, the caller should know to return it.
+        if self.is_eof() {
            return Node::Text { content: "".into() };
-        };
+        }

-        // Use a clone to look ahead
-        let mut clone = stream.clone();
-        clone.next();
+        let c1 = self.peek();
+        let c2 = self.peek_nth(1);
+        let c3 = self.peek_nth(2);

-        let c2 = clone.next();
-        let c3 = clone.next();
-
-        // Now we can handle numbers from 0 to 99 for ordered lists
+        // TODO: For now we are just implementing paragraphs. So we can start with inline parsing
        match (c1, c2, c3) {
-            ('#', _, _) => self.parse_heading(stream),
-            ('*' | '-' | '+', Some(' ' | '\t'), _) => self.parse_list(stream, false),
-            (d, Some('.'), _) if d.is_ascii_digit() => self.parse_list(stream, true),
-            (d1, Some(d2), Some('.')) if d1.is_ascii_digit() && d2.is_ascii_digit() => {
-                self.parse_list(stream, true)
-            }
-            ('`', Some('`'), Some('`')) => self.parse_code_block(stream),
-            ('!', Some('['), _) => self.parse_image(stream),
-            ('>', _, _) => self.parse_block_quote(stream),
-            _ => self.parse_paragraph(stream),
+            // (Some('#'), _, _) => self.parse_heading(),
+            _ => self.parse_paragraph(),
        }
    }

-    // --- STRUCTURE PARSING ---
-    fn parse_paragraph(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
-        let children = self.parse_inline(stream);
-        Node::Paragraph { children }
-    }
-
-    fn parse_heading(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
-        // Consume the hashes to determine the size, then consume the whitespace
-        let hashes = self.consume_until_char(stream, ' ');
-        self.consume_whitespace(stream);
-
-        let children = self.parse_inline(stream);
-        dbg!(&children);
+    fn parse_heading(&mut self) -> Node {
        Node::Heading {
-            level: hashes.len(),
-            children,
+            level: 1,
+            children: vec![],
        }
    }

-    fn parse_list(&self, stream: &mut Peekable<Chars<'_>>, ordered: bool) -> Node {
-        Node::Text { content: "".into() }
-    }
-
-    fn parse_block_quote(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
-        Node::Text { content: "".into() }
-    }
-
-    fn parse_code_block(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
-        Node::Text { content: "".into() }
-    }
-
-    fn parse_image(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
-        Node::Text { content: "".into() }
+    fn parse_paragraph(&mut self) -> Node {
+        Node::Paragraph {
+            children: self.parse_inline(),
+        }
    }

    // --- INLINE PARSING ---
-    fn parse_inline(&self, stream: &mut Peekable<Chars<'_>>) -> Vec<Node> {
+    fn parse_inline(&mut self) -> Vec<Node> {
        let mut nodes = vec![];
-        let mut str = String::new();
+        let mut str = "".to_string();

-        // use a clone to allow for peeking ahead
-        // REMEMBER TO ALSO CONSUME ANYTIME MAIN STREAM IS CONSUMED
-        let mut clone = stream.clone();
-        clone.next(); // Stay one ahead
-
-        while let Some(&c1) = stream.peek() {
-            let c2 = clone.next();
-            let c3 = clone.next();
-
-            // println!("({}, {}, {})", c1, c2.unwrap_or('~'), c3.unwrap_or('~'));
+        while !self.is_eof() {
+            // c1 stores current char, c2/c3 store future, contextual chars
+            let c1 = self.peek();
+            let c2 = self.peek_nth(1);
+            let c3 = self.peek_nth(2);

+            // TODO: Need to redesign the nodes
            match (c1, c2, c3) {
-                ('\n', _, _) => break,
-                ('!', Some('['), _) => { /* Image */ }
-                ('[', _, _) => { /* Link */ }
-                ('*', Some('*'), Some('*')) => { /* Bold Italic */ }
-                ('*', Some('*'), _) => {
-                    nodes.push(Node::Text { content: str });
-                    str = "".into();
-                    let node = self.parse_bold(stream);
+                (None, _, _) | (Some('\n'), Some('\n'), _) => break,
+                (Some('!'), Some('['), _) =>
+                /* parse image */
+                {
+                    continue;
+                }
+                (Some('['), _, _) =>
+                /* parse link */
+                {
+                    continue;
+                }
+                (Some('*'), Some('*'), Some('*')) =>
+                /* parse bold italic */
+                {
+                    continue;
+                }
+                (Some('*'), Some('*'), _) =>
+                /* parse bold */
+                {
+                    continue;
+                }
+                (Some('*'), _, _) => {
+                    nodes.push(Node::Text {
+                        content: str.clone(),
+                    });
+                    str = "".to_string();
+                    let node = self.parse_italic();
                    if !node.is_empty() {
-                        nodes.push(node)
+                        nodes.push(node);
                    }
                    continue;
                }
-                ('*', _, _) => { /* Italic */ }
-                ('`', _, _) => { /* Code */ }
+                (Some('`'), _, _) =>
+                /* parse code */
+                {
+                    continue;
+                }

-                _ => {
-                    // Should we swap '\n' with ' '
-                    str.push(c1);
-                    stream.next();
-                    clone.next();
+                (Some(c), _, _) => {
+                    str.push(c);
+                    self.consume();
                }
            }
        }

-        // Push final node
-        if !str.is_empty() {
-            nodes.push(Node::Text { content: str });
-        }
+        // TODO: Push text node
+        nodes.push(Node::Text { content: str });
        nodes
    }

-    /// BUG: THIS FUNCTION SHOULD PARSE UNTIL IT FINDS EITHER THE ** OR AN ENDING OF A BLOCK. FOR
-    /// EXAMPLE: \n\n IS A NEW BLOCK AND THEN IT SHOULD END. BUT IF IT ENDS ON A NEW BLOCK, IT
-    /// SHOULD RETURN A TEXT NODE, WITH THE ** PREPENDED, SIGNIFYING FAILURE TO COMPLETE THE ENTIRE
-    /// STRONG BLOCK.
-    fn parse_bold(&self, stream: &mut Peekable<Chars<'_>>) -> Node {
-        let mut str = String::new();
+    fn parse_italic(&mut self) -> Node {
+        let mut str = "".to_string();
+        self.consume(); // Consume the '*'

-        stream.next();
-        stream.next();
+        println!("'{}'", self.content);

-        let mut clone = stream.clone();
-        clone.next();
-
-        while let Some(&c1) = stream.peek() {
-            let c2 = clone.peek();
+        // Use loop instead of 'while !self.is_eof()' so we can make it to the (None, _) case to
+        // exit
+        loop {
+            let c1 = self.peek();
+            let c2 = self.peek_nth(1);

            match (c1, c2) {
-                ('\n', Some('\n')) => break,
-                ('*', Some('*')) => {
-                    stream.next();
-                    stream.next();
+                (None, _) | (Some('\n'), None) | (Some('\n'), Some('\n')) => {
+                    // In this case, we did not find an ending star, so we should return a normal
+                    // node. But we have to add the star back since we consumed it already
+                    str.insert(0, '*');
+                    return Node::Text { content: str };
+                }
+                (Some('*'), _) => {
+                    self.consume();
                    break;
                }
-                _ => str.push(c1),
+                (Some(c), _) => {
+                    str.push(c);
+                    self.consume();
+                }
            }
-            stream.next();
-            clone.next();
        }

-        println!("@str '{}'", str);
-
-        Node::Bold { content: str }
+        Node::Italic { content: str }
    }

-    // --- HELPER FUNCTIONS ---
-    fn consume_whitespace(&self, stream: &mut Peekable<Chars<'_>>) {
-        while let Some(&c) = stream.peek() {
+    // --- HELPERS ---
+    fn is_eof(&self) -> bool {
+        self.position >= self.content.len()
+    }
+
+    fn peek(&self) -> Option<char> {
+        self.peek_nth(0)
+    }
+
+    fn peek_nth(&self, n: usize) -> Option<char> {
+        self.content[self.position..].chars().nth(n)
+    }
+
+    fn consume(&mut self) {
+        self.consume_n(1)
+    }
+
+    fn consume_n(&mut self, n: usize) {
+        for _ in 0..n {
+            if let Some(c) = self.content[self.position..].chars().next() {
+                self.position += c.len_utf8();
+            } else {
+                break;
+            }
+        }
+    }
+
+    fn consume_whitespace(&mut self) {
+        while let Some(c) = self.peek() {
            if !c.is_whitespace() {
                break;
            }
-            stream.next();
+            self.consume();
+        }
+    }
+}
+
+#[cfg(test)]
+mod parser_tests {
+    use super::Parser;
+
+    #[test]
+    fn test_parse_italic() {
+        // This test only tests the `parse_italic` method, so it is expected that the first
+        // character is a '*', otherwise the first character will be consumed. The `parse_inline`
+        // tests will be able to handle more specific cases.
+        {
+            let s = "*hello world*";
+            let html = "<em>hello world</em>";
+            let mut p = Parser::new(s);
+            let node = p.parse_italic();
+            assert_eq!(node.to_html(), html);
+        }
+        {
+            let s = "*hello* world";
+            let html = "<em>hello</em>";
+            let mut p = Parser::new(s);
+            let node = p.parse_italic();
+            assert_eq!(node.to_html(), html);
+        }
+        {
+            let s = "*hello world";
+            let html = "*hello world";
+            let mut p = Parser::new(s);
+            let node = p.parse_italic();
+            assert_eq!(node.to_html(), html);
+        }
+        {
+            let s = "*hello world\n";
+            let html = "*hello world";
+            let mut p = Parser::new(s);
+            let node = p.parse_italic();
+            assert_eq!(node.to_html(), html);
+        }
+        {
+            let s = "*hello world\n\n";
+            let html = "*hello world";
+            let mut p = Parser::new(s);
+            let node = p.parse_italic();
+            assert_eq!(node.to_html(), html);
+        }
+        {
+            let s = "*hello\n\nworld*";
+            let html = "*hello";
+            let mut p = Parser::new(s);
+            let node = p.parse_italic();
+            assert_eq!(node.to_html(), html);
        }
    }
-
-    /// THIS DOES NOT CONSUME THE TARGET, IT STOPS RIGHT BEFORE IT AND RETURNS THE STRING UNTIL
-    /// ITSELF
-    fn consume_until_char(&self, stream: &mut Peekable<Chars<'_>>, target: char) -> String {
-        let mut out = String::new();
-
-        while let Some(&c) = stream.peek() {
-            if c == target {
-                break;
-            }
-            stream.next();
-            out.push(c);
-        }
-
-        out
-    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -3,13 +3,16 @@ use transpiler::parser::Parser;

 pub fn main() -> Result<(), Box<dyn std::error::Error>> {
    let file = Filesystem::read_file("./test.md");
-    let content;
+    let mut content;
    match file {
        Ok(s) => content = s,
        Err(err) => panic!("Failed to read file. {}", err),
    }

-    let parser = Parser::new(&content);
+    // Normalize char stream
+    content = content.replace("\r\n", "\n").replace("\r", "");
+
+    let mut parser = Parser::new(&content);
    let node = parser.parse_document();

    match Filesystem::write_file("./output.html", &node.to_html()) {
--- a/test.md
+++ b/test.md
@ -1 +1,7 @@
-# h1, **this** is sick as fuck 
+hello *world*. This is pre*tty* cool
+
+
+What about this
+
+
+This *should have a star