use crate::node::Node; #[derive(Debug)] pub struct Parser<'a> { content: &'a str, position: usize, } impl<'a> Parser<'a> { // Content should be normalized before being passed into this function. Since we do not take // ownership here, we cannot mutate it. pub fn new(content: &'a str) -> Self { Self { content, position: 0, } } pub fn parse_document(&mut self) -> Node { let mut root = Node::Document { children: vec![] }; while !self.is_eof() { let block = self.parse_block(); if !block.is_empty() { root.add_child(block); } } root } // BUG: We should be using optional, not blank nodes fn parse_block(&mut self) -> Node { self.consume_whitespace(); // If we are at the end, return an empty node, the caller should know to return it. if self.is_eof() { return Node::Text { content: "".into() }; } let c1 = self.peek(); let c2 = self.peek_nth(1); let c3 = self.peek_nth(2); // TODO: For now we are just implementing paragraphs. So we can start with inline parsing match (c1, c2, c3) { // (Some('#'), _, _) => self.parse_heading(), _ => self.parse_paragraph(), } } fn parse_paragraph(&mut self) -> Node { self.consume_whitespace(); Node::Paragraph { children: self.parse_inline(), } } // --- INLINE PARSING --- fn parse_inline(&mut self) -> Vec { let mut nodes = vec![]; let mut str = String::new(); while !self.is_eof() { let c1 = self.peek(); let c2 = self.peek_nth(1); let c3 = self.peek_nth(2); println!("parse_inline: c1={:?}, c2={:?}, c3={:?}", c1, c2, c3); // DEBUG match (c1, c2, c3) { (None, _, _) | (Some('\n'), Some('\n'), _) => break, // Check for ** (bold) before * (italic) (Some('*'), Some('*'), _) => { println!("Matched bold"); if !str.is_empty() { nodes.push(Node::Text { content: str.clone(), }); str.clear(); } nodes.push(self.parse_bold()); } // Check for * (italic) (Some('*'), _, _) => { println!("Matched italic"); if !str.is_empty() { nodes.push(Node::Text { content: str.clone(), }); str.clear(); } nodes.push(self.parse_italic()); } (Some(c), _, _) => { str.push(c); self.consume(); } } } if !str.is_empty() { nodes.push(Node::Text { content: str }); } nodes } fn parse_bold(&mut self) -> Node { self.consume_n(2); // Consume opening '**' let mut children = vec![]; let mut str = String::new(); while !self.is_eof() { let c1 = self.peek(); let c2 = self.peek_nth(1); println!("parse_bold: c1={:?} c2={:?}", c1, c2); match (c1, c2) { (None, _) | (Some('\n'), Some('\n')) => break, // Found closing '**' (Some('*'), Some('*')) => { println!("parse_bold: matched closing bold"); if !str.is_empty() { children.push(Node::Text { content: str }); } self.consume_n(2); return Node::Bold { children }; } // Single '*' inside bold (italic) (Some('*'), _) => { println!("parse_bold: matched italic"); if !str.is_empty() { children.push(Node::Text { content: str.clone(), }); str.clear(); } children.push(self.parse_italic()); } (Some(c), _) => { str.push(c); self.consume(); } } } // No closing '**' found - return as text with '**' prefix if !str.is_empty() { children.push(Node::Text { content: str }); } let mut text = String::from("**"); for child in children { if let Node::Text { content } = child { text.push_str(&content); } } Node::Text { content: text } } fn parse_italic(&mut self) -> Node { self.consume(); // Consume opening '*' let mut children = vec![]; let mut str = String::new(); while !self.is_eof() { let c1 = self.peek(); let c2 = self.peek_nth(1); println!("parse_italic: c1={:?} c2={:?}", c1, c2); match (c1, c2) { (None, _) | (Some('\n'), Some('\n')) => break, // Check for '**' (bold inside italic) (Some('*'), Some('*')) => { println!("parse_italic: matched bold"); if !str.is_empty() { children.push(Node::Text { content: str.clone(), }); str.clear(); } children.push(self.parse_bold()); } // Single '*' - our closing delimiter (Some('*'), _) => { println!("parse_italic: matched closing italic"); if !str.is_empty() { children.push(Node::Text { content: str }); } self.consume(); return Node::Italic { children }; } (Some(c), _) => { str.push(c); self.consume(); } } } // No closing '*' found - return as text with '*' prefix if !str.is_empty() { children.push(Node::Text { content: str }); } let mut text = String::from("*"); for child in children { if let Node::Text { content } = child { text.push_str(&content); } } Node::Text { content: text } } // --- INLINE PARSING --- // fn parse_inline(&mut self) -> Vec { // let mut nodes = vec![]; // let mut str = String::new(); // // while !self.is_eof() { // let c1 = self.peek(); // let c2 = self.peek_nth(1); // let c3 = self.peek_nth(2); // // match (c1, c2, c3) { // // Stop at double newline (paragraph break) // (None, _, _) | (Some('\n'), Some('\n'), _) => break, // // // Check for *** (bold italic) - LONGEST FIRST // (Some('*'), Some('*'), Some('*')) => { // if !str.is_empty() { // nodes.push(Node::Text { // content: str.clone(), // }); // str.clear(); // } // nodes.push(self.parse_bold_italic()); // } // // // Check for ** (bold) // (Some('*'), Some('*'), _) => { // if !str.is_empty() { // nodes.push(Node::Text { // content: str.clone(), // }); // str.clear(); // } // nodes.push(self.parse_bold()); // } // // // Check for * (italic) // (Some('*'), _, _) => { // if !str.is_empty() { // nodes.push(Node::Text { // content: str.clone(), // }); // str.clear(); // } // nodes.push(self.parse_italic()); // } // // // Regular character // (Some(c), _, _) => { // str.push(c); // self.consume(); // } // } // } // // if !str.is_empty() { // nodes.push(Node::Text { content: str }); // } // // nodes // } // // fn parse_bold_italic(&mut self) -> Node { // self.consume_n(3); // Consume opening '***' // // let mut children = vec![]; // let mut str = String::new(); // // while !self.is_eof() { // let c1 = self.peek(); // let c2 = self.peek_nth(1); // let c3 = self.peek_nth(2); // // match (c1, c2, c3) { // (None, _, _) | (Some('\n'), Some('\n'), _) => break, // // // Found closing '***' // (Some('*'), Some('*'), Some('*')) => { // if !str.is_empty() { // children.push(Node::Text { content: str }); // } // self.consume_n(3); // return Node::BoldItalic { children }; // } // // // Regular character (no nested formatting in bold-italic for simplicity) // (Some(c), _, _) => { // str.push(c); // self.consume(); // } // } // } // // // No closing '***' found - return as text // if !str.is_empty() { // children.push(Node::Text { content: str }); // } // let mut text = String::from("***"); // for child in children { // if let Node::Text { content } = child { // text.push_str(&content); // } // } // Node::Text { content: text } // } // // fn parse_bold(&mut self) -> Node { // self.consume_n(2); // Consume opening '**' // // let mut children = vec![]; // let mut str = String::new(); // // while !self.is_eof() { // let c1 = self.peek(); // let c2 = self.peek_nth(1); // let c3 = self.peek_nth(2); // // match (c1, c2, c3) { // (None, _, _) | (Some('\n'), Some('\n'), _) => break, // // // Check for closing '**' but NOT '***' // (Some('*'), Some('*'), Some('*')) => { // // This is ***, not our closing ** // // Treat as text (or you could support nested bold-italic) // str.push('*'); // self.consume(); // } // // // Found closing '**' // (Some('*'), Some('*'), _) => { // if !str.is_empty() { // children.push(Node::Text { content: str }); // } // self.consume_n(2); // return Node::Bold { children }; // } // // // Single '*' inside bold (italic) // (Some('*'), _, _) => { // if !str.is_empty() { // children.push(Node::Text { // content: str.clone(), // }); // str.clear(); // } // children.push(self.parse_italic()); // } // // (Some(c), _, _) => { // str.push(c); // self.consume(); // } // } // } // // // No closing '**' found - return as text with '**' prefix // if !str.is_empty() { // children.push(Node::Text { content: str }); // } // let mut text = String::from("**"); // for child in children { // if let Node::Text { content } = child { // text.push_str(&content); // } // } // Node::Text { content: text } // } // // fn parse_italic(&mut self) -> Node { // self.consume(); // Consume opening '*' // // let mut children = vec![]; // let mut str = String::new(); // // while !self.is_eof() { // let c1 = self.peek(); // let c2 = self.peek_nth(1); // let c3 = self.peek_nth(2); // // match (c1, c2, c3) { // (None, _, _) | (Some('\n'), Some('\n'), _) => break, // // // Check for '***' - not our closing // (Some('*'), Some('*'), Some('*')) => { // // Treat as text or handle specially // str.push('*'); // self.consume(); // } // // // Check for '**' (bold inside italic) // (Some('*'), Some('*'), _) => { // if !str.is_empty() { // children.push(Node::Text { // content: str.clone(), // }); // str.clear(); // } // children.push(self.parse_bold()); // } // // // Single '*' - our closing delimiter // (Some('*'), _, _) => { // if !str.is_empty() { // children.push(Node::Text { content: str }); // } // self.consume(); // return Node::Italic { children }; // } // // (Some(c), _, _) => { // str.push(c); // self.consume(); // } // } // } // // // No closing '*' found - return as text with '*' prefix // if !str.is_empty() { // children.push(Node::Text { content: str }); // } // let mut text = String::from("*"); // for child in children { // if let Node::Text { content } = child { // text.push_str(&content); // } // } // Node::Text { content: text } // } // // // // // fn parse_inline(&mut self) -> Vec { // self.parse_inline_until(&[]) // } // // fn parse_inline_until(&mut self, chars: &[char]) -> Vec { // let mut nodes = vec![]; // let mut str = String::new(); // // while !self.is_eof() { // let c1 = self.peek(); // let c2 = self.peek_nth(1); // let c3 = self.peek_nth(2); // // match (c1, c2, c3) { // // Default stop conditions for all elements: empty or block break // (None, _, _) | (Some('\n'), Some('\n'), _) => break, // // // Parse bold nodes // (Some('*'), Some('*'), _) => { // if !str.is_empty() { // nodes.push(Node::Text { // content: str.clone(), // }); // str.clear(); // } // nodes.push(self.parse_bold()); // } // // // Check the dynamic stop conditions // (Some(c), _, _) if chars.contains(&c) => break, // // // Parse italic nodes // (Some('*'), _, _) => { // if !str.is_empty() { // nodes.push(Node::Text { // content: str.clone(), // }); // str.clear(); // } // nodes.push(self.parse_italic()); // } // // (Some(c), _, _) => { // str.push(c); // self.consume(); // } // } // } // // // If content remains, push it to the list // if !str.is_empty() { // nodes.push(Node::Text { content: str }); // } // // nodes // } // // fn parse_bold(&mut self) -> Node { // self.consume_n(2); // Consume opening '**' // // // DON'T pass '*' as stop char - we need to look for '**' specifically // let mut children = vec![]; // let mut str = String::new(); // // while !self.is_eof() { // let c1 = self.peek(); // let c2 = self.peek_nth(1); // // match (c1, c2) { // (None, _) | (Some('\n'), Some('\n')) => break, // // // Found closing '**' // (Some('*'), Some('*')) => { // if !str.is_empty() { // children.push(Node::Text { content: str }); // } // self.consume_n(2); // return Node::Bold { children }; // } // // // Single '*' inside bold (could be italic) // (Some('*'), _) => { // if !str.is_empty() { // children.push(Node::Text { // content: str.clone(), // }); // str.clear(); // } // children.push(self.parse_italic()); // } // // (Some(c), _) => { // str.push(c); // self.consume(); // } // } // } // // // No closing '**' found // if !str.is_empty() { // children.push(Node::Text { content: str }); // } // let mut text = String::from("**"); // for child in children { // if let Node::Text { content } = child { // text.push_str(&content); // } // } // Node::Text { content: text } // } // // fn parse_italic(&mut self) -> Node { // self.consume(); // Consume opening '*' // // // Parse inline content until we hit closing '*' or end condition // let children = self.parse_inline_until(&['*', '\n']); // // // Check if we found the closing '*' // if self.peek() == Some('*') { // self.consume(); // Consume closing '*' // Node::Italic { children } // } else { // // No closing '*' found - return as plain text with the '*' prefix // let mut text = String::from("*"); // for child in children { // if let Node::Text { content } = child { // text.push_str(&content); // } // // Note: This is simplified - you'd need to flatten properly // } // Node::Text { content: text } // } // } // --- HELPERS --- fn is_eof(&self) -> bool { self.position >= self.content.len() } fn peek(&self) -> Option { self.peek_nth(0) } fn peek_nth(&self, n: usize) -> Option { self.content[self.position..].chars().nth(n) } fn consume(&mut self) { self.consume_n(1) } fn consume_n(&mut self, n: usize) { for _ in 0..n { if let Some(c) = self.content[self.position..].chars().next() { self.position += c.len_utf8(); } else { break; } } } fn consume_whitespace(&mut self) { while let Some(c) = self.peek() { if !c.is_whitespace() { break; } self.consume(); } } } #[cfg(test)] mod parser_tests { use super::Parser; #[test] fn test_parse_italic() { // This test only tests the `parse_italic` method, so it is expected that the first // character is a '*', otherwise the first character will be consumed. The `parse_inline` // tests will be able to handle more specific cases. { let s = "*hello world*"; let html = "hello world"; let mut p = Parser::new(s); let node = p.parse_italic(); assert_eq!(node.to_html(), html); } { let s = "*hello* world"; let html = "hello"; let mut p = Parser::new(s); let node = p.parse_italic(); assert_eq!(node.to_html(), html); } { let s = "*hello world"; let html = "*hello world"; let mut p = Parser::new(s); let node = p.parse_italic(); assert_eq!(node.to_html(), html); } { let s = "*hello world\n"; let html = "*hello world\n"; let mut p = Parser::new(s); let node = p.parse_italic(); assert_eq!(node.to_html(), html); } { let s = "*hello world\n\n"; let html = "*hello world"; let mut p = Parser::new(s); let node = p.parse_italic(); assert_eq!(node.to_html(), html); } { let s = "*hello\n\nworld*"; let html = "*hello"; let mut p = Parser::new(s); let node = p.parse_italic(); assert_eq!(node.to_html(), html); } } #[test] fn test_nested_bold_and_italics() { { let s = "***a***"; let html = "a"; let mut p = Parser::new(s); let nodes = p.parse_inline(); assert_eq!(nodes.len(), 1); assert_eq!(nodes[0].to_html(), html); } { let s = "*a **b** c*"; let html = "a b c"; let mut p = Parser::new(s); let nodes = p.parse_inline(); assert_eq!(nodes.len(), 1); assert_eq!(nodes[0].to_html(), html); } { let s = "**a *b* c**"; let html = "a b c"; let mut p = Parser::new(s); let nodes = p.parse_inline(); assert_eq!(nodes.len(), 1); assert_eq!(nodes[0].to_html(), html); } { let s = "*a *b* c*"; let html_1 = "a "; let html_2 = "b"; let html_3 = " c"; let mut p = Parser::new(s); let nodes = p.parse_inline(); assert_eq!(nodes.len(), 3); assert_eq!(nodes[0].to_html(), html_1); assert_eq!(nodes[1].to_html(), html_2); assert_eq!(nodes[2].to_html(), html_3); } { let s = "**a **b** c**"; let html_1 = "a "; let html_2 = "b"; let html_3 = " c"; let mut p = Parser::new(s); let nodes = p.parse_inline(); assert_eq!(nodes.len(), 3); assert_eq!(nodes[0].to_html(), html_1); assert_eq!(nodes[1].to_html(), html_2); assert_eq!(nodes[2].to_html(), html_3); } } }