FEATURE: Implemented basic parser rules #18

Merged
shultzp1 merged 3 commits from feature/parser-basic-rules into main 2025-10-16 18:34:35 -07:00
4 changed files with 301 additions and 17 deletions
Showing only changes of commit 2c1e137157 - Show all commits

View File

@ -1,9 +1,17 @@
#include "parser.h"
#include "inlineNode.h"
#include "structureNode.h"
#include "util.h"
#include <algorithm>
#include <cctype>
#include <fstream>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <string>
using std::string;
using std::vector;
Parser::Parser(string input_file_path, string output_file_path) {
// NOTE: Remove any white space AROUND the inputs
@ -34,3 +42,248 @@ void Parser::Inspect() {
std::cout << "std::string output_file_path: " << this->output_file_path
<< std::endl;
}
// replace '\r\n' with '\n'
void Parser::NormalizeInputStream() {
if (this->content.empty())
return;
size_t pos = 0;
while ((pos = content.find("\r\n", pos)) != string::npos) {
this->content.replace(pos, 2, "\n");
pos++;
}
// NOTE: Remove all occurrences of '\r'
this->content.erase(
std::remove(this->content.begin(), this->content.end(), '\r'),
this->content.end());
}
void Parser::ParseDocument() {
// Open the input file
std::ifstream input_file(this->input_file_path);
if (!input_file.is_open()) {
throw std::runtime_error("Failed to open input file.");
return;
}
// Read the file into a single string
std::stringstream buffer;
buffer << input_file.rdbuf();
this->content = buffer.str();
input_file.close();
// We need document parent
this->DOM = std::make_unique<DocumentNode>();
while (!IsEOF()) {
// std::cout << Peek(); Consume();
auto block = ParseBlock();
if (block != nullptr)
this->DOM->AddChild(std::move(block));
}
std::cout << this->DOM->ToHtml();
}
// All this does is pick which subparser to call
// Identify which block to parse
std::unique_ptr<Node> Parser::ParseBlock() {
// Remove whitespace using peek and consume (' ', '\t', '\n')
ConsumeWhiteSpace();
// NOTE: Simple example
// std::string ch(1, Peek());
// std::unique_ptr<Node> block = std::make_unique<TextNode>(ch);
// Consume();
if (Peek() == '#') {
return ParseHeading();
}
// this is the default case
return ParseParagraph();
}
std::unique_ptr<Node> Parser::ParseParagraph() {
auto node = std::make_unique<ParagraphNode>();
// This should call parse inline
auto text_nodes = ParseInline();
for (auto &text_node : text_nodes) {
node->AddChild(std::move(text_node));
}
if (node->GetChilren().size() < 1)
return nullptr;
return node;
}
std::unique_ptr<Node> Parser::ParseHeading() {
// Compute the size of the heading
int i = 0;
char c = Peek();
while (c == '#') {
c = Peek(i++);
}
Consume(i - 1);
auto node = std::make_unique<HeadingNode>(i - 1);
ConsumeWhiteSpace();
std::string str;
while (!IsEOF()) {
c = Peek();
// We can stop as soon as we see a new line. Headings are single line blocks
if (c == '\n')
break;
// If a newline, use a space instead
str += c;
Consume();
}
// BUG: Why do we need to check this?
if (str == "")
return nullptr;
auto text_node = std::make_unique<TextNode>(str);
node->AddChild(std::move(text_node));
return node;
}
vector<std::unique_ptr<Node>> Parser::ParseInline() {
vector<std::unique_ptr<Node>> nodes;
string str;
while (!IsEOF()) {
char c = Peek();
// If this char and next char are both newlines: then we have an empty line,
// we should stop.
if (c == '\n' && Peek(1) == '\n')
break;
if (c == '*' && Peek(1) == '*' && Peek(2) == '*') {
PushTextNode(nodes, str);
nodes.push_back(std::move(ParseBoldItalic()));
continue;
} else if (c == '*' && Peek(1) == '*') {
PushTextNode(nodes, str);
nodes.push_back(std::move(ParseBold()));
continue;
} else if (c == '*') {
PushTextNode(nodes, str);
nodes.push_back(std::move(ParseItalic()));
continue;
}
// If a newline, use a space instead
str += (c == '\n' ? ' ' : c);
Consume();
}
// Push the last node, if the string is not empty
PushTextNode(nodes, str);
return nodes;
}
std::unique_ptr<Node> Parser::ParseItalic() {
string str;
Consume(1);
while (!IsEOF()) {
char c = Peek();
if (c == '\n' && Peek(1) == '\n')
break;
if (c == '*') {
Consume(1);
break;
}
str += c;
Consume();
}
return std::make_unique<ItalicNode>(str);
}
std::unique_ptr<Node> Parser::ParseBold() {
string str;
Consume(2);
while (!IsEOF()) {
char c = Peek();
if (c == '\n' && Peek(1) == '\n')
break;
if (c == '*' && Peek(1) == '*') {
Consume(2);
break;
}
str += c;
Consume();
}
return std::make_unique<BoldNode>(str);
}
std::unique_ptr<Node> Parser::ParseBoldItalic() {
string str;
Consume(3);
while (!IsEOF()) {
char c = Peek();
if (c == '\n' && Peek(1) == '\n')
break;
if (c == '*' && Peek(1) == '*' && Peek(2) == '*') {
Consume(3);
break;
}
str += c;
Consume();
}
return std::make_unique<BoldItalicNode>(str);
}
void Parser::PushTextNode(vector<std::unique_ptr<Node>> &nodes, string &str) {
if (!str.empty())
nodes.push_back(std::move(std::make_unique<TextNode>(str)));
str = "";
}
char Parser::Peek(size_t offset) {
size_t look_ahead_pos = this->position + offset;
if (look_ahead_pos < this->content.length()) {
return this->content[look_ahead_pos];
}
return '\0'; // null if past end
};
void Parser::Consume(size_t count) { this->position += count; };
bool Parser::IsEOF() { return this->position >= this->content.length(); };
void Parser::ConsumeWhiteSpace() {
// TODO: This can be optimized using an accumulator and then consuming
char c = Peek();
while (c == ' ' || c == '\t' || c == '\n') {
Consume();
c = Peek();
}
}

View File

@ -1,11 +1,14 @@
#ifndef PARSER_H
#define PARSER_H
#include "node.h"
#include <iostream>
#include <memory>
#include <stack>
#include <string>
using std::string;
using std::vector;
/**
* @brief Markdown parser class.
@ -48,7 +51,7 @@ public:
*
* @author Hayden Hargreaves (hhargreaves2006@gmail.com)
*/
void ParseDocument(void);
void ParseDocument();
protected:
/**
@ -70,35 +73,57 @@ protected:
*/
string output_file_path;
/**
* @brief Parser generated tree.
*
* This value will store the root, which is expected to be a DocumentNode.
* This node will mark the start of the tree. The parser will populate this
* tree during the parsing process.
*
* @author Hayden Hargreaves (hhargreaves2006@gmail.com)
*/
std::unique_ptr<Node> DOM;
// NOTE: We need a stack, just not sure what goes in it yet
// std::stack<any> stack;
private:
// windows... >:(
void NormalizeInputStream();
/**
* @brief Parse a single line.
* @brief Parse a single block of content
*
* How does this function work...
* This is where the magic happens.
*
* @param line Target line to parse, as string.
* @return DOMNode, once exists
*
* @author Hayden Hargreaves (hhargreaves2006@gmail.com)
*/
void ParseLine(string line);
std::unique_ptr<Node> ParseBlock();
// NOTE: Parser operations, again, abstract, just for brainstorming now
// These should operate on internal state, not lines themselves
void ParseHeader();
void ParseParagraph();
void ParseItalic();
void ParseBold();
void ParseBoldItalic();
// Stores index in the string
size_t position = 0;
// NOTE: Character operations, these are just for brainstorming
char Peek();
void Consume();
bool EndOfLine();
// Working input content
string content;
std::unique_ptr<Node> ParseParagraph();
std::unique_ptr<Node> ParseHeading();
vector<std::unique_ptr<Node>> ParseInline();
void PushTextNode(vector<std::unique_ptr<Node>> &nodes, string &str);
std::unique_ptr<Node> ParseItalic();
std::unique_ptr<Node> ParseBold();
std::unique_ptr<Node> ParseBoldItalic();
char Peek(size_t offset = 0);
void Consume(size_t count = 1);
bool IsEOF();
void ConsumeWhiteSpace();
};
#endif

View File

@ -126,4 +126,4 @@ std::string WatchDog::timePointToString(const fs::file_time_type& timePoint){
std::strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", &localTime);
return std::string(buffer);
}
}

View File

@ -78,4 +78,10 @@ void test_input(int argc, char **argv) {
std::cout << std::endl;
}
int main(int argc, char **argv) { test_nodes(); }
int main(int argc, char **argv) {
Parser p("input.md");
p.ParseDocument();
Parser p2("README.md");
p2.ParseDocument();
}