From 2c1e13715760835234a0e93081f3063ffca59a79 Mon Sep 17 00:00:00 2001 From: Hayden Hargreaves Date: Thu, 16 Oct 2025 17:19:51 -0700 Subject: [PATCH 1/2] (FEAT): Parser is working pretty well. I needed to make this commit so I can test on the windows machine... --- lib/parser.cpp | 253 +++++++++++++++++++++++++++++++++++++++++++++++ lib/parser.h | 55 ++++++++--- lib/watchDog.cpp | 2 +- src/main.cpp | 8 +- 4 files changed, 301 insertions(+), 17 deletions(-) diff --git a/lib/parser.cpp b/lib/parser.cpp index 0f563e0..9519d82 100644 --- a/lib/parser.cpp +++ b/lib/parser.cpp @@ -1,9 +1,17 @@ #include "parser.h" +#include "inlineNode.h" +#include "structureNode.h" #include "util.h" +#include #include +#include +#include +#include #include +#include using std::string; +using std::vector; Parser::Parser(string input_file_path, string output_file_path) { // NOTE: Remove any white space AROUND the inputs @@ -34,3 +42,248 @@ void Parser::Inspect() { std::cout << "std::string output_file_path: " << this->output_file_path << std::endl; } + +// replace '\r\n' with '\n' +void Parser::NormalizeInputStream() { + if (this->content.empty()) + return; + + size_t pos = 0; + while ((pos = content.find("\r\n", pos)) != string::npos) { + this->content.replace(pos, 2, "\n"); + pos++; + } + + // NOTE: Remove all occurrences of '\r' + this->content.erase( + std::remove(this->content.begin(), this->content.end(), '\r'), + this->content.end()); +} + +void Parser::ParseDocument() { + // Open the input file + std::ifstream input_file(this->input_file_path); + + if (!input_file.is_open()) { + throw std::runtime_error("Failed to open input file."); + return; + } + + // Read the file into a single string + std::stringstream buffer; + buffer << input_file.rdbuf(); + this->content = buffer.str(); + + input_file.close(); + + // We need document parent + this->DOM = std::make_unique(); + + while (!IsEOF()) { + // std::cout << Peek(); Consume(); + auto block = ParseBlock(); + if (block != nullptr) + this->DOM->AddChild(std::move(block)); + } + + std::cout << this->DOM->ToHtml(); +} + +// All this does is pick which subparser to call +// Identify which block to parse +std::unique_ptr Parser::ParseBlock() { + // Remove whitespace using peek and consume (' ', '\t', '\n') + ConsumeWhiteSpace(); + + // NOTE: Simple example + // std::string ch(1, Peek()); + // std::unique_ptr block = std::make_unique(ch); + // Consume(); + + if (Peek() == '#') { + return ParseHeading(); + } + + // this is the default case + return ParseParagraph(); +} + +std::unique_ptr Parser::ParseParagraph() { + auto node = std::make_unique(); + + // This should call parse inline + auto text_nodes = ParseInline(); + for (auto &text_node : text_nodes) { + node->AddChild(std::move(text_node)); + } + + if (node->GetChilren().size() < 1) + return nullptr; + + return node; +} + +std::unique_ptr Parser::ParseHeading() { + // Compute the size of the heading + int i = 0; + char c = Peek(); + while (c == '#') { + c = Peek(i++); + } + + Consume(i - 1); + auto node = std::make_unique(i - 1); + + ConsumeWhiteSpace(); + + std::string str; + while (!IsEOF()) { + c = Peek(); + // We can stop as soon as we see a new line. Headings are single line blocks + if (c == '\n') + break; + + // If a newline, use a space instead + str += c; + Consume(); + } + + // BUG: Why do we need to check this? + if (str == "") + return nullptr; + + auto text_node = std::make_unique(str); + node->AddChild(std::move(text_node)); + + return node; +} + +vector> Parser::ParseInline() { + vector> nodes; + string str; + + while (!IsEOF()) { + char c = Peek(); + // If this char and next char are both newlines: then we have an empty line, + // we should stop. + if (c == '\n' && Peek(1) == '\n') + break; + + if (c == '*' && Peek(1) == '*' && Peek(2) == '*') { + PushTextNode(nodes, str); + nodes.push_back(std::move(ParseBoldItalic())); + continue; + } else if (c == '*' && Peek(1) == '*') { + PushTextNode(nodes, str); + nodes.push_back(std::move(ParseBold())); + continue; + } else if (c == '*') { + PushTextNode(nodes, str); + nodes.push_back(std::move(ParseItalic())); + continue; + } + + // If a newline, use a space instead + str += (c == '\n' ? ' ' : c); + Consume(); + } + + // Push the last node, if the string is not empty + PushTextNode(nodes, str); + return nodes; +} + +std::unique_ptr Parser::ParseItalic() { + string str; + Consume(1); + + while (!IsEOF()) { + char c = Peek(); + + if (c == '\n' && Peek(1) == '\n') + break; + + if (c == '*') { + Consume(1); + break; + } + + str += c; + Consume(); + } + + return std::make_unique(str); +} + +std::unique_ptr Parser::ParseBold() { + string str; + Consume(2); + + while (!IsEOF()) { + char c = Peek(); + + if (c == '\n' && Peek(1) == '\n') + break; + + if (c == '*' && Peek(1) == '*') { + Consume(2); + break; + } + + str += c; + Consume(); + } + + return std::make_unique(str); +} + +std::unique_ptr Parser::ParseBoldItalic() { + string str; + Consume(3); + + while (!IsEOF()) { + char c = Peek(); + + if (c == '\n' && Peek(1) == '\n') + break; + + if (c == '*' && Peek(1) == '*' && Peek(2) == '*') { + Consume(3); + break; + } + + str += c; + Consume(); + } + + return std::make_unique(str); +} + +void Parser::PushTextNode(vector> &nodes, string &str) { + if (!str.empty()) + nodes.push_back(std::move(std::make_unique(str))); + str = ""; +} + +char Parser::Peek(size_t offset) { + size_t look_ahead_pos = this->position + offset; + + if (look_ahead_pos < this->content.length()) { + return this->content[look_ahead_pos]; + } + + return '\0'; // null if past end +}; + +void Parser::Consume(size_t count) { this->position += count; }; + +bool Parser::IsEOF() { return this->position >= this->content.length(); }; + +void Parser::ConsumeWhiteSpace() { + // TODO: This can be optimized using an accumulator and then consuming + char c = Peek(); + while (c == ' ' || c == '\t' || c == '\n') { + Consume(); + c = Peek(); + } +} diff --git a/lib/parser.h b/lib/parser.h index 74ecfab..ce7590d 100644 --- a/lib/parser.h +++ b/lib/parser.h @@ -1,11 +1,14 @@ #ifndef PARSER_H #define PARSER_H +#include "node.h" #include +#include #include #include using std::string; +using std::vector; /** * @brief Markdown parser class. @@ -48,7 +51,7 @@ public: * * @author Hayden Hargreaves (hhargreaves2006@gmail.com) */ - void ParseDocument(void); + void ParseDocument(); protected: /** @@ -70,35 +73,57 @@ protected: */ string output_file_path; + /** + * @brief Parser generated tree. + * + * This value will store the root, which is expected to be a DocumentNode. + * This node will mark the start of the tree. The parser will populate this + * tree during the parsing process. + * + * @author Hayden Hargreaves (hhargreaves2006@gmail.com) + */ + std::unique_ptr DOM; + // NOTE: We need a stack, just not sure what goes in it yet // std::stack stack; private: + // windows... >:( + void NormalizeInputStream(); + /** - * @brief Parse a single line. + * @brief Parse a single block of content * * How does this function work... * This is where the magic happens. * - * @param line Target line to parse, as string. * @return DOMNode, once exists * * @author Hayden Hargreaves (hhargreaves2006@gmail.com) */ - void ParseLine(string line); + std::unique_ptr ParseBlock(); - // NOTE: Parser operations, again, abstract, just for brainstorming now - // These should operate on internal state, not lines themselves - void ParseHeader(); - void ParseParagraph(); - void ParseItalic(); - void ParseBold(); - void ParseBoldItalic(); + // Stores index in the string + size_t position = 0; - // NOTE: Character operations, these are just for brainstorming - char Peek(); - void Consume(); - bool EndOfLine(); + // Working input content + string content; + + std::unique_ptr ParseParagraph(); + std::unique_ptr ParseHeading(); + vector> ParseInline(); + + void PushTextNode(vector> &nodes, string &str); + + std::unique_ptr ParseItalic(); + std::unique_ptr ParseBold(); + std::unique_ptr ParseBoldItalic(); + + char Peek(size_t offset = 0); + void Consume(size_t count = 1); + bool IsEOF(); + + void ConsumeWhiteSpace(); }; #endif diff --git a/lib/watchDog.cpp b/lib/watchDog.cpp index e9bec9e..cbc44bd 100644 --- a/lib/watchDog.cpp +++ b/lib/watchDog.cpp @@ -126,4 +126,4 @@ std::string WatchDog::timePointToString(const fs::file_time_type& timePoint){ std::strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", &localTime); return std::string(buffer); -} \ No newline at end of file +} diff --git a/src/main.cpp b/src/main.cpp index 9a67d41..5fc78ed 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -78,4 +78,10 @@ void test_input(int argc, char **argv) { std::cout << std::endl; } -int main(int argc, char **argv) { test_nodes(); } +int main(int argc, char **argv) { + Parser p("input.md"); + p.ParseDocument(); + + Parser p2("README.md"); + p2.ParseDocument(); +} -- 2.47.2 From 4e334638631e2b1045ab7b48e5ad3945036d575f Mon Sep 17 00:00:00 2001 From: Hayden Hargreaves Date: Thu, 16 Oct 2025 17:23:29 -0700 Subject: [PATCH 2/2] (FIX): Forgot to call the method itself. But now this will be supported by both windows and linux. --- input.md | 115 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/parser.cpp | 3 ++ 2 files changed, 118 insertions(+) create mode 100644 input.md diff --git a/input.md b/input.md new file mode 100644 index 0000000..77ed442 --- /dev/null +++ b/input.md @@ -0,0 +1,115 @@ +# MarkdownToHtmlCompiler + +### Project Overview + +The goal is to create a program that reads a file containing text formatted in a simple version of +Markdown and converts it into a valid HTML file. The program will need to identify and translate +specific syntax (e.g., `# Heading` to `

Heading

`, `*text*` to `text`). + + +### Implementation Requirements (Generated by Gemini) + +Class Hierarchy: Design a class hierarchy to represent the components of your Markdown document. An +abstract base class, Element, can define common behavior. Derived classes would then represent specific +types of elements, such as Heading, Paragraph, BoldText, and ListItem. This is a perfect example of +inheritance and polymorphism. + +Object Composition: A Document class can be composed of multiple Element objects, representing the +entire file. A Parser class would be composed of helper methods to break down the input string and +build the Document object. This shows how you can build a complex system from smaller, self-contained +objects. + +File I/O and Exceptions: You will need to use ifstream to read the Markdown file and ofstream to write +the generated HTML file. Your code should use exceptions to gracefully handle potential errors, such +as a file not being found. + +Operator Overloading: Overload the << stream insertion operator for your Element and Document classes. +This would allow you to easily print the generated HTML to the console or write it to a file, making +your code cleaner and more readable. + +UML Diagram: The complexity of the class relationships makes a UML diagram an essential part of the +project. It will help you plan your design and will be a key component of your submission. + +Recursive Descent Parser: This is the primary algorithm you'll use. It's a top-down parsing technique +where a set of recursive functions "descend" through the grammar of your simple Markdown language. For +example, a parse_document() function would call parse_line(), which in turn might call parse_bold_text() +or parse_italic_text(). This method is intuitive and easy to implement for a simple grammar. + +Stack: A stack is essential for handling nested elements. For instance, if you allow bold text inside +italic text (_This is *bold and italic* text_), you can push the _ token onto the stack and then push +the * token. When you encounter the closing *, you check if the top of the stack matches. This ensures +that all tags are correctly opened and closed. Your presentation can visually demonstrate this process +with a stack diagram. + +Hash Map or Map: A hash map (std::unordered_map) or a map (std::map) can be used to efficiently store +and retrieve the HTML equivalent for each Markdown tag. For example, you could map `#` to `

`or `*` +to ``. This provides O(1) average-case lookup time. + + +### Contribution Policy + +###### Branching +When working on this project, please use a feature branch (i.e. `feature/parser`) with a descriptive name. +`feature/a` is not a descriptive name. These branches should be branched off the most recent `main` branch, +we will not make use of a `dev` or `staging` branch since the project is small in scale as well as time. +**However, if the project becomes larger or out-of-control, a dev/staging branch will be implemented.** + +###### Commits + +When working, it is best practice to commit code as much as possible, without being over zealous. For +example, when a feature or bug is complete, its time to commit. But when you have to make a new function, +that does not mean its time. Each team member should use their best judgment. + +Commit messages a little bit more important, when working in a team, it is important to provide strong, +clear and concise commit messages. In this project, the team will use a simple formula: + +**(SUBJECT) Title: textual description** + +i.e. (FIX) Rendering completed: explain what changed in short. + +###### Pushing + +When working in a feature branch, pushing and pulling has no restrictions. Feel free to do as much +(or as little) as possible. However, you **CANNOT** push directly to `main`, the VCS will not allow you +to do so, but do not make that mistake. When you are ready to merge a feature, you will create a PR +and once it has been reviewed and approved it will be automatically merged in. + +###### Pull Requests (PR) + +Once a feature is complete, you will create a pull request. Before a request can be merged into `main`, +one approval is required (which cannot be the author). This practice is to promote team work and encourage +code reviews. Each team member is expected to check in frequently and review as often as they are able to, +however, there is no defined time requirement. Personal communication is totally acceptable as a means to +request approval, since I am unsure if this platform will notify members. + +###### Issues + +If a bug, issue, or otherwise concern is noticed the first thing the team member should do is create an +issue. An issue should be descriptive and contain everything another team member needs to understand the +issue and its context. This way, a new team member can tackle the issue without contextual gaps. + +If a member would like to work on the issue themself, the `assignee` field is where this should be defined. +If a member would like help from another member, they should assign the other team member to the issue, and +leave a comment in the issue itself describing what help is needed. + +**Labels** are important for understanding what type of issues/bugs exist in the application. When a bug is +created, make sure the proper labels are applied. These labels will be abstract, such as: `bug`, `fix` or `feature` +and they will also be specific, such as: `parser`, `i/o` or `processer`. A combination of both styles of labels +allows other team members to understand what is going on. If a member feels an issue is missing, they are free +to create new ones, but there is a such thing as **too many labels** a few per issue is totally fine. They are +not meant to replace the description. + +**Priority** is the final important factor to consider. In this project, priority will be defined using labels +as well. The policy defined above will apply here to priority labels as well. However, these labels are +**mutually exclusive**. + +###### Projects (Sprints) + +The use of the `projects` tab in the VCS will allow the team to remain organized as create notes and action +items that should be completed before one another. These resemble `sprints` from the `AGILE` development life cycle. +A new "project" should be created when a large piece of functionality needs to be created. Issues can **and should** +be attached to the projects they are related too. This will continue to encourage teamwork and organization. + +Projects should have defined criteria, such as input and outputs, expectations and a semi-defined timeline. +Once a description and is defined, tasks can be added and moved around as needed. The team will use **Kanban** +project types, as they are simple and easy to understand for new team members. diff --git a/lib/parser.cpp b/lib/parser.cpp index 9519d82..51bde62 100644 --- a/lib/parser.cpp +++ b/lib/parser.cpp @@ -76,6 +76,9 @@ void Parser::ParseDocument() { input_file.close(); + // Remove the windows BS + NormalizeInputStream(); + // We need document parent this->DOM = std::make_unique(); -- 2.47.2