#include "parser.h" #include "fileSystem.h" #include "inlineNode.h" #include "structureNode.h" #include #include #include #include #include using std::string; using std::vector; void Parser::Inspect() { std::cout << this->position << std::endl; std::cout << this->content.size() << std::endl; } void Parser::NormalizeInputStream() { if (this->content.empty()) return; size_t pos = 0; while ((pos = content.find("\r\n", pos)) != string::npos) { this->content.replace(pos, 2, "\n"); pos++; } // NOTE: Remove all occurrences of '\r' this->content.erase( std::remove(this->content.begin(), this->content.end(), '\r'), this->content.end()); } void Parser::WriteOutput() { if (this->DOM == nullptr) throw std::runtime_error( "Cannot write output, output DOM tree does not exist. Please run the " "Parser::ParserDocument method first."); this->filesystem.WriteOutputFile(this->DOM->ToHtml()); } void Parser::ParseDocument() { // NOTE:This needs to be set so the parsing can continue this->position = 0; try { this->content = this->filesystem.ReadInputFile(); } catch (const std::runtime_error &e) { std::cerr << "Caught an error: " << e.what() << std::endl; return; } // Remove the windows BS NormalizeInputStream(); // We need document parent this->DOM = std::make_unique(); while (!IsEOF()) { // std::cout << Peek(); Consume(); auto block = ParseBlock(); if (block != nullptr) this->DOM->AddChild(std::move(block)); } } // All this does is pick which subparser to call // Identify which block to parse std::unique_ptr Parser::ParseBlock() { // Remove whitespace using peek and consume (' ', '\t', '\n') ConsumeWhiteSpace(); char c = Peek(); char c_next = Peek(1); // 1. Parse heading if (c == '#') { return ParseHeading(); } // 2. Parser unordered list if (c == '*' || c == '-' || c == '+') { // Next character must be space or tab if (c_next == ' ' || c_next == '\t') { return ParseList(false); } } // 3. Parse ordered list if (std::isdigit(c) && c_next == '.') { return ParseList(true); } // 4. Parse code block if (c == '`' && c_next == '`' && Peek(2) == '`') { return ParseCodeBlock(); } // 5. Parser paragraph return ParseParagraph(); } std::unique_ptr Parser::ParseParagraph() { auto node = std::make_unique(); // This should call parse inline auto text_nodes = ParseInline(); for (auto &text_node : text_nodes) { node->AddChild(std::move(text_node)); } if (node->IsEmpty()) return nullptr; return node; } std::unique_ptr Parser::ParseHeading() { // Compute the size of the heading int i = 0; char c = Peek(); while (c == '#') { c = Peek(i++); } Consume(i - 1); auto node = std::make_unique(i - 1); ConsumeWhiteSpace(); // This should call parse inline auto text_nodes = ParseInlineHeading(); for (auto &text_node : text_nodes) { node->AddChild(std::move(text_node)); } if (node->IsEmpty()) return nullptr; return node; } std::unique_ptr Parser::ParseList(bool ordered) { auto node = std::make_unique(ordered); // Consume the required white space and list char ('* ' or '1.') while (true) { Consume(ordered ? 2 : 1); ConsumeWhiteSpace(); // std::unique_ptr Parser::ParseParagraph() { // auto node = std::make_unique(); // // // This should call parse inline // auto text_nodes = ParseInline(); // for (auto &text_node : text_nodes) { // node->AddChild(std::move(text_node)); // } // // if (node->IsEmpty()) // return nullptr; // // return node; // } // Parse until either '\n\n' (exit) or the next list element is found ('* ' // or '1.') If '\n\n', then create a node and exit auto element = ParseInlineListContent(); node->AddChild(std::move(element)); // for (auto &child : children) { // node->AddChild(std::move(child)); // } char c = Peek(); char c_next = Peek(1); // 2. Parser unordered list if (c == '*' || c == '-' || c == '+') { if (c_next == ' ' || c_next == '\t') { continue; } } // 3. Parse ordered list // TODO: This only checks a single digit, should check for 'n' digits if (std::isdigit(c) && c_next == '.') { continue; } break; } return node; }; std::unique_ptr Parser::ParseCodeBlock() { auto node = std::make_unique(); string str; // Remove the first three characters, the '```' Consume(3); // Parse text into a single text node until '```' is found, include everything // else while (!IsEOF()) { char c = Peek(); if (c == '`' && Peek(1) == '`' && Peek(2) == '`') { Consume(3); break; } // Swap any '\n' with BR tags, so it will visually break if (c == '\n') str += "\n
\n"; else str += c; Consume(); } auto text_node = std::make_unique(str); node->AddChild(std::move(text_node)); return node; } vector> Parser::ParseInline() { vector> nodes; string str; while (!IsEOF()) { char c = Peek(); // If this char and next char are both newlines: then we have an empty line, // we should stop. if (c == '\n' && Peek(1) == '\n') break; if (c == '[') { PushTextNode(nodes, str); auto node = ParseLink(); if (!node->IsEmpty()) nodes.push_back(std::move(node)); continue; } if (c == '*' && Peek(1) == '*' && Peek(2) == '*') { PushTextNode(nodes, str); auto node = ParseBoldItalic(); if (!node->IsEmpty()) nodes.push_back(std::move(node)); continue; } else if (c == '*' && Peek(1) == '*') { PushTextNode(nodes, str); auto node = ParseBold(); if (!node->IsEmpty()) nodes.push_back(std::move(node)); continue; } else if (c == '*') { PushTextNode(nodes, str); auto node = ParseItalic(); if (!node->IsEmpty()) nodes.push_back(std::move(node)); continue; } if (c == '`') { PushTextNode(nodes, str); auto node = ParseCode(); if (!node->IsEmpty()) nodes.push_back(std::move(node)); continue; } // If a newline, use a space instead str += (c == '\n' ? ' ' : c); Consume(); } // Push the last node, if the string is not empty PushTextNode(nodes, str); return nodes; } vector> Parser::ParseInlineHeading() { vector> nodes; string str; while (!IsEOF()) { char c = Peek(); // We can stop as soon as we see a new line. Headings are single line blocks if (c == '\n') break; if (c == '[') { PushTextNode(nodes, str); auto node = ParseLink(); if (!node->IsEmpty()) nodes.push_back(std::move(node)); continue; } if (c == '*' && Peek(1) == '*' && Peek(2) == '*') { PushTextNode(nodes, str); auto node = ParseBoldItalic(); if (!node->IsEmpty()) nodes.push_back(std::move(node)); continue; } else if (c == '*' && Peek(1) == '*') { PushTextNode(nodes, str); auto node = ParseBold(); if (!node->IsEmpty()) nodes.push_back(std::move(node)); continue; } else if (c == '*') { PushTextNode(nodes, str); auto node = ParseItalic(); if (!node->IsEmpty()) nodes.push_back(std::move(node)); continue; } if (c == '`') { PushTextNode(nodes, str); auto node = ParseCode(); if (!node->IsEmpty()) nodes.push_back(std::move(node)); continue; } // If a newline, use a space instead str += (c == '\n' ? ' ' : c); Consume(); } // Push the last node, if the string is not empty PushTextNode(nodes, str); return nodes; } std::unique_ptr Parser::ParseInlineListContent() { vector> children; string str; while (!IsEOF()) { char c = Peek(); // char c_next = Peek(1); // If this char and next char are both newlines: then we have an empty line, // we should stop. if (c == '\n' && Peek(1) == '\n') break; // A single newline: We should consume whitespace and check if the next // character is a list item and the following item is a space if (c == '\n') { PushTextNode(children, str); ConsumeWhiteSpace(); char new_c = Peek(); char new_c_next = Peek(1); if ((new_c == '*' || new_c == '-' || new_c == '+') && (new_c_next == ' ' || new_c_next == '\t')) break; if (std::isdigit(new_c) && new_c_next == '.') break; str += ' '; continue; } if (c == '[') { PushTextNode(children, str); auto node = ParseLink(); if (!node->IsEmpty()) children.push_back(std::move(node)); continue; } if (c == '*' && Peek(1) == '*' && Peek(2) == '*') { PushTextNode(children, str); auto node = ParseBoldItalic(); if (!node->IsEmpty()) children.push_back(std::move(node)); continue; } else if (c == '*' && Peek(1) == '*') { PushTextNode(children, str); auto node = ParseBold(); if (!node->IsEmpty()) children.push_back(std::move(node)); continue; } else if (c == '*') { PushTextNode(children, str); auto node = ParseItalic(); if (!node->IsEmpty()) children.push_back(std::move(node)); continue; } if (c == '`') { PushTextNode(children, str); auto node = ParseCode(); if (!node->IsEmpty()) children.push_back(std::move(node)); continue; } // If a newline, use a space instead str += (c == '\n' ? ' ' : c); Consume(); } // Push the last node, if the string is not empty PushTextNode(children, str); // Create the list node with the children appended auto element = std::make_unique(); for (auto &child : children) { element->AddChild(std::move(child)); } return element; } std::unique_ptr Parser::ParseItalic() { string str; Consume(1); while (!IsEOF()) { char c = Peek(); if (c == '\n' && Peek(1) == '\n') break; if (c == '*') { Consume(1); break; } str += c; Consume(); } return std::make_unique(str); } std::unique_ptr Parser::ParseBold() { string str; Consume(2); while (!IsEOF()) { char c = Peek(); if (c == '\n' && Peek(1) == '\n') break; if (c == '*' && Peek(1) == '*') { Consume(2); break; } str += c; Consume(); } return std::make_unique(str); } std::unique_ptr Parser::ParseBoldItalic() { string str; Consume(3); while (!IsEOF()) { char c = Peek(); if (c == '\n' && Peek(1) == '\n') break; if (c == '*' && Peek(1) == '*' && Peek(2) == '*') { Consume(3); break; } str += c; Consume(); } return std::make_unique(str); } std::unique_ptr Parser::ParseCode() { string str; Consume(1); while (!IsEOF()) { char c = Peek(); if (c == '\n' && Peek(1) == '\n') break; if (c == '`') { Consume(1); break; } str += c; Consume(); } return std::make_unique(str); } std::unique_ptr Parser::ParseLink() { // Consume '[' Consume(); string content; while (!IsEOF()) { char c = Peek(); if (c == ']') break; content += c; Consume(); } // Consume '](' Consume(2); string link; while (!IsEOF()) { char c = Peek(); if (c == ')') { Consume(); break; } link += c; Consume(); } return std::make_unique(link, content); } void Parser::PushTextNode(vector> &nodes, string &str) { if (!str.empty()) nodes.push_back(std::move(std::make_unique(str))); str = ""; } char Parser::Peek(size_t offset) { size_t look_ahead_pos = this->position + offset; if (look_ahead_pos < this->content.length()) { return this->content[look_ahead_pos]; } return '\0'; // null if past end }; void Parser::Consume(size_t count) { this->position += count; }; bool Parser::IsEOF() { return this->position >= this->content.length(); }; void Parser::ConsumeWhiteSpace() { char c = Peek(); while (c == ' ' || c == '\t' || c == '\n') { Consume(); c = Peek(); } }