#include #include "gmock/gmock.h" #include "gtest/gtest.h" #include #include #include #include #include #include #include #include using namespace std::string_literals; using BNF = std::map>>; using Terminals = std::set; using ProgramNode = std::deque; using PathElement = std::pair; // Name, Index std::vector split(std::string s) { std::vector result; boost::algorithm::split(result, s, boost::algorithm::is_any_of(s), boost::algorithm::token_compress_on); while (result.size() > 0 && result.back() == ""s) result.pop_back(); return result; } auto Reverse(BNF bnf){ std::map> result; for (const auto& [from, to] : bnf) { for (const auto& list : to) { for (const auto& element : list) { auto i{result.find(element)}; if (i != result.end()) // already present i->second.insert(from); else // new element result.emplace(element, std::set{from}); } } } return result; } using index_t = size_t; struct TreeNode { index_t parent{}; std::vector childs; // fill char by char std::vector child_names; // fill always std::string name; }; class Tree { private: std::map nodes; // index 0 = non existing; index starting at 1 index_t node_num{}; index_t root{}; index_t last{}; public: void clear() { nodes.clear(); root = 0; last = 0; node_num = 0; } // Type of lexical token std::string GetType() { if (node_num > 0 && nodes[root].child_names.size() == 1) return nodes[root].child_names[0]; return ""; } bool Valid(const std::string& Top) const { // A token is non empty if (node_num == 0) return false; // Start symbol on top auto rootNode{nodes.find(root)}; if (rootNode == nodes.end()) throw std::runtime_error("Node not found: "s + std::to_string(root)); if (rootNode->second.name != Top) return false; // All nodes filled (implies all leaves are terminal) for (const auto& [index, node]: nodes) { if (node.childs.size() < node.child_names.size()) return false; // node not filled } return true; } bool AddFirstNode(char c, const BNF& bnf, const std::map>& reverseBNF) { node_num ++; root = node_num; last = node_num; std::string node_name(1, char(c)); auto reverseRule{reverseBNF.find(node_name)}; if (reverseRule == reverseBNF.end()) throw std::runtime_error("Reverse rule not found for "s + node_name); auto rule{bnf.find(node_name)}; if (rule != bnf.end()) { // multiple variants! throw std::runtime_error("BNF rule for terminal symbol "s + node_name + " found."s); } nodes.emplace(root, TreeNode{0, std::vector{}, std::vector{}, node_name}); return true; } std::vector getParentTreeNode(const BNF& bnf, const std::map>& reverseBNF) { std::vector result; // default: empty auto& root_name {nodes[root].name}; auto bnfParents {reverseBNF.find(root_name)}; if (bnfParents == reverseBNF.end()) return result; for (const auto& parent_node_name : bnfParents->second) { auto lists {bnf.at(parent_node_name)}; for (const auto& list : lists) { if (list.size() > 0 && list[0] == root_name) { TreeNode node{0, std::vector{root}, list, parent_node_name}; result.push_back(node); } } } return result; } index_t GetLast() { index_t result {root}; while(result != 0 && nodes[result].childs.size() >= 2) { result = nodes[result].childs[nodes[result].childs.size() - 1]; } return result; } void AddRootNode(const TreeNode& newRootNode) { node_num++; nodes[node_num] = newRootNode; root = node_num; last = node_num; } void RemoveRootNode() { root = nodes[root].childs[0]; nodes.erase(node_num); node_num--; last = GetLast(); } // Path from leaf to root std::vector GetPath(std::string a, std::string b, const BNF& bnf, const std::map>& reverseBNF) { std::vector result; while (a != b) { auto parents {reverseBNF.find(a)}; if (parents == reverseBNF.end()) return {}; bool hit{false}; for (const auto& parent : parents->second) { for (const auto& list : bnf.at(parent)) { if (list.size() > 0 && list[0] == a) { if (!hit) { result.push_back(a); a = parent; hit = true; } else throw std::runtime_error("Double match for "s + parent + "/"s + a); } } } } if (a == b) { result.push_back(a); } return result; } index_t AddNode(const std::string& name, const std::string& child_name, index_t parent_index, const BNF& bnf, const std::map>& reverseBNF) { TreeNode& parent {nodes[parent_index]}; node_num++; index_t index = node_num; parent.childs.push_back(index); std::vector child_names; auto rule {bnf.find(name)}; if (rule != bnf.end()) { for (auto& list : rule->second) { if (list.size() > 0 && list[0] == child_name) child_names = list; } } nodes.emplace(index, TreeNode{parent_index, {}, child_names, name}); //root stays last = GetLast(); return index; } void AddPath(const std::vector& path, index_t current_index, const BNF& bnf, const std::map>& reverseBNF) { for (int i = path.size() - 1; i >= 0; i--) { std::string child_name; if (i > 0) child_name = path[i - 1]; current_index = AddNode(path[i], child_name, current_index, bnf, reverseBNF); } } // try to add character to tree bool Add(char c, const BNF& bnf, const std::map>& reverseBNF) { if (nodes.empty()) { // first node return AddFirstNode(c, bnf, reverseBNF); } else { // at least one character is already present // Traverse tree until partially filled node found // or new node can be added index_t current_index{last}; while (current_index != 0) { TreeNode& node {nodes[current_index]}; if (node.childs.size() < node.child_names.size()) { // partially filled node std::vector list = GetPath(std::string(1, c), node.child_names[node.childs.size()], bnf, reverseBNF); if (list.size() > 0) { AddPath(list, current_index, bnf, reverseBNF); return true; } else { return false; // The path a->b is not available via bnf } } current_index = node.parent; } // Add node at root std::vector parent_nodes = getParentTreeNode(bnf, reverseBNF); if (parent_nodes.size() == 0) throw std::runtime_error("Couldn't add new parent node."); for (const auto &i : parent_nodes) { AddRootNode(i); if (Add(c, bnf, reverseBNF)) return true; RemoveRootNode(); } } return false; } // add path to start symbol void Resolve(const BNF& bnf, const std::map>& reverseBNF) { if (nodes.empty()) // only handle non-empty trees return; while (true) { std::string& old_root_name { nodes[root].name }; // current root node name auto parents {reverseBNF.find(old_root_name)}; if (parents != reverseBNF.end()) { // parents in bnf available bool hit{false}; for (auto& parent : parents->second) { for (const auto& list : bnf.at(parent)) { if (list.size() == 1 && list[0] == old_root_name) { if (!hit) { const std::string& new_root_name {parent}; // Add new TreeNode in the direction to root: // New root with 1 child (old root) nodes.emplace(++node_num, TreeNode{0, // parent std::vector{root}, // child indices std::vector{old_root_name}, // child names new_root_name // name }); nodes[root].parent = node_num; root = node_num; // this->last stays hit = true; } else throw std::runtime_error("Error: Multiple resolve nodes for "s + old_root_name); } } } if (!hit) break; } else break; } } }; struct Location { size_t line; size_t column; }; bool operator==(const Location &a, const Location &b) { return (a.line == b.line && a.column == b.column); } struct Token { std::string type; std::string value; Location location; }; bool operator==(const Token &a, const Token &b) { return (a.type == b.type && a.value == b.value && a.location == b.location); } std::ostream& operator<<(std::ostream& os, const Token& token) { return os << token.type << ": " << token.value << "(" << token.location.line << ":" << token.location.column << ")"; } class Lexer { private: const BNF &bnf; const std::string& Top; Location location{1, 0}; std::map> ReverseBNF; // to be called on token end void FinalizeTree(Tree& tree, std::string& token, std::vector& result) { tree.Resolve(bnf, ReverseBNF); if (tree.Valid(Top)) { result.emplace_back(Token{tree.GetType(), token, Location{location.line, location.column - token.size()}}); token.clear(); } tree.clear(); } public: Lexer(const BNF& bnf, const std::string& Top): bnf(bnf), Top(Top), ReverseBNF{Reverse(bnf)} { } std::vector Lex(const std::string& s) { std::vector result; std::string token; std::string Whitespace{"\t \n\r"}; Tree tree; for (size_t pos{0}; pos < s.size(); pos++) { char c{s[pos]}; if (c == '\n') { location.column = 0; location.line++; } else if (std::isprint(c)) { location.column++; } //std::cout << "Char: |" << c << "|" << std::endl; if (Whitespace.find(c) != std::string::npos) { // found whitespace character // evaluate token up to now and skip whitespace FinalizeTree(tree, token, result); } else { // no whitespace: try to add to tree if (!tree.Add(c, bnf, ReverseBNF)) { FinalizeTree(tree, token, result); if (!tree.Add(c, bnf, ReverseBNF)) throw std::runtime_error("Parse error"); } token.push_back(c); } } // Final evaluation of last token FinalizeTree(tree, token, result); return result; } }; ProgramNode Compile(std::vector Tokens, std::string Top, BNF bnf, Terminals terminals) { std::map> ReverseBNF{ Reverse(bnf)}; if (Tokens.size()){ } else throw std::runtime_error("No tokens!"); return {}; } class Test: public ::testing::Test { protected: Test(){} ~Test() override {} }; TEST_F(Test, BNF) { std::string LexTop{"preprocessing-token"}; BNF LexBNF{ {"preprocessing-token", {{"identifier"}, {"preprocessing-op-or-punc"}, {"pp-number"}}}, {"identifier", {{"identifier-nondigit"}, {"identifier", "identifier-nondigit"}, {"identifier", "digit"}}}, {"digit", {{"0"}, {"1"}, {"2"}, {"3"}, {"4"}, {"5"}, {"6"}, {"7"}, {"8"}, {"9"} }}, {"identifier-nondigit", {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, {"f"}, {"g"}, {"h"}, {"i"}, {"j"}, {"k"}, {"l"}, {"m"}, {"n"}, {"o"}, {"p"}, {"q"}, {"r"}, {"s"}, {"t"}, {"u"}, {"v"}, {"w"}, {"x"}, {"y"}, {"z"}, {"A"}, {"B"}, {"C"}, {"D"}, {"E"}, {"F"}, {"G"}, {"H"}, {"I"}, {"J"}, {"K"}, {"L"}, {"M"}, {"N"}, {"O"}, {"P"}, {"Q"}, {"R"}, {"S"}, {"T"}, {"U"}, {"V"}, {"W"}, {"X"}, {"Y"}, {"Z"}, {"_"}}}, {"preprocessing-op-or-punc", {{";"}, {"="}}}, {"pp-number", {{"digit"}, {"pp-number", "digit"}}} }; std::string Top{"program"}; BNF bnf{ {"program", {{"statement-list"}}}, {"statement-list", {{"statement", "statement-list"}, {}, }}, {"statement", {{"assigmnent", ";"}}}, {"assignment", {{"identifier", "=", "identifier"}}} }; std::set Terminals{"identifier", "=", ";"}; std::string Code{"a = bc ; c = 123 ; esd = Ff ; 1 = XYZ"}; std::vector tokens_reference{ {"identifier", "a", { 1, 1} }, {"preprocessing-op-or-punc", "=", { 1, 3}}, {"identifier", "bc", { 1, 5}}, {"preprocessing-op-or-punc", ";", { 1, 8}}, {"identifier", "c", { 1, 10}}, {"preprocessing-op-or-punc", "=", { 1, 12}}, {"pp-number", "123", { 1, 14}}, {"preprocessing-op-or-punc", ";", { 1, 18}}, {"identifier", "esd", { 1, 20}}, {"preprocessing-op-or-punc", "=", { 1, 24}}, {"identifier", "Ff", { 1, 26}}, {"preprocessing-op-or-punc", ";", { 1, 29}}, {"pp-number", "1", { 1, 31}}, {"preprocessing-op-or-punc", "=", { 1, 33}}, {"identifier", "XYZ", { 1, 34}}, }; Lexer lexer(LexBNF, LexTop); auto tokens = lexer.Lex(Code); ASSERT_EQ(tokens, tokens_reference); #if 0 for (const auto& i: tokens) { std::cout << i.value << std::endl; } #endif auto Program = Compile(tokens, Top, bnf, Terminals); } int main(int argc, char* argv[]) { ::testing::InitGoogleMock(&argc, argv); return RUN_ALL_TESTS(); }