#include #include "gmock/gmock.h" #include "gtest/gtest.h" #include #include #include #include #include #include using namespace std::string_literals; using BNF = std::map>>; using Terminals = std::set; using ProgramNode = std::deque; using PathElement = std::pair; // Name, Index std::vector split(std::string s) { std::vector result; boost::algorithm::split(result, s, boost::algorithm::is_any_of(s), boost::algorithm::token_compress_on); while (result.size() > 0 && result.back() == ""s) result.pop_back(); return result; } std::vector GetPath(std::string Token, BNF ReverseBNF, std::string Top, Terminals terminals = {}, std::vector PreviousPath = {}) { throw std::runtime_error("Compile error"); return {}; // TODO } auto Reverse(BNF bnf){ std::map> result; for (const auto& [from, to] : bnf) { for (const auto& list : to) { for (const auto& element : list) { auto i{result.find(element)}; if (i != result.end()) // already present i->second.insert(from); else // new element result.emplace(element, std::set{from}); } } } return result; } using index_t = size_t; struct TreeNode { index_t parent; std::vector childs; // fill char by char std::vector child_names; // fill always std::string name; }; class Tree { private: std::map nodes; // index 0 = non existing; index starting at 1 index_t node_num{}; index_t root{}; index_t last{}; public: bool Valid(const std::string& Top) const { // A token is non empty if (node_num == 0) return false; // Start symbol on top auto rootNode{nodes.find(root)}; if (rootNode == nodes.end()) throw std::runtime_error("Node not found: "s + std::to_string(root)); if (rootNode->second.name != Top) return false; // All nodes filled (implies all leaves are terminal) for (const auto& [index, node]: nodes) { if (node.childs.size() < node.child_names.size()) return false; // node not filled } return true; } bool AddFirstNode(char c, const BNF& bnf, const std::map>& reverseBNF) { node_num ++; root = node_num; last = node_num; std::string node_name(1, char(c)); auto reverseRule{reverseBNF.find(node_name)}; if (reverseRule == reverseBNF.end()) throw std::runtime_error("Reverse rule not found: "s + node_name); std::vector children; // default: no children for terminal auto rule{bnf.find(node_name)}; if (rule != bnf.end()) { // multiple variants! bool hit{false}; for (const auto& i : rule->second) { if (i.size() > 0 && i[0] == node_name) { if (hit) throw std::runtime_error("Multiple matching rules found for "s + node_name); children = i; hit = true; } } if (!hit) throw std::runtime_error("No matching rule found for "s + node_name); } nodes.emplace(root, TreeNode{0, std::vector{}, children, node_name}); return true; } // try to add character to tree bool Add(char c, const BNF& bnf, const std::map>& reverseBNF) { if (nodes.empty()) { // first node return AddFirstNode(c, bnf, reverseBNF); } else { // at least one character is already present // Traverse tree until partially filled node found // or new node can be added index_t current_index{last}; while (current_index != 0) { TreeNode& node {nodes[current_index]}; if (node.childs.size() < node.child_names.size()) { // partially filled node throw std::runtime_error("TODO: partially filled node"); } current_index = node.parent; } throw std::runtime_error("TODO: need to add node"); } return false; } // add path to start symbol void Resolve(const BNF& bnf, const std::map>& reverseBNF) { if (nodes.empty()) // only handle non-empty trees return; while (true) { std::string& old_root_name { nodes[root].name }; // current root node name auto parents {reverseBNF.find(old_root_name)}; if (parents != reverseBNF.end()) { // parents in bnf available bool hit{false}; for (auto& parent : parents->second) { for (const auto& list : bnf.at(parent)) { if (list.size() == 1 && list[0] == old_root_name) { if (!hit) { const std::string& new_root_name {parent}; // Add new TreeNode in the direction to root: // New root with 1 child (old root) nodes.emplace(++node_num, TreeNode{0, // parent std::vector{root}, // child indices std::vector{old_root_name}, // child names new_root_name // name }); nodes[root].parent = node_num; root = node_num; // this->last stays hit = true; } else throw std::runtime_error("Error: Multiple resolve nodes for "s + old_root_name); } } } if (!hit) break; } else break; } } }; class Lexer { private: const BNF &bnf; const std::string& Top; std::map> ReverseBNF; // to be called on token end void FinalizeCandidates(std::deque& candidates, std::string& token, std::vector& result) { if (candidates.empty()) { // skip if (!token.empty()) throw std::runtime_error("Expected empty token, got "s + token); } else { // check candidates bool valid{false}; for (auto& ct : candidates) { ct.Resolve(bnf, ReverseBNF); if (ct.Valid(Top)) { if (valid) throw std::runtime_error("Found ambiguous token "s + token); result.push_back(token); token.clear(); valid = true; } } if (!valid) throw std::runtime_error("Invalid token: "s + token); candidates.clear(); } } void AddCandidate(char c, std::deque& candidates) { // new candidate: starting with c auto element {ReverseBNF.find(std::string(1, c))}; if (element != ReverseBNF.end()) { Tree newTree; if (newTree.Add(c, bnf, ReverseBNF)) candidates.push_back(newTree); } } public: Lexer(const BNF& bnf, const std::string& Top): bnf(bnf), Top(Top), ReverseBNF{Reverse(bnf)} { } std::vector Lex(const std::string& s) { std::vector result; std::string token; std::string Whitespace{"\t \n\r"}; std::deque candidates; for (size_t pos{0}; pos < s.size(); pos++) { char c{s[pos]}; std::cout << "Char: |" << c << "|" << std::endl; if (Whitespace.find(c) != std::string::npos) { // found whitespace character // evaluate token up to now and skip whitespace FinalizeCandidates(candidates, token, result); } else { // no whitespace: try to add to tree std::deque EraseList; int i = 0; for (auto ct = candidates.begin(); ct != candidates.end(); ct++, i++) { if (!ct->Add(c, bnf, ReverseBNF)) { // either add char or delete candidate // push to front to get reversed order EraseList.push_front(i); // no candidate anymore } } if (token.empty()) { AddCandidate(c, candidates); } else if (candidates.size() - EraseList.size() > 0) { // added to some candidates: Erase invalidated ones for (const auto& i : EraseList) candidates.erase(candidates.begin() + i); } else { // no candidates left: new tree FinalizeCandidates(candidates, token, result); AddCandidate(c, candidates); } token.push_back(c); } } // Final evaluation of last token FinalizeCandidates(candidates, token, result); return result; } }; ProgramNode Compile(std::vector Tokens, std::string Top, BNF bnf, Terminals terminals) { BNF ReverseBNF;//{ Reverse(bnf)}; if (Tokens.size()){ std::string Token = Tokens[0]; auto Path = GetPath(Token, ReverseBNF, Top, terminals); if (Path.size()) { size_t Index{1}; while (Index < Tokens.size()) { Path = GetPath(Token, ReverseBNF, Top, terminals, Path); Index++; } } else throw std::runtime_error("Invalid token: "s + Token); } else throw std::runtime_error("No tokens!"); return {}; } class Test: public ::testing::Test { protected: Test(){} ~Test() override {} }; TEST_F(Test, BNF) { std::string LexTop{"preprocessing-token"}; BNF LexBNF{ {"preprocessing-token", {{"identifier"}, {"preprocessing-op-or-punc"}, {"pp-number"}}}, {"identifier", {{"identifier-nondigit"}, {"identifier", "identifier-nondigit"}, {"identifier", "digit"}}}, {"digit", {{"0"}, {"1"}, {"2"}, {"3"}, {"4"}, {"5"}, {"6"}, {"7"}, {"8"}, {"9"} }}, {"identifier-nondigit", {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, {"f"}, {"g"}, {"h"}, {"i"}, {"j"}, {"k"}, {"l"}, {"m"}, {"n"}, {"o"}, {"p"}, {"q"}, {"r"}, {"s"}, {"t"}, {"u"}, {"v"}, {"w"}, {"x"}, {"y"}, {"z"}, {"A"}, {"B"}, {"C"}, {"D"}, {"E"}, {"F"}, {"G"}, {"H"}, {"I"}, {"J"}, {"K"}, {"L"}, {"M"}, {"N"}, {"O"}, {"P"}, {"Q"}, {"R"}, {"S"}, {"T"}, {"U"}, {"V"}, {"W"}, {"X"}, {"Y"}, {"Z"}, {"_"}}}, {"preprocessing-op-or-punc", {{";"}, {"="}}}, {"pp-number", {{"digit"}, {"pp-number", "digit"}}} }; std::string Top{"program"}; BNF bnf{ {"program", {{"statement-list"}}}, {"statement-list", {{"statement", "statement-list"}, {}, }}, {"statement", {{"assigmnent", ";"}}}, {"assignment", {{"identifier", "=", "identifier"}}} }; std::set Terminals{"identifier", "=", ";"}; std::string Code{"a = bc ; c = 123 ; esd = Ff ; 1 = XYZ"}; Lexer lexer(LexBNF, LexTop); auto tokens = lexer.Lex(Code); #if 1 for (const auto& i: tokens) { std::cout << i << std::endl; } #endif //auto Program = Compile(tokens, Top, bnf, Terminals); } int main(int argc, char* argv[]) { ::testing::InitGoogleMock(&argc, argv); return RUN_ALL_TESTS(); }