From 766b70f8b85197de38a9fd629641ca9f70cc2340 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Sat, 18 Jan 2020 15:36:42 +0100 Subject: Lexer (WIP) --- TODO | 1 + minicc.cpp | 209 ++++++++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 137 insertions(+), 73 deletions(-) create mode 100644 TODO diff --git a/TODO b/TODO new file mode 100644 index 0000000..751699a --- /dev/null +++ b/TODO @@ -0,0 +1 @@ +Locations in source: File:line diff --git a/minicc.cpp b/minicc.cpp index b609477..94bae72 100644 --- a/minicc.cpp +++ b/minicc.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -31,115 +32,172 @@ std::vector GetPath(std::string Token, BNF ReverseBNF, std::string return {}; // TODO } -BNF Reverse(BNF bnf){ - return {}; // TODO +auto Reverse(BNF bnf){ + std::map> result; + + for (const auto& [from, to] : bnf) { + for (const auto& list : to) { + for (const auto& element : list) { + auto i{result.find(element)}; + if (i != result.end()) // already present + i->second.push_back(from); + else // new element + result.emplace(element, std::vector{1, from}); + } + } + } + + return result; } using index_t = size_t; struct TreeNode { index_t parent; - std::vector childs; + std::vector childs; // fill char by char + std::vector child_names; // fill always std::string name; }; -struct Tree { +class Tree { +private: std::map nodes; // index 0 = non existing; index starting at 1 index_t node_num{}; index_t root{}; index_t last{}; - bool Valid() const { - // Start symbol? - // All terminal symbols? - return true; // TODO +public: + bool Valid(const std::string& Top) const { + // Start symbol on top + if (node_num == 0) + return false; + + auto rootNode{nodes.find(root)}; + if (rootNode == nodes.end()) + throw std::runtime_error("Node not found: "s + std::to_string(root)); + + if (rootNode->second.name != Top) + return false; + + // All nodes filled (implies all leaves are terminal) + for (const auto& [index, node]: nodes) { + if (node.childs.size() < node.child_names.size()) + return false; // node not filled + } + + return true; } - bool Add(char c) { + // try to add node + bool Add(char c, const BNF& bnf, const std::map>& reverseBNF) { + if (nodes.empty()) { // first node + node_num ++; + root = node_num; + last = node_num; + std::string node_name{1, c}; + auto rule{reverseBNF.find(node_name)}; + if (rule == reverseBNF.end()) + throw std::runtime_error("Rule not found: "s + node_name); + nodes.emplace(root, TreeNode{0, std::vector{}, std::vector{}, node_name}); + return true; + } else { // TODO - //node_num ++; - //nodes.emplace(node_num, {}); - return false; } + return false; + } }; -void CheckCandidates(std::deque& candidates, std::string& token, std::vector& result) +class Lexer { - bool valid{false}; - for (const auto& ct : candidates) { - if (ct.Valid()) { - if (valid) - throw std::runtime_error("Found ambiguous token "s + token); - result.push_back(token); - token.clear(); - valid = true; + +private: + const BNF &bnf; + const std::string& Top; + + std::map> ReverseBNF; + + // to be called on token end + void CheckCandidates(std::deque& candidates, std::string& token, std::vector& result) + { + if (candidates.empty()) { // skip + if (!token.empty()) + throw std::runtime_error("Expected empty token, got "s + token); + } else { // check candidates + bool valid{false}; + for (const auto& ct : candidates) { + if (ct.Valid(Top)) { + if (valid) + throw std::runtime_error("Found ambiguous token "s + token); + result.push_back(token); + token.clear(); + valid = true; + } + } + if (!valid) + throw std::runtime_error("Invalid token: "s + token); + + candidates.clear(); } } - if (!valid) - throw std::runtime_error("Invalid token: "s + token); - candidates.clear(); -} - -std::vector Lex(std::string s, std::string Top, BNF bnf) -{ - std::vector result; - std::string token; +public: + Lexer(const BNF& bnf, const std::string& Top): bnf(bnf), Top(Top), ReverseBNF{Reverse(bnf)} + { + } - BNF ReverseBNF{ Reverse(bnf)}; + std::vector Lex(const std::string& s) + { + std::vector result; + std::string token; - std::string Whitespace{"\t \n\r"}; - std::deque candidates; + std::string Whitespace{"\t \n\r"}; + std::deque candidates; - for (size_t pos{0}; pos < s.size(); pos++) { - char c{s[pos]}; - if (Whitespace.find(c) != std::string::npos) { // found whitespace character - if (candidates.empty()) { // skip - if (!token.empty()) - throw std::runtime_error("Expected empty token, got "s + token); - } else { // check candidates + for (size_t pos{0}; pos < s.size(); pos++) { + char c{s[pos]}; + if (Whitespace.find(c) != std::string::npos) { // found whitespace character + // evaluate token up to now and skip whitespace CheckCandidates(candidates, token, result); - } - } else { // no whitespace: try to add to tree - std::deque EraseList; - int i = 0; - for (auto ct = candidates.begin(); ct != candidates.end(); ct++, i++) { - if (!ct->Add(c)) { - EraseList.push_front(i); // no candidate anymore + } else { // no whitespace: try to add to tree + std::deque EraseList; + int i = 0; + for (auto ct = candidates.begin(); ct != candidates.end(); ct++, i++) { + if (!ct->Add(c, bnf, ReverseBNF)) { // either add char or delete candidate + // push to front to get reversed order + EraseList.push_front(i); // no candidate anymore + } + } + + if (candidates.size() - EraseList.size() > 0) { // Added to some candidates: Continue growing token + token.push_back(c); + for (const auto& i : EraseList) + candidates.erase(candidates.begin() + i); + } else { // no candidates left: new tree + CheckCandidates(candidates, token, result); + } + + // new candidate: starting with c + auto element {ReverseBNF.find(std::string(1, c))}; + if (element != ReverseBNF.end()) { + Tree newTree; + if (newTree.Add(c, bnf, ReverseBNF)) + candidates.push_back(newTree); } - } - // new candidate: starting with c - auto element {ReverseBNF.find(std::string(1, c))}; - if (element != ReverseBNF.end()) { - candidates.emplace_back(); - candidates.back().Add(c); - } - - if (candidates.size() - EraseList.size() > 0) { // Added to some candidates: Continue growing token - token.push_back(c); - for (const auto& i : EraseList) - candidates.erase(candidates.begin() + i); - } else { // no candidates left: new tree - CheckCandidates(candidates, token, result); } } - } - // Final evaluation - if (candidates.empty()) { // skip - if (!token.empty()) - throw std::runtime_error("Expected empty token at end of input, got "s + token); - } else { // check candidates + // Final evaluation of last token CheckCandidates(candidates, token, result); + + return result; } - return result; -} +}; ProgramNode Compile(std::vector Tokens, std::string Top, BNF bnf, Terminals terminals) { - BNF ReverseBNF{ Reverse(bnf)}; + BNF ReverseBNF;//{ Reverse(bnf)}; if (Tokens.size()){ std::string Token = Tokens[0]; @@ -194,10 +252,15 @@ TEST_F(Test, BNF) { std::set Terminals{"identifier", "=", ";"}; - std::string Code{"a = b ; c = d ; e = f ;"}; + std::string Code{"a = bc ; c = 123 ; esd = Ff ; 1 = XYZ"}; - auto tokens = Lex(Code, LexTop, LexBNF); - auto Program = Compile(tokens, Top, bnf, Terminals); + Lexer lexer(LexBNF, LexTop); + auto tokens = lexer.Lex(Code); + + for (const auto& i: tokens) { + std::cout << i << std::endl; + } + //auto Program = Compile(tokens, Top, bnf, Terminals); } int main(int argc, char* argv[]) { -- cgit v1.2.3