diff options
author | Roland Reichwein <mail@reichwein.it> | 2020-03-27 17:39:58 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2020-03-27 17:39:58 +0100 |
commit | 3057729f132d516dd9ed58c6964a495aa1c11c3d (patch) | |
tree | 7eca9a03fd08200868a5ab0c27e33170b9dc917a | |
parent | 5467147d9470ee294ddab938098c3ef172222066 (diff) |
Top-down algo recognizes first C*+ program
-rw-r--r-- | cpp.cpp | 2 | ||||
-rw-r--r-- | cpp.h | 2 | ||||
-rw-r--r-- | grammer.cpp | 117 | ||||
-rw-r--r-- | grammer.h | 21 | ||||
-rw-r--r-- | test-cpp.cpp | 4 |
5 files changed, 114 insertions, 32 deletions
@@ -213,7 +213,7 @@ std::vector<Token> CPP::tokens_from_pptokens(std::vector<Token> pp_tokens) } // Phase 7.b: Grammar Analysis -std::pair<index_t, std::vector<Gram::TreeNode>> CPP::analysis(std::vector<Token> tokens) +std::vector<Gram::TreeNode> CPP::analysis(std::vector<Token> tokens) { auto bnf = SubBNF(CPPBNF::GetCppBNFGram(), "translation-unit"); @@ -22,7 +22,7 @@ void preprocess(); // phase 4 void execution_charset_map(); // phase 5 void concatenate_strings(); // phase 6 std::vector<Token> tokens_from_pptokens(std::vector<Token> pp_tokens); // phase 7.a -std::pair<index_t, std::vector<Gram::TreeNode>> analysis(std::vector<Token>); // phase 7.b +std::vector<Gram::TreeNode> analysis(std::vector<Token>); // phase 7.b void translate(); // phase 7.c void instantiate(); // phase 8 void link(); // phase 9 diff --git a/grammer.cpp b/grammer.cpp index 3cc88fa..cd5b50f 100644 --- a/grammer.cpp +++ b/grammer.cpp @@ -15,8 +15,8 @@ void Debug(std::string s) void Compiler::clear() { + symbol_variants.clear(); nodes.clear(); - begin_pos = NodePosition{}; } std::string Compiler::GetTypeOfNode(index_t node_id) const @@ -26,11 +26,21 @@ std::string Compiler::GetTypeOfNode(index_t node_id) const return nodes[node_id].type; } +bool Gram::ChildIdIsEmpty(int32_t child_id) +{ + return child_id == 0; +} + bool Gram::ChildIdIsToken(int32_t child_id) { return child_id < 0; } +bool Gram::ChildIdIsNode(int32_t child_id) +{ + return child_id > 0; +} + index_t Gram::TokenIdFromChildId(int32_t child_id) { return index_t(-child_id) - 1; @@ -94,33 +104,53 @@ index_t Compiler::AddNode(const std::string& type, index_t variant, NodePosition nodes[pos.node_id].child_ids[pos.child_pos] = node_id; nodes.emplace_back(TreeNode{pos, node_id, type, variant, std::vector<int32_t>(size_t(list.size()), 0)}); - Debug("AddNode(): "s + nodes[pos.node_id].type + "->"s + type + ": "s + std::to_string(node_id)); - DumpTree(); - return node_id; } -void Compiler::RemoveNode() +Compiler::AddNodeGuard::AddNodeGuard(Compiler& compiler, index_t variant): m_compiler(compiler) { - auto& pos{nodes.back().pos}; - - nodes[pos.node_id].child_ids[pos.child_pos] = 0; - nodes.pop_back(); -} - -Compiler::AddNodeGuard::AddNodeGuard(Compiler& compiler, const std::string& type, index_t variant, NodePosition pos): m_compiler(compiler) -{ - m_compiler.AddNode(type, variant, pos); + m_compiler.symbol_variants.push_back(variant); } Compiler::AddNodeGuard::~AddNodeGuard() { - m_compiler.RemoveNode(); + m_compiler.symbol_variants.pop_back(); } void Compiler::IncNodePosition(NodePosition& pos) { - // TODO + if (nodes.size() == 0) // special case: empty tree + return; + if (pos.node_id >= nodes.size()) + throw std::runtime_error("ICE: NodePosition with node_id "s + std::to_string(pos.node_id) + " doesn't exist."s); + if (pos.child_pos >= nodes[pos.node_id].child_ids.size()) + throw std::runtime_error("ICE: NodePosition with child_pos "s + std::to_string(pos.child_pos) + " in node_id "s + std::to_string(pos.node_id) + " doesn't exist."s); + + int32_t child_id{nodes[pos.node_id].child_ids[pos.child_pos]}; + + if (ChildIdIsEmpty(child_id)) + throw std::runtime_error("ICE: NodePosition is empty"); + + // Actually, advance + if (ChildIdIsToken(child_id)) { + pos.child_pos++; + } else { + pos.node_id = child_id; + pos.child_pos = 0; + } + + // Go to parent if child ids completely traversed + while (pos.node_id > 0 && pos.child_pos >= nodes[pos.node_id].child_ids.size()) { + pos = nodes[pos.node_id].pos; + pos.child_pos++; + } + + // Advancing at root node for last child is allowed: Finished + if (pos.child_pos >= nodes[pos.node_id].child_ids.size()) + return; + + if (ChildIdIsNode(nodes[pos.node_id].child_ids[pos.child_pos])) + throw std::runtime_error("ICE: No node expected at "s + std::to_string(pos.node_id) + "/"s + std::to_string(pos.child_pos)); } size_t Compiler::minimumSymbolsNeeded(std::string symbol) @@ -170,7 +200,6 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t while (begin < end && symbol_list.size() > 0 && symbol_list.front() == tokens[begin].type) { begin++; symbol_list.erase(symbol_list.begin()); - IncNodePosition(begin_pos); // TODO: guard? } // match terminal symbols at end @@ -181,8 +210,11 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t // matching of empty lists if (symbol_list.size() == 0) { - if (begin == end) // only match real empty list + if (begin == end) { // only match real empty list + // this is the point of the final match + constructTree(); return true; + } return false; } @@ -192,7 +224,7 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t auto it{bnf.find(symbol_list.front())}; if (it != bnf.end()) { for (size_t i = 0; i < it->second.size(); i++) { // iterate over alternatives - //AddNodeGuard guard(*this, symbol_list.front(), i, begin_pos); + AddNodeGuard guard(*this, i); std::vector<std::string> list {it->second[i]}; list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end()); if (minimumSymbolsNeeded(list) > end - begin) // stop recursion @@ -214,6 +246,42 @@ bool Compiler::match(std::string symbol, size_t begin, size_t end) return match(symbol_list, begin, end); } +void Compiler::constructTree() +{ + symbol_variants_it = symbol_variants.begin(); + + m_symbol_list = {m_top}; + m_symbol_list_pos = 0; + + NodePosition tree_pos; + + while (m_symbol_list_pos < m_symbol_list.size()) { + std::string symbol{m_symbol_list[m_symbol_list_pos]}; + + if (isTerminal(bnf, symbol)) { + // Advance terminal symbol + nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = ChildIdFromTokenId(m_symbol_list_pos); + IncNodePosition(tree_pos); + m_symbol_list_pos++; + } else { + // Replace non-terminal symbol + m_symbol_list.erase(m_symbol_list.begin() + m_symbol_list_pos); + std::vector<std::string> list {bnf[symbol][*symbol_variants_it]}; + m_symbol_list.insert(m_symbol_list.begin() + m_symbol_list_pos, list.begin(), list.end()); + + index_t node_id {AddNode(symbol, *symbol_variants_it, tree_pos)}; + + if (node_id > 0) { + nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = node_id; + IncNodePosition(tree_pos); + } + + symbol_variants_it++; + } + } + +} + Compiler::Compiler(BNF& bnf, const std::string& top): bnf(bnf), m_top(top), ReverseBNF{Reverse(bnf)}, reversedFirst{reverseFirst(bnf)} { //std::cout << "DEBUG: " << m_top << std::endl; @@ -234,7 +302,7 @@ Compiler::Compiler(BNF& bnf, const std::string& top): bnf(bnf), m_top(top), Reve minimumSymbolsNeeded("translation-unit"); } -std::pair<index_t, std::vector<TreeNode>> Compiler::compile(std::vector<Token> p_tokens) +std::vector<TreeNode> Compiler::compile(std::vector<Token> p_tokens) { clear(); tokens = p_tokens; @@ -243,11 +311,16 @@ std::pair<index_t, std::vector<TreeNode>> Compiler::compile(std::vector<Token> p throw std::runtime_error("No tokens!"); // - // top-down algorithm + // top-down algorithm: + // + // 1. Match linear tokens list to bnf, building up list of used variants (symbol_variants) + // 2. Construct Node Tree from symbol_variants // if (!match(m_top, 0, tokens.size())) throw std::runtime_error("Compile error."); - return std::pair<index_t, std::vector<TreeNode>>{0, nodes}; + //DumpTree(); + + return nodes; } @@ -4,6 +4,7 @@ #include "minicc.h" #include <cstdint> +#include <deque> #include <unordered_set> #include <utility> @@ -16,7 +17,7 @@ struct NodePosition { index_t child_pos{}; // 0-based }; -// TreeNodes are only intermediate. Terminal symbols don't get of TreeNodes +// TreeNodes are only intermediate. Terminal symbols don't get TreeNodes // token_id: index into token list // node_id: index into tree node list struct TreeNode { @@ -43,24 +44,24 @@ private: std::unordered_map<std::string, std::set<std::string>> ReverseBNF; // possible parent types of a given type; unused now: remove? std::unordered_map<std::string, std::set<std::string>> reversedFirst; // possible parent types of first childs of a given type + std::deque<index_t> symbol_variants; + decltype(symbol_variants)::iterator symbol_variants_it; + // Tree specific void clear(); // Node specific std::string GetTypeOfNode(index_t node_id) const; index_t AddNode(const std::string& type, index_t variant, NodePosition pos = {}); - void RemoveNode(); - // Adds Node and Removes it on scope exit (RAII) + // Adds actually used Non-Terminal Symbol Removes it on scope exit (RAII) class AddNodeGuard { Compiler& m_compiler; public: - AddNodeGuard(Compiler& compiler, const std::string& type, index_t variant, NodePosition pos); + AddNodeGuard(Compiler& compiler, index_t variant); ~AddNodeGuard(); }; void IncNodePosition(NodePosition& pos); - NodePosition begin_pos; - // top-down algorithm std::unordered_map<std::string, size_t> m_min; // cache size_t minimumSymbolsNeeded(std::string symbol); @@ -68,15 +69,21 @@ private: bool match(std::string symbol, size_t begin, size_t end); bool match(std::vector<std::string> symbol_list, size_t begin, size_t end); + void constructTree(); + std::vector<std::string> m_symbol_list; + index_t m_symbol_list_pos{}; + public: Compiler(BNF& bnf, const std::string& top); - std::pair<index_t, std::vector<TreeNode>> compile(std::vector<Token> p_tokens); + std::vector<TreeNode> compile(std::vector<Token> p_tokens); void DumpTree(); friend class ::GrammerTest; }; +bool ChildIdIsEmpty(int32_t child_id); bool ChildIdIsToken(int32_t child_id); +bool ChildIdIsNode(int32_t child_id); index_t TokenIdFromChildId(int32_t child_id); int32_t ChildIdFromTokenId(index_t token_id); diff --git a/test-cpp.cpp b/test-cpp.cpp index 2a67b38..47a57f5 100644 --- a/test-cpp.cpp +++ b/test-cpp.cpp @@ -46,7 +46,9 @@ TEST_F(CppTest, preprocessing_tokenize) { } #endif - auto result = cpp.analysis(tokens); + auto nodes = cpp.analysis(tokens); + + ASSERT_EQ(nodes.size(), 44); } #endif |