diff options
author | Roland Reichwein <mail@reichwein.it> | 2020-03-31 18:46:49 +0200 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2020-03-31 18:46:49 +0200 |
commit | cd6c436cb70c4323c7d14ebd74f89bb0914649f2 (patch) | |
tree | 1bdb2a8409b2b370bcaa5e1b6cf091c0c8e1f779 | |
parent | bed46a062c442656b1bc16d965e12405297029d3 (diff) |
Optimization: Replace head recursion by tail recursion in matching
-rw-r--r-- | bnf.cpp | 53 | ||||
-rw-r--r-- | bnf.h | 1 | ||||
-rw-r--r-- | grammer.cpp | 58 | ||||
-rw-r--r-- | grammer.h | 10 | ||||
-rw-r--r-- | lexer.cpp | 4 | ||||
-rw-r--r-- | test-cpp.cpp | 2 |
6 files changed, 96 insertions, 32 deletions
@@ -68,6 +68,59 @@ BNF SubBNF(const BNF& bnf, const std::string& top) return result; } +namespace { + + bool isHeadRecursive(const std::vector<std::string>& list, const std::string& symbol) { + if (list.size() > 0 && list[0] == symbol) + return true; + return false; + } + + bool isHeadRecursive(const std::vector<std::vector<std::string>>& lists, const std::string& symbol) { + for (const auto& list: lists) { + if (isHeadRecursive(list, symbol)) + return true; + } + return false; + } + +} // anonymous namespace + +// Change head recursion to tail recursion +BNF removeHeadRecursion(const BNF& bnf) +{ + std::unordered_map<std::string, std::vector<std::vector<std::string>>> result; + + for (const auto& [from, to]: bnf) { + if (isHeadRecursive(to, from)) { // modify rule by adding additional one + std::string from_ext = from + "-ext"; + if (bnf.find(from_ext) != bnf.end()) + throw std::runtime_error("ICE: Symbol "s + from_ext + " already exists in original BNF"); + + std::vector<std::vector<std::string>> to_new; + std::vector<std::vector<std::string>> to_ext{{}}; // starts with empty list as first element + for (const auto& list: to) { + if (isHeadRecursive(list, from)) { + std::vector<std::string> list_new{list.begin() + 1, list.end()}; + list_new.push_back(from_ext); + to_ext.push_back(list_new); + } else { + std::vector<std::string> list_new{list.begin(), list.end()}; + list_new.push_back(from_ext); + to_new.push_back(list_new); + } + } + + result[from] = to_new; + result[from_ext] = to_ext; + } else { // just add + result[from] = to; + } + } + + return result; +} + bool isTerminal(const BNF& bnf, const std::string& symbol) { return bnf.find(symbol) == bnf.end(); @@ -15,6 +15,7 @@ std::unordered_map<std::string, std::unordered_set<std::string>> Reverse(const B std::unordered_map<std::string, std::unordered_set<std::string>> reverseFirst(const BNF& bnf); BNF SubBNF(const BNF& bnf, const std::string& top); +BNF removeHeadRecursion(const BNF& bnf); bool isTerminal(const BNF& bnf, const std::string& symbol); std::unordered_set<std::string> getTerminals(const BNF& bnf); diff --git a/grammer.cpp b/grammer.cpp index 50617ce..d7afaef 100644 --- a/grammer.cpp +++ b/grammer.cpp @@ -92,7 +92,7 @@ void Compiler::DumpTree() index_t Compiler::AddNode(const std::string& type, index_t variant, NodePosition pos) { - auto& list { bnf[type][variant]}; + auto& list { m_bnf[type][variant]}; index_t node_id{nodes.size()}; if (nodes.size() > 0) nodes[pos.node_id].child_ids[pos.child_pos] = node_id; @@ -149,7 +149,7 @@ void Compiler::IncNodePosition(NodePosition& pos) size_t Compiler::minimumSymbolsNeeded(const std::string& symbol) { - if (isTerminal(bnf, symbol)) { + if (isTerminal(m_bnf, symbol)) { return 1; } else { auto it_min{m_min.find(symbol)}; @@ -157,8 +157,8 @@ size_t Compiler::minimumSymbolsNeeded(const std::string& symbol) return it_min->second; m_min[symbol] = std::numeric_limits<size_t>::max(); - auto it{bnf.find(symbol)}; - if (it != bnf.end()) { + auto it{m_bnf.find(symbol)}; + if (it != m_bnf.end()) { size_t minimum{std::numeric_limits<size_t>::max()}; for (const auto& list: it->second) { @@ -196,7 +196,7 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t symbol_list.erase(symbol_list.begin()); } - // matching of empty lists + // matching of empty list in non-terminals if (symbol_list.size() == 0) { if (begin == end) { // only match real empty list // this is the point of the final match @@ -206,6 +206,19 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t return false; } + // matching of empty list in terminals + if (begin == end) { + const auto& symbol{symbol_list.front()}; + auto it{m_empty_lut.find(symbol)}; + if (it == m_empty_lut.end()) // can't match empty tokens list with this nt-symbol + return false; + + AddNodeGuard guard(*this, it->second); + std::vector<std::string> list {m_bnf[symbol][it->second]}; + list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end()); + return match(list, begin, end); + } + // now, symbol_list has size > 0 and contains non-terminal symbols at start and end // resolve first symbol @@ -213,12 +226,11 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t if (it != m_match_lut.end()) { for (size_t i: it->second) { AddNodeGuard guard(*this, i); - std::vector<std::string> list {bnf[symbol_list.front()][i]}; + std::vector<std::string> list {m_bnf[symbol_list.front()][i]}; list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end()); if (minimumSymbolsNeeded(list) > end - begin) // stop recursion continue; - // TODO: recurse last? if (match(list, begin, end)) return true; } @@ -236,7 +248,7 @@ bool Compiler::match(std::string symbol, size_t begin, size_t end) void Compiler::fillStartCache() { - std::unordered_set<std::string> terminals {getTerminals(bnf)}; + std::unordered_set<std::string> terminals {getTerminals(m_bnf)}; { // follow terminal symbols for (const std::string& terminal: terminals) { @@ -273,12 +285,13 @@ void Compiler::fillStartCache() } { // follow empty non-terminal symbols, and combine all found non-terminals with all terminals - std::unordered_set<std::pair<std::string, size_t>, PairHash> current_set {getEmptyPositions(bnf)}; + std::unordered_set<std::pair<std::string, size_t>, PairHash> current_set {getEmptyPositions(m_bnf)}; std::unordered_set<std::pair<std::string, size_t>, PairHash> done_set; std::unordered_set<std::pair<std::string, size_t>, PairHash> next_set; do { for (const auto& current_pos: current_set) { + m_empty_lut[current_pos.first] = current_pos.second; for (const std::string& terminal: terminals) { auto it_lut {m_match_lut.find({current_pos.first, terminal})}; if (it_lut == m_match_lut.end()) { // add new list @@ -286,7 +299,6 @@ void Compiler::fillStartCache() } else { // extend list it_lut->second.emplace_back(current_pos.second); } - } auto it {reversedPosFirst.find(current_pos.first)}; @@ -317,24 +329,24 @@ void Compiler::constructTree() { symbol_variants_it = symbol_variants.begin(); - m_symbol_list = {m_top}; - m_symbol_list_pos = 0; + std::vector<std::string> symbol_list{m_top}; + index_t symbol_list_pos{0}; NodePosition tree_pos; - while (m_symbol_list_pos < m_symbol_list.size()) { - std::string symbol{m_symbol_list[m_symbol_list_pos]}; + while (symbol_list_pos < symbol_list.size()) { + std::string symbol{symbol_list[symbol_list_pos]}; - if (isTerminal(bnf, symbol)) { + if (isTerminal(m_bnf, symbol)) { // Advance terminal symbol - nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = ChildIdFromTokenId(m_symbol_list_pos); + nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = ChildIdFromTokenId(symbol_list_pos); IncNodePosition(tree_pos); - m_symbol_list_pos++; + symbol_list_pos++; } else { // Replace non-terminal symbol - m_symbol_list.erase(m_symbol_list.begin() + m_symbol_list_pos); - std::vector<std::string> list {bnf[symbol][*symbol_variants_it]}; - m_symbol_list.insert(m_symbol_list.begin() + m_symbol_list_pos, list.begin(), list.end()); + symbol_list.erase(symbol_list.begin() + symbol_list_pos); + std::vector<std::string> list {m_bnf[symbol][*symbol_variants_it]}; + symbol_list.insert(symbol_list.begin() + symbol_list_pos, list.begin(), list.end()); index_t node_id {AddNode(symbol, *symbol_variants_it, tree_pos)}; @@ -350,11 +362,9 @@ void Compiler::constructTree() } Compiler::Compiler(BNF& bnf, const std::string& top) - : bnf(bnf) + : m_bnf {removeHeadRecursion(bnf)} , m_top(top) - //, ReverseBNF{Reverse(bnf)} - //, reversedFirst{reverseFirst(bnf)} - , reversedPosFirst{reversePosFirst(bnf)} + , reversedPosFirst{reversePosFirst(m_bnf)} { // // prepare helper cache (TODO: remove this ugly workaround for remaining bad marker elements) @@ -39,11 +39,9 @@ private: // Input std::vector<Token> tokens; - BNF &bnf; // not const for access via operator[] + BNF m_bnf; // not const for access via operator[] const std::string m_top; - //std::unordered_map<std::string, std::unordered_set<std::string>> ReverseBNF; // possible parent types of a given type; unused now: remove? - //std::unordered_map<std::string, std::unordered_set<std::string>> reversedFirst; // possible parent types of first childs of a given type std::unordered_map<std::string, std::unordered_set<std::pair<std::string, index_t>, PairHash>> reversedPosFirst; // possible parent types of first childs of a given type std::deque<index_t> symbol_variants; @@ -81,12 +79,14 @@ private: return h0 ^ (h1 << 1) ^ (h2 << 2); } }; + // map combination of non-terminal+terminal symbol combination to list of + // possible BNF positions for the specified non-terminal symbol std::unordered_map<std::pair<std::string, std::string>, std::vector<size_t>, PairHashSS> m_match_lut; + std::unordered_map<std::string, size_t> m_empty_lut; // list of non-terminal symbols that can lead to empty token list (maps to variant) + void fillStartCache(); void constructTree(); - std::vector<std::string> m_symbol_list; - index_t m_symbol_list_pos{}; public: Compiler(BNF& bnf, const std::string& top); @@ -85,14 +85,14 @@ void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from for (size_t i = list_index_from; i < list_index_to - 1; i++) { std::string symbol{list[i]}; if (symbol == rule_symbol) - throw std::runtime_error("Recursion found but not allowed"); + throw std::runtime_error("Recursion found but not allowed. Only head recursion allowed for lexer."); size_t state{newState()}; addPathOrTransition(previousState, state, symbol); previousState = state; } if (list.back() == rule_symbol) - throw std::runtime_error("Recursion found but not allowed"); + throw std::runtime_error("Tail recursion found but not allowed. Only head recursion allowed for lexer."); // last transition addPathOrTransition(previousState, state1, list.back()); diff --git a/test-cpp.cpp b/test-cpp.cpp index 8a9b7cb..e5b2a1a 100644 --- a/test-cpp.cpp +++ b/test-cpp.cpp @@ -42,7 +42,7 @@ TEST_F(CppTest, preprocessing_tokenize) { auto nodes = cpp.analysis(tokens); - ASSERT_EQ(nodes.size(), 44); + ASSERT_EQ(nodes.size(), 60/*44*/); } TEST_F(CppTest, preprocessing_tokenize_compile_error) { |