From cd6c436cb70c4323c7d14ebd74f89bb0914649f2 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Tue, 31 Mar 2020 18:46:49 +0200 Subject: Optimization: Replace head recursion by tail recursion in matching --- bnf.cpp | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ bnf.h | 1 + grammer.cpp | 58 ++++++++++++++++++++++++++++++++++------------------------ grammer.h | 10 +++++----- lexer.cpp | 4 ++-- test-cpp.cpp | 2 +- 6 files changed, 96 insertions(+), 32 deletions(-) diff --git a/bnf.cpp b/bnf.cpp index 04a6be9..b2d0472 100644 --- a/bnf.cpp +++ b/bnf.cpp @@ -68,6 +68,59 @@ BNF SubBNF(const BNF& bnf, const std::string& top) return result; } +namespace { + + bool isHeadRecursive(const std::vector& list, const std::string& symbol) { + if (list.size() > 0 && list[0] == symbol) + return true; + return false; + } + + bool isHeadRecursive(const std::vector>& lists, const std::string& symbol) { + for (const auto& list: lists) { + if (isHeadRecursive(list, symbol)) + return true; + } + return false; + } + +} // anonymous namespace + +// Change head recursion to tail recursion +BNF removeHeadRecursion(const BNF& bnf) +{ + std::unordered_map>> result; + + for (const auto& [from, to]: bnf) { + if (isHeadRecursive(to, from)) { // modify rule by adding additional one + std::string from_ext = from + "-ext"; + if (bnf.find(from_ext) != bnf.end()) + throw std::runtime_error("ICE: Symbol "s + from_ext + " already exists in original BNF"); + + std::vector> to_new; + std::vector> to_ext{{}}; // starts with empty list as first element + for (const auto& list: to) { + if (isHeadRecursive(list, from)) { + std::vector list_new{list.begin() + 1, list.end()}; + list_new.push_back(from_ext); + to_ext.push_back(list_new); + } else { + std::vector list_new{list.begin(), list.end()}; + list_new.push_back(from_ext); + to_new.push_back(list_new); + } + } + + result[from] = to_new; + result[from_ext] = to_ext; + } else { // just add + result[from] = to; + } + } + + return result; +} + bool isTerminal(const BNF& bnf, const std::string& symbol) { return bnf.find(symbol) == bnf.end(); diff --git a/bnf.h b/bnf.h index e5c62ed..4ad782a 100644 --- a/bnf.h +++ b/bnf.h @@ -15,6 +15,7 @@ std::unordered_map> Reverse(const B std::unordered_map> reverseFirst(const BNF& bnf); BNF SubBNF(const BNF& bnf, const std::string& top); +BNF removeHeadRecursion(const BNF& bnf); bool isTerminal(const BNF& bnf, const std::string& symbol); std::unordered_set getTerminals(const BNF& bnf); diff --git a/grammer.cpp b/grammer.cpp index 50617ce..d7afaef 100644 --- a/grammer.cpp +++ b/grammer.cpp @@ -92,7 +92,7 @@ void Compiler::DumpTree() index_t Compiler::AddNode(const std::string& type, index_t variant, NodePosition pos) { - auto& list { bnf[type][variant]}; + auto& list { m_bnf[type][variant]}; index_t node_id{nodes.size()}; if (nodes.size() > 0) nodes[pos.node_id].child_ids[pos.child_pos] = node_id; @@ -149,7 +149,7 @@ void Compiler::IncNodePosition(NodePosition& pos) size_t Compiler::minimumSymbolsNeeded(const std::string& symbol) { - if (isTerminal(bnf, symbol)) { + if (isTerminal(m_bnf, symbol)) { return 1; } else { auto it_min{m_min.find(symbol)}; @@ -157,8 +157,8 @@ size_t Compiler::minimumSymbolsNeeded(const std::string& symbol) return it_min->second; m_min[symbol] = std::numeric_limits::max(); - auto it{bnf.find(symbol)}; - if (it != bnf.end()) { + auto it{m_bnf.find(symbol)}; + if (it != m_bnf.end()) { size_t minimum{std::numeric_limits::max()}; for (const auto& list: it->second) { @@ -196,7 +196,7 @@ bool Compiler::match(std::vector symbol_list, size_t begin, size_t symbol_list.erase(symbol_list.begin()); } - // matching of empty lists + // matching of empty list in non-terminals if (symbol_list.size() == 0) { if (begin == end) { // only match real empty list // this is the point of the final match @@ -206,6 +206,19 @@ bool Compiler::match(std::vector symbol_list, size_t begin, size_t return false; } + // matching of empty list in terminals + if (begin == end) { + const auto& symbol{symbol_list.front()}; + auto it{m_empty_lut.find(symbol)}; + if (it == m_empty_lut.end()) // can't match empty tokens list with this nt-symbol + return false; + + AddNodeGuard guard(*this, it->second); + std::vector list {m_bnf[symbol][it->second]}; + list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end()); + return match(list, begin, end); + } + // now, symbol_list has size > 0 and contains non-terminal symbols at start and end // resolve first symbol @@ -213,12 +226,11 @@ bool Compiler::match(std::vector symbol_list, size_t begin, size_t if (it != m_match_lut.end()) { for (size_t i: it->second) { AddNodeGuard guard(*this, i); - std::vector list {bnf[symbol_list.front()][i]}; + std::vector list {m_bnf[symbol_list.front()][i]}; list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end()); if (minimumSymbolsNeeded(list) > end - begin) // stop recursion continue; - // TODO: recurse last? if (match(list, begin, end)) return true; } @@ -236,7 +248,7 @@ bool Compiler::match(std::string symbol, size_t begin, size_t end) void Compiler::fillStartCache() { - std::unordered_set terminals {getTerminals(bnf)}; + std::unordered_set terminals {getTerminals(m_bnf)}; { // follow terminal symbols for (const std::string& terminal: terminals) { @@ -273,12 +285,13 @@ void Compiler::fillStartCache() } { // follow empty non-terminal symbols, and combine all found non-terminals with all terminals - std::unordered_set, PairHash> current_set {getEmptyPositions(bnf)}; + std::unordered_set, PairHash> current_set {getEmptyPositions(m_bnf)}; std::unordered_set, PairHash> done_set; std::unordered_set, PairHash> next_set; do { for (const auto& current_pos: current_set) { + m_empty_lut[current_pos.first] = current_pos.second; for (const std::string& terminal: terminals) { auto it_lut {m_match_lut.find({current_pos.first, terminal})}; if (it_lut == m_match_lut.end()) { // add new list @@ -286,7 +299,6 @@ void Compiler::fillStartCache() } else { // extend list it_lut->second.emplace_back(current_pos.second); } - } auto it {reversedPosFirst.find(current_pos.first)}; @@ -317,24 +329,24 @@ void Compiler::constructTree() { symbol_variants_it = symbol_variants.begin(); - m_symbol_list = {m_top}; - m_symbol_list_pos = 0; + std::vector symbol_list{m_top}; + index_t symbol_list_pos{0}; NodePosition tree_pos; - while (m_symbol_list_pos < m_symbol_list.size()) { - std::string symbol{m_symbol_list[m_symbol_list_pos]}; + while (symbol_list_pos < symbol_list.size()) { + std::string symbol{symbol_list[symbol_list_pos]}; - if (isTerminal(bnf, symbol)) { + if (isTerminal(m_bnf, symbol)) { // Advance terminal symbol - nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = ChildIdFromTokenId(m_symbol_list_pos); + nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = ChildIdFromTokenId(symbol_list_pos); IncNodePosition(tree_pos); - m_symbol_list_pos++; + symbol_list_pos++; } else { // Replace non-terminal symbol - m_symbol_list.erase(m_symbol_list.begin() + m_symbol_list_pos); - std::vector list {bnf[symbol][*symbol_variants_it]}; - m_symbol_list.insert(m_symbol_list.begin() + m_symbol_list_pos, list.begin(), list.end()); + symbol_list.erase(symbol_list.begin() + symbol_list_pos); + std::vector list {m_bnf[symbol][*symbol_variants_it]}; + symbol_list.insert(symbol_list.begin() + symbol_list_pos, list.begin(), list.end()); index_t node_id {AddNode(symbol, *symbol_variants_it, tree_pos)}; @@ -350,11 +362,9 @@ void Compiler::constructTree() } Compiler::Compiler(BNF& bnf, const std::string& top) - : bnf(bnf) + : m_bnf {removeHeadRecursion(bnf)} , m_top(top) - //, ReverseBNF{Reverse(bnf)} - //, reversedFirst{reverseFirst(bnf)} - , reversedPosFirst{reversePosFirst(bnf)} + , reversedPosFirst{reversePosFirst(m_bnf)} { // // prepare helper cache (TODO: remove this ugly workaround for remaining bad marker elements) diff --git a/grammer.h b/grammer.h index 9b72d3a..a8a3356 100644 --- a/grammer.h +++ b/grammer.h @@ -39,11 +39,9 @@ private: // Input std::vector tokens; - BNF &bnf; // not const for access via operator[] + BNF m_bnf; // not const for access via operator[] const std::string m_top; - //std::unordered_map> ReverseBNF; // possible parent types of a given type; unused now: remove? - //std::unordered_map> reversedFirst; // possible parent types of first childs of a given type std::unordered_map, PairHash>> reversedPosFirst; // possible parent types of first childs of a given type std::deque symbol_variants; @@ -81,12 +79,14 @@ private: return h0 ^ (h1 << 1) ^ (h2 << 2); } }; + // map combination of non-terminal+terminal symbol combination to list of + // possible BNF positions for the specified non-terminal symbol std::unordered_map, std::vector, PairHashSS> m_match_lut; + std::unordered_map m_empty_lut; // list of non-terminal symbols that can lead to empty token list (maps to variant) + void fillStartCache(); void constructTree(); - std::vector m_symbol_list; - index_t m_symbol_list_pos{}; public: Compiler(BNF& bnf, const std::string& top); diff --git a/lexer.cpp b/lexer.cpp index 0944738..a9cda32 100644 --- a/lexer.cpp +++ b/lexer.cpp @@ -85,14 +85,14 @@ void Lexer::addRule(const std::vector& list, size_t list_index_from for (size_t i = list_index_from; i < list_index_to - 1; i++) { std::string symbol{list[i]}; if (symbol == rule_symbol) - throw std::runtime_error("Recursion found but not allowed"); + throw std::runtime_error("Recursion found but not allowed. Only head recursion allowed for lexer."); size_t state{newState()}; addPathOrTransition(previousState, state, symbol); previousState = state; } if (list.back() == rule_symbol) - throw std::runtime_error("Recursion found but not allowed"); + throw std::runtime_error("Tail recursion found but not allowed. Only head recursion allowed for lexer."); // last transition addPathOrTransition(previousState, state1, list.back()); diff --git a/test-cpp.cpp b/test-cpp.cpp index 8a9b7cb..e5b2a1a 100644 --- a/test-cpp.cpp +++ b/test-cpp.cpp @@ -42,7 +42,7 @@ TEST_F(CppTest, preprocessing_tokenize) { auto nodes = cpp.analysis(tokens); - ASSERT_EQ(nodes.size(), 44); + ASSERT_EQ(nodes.size(), 60/*44*/); } TEST_F(CppTest, preprocessing_tokenize_compile_error) { -- cgit v1.2.3