summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2020-03-31 18:46:49 +0200
committerRoland Reichwein <mail@reichwein.it>2020-03-31 18:46:49 +0200
commitcd6c436cb70c4323c7d14ebd74f89bb0914649f2 (patch)
tree1bdb2a8409b2b370bcaa5e1b6cf091c0c8e1f779
parentbed46a062c442656b1bc16d965e12405297029d3 (diff)
Optimization: Replace head recursion by tail recursion in matching
-rw-r--r--bnf.cpp53
-rw-r--r--bnf.h1
-rw-r--r--grammer.cpp58
-rw-r--r--grammer.h10
-rw-r--r--lexer.cpp4
-rw-r--r--test-cpp.cpp2
6 files changed, 96 insertions, 32 deletions
diff --git a/bnf.cpp b/bnf.cpp
index 04a6be9..b2d0472 100644
--- a/bnf.cpp
+++ b/bnf.cpp
@@ -68,6 +68,59 @@ BNF SubBNF(const BNF& bnf, const std::string& top)
return result;
}
+namespace {
+
+ bool isHeadRecursive(const std::vector<std::string>& list, const std::string& symbol) {
+ if (list.size() > 0 && list[0] == symbol)
+ return true;
+ return false;
+ }
+
+ bool isHeadRecursive(const std::vector<std::vector<std::string>>& lists, const std::string& symbol) {
+ for (const auto& list: lists) {
+ if (isHeadRecursive(list, symbol))
+ return true;
+ }
+ return false;
+ }
+
+} // anonymous namespace
+
+// Change head recursion to tail recursion
+BNF removeHeadRecursion(const BNF& bnf)
+{
+ std::unordered_map<std::string, std::vector<std::vector<std::string>>> result;
+
+ for (const auto& [from, to]: bnf) {
+ if (isHeadRecursive(to, from)) { // modify rule by adding additional one
+ std::string from_ext = from + "-ext";
+ if (bnf.find(from_ext) != bnf.end())
+ throw std::runtime_error("ICE: Symbol "s + from_ext + " already exists in original BNF");
+
+ std::vector<std::vector<std::string>> to_new;
+ std::vector<std::vector<std::string>> to_ext{{}}; // starts with empty list as first element
+ for (const auto& list: to) {
+ if (isHeadRecursive(list, from)) {
+ std::vector<std::string> list_new{list.begin() + 1, list.end()};
+ list_new.push_back(from_ext);
+ to_ext.push_back(list_new);
+ } else {
+ std::vector<std::string> list_new{list.begin(), list.end()};
+ list_new.push_back(from_ext);
+ to_new.push_back(list_new);
+ }
+ }
+
+ result[from] = to_new;
+ result[from_ext] = to_ext;
+ } else { // just add
+ result[from] = to;
+ }
+ }
+
+ return result;
+}
+
bool isTerminal(const BNF& bnf, const std::string& symbol)
{
return bnf.find(symbol) == bnf.end();
diff --git a/bnf.h b/bnf.h
index e5c62ed..4ad782a 100644
--- a/bnf.h
+++ b/bnf.h
@@ -15,6 +15,7 @@ std::unordered_map<std::string, std::unordered_set<std::string>> Reverse(const B
std::unordered_map<std::string, std::unordered_set<std::string>> reverseFirst(const BNF& bnf);
BNF SubBNF(const BNF& bnf, const std::string& top);
+BNF removeHeadRecursion(const BNF& bnf);
bool isTerminal(const BNF& bnf, const std::string& symbol);
std::unordered_set<std::string> getTerminals(const BNF& bnf);
diff --git a/grammer.cpp b/grammer.cpp
index 50617ce..d7afaef 100644
--- a/grammer.cpp
+++ b/grammer.cpp
@@ -92,7 +92,7 @@ void Compiler::DumpTree()
index_t Compiler::AddNode(const std::string& type, index_t variant, NodePosition pos)
{
- auto& list { bnf[type][variant]};
+ auto& list { m_bnf[type][variant]};
index_t node_id{nodes.size()};
if (nodes.size() > 0)
nodes[pos.node_id].child_ids[pos.child_pos] = node_id;
@@ -149,7 +149,7 @@ void Compiler::IncNodePosition(NodePosition& pos)
size_t Compiler::minimumSymbolsNeeded(const std::string& symbol)
{
- if (isTerminal(bnf, symbol)) {
+ if (isTerminal(m_bnf, symbol)) {
return 1;
} else {
auto it_min{m_min.find(symbol)};
@@ -157,8 +157,8 @@ size_t Compiler::minimumSymbolsNeeded(const std::string& symbol)
return it_min->second;
m_min[symbol] = std::numeric_limits<size_t>::max();
- auto it{bnf.find(symbol)};
- if (it != bnf.end()) {
+ auto it{m_bnf.find(symbol)};
+ if (it != m_bnf.end()) {
size_t minimum{std::numeric_limits<size_t>::max()};
for (const auto& list: it->second) {
@@ -196,7 +196,7 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t
symbol_list.erase(symbol_list.begin());
}
- // matching of empty lists
+ // matching of empty list in non-terminals
if (symbol_list.size() == 0) {
if (begin == end) { // only match real empty list
// this is the point of the final match
@@ -206,6 +206,19 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t
return false;
}
+ // matching of empty list in terminals
+ if (begin == end) {
+ const auto& symbol{symbol_list.front()};
+ auto it{m_empty_lut.find(symbol)};
+ if (it == m_empty_lut.end()) // can't match empty tokens list with this nt-symbol
+ return false;
+
+ AddNodeGuard guard(*this, it->second);
+ std::vector<std::string> list {m_bnf[symbol][it->second]};
+ list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end());
+ return match(list, begin, end);
+ }
+
// now, symbol_list has size > 0 and contains non-terminal symbols at start and end
// resolve first symbol
@@ -213,12 +226,11 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t
if (it != m_match_lut.end()) {
for (size_t i: it->second) {
AddNodeGuard guard(*this, i);
- std::vector<std::string> list {bnf[symbol_list.front()][i]};
+ std::vector<std::string> list {m_bnf[symbol_list.front()][i]};
list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end());
if (minimumSymbolsNeeded(list) > end - begin) // stop recursion
continue;
- // TODO: recurse last?
if (match(list, begin, end))
return true;
}
@@ -236,7 +248,7 @@ bool Compiler::match(std::string symbol, size_t begin, size_t end)
void Compiler::fillStartCache()
{
- std::unordered_set<std::string> terminals {getTerminals(bnf)};
+ std::unordered_set<std::string> terminals {getTerminals(m_bnf)};
{ // follow terminal symbols
for (const std::string& terminal: terminals) {
@@ -273,12 +285,13 @@ void Compiler::fillStartCache()
}
{ // follow empty non-terminal symbols, and combine all found non-terminals with all terminals
- std::unordered_set<std::pair<std::string, size_t>, PairHash> current_set {getEmptyPositions(bnf)};
+ std::unordered_set<std::pair<std::string, size_t>, PairHash> current_set {getEmptyPositions(m_bnf)};
std::unordered_set<std::pair<std::string, size_t>, PairHash> done_set;
std::unordered_set<std::pair<std::string, size_t>, PairHash> next_set;
do {
for (const auto& current_pos: current_set) {
+ m_empty_lut[current_pos.first] = current_pos.second;
for (const std::string& terminal: terminals) {
auto it_lut {m_match_lut.find({current_pos.first, terminal})};
if (it_lut == m_match_lut.end()) { // add new list
@@ -286,7 +299,6 @@ void Compiler::fillStartCache()
} else { // extend list
it_lut->second.emplace_back(current_pos.second);
}
-
}
auto it {reversedPosFirst.find(current_pos.first)};
@@ -317,24 +329,24 @@ void Compiler::constructTree()
{
symbol_variants_it = symbol_variants.begin();
- m_symbol_list = {m_top};
- m_symbol_list_pos = 0;
+ std::vector<std::string> symbol_list{m_top};
+ index_t symbol_list_pos{0};
NodePosition tree_pos;
- while (m_symbol_list_pos < m_symbol_list.size()) {
- std::string symbol{m_symbol_list[m_symbol_list_pos]};
+ while (symbol_list_pos < symbol_list.size()) {
+ std::string symbol{symbol_list[symbol_list_pos]};
- if (isTerminal(bnf, symbol)) {
+ if (isTerminal(m_bnf, symbol)) {
// Advance terminal symbol
- nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = ChildIdFromTokenId(m_symbol_list_pos);
+ nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = ChildIdFromTokenId(symbol_list_pos);
IncNodePosition(tree_pos);
- m_symbol_list_pos++;
+ symbol_list_pos++;
} else {
// Replace non-terminal symbol
- m_symbol_list.erase(m_symbol_list.begin() + m_symbol_list_pos);
- std::vector<std::string> list {bnf[symbol][*symbol_variants_it]};
- m_symbol_list.insert(m_symbol_list.begin() + m_symbol_list_pos, list.begin(), list.end());
+ symbol_list.erase(symbol_list.begin() + symbol_list_pos);
+ std::vector<std::string> list {m_bnf[symbol][*symbol_variants_it]};
+ symbol_list.insert(symbol_list.begin() + symbol_list_pos, list.begin(), list.end());
index_t node_id {AddNode(symbol, *symbol_variants_it, tree_pos)};
@@ -350,11 +362,9 @@ void Compiler::constructTree()
}
Compiler::Compiler(BNF& bnf, const std::string& top)
- : bnf(bnf)
+ : m_bnf {removeHeadRecursion(bnf)}
, m_top(top)
- //, ReverseBNF{Reverse(bnf)}
- //, reversedFirst{reverseFirst(bnf)}
- , reversedPosFirst{reversePosFirst(bnf)}
+ , reversedPosFirst{reversePosFirst(m_bnf)}
{
//
// prepare helper cache (TODO: remove this ugly workaround for remaining bad marker elements)
diff --git a/grammer.h b/grammer.h
index 9b72d3a..a8a3356 100644
--- a/grammer.h
+++ b/grammer.h
@@ -39,11 +39,9 @@ private:
// Input
std::vector<Token> tokens;
- BNF &bnf; // not const for access via operator[]
+ BNF m_bnf; // not const for access via operator[]
const std::string m_top;
- //std::unordered_map<std::string, std::unordered_set<std::string>> ReverseBNF; // possible parent types of a given type; unused now: remove?
- //std::unordered_map<std::string, std::unordered_set<std::string>> reversedFirst; // possible parent types of first childs of a given type
std::unordered_map<std::string, std::unordered_set<std::pair<std::string, index_t>, PairHash>> reversedPosFirst; // possible parent types of first childs of a given type
std::deque<index_t> symbol_variants;
@@ -81,12 +79,14 @@ private:
return h0 ^ (h1 << 1) ^ (h2 << 2);
}
};
+ // map combination of non-terminal+terminal symbol combination to list of
+ // possible BNF positions for the specified non-terminal symbol
std::unordered_map<std::pair<std::string, std::string>, std::vector<size_t>, PairHashSS> m_match_lut;
+ std::unordered_map<std::string, size_t> m_empty_lut; // list of non-terminal symbols that can lead to empty token list (maps to variant)
+
void fillStartCache();
void constructTree();
- std::vector<std::string> m_symbol_list;
- index_t m_symbol_list_pos{};
public:
Compiler(BNF& bnf, const std::string& top);
diff --git a/lexer.cpp b/lexer.cpp
index 0944738..a9cda32 100644
--- a/lexer.cpp
+++ b/lexer.cpp
@@ -85,14 +85,14 @@ void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from
for (size_t i = list_index_from; i < list_index_to - 1; i++) {
std::string symbol{list[i]};
if (symbol == rule_symbol)
- throw std::runtime_error("Recursion found but not allowed");
+ throw std::runtime_error("Recursion found but not allowed. Only head recursion allowed for lexer.");
size_t state{newState()};
addPathOrTransition(previousState, state, symbol);
previousState = state;
}
if (list.back() == rule_symbol)
- throw std::runtime_error("Recursion found but not allowed");
+ throw std::runtime_error("Tail recursion found but not allowed. Only head recursion allowed for lexer.");
// last transition
addPathOrTransition(previousState, state1, list.back());
diff --git a/test-cpp.cpp b/test-cpp.cpp
index 8a9b7cb..e5b2a1a 100644
--- a/test-cpp.cpp
+++ b/test-cpp.cpp
@@ -42,7 +42,7 @@ TEST_F(CppTest, preprocessing_tokenize) {
auto nodes = cpp.analysis(tokens);
- ASSERT_EQ(nodes.size(), 44);
+ ASSERT_EQ(nodes.size(), 60/*44*/);
}
TEST_F(CppTest, preprocessing_tokenize_compile_error) {