summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2020-03-23 22:02:40 +0100
committerRoland Reichwein <mail@reichwein.it>2020-03-23 22:02:40 +0100
commitfce1c18103f208857af477ca47af9aa908bd69b6 (patch)
tree3d0affeca1328825e81be8950f19e956f39ab213
parent365183e243d164185bca6ad9fa4e0d75664800f4 (diff)
Helper functions for grammer
-rw-r--r--Makefile1
-rw-r--r--cpp.cpp2
-rw-r--r--grammer.cpp136
-rw-r--r--grammer.h19
-rw-r--r--test-cpp.cpp4
-rw-r--r--test-grammer.cpp65
6 files changed, 215 insertions, 12 deletions
diff --git a/Makefile b/Makefile
index b28f356..23b7457 100644
--- a/Makefile
+++ b/Makefile
@@ -49,6 +49,7 @@ SRC=\
cppbnf.cpp \
test-cppbnf.cpp \
grammer.cpp \
+ test-grammer.cpp \
lexer.cpp \
test-lexer.cpp \
minicc.cpp \
diff --git a/cpp.cpp b/cpp.cpp
index 69ca847..f59859d 100644
--- a/cpp.cpp
+++ b/cpp.cpp
@@ -198,7 +198,7 @@ std::vector<Token> CPP::tokens_from_pptokens(std::vector<Token> pp_tokens)
if (pp_types.find(token.type) != pp_types.end()) {
if (token.type == "identifier") {
if (keywords.find(token.value) != keywords.end())
- result.emplace_back(token.value, token.value);
+ result.emplace_back(Token{token.value, token.value});
else
result.emplace_back(Token{"identifier"s, token.value});
}
diff --git a/grammer.cpp b/grammer.cpp
index 5557b12..cc285b9 100644
--- a/grammer.cpp
+++ b/grammer.cpp
@@ -1,6 +1,7 @@
#include "grammer.h"
#include <algorithm>
+#include <limits>
using namespace Gram;
@@ -44,7 +45,7 @@ void Compiler::Validate() const
throw std::runtime_error("Bad root node: "s + std::to_string(root_node_id) + " vs. "s + std::to_string(nodes.size()));
// Start symbol on top
- if (GetTypeOfNode(root_node_id) != Top)
+ if (GetTypeOfNode(root_node_id) != m_top)
throw std::runtime_error("Root node not start symbol!");
// All nodes filled
@@ -56,7 +57,7 @@ void Compiler::Validate() const
bool Compiler::rootIsStartSymbol() const
{
- return GetTypeOfNode(root_node_id) == Top;
+ return GetTypeOfNode(root_node_id) == m_top;
}
bool Gram::ChildIdIsToken(int32_t child_id)
@@ -493,18 +494,136 @@ bool Compiler::FillTree()
return true;
}
-Compiler::Compiler(BNF& bnf, const std::string& Top): bnf(bnf), Top(Top), ReverseBNF{Reverse(bnf)}, reversedFirst{reverseFirst(bnf)}
+size_t Compiler::minimumSymbolsNeeded(std::string symbol)
{
+ if (isTerminal(bnf, symbol)) {
+ return 1;
+ } else {
+ auto it_min{m_min.find(symbol)};
+ if (it_min != m_min.end())
+ return it_min->second;
+ m_min[symbol] = std::numeric_limits<size_t>::max();
+
+ auto it{bnf.find(symbol)};
+ if (it != bnf.end()) {
+ size_t minimum{std::numeric_limits<size_t>::max()};
+
+ for (const auto& list: it->second) {
+ minimum = std::min(minimum, minimumSymbolsNeeded(list));
+ }
+
+ m_min[symbol] = minimum;
+
+ return minimum;
+ } else
+ throw std::runtime_error("ICE: Symbol "s + symbol + " expected in BNF"s);
+ }
+}
+
+size_t Compiler::minimumSymbolsNeeded(std::vector<std::string> symbol_list)
+{
+ size_t result{0};
+
+ for (const auto& symbol: symbol_list) {
+ size_t min { minimumSymbolsNeeded(symbol) };
+ if (min == std::numeric_limits<size_t>::max())
+ return min;
+ result += min;
+ }
+
+ return result;
+}
+
+/// begin, end: indexes in tokens list
+bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t end)
+{
+ // TODO: isTerminal() necessary here?
+
+ std::cout << "DEBUG: Matching:";
+ for (auto symbol: symbol_list) {
+ std::cout << " |" << symbol << "|";
+ }
+ std::cout << std::endl;
+
+ // match terminal symbols at start
+ while (begin < end && isTerminal(bnf, tokens[begin].type) && symbol_list.size() > 0 && symbol_list.front() == tokens[begin].type) {
+ begin++;
+ symbol_list.erase(symbol_list.begin());
+ }
+
+ // match terminal symbols at end
+ while (begin < end && isTerminal(bnf, tokens[end - 1].type) && symbol_list.size() > 0 && symbol_list.back() == tokens[end - 1].type) {
+ end--;
+ symbol_list.erase(symbol_list.end() - 1);
+ }
+
+ // matching of empty lists
+ if (symbol_list.size() == 0) {
+ if (begin == end) // only match real empty list
+ return true;
+ return false;
+ }
+
+ // now, symbol_list[begin .. end - 1] has size > 0 and contains non-terminal symbols at start and end
+
+ // resolve first symbol
+ auto it{bnf.find(symbol_list.front())};
+ if (it != bnf.end()) {
+ for (std::vector<std::string> list: it->second) { // iterate over alternatives
+ std::cout << "ALTERNATIVE for " << symbol_list.front() << " with " << list.size() << std::endl;
+ list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end());
+ std::cout << "Min: " << minimumSymbolsNeeded(list) << ", end-begin: " << (end-begin) << std::endl;
+ if (minimumSymbolsNeeded(list) > end - begin) // stop recursion
+ continue;
+
+ // TODO: recurse last?
+ if (match(list, begin, end))
+ return true;
+ }
+ } else
+ return false; // terminal symbol not found in bnf, non-matching
+
+ return false; // no match found
+}
+
+bool Compiler::match(std::string symbol, size_t begin, size_t end)
+{
+ std::vector<std::string> symbol_list{symbol};
+ return match(symbol_list, begin, end);
+}
+
+Compiler::Compiler(BNF& bnf, const std::string& top): bnf(bnf), m_top(top), ReverseBNF{Reverse(bnf)}, reversedFirst{reverseFirst(bnf)}
+{
+ //std::cout << "DEBUG: " << m_top << std::endl;
+
+ //
+ // prepare helper cache (TODO: remove this ugly workaround for remaining bad marker elements)
+ //
+ minimumSymbolsNeeded("translation-unit");
+ // remove bad marker elements
+ auto it{m_min.begin()};
+ while (it != m_min.end()) {
+ if (it->second == std::numeric_limits<size_t>::max()) {
+ it = m_min.erase(it);
+ } else {
+ ++it;
+ }
+ }
+ minimumSymbolsNeeded("translation-unit");
}
-std::pair<index_t, std::vector<TreeNode>> Compiler::compile(std::vector<Token> Tokens)
+std::pair<index_t, std::vector<TreeNode>> Compiler::compile(std::vector<Token> p_tokens)
{
clear();
- tokens = Tokens;
+ tokens = p_tokens;
if (tokens.size() == 0)
throw std::runtime_error("No tokens!");
+#if 0
+ //
+ // bottom-up algorithm
+ //
while (!treeIsComplete()) {
if (!FillTree()) {
TrackBack();
@@ -514,6 +633,13 @@ std::pair<index_t, std::vector<TreeNode>> Compiler::compile(std::vector<Token> T
TrackBack();
}
}
+#else
+ //
+ // top-down algorithm
+ //
+ if (!match(m_top, 0, tokens.size()))
+ throw std::runtime_error("Compile error.");
+#endif
Validate();
diff --git a/grammer.h b/grammer.h
index ee2897d..e5c2f11 100644
--- a/grammer.h
+++ b/grammer.h
@@ -4,8 +4,11 @@
#include "minicc.h"
#include <cstdint>
+#include <unordered_set>
#include <utility>
+class GrammerTest;
+
namespace Gram {
// TreeNodes are only intermediate. Terminal symbols don't get of TreeNodes
@@ -23,7 +26,6 @@ struct TreeNode {
class Compiler
{
-
private:
// The result
index_t root_node_id{};
@@ -36,7 +38,7 @@ private:
index_t tokens_used{0}; // number of tokens used, this is also the next token index to use
BNF &bnf; // not const for access via operator[]
- const std::string& Top;
+ const std::string m_top;
std::unordered_map<std::string, std::set<std::string>> ReverseBNF; // possible parent types of a given type; unused now: remove?
std::unordered_map<std::string, std::set<std::string>> reversedFirst; // possible parent types of first childs of a given type
@@ -67,10 +69,19 @@ private:
void AddPath(const std::vector<std::string>& path, index_t current_index);
bool FillTree();
+ // top-down algorithm
+ std::unordered_map<std::string, size_t> m_min; // cache
+ size_t minimumSymbolsNeeded(std::string symbol);
+ size_t minimumSymbolsNeeded(std::vector<std::string> symbol_list);
+ bool match(std::string symbol, size_t begin, size_t end);
+ bool match(std::vector<std::string> symbol_list, size_t begin, size_t end);
+
public:
- Compiler(BNF& bnf, const std::string& Top);
- std::pair<index_t, std::vector<TreeNode>> compile(std::vector<Token> Tokens);
+ Compiler(BNF& bnf, const std::string& top);
+ std::pair<index_t, std::vector<TreeNode>> compile(std::vector<Token> p_tokens);
void DumpTree();
+
+ friend class ::GrammerTest;
};
bool ChildIdIsToken(int32_t child_id);
diff --git a/test-cpp.cpp b/test-cpp.cpp
index 2a67b38..6727966 100644
--- a/test-cpp.cpp
+++ b/test-cpp.cpp
@@ -30,7 +30,7 @@ protected:
}
};
-#if 1
+#if 0
TEST_F(CppTest, preprocessing_tokenize) {
CPP cpp;
auto pp_tokens = cpp.preprocessing_tokenize("int main() { return 1; }");
@@ -40,7 +40,7 @@ TEST_F(CppTest, preprocessing_tokenize) {
auto tokens = cpp.tokens_from_pptokens(pp_tokens);
ASSERT_EQ(tokens.size(), 9);
-#if 0
+#if 1
for (auto &i: tokens) {
std::cout << i.type << ": " << i.value << std::endl;
}
diff --git a/test-grammer.cpp b/test-grammer.cpp
new file mode 100644
index 0000000..1734da2
--- /dev/null
+++ b/test-grammer.cpp
@@ -0,0 +1,65 @@
+#include "bnf.h"
+#include "cpp.h"
+#include "cppbnf.h"
+#include "lexer.h"
+#include "grammer.h"
+#include "minicc.h"
+#include "debug.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include <algorithm>
+#include <cctype>
+#include <deque>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace std::string_literals;
+
+class GrammerTest: public ::testing::Test
+{
+protected:
+ GrammerTest() {
+ //debug = true;
+ }
+ ~GrammerTest() {
+ }
+
+ // Accessors for friend
+ size_t minimumSymbolsNeeded(Gram::Compiler& compiler, std::vector<std::string> list) {
+ return compiler.minimumSymbolsNeeded(list);
+ }
+ size_t minimumSymbolsNeeded(Gram::Compiler& compiler, std::string s) {
+ return compiler.minimumSymbolsNeeded(s);
+ }
+};
+
+TEST_F(GrammerTest, minimumSymbolsNeeded) {
+ auto bnf = SubBNF(CPPBNF::GetCppBNFGram(), "translation-unit");
+
+ Gram::Compiler compiler(bnf, "translation-unit");
+
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, std::vector<std::string>{}), 0);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "translation-unit"), 0);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "logical-or-expression"), 1);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "assignment-expression"), 1);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "declaration"), 1);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "block-declaration"), 3);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "simple-declaration"), 2);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "asm-declaration"), 5);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "namespace-alias-definition"), 5);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "using-declaration"), 4);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "using-enum-declaration"), 4);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "using-directive"), 4);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "static_assert-declaration"), 5);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "alias-declaration"), 7);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "opaque-enum-declaration"), 3);
+ EXPECT_EQ(minimumSymbolsNeeded(compiler, "function-definition"), 4);
+}
+