From fce1c18103f208857af477ca47af9aa908bd69b6 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Mon, 23 Mar 2020 22:02:40 +0100 Subject: Helper functions for grammer --- Makefile | 1 + cpp.cpp | 2 +- grammer.cpp | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- grammer.h | 19 ++++++-- test-cpp.cpp | 4 +- test-grammer.cpp | 65 ++++++++++++++++++++++++++ 6 files changed, 215 insertions(+), 12 deletions(-) create mode 100644 test-grammer.cpp diff --git a/Makefile b/Makefile index b28f356..23b7457 100644 --- a/Makefile +++ b/Makefile @@ -49,6 +49,7 @@ SRC=\ cppbnf.cpp \ test-cppbnf.cpp \ grammer.cpp \ + test-grammer.cpp \ lexer.cpp \ test-lexer.cpp \ minicc.cpp \ diff --git a/cpp.cpp b/cpp.cpp index 69ca847..f59859d 100644 --- a/cpp.cpp +++ b/cpp.cpp @@ -198,7 +198,7 @@ std::vector CPP::tokens_from_pptokens(std::vector pp_tokens) if (pp_types.find(token.type) != pp_types.end()) { if (token.type == "identifier") { if (keywords.find(token.value) != keywords.end()) - result.emplace_back(token.value, token.value); + result.emplace_back(Token{token.value, token.value}); else result.emplace_back(Token{"identifier"s, token.value}); } diff --git a/grammer.cpp b/grammer.cpp index 5557b12..cc285b9 100644 --- a/grammer.cpp +++ b/grammer.cpp @@ -1,6 +1,7 @@ #include "grammer.h" #include +#include using namespace Gram; @@ -44,7 +45,7 @@ void Compiler::Validate() const throw std::runtime_error("Bad root node: "s + std::to_string(root_node_id) + " vs. "s + std::to_string(nodes.size())); // Start symbol on top - if (GetTypeOfNode(root_node_id) != Top) + if (GetTypeOfNode(root_node_id) != m_top) throw std::runtime_error("Root node not start symbol!"); // All nodes filled @@ -56,7 +57,7 @@ void Compiler::Validate() const bool Compiler::rootIsStartSymbol() const { - return GetTypeOfNode(root_node_id) == Top; + return GetTypeOfNode(root_node_id) == m_top; } bool Gram::ChildIdIsToken(int32_t child_id) @@ -493,18 +494,136 @@ bool Compiler::FillTree() return true; } -Compiler::Compiler(BNF& bnf, const std::string& Top): bnf(bnf), Top(Top), ReverseBNF{Reverse(bnf)}, reversedFirst{reverseFirst(bnf)} +size_t Compiler::minimumSymbolsNeeded(std::string symbol) { + if (isTerminal(bnf, symbol)) { + return 1; + } else { + auto it_min{m_min.find(symbol)}; + if (it_min != m_min.end()) + return it_min->second; + m_min[symbol] = std::numeric_limits::max(); + + auto it{bnf.find(symbol)}; + if (it != bnf.end()) { + size_t minimum{std::numeric_limits::max()}; + + for (const auto& list: it->second) { + minimum = std::min(minimum, minimumSymbolsNeeded(list)); + } + + m_min[symbol] = minimum; + + return minimum; + } else + throw std::runtime_error("ICE: Symbol "s + symbol + " expected in BNF"s); + } +} + +size_t Compiler::minimumSymbolsNeeded(std::vector symbol_list) +{ + size_t result{0}; + + for (const auto& symbol: symbol_list) { + size_t min { minimumSymbolsNeeded(symbol) }; + if (min == std::numeric_limits::max()) + return min; + result += min; + } + + return result; +} + +/// begin, end: indexes in tokens list +bool Compiler::match(std::vector symbol_list, size_t begin, size_t end) +{ + // TODO: isTerminal() necessary here? + + std::cout << "DEBUG: Matching:"; + for (auto symbol: symbol_list) { + std::cout << " |" << symbol << "|"; + } + std::cout << std::endl; + + // match terminal symbols at start + while (begin < end && isTerminal(bnf, tokens[begin].type) && symbol_list.size() > 0 && symbol_list.front() == tokens[begin].type) { + begin++; + symbol_list.erase(symbol_list.begin()); + } + + // match terminal symbols at end + while (begin < end && isTerminal(bnf, tokens[end - 1].type) && symbol_list.size() > 0 && symbol_list.back() == tokens[end - 1].type) { + end--; + symbol_list.erase(symbol_list.end() - 1); + } + + // matching of empty lists + if (symbol_list.size() == 0) { + if (begin == end) // only match real empty list + return true; + return false; + } + + // now, symbol_list[begin .. end - 1] has size > 0 and contains non-terminal symbols at start and end + + // resolve first symbol + auto it{bnf.find(symbol_list.front())}; + if (it != bnf.end()) { + for (std::vector list: it->second) { // iterate over alternatives + std::cout << "ALTERNATIVE for " << symbol_list.front() << " with " << list.size() << std::endl; + list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end()); + std::cout << "Min: " << minimumSymbolsNeeded(list) << ", end-begin: " << (end-begin) << std::endl; + if (minimumSymbolsNeeded(list) > end - begin) // stop recursion + continue; + + // TODO: recurse last? + if (match(list, begin, end)) + return true; + } + } else + return false; // terminal symbol not found in bnf, non-matching + + return false; // no match found +} + +bool Compiler::match(std::string symbol, size_t begin, size_t end) +{ + std::vector symbol_list{symbol}; + return match(symbol_list, begin, end); +} + +Compiler::Compiler(BNF& bnf, const std::string& top): bnf(bnf), m_top(top), ReverseBNF{Reverse(bnf)}, reversedFirst{reverseFirst(bnf)} +{ + //std::cout << "DEBUG: " << m_top << std::endl; + + // + // prepare helper cache (TODO: remove this ugly workaround for remaining bad marker elements) + // + minimumSymbolsNeeded("translation-unit"); + // remove bad marker elements + auto it{m_min.begin()}; + while (it != m_min.end()) { + if (it->second == std::numeric_limits::max()) { + it = m_min.erase(it); + } else { + ++it; + } + } + minimumSymbolsNeeded("translation-unit"); } -std::pair> Compiler::compile(std::vector Tokens) +std::pair> Compiler::compile(std::vector p_tokens) { clear(); - tokens = Tokens; + tokens = p_tokens; if (tokens.size() == 0) throw std::runtime_error("No tokens!"); +#if 0 + // + // bottom-up algorithm + // while (!treeIsComplete()) { if (!FillTree()) { TrackBack(); @@ -514,6 +633,13 @@ std::pair> Compiler::compile(std::vector T TrackBack(); } } +#else + // + // top-down algorithm + // + if (!match(m_top, 0, tokens.size())) + throw std::runtime_error("Compile error."); +#endif Validate(); diff --git a/grammer.h b/grammer.h index ee2897d..e5c2f11 100644 --- a/grammer.h +++ b/grammer.h @@ -4,8 +4,11 @@ #include "minicc.h" #include +#include #include +class GrammerTest; + namespace Gram { // TreeNodes are only intermediate. Terminal symbols don't get of TreeNodes @@ -23,7 +26,6 @@ struct TreeNode { class Compiler { - private: // The result index_t root_node_id{}; @@ -36,7 +38,7 @@ private: index_t tokens_used{0}; // number of tokens used, this is also the next token index to use BNF &bnf; // not const for access via operator[] - const std::string& Top; + const std::string m_top; std::unordered_map> ReverseBNF; // possible parent types of a given type; unused now: remove? std::unordered_map> reversedFirst; // possible parent types of first childs of a given type @@ -67,10 +69,19 @@ private: void AddPath(const std::vector& path, index_t current_index); bool FillTree(); + // top-down algorithm + std::unordered_map m_min; // cache + size_t minimumSymbolsNeeded(std::string symbol); + size_t minimumSymbolsNeeded(std::vector symbol_list); + bool match(std::string symbol, size_t begin, size_t end); + bool match(std::vector symbol_list, size_t begin, size_t end); + public: - Compiler(BNF& bnf, const std::string& Top); - std::pair> compile(std::vector Tokens); + Compiler(BNF& bnf, const std::string& top); + std::pair> compile(std::vector p_tokens); void DumpTree(); + + friend class ::GrammerTest; }; bool ChildIdIsToken(int32_t child_id); diff --git a/test-cpp.cpp b/test-cpp.cpp index 2a67b38..6727966 100644 --- a/test-cpp.cpp +++ b/test-cpp.cpp @@ -30,7 +30,7 @@ protected: } }; -#if 1 +#if 0 TEST_F(CppTest, preprocessing_tokenize) { CPP cpp; auto pp_tokens = cpp.preprocessing_tokenize("int main() { return 1; }"); @@ -40,7 +40,7 @@ TEST_F(CppTest, preprocessing_tokenize) { auto tokens = cpp.tokens_from_pptokens(pp_tokens); ASSERT_EQ(tokens.size(), 9); -#if 0 +#if 1 for (auto &i: tokens) { std::cout << i.type << ": " << i.value << std::endl; } diff --git a/test-grammer.cpp b/test-grammer.cpp new file mode 100644 index 0000000..1734da2 --- /dev/null +++ b/test-grammer.cpp @@ -0,0 +1,65 @@ +#include "bnf.h" +#include "cpp.h" +#include "cppbnf.h" +#include "lexer.h" +#include "grammer.h" +#include "minicc.h" +#include "debug.h" + +#include + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std::string_literals; + +class GrammerTest: public ::testing::Test +{ +protected: + GrammerTest() { + //debug = true; + } + ~GrammerTest() { + } + + // Accessors for friend + size_t minimumSymbolsNeeded(Gram::Compiler& compiler, std::vector list) { + return compiler.minimumSymbolsNeeded(list); + } + size_t minimumSymbolsNeeded(Gram::Compiler& compiler, std::string s) { + return compiler.minimumSymbolsNeeded(s); + } +}; + +TEST_F(GrammerTest, minimumSymbolsNeeded) { + auto bnf = SubBNF(CPPBNF::GetCppBNFGram(), "translation-unit"); + + Gram::Compiler compiler(bnf, "translation-unit"); + + EXPECT_EQ(minimumSymbolsNeeded(compiler, std::vector{}), 0); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "translation-unit"), 0); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "logical-or-expression"), 1); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "assignment-expression"), 1); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "declaration"), 1); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "block-declaration"), 3); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "simple-declaration"), 2); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "asm-declaration"), 5); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "namespace-alias-definition"), 5); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "using-declaration"), 4); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "using-enum-declaration"), 4); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "using-directive"), 4); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "static_assert-declaration"), 5); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "alias-declaration"), 7); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "opaque-enum-declaration"), 3); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "function-definition"), 4); +} + -- cgit v1.2.3