From 2f4118526972f7f3d5147342bc65909fcc82b6c7 Mon Sep 17 00:00:00 2001
From: Roland Reichwein <mail@reichwein.it>
Date: Sun, 16 Feb 2020 16:33:19 +0100
Subject: clang++-9, integrate cppbnf

---
 Makefile    | 13 +++++++------
 bnf.cpp     | 31 ++++++++++++++++++++++++++++++-
 bnf.h       |  2 ++
 cppbnf.cpp  | 51 ++++++++++++++++++++++++++++-----------------------
 grammer.cpp | 59 +++++++++++++++++++++++++++++++++--------------------------
 5 files changed, 100 insertions(+), 56 deletions(-)
diff --git a/Makefile b/Makefile
index b18defe..1c2db6c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,16 @@
 PROJECTNAME=minicc
 
-CXX=clang++-8
+CXX=clang++-9
 #CXX=g++-8
 
-#CXXFLAGS=-O0 -D_DEBUG
+CXXFLAGS=-O0 -D_DEBUG
 # -fprofile-instr-generate -fcoverage-mapping
 # gcc:--coverage
-CXXFLAGS=-O2 -DNDEBUG
+#CXXFLAGS=-O2 -DNDEBUG
 
-CXXFLAGS+= -Wall -std=c++17 -I. -Ilib
+CXXFLAGS+= -Wall -std=c++17 -I.
 
-ifeq ($(CXX),clang++-8)
+ifeq ($(CXX),clang++-9)
 # currently broken:
 # ld.lld-8: error: undefined symbol: boost::re_detail_106700::cpp_regex_traits_implementation<char>::transform(char const*, char const*) const
 #CXXFLAGS+= -stdlib=libc++
@@ -28,7 +28,7 @@ LIBS=\
 -lboost_regex \
 -lpthread
 
-ifeq ($(CXX),clang++-8)
+ifeq ($(CXX),clang++-9)
 LIBS+= \
 -fuse-ld=lld-8 \
 -lstdc++ \
@@ -48,6 +48,7 @@ SRC=\
     grammer.cpp \
     lexer.cpp \
     minicc.cpp \
+    cppbnf.cpp \
     test-lexer.cpp \
     googletest/src/gtest-all.cpp \
     googlemock/src/gmock-all.cpp
diff --git a/bnf.cpp b/bnf.cpp
index 1efb459..f240f29 100644
--- a/bnf.cpp
+++ b/bnf.cpp
@@ -1,6 +1,7 @@
 #include "bnf.h"
 
-std::map<std::string, std::set<std::string>> Reverse(BNF bnf){
+std::map<std::string, std::set<std::string>> Reverse(BNF bnf)
+{
  std::map<std::string, std::set<std::string>> result;
 
  for (const auto& [from, to] : bnf) {
@@ -18,4 +19,32 @@ std::map<std::string, std::set<std::string>> Reverse(BNF bnf){
  return result;
 }
 
+BNF SubBNF(const BNF& bnf, const std::string& top)
+{
+ BNF result;
+
+ std::vector<std::string> todo{top};
+
+ while (!todo.empty()) {
+  std::string current{todo.back()};
+  todo.pop_back();
+
+  auto it = bnf.find(current);
+  if (it != bnf.end()) {
+   // add current value
+   result[it->first] = it->second;
+
+   // add sub-tree values if not present yet, but available in original bnf
+   for (auto& variant: it->second) {
+    for (auto& child: variant) {
+     if (result.find(child) == result.end() && bnf.find(child) != bnf.end()) {
+      todo.push_back(child);
+     }
+    }
+   }
+  }
+ }
+
+ return result;
+}
 
diff --git a/bnf.h b/bnf.h
index 959e1f8..56aacb0 100644
--- a/bnf.h
+++ b/bnf.h
@@ -12,3 +12,5 @@ using namespace std::string_literals;
 using BNF = std::map<std::string, std::vector<std::vector<std::string>>>;
 
 std::map<std::string, std::set<std::string>> Reverse(BNF bnf);
+
+BNF SubBNF(const BNF& bnf, const std::string& top);
diff --git a/cppbnf.cpp b/cppbnf.cpp
index 5266fa0..0b6e58f 100755
--- a/cppbnf.cpp
+++ b/cppbnf.cpp
@@ -21,7 +21,7 @@ namespace {
   return result;
  }
 
- std::unordered_map<std::string, std::unordered_set<std::string>> reverseBNF(const T_BNF& bnf)
+ std::unordered_map<std::string, std::unordered_set<std::string>> reverseBNF(const BNF& bnf)
  {
   std::unordered_map<std::string, std::unordered_set<std::string>> result;
   for (const auto& [symbol, lists] : bnf) {
@@ -39,12 +39,12 @@ namespace {
   return result;
  }
 
- bool isTerminal(const std::string& symbol, const T_BNF& bnf)
+ bool isTerminal(const std::string& symbol, const BNF& bnf)
  {
   return bnf.find(symbol) == bnf.end();
  }
 
- size_t numberOfStartSymbols(const T_BNF& bnf)
+ size_t numberOfStartSymbols(const BNF& bnf)
  {
   // exactly 1 start symbol
   std::vector<std::string> startSymbols;
@@ -64,7 +64,7 @@ namespace {
   return startSymbols.size();
  }
 
- bool symbolsValid(const T_BNF& bnf)
+ bool symbolsValid(const BNF& bnf)
  {
   for (const auto& [symbol, lists] : bnf) {
    if (boost::contains(symbol, " ")) {
@@ -74,14 +74,19 @@ namespace {
 
    for (const auto& list : lists) {
     for (const auto& i : list) {
-     if (i.size() == 1 && !isTerminal(i, bnf)) {
-      std::cerr << "Warning: Non-Terminal symbol " << i << " in " << symbol << " is too long." << std::endl;
-      return false;
-     }
 
-     if (boost::contains(i, " ")) {
-      std::cerr << "Warning: Symbol " << i << " in " << symbol << " contains space." << std::endl;
-      return false;
+     if (!isTerminal(i, bnf)) {
+      // every non-terminal symbol must be longer that 1 char
+      if (i.size() == 1) {
+       std::cerr << "Warning: Non-Terminal symbol " << i << " in " << symbol << " is too short." << std::endl;
+       return false;
+      }
+
+      // non-terminal symbols must not contain space
+      if (boost::contains(i, " ")) {
+       std::cerr << "Warning: Symbol " << i << " in " << symbol << " contains space." << std::endl;
+       return false;
+      }
      }
     }
    }
@@ -92,11 +97,11 @@ namespace {
 
  // returns 1 if exactly one start symbol and
  // all nodes size > 1, except terminal symbols
- bool valid(const T_BNF& bnf) {
+ bool valid(const BNF& bnf) {
   return numberOfStartSymbols(bnf) == 1 && symbolsValid(bnf);
  }
 
- bool validLex(const T_BNF& bnf) {
+ bool validLex(const BNF& bnf) {
   // all terminal symbols exactly one character
 
   for (const auto& [symbol, lists] : bnf) {
@@ -174,7 +179,7 @@ namespace {
   }
  }
 
- T_BNF& normalizeBNF(T_BNF& bnf)
+ BNF& normalizeBNF(BNF& bnf)
  {
   // resolve OPTIONAL symbols
   for (auto& [symbol, lists] : bnf) {
@@ -184,7 +189,7 @@ namespace {
   return bnf;
  }
 
- T_BNF& normalizeLexBNF(T_BNF& bnf)
+ BNF& normalizeLexBNF(BNF& bnf)
  {
   normalizeBNF(bnf);
 
@@ -206,9 +211,9 @@ namespace {
 
 } // namespace
 
-T_BNF GetCppBNFLex()
+BNF GetCppBNFLex()
 {
- T_BNF bnf{
+ BNF bnf{
   // [gram.lex]
 
   {"hex-quad", {{"hexadecimal-digit", "hexadecimal-digit", "hexadecimal-digit", "hexadecimal-digit"}}},
@@ -520,9 +525,9 @@ T_BNF GetCppBNFLex()
  return normalizeLexBNF(bnf);
 }
 
-T_BNF GetCppBNFGram()
+BNF GetCppBNFGram()
 {
- T_BNF bnf{
+ BNF bnf{
   // [gram.key]
 
   {"typedef-name", {{"identifier"}, {"simple-template-id"}}},
@@ -1919,15 +1924,15 @@ T_BNF GetCppBNFGram()
  return normalizeBNF(bnf);
 }
 
-TEST(Lex, Test2) {
- auto bnf = GetCppBNFLex();
+TEST(CppBnf, LexicalBnf) {
+ auto bnf = SubBNF(GetCppBNFLex(), "preprocessing-token");
 
  EXPECT_TRUE(valid(bnf));
  EXPECT_TRUE(validLex(bnf));
 }
 
-TEST(Gram, Test3) {
- auto bnf = GetCppBNFGram();
+TEST(CppBnf, GrammarBnf) {
+ auto bnf = SubBNF(GetCppBNFGram(), "translation-unit");
 
  EXPECT_TRUE(valid(bnf));
 }
diff --git a/grammer.cpp b/grammer.cpp
index 3c7671b..808c49e 100644
--- a/grammer.cpp
+++ b/grammer.cpp
@@ -4,7 +4,16 @@
 
 using namespace Gram;
 
-void Compiler::clear() {
+static bool debug{false};
+
+void Debug(std::string s)
+{
+ if (debug)
+  std::cout << s << std::endl;
+}
+
+void Compiler::clear()
+{
  nodes.clear();
  root_node_id = 0;
 
@@ -71,52 +80,51 @@ namespace {
 
 void Compiler::DumpTree()
 {
- std::cout << "= Dump =======================================" << std::endl;
- std::cout << "nodes.size()=" << nodes.size() << std::endl;
+ Debug("= Dump =======================================");
+ Debug("nodes.size()="s + std::to_string(nodes.size()));
  if (nodes.size() > 0) {
   if (0) {
-   std::cout << "--- Nodes ------------------------------------" << std::endl;
-   std::cout << "root_node_id=" << root_node_id << std::endl;
+   Debug("--- Nodes ------------------------------------");
+   Debug("root_node_id="s + std::to_string(root_node_id));
    for (const auto& node : nodes) {
-    std::cout << node.type << "(" << node.node_id << "):";
+    std::string line{"("s + std::to_string(node.node_id) + "):"s};
     for (const auto& child : node.child_ids) {
-     std::cout << " ";
+     line += " "s;
      if (ChildIdIsToken(child))
-      std::cout << "t" << TokenIdFromChildId(child);
+      line += "t"s + std::to_string(TokenIdFromChildId(child));
      else
-      std::cout << child;
+      line += std::to_string(child);
     }
-    std::cout << std::endl;
+    Debug(line);
    }
   }
 
-  std::cout << "--- Tree -------------------------------------" << std::endl;
+  Debug("--- Tree -------------------------------------");
   std::deque<std::pair<int32_t, size_t>> todo{std::pair<int32_t, size_t>{static_cast<int32_t>(root_node_id), 0}}; // id, indent
   while (!todo.empty()) {
    auto [current_index, indent] {todo.front()};
    todo.pop_front();
 
-   std::cout << std::string(indent, ' ');
+   std::string line(indent, ' ');
    if (ChildIdIsToken(current_index)) {
     index_t token_id {TokenIdFromChildId(current_index)};
-    std::cout << "Token(" << token_id << "): " << tokens[token_id].type << "(" << tokens[token_id].value << ")";
+    line += "Token("s + std::to_string(token_id) + "): "s + tokens[token_id].type + "("s + tokens[token_id].value + ")"s;
    } else {
     auto& node {nodes[current_index]};
-    std::cout << "Node(" << current_index << "): " << node.type << "/" << node.variant;
+    line += "Node("s + std::to_string(current_index) + "): "s + node.type + "/" + std::to_string(node.variant);
    
     auto child_ids{node.child_ids};
     for (int i = 0; i < child_ids.size(); i++) {
      todo.insert(todo.begin() + i, std::pair<int32_t, size_t>{child_ids[i], indent + 1});
     }
     if (node.alternatives.size()) {
-     std::cout << ", " << node.alternatives.size() << " alternatives available";
+     line += ", "s + std::to_string(node.alternatives.size()) + " alternatives available"s;
     }
    }
-   std::cout << std::endl;
-
+   Debug(line);
   }
  }
- std::cout << "==============================================" << std::endl;
+ Debug("==============================================");
 }
 
 bool Compiler::AllTokensUsed() const
@@ -240,7 +248,7 @@ bool Compiler::AddRootNode()
    return false;
 
   // now add! 
-  std::cout << "AddRootNode(): Adding " << node_type << std::endl;
+  Debug("AddRootNode(): Adding "s + node_type);
   nodes[old_root_node_id].parent_node_id = new_root_node_id;
   root_node_id = new_root_node_id;
   nodes.emplace_back(TreeNode{root_node_id, root_node_id, node_type, node_variant, alternatives, child_ids});
@@ -258,7 +266,6 @@ void Compiler::removeTokensUpTo(index_t token_id)
 
 void Compiler::removeTokensUpTo(index_t token_id, index_t node_id)
 {
- // std::cout << "DEBUG " << token_id << " " << tokens_used << std::endl;
  // token_id should be the new tokens_used
  
  if (token_id < tokens_used) {
@@ -267,7 +274,7 @@ void Compiler::removeTokensUpTo(index_t token_id, index_t node_id)
 
   // remove relevant tokens from end
   while (token_id < tokens_used && child_ids.size() && ChildIdIsToken(child_ids.back()) && TokenIdFromChildId(child_ids.back()) >= token_id) {
-   std::cout << "Removing token " << TokenIdFromChildId(child_ids.back()) << std::endl;
+   Debug("Removing token "s + std::to_string(TokenIdFromChildId(child_ids.back())));
    child_ids.pop_back();
    if (tokens_used > 0)
     tokens_used--;
@@ -301,7 +308,7 @@ void Compiler::RemoveLastNode()
     nodes[node.child_ids[0]].parent_node_id = node.child_ids[0];
     root_node_id = node.child_ids[0];
    }
-   std::cout << "Removing root node " << nodes.back().type << "(" << nodes.back().node_id << ")" << std::endl;
+   Debug("Removing root node "s + nodes.back().type + "("s + std::to_string(nodes.back().node_id) + ")"s);
    nodes.pop_back();
   } else {
    DumpTree();
@@ -313,7 +320,7 @@ void Compiler::RemoveLastNode()
   if (parent.child_ids.empty() || parent.child_ids.back() != node_id)
    throw std::runtime_error("ICE: Backtrack: Bad child nodes");
   parent.child_ids.pop_back();
-  std::cout << "Removing " << nodes.back().type << "(" << nodes.back().node_id << ")" << std::endl;
+  Debug("Removing "s + nodes.back().type + "("s + std::to_string(nodes.back().node_id) + ")"s);
   nodes.pop_back();
  } else if (ChildIdIsToken(node.child_ids.back())) {
   removeTokensUpTo(TokenIdFromChildId(node.child_ids.back()));
@@ -435,7 +442,7 @@ index_t Compiler::AddNode(const std::string& child_type, index_t parent_index)
  nodes.emplace_back(TreeNode{parent_index, index, child_type, variant, alternatives, std::vector<int32_t>{}});
  //root stays, tokens_used stays
 
- std::cout << "AddNode(): " << parent.type << "->" << child_type << ": "<< index << std::endl;
+ Debug("AddNode(): "s + parent.type + "->"s + child_type + ": "s + std::to_string(index));
  DumpTree();
 
  return index;
@@ -450,7 +457,7 @@ void Compiler::AddPath(const std::vector<std::string>& path, index_t current_ind
  nodes.back().child_ids.emplace_back(ChildIdFromTokenId(tokens_used));
  tokens_used++;
 
- std::cout << "AddPath(): token " << tokens.back().type << std::endl;
+ Debug("AddPath(): token "s + tokens.back().type);
  DumpTree();
 }
 
@@ -470,7 +477,7 @@ bool Compiler::FillTree()
   if (next_child == tokens[tokens_used].type) { // add token directly
    node.child_ids.push_back(ChildIdFromTokenId(tokens_used));
    tokens_used++;
-   std::cout << "tokens_used++: " << tokens_used << std::endl;
+   Debug("tokens_used++: "s + std::to_string(tokens_used));
    DumpTree();
   } else { // add inner nodes
    auto list = GetPath(next_child, tokens[tokens_used].type);
-- 
cgit v1.2.3