summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2020-03-27 17:39:58 +0100
committerRoland Reichwein <mail@reichwein.it>2020-03-27 17:39:58 +0100
commit3057729f132d516dd9ed58c6964a495aa1c11c3d (patch)
tree7eca9a03fd08200868a5ab0c27e33170b9dc917a
parent5467147d9470ee294ddab938098c3ef172222066 (diff)
Top-down algo recognizes first C*+ program
-rw-r--r--cpp.cpp2
-rw-r--r--cpp.h2
-rw-r--r--grammer.cpp117
-rw-r--r--grammer.h21
-rw-r--r--test-cpp.cpp4
5 files changed, 114 insertions, 32 deletions
diff --git a/cpp.cpp b/cpp.cpp
index f59859d..75212a8 100644
--- a/cpp.cpp
+++ b/cpp.cpp
@@ -213,7 +213,7 @@ std::vector<Token> CPP::tokens_from_pptokens(std::vector<Token> pp_tokens)
}
// Phase 7.b: Grammar Analysis
-std::pair<index_t, std::vector<Gram::TreeNode>> CPP::analysis(std::vector<Token> tokens)
+std::vector<Gram::TreeNode> CPP::analysis(std::vector<Token> tokens)
{
auto bnf = SubBNF(CPPBNF::GetCppBNFGram(), "translation-unit");
diff --git a/cpp.h b/cpp.h
index 282c83d..316eaa3 100644
--- a/cpp.h
+++ b/cpp.h
@@ -22,7 +22,7 @@ void preprocess(); // phase 4
void execution_charset_map(); // phase 5
void concatenate_strings(); // phase 6
std::vector<Token> tokens_from_pptokens(std::vector<Token> pp_tokens); // phase 7.a
-std::pair<index_t, std::vector<Gram::TreeNode>> analysis(std::vector<Token>); // phase 7.b
+std::vector<Gram::TreeNode> analysis(std::vector<Token>); // phase 7.b
void translate(); // phase 7.c
void instantiate(); // phase 8
void link(); // phase 9
diff --git a/grammer.cpp b/grammer.cpp
index 3cc88fa..cd5b50f 100644
--- a/grammer.cpp
+++ b/grammer.cpp
@@ -15,8 +15,8 @@ void Debug(std::string s)
void Compiler::clear()
{
+ symbol_variants.clear();
nodes.clear();
- begin_pos = NodePosition{};
}
std::string Compiler::GetTypeOfNode(index_t node_id) const
@@ -26,11 +26,21 @@ std::string Compiler::GetTypeOfNode(index_t node_id) const
return nodes[node_id].type;
}
+bool Gram::ChildIdIsEmpty(int32_t child_id)
+{
+ return child_id == 0;
+}
+
bool Gram::ChildIdIsToken(int32_t child_id)
{
return child_id < 0;
}
+bool Gram::ChildIdIsNode(int32_t child_id)
+{
+ return child_id > 0;
+}
+
index_t Gram::TokenIdFromChildId(int32_t child_id)
{
return index_t(-child_id) - 1;
@@ -94,33 +104,53 @@ index_t Compiler::AddNode(const std::string& type, index_t variant, NodePosition
nodes[pos.node_id].child_ids[pos.child_pos] = node_id;
nodes.emplace_back(TreeNode{pos, node_id, type, variant, std::vector<int32_t>(size_t(list.size()), 0)});
- Debug("AddNode(): "s + nodes[pos.node_id].type + "->"s + type + ": "s + std::to_string(node_id));
- DumpTree();
-
return node_id;
}
-void Compiler::RemoveNode()
+Compiler::AddNodeGuard::AddNodeGuard(Compiler& compiler, index_t variant): m_compiler(compiler)
{
- auto& pos{nodes.back().pos};
-
- nodes[pos.node_id].child_ids[pos.child_pos] = 0;
- nodes.pop_back();
-}
-
-Compiler::AddNodeGuard::AddNodeGuard(Compiler& compiler, const std::string& type, index_t variant, NodePosition pos): m_compiler(compiler)
-{
- m_compiler.AddNode(type, variant, pos);
+ m_compiler.symbol_variants.push_back(variant);
}
Compiler::AddNodeGuard::~AddNodeGuard()
{
- m_compiler.RemoveNode();
+ m_compiler.symbol_variants.pop_back();
}
void Compiler::IncNodePosition(NodePosition& pos)
{
- // TODO
+ if (nodes.size() == 0) // special case: empty tree
+ return;
+ if (pos.node_id >= nodes.size())
+ throw std::runtime_error("ICE: NodePosition with node_id "s + std::to_string(pos.node_id) + " doesn't exist."s);
+ if (pos.child_pos >= nodes[pos.node_id].child_ids.size())
+ throw std::runtime_error("ICE: NodePosition with child_pos "s + std::to_string(pos.child_pos) + " in node_id "s + std::to_string(pos.node_id) + " doesn't exist."s);
+
+ int32_t child_id{nodes[pos.node_id].child_ids[pos.child_pos]};
+
+ if (ChildIdIsEmpty(child_id))
+ throw std::runtime_error("ICE: NodePosition is empty");
+
+ // Actually, advance
+ if (ChildIdIsToken(child_id)) {
+ pos.child_pos++;
+ } else {
+ pos.node_id = child_id;
+ pos.child_pos = 0;
+ }
+
+ // Go to parent if child ids completely traversed
+ while (pos.node_id > 0 && pos.child_pos >= nodes[pos.node_id].child_ids.size()) {
+ pos = nodes[pos.node_id].pos;
+ pos.child_pos++;
+ }
+
+ // Advancing at root node for last child is allowed: Finished
+ if (pos.child_pos >= nodes[pos.node_id].child_ids.size())
+ return;
+
+ if (ChildIdIsNode(nodes[pos.node_id].child_ids[pos.child_pos]))
+ throw std::runtime_error("ICE: No node expected at "s + std::to_string(pos.node_id) + "/"s + std::to_string(pos.child_pos));
}
size_t Compiler::minimumSymbolsNeeded(std::string symbol)
@@ -170,7 +200,6 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t
while (begin < end && symbol_list.size() > 0 && symbol_list.front() == tokens[begin].type) {
begin++;
symbol_list.erase(symbol_list.begin());
- IncNodePosition(begin_pos); // TODO: guard?
}
// match terminal symbols at end
@@ -181,8 +210,11 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t
// matching of empty lists
if (symbol_list.size() == 0) {
- if (begin == end) // only match real empty list
+ if (begin == end) { // only match real empty list
+ // this is the point of the final match
+ constructTree();
return true;
+ }
return false;
}
@@ -192,7 +224,7 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t
auto it{bnf.find(symbol_list.front())};
if (it != bnf.end()) {
for (size_t i = 0; i < it->second.size(); i++) { // iterate over alternatives
- //AddNodeGuard guard(*this, symbol_list.front(), i, begin_pos);
+ AddNodeGuard guard(*this, i);
std::vector<std::string> list {it->second[i]};
list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end());
if (minimumSymbolsNeeded(list) > end - begin) // stop recursion
@@ -214,6 +246,42 @@ bool Compiler::match(std::string symbol, size_t begin, size_t end)
return match(symbol_list, begin, end);
}
+void Compiler::constructTree()
+{
+ symbol_variants_it = symbol_variants.begin();
+
+ m_symbol_list = {m_top};
+ m_symbol_list_pos = 0;
+
+ NodePosition tree_pos;
+
+ while (m_symbol_list_pos < m_symbol_list.size()) {
+ std::string symbol{m_symbol_list[m_symbol_list_pos]};
+
+ if (isTerminal(bnf, symbol)) {
+ // Advance terminal symbol
+ nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = ChildIdFromTokenId(m_symbol_list_pos);
+ IncNodePosition(tree_pos);
+ m_symbol_list_pos++;
+ } else {
+ // Replace non-terminal symbol
+ m_symbol_list.erase(m_symbol_list.begin() + m_symbol_list_pos);
+ std::vector<std::string> list {bnf[symbol][*symbol_variants_it]};
+ m_symbol_list.insert(m_symbol_list.begin() + m_symbol_list_pos, list.begin(), list.end());
+
+ index_t node_id {AddNode(symbol, *symbol_variants_it, tree_pos)};
+
+ if (node_id > 0) {
+ nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = node_id;
+ IncNodePosition(tree_pos);
+ }
+
+ symbol_variants_it++;
+ }
+ }
+
+}
+
Compiler::Compiler(BNF& bnf, const std::string& top): bnf(bnf), m_top(top), ReverseBNF{Reverse(bnf)}, reversedFirst{reverseFirst(bnf)}
{
//std::cout << "DEBUG: " << m_top << std::endl;
@@ -234,7 +302,7 @@ Compiler::Compiler(BNF& bnf, const std::string& top): bnf(bnf), m_top(top), Reve
minimumSymbolsNeeded("translation-unit");
}
-std::pair<index_t, std::vector<TreeNode>> Compiler::compile(std::vector<Token> p_tokens)
+std::vector<TreeNode> Compiler::compile(std::vector<Token> p_tokens)
{
clear();
tokens = p_tokens;
@@ -243,11 +311,16 @@ std::pair<index_t, std::vector<TreeNode>> Compiler::compile(std::vector<Token> p
throw std::runtime_error("No tokens!");
//
- // top-down algorithm
+ // top-down algorithm:
+ //
+ // 1. Match linear tokens list to bnf, building up list of used variants (symbol_variants)
+ // 2. Construct Node Tree from symbol_variants
//
if (!match(m_top, 0, tokens.size()))
throw std::runtime_error("Compile error.");
- return std::pair<index_t, std::vector<TreeNode>>{0, nodes};
+ //DumpTree();
+
+ return nodes;
}
diff --git a/grammer.h b/grammer.h
index 39e6884..88cb7b7 100644
--- a/grammer.h
+++ b/grammer.h
@@ -4,6 +4,7 @@
#include "minicc.h"
#include <cstdint>
+#include <deque>
#include <unordered_set>
#include <utility>
@@ -16,7 +17,7 @@ struct NodePosition {
index_t child_pos{}; // 0-based
};
-// TreeNodes are only intermediate. Terminal symbols don't get of TreeNodes
+// TreeNodes are only intermediate. Terminal symbols don't get TreeNodes
// token_id: index into token list
// node_id: index into tree node list
struct TreeNode {
@@ -43,24 +44,24 @@ private:
std::unordered_map<std::string, std::set<std::string>> ReverseBNF; // possible parent types of a given type; unused now: remove?
std::unordered_map<std::string, std::set<std::string>> reversedFirst; // possible parent types of first childs of a given type
+ std::deque<index_t> symbol_variants;
+ decltype(symbol_variants)::iterator symbol_variants_it;
+
// Tree specific
void clear();
// Node specific
std::string GetTypeOfNode(index_t node_id) const;
index_t AddNode(const std::string& type, index_t variant, NodePosition pos = {});
- void RemoveNode();
- // Adds Node and Removes it on scope exit (RAII)
+ // Adds actually used Non-Terminal Symbol Removes it on scope exit (RAII)
class AddNodeGuard {
Compiler& m_compiler;
public:
- AddNodeGuard(Compiler& compiler, const std::string& type, index_t variant, NodePosition pos);
+ AddNodeGuard(Compiler& compiler, index_t variant);
~AddNodeGuard();
};
void IncNodePosition(NodePosition& pos);
- NodePosition begin_pos;
-
// top-down algorithm
std::unordered_map<std::string, size_t> m_min; // cache
size_t minimumSymbolsNeeded(std::string symbol);
@@ -68,15 +69,21 @@ private:
bool match(std::string symbol, size_t begin, size_t end);
bool match(std::vector<std::string> symbol_list, size_t begin, size_t end);
+ void constructTree();
+ std::vector<std::string> m_symbol_list;
+ index_t m_symbol_list_pos{};
+
public:
Compiler(BNF& bnf, const std::string& top);
- std::pair<index_t, std::vector<TreeNode>> compile(std::vector<Token> p_tokens);
+ std::vector<TreeNode> compile(std::vector<Token> p_tokens);
void DumpTree();
friend class ::GrammerTest;
};
+bool ChildIdIsEmpty(int32_t child_id);
bool ChildIdIsToken(int32_t child_id);
+bool ChildIdIsNode(int32_t child_id);
index_t TokenIdFromChildId(int32_t child_id);
int32_t ChildIdFromTokenId(index_t token_id);
diff --git a/test-cpp.cpp b/test-cpp.cpp
index 2a67b38..47a57f5 100644
--- a/test-cpp.cpp
+++ b/test-cpp.cpp
@@ -46,7 +46,9 @@ TEST_F(CppTest, preprocessing_tokenize) {
}
#endif
- auto result = cpp.analysis(tokens);
+ auto nodes = cpp.analysis(tokens);
+
+ ASSERT_EQ(nodes.size(), 44);
}
#endif