summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2020-01-29 21:52:35 +0100
committerRoland Reichwein <mail@reichwein.it>2020-01-29 21:52:35 +0100
commit1f3f7c2693686d847bf1a9bb3e47a023fe9d7992 (patch)
tree032ee62d5c19b54f64b6fcb3a1aa941d2dc31c62
parentb97f6b86b85553acd3863ee18a67b8868e0ea7b4 (diff)
New algo
-rw-r--r--TODO6
-rw-r--r--grammer.cpp468
-rw-r--r--grammer.h65
-rw-r--r--test-lexer.cpp2
4 files changed, 328 insertions, 213 deletions
diff --git a/TODO b/TODO
index e69de29..a1d7b33 100644
--- a/TODO
+++ b/TODO
@@ -0,0 +1,6 @@
+
+ Token() = default;
+ Token(const std::string& s) { type = s; } // Assign type via "=" from string
+
+start symbol implicitly from bnf
+validate bnf: no empty types or values, or empty target lists
diff --git a/grammer.cpp b/grammer.cpp
index 25fe837..8f38e3e 100644
--- a/grammer.cpp
+++ b/grammer.cpp
@@ -1,241 +1,344 @@
#include "grammer.h"
+#include <algorithm>
+
using namespace Gram;
-void Tree::clear() {
+void Compiler::clear() {
nodes.clear();
- root = 0;
- last = 0;
- node_num = 0;
+ root_node_id = 0;
+
+ tokens_used = 0;
}
-bool Tree::Valid(const std::string& Top) const {
+std::string Compiler::GetTypeOfNode(index_t node_id) const
+{
+ return nodes[node_id].type;
+}
+
+bool Compiler::IsRootNode(index_t node_id) const
+{
+ auto node& {nodes[node_id]};
+ return node.parent_node_id == node.node_id;
+}
+
+void Compiler::Validate() const {
// A program is non empty
- if (node_num == 0)
- return false;
+ if (nodes.size() == 0)
+ throw std::runtime_error("");
- // Start symbol on top
- auto rootNode{nodes.find(root)};
- if (rootNode == nodes.end())
- throw std::runtime_error("Node not found: "s + std::to_string(root));
+ // Consistency check for nodes
+ if (root_node_id >= nodes.size())
+ throw std::runtime_error("Bad root node: "s + std::to_string(root_node_id) + " vs. "s + std::to_string(nodes.size()));
- if (rootNode->second.token.type != Top)
- return false;
+ // Start symbol on top
+ if (GetTypeOfNode(root_node_id) != Top)
+ throw std::runtime_error("Root node not start symbol!");
+
+ // All nodes filled
+ for (const auto& node: nodes) {
+ if (node.child_ids.size() != bnf[node.type][node.variant].size())
+ throw std::runtime_error("Node not filled: "s + node.type + "["s + std::to_string(node.variant) + "]"s);
+ }
+}
- // All nodes filled (implies all leaves are terminal)
- for (const auto& [index, node]: nodes) {
- if (node.childs.size() < node.child_types.size())
- return false; // node not filled
+void Compiler::DumpTree()
+{
+ std::cout << "=== Tree =====================================" << std::endl;
+ for (const auto& i : nodes) {
+ std::cout << i.type << std::endl;
}
+}
- return true;
+bool RootIsStartSymbol()
+{
+ return GetTypeOfNode(root_node_id) == Top;
}
-bool Tree::AddFirstNode(const Token& token, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) {
- node_num ++;
- root = node_num;
- last = node_num;
+bool ChildIdIsToken(int32_t child_id)
+{
+ return child_i < 0;
+}
- auto reverseRule{reverseBNF.find(token.type)};
- if (reverseRule == reverseBNF.end())
- throw std::runtime_error("Reverse rule not found for "s + token.type);
+index_t TokenIdFromChildId(int32_t child_id)
+{
+ return index_t(-child_id) - 1;
+}
- auto rule{bnf.find(token.type)};
- if (rule != bnf.end()) { // multiple variants!
- throw std::runtime_error("BNF rule for terminal symbol "s + token.type + " found."s);
+int32_t ChildIdFromTokenId(index_t token_id)
+{
+ return -1 - int32_t(token_id);
+}
+
+bool AllTokensUsed()
+{
+ return tokens_used == tokens.size();
+}
+
+bool Compiler::treeIsComplete()
+{
+ return RootIsStartSymbol() && AllTokensUsed();
+}
+
+std::vector<std::string>& Compiler::getNodeExpectedChilds(node_id)
+{
+ std::string& type = nodes[node_id].type;
+ index_t& variant = nodes[node_id].variant;
+ return bnf[type][variant];
+}
+
+// returns true if all childs are filled, recursively. Else false with to_fill as hint to node to fill
+bool Compiler::subTreeIsComplete(index_t node_id, index_t& to_fill)
+{
+ for (const auto& i : nodes[node_id].child_ids) {
+ if (!ChildIdIsToken(i)) { // consider subtrees
+ if (!subTreeIsComplete(i, to_fill))
+ return false; // found incomplete subtree -> return it!
+ }
}
- nodes.emplace(root, TreeNode{0, std::vector<index_t>{}, std::vector<std::string>{}, token});
+
+ if (nodes[node_id].child_ids.size() < getNodeExpectedChilds(node_id).size()) {
+ to_fill = node_id;
+ return false;
+ }
+
return true;
}
-std::vector<TreeNode> Tree::getParentTreeNode(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) {
- std::vector<TreeNode> result; // default: empty
+bool Compiler::StartsWith(const std::vector<Token>& tokens, const std::vector<std::string>& types)
+{
+ if (tokens.size() < types.size())
+ return false;
- auto& root_name {nodes[root].token.type};
- auto bnfParents {reverseBNF.find(root_name)};
- if (bnfParents == reverseBNF.end())
- return result;
+ auto [tokens_it, types_it] = std::mismatch(tokens.begin(), tokens.end(), types.begin(), types.end(), [](const Token& token, const std::string& s){ return token.type == s; });
+ return types_it == types.end(); // no mismatch: tokens (types) start wit specified types list
+}
- for (const auto& parent_node_name : bnfParents->second) {
- auto lists {bnf.at(parent_node_name)};
- for (const auto& list : lists) {
- if (list.size() > 0 && list[0] == root_name) {
- TreeNode node{0, std::vector<index_t>{root}, list, Token{parent_node_name}};
- result.push_back(node);
+void Compiler::AddFirstNode()
+{
+ root_node_id = 0;
+
+ const std::string& child_type = tokens[0].type;
+ auto it = ReverseBNF.find(child_type);
+ if (it == ReverseBNF.end())
+ throw std::runtime_error("Type not found: "s + child_type + " ("s + tokens[0].value + ")"s);
+
+ std::set<std::string>& alternatives_set {it->second};
+
+ std::string node_type;
+ index_t node_variant;
+ std::deque<std::string> alternatives; // only for valid elements from alternatives_set
+ std::vector<index_t> child_ids;
+
+ for (const auto& type : alternatives_set) {
+ const auto& variants{bnf[type]};
+ for (int i = 0; i < variants.size(); i++) {
+ const std::vector<std::string> & variant{variants[i]};
+ if (StartsWith(tokens, variant)) { // match
+ if (node_type == "") {
+ node_type = type;
+ node_variant = i;
+ for (int token_id = 0; token_id < variant.size())
+ child_ids.push_back(ChildIdFromTokenId(token_id));
+ } else
+ alternatives.push_back(type); // duplicates possible: variants of same type!
}
}
}
- return result;
+ if (node_type == "") // no matching type found
+ throw std::runtime_error("No matching first node found.");
+
+ nodes.emplace_back({0, 0, node_type, node_variant, alternatives, child_ids});
+
+ tokens_used = child_ids.size();
}
-index_t Tree::GetLast() {
- index_t result {root};
+bool Compiler::AddRootNode()
+{
+ if (nodes.size() == 0) {
+ AddFirstNode();
+ } else {
+ const std::string& child_type = nodes[root_node_id].type; // starting at old root
+ auto it = ReverseBNF.find(child_type);
+ if (it == ReverseBNF.end()) // this one doesn't have a parent, maybe a start symbol to discard?
+ return false;
+
+ index_t old_root_node_id {root_node_id};
+ index_t new_root_node_id {nodes.size()};
+ nodes[root_node_id].parent_node_id = new_root_node_id;
+
+ std::set<std::string>& alternatives_set {it->second};
+
+ std::string node_type;
+ index_t node_variant;
+ std::deque<std::string> alternatives; // only for valid elements from alternatives_set
+ std::vector<index_t> child_ids{1, old_root_node_id};
+
+ for (const auto& type : alternatives_set) {
+ const auto& variants{bnf[type]};
+ for (int i = 0; i < variants.size(); i++) {
+ const std::vector<std::string> & variant{variants[i]};
+ if (child_type == variant[0]) {
+ if (node_type == "") {
+ node_type = type;
+ node_variant = i;
+ } else
+ alternatives.push_back(type); // duplicates possible: variants of same type
+ }
+ }
+ }
- while(result != 0 && nodes[result].childs.size() >= 2) {
- result = nodes[result].childs[nodes[result].childs.size() - 1];
+ if (node_type == "") // no matching type found
+ return false;
+
+ // now add!
+ root_node_id = new_root_node_id;
+ nodes.emplace_back({root_node_id, root_node_id, node_type, node_variant, alternatives, child_ids});
+ // keep tokens_used as is
}
- return result;
+ return true;
}
-void Tree::AddRootNode(const TreeNode& newRootNode) {
- node_num++;
- nodes[node_num] = newRootNode;
- root = node_num;
- last = node_num;
+void RemoveLastNode()
+{
+ TreeNode& node {nodes.back()};
+ index_t node_id = node.node_id;
+
+ if (node_id == root_node_id) { // No parent -> remove root
+ if (node.child_ids().empty()) { // No children -> now empty
+ clear();
+ } else if (node.child_ids().size() == 1) { // One child: removing possible
+ root_node_id = node.child_ids()[0];
+ nodes.pop_back();
+ } else
+ throw std::runtime_error("Backtrack not possible: Root not empty"); // ICE
+ } else if (node.child_ids().empty()) { // No children -> remove leaf
+ // We have a parent, otherwise we would have taken previous branch
+ TreeNode& parent {nodes[node.parent_node_id]};
+ if (parent.child_ids().empty() || parent.child_ids().last() != node_id)
+ throw std::runtime_error("Backtrack: Bad child nodes"); // ICE
+ parent.childs_ids().pop_back();
+ nodes.pop_back();
+ } else { // In the middle
+ throw std::runtime_error("Backtrack in the middle of the tree."); // ICE
+ }
}
-void Tree::RemoveRootNode() {
- root = nodes[root].childs[0];
- nodes.erase(node_num);
- node_num--;
- last = GetLast();
+// Change type of last node according to alternatives
+void ChangeNodeType()
+{
+ TreeNode& node {nodes.back()};
+ index_t node_id = node.node_id;
+
+ if (node.alternative_types.empty())
+ throw std::runtime_error("No alternatives found during Backtrack"); // ICE
+
+ if (node.alternative_types.front() == node.type) { // Keep type, change variant
+ if (root_node_id == node_id) { // Root node
+ // TODO ...
+ } else if (node.child_ids().empty()) { // leaf node
+ } else
+ throw std::runtime_error("Backtrack: Can't set alternative in the middle of the tree.");
+ } else { // Different type
+ if (root_node_id == node_id) { // Root node
+ } else if (node.child_ids().empty()) { // leaf node
+ } else
+ throw std::runtime_error("Backtrack: Can't set alternative in the middle of the tree.");
+ }
+}
+
+// throws if no further track back is possible: compile error
+void Compiler::TrackBack()
+{
+ // Search backwards for alternatives: last node with alternatives (variant or alt. token)
+ while (!nodes.empty() && nodes.last().alternative_types.empty()) {
+ RemoveLastNode();
+ }
+
+ if (nodes.empty()) {
+ throw std::runtime_error("Compile error: Invalid program.");
+ }
+
+ ChangeNodeType();
}
-// Path from leaf to root
-std::vector<std::string> Tree::GetPath(std::string a, std::string b, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) {
+// returns list from lower (including) to upper (excluding)
+// returns empty list on fail
+std::vector<std::string> Compiler::GetPath(std::string upper, std::string lower) {
std::vector<std::string> result;
- while (a != b) {
- auto parents {reverseBNF.find(a)};
- if (parents == reverseBNF.end())
+ while (lower != upper) {
+ auto parents {ReverseBNF.find(lower)};
+ if (parents == ReverseBNF.end())
return {};
+ std::string new_lower;
bool hit{false};
for (const auto& parent : parents->second) {
for (const auto& list : bnf.at(parent)) {
- if (list.size() > 0 && list[0] == a) {
+ if (list.size() > 0 && list[0] == lower) {
if (!hit) {
- result.push_back(a);
- a = parent;
+ result.push_back(lower);
+ new_lower = parent;
hit = true;
} else
- throw std::runtime_error("Double match for "s + parent + "/"s + a);
+ throw std::runtime_error("Double match for "s + lower + ": "s + parent + ", "s + new_lower); // TODO: also get alternatives at each step
}
}
}
- }
- if (a == b) {
- result.push_back(a);
+ lower = new_lower;
}
return result;
}
-index_t Tree::AddNode(const std::string& name, const std::string& child_name, index_t parent_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF)
+index_t Compiler::AddNode(const std::string& name, const std::string& child_type, index_t parent_index)
{
TreeNode& parent {nodes[parent_index]};
- node_num++;
- index_t index = node_num;
- parent.childs.push_back(index);
- std::vector<std::string> child_types;
- auto rule {bnf.find(name)};
- if (rule != bnf.end()) {
- for (auto& list : rule->second) {
- if (list.size() > 0 && list[0] == child_name)
- child_types = list;
- }
- }
- nodes.emplace(index, TreeNode{parent_index, {}, child_types, Token{name}});
- //root stays
- last = GetLast();
+ index_t index = nodes.size();
+ parent.child_ids.push_back(index);
- return index;
-}
+ index_t variant;
+ std::deque<std::string> alternatives;
-void Tree::AddPath(const std::vector<std::string>& path, index_t current_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) {
- for (int i = path.size() - 1; i >= 0; i--) {
- std::string child_name;
- if (i > 0)
- child_name = path[i - 1];
- current_index = AddNode(path[i], child_name, current_index, bnf, reverseBNF);
+ const auto& lists { bnf[parent.type] };
+ for (int i = 0; i < lists.size(); i++) { // variants
+ if (lists[i].size() > 0 && lists[i][0] == child_type)
+ variant = i;
}
-}
-// try to add Node to tree
-bool Tree::Add(const Token& token, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) {
- if (nodes.empty()) { // first node
- return AddFirstNode(token, bnf, reverseBNF);
- } else { // at least one character is already present
- // Traverse tree until partially filled node found
- // or new node can be added
- index_t current_index{last};
-
- while (current_index != 0) {
- TreeNode& node {nodes[current_index]};
- if (node.childs.size() < node.child_types.size()) { // partially filled node
- std::vector<std::string> list = GetPath(token.type, node.child_types[node.childs.size()], bnf, reverseBNF);
- if (list.size() > 0) {
- AddPath(list, current_index, bnf, reverseBNF);
- return true;
- } else {
- return false; // The path a->b is not available via bnf
- }
- }
- current_index = node.parent;
- }
-
- // Add node at root
-
- std::vector<TreeNode> parent_nodes = getParentTreeNode(bnf, reverseBNF);
- if (parent_nodes.size() == 0)
- throw std::runtime_error("Couldn't add new parent node for "s + nodes[root].token.type + "("s + std::to_string(root) + ")"s);
+ nodes.emplace_back({parent_index, index, child_type, variant, alternatives, {}});
+ //root stays, tokens_used stays
- for (const auto &i : parent_nodes) {
- AddRootNode(i);
- if (Add(token, bnf, reverseBNF))
- return true;
- RemoveRootNode();
- }
-
- }
- return false;
+ return index;
}
-// add path to start symbol
-void Tree::Resolve(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) {
- if (nodes.empty()) // only handle non-empty trees
- return;
-
- while (true) {
- std::string& old_root_name { nodes[root].token.type }; // current root node type
-
- auto parents {reverseBNF.find(old_root_name)};
- if (parents != reverseBNF.end()) { // parents in bnf available
- bool hit{false};
- for (auto& parent : parents->second) {
- for (const auto& list : bnf.at(parent)) {
- if (list.size() == 1 && list[0] == old_root_name) {
- if (!hit) {
- // Add new TreeNode in the direction to root:
- // New root with 1 child (old root)
- nodes.emplace(++node_num,
- TreeNode{0, // parent
- std::vector<index_t>{root}, // child indices
- std::vector<std::string>{old_root_name}, // child types
- Token{parent}
- });
- nodes[root].parent = node_num;
- root = node_num;
- // this->last stays
- hit = true;
- } else
- throw std::runtime_error("Error: Multiple resolve nodes for "s + old_root_name);
- }
- }
- }
- if (!hit)
- break;
- } else
- break;
+void Compiler::AddPath(const std::vector<std::string>& path, index_t current_index) {
+ for (int i = path.size() - 1; i > 0; i--) {
+ std::string child_name = path[i - 1];
+ current_index = AddNode(path[i], child_name, current_index);
}
+
+ nodes.back().child_ids.emplace_back(ChildIdFromTokenId(tokens_used));
+ tokens_used++;
}
-void Tree::Dump()
+bool Compiler::FillTree()
{
- std::cout << "=== Tree =====================================" << std::endl;
- for (const auto& i : nodes) {
- std::cout << i.second.token.type << " (" << i.second.token.value << ")" << std::endl;
+ if (nodes.size() == 0) // ignore empty tree, successfully
+ return true;
+
+ index_t to_fill;
+
+ while (!subTreeIsComplete(root_node_id, to_fill)) {
+ auto list = GetPath(nodes[to_fill].type, tokens[tokens_used]);
+ if (list.size() > 0) {
+ AddPath(list, to_fill);
+ return true;
+ } else {
+ return false;
+ }
}
}
@@ -245,22 +348,23 @@ Compiler::Compiler(const BNF& bnf, const std::string& Top): bnf(bnf), Top(Top),
Tree Compiler::compile(std::vector<Token> Tokens)
{
- if (Tokens.size()){
- } else
- throw std::runtime_error("No tokens!");
+ clear();
+ tokens = Tokens;
- Tree tree;
+ if (tokens.size() == 0)
+ throw std::runtime_error("No tokens!");
- for (const Token& i : Tokens) {
- if (!tree.Add(i, bnf, ReverseBNF)) {
- //tree.Dump();
- throw std::runtime_error("Compile error: Invalid token "s + i.type);
- }
+ while (!treeIsComplete()) {
+ if (!FillTree())
+ TrackBack();
+ else if (!AddRootNode())
+ TrackBack();
+ else if (!FillTree())
+ TrackBack();
}
- tree.Resolve(bnf, ReverseBNF);
- if (!tree.Valid(Top))
- throw std::runtime_error("Compile error: Program incomplete");
+ Validate();
return tree;
}
+
diff --git a/grammer.h b/grammer.h
index 87aa1ad..18719cd 100644
--- a/grammer.h
+++ b/grammer.h
@@ -3,51 +3,58 @@
#include "bnf.h"
#include "minicc.h"
+#include <cstdint>
+
namespace Gram {
+// TreeNodes are only intermediate. Terminal symbols don't get of TreeNodes
+// token_id: index into token list
+// node_id: index into tree node list
struct TreeNode {
- index_t parent{};
- std::vector<index_t> childs; // fill Token by Token
- std::vector<std::string> child_types; // fill always
- Token token;
-};
-
-class Tree {
-private:
- std::map<index_t, TreeNode> nodes; // index 0 = non existing; index starting at 1
- index_t node_num{};
- index_t root{};
- index_t last{};
+ index_t parent_node_id{}; // root if parent_node_id == node_id
+ index_t node_id{};
-public:
- void clear();
- bool Valid(const std::string& Top) const;
- bool AddFirstNode(const Token& token, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF);
- std::vector<TreeNode> getParentTreeNode(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF);
- index_t GetLast();
- void AddRootNode(const TreeNode& newRootNode);
- void RemoveRootNode();
- std::vector<std::string> GetPath(std::string a, std::string b, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF);
- index_t AddNode(const std::string& name, const std::string& child_name, index_t parent_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF);
- void AddPath(const std::vector<std::string>& path, index_t current_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF);
- bool Add(const Token& token, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF);
- void Resolve(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF);
-
- void Dump();
+ std::string type;
+ index_t variant; // bnf[type][variant]
+ std::deque<std::string> alternative_types; // alternatives that we can consider if type fails. Discard after parsing!
+ std::vector<int32_t> child_ids; // < 0: terminal: token_id; >= 0: non-terminal: node_id; fill token by token
};
class Compiler
{
private:
+ // The result
+ index_t root_node_id{};
+ std::vector<TreeNode> nodes;
+
+ // Input
+ std::vector<std::string> tokens;
+
+ // Helper data
+ index_t tokens_used{0}; // number of tokens used, this is also the next token index to use
+
const BNF &bnf;
const std::string& Top;
- std::map<std::string, std::set<std::string>> ReverseBNF;
+ std::map<std::string, std::set<std::string>> ReverseBNF; // possible parent types of a given type
public:
+ // Tree specific
+ void clear();
+
+ // Node specific
+ std::string GetTypeOfNode(index_t node_id) const;
+
+ index_t TrackBack();
+
+ void Validate(const std::string& Top, const BNF& bnf) const;
+
+ void Dump();
+
Compiler(const BNF& bnf, const std::string& Top);
- Tree compile(std::vector<Token> Tokens);
+
+ std::pair<index_t, std::vector<TreeNode>> compile(std::vector<Token> Tokens);
};
} // namespace Gram
diff --git a/test-lexer.cpp b/test-lexer.cpp
index e85fc5c..3a4ed2a 100644
--- a/test-lexer.cpp
+++ b/test-lexer.cpp
@@ -95,8 +95,6 @@ TEST_F(Test, BNF) {
Gram::Compiler compiler(bnf, Top);
auto Tree = compiler.compile(tokens);
- ASSERT_TRUE(Tree.Valid(Top));
-
Tree.Dump();
}