diff options
author | Roland Reichwein <mail@reichwein.it> | 2020-01-29 21:52:35 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2020-01-29 21:52:35 +0100 |
commit | 1f3f7c2693686d847bf1a9bb3e47a023fe9d7992 (patch) | |
tree | 032ee62d5c19b54f64b6fcb3a1aa941d2dc31c62 | |
parent | b97f6b86b85553acd3863ee18a67b8868e0ea7b4 (diff) |
New algo
-rw-r--r-- | TODO | 6 | ||||
-rw-r--r-- | grammer.cpp | 468 | ||||
-rw-r--r-- | grammer.h | 65 | ||||
-rw-r--r-- | test-lexer.cpp | 2 |
4 files changed, 328 insertions, 213 deletions
@@ -0,0 +1,6 @@ + + Token() = default; + Token(const std::string& s) { type = s; } // Assign type via "=" from string + +start symbol implicitly from bnf +validate bnf: no empty types or values, or empty target lists diff --git a/grammer.cpp b/grammer.cpp index 25fe837..8f38e3e 100644 --- a/grammer.cpp +++ b/grammer.cpp @@ -1,241 +1,344 @@ #include "grammer.h" +#include <algorithm> + using namespace Gram; -void Tree::clear() { +void Compiler::clear() { nodes.clear(); - root = 0; - last = 0; - node_num = 0; + root_node_id = 0; + + tokens_used = 0; } -bool Tree::Valid(const std::string& Top) const { +std::string Compiler::GetTypeOfNode(index_t node_id) const +{ + return nodes[node_id].type; +} + +bool Compiler::IsRootNode(index_t node_id) const +{ + auto node& {nodes[node_id]}; + return node.parent_node_id == node.node_id; +} + +void Compiler::Validate() const { // A program is non empty - if (node_num == 0) - return false; + if (nodes.size() == 0) + throw std::runtime_error(""); - // Start symbol on top - auto rootNode{nodes.find(root)}; - if (rootNode == nodes.end()) - throw std::runtime_error("Node not found: "s + std::to_string(root)); + // Consistency check for nodes + if (root_node_id >= nodes.size()) + throw std::runtime_error("Bad root node: "s + std::to_string(root_node_id) + " vs. "s + std::to_string(nodes.size())); - if (rootNode->second.token.type != Top) - return false; + // Start symbol on top + if (GetTypeOfNode(root_node_id) != Top) + throw std::runtime_error("Root node not start symbol!"); + + // All nodes filled + for (const auto& node: nodes) { + if (node.child_ids.size() != bnf[node.type][node.variant].size()) + throw std::runtime_error("Node not filled: "s + node.type + "["s + std::to_string(node.variant) + "]"s); + } +} - // All nodes filled (implies all leaves are terminal) - for (const auto& [index, node]: nodes) { - if (node.childs.size() < node.child_types.size()) - return false; // node not filled +void Compiler::DumpTree() +{ + std::cout << "=== Tree =====================================" << std::endl; + for (const auto& i : nodes) { + std::cout << i.type << std::endl; } +} - return true; +bool RootIsStartSymbol() +{ + return GetTypeOfNode(root_node_id) == Top; } -bool Tree::AddFirstNode(const Token& token, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { - node_num ++; - root = node_num; - last = node_num; +bool ChildIdIsToken(int32_t child_id) +{ + return child_i < 0; +} - auto reverseRule{reverseBNF.find(token.type)}; - if (reverseRule == reverseBNF.end()) - throw std::runtime_error("Reverse rule not found for "s + token.type); +index_t TokenIdFromChildId(int32_t child_id) +{ + return index_t(-child_id) - 1; +} - auto rule{bnf.find(token.type)}; - if (rule != bnf.end()) { // multiple variants! - throw std::runtime_error("BNF rule for terminal symbol "s + token.type + " found."s); +int32_t ChildIdFromTokenId(index_t token_id) +{ + return -1 - int32_t(token_id); +} + +bool AllTokensUsed() +{ + return tokens_used == tokens.size(); +} + +bool Compiler::treeIsComplete() +{ + return RootIsStartSymbol() && AllTokensUsed(); +} + +std::vector<std::string>& Compiler::getNodeExpectedChilds(node_id) +{ + std::string& type = nodes[node_id].type; + index_t& variant = nodes[node_id].variant; + return bnf[type][variant]; +} + +// returns true if all childs are filled, recursively. Else false with to_fill as hint to node to fill +bool Compiler::subTreeIsComplete(index_t node_id, index_t& to_fill) +{ + for (const auto& i : nodes[node_id].child_ids) { + if (!ChildIdIsToken(i)) { // consider subtrees + if (!subTreeIsComplete(i, to_fill)) + return false; // found incomplete subtree -> return it! + } } - nodes.emplace(root, TreeNode{0, std::vector<index_t>{}, std::vector<std::string>{}, token}); + + if (nodes[node_id].child_ids.size() < getNodeExpectedChilds(node_id).size()) { + to_fill = node_id; + return false; + } + return true; } -std::vector<TreeNode> Tree::getParentTreeNode(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { - std::vector<TreeNode> result; // default: empty +bool Compiler::StartsWith(const std::vector<Token>& tokens, const std::vector<std::string>& types) +{ + if (tokens.size() < types.size()) + return false; - auto& root_name {nodes[root].token.type}; - auto bnfParents {reverseBNF.find(root_name)}; - if (bnfParents == reverseBNF.end()) - return result; + auto [tokens_it, types_it] = std::mismatch(tokens.begin(), tokens.end(), types.begin(), types.end(), [](const Token& token, const std::string& s){ return token.type == s; }); + return types_it == types.end(); // no mismatch: tokens (types) start wit specified types list +} - for (const auto& parent_node_name : bnfParents->second) { - auto lists {bnf.at(parent_node_name)}; - for (const auto& list : lists) { - if (list.size() > 0 && list[0] == root_name) { - TreeNode node{0, std::vector<index_t>{root}, list, Token{parent_node_name}}; - result.push_back(node); +void Compiler::AddFirstNode() +{ + root_node_id = 0; + + const std::string& child_type = tokens[0].type; + auto it = ReverseBNF.find(child_type); + if (it == ReverseBNF.end()) + throw std::runtime_error("Type not found: "s + child_type + " ("s + tokens[0].value + ")"s); + + std::set<std::string>& alternatives_set {it->second}; + + std::string node_type; + index_t node_variant; + std::deque<std::string> alternatives; // only for valid elements from alternatives_set + std::vector<index_t> child_ids; + + for (const auto& type : alternatives_set) { + const auto& variants{bnf[type]}; + for (int i = 0; i < variants.size(); i++) { + const std::vector<std::string> & variant{variants[i]}; + if (StartsWith(tokens, variant)) { // match + if (node_type == "") { + node_type = type; + node_variant = i; + for (int token_id = 0; token_id < variant.size()) + child_ids.push_back(ChildIdFromTokenId(token_id)); + } else + alternatives.push_back(type); // duplicates possible: variants of same type! } } } - return result; + if (node_type == "") // no matching type found + throw std::runtime_error("No matching first node found."); + + nodes.emplace_back({0, 0, node_type, node_variant, alternatives, child_ids}); + + tokens_used = child_ids.size(); } -index_t Tree::GetLast() { - index_t result {root}; +bool Compiler::AddRootNode() +{ + if (nodes.size() == 0) { + AddFirstNode(); + } else { + const std::string& child_type = nodes[root_node_id].type; // starting at old root + auto it = ReverseBNF.find(child_type); + if (it == ReverseBNF.end()) // this one doesn't have a parent, maybe a start symbol to discard? + return false; + + index_t old_root_node_id {root_node_id}; + index_t new_root_node_id {nodes.size()}; + nodes[root_node_id].parent_node_id = new_root_node_id; + + std::set<std::string>& alternatives_set {it->second}; + + std::string node_type; + index_t node_variant; + std::deque<std::string> alternatives; // only for valid elements from alternatives_set + std::vector<index_t> child_ids{1, old_root_node_id}; + + for (const auto& type : alternatives_set) { + const auto& variants{bnf[type]}; + for (int i = 0; i < variants.size(); i++) { + const std::vector<std::string> & variant{variants[i]}; + if (child_type == variant[0]) { + if (node_type == "") { + node_type = type; + node_variant = i; + } else + alternatives.push_back(type); // duplicates possible: variants of same type + } + } + } - while(result != 0 && nodes[result].childs.size() >= 2) { - result = nodes[result].childs[nodes[result].childs.size() - 1]; + if (node_type == "") // no matching type found + return false; + + // now add! + root_node_id = new_root_node_id; + nodes.emplace_back({root_node_id, root_node_id, node_type, node_variant, alternatives, child_ids}); + // keep tokens_used as is } - return result; + return true; } -void Tree::AddRootNode(const TreeNode& newRootNode) { - node_num++; - nodes[node_num] = newRootNode; - root = node_num; - last = node_num; +void RemoveLastNode() +{ + TreeNode& node {nodes.back()}; + index_t node_id = node.node_id; + + if (node_id == root_node_id) { // No parent -> remove root + if (node.child_ids().empty()) { // No children -> now empty + clear(); + } else if (node.child_ids().size() == 1) { // One child: removing possible + root_node_id = node.child_ids()[0]; + nodes.pop_back(); + } else + throw std::runtime_error("Backtrack not possible: Root not empty"); // ICE + } else if (node.child_ids().empty()) { // No children -> remove leaf + // We have a parent, otherwise we would have taken previous branch + TreeNode& parent {nodes[node.parent_node_id]}; + if (parent.child_ids().empty() || parent.child_ids().last() != node_id) + throw std::runtime_error("Backtrack: Bad child nodes"); // ICE + parent.childs_ids().pop_back(); + nodes.pop_back(); + } else { // In the middle + throw std::runtime_error("Backtrack in the middle of the tree."); // ICE + } } -void Tree::RemoveRootNode() { - root = nodes[root].childs[0]; - nodes.erase(node_num); - node_num--; - last = GetLast(); +// Change type of last node according to alternatives +void ChangeNodeType() +{ + TreeNode& node {nodes.back()}; + index_t node_id = node.node_id; + + if (node.alternative_types.empty()) + throw std::runtime_error("No alternatives found during Backtrack"); // ICE + + if (node.alternative_types.front() == node.type) { // Keep type, change variant + if (root_node_id == node_id) { // Root node + // TODO ... + } else if (node.child_ids().empty()) { // leaf node + } else + throw std::runtime_error("Backtrack: Can't set alternative in the middle of the tree."); + } else { // Different type + if (root_node_id == node_id) { // Root node + } else if (node.child_ids().empty()) { // leaf node + } else + throw std::runtime_error("Backtrack: Can't set alternative in the middle of the tree."); + } +} + +// throws if no further track back is possible: compile error +void Compiler::TrackBack() +{ + // Search backwards for alternatives: last node with alternatives (variant or alt. token) + while (!nodes.empty() && nodes.last().alternative_types.empty()) { + RemoveLastNode(); + } + + if (nodes.empty()) { + throw std::runtime_error("Compile error: Invalid program."); + } + + ChangeNodeType(); } -// Path from leaf to root -std::vector<std::string> Tree::GetPath(std::string a, std::string b, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { +// returns list from lower (including) to upper (excluding) +// returns empty list on fail +std::vector<std::string> Compiler::GetPath(std::string upper, std::string lower) { std::vector<std::string> result; - while (a != b) { - auto parents {reverseBNF.find(a)}; - if (parents == reverseBNF.end()) + while (lower != upper) { + auto parents {ReverseBNF.find(lower)}; + if (parents == ReverseBNF.end()) return {}; + std::string new_lower; bool hit{false}; for (const auto& parent : parents->second) { for (const auto& list : bnf.at(parent)) { - if (list.size() > 0 && list[0] == a) { + if (list.size() > 0 && list[0] == lower) { if (!hit) { - result.push_back(a); - a = parent; + result.push_back(lower); + new_lower = parent; hit = true; } else - throw std::runtime_error("Double match for "s + parent + "/"s + a); + throw std::runtime_error("Double match for "s + lower + ": "s + parent + ", "s + new_lower); // TODO: also get alternatives at each step } } } - } - if (a == b) { - result.push_back(a); + lower = new_lower; } return result; } -index_t Tree::AddNode(const std::string& name, const std::string& child_name, index_t parent_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) +index_t Compiler::AddNode(const std::string& name, const std::string& child_type, index_t parent_index) { TreeNode& parent {nodes[parent_index]}; - node_num++; - index_t index = node_num; - parent.childs.push_back(index); - std::vector<std::string> child_types; - auto rule {bnf.find(name)}; - if (rule != bnf.end()) { - for (auto& list : rule->second) { - if (list.size() > 0 && list[0] == child_name) - child_types = list; - } - } - nodes.emplace(index, TreeNode{parent_index, {}, child_types, Token{name}}); - //root stays - last = GetLast(); + index_t index = nodes.size(); + parent.child_ids.push_back(index); - return index; -} + index_t variant; + std::deque<std::string> alternatives; -void Tree::AddPath(const std::vector<std::string>& path, index_t current_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { - for (int i = path.size() - 1; i >= 0; i--) { - std::string child_name; - if (i > 0) - child_name = path[i - 1]; - current_index = AddNode(path[i], child_name, current_index, bnf, reverseBNF); + const auto& lists { bnf[parent.type] }; + for (int i = 0; i < lists.size(); i++) { // variants + if (lists[i].size() > 0 && lists[i][0] == child_type) + variant = i; } -} -// try to add Node to tree -bool Tree::Add(const Token& token, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { - if (nodes.empty()) { // first node - return AddFirstNode(token, bnf, reverseBNF); - } else { // at least one character is already present - // Traverse tree until partially filled node found - // or new node can be added - index_t current_index{last}; - - while (current_index != 0) { - TreeNode& node {nodes[current_index]}; - if (node.childs.size() < node.child_types.size()) { // partially filled node - std::vector<std::string> list = GetPath(token.type, node.child_types[node.childs.size()], bnf, reverseBNF); - if (list.size() > 0) { - AddPath(list, current_index, bnf, reverseBNF); - return true; - } else { - return false; // The path a->b is not available via bnf - } - } - current_index = node.parent; - } - - // Add node at root - - std::vector<TreeNode> parent_nodes = getParentTreeNode(bnf, reverseBNF); - if (parent_nodes.size() == 0) - throw std::runtime_error("Couldn't add new parent node for "s + nodes[root].token.type + "("s + std::to_string(root) + ")"s); + nodes.emplace_back({parent_index, index, child_type, variant, alternatives, {}}); + //root stays, tokens_used stays - for (const auto &i : parent_nodes) { - AddRootNode(i); - if (Add(token, bnf, reverseBNF)) - return true; - RemoveRootNode(); - } - - } - return false; + return index; } -// add path to start symbol -void Tree::Resolve(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { - if (nodes.empty()) // only handle non-empty trees - return; - - while (true) { - std::string& old_root_name { nodes[root].token.type }; // current root node type - - auto parents {reverseBNF.find(old_root_name)}; - if (parents != reverseBNF.end()) { // parents in bnf available - bool hit{false}; - for (auto& parent : parents->second) { - for (const auto& list : bnf.at(parent)) { - if (list.size() == 1 && list[0] == old_root_name) { - if (!hit) { - // Add new TreeNode in the direction to root: - // New root with 1 child (old root) - nodes.emplace(++node_num, - TreeNode{0, // parent - std::vector<index_t>{root}, // child indices - std::vector<std::string>{old_root_name}, // child types - Token{parent} - }); - nodes[root].parent = node_num; - root = node_num; - // this->last stays - hit = true; - } else - throw std::runtime_error("Error: Multiple resolve nodes for "s + old_root_name); - } - } - } - if (!hit) - break; - } else - break; +void Compiler::AddPath(const std::vector<std::string>& path, index_t current_index) { + for (int i = path.size() - 1; i > 0; i--) { + std::string child_name = path[i - 1]; + current_index = AddNode(path[i], child_name, current_index); } + + nodes.back().child_ids.emplace_back(ChildIdFromTokenId(tokens_used)); + tokens_used++; } -void Tree::Dump() +bool Compiler::FillTree() { - std::cout << "=== Tree =====================================" << std::endl; - for (const auto& i : nodes) { - std::cout << i.second.token.type << " (" << i.second.token.value << ")" << std::endl; + if (nodes.size() == 0) // ignore empty tree, successfully + return true; + + index_t to_fill; + + while (!subTreeIsComplete(root_node_id, to_fill)) { + auto list = GetPath(nodes[to_fill].type, tokens[tokens_used]); + if (list.size() > 0) { + AddPath(list, to_fill); + return true; + } else { + return false; + } } } @@ -245,22 +348,23 @@ Compiler::Compiler(const BNF& bnf, const std::string& Top): bnf(bnf), Top(Top), Tree Compiler::compile(std::vector<Token> Tokens) { - if (Tokens.size()){ - } else - throw std::runtime_error("No tokens!"); + clear(); + tokens = Tokens; - Tree tree; + if (tokens.size() == 0) + throw std::runtime_error("No tokens!"); - for (const Token& i : Tokens) { - if (!tree.Add(i, bnf, ReverseBNF)) { - //tree.Dump(); - throw std::runtime_error("Compile error: Invalid token "s + i.type); - } + while (!treeIsComplete()) { + if (!FillTree()) + TrackBack(); + else if (!AddRootNode()) + TrackBack(); + else if (!FillTree()) + TrackBack(); } - tree.Resolve(bnf, ReverseBNF); - if (!tree.Valid(Top)) - throw std::runtime_error("Compile error: Program incomplete"); + Validate(); return tree; } + @@ -3,51 +3,58 @@ #include "bnf.h" #include "minicc.h" +#include <cstdint> + namespace Gram { +// TreeNodes are only intermediate. Terminal symbols don't get of TreeNodes +// token_id: index into token list +// node_id: index into tree node list struct TreeNode { - index_t parent{}; - std::vector<index_t> childs; // fill Token by Token - std::vector<std::string> child_types; // fill always - Token token; -}; - -class Tree { -private: - std::map<index_t, TreeNode> nodes; // index 0 = non existing; index starting at 1 - index_t node_num{}; - index_t root{}; - index_t last{}; + index_t parent_node_id{}; // root if parent_node_id == node_id + index_t node_id{}; -public: - void clear(); - bool Valid(const std::string& Top) const; - bool AddFirstNode(const Token& token, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - std::vector<TreeNode> getParentTreeNode(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - index_t GetLast(); - void AddRootNode(const TreeNode& newRootNode); - void RemoveRootNode(); - std::vector<std::string> GetPath(std::string a, std::string b, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - index_t AddNode(const std::string& name, const std::string& child_name, index_t parent_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - void AddPath(const std::vector<std::string>& path, index_t current_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - bool Add(const Token& token, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - void Resolve(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - - void Dump(); + std::string type; + index_t variant; // bnf[type][variant] + std::deque<std::string> alternative_types; // alternatives that we can consider if type fails. Discard after parsing! + std::vector<int32_t> child_ids; // < 0: terminal: token_id; >= 0: non-terminal: node_id; fill token by token }; class Compiler { private: + // The result + index_t root_node_id{}; + std::vector<TreeNode> nodes; + + // Input + std::vector<std::string> tokens; + + // Helper data + index_t tokens_used{0}; // number of tokens used, this is also the next token index to use + const BNF &bnf; const std::string& Top; - std::map<std::string, std::set<std::string>> ReverseBNF; + std::map<std::string, std::set<std::string>> ReverseBNF; // possible parent types of a given type public: + // Tree specific + void clear(); + + // Node specific + std::string GetTypeOfNode(index_t node_id) const; + + index_t TrackBack(); + + void Validate(const std::string& Top, const BNF& bnf) const; + + void Dump(); + Compiler(const BNF& bnf, const std::string& Top); - Tree compile(std::vector<Token> Tokens); + + std::pair<index_t, std::vector<TreeNode>> compile(std::vector<Token> Tokens); }; } // namespace Gram diff --git a/test-lexer.cpp b/test-lexer.cpp index e85fc5c..3a4ed2a 100644 --- a/test-lexer.cpp +++ b/test-lexer.cpp @@ -95,8 +95,6 @@ TEST_F(Test, BNF) { Gram::Compiler compiler(bnf, Top); auto Tree = compiler.compile(tokens); - ASSERT_TRUE(Tree.Valid(Top)); - Tree.Dump(); } |