diff options
-rw-r--r-- | grammer.cpp | 347 | ||||
-rw-r--r-- | grammer.h | 19 |
2 files changed, 3 insertions, 363 deletions
diff --git a/grammer.cpp b/grammer.cpp index 4bcdd4d..97930b1 100644 --- a/grammer.cpp +++ b/grammer.cpp @@ -17,8 +17,6 @@ void Compiler::clear() { nodes.clear(); root_node_id = 0; - - tokens_used = 0; } std::string Compiler::GetTypeOfNode(index_t node_id) const @@ -115,9 +113,6 @@ void Compiler::DumpTree() for (int i = 0; i < child_ids.size(); i++) { todo.insert(todo.begin() + i, std::pair<int32_t, size_t>{child_ids[i], indent + 1}); } - if (node.alternatives.size()) { - line += ", "s + std::to_string(node.alternatives.size()) + " alternatives available"s; - } } Debug(line); } @@ -125,302 +120,9 @@ void Compiler::DumpTree() Debug("=============================================="); } -bool Compiler::AllTokensUsed() const -{ - return tokens_used == tokens.size(); -} - bool Compiler::treeIsComplete() const { - return nodes.size() > 0 && rootIsStartSymbol() && AllTokensUsed(); -} - -std::vector<std::string>& Compiler::getNodeExpectedChilds(index_t node_id) -{ - std::string& type = nodes[node_id].type; - index_t& variant = nodes[node_id].variant; - return bnf[type][variant]; -} - -// returns true if all childs are filled, recursively. Else false with to_fill as hint to node to fill -bool Compiler::subTreeIsComplete(index_t node_id, index_t& to_fill) -{ - // recurse first to get first subtree to_fill - for (const auto& i : nodes[node_id].child_ids) { - if (!ChildIdIsToken(i)) { // consider subtrees - if (!subTreeIsComplete(i, to_fill)) - return false; // found incomplete subtree -> return it! - } - } - - if (nodes[node_id].child_ids.size() < getNodeExpectedChilds(node_id).size()) { - to_fill = node_id; - return false; - } - - return true; -} - -size_t Compiler::CommonPrefix(const std::vector<Token>& tokens, const std::vector<std::string>& types) -{ - auto [tokens_it, types_it] = std::mismatch(tokens.begin(), tokens.end(), types.begin(), types.end(), [](const Token& token, const std::string& s){ return token.type == s; }); - return types_it - types.begin(); // a distance, 0 on no match -} - -void Compiler::AddFirstNode() -{ - root_node_id = 0; - - const std::string& child_type = tokens[0].type; - auto it = reversedFirst.find(child_type); - if (it == reversedFirst.end()) - throw std::runtime_error("Illegal first token: "s + child_type + " ("s + tokens[0].value + ")"s); - - std::set<std::string>& alternatives_set {it->second}; - - std::string node_type; - index_t node_variant; - std::deque<std::pair<std::string, index_t>> alternatives; // only for valid elements from alternatives_set - std::vector<int32_t> child_ids; - - for (const auto& type : alternatives_set) { - const auto& variants{bnf[type]}; - for (int i = 0; i < variants.size(); i++) { - const std::vector<std::string> & variant{variants[i]}; - size_t common{}; - if ((common = CommonPrefix(tokens, variant)) > 0) { // match a common prefix - if (node_type == "") { // no match yet - node_type = type; - node_variant = i; - for (int token_id = 0; token_id < common; token_id++) - child_ids.push_back(ChildIdFromTokenId(token_id)); - } else - alternatives.emplace_back(type, i); - } - } - } - - if (node_type == "") // no matching type found - throw std::runtime_error("Syntax error on first token: "s + child_type + " ("s + tokens[0].value + ")"s); - - nodes.emplace_back(TreeNode{0, 0, node_type, node_variant, alternatives, child_ids}); - - tokens_used = child_ids.size(); -} - -bool Compiler::AddRootNode() -{ - if (nodes.size() == 0) { - AddFirstNode(); - } else { - const std::string& child_type = nodes[root_node_id].type; // starting at old root - auto it = reversedFirst.find(child_type); - if (it == reversedFirst.end()) // this one doesn't have a parent, maybe a start symbol to discard? - return false; - - index_t old_root_node_id {root_node_id}; - index_t new_root_node_id {nodes.size()}; - - std::set<std::string>& alternatives_set {it->second}; - - std::string node_type; - index_t node_variant; - std::deque<std::pair<std::string, index_t>> alternatives; // only for valid elements from alternatives_set - std::vector<int32_t> child_ids{static_cast<int>(old_root_node_id)}; - - for (const auto& type : alternatives_set) { - const auto& variants{bnf[type]}; - for (int i = 0; i < variants.size(); i++) { - const std::vector<std::string> & variant{variants[i]}; - if (variant.size() == 0) - continue; // TODO: Handle case of empty rule (see e.g. C++ "attribute-list") - if (child_type == variant[0]) { - if (node_type == "") { - node_type = type; - node_variant = i; - } else - alternatives.emplace_back(type, i); // duplicates possible: variants of same type - } - } - } - - if (node_type == "") // no matching type found (maybe backtrack possible?) - return false; - - // now add! - Debug("AddRootNode(): Adding "s + node_type); - nodes[old_root_node_id].parent_node_id = new_root_node_id; - root_node_id = new_root_node_id; - nodes.emplace_back(TreeNode{root_node_id, root_node_id, node_type, node_variant, alternatives, child_ids}); - // keep tokens_used as is - } - - DumpTree(); - return true; -} - -void Compiler::removeTokensUpTo(index_t token_id) -{ - removeTokensUpTo(token_id, root_node_id); -} - -// operate on node_id -void Compiler::removeTokensUpTo(index_t token_id, index_t node_id) -{ - // token_id should be the new tokens_used - - if (token_id < tokens_used) { - auto& node{nodes[node_id]}; - auto& child_ids {node.child_ids}; - - // remove relevant tokens from end - while (token_id < tokens_used && child_ids.size() && ChildIdIsToken(child_ids.back()) && TokenIdFromChildId(child_ids.back()) >= token_id) { - Debug("Removing token "s + std::to_string(TokenIdFromChildId(child_ids.back()))); - child_ids.pop_back(); - if (tokens_used > 0) - tokens_used--; - else - throw std::runtime_error("ICE: Removing non-existing token at "s + std::to_string(node_id) + " ("s + node.type + ")"s); - } - - // recurse from back, to remove tokens from end - for (int i = child_ids.size() - 1; token_id < tokens_used && i >= 0; i--) { - if (!ChildIdIsToken(child_ids[i])) { - removeTokensUpTo(token_id, child_ids[i]); - } - } - } -} - -// Go back one step: Remove Node or Token -void Compiler::RemoveLastNode() -{ - TreeNode& node {nodes.back()}; - index_t node_id = node.node_id; - - if (node_id == root_node_id) { // No parent -> remove root - if (node.child_ids.empty()) { // No children -> now tree is empty - clear(); - } else if (ChildIdIsToken(node.child_ids.back())) { // last token child: remove - removeTokensUpTo(TokenIdFromChildId(node.child_ids.back())); - } else if (node.child_ids.size() == 1) { // One child: removing possible - if (!ChildIdIsToken(node.child_ids[0])) { - // node: set new root - nodes[node.child_ids[0]].parent_node_id = node.child_ids[0]; - root_node_id = node.child_ids[0]; - } - Debug("Removing root node "s + nodes.back().type + "("s + std::to_string(nodes.back().node_id) + ")"s); - nodes.pop_back(); - } else { - DumpTree(); - throw std::runtime_error("ICE: Backtrack not possible: Root not empty"); - } - } else if (node.child_ids.empty()) { // No children -> remove leaf - // We have a parent, otherwise we would have taken previous branch - TreeNode& parent {nodes[node.parent_node_id]}; - if (parent.child_ids.empty() || parent.child_ids.back() != node_id) - throw std::runtime_error("ICE: Backtrack: Bad child nodes"); - parent.child_ids.pop_back(); - Debug("Removing "s + nodes.back().type + "("s + std::to_string(nodes.back().node_id) + ")"s); - nodes.pop_back(); - } else if (ChildIdIsToken(node.child_ids.back())) { - removeTokensUpTo(TokenIdFromChildId(node.child_ids.back())); - } else { // In the middle - throw std::runtime_error("ICE: Backtrack in the middle of the tree: "s + std::to_string(node_id) + " ("s + node.type + ")"s); - } - DumpTree(); -} - -// Change type of last node according to alternatives -void Compiler::ChangeNodeType() -{ - TreeNode& node {nodes.back()}; - - if (node.alternatives.empty()) - throw std::runtime_error("ICE: No alternatives found during Backtrack"); - - auto& [type, variant] {node.alternatives.front()}; - - node.type = type; - node.variant = variant; - - node.alternatives.pop_front(); -} - -// throws if no further track back is possible: compile error -void Compiler::TrackBack() -{ - // Search backwards for alternatives: last node with alternatives (variant or alt. token) - while (!nodes.empty() && nodes.back().alternatives.empty()) { - RemoveLastNode(); - } - - if (nodes.empty()) { - throw std::runtime_error("Compile error: Invalid program."); - } - - ChangeNodeType(); -} - -// GetPath() + traverse(): return shortest path with variants -// via first-entry-in-bnf-rule -// excluding lower (already exists) -// including upper (already determined to be included) - -// breadth-first search -// return: node, child -std::unordered_map<std::string, std::string> Compiler::traverse(const std::string& lower, const std::string& upper) -{ - std::unordered_map<std::string, std::string> visited; // node, child - std::deque<std::pair<std::string, std::string>> todo{{lower, ""}}; // node, child - - while (!todo.empty()) { - auto [current_node, current_child] = todo.front(); - std::string& current_node2{current_node}; // workaround for lambda capture below (clang 8) - todo.pop_front(); - - auto it {visited.find(current_node)}; - if (it == visited.end()) { // not visited, yet: visit now - auto parents_it {reversedFirst.find(current_node)}; - - if (parents_it != reversedFirst.end()) { - auto& parents {parents_it->second}; - - visited[current_node] = current_child; - - std::for_each(parents.begin(), parents.end(), [&](const auto&x) { - todo.push_back({x, current_node2}); - }); - } - } - } - - return visited; -} - -// returns list from upper (including) to lower (excluding) -// returns empty list on fail -std::vector<std::string> Compiler::GetPath(std::string upper, std::string lower) -{ - std::vector<std::string> result; - - // traverse bnf from lower to upper - std::unordered_map<std::string, std::string> visited {traverse(lower, upper)}; - - auto current {upper}; - while (current != lower) { - auto it {visited.find(current)}; - if (it != visited.end()) { - auto& child{it->second}; - - result.push_back(current); - - current = child; - } else { - return {}; - } - } - return result; + return nodes.size() > 0 && rootIsStartSymbol(); } index_t Compiler::AddNode(const std::string& child_type, index_t parent_index) @@ -443,8 +145,8 @@ index_t Compiler::AddNode(const std::string& child_type, index_t parent_index) } } - nodes.emplace_back(TreeNode{parent_index, index, child_type, variant, alternatives, std::vector<int32_t>{}}); - //root stays, tokens_used stays + nodes.emplace_back(TreeNode{parent_index, index, child_type, variant, std::vector<int32_t>{}}); + //root stays Debug("AddNode(): "s + nodes[parent_index].type + "->"s + child_type + ": "s + std::to_string(index)); DumpTree(); @@ -452,49 +154,6 @@ index_t Compiler::AddNode(const std::string& child_type, index_t parent_index) return index; } -void Compiler::AddPath(const std::vector<std::string>& path, index_t current_index) -{ - for (const std::string& child_type: path) { - current_index = AddNode(child_type, current_index); - } - - nodes.back().child_ids.emplace_back(ChildIdFromTokenId(tokens_used)); - tokens_used++; - - Debug("AddPath(): token "s + tokens.back().type); - DumpTree(); -} - -bool Compiler::FillTree() -{ - if (nodes.size() == 0) // ignore empty tree, successfully - return true; - - index_t to_fill{}; - - while (!subTreeIsComplete(root_node_id, to_fill)) { - if (tokens_used >= tokens.size()) - return false; // Unexpected end of program? - - auto& node {nodes[to_fill]}; - std::string next_child {bnf[node.type][node.variant][node.child_ids.size()]}; - if (next_child == tokens[tokens_used].type) { // add token directly - node.child_ids.push_back(ChildIdFromTokenId(tokens_used)); - tokens_used++; - Debug("tokens_used++: "s + std::to_string(tokens_used)); - DumpTree(); - } else { // add inner nodes - auto list = GetPath(next_child, tokens[tokens_used].type); - if (list.size() > 0) { - AddPath(list, to_fill); - } else { - return false; - } - } - } - return true; -} - size_t Compiler::minimumSymbolsNeeded(std::string symbol) { if (isTerminal(bnf, symbol)) { @@ -20,7 +20,6 @@ struct TreeNode { std::string type; index_t variant; // bnf[type][variant] - std::deque<std::pair<std::string, index_t>> alternatives; // [type][value] alternatives that we can consider on fail. Discard after parsing! std::vector<int32_t> child_ids; // < 0: terminal: token_id; >= 0: non-terminal: node_id; fill token by token }; @@ -34,9 +33,6 @@ private: // Input std::vector<Token> tokens; - // Helper data - index_t tokens_used{0}; // number of tokens used, this is also the next token index to use - BNF &bnf; // not const for access via operator[] const std::string m_top; @@ -50,24 +46,9 @@ private: std::string GetTypeOfNode(index_t node_id) const; bool IsRootNode(index_t node_id) const; bool rootIsStartSymbol() const; - bool AllTokensUsed() const; bool treeIsComplete() const; - std::vector<std::string>& getNodeExpectedChilds(index_t node_id); - bool subTreeIsComplete(index_t node_id, index_t& to_fill); - size_t CommonPrefix(const std::vector<Token>& tokens, const std::vector<std::string>& types); - void AddFirstNode(); - bool AddRootNode(); - void removeTokensUpTo(index_t token_id); - void removeTokensUpTo(index_t token_id, index_t node_id); - void RemoveLastNode(); - void ChangeNodeType(); - void TrackBack(); void Validate() const; - std::unordered_map<std::string, std::string> traverse(const std::string& lower, const std::string& upper); - std::vector<std::string> GetPath(std::string upper, std::string lower); index_t AddNode(const std::string& child_type, index_t parent_index); - void AddPath(const std::vector<std::string>& path, index_t current_index); - bool FillTree(); // top-down algorithm std::unordered_map<std::string, size_t> m_min; // cache |