#include "grammer.h" #include #include using namespace Gram; bool debug{false}; void Debug(std::string s) { if (debug) std::cout << s << std::endl; } void Compiler::clear() { nodes.clear(); root_node_id = 0; tokens_used = 0; } std::string Compiler::GetTypeOfNode(index_t node_id) const { if (node_id >= nodes.size()) throw std::runtime_error("GetTypeOfNode(): node_id="s + std::to_string(node_id) + ", nodes.size()="s + std::to_string(nodes.size())); return nodes[node_id].type; } bool Compiler::IsRootNode(index_t node_id) const { auto& node {nodes[node_id]}; return node.parent_node_id == node.node_id; } void Compiler::Validate() const { // A program is non empty if (nodes.size() == 0) return; //throw std::runtime_error("Program is empty"); // Consistency check for nodes if (root_node_id >= nodes.size()) throw std::runtime_error("Bad root node: "s + std::to_string(root_node_id) + " vs. "s + std::to_string(nodes.size())); // Start symbol on top if (GetTypeOfNode(root_node_id) != m_top) throw std::runtime_error("Root node not start symbol!"); // All nodes filled for (const auto& node: nodes) { if (node.child_ids.size() != bnf[node.type][node.variant].size()) throw std::runtime_error("Node not filled: "s + node.type + "["s + std::to_string(node.variant) + "]"s); } } bool Compiler::rootIsStartSymbol() const { return GetTypeOfNode(root_node_id) == m_top; } bool Gram::ChildIdIsToken(int32_t child_id) { return child_id < 0; } index_t Gram::TokenIdFromChildId(int32_t child_id) { return index_t(-child_id) - 1; } int32_t Gram::ChildIdFromTokenId(index_t token_id) { return -1 - int32_t(token_id); } void Compiler::DumpTree() { Debug("= Dump ======================================="); Debug("nodes.size()="s + std::to_string(nodes.size())); if (nodes.size() > 0) { if (0) { Debug("--- Nodes ------------------------------------"); Debug("root_node_id="s + std::to_string(root_node_id)); for (const auto& node : nodes) { std::string line{"("s + std::to_string(node.node_id) + "):"s}; for (const auto& child : node.child_ids) { line += " "s; if (ChildIdIsToken(child)) line += "t"s + std::to_string(TokenIdFromChildId(child)); else line += std::to_string(child); } Debug(line); } } Debug("--- Tree -------------------------------------"); std::deque> todo{std::pair{static_cast(root_node_id), 0}}; // id, indent while (!todo.empty()) { auto [current_index, indent] {todo.front()}; todo.pop_front(); std::string line(indent, ' '); if (ChildIdIsToken(current_index)) { index_t token_id {TokenIdFromChildId(current_index)}; line += "Token("s + std::to_string(token_id) + "): "s + tokens[token_id].type + "("s + tokens[token_id].value + ")"s; } else { auto& node {nodes[current_index]}; line += "Node("s + std::to_string(current_index) + "): "s + node.type + "/" + std::to_string(node.variant); auto child_ids{node.child_ids}; for (int i = 0; i < child_ids.size(); i++) { todo.insert(todo.begin() + i, std::pair{child_ids[i], indent + 1}); } if (node.alternatives.size()) { line += ", "s + std::to_string(node.alternatives.size()) + " alternatives available"s; } } Debug(line); } } Debug("=============================================="); } bool Compiler::AllTokensUsed() const { return tokens_used == tokens.size(); } bool Compiler::treeIsComplete() const { return nodes.size() > 0 && rootIsStartSymbol() && AllTokensUsed(); } std::vector& Compiler::getNodeExpectedChilds(index_t node_id) { std::string& type = nodes[node_id].type; index_t& variant = nodes[node_id].variant; return bnf[type][variant]; } // returns true if all childs are filled, recursively. Else false with to_fill as hint to node to fill bool Compiler::subTreeIsComplete(index_t node_id, index_t& to_fill) { // recurse first to get first subtree to_fill for (const auto& i : nodes[node_id].child_ids) { if (!ChildIdIsToken(i)) { // consider subtrees if (!subTreeIsComplete(i, to_fill)) return false; // found incomplete subtree -> return it! } } if (nodes[node_id].child_ids.size() < getNodeExpectedChilds(node_id).size()) { to_fill = node_id; return false; } return true; } size_t Compiler::CommonPrefix(const std::vector& tokens, const std::vector& types) { auto [tokens_it, types_it] = std::mismatch(tokens.begin(), tokens.end(), types.begin(), types.end(), [](const Token& token, const std::string& s){ return token.type == s; }); return types_it - types.begin(); // a distance, 0 on no match } void Compiler::AddFirstNode() { root_node_id = 0; const std::string& child_type = tokens[0].type; auto it = reversedFirst.find(child_type); if (it == reversedFirst.end()) throw std::runtime_error("Illegal first token: "s + child_type + " ("s + tokens[0].value + ")"s); std::set& alternatives_set {it->second}; std::string node_type; index_t node_variant; std::deque> alternatives; // only for valid elements from alternatives_set std::vector child_ids; for (const auto& type : alternatives_set) { const auto& variants{bnf[type]}; for (int i = 0; i < variants.size(); i++) { const std::vector & variant{variants[i]}; size_t common{}; if ((common = CommonPrefix(tokens, variant)) > 0) { // match a common prefix if (node_type == "") { // no match yet node_type = type; node_variant = i; for (int token_id = 0; token_id < common; token_id++) child_ids.push_back(ChildIdFromTokenId(token_id)); } else alternatives.emplace_back(type, i); } } } if (node_type == "") // no matching type found throw std::runtime_error("Syntax error on first token: "s + child_type + " ("s + tokens[0].value + ")"s); nodes.emplace_back(TreeNode{0, 0, node_type, node_variant, alternatives, child_ids}); tokens_used = child_ids.size(); } bool Compiler::AddRootNode() { if (nodes.size() == 0) { AddFirstNode(); } else { const std::string& child_type = nodes[root_node_id].type; // starting at old root auto it = reversedFirst.find(child_type); if (it == reversedFirst.end()) // this one doesn't have a parent, maybe a start symbol to discard? return false; index_t old_root_node_id {root_node_id}; index_t new_root_node_id {nodes.size()}; std::set& alternatives_set {it->second}; std::string node_type; index_t node_variant; std::deque> alternatives; // only for valid elements from alternatives_set std::vector child_ids{static_cast(old_root_node_id)}; for (const auto& type : alternatives_set) { const auto& variants{bnf[type]}; for (int i = 0; i < variants.size(); i++) { const std::vector & variant{variants[i]}; if (variant.size() == 0) continue; // TODO: Handle case of empty rule (see e.g. C++ "attribute-list") if (child_type == variant[0]) { if (node_type == "") { node_type = type; node_variant = i; } else alternatives.emplace_back(type, i); // duplicates possible: variants of same type } } } if (node_type == "") // no matching type found (maybe backtrack possible?) return false; // now add! Debug("AddRootNode(): Adding "s + node_type); nodes[old_root_node_id].parent_node_id = new_root_node_id; root_node_id = new_root_node_id; nodes.emplace_back(TreeNode{root_node_id, root_node_id, node_type, node_variant, alternatives, child_ids}); // keep tokens_used as is } DumpTree(); return true; } void Compiler::removeTokensUpTo(index_t token_id) { removeTokensUpTo(token_id, root_node_id); } // operate on node_id void Compiler::removeTokensUpTo(index_t token_id, index_t node_id) { // token_id should be the new tokens_used if (token_id < tokens_used) { auto& node{nodes[node_id]}; auto& child_ids {node.child_ids}; // remove relevant tokens from end while (token_id < tokens_used && child_ids.size() && ChildIdIsToken(child_ids.back()) && TokenIdFromChildId(child_ids.back()) >= token_id) { Debug("Removing token "s + std::to_string(TokenIdFromChildId(child_ids.back()))); child_ids.pop_back(); if (tokens_used > 0) tokens_used--; else throw std::runtime_error("ICE: Removing non-existing token at "s + std::to_string(node_id) + " ("s + node.type + ")"s); } // recurse from back, to remove tokens from end for (int i = child_ids.size() - 1; token_id < tokens_used && i >= 0; i--) { if (!ChildIdIsToken(child_ids[i])) { removeTokensUpTo(token_id, child_ids[i]); } } } } // Go back one step: Remove Node or Token void Compiler::RemoveLastNode() { TreeNode& node {nodes.back()}; index_t node_id = node.node_id; if (node_id == root_node_id) { // No parent -> remove root if (node.child_ids.empty()) { // No children -> now tree is empty clear(); } else if (ChildIdIsToken(node.child_ids.back())) { // last token child: remove removeTokensUpTo(TokenIdFromChildId(node.child_ids.back())); } else if (node.child_ids.size() == 1) { // One child: removing possible if (!ChildIdIsToken(node.child_ids[0])) { // node: set new root nodes[node.child_ids[0]].parent_node_id = node.child_ids[0]; root_node_id = node.child_ids[0]; } Debug("Removing root node "s + nodes.back().type + "("s + std::to_string(nodes.back().node_id) + ")"s); nodes.pop_back(); } else { DumpTree(); throw std::runtime_error("ICE: Backtrack not possible: Root not empty"); } } else if (node.child_ids.empty()) { // No children -> remove leaf // We have a parent, otherwise we would have taken previous branch TreeNode& parent {nodes[node.parent_node_id]}; if (parent.child_ids.empty() || parent.child_ids.back() != node_id) throw std::runtime_error("ICE: Backtrack: Bad child nodes"); parent.child_ids.pop_back(); Debug("Removing "s + nodes.back().type + "("s + std::to_string(nodes.back().node_id) + ")"s); nodes.pop_back(); } else if (ChildIdIsToken(node.child_ids.back())) { removeTokensUpTo(TokenIdFromChildId(node.child_ids.back())); } else { // In the middle throw std::runtime_error("ICE: Backtrack in the middle of the tree: "s + std::to_string(node_id) + " ("s + node.type + ")"s); } DumpTree(); } // Change type of last node according to alternatives void Compiler::ChangeNodeType() { TreeNode& node {nodes.back()}; if (node.alternatives.empty()) throw std::runtime_error("ICE: No alternatives found during Backtrack"); auto& [type, variant] {node.alternatives.front()}; node.type = type; node.variant = variant; node.alternatives.pop_front(); } // throws if no further track back is possible: compile error void Compiler::TrackBack() { // Search backwards for alternatives: last node with alternatives (variant or alt. token) while (!nodes.empty() && nodes.back().alternatives.empty()) { RemoveLastNode(); } if (nodes.empty()) { throw std::runtime_error("Compile error: Invalid program."); } ChangeNodeType(); } // GetPath() + traverse(): return shortest path with variants // via first-entry-in-bnf-rule // excluding lower (already exists) // including upper (already determined to be included) // breadth-first search // return: node, child std::unordered_map Compiler::traverse(const std::string& lower, const std::string& upper) { std::unordered_map visited; // node, child std::deque> todo{{lower, ""}}; // node, child while (!todo.empty()) { auto [current_node, current_child] = todo.front(); std::string& current_node2{current_node}; // workaround for lambda capture below (clang 8) todo.pop_front(); auto it {visited.find(current_node)}; if (it == visited.end()) { // not visited, yet: visit now auto parents_it {reversedFirst.find(current_node)}; if (parents_it != reversedFirst.end()) { auto& parents {parents_it->second}; visited[current_node] = current_child; std::for_each(parents.begin(), parents.end(), [&](const auto&x) { todo.push_back({x, current_node2}); }); } } } return visited; } // returns list from upper (including) to lower (excluding) // returns empty list on fail std::vector Compiler::GetPath(std::string upper, std::string lower) { std::vector result; // traverse bnf from lower to upper std::unordered_map visited {traverse(lower, upper)}; auto current {upper}; while (current != lower) { auto it {visited.find(current)}; if (it != visited.end()) { auto& child{it->second}; result.push_back(current); current = child; } else { return {}; } } return result; } index_t Compiler::AddNode(const std::string& child_type, index_t parent_index) { TreeNode& parent {nodes[parent_index]}; index_t index = nodes.size(); parent.child_ids.push_back(index); index_t variant{}; std::deque> alternatives; const auto& variants { bnf[child_type] }; bool found{false}; for (int i = 0; i < variants.size(); i++) { // variants if (!found) { // use first match variant = i; found = true; } else { // defer all others alternatives.emplace_back(child_type, i); } } nodes.emplace_back(TreeNode{parent_index, index, child_type, variant, alternatives, std::vector{}}); //root stays, tokens_used stays Debug("AddNode(): "s + nodes[parent_index].type + "->"s + child_type + ": "s + std::to_string(index)); DumpTree(); return index; } void Compiler::AddPath(const std::vector& path, index_t current_index) { for (const std::string& child_type: path) { current_index = AddNode(child_type, current_index); } nodes.back().child_ids.emplace_back(ChildIdFromTokenId(tokens_used)); tokens_used++; Debug("AddPath(): token "s + tokens.back().type); DumpTree(); } bool Compiler::FillTree() { if (nodes.size() == 0) // ignore empty tree, successfully return true; index_t to_fill{}; while (!subTreeIsComplete(root_node_id, to_fill)) { if (tokens_used >= tokens.size()) return false; // Unexpected end of program? auto& node {nodes[to_fill]}; std::string next_child {bnf[node.type][node.variant][node.child_ids.size()]}; if (next_child == tokens[tokens_used].type) { // add token directly node.child_ids.push_back(ChildIdFromTokenId(tokens_used)); tokens_used++; Debug("tokens_used++: "s + std::to_string(tokens_used)); DumpTree(); } else { // add inner nodes auto list = GetPath(next_child, tokens[tokens_used].type); if (list.size() > 0) { AddPath(list, to_fill); } else { return false; } } } return true; } size_t Compiler::minimumSymbolsNeeded(std::string symbol) { if (isTerminal(bnf, symbol)) { return 1; } else { auto it_min{m_min.find(symbol)}; if (it_min != m_min.end()) return it_min->second; m_min[symbol] = std::numeric_limits::max(); auto it{bnf.find(symbol)}; if (it != bnf.end()) { size_t minimum{std::numeric_limits::max()}; for (const auto& list: it->second) { minimum = std::min(minimum, minimumSymbolsNeeded(list)); } m_min[symbol] = minimum; return minimum; } else throw std::runtime_error("ICE: Symbol "s + symbol + " expected in BNF"s); } } size_t Compiler::minimumSymbolsNeeded(std::vector symbol_list) { size_t result{0}; for (const auto& symbol: symbol_list) { size_t min { minimumSymbolsNeeded(symbol) }; if (min == std::numeric_limits::max()) return min; result += min; } return result; } /// begin, end: indexes in tokens list bool Compiler::match(std::vector symbol_list, size_t begin, size_t end) { // TODO: isTerminal() necessary here? // match terminal symbols at start while (begin < end && isTerminal(bnf, tokens[begin].type) && symbol_list.size() > 0 && symbol_list.front() == tokens[begin].type) { begin++; symbol_list.erase(symbol_list.begin()); } // match terminal symbols at end while (begin < end && isTerminal(bnf, tokens[end - 1].type) && symbol_list.size() > 0 && symbol_list.back() == tokens[end - 1].type) { end--; symbol_list.erase(symbol_list.end() - 1); } // matching of empty lists if (symbol_list.size() == 0) { if (begin == end) // only match real empty list return true; return false; } // now, symbol_list[begin .. end - 1] has size > 0 and contains non-terminal symbols at start and end // resolve first symbol auto it{bnf.find(symbol_list.front())}; if (it != bnf.end()) { for (std::vector list: it->second) { // iterate over alternatives list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end()); if (minimumSymbolsNeeded(list) > end - begin) // stop recursion continue; // TODO: recurse last? if (match(list, begin, end)) return true; } } else return false; // terminal symbol not found in bnf, non-matching return false; // no match found } bool Compiler::match(std::string symbol, size_t begin, size_t end) { std::vector symbol_list{symbol}; return match(symbol_list, begin, end); } Compiler::Compiler(BNF& bnf, const std::string& top): bnf(bnf), m_top(top), ReverseBNF{Reverse(bnf)}, reversedFirst{reverseFirst(bnf)} { //std::cout << "DEBUG: " << m_top << std::endl; // // prepare helper cache (TODO: remove this ugly workaround for remaining bad marker elements) // minimumSymbolsNeeded("translation-unit"); // remove bad marker elements auto it{m_min.begin()}; while (it != m_min.end()) { if (it->second == std::numeric_limits::max()) { it = m_min.erase(it); } else { ++it; } } minimumSymbolsNeeded("translation-unit"); } std::pair> Compiler::compile(std::vector p_tokens) { clear(); tokens = p_tokens; if (tokens.size() == 0) throw std::runtime_error("No tokens!"); #if 0 // // bottom-up algorithm // while (!treeIsComplete()) { if (!FillTree()) { TrackBack(); } else if (!AddRootNode()) { TrackBack(); } else if (!FillTree()) { TrackBack(); } } #else // // top-down algorithm // if (!match(m_top, 0, tokens.size())) throw std::runtime_error("Compile error."); #endif Validate(); return std::pair>{root_node_id, nodes}; }