#include "grammer.h" #include "debug.h" #include #include #include using namespace Gram; void Compiler::clear() { symbol_variants.clear(); nodes.clear(); } std::string Compiler::GetTypeOfNode(index_t node_id) const { if (node_id >= nodes.size()) throw std::runtime_error("GetTypeOfNode(): node_id="s + std::to_string(node_id) + ", nodes.size()="s + std::to_string(nodes.size())); return nodes[node_id].type; } bool Gram::ChildIdIsEmpty(int32_t child_id) { return child_id == 0; } bool Gram::ChildIdIsToken(int32_t child_id) { return child_id < 0; } bool Gram::ChildIdIsNode(int32_t child_id) { return child_id > 0; } index_t Gram::TokenIdFromChildId(int32_t child_id) { return index_t(-child_id) - 1; } int32_t Gram::ChildIdFromTokenId(index_t token_id) { return -1 - int32_t(token_id); } void Compiler::DumpTree() { Debug("= Dump ======================================="); Debug("nodes.size()="s + std::to_string(nodes.size())); if (nodes.size() > 0) { if (0) { Debug("--- Nodes ------------------------------------"); for (const auto& node : nodes) { std::string line{"("s + std::to_string(node.node_id) + "):"s}; for (const auto& child : node.child_ids) { line += " "s; if (ChildIdIsToken(child)) line += "t"s + std::to_string(TokenIdFromChildId(child)); else line += std::to_string(child); } Debug(line); } } Debug("--- Tree -------------------------------------"); std::deque> todo{std::pair{static_cast(0), 0}}; // id, indent while (!todo.empty()) { auto [current_index, indent] {todo.front()}; todo.pop_front(); std::string line; for (int i = 0; i < indent; i++) line += "| "; if (ChildIdIsToken(current_index)) { index_t token_id {TokenIdFromChildId(current_index)}; line += "Token("s + std::to_string(token_id) + "): "s + tokens[token_id].type; if (tokens[token_id].value != tokens[token_id].type) line += "("s + tokens[token_id].value + ")"s; } else { auto& node {nodes[current_index]}; line += "Node("s + std::to_string(current_index) + "): "s + node.type + "/" + std::to_string(node.variant); auto child_ids{node.child_ids}; for (int i = 0; i < child_ids.size(); i++) { todo.insert(todo.begin() + i, std::pair{child_ids[i], indent + 1}); } } Debug(line); } } Debug("=============================================="); } index_t Compiler::AddNode(const std::string& type, index_t variant, NodePosition pos) { auto& list { m_bnf[type][variant]}; index_t node_id{nodes.size()}; if (nodes.size() > 0) nodes[pos.node_id].child_ids[pos.child_pos] = node_id; nodes.emplace_back(TreeNode{pos, node_id, type, variant, std::vector(size_t(list.size()), 0)}); return node_id; } Compiler::AddNodeGuard::AddNodeGuard(Compiler& compiler, index_t variant): m_compiler(compiler) { m_compiler.symbol_variants.push_back(variant); } Compiler::AddNodeGuard::~AddNodeGuard() { m_compiler.symbol_variants.pop_back(); } void Compiler::IncNodePosition(NodePosition& pos) { if (nodes.size() == 0) // special case: empty tree return; if (pos.node_id >= nodes.size()) throw std::runtime_error("ICE: NodePosition with node_id "s + std::to_string(pos.node_id) + " doesn't exist."s); if (pos.child_pos >= nodes[pos.node_id].child_ids.size()) throw std::runtime_error("ICE: NodePosition with child_pos "s + std::to_string(pos.child_pos) + " in node_id "s + std::to_string(pos.node_id) + " doesn't exist."s); int32_t child_id{nodes[pos.node_id].child_ids[pos.child_pos]}; if (ChildIdIsEmpty(child_id)) throw std::runtime_error("ICE: NodePosition is empty"); // Actually, advance if (ChildIdIsToken(child_id)) { pos.child_pos++; } else { pos.node_id = child_id; pos.child_pos = 0; } // Go to parent if child ids completely traversed while (pos.node_id > 0 && pos.child_pos >= nodes[pos.node_id].child_ids.size()) { pos = nodes[pos.node_id].pos; pos.child_pos++; } // Advancing at root node for last child is allowed: Finished if (pos.child_pos >= nodes[pos.node_id].child_ids.size()) return; if (ChildIdIsNode(nodes[pos.node_id].child_ids[pos.child_pos])) throw std::runtime_error("ICE: No node expected at "s + std::to_string(pos.node_id) + "/"s + std::to_string(pos.child_pos)); } size_t Compiler::minimumSymbolsNeeded(const std::string& symbol) { if (isTerminal(m_bnf, symbol)) { return 1; } else { auto it_min{m_min.find(symbol)}; if (it_min != m_min.end()) return it_min->second; m_min[symbol] = std::numeric_limits::max(); auto it{m_bnf.find(symbol)}; if (it != m_bnf.end()) { size_t minimum{std::numeric_limits::max()}; for (const auto& list: it->second) { minimum = std::min(minimum, minimumSymbolsNeeded(list)); } m_min[symbol] = minimum; return minimum; } else throw std::runtime_error("ICE: Symbol "s + symbol + " expected in BNF"s); } } size_t Compiler::minimumSymbolsNeeded(const std::vector& symbol_list) { size_t result{0}; for (const auto& symbol: symbol_list) { size_t min { minimumSymbolsNeeded(symbol) }; if (min == std::numeric_limits::max()) return min; result += min; } return result; } /// begin, end: indexes in tokens list bool Compiler::match(std::vector symbol_list, size_t begin, size_t end) { // match terminal symbols at start while (begin < end && symbol_list.size() > 0 && symbol_list.front() == tokens[begin].type) { begin++; symbol_list.erase(symbol_list.begin()); } // matching of empty list in non-terminals if (symbol_list.size() == 0) { if (begin == end) { // only match real empty list // this is the point of the final match constructTree(); return true; } return false; } // matching of empty list in terminals if (begin == end) { const auto& symbol{symbol_list.front()}; auto it{m_empty_lut.find(symbol)}; if (it == m_empty_lut.end()) // can't match empty tokens list with this nt-symbol return false; AddNodeGuard guard(*this, it->second); std::vector list {m_bnf[symbol][it->second]}; list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end()); return match(list, begin, end); } // now, symbol_list has size > 0 and contains non-terminal symbols at start and end // resolve first symbol auto it = m_match_lut.find({symbol_list.front(), tokens[begin].type}); if (it != m_match_lut.end()) { for (size_t i: it->second) { AddNodeGuard guard(*this, i); std::vector list {m_bnf[symbol_list.front()][i]}; list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end()); if (minimumSymbolsNeeded(list) > end - begin) // stop recursion continue; if (match(list, begin, end)) return true; } } else return false; // terminal symbol not found in bnf, non-matching return false; // no match found } bool Compiler::match(std::string symbol, size_t begin, size_t end) { std::vector symbol_list{symbol}; return match(symbol_list, begin, end); } void Compiler::fillStartCache() { std::unordered_set terminals {getTerminals(m_bnf)}; { // follow terminal symbols for (const std::string& terminal: terminals) { std::unordered_set current_set{terminal}; std::unordered_set done_set; std::unordered_set next_set; do { for (const auto& current_symbol: current_set) { auto it {reversedPosFirst.find(current_symbol)}; if (it != reversedPosFirst.end()) { std::unordered_set, PairHash>& positions{it->second}; for (const auto& position: positions) { auto it_lut {m_match_lut.find({position.first, terminal})}; if (it_lut == m_match_lut.end()) { // add new list m_match_lut[std::pair{position.first, terminal}] = std::vector{position.second}; } else { // extend list it_lut->second.emplace_back(position.second); } if (done_set.find(position.first) == done_set.end()) { next_set.insert(position.first); done_set.insert(position.first); } } } } current_set = next_set; next_set.clear(); } while (!current_set.empty()); } } { // follow empty non-terminal symbols, and combine all found non-terminals with all terminals std::unordered_set, PairHash> current_set {getEmptyPositions(m_bnf)}; std::unordered_set, PairHash> done_set; std::unordered_set, PairHash> next_set; do { for (const auto& current_pos: current_set) { m_empty_lut[current_pos.first] = current_pos.second; for (const std::string& terminal: terminals) { auto it_lut {m_match_lut.find({current_pos.first, terminal})}; if (it_lut == m_match_lut.end()) { // add new list m_match_lut[std::pair{current_pos.first, terminal}] = std::vector{current_pos.second}; } else { // extend list it_lut->second.emplace_back(current_pos.second); } } auto it {reversedPosFirst.find(current_pos.first)}; if (it != reversedPosFirst.end()) { std::unordered_set, PairHash>& positions{it->second}; for (const auto& position: positions) { if (done_set.find(position) == done_set.end()) { next_set.insert(position); done_set.insert(position); } } } } current_set = next_set; next_set.clear(); } while (!current_set.empty()); } for (auto& x: m_match_lut) { std::sort(x.second.begin(), x.second.end()); } } void Compiler::constructTree() { symbol_variants_it = symbol_variants.begin(); std::vector symbol_list{m_top}; index_t symbol_list_pos{0}; NodePosition tree_pos; while (symbol_list_pos < symbol_list.size()) { std::string symbol{symbol_list[symbol_list_pos]}; if (isTerminal(m_bnf, symbol)) { // Advance terminal symbol nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = ChildIdFromTokenId(symbol_list_pos); IncNodePosition(tree_pos); symbol_list_pos++; } else { // Replace non-terminal symbol symbol_list.erase(symbol_list.begin() + symbol_list_pos); std::vector list {m_bnf[symbol][*symbol_variants_it]}; symbol_list.insert(symbol_list.begin() + symbol_list_pos, list.begin(), list.end()); index_t node_id {AddNode(symbol, *symbol_variants_it, tree_pos)}; if (node_id > 0) { nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = node_id; IncNodePosition(tree_pos); } symbol_variants_it++; } } } Compiler::Compiler(BNF& bnf, const std::string& top) : m_bnf {removeHeadRecursion(bnf)} , m_top(top) , reversedPosFirst{reversePosFirst(m_bnf)} { // // prepare helper cache (TODO: remove this ugly workaround for remaining bad marker elements) // (void) minimumSymbolsNeeded("translation-unit"); // remove bad marker elements auto it{m_min.begin()}; while (it != m_min.end()) { if (it->second == std::numeric_limits::max()) { it = m_min.erase(it); } else { ++it; } } (void) minimumSymbolsNeeded("translation-unit"); // fill other cache fillStartCache(); } namespace { bool hasExtChild(index_t index, const std::vector& nodes) { if (index >= nodes.size()) throw std::runtime_error("Node out of range at ext node detection: "s + std::to_string(index) + " vs. "s + std::to_string(nodes.size())); if (nodes[index].child_ids.size() < 1) // no childs return false; int32_t child_id {nodes[index].child_ids.back()}; if (ChildIdIsToken(child_id)) // token can't be ext node return false; if (child_id >= nodes.size()) throw std::runtime_error("Child node out of range at ext node detection: "s + std::to_string(child_id) + " vs. "s + std::to_string(nodes.size())); return nodes[child_id].type.ends_with("-EXT"); } // node to add depends on variant: // std::string: node type // returns: index of new node index_t AddNode(std::vector& result, index_t parent, std::string type) { index_t node_pos{result.size()}; NodePosition pos{parent, 0}; if (result.size() > 0) { // i.e. we have a predecessor pos.child_pos = result[parent].child_ids.size(); result[parent].child_ids.push_back(node_pos); } result.emplace_back(TreeNode{pos, node_pos, type, 0, {} }); return node_pos; } // forward declaration void visit(index_t nodes_index, const std::vector& nodes, index_t result_parent, std::vector& result); // copy child nodes, recursively // nodes_index: node whose childs to copy to result // skip: number of elements to skip at end void copy_childs(index_t nodes_index, const std::vector& nodes, index_t result_parent, std::vector& result, int skip = 0) { for (int i = 0; i < int(nodes[nodes_index].child_ids.size()) - skip; i++) { int32_t child_id{nodes[nodes_index].child_ids[i]}; if (ChildIdIsNode(child_id)) visit(child_id, nodes, result_parent, result); else result[result_parent].child_ids.push_back(child_id); } } // returns node to continue with index_t visitExt(index_t nodes_index, const std::vector& nodes, index_t result_parent, std::vector& result) { std::string type{nodes[nodes_index].type}; assert(type.ends_with("-EXT")); type = type.substr(0, type.size() - 4); // remove "-EXT" from type if (hasExtChild(nodes_index, nodes)) { // ignore empty last "-EXT" nodes finally // at this point, we have at least 1 last child ("-EXT"): visit it first result_parent = visitExt(nodes[nodes_index].child_ids.back(), nodes, result_parent, result); } size_t result_node_id{ AddNode(result, result_parent, type)}; // old parent is new child copy_childs(nodes_index, nodes, result_parent, result, 1); return result_node_id; } // visit nodes recursively, adding to result // index: index in nodes void visit(index_t nodes_index, const std::vector& nodes, index_t result_parent, std::vector& result) { std::string type{nodes[nodes_index].type}; if (type.ends_with("-EXT") && nodes[nodes_index].child_ids.empty()) { // ignore end of -EXT chain } else if (hasExtChild(nodes_index, nodes)) { // resolve ext node result_parent = visitExt(nodes[nodes_index].child_ids.back(), nodes, result_parent, result); if (!type.ends_with("-EXT")) // if first-in row, add childs here copy_childs(nodes_index, nodes, result_parent, result, 1); } else { // normal node: just copy, recursively assert(!type.ends_with("-EXT")); result_parent = AddNode(result, result_parent, type); copy_childs(nodes_index, nodes, result_parent, result); } } // At the start of compile() head recursion is removed. This leads to a // different tree of nodes. This functions restores the head recursion in the // nodes tree. // Note: "variant" element of nodes will be all zero since it can't be // restored easily std::vector restoreHeadRecursion(const std::vector& nodes) { // traverse nodes (old), while reversing if tail recursion via -EXT is detected std::vector result; if (nodes.size() > 0) visit(0, nodes, 0, result); return result; } } // namespace std::vector Compiler::compile(std::vector p_tokens) { clear(); tokens = p_tokens; // // top-down algorithm: // // 1. Match linear tokens list to bnf, building up list of used variants (symbol_variants) // 2. Construct Node Tree from symbol_variants // if (!match(m_top, 0, tokens.size())) throw std::runtime_error("Compile error"); nodes = restoreHeadRecursion(nodes); DumpTree(); return nodes; }