diff options
Diffstat (limited to 'lexer.cpp')
-rw-r--r-- | lexer.cpp | 292 |
1 files changed, 292 insertions, 0 deletions
diff --git a/lexer.cpp b/lexer.cpp new file mode 100644 index 0000000..46a38bd --- /dev/null +++ b/lexer.cpp @@ -0,0 +1,292 @@ +#include "lexer.h" + +void Tree::clear() { + nodes.clear(); + root = 0; + last = 0; + node_num = 0; +} + +// Type of lexical token +std::string Tree::GetType() { + if (node_num > 0 && nodes[root].child_names.size() == 1) + return nodes[root].child_names[0]; + return ""; +} + +bool Tree::Valid(const std::string& Top) const { + // A token is non empty + if (node_num == 0) + return false; + + // Start symbol on top + auto rootNode{nodes.find(root)}; + if (rootNode == nodes.end()) + throw std::runtime_error("Node not found: "s + std::to_string(root)); + + if (rootNode->second.name != Top) + return false; + + // All nodes filled (implies all leaves are terminal) + for (const auto& [index, node]: nodes) { + if (node.childs.size() < node.child_names.size()) + return false; // node not filled + } + + return true; +} + +bool Tree::AddFirstNode(char c, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { + node_num ++; + root = node_num; + last = node_num; + std::string node_name(1, char(c)); + + auto reverseRule{reverseBNF.find(node_name)}; + if (reverseRule == reverseBNF.end()) + throw std::runtime_error("Reverse rule not found for "s + node_name); + + auto rule{bnf.find(node_name)}; + if (rule != bnf.end()) { // multiple variants! + throw std::runtime_error("BNF rule for terminal symbol "s + node_name + " found."s); + } + nodes.emplace(root, TreeNode{0, std::vector<index_t>{}, std::vector<std::string>{}, node_name}); + return true; +} + +std::vector<TreeNode> Tree::getParentTreeNode(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { + std::vector<TreeNode> result; // default: empty + + auto& root_name {nodes[root].name}; + auto bnfParents {reverseBNF.find(root_name)}; + if (bnfParents == reverseBNF.end()) + return result; + + for (const auto& parent_node_name : bnfParents->second) { + auto lists {bnf.at(parent_node_name)}; + for (const auto& list : lists) { + if (list.size() > 0 && list[0] == root_name) { + TreeNode node{0, std::vector<index_t>{root}, list, parent_node_name}; + result.push_back(node); + } + } + } + + return result; +} + +index_t Tree::GetLast() { + index_t result {root}; + + while(result != 0 && nodes[result].childs.size() >= 2) { + result = nodes[result].childs[nodes[result].childs.size() - 1]; + } + + return result; +} + +void Tree::AddRootNode(const TreeNode& newRootNode) { + node_num++; + nodes[node_num] = newRootNode; + root = node_num; + last = node_num; +} + +void Tree::RemoveRootNode() { + root = nodes[root].childs[0]; + nodes.erase(node_num); + node_num--; + last = GetLast(); +} + +// Path from leaf to root +std::vector<std::string> Tree::GetPath(std::string a, std::string b, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { + std::vector<std::string> result; + + while (a != b) { + auto parents {reverseBNF.find(a)}; + if (parents == reverseBNF.end()) + return {}; + + bool hit{false}; + for (const auto& parent : parents->second) { + for (const auto& list : bnf.at(parent)) { + if (list.size() > 0 && list[0] == a) { + if (!hit) { + result.push_back(a); + a = parent; + hit = true; + } else + throw std::runtime_error("Double match for "s + parent + "/"s + a); + } + } + } + } + if (a == b) { + result.push_back(a); + } + return result; +} + +index_t Tree::AddNode(const std::string& name, const std::string& child_name, index_t parent_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) +{ + TreeNode& parent {nodes[parent_index]}; + node_num++; + index_t index = node_num; + parent.childs.push_back(index); + std::vector<std::string> child_names; + auto rule {bnf.find(name)}; + if (rule != bnf.end()) { + for (auto& list : rule->second) { + if (list.size() > 0 && list[0] == child_name) + child_names = list; + } + } + nodes.emplace(index, TreeNode{parent_index, {}, child_names, name}); + //root stays + last = GetLast(); + + return index; +} + +void Tree::AddPath(const std::vector<std::string>& path, index_t current_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { + for (int i = path.size() - 1; i >= 0; i--) { + std::string child_name; + if (i > 0) + child_name = path[i - 1]; + current_index = AddNode(path[i], child_name, current_index, bnf, reverseBNF); + } +} + +// try to add character to tree +bool Tree::Add(char c, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { + if (nodes.empty()) { // first node + return AddFirstNode(c, bnf, reverseBNF); + } else { // at least one character is already present + // Traverse tree until partially filled node found + // or new node can be added + index_t current_index{last}; + + while (current_index != 0) { + TreeNode& node {nodes[current_index]}; + if (node.childs.size() < node.child_names.size()) { // partially filled node + std::vector<std::string> list = GetPath(std::string(1, c), node.child_names[node.childs.size()], bnf, reverseBNF); + if (list.size() > 0) { + AddPath(list, current_index, bnf, reverseBNF); + return true; + } else { + return false; // The path a->b is not available via bnf + } + } + current_index = node.parent; + } + + // Add node at root + + std::vector<TreeNode> parent_nodes = getParentTreeNode(bnf, reverseBNF); + if (parent_nodes.size() == 0) + throw std::runtime_error("Couldn't add new parent node."); + + for (const auto &i : parent_nodes) { + AddRootNode(i); + if (Add(c, bnf, reverseBNF)) + return true; + RemoveRootNode(); + } + + } + return false; +} + +// add path to start symbol +void Tree::Resolve(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { + if (nodes.empty()) // only handle non-empty trees + return; + + while (true) { + std::string& old_root_name { nodes[root].name }; // current root node name + + auto parents {reverseBNF.find(old_root_name)}; + if (parents != reverseBNF.end()) { // parents in bnf available + bool hit{false}; + for (auto& parent : parents->second) { + for (const auto& list : bnf.at(parent)) { + if (list.size() == 1 && list[0] == old_root_name) { + if (!hit) { + const std::string& new_root_name {parent}; + // Add new TreeNode in the direction to root: + // New root with 1 child (old root) + nodes.emplace(++node_num, + TreeNode{0, // parent + std::vector<index_t>{root}, // child indices + std::vector<std::string>{old_root_name}, // child names + new_root_name // name + }); + nodes[root].parent = node_num; + root = node_num; + // this->last stays + hit = true; + } else + throw std::runtime_error("Error: Multiple resolve nodes for "s + old_root_name); + } + } + } + if (!hit) + break; + } else + break; + } +} + +void Lexer::FinalizeTree(Tree& tree, std::string& token, std::vector<Token>& result) +{ + tree.Resolve(bnf, ReverseBNF); + if (tree.Valid(Top)) { + result.emplace_back(Token{tree.GetType(), token, Location{location.line, location.column - token.size()}}); + token.clear(); + } + tree.clear(); +} + +Lexer::Lexer(const BNF& bnf, const std::string& Top): bnf(bnf), Top(Top), ReverseBNF{Reverse(bnf)} +{ +} + +std::vector<Token> Lexer::Lex(const std::string& s) +{ + std::vector<Token> result; + std::string token; + + std::string Whitespace{"\t \n\r"}; + Tree tree; + + for (size_t pos{0}; pos < s.size(); pos++) { + char c{s[pos]}; + if (c == '\n') { + location.column = 0; + location.line++; + } else if (std::isprint(c)) { + location.column++; + } + + //std::cout << "Char: |" << c << "|" << std::endl; + if (Whitespace.find(c) != std::string::npos) { // found whitespace character + // evaluate token up to now and skip whitespace + FinalizeTree(tree, token, result); + } else { // no whitespace: try to add to tree + if (!tree.Add(c, bnf, ReverseBNF)) { + FinalizeTree(tree, token, result); + if (!tree.Add(c, bnf, ReverseBNF)) + throw std::runtime_error("Parse error"); + } + + token.push_back(c); + } + } + + // Final evaluation of last token + FinalizeTree(tree, token, result); + + return result; +} + |