From 9f69b006dde3c3fbe19ed3e0275d3b7348f2aa87 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Sun, 15 Mar 2020 18:19:49 +0100 Subject: New lexer --- lexer.cpp | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 165 insertions(+), 1 deletion(-) (limited to 'lexer.cpp') diff --git a/lexer.cpp b/lexer.cpp index 4a458c3..61caae1 100644 --- a/lexer.cpp +++ b/lexer.cpp @@ -2,14 +2,178 @@ using namespace Lex; -Lexer::Lexer(const BNF& bnf, const std::string& Top) +size_t Lexer::newState(std::string state_type) { + m_state_types.push_back(state_type); + return states++; +} + +void Lexer::addTransition(size_t state0, size_t state1, char c) +{ + auto it{transitions.find(state0)}; + + if (it == transitions.end()) { // new entry is a list with 1 entry + transitions[state0] = {{state1, c}}; + } else { // extend list entry + it->second.push_back({state1, c}); + } +} + +std::vector> Lexer::getSuccessors(size_t state) +{ + std::vector> result; + + auto it{transitions.find(state)}; + + if (it != transitions.end()) { // new list entry + for (auto &i: it->second) { + if (i.first == m_endState || i.second != '\0') { // add transition to end state or transition via actual char + result.push_back(i); + } else { // follow empty transition + auto successors { getSuccessors(i.first) }; + result.insert(result.end(), successors.begin(), successors.end()); + } + } + } + + return result; +} + +// Helper function +void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type) +{ + if (isTerminal(m_bnf, symbol)) { // end recursion with new transition + if (symbol.size() != 1) + throw std::runtime_error("Bad sized terminal symbol: "s + symbol); + addTransition(state0, state1, symbol[0]); + } else { // recurse via non-terminal symbol + addPath(state0, state1, symbol, type); + } +} + +// Helper function: add one rule +void Lexer::addRule(const std::vector& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type) +{ + size_t previousState{state0}; + + // add intermediate states with transitions + for (size_t i = list_index_from; i < list_index_to - 1; i++) { + std::string symbol{list[i]}; + if (symbol == rule_symbol) + throw std::runtime_error("Recursion found but not allowed"); + + size_t state{newState(type)}; + addPathOrTransition(previousState, state, symbol, type); + previousState = state; + } + if (list.back() == rule_symbol) + throw std::runtime_error("Recursion found but not allowed"); + + // last transition + addPathOrTransition(previousState, state1, list.back(), type); +} + +// Add paths between state0 and state1, including new states and transitions +void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string type) +{ + if (type == "" && s != "" && s != m_top) + type = s; + + // state0 -> [paths] -> state01 -> state1 + // ^ v + // [recursion paths] + size_t state01{newState(type)}; + auto it {m_bnf.find(s)}; + + if (it == m_bnf.end()) + throw std::runtime_error("Path ("s + std::to_string(state0) + ", "s + std::to_string(state1) + ") not possible."s); + + for (auto& list: it->second) { // for every path between state0 and state1 + size_t list_size{list.size()}; + if (list_size < 1) + throw std::runtime_error("List too small in rule "s + s); + + if (list[0] == s) { // recursion rule + addRule(list, 1, list_size, state01, state01, s, type); + } else { // non-recursion rule + addRule(list, 0, list_size, state0, state01, s, type); + } + } + addTransition(state01, state1, 0); // empty transition to end +} + +Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top) + , m_startState(newState()), m_endState(newState()) +{ + addPath(m_startState, m_endState, m_top, ""); +} + +Token Lexer::getToken(const std::string& s, Location& location) +{ + Location oldLocation{location}; // start of token + + std::vector states{m_startState}; // can be in multiple states at once + std::vector newStates; + + Location found; + std::string state_type; + + // match as much as possible + while (location.pos < s.size() && states.size() > 0) { + char currentChar{s[location.pos]}; + std::cout << "DEBUG: Char: " << currentChar << std::endl; + + for (const auto& state: states) { + std::vector> successors{getSuccessors(state)}; + for (const auto& [nextState, c]: successors) { + if (c == currentChar) { + newStates.push_back(nextState); + if (nextState == m_endState) { // save intermediate result upon match + found = location; + found.advance(); + state_type = m_state_types[state]; + } + } else if (nextState == m_endState) { // save intermediate result w/o match of c + found = location; + state_type = m_state_types[state]; + } + } + } + states = newStates; + newStates.clear(); + location.advance(currentChar == '\n'); + } + + std::string value {s.substr(oldLocation.pos, found.pos - oldLocation.pos)}; + + if (found.pos > 0) + std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl; + else + throw std::runtime_error("Tokenize error at "s + oldLocation.toString()); + + location = found; // reset to end of match + + return {state_type, value, oldLocation}; +} + +void Lexer::skipWhitespace(const std::string& s, Location& location) +{ + while (location.pos < s.size() && std::isspace(s[location.pos])) { + location.advance(s[location.pos] == '\n'); + } } std::vector Lexer::Lex(const std::string& s) { std::vector result; + Location location; + skipWhitespace(s, location); + while (location.pos < s.size()) { + result.emplace_back(getToken(s, location)); + skipWhitespace(s, location); + } + return result; } -- cgit v1.2.3