#include "lexer.h" #include using namespace Lex; size_t Lexer::newState() { return states++; } void Lexer::addTransition(size_t state0, size_t state1, char c) { auto it{transitions.find(state0)}; if (it == transitions.end()) { // new entry is a list with 1 entry transitions[state0] = {{state1, c}}; } else { // extend list entry it->second.push_back({state1, c}); } } void Lexer::removeTransition(size_t state0, size_t state1, char c) { auto it{transitions.find(state0)}; if (it != transitions.end()) { std::pair reference{state1, c}; it->second.erase(std::remove(it->second.begin(), it->second.end(), reference), it->second.end()); } else { throw std::runtime_error("Transition not found: "s + std::to_string(state0) + "->"s + std::to_string(state1)); } } std::vector> Lexer::getSuccessorsViaEmpty(size_t state) { std::vector> result; auto it{transitions.find(state)}; if (it != transitions.end()) { for (auto &i: it->second) { if (i.second == '\0') { // add empty transitions result.push_back(i); // and more, recursively auto successors { getSuccessorsViaEmpty(i.first) }; result.insert(result.end(), successors.begin(), successors.end()); } } } return result; } std::vector> Lexer::getSuccessors(size_t state) { auto it{transitions.find(state)}; if (it != transitions.end()) { return it->second; } return {}; } // Helper function void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol) { if (isTerminal(m_bnf, symbol)) { // end recursion with new transition if (symbol.size() != 1) throw std::runtime_error("Bad sized terminal symbol: "s + symbol); addTransition(state0, state1, symbol[0]); } else { // recurse via non-terminal symbol addPath(state0, state1, symbol); } } // Helper function: add one rule void Lexer::addRule(const std::vector& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol) { size_t previousState{state0}; // add intermediate states with transitions for (size_t i = list_index_from; i < list_index_to - 1; i++) { std::string symbol{list[i]}; if (symbol == rule_symbol) throw std::runtime_error("Recursion found but not allowed. Only head recursion allowed for lexer."); size_t state{newState()}; addPathOrTransition(previousState, state, symbol); previousState = state; } if (list.back() == rule_symbol) throw std::runtime_error("Tail recursion found but not allowed. Only head recursion allowed for lexer."); // last transition addPathOrTransition(previousState, state1, list.back()); } // Add paths between state0 and state1, including new states and transitions void Lexer::addPath(size_t state0, size_t state1, std::string s) { // state0 -> [paths] -> state01 -> state1 // ^ v // [recursion paths] size_t state01{newState()}; auto it {m_bnf.find(s)}; if (it == m_bnf.end()) throw std::runtime_error("Path ("s + std::to_string(state0) + ", "s + std::to_string(state1) + ") not possible."s); for (auto& list: it->second) { // for every path between state0 and state1 size_t list_size{list.size()}; if (list_size < 1) throw std::runtime_error("List too small in rule "s + s); if (list[0] == s) { // recursion rule addRule(list, 1, list_size, state01, state01, s); } else { // non-recursion rule addRule(list, 0, list_size, state0, state01, s); } } addTransition(state01, state1, 0); // empty transition to end } void Lexer::replaceEmptyTransitions() { // iterate over all transitions for (auto& [state0, list]: transitions) { std::vector> list_extension; // which elements must be added to list for (auto& [state1, currentChar]: list) { if (currentChar != '\0') { // for every non-empty transition // add extension via following empty transitions auto successors { getSuccessorsViaEmpty(state1) }; for (auto &[state, dummyChar] : successors) { list_extension.emplace_back(state, currentChar); } } } list.insert(list.end(), list_extension.begin(), list_extension.end()); } } void Lexer::removeEmptyTransitions() { for (auto& [state0, list]: transitions) { list.erase(std::remove_if(list.begin(), list.end(), [](const std::pair& pair){return pair.second == '\0';}), list.end()); } } Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top), m_startState(newState()) { // types are the second level symbols, directly under top auto it {m_bnf.find(m_top)}; if (it == m_bnf.end()) throw std::runtime_error("Start symbol "s + m_top + " not found."s); auto& list {it->second}; for (auto& element : list) { if (element.size() != 1) throw std::runtime_error("Bad type rule in "s + m_top + ": size = "s + std::to_string(element.size())); auto endState{newState()}; std::string type{element[0]}; m_state_types[endState] = type; addPath(m_startState, endState, type); } replaceEmptyTransitions(); removeEmptyTransitions(); } bool Lexer::isEndState(size_t state) { return m_state_types.find(state) != m_state_types.end(); } Token Lexer::getToken(const std::string& s, Location& location) { Location oldLocation{location}; // start of token std::vector states{m_startState}; // can be in multiple states at once std::vector newStates; Location found; std::string state_type; // match as much as possible while (location.pos < s.size() && states.size() > 0) { char currentChar{s[location.pos]}; //std::cout << "DEBUG: Char: " << currentChar << std::endl; for (const auto& state: states) { std::vector> successors{getSuccessors(state)}; for (const auto& [nextState, c]: successors) { if (c == currentChar) { if (isEndState(nextState)) { // save intermediate result upon match found = location; found.advance(); state_type = m_state_types[nextState]; } else { newStates.push_back(nextState); } } } } states = newStates; newStates.clear(); location.advance(currentChar == '\n'); } std::string value {s.substr(oldLocation.pos, found.pos - oldLocation.pos)}; if (found.pos == 0) throw std::runtime_error("Bad Token at "s + oldLocation.toString()); //else std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl; location = found; // reset to end of match return {state_type, value, oldLocation}; } void Lexer::skipWhitespace(const std::string& s, Location& location) { while (location.pos < s.size() && std::isspace(s[location.pos])) { location.advance(s[location.pos] == '\n'); } } std::vector Lexer::Lex(const std::string& s) { std::vector result; Location location; skipWhitespace(s, location); while (location.pos < s.size()) { result.emplace_back(getToken(s, location)); skipWhitespace(s, location); } return result; }