diff options
Diffstat (limited to 'lexer.cpp')
-rw-r--r-- | lexer.cpp | 62 |
1 files changed, 39 insertions, 23 deletions
@@ -4,9 +4,8 @@ using namespace Lex; -size_t Lexer::newState(std::string state_type) +size_t Lexer::newState() { - m_state_types.push_back(state_type); return states++; } @@ -66,19 +65,19 @@ std::vector<std::pair<size_t, char>> Lexer::getSuccessors(size_t state) } // Helper function -void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type) +void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol) { if (isTerminal(m_bnf, symbol)) { // end recursion with new transition if (symbol.size() != 1) throw std::runtime_error("Bad sized terminal symbol: "s + symbol); addTransition(state0, state1, symbol[0]); } else { // recurse via non-terminal symbol - addPath(state0, state1, symbol, type); + addPath(state0, state1, symbol); } } // Helper function: add one rule -void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type) +void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol) { size_t previousState{state0}; @@ -88,27 +87,24 @@ void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from if (symbol == rule_symbol) throw std::runtime_error("Recursion found but not allowed"); - size_t state{newState(type)}; - addPathOrTransition(previousState, state, symbol, type); + size_t state{newState()}; + addPathOrTransition(previousState, state, symbol); previousState = state; } if (list.back() == rule_symbol) throw std::runtime_error("Recursion found but not allowed"); // last transition - addPathOrTransition(previousState, state1, list.back(), type); + addPathOrTransition(previousState, state1, list.back()); } // Add paths between state0 and state1, including new states and transitions -void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string type) +void Lexer::addPath(size_t state0, size_t state1, std::string s) { - if (type == "" && s != "" && s != m_top) - type = s; - // state0 -> [paths] -> state01 -> state1 // ^ v // [recursion paths] - size_t state01{newState(type)}; + size_t state01{newState()}; auto it {m_bnf.find(s)}; if (it == m_bnf.end()) @@ -120,9 +116,9 @@ void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string typ throw std::runtime_error("List too small in rule "s + s); if (list[0] == s) { // recursion rule - addRule(list, 1, list_size, state01, state01, s, type); + addRule(list, 1, list_size, state01, state01, s); } else { // non-recursion rule - addRule(list, 0, list_size, state0, state01, s, type); + addRule(list, 0, list_size, state0, state01, s); } } addTransition(state01, state1, 0); // empty transition to end @@ -154,14 +150,35 @@ void Lexer::removeEmptyTransitions() } } -Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top), m_startState(newState()), m_endState(newState()) +Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top), m_startState(newState()) { - addPath(m_startState, m_endState, m_top, ""); + // types are the second level symbols, directly under top + auto it {m_bnf.find(m_top)}; + if (it == m_bnf.end()) + throw std::runtime_error("Start symbol "s + m_top + " not found."s); + + auto& list {it->second}; + + for (auto& element : list) { + if (element.size() != 1) + throw std::runtime_error("Bad type rule in "s + m_top + ": size = "s + std::to_string(element.size())); + + auto endState{newState()}; + std::string type{element[0]}; + m_state_types[endState] = type; + + addPath(m_startState, endState, type); + } replaceEmptyTransitions(); removeEmptyTransitions(); } +bool Lexer::isEndState(size_t state) +{ + return m_state_types.find(state) != m_state_types.end(); +} + Token Lexer::getToken(const std::string& s, Location& location) { Location oldLocation{location}; // start of token @@ -175,16 +192,16 @@ Token Lexer::getToken(const std::string& s, Location& location) // match as much as possible while (location.pos < s.size() && states.size() > 0) { char currentChar{s[location.pos]}; - std::cout << "DEBUG: Char: " << currentChar << std::endl; + //std::cout << "DEBUG: Char: " << currentChar << std::endl; for (const auto& state: states) { std::vector<std::pair<size_t, char>> successors{getSuccessors(state)}; for (const auto& [nextState, c]: successors) { if (c == currentChar) { - if (nextState == m_endState) { // save intermediate result upon match + if (isEndState(nextState)) { // save intermediate result upon match found = location; found.advance(); - state_type = m_state_types[state]; + state_type = m_state_types[nextState]; } else { newStates.push_back(nextState); } @@ -198,10 +215,9 @@ Token Lexer::getToken(const std::string& s, Location& location) std::string value {s.substr(oldLocation.pos, found.pos - oldLocation.pos)}; - if (found.pos > 0) - std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl; - else + if (found.pos == 0) throw std::runtime_error("Bad Token at "s + oldLocation.toString()); + //else std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl; location = found; // reset to end of match |