1 files changed, 165 insertions, 1 deletions
diff --git a/lexer.cpp b/lexer.cpp
index 4a458c3..61caae1 100644
--- a/lexer.cpp
+++ b/lexer.cpp
@@ -2,14 +2,178 @@
 
 using namespace Lex;
 
-Lexer::Lexer(const BNF& bnf, const std::string& Top)
+size_t Lexer::newState(std::string state_type)
 {
+ m_state_types.push_back(state_type);
+ return states++;
+}
+
+void Lexer::addTransition(size_t state0, size_t state1, char c)
+{
+ auto it{transitions.find(state0)};
+
+ if (it == transitions.end()) { // new entry is a list with 1 entry
+  transitions[state0] = {{state1, c}};
+ } else { // extend list entry
+  it->second.push_back({state1, c});
+ }
+}
+
+std::vector<std::pair<size_t, char>> Lexer::getSuccessors(size_t state)
+{
+ std::vector<std::pair<size_t, char>> result;
+
+ auto it{transitions.find(state)};
+
+ if (it != transitions.end()) { // new list entry
+  for (auto &i: it->second) {
+   if (i.first == m_endState || i.second != '\0') { // add transition to end state or transition via actual char
+    result.push_back(i);
+   } else { // follow empty transition
+    auto successors { getSuccessors(i.first) };
+    result.insert(result.end(), successors.begin(), successors.end());
+   }
+  }
+ }
+
+ return result;
+}
+
+// Helper function
+void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type)
+{
+ if (isTerminal(m_bnf, symbol)) { // end recursion with new transition
+  if (symbol.size() != 1)
+   throw std::runtime_error("Bad sized terminal symbol: "s + symbol);
+  addTransition(state0, state1, symbol[0]);
+ } else { // recurse via non-terminal symbol
+  addPath(state0, state1, symbol, type);
+ }
+}
+
+// Helper function: add one rule
+void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type)
+{
+ size_t previousState{state0};
+ 
+ // add intermediate states with transitions
+ for (size_t i = list_index_from; i < list_index_to - 1; i++) {
+  std::string symbol{list[i]};
+  if (symbol == rule_symbol)
+   throw std::runtime_error("Recursion found but not allowed");
+
+  size_t state{newState(type)};
+  addPathOrTransition(previousState, state, symbol, type);
+  previousState = state;
+ }
+ if (list.back() == rule_symbol)
+  throw std::runtime_error("Recursion found but not allowed");
+
+ // last transition
+ addPathOrTransition(previousState, state1, list.back(), type);
+}
+
+// Add paths between state0 and state1, including new states and transitions
+void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string type)
+{
+ if (type == "" && s != "" && s != m_top)
+  type = s;
+
+ // state0 -> [paths] -> state01 -> state1
+ //                      ^     v
+ //                 [recursion paths]
+ size_t state01{newState(type)};
+ auto it {m_bnf.find(s)};
+
+ if (it == m_bnf.end())
+  throw std::runtime_error("Path ("s + std::to_string(state0) + ", "s + std::to_string(state1) + ") not possible."s);
+
+ for (auto& list: it->second) { // for every path between state0 and state1
+  size_t list_size{list.size()};
+  if (list_size < 1)
+   throw std::runtime_error("List too small in rule "s + s);
+
+  if (list[0] == s) { // recursion rule
+   addRule(list, 1, list_size, state01, state01, s, type);
+  } else { // non-recursion rule
+   addRule(list, 0, list_size, state0, state01, s, type);
+  }
+ }
+ addTransition(state01, state1, 0); // empty transition to end
+}
+
+Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top)
+                                                      , m_startState(newState()), m_endState(newState())
+{
+ addPath(m_startState, m_endState, m_top, "");
+}
+
+Token Lexer::getToken(const std::string& s, Location& location)
+{
+ Location oldLocation{location}; // start of token
+
+ std::vector<size_t> states{m_startState}; // can be in multiple states at once
+ std::vector<size_t> newStates;
+
+ Location found;
+ std::string state_type;
+
+ // match as much as possible
+ while (location.pos < s.size() && states.size() > 0) {
+  char currentChar{s[location.pos]};
+  std::cout << "DEBUG: Char: " << currentChar << std::endl;
+
+  for (const auto& state: states) {
+   std::vector<std::pair<size_t, char>> successors{getSuccessors(state)};
+   for (const auto& [nextState, c]: successors) {
+    if (c == currentChar) {
+     newStates.push_back(nextState);
+     if (nextState == m_endState) { // save intermediate result upon match
+      found = location;
+      found.advance();
+      state_type = m_state_types[state];
+     }
+    } else if (nextState == m_endState) { // save intermediate result w/o match of c
+     found = location;
+     state_type = m_state_types[state];
+    }
+   }
+  }
+  states = newStates;
+  newStates.clear();
+  location.advance(currentChar == '\n');
+ }
+
+ std::string value {s.substr(oldLocation.pos, found.pos - oldLocation.pos)};
+
+ if (found.pos > 0)
+  std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl;
+ else
+  throw std::runtime_error("Tokenize error at "s + oldLocation.toString());
+
+ location = found; // reset to end of match
+
+ return {state_type, value, oldLocation};
+}
+
+void Lexer::skipWhitespace(const std::string& s, Location& location)
+{
+ while (location.pos < s.size() && std::isspace(s[location.pos])) {
+  location.advance(s[location.pos] == '\n');
+ }
 }
 
 std::vector<Token> Lexer::Lex(const std::string& s)
 {
  std::vector<Token> result;
 
+ Location location;
+ skipWhitespace(s, location);
+ while (location.pos < s.size()) {
+  result.emplace_back(getToken(s, location));
+  skipWhitespace(s, location);
+ }
+
  return result;
 }