summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2020-03-15 18:19:49 +0100
committerRoland Reichwein <mail@reichwein.it>2020-03-15 18:19:49 +0100
commit9f69b006dde3c3fbe19ed3e0275d3b7348f2aa87 (patch)
tree6ac42793568339463f913cf39474794c8613d0b6
parent3a7006fcf5f8ecffd852fbba6d8ee03ce8a35dce (diff)
New lexer
-rw-r--r--bnf.cpp5
-rw-r--r--bnf.h2
-rw-r--r--cpp.cpp2
-rwxr-xr-xcppbnf.cpp11
-rw-r--r--lexer.cpp166
-rw-r--r--lexer.h32
-rw-r--r--minicc.cpp17
-rw-r--r--minicc.h8
-rw-r--r--test-lexer.cpp17
9 files changed, 241 insertions, 19 deletions
diff --git a/bnf.cpp b/bnf.cpp
index 4dd896e..32e1175 100644
--- a/bnf.cpp
+++ b/bnf.cpp
@@ -68,3 +68,8 @@ BNF SubBNF(const BNF& bnf, const std::string& top)
return result;
}
+bool isTerminal(const BNF& bnf, const std::string& symbol)
+{
+ return bnf.find(symbol) == bnf.end();
+}
+
diff --git a/bnf.h b/bnf.h
index 13337bf..9244b4d 100644
--- a/bnf.h
+++ b/bnf.h
@@ -15,3 +15,5 @@ std::unordered_map<std::string, std::set<std::string>> Reverse(BNF bnf); // unus
std::unordered_map<std::string, std::set<std::string>> reverseFirst(BNF bnf);
BNF SubBNF(const BNF& bnf, const std::string& top);
+
+bool isTerminal(const BNF& bnf, const std::string& symbol);
diff --git a/cpp.cpp b/cpp.cpp
index 714045b..3d8c20d 100644
--- a/cpp.cpp
+++ b/cpp.cpp
@@ -247,12 +247,14 @@ TEST_F(CppTest, preprocessing_tokenize) {
}
#endif
+#if 0
TEST_F(CppTest, preprocessing_tokenize2) {
CPP cpp;
auto ppTree = cpp.preprocessing_tokenize("in ma");
cpp.tokens_from_pptokens(ppTree);
}
+#endif
#if 0
TEST(Cpp, translate) {
diff --git a/cppbnf.cpp b/cppbnf.cpp
index 0b6e58f..4818fc3 100755
--- a/cppbnf.cpp
+++ b/cppbnf.cpp
@@ -39,11 +39,6 @@ namespace {
return result;
}
- bool isTerminal(const std::string& symbol, const BNF& bnf)
- {
- return bnf.find(symbol) == bnf.end();
- }
-
size_t numberOfStartSymbols(const BNF& bnf)
{
// exactly 1 start symbol
@@ -75,7 +70,7 @@ namespace {
for (const auto& list : lists) {
for (const auto& i : list) {
- if (!isTerminal(i, bnf)) {
+ if (!isTerminal(bnf, i)) {
// every non-terminal symbol must be longer that 1 char
if (i.size() == 1) {
std::cerr << "Warning: Non-Terminal symbol " << i << " in " << symbol << " is too short." << std::endl;
@@ -107,7 +102,7 @@ namespace {
for (const auto& [symbol, lists] : bnf) {
for (const auto& list : lists) {
for (const auto& i : list) {
- if (i.size() != 1 && isTerminal(i, bnf)) {
+ if (i.size() != 1 && isTerminal(bnf, i)) {
std::cerr << "Warning: Terminal symbol in " << symbol << " is too long: "s << i << std::endl;
return false;
}
@@ -197,7 +192,7 @@ namespace {
for (auto& [symbol, lists] : bnf) {
for (auto& list : lists) {
for (int i = 0; i < list.size(); i++) {
- if (list[i].size() > 1 && isTerminal(list[i], bnf)) {
+ if (list[i].size() > 1 && isTerminal(bnf, list[i])) {
auto newList = vectorize(list[i]);
list.erase(list.begin() + i);
list.insert(list.begin() + i, newList.begin(), newList.end());
diff --git a/lexer.cpp b/lexer.cpp
index 4a458c3..61caae1 100644
--- a/lexer.cpp
+++ b/lexer.cpp
@@ -2,14 +2,178 @@
using namespace Lex;
-Lexer::Lexer(const BNF& bnf, const std::string& Top)
+size_t Lexer::newState(std::string state_type)
{
+ m_state_types.push_back(state_type);
+ return states++;
+}
+
+void Lexer::addTransition(size_t state0, size_t state1, char c)
+{
+ auto it{transitions.find(state0)};
+
+ if (it == transitions.end()) { // new entry is a list with 1 entry
+ transitions[state0] = {{state1, c}};
+ } else { // extend list entry
+ it->second.push_back({state1, c});
+ }
+}
+
+std::vector<std::pair<size_t, char>> Lexer::getSuccessors(size_t state)
+{
+ std::vector<std::pair<size_t, char>> result;
+
+ auto it{transitions.find(state)};
+
+ if (it != transitions.end()) { // new list entry
+ for (auto &i: it->second) {
+ if (i.first == m_endState || i.second != '\0') { // add transition to end state or transition via actual char
+ result.push_back(i);
+ } else { // follow empty transition
+ auto successors { getSuccessors(i.first) };
+ result.insert(result.end(), successors.begin(), successors.end());
+ }
+ }
+ }
+
+ return result;
+}
+
+// Helper function
+void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type)
+{
+ if (isTerminal(m_bnf, symbol)) { // end recursion with new transition
+ if (symbol.size() != 1)
+ throw std::runtime_error("Bad sized terminal symbol: "s + symbol);
+ addTransition(state0, state1, symbol[0]);
+ } else { // recurse via non-terminal symbol
+ addPath(state0, state1, symbol, type);
+ }
+}
+
+// Helper function: add one rule
+void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type)
+{
+ size_t previousState{state0};
+
+ // add intermediate states with transitions
+ for (size_t i = list_index_from; i < list_index_to - 1; i++) {
+ std::string symbol{list[i]};
+ if (symbol == rule_symbol)
+ throw std::runtime_error("Recursion found but not allowed");
+
+ size_t state{newState(type)};
+ addPathOrTransition(previousState, state, symbol, type);
+ previousState = state;
+ }
+ if (list.back() == rule_symbol)
+ throw std::runtime_error("Recursion found but not allowed");
+
+ // last transition
+ addPathOrTransition(previousState, state1, list.back(), type);
+}
+
+// Add paths between state0 and state1, including new states and transitions
+void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string type)
+{
+ if (type == "" && s != "" && s != m_top)
+ type = s;
+
+ // state0 -> [paths] -> state01 -> state1
+ // ^ v
+ // [recursion paths]
+ size_t state01{newState(type)};
+ auto it {m_bnf.find(s)};
+
+ if (it == m_bnf.end())
+ throw std::runtime_error("Path ("s + std::to_string(state0) + ", "s + std::to_string(state1) + ") not possible."s);
+
+ for (auto& list: it->second) { // for every path between state0 and state1
+ size_t list_size{list.size()};
+ if (list_size < 1)
+ throw std::runtime_error("List too small in rule "s + s);
+
+ if (list[0] == s) { // recursion rule
+ addRule(list, 1, list_size, state01, state01, s, type);
+ } else { // non-recursion rule
+ addRule(list, 0, list_size, state0, state01, s, type);
+ }
+ }
+ addTransition(state01, state1, 0); // empty transition to end
+}
+
+Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top)
+ , m_startState(newState()), m_endState(newState())
+{
+ addPath(m_startState, m_endState, m_top, "");
+}
+
+Token Lexer::getToken(const std::string& s, Location& location)
+{
+ Location oldLocation{location}; // start of token
+
+ std::vector<size_t> states{m_startState}; // can be in multiple states at once
+ std::vector<size_t> newStates;
+
+ Location found;
+ std::string state_type;
+
+ // match as much as possible
+ while (location.pos < s.size() && states.size() > 0) {
+ char currentChar{s[location.pos]};
+ std::cout << "DEBUG: Char: " << currentChar << std::endl;
+
+ for (const auto& state: states) {
+ std::vector<std::pair<size_t, char>> successors{getSuccessors(state)};
+ for (const auto& [nextState, c]: successors) {
+ if (c == currentChar) {
+ newStates.push_back(nextState);
+ if (nextState == m_endState) { // save intermediate result upon match
+ found = location;
+ found.advance();
+ state_type = m_state_types[state];
+ }
+ } else if (nextState == m_endState) { // save intermediate result w/o match of c
+ found = location;
+ state_type = m_state_types[state];
+ }
+ }
+ }
+ states = newStates;
+ newStates.clear();
+ location.advance(currentChar == '\n');
+ }
+
+ std::string value {s.substr(oldLocation.pos, found.pos - oldLocation.pos)};
+
+ if (found.pos > 0)
+ std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl;
+ else
+ throw std::runtime_error("Tokenize error at "s + oldLocation.toString());
+
+ location = found; // reset to end of match
+
+ return {state_type, value, oldLocation};
+}
+
+void Lexer::skipWhitespace(const std::string& s, Location& location)
+{
+ while (location.pos < s.size() && std::isspace(s[location.pos])) {
+ location.advance(s[location.pos] == '\n');
+ }
}
std::vector<Token> Lexer::Lex(const std::string& s)
{
std::vector<Token> result;
+ Location location;
+ skipWhitespace(s, location);
+ while (location.pos < s.size()) {
+ result.emplace_back(getToken(s, location));
+ skipWhitespace(s, location);
+ }
+
return result;
}
diff --git a/lexer.h b/lexer.h
index f9363ef..eb41e3d 100644
--- a/lexer.h
+++ b/lexer.h
@@ -3,16 +3,42 @@
#include "minicc.h"
#include "bnf.h"
+#include <string>
+#include <unordered_map>
+#include <vector>
+
namespace Lex {
class Lexer
{
+ // constructor input
+ const BNF& m_bnf;
+ const std::string& m_top;
+
+ // Graph
+
+ size_t states{}; // start, ...
+ std::unordered_map<size_t, std::vector<std::pair<size_t, char>>> transitions; //transitions: state -> {state,character}, ...; empty transition is marked by \0
+ std::vector<std::string> m_state_types;
+ size_t m_startState;
+ size_t m_endState;
+
+ // Graph manipulation
+
+ size_t newState(std::string state_type = "");
+ void addTransition(size_t state0, size_t state1, char c);
+ std::vector<std::pair<size_t, char>> getSuccessors(size_t state);
+
+ // Build up automaton, recursively
+ void addPath(size_t state0, size_t state1, std::string s, std::string type);
+ void addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type);
+ void addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type);
- //states; // start, ...
- //transitions; // state, state, character
+ Token getToken(const std::string& s, Location& location);
+ void skipWhitespace(const std::string& s, Location& location);
public:
- Lexer(const BNF& bnf, const std::string& Top);
+ Lexer(const BNF& bnf, const std::string& top);
std::vector<Token> Lex(const std::string& s);
};
diff --git a/minicc.cpp b/minicc.cpp
index 48d2a15..d180517 100644
--- a/minicc.cpp
+++ b/minicc.cpp
@@ -14,6 +14,8 @@
#include <utility>
#include <vector>
+using namespace std::string_literals;
+
std::vector<std::string> split(std::string s)
{
std::vector<std::string> result;
@@ -37,3 +39,18 @@ std::ostream& operator<<(std::ostream& os, const Token& token) {
return os << token.type << ": " << token.value << "(" << token.location.line << ":" << token.location.column << ")";
}
+void Location::advance(bool newline)
+{
+ pos++;
+ if (newline) {
+ line++;
+ column = 1;
+ } else {
+ column++;
+ }
+}
+
+std::string Location::toString()
+{
+ return std::to_string(line) + ":"s + std::to_string(column);
+}
diff --git a/minicc.h b/minicc.h
index 9bc8945..861a30a 100644
--- a/minicc.h
+++ b/minicc.h
@@ -10,8 +10,12 @@ using index_t = size_t;
std::vector<std::string> split(std::string s);
struct Location {
- size_t line;
- size_t column;
+ size_t line{1};
+ size_t column{1};
+ size_t pos{0};
+
+ void advance(bool newline = false);
+ std::string toString();
};
bool operator==(const Location &a, const Location &b);
diff --git a/test-lexer.cpp b/test-lexer.cpp
index a94e550..9f1cb77 100644
--- a/test-lexer.cpp
+++ b/test-lexer.cpp
@@ -1,5 +1,6 @@
#include "bnf.h"
#include "cpp.h"
+#include "cppbnf.h"
#include "lexer.h"
#include "grammer.h"
#include "minicc.h"
@@ -19,14 +20,20 @@
#include <utility>
#include <vector>
-class Test: public ::testing::Test {
+class LexerTest: public ::testing::Test {
protected:
- Test(){
+ LexerTest(){
debug = false;
}
- ~Test() override {}
+ ~LexerTest() override {}
};
-TEST_F(Test, BNF) {
-}
+TEST_F(LexerTest, Lex) {
+ auto bnf{SubBNF(GetCppBNFLex(), "preprocessing-token")};
+
+ Lex::Lexer lexer(bnf, "preprocessing-token");
+ std::vector<Token> tokens{lexer.Lex("int main() { return 1; }")};
+
+ ASSERT_EQ(tokens.size(), 9);
+}