#include "lexer.h" #include #include #include #include #include "utils/types.h" const char *TOKEN_STRINGS[] = { "TOKEN_NONE", "TOKEN_IDENTIFIER", "TOKEN_NUMBER", "TOKEN_STRING", "TOKEN_OPERATOR", "TOKEN_OPERATOR_PARENTHESES_OPEN", "TOKEN_OPERATOR_PARENTHESES_CLOSE", "TOKEN_OPERATOR_CURLY_BRACKET_OPEN", "TOKEN_OPERATOR_CURLY_BRACKET_CLOSE", "TOKEN_OPERATOR_ASSIGN", "TOKEN_OPERATOR_EQUAL", "TOKEN_OPERATOR_COLON", "TOKEN_OPERATOR_COMMA", "TOKEN_OPERATOR_EOL", "TOKEN_OPERATOR_FUNCTION", "TOKEN_SYMBOL", "TOKEN_KEYWORD_STRUCT", "TOKEN_KEYWORD_EXTERNAL", "TOKEN_KEYWORD_IMPORT", "TOKEN_PARSED", }; static const char *KEYWORDS_STRINGS[] = { "struct", "external", "import", }; static const Token KEYWORDS_TOKENS[] = { TOKEN_KEYWORD_STRUCT, TOKEN_KEYWORD_EXTERNAL, TOKEN_KEYWORD_IMPORT, }; static const size_t KEYWORDS_SIZE = sizeof(KEYWORDS_STRINGS) / sizeof(char *); static const char *OPERATORS_STRINGS[] = { "(", ")", "{", "}", "=", "==", ":", ",", ";", "->", }; static const Token OPERATORS_TOKENS[] = { TOKEN_OPERATOR_PARENTHESES_OPEN, TOKEN_OPERATOR_PARENTHESES_CLOSE, TOKEN_OPERATOR_CURLY_BRACKET_OPEN, TOKEN_OPERATOR_CURLY_BRACKET_CLOSE, TOKEN_OPERATOR_ASSIGN, TOKEN_OPERATOR_EQUAL, TOKEN_OPERATOR_COLON, TOKEN_OPERATOR_COMMA, TOKEN_OPERATOR_EOL, TOKEN_OPERATOR_FUNCTION, }; static const size_t OPERATORS_SIZE = sizeof(OPERATORS_STRINGS) / sizeof(char *); void printNodes(Nodes nodes) { for (size_t i = 0; i < nodes.size; ++i) { const Node node = nodes.nodes[i]; printf("{'%.*s',%s}\n", (int)(node.strEnd - node.strBegin), node.strBegin, TOKEN_STRINGS[node.token]); } } void deleteNodes(Nodes nodes) { free(nodes.nodes); } Nodes lexer(char const *const restrict code) { size_t nodes_size = 10; Node *nodes = a404m_malloc(nodes_size * sizeof(Node)); size_t nodes_inserted = 0; Node node = { .strBegin = code, .strEnd = code, .token = TOKEN_NONE, }; for (int i = 0;; ++i) { const char c = code[i]; if (c == '\0') { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_NONE); break; } else if (c == '/') { const char follow = code[i + 1]; if (follow == '/') { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_NONE); for (i += 2; code[i] != '\0' && code[i] != '\n'; ++i); push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_NONE); if (code[i] == '\0') { goto RETURN_SUCCESS; } continue; } else if (follow == '*') { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_NONE); int in = 1; for (i += 2;; ++i) { switch (code[i]) { case '\0': printError("Expected multi line comment to end", code, node.strBegin, code + i); goto RETURN_ERROR; case '*': ++i; if (code[i] == '/') { --in; if (in == 0) { goto END_OF_BLOCK_COMMENT_LOOP; } } else if (code[i] == '\0') { printError("Expected multi line comment to end", code, node.strBegin, code + i); goto RETURN_ERROR; } break; case '/': ++i; if (code[i] == '*') { ++in; } else if (code[i] == '\0') { printError("Expected multi line comment to end", code, node.strBegin, code + i); goto RETURN_ERROR; } break; } } END_OF_BLOCK_COMMENT_LOOP: push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_NONE); if (code[i] == '\0') { goto RETURN_SUCCESS; } continue; } } if (isSpace(c)) { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_NONE); } else if (isIdentifier(c)) { if (node.token != TOKEN_IDENTIFIER && node.token != TOKEN_SYMBOL) { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_IDENTIFIER); } } else if (isIdentifierSymbol(c)) { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_IDENTIFIER); for (++i;; ++i) { const char current = code[i]; if (current == c) { break; } else if (current == '\0') { printError("Expected %c to end", code, node.strBegin, code + i, c); goto RETURN_ERROR; } } ++node.strBegin; push_clear_without_check(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_NONE); } else if (isNumber(c)) { if (node.token != TOKEN_NUMBER && node.token != TOKEN_IDENTIFIER && node.token != TOKEN_SYMBOL) { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_NUMBER); } } else if (isString(c)) { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_STRING); for (++i;; ++i) { const char current = code[i]; if (current == c) { break; } else if (current == '\\') { ++i; } else if (current == '\0') { printError("Expected %c to end", code, node.strBegin, code + i, c); goto RETURN_ERROR; } } ++i; push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_NONE); --i; } else if (isOperator(c)) { if (node.token == TOKEN_OPERATOR) { const Token token = getOperator(node.strBegin, code + i + 1); if (token != TOKEN_NONE) { continue; } else { node.token = getOperator(node.strBegin, code + i); if (node.token == TOKEN_NONE) { printError("Unknown operator '%.*s'", code, node.strBegin, node.strEnd, (int)(code + i - node.strBegin), node.strBegin); goto RETURN_ERROR; } push_clear_without_check(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_OPERATOR); } } else { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_OPERATOR); } } else if (isSymbol(c)) { if (node.token != TOKEN_SYMBOL) { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, code, i, TOKEN_SYMBOL); } } else { printError("Unexpected char '%c'", code, code + i, code + i + 1, c); goto RETURN_ERROR; } } RETURN_SUCCESS: Nodes result = { .nodes = a404m_realloc(nodes, nodes_inserted * sizeof(Node)), .size = nodes_inserted, }; return result; RETURN_ERROR: free(nodes); Nodes error = { .nodes = NULL, .size = ERROR_SIZE, }; return error; } void push_if_not_empty(Node **restrict nodes, size_t *restrict nodes_size, size_t *restrict nodes_inserted, Node *restrict node, char const *restrict str, int i, Token token) { if (node->token != TOKEN_NONE) { if (*nodes_size == *nodes_inserted) { *nodes_size += *nodes_size / 2 + 1; *nodes = a404m_realloc(*nodes, *nodes_size * sizeof(Node)); } node->strEnd = str + i; if (node->token == TOKEN_IDENTIFIER) { const Token foundToken = getKeyword(node->strBegin, node->strEnd); if (foundToken != TOKEN_NONE) { node->token = foundToken; } } else if (node->token == TOKEN_OPERATOR) { const Token foundToken = getOperator(node->strBegin, node->strEnd); if (foundToken != TOKEN_NONE) { node->token = foundToken; } } (*nodes)[*nodes_inserted] = *node; ++*nodes_inserted; } node->strBegin = str + i; node->token = token; } void push_clear_without_check(Node **restrict nodes, size_t *restrict nodes_size, size_t *restrict nodes_inserted, Node *node, char const *restrict str, int i, Token token) { if (*nodes_size == *nodes_inserted) { *nodes_size += *nodes_size / 2 + 1; *nodes = a404m_realloc(*nodes, *nodes_size * sizeof(Node)); } node->strEnd = str + i; (*nodes)[*nodes_inserted] = *node; ++*nodes_inserted; node->strBegin = str + i; node->token = token; } bool isSpace(char c) { return isspace(c); } bool isIdentifier(char c) { return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_'; } bool isIdentifierSymbol(char c) { return c == '`'; } bool isNumber(char c) { return '0' <= c && c <= '9'; } bool isString(char c) { return c == '"' || c == '\''; } bool isOperator(char c) { switch (c) { case '(': case ')': case '[': case ']': case '{': case '}': case '*': case '/': case '%': case '+': case '-': case ':': case '!': case '=': case '<': case '>': case '&': case '|': case '.': case ',': case ';': return true; } return false; } bool isSymbol(char c) { return c == '#'; } Token getTokenInStrings(char const *strBegin, char const *strEnd, const char *strings[], const Token tokens[], size_t size) { const size_t strSize = strEnd - strBegin; for (size_t i = 0; i < size; ++i) { const char *search = strings[i]; // faster than strlen+strncmp for (size_t j = 0;; ++j) { const char searchChar = search[j]; if (j == strSize) { if (searchChar == '\0') { return tokens[i]; } else { break; } } else if (searchChar == '\0') { break; } else if (searchChar != strBegin[j]) { break; } } } return TOKEN_NONE; } Token getKeyword(char const *strBegin, char const *strEnd) { return getTokenInStrings(strBegin, strEnd, KEYWORDS_STRINGS, KEYWORDS_TOKENS, KEYWORDS_SIZE); } Token getOperator(char const *strBegin, char const *strEnd) { return getTokenInStrings(strBegin, strEnd, OPERATORS_STRINGS, OPERATORS_TOKENS, OPERATORS_SIZE); }