From d6ba30b94a24607bce5db5e706eb20cc051a98f0 Mon Sep 17 00:00:00 2001 From: A404M Date: Wed, 18 Sep 2024 19:46:38 +0330 Subject: initial commit --- src/compiler/lexer/lexer.c | 307 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 src/compiler/lexer/lexer.c (limited to 'src/compiler/lexer/lexer.c') diff --git a/src/compiler/lexer/lexer.c b/src/compiler/lexer/lexer.c new file mode 100644 index 0000000..35631f5 --- /dev/null +++ b/src/compiler/lexer/lexer.c @@ -0,0 +1,307 @@ +#include "lexer.h" + +#include +#include +#include +#include +#include + +const char *TOKEN_STRINGS[] = { + "TOKEN_NONE", + "TOKEN_IDENTIFIER", + "TOKEN_KEYWORD_PRINT", + "TOKEN_NUMBER", + "TOKEN_STRING", + "TOKEN_OPERATOR", + "TOKEN_OPERATOR_PARENTHESES_OPEN", + "TOKEN_OPERATOR_PARENTHESES_CLOSE", + "TOKEN_OPERATOR_CURLY_BRACKET_OPEN", + "TOKEN_OPERATOR_CURLY_BRACKET_CLOSE", + "TOKEN_OPERATOR_ASSIGN", + "TOKEN_OPERATOR_EQUAL", + "TOKEN_OPERATOR_COLON", + "TOKEN_OPERATOR_EOL", + "TOKEN_SYMBOL", + "TOKEN_PARSED", +}; + +static const char *KEYWORDS_STRINGS[] = { + "print", +}; +static const Token KEYWORDS_TOKENS[] = { + TOKEN_KEYWORD_PRINT, +}; +static const size_t KEYWORDS_SIZE = sizeof(KEYWORDS_STRINGS) / sizeof(char *); + +static const char *OPERATORS_STRINGS[] = { + "(", ")", "{", "}", "=", "==", ":", ";", +}; +static const Token OPERATORS_TOKENS[] = { + TOKEN_OPERATOR_PARENTHESES_OPEN, + TOKEN_OPERATOR_PARENTHESES_CLOSE, + TOKEN_OPERATOR_CURLY_BRACKET_OPEN, + TOKEN_OPERATOR_CURLY_BRACKET_CLOSE, + TOKEN_OPERATOR_ASSIGN, + TOKEN_OPERATOR_EQUAL, + TOKEN_OPERATOR_COLON, + TOKEN_OPERATOR_EOL, +}; +static const size_t OPERATOR_SIZE = sizeof(OPERATORS_STRINGS) / sizeof(char *); + +void printNodes(Nodes nodes) { + for (size_t i = 0; i < nodes.size; ++i) { + const Node node = nodes.nodes[i]; + printf("{'%.*s',%s}\n", (int)(node.strEnd - node.strBegin), node.strBegin, + TOKEN_STRINGS[node.token]); + } +} + +void deleteNodes(Nodes nodes) { free(nodes.nodes); } + +Nodes lexer(char const *restrict str) { + size_t nodes_size = 10; + Node *nodes = a404m_malloc(nodes_size * sizeof(Node)); + size_t nodes_inserted = 0; + + Node node = { + .strBegin = str, + .strEnd = str, + .token = TOKEN_NONE, + }; + + for (int i = 0;; ++i) { + const char c = str[i]; + if (c == '\0') { + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_NONE); + break; + } + + if (c == '/') { + const char follow = str[i + 1]; + if (follow == '/') { + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_NONE); + for (i += 2; str[i] != '\0' && str[i] != '\n'; ++i); + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_NONE); + if (str[i] == '\0') { + goto RETURN_SUCCESS; + } + continue; + } else if (follow == '*') { + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_NONE); + for (i += 2; str[i] != '\0' && str[i + 1] != '\0' && + (str[i] != '*' || str[i + 1] != '/'); + ++i); + if (str[i] == '\0' || str[i + 1] == '\0') { + perror("expected multi line comment to end\n"); + exit(1); + } + i += 1; + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_NONE); + if (str[i] == '\0') { + goto RETURN_SUCCESS; + } + continue; + } + } + if (isSpace(c)) { + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_NONE); + } else if (isIdentifier(c)) { + if (node.token != TOKEN_IDENTIFIER && node.token != TOKEN_SYMBOL) { + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_IDENTIFIER); + } + } else if (isIdentifierSymbol(c)) { + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_IDENTIFIER); + for (++i;; ++i) { + const char current = str[i]; + if (current == c) { + break; + } else if (current == '\0') { + fprintf(stderr, "expected %c to end\n", c); + exit(1); + } + } + ++node.strBegin; + push_clear_without_check(&nodes, &nodes_size, &nodes_inserted, &node, str, + i, TOKEN_NONE); + } else if (isNumber(c)) { + if (node.token != TOKEN_NUMBER && node.token != TOKEN_IDENTIFIER && + node.token != TOKEN_SYMBOL) { + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_NUMBER); + } + } else if (isString(c)) { + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_STRING); + + for (++i;; ++i) { + const char current = str[i]; + if (current == c) { + break; + } else if (current == '\0') { + fprintf(stderr, "expected %c to end\n", c); + exit(1); + } + } + + ++i; + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_NONE); + --i; + } else if (isOperator(c)) { + if (node.token == TOKEN_OPERATOR) { + const Token token = getOperator(node.strBegin, str + i + 1); + if (token != TOKEN_NONE) { + continue; + } else { + node.token = getOperator(node.strBegin, str + i); + if (node.token == TOKEN_NONE) { + fprintf(stderr, "unknown operator '%.*s'\n", + (int)(str + i - node.strBegin), node.strBegin); + exit(1); + } + push_clear_without_check(&nodes, &nodes_size, &nodes_inserted, &node, + str, i, TOKEN_OPERATOR); + } + } else { + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_OPERATOR); + } + } else if (isSymbol(c)) { + if (node.token != TOKEN_SYMBOL) { + push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, + TOKEN_SYMBOL); + } + } else { + fprintf(stderr, "unexpected char '%c'\n", c); + exit(1); + } + } + +RETURN_SUCCESS: + Nodes result = { + .nodes = a404m_realloc(nodes, nodes_inserted * sizeof(Node)), + .size = nodes_inserted, + }; + + return result; +} + +void push_if_not_empty(Node **restrict nodes, size_t *restrict nodes_size, + size_t *restrict nodes_inserted, Node *restrict node, + char const *restrict str, int i, Token token) { + if (node->token != TOKEN_NONE) { + if (*nodes_size == *nodes_inserted) { + *nodes_size += *nodes_size / 2 + 1; + *nodes = a404m_realloc(*nodes, *nodes_size * sizeof(Node)); + } + node->strEnd = str + i; + if (node->token == TOKEN_IDENTIFIER) { + const Token foundToken = getKeyword(node->strBegin, node->strEnd); + if (foundToken != TOKEN_NONE) { + node->token = foundToken; + } + } else if (node->token == TOKEN_OPERATOR) { + const Token foundToken = getOperator(node->strBegin, node->strEnd); + if (foundToken != TOKEN_NONE) { + node->token = foundToken; + } + } + + (*nodes)[*nodes_inserted] = *node; + ++*nodes_inserted; + } + node->strBegin = str + i; + node->token = token; +} + +void push_clear_without_check(Node **restrict nodes, + size_t *restrict nodes_size, + size_t *restrict nodes_inserted, Node *node, + char const *restrict str, int i, Token token) { + if (*nodes_size == *nodes_inserted) { + *nodes_size += *nodes_size / 2 + 1; + *nodes = a404m_realloc(*nodes, *nodes_size * sizeof(Node)); + } + node->strEnd = str + i; + (*nodes)[*nodes_inserted] = *node; + ++*nodes_inserted; + node->strBegin = str + i; + node->token = token; +} + +bool isSpace(char c) { return isspace(c); } + +bool isIdentifier(char c) { + return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_'; +} + +bool isIdentifierSymbol(char c) { return c == '`'; } + +bool isNumber(char c) { return '0' <= c && c <= '9'; } + +bool isString(char c) { return c == '"' || c == '\''; } + +bool isOperator(char c) { + switch (c) { + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '*': + case '/': + case '%': + case '+': + case '-': + case ':': + case '!': + case '=': + case '<': + case '>': + case '&': + case '|': + case '.': + case ',': + case ';': + return true; + default: + return false; + } +} + +bool isSymbol(char c) { return c == '#'; } + + +Token getKeyword(char const *strBegin, char const *strEnd) { + const size_t strSize = strEnd - strBegin; + + for (size_t i = 0; i < KEYWORDS_SIZE; ++i) { + const char *search = KEYWORDS_STRINGS[i]; + if (strlen(search) == strSize && strncmp(search, strBegin, strSize) == 0) { + return KEYWORDS_TOKENS[i]; + } + } + + return TOKEN_NONE; +} +Token getOperator(char const *strBegin, char const *strEnd) { + const size_t strSize = strEnd - strBegin; + + for (size_t i = 0; i < OPERATOR_SIZE; ++i) { + const char *search = OPERATORS_STRINGS[i]; + if (strlen(search) == strSize && strncmp(search, strBegin, strSize) == 0) { + return OPERATORS_TOKENS[i]; + } + } + + return TOKEN_NONE; +} -- cgit v1.2.3