aboutsummaryrefslogtreecommitdiff
path: root/src/compiler/lexer
diff options
context:
space:
mode:
authorA404M <ahmadmahmoudiprogrammer@gmail.com>2024-09-18 19:46:38 +0330
committerA404M <ahmadmahmoudiprogrammer@gmail.com>2024-09-18 19:57:20 +0330
commitd6ba30b94a24607bce5db5e706eb20cc051a98f0 (patch)
tree146e74b0bc2e1636451257015210e3c7d1a0ecab /src/compiler/lexer
initial commit
Diffstat (limited to 'src/compiler/lexer')
-rw-r--r--src/compiler/lexer/lexer.c307
-rw-r--r--src/compiler/lexer/lexer.h66
2 files changed, 373 insertions, 0 deletions
diff --git a/src/compiler/lexer/lexer.c b/src/compiler/lexer/lexer.c
new file mode 100644
index 0000000..35631f5
--- /dev/null
+++ b/src/compiler/lexer/lexer.c
@@ -0,0 +1,307 @@
+#include "lexer.h"
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <utils/memory/memory.h>
+
+const char *TOKEN_STRINGS[] = {
+ "TOKEN_NONE",
+ "TOKEN_IDENTIFIER",
+ "TOKEN_KEYWORD_PRINT",
+ "TOKEN_NUMBER",
+ "TOKEN_STRING",
+ "TOKEN_OPERATOR",
+ "TOKEN_OPERATOR_PARENTHESES_OPEN",
+ "TOKEN_OPERATOR_PARENTHESES_CLOSE",
+ "TOKEN_OPERATOR_CURLY_BRACKET_OPEN",
+ "TOKEN_OPERATOR_CURLY_BRACKET_CLOSE",
+ "TOKEN_OPERATOR_ASSIGN",
+ "TOKEN_OPERATOR_EQUAL",
+ "TOKEN_OPERATOR_COLON",
+ "TOKEN_OPERATOR_EOL",
+ "TOKEN_SYMBOL",
+ "TOKEN_PARSED",
+};
+
+static const char *KEYWORDS_STRINGS[] = {
+ "print",
+};
+static const Token KEYWORDS_TOKENS[] = {
+ TOKEN_KEYWORD_PRINT,
+};
+static const size_t KEYWORDS_SIZE = sizeof(KEYWORDS_STRINGS) / sizeof(char *);
+
+static const char *OPERATORS_STRINGS[] = {
+ "(", ")", "{", "}", "=", "==", ":", ";",
+};
+static const Token OPERATORS_TOKENS[] = {
+ TOKEN_OPERATOR_PARENTHESES_OPEN,
+ TOKEN_OPERATOR_PARENTHESES_CLOSE,
+ TOKEN_OPERATOR_CURLY_BRACKET_OPEN,
+ TOKEN_OPERATOR_CURLY_BRACKET_CLOSE,
+ TOKEN_OPERATOR_ASSIGN,
+ TOKEN_OPERATOR_EQUAL,
+ TOKEN_OPERATOR_COLON,
+ TOKEN_OPERATOR_EOL,
+};
+static const size_t OPERATOR_SIZE = sizeof(OPERATORS_STRINGS) / sizeof(char *);
+
+void printNodes(Nodes nodes) {
+ for (size_t i = 0; i < nodes.size; ++i) {
+ const Node node = nodes.nodes[i];
+ printf("{'%.*s',%s}\n", (int)(node.strEnd - node.strBegin), node.strBegin,
+ TOKEN_STRINGS[node.token]);
+ }
+}
+
+void deleteNodes(Nodes nodes) { free(nodes.nodes); }
+
+Nodes lexer(char const *restrict str) {
+ size_t nodes_size = 10;
+ Node *nodes = a404m_malloc(nodes_size * sizeof(Node));
+ size_t nodes_inserted = 0;
+
+ Node node = {
+ .strBegin = str,
+ .strEnd = str,
+ .token = TOKEN_NONE,
+ };
+
+ for (int i = 0;; ++i) {
+ const char c = str[i];
+ if (c == '\0') {
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_NONE);
+ break;
+ }
+
+ if (c == '/') {
+ const char follow = str[i + 1];
+ if (follow == '/') {
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_NONE);
+ for (i += 2; str[i] != '\0' && str[i] != '\n'; ++i);
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_NONE);
+ if (str[i] == '\0') {
+ goto RETURN_SUCCESS;
+ }
+ continue;
+ } else if (follow == '*') {
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_NONE);
+ for (i += 2; str[i] != '\0' && str[i + 1] != '\0' &&
+ (str[i] != '*' || str[i + 1] != '/');
+ ++i);
+ if (str[i] == '\0' || str[i + 1] == '\0') {
+ perror("expected multi line comment to end\n");
+ exit(1);
+ }
+ i += 1;
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_NONE);
+ if (str[i] == '\0') {
+ goto RETURN_SUCCESS;
+ }
+ continue;
+ }
+ }
+ if (isSpace(c)) {
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_NONE);
+ } else if (isIdentifier(c)) {
+ if (node.token != TOKEN_IDENTIFIER && node.token != TOKEN_SYMBOL) {
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_IDENTIFIER);
+ }
+ } else if (isIdentifierSymbol(c)) {
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_IDENTIFIER);
+ for (++i;; ++i) {
+ const char current = str[i];
+ if (current == c) {
+ break;
+ } else if (current == '\0') {
+ fprintf(stderr, "expected %c to end\n", c);
+ exit(1);
+ }
+ }
+ ++node.strBegin;
+ push_clear_without_check(&nodes, &nodes_size, &nodes_inserted, &node, str,
+ i, TOKEN_NONE);
+ } else if (isNumber(c)) {
+ if (node.token != TOKEN_NUMBER && node.token != TOKEN_IDENTIFIER &&
+ node.token != TOKEN_SYMBOL) {
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_NUMBER);
+ }
+ } else if (isString(c)) {
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_STRING);
+
+ for (++i;; ++i) {
+ const char current = str[i];
+ if (current == c) {
+ break;
+ } else if (current == '\0') {
+ fprintf(stderr, "expected %c to end\n", c);
+ exit(1);
+ }
+ }
+
+ ++i;
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_NONE);
+ --i;
+ } else if (isOperator(c)) {
+ if (node.token == TOKEN_OPERATOR) {
+ const Token token = getOperator(node.strBegin, str + i + 1);
+ if (token != TOKEN_NONE) {
+ continue;
+ } else {
+ node.token = getOperator(node.strBegin, str + i);
+ if (node.token == TOKEN_NONE) {
+ fprintf(stderr, "unknown operator '%.*s'\n",
+ (int)(str + i - node.strBegin), node.strBegin);
+ exit(1);
+ }
+ push_clear_without_check(&nodes, &nodes_size, &nodes_inserted, &node,
+ str, i, TOKEN_OPERATOR);
+ }
+ } else {
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_OPERATOR);
+ }
+ } else if (isSymbol(c)) {
+ if (node.token != TOKEN_SYMBOL) {
+ push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i,
+ TOKEN_SYMBOL);
+ }
+ } else {
+ fprintf(stderr, "unexpected char '%c'\n", c);
+ exit(1);
+ }
+ }
+
+RETURN_SUCCESS:
+ Nodes result = {
+ .nodes = a404m_realloc(nodes, nodes_inserted * sizeof(Node)),
+ .size = nodes_inserted,
+ };
+
+ return result;
+}
+
+void push_if_not_empty(Node **restrict nodes, size_t *restrict nodes_size,
+ size_t *restrict nodes_inserted, Node *restrict node,
+ char const *restrict str, int i, Token token) {
+ if (node->token != TOKEN_NONE) {
+ if (*nodes_size == *nodes_inserted) {
+ *nodes_size += *nodes_size / 2 + 1;
+ *nodes = a404m_realloc(*nodes, *nodes_size * sizeof(Node));
+ }
+ node->strEnd = str + i;
+ if (node->token == TOKEN_IDENTIFIER) {
+ const Token foundToken = getKeyword(node->strBegin, node->strEnd);
+ if (foundToken != TOKEN_NONE) {
+ node->token = foundToken;
+ }
+ } else if (node->token == TOKEN_OPERATOR) {
+ const Token foundToken = getOperator(node->strBegin, node->strEnd);
+ if (foundToken != TOKEN_NONE) {
+ node->token = foundToken;
+ }
+ }
+
+ (*nodes)[*nodes_inserted] = *node;
+ ++*nodes_inserted;
+ }
+ node->strBegin = str + i;
+ node->token = token;
+}
+
+void push_clear_without_check(Node **restrict nodes,
+ size_t *restrict nodes_size,
+ size_t *restrict nodes_inserted, Node *node,
+ char const *restrict str, int i, Token token) {
+ if (*nodes_size == *nodes_inserted) {
+ *nodes_size += *nodes_size / 2 + 1;
+ *nodes = a404m_realloc(*nodes, *nodes_size * sizeof(Node));
+ }
+ node->strEnd = str + i;
+ (*nodes)[*nodes_inserted] = *node;
+ ++*nodes_inserted;
+ node->strBegin = str + i;
+ node->token = token;
+}
+
+bool isSpace(char c) { return isspace(c); }
+
+bool isIdentifier(char c) {
+ return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_';
+}
+
+bool isIdentifierSymbol(char c) { return c == '`'; }
+
+bool isNumber(char c) { return '0' <= c && c <= '9'; }
+
+bool isString(char c) { return c == '"' || c == '\''; }
+
+bool isOperator(char c) {
+ switch (c) {
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '*':
+ case '/':
+ case '%':
+ case '+':
+ case '-':
+ case ':':
+ case '!':
+ case '=':
+ case '<':
+ case '>':
+ case '&':
+ case '|':
+ case '.':
+ case ',':
+ case ';':
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool isSymbol(char c) { return c == '#'; }
+
+
+Token getKeyword(char const *strBegin, char const *strEnd) {
+ const size_t strSize = strEnd - strBegin;
+
+ for (size_t i = 0; i < KEYWORDS_SIZE; ++i) {
+ const char *search = KEYWORDS_STRINGS[i];
+ if (strlen(search) == strSize && strncmp(search, strBegin, strSize) == 0) {
+ return KEYWORDS_TOKENS[i];
+ }
+ }
+
+ return TOKEN_NONE;
+}
+Token getOperator(char const *strBegin, char const *strEnd) {
+ const size_t strSize = strEnd - strBegin;
+
+ for (size_t i = 0; i < OPERATOR_SIZE; ++i) {
+ const char *search = OPERATORS_STRINGS[i];
+ if (strlen(search) == strSize && strncmp(search, strBegin, strSize) == 0) {
+ return OPERATORS_TOKENS[i];
+ }
+ }
+
+ return TOKEN_NONE;
+}
diff --git a/src/compiler/lexer/lexer.h b/src/compiler/lexer/lexer.h
new file mode 100644
index 0000000..7d60101
--- /dev/null
+++ b/src/compiler/lexer/lexer.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <stddef.h>
+#include <utils/memory/memory.h>
+#include <utils/types.h>
+
+typedef enum Token {
+ TOKEN_NONE = 0,
+ TOKEN_IDENTIFIER,
+ TOKEN_KEYWORD_PRINT,
+ TOKEN_NUMBER,
+ TOKEN_STRING,
+ TOKEN_OPERATOR,
+ TOKEN_OPERATOR_PARENTHESES_OPEN,
+ TOKEN_OPERATOR_PARENTHESES_CLOSE,
+ TOKEN_OPERATOR_CURLY_BRACKET_OPEN,
+ TOKEN_OPERATOR_CURLY_BRACKET_CLOSE,
+ TOKEN_OPERATOR_ASSIGN,
+ TOKEN_OPERATOR_EQUAL,
+ TOKEN_OPERATOR_COLON,
+ TOKEN_OPERATOR_EOL,
+ TOKEN_SYMBOL,
+ TOKEN_PARSED,
+} Token;
+
+extern const char *TOKEN_STRINGS[];
+
+typedef struct Node {
+ char const *strBegin;
+ char const *strEnd;
+ Token token;
+ void *parsedNode;
+} Node;
+
+typedef struct Nodes {
+ Node *nodes;
+ size_t size;
+} Nodes;
+
+extern void printNodes(Nodes nodes);
+
+extern void deleteNodes(Nodes nodes);
+
+extern Nodes lexer(char const *restrict str);
+
+extern void push_if_not_empty(Node **restrict nodes,
+ size_t *restrict nodes_size,
+ size_t *restrict nodes_inserted,
+ Node *restrict node, char const *restrict str,
+ int i, Token token);
+extern void push_clear_without_check(Node **restrict nodes,
+ size_t *restrict nodes_size,
+ size_t *restrict nodes_inserted,
+ Node *node, char const *restrict str,
+ int i, Token token);
+
+extern bool isSpace(char c);
+extern bool isIdentifier(char c);
+extern bool isIdentifierSymbol(char c);
+extern bool isNumber(char c);
+extern bool isString(char c);
+extern bool isOperator(char c);
+extern bool isSymbol(char c);
+
+extern Token getKeyword(char const *strBegin, char const *strEnd);
+extern Token getOperator(char const *strBegin, char const *strEnd);