#include "lexer.h" #include #include #include #include #include const char *TOKEN_STRINGS[] = { "TOKEN_NONE", "TOKEN_IDENTIFIER", "TOKEN_KEYWORD_PRINT", "TOKEN_NUMBER", "TOKEN_STRING", "TOKEN_OPERATOR", "TOKEN_OPERATOR_PARENTHESES_OPEN", "TOKEN_OPERATOR_PARENTHESES_CLOSE", "TOKEN_OPERATOR_CURLY_BRACKET_OPEN", "TOKEN_OPERATOR_CURLY_BRACKET_CLOSE", "TOKEN_OPERATOR_ASSIGN", "TOKEN_OPERATOR_EQUAL", "TOKEN_OPERATOR_COLON", "TOKEN_OPERATOR_EOL", "TOKEN_SYMBOL", "TOKEN_PARSED", }; static const char *KEYWORDS_STRINGS[] = { "print", }; static const Token KEYWORDS_TOKENS[] = { TOKEN_KEYWORD_PRINT, }; static const size_t KEYWORDS_SIZE = sizeof(KEYWORDS_STRINGS) / sizeof(char *); static const char *OPERATORS_STRINGS[] = { "(", ")", "{", "}", "=", "==", ":", ";", }; static const Token OPERATORS_TOKENS[] = { TOKEN_OPERATOR_PARENTHESES_OPEN, TOKEN_OPERATOR_PARENTHESES_CLOSE, TOKEN_OPERATOR_CURLY_BRACKET_OPEN, TOKEN_OPERATOR_CURLY_BRACKET_CLOSE, TOKEN_OPERATOR_ASSIGN, TOKEN_OPERATOR_EQUAL, TOKEN_OPERATOR_COLON, TOKEN_OPERATOR_EOL, }; static const size_t OPERATOR_SIZE = sizeof(OPERATORS_STRINGS) / sizeof(char *); void printNodes(Nodes nodes) { for (size_t i = 0; i < nodes.size; ++i) { const Node node = nodes.nodes[i]; printf("{'%.*s',%s}\n", (int)(node.strEnd - node.strBegin), node.strBegin, TOKEN_STRINGS[node.token]); } } void deleteNodes(Nodes nodes) { free(nodes.nodes); } Nodes lexer(char const *restrict str) { size_t nodes_size = 10; Node *nodes = a404m_malloc(nodes_size * sizeof(Node)); size_t nodes_inserted = 0; Node node = { .strBegin = str, .strEnd = str, .token = TOKEN_NONE, }; for (int i = 0;; ++i) { const char c = str[i]; if (c == '\0') { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_NONE); break; } else if (c == '/') { const char follow = str[i + 1]; if (follow == '/') { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_NONE); for (i += 2; str[i] != '\0' && str[i] != '\n'; ++i); push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_NONE); if (str[i] == '\0') { goto RETURN_SUCCESS; } continue; } else if (follow == '*') { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_NONE); int in = 1; for (i += 2;; ++i) { switch (str[i]) { case '\0': fprintf(stderr, "expected multi line comment to end at compiler line %d " "and in=%d\n", __LINE__, in); exit(1); case '*': ++i; if (str[i] == '/') { --in; if (in == 0) { goto END_OF_BLOCK_COMMENT_LOOP; } } else if (str[i] == '\0') { fprintf(stderr, "expected multi line comment to end at compiler line " "%d and in=%d\n", __LINE__, in); exit(1); } break; case '/': ++i; if (str[i] == '*') { ++in; } else if (str[i] == '\0') { fprintf(stderr, "expected multi line comment to end at compiler line " "%d and in=%d\n", __LINE__, in); exit(1); } break; } } END_OF_BLOCK_COMMENT_LOOP: push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_NONE); if (str[i] == '\0') { goto RETURN_SUCCESS; } continue; } } if (isSpace(c)) { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_NONE); } else if (isIdentifier(c)) { if (node.token != TOKEN_IDENTIFIER && node.token != TOKEN_SYMBOL) { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_IDENTIFIER); } } else if (isIdentifierSymbol(c)) { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_IDENTIFIER); for (++i;; ++i) { const char current = str[i]; if (current == c) { break; } else if (current == '\0') { fprintf(stderr, "expected %c to end\n", c); exit(1); } } ++node.strBegin; push_clear_without_check(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_NONE); } else if (isNumber(c)) { if (node.token != TOKEN_NUMBER && node.token != TOKEN_IDENTIFIER && node.token != TOKEN_SYMBOL) { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_NUMBER); } } else if (isString(c)) { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_STRING); for (++i;; ++i) { const char current = str[i]; if (current == c) { break; }else if(current == '\\'){ ++i; } else if (current == '\0') { fprintf(stderr, "expected %c to end\n", c); exit(1); } } ++i; push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_NONE); --i; } else if (isOperator(c)) { if (node.token == TOKEN_OPERATOR) { const Token token = getOperator(node.strBegin, str + i + 1); if (token != TOKEN_NONE) { continue; } else { node.token = getOperator(node.strBegin, str + i); if (node.token == TOKEN_NONE) { fprintf(stderr, "unknown operator '%.*s'\n", (int)(str + i - node.strBegin), node.strBegin); exit(1); } push_clear_without_check(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_OPERATOR); } } else { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_OPERATOR); } } else if (isSymbol(c)) { if (node.token != TOKEN_SYMBOL) { push_if_not_empty(&nodes, &nodes_size, &nodes_inserted, &node, str, i, TOKEN_SYMBOL); } } else { fprintf(stderr, "unexpected char '%c'\n", c); exit(1); } } RETURN_SUCCESS: Nodes result = { .nodes = a404m_realloc(nodes, nodes_inserted * sizeof(Node)), .size = nodes_inserted, }; return result; } void push_if_not_empty(Node **restrict nodes, size_t *restrict nodes_size, size_t *restrict nodes_inserted, Node *restrict node, char const *restrict str, int i, Token token) { if (node->token != TOKEN_NONE) { if (*nodes_size == *nodes_inserted) { *nodes_size += *nodes_size / 2 + 1; *nodes = a404m_realloc(*nodes, *nodes_size * sizeof(Node)); } node->strEnd = str + i; if (node->token == TOKEN_IDENTIFIER) { const Token foundToken = getKeyword(node->strBegin, node->strEnd); if (foundToken != TOKEN_NONE) { node->token = foundToken; } } else if (node->token == TOKEN_OPERATOR) { const Token foundToken = getOperator(node->strBegin, node->strEnd); if (foundToken != TOKEN_NONE) { node->token = foundToken; } } (*nodes)[*nodes_inserted] = *node; ++*nodes_inserted; } node->strBegin = str + i; node->token = token; } void push_clear_without_check(Node **restrict nodes, size_t *restrict nodes_size, size_t *restrict nodes_inserted, Node *node, char const *restrict str, int i, Token token) { if (*nodes_size == *nodes_inserted) { *nodes_size += *nodes_size / 2 + 1; *nodes = a404m_realloc(*nodes, *nodes_size * sizeof(Node)); } node->strEnd = str + i; (*nodes)[*nodes_inserted] = *node; ++*nodes_inserted; node->strBegin = str + i; node->token = token; } bool isSpace(char c) { return isspace(c); } bool isIdentifier(char c) { return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_'; } bool isIdentifierSymbol(char c) { return c == '`'; } bool isNumber(char c) { return '0' <= c && c <= '9'; } bool isString(char c) { return c == '"' || c == '\''; } bool isOperator(char c) { switch (c) { case '(': case ')': case '[': case ']': case '{': case '}': case '*': case '/': case '%': case '+': case '-': case ':': case '!': case '=': case '<': case '>': case '&': case '|': case '.': case ',': case ';': return true; default: return false; } } bool isSymbol(char c) { return c == '#'; } Token getKeyword(char const *strBegin, char const *strEnd) { const size_t strSize = strEnd - strBegin; for (size_t i = 0; i < KEYWORDS_SIZE; ++i) { const char *search = KEYWORDS_STRINGS[i]; if (strlen(search) == strSize && strncmp(search, strBegin, strSize) == 0) { return KEYWORDS_TOKENS[i]; } } return TOKEN_NONE; } Token getOperator(char const *strBegin, char const *strEnd) { const size_t strSize = strEnd - strBegin; for (size_t i = 0; i < OPERATOR_SIZE; ++i) { const char *search = OPERATORS_STRINGS[i]; if (strlen(search) == strSize && strncmp(search, strBegin, strSize) == 0) { return OPERATORS_TOKENS[i]; } } return TOKEN_NONE; }