#include "lexer.h" #include #include #include #include #include #include #include const char *FASM_TOKEN_STRINGS[] = { "NOOP", "PUSH8", "PUSH16", "PUSH32", "PUSH64", "LOAD8", "LOAD16", "LOAD32", "LOAD64", "POP8", "POP16", "POP32", "POP64", "DUP8", "DUP16", "DUP32", "DUP64", "SWAP8", "SWAP16", "SWAP32", "SWAP64", "DROP8", "DROP16", "DROP32", "DROP64", "ADD_I8", "ADD_I16", "ADD_I32", "ADD_I64", "ADD_F32", "ADD_F64", "SUB_I8", "SUB_I16", "SUB_I32", "SUB_I64", "SUB_F32", "SUB_F64", "NEG_I8", "NEG_I16", "NEG_I32", "NEG_I64", "NEG_F32", "NEG_F64", "MUL_I8", "MUL_I16", "MUL_I32", "MUL_I64", "MUL_U8", "MUL_U16", "MUL_U32", "MUL_U64", "MUL_F32", "MUL_F64", "DIV_I8", "DIV_I16", "DIV_I32", "DIV_I64", "DIV_U8", "DIV_U16", "DIV_U32", "DIV_U64", "DIV_F32", "DIV_F64", "REM_I8", "REM_I16", "REM_I32", "REM_I64", "REM_U8", "REM_U16", "REM_U32", "REM_U64", "CAST_I8_I64", "CAST_I16_I64", "CAST_I32_I64", "CAST_I64_I8", "CAST_I64_I16", "CAST_I64_I32", "CAST_F64_I64", "CAST_I64_F64", "CAST_U8_U64", "CAST_U16_U64", "CAST_U32_U64", "CAST_U64_U8", "CAST_U64_U16", "CAST_U64_U32", "CAST_F64_U64", "CAST_U64_F64", "CAST_F32_F64", "CAST_F64_F32", "JUMP", "JZ_I8", "JNZ_I8", "JN_I8", "JNN_I8", "JP_I8", "JNP_I8", "JZ_I16", "JNZ_I16", "JN_I16", "JNN_I16", "JP_I16", "JNP_I16", "JZ_I32", "JNZ_I32", "JN_I32", "JNN_I32", "JP_I32", "JNP_I32", "JZ_I64", "JNZ_I64", "JN_I64", "JNN_I64", "JP_I64", "JNP_I64", "JZ_F32", "JNZ_F32", "JN_F32", "JNN_F32", "JP_F32", "JNP_F32", "JZ_F64", "JNZ_F64", "JN_F64", "JNN_F64", "JP_F64", "JNP_F64", "ALLOC_HEAP", "ALLOC_STACK", "FREE_HEAP", "GET_STACK_ADDRESS", "GET_GLOBAL_ADDRESS", "CALL", "RET", "SYSCALL", "DEFINE_BYTE", "DEFINE_WORD", "DEFINE_DWORD", "DEFINE_QWORD", "NONE", }; const char *FASM_LINE_LOOKING_FOR_STRINGS[] = { "FASM_LINE_LOOKING_FOR_LABEL_OR_INSTRUCTION", "FASM_LINE_LOOKING_FOR_INSTRUCTION", "FASM_LINE_LOOKING_FOR_OPERAND", "FASM_LINE_LOOKING_FOR_OPERAND_OR_END", "FASM_LINE_LOOKING_FOR_COMMA_OR_END", }; const size_t FASM_TOKEN_STRINGS_SIZE = sizeof(FASM_TOKEN_STRINGS) / sizeof(char *); void fasmLinePrint(FasmLine line) { printf("{label='%.*s',instruction='%s',operands=[\n", (int)(line.labelEnd - line.labelBegin), line.labelBegin, FASM_TOKEN_STRINGS[line.instruction]); for (size_t i = 0; i < line.operands_size; ++i) { const FasmOperand operand = line.operands[i]; printf(" {'%.*s'},\n", (int)(operand.end - operand.begin), operand.begin); } printf("]}\n"); } void fasmLinesPrint(FasmLines lines) { printf("section code\n"); for (size_t i = 0; i < lines.lines_size; ++i) { fasmLinePrint(lines.lines[i]); } printf("section data\n"); for (size_t i = 0; i < lines.data_size; ++i) { fasmLinePrint(lines.data[i]); } } void fasmLineDeleteInner(FasmLine line) { free(line.operands); } void fasmLinesDeleteInner(FasmLines lines) { for (size_t i = 0; i < lines.lines_size; ++i) { fasmLineDeleteInner(lines.lines[i]); } for (size_t i = 0; i < lines.data_size; ++i) { fasmLineDeleteInner(lines.data[i]); } free(lines.lines); free(lines.data); } FasmLines *fasmLexer(SourceCode *sourceCode) { FasmLines *lines = a404m_malloc(sourceCode->size * sizeof(FasmLines)); for (size_t i = 0; i < sourceCode->size; ++i) { if ((lines[i] = fasmLexerCode(sourceCode->codes[i], sourceCode)) .lines_size == ERROR_SIZE) { goto RETURN_ERROR; } } return lines; RETURN_ERROR: free(lines); return NULL; } FasmLines fasmLexerCode(Code *code, SourceCode *sourceCode) { FasmLineLookingFor lookingFor = FASM_LINE_LOOKING_FOR_LABEL_OR_INSTRUCTION; FasmLine line = { .begin = code->code, .end = code->code, .labelBegin = NULL, .labelEnd = NULL, .instruction = FASM_TOKEN_NONE, .operands = a404m_malloc(0), .operands_size = 0, }; FasmLines lines = { .lines = a404m_malloc(0), .lines_size = 0, .data = a404m_malloc(0), .data_size = 0, }; FasmSection section = FASM_SECTION_NONE; for (char *iter = code->code;; ++iter) { LOOP_BEGIN: const char c = *iter; if (c == '\0') { switch (lookingFor) { case FASM_LINE_LOOKING_FOR_INSTRUCTION: case FASM_LINE_LOOKING_FOR_OPERAND: printError("Expected instruction", sourceCode, line.begin, iter); goto RETURN_ERROR; case FASM_LINE_LOOKING_FOR_OPERAND_OR_END: case FASM_LINE_LOOKING_FOR_COMMA_OR_END: fasmLexerPushLine(&lines, &line, iter, section, sourceCode); /* pass through */ case FASM_LINE_LOOKING_FOR_LABEL_OR_INSTRUCTION: goto RETURN_SUCCESS; } } else if (fasmLexerIsSpace(c)) { continue; } /*fprintf(stderr, "a404m: Char '%c' at %ld and looking for '%s'\n", c,*/ /* iter - code->code, FASM_LINE_LOOKING_FOR_STRINGS[lookingFor]);*/ switch (lookingFor) { case FASM_LINE_LOOKING_FOR_LABEL_OR_INSTRUCTION: if (fasmLexerIsLineSeparator(c)) { continue; } else if (fasmLexerIsSectionIndicator(c)) { static const char *const SECTIONS_STRINGS[] = { "code", "data", }; static const FasmSection SECTIONS[] = { FASM_SECTION_CODE, FASM_SECTION_DATA, }; static const size_t SECTIONS_SIZE = sizeof(SECTIONS_STRINGS) / sizeof(*SECTIONS_STRINGS); ++iter; for (size_t i = 0; i < SECTIONS_SIZE; ++i) { const char *const sectionStr = SECTIONS_STRINGS[i]; for (size_t j = 0;; ++j) { const char c0 = sectionStr[j]; const char c1 = iter[j]; if (c0 == '\0') { if (c1 == '\0' || isspace(c1)) { iter += j; section = SECTIONS[i]; /*fprintf(stderr, "section changed to '%s'\n", sectionStr);*/ goto LOOP_BEGIN; } else { break; } } else if (c0 != c1) { // no need for c1 == '\0' break; } } } printError("Invalid section", sourceCode, iter - 1, iter); goto RETURN_ERROR; } else if (fasmLexerIsWord(c)) { char *begin = iter; char *end = iter = fasmLexerGetNextWord(iter); line.begin = begin; line.end = end; if (fasmLexerIsLabel(*iter)) { ++iter; line.labelBegin = begin; line.labelEnd = end; lookingFor = FASM_LINE_LOOKING_FOR_INSTRUCTION; } else { if ((line.instruction = fasmLexerTokenFromIdentifier(begin, end)) == FASM_TOKEN_NONE) { printError("Unknown instruction", sourceCode, begin, end); goto RETURN_ERROR; } lookingFor = FASM_LINE_LOOKING_FOR_OPERAND_OR_END; } goto LOOP_BEGIN; } else { UNEXPECTED: fasmLinePrint(line); printError("Unexpected character", sourceCode, iter, iter + 1); goto RETURN_ERROR; } break; case FASM_LINE_LOOKING_FOR_INSTRUCTION: if (fasmLexerIsWord(c)) { char *begin = iter; char *end = iter = fasmLexerGetNextWord(iter); line.end = end; if ((line.instruction = fasmLexerTokenFromIdentifier(begin, end)) == FASM_TOKEN_NONE) { printError("Unknown instruction", sourceCode, begin, end); goto RETURN_ERROR; } lookingFor = FASM_LINE_LOOKING_FOR_OPERAND_OR_END; goto LOOP_BEGIN; } else { fprintf(stderr, "a404m %s:%d\n", __FILE_NAME__, __LINE__); goto UNEXPECTED; } break; case FASM_LINE_LOOKING_FOR_OPERAND: if (fasmLexerIsWord(c)) { LEX_OPERAND: char *begin = iter; char *end = iter = fasmLexerGetNextWord(iter); const size_t size = a404m_malloc_usable_size(line.operands) / sizeof(*line.operands); if (line.operands_size == size) { line.operands = a404m_realloc( line.operands, (line.operands_size + line.operands_size / 2 + 1) * sizeof(*line.operands)); } line.operands[line.operands_size].begin = begin; line.operands[line.operands_size].end = end; line.operands_size += 1; line.end = end; lookingFor = FASM_LINE_LOOKING_FOR_COMMA_OR_END; goto LOOP_BEGIN; } else { fprintf(stderr, "a404m %s:%d\n", __FILE_NAME__, __LINE__); goto UNEXPECTED; } break; case FASM_LINE_LOOKING_FOR_OPERAND_OR_END: if (fasmLexerIsWord(c)) { goto LEX_OPERAND; } else if (fasmLexerIsLineSeparator(c)) { goto LEX_END; } else { fprintf(stderr, "a404m %s:%d\n", __FILE_NAME__, __LINE__); goto UNEXPECTED; } break; case FASM_LINE_LOOKING_FOR_COMMA_OR_END: if (fasmLexerIsLineSeparator(c)) { LEX_END: fasmLexerPushLine(&lines, &line, iter, section, sourceCode); lookingFor = FASM_LINE_LOOKING_FOR_LABEL_OR_INSTRUCTION; } else if (fasmLexerIsOperandSeparator(c)) { lookingFor = FASM_LINE_LOOKING_FOR_OPERAND; } else { fprintf(stderr, "a404m %s:%d\n", __FILE_NAME__, __LINE__); goto UNEXPECTED; } } } RETURN_SUCCESS: return lines; RETURN_ERROR: free(lines.lines); const FasmLines error = { .lines_size = ERROR_SIZE, }; return error; } bool fasmLexerPushLine(FasmLines *lines, FasmLine *line, char const *iter, FasmSection section, SourceCode *sourceCode) { if (!fasmLexerIsAllowed(*line, section)) { printError("Instruction is not allowed here", sourceCode, line->begin, line->end); return false; } line->operands = a404m_realloc(line->operands, line->operands_size * sizeof(*line->operands)); switch (section) { case FASM_SECTION_NONE: printError("Instruction is in no section", sourceCode, line->begin, line->end); return true; case FASM_SECTION_CODE: _fasmLexerPushLine(&lines->lines, &lines->lines_size, line, iter); return true; case FASM_SECTION_DATA: _fasmLexerPushLine(&lines->data, &lines->data_size, line, iter); return true; } fprintf(stderr, "Bad section '%d'\n", section); return false; } void _fasmLexerPushLine(FasmLine **lines, size_t *lines_size, FasmLine *line, char const *) { const size_t size = a404m_malloc_usable_size(*lines) / sizeof(**lines); if (size == *lines_size) { *lines = a404m_realloc(*lines, (size * 2 + 1) * sizeof(**lines)); } // no need /*line->end = iter;*/ (*lines)[*lines_size] = *line; *lines_size += 1; line->operands = a404m_malloc(0); line->operands_size = 0; /*line->begin = iter;*/ line->labelBegin = NULL; line->labelEnd = NULL; /*line->instruction = FASM_TOKEN_NONE;*/ } bool fasmLexerIsAllowed(FasmLine line, FasmSection section) { switch (line.instruction) { case FASM_TOKEN_NOOP: case FASM_TOKEN_PUSH8: case FASM_TOKEN_PUSH16: case FASM_TOKEN_PUSH32: case FASM_TOKEN_PUSH64: case FASM_TOKEN_LOAD8: case FASM_TOKEN_LOAD16: case FASM_TOKEN_LOAD32: case FASM_TOKEN_LOAD64: case FASM_TOKEN_POP8: case FASM_TOKEN_POP16: case FASM_TOKEN_POP32: case FASM_TOKEN_POP64: case FASM_TOKEN_DUP8: case FASM_TOKEN_DUP16: case FASM_TOKEN_DUP32: case FASM_TOKEN_DUP64: case FASM_TOKEN_SWAP8: case FASM_TOKEN_SWAP16: case FASM_TOKEN_SWAP32: case FASM_TOKEN_SWAP64: case FASM_TOKEN_DROP8: case FASM_TOKEN_DROP16: case FASM_TOKEN_DROP32: case FASM_TOKEN_DROP64: case FASM_TOKEN_ADD_I8: case FASM_TOKEN_ADD_I16: case FASM_TOKEN_ADD_I32: case FASM_TOKEN_ADD_I64: case FASM_TOKEN_ADD_F32: case FASM_TOKEN_ADD_F64: case FASM_TOKEN_SUB_I8: case FASM_TOKEN_SUB_I16: case FASM_TOKEN_SUB_I32: case FASM_TOKEN_SUB_I64: case FASM_TOKEN_SUB_F32: case FASM_TOKEN_SUB_F64: case FASM_TOKEN_NEG_I8: case FASM_TOKEN_NEG_I16: case FASM_TOKEN_NEG_I32: case FASM_TOKEN_NEG_I64: case FASM_TOKEN_NEG_F32: case FASM_TOKEN_NEG_F64: case FASM_TOKEN_MUL_I8: case FASM_TOKEN_MUL_I16: case FASM_TOKEN_MUL_I32: case FASM_TOKEN_MUL_I64: case FASM_TOKEN_MUL_U8: case FASM_TOKEN_MUL_U16: case FASM_TOKEN_MUL_U32: case FASM_TOKEN_MUL_U64: case FASM_TOKEN_MUL_F32: case FASM_TOKEN_MUL_F64: case FASM_TOKEN_DIV_I8: case FASM_TOKEN_DIV_I16: case FASM_TOKEN_DIV_I32: case FASM_TOKEN_DIV_I64: case FASM_TOKEN_DIV_U8: case FASM_TOKEN_DIV_U16: case FASM_TOKEN_DIV_U32: case FASM_TOKEN_DIV_U64: case FASM_TOKEN_DIV_F32: case FASM_TOKEN_DIV_F64: case FASM_TOKEN_REM_I8: case FASM_TOKEN_REM_I16: case FASM_TOKEN_REM_I32: case FASM_TOKEN_REM_I64: case FASM_TOKEN_REM_U8: case FASM_TOKEN_REM_U16: case FASM_TOKEN_REM_U32: case FASM_TOKEN_REM_U64: case FASM_TOKEN_CAST_I8_I64: case FASM_TOKEN_CAST_I16_I64: case FASM_TOKEN_CAST_I32_I64: case FASM_TOKEN_CAST_I64_I8: case FASM_TOKEN_CAST_I64_I16: case FASM_TOKEN_CAST_I64_I32: case FASM_TOKEN_CAST_F64_I64: case FASM_TOKEN_CAST_I64_F64: case FASM_TOKEN_CAST_U8_U64: case FASM_TOKEN_CAST_U16_U64: case FASM_TOKEN_CAST_U32_U64: case FASM_TOKEN_CAST_U64_U8: case FASM_TOKEN_CAST_U64_U16: case FASM_TOKEN_CAST_U64_U32: case FASM_TOKEN_CAST_F64_U64: case FASM_TOKEN_CAST_U64_F64: case FASM_TOKEN_CAST_F32_F64: case FASM_TOKEN_CAST_F64_F32: case FASM_TOKEN_JUMP: case FASM_TOKEN_JZ_I8: case FASM_TOKEN_JNZ_I8: case FASM_TOKEN_JN_I8: case FASM_TOKEN_JNN_I8: case FASM_TOKEN_JP_I8: case FASM_TOKEN_JNP_I8: case FASM_TOKEN_JZ_I16: case FASM_TOKEN_JNZ_I16: case FASM_TOKEN_JN_I16: case FASM_TOKEN_JNN_I16: case FASM_TOKEN_JP_I16: case FASM_TOKEN_JNP_I16: case FASM_TOKEN_JZ_I32: case FASM_TOKEN_JNZ_I32: case FASM_TOKEN_JN_I32: case FASM_TOKEN_JNN_I32: case FASM_TOKEN_JP_I32: case FASM_TOKEN_JNP_I32: case FASM_TOKEN_JZ_I64: case FASM_TOKEN_JNZ_I64: case FASM_TOKEN_JN_I64: case FASM_TOKEN_JNN_I64: case FASM_TOKEN_JP_I64: case FASM_TOKEN_JNP_I64: case FASM_TOKEN_JZ_F32: case FASM_TOKEN_JNZ_F32: case FASM_TOKEN_JN_F32: case FASM_TOKEN_JNN_F32: case FASM_TOKEN_JP_F32: case FASM_TOKEN_JNP_F32: case FASM_TOKEN_JZ_F64: case FASM_TOKEN_JNZ_F64: case FASM_TOKEN_JN_F64: case FASM_TOKEN_JNN_F64: case FASM_TOKEN_JP_F64: case FASM_TOKEN_JNP_F64: case FASM_TOKEN_ALLOC_HEAP: case FASM_TOKEN_ALLOC_STACK: case FASM_TOKEN_FREE_HEAP: case FASM_TOKEN_GET_STACK_ADDRESS: case FASM_TOKEN_GET_GLOBAL_ADDRESS: case FASM_TOKEN_CALL: case FASM_TOKEN_RET: case FASM_TOKEN_SYSCALL: return section == FASM_SECTION_CODE; case FASM_TOKEN_DEFINE_BYTE: case FASM_TOKEN_DEFINE_WORD: case FASM_TOKEN_DEFINE_DWORD: case FASM_TOKEN_DEFINE_QWORD: return section == FASM_SECTION_DATA; case FASM_TOKEN_NONE: return false; } fprintf(stderr, "Bad token %d at %s:%d\n", line.instruction, __FILE_NAME__, __LINE__); exit(1); } char *fasmLexerGetNextWord(char *iter) { if (fasmLexerIsString(*iter)) { const char begin = *iter; for (++iter; *iter != begin; ++iter) { if (*iter == '\0') { fprintf(stderr, "No ending for string at %s:%d\n", __FILE_NAME__, __LINE__); exit(1); } } ++iter; } else { for (++iter; *iter != '\0' && fasmLexerIsWord(*iter); ++iter); } return iter; } FasmToken fasmLexerTokenFromIdentifier(char *begin, char *end) { const size_t size = end - begin; char *uppered = a404m_malloc((size + 1) * sizeof(char)); for (char *iter = begin; iter < end; ++iter) { uppered[iter - begin] = toupper(*iter); } uppered[size] = '\0'; for (size_t i = 0; i < FASM_TOKEN_STRINGS_SIZE; ++i) { const char *str = FASM_TOKEN_STRINGS[i]; if (strcmp(uppered, str) == 0) { free(uppered); return (FasmToken)i; } } free(uppered); return FASM_TOKEN_NONE; } bool fasmLexerIsSpace(char c) { return c != '\n' && isspace(c); } bool fasmLexerIsSectionIndicator(char c) { return c == '.'; } bool fasmLexerIsLabel(char c) { return c == ':'; } bool fasmLexerIsWord(char c) { return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '_' || fasmLexerIsString(c); } extern bool fasmLexerIsIdentifierSymbol(char c) { return c == '`'; } bool fasmLexerIsString(char c) { return c == '\'' || c == '\"'; } bool fasmLexerIsOperandSeparator(char c) { return c == ','; } bool fasmLexerIsLineSeparator(char c) { return c == '\n'; }