Writing a Simple Compiler: Lexer to Assembly

CompilersLow-Level

Compilers seem like magic, but they’re just programs that transform text. Let’s build a tiny compiler that turns simple math expressions into x86 assembly.

The Pipeline

Source Code → Lexer → Parser → Code Generator → Assembly
"2 + 3 * 4"   Tokens   AST      Instructions     "mov eax, 14"

Step 1: Lexer (Tokenization)

typedef enum {
    TOKEN_NUMBER,
    TOKEN_PLUS,
    TOKEN_MINUS,
    TOKEN_STAR,
    TOKEN_SLASH,
    TOKEN_EOF
} TokenType;

typedef struct {
    TokenType type;
    int value;  // For TOKEN_NUMBER
} Token;

Token next_token(const char** input) {
    while (**input == ' ') (*input)++;  // Skip whitespace
    
    if (isdigit(**input)) {
        Token tok = {TOKEN_NUMBER, 0};
        while (isdigit(**input)) {
            tok.value = tok.value * 10 + (**input - '0');
            (*input)++;
        }
        return tok;
    }
    
    switch (**input) {
        case '+': (*input)++; return (Token){TOKEN_PLUS, 0};
        case '-': (*input)++; return (Token){TOKEN_MINUS, 0};
        case '*': (*input)++; return (Token){TOKEN_STAR, 0};
        case '/': (*input)++; return (Token){TOKEN_SLASH, 0};
        case '\0': return (Token){TOKEN_EOF, 0};
        default: fprintf(stderr, "Unknown character\n"); exit(1);
    }
}

Step 2: Parser (Build AST)

typedef enum { AST_NUMBER, AST_BINOP } ASTType;

typedef struct AST {
    ASTType type;
    union {
        int number;
        struct {
            TokenType op;
            struct AST *left, *right;
        } binop;
    };
} AST;

// Recursive descent parser
AST* parse_expression();
AST* parse_term();
AST* parse_factor();

AST* parse_factor() {
    Token tok = next_token(&input);
    if (tok.type == TOKEN_NUMBER) {
        AST* node = malloc(sizeof(AST));
        node->type = AST_NUMBER;
        node->number = tok.value;
        return node;
    }
    error("Expected number");
}

AST* parse_term() {
    AST* left = parse_factor();
    
    while (peek() == TOKEN_STAR || peek() == TOKEN_SLASH) {
        Token op = next_token(&input);
        AST* right = parse_factor();
        
        AST* node = malloc(sizeof(AST));
        node->type = AST_BINOP;
        node->binop.op = op.type;
        node->binop.left = left;
        node->binop.right = right;
        left = node;
    }
    
    return left;
}

AST* parse_expression() {
    AST* left = parse_term();
    
    while (peek() == TOKEN_PLUS || peek() == TOKEN_MINUS) {
        Token op = next_token(&input);
        AST* right = parse_term();
        
        AST* node = malloc(sizeof(AST));
        node->type = AST_BINOP;
        node->binop.op = op.type;
        node->binop.left = left;
        node->binop.right = right;
        left = node;
    }
    
    return left;
}

Step 3: Code Generation

void generate(AST* node) {
    if (node->type == AST_NUMBER) {
        printf("  mov rax, %d\n", node->number);
        return;
    }
    
    // Generate left subtree
    generate(node->binop.left);
    printf("  push rax\n");  // Save result
    
    // Generate right subtree
    generate(node->binop.right);
    printf("  pop rbx\n");   // Restore left result
    
    // Perform operation
    switch (node->binop.op) {
        case TOKEN_PLUS:
            printf("  add rax, rbx\n");
            break;
        case TOKEN_MINUS:
            printf("  sub rbx, rax\n");
            printf("  mov rax, rbx\n");
            break;
        case TOKEN_STAR:
            printf("  imul rax, rbx\n");
            break;
        case TOKEN_SLASH:
            printf("  mov rdx, 0\n");
            printf("  mov rcx, rax\n");
            printf("  mov rax, rbx\n");
            printf("  idiv rcx\n");
            break;
    }
}

Complete Example

Input: 2 + 3 * 4

Generated Assembly:

  mov rax, 2
  push rax
  mov rax, 3
  push rax
  mov rax, 4
  pop rbx
  imul rax, rbx
  pop rbx
  add rax, rbx

Result: rax = 14

Testing

int main() {
    const char* input = "2 + 3 * 4";
    AST* ast = parse_expression();
    
    printf(".global _start\n");
    printf("_start:\n");
    generate(ast);
    printf("  mov rdi, rax\n");
    printf("  mov rax, 60\n");  // exit syscall
    printf("  syscall\n");
    
    return 0;
}

Compile and run:

./compiler > output.s
as output.s -o output.o
ld output.o -o output
./output
echo $?  # Prints 14

Conclusion

We built a compiler in ~200 lines of C that:

  • Tokenizes input
  • Parses expressions with correct precedence
  • Generates x86-64 assembly

Next Steps:

  • Add variables and assignment
  • Implement control flow (if/while)
  • Add functions and stack frames

Have you written a compiler? What language did you target?