add tokenizer

2024-08-10 15:18:09 +02:00 · 2024-08-10 15:18:09 +02:00 · 6110b248d1
parent 78b6dfbcee
commit 6110b248d1
2 changed files with 109 additions and 18 deletions
--- a/src/main.zig
+++ b/src/main.zig
@ -1,24 +1,18 @@
 const std = @import("std");
 const tkz = @import("tokenize.zig");
 pub fn main() !void {
-    // Prints to stderr (it's a shortcut based on `std.io.getStdErr()`)
+    var file = try std.fs.cwd().openFile("data/example.il", .{});
-    std.debug.print("All your {s} are belong to us.\n", .{"codebase"});
+    defer file.close();
-    // stdout is for the actual output of your application, for example if you
+    const content = try file.readToEndAlloc(std.heap.page_allocator, 4096 * ((1 << 10) << 10));
    // are implementing gzip, then only the compressed bytes should be sent to
    // stdout, not any debugging messages.
    const stdout_file = std.io.getStdOut().writer();
    var bw = std.io.bufferedWriter(stdout_file);
    const stdout = bw.writer();
-    try stdout.print("Run `zig build test` to run the tests.\n", .{});
+    const toks = try tkz.tokenizeContent(content);
-
+    for (toks.items) |tok| {
-    try bw.flush(); // don't forget to flush!
+        std.debug.print("{}:{} `{s}`\n", .{
-}
+            tok.line,
-
+            tok.char,
-test "simple test" {
+            tok.value,
-    var list = std.ArrayList(i32).init(std.testing.allocator);
+        });
-    defer list.deinit(); // try commenting this out and see if zig detects the memory leak!
+    }
    try list.append(42);
    try std.testing.expectEqual(@as(i32, 42), list.pop());
 }
--- a/src/tokenize.zig
+++ b/src/tokenize.zig
@ -0,0 +1,97 @@
 const std = @import("std");
 const Token = struct {
    /// 0-based index of token start in whole file
    start: usize,
    /// 1-based line numbert token starts at
    line: usize,
    /// 1-based char numbert token starts at in line
    char: usize,
    value: []const u8,
 };
 const TokenizationError = error{InvalidKeyword};
 pub fn tokenizeContent(content: []u8) !std.ArrayList(Token) {
    var toks = std.ArrayList(Token).init(std.heap.page_allocator);
    var lines = std.ArrayList(usize).init(std.heap.page_allocator);
    var index: usize = 0;
    while (index < content.len) {
        var l: usize = 1;
        const char = content[index];
        _ = switch (char) {
            '\n' => {
                try lines.append(index);
                index += l;
                continue;
            },
            ';' => {
                while (switch (content[index + l]) {
                    '\n' => false,
                    else => true,
                }) : (l += 1) {}
            },
            '"' => {
                while (switch (content[index + l]) {
                    '"' => (content[index + l - 1] == '\\'),
                    else => true,
                }) : (l += 1) {}
                l += 1;
            },
            'a'...'z', 'A'...'Z', '_' => {
                while (switch (content[index + l]) {
                    'a'...'z', 'A'...'Z', '0'...'9', '_' => true,
                    else => false,
                }) : (l += 1) {}
            },
            '0'...'9' => {
                while (switch (content[index + l]) {
                    '0'...'9', '.', 'e' => true,
                    else => false,
                }) : (l += 1) {}
            },
            '+', '-', '~', '*', '/', '%', '<', '>', '=', '?', '|', '&', '(', ')', '\'' => {
                for ([_]*const [2]u8{ "->", "~>", "||", "&&", "/=", "*=", "+=", "-=", "'(" }) |op| {
                    if (std.mem.eql(u8, op, content[index .. index + 2])) {
                        l = 2;
                        break;
                    }
                }
            },
            '@' => {
                while (switch (content[index + l]) {
                    'a'...'z', 'A'...'Z', '_', '0'...'9' => true,
                    else => false,
                }) : (l += 1) {}
                if (std.mem.eql(u8, "@keys", content[index .. index + l])) {} else if (std.mem.eql(u8, "@rest", content[index .. index + l])) {} else {
                    std.debug.print("line={d}, char={d}\n", .{
                        .line = lines.items.len + 1,
                        .char = switch (lines.items.len) {
                            0 => index,
                            else => index - lines.items[lines.items.len - 1],
                        },
                    });
                }
            },
            else => {
                index += l;
                continue;
            },
        };
        try toks.append(.{
            .start = index,
            .value = content[index .. index + l],
            .line = lines.items.len + 1,
            .char = switch (lines.items.len) {
                0 => index,
                else => index - lines.items[lines.items.len - 1],
            },
        });
        index += l;
    }
    return toks;
 }