diff --git a/src/main.zig b/src/main.zig index c8a3f67..e5ba2df 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,24 +1,18 @@ const std = @import("std"); +const tkz = @import("tokenize.zig"); pub fn main() !void { - // Prints to stderr (it's a shortcut based on `std.io.getStdErr()`) - std.debug.print("All your {s} are belong to us.\n", .{"codebase"}); + var file = try std.fs.cwd().openFile("data/example.il", .{}); + defer file.close(); - // stdout is for the actual output of your application, for example if you - // are implementing gzip, then only the compressed bytes should be sent to - // stdout, not any debugging messages. - const stdout_file = std.io.getStdOut().writer(); - var bw = std.io.bufferedWriter(stdout_file); - const stdout = bw.writer(); + const content = try file.readToEndAlloc(std.heap.page_allocator, 4096 * ((1 << 10) << 10)); - try stdout.print("Run `zig build test` to run the tests.\n", .{}); - - try bw.flush(); // don't forget to flush! -} - -test "simple test" { - var list = std.ArrayList(i32).init(std.testing.allocator); - defer list.deinit(); // try commenting this out and see if zig detects the memory leak! - try list.append(42); - try std.testing.expectEqual(@as(i32, 42), list.pop()); + const toks = try tkz.tokenizeContent(content); + for (toks.items) |tok| { + std.debug.print("{}:{} `{s}`\n", .{ + tok.line, + tok.char, + tok.value, + }); + } } diff --git a/src/tokenize.zig b/src/tokenize.zig new file mode 100644 index 0000000..8749053 --- /dev/null +++ b/src/tokenize.zig @@ -0,0 +1,97 @@ +const std = @import("std"); + +const Token = struct { + /// 0-based index of token start in whole file + start: usize, + /// 1-based line numbert token starts at + line: usize, + /// 1-based char numbert token starts at in line + char: usize, + + value: []const u8, +}; + +const TokenizationError = error{InvalidKeyword}; + +pub fn tokenizeContent(content: []u8) !std.ArrayList(Token) { + var toks = std.ArrayList(Token).init(std.heap.page_allocator); + var lines = std.ArrayList(usize).init(std.heap.page_allocator); + + var index: usize = 0; + while (index < content.len) { + var l: usize = 1; + const char = content[index]; + _ = switch (char) { + '\n' => { + try lines.append(index); + index += l; + continue; + }, + ';' => { + while (switch (content[index + l]) { + '\n' => false, + else => true, + }) : (l += 1) {} + }, + '"' => { + while (switch (content[index + l]) { + '"' => (content[index + l - 1] == '\\'), + else => true, + }) : (l += 1) {} + l += 1; + }, + 'a'...'z', 'A'...'Z', '_' => { + while (switch (content[index + l]) { + 'a'...'z', 'A'...'Z', '0'...'9', '_' => true, + else => false, + }) : (l += 1) {} + }, + '0'...'9' => { + while (switch (content[index + l]) { + '0'...'9', '.', 'e' => true, + else => false, + }) : (l += 1) {} + }, + '+', '-', '~', '*', '/', '%', '<', '>', '=', '?', '|', '&', '(', ')', '\'' => { + for ([_]*const [2]u8{ "->", "~>", "||", "&&", "/=", "*=", "+=", "-=", "'(" }) |op| { + if (std.mem.eql(u8, op, content[index .. index + 2])) { + l = 2; + break; + } + } + }, + '@' => { + while (switch (content[index + l]) { + 'a'...'z', 'A'...'Z', '_', '0'...'9' => true, + else => false, + }) : (l += 1) {} + + if (std.mem.eql(u8, "@keys", content[index .. index + l])) {} else if (std.mem.eql(u8, "@rest", content[index .. index + l])) {} else { + std.debug.print("line={d}, char={d}\n", .{ + .line = lines.items.len + 1, + .char = switch (lines.items.len) { + 0 => index, + else => index - lines.items[lines.items.len - 1], + }, + }); + } + }, + else => { + index += l; + continue; + }, + }; + try toks.append(.{ + .start = index, + .value = content[index .. index + l], + .line = lines.items.len + 1, + .char = switch (lines.items.len) { + 0 => index, + else => index - lines.items[lines.items.len - 1], + }, + }); + index += l; + } + + return toks; +}