add tokenizer

This commit is contained in:
AcerecA 2024-08-10 15:18:09 +02:00
parent 78b6dfbcee
commit 6110b248d1
2 changed files with 109 additions and 18 deletions

View File

@ -1,24 +1,18 @@
const std = @import("std"); const std = @import("std");
const tkz = @import("tokenize.zig");
pub fn main() !void { pub fn main() !void {
// Prints to stderr (it's a shortcut based on `std.io.getStdErr()`) var file = try std.fs.cwd().openFile("data/example.il", .{});
std.debug.print("All your {s} are belong to us.\n", .{"codebase"}); defer file.close();
// stdout is for the actual output of your application, for example if you const content = try file.readToEndAlloc(std.heap.page_allocator, 4096 * ((1 << 10) << 10));
// are implementing gzip, then only the compressed bytes should be sent to
// stdout, not any debugging messages.
const stdout_file = std.io.getStdOut().writer();
var bw = std.io.bufferedWriter(stdout_file);
const stdout = bw.writer();
try stdout.print("Run `zig build test` to run the tests.\n", .{}); const toks = try tkz.tokenizeContent(content);
for (toks.items) |tok| {
try bw.flush(); // don't forget to flush! std.debug.print("{}:{} `{s}`\n", .{
} tok.line,
tok.char,
test "simple test" { tok.value,
var list = std.ArrayList(i32).init(std.testing.allocator); });
defer list.deinit(); // try commenting this out and see if zig detects the memory leak! }
try list.append(42);
try std.testing.expectEqual(@as(i32, 42), list.pop());
} }

97
src/tokenize.zig Normal file
View File

@ -0,0 +1,97 @@
const std = @import("std");
const Token = struct {
/// 0-based index of token start in whole file
start: usize,
/// 1-based line numbert token starts at
line: usize,
/// 1-based char numbert token starts at in line
char: usize,
value: []const u8,
};
const TokenizationError = error{InvalidKeyword};
pub fn tokenizeContent(content: []u8) !std.ArrayList(Token) {
var toks = std.ArrayList(Token).init(std.heap.page_allocator);
var lines = std.ArrayList(usize).init(std.heap.page_allocator);
var index: usize = 0;
while (index < content.len) {
var l: usize = 1;
const char = content[index];
_ = switch (char) {
'\n' => {
try lines.append(index);
index += l;
continue;
},
';' => {
while (switch (content[index + l]) {
'\n' => false,
else => true,
}) : (l += 1) {}
},
'"' => {
while (switch (content[index + l]) {
'"' => (content[index + l - 1] == '\\'),
else => true,
}) : (l += 1) {}
l += 1;
},
'a'...'z', 'A'...'Z', '_' => {
while (switch (content[index + l]) {
'a'...'z', 'A'...'Z', '0'...'9', '_' => true,
else => false,
}) : (l += 1) {}
},
'0'...'9' => {
while (switch (content[index + l]) {
'0'...'9', '.', 'e' => true,
else => false,
}) : (l += 1) {}
},
'+', '-', '~', '*', '/', '%', '<', '>', '=', '?', '|', '&', '(', ')', '\'' => {
for ([_]*const [2]u8{ "->", "~>", "||", "&&", "/=", "*=", "+=", "-=", "'(" }) |op| {
if (std.mem.eql(u8, op, content[index .. index + 2])) {
l = 2;
break;
}
}
},
'@' => {
while (switch (content[index + l]) {
'a'...'z', 'A'...'Z', '_', '0'...'9' => true,
else => false,
}) : (l += 1) {}
if (std.mem.eql(u8, "@keys", content[index .. index + l])) {} else if (std.mem.eql(u8, "@rest", content[index .. index + l])) {} else {
std.debug.print("line={d}, char={d}\n", .{
.line = lines.items.len + 1,
.char = switch (lines.items.len) {
0 => index,
else => index - lines.items[lines.items.len - 1],
},
});
}
},
else => {
index += l;
continue;
},
};
try toks.append(.{
.start = index,
.value = content[index .. index + l],
.line = lines.items.len + 1,
.char = switch (lines.items.len) {
0 => index,
else => index - lines.items[lines.items.len - 1],
},
});
index += l;
}
return toks;
}