///! Lexer for *unified diff* format to tokenize input sources accordningly. pub const Token = struct { tag: Tag, loc: Location, pub const Location = struct { idx: usize, len: usize, }; pub const Tag = enum(u8) { /// File information; contains the content of: /// ``` /// --- a/xxx /// --- b/xxx /// ``` /// *NOTE* includes trailing newline character file, /// Hunk header information; contains content of: /// ```@@ -x,y +z,y @@``` header, /// may be diff content or filler content of the tools output content, /// invalid contents that could not be parsed correctly invalid, /// End of file eof, pub fn lexeme(tag: Tag) ?[]const u8 { return switch (tag) { .header => "@@ -x,y +z,y @@", .content => "..", .file => "diff --git a/xxx b/xxx", }; } pub fn symbol(tag: Tag) []const u8 { return tag.lexeme() orelse switch (tag) { .eof => "EOF", .invalid => "invalid", else => unreachable, }; } }; }; pub const Tokenizer = struct { buffer: [:0]const u8, index: usize, /// For debugging purposes pub fn dump(self: *const Tokenizer, token: *const Token) void { print(".{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.idx .. token.loc.idx + token.loc.len] }); } pub fn init(buffer: [:0]const u8) Tokenizer { return .{ .buffer = buffer, // skip the UTF-8 BOM if present .index = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0, }; } const State = enum { default, invalid, at_sign, minus, header, plus, file, }; /// state fsm (finite state machine) describing the syntax of `nf` /// TODO I need to draw one for all the possible states for tokenization! /// -> for that I can create test cases! /// -> detect valid and invalid syntax uses! this is however the job of the parser? /// /// TODO points to improve on: /// -> reduce duplicated code sections /// -> make tags more explicit (i.e. remove unnecessary newlines, whitespaces, etc.) /// -> streamline catching the common cases for tokens /// -> reduce state machine /// -> do not group tokens, instead this should be done by the parser when deriving the ast from the token stream /// then the parser can identify missing parts and even point to the corresponding location in the file! pub fn next(this: *Tokenizer) Token { const token = this.next_token(); this.index = token.loc.idx + token.loc.len; return token; } fn next_token(this: *const Tokenizer) Token { var index = this.index; var result: Token = .{ .tag = undefined, .loc = .{ .idx = this.index, .len = undefined, }, }; state: switch (State.default) { .default => switch (this.buffer[index]) { 0 => if (index == this.buffer.len) { if (result.loc.idx != index) { result.tag = .content; } else { return .{ .tag = .eof, .loc = .{ .idx = index, .len = 0, }, }; } } else { continue :state .invalid; }, '@' => continue :state .at_sign, '-' => continue :state .minus, else => { index += 1; continue :state .default; }, }, .invalid => { switch (this.buffer[index]) { 0 => result.tag = .invalid, else => { index += 1; result.tag = .invalid; }, } }, .at_sign => { index += 1; switch (this.buffer[index]) { '@' => if (result.loc.idx != index - 1) { index -= 1; result.tag = .content; } else continue :state .header, else => continue :state .default, } }, .header => { index += 1; switch (this.buffer[index]) { '@' => if (this.buffer[index + 1] == '@') { result.tag = .header; index += 2; } else continue :state .invalid, 0 => continue :state .invalid, else => continue :state .header, } }, .minus => { index += 1; switch (this.buffer[index]) { // assuming that we start with a minus! '-' => if (this.buffer[index + 1] == '-') { if (result.loc.idx != index - 1) { index -= 1; result.tag = .content; } else { index += 1; continue :state .file; } } else continue :state .default, 0 => continue :state .invalid, else => continue :state .default, } }, .file => { // std.log.err(".file: {s}", .{this.buffer[index - 2 .. @min(index + 3, this.buffer.len)]}); index += 1; switch (this.buffer[index]) { '+' => if (this.buffer[index + 1] == '+' and this.buffer[index + 2] == '+') { index += 2; continue :state .plus; } else continue :state .file, 0 => continue :state .invalid, else => continue :state .file, } }, .plus => { // std.log.err(".plus", .{}); index += 1; switch (this.buffer[index]) { '\n' => { index += 1; // include newline result.tag = .file; }, 0 => continue :state .invalid, else => continue :state .plus, } }, } result.loc.len = index - result.loc.idx; return result; } }; const std = @import("std"); const mem = std.mem; const debug = std.debug; const testing = std.testing; const assert = debug.assert; const print = debug.print; test "individual change" { try testTokenize( \\diff --git a/build.zig.zon b/build.zig.zon \\index 99bede4..a039487 100644 \\--- a/build.zig.zon \\+++ b/build.zig.zon \\@@ -3,8 +3,8 @@ \\ .version = "0.0.1", \\ .dependencies = .{ \\ .zterm = .{ \\- .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#855594a8c836723f0230bfd6ad24f47613a147b1", \\- .hash = "zterm-0.3.0-1xmmEM8eHAB0cA7KLXGC7C8Nt7YEJcyoTme4domF1Yty", \\+ .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#e972a2ea0f7a9f8caffd439ef206474b46475f91", \\+ .hash = "zterm-0.3.0-1xmmENkhHAB2rmNJFH-9rRqiRLnT673xwuMrqLwOnlT_", \\ }, \\ }, \\ .minimum_zig_version = "0.16.0-dev.1254+bf15c791f", , &.{ .content, .file, .header, .content, .eof }); } test "individual changes in the different files" { try testTokenize( \\diff --git a/build.zig.zon b/build.zig.zon \\index 99bede4..a039487 100644 \\--- a/build.zig.zon \\+++ b/build.zig.zon \\@@ -3,8 +3,8 @@ \\ .version = "0.0.1", \\ .dependencies = .{ \\ .zterm = .{ \\- .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#855594a8c836723f0230bfd6ad24f47613a147b1", \\- .hash = "zterm-0.3.0-1xmmEM8eHAB0cA7KLXGC7C8Nt7YEJcyoTme4domF1Yty", \\+ .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#e972a2ea0f7a9f8caffd439ef206474b46475f91", \\+ .hash = "zterm-0.3.0-1xmmENkhHAB2rmNJFH-9rRqiRLnT673xwuMrqLwOnlT_", \\ }, \\ }, \\ .minimum_zig_version = "0.16.0-dev.1254+bf15c791f", \\diff --git a/src/model.zig b/src/model.zig \\index b402c51..defd874 100644 \\--- a/src/model.zig \\+++ b/src/model.zig \\@@ -30,3 +30,9 @@ pub const Change = struct { \\ \\ const Model = @This(); \\ const std = @import("std"); \\+const lexer = @import("lexer.zig"); \\+ \\+test { \\+ std.testing.refAllDeclsRecursive(@This()); \\+ _ = @import("lexer.zig"); \\+} , &.{ .content, .file, .header, .content, .file, .header, .content, .eof }); } test "multiple changes in same file" { try testTokenize( \\diff --git a/src/queue.zig b/src/queue.zig \\index aae7ddf..2591b0a 100644 \\--- a/src/queue.zig \\+++ b/src/queue.zig \\@@ -215,7 +215,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void { \\ // still full and the push in the other thread is still blocked \\ // waiting for space. \\ try Thread.yield(); \\- std.Thread.sleep(std.time.ns_per_s); \\+ // std.Thread.sleep(std.time.ns_per_s); \\ // Finally, let that other thread go. \\ try testing.expectEqual(1, q.pop()); \\ \\@@ -225,7 +225,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void { \\ try Thread.yield(); \\ // But we want to ensure that there's a second push waiting, so \\ // here's another sleep. \\- std.Thread.sleep(std.time.ns_per_s / 2); \\+ // std.Thread.sleep(std.time.ns_per_s / 2); \\ \\ // Another spurious wake... \\ q.not_full.signal(); \\@@ -233,7 +233,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void { \\ // And another chance for the other thread to see that it's \\ // spurious and go back to sleep. \\ try Thread.yield(); \\- std.Thread.sleep(std.time.ns_per_s / 2); \\+ // std.Thread.sleep(std.time.ns_per_s / 2); \\ \\ // Pop that thing and we're done. \\ try testing.expectEqual(2, q.pop()); \\@@ -250,13 +250,13 @@ test "Fill, block, fill, block" { \\ const thread = try Thread.spawn(cfg, sleepyPop, .{&queue}); \\ queue.push(1); \\ queue.push(2); \\- const now = std.time.milliTimestamp(); \\+ // const now = std.time.milliTimestamp(); \\ queue.push(3); // This one should block. \\- const then = std.time.milliTimestamp(); \\+ // const then = std.time.milliTimestamp(); \\ \\ // Just to make sure the sleeps are yielding to this thread, make \\ // sure it took at least 900ms to do the push. \\- try testing.expect(then - now > 900); \\+ // try testing.expect(then - now > 900); \\ \\ // This should block again, waiting for the other thread. \\ queue.push(4); \\@@ -270,14 +270,14 @@ test "Fill, block, fill, block" { \\ fn sleepyPush(q: *Queue(u8, 1)) !void { \\ // Try to ensure the other thread has already started trying to pop. \\ try Thread.yield(); \\- std.Thread.sleep(std.time.ns_per_s / 2); \\+ // std.Thread.sleep(std.time.ns_per_s / 2); \\ \\ // Spurious wake \\ q.not_full.signal(); \\ q.not_empty.signal(); \\ \\ try Thread.yield(); \\- std.Thread.sleep(std.time.ns_per_s / 2); \\+ // std.Thread.sleep(std.time.ns_per_s / 2); \\ \\ // Stick something in the queue so it can be popped. \\ q.push(1); \\@@ -286,7 +286,7 @@ fn sleepyPush(q: *Queue(u8, 1)) !void { \\ try Thread.yield(); \\ // Give the other thread time to block again. \\ try Thread.yield(); \\- std.Thread.sleep(std.time.ns_per_s / 2); \\+ // std.Thread.sleep(std.time.ns_per_s / 2); \\ \\ // Spurious wake \\ q.not_full.signal(); \\@@ -317,7 +317,7 @@ test "2 readers" { \\ const t1 = try Thread.spawn(cfg, readerThread, .{&queue}); \\ const t2 = try Thread.spawn(cfg, readerThread, .{&queue}); \\ try Thread.yield(); \\- std.Thread.sleep(std.time.ns_per_s / 2); \\+ // std.Thread.sleep(std.time.ns_per_s / 2); \\ queue.push(1); \\ queue.push(1); \\ t1.join(); \\ ); , &.{ .content, .file, .header, .content, .header, .content, .header, .content, .header, .content, .header, .content, .header, .content, .header, .content, .eof, }); } /// Test tokenizer's iterator outputs for the provided source. It should /// match the expected token tags, except the very last .eof tag which shall /// be omitted from the argument of expected_token_tags, as this function /// explicitly tests for the .eof tag (with corresponding location information). fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void { var tokenizer = Tokenizer.init(source); for (0.., expected_token_tags) |i, expected| { const token = tokenizer.next(); testing.expectEqual(expected, token.tag) catch |err| { print("Got token: ", .{}); tokenizer.dump(&token); print("Expected .{s} at index {d}\n", .{ @tagName(expected), i }); return err; }; } const last_token = tokenizer.next(); testing.expectEqual(Token.Tag.eof, last_token.tag) catch |err| { print("Got token: ", .{}); tokenizer.dump(&last_token); print("Expected .{s}\n", .{@tagName(Token.Tag.eof)}); return err; }; try testing.expectEqual(source.len, last_token.loc.idx); try testing.expectEqual(0, last_token.loc.len); }