WIP: first working version

It still has a minor memory leak and has at least two hacks implemented that I would like to improve on.
2025-11-29 12:19:07 +01:00
parent 1485385735
commit 8c4b8643af
6 changed files with 748 additions and 23 deletions
--- a/src/lexer.zig
+++ b/src/lexer.zig
@@ -0,0 +1,395 @@
+///! Lexer for *unified diff* format to tokenize input sources accordningly.
+pub const Token = struct {
+    tag: Tag,
+    loc: Location,
+
+    pub const Location = struct {
+        idx: usize,
+        len: usize,
+    };
+
+    pub const Tag = enum(u8) {
+        /// File information; contains the content of:
+        /// ```
+        /// --- a/xxx
+        /// --- b/xxx
+        /// ```
+        /// *NOTE* includes trailing newline character
+        file,
+        /// Hunk header information; contains content of:
+        /// ```@@ -x,y +z,y @@```
+        header,
+        /// may be diff content or filler content of the tools output
+        content,
+        /// invalid contents that could not be parsed correctly
+        invalid,
+        /// End of file
+        eof,
+
+        pub fn lexeme(tag: Tag) ?[]const u8 {
+            return switch (tag) {
+                .header => "@@ -x,y +z,y @@",
+                .content => "..",
+                .file => "diff --git a/xxx b/xxx",
+            };
+        }
+
+        pub fn symbol(tag: Tag) []const u8 {
+            return tag.lexeme() orelse switch (tag) {
+                .eof => "EOF",
+                .invalid => "invalid",
+                else => unreachable,
+            };
+        }
+    };
+};
+
+pub const Tokenizer = struct {
+    buffer: [:0]const u8,
+    index: usize,
+
+    /// For debugging purposes
+    pub fn dump(self: *const Tokenizer, token: *const Token) void {
+        print(".{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.idx .. token.loc.idx + token.loc.len] });
+    }
+
+    pub fn init(buffer: [:0]const u8) Tokenizer {
+        return .{
+            .buffer = buffer,
+            // skip the UTF-8 BOM if present
+            .index = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
+        };
+    }
+
+    const State = enum {
+        default,
+        invalid,
+        at_sign,
+        minus,
+        header,
+        plus,
+        file,
+    };
+
+    /// state fsm (finite state machine) describing the syntax of `nf`
+    /// TODO I need to draw one for all the possible states for tokenization!
+    /// -> for that I can create test cases!
+    /// -> detect valid and invalid syntax uses! this is however the job of the parser?
+    ///
+    /// TODO points to improve on:
+    /// -> reduce duplicated code sections
+    /// -> make tags more explicit (i.e. remove unnecessary newlines, whitespaces, etc.)
+    /// -> streamline catching the common cases for tokens
+    /// -> reduce state machine
+    /// -> do not group tokens, instead this should be done by the parser when deriving the ast from the token stream
+    ///    then the parser can identify missing parts and even point to the corresponding location in the file!
+    pub fn next(this: *Tokenizer) Token {
+        const token = this.next_token();
+        this.index = token.loc.idx + token.loc.len;
+        return token;
+    }
+
+    fn next_token(this: *const Tokenizer) Token {
+        var index = this.index;
+        var result: Token = .{
+            .tag = undefined,
+            .loc = .{
+                .idx = this.index,
+                .len = undefined,
+            },
+        };
+        state: switch (State.default) {
+            .default => switch (this.buffer[index]) {
+                0 => if (index == this.buffer.len) {
+                    if (result.loc.idx != index) {
+                        result.tag = .content;
+                    } else {
+                        return .{
+                            .tag = .eof,
+                            .loc = .{
+                                .idx = index,
+                                .len = 0,
+                            },
+                        };
+                    }
+                } else {
+                    continue :state .invalid;
+                },
+                '@' => continue :state .at_sign,
+                '-' => continue :state .minus,
+                else => {
+                    index += 1;
+                    continue :state .default;
+                },
+            },
+            .invalid => {
+                switch (this.buffer[index]) {
+                    0 => result.tag = .invalid,
+                    else => {
+                        index += 1;
+                        result.tag = .invalid;
+                    },
+                }
+            },
+            .at_sign => {
+                index += 1;
+                switch (this.buffer[index]) {
+                    '@' => if (result.loc.idx != index - 1) {
+                        index -= 1;
+                        result.tag = .content;
+                    } else continue :state .header,
+                    else => continue :state .default,
+                }
+            },
+            .header => {
+                index += 1;
+                switch (this.buffer[index]) {
+                    '@' => if (this.buffer[index + 1] == '@') {
+                        result.tag = .header;
+                        index += 2;
+                    } else continue :state .invalid,
+                    0 => continue :state .invalid,
+                    else => continue :state .header,
+                }
+            },
+            .minus => {
+                index += 1;
+                switch (this.buffer[index]) {
+                    // assuming that we start with a minus!
+                    '-' => if (this.buffer[index + 1] == '-') {
+                        if (result.loc.idx != index - 1) {
+                            index -= 1;
+                            result.tag = .content;
+                        } else {
+                            index += 1;
+                            continue :state .file;
+                        }
+                    } else continue :state .default,
+                    0 => continue :state .invalid,
+                    else => continue :state .default,
+                }
+            },
+            .file => {
+                // std.log.err(".file: {s}", .{this.buffer[index - 2 .. @min(index + 3, this.buffer.len)]});
+                index += 1;
+                switch (this.buffer[index]) {
+                    '+' => if (this.buffer[index + 1] == '+' and this.buffer[index + 2] == '+') {
+                        index += 2;
+                        continue :state .plus;
+                    } else continue :state .file,
+                    0 => continue :state .invalid,
+                    else => continue :state .file,
+                }
+            },
+            .plus => {
+                // std.log.err(".plus", .{});
+                index += 1;
+                switch (this.buffer[index]) {
+                    '\n' => {
+                        index += 1; // include newline
+                        result.tag = .file;
+                    },
+                    0 => continue :state .invalid,
+                    else => continue :state .plus,
+                }
+            },
+        }
+
+        result.loc.len = index - result.loc.idx;
+        return result;
+    }
+};
+
+const std = @import("std");
+const mem = std.mem;
+const debug = std.debug;
+const testing = std.testing;
+const assert = debug.assert;
+const print = debug.print;
+
+test "individual change" {
+    try testTokenize(
+        \\diff --git a/build.zig.zon b/build.zig.zon
+        \\index 99bede4..a039487 100644
+        \\--- a/build.zig.zon
+        \\+++ b/build.zig.zon
+        \\@@ -3,8 +3,8 @@
+        \\     .version = "0.0.1",
+        \\     .dependencies = .{
+        \\         .zterm = .{
+        \\-            .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#855594a8c836723f0230bfd6ad24f47613a147b1",
+        \\-            .hash = "zterm-0.3.0-1xmmEM8eHAB0cA7KLXGC7C8Nt7YEJcyoTme4domF1Yty",
+        \\+            .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#e972a2ea0f7a9f8caffd439ef206474b46475f91",
+        \\+            .hash = "zterm-0.3.0-1xmmENkhHAB2rmNJFH-9rRqiRLnT673xwuMrqLwOnlT_",
+        \\         },
+        \\     },
+        \\     .minimum_zig_version = "0.16.0-dev.1254+bf15c791f",
+    , &.{ .content, .file, .header, .content, .eof });
+}
+
+test "individual changes in the different files" {
+    try testTokenize(
+        \\diff --git a/build.zig.zon b/build.zig.zon
+        \\index 99bede4..a039487 100644
+        \\--- a/build.zig.zon
+        \\+++ b/build.zig.zon
+        \\@@ -3,8 +3,8 @@
+        \\     .version = "0.0.1",
+        \\     .dependencies = .{
+        \\         .zterm = .{
+        \\-            .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#855594a8c836723f0230bfd6ad24f47613a147b1",
+        \\-            .hash = "zterm-0.3.0-1xmmEM8eHAB0cA7KLXGC7C8Nt7YEJcyoTme4domF1Yty",
+        \\+            .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#e972a2ea0f7a9f8caffd439ef206474b46475f91",
+        \\+            .hash = "zterm-0.3.0-1xmmENkhHAB2rmNJFH-9rRqiRLnT673xwuMrqLwOnlT_",
+        \\         },
+        \\     },
+        \\     .minimum_zig_version = "0.16.0-dev.1254+bf15c791f",
+        \\diff --git a/src/model.zig b/src/model.zig
+        \\index b402c51..defd874 100644
+        \\--- a/src/model.zig
+        \\+++ b/src/model.zig
+        \\@@ -30,3 +30,9 @@ pub const Change = struct {
+        \\
+        \\ const Model = @This();
+        \\ const std = @import("std");
+        \\+const lexer = @import("lexer.zig");
+        \\+
+        \\+test {
+        \\+    std.testing.refAllDeclsRecursive(@This());
+        \\+    _ = @import("lexer.zig");
+        \\+}
+    , &.{ .content, .file, .header, .content, .file, .header, .content, .eof });
+}
+
+test "multiple changes in same file" {
+    try testTokenize(
+        \\diff --git a/src/queue.zig b/src/queue.zig
+        \\index aae7ddf..2591b0a 100644
+        \\--- a/src/queue.zig
+        \\+++ b/src/queue.zig
+        \\@@ -215,7 +215,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void {
+        \\     // still full and the push in the other thread is still blocked
+        \\     // waiting for space.
+        \\     try Thread.yield();
+        \\-    std.Thread.sleep(std.time.ns_per_s);
+        \\+    // std.Thread.sleep(std.time.ns_per_s);
+        \\     // Finally, let that other thread go.
+        \\     try testing.expectEqual(1, q.pop());
+        \\
+        \\@@ -225,7 +225,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void {
+        \\         try Thread.yield();
+        \\     // But we want to ensure that there's a second push waiting, so
+        \\     // here's another sleep.
+        \\-    std.Thread.sleep(std.time.ns_per_s / 2);
+        \\+    // std.Thread.sleep(std.time.ns_per_s / 2);
+        \\
+        \\     // Another spurious wake...
+        \\     q.not_full.signal();
+        \\@@ -233,7 +233,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void {
+        \\     // And another chance for the other thread to see that it's
+        \\     // spurious and go back to sleep.
+        \\     try Thread.yield();
+        \\-    std.Thread.sleep(std.time.ns_per_s / 2);
+        \\+    // std.Thread.sleep(std.time.ns_per_s / 2);
+        \\
+        \\     // Pop that thing and we're done.
+        \\     try testing.expectEqual(2, q.pop());
+        \\@@ -250,13 +250,13 @@ test "Fill, block, fill, block" {
+        \\     const thread = try Thread.spawn(cfg, sleepyPop, .{&queue});
+        \\     queue.push(1);
+        \\     queue.push(2);
+        \\-    const now = std.time.milliTimestamp();
+        \\+    // const now = std.time.milliTimestamp();
+        \\     queue.push(3); // This one should block.
+        \\-    const then = std.time.milliTimestamp();
+        \\+    // const then = std.time.milliTimestamp();
+        \\
+        \\     // Just to make sure the sleeps are yielding to this thread, make
+        \\     // sure it took at least 900ms to do the push.
+        \\-    try testing.expect(then - now > 900);
+        \\+    // try testing.expect(then - now > 900);
+        \\
+        \\     // This should block again, waiting for the other thread.
+        \\     queue.push(4);
+        \\@@ -270,14 +270,14 @@ test "Fill, block, fill, block" {
+        \\ fn sleepyPush(q: *Queue(u8, 1)) !void {
+        \\     // Try to ensure the other thread has already started trying to pop.
+        \\     try Thread.yield();
+        \\-    std.Thread.sleep(std.time.ns_per_s / 2);
+        \\+    // std.Thread.sleep(std.time.ns_per_s / 2);
+        \\
+        \\     // Spurious wake
+        \\     q.not_full.signal();
+        \\     q.not_empty.signal();
+        \\
+        \\     try Thread.yield();
+        \\-    std.Thread.sleep(std.time.ns_per_s / 2);
+        \\+    // std.Thread.sleep(std.time.ns_per_s / 2);
+        \\
+        \\     // Stick something in the queue so it can be popped.
+        \\     q.push(1);
+        \\@@ -286,7 +286,7 @@ fn sleepyPush(q: *Queue(u8, 1)) !void {
+        \\         try Thread.yield();
+        \\     // Give the other thread time to block again.
+        \\     try Thread.yield();
+        \\-    std.Thread.sleep(std.time.ns_per_s / 2);
+        \\+    // std.Thread.sleep(std.time.ns_per_s / 2);
+        \\
+        \\     // Spurious wake
+        \\     q.not_full.signal();
+        \\@@ -317,7 +317,7 @@ test "2 readers" {
+        \\     const t1 = try Thread.spawn(cfg, readerThread, .{&queue});
+        \\     const t2 = try Thread.spawn(cfg, readerThread, .{&queue});
+        \\     try Thread.yield();
+        \\-    std.Thread.sleep(std.time.ns_per_s / 2);
+        \\+    // std.Thread.sleep(std.time.ns_per_s / 2);
+        \\     queue.push(1);
+        \\     queue.push(1);
+        \\     t1.join();
+        \\    );
+    , &.{
+        .content,
+        .file,
+        .header,
+        .content,
+        .header,
+        .content,
+        .header,
+        .content,
+        .header,
+        .content,
+        .header,
+        .content,
+        .header,
+        .content,
+        .header,
+        .content,
+        .eof,
+    });
+}
+
+/// Test tokenizer's iterator outputs for the provided source. It should
+/// match the expected token tags, except the very last .eof tag which shall
+/// be omitted from the argument of expected_token_tags, as this function
+/// explicitly tests for the .eof tag (with corresponding location information).
+fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
+    var tokenizer = Tokenizer.init(source);
+    for (0.., expected_token_tags) |i, expected| {
+        const token = tokenizer.next();
+        testing.expectEqual(expected, token.tag) catch |err| {
+            print("Got token: ", .{});
+            tokenizer.dump(&token);
+            print("Expected .{s} at index {d}\n", .{ @tagName(expected), i });
+            return err;
+        };
+    }
+    const last_token = tokenizer.next();
+    testing.expectEqual(Token.Tag.eof, last_token.tag) catch |err| {
+        print("Got token: ", .{});
+        tokenizer.dump(&last_token);
+        print("Expected .{s}\n", .{@tagName(Token.Tag.eof)});
+        return err;
+    };
+    try testing.expectEqual(source.len, last_token.loc.idx);
+    try testing.expectEqual(0, last_token.loc.len);
+}