tui-diff/src/lexer.zig

///! Lexer for *unified diff* format to tokenize input sources accordningly.
pub const Token = struct {
    tag: Tag,
    loc: Location,

    pub const Location = struct {
        idx: usize,
        len: usize,
    };

    pub const Tag = enum(u8) {
        /// File information; contains the content of:
        /// ```
        /// --- a/xxx
        /// --- b/xxx
        /// ```
        /// *NOTE* includes trailing newline character
        file,
        /// Hunk header information; contains content of:
        /// ```@@ -x,y +z,y @@```
        header,
        /// may be diff content or filler content of the tools output
        content,
        /// invalid contents that could not be parsed correctly
        invalid,
        /// End of file
        eof,

        pub fn lexeme(tag: Tag) ?[]const u8 {
            return switch (tag) {
                .header => "@@ -x,y +z,y @@",
                .content => "..",
                .file => "diff --git a/xxx b/xxx",
            };
        }

        pub fn symbol(tag: Tag) []const u8 {
            return tag.lexeme() orelse switch (tag) {
                .eof => "EOF",
                .invalid => "invalid",
                else => unreachable,
            };
        }
    };
};

pub const Tokenizer = struct {
    buffer: [:0]const u8,
    index: usize,

    /// For debugging purposes
    pub fn dump(self: *const Tokenizer, token: *const Token) void {
        print(".{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.idx .. token.loc.idx + token.loc.len] });
    }

    pub fn init(buffer: [:0]const u8) Tokenizer {
        return .{
            .buffer = buffer,
            // skip the UTF-8 BOM if present
            .index = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
        };
    }

    const State = enum {
        default,
        invalid,
        at_sign,
        minus,
        header,
        plus,
        file,
    };

    /// state fsm (finite state machine) describing the syntax of `nf`
    /// TODO I need to draw one for all the possible states for tokenization!
    /// -> for that I can create test cases!
    /// -> detect valid and invalid syntax uses! this is however the job of the parser?
    ///
    /// TODO points to improve on:
    /// -> reduce duplicated code sections
    /// -> make tags more explicit (i.e. remove unnecessary newlines, whitespaces, etc.)
    /// -> streamline catching the common cases for tokens
    /// -> reduce state machine
    /// -> do not group tokens, instead this should be done by the parser when deriving the ast from the token stream
    ///    then the parser can identify missing parts and even point to the corresponding location in the file!
    pub fn next(this: *Tokenizer) Token {
        const token = this.next_token();
        this.index = token.loc.idx + token.loc.len;
        return token;
    }

    fn next_token(this: *const Tokenizer) Token {
        var index = this.index;
        var result: Token = .{
            .tag = undefined,
            .loc = .{
                .idx = this.index,
                .len = undefined,
            },
        };
        state: switch (State.default) {
            .default => switch (this.buffer[index]) {
                0 => if (index == this.buffer.len) {
                    if (result.loc.idx != index) {
                        result.tag = .content;
                    } else {
                        return .{
                            .tag = .eof,
                            .loc = .{
                                .idx = index,
                                .len = 0,
                            },
                        };
                    }
                } else {
                    continue :state .invalid;
                },
                '@' => continue :state .at_sign,
                '-' => continue :state .minus,
                else => {
                    index += 1;
                    continue :state .default;
                },
            },
            .invalid => {
                switch (this.buffer[index]) {
                    0 => result.tag = .invalid,
                    else => {
                        index += 1;
                        result.tag = .invalid;
                    },
                }
            },
            .at_sign => {
                index += 1;
                switch (this.buffer[index]) {
                    '@' => if (result.loc.idx != index - 1) {
                        index -= 1;
                        result.tag = .content;
                    } else continue :state .header,
                    else => continue :state .default,
                }
            },
            .header => {
                index += 1;
                switch (this.buffer[index]) {
                    '@' => if (this.buffer[index + 1] == '@') {
                        result.tag = .header;
                        index += 2;
                    } else continue :state .invalid,
                    0 => continue :state .invalid,
                    else => continue :state .header,
                }
            },
            .minus => {
                index += 1;
                switch (this.buffer[index]) {
                    // assuming that we start with a minus!
                    '-' => if (this.buffer[index + 1] == '-') {
                        if (result.loc.idx != index - 1) {
                            index -= 1;
                            result.tag = .content;
                        } else {
                            index += 1;
                            continue :state .file;
                        }
                    } else continue :state .default,
                    0 => continue :state .invalid,
                    else => continue :state .default,
                }
            },
            .file => {
                // std.log.err(".file: {s}", .{this.buffer[index - 2 .. @min(index + 3, this.buffer.len)]});
                index += 1;
                switch (this.buffer[index]) {
                    '+' => if (this.buffer[index + 1] == '+' and this.buffer[index + 2] == '+') {
                        index += 2;
                        continue :state .plus;
                    } else continue :state .file,
                    0 => continue :state .invalid,
                    else => continue :state .file,
                }
            },
            .plus => {
                // std.log.err(".plus", .{});
                index += 1;
                switch (this.buffer[index]) {
                    '\n' => {
                        index += 1; // include newline
                        result.tag = .file;
                    },
                    0 => continue :state .invalid,
                    else => continue :state .plus,
                }
            },
        }

        result.loc.len = index - result.loc.idx;
        return result;
    }
};

const std = @import("std");
const mem = std.mem;
const debug = std.debug;
const testing = std.testing;
const assert = debug.assert;
const print = debug.print;

test "individual change" {
    try testTokenize(
        \\diff --git a/build.zig.zon b/build.zig.zon
        \\index 99bede4..a039487 100644
        \\--- a/build.zig.zon
        \\+++ b/build.zig.zon
        \\@@ -3,8 +3,8 @@
        \\     .version = "0.0.1",
        \\     .dependencies = .{
        \\         .zterm = .{
        \\-            .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#855594a8c836723f0230bfd6ad24f47613a147b1",
        \\-            .hash = "zterm-0.3.0-1xmmEM8eHAB0cA7KLXGC7C8Nt7YEJcyoTme4domF1Yty",
        \\+            .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#e972a2ea0f7a9f8caffd439ef206474b46475f91",
        \\+            .hash = "zterm-0.3.0-1xmmENkhHAB2rmNJFH-9rRqiRLnT673xwuMrqLwOnlT_",
        \\         },
        \\     },
        \\     .minimum_zig_version = "0.16.0-dev.1254+bf15c791f",
    , &.{ .content, .file, .header, .content, .eof });
}

test "individual changes in the different files" {
    try testTokenize(
        \\diff --git a/build.zig.zon b/build.zig.zon
        \\index 99bede4..a039487 100644
        \\--- a/build.zig.zon
        \\+++ b/build.zig.zon
        \\@@ -3,8 +3,8 @@
        \\     .version = "0.0.1",
        \\     .dependencies = .{
        \\         .zterm = .{
        \\-            .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#855594a8c836723f0230bfd6ad24f47613a147b1",
        \\-            .hash = "zterm-0.3.0-1xmmEM8eHAB0cA7KLXGC7C8Nt7YEJcyoTme4domF1Yty",
        \\+            .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#e972a2ea0f7a9f8caffd439ef206474b46475f91",
        \\+            .hash = "zterm-0.3.0-1xmmENkhHAB2rmNJFH-9rRqiRLnT673xwuMrqLwOnlT_",
        \\         },
        \\     },
        \\     .minimum_zig_version = "0.16.0-dev.1254+bf15c791f",
        \\diff --git a/src/model.zig b/src/model.zig
        \\index b402c51..defd874 100644
        \\--- a/src/model.zig
        \\+++ b/src/model.zig
        \\@@ -30,3 +30,9 @@ pub const Change = struct {
        \\
        \\ const Model = @This();
        \\ const std = @import("std");
        \\+const lexer = @import("lexer.zig");
        \\+
        \\+test {
        \\+    std.testing.refAllDeclsRecursive(@This());
        \\+    _ = @import("lexer.zig");
        \\+}
    , &.{ .content, .file, .header, .content, .file, .header, .content, .eof });
}

test "multiple changes in same file" {
    try testTokenize(
        \\diff --git a/src/queue.zig b/src/queue.zig
        \\index aae7ddf..2591b0a 100644
        \\--- a/src/queue.zig
        \\+++ b/src/queue.zig
        \\@@ -215,7 +215,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void {
        \\     // still full and the push in the other thread is still blocked
        \\     // waiting for space.
        \\     try Thread.yield();
        \\-    std.Thread.sleep(std.time.ns_per_s);
        \\+    // std.Thread.sleep(std.time.ns_per_s);
        \\     // Finally, let that other thread go.
        \\     try testing.expectEqual(1, q.pop());
        \\
        \\@@ -225,7 +225,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void {
        \\         try Thread.yield();
        \\     // But we want to ensure that there's a second push waiting, so
        \\     // here's another sleep.
        \\-    std.Thread.sleep(std.time.ns_per_s / 2);
        \\+    // std.Thread.sleep(std.time.ns_per_s / 2);
        \\
        \\     // Another spurious wake...
        \\     q.not_full.signal();
        \\@@ -233,7 +233,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void {
        \\     // And another chance for the other thread to see that it's
        \\     // spurious and go back to sleep.
        \\     try Thread.yield();
        \\-    std.Thread.sleep(std.time.ns_per_s / 2);
        \\+    // std.Thread.sleep(std.time.ns_per_s / 2);
        \\
        \\     // Pop that thing and we're done.
        \\     try testing.expectEqual(2, q.pop());
        \\@@ -250,13 +250,13 @@ test "Fill, block, fill, block" {
        \\     const thread = try Thread.spawn(cfg, sleepyPop, .{&queue});
        \\     queue.push(1);
        \\     queue.push(2);
        \\-    const now = std.time.milliTimestamp();
        \\+    // const now = std.time.milliTimestamp();
        \\     queue.push(3); // This one should block.
        \\-    const then = std.time.milliTimestamp();
        \\+    // const then = std.time.milliTimestamp();
        \\
        \\     // Just to make sure the sleeps are yielding to this thread, make
        \\     // sure it took at least 900ms to do the push.
        \\-    try testing.expect(then - now > 900);
        \\+    // try testing.expect(then - now > 900);
        \\
        \\     // This should block again, waiting for the other thread.
        \\     queue.push(4);
        \\@@ -270,14 +270,14 @@ test "Fill, block, fill, block" {
        \\ fn sleepyPush(q: *Queue(u8, 1)) !void {
        \\     // Try to ensure the other thread has already started trying to pop.
        \\     try Thread.yield();
        \\-    std.Thread.sleep(std.time.ns_per_s / 2);
        \\+    // std.Thread.sleep(std.time.ns_per_s / 2);
        \\
        \\     // Spurious wake
        \\     q.not_full.signal();
        \\     q.not_empty.signal();
        \\
        \\     try Thread.yield();
        \\-    std.Thread.sleep(std.time.ns_per_s / 2);
        \\+    // std.Thread.sleep(std.time.ns_per_s / 2);
        \\
        \\     // Stick something in the queue so it can be popped.
        \\     q.push(1);
        \\@@ -286,7 +286,7 @@ fn sleepyPush(q: *Queue(u8, 1)) !void {
        \\         try Thread.yield();
        \\     // Give the other thread time to block again.
        \\     try Thread.yield();
        \\-    std.Thread.sleep(std.time.ns_per_s / 2);
        \\+    // std.Thread.sleep(std.time.ns_per_s / 2);
        \\
        \\     // Spurious wake
        \\     q.not_full.signal();
        \\@@ -317,7 +317,7 @@ test "2 readers" {
        \\     const t1 = try Thread.spawn(cfg, readerThread, .{&queue});
        \\     const t2 = try Thread.spawn(cfg, readerThread, .{&queue});
        \\     try Thread.yield();
        \\-    std.Thread.sleep(std.time.ns_per_s / 2);
        \\+    // std.Thread.sleep(std.time.ns_per_s / 2);
        \\     queue.push(1);
        \\     queue.push(1);
        \\     t1.join();
        \\    );
    , &.{
        .content,
        .file,
        .header,
        .content,
        .header,
        .content,
        .header,
        .content,
        .header,
        .content,
        .header,
        .content,
        .header,
        .content,
        .header,
        .content,
        .eof,
    });
}

/// Test tokenizer's iterator outputs for the provided source. It should
/// match the expected token tags, except the very last .eof tag which shall
/// be omitted from the argument of expected_token_tags, as this function
/// explicitly tests for the .eof tag (with corresponding location information).
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
    var tokenizer = Tokenizer.init(source);
    for (0.., expected_token_tags) |i, expected| {
        const token = tokenizer.next();
        testing.expectEqual(expected, token.tag) catch |err| {
            print("Got token: ", .{});
            tokenizer.dump(&token);
            print("Expected .{s} at index {d}\n", .{ @tagName(expected), i });
            return err;
        };
    }
    const last_token = tokenizer.next();
    testing.expectEqual(Token.Tag.eof, last_token.tag) catch |err| {
        print("Got token: ", .{});
        tokenizer.dump(&last_token);
        print("Expected .{s}\n", .{@tagName(Token.Tag.eof)});
        return err;
    };
    try testing.expectEqual(source.len, last_token.loc.idx);
    try testing.expectEqual(0, last_token.loc.len);
}