WIP: first working version
Some checks failed
Zig Project Action / Lint, Spell-check and test zig project (push) Failing after 55s

It still has a minor memory leak and has at least two hacks
implemented that I would like to improve on.
This commit is contained in:
2025-11-29 12:19:07 +01:00
parent 1485385735
commit 8c4b8643af
6 changed files with 748 additions and 23 deletions

395
src/lexer.zig Normal file
View File

@@ -0,0 +1,395 @@
///! Lexer for *unified diff* format to tokenize input sources accordningly.
pub const Token = struct {
tag: Tag,
loc: Location,
pub const Location = struct {
idx: usize,
len: usize,
};
pub const Tag = enum(u8) {
/// File information; contains the content of:
/// ```
/// --- a/xxx
/// --- b/xxx
/// ```
/// *NOTE* includes trailing newline character
file,
/// Hunk header information; contains content of:
/// ```@@ -x,y +z,y @@```
header,
/// may be diff content or filler content of the tools output
content,
/// invalid contents that could not be parsed correctly
invalid,
/// End of file
eof,
pub fn lexeme(tag: Tag) ?[]const u8 {
return switch (tag) {
.header => "@@ -x,y +z,y @@",
.content => "..",
.file => "diff --git a/xxx b/xxx",
};
}
pub fn symbol(tag: Tag) []const u8 {
return tag.lexeme() orelse switch (tag) {
.eof => "EOF",
.invalid => "invalid",
else => unreachable,
};
}
};
};
pub const Tokenizer = struct {
buffer: [:0]const u8,
index: usize,
/// For debugging purposes
pub fn dump(self: *const Tokenizer, token: *const Token) void {
print(".{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.idx .. token.loc.idx + token.loc.len] });
}
pub fn init(buffer: [:0]const u8) Tokenizer {
return .{
.buffer = buffer,
// skip the UTF-8 BOM if present
.index = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
};
}
const State = enum {
default,
invalid,
at_sign,
minus,
header,
plus,
file,
};
/// state fsm (finite state machine) describing the syntax of `nf`
/// TODO I need to draw one for all the possible states for tokenization!
/// -> for that I can create test cases!
/// -> detect valid and invalid syntax uses! this is however the job of the parser?
///
/// TODO points to improve on:
/// -> reduce duplicated code sections
/// -> make tags more explicit (i.e. remove unnecessary newlines, whitespaces, etc.)
/// -> streamline catching the common cases for tokens
/// -> reduce state machine
/// -> do not group tokens, instead this should be done by the parser when deriving the ast from the token stream
/// then the parser can identify missing parts and even point to the corresponding location in the file!
pub fn next(this: *Tokenizer) Token {
const token = this.next_token();
this.index = token.loc.idx + token.loc.len;
return token;
}
fn next_token(this: *const Tokenizer) Token {
var index = this.index;
var result: Token = .{
.tag = undefined,
.loc = .{
.idx = this.index,
.len = undefined,
},
};
state: switch (State.default) {
.default => switch (this.buffer[index]) {
0 => if (index == this.buffer.len) {
if (result.loc.idx != index) {
result.tag = .content;
} else {
return .{
.tag = .eof,
.loc = .{
.idx = index,
.len = 0,
},
};
}
} else {
continue :state .invalid;
},
'@' => continue :state .at_sign,
'-' => continue :state .minus,
else => {
index += 1;
continue :state .default;
},
},
.invalid => {
switch (this.buffer[index]) {
0 => result.tag = .invalid,
else => {
index += 1;
result.tag = .invalid;
},
}
},
.at_sign => {
index += 1;
switch (this.buffer[index]) {
'@' => if (result.loc.idx != index - 1) {
index -= 1;
result.tag = .content;
} else continue :state .header,
else => continue :state .default,
}
},
.header => {
index += 1;
switch (this.buffer[index]) {
'@' => if (this.buffer[index + 1] == '@') {
result.tag = .header;
index += 2;
} else continue :state .invalid,
0 => continue :state .invalid,
else => continue :state .header,
}
},
.minus => {
index += 1;
switch (this.buffer[index]) {
// assuming that we start with a minus!
'-' => if (this.buffer[index + 1] == '-') {
if (result.loc.idx != index - 1) {
index -= 1;
result.tag = .content;
} else {
index += 1;
continue :state .file;
}
} else continue :state .default,
0 => continue :state .invalid,
else => continue :state .default,
}
},
.file => {
// std.log.err(".file: {s}", .{this.buffer[index - 2 .. @min(index + 3, this.buffer.len)]});
index += 1;
switch (this.buffer[index]) {
'+' => if (this.buffer[index + 1] == '+' and this.buffer[index + 2] == '+') {
index += 2;
continue :state .plus;
} else continue :state .file,
0 => continue :state .invalid,
else => continue :state .file,
}
},
.plus => {
// std.log.err(".plus", .{});
index += 1;
switch (this.buffer[index]) {
'\n' => {
index += 1; // include newline
result.tag = .file;
},
0 => continue :state .invalid,
else => continue :state .plus,
}
},
}
result.loc.len = index - result.loc.idx;
return result;
}
};
const std = @import("std");
const mem = std.mem;
const debug = std.debug;
const testing = std.testing;
const assert = debug.assert;
const print = debug.print;
test "individual change" {
try testTokenize(
\\diff --git a/build.zig.zon b/build.zig.zon
\\index 99bede4..a039487 100644
\\--- a/build.zig.zon
\\+++ b/build.zig.zon
\\@@ -3,8 +3,8 @@
\\ .version = "0.0.1",
\\ .dependencies = .{
\\ .zterm = .{
\\- .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#855594a8c836723f0230bfd6ad24f47613a147b1",
\\- .hash = "zterm-0.3.0-1xmmEM8eHAB0cA7KLXGC7C8Nt7YEJcyoTme4domF1Yty",
\\+ .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#e972a2ea0f7a9f8caffd439ef206474b46475f91",
\\+ .hash = "zterm-0.3.0-1xmmENkhHAB2rmNJFH-9rRqiRLnT673xwuMrqLwOnlT_",
\\ },
\\ },
\\ .minimum_zig_version = "0.16.0-dev.1254+bf15c791f",
, &.{ .content, .file, .header, .content, .eof });
}
test "individual changes in the different files" {
try testTokenize(
\\diff --git a/build.zig.zon b/build.zig.zon
\\index 99bede4..a039487 100644
\\--- a/build.zig.zon
\\+++ b/build.zig.zon
\\@@ -3,8 +3,8 @@
\\ .version = "0.0.1",
\\ .dependencies = .{
\\ .zterm = .{
\\- .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#855594a8c836723f0230bfd6ad24f47613a147b1",
\\- .hash = "zterm-0.3.0-1xmmEM8eHAB0cA7KLXGC7C8Nt7YEJcyoTme4domF1Yty",
\\+ .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#e972a2ea0f7a9f8caffd439ef206474b46475f91",
\\+ .hash = "zterm-0.3.0-1xmmENkhHAB2rmNJFH-9rRqiRLnT673xwuMrqLwOnlT_",
\\ },
\\ },
\\ .minimum_zig_version = "0.16.0-dev.1254+bf15c791f",
\\diff --git a/src/model.zig b/src/model.zig
\\index b402c51..defd874 100644
\\--- a/src/model.zig
\\+++ b/src/model.zig
\\@@ -30,3 +30,9 @@ pub const Change = struct {
\\
\\ const Model = @This();
\\ const std = @import("std");
\\+const lexer = @import("lexer.zig");
\\+
\\+test {
\\+ std.testing.refAllDeclsRecursive(@This());
\\+ _ = @import("lexer.zig");
\\+}
, &.{ .content, .file, .header, .content, .file, .header, .content, .eof });
}
test "multiple changes in same file" {
try testTokenize(
\\diff --git a/src/queue.zig b/src/queue.zig
\\index aae7ddf..2591b0a 100644
\\--- a/src/queue.zig
\\+++ b/src/queue.zig
\\@@ -215,7 +215,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void {
\\ // still full and the push in the other thread is still blocked
\\ // waiting for space.
\\ try Thread.yield();
\\- std.Thread.sleep(std.time.ns_per_s);
\\+ // std.Thread.sleep(std.time.ns_per_s);
\\ // Finally, let that other thread go.
\\ try testing.expectEqual(1, q.pop());
\\
\\@@ -225,7 +225,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void {
\\ try Thread.yield();
\\ // But we want to ensure that there's a second push waiting, so
\\ // here's another sleep.
\\- std.Thread.sleep(std.time.ns_per_s / 2);
\\+ // std.Thread.sleep(std.time.ns_per_s / 2);
\\
\\ // Another spurious wake...
\\ q.not_full.signal();
\\@@ -233,7 +233,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void {
\\ // And another chance for the other thread to see that it's
\\ // spurious and go back to sleep.
\\ try Thread.yield();
\\- std.Thread.sleep(std.time.ns_per_s / 2);
\\+ // std.Thread.sleep(std.time.ns_per_s / 2);
\\
\\ // Pop that thing and we're done.
\\ try testing.expectEqual(2, q.pop());
\\@@ -250,13 +250,13 @@ test "Fill, block, fill, block" {
\\ const thread = try Thread.spawn(cfg, sleepyPop, .{&queue});
\\ queue.push(1);
\\ queue.push(2);
\\- const now = std.time.milliTimestamp();
\\+ // const now = std.time.milliTimestamp();
\\ queue.push(3); // This one should block.
\\- const then = std.time.milliTimestamp();
\\+ // const then = std.time.milliTimestamp();
\\
\\ // Just to make sure the sleeps are yielding to this thread, make
\\ // sure it took at least 900ms to do the push.
\\- try testing.expect(then - now > 900);
\\+ // try testing.expect(then - now > 900);
\\
\\ // This should block again, waiting for the other thread.
\\ queue.push(4);
\\@@ -270,14 +270,14 @@ test "Fill, block, fill, block" {
\\ fn sleepyPush(q: *Queue(u8, 1)) !void {
\\ // Try to ensure the other thread has already started trying to pop.
\\ try Thread.yield();
\\- std.Thread.sleep(std.time.ns_per_s / 2);
\\+ // std.Thread.sleep(std.time.ns_per_s / 2);
\\
\\ // Spurious wake
\\ q.not_full.signal();
\\ q.not_empty.signal();
\\
\\ try Thread.yield();
\\- std.Thread.sleep(std.time.ns_per_s / 2);
\\+ // std.Thread.sleep(std.time.ns_per_s / 2);
\\
\\ // Stick something in the queue so it can be popped.
\\ q.push(1);
\\@@ -286,7 +286,7 @@ fn sleepyPush(q: *Queue(u8, 1)) !void {
\\ try Thread.yield();
\\ // Give the other thread time to block again.
\\ try Thread.yield();
\\- std.Thread.sleep(std.time.ns_per_s / 2);
\\+ // std.Thread.sleep(std.time.ns_per_s / 2);
\\
\\ // Spurious wake
\\ q.not_full.signal();
\\@@ -317,7 +317,7 @@ test "2 readers" {
\\ const t1 = try Thread.spawn(cfg, readerThread, .{&queue});
\\ const t2 = try Thread.spawn(cfg, readerThread, .{&queue});
\\ try Thread.yield();
\\- std.Thread.sleep(std.time.ns_per_s / 2);
\\+ // std.Thread.sleep(std.time.ns_per_s / 2);
\\ queue.push(1);
\\ queue.push(1);
\\ t1.join();
\\ );
, &.{
.content,
.file,
.header,
.content,
.header,
.content,
.header,
.content,
.header,
.content,
.header,
.content,
.header,
.content,
.header,
.content,
.eof,
});
}
/// Test tokenizer's iterator outputs for the provided source. It should
/// match the expected token tags, except the very last .eof tag which shall
/// be omitted from the argument of expected_token_tags, as this function
/// explicitly tests for the .eof tag (with corresponding location information).
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
var tokenizer = Tokenizer.init(source);
for (0.., expected_token_tags) |i, expected| {
const token = tokenizer.next();
testing.expectEqual(expected, token.tag) catch |err| {
print("Got token: ", .{});
tokenizer.dump(&token);
print("Expected .{s} at index {d}\n", .{ @tagName(expected), i });
return err;
};
}
const last_token = tokenizer.next();
testing.expectEqual(Token.Tag.eof, last_token.tag) catch |err| {
print("Got token: ", .{});
tokenizer.dump(&last_token);
print("Expected .{s}\n", .{@tagName(Token.Tag.eof)});
return err;
};
try testing.expectEqual(source.len, last_token.loc.idx);
try testing.expectEqual(0, last_token.loc.len);
}