WIP: first working version
Some checks failed
Zig Project Action / Lint, Spell-check and test zig project (push) Failing after 55s
Some checks failed
Zig Project Action / Lint, Spell-check and test zig project (push) Failing after 55s
It still has a minor memory leak and has at least two hacks implemented that I would like to improve on.
This commit is contained in:
395
src/lexer.zig
Normal file
395
src/lexer.zig
Normal file
@@ -0,0 +1,395 @@
|
||||
///! Lexer for *unified diff* format to tokenize input sources accordningly.
|
||||
pub const Token = struct {
|
||||
tag: Tag,
|
||||
loc: Location,
|
||||
|
||||
pub const Location = struct {
|
||||
idx: usize,
|
||||
len: usize,
|
||||
};
|
||||
|
||||
pub const Tag = enum(u8) {
|
||||
/// File information; contains the content of:
|
||||
/// ```
|
||||
/// --- a/xxx
|
||||
/// --- b/xxx
|
||||
/// ```
|
||||
/// *NOTE* includes trailing newline character
|
||||
file,
|
||||
/// Hunk header information; contains content of:
|
||||
/// ```@@ -x,y +z,y @@```
|
||||
header,
|
||||
/// may be diff content or filler content of the tools output
|
||||
content,
|
||||
/// invalid contents that could not be parsed correctly
|
||||
invalid,
|
||||
/// End of file
|
||||
eof,
|
||||
|
||||
pub fn lexeme(tag: Tag) ?[]const u8 {
|
||||
return switch (tag) {
|
||||
.header => "@@ -x,y +z,y @@",
|
||||
.content => "..",
|
||||
.file => "diff --git a/xxx b/xxx",
|
||||
};
|
||||
}
|
||||
|
||||
pub fn symbol(tag: Tag) []const u8 {
|
||||
return tag.lexeme() orelse switch (tag) {
|
||||
.eof => "EOF",
|
||||
.invalid => "invalid",
|
||||
else => unreachable,
|
||||
};
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
pub const Tokenizer = struct {
|
||||
buffer: [:0]const u8,
|
||||
index: usize,
|
||||
|
||||
/// For debugging purposes
|
||||
pub fn dump(self: *const Tokenizer, token: *const Token) void {
|
||||
print(".{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.idx .. token.loc.idx + token.loc.len] });
|
||||
}
|
||||
|
||||
pub fn init(buffer: [:0]const u8) Tokenizer {
|
||||
return .{
|
||||
.buffer = buffer,
|
||||
// skip the UTF-8 BOM if present
|
||||
.index = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
|
||||
};
|
||||
}
|
||||
|
||||
const State = enum {
|
||||
default,
|
||||
invalid,
|
||||
at_sign,
|
||||
minus,
|
||||
header,
|
||||
plus,
|
||||
file,
|
||||
};
|
||||
|
||||
/// state fsm (finite state machine) describing the syntax of `nf`
|
||||
/// TODO I need to draw one for all the possible states for tokenization!
|
||||
/// -> for that I can create test cases!
|
||||
/// -> detect valid and invalid syntax uses! this is however the job of the parser?
|
||||
///
|
||||
/// TODO points to improve on:
|
||||
/// -> reduce duplicated code sections
|
||||
/// -> make tags more explicit (i.e. remove unnecessary newlines, whitespaces, etc.)
|
||||
/// -> streamline catching the common cases for tokens
|
||||
/// -> reduce state machine
|
||||
/// -> do not group tokens, instead this should be done by the parser when deriving the ast from the token stream
|
||||
/// then the parser can identify missing parts and even point to the corresponding location in the file!
|
||||
pub fn next(this: *Tokenizer) Token {
|
||||
const token = this.next_token();
|
||||
this.index = token.loc.idx + token.loc.len;
|
||||
return token;
|
||||
}
|
||||
|
||||
fn next_token(this: *const Tokenizer) Token {
|
||||
var index = this.index;
|
||||
var result: Token = .{
|
||||
.tag = undefined,
|
||||
.loc = .{
|
||||
.idx = this.index,
|
||||
.len = undefined,
|
||||
},
|
||||
};
|
||||
state: switch (State.default) {
|
||||
.default => switch (this.buffer[index]) {
|
||||
0 => if (index == this.buffer.len) {
|
||||
if (result.loc.idx != index) {
|
||||
result.tag = .content;
|
||||
} else {
|
||||
return .{
|
||||
.tag = .eof,
|
||||
.loc = .{
|
||||
.idx = index,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
}
|
||||
} else {
|
||||
continue :state .invalid;
|
||||
},
|
||||
'@' => continue :state .at_sign,
|
||||
'-' => continue :state .minus,
|
||||
else => {
|
||||
index += 1;
|
||||
continue :state .default;
|
||||
},
|
||||
},
|
||||
.invalid => {
|
||||
switch (this.buffer[index]) {
|
||||
0 => result.tag = .invalid,
|
||||
else => {
|
||||
index += 1;
|
||||
result.tag = .invalid;
|
||||
},
|
||||
}
|
||||
},
|
||||
.at_sign => {
|
||||
index += 1;
|
||||
switch (this.buffer[index]) {
|
||||
'@' => if (result.loc.idx != index - 1) {
|
||||
index -= 1;
|
||||
result.tag = .content;
|
||||
} else continue :state .header,
|
||||
else => continue :state .default,
|
||||
}
|
||||
},
|
||||
.header => {
|
||||
index += 1;
|
||||
switch (this.buffer[index]) {
|
||||
'@' => if (this.buffer[index + 1] == '@') {
|
||||
result.tag = .header;
|
||||
index += 2;
|
||||
} else continue :state .invalid,
|
||||
0 => continue :state .invalid,
|
||||
else => continue :state .header,
|
||||
}
|
||||
},
|
||||
.minus => {
|
||||
index += 1;
|
||||
switch (this.buffer[index]) {
|
||||
// assuming that we start with a minus!
|
||||
'-' => if (this.buffer[index + 1] == '-') {
|
||||
if (result.loc.idx != index - 1) {
|
||||
index -= 1;
|
||||
result.tag = .content;
|
||||
} else {
|
||||
index += 1;
|
||||
continue :state .file;
|
||||
}
|
||||
} else continue :state .default,
|
||||
0 => continue :state .invalid,
|
||||
else => continue :state .default,
|
||||
}
|
||||
},
|
||||
.file => {
|
||||
// std.log.err(".file: {s}", .{this.buffer[index - 2 .. @min(index + 3, this.buffer.len)]});
|
||||
index += 1;
|
||||
switch (this.buffer[index]) {
|
||||
'+' => if (this.buffer[index + 1] == '+' and this.buffer[index + 2] == '+') {
|
||||
index += 2;
|
||||
continue :state .plus;
|
||||
} else continue :state .file,
|
||||
0 => continue :state .invalid,
|
||||
else => continue :state .file,
|
||||
}
|
||||
},
|
||||
.plus => {
|
||||
// std.log.err(".plus", .{});
|
||||
index += 1;
|
||||
switch (this.buffer[index]) {
|
||||
'\n' => {
|
||||
index += 1; // include newline
|
||||
result.tag = .file;
|
||||
},
|
||||
0 => continue :state .invalid,
|
||||
else => continue :state .plus,
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
result.loc.len = index - result.loc.idx;
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
const std = @import("std");
|
||||
const mem = std.mem;
|
||||
const debug = std.debug;
|
||||
const testing = std.testing;
|
||||
const assert = debug.assert;
|
||||
const print = debug.print;
|
||||
|
||||
test "individual change" {
|
||||
try testTokenize(
|
||||
\\diff --git a/build.zig.zon b/build.zig.zon
|
||||
\\index 99bede4..a039487 100644
|
||||
\\--- a/build.zig.zon
|
||||
\\+++ b/build.zig.zon
|
||||
\\@@ -3,8 +3,8 @@
|
||||
\\ .version = "0.0.1",
|
||||
\\ .dependencies = .{
|
||||
\\ .zterm = .{
|
||||
\\- .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#855594a8c836723f0230bfd6ad24f47613a147b1",
|
||||
\\- .hash = "zterm-0.3.0-1xmmEM8eHAB0cA7KLXGC7C8Nt7YEJcyoTme4domF1Yty",
|
||||
\\+ .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#e972a2ea0f7a9f8caffd439ef206474b46475f91",
|
||||
\\+ .hash = "zterm-0.3.0-1xmmENkhHAB2rmNJFH-9rRqiRLnT673xwuMrqLwOnlT_",
|
||||
\\ },
|
||||
\\ },
|
||||
\\ .minimum_zig_version = "0.16.0-dev.1254+bf15c791f",
|
||||
, &.{ .content, .file, .header, .content, .eof });
|
||||
}
|
||||
|
||||
test "individual changes in the different files" {
|
||||
try testTokenize(
|
||||
\\diff --git a/build.zig.zon b/build.zig.zon
|
||||
\\index 99bede4..a039487 100644
|
||||
\\--- a/build.zig.zon
|
||||
\\+++ b/build.zig.zon
|
||||
\\@@ -3,8 +3,8 @@
|
||||
\\ .version = "0.0.1",
|
||||
\\ .dependencies = .{
|
||||
\\ .zterm = .{
|
||||
\\- .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#855594a8c836723f0230bfd6ad24f47613a147b1",
|
||||
\\- .hash = "zterm-0.3.0-1xmmEM8eHAB0cA7KLXGC7C8Nt7YEJcyoTme4domF1Yty",
|
||||
\\+ .url = "git+https://gitea.yves-biener.de/yves-biener/zterm#e972a2ea0f7a9f8caffd439ef206474b46475f91",
|
||||
\\+ .hash = "zterm-0.3.0-1xmmENkhHAB2rmNJFH-9rRqiRLnT673xwuMrqLwOnlT_",
|
||||
\\ },
|
||||
\\ },
|
||||
\\ .minimum_zig_version = "0.16.0-dev.1254+bf15c791f",
|
||||
\\diff --git a/src/model.zig b/src/model.zig
|
||||
\\index b402c51..defd874 100644
|
||||
\\--- a/src/model.zig
|
||||
\\+++ b/src/model.zig
|
||||
\\@@ -30,3 +30,9 @@ pub const Change = struct {
|
||||
\\
|
||||
\\ const Model = @This();
|
||||
\\ const std = @import("std");
|
||||
\\+const lexer = @import("lexer.zig");
|
||||
\\+
|
||||
\\+test {
|
||||
\\+ std.testing.refAllDeclsRecursive(@This());
|
||||
\\+ _ = @import("lexer.zig");
|
||||
\\+}
|
||||
, &.{ .content, .file, .header, .content, .file, .header, .content, .eof });
|
||||
}
|
||||
|
||||
test "multiple changes in same file" {
|
||||
try testTokenize(
|
||||
\\diff --git a/src/queue.zig b/src/queue.zig
|
||||
\\index aae7ddf..2591b0a 100644
|
||||
\\--- a/src/queue.zig
|
||||
\\+++ b/src/queue.zig
|
||||
\\@@ -215,7 +215,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void {
|
||||
\\ // still full and the push in the other thread is still blocked
|
||||
\\ // waiting for space.
|
||||
\\ try Thread.yield();
|
||||
\\- std.Thread.sleep(std.time.ns_per_s);
|
||||
\\+ // std.Thread.sleep(std.time.ns_per_s);
|
||||
\\ // Finally, let that other thread go.
|
||||
\\ try testing.expectEqual(1, q.pop());
|
||||
\\
|
||||
\\@@ -225,7 +225,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void {
|
||||
\\ try Thread.yield();
|
||||
\\ // But we want to ensure that there's a second push waiting, so
|
||||
\\ // here's another sleep.
|
||||
\\- std.Thread.sleep(std.time.ns_per_s / 2);
|
||||
\\+ // std.Thread.sleep(std.time.ns_per_s / 2);
|
||||
\\
|
||||
\\ // Another spurious wake...
|
||||
\\ q.not_full.signal();
|
||||
\\@@ -233,7 +233,7 @@ fn sleepyPop(q: *Queue(u8, 2)) !void {
|
||||
\\ // And another chance for the other thread to see that it's
|
||||
\\ // spurious and go back to sleep.
|
||||
\\ try Thread.yield();
|
||||
\\- std.Thread.sleep(std.time.ns_per_s / 2);
|
||||
\\+ // std.Thread.sleep(std.time.ns_per_s / 2);
|
||||
\\
|
||||
\\ // Pop that thing and we're done.
|
||||
\\ try testing.expectEqual(2, q.pop());
|
||||
\\@@ -250,13 +250,13 @@ test "Fill, block, fill, block" {
|
||||
\\ const thread = try Thread.spawn(cfg, sleepyPop, .{&queue});
|
||||
\\ queue.push(1);
|
||||
\\ queue.push(2);
|
||||
\\- const now = std.time.milliTimestamp();
|
||||
\\+ // const now = std.time.milliTimestamp();
|
||||
\\ queue.push(3); // This one should block.
|
||||
\\- const then = std.time.milliTimestamp();
|
||||
\\+ // const then = std.time.milliTimestamp();
|
||||
\\
|
||||
\\ // Just to make sure the sleeps are yielding to this thread, make
|
||||
\\ // sure it took at least 900ms to do the push.
|
||||
\\- try testing.expect(then - now > 900);
|
||||
\\+ // try testing.expect(then - now > 900);
|
||||
\\
|
||||
\\ // This should block again, waiting for the other thread.
|
||||
\\ queue.push(4);
|
||||
\\@@ -270,14 +270,14 @@ test "Fill, block, fill, block" {
|
||||
\\ fn sleepyPush(q: *Queue(u8, 1)) !void {
|
||||
\\ // Try to ensure the other thread has already started trying to pop.
|
||||
\\ try Thread.yield();
|
||||
\\- std.Thread.sleep(std.time.ns_per_s / 2);
|
||||
\\+ // std.Thread.sleep(std.time.ns_per_s / 2);
|
||||
\\
|
||||
\\ // Spurious wake
|
||||
\\ q.not_full.signal();
|
||||
\\ q.not_empty.signal();
|
||||
\\
|
||||
\\ try Thread.yield();
|
||||
\\- std.Thread.sleep(std.time.ns_per_s / 2);
|
||||
\\+ // std.Thread.sleep(std.time.ns_per_s / 2);
|
||||
\\
|
||||
\\ // Stick something in the queue so it can be popped.
|
||||
\\ q.push(1);
|
||||
\\@@ -286,7 +286,7 @@ fn sleepyPush(q: *Queue(u8, 1)) !void {
|
||||
\\ try Thread.yield();
|
||||
\\ // Give the other thread time to block again.
|
||||
\\ try Thread.yield();
|
||||
\\- std.Thread.sleep(std.time.ns_per_s / 2);
|
||||
\\+ // std.Thread.sleep(std.time.ns_per_s / 2);
|
||||
\\
|
||||
\\ // Spurious wake
|
||||
\\ q.not_full.signal();
|
||||
\\@@ -317,7 +317,7 @@ test "2 readers" {
|
||||
\\ const t1 = try Thread.spawn(cfg, readerThread, .{&queue});
|
||||
\\ const t2 = try Thread.spawn(cfg, readerThread, .{&queue});
|
||||
\\ try Thread.yield();
|
||||
\\- std.Thread.sleep(std.time.ns_per_s / 2);
|
||||
\\+ // std.Thread.sleep(std.time.ns_per_s / 2);
|
||||
\\ queue.push(1);
|
||||
\\ queue.push(1);
|
||||
\\ t1.join();
|
||||
\\ );
|
||||
, &.{
|
||||
.content,
|
||||
.file,
|
||||
.header,
|
||||
.content,
|
||||
.header,
|
||||
.content,
|
||||
.header,
|
||||
.content,
|
||||
.header,
|
||||
.content,
|
||||
.header,
|
||||
.content,
|
||||
.header,
|
||||
.content,
|
||||
.header,
|
||||
.content,
|
||||
.eof,
|
||||
});
|
||||
}
|
||||
|
||||
/// Test tokenizer's iterator outputs for the provided source. It should
|
||||
/// match the expected token tags, except the very last .eof tag which shall
|
||||
/// be omitted from the argument of expected_token_tags, as this function
|
||||
/// explicitly tests for the .eof tag (with corresponding location information).
|
||||
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
|
||||
var tokenizer = Tokenizer.init(source);
|
||||
for (0.., expected_token_tags) |i, expected| {
|
||||
const token = tokenizer.next();
|
||||
testing.expectEqual(expected, token.tag) catch |err| {
|
||||
print("Got token: ", .{});
|
||||
tokenizer.dump(&token);
|
||||
print("Expected .{s} at index {d}\n", .{ @tagName(expected), i });
|
||||
return err;
|
||||
};
|
||||
}
|
||||
const last_token = tokenizer.next();
|
||||
testing.expectEqual(Token.Tag.eof, last_token.tag) catch |err| {
|
||||
print("Got token: ", .{});
|
||||
tokenizer.dump(&last_token);
|
||||
print("Expected .{s}\n", .{@tagName(Token.Tag.eof)});
|
||||
return err;
|
||||
};
|
||||
try testing.expectEqual(source.len, last_token.loc.idx);
|
||||
try testing.expectEqual(0, last_token.loc.len);
|
||||
}
|
||||
Reference in New Issue
Block a user