diff --git a/src/Ast.zig b/src/Ast.zig new file mode 100644 index 0000000..229c0e3 --- /dev/null +++ b/src/Ast.zig @@ -0,0 +1,282 @@ +///! Abstract syntax tree implementation for `nf` file format. +nodes: std.MultiArrayList(Node), +tokens: std.MultiArrayList(Token), + +// TODO what would I need from the `Ast`? +// - parser checks if `Ast` is _correct_ when constructing from source using lexer / tokenizer +// -> use that for reporting through the lsp / syntax checker? +// - Ast holds representation for formatting +// - Ast holds information about contents and locations for lsp implementation + +// TODO do I need more specific ast nodes? or do I even need some kind of interface? +// -> i.e. blocks should also tell me what kind of block it is +// -> i.e. headers should also tell me what level (and their corresponding children) +// -> i.e. links should also tell me what kind of link (file, web, note) +// -> i.e. table should also tell me details like (columns, rows, headers) +// -> i.e. topic should also tell me what level (and their corresponding child tags) +// -> i.e. paragraphs should also tell me what contents (their corresponding children, which are .text or .styled_text) +// -> have only specific children allowed for each kind? + +pub const Tag = enum(u8) { + anchor, + block, + comment, + header, + link, + list, + paragraph, + reference, + table, + topic, + text, + styled_text, + /// invalid type as the `Ast` may be not complete to derive the correct type yet + invalid, +}; + +pub const Type = union(Tag) { + anchor: Anchor, + block: Block, + comment: Comment, + header: Header, + link: Link, + list: List, + paragraph: Paragraph, + reference: Reference, + table: Table, + topic: Topic, + text: Text, + styled_text: Text, + invalid, + + pub fn getNode(this: @This(), ast: Ast) Node { + return switch (this) { + .anchor => |anchor| ast.nodes.get(anchor.idx), + .block => |block| ast.nodes.get(block.idx), + .comment => |comment| ast.nodes.get(comment.idx), + .header => |header| ast.nodes.get(header.idx), + .link => |link| ast.nodes.get(link.idx), + .list => |list| ast.nodes.get(list.idx), + .paragraph => |paragraph| ast.nodes.get(paragraph.idx), + .reference => |reference| ast.nodes.get(reference.idx), + .table => |table| ast.nodes.get(table.idx), + .topic => |topic| ast.nodes.get(topic.idx), + .text => |text| ast.nodes.get(text.idx), + .styled_text => |text| ast.nodes.get(text.idx), + .invalid => .{ + .tag = .invalid, + .loc = .{ + .start = 0, + .end = 0, + }, + .parent = .{ + .start = 0, + .end = 0, + }, + }, + }; + } + + pub fn dump(this: @This()) void { + switch (this) { + .anchor => |anchor| anchor.dump(), + .block => |block| block.dump(), + .comment => |comment| comment.dump(), + .header => |header| header.dump(), + .link => |link| link.dump(), + .list => |list| list.dump(), + .paragraph => |paragraph| paragraph.dump(), + .reference => |reference| reference.dump(), + .table => |table| table.dump(), + .topic => |topic| topic.dump(), + .text => |text| text.dump(), + .styled_text => |text| text.dump(), + .invalid => {}, + } + } +}; + +pub const Node = struct { + tag: Type = .invalid, + loc: Token.Location, + parent: Token.Location = undefined, + + pub fn getParent(this: @This(), ast: Ast) Node { + for (0.., ast.nodes.items(.loc)) |idx, loc| if (loc == this.parent) return ast.nodes.get(idx); + + return .{ + .tag = .invalid, + .loc = .{ + .start = 0, + .end = 0, + }, + .parent = .{ + .start = 0, + .end = 0, + }, + }; + } + + pub fn dump(this: @This(), source: [:0]const u8) void { + assert(this.loc.start < this.loc.end); + print("{s} ", .{@tagName(this.tag)}); + this.tag.dump(); + print(": '{s}'\n", .{source[this.loc.start..this.loc.end]}); + } +}; + +pub const Anchor = struct { + idx: usize = undefined, + target: Token.Location, + + pub fn dump(this: @This()) void { + print(".target: {any} ", .{this.target}); + } +}; + +pub const Block = struct { + idx: usize = undefined, + kind: enum(u8) { + tldr, + info, + warn, + quote, + math, + @"fn", + code, // if not matched with one above 'code' is assumed + }, + + pub fn dump(this: @This()) void { + print(".kind: {any} ", .{this.kind}); + } +}; + +pub const Comment = struct { + idx: usize = undefined, + + pub fn dump(this: @This()) void { + _ = this; + } +}; + +pub const Header = struct { + idx: usize = undefined, + level: u8, + + pub fn dump(this: @This()) void { + print(".level: {any} ", .{this.level}); + } +}; + +pub const Link = struct { + idx: usize = undefined, + kind: enum(u2) { note, file, web }, + + pub fn dump(this: @This()) void { + print(".kind: .{s} ", .{@tagName(this.kind)}); + } +}; + +pub const List = struct { + idx: usize = undefined, + level: u8, + ordered: bool, + + pub fn dump(this: @This()) void { + print(".ordered = {any}, .level: {d} ", .{ this.ordered, this.level }); + } +}; + +pub const Paragraph = struct { + idx: usize = undefined, + + pub fn dump(this: @This()) void { + _ = this; + } +}; + +pub const Reference = struct { + idx: usize = undefined, + + pub fn dump(this: @This()) void { + _ = this; + } +}; + +pub const Table = struct { + idx: usize = undefined, + cols: u8, + rows: u8, + + pub fn dump(this: @This()) void { + _ = this; + } +}; + +pub const Topic = struct { + idx: usize = undefined, + topics: []const u8, // order here is important! + + pub fn dump(this: @This()) void { + _ = this; + } +}; + +pub const Text = struct { + idx: usize = undefined, + styled: bool, + + pub fn dump(this: @This()) void { + _ = this; + } +}; + +pub const init: Ast = .{ + .nodes = .empty, + .tokens = .empty, +}; + +pub fn deinit(this: *Ast, gpa: Allocator) void { + this.nodes.deinit(gpa); + this.tokens.deinit(gpa); +} + +pub fn addNode(this: *Ast, gpa: Allocator, node: Node) !void { + assert(node.loc.start < node.loc.end); + const idx = this.nodes.len; + try this.nodes.ensureTotalCapacity(gpa, this.nodes.len + 1); + errdefer comptime unreachable; // no more are possible + + var n = node; + switch (n.tag) { + .anchor => |*anchor| anchor.idx = idx, + .block => |*block| block.idx = idx, + .comment => |*comment| comment.idx = idx, + .header => |*header| header.idx = idx, + .link => |*link| link.idx = idx, + .list => |*list| list.idx = idx, + .paragraph => |*paragraph| paragraph.idx = idx, + .reference => |*reference| reference.idx = idx, + .table => |*table| table.idx = idx, + .topic => |*topic| topic.idx = idx, + .text => |*text| text.idx = idx, + .styled_text => |*text| text.idx = idx, + .invalid => {}, + } + this.nodes.appendAssumeCapacity(n); +} + +pub fn addToken(this: *Ast, gpa: Allocator, token: Token) !void { + try this.tokens.append(gpa, token); +} + +const std = @import("std"); +const debug = std.debug; +const mem = std.mem; +const assert = debug.assert; +const print = debug.print; +const Allocator = mem.Allocator; +const lexer = @import("lexer.zig"); +const Token = lexer.Token; + +const Ast = @This(); diff --git a/src/ast.zig b/src/ast.zig deleted file mode 100644 index e69de29..0000000 diff --git a/src/lexer.zig b/src/lexer.zig index e69de29..00eeb1d 100644 --- a/src/lexer.zig +++ b/src/lexer.zig @@ -0,0 +1,1019 @@ +///! Lexer for `nf` file format to tokenize input sources accordningly. +pub const Token = struct { + tag: Tag, + loc: Location, + + pub const Location = packed struct { + start: u32, + end: u32, + }; + + pub const Tag = enum(u8) { + anchor, + at_sign, + block, + colon, + comment, + eof, + equal, + hashtag, + invalid, + l_angle_bracket, + r_angle_bracket, + l_bracket, + l_bracket_colon, + l_bracket_minus, + l_bracket_slash, + r_bracket, + minus, + newline, + reference, + pipe, + pipe_equal, + plus, + text, + underscore, + + pub fn lexeme(tag: Tag) ?[]const u8 { + return switch (tag) { + .at_sign => "@", + .colon => ":", + .equal => "=", + .hashtag => "#", + .l_angle_bracket => "<", + .r_angle_bracket => ">", + .l_bracket => "[", + .l_bracket_colon => "[:", + .l_bracket_minus => "[-", + .l_bracket_slash => "[/", + .r_bracket => "]", + .minus => "-", + .pipe => "|", + .pipe_equal => "|=", + .plus => "+", + .underscore => "_", + else => null, + }; + } + + pub fn symbol(tag: Tag) []const u8 { + return tag.lexeme() orelse switch (tag) { + .anchor => "anchor", + .block => "block", + .comment => "comment", + .eof => "EOF", + .invalid => "invalid token", + .newline => "\n", + .reference => "reference", + .text => "text", + else => unreachable, + }; + } + }; +}; + +pub const Tokenizer = struct { + buffer: [:0]const u8, + index: u32, + + /// For debugging purposes + pub fn dump(self: *Tokenizer, token: *const Token) void { + assert(token.loc.start < token.loc.end); + print(".{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.start..token.loc.end] }); + } + + pub fn init(buffer: [:0]const u8) Tokenizer { + return .{ + .buffer = buffer, + // skip the UTF-8 BOM if present + .index = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0, + }; + } + + const State = enum { + default, + invalid, + at_sign, + equal, + l_angle_bracket, + comment, + block, + slash, + pipe, + l_bracket, + }; + + /// state fsm (finite state machine) describing the syntax of `nf` + /// TODO I need to draw one for all the possible states for tokenization! + /// -> for that I can create test cases! + /// -> detect valid and invalid syntax uses! this is however the job of the parser? + /// + /// TODO points to improve on: + /// -> reduce duplicated code sections + /// -> make tags more explicit (i.e. remove unnecessary newlines, whitespaces, etc.) + /// -> streamline catching the common cases for tokens + /// -> reduce state machine + /// -> do not group tokens, instead this should be done by the parser when deriving the ast from the token stream + /// then the parser can identify missing parts and even point to the corresponding location in the file! + pub fn next(this: *Tokenizer) Token { + var result: Token = .{ + .tag = undefined, + .loc = .{ + .start = this.index, + .end = undefined, + }, + }; + state: switch (State.default) { + .default => switch (this.buffer[this.index]) { + 0 => if (this.index == this.buffer.len) { + if (result.loc.start != this.index) { + result.tag = .text; + } else { + return .{ + .tag = .eof, + .loc = .{ + .start = this.index, + .end = this.index, + }, + }; + } + } else { + continue :state .invalid; + }, + '=' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + this.index += 1; + result.tag = .equal; + }, + '@' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + continue :state .at_sign; + }, + '|' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + continue :state .pipe; + }, + '/' => continue :state .slash, + '`' => { + var i: u32 = 1; + while (this.buffer[this.index + i] != '`') : (i += 1) { + if (this.index + i >= this.buffer.len) { + this.index += 1; + continue :state .default; + } + } else { + this.index += i; + continue :state .default; + } + }, + '_' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + this.index += 1; + result.tag = .underscore; + }, + '#' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + this.index += 1; + switch (this.buffer[this.index]) { + '#' => continue :state .block, + else => continue :state .default, + } + }, + ':' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + this.index += 1; + result.tag = .colon; + }, + '[' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + continue :state .l_bracket; + }, + ']' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + this.index += 1; + result.tag = .r_bracket; + }, + '<' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + continue :state .l_angle_bracket; + }, + '>' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + this.index += 1; + result.tag = .r_angle_bracket; + }, + '+' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + this.index += 1; + result.tag = .plus; + }, + '-' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + this.index += 1; + result.tag = .minus; + }, + '\n' => if (result.loc.start != this.index) { + result.tag = .text; + } else { + this.index += 1; + result.tag = .newline; + }, + else => { + this.index += 1; + continue :state .default; + }, + }, + .invalid => { + this.index += 1; + switch (this.buffer[this.index]) { + 0 => if (this.index == this.buffer.len) { + result.tag = .invalid; + } else { + continue :state .invalid; + }, + else => continue :state .invalid, + } + }, + // referencing + .at_sign => { + this.index += 1; + switch (this.buffer[this.index]) { + 'a'...'z', 'A'...'Z', '0'...'9', '-', '.', ':', '/' => continue :state .at_sign, + ' ', '\n', '\t', '\r', ']', '|', 0 => result.tag = .reference, + else => continue :state .invalid, + } + }, + .l_angle_bracket => { + this.index += 1; + switch (this.buffer[this.index]) { + 'a'...'z', 'A'...'Z', '0'...'9', '-', '.', ':', '/' => continue :state .l_angle_bracket, + '>' => { + this.index += 1; + result.tag = .anchor; + }, + else => continue :state .invalid, + } + }, + // links + .l_bracket => { + this.index += 1; + switch (this.buffer[this.index]) { + ':' => { + this.index += 1; + result.tag = .l_bracket_colon; + }, + '-' => { + this.index += 1; + result.tag = .l_bracket_minus; + }, + '/' => { + this.index += 1; + result.tag = .l_bracket_slash; + }, + else => { + this.index -= 1; + result.tag = .l_bracket; + }, + } + }, + .pipe => { + this.index += 1; + switch (this.buffer[this.index]) { + '=' => { + this.index += 1; + result.tag = .pipe_equal; + }, + else => result.tag = .pipe, + } + }, + .slash => { + this.index += 1; + switch (this.buffer[this.index]) { + '/' => if (result.loc.start != this.index - 1) { + result.tag = .text; + this.index -= 1; + } else continue :state .comment, + else => continue :state .default, + } + }, + .comment => { + this.index += 1; + switch (this.buffer[this.index]) { + 0, '\n' => result.tag = .comment, + else => continue :state .comment, + } + }, + .block => { + this.index += 1; + switch (this.buffer[this.index]) { + 0 => result.tag = .invalid, + '#' => if (this.buffer[this.index + 1] == '#') { + this.index += 2; + result.tag = .block; + } else { + continue :state .block; + }, + else => continue :state .block, + } + }, + else => { + print("Not yet implemented at {d}: '{s}'", .{ this.index, this.buffer[result.loc.start..this.index] }); + unreachable; + }, // not yet implemented + } + + result.loc.end = this.index; + return result; + } + + /// Peek the next `Token` which would be seen by the Tokenizer after n calls to `next` without changing the internal state of the iterator. + /// This allows look ahead parsing of the Token stream. + pub fn peek(this: Tokenizer, n: u32) Token { + assert(n > 0); + var index = this.index; + var result: Token = undefined; + for (0..n) |_| { + result = .{ + .tag = undefined, + .loc = .{ + .start = index, + .end = undefined, + }, + }; + state: switch (State.default) { + .default => switch (this.buffer[index]) { + 0 => if (index == this.buffer.len) { + if (result.loc.start != index) { + result.tag = .text; + } else { + return .{ + .tag = .eof, + .loc = .{ + .start = index, + .end = index, + }, + }; + } + } else { + continue :state .invalid; + }, + '=' => if (result.loc.start != index) { + result.tag = .text; + } else { + index += 1; + result.tag = .equal; + }, + '@' => if (result.loc.start != index) { + result.tag = .text; + } else { + continue :state .at_sign; + }, + '|' => if (result.loc.start != index) { + result.tag = .text; + } else { + continue :state .pipe; + }, + '/' => continue :state .slash, + '`' => { + var i: u32 = 1; + while (this.buffer[index + i] != '`') : (i += 1) { + if (index + i >= this.buffer.len) { + index += 1; + continue :state .default; + } + } else { + index += i; + continue :state .default; + } + }, + '_' => if (result.loc.start != index) { + result.tag = .text; + } else { + index += 1; + result.tag = .underscore; + }, + '#' => if (result.loc.start != index) { + result.tag = .text; + } else { + index += 1; + switch (this.buffer[index]) { + '#' => continue :state .block, + else => continue :state .default, + } + }, + ':' => if (result.loc.start != index) { + result.tag = .text; + } else { + index += 1; + result.tag = .colon; + }, + '[' => if (result.loc.start != index) { + result.tag = .text; + } else { + continue :state .l_bracket; + }, + ']' => if (result.loc.start != index) { + result.tag = .text; + } else { + index += 1; + result.tag = .r_bracket; + }, + '<' => if (result.loc.start != index) { + result.tag = .text; + } else { + continue :state .l_angle_bracket; + }, + '>' => if (result.loc.start != index) { + result.tag = .text; + } else { + index += 1; + result.tag = .r_angle_bracket; + }, + '+' => if (result.loc.start != index) { + result.tag = .text; + } else { + index += 1; + result.tag = .plus; + }, + '-' => if (result.loc.start != index) { + result.tag = .text; + } else { + index += 1; + result.tag = .minus; + }, + '\n' => if (result.loc.start != index) { + result.tag = .text; + } else { + index += 1; + result.tag = .newline; + }, + else => { + index += 1; + continue :state .default; + }, + }, + .invalid => { + index += 1; + switch (this.buffer[index]) { + 0 => if (index == this.buffer.len) { + result.tag = .invalid; + } else { + continue :state .invalid; + }, + else => continue :state .invalid, + } + }, + // referencing + .at_sign => { + index += 1; + switch (this.buffer[index]) { + 'a'...'z', 'A'...'Z', '0'...'9', '-', '.', ':', '/' => continue :state .at_sign, + ' ', '\n', '\t', '\r', ']', '|', 0 => result.tag = .reference, + else => continue :state .invalid, + } + }, + .l_angle_bracket => { + index += 1; + switch (this.buffer[index]) { + 'a'...'z', 'A'...'Z', '0'...'9', '-', '.', ':', '/' => continue :state .l_angle_bracket, + '>' => { + index += 1; + result.tag = .anchor; + }, + else => continue :state .invalid, + } + }, + // links + .l_bracket => { + index += 1; + switch (this.buffer[index]) { + ':' => { + index += 1; + result.tag = .l_bracket_colon; + }, + '-' => { + index += 1; + result.tag = .l_bracket_minus; + }, + '/' => { + index += 1; + result.tag = .l_bracket_slash; + }, + else => { + index -= 1; + result.tag = .l_bracket; + }, + } + }, + .pipe => { + index += 1; + switch (this.buffer[index]) { + '=' => { + index += 1; + result.tag = .pipe_equal; + }, + else => result.tag = .pipe, + } + }, + .slash => { + index += 1; + switch (this.buffer[index]) { + '/' => if (result.loc.start != index) { + result.tag = .text; + index -= 1; + } else continue :state .comment, + else => continue :state .default, + } + }, + .comment => { + index += 1; + switch (this.buffer[index]) { + 0, '\n' => result.tag = .comment, + else => continue :state .comment, + } + }, + .block => { + index += 1; + switch (this.buffer[index]) { + 0 => result.tag = .invalid, + '#' => if (this.buffer[index + 1] == '#') { + index += 2; + result.tag = .block; + } else { + continue :state .block; + }, + else => continue :state .block, + } + }, + else => { + print("Not yet implemented at {d}: '{s}'", .{ index, this.buffer[result.loc.start..index] }); + unreachable; + }, // not yet implemented + } + } + result.loc.end = index; + return result; + } +}; + +const std = @import("std"); +const mem = std.mem; +const debug = std.debug; +const testing = std.testing; +const assert = debug.assert; +const print = debug.print; + +test "paragraphs" { + try testTokenize( + \\This is the first paragraph. + , &.{.text}); + try testTokenize( + \\This is the first paragraph. + \\Another line. + , &.{ .text, .newline, .text }); + try testTokenize( + \\This is the first paragraph. + \\This is still part of the paragraph. + \\ + , &.{ .text, .newline, .text, .newline }); + try testTokenize( + \\This is the first paragraph. + \\This is still part of the paragraph. + \\ + \\This is the second paragraph. // comment + \\ + \\This is the third paragraph. + , &.{ .text, .newline, .text, .newline, .newline, .text, .comment, .newline, .newline, .text }); + try testTokenize( + \\This is the first paragraph. + \\This is still part of the paragraph. + \\ + \\// comment before a newline + \\This is the second paragraph. + \\// comment after a newline + \\ + \\This is the third paragraph. + , &.{ .text, .newline, .text, .newline, .newline, .comment, .newline, .text, .newline, .comment, .newline, .newline, .text }); + try testTokenize( + \\This is the first paragraph. + \\This is still part of the paragraph. + \\ + \\// comment before a newline + \\ + \\This is the second paragraph. + \\// comment after a newline + \\ + \\This is the third paragraph. + , &.{ .text, .newline, .text, .newline, .newline, .comment, .newline, .newline, .text, .newline, .comment, .newline, .newline, .text }); + try testTokenize( + \\This is the first paragraph. + \\This is still part of the paragraph. + \\ + \\// comment before a newline + \\This is the second paragraph. + \\--- + \\This is the third paragraph. + \\// comment after a newline + \\ + \\This is the last paragraph. + , &.{ .text, .newline, .text, .newline, .newline, .comment, .newline, .text, .newline, .minus, .minus, .minus, .newline, .text, .newline, .comment, .newline, .newline, .text }); +} + +test "styling" { + try testTokenize("*Test with _same_ more text*", &.{ .text, .underscore, .text, .underscore, .text }); + try testTokenize("*~_test_~*", &.{ .text, .underscore, .text, .underscore, .text }); + try testTokenize("*`~_test_~`*", &.{.text}); +} + +test "valid tracing" { + // target + try testTokenize("", &.{.anchor}); + try testTokenize("", &.{.anchor}); + try testTokenize("", &.{.anchor}); + try testTokenize("", &.{.anchor}); + try testTokenize("", &.{.anchor}); + // reference + try testTokenize("@anchor1", &.{.reference}); + try testTokenize("@a.n.c.h.o.r.", &.{.reference}); + try testTokenize("@a/n/c/h/o/r", &.{.reference}); + try testTokenize("@a:n:c:h:o:r", &.{.reference}); + try testTokenize("@a-n-c-h-o-r", &.{.reference}); + // usage in link context + try testTokenize("##math x^2## ", &.{ .block, .text, .anchor }); + try testTokenize("@a|some text", &.{ .reference, .pipe, .text }); // see complete example below + try testTokenize("[:ul1p.n@builtin-functions|Builtin functions]", &.{ .l_bracket_colon, .text, .reference, .pipe, .text, .r_bracket }); +} + +test "invalid anchors" { + // target + try testTokenize("", &.{.invalid}); + try testTokenize("", &.{.invalid}); + // reference + try testTokenize("@a_n_c_h_o_r", &.{.invalid}); +} + +test "blocks" { + try testTokenize("##zig fn main() void {}##", &.{.block}); + try testTokenize("This is some text ##zig fn main() void {}## with inline code.", &.{ .text, .block, .text }); + try testTokenize("##zig fn main() void {}## with inline code.", &.{ .block, .text }); + try testTokenize( + \\##quote + \\Forty hour work weeks are a relic of the Industrial Age. Knowledge works function like athletes - train and sprint, then rest and reassess. + \\-- Naval Ravikant + \\## + , &.{.block}); + try testTokenize( + \\##quote + \\Forty hour work weeks are a relic of the Industrial Age. Knowledge works function like athletes - train and sprint, then rest and reassess. + \\-- Naval Ravikant + \\## With some text afterwards + , &.{ .block, .text }); + try testTokenize("##math x^2## Math", &.{ .block, .text }); + try testTokenize( + \\##math + \\sum(i)^(n)_(i = 0) = (n^2 + n) / 2 + \\## + , &.{.block}); +} + +test "comment" { + try testTokenize("// This is a comment", &.{.comment}); + try testTokenize( + \\// This is a comment + \\// Another comment (which is not treated the same as the one above) + , + &.{ .comment, .newline, .comment }, + ); + try testTokenize( + \\This is simple paragraph with + \\// comments in between + \\followed by some more normal paragraph // with a comment afterwards + \\// Ending with another comment in a new line + , + &.{ .text, .newline, .comment, .newline, .text, .comment, .newline, .comment }, + ); +} + +test "slash with paragraph" { + try testTokenize("/ with some paragraph", &.{.text}); + try testTokenize("they / them", &.{.text}); + try testTokenize( + \\/ with some paragraph + \\followed by some / more paragraph + , + &.{ .text, .newline, .text }, + ); +} + +test "heading" { + try testTokenize("= Heading 1", &.{ .equal, .text }); + try testTokenize("== Heading 2", &.{ .equal, .equal, .text }); + try testTokenize("=== Heading 3", &.{ .equal, .equal, .equal, .text }); + try testTokenize("==== Heading 4", &.{ .equal, .equal, .equal, .equal, .text }); + try testTokenize("===== Heading 5", &.{ .equal, .equal, .equal, .equal, .equal, .text }); + try testTokenize("====== Heading 6", &.{ .equal, .equal, .equal, .equal, .equal, .equal, .text }); + try testTokenize("=| test", &.{ .equal, .pipe, .text }); + try testTokenize( + \\= Heading With some paragraph // with a comment + \\ Followed with some more paragraph for that heading. + , &.{ .equal, .text, .comment, .newline, .text }); + try testTokenize( + \\= Heading With some paragraph + \\ Followed with some more paragraph for that heading. + , &.{ .equal, .text, .anchor, .newline, .text }); + try testTokenize( + \\= Heading With some paragraph // With an additional comment + \\ Followed with some more paragraph for that heading. + , &.{ .equal, .text, .anchor, .text, .comment, .newline, .text }); + try testTokenize( + \\= Heading With some paragraph // With an additional comment + \\ Followed with some more paragraph for that heading. + , &.{ .equal, .text, .comment, .newline, .text }); + try testTokenize("Some paragraph with some equal signs=Which is not a heading.", &.{ .text, .equal, .text }); + try testTokenize("Some paragraph with some equal signs =Which is not a heading.", &.{ .text, .equal, .text }); + try testTokenize("Some paragraph with some equal signs = Which is not a heading.", &.{ .text, .equal, .text }); +} + +test "lists" { + try testTokenize( + \\- First level + \\-- Second level + \\-- Second level + \\- First level + \\-- Second level + \\--- Third level + , &.{ .minus, .text, .newline, .minus, .minus, .text, .newline, .minus, .minus, .text, .newline, .minus, .text, .newline, .minus, .minus, .text, .newline, .minus, .minus, .minus, .text }); + try testTokenize( + \\+ First level + \\++ Second level + \\++ Second level + \\+ First level + \\++ Second level + \\+++ Third level + , &.{ .plus, .text, .newline, .plus, .plus, .text, .newline, .plus, .plus, .text, .newline, .plus, .text, .newline, .plus, .plus, .text, .newline, .plus, .plus, .plus, .text }); +} + +test "tables" { + try testTokenize( + \\|= Build Mode |= Runtime Safety |= Optimizations | + \\| Debug (default) | Yes | No | + \\| ReleaseSafe | Yes | Yes, Speed | + \\| ReleaseSmall | No | Yes, Size | + \\| ReleaseFast | No | Yes, Speed | + , &.{ + .pipe_equal, .text, .pipe_equal, .text, .pipe_equal, .text, .pipe, .newline, + .pipe, .text, .pipe, .text, .pipe, .text, .pipe, .newline, + .pipe, .text, .pipe, .text, .pipe, .text, .pipe, .newline, + .pipe, .text, .pipe, .text, .pipe, .text, .pipe, .newline, + .pipe, .text, .pipe, .text, .pipe, .text, .pipe, + }); +} + +test "Example note with table" { + try testTokenize( + \\:code:zig: + \\ + \\= Build modes + \\Zig provides different kind of build modes for different purposes. + \\ + \\|= Build Mode |= Runtime Safety |= Optimizations | + \\| Debug (default) | Yes | No | + \\| ReleaseSafe | Yes | Yes, Speed | + \\| ReleaseSmall | No | Yes, Size | + \\| ReleaseFast | No | Yes, Speed | + \\ + \\ + \\With runtime safety checks enabled the compiler asserts code to enable the detection of illegal behaviour during runtime. If such a check fails a call to `@panic` ([:ul1p.n@builtin-functions|Builtin functions]) will be emitted. + \\ + \\##fn [/https://ziglang.org/documentation/master/#Build-Mode]## + , &.{ + .colon, + .text, + .colon, + .text, + .colon, + .newline, + .newline, + .equal, + .text, + .anchor, + .newline, + .text, + .newline, + .newline, + .pipe_equal, + .text, + .pipe_equal, + .text, + .pipe_equal, + .text, + .pipe, + .newline, + .pipe, + .text, + .pipe, + .text, + .pipe, + .text, + .pipe, + .newline, + .pipe, + .text, + .pipe, + .text, + .pipe, + .text, + .pipe, + .newline, + .pipe, + .text, + .pipe, + .text, + .pipe, + .text, + .pipe, + .newline, + .pipe, + .text, + .pipe, + .text, + .pipe, + .text, + .pipe, + .newline, + .anchor, + .newline, + .newline, + .text, + .l_bracket_colon, + .text, + .reference, + .pipe, + .text, + .r_bracket, + .text, + .newline, + .newline, + .block, // footnote + .text, // whitespace + .anchor, + }); +} + +test "Example note with code snippets" { + try testTokenize( + \\:code:zig: + \\ + \\= Conditional Code + \\Controlling not just the control flow of the code, but also which parts of the code base are actually compiled and used when shipping the application is very crucial and often done via condiationally enabling / disabling code. They are usually controlled via _feature toggles_ and can be implemented in zig via [:ly9j.n|comptime] (pre-processor statements in C/C++, etc.). + \\ + \\[:ly9j.n@comptime] even allows mixing build and runtime checks, see the following example: + \\ + \\##zig + \\ fn myFunction() void { + \\ if (hasFeature()) { + \\ // Feature-specific code + \\ } else { + \\ // Default code + \\ } + \\ } + \\ + \\ inline fn hasFeature() bool { + \\ return (comptime comptimeCheck()) and runtimeCheck(); + \\ } + \\## + \\ + \\Both the [:g0ic.n@inline] and [:ly9j.n@comptime] keywords are required, such that the `hasFeature` function call in `myFunction` will be [:msev.n|correctly] evaluated during build-time. + \\ + \\Most commonly such conditional code is used to provide _platform specific_ implementations: + \\ + \\##zig + \\ const builtin = @import("builtin"); + \\ + \\ fn myFunction() void { + \\ if (builtin.os.tag == .macos) { + \\ // This code will only be included if the target OS is macOS. + \\ return; + \\ } + \\ + \\ // This code will be included for all other operating systems. + \\ } + \\## + \\ + \\##fn [/https://mitchellh.com/writing/zig-comptime-conditional-disable|Conditionally Disabling Code with comptime in Zig - Mitchell Hashimoto]## + , &.{ + .colon, + .text, + .colon, + .text, + .colon, + .newline, + .newline, + .equal, + .text, + .anchor, + .newline, + .text, + .underscore, + .text, + .underscore, + .text, + .l_bracket_colon, + .text, + .pipe, + .text, + .r_bracket, + .text, + .minus, + .text, + .plus, + .plus, + .text, + .newline, + .newline, + .l_bracket_colon, + .text, + .reference, + .r_bracket, + .text, + .colon, + .newline, + .newline, + .block, + .newline, + .newline, + .text, + .l_bracket_colon, + .text, + .reference, + .r_bracket, + .text, + .l_bracket_colon, + .text, + .reference, + .r_bracket, + .text, + .l_bracket_colon, + .text, + .pipe, + .text, + .r_bracket, + .text, + .minus, + .text, + .newline, + .newline, + .text, + .underscore, + .text, + .underscore, + .text, + .colon, + .newline, + .newline, + .block, + .newline, + .newline, + .block, // footnote + .text, // whitespace + .anchor, + }); +} + +/// Test tokenizer's iterator outputs for the provided source. It should +/// match the expected token tags, except the very last .eof tag which shall +/// be omitted from the argument of expected_token_tags, as this function +/// explicitly tests for the .eof tag (with corresponding location information). +fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void { + var tokenizer = Tokenizer.init(source); + for (0.., expected_token_tags) |i, expected| { + const token = tokenizer.next(); + testing.expectEqual(expected, token.tag) catch |err| { + print("Got token: ", .{}); + tokenizer.dump(&token); + print("Expected .{s} at index {d}\n", .{ @tagName(expected), i }); + return err; + }; + } + const last_token = tokenizer.next(); + testing.expectEqual(Token.Tag.eof, last_token.tag) catch |err| { + print("Got token: ", .{}); + tokenizer.dump(&last_token); + print("Expected .{s}\n", .{@tagName(Token.Tag.eof)}); + return err; + }; + try testing.expectEqual(source.len, last_token.loc.start); + try testing.expectEqual(source.len, last_token.loc.end); +} diff --git a/src/parser.zig b/src/parser.zig index a65a6dc..7d276c2 100644 --- a/src/parser.zig +++ b/src/parser.zig @@ -1 +1,376 @@ -pub const lexer = @import("lexer.zig"); +fn parse(allocator: Allocator, content: [:0]const u8) !Ast { + var ast: Ast = .init; + var tokenizer = Tokenizer.init(content); + var token = tokenizer.next(); + var last_loc: Location = .{ .start = 0, .end = 0 }; + + while (token.tag != .eof) : (token = tokenizer.next()) { + // token + try ast.addToken(allocator, token); + // node + var idx: u32 = 0; // keep track on how much we peeked ahead + // TODO keep track of the parent position too! + const node: Ast.Node = tag: switch (token.tag) { + // tracing + .anchor => break :tag .{ + .tag = .{ + .anchor = .{ .target = undefined }, + }, + .loc = token.loc, + .parent = last_loc, + }, + .reference => break :tag .{ + .tag = .{ .reference = .{} }, + .loc = token.loc, + .parent = last_loc, + }, + .at_sign => break :tag .{ + .tag = .{ .text = .{ .styled = false } }, + .loc = token.loc, + .parent = last_loc, + }, + // TODO determine kind of block + // -> run lexer on block contents if necessary and inject results into this tree accordingly + // -> recursive lexer instance needs to update the found corresponding index with the starting index of the block contents! + .block => break :tag .{ + .tag = .{ + .block = .{ + .kind = undefined, + }, + }, + .loc = token.loc, + .parent = last_loc, + }, + .colon => { + var loc: Location = token.loc; + idx += 1; + token = tokenizer.peek(idx); + colon: switch (token.tag) { + .colon => { + idx += 1; + token = tokenizer.peek(idx); + continue :colon token.tag; + }, + .text => { + idx += 1; + const end = tokenizer.peek(idx); + if (end.tag == .colon) { + loc.end = end.loc.end; + idx += 1; + token = tokenizer.peek(idx); + continue :colon token.tag; + } + }, + .newline => if (loc.start + 1 == loc.end) { + // only have ':\n' which is not a .tag + continue; + }, + else => continue :tag token.tag, + } + for (0..idx) |_| _ = tokenizer.next(); + break :tag .{ + .tag = .{ + .topic = .{ .topics = undefined }, + }, + .loc = loc, + .parent = last_loc, + }; + }, + .comment => break :tag .{ + .tag = .{ + .comment = .{}, + }, + .loc = token.loc, + .parent = last_loc, + }, + .newline => { + idx = 0; + token = tokenizer.next(); + continue :tag token.tag; + }, + .l_angle_bracket, .r_angle_bracket, .l_bracket, .r_bracket, .hashtag => { + idx = 0; + token = tokenizer.next(); + continue :tag token.tag; + }, + // header + .equal => { + var loc: Location = token.loc; + idx += 1; + const next = tokenizer.peek(idx); + if (next.tag != .text) { + // invalid + continue :tag .invalid; + } else { + // FIX this will fail if the entire heading is not only a text, but contains a minus, slash, etc. + loc.end = next.loc.end; + break :tag .{ + .tag = .{ + .header = .{ .level = undefined }, + }, + .loc = loc, + .parent = last_loc, + }; + } + }, + // link + .l_bracket_colon => { + var loc: Location = token.loc; + idx += 1; + var next = tokenizer.peek(idx); + link: switch (next.tag) { + .text, .reference, .pipe => { + idx += 1; + next = tokenizer.peek(idx); + continue :link next.tag; + }, + .r_bracket => { + loc.end = next.loc.end; + idx += 1; + break :tag .{ + .tag = .{ .link = .{ .kind = .note } }, + .loc = loc, + .parent = last_loc, + }; + }, + else => { + // invalid + continue; + }, + } + }, + .l_bracket_minus => { + var loc: Location = token.loc; + idx += 1; + var next = tokenizer.peek(idx); + link: switch (next.tag) { + .text, .pipe => { + idx += 1; + next = tokenizer.peek(idx); + continue :link next.tag; + }, + .r_bracket => { + loc.end = next.loc.end; + idx += 1; + break :tag .{ + .tag = .{ .link = .{ .kind = .file } }, + .loc = loc, + .parent = last_loc, + }; + }, + else => { + // invalid + continue; + }, + } + }, + .l_bracket_slash => { + var loc: Location = token.loc; + idx += 1; + var next = tokenizer.peek(idx); + link: switch (next.tag) { + .text, .reference, .pipe => { + idx += 1; + next = tokenizer.peek(idx); + continue :link next.tag; + }, + .r_bracket => { + loc.end = next.loc.end; + idx += 1; + break :tag .{ + .tag = .{ .link = .{ .kind = .web } }, + .loc = loc, + .parent = last_loc, + }; + }, + else => { + // invalid + continue; + }, + } + }, + // list + .minus => break :tag .{ + .tag = .{ + .text = .{ .styled = false }, + }, + .loc = token.loc, + .parent = last_loc, + }, + .plus => break :tag .{ + .tag = .{ + .text = .{ .styled = false }, + }, + .loc = token.loc, + .parent = last_loc, + }, + // table + .pipe => break :tag .{ + .tag = .{ + .text = .{ .styled = false }, + }, + .loc = token.loc, + .parent = last_loc, + }, + .pipe_equal => unreachable, + // text + .text => { + var loc: Location = token.loc; + idx += 1; + var next = tokenizer.peek(idx); + text: switch (next.tag) { + .plus, .minus, .text, .colon => { + loc.end = next.loc.end; + idx += 2; + next = tokenizer.peek(idx); + continue :text next.tag; + }, + else => break :tag .{ + .tag = .{ + .text = .{ .styled = false }, + }, + .loc = loc, + .parent = last_loc, + }, + } + }, + .underscore => { + var loc: Location = token.loc; + idx += 1; + var next = tokenizer.peek(idx); + if (next.tag != .text) { + // invalid + loc.end = token.loc.end; + continue :tag .invalid; + } else { + loc = next.loc; + idx += 1; + next = tokenizer.peek(idx); + + if (next.tag == .underscore) { + for (0..idx) |_| _ = tokenizer.next(); + break :tag .{ + .tag = .{ + .text = .{ .styled = true }, + }, + .loc = loc, + .parent = last_loc, + }; + } else { + // invalid + loc.end = next.loc.end; + break :tag .{ + .tag = .invalid, + .loc = loc, + .parent = last_loc, + }; + } + } + }, + .eof => return ast, + .invalid => { + idx += 1; + token = tokenizer.peek(idx); + break :tag .{ + .tag = .invalid, + .loc = token.loc, + .parent = last_loc, + }; + }, + }; + try ast.addNode(allocator, node); + // TODO improve the parent node's location information! + switch (node.tag) { + .header, .link, .list, .table, .block => last_loc = node.loc, + else => {}, + } + } + + return ast; +} + +pub fn parse_reader(allocator: Allocator, reader: Reader) !Ast { + const contents = try reader.readAllAlloc(allocator, std.math.maxInt(usize)); + defer allocator.free(contents); + return parse(allocator, contents[0..contents.len :0]); +} + +const std = @import("std"); +const testing = std.testing; +const Allocator = std.mem.Allocator; +const Reader = std.io.AnyReader; +const lexer = @import("lexer.zig"); +const Tokenizer = lexer.Tokenizer; +const Location = lexer.Token.Location; +const Ast = @import("Ast.zig"); + +test "Example note with code snippets" { + const content: [:0]const u8 = + \\:code:zig: + \\:test: + \\:test:another: + \\ + \\= Conditional Code + \\Controlling not just the control flow of the code, but also which parts of the code base are actually compiled and used when shipping the application is very crucial and often done via condiationally enabling / disabling code. They are usually controlled via _feature toggles_ and can be implemented in zig via [:ly9j.n|comptime] (pre-processor statements in C/C++, etc.). + \\ + \\[:ly9j.n@comptime] even allows mixing build and runtime checks, see the following example: + \\ + \\##zig + \\ fn myFunction() void { + \\ if (hasFeature()) { + \\ // Feature-specific code + \\ } else { + \\ // Default code + \\ } + \\ } + \\ + \\ inline fn hasFeature() bool { + \\ return (comptime comptimeCheck()) and runtimeCheck(); + \\ } + \\## + \\ + \\Both the [:g0ic.n@inline] and [:ly9j.n@comptime] keywords are required, such that the `hasFeature` function call in `myFunction` will be [:msev.n|correctly] evaluated during build-time. + \\ + \\Most commonly such conditional code is used to provide _platform specific_ implementations: + \\ + \\##zig + \\ const builtin = @import("builtin"); + \\ + \\ fn myFunction() void { + \\ if (builtin.os.tag == .macos) { + \\ // This code will only be included if the target OS is macOS. + \\ return; + \\ } + \\ + \\ // This code will be included for all other operating systems. + \\ } + \\## + \\ + \\##fn [/https://mitchellh.com/writing/zig-comptime-conditional-disable|Conditionally Disabling Code with comptime in Zig - Mitchell Hashimoto]## + ; + var ast = try parse(testing.allocator, content); + defer ast.deinit(testing.allocator); + + var idx: usize = 0; + while (idx < ast.nodes.len) : (idx += 1) ast.nodes.get(idx).dump(content); + + // access specific tags and then their associated node + for (ast.nodes.items(.tag)) |tag| switch (tag) { + .anchor => |anchor| { + std.debug.print("found anchor {any}\n", .{anchor}); + const node = ast.nodes.get(anchor.idx); + std.debug.print("\tassociated node: {any}: ", .{node}); + node.dump(content); + const parent = node.getParent(ast); + std.debug.print("\tassociated parent: {any}: ", .{parent}); + parent.dump(content); + }, + // .reference => |reference| { + // std.debug.print("found reference {any}\n", .{reference}); + // const node = ast.nodes.get(reference.idx); + // std.debug.print("associated node: {any}: ", .{node}); + // node.dump(content); + // }, + else => {}, + }; +} diff --git a/src/root.zig b/src/root.zig index 325b051..5da6f52 100644 --- a/src/root.zig +++ b/src/root.zig @@ -2,5 +2,10 @@ ///! emits an AST of valid nf file contents. In case of invalid files, ///! corresponding errors are returned. For detailed error messages refer to ///! `errorMessage()` -pub const ast = @import("ast.zig"); +pub const Ast = @import("Ast.zig"); pub const parser = @import("parser.zig"); +pub const lexer = @import("lexer.zig"); + +test { + @import("std").testing.refAllDeclsRecursive(@This()); +}