feat: lexing, and parsing implementation

2025-06-11 19:03:03 +02:00
parent 1fb3ed0496
commit 896f68248f
5 changed files with 1683 additions and 2 deletions
--- a/src/Ast.zig
+++ b/src/Ast.zig
@@ -0,0 +1,282 @@
 ///! Abstract syntax tree implementation for `nf` file format.
 nodes: std.MultiArrayList(Node),
 tokens: std.MultiArrayList(Token),
 // TODO what would I need from the `Ast`?
 // - parser checks if `Ast` is _correct_ when constructing from source using lexer / tokenizer
 //  -> use that for reporting through the lsp / syntax checker?
 // - Ast holds representation for formatting
 // - Ast holds information about contents and locations for lsp implementation
 // TODO do I need more specific ast nodes? or do I even need some kind of interface?
 // -> i.e. blocks should also tell me what kind of block it is
 // -> i.e. headers should also tell me what level (and their corresponding children)
 // -> i.e. links should also tell me what kind of link (file, web, note)
 // -> i.e. table should also tell me details like (columns, rows, headers)
 // -> i.e. topic should also tell me what level (and their corresponding child tags)
 // -> i.e. paragraphs should also tell me what contents (their corresponding children, which are .text or .styled_text)
 // -> have only specific children allowed for each kind?
 pub const Tag = enum(u8) {
    anchor,
    block,
    comment,
    header,
    link,
    list,
    paragraph,
    reference,
    table,
    topic,
    text,
    styled_text,
    /// invalid type as the `Ast` may be not complete to derive the correct type yet
    invalid,
 };
 pub const Type = union(Tag) {
    anchor: Anchor,
    block: Block,
    comment: Comment,
    header: Header,
    link: Link,
    list: List,
    paragraph: Paragraph,
    reference: Reference,
    table: Table,
    topic: Topic,
    text: Text,
    styled_text: Text,
    invalid,
    pub fn getNode(this: @This(), ast: Ast) Node {
        return switch (this) {
            .anchor => |anchor| ast.nodes.get(anchor.idx),
            .block => |block| ast.nodes.get(block.idx),
            .comment => |comment| ast.nodes.get(comment.idx),
            .header => |header| ast.nodes.get(header.idx),
            .link => |link| ast.nodes.get(link.idx),
            .list => |list| ast.nodes.get(list.idx),
            .paragraph => |paragraph| ast.nodes.get(paragraph.idx),
            .reference => |reference| ast.nodes.get(reference.idx),
            .table => |table| ast.nodes.get(table.idx),
            .topic => |topic| ast.nodes.get(topic.idx),
            .text => |text| ast.nodes.get(text.idx),
            .styled_text => |text| ast.nodes.get(text.idx),
            .invalid => .{
                .tag = .invalid,
                .loc = .{
                    .start = 0,
                    .end = 0,
                },
                .parent = .{
                    .start = 0,
                    .end = 0,
                },
            },
        };
    }
    pub fn dump(this: @This()) void {
        switch (this) {
            .anchor => |anchor| anchor.dump(),
            .block => |block| block.dump(),
            .comment => |comment| comment.dump(),
            .header => |header| header.dump(),
            .link => |link| link.dump(),
            .list => |list| list.dump(),
            .paragraph => |paragraph| paragraph.dump(),
            .reference => |reference| reference.dump(),
            .table => |table| table.dump(),
            .topic => |topic| topic.dump(),
            .text => |text| text.dump(),
            .styled_text => |text| text.dump(),
            .invalid => {},
        }
    }
 };
 pub const Node = struct {
    tag: Type = .invalid,
    loc: Token.Location,
    parent: Token.Location = undefined,
    pub fn getParent(this: @This(), ast: Ast) Node {
        for (0.., ast.nodes.items(.loc)) |idx, loc| if (loc == this.parent) return ast.nodes.get(idx);
        return .{
            .tag = .invalid,
            .loc = .{
                .start = 0,
                .end = 0,
            },
            .parent = .{
                .start = 0,
                .end = 0,
            },
        };
    }
    pub fn dump(this: @This(), source: [:0]const u8) void {
        assert(this.loc.start < this.loc.end);
        print("{s} ", .{@tagName(this.tag)});
        this.tag.dump();
        print(": '{s}'\n", .{source[this.loc.start..this.loc.end]});
    }
 };
 pub const Anchor = struct {
    idx: usize = undefined,
    target: Token.Location,
    pub fn dump(this: @This()) void {
        print(".target: {any} ", .{this.target});
    }
 };
 pub const Block = struct {
    idx: usize = undefined,
    kind: enum(u8) {
        tldr,
        info,
        warn,
        quote,
        math,
        @"fn",
        code, // if not matched with one above 'code' is assumed
    },
    pub fn dump(this: @This()) void {
        print(".kind: {any} ", .{this.kind});
    }
 };
 pub const Comment = struct {
    idx: usize = undefined,
    pub fn dump(this: @This()) void {
        _ = this;
    }
 };
 pub const Header = struct {
    idx: usize = undefined,
    level: u8,
    pub fn dump(this: @This()) void {
        print(".level: {any} ", .{this.level});
    }
 };
 pub const Link = struct {
    idx: usize = undefined,
    kind: enum(u2) { note, file, web },
    pub fn dump(this: @This()) void {
        print(".kind: .{s} ", .{@tagName(this.kind)});
    }
 };
 pub const List = struct {
    idx: usize = undefined,
    level: u8,
    ordered: bool,
    pub fn dump(this: @This()) void {
        print(".ordered = {any}, .level: {d} ", .{ this.ordered, this.level });
    }
 };
 pub const Paragraph = struct {
    idx: usize = undefined,
    pub fn dump(this: @This()) void {
        _ = this;
    }
 };
 pub const Reference = struct {
    idx: usize = undefined,
    pub fn dump(this: @This()) void {
        _ = this;
    }
 };
 pub const Table = struct {
    idx: usize = undefined,
    cols: u8,
    rows: u8,
    pub fn dump(this: @This()) void {
        _ = this;
    }
 };
 pub const Topic = struct {
    idx: usize = undefined,
    topics: []const u8, // order here is important!
    pub fn dump(this: @This()) void {
        _ = this;
    }
 };
 pub const Text = struct {
    idx: usize = undefined,
    styled: bool,
    pub fn dump(this: @This()) void {
        _ = this;
    }
 };
 pub const init: Ast = .{
    .nodes = .empty,
    .tokens = .empty,
 };
 pub fn deinit(this: *Ast, gpa: Allocator) void {
    this.nodes.deinit(gpa);
    this.tokens.deinit(gpa);
 }
 pub fn addNode(this: *Ast, gpa: Allocator, node: Node) !void {
    assert(node.loc.start < node.loc.end);
    const idx = this.nodes.len;
    try this.nodes.ensureTotalCapacity(gpa, this.nodes.len + 1);
    errdefer comptime unreachable; // no more are possible
    var n = node;
    switch (n.tag) {
        .anchor => |*anchor| anchor.idx = idx,
        .block => |*block| block.idx = idx,
        .comment => |*comment| comment.idx = idx,
        .header => |*header| header.idx = idx,
        .link => |*link| link.idx = idx,
        .list => |*list| list.idx = idx,
        .paragraph => |*paragraph| paragraph.idx = idx,
        .reference => |*reference| reference.idx = idx,
        .table => |*table| table.idx = idx,
        .topic => |*topic| topic.idx = idx,
        .text => |*text| text.idx = idx,
        .styled_text => |*text| text.idx = idx,
        .invalid => {},
    }
    this.nodes.appendAssumeCapacity(n);
 }
 pub fn addToken(this: *Ast, gpa: Allocator, token: Token) !void {
    try this.tokens.append(gpa, token);
 }
 const std = @import("std");
 const debug = std.debug;
 const mem = std.mem;
 const assert = debug.assert;
 const print = debug.print;
 const Allocator = mem.Allocator;
 const lexer = @import("lexer.zig");
 const Token = lexer.Token;
 const Ast = @This();
--- a/src/ast.zig
+++ b/src/ast.zig
--- a/src/lexer.zig
+++ b/src/lexer.zig
--- a/src/parser.zig
+++ b/src/parser.zig
@@ -1 +1,376 @@
-pub const lexer = @import("lexer.zig");
+fn parse(allocator: Allocator, content: [:0]const u8) !Ast {
    var ast: Ast = .init;
    var tokenizer = Tokenizer.init(content);
    var token = tokenizer.next();
    var last_loc: Location = .{ .start = 0, .end = 0 };
    while (token.tag != .eof) : (token = tokenizer.next()) {
        // token
        try ast.addToken(allocator, token);
        // node
        var idx: u32 = 0; // keep track on how much we peeked ahead
        // TODO keep track of the parent position too!
        const node: Ast.Node = tag: switch (token.tag) {
            // tracing
            .anchor => break :tag .{
                .tag = .{
                    .anchor = .{ .target = undefined },
                },
                .loc = token.loc,
                .parent = last_loc,
            },
            .reference => break :tag .{
                .tag = .{ .reference = .{} },
                .loc = token.loc,
                .parent = last_loc,
            },
            .at_sign => break :tag .{
                .tag = .{ .text = .{ .styled = false } },
                .loc = token.loc,
                .parent = last_loc,
            },
            // TODO determine kind of block
            // -> run lexer on block contents if necessary and inject results into this tree accordingly
            // -> recursive lexer instance needs to update the found corresponding index with the starting index of the block contents!
            .block => break :tag .{
                .tag = .{
                    .block = .{
                        .kind = undefined,
                    },
                },
                .loc = token.loc,
                .parent = last_loc,
            },
            .colon => {
                var loc: Location = token.loc;
                idx += 1;
                token = tokenizer.peek(idx);
                colon: switch (token.tag) {
                    .colon => {
                        idx += 1;
                        token = tokenizer.peek(idx);
                        continue :colon token.tag;
                    },
                    .text => {
                        idx += 1;
                        const end = tokenizer.peek(idx);
                        if (end.tag == .colon) {
                            loc.end = end.loc.end;
                            idx += 1;
                            token = tokenizer.peek(idx);
                            continue :colon token.tag;
                        }
                    },
                    .newline => if (loc.start + 1 == loc.end) {
                        // only have ':\n' which is not a .tag
                        continue;
                    },
                    else => continue :tag token.tag,
                }
                for (0..idx) |_| _ = tokenizer.next();
                break :tag .{
                    .tag = .{
                        .topic = .{ .topics = undefined },
                    },
                    .loc = loc,
                    .parent = last_loc,
                };
            },
            .comment => break :tag .{
                .tag = .{
                    .comment = .{},
                },
                .loc = token.loc,
                .parent = last_loc,
            },
            .newline => {
                idx = 0;
                token = tokenizer.next();
                continue :tag token.tag;
            },
            .l_angle_bracket, .r_angle_bracket, .l_bracket, .r_bracket, .hashtag => {
                idx = 0;
                token = tokenizer.next();
                continue :tag token.tag;
            },
            // header
            .equal => {
                var loc: Location = token.loc;
                idx += 1;
                const next = tokenizer.peek(idx);
                if (next.tag != .text) {
                    // invalid
                    continue :tag .invalid;
                } else {
                    // FIX this will fail if the entire heading is not only a text, but contains a minus, slash, etc.
                    loc.end = next.loc.end;
                    break :tag .{
                        .tag = .{
                            .header = .{ .level = undefined },
                        },
                        .loc = loc,
                        .parent = last_loc,
                    };
                }
            },
            // link
            .l_bracket_colon => {
                var loc: Location = token.loc;
                idx += 1;
                var next = tokenizer.peek(idx);
                link: switch (next.tag) {
                    .text, .reference, .pipe => {
                        idx += 1;
                        next = tokenizer.peek(idx);
                        continue :link next.tag;
                    },
                    .r_bracket => {
                        loc.end = next.loc.end;
                        idx += 1;
                        break :tag .{
                            .tag = .{ .link = .{ .kind = .note } },
                            .loc = loc,
                            .parent = last_loc,
                        };
                    },
                    else => {
                        // invalid
                        continue;
                    },
                }
            },
            .l_bracket_minus => {
                var loc: Location = token.loc;
                idx += 1;
                var next = tokenizer.peek(idx);
                link: switch (next.tag) {
                    .text, .pipe => {
                        idx += 1;
                        next = tokenizer.peek(idx);
                        continue :link next.tag;
                    },
                    .r_bracket => {
                        loc.end = next.loc.end;
                        idx += 1;
                        break :tag .{
                            .tag = .{ .link = .{ .kind = .file } },
                            .loc = loc,
                            .parent = last_loc,
                        };
                    },
                    else => {
                        // invalid
                        continue;
                    },
                }
            },
            .l_bracket_slash => {
                var loc: Location = token.loc;
                idx += 1;
                var next = tokenizer.peek(idx);
                link: switch (next.tag) {
                    .text, .reference, .pipe => {
                        idx += 1;
                        next = tokenizer.peek(idx);
                        continue :link next.tag;
                    },
                    .r_bracket => {
                        loc.end = next.loc.end;
                        idx += 1;
                        break :tag .{
                            .tag = .{ .link = .{ .kind = .web } },
                            .loc = loc,
                            .parent = last_loc,
                        };
                    },
                    else => {
                        // invalid
                        continue;
                    },
                }
            },
            // list
            .minus => break :tag .{
                .tag = .{
                    .text = .{ .styled = false },
                },
                .loc = token.loc,
                .parent = last_loc,
            },
            .plus => break :tag .{
                .tag = .{
                    .text = .{ .styled = false },
                },
                .loc = token.loc,
                .parent = last_loc,
            },
            // table
            .pipe => break :tag .{
                .tag = .{
                    .text = .{ .styled = false },
                },
                .loc = token.loc,
                .parent = last_loc,
            },
            .pipe_equal => unreachable,
            // text
            .text => {
                var loc: Location = token.loc;
                idx += 1;
                var next = tokenizer.peek(idx);
                text: switch (next.tag) {
                    .plus, .minus, .text, .colon => {
                        loc.end = next.loc.end;
                        idx += 2;
                        next = tokenizer.peek(idx);
                        continue :text next.tag;
                    },
                    else => break :tag .{
                        .tag = .{
                            .text = .{ .styled = false },
                        },
                        .loc = loc,
                        .parent = last_loc,
                    },
                }
            },
            .underscore => {
                var loc: Location = token.loc;
                idx += 1;
                var next = tokenizer.peek(idx);
                if (next.tag != .text) {
                    // invalid
                    loc.end = token.loc.end;
                    continue :tag .invalid;
                } else {
                    loc = next.loc;
                    idx += 1;
                    next = tokenizer.peek(idx);
                    if (next.tag == .underscore) {
                        for (0..idx) |_| _ = tokenizer.next();
                        break :tag .{
                            .tag = .{
                                .text = .{ .styled = true },
                            },
                            .loc = loc,
                            .parent = last_loc,
                        };
                    } else {
                        // invalid
                        loc.end = next.loc.end;
                        break :tag .{
                            .tag = .invalid,
                            .loc = loc,
                            .parent = last_loc,
                        };
                    }
                }
            },
            .eof => return ast,
            .invalid => {
                idx += 1;
                token = tokenizer.peek(idx);
                break :tag .{
                    .tag = .invalid,
                    .loc = token.loc,
                    .parent = last_loc,
                };
            },
        };
        try ast.addNode(allocator, node);
        // TODO improve the parent node's location information!
        switch (node.tag) {
            .header, .link, .list, .table, .block => last_loc = node.loc,
            else => {},
        }
    }
    return ast;
 }
 pub fn parse_reader(allocator: Allocator, reader: Reader) !Ast {
    const contents = try reader.readAllAlloc(allocator, std.math.maxInt(usize));
    defer allocator.free(contents);
    return parse(allocator, contents[0..contents.len :0]);
 }
 const std = @import("std");
 const testing = std.testing;
 const Allocator = std.mem.Allocator;
 const Reader = std.io.AnyReader;
 const lexer = @import("lexer.zig");
 const Tokenizer = lexer.Tokenizer;
 const Location = lexer.Token.Location;
 const Ast = @import("Ast.zig");
 test "Example note with code snippets" {
    const content: [:0]const u8 =
        \\:code:zig:
        \\:test:
        \\:test:another:
        \\
        \\= Conditional Code <conditional-code>
        \\Controlling not just the control flow of the code, but also which parts of the code base are actually compiled and used when shipping the application is very crucial and often done via condiationally enabling / disabling code. They are usually controlled via _feature toggles_ and can be implemented in zig via [:ly9j.n|comptime] (pre-processor statements in C/C++, etc.).
        \\
        \\[:ly9j.n@comptime] even allows mixing build and runtime checks, see the following example:
        \\
        \\##zig
        \\  fn myFunction() void {
        \\      if (hasFeature()) {
        \\          // Feature-specific code
        \\      } else {
        \\          // Default code
        \\      }
        \\  }
        \\  
        \\  inline fn hasFeature() bool {
        \\      return (comptime comptimeCheck()) and runtimeCheck();
        \\  }
        \\##
        \\
        \\Both the [:g0ic.n@inline] and [:ly9j.n@comptime] keywords are required, such that the `hasFeature` function call in `myFunction` will be [:msev.n|correctly] evaluated during build-time.
        \\
        \\Most commonly such conditional code is used to provide _platform specific_ implementations:
        \\
        \\##zig
        \\  const builtin = @import("builtin");
        \\  
        \\  fn myFunction() void {
        \\      if (builtin.os.tag == .macos) {
        \\          // This code will only be included if the target OS is macOS.
        \\          return;
        \\      }
        \\  
        \\      // This code will be included for all other operating systems.
        \\  }
        \\##
        \\
        \\##fn [/https://mitchellh.com/writing/zig-comptime-conditional-disable|Conditionally Disabling Code with comptime in Zig - Mitchell Hashimoto]## <fn-1>
    ;
    var ast = try parse(testing.allocator, content);
    defer ast.deinit(testing.allocator);
    var idx: usize = 0;
    while (idx < ast.nodes.len) : (idx += 1) ast.nodes.get(idx).dump(content);
    // access specific tags and then their associated node
    for (ast.nodes.items(.tag)) |tag| switch (tag) {
        .anchor => |anchor| {
            std.debug.print("found anchor {any}\n", .{anchor});
            const node = ast.nodes.get(anchor.idx);
            std.debug.print("\tassociated node: {any}: ", .{node});
            node.dump(content);
            const parent = node.getParent(ast);
            std.debug.print("\tassociated parent: {any}: ", .{parent});
            parent.dump(content);
        },
        // .reference => |reference| {
        //     std.debug.print("found reference {any}\n", .{reference});
        //     const node = ast.nodes.get(reference.idx);
        //     std.debug.print("associated node: {any}: ", .{node});
        //     node.dump(content);
        // },
        else => {},
    };
 }
--- a/src/root.zig
+++ b/src/root.zig
@@ -2,5 +2,10 @@
 ///! emits an AST of valid nf file contents. In case of invalid files,
 ///! corresponding errors are returned. For detailed error messages refer to
 ///! `errorMessage()`
-pub const ast = @import("ast.zig");
+pub const Ast = @import("Ast.zig");
 pub const parser = @import("parser.zig");
 pub const lexer = @import("lexer.zig");
 test {
    @import("std").testing.refAllDeclsRecursive(@This());
 }