initial commit based on existing implementation of another project

Added documentation comments with example snippets and a complete example showcasing how to use the library for matching.
2025-11-25 19:32:30 +01:00
parent eb7cc9c2dc
commit 04f082d801
5 changed files with 325 additions and 0 deletions
--- a/src/root.zig
+++ b/src/root.zig
@@ -0,0 +1,280 @@
+//! `Fuzzig` the fuzzy search library. Matching algorithm implementation is based on
+//! [ms-edit](https://github.com/microsoft/edit/blob/main/src/fuzzy.rs) MIT-Licensed.
+
+/// Result of the calculated score. The `Result` holds memory that needs to be
+/// freed once the `Result` is no longer required.
+///
+/// # Example
+///
+/// ```zig
+/// const item = haystack[result.index];
+/// var match_highlights: []u8 = try gpa.alloc(u8, item.len);
+/// defer gpa.free(match_highlights);
+/// @memset(match_highlights, ' ');
+/// // highlight what caused this search result
+/// for (result.positions.items) |pos| match_highlights[pos] = '^';
+/// ```
+///
+/// Results in a match (with highlight) as for in the above example
+/// (`haystack[i]` = "Hello, World!", `match` = "world"):
+///
+/// ```
+/// Hello, World!
+///        ^^^^^
+/// ```
+pub const Result = struct {
+    score: usize,
+    // positions are reversed and contain the indices of the characters that were matched during the fuzzy scoring.
+    positions: std.ArrayList(usize),
+    // index to the file this `Result` relates to. Using this index the associated file can be determined.
+    index: usize,
+
+    pub fn init(score: usize, positions: std.ArrayList(usize), index: usize) @This() {
+        return .{
+            .score = score,
+            .positions = positions,
+            .index = index,
+        };
+    }
+
+    pub fn deinit(this: *@This(), gpa: Allocator) void {
+        this.positions.deinit(gpa);
+    }
+};
+
+/// Compare function for `std.sort.heap`. Comparator sort items by their highest
+/// score.
+///
+/// # Example
+///
+/// The following examples shows a short example on the usage of the `greaterThan`
+/// function when applying to the heap sort algorithm of the standard library.
+///
+/// ```zig
+/// var results: std.ArrayList(Result) = .empty;
+/// // ..
+/// std.sort.heap(fuzzig.Result, results.items, {}, fuzzig.greaterThan);
+/// // act on sorted scores:
+/// for (results.items) |result| {}
+/// ```
+pub fn greaterThan(_: void, a: Result, b: Result) bool {
+    return a.score > b.score;
+}
+
+/// Calculate the matching score for the provided query against the target string. The index is used as
+/// a reference for the target string and is passed through to the returned `Result`.
+///
+/// In case there are no possible matches `null` is returned (i.e. the target
+/// string is empty or the query is longer than the target).
+///
+/// # Example
+///
+/// Given a list of file names, you can match a given string as follows:
+///
+/// ```zig
+/// var results: std.ArrayList(Result) = .empty;
+/// defer {
+///     for (results.items) |*result| result.deinit(gpa);
+///     results.deinit(gpa);
+/// }
+/// // ..
+/// // create fuzzy score for each file entry
+/// for (0.., files.items) |idx, entry| {
+///     const result = try fuzzig.match(gpa, entry, search, idx) orelse continue;
+///     try results.append(gpa, result);
+/// }
+/// ```
+pub fn match(gpa: Allocator, target: []const u8, query: []const u8, index: usize) !?Result {
+    if (target.len == 0 or query.len == 0) return null;
+    if (target.len < query.len) return null;
+
+    // temporary arena allocator to free all allocated memory at the end of the function
+    var scratch_arena: heap.ArenaAllocator = .init(gpa);
+    defer scratch_arena.deinit();
+
+    const allocator = scratch_arena.allocator();
+    const target_lower = try fold_case(allocator, target);
+    const query_lower = try fold_case(allocator, query);
+
+    const area = target.len * query.len;
+    var scores = try allocator.alloc(usize, area);
+    var matches = try allocator.alloc(usize, area);
+
+    for (0..query.len) |idx| {
+        const offset = idx * target.len;
+        const prev_offset = if (idx > 0) (idx - 1) * target.len else 0;
+
+        for (0..target.len) |target_index| {
+            const current_idx = offset + target_index;
+            const diag_idx = if (idx > 0 and target_index > 0) prev_offset + target_index - 1 else 0;
+            const left_score = if (target_index > 0) scores[current_idx - 1] else 0;
+            const diag_score = if (idx > 0 and target_index > 0) scores[diag_idx] else 0;
+            const matches_sequence_len = if (idx > 0 and target_index > 0) matches[diag_idx] else 0;
+
+            const score = if (diag_score == 0 and idx != 0) 0 else compute_char_score(
+                query[idx],
+                query_lower[idx],
+                if (target_index != 0) target[target_index - 1] else null,
+                target[target_index],
+                target_lower[target_index],
+                matches_sequence_len,
+            );
+
+            if (score != 0 and diag_score + score >= left_score) {
+                matches[current_idx] = matches_sequence_len + 1;
+                scores[current_idx] = diag_score + score;
+            } else {
+                matches[current_idx] = 0;
+                scores[current_idx] = left_score;
+            }
+        }
+    }
+
+    var positions: std.ArrayList(usize) = .empty;
+
+    if (query.len != 0 and target.len != 0) {
+        var query_idx = query.len - 1;
+        var target_idx = target.len - 1;
+
+        while (true) {
+            const current_idx = query_idx * target.len + target_idx;
+            if (matches[current_idx] == 0) {
+                if (target_idx == 0) break;
+                target_idx -= 1;
+            } else {
+                try positions.append(gpa, target_idx);
+                if (query_idx == 0 or target_idx == 0) break;
+                query_idx -= 1;
+                target_idx -= 1;
+            }
+        }
+    }
+
+    return .init(scores[area - 1], positions, index);
+}
+
+/// Compute the score for a given character, while taking into account the previous character, the already matching length of a (sub-)sequence.
+fn compute_char_score(query: u8, query_lower: u8, target_prev: ?u8, target_curr: u8, target_curr_lower: u8, matches_sequence_len: usize) usize {
+    var score: usize = 0;
+    if (!(query_lower == target_curr_lower or query_lower == '/' or query_lower == '\\' and target_curr_lower == '/' or target_curr_lower == '\\')) return score;
+
+    score += 1;
+
+    if (matches_sequence_len > 0) score += matches_sequence_len * 5;
+
+    if (query == target_curr) score += 1;
+
+    if (target_prev) |prev| {
+        score += score_separator_at_pos(prev);
+        if (target_curr != target_curr_lower and matches_sequence_len == 0) score += 2;
+    } else {
+        score += 8;
+    }
+
+    return score;
+}
+
+/// Scoring for separator characters. Slightly prefering path separators over other separators.
+fn score_separator_at_pos(prev: u8) u32 {
+    return switch (prev) {
+        '/', '\\' => 5, // prefer path separators...
+        '_', '-', '.', ' ', '\'', '"', ':' => 4, // ...over other separators
+        else => 0,
+    };
+}
+
+/// Fold the case of the provided string (of ascii characters). Returned slice
+/// is owned by the caller and has to be freed using the provided `Allocator`.
+fn fold_case(gpa: Allocator, s: []const u8) ![]const u8 {
+    var vector: std.ArrayList(u8) = try .initCapacity(gpa, s.len);
+    for (s) |c| {
+        var target = c;
+        if (target > 64 and target < 91) target += 32;
+        vector.appendAssumeCapacity(target);
+    }
+    return try vector.toOwnedSlice(gpa);
+}
+
+const std = @import("std");
+const heap = std.heap;
+const testing = std.testing;
+const Allocator = std.mem.Allocator;
+
+test "matching `s` on local files" {
+    var gpa = testing.allocator;
+
+    // files to fuzzy match against
+    var files: std.ArrayList([]const u8) = .empty;
+    defer {
+        for (files.items) |file| gpa.free(file);
+        files.deinit(gpa);
+    }
+
+    // fuzzy matching results (containing only the scores)
+    var results: std.ArrayList(Result) = .empty;
+    defer {
+        for (results.items) |*result| result.deinit(gpa);
+        results.deinit(gpa);
+    }
+
+    // arrange
+    var dir = try std.fs.cwd().openDir(".", .{ .iterate = true });
+    defer dir.close();
+
+    var iter = try dir.walk(gpa);
+    defer iter.deinit();
+
+    while (try iter.next()) |entry| {
+        switch (entry.kind) {
+            .file => {
+                if (std.mem.startsWith(u8, entry.path, ".git/")) continue;
+                if (std.mem.startsWith(u8, entry.path, ".zig-cache")) continue;
+                const path = try gpa.dupe(u8, entry.path[0..entry.path.len]);
+                try files.append(gpa, path);
+            },
+            else => continue,
+        }
+    }
+    try results.ensureTotalCapacity(gpa, files.items.len);
+
+    // act
+    const search = "s";
+
+    // create fuzzy score for each file entry
+    for (0.., files.items) |idx, entry| {
+        const result = try match(gpa, entry, search, idx) orelse continue;
+        try results.append(gpa, result);
+    }
+    // sort scores by their received score descending
+    std.sort.heap(Result, results.items, {}, greaterThan);
+
+    var buf: [128]u8 = undefined;
+    var buffer = std.fs.File.stderr().writer(&buf);
+    var writer = &buffer.interface;
+    defer writer.flush() catch unreachable;
+
+    std.debug.lockStdErr();
+    defer std.debug.unlockStdErr();
+
+    // assert
+    var scored_entries: usize = 0;
+    var unscored_entries: usize = 0;
+    for (results.items) |result| {
+        if (result.score > 0) scored_entries += 1 else unscored_entries += 1;
+        if (result.score == 0) continue; // do not print results that are unmatched
+
+        const item = files.items[result.index];
+        var match_highlights: []u8 = try gpa.alloc(u8, item.len);
+        defer gpa.free(match_highlights);
+
+        @memset(match_highlights, ' ');
+        // highlight what caused this search result
+        for (result.positions.items) |pos| match_highlights[pos] = '^';
+        // print item and its highlighted positions
+        // NOTE uncomment the print for the writer to show matches and their highlights of what matched
+        // -> as the writer prints to *stderr* writing will cause the test to fail, hence it is commented out by default
+        // try writer.print("{s}\n{s}\n", .{ item, match_highlights });
+    }
+    try testing.expectEqual(5, scored_entries);
+    try testing.expectEqual(results.items.len - 5, unscored_entries);
+}