diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3389c86 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.zig-cache/ +zig-out/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d711c08 --- /dev/null +++ b/LICENSE @@ -0,0 +1,9 @@ +MIT License + +Copyright (c) 2025 Yves Biener + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/build.zig b/build.zig new file mode 100644 index 0000000..427218f --- /dev/null +++ b/build.zig @@ -0,0 +1,20 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + const mod = b.addModule("fuzzig", .{ + .root_source_file = b.path("src/root.zig"), + .target = target, + .optimize = optimize, + }); + + const mod_tests = b.addTest(.{ + .root_module = mod, + }); + + const run_mod_tests = b.addRunArtifact(mod_tests); + const test_step = b.step("test", "Run tests"); + test_step.dependOn(&run_mod_tests.step); +} diff --git a/build.zig.zon b/build.zig.zon new file mode 100644 index 0000000..cd16041 --- /dev/null +++ b/build.zig.zon @@ -0,0 +1,14 @@ +.{ + .name = .fuzzig, + // This is a [Semantic Version](https://semver.org/). + .version = "0.0.0", + .fingerprint = 0x6450ab302d40f9a8, // Changing this has security and trust implications. + .minimum_zig_version = "0.16.0-dev.1254+bf15c791f", + .dependencies = .{}, + .paths = .{ + "build.zig", + "build.zig.zon", + "src", + "LICENSE", + }, +} diff --git a/src/root.zig b/src/root.zig new file mode 100644 index 0000000..572f34e --- /dev/null +++ b/src/root.zig @@ -0,0 +1,280 @@ +//! `Fuzzig` the fuzzy search library. Matching algorithm implementation is based on +//! [ms-edit](https://github.com/microsoft/edit/blob/main/src/fuzzy.rs) MIT-Licensed. + +/// Result of the calculated score. The `Result` holds memory that needs to be +/// freed once the `Result` is no longer required. +/// +/// # Example +/// +/// ```zig +/// const item = haystack[result.index]; +/// var match_highlights: []u8 = try gpa.alloc(u8, item.len); +/// defer gpa.free(match_highlights); +/// @memset(match_highlights, ' '); +/// // highlight what caused this search result +/// for (result.positions.items) |pos| match_highlights[pos] = '^'; +/// ``` +/// +/// Results in a match (with highlight) as for in the above example +/// (`haystack[i]` = "Hello, World!", `match` = "world"): +/// +/// ``` +/// Hello, World! +/// ^^^^^ +/// ``` +pub const Result = struct { + score: usize, + // positions are reversed and contain the indices of the characters that were matched during the fuzzy scoring. + positions: std.ArrayList(usize), + // index to the file this `Result` relates to. Using this index the associated file can be determined. + index: usize, + + pub fn init(score: usize, positions: std.ArrayList(usize), index: usize) @This() { + return .{ + .score = score, + .positions = positions, + .index = index, + }; + } + + pub fn deinit(this: *@This(), gpa: Allocator) void { + this.positions.deinit(gpa); + } +}; + +/// Compare function for `std.sort.heap`. Comparator sort items by their highest +/// score. +/// +/// # Example +/// +/// The following examples shows a short example on the usage of the `greaterThan` +/// function when applying to the heap sort algorithm of the standard library. +/// +/// ```zig +/// var results: std.ArrayList(Result) = .empty; +/// // .. +/// std.sort.heap(fuzzig.Result, results.items, {}, fuzzig.greaterThan); +/// // act on sorted scores: +/// for (results.items) |result| {} +/// ``` +pub fn greaterThan(_: void, a: Result, b: Result) bool { + return a.score > b.score; +} + +/// Calculate the matching score for the provided query against the target string. The index is used as +/// a reference for the target string and is passed through to the returned `Result`. +/// +/// In case there are no possible matches `null` is returned (i.e. the target +/// string is empty or the query is longer than the target). +/// +/// # Example +/// +/// Given a list of file names, you can match a given string as follows: +/// +/// ```zig +/// var results: std.ArrayList(Result) = .empty; +/// defer { +/// for (results.items) |*result| result.deinit(gpa); +/// results.deinit(gpa); +/// } +/// // .. +/// // create fuzzy score for each file entry +/// for (0.., files.items) |idx, entry| { +/// const result = try fuzzig.match(gpa, entry, search, idx) orelse continue; +/// try results.append(gpa, result); +/// } +/// ``` +pub fn match(gpa: Allocator, target: []const u8, query: []const u8, index: usize) !?Result { + if (target.len == 0 or query.len == 0) return null; + if (target.len < query.len) return null; + + // temporary arena allocator to free all allocated memory at the end of the function + var scratch_arena: heap.ArenaAllocator = .init(gpa); + defer scratch_arena.deinit(); + + const allocator = scratch_arena.allocator(); + const target_lower = try fold_case(allocator, target); + const query_lower = try fold_case(allocator, query); + + const area = target.len * query.len; + var scores = try allocator.alloc(usize, area); + var matches = try allocator.alloc(usize, area); + + for (0..query.len) |idx| { + const offset = idx * target.len; + const prev_offset = if (idx > 0) (idx - 1) * target.len else 0; + + for (0..target.len) |target_index| { + const current_idx = offset + target_index; + const diag_idx = if (idx > 0 and target_index > 0) prev_offset + target_index - 1 else 0; + const left_score = if (target_index > 0) scores[current_idx - 1] else 0; + const diag_score = if (idx > 0 and target_index > 0) scores[diag_idx] else 0; + const matches_sequence_len = if (idx > 0 and target_index > 0) matches[diag_idx] else 0; + + const score = if (diag_score == 0 and idx != 0) 0 else compute_char_score( + query[idx], + query_lower[idx], + if (target_index != 0) target[target_index - 1] else null, + target[target_index], + target_lower[target_index], + matches_sequence_len, + ); + + if (score != 0 and diag_score + score >= left_score) { + matches[current_idx] = matches_sequence_len + 1; + scores[current_idx] = diag_score + score; + } else { + matches[current_idx] = 0; + scores[current_idx] = left_score; + } + } + } + + var positions: std.ArrayList(usize) = .empty; + + if (query.len != 0 and target.len != 0) { + var query_idx = query.len - 1; + var target_idx = target.len - 1; + + while (true) { + const current_idx = query_idx * target.len + target_idx; + if (matches[current_idx] == 0) { + if (target_idx == 0) break; + target_idx -= 1; + } else { + try positions.append(gpa, target_idx); + if (query_idx == 0 or target_idx == 0) break; + query_idx -= 1; + target_idx -= 1; + } + } + } + + return .init(scores[area - 1], positions, index); +} + +/// Compute the score for a given character, while taking into account the previous character, the already matching length of a (sub-)sequence. +fn compute_char_score(query: u8, query_lower: u8, target_prev: ?u8, target_curr: u8, target_curr_lower: u8, matches_sequence_len: usize) usize { + var score: usize = 0; + if (!(query_lower == target_curr_lower or query_lower == '/' or query_lower == '\\' and target_curr_lower == '/' or target_curr_lower == '\\')) return score; + + score += 1; + + if (matches_sequence_len > 0) score += matches_sequence_len * 5; + + if (query == target_curr) score += 1; + + if (target_prev) |prev| { + score += score_separator_at_pos(prev); + if (target_curr != target_curr_lower and matches_sequence_len == 0) score += 2; + } else { + score += 8; + } + + return score; +} + +/// Scoring for separator characters. Slightly prefering path separators over other separators. +fn score_separator_at_pos(prev: u8) u32 { + return switch (prev) { + '/', '\\' => 5, // prefer path separators... + '_', '-', '.', ' ', '\'', '"', ':' => 4, // ...over other separators + else => 0, + }; +} + +/// Fold the case of the provided string (of ascii characters). Returned slice +/// is owned by the caller and has to be freed using the provided `Allocator`. +fn fold_case(gpa: Allocator, s: []const u8) ![]const u8 { + var vector: std.ArrayList(u8) = try .initCapacity(gpa, s.len); + for (s) |c| { + var target = c; + if (target > 64 and target < 91) target += 32; + vector.appendAssumeCapacity(target); + } + return try vector.toOwnedSlice(gpa); +} + +const std = @import("std"); +const heap = std.heap; +const testing = std.testing; +const Allocator = std.mem.Allocator; + +test "matching `s` on local files" { + var gpa = testing.allocator; + + // files to fuzzy match against + var files: std.ArrayList([]const u8) = .empty; + defer { + for (files.items) |file| gpa.free(file); + files.deinit(gpa); + } + + // fuzzy matching results (containing only the scores) + var results: std.ArrayList(Result) = .empty; + defer { + for (results.items) |*result| result.deinit(gpa); + results.deinit(gpa); + } + + // arrange + var dir = try std.fs.cwd().openDir(".", .{ .iterate = true }); + defer dir.close(); + + var iter = try dir.walk(gpa); + defer iter.deinit(); + + while (try iter.next()) |entry| { + switch (entry.kind) { + .file => { + if (std.mem.startsWith(u8, entry.path, ".git/")) continue; + if (std.mem.startsWith(u8, entry.path, ".zig-cache")) continue; + const path = try gpa.dupe(u8, entry.path[0..entry.path.len]); + try files.append(gpa, path); + }, + else => continue, + } + } + try results.ensureTotalCapacity(gpa, files.items.len); + + // act + const search = "s"; + + // create fuzzy score for each file entry + for (0.., files.items) |idx, entry| { + const result = try match(gpa, entry, search, idx) orelse continue; + try results.append(gpa, result); + } + // sort scores by their received score descending + std.sort.heap(Result, results.items, {}, greaterThan); + + var buf: [128]u8 = undefined; + var buffer = std.fs.File.stderr().writer(&buf); + var writer = &buffer.interface; + defer writer.flush() catch unreachable; + + std.debug.lockStdErr(); + defer std.debug.unlockStdErr(); + + // assert + var scored_entries: usize = 0; + var unscored_entries: usize = 0; + for (results.items) |result| { + if (result.score > 0) scored_entries += 1 else unscored_entries += 1; + if (result.score == 0) continue; // do not print results that are unmatched + + const item = files.items[result.index]; + var match_highlights: []u8 = try gpa.alloc(u8, item.len); + defer gpa.free(match_highlights); + + @memset(match_highlights, ' '); + // highlight what caused this search result + for (result.positions.items) |pos| match_highlights[pos] = '^'; + // print item and its highlighted positions + // NOTE uncomment the print for the writer to show matches and their highlights of what matched + // -> as the writer prints to *stderr* writing will cause the test to fail, hence it is commented out by default + // try writer.print("{s}\n{s}\n", .{ item, match_highlights }); + } + try testing.expectEqual(5, scored_entries); + try testing.expectEqual(results.items.len - 5, unscored_entries); +}