initial commit based on existing implementation of another project
Some checks failed
Zig Project Action / Lint, Spell-check and test zig project (push) Failing after 1m37s

Added documentation comments with example snippets and a complete
example showcasing how to use the library for matching.
This commit is contained in:
2025-11-25 19:32:30 +01:00
parent eb7cc9c2dc
commit 04f082d801
5 changed files with 325 additions and 0 deletions

280
src/root.zig Normal file
View File

@@ -0,0 +1,280 @@
//! `Fuzzig` the fuzzy search library. Matching algorithm implementation is based on
//! [ms-edit](https://github.com/microsoft/edit/blob/main/src/fuzzy.rs) MIT-Licensed.
/// Result of the calculated score. The `Result` holds memory that needs to be
/// freed once the `Result` is no longer required.
///
/// # Example
///
/// ```zig
/// const item = haystack[result.index];
/// var match_highlights: []u8 = try gpa.alloc(u8, item.len);
/// defer gpa.free(match_highlights);
/// @memset(match_highlights, ' ');
/// // highlight what caused this search result
/// for (result.positions.items) |pos| match_highlights[pos] = '^';
/// ```
///
/// Results in a match (with highlight) as for in the above example
/// (`haystack[i]` = "Hello, World!", `match` = "world"):
///
/// ```
/// Hello, World!
/// ^^^^^
/// ```
pub const Result = struct {
score: usize,
// positions are reversed and contain the indices of the characters that were matched during the fuzzy scoring.
positions: std.ArrayList(usize),
// index to the file this `Result` relates to. Using this index the associated file can be determined.
index: usize,
pub fn init(score: usize, positions: std.ArrayList(usize), index: usize) @This() {
return .{
.score = score,
.positions = positions,
.index = index,
};
}
pub fn deinit(this: *@This(), gpa: Allocator) void {
this.positions.deinit(gpa);
}
};
/// Compare function for `std.sort.heap`. Comparator sort items by their highest
/// score.
///
/// # Example
///
/// The following examples shows a short example on the usage of the `greaterThan`
/// function when applying to the heap sort algorithm of the standard library.
///
/// ```zig
/// var results: std.ArrayList(Result) = .empty;
/// // ..
/// std.sort.heap(fuzzig.Result, results.items, {}, fuzzig.greaterThan);
/// // act on sorted scores:
/// for (results.items) |result| {}
/// ```
pub fn greaterThan(_: void, a: Result, b: Result) bool {
return a.score > b.score;
}
/// Calculate the matching score for the provided query against the target string. The index is used as
/// a reference for the target string and is passed through to the returned `Result`.
///
/// In case there are no possible matches `null` is returned (i.e. the target
/// string is empty or the query is longer than the target).
///
/// # Example
///
/// Given a list of file names, you can match a given string as follows:
///
/// ```zig
/// var results: std.ArrayList(Result) = .empty;
/// defer {
/// for (results.items) |*result| result.deinit(gpa);
/// results.deinit(gpa);
/// }
/// // ..
/// // create fuzzy score for each file entry
/// for (0.., files.items) |idx, entry| {
/// const result = try fuzzig.match(gpa, entry, search, idx) orelse continue;
/// try results.append(gpa, result);
/// }
/// ```
pub fn match(gpa: Allocator, target: []const u8, query: []const u8, index: usize) !?Result {
if (target.len == 0 or query.len == 0) return null;
if (target.len < query.len) return null;
// temporary arena allocator to free all allocated memory at the end of the function
var scratch_arena: heap.ArenaAllocator = .init(gpa);
defer scratch_arena.deinit();
const allocator = scratch_arena.allocator();
const target_lower = try fold_case(allocator, target);
const query_lower = try fold_case(allocator, query);
const area = target.len * query.len;
var scores = try allocator.alloc(usize, area);
var matches = try allocator.alloc(usize, area);
for (0..query.len) |idx| {
const offset = idx * target.len;
const prev_offset = if (idx > 0) (idx - 1) * target.len else 0;
for (0..target.len) |target_index| {
const current_idx = offset + target_index;
const diag_idx = if (idx > 0 and target_index > 0) prev_offset + target_index - 1 else 0;
const left_score = if (target_index > 0) scores[current_idx - 1] else 0;
const diag_score = if (idx > 0 and target_index > 0) scores[diag_idx] else 0;
const matches_sequence_len = if (idx > 0 and target_index > 0) matches[diag_idx] else 0;
const score = if (diag_score == 0 and idx != 0) 0 else compute_char_score(
query[idx],
query_lower[idx],
if (target_index != 0) target[target_index - 1] else null,
target[target_index],
target_lower[target_index],
matches_sequence_len,
);
if (score != 0 and diag_score + score >= left_score) {
matches[current_idx] = matches_sequence_len + 1;
scores[current_idx] = diag_score + score;
} else {
matches[current_idx] = 0;
scores[current_idx] = left_score;
}
}
}
var positions: std.ArrayList(usize) = .empty;
if (query.len != 0 and target.len != 0) {
var query_idx = query.len - 1;
var target_idx = target.len - 1;
while (true) {
const current_idx = query_idx * target.len + target_idx;
if (matches[current_idx] == 0) {
if (target_idx == 0) break;
target_idx -= 1;
} else {
try positions.append(gpa, target_idx);
if (query_idx == 0 or target_idx == 0) break;
query_idx -= 1;
target_idx -= 1;
}
}
}
return .init(scores[area - 1], positions, index);
}
/// Compute the score for a given character, while taking into account the previous character, the already matching length of a (sub-)sequence.
fn compute_char_score(query: u8, query_lower: u8, target_prev: ?u8, target_curr: u8, target_curr_lower: u8, matches_sequence_len: usize) usize {
var score: usize = 0;
if (!(query_lower == target_curr_lower or query_lower == '/' or query_lower == '\\' and target_curr_lower == '/' or target_curr_lower == '\\')) return score;
score += 1;
if (matches_sequence_len > 0) score += matches_sequence_len * 5;
if (query == target_curr) score += 1;
if (target_prev) |prev| {
score += score_separator_at_pos(prev);
if (target_curr != target_curr_lower and matches_sequence_len == 0) score += 2;
} else {
score += 8;
}
return score;
}
/// Scoring for separator characters. Slightly prefering path separators over other separators.
fn score_separator_at_pos(prev: u8) u32 {
return switch (prev) {
'/', '\\' => 5, // prefer path separators...
'_', '-', '.', ' ', '\'', '"', ':' => 4, // ...over other separators
else => 0,
};
}
/// Fold the case of the provided string (of ascii characters). Returned slice
/// is owned by the caller and has to be freed using the provided `Allocator`.
fn fold_case(gpa: Allocator, s: []const u8) ![]const u8 {
var vector: std.ArrayList(u8) = try .initCapacity(gpa, s.len);
for (s) |c| {
var target = c;
if (target > 64 and target < 91) target += 32;
vector.appendAssumeCapacity(target);
}
return try vector.toOwnedSlice(gpa);
}
const std = @import("std");
const heap = std.heap;
const testing = std.testing;
const Allocator = std.mem.Allocator;
test "matching `s` on local files" {
var gpa = testing.allocator;
// files to fuzzy match against
var files: std.ArrayList([]const u8) = .empty;
defer {
for (files.items) |file| gpa.free(file);
files.deinit(gpa);
}
// fuzzy matching results (containing only the scores)
var results: std.ArrayList(Result) = .empty;
defer {
for (results.items) |*result| result.deinit(gpa);
results.deinit(gpa);
}
// arrange
var dir = try std.fs.cwd().openDir(".", .{ .iterate = true });
defer dir.close();
var iter = try dir.walk(gpa);
defer iter.deinit();
while (try iter.next()) |entry| {
switch (entry.kind) {
.file => {
if (std.mem.startsWith(u8, entry.path, ".git/")) continue;
if (std.mem.startsWith(u8, entry.path, ".zig-cache")) continue;
const path = try gpa.dupe(u8, entry.path[0..entry.path.len]);
try files.append(gpa, path);
},
else => continue,
}
}
try results.ensureTotalCapacity(gpa, files.items.len);
// act
const search = "s";
// create fuzzy score for each file entry
for (0.., files.items) |idx, entry| {
const result = try match(gpa, entry, search, idx) orelse continue;
try results.append(gpa, result);
}
// sort scores by their received score descending
std.sort.heap(Result, results.items, {}, greaterThan);
var buf: [128]u8 = undefined;
var buffer = std.fs.File.stderr().writer(&buf);
var writer = &buffer.interface;
defer writer.flush() catch unreachable;
std.debug.lockStdErr();
defer std.debug.unlockStdErr();
// assert
var scored_entries: usize = 0;
var unscored_entries: usize = 0;
for (results.items) |result| {
if (result.score > 0) scored_entries += 1 else unscored_entries += 1;
if (result.score == 0) continue; // do not print results that are unmatched
const item = files.items[result.index];
var match_highlights: []u8 = try gpa.alloc(u8, item.len);
defer gpa.free(match_highlights);
@memset(match_highlights, ' ');
// highlight what caused this search result
for (result.positions.items) |pos| match_highlights[pos] = '^';
// print item and its highlighted positions
// NOTE uncomment the print for the writer to show matches and their highlights of what matched
// -> as the writer prints to *stderr* writing will cause the test to fail, hence it is commented out by default
// try writer.print("{s}\n{s}\n", .{ item, match_highlights });
}
try testing.expectEqual(5, scored_entries);
try testing.expectEqual(results.items.len - 5, unscored_entries);
}