initial commit based on existing implementation of another project
Some checks failed
Zig Project Action / Lint, Spell-check and test zig project (push) Failing after 1m37s
Some checks failed
Zig Project Action / Lint, Spell-check and test zig project (push) Failing after 1m37s
Added documentation comments with example snippets and a complete example showcasing how to use the library for matching.
This commit is contained in:
280
src/root.zig
Normal file
280
src/root.zig
Normal file
@@ -0,0 +1,280 @@
|
||||
//! `Fuzzig` the fuzzy search library. Matching algorithm implementation is based on
|
||||
//! [ms-edit](https://github.com/microsoft/edit/blob/main/src/fuzzy.rs) MIT-Licensed.
|
||||
|
||||
/// Result of the calculated score. The `Result` holds memory that needs to be
|
||||
/// freed once the `Result` is no longer required.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```zig
|
||||
/// const item = haystack[result.index];
|
||||
/// var match_highlights: []u8 = try gpa.alloc(u8, item.len);
|
||||
/// defer gpa.free(match_highlights);
|
||||
/// @memset(match_highlights, ' ');
|
||||
/// // highlight what caused this search result
|
||||
/// for (result.positions.items) |pos| match_highlights[pos] = '^';
|
||||
/// ```
|
||||
///
|
||||
/// Results in a match (with highlight) as for in the above example
|
||||
/// (`haystack[i]` = "Hello, World!", `match` = "world"):
|
||||
///
|
||||
/// ```
|
||||
/// Hello, World!
|
||||
/// ^^^^^
|
||||
/// ```
|
||||
pub const Result = struct {
|
||||
score: usize,
|
||||
// positions are reversed and contain the indices of the characters that were matched during the fuzzy scoring.
|
||||
positions: std.ArrayList(usize),
|
||||
// index to the file this `Result` relates to. Using this index the associated file can be determined.
|
||||
index: usize,
|
||||
|
||||
pub fn init(score: usize, positions: std.ArrayList(usize), index: usize) @This() {
|
||||
return .{
|
||||
.score = score,
|
||||
.positions = positions,
|
||||
.index = index,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn deinit(this: *@This(), gpa: Allocator) void {
|
||||
this.positions.deinit(gpa);
|
||||
}
|
||||
};
|
||||
|
||||
/// Compare function for `std.sort.heap`. Comparator sort items by their highest
|
||||
/// score.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// The following examples shows a short example on the usage of the `greaterThan`
|
||||
/// function when applying to the heap sort algorithm of the standard library.
|
||||
///
|
||||
/// ```zig
|
||||
/// var results: std.ArrayList(Result) = .empty;
|
||||
/// // ..
|
||||
/// std.sort.heap(fuzzig.Result, results.items, {}, fuzzig.greaterThan);
|
||||
/// // act on sorted scores:
|
||||
/// for (results.items) |result| {}
|
||||
/// ```
|
||||
pub fn greaterThan(_: void, a: Result, b: Result) bool {
|
||||
return a.score > b.score;
|
||||
}
|
||||
|
||||
/// Calculate the matching score for the provided query against the target string. The index is used as
|
||||
/// a reference for the target string and is passed through to the returned `Result`.
|
||||
///
|
||||
/// In case there are no possible matches `null` is returned (i.e. the target
|
||||
/// string is empty or the query is longer than the target).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// Given a list of file names, you can match a given string as follows:
|
||||
///
|
||||
/// ```zig
|
||||
/// var results: std.ArrayList(Result) = .empty;
|
||||
/// defer {
|
||||
/// for (results.items) |*result| result.deinit(gpa);
|
||||
/// results.deinit(gpa);
|
||||
/// }
|
||||
/// // ..
|
||||
/// // create fuzzy score for each file entry
|
||||
/// for (0.., files.items) |idx, entry| {
|
||||
/// const result = try fuzzig.match(gpa, entry, search, idx) orelse continue;
|
||||
/// try results.append(gpa, result);
|
||||
/// }
|
||||
/// ```
|
||||
pub fn match(gpa: Allocator, target: []const u8, query: []const u8, index: usize) !?Result {
|
||||
if (target.len == 0 or query.len == 0) return null;
|
||||
if (target.len < query.len) return null;
|
||||
|
||||
// temporary arena allocator to free all allocated memory at the end of the function
|
||||
var scratch_arena: heap.ArenaAllocator = .init(gpa);
|
||||
defer scratch_arena.deinit();
|
||||
|
||||
const allocator = scratch_arena.allocator();
|
||||
const target_lower = try fold_case(allocator, target);
|
||||
const query_lower = try fold_case(allocator, query);
|
||||
|
||||
const area = target.len * query.len;
|
||||
var scores = try allocator.alloc(usize, area);
|
||||
var matches = try allocator.alloc(usize, area);
|
||||
|
||||
for (0..query.len) |idx| {
|
||||
const offset = idx * target.len;
|
||||
const prev_offset = if (idx > 0) (idx - 1) * target.len else 0;
|
||||
|
||||
for (0..target.len) |target_index| {
|
||||
const current_idx = offset + target_index;
|
||||
const diag_idx = if (idx > 0 and target_index > 0) prev_offset + target_index - 1 else 0;
|
||||
const left_score = if (target_index > 0) scores[current_idx - 1] else 0;
|
||||
const diag_score = if (idx > 0 and target_index > 0) scores[diag_idx] else 0;
|
||||
const matches_sequence_len = if (idx > 0 and target_index > 0) matches[diag_idx] else 0;
|
||||
|
||||
const score = if (diag_score == 0 and idx != 0) 0 else compute_char_score(
|
||||
query[idx],
|
||||
query_lower[idx],
|
||||
if (target_index != 0) target[target_index - 1] else null,
|
||||
target[target_index],
|
||||
target_lower[target_index],
|
||||
matches_sequence_len,
|
||||
);
|
||||
|
||||
if (score != 0 and diag_score + score >= left_score) {
|
||||
matches[current_idx] = matches_sequence_len + 1;
|
||||
scores[current_idx] = diag_score + score;
|
||||
} else {
|
||||
matches[current_idx] = 0;
|
||||
scores[current_idx] = left_score;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var positions: std.ArrayList(usize) = .empty;
|
||||
|
||||
if (query.len != 0 and target.len != 0) {
|
||||
var query_idx = query.len - 1;
|
||||
var target_idx = target.len - 1;
|
||||
|
||||
while (true) {
|
||||
const current_idx = query_idx * target.len + target_idx;
|
||||
if (matches[current_idx] == 0) {
|
||||
if (target_idx == 0) break;
|
||||
target_idx -= 1;
|
||||
} else {
|
||||
try positions.append(gpa, target_idx);
|
||||
if (query_idx == 0 or target_idx == 0) break;
|
||||
query_idx -= 1;
|
||||
target_idx -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return .init(scores[area - 1], positions, index);
|
||||
}
|
||||
|
||||
/// Compute the score for a given character, while taking into account the previous character, the already matching length of a (sub-)sequence.
|
||||
fn compute_char_score(query: u8, query_lower: u8, target_prev: ?u8, target_curr: u8, target_curr_lower: u8, matches_sequence_len: usize) usize {
|
||||
var score: usize = 0;
|
||||
if (!(query_lower == target_curr_lower or query_lower == '/' or query_lower == '\\' and target_curr_lower == '/' or target_curr_lower == '\\')) return score;
|
||||
|
||||
score += 1;
|
||||
|
||||
if (matches_sequence_len > 0) score += matches_sequence_len * 5;
|
||||
|
||||
if (query == target_curr) score += 1;
|
||||
|
||||
if (target_prev) |prev| {
|
||||
score += score_separator_at_pos(prev);
|
||||
if (target_curr != target_curr_lower and matches_sequence_len == 0) score += 2;
|
||||
} else {
|
||||
score += 8;
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
/// Scoring for separator characters. Slightly prefering path separators over other separators.
|
||||
fn score_separator_at_pos(prev: u8) u32 {
|
||||
return switch (prev) {
|
||||
'/', '\\' => 5, // prefer path separators...
|
||||
'_', '-', '.', ' ', '\'', '"', ':' => 4, // ...over other separators
|
||||
else => 0,
|
||||
};
|
||||
}
|
||||
|
||||
/// Fold the case of the provided string (of ascii characters). Returned slice
|
||||
/// is owned by the caller and has to be freed using the provided `Allocator`.
|
||||
fn fold_case(gpa: Allocator, s: []const u8) ![]const u8 {
|
||||
var vector: std.ArrayList(u8) = try .initCapacity(gpa, s.len);
|
||||
for (s) |c| {
|
||||
var target = c;
|
||||
if (target > 64 and target < 91) target += 32;
|
||||
vector.appendAssumeCapacity(target);
|
||||
}
|
||||
return try vector.toOwnedSlice(gpa);
|
||||
}
|
||||
|
||||
const std = @import("std");
|
||||
const heap = std.heap;
|
||||
const testing = std.testing;
|
||||
const Allocator = std.mem.Allocator;
|
||||
|
||||
test "matching `s` on local files" {
|
||||
var gpa = testing.allocator;
|
||||
|
||||
// files to fuzzy match against
|
||||
var files: std.ArrayList([]const u8) = .empty;
|
||||
defer {
|
||||
for (files.items) |file| gpa.free(file);
|
||||
files.deinit(gpa);
|
||||
}
|
||||
|
||||
// fuzzy matching results (containing only the scores)
|
||||
var results: std.ArrayList(Result) = .empty;
|
||||
defer {
|
||||
for (results.items) |*result| result.deinit(gpa);
|
||||
results.deinit(gpa);
|
||||
}
|
||||
|
||||
// arrange
|
||||
var dir = try std.fs.cwd().openDir(".", .{ .iterate = true });
|
||||
defer dir.close();
|
||||
|
||||
var iter = try dir.walk(gpa);
|
||||
defer iter.deinit();
|
||||
|
||||
while (try iter.next()) |entry| {
|
||||
switch (entry.kind) {
|
||||
.file => {
|
||||
if (std.mem.startsWith(u8, entry.path, ".git/")) continue;
|
||||
if (std.mem.startsWith(u8, entry.path, ".zig-cache")) continue;
|
||||
const path = try gpa.dupe(u8, entry.path[0..entry.path.len]);
|
||||
try files.append(gpa, path);
|
||||
},
|
||||
else => continue,
|
||||
}
|
||||
}
|
||||
try results.ensureTotalCapacity(gpa, files.items.len);
|
||||
|
||||
// act
|
||||
const search = "s";
|
||||
|
||||
// create fuzzy score for each file entry
|
||||
for (0.., files.items) |idx, entry| {
|
||||
const result = try match(gpa, entry, search, idx) orelse continue;
|
||||
try results.append(gpa, result);
|
||||
}
|
||||
// sort scores by their received score descending
|
||||
std.sort.heap(Result, results.items, {}, greaterThan);
|
||||
|
||||
var buf: [128]u8 = undefined;
|
||||
var buffer = std.fs.File.stderr().writer(&buf);
|
||||
var writer = &buffer.interface;
|
||||
defer writer.flush() catch unreachable;
|
||||
|
||||
std.debug.lockStdErr();
|
||||
defer std.debug.unlockStdErr();
|
||||
|
||||
// assert
|
||||
var scored_entries: usize = 0;
|
||||
var unscored_entries: usize = 0;
|
||||
for (results.items) |result| {
|
||||
if (result.score > 0) scored_entries += 1 else unscored_entries += 1;
|
||||
if (result.score == 0) continue; // do not print results that are unmatched
|
||||
|
||||
const item = files.items[result.index];
|
||||
var match_highlights: []u8 = try gpa.alloc(u8, item.len);
|
||||
defer gpa.free(match_highlights);
|
||||
|
||||
@memset(match_highlights, ' ');
|
||||
// highlight what caused this search result
|
||||
for (result.positions.items) |pos| match_highlights[pos] = '^';
|
||||
// print item and its highlighted positions
|
||||
// NOTE uncomment the print for the writer to show matches and their highlights of what matched
|
||||
// -> as the writer prints to *stderr* writing will cause the test to fail, hence it is commented out by default
|
||||
// try writer.print("{s}\n{s}\n", .{ item, match_highlights });
|
||||
}
|
||||
try testing.expectEqual(5, scored_entries);
|
||||
try testing.expectEqual(results.items.len - 5, unscored_entries);
|
||||
}
|
||||
Reference in New Issue
Block a user