Skip to content

Commit 5b11f21

Browse files
committed
feat(tools): add no_ignore mode and parallel search to grep
Add `no_ignore` boolean parameter that bypasses .gitignore rules while still excluding hardcoded directories (.git, node_modules, etc.). Parallelize file searching with Task.async_stream when >= 4 files, using ordered results and max_concurrency based on scheduler count. Sequential path preserved for small file sets to avoid overhead. - Thread no_ignore through execute → do_search → collect_files → walk_dir - Skip .gitignore loading and checks when no_ignore is true - Add trim_matches/2 for capping parallel results at max_results - Add 7 new tests (4 no_ignore + 3 parallel search) - Update docs/tools/grep.md with no_ignore parameter
1 parent a154f2d commit 5b11f21

3 files changed

Lines changed: 228 additions & 15 deletions

File tree

docs/tools/grep.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Returns matching lines in the same `N:hash|content` format as `read_file`, so re
1313
| `include` | string | no | Glob to filter filenames (e.g. `*.ex`, `*.{ex,exs}`) |
1414
| `context_lines` | integer | no | Lines of surrounding context per match (default: 2) |
1515
| `max_results` | integer | no | Cap on total matches returned (default: 50, max: 500) |
16+
| `no_ignore` | boolean | no | When `true`, search files even if excluded by `.gitignore` (default: `false`) |
1617

1718
## Output Format
1819

@@ -34,7 +35,7 @@ When results are capped, a `[Results capped]` hint is appended.
3435

3536
## Exclusions
3637

37-
The following directories are always skipped:
38+
The following directories are always skipped (even with `no_ignore`):
3839

3940
| Category | Directories |
4041
|----------|-------------|
@@ -43,6 +44,8 @@ The following directories are always skipped:
4344
| Editor metadata | `.elixir_ls`, `.vscode`, `.idea` |
4445
| Caches | `__pycache__`, `.mypy_cache`, `tmp` |
4546

47+
`.gitignore` patterns are respected by default. Set `no_ignore: true` to bypass `.gitignore` rules and search all non-binary files.
48+
4649
Binary files (containing null bytes in the first 8 KB) are silently skipped.
4750

4851
## Cross-Platform

opal/lib/opal/tool/grep.ex

Lines changed: 108 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,16 @@ defmodule Opal.Tool.Grep do
2929
parsed and applied during directory traversal, so ignored files and
3030
directories are skipped automatically.
3131
32+
Set `no_ignore: true` to bypass `.gitignore` rules and search all
33+
non-binary files (hardcoded skip directories like `.git` are still
34+
excluded).
35+
3236
Binary files (containing null bytes) are silently skipped.
3337
"""
3438

3539
@behaviour Opal.Tool
3640

37-
@dialyzer {:no_opaque, [do_walk_dir: 5, walk_dir: 5]}
41+
@dialyzer {:no_opaque, [do_walk_dir: 6, walk_dir: 6]}
3842

3943
alias Opal.Tool.Encoding
4044
alias Opal.Tool.FileHelper
@@ -57,6 +61,10 @@ defmodule Opal.Tool.Grep do
5761
@max_output_bytes 50 * 1024
5862
@max_depth 25
5963

64+
# Parallelism: only fan out when there are enough files to justify it.
65+
@parallel_threshold 4
66+
@max_concurrency System.schedulers_online()
67+
6068
@impl true
6169
@spec name() :: String.t()
6270
def name, do: "grep"
@@ -100,6 +108,11 @@ defmodule Opal.Tool.Grep do
100108
"type" => "integer",
101109
"description" =>
102110
"Maximum number of matching lines returned across all files (default: 50)"
111+
},
112+
"no_ignore" => %{
113+
"type" => "boolean",
114+
"description" =>
115+
"When true, search files even if they are excluded by .gitignore rules (default: false)"
103116
}
104117
},
105118
"required" => ["pattern"]
@@ -115,12 +128,13 @@ defmodule Opal.Tool.Grep do
115128
include = Map.get(args, "include")
116129
ctx_lines = Map.get(args, "context_lines", @max_context_default) |> max(0) |> min(10)
117130
max_results = Map.get(args, "max_results", @max_results_default) |> max(1) |> min(500)
131+
no_ignore = Map.get(args, "no_ignore", false)
118132

119133
allow_bases = FileHelper.allowed_bases(context)
120134

121135
case FileHelper.resolve_path(search_path, working_dir, allow_bases: allow_bases) do
122136
{:ok, resolved} ->
123-
do_search(resolved, regex, include, ctx_lines, max_results, working_dir)
137+
do_search(resolved, regex, include, ctx_lines, max_results, working_dir, no_ignore)
124138

125139
{:error, reason} ->
126140
{:error, reason}
@@ -136,10 +150,10 @@ defmodule Opal.Tool.Grep do
136150

137151
# -- Search implementation --------------------------------------------------
138152

139-
defp do_search(resolved, regex, include, ctx_lines, max_results, working_dir) do
153+
defp do_search(resolved, regex, include, ctx_lines, max_results, working_dir, no_ignore) do
140154
glob = Opal.Platform.compile_glob(include)
141155

142-
files = collect_files(resolved, glob)
156+
files = collect_files(resolved, glob, no_ignore)
143157

144158
{results, total_matches, capped?} =
145159
search_files(files, regex, ctx_lines, max_results, working_dir)
@@ -154,27 +168,28 @@ defmodule Opal.Tool.Grep do
154168

155169
# -- File collection --------------------------------------------------------
156170

157-
defp collect_files(path, glob) do
171+
defp collect_files(path, glob, no_ignore) do
158172
if File.regular?(path) do
159173
if Opal.Platform.matches_glob?(Path.basename(path), glob), do: [path], else: []
160174
else
161-
gitignore = Gitignore.load(path)
162-
walk_dir(path, glob, 0, MapSet.new(), gitignore)
175+
gitignore = if no_ignore, do: %Gitignore{root: path}, else: Gitignore.load(path)
176+
walk_dir(path, glob, 0, MapSet.new(), gitignore, no_ignore)
163177
end
164178
end
165179

166180
# Walks directories with depth limiting and symlink-loop protection.
167181
# `visited` tracks real (resolved) directory paths to break cycles.
168182
# `gitignore` accumulates rules from nested .gitignore files.
169-
defp walk_dir(dir, glob, depth, visited, gitignore) do
183+
# `no_ignore` bypasses .gitignore rules when true.
184+
defp walk_dir(dir, glob, depth, visited, gitignore, no_ignore) do
170185
if depth > @max_depth do
171186
[]
172187
else
173-
do_walk_dir(dir, glob, depth, visited, gitignore)
188+
do_walk_dir(dir, glob, depth, visited, gitignore, no_ignore)
174189
end
175190
end
176191

177-
defp do_walk_dir(dir, glob, depth, visited, gitignore) do
192+
defp do_walk_dir(dir, glob, depth, visited, gitignore, no_ignore) do
178193
# Resolve symlinks to detect cycles on all platforms
179194
real_dir = Path.expand(dir)
180195

@@ -185,9 +200,9 @@ defmodule Opal.Tool.Grep do
185200

186201
# Merge nested .gitignore when descending into subdirectories.
187202
# The root .gitignore is already loaded in collect_files, so skip
188-
# re-reading it at depth 0.
203+
# re-reading it at depth 0. Skip entirely when no_ignore is set.
189204
gitignore =
190-
if depth > 0 do
205+
if not no_ignore and depth > 0 do
191206
case File.read(Path.join(dir, ".gitignore")) do
192207
{:ok, content} ->
193208
child = Gitignore.parse(content, gitignore.root)
@@ -213,11 +228,11 @@ defmodule Opal.Tool.Grep do
213228
skip_dir?(entry) ->
214229
[]
215230

216-
Gitignore.ignored?(gitignore, rel, is_dir) ->
231+
not no_ignore and Gitignore.ignored?(gitignore, rel, is_dir) ->
217232
[]
218233

219234
is_dir ->
220-
walk_dir(full, glob, depth + 1, visited, gitignore)
235+
walk_dir(full, glob, depth + 1, visited, gitignore, no_ignore)
221236

222237
Opal.Platform.matches_glob?(entry, glob) ->
223238
[full]
@@ -241,8 +256,21 @@ defmodule Opal.Tool.Grep do
241256
defp skip_dir?(name), do: MapSet.member?(@skip_dirs, name)
242257

243258
# -- Search across files ----------------------------------------------------
259+
#
260+
# Files are searched in parallel when there are enough to justify the
261+
# overhead. Each task is fully independent (read → regex → hashline),
262+
# so there is no shared mutable state. Results stream back in file
263+
# order via `ordered: true`, then we apply the max_results cap.
244264

245265
defp search_files(files, regex, ctx_lines, max_results, working_dir) do
266+
if length(files) < @parallel_threshold do
267+
search_files_sequential(files, regex, ctx_lines, max_results, working_dir)
268+
else
269+
search_files_parallel(files, regex, ctx_lines, max_results, working_dir)
270+
end
271+
end
272+
273+
defp search_files_sequential(files, regex, ctx_lines, max_results, working_dir) do
246274
Enum.reduce_while(files, {[], 0, false}, fn file, {acc, count, _capped?} ->
247275
case search_file(file, regex, ctx_lines, max_results - count, working_dir) do
248276
{:ok, matches, match_count} when match_count > 0 ->
@@ -261,6 +289,72 @@ defmodule Opal.Tool.Grep do
261289
end)
262290
end
263291

292+
defp search_files_parallel(files, regex, ctx_lines, max_results, working_dir) do
293+
# Each task searches with the full max_results cap. We trim after
294+
# collecting, so individual tasks may do slightly more work than
295+
# strictly necessary — but each one is bounded and the fan-out
296+
# across schedulers more than compensates.
297+
files
298+
|> Task.async_stream(
299+
fn file -> {file, search_file(file, regex, ctx_lines, max_results, working_dir)} end,
300+
ordered: true,
301+
max_concurrency: @max_concurrency
302+
)
303+
|> Enum.reduce_while({[], 0, false}, fn {:ok, {file, result}}, {acc, count, _capped?} ->
304+
case result do
305+
{:ok, matches, match_count} when match_count > 0 ->
306+
# Trim this file's matches if adding all would exceed the cap.
307+
trimmed_count = min(match_count, max_results - count)
308+
309+
matches =
310+
if trimmed_count < match_count do
311+
trim_matches(matches, trimmed_count)
312+
else
313+
matches
314+
end
315+
316+
new_count = count + trimmed_count
317+
new_acc = acc ++ [{file, matches}]
318+
319+
if new_count >= max_results do
320+
{:halt, {new_acc, new_count, true}}
321+
else
322+
{:cont, {new_acc, new_count, false}}
323+
end
324+
325+
_ ->
326+
{:cont, {acc, count, false}}
327+
end
328+
end)
329+
end
330+
331+
# Rebuild the tagged output keeping only the first `n` match lines.
332+
# Context lines around kept matches are preserved.
333+
defp trim_matches({rel_path, groups}, keep) do
334+
{trimmed_groups, _remaining} =
335+
Enum.reduce_while(groups, {[], keep}, fn group, {acc, remaining} ->
336+
match_lines_in_group = Enum.count(group, fn {_tagged, is_match} -> is_match end)
337+
338+
if match_lines_in_group <= remaining do
339+
{:cont, {acc ++ [group], remaining - match_lines_in_group}}
340+
else
341+
# Partial group: keep only enough match lines.
342+
{partial, _} =
343+
Enum.reduce(group, {[], remaining}, fn {tagged, is_match} = entry, {kept, rem} ->
344+
cond do
345+
not is_match -> {kept ++ [entry], rem}
346+
rem > 0 -> {kept ++ [{tagged, true}], rem - 1}
347+
true -> {kept, 0}
348+
end
349+
end)
350+
351+
{:halt, {acc ++ [partial], 0}}
352+
end
353+
end)
354+
355+
{rel_path, trimmed_groups}
356+
end
357+
264358
defp search_file(file, regex, ctx_lines, remaining, working_dir) do
265359
with {:ok, raw} <- File.read(file),
266360
true <- String.valid?(raw),

opal/test/opal/tool/grep_test.exs

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,4 +375,120 @@ defmodule Opal.Tool.GrepTest do
375375
refute result =~ "deep.txt"
376376
end
377377
end
378+
379+
describe "execute/2 — no_ignore" do
380+
test "gitignored files are skipped by default", %{tmp_dir: tmp_dir} do
381+
File.write!(Path.join(tmp_dir, ".gitignore"), "secret.log\n")
382+
File.write!(Path.join(tmp_dir, "secret.log"), "password123")
383+
File.write!(Path.join(tmp_dir, "visible.txt"), "password123")
384+
385+
{:ok, result} =
386+
Grep.execute(%{"pattern" => "password"}, %{working_dir: tmp_dir})
387+
388+
assert result =~ "visible.txt"
389+
refute result =~ "secret.log"
390+
end
391+
392+
test "no_ignore includes gitignored files", %{tmp_dir: tmp_dir} do
393+
File.write!(Path.join(tmp_dir, ".gitignore"), "secret.log\n")
394+
File.write!(Path.join(tmp_dir, "secret.log"), "password123")
395+
File.write!(Path.join(tmp_dir, "visible.txt"), "password123")
396+
397+
{:ok, result} =
398+
Grep.execute(
399+
%{"pattern" => "password", "no_ignore" => true},
400+
%{working_dir: tmp_dir}
401+
)
402+
403+
assert result =~ "visible.txt"
404+
assert result =~ "secret.log"
405+
end
406+
407+
test "no_ignore still skips hardcoded dirs like .git", %{tmp_dir: tmp_dir} do
408+
git_dir = Path.join(tmp_dir, ".git")
409+
File.mkdir_p!(git_dir)
410+
File.write!(Path.join(git_dir, "HEAD"), "ref: refs/heads/main")
411+
File.write!(Path.join(tmp_dir, "real.txt"), "ref: refs/heads/main")
412+
413+
{:ok, result} =
414+
Grep.execute(
415+
%{"pattern" => "ref:", "no_ignore" => true},
416+
%{working_dir: tmp_dir}
417+
)
418+
419+
assert result =~ "real.txt"
420+
refute result =~ ".git"
421+
end
422+
423+
test "no_ignore includes files in gitignored subdirectories", %{tmp_dir: tmp_dir} do
424+
File.write!(Path.join(tmp_dir, ".gitignore"), "build/\n")
425+
build_dir = Path.join(tmp_dir, "build")
426+
File.mkdir_p!(build_dir)
427+
File.write!(Path.join(build_dir, "output.txt"), "compiled result")
428+
429+
{:ok, result} =
430+
Grep.execute(
431+
%{"pattern" => "compiled", "no_ignore" => true},
432+
%{working_dir: tmp_dir}
433+
)
434+
435+
assert result =~ "output.txt"
436+
end
437+
end
438+
439+
describe "execute/2 — parallel search" do
440+
test "produces correct results with many files (parallel path)", %{tmp_dir: tmp_dir} do
441+
# Create enough files to exceed @parallel_threshold (4)
442+
for i <- 1..10 do
443+
File.write!(Path.join(tmp_dir, "file_#{i}.txt"), "needle_#{i}\nhaystack")
444+
end
445+
446+
{:ok, result} =
447+
Grep.execute(%{"pattern" => "needle", "context_lines" => 0}, %{working_dir: tmp_dir})
448+
449+
# All 10 files should appear
450+
for i <- 1..10 do
451+
assert result =~ "file_#{i}.txt", "Expected file_#{i}.txt in results"
452+
end
453+
454+
assert result =~ "10 matches found."
455+
end
456+
457+
test "parallel search respects max_results cap", %{tmp_dir: tmp_dir} do
458+
# Each file has 5 matches, 10 files = 50 total possible matches
459+
for i <- 1..10 do
460+
content = Enum.map_join(1..5, "\n", &"match_#{i}_#{&1}")
461+
File.write!(Path.join(tmp_dir, "multi_#{i}.txt"), content)
462+
end
463+
464+
{:ok, result} =
465+
Grep.execute(
466+
%{"pattern" => "match_", "max_results" => 7, "context_lines" => 0},
467+
%{working_dir: tmp_dir}
468+
)
469+
470+
assert result =~ "capped"
471+
end
472+
473+
test "parallel search preserves file ordering", %{tmp_dir: tmp_dir} do
474+
for name <- ~w(alpha.txt beta.txt gamma.txt delta.txt epsilon.txt) do
475+
File.write!(Path.join(tmp_dir, name), "findme")
476+
end
477+
478+
{:ok, result} =
479+
Grep.execute(%{"pattern" => "findme", "context_lines" => 0}, %{working_dir: tmp_dir})
480+
481+
# Files should appear in sorted order (walk_dir sorts entries)
482+
positions =
483+
~w(alpha.txt beta.txt delta.txt epsilon.txt gamma.txt)
484+
|> Enum.map(fn name ->
485+
case :binary.match(result, name) do
486+
{pos, _} -> pos
487+
:nomatch -> flunk("Expected #{name} in results")
488+
end
489+
end)
490+
491+
assert positions == Enum.sort(positions), "Files should appear in sorted order"
492+
end
493+
end
378494
end

0 commit comments

Comments
 (0)