Skip to content

Commit 6e82690

Browse files
authored
Output the summary of skipped and excluded files (#137)
1 parent 9b0e381 commit 6e82690

9 files changed

Lines changed: 244 additions & 151 deletions

File tree

src/hooks/cli.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import anyio
22
import argparse
33
import sys
4+
import time
45

56

67
from typing import List, Optional
@@ -71,6 +72,8 @@ def parse_args(argv):
7172

7273

7374
async def main_async(argv: Optional[List[str]] = None):
75+
hook_run_time = time.time()
76+
7477
args = parse_args(argv)
7578

7679
init_logger(args.verbose)
@@ -96,6 +99,9 @@ async def main_async(argv: Optional[List[str]] = None):
9699
run_result = await hook.run()
97100
logger.info("%s", run_result.run_summary())
98101

102+
hook_run_time = time.time() - hook_run_time
103+
logger.debug("Hook took %s seconds", hook_run_time)
104+
99105
if not run_result.run_success():
100106
logger.info("Hook '%s' did not successfully run.", hook)
101107
return 1

src/hooks/presidio/path_filter.py

Lines changed: 15 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,28 @@
1-
import git
21
import re
32

43
from anyio import open_file, Path
5-
4+
from enum import Enum
65
from typing import List
76

87

98
from src.hooks.config import (
109
DEFAULT_FILE_TYPES,
1110
LOGGER,
12-
PRESIDIO_EXCLUSIONS_FILE_PATH,
1311
)
1412

1513
logger = LOGGER
1614

1715

16+
class PathScanStatus(Enum):
17+
SKIPPED = 1
18+
EXCLUDED = 2
19+
PASSED = 3
20+
FAILED = 4
21+
22+
1823
class PathFilter:
1924
LINE_BY_LINE_FILE_EXTENSIONS = [".csv"]
2025

21-
def __init__(
22-
self,
23-
verbose: bool = False,
24-
) -> None:
25-
self.verbose = verbose
26-
2726
def _is_path_excluded(self, path: str, exclusions: List[re.Pattern[str]]):
2827
for exclusion in exclusions:
2928
match = exclusion.search(path)
@@ -34,17 +33,17 @@ def _is_path_excluded(self, path: str, exclusions: List[re.Pattern[str]]):
3433
logger.debug("The path %s was not found in any exclusion regexes", path)
3534
return False
3635

37-
async def _should_scan_path(self, path: str, exclusions: List[re.Pattern[str]]):
36+
async def _check_is_path_invalid(self, path: str, exclusions: List[re.Pattern[str]]):
3837
if self._is_path_excluded(path, exclusions):
39-
return False
38+
return PathScanStatus.EXCLUDED
4039

4140
if not await Path(path).exists():
4241
logger.debug("Path %s does not exist", path)
43-
return False
42+
return PathScanStatus.SKIPPED
4443

4544
if not await Path(path).is_file():
4645
logger.debug("Path %s is a directory, presidio can only scan files", path)
47-
return False
46+
return PathScanStatus.SKIPPED
4847

4948
file_extension = Path(path).suffix
5049
if file_extension not in DEFAULT_FILE_TYPES:
@@ -53,15 +52,15 @@ async def _should_scan_path(self, path: str, exclusions: List[re.Pattern[str]]):
5352
path,
5453
DEFAULT_FILE_TYPES,
5554
)
56-
return False
55+
return PathScanStatus.SKIPPED
5756

5857
logger.debug(
5958
"Path %s is valid and should be scanned",
6059
path,
6160
)
62-
return True
61+
return None
6362

64-
async def _get_exclusions(self, exclusions_file: str):
63+
async def _get_exclusions(self, exclusions_file: str) -> List[re.Pattern[str]]:
6564
exclusions = []
6665

6766
if not await Path(exclusions_file).exists():
@@ -80,20 +79,3 @@ async def _get_exclusions(self, exclusions_file: str):
8079
)
8180
raise
8281
return exclusions
83-
84-
async def get_paths_to_scan(
85-
self,
86-
paths: List[str],
87-
github_action: bool = False,
88-
):
89-
if github_action:
90-
repo = git.Repo(paths[0])
91-
logger.debug("Scanning files in git repository %s", repo)
92-
paths = [entry.abspath for entry in repo.tree().traverse()]
93-
94-
exclusions = await self._get_exclusions(exclusions_file=PRESIDIO_EXCLUSIONS_FILE_PATH)
95-
logger.debug("Exclusions file loaded with exclusions %s", exclusions)
96-
97-
for path in paths:
98-
if await self._should_scan_path(path, exclusions):
99-
yield path

src/hooks/presidio/scanner.py

Lines changed: 68 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import asyncio
2+
import json
3+
import re
4+
15
from io import StringIO
26
from anyio import open_file
3-
import json
47
from pathlib import Path
58
from typing import List
69

@@ -12,10 +15,11 @@
1215
DEFAULT_LANGUAGE_CODE,
1316
LOGGER,
1417
NLP_CONFIG_FILE,
18+
PRESIDIO_EXCLUSIONS_FILE_PATH,
1519
RECOGNIZER_CONFIG_FILE,
1620
)
1721
from src.hooks.presidio.spacy_post_processing_recognizer import SpacyPostProcessingRecognizer
18-
from src.hooks.presidio.path_filter import PathFilter
22+
from src.hooks.presidio.path_filter import PathFilter, PathScanStatus
1923

2024
logger = LOGGER
2125

@@ -30,38 +34,65 @@ def __repr__(self) -> str:
3034

3135

3236
class PathScanResult:
33-
def __init__(self, path: str, results: List[PersonalDataDetection]) -> None:
37+
def __init__(self, path: str, status: PathScanStatus, results: List[PersonalDataDetection] = []) -> None:
3438
self.path = path
39+
self.status = status
3540
self.results = results
3641

3742

3843
class PresidioScanResult:
39-
def __init__(
40-
self,
41-
) -> None:
42-
self.valid_path_scans: List[PathScanResult] = []
43-
self.invalid_path_scans: List[PathScanResult] = []
44+
def __init__(self, results: List[PathScanResult] = []) -> None:
45+
self.paths_without_personal_data: List[PathScanResult] = []
46+
self.paths_containing_personal_data: List[PathScanResult] = []
47+
self.paths_skipped: List[PathScanResult] = []
48+
self.paths_excluded: List[PathScanResult] = []
49+
self.add_path_scan_results(results)
50+
51+
def add_path_scan_results(self, scan_results: List[PathScanResult]):
52+
for scan_result in scan_results:
53+
self.add_path_scan_result(scan_result)
54+
55+
def add_path_scan_result(self, scan_result: PathScanResult):
56+
if scan_result.status == PathScanStatus.EXCLUDED:
57+
self.paths_excluded.append(scan_result)
4458

45-
def add_scan_result(self, scan_result: PathScanResult):
46-
if not scan_result.results or len(scan_result.results) == 0:
47-
self.valid_path_scans.append(scan_result)
48-
else:
49-
self.invalid_path_scans.append(scan_result)
59+
if scan_result.status == PathScanStatus.FAILED:
60+
self.paths_containing_personal_data.append(scan_result)
61+
62+
if scan_result.status == PathScanStatus.PASSED:
63+
self.paths_without_personal_data.append(scan_result)
64+
65+
if scan_result.status == PathScanStatus.SKIPPED:
66+
self.paths_skipped.append(scan_result)
5067

5168
def __str__(self) -> str:
5269
with StringIO() as output_buffer:
5370
output_buffer.write("--------PERSONAL DATA SCAN SUMMARY--------")
54-
if self.valid_path_scans:
71+
if self.paths_excluded:
72+
output_buffer.write("\n\nFILES EXCLUDED\n")
73+
excluded_paths_table = PrettyTable(["Path"])
74+
for excluded_path in self.paths_excluded:
75+
excluded_paths_table.add_row([excluded_path.path])
76+
output_buffer.write(str(excluded_paths_table))
77+
78+
if self.paths_skipped:
79+
output_buffer.write("\n\nFILES SKIPPED\n")
80+
skipped_paths_table = PrettyTable(["Path"])
81+
for skipped_path in self.paths_skipped:
82+
skipped_paths_table.add_row([skipped_path.path])
83+
output_buffer.write(str(skipped_paths_table))
84+
85+
if self.paths_without_personal_data:
5586
output_buffer.write("\n\nFILES WITHOUT PERSONAL DATA\n")
5687
paths_without_issues_table = PrettyTable(["Path"])
57-
for valid_path in self.valid_path_scans:
88+
for valid_path in self.paths_without_personal_data:
5889
paths_without_issues_table.add_row([valid_path.path])
5990
output_buffer.write(str(paths_without_issues_table))
6091

61-
if self.invalid_path_scans:
92+
if self.paths_containing_personal_data:
6293
output_buffer.write("\n\nFILES CONTAINING PERSONAL DATA\n")
6394

64-
for invalid_path_scan in self.invalid_path_scans:
95+
for invalid_path_scan in self.paths_containing_personal_data:
6596
output_buffer.write(f"\n{invalid_path_scan.path}\n")
6697
table = PrettyTable(["Type", "Value", "Score"])
6798
for invalid_path in invalid_path_scan.results:
@@ -117,11 +148,14 @@ def _scan_content(self, analyzer: AnalyzerEngine, entities: List[str], content:
117148
return [PersonalDataDetection(result, content[result.start : result.end]) for result in results]
118149

119150
async def _scan_path(
120-
self,
121-
analyzer: AnalyzerEngine,
122-
entities: List[str],
123-
file_path: str,
151+
self, analyzer: AnalyzerEngine, entities: List[str], file_path: str, exclusions: List[re.Pattern[str]]
124152
) -> PathScanResult:
153+
sources = PathFilter()
154+
155+
invalid_check_result = await sources._check_is_path_invalid(file_path, exclusions)
156+
if invalid_check_result is not None:
157+
return PathScanResult(file_path, invalid_check_result)
158+
125159
file_extension = Path(file_path).suffix.lower()
126160
async with await open_file(file_path, "r", encoding="utf-8") as fs:
127161
results: List[PersonalDataDetection] = []
@@ -133,23 +167,30 @@ async def _scan_path(
133167
contents = await fs.read()
134168
logger.debug("Scanning file %s by reading all contents", file_path)
135169
results.extend(self._scan_content(analyzer, entities, contents))
170+
136171
return PathScanResult(
137172
file_path,
173+
status=PathScanStatus.PASSED if len(results) == 0 else PathScanStatus.FAILED,
138174
results=results,
139175
)
140176

141177
async def scan(
142178
self,
143-
github_action: bool = False,
144179
) -> PresidioScanResult:
145-
sources = PathFilter(self.verbose)
180+
sources = PathFilter()
146181

147182
analyzer = self._get_analyzer()
148183
entities = analyzer.get_supported_entities()
149184

150-
scan_result = PresidioScanResult()
185+
exclusions = await sources._get_exclusions(exclusions_file=PRESIDIO_EXCLUSIONS_FILE_PATH)
186+
logger.debug("Personal data exclusions file loaded with exclusions %s", exclusions)
187+
188+
tasks: list[asyncio.Task] = []
189+
async with asyncio.TaskGroup() as tg:
190+
for path in self.paths:
191+
tasks.append(
192+
tg.create_task(self._scan_path(analyzer, entities, path, exclusions)),
193+
)
194+
scan_result = PresidioScanResult(results=[task.result() for task in tasks])
151195

152-
async for path in sources.get_paths_to_scan(self.paths, github_action):
153-
path_scan_result = await self._scan_path(analyzer, entities, path)
154-
scan_result.add_scan_result(path_scan_result)
155196
return scan_result

src/hooks/run_security_scan.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import asyncio
22
import aiohttp
3+
import git
34

45
from pathlib import Path
56
from typing import List
67

8+
79
from src.hooks.config import (
810
LOGGER,
911
PERSONAL_DATA_SCAN,
@@ -34,7 +36,10 @@ def run_success(self) -> bool:
3436
if self.trufflehog_scan_result.detected_keys is not None:
3537
is_success = False
3638
if self.presidio_scan_result:
37-
if self.presidio_scan_result.invalid_path_scans and len(self.presidio_scan_result.invalid_path_scans) > 0:
39+
if (
40+
self.presidio_scan_result.paths_containing_personal_data
41+
and len(self.presidio_scan_result.paths_containing_personal_data) > 0
42+
):
3843
is_success = False
3944
return is_success
4045

@@ -135,26 +140,30 @@ async def run_security_scan(self) -> TrufflehogScanResult:
135140
)
136141

137142
async def run_personal_scan(self) -> PresidioScanResult:
143+
paths_to_scan = self.paths
144+
if self.github_action:
145+
repo = git.Repo(self.paths[0])
146+
logger.debug("Scanning files in git repository %s", repo)
147+
paths_to_scan = [entry.abspath for entry in repo.tree().traverse()]
148+
138149
return await PresidioScanner(
139150
self.verbose,
140-
self.paths,
141-
).scan(self.github_action)
142-
143-
# TODO
144-
# File skipped due to file extension
145-
# File excluded from scan
151+
paths_to_scan,
152+
).scan()
146153

147154
async def run(self) -> RunSecurityScanResult:
148155
security_scan_task = None
149156
personal_data_scan_task = None
150157

151158
async with asyncio.TaskGroup() as tg:
152159
if SECURITY_SCAN not in self.excluded_scans:
160+
logger.debug("Running security scan")
153161
security_scan_task = tg.create_task(self.run_security_scan())
154162
else:
155163
logger.debug("Security scan is excluded")
156164

157165
if PERSONAL_DATA_SCAN not in self.excluded_scans:
166+
logger.debug("Running personal data scan")
158167
personal_data_scan_task = tg.create_task(self.run_personal_scan())
159168
else:
160169
logger.debug("Personal data scan is excluded")
@@ -163,5 +172,6 @@ async def run(self) -> RunSecurityScanResult:
163172
personal_data_scan_result = personal_data_scan_task.result() if personal_data_scan_task else None
164173

165174
return RunSecurityScanResult(
166-
trufflehog_scan_result=security_scan_result, presidio_scan_result=personal_data_scan_result
175+
trufflehog_scan_result=security_scan_result,
176+
presidio_scan_result=personal_data_scan_result,
167177
)

src/hooks/trufflehog/scanner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ async def _get_args(
7777
trufflehog_cmd_args.append("--since-commit=main")
7878

7979
if await Path(TRUFFLEHOG_EXCLUSIONS_FILE_PATH).exists():
80-
logger.debug("This repo has an exclusions file, adding this file to the trufflehog runner")
80+
logger.debug("Security scanner exclusions file loaded")
8181
trufflehog_cmd_args.append(f"--exclude-paths={TRUFFLEHOG_EXCLUSIONS_FILE_PATH}")
8282

8383
trufflehog_detectors = ",".join(allowed_vendor_codes)

0 commit comments

Comments
 (0)