1+ import asyncio
2+ import json
3+ import re
4+
15from io import StringIO
26from anyio import open_file
3- import json
47from pathlib import Path
58from typing import List
69
1215 DEFAULT_LANGUAGE_CODE ,
1316 LOGGER ,
1417 NLP_CONFIG_FILE ,
18+ PRESIDIO_EXCLUSIONS_FILE_PATH ,
1519 RECOGNIZER_CONFIG_FILE ,
1620)
1721from src .hooks .presidio .spacy_post_processing_recognizer import SpacyPostProcessingRecognizer
18- from src .hooks .presidio .path_filter import PathFilter
22+ from src .hooks .presidio .path_filter import PathFilter , PathScanStatus
1923
2024logger = LOGGER
2125
@@ -30,38 +34,65 @@ def __repr__(self) -> str:
3034
3135
3236class PathScanResult :
33- def __init__ (self , path : str , results : List [PersonalDataDetection ]) -> None :
37+ def __init__ (self , path : str , status : PathScanStatus , results : List [PersonalDataDetection ] = [ ]) -> None :
3438 self .path = path
39+ self .status = status
3540 self .results = results
3641
3742
3843class PresidioScanResult :
39- def __init__ (
40- self ,
41- ) -> None :
42- self .valid_path_scans : List [PathScanResult ] = []
43- self .invalid_path_scans : List [PathScanResult ] = []
44+ def __init__ (self , results : List [PathScanResult ] = []) -> None :
45+ self .paths_without_personal_data : List [PathScanResult ] = []
46+ self .paths_containing_personal_data : List [PathScanResult ] = []
47+ self .paths_skipped : List [PathScanResult ] = []
48+ self .paths_excluded : List [PathScanResult ] = []
49+ self .add_path_scan_results (results )
50+
51+ def add_path_scan_results (self , scan_results : List [PathScanResult ]):
52+ for scan_result in scan_results :
53+ self .add_path_scan_result (scan_result )
54+
55+ def add_path_scan_result (self , scan_result : PathScanResult ):
56+ if scan_result .status == PathScanStatus .EXCLUDED :
57+ self .paths_excluded .append (scan_result )
4458
45- def add_scan_result (self , scan_result : PathScanResult ):
46- if not scan_result .results or len (scan_result .results ) == 0 :
47- self .valid_path_scans .append (scan_result )
48- else :
49- self .invalid_path_scans .append (scan_result )
59+ if scan_result .status == PathScanStatus .FAILED :
60+ self .paths_containing_personal_data .append (scan_result )
61+
62+ if scan_result .status == PathScanStatus .PASSED :
63+ self .paths_without_personal_data .append (scan_result )
64+
65+ if scan_result .status == PathScanStatus .SKIPPED :
66+ self .paths_skipped .append (scan_result )
5067
5168 def __str__ (self ) -> str :
5269 with StringIO () as output_buffer :
5370 output_buffer .write ("--------PERSONAL DATA SCAN SUMMARY--------" )
54- if self .valid_path_scans :
71+ if self .paths_excluded :
72+ output_buffer .write ("\n \n FILES EXCLUDED\n " )
73+ excluded_paths_table = PrettyTable (["Path" ])
74+ for excluded_path in self .paths_excluded :
75+ excluded_paths_table .add_row ([excluded_path .path ])
76+ output_buffer .write (str (excluded_paths_table ))
77+
78+ if self .paths_skipped :
79+ output_buffer .write ("\n \n FILES SKIPPED\n " )
80+ skipped_paths_table = PrettyTable (["Path" ])
81+ for skipped_path in self .paths_skipped :
82+ skipped_paths_table .add_row ([skipped_path .path ])
83+ output_buffer .write (str (skipped_paths_table ))
84+
85+ if self .paths_without_personal_data :
5586 output_buffer .write ("\n \n FILES WITHOUT PERSONAL DATA\n " )
5687 paths_without_issues_table = PrettyTable (["Path" ])
57- for valid_path in self .valid_path_scans :
88+ for valid_path in self .paths_without_personal_data :
5889 paths_without_issues_table .add_row ([valid_path .path ])
5990 output_buffer .write (str (paths_without_issues_table ))
6091
61- if self .invalid_path_scans :
92+ if self .paths_containing_personal_data :
6293 output_buffer .write ("\n \n FILES CONTAINING PERSONAL DATA\n " )
6394
64- for invalid_path_scan in self .invalid_path_scans :
95+ for invalid_path_scan in self .paths_containing_personal_data :
6596 output_buffer .write (f"\n { invalid_path_scan .path } \n " )
6697 table = PrettyTable (["Type" , "Value" , "Score" ])
6798 for invalid_path in invalid_path_scan .results :
@@ -117,11 +148,14 @@ def _scan_content(self, analyzer: AnalyzerEngine, entities: List[str], content:
117148 return [PersonalDataDetection (result , content [result .start : result .end ]) for result in results ]
118149
119150 async def _scan_path (
120- self ,
121- analyzer : AnalyzerEngine ,
122- entities : List [str ],
123- file_path : str ,
151+ self , analyzer : AnalyzerEngine , entities : List [str ], file_path : str , exclusions : List [re .Pattern [str ]]
124152 ) -> PathScanResult :
153+ sources = PathFilter ()
154+
155+ invalid_check_result = await sources ._check_is_path_invalid (file_path , exclusions )
156+ if invalid_check_result is not None :
157+ return PathScanResult (file_path , invalid_check_result )
158+
125159 file_extension = Path (file_path ).suffix .lower ()
126160 async with await open_file (file_path , "r" , encoding = "utf-8" ) as fs :
127161 results : List [PersonalDataDetection ] = []
@@ -133,23 +167,30 @@ async def _scan_path(
133167 contents = await fs .read ()
134168 logger .debug ("Scanning file %s by reading all contents" , file_path )
135169 results .extend (self ._scan_content (analyzer , entities , contents ))
170+
136171 return PathScanResult (
137172 file_path ,
173+ status = PathScanStatus .PASSED if len (results ) == 0 else PathScanStatus .FAILED ,
138174 results = results ,
139175 )
140176
141177 async def scan (
142178 self ,
143- github_action : bool = False ,
144179 ) -> PresidioScanResult :
145- sources = PathFilter (self . verbose )
180+ sources = PathFilter ()
146181
147182 analyzer = self ._get_analyzer ()
148183 entities = analyzer .get_supported_entities ()
149184
150- scan_result = PresidioScanResult ()
185+ exclusions = await sources ._get_exclusions (exclusions_file = PRESIDIO_EXCLUSIONS_FILE_PATH )
186+ logger .debug ("Personal data exclusions file loaded with exclusions %s" , exclusions )
187+
188+ tasks : list [asyncio .Task ] = []
189+ async with asyncio .TaskGroup () as tg :
190+ for path in self .paths :
191+ tasks .append (
192+ tg .create_task (self ._scan_path (analyzer , entities , path , exclusions )),
193+ )
194+ scan_result = PresidioScanResult (results = [task .result () for task in tasks ])
151195
152- async for path in sources .get_paths_to_scan (self .paths , github_action ):
153- path_scan_result = await self ._scan_path (analyzer , entities , path )
154- scan_result .add_scan_result (path_scan_result )
155196 return scan_result
0 commit comments