Skip to content

Commit f974dca

Browse files
authored
Unify parse_rules_file and parse_unicode_file into shared helper (#512)
1 parent aa04318 commit f974dca

2 files changed

Lines changed: 112 additions & 52 deletions

File tree

PythonScripts/audit_translations/parsers.py

Lines changed: 32 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -100,81 +100,61 @@ def iter_field_matches(node: Any) -> Iterator[tuple[str, Any, Any]]:
100100
yield key, match.value, parent
101101

102102

103-
def parse_rules_file(content: str, data: Any) -> list[RuleInfo]:
104-
"""Parse a standard rules file with name/tag entries"""
103+
def _extract_item_fields(item: Any, is_unicode: bool) -> tuple[str, str | None, str | None, Any] | None:
104+
if is_unicode:
105+
if isinstance(item, dict) and len(item) == 1:
106+
char_key = str(next(iter(item.keys())))
107+
return char_key, None, None, item[char_key]
108+
else:
109+
if isinstance(item, dict) and "name" in item:
110+
rule_name = str(item.get("name"))
111+
tag = format_tag(item.get("tag"))
112+
return f"{rule_name}|{tag or 'unknown'}", rule_name, tag, item
113+
return None
114+
115+
116+
def _build_rule_items(content: str, data: Any, is_unicode_file: bool) -> list[RuleInfo]:
105117
if not isinstance(data, list):
106118
return []
107119

108-
rules: list[RuleInfo] = []
109120
lines = content.splitlines()
110-
111121
start_lines: list[int] = []
112-
rule_items: list[Any] = []
122+
extracted: list[tuple[str, str | None, str | None, Any]] = []
123+
113124
for idx, item in enumerate(data):
114-
if isinstance(item, dict) and "name" in item:
125+
fields = _extract_item_fields(item, is_unicode_file)
126+
if fields is not None:
115127
line = data.lc.item(idx)[0] if hasattr(data, "lc") else 0
116128
start_lines.append(line)
117-
rule_items.append(item)
118-
129+
extracted.append(fields)
119130
raw_blocks = build_raw_blocks(lines, start_lines)
120131

121-
for item, raw_content, line_idx in zip(rule_items, raw_blocks, start_lines):
122-
rule_name = str(item.get("name"))
123-
tag = format_tag(item.get("tag"))
124-
rule_key = f"{rule_name}|{tag or 'unknown'}"
132+
rules: list[RuleInfo] = []
133+
for (key, name, tag, item_data), raw_content, line_idx in zip(extracted, raw_blocks, start_lines):
125134
rules.append(
126135
RuleInfo(
127-
name=rule_name,
136+
name=name,
128137
tag=tag,
129-
key=rule_key,
138+
key=key,
130139
line_number=line_idx + 1,
131140
raw_content=raw_content,
132-
data=item,
133-
untranslated_entries=find_untranslated_text_entries(item),
134-
line_map=build_line_map(item),
141+
data=item_data,
142+
untranslated_entries=find_untranslated_text_entries(item_data),
143+
line_map=build_line_map(item_data),
135144
audit_ignore=has_audit_ignore(raw_content),
136145
)
137146
)
138-
139147
return rules
140148

141149

142-
def parse_unicode_file(content: str, data: Any) -> list[RuleInfo]:
143-
"""Parse a unicode file with character/range keys"""
144-
if not isinstance(data, list):
145-
return []
146-
147-
rules: list[RuleInfo] = []
148-
lines = content.splitlines()
149-
150-
start_lines: list[int] = []
151-
entries: list[tuple[str, Any]] = []
152-
for idx, item in enumerate(data):
153-
if isinstance(item, dict) and len(item) == 1:
154-
key = next(iter(item.keys()))
155-
value = item[key]
156-
line = data.lc.item(idx)[0] if hasattr(data, "lc") else 0
157-
start_lines.append(line)
158-
entries.append((str(key), value))
150+
def parse_rules_file(content: str, data: Any) -> list[RuleInfo]:
151+
"""Parse a standard rules file with name/tag entries."""
152+
return _build_rule_items(content, data, is_unicode_file=False)
159153

160-
raw_blocks = build_raw_blocks(lines, start_lines)
161154

162-
for (char_key, value), raw_content, line_idx in zip(entries, raw_blocks, start_lines):
163-
rules.append(
164-
RuleInfo(
165-
name=None,
166-
tag=None,
167-
key=char_key,
168-
line_number=line_idx + 1,
169-
raw_content=raw_content,
170-
data=value,
171-
untranslated_entries=find_untranslated_text_entries(value),
172-
line_map=build_line_map(value),
173-
audit_ignore=has_audit_ignore(raw_content),
174-
)
175-
)
176-
177-
return rules
155+
def parse_unicode_file(content: str, data: Any) -> list[RuleInfo]:
156+
"""Parse a unicode file with character/range keys."""
157+
return _build_rule_items(content, data, is_unicode_file=True)
178158

179159

180160
def has_audit_ignore(content: str) -> bool:

PythonScripts/audit_translations/tests/test_parsers.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,45 @@ def test_sorts_tag_lists(self):
199199
rules = parse_rules_file(content, data)
200200
assert rules[0].tag == "[mo, mtext]"
201201

202+
def test_returns_empty_for_non_list_data(self):
203+
"""Non-list YAML data returns no rules."""
204+
rules = parse_rules_file("key: value", {"key": "value"})
205+
assert rules == []
206+
207+
def test_skips_items_without_name(self):
208+
"""Items like '- include: file' that lack a 'name' key are skipped."""
209+
content = """- include: shared.yaml
210+
- name: real-rule
211+
tag: mo
212+
match: "."
213+
"""
214+
yaml = YAML()
215+
data = yaml.load(content)
216+
rules = parse_rules_file(content, data)
217+
assert len(rules) == 1
218+
assert rules[0].name == "real-rule"
219+
220+
def test_mixed_valid_and_skipped_items(self):
221+
"""Valid rules interspersed with non-rule items keep correct line numbers."""
222+
content = """- name: first
223+
tag: mo
224+
match: "."
225+
226+
- include: other.yaml
227+
228+
- name: second
229+
tag: mi
230+
match: "x"
231+
"""
232+
yaml = YAML()
233+
data = yaml.load(content)
234+
rules = parse_rules_file(content, data)
235+
assert len(rules) == 2
236+
assert rules[0].name == "first"
237+
assert rules[0].line_number == 1
238+
assert rules[1].name == "second"
239+
assert rules[1].line_number == 7
240+
202241
def test_parse_yaml_file_handles_tabs(self, tmp_path):
203242
"""Ensure parse yaml file handles tabs."""
204243
content = """- name: tabbed
@@ -268,6 +307,47 @@ def test_parses_multiple_entries(self):
268307
assert len(rules) == 2
269308

270309

310+
def test_returns_empty_for_non_list_data(self):
311+
"""Non-list YAML data returns no rules."""
312+
rules = parse_unicode_file("key: value", {"key": "value"})
313+
assert rules == []
314+
315+
def test_skips_multi_key_dicts(self):
316+
"""Dicts with more than one key are not valid unicode entries and are skipped."""
317+
content = """- "a":
318+
- t: "a"
319+
- "b":
320+
- t: "b"
321+
"c":
322+
- t: "c"
323+
"""
324+
yaml = YAML()
325+
data = yaml.load(content)
326+
rules = parse_unicode_file(content, data)
327+
assert len(rules) == 1
328+
assert rules[0].key == "a"
329+
330+
def test_mixed_valid_and_skipped_items(self):
331+
"""Valid entries interspersed with non-entry items keep correct line numbers."""
332+
content = """- "a":
333+
- t: "alpha"
334+
335+
- not: a unicode entry
336+
extra: key
337+
338+
- "b":
339+
- t: "bravo"
340+
"""
341+
yaml = YAML()
342+
data = yaml.load(content)
343+
rules = parse_unicode_file(content, data)
344+
assert len(rules) == 2
345+
assert rules[0].key == "a"
346+
assert rules[0].line_number == 1
347+
assert rules[1].key == "b"
348+
assert rules[1].line_number == 7
349+
350+
271351
class TestExtractMatchPattern:
272352
def test_extracts_inline_match(self):
273353
"""Ensure extracts inline match."""

0 commit comments

Comments
 (0)