Skip to content

Commit 2f0c4af

Browse files
committed
rotation
1 parent ba3455f commit 2f0c4af

2 files changed

Lines changed: 37 additions & 4 deletions

File tree

batchalign/pipelines/analysis/compare.py

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,30 @@ def _find_best_segment(gold_tokens, main_tokens, mfn):
216216
return best
217217

218218

219+
def _best_rotation(window_tokens, gold_tokens, mfn):
220+
"""Find the cyclic rotation of *window_tokens* that maximises matches.
221+
222+
Returns the rotation offset *r* such that
223+
``window_tokens[r:] + window_tokens[:r]`` best aligns to *gold_tokens*.
224+
"""
225+
if len(window_tokens) <= 1:
226+
return 0
227+
228+
best_r = 0
229+
best_matches = -1
230+
n = len(window_tokens)
231+
232+
for r in range(n):
233+
rotated = window_tokens[r:] + window_tokens[:r]
234+
alignment = align(rotated, gold_tokens, False, mfn)
235+
matches = sum(1 for item in alignment if isinstance(item, Match))
236+
if matches > best_matches:
237+
best_matches = matches
238+
best_r = r
239+
240+
return best_r
241+
242+
219243
def _get_pos(form):
220244
"""Extract uppercased POS from a Form's morphology, or '?' if absent."""
221245
if form is not None and form.morphology:
@@ -362,8 +386,13 @@ def process(self, doc, **kwargs):
362386
utt_main_forms[utt_idx].append(m_form)
363387
utt_main_speakers[utt_idx].append(m_utt_idx)
364388

365-
# Align the chosen window against this gold utterance
389+
# Align the chosen window against this gold utterance,
390+
# trying cyclic rotations to avoid spurious del/ins pairs.
366391
window_main = conformed_main[abs_start:abs_end]
392+
window_len = len(window_main)
393+
rotation = _best_rotation(window_main, g_tokens, match_fn)
394+
if rotation > 0:
395+
window_main = window_main[rotation:] + window_main[:rotation]
367396
utt_alignment = align(window_main, g_tokens, False, match_fn)
368397

369398
local_main_cursor = 0
@@ -372,7 +401,7 @@ def process(self, doc, **kwargs):
372401

373402
for item in utt_alignment:
374403
if isinstance(item, Match):
375-
global_main_idx = abs_start + local_main_cursor
404+
global_main_idx = abs_start + (local_main_cursor + rotation) % window_len
376405
orig_main_idx = main_map[global_main_idx]
377406
main_form = main_info[orig_main_idx][2]
378407
orig_gold_idx = g_maps[local_gold_cursor]
@@ -403,7 +432,7 @@ def process(self, doc, **kwargs):
403432
local_gold_cursor += 1
404433

405434
else:
406-
global_main_idx = abs_start + local_main_cursor
435+
global_main_idx = abs_start + (local_main_cursor + rotation) % window_len
407436
orig_main_idx = main_map[global_main_idx]
408437
main_form = main_info[orig_main_idx][2]
409438

@@ -464,6 +493,10 @@ def process(self, doc, **kwargs):
464493
utt.time = (timed_forms[0].time[0], timed_forms[-1].time[1])
465494
utt.text = None
466495

496+
# Copy @Media header from the main doc if it has one
497+
if doc.media is not None:
498+
gold.media = doc.media
499+
467500
return gold
468501

469502

batchalign/version

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
0.8.2-post.9
1+
0.8.2-post.10
22
April 02 2026
33
segment

0 commit comments

Comments
 (0)