@@ -216,6 +216,30 @@ def _find_best_segment(gold_tokens, main_tokens, mfn):
216216 return best
217217
218218
219+ def _best_rotation (window_tokens , gold_tokens , mfn ):
220+ """Find the cyclic rotation of *window_tokens* that maximises matches.
221+
222+ Returns the rotation offset *r* such that
223+ ``window_tokens[r:] + window_tokens[:r]`` best aligns to *gold_tokens*.
224+ """
225+ if len (window_tokens ) <= 1 :
226+ return 0
227+
228+ best_r = 0
229+ best_matches = - 1
230+ n = len (window_tokens )
231+
232+ for r in range (n ):
233+ rotated = window_tokens [r :] + window_tokens [:r ]
234+ alignment = align (rotated , gold_tokens , False , mfn )
235+ matches = sum (1 for item in alignment if isinstance (item , Match ))
236+ if matches > best_matches :
237+ best_matches = matches
238+ best_r = r
239+
240+ return best_r
241+
242+
219243def _get_pos (form ):
220244 """Extract uppercased POS from a Form's morphology, or '?' if absent."""
221245 if form is not None and form .morphology :
@@ -362,8 +386,13 @@ def process(self, doc, **kwargs):
362386 utt_main_forms [utt_idx ].append (m_form )
363387 utt_main_speakers [utt_idx ].append (m_utt_idx )
364388
365- # Align the chosen window against this gold utterance
389+ # Align the chosen window against this gold utterance,
390+ # trying cyclic rotations to avoid spurious del/ins pairs.
366391 window_main = conformed_main [abs_start :abs_end ]
392+ window_len = len (window_main )
393+ rotation = _best_rotation (window_main , g_tokens , match_fn )
394+ if rotation > 0 :
395+ window_main = window_main [rotation :] + window_main [:rotation ]
367396 utt_alignment = align (window_main , g_tokens , False , match_fn )
368397
369398 local_main_cursor = 0
@@ -372,7 +401,7 @@ def process(self, doc, **kwargs):
372401
373402 for item in utt_alignment :
374403 if isinstance (item , Match ):
375- global_main_idx = abs_start + local_main_cursor
404+ global_main_idx = abs_start + ( local_main_cursor + rotation ) % window_len
376405 orig_main_idx = main_map [global_main_idx ]
377406 main_form = main_info [orig_main_idx ][2 ]
378407 orig_gold_idx = g_maps [local_gold_cursor ]
@@ -403,7 +432,7 @@ def process(self, doc, **kwargs):
403432 local_gold_cursor += 1
404433
405434 else :
406- global_main_idx = abs_start + local_main_cursor
435+ global_main_idx = abs_start + ( local_main_cursor + rotation ) % window_len
407436 orig_main_idx = main_map [global_main_idx ]
408437 main_form = main_info [orig_main_idx ][2 ]
409438
@@ -464,6 +493,10 @@ def process(self, doc, **kwargs):
464493 utt .time = (timed_forms [0 ].time [0 ], timed_forms [- 1 ].time [1 ])
465494 utt .text = None
466495
496+ # Copy @Media header from the main doc if it has one
497+ if doc .media is not None :
498+ gold .media = doc .media
499+
467500 return gold
468501
469502
0 commit comments