diff --git a/src/canonicalize.rs b/src/canonicalize.rs index 03eccde7..e686b0b9 100644 --- a/src/canonicalize.rs +++ b/src/canonicalize.rs @@ -838,7 +838,7 @@ impl CanonicalizeContext { }, "mi" => { let text = as_text(mathml); - if !text.trim().is_empty() && is_roman_number_match(text) && is_roman_numeral_number_context(mathml) { + if should_canonicalize_mi_as_roman_numeral(mathml, text) { // people tend to set them in a non-italic font and software makes that 'mtext' CanonicalizeContext::make_roman_numeral(mathml); return Some(mathml); @@ -1824,6 +1824,19 @@ impl CanonicalizeContext { return UPPER_ROMAN_NUMERAL.is_match(text) || LOWER_ROMAN_NUMERAL.is_match(text); } + /// Returns true when `` looks like a Roman numeral, appears in a numeric context, + /// and is either multi-letter or explicitly marked with Roman-numeral intent. + fn should_canonicalize_mi_as_roman_numeral(mathml: Element, text: &str) -> bool { + let trimmed = text.trim(); + if trimmed.is_empty() || !is_roman_number_match(text) || !is_roman_numeral_number_context(mathml) { + return false; + } + + return trimmed.len() > 1 || + mathml.attribute_value(INTENT_ATTR) + .is_some_and(|s| s.contains("roman-numeral")); + } + /// Return true if 'element' (which is syntactically a roman numeral) is only inside mrows and /// if its length is < 3 chars, then there is another roman numeral near it (separated by an operator). /// We want to rule out something like 'm' or 'cm' being a roman numeral. @@ -5987,6 +6000,45 @@ mod canonicalize_tests { are_strs_canonically_equal_result(test_str, target_str, &[]) } + #[test] + fn roman_numeral_multi_letter_mi() -> Result<()> { + let test_str = " + IX + + + VIII + = + XVII + "; + let target_str = " + + IX + + + VIII + + = + XVII + "; + are_strs_canonically_equal_result(test_str, target_str, &[]) + } + + #[test] + fn roman_like_single_letter_mi_is_not_number() -> Result<()> { + // Regression test for https://github.com/daisy/MathCAT/issues/528 + let test_str = " + C + = + D + "; + let target_str = " + + C + = + D + + "; + are_strs_canonically_equal_result(test_str, target_str, &[]) + } + // #[test] // fn roman_numeral_context() { // let test_str = "vi-i=v"; diff --git a/src/chemistry.rs b/src/chemistry.rs index f538bbe8..4b03c97d 100644 --- a/src/chemistry.rs +++ b/src/chemistry.rs @@ -68,6 +68,8 @@ static CHEM_ELEMENT: &str = "data-chem-element"; static CHEM_FORMULA_OPERATOR: &str = "data-chem-formula-op"; static CHEM_EQUATION_OPERATOR: &str = "data-chem-equation-op"; static CHEM_STATE: &str = "data-chem-state"; +/// Temporary storage for superscript Roman numerals, e.g. oxidation states, until chemistry is confirmed. +static CHEM_ROMAN_NUMERAL_NUMBER: &str = "data-chem-roman-numeral-number"; /// mark a new chem element that happened due to splitting a leaf pub static SPLIT_TOKEN: &str = "data-split"; @@ -483,6 +485,11 @@ fn get_marked_value(mathml: Element) -> Option { /// Recurse through all the children that have MAYBE_CHEMISTRY set fn set_marked_chemistry_attr(mathml: Element, chem: &str) { let tag_name = name(mathml); + if let Some(number) = mathml.attribute_value(CHEM_ROMAN_NUMERAL_NUMBER) { + let number = number.to_string(); + mathml.set_attribute_value("data-number", &number); + mathml.remove_attribute(CHEM_ROMAN_NUMERAL_NUMBER); + } if let Some(maybe_attr) = mathml.attribute(MAYBE_CHEMISTRY) { maybe_attr.remove_from_parent(); @@ -554,6 +561,7 @@ fn is_changed_after_unmarking_chemistry(mathml: Element) -> bool { mathml.remove_attribute(MAYBE_CHEMISTRY); if is_leaf(mathml) { // don't bother testing for the attr -- just remove and nothing bad happens if they aren't there + mathml.remove_attribute(CHEM_ROMAN_NUMERAL_NUMBER); mathml.remove_attribute(CHEM_FORMULA_OPERATOR); mathml.remove_attribute(CHEM_EQUATION_OPERATOR); mathml.remove_attribute(CHEMICAL_BOND); @@ -624,7 +632,7 @@ fn is_changed_after_unmarking_chemistry(mathml: Element) -> bool { script_base.remove_attribute(MAYBE_CHEMISTRY); script_base.remove_attribute(SPLIT_TOKEN); mathml.replace_children(script_children); - + first_element_of_split.remove_from_parent(); return true; } @@ -1007,7 +1015,7 @@ fn likely_chem_superscript(sup: Element) -> isize { } return if as_text(sup).len()==1 {1} else {2}; } else if (sup_name == "mi" || sup_name == "mn" || sup_name=="mtext") && SMALL_UPPER_ROMAN_NUMERAL.is_match(as_text(sup)){ - sup.set_attribute_value("data-number", small_roman_to_number(as_text(sup))); + sup.set_attribute_value(CHEM_ROMAN_NUMERAL_NUMBER, small_roman_to_number(as_text(sup))); sup.set_attribute_value(MAYBE_CHEMISTRY, "2"); return 2; } else if sup_name == "mrow" { diff --git a/tests/Languages/en/shared.rs b/tests/Languages/en/shared.rs index 37fd4565..93c3a9cd 100644 --- a/tests/Languages/en/shared.rs +++ b/tests/Languages/en/shared.rs @@ -209,6 +209,39 @@ fn presentation_mathml_in_semantics() -> Result<()> { } +#[test] +fn roman_like_superscript_identifier_is_not_chemistry() -> Result<()> { + // Regression test for https://github.com/daisy/MathCAT/issues/528 + let expr = " + I + = + + b + r + + + + z + I + + "; + test("en", "ClearSpeak", expr, "cap i is equal to, negative b r, plus z to the cap i-th power")?; + Ok(()) +} + +#[test] +fn roman_like_identifier_sequence_is_not_number() -> Result<()> { + // Regression test for https://github.com/daisy/MathCAT/issues/528 + let expr = " + C + + + I + + + X + "; + test("en", "ClearSpeak", expr, "cap c, plus cap i, plus cap x")?; + Ok(()) +} + #[test] fn ignore_period() -> Result<()> { // from https://en.wikipedia.org/wiki/Probability