Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 53 additions & 1 deletion src/canonicalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,7 @@ impl CanonicalizeContext {
},
"mi" => {
let text = as_text(mathml);
if !text.trim().is_empty() && is_roman_number_match(text) && is_roman_numeral_number_context(mathml) {
if should_canonicalize_mi_as_roman_numeral(mathml, text) {
// people tend to set them in a non-italic font and software makes that 'mtext'
CanonicalizeContext::make_roman_numeral(mathml);
return Some(mathml);
Expand Down Expand Up @@ -1824,6 +1824,19 @@ impl CanonicalizeContext {
return UPPER_ROMAN_NUMERAL.is_match(text) || LOWER_ROMAN_NUMERAL.is_match(text);
}

/// Returns true when `<mi>` looks like a Roman numeral, appears in a numeric context,
/// and is either multi-letter or explicitly marked with Roman-numeral intent.
fn should_canonicalize_mi_as_roman_numeral(mathml: Element, text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() || !is_roman_number_match(text) || !is_roman_numeral_number_context(mathml) {
return false;
}

return trimmed.len() > 1 ||
mathml.attribute_value(INTENT_ATTR)
.is_some_and(|s| s.contains("roman-numeral"));
}

/// Return true if 'element' (which is syntactically a roman numeral) is only inside mrows and
/// if its length is < 3 chars, then there is another roman numeral near it (separated by an operator).
/// We want to rule out something like 'm' or 'cm' being a roman numeral.
Expand Down Expand Up @@ -5987,6 +6000,45 @@ mod canonicalize_tests {
are_strs_canonically_equal_result(test_str, target_str, &[])
}

#[test]
fn roman_numeral_multi_letter_mi() -> Result<()> {
let test_str = "<math>
<mi>IX</mi>
<mo>+</mo>
<mi>VIII</mi>
<mo>=</mo>
<mi>XVII</mi>
</math>";
let target_str = "<math><mrow data-changed='added'>
<mrow data-changed='added'>
<mn data-roman-numeral='true' data-number='9'>IX</mn>
<mo>+</mo>
<mn data-roman-numeral='true' data-number='8'>VIII</mn>
</mrow>
<mo>=</mo>
<mn data-roman-numeral='true' data-number='17'>XVII</mn>
</mrow></math>";
are_strs_canonically_equal_result(test_str, target_str, &[])
}

#[test]
fn roman_like_single_letter_mi_is_not_number() -> Result<()> {
// Regression test for https://github.com/daisy/MathCAT/issues/528
let test_str = "<math>
<mi>C</mi>
<mo>=</mo>
<mi>D</mi>
</math>";
let target_str = "<math>
<mrow data-changed='added'>
<mi>C</mi>
<mo>=</mo>
<mi>D</mi>
</mrow>
</math>";
are_strs_canonically_equal_result(test_str, target_str, &[])
}

// #[test]
// fn roman_numeral_context() {
// let test_str = "<math><mi>vi</mi><mo>-</mo><mi mathvariant='normal'>i</mi><mo>=</mo><mtext>v</mtext></math>";
Expand Down
12 changes: 10 additions & 2 deletions src/chemistry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ static CHEM_ELEMENT: &str = "data-chem-element";
static CHEM_FORMULA_OPERATOR: &str = "data-chem-formula-op";
static CHEM_EQUATION_OPERATOR: &str = "data-chem-equation-op";
static CHEM_STATE: &str = "data-chem-state";
/// Temporary storage for superscript Roman numerals, e.g. oxidation states, until chemistry is confirmed.
static CHEM_ROMAN_NUMERAL_NUMBER: &str = "data-chem-roman-numeral-number";

/// mark a new chem element that happened due to splitting a leaf
pub static SPLIT_TOKEN: &str = "data-split";
Expand Down Expand Up @@ -483,6 +485,11 @@ fn get_marked_value(mathml: Element) -> Option<isize> {
/// Recurse through all the children that have MAYBE_CHEMISTRY set
fn set_marked_chemistry_attr(mathml: Element, chem: &str) {
let tag_name = name(mathml);
if let Some(number) = mathml.attribute_value(CHEM_ROMAN_NUMERAL_NUMBER) {
let number = number.to_string();
mathml.set_attribute_value("data-number", &number);
mathml.remove_attribute(CHEM_ROMAN_NUMERAL_NUMBER);
}
if let Some(maybe_attr) = mathml.attribute(MAYBE_CHEMISTRY) {
maybe_attr.remove_from_parent();

Expand Down Expand Up @@ -554,6 +561,7 @@ fn is_changed_after_unmarking_chemistry(mathml: Element) -> bool {
mathml.remove_attribute(MAYBE_CHEMISTRY);
if is_leaf(mathml) {
// don't bother testing for the attr -- just remove and nothing bad happens if they aren't there
mathml.remove_attribute(CHEM_ROMAN_NUMERAL_NUMBER);
mathml.remove_attribute(CHEM_FORMULA_OPERATOR);
mathml.remove_attribute(CHEM_EQUATION_OPERATOR);
mathml.remove_attribute(CHEMICAL_BOND);
Expand Down Expand Up @@ -624,7 +632,7 @@ fn is_changed_after_unmarking_chemistry(mathml: Element) -> bool {
script_base.remove_attribute(MAYBE_CHEMISTRY);
script_base.remove_attribute(SPLIT_TOKEN);
mathml.replace_children(script_children);

first_element_of_split.remove_from_parent();
return true;
}
Expand Down Expand Up @@ -1007,7 +1015,7 @@ fn likely_chem_superscript(sup: Element) -> isize {
}
return if as_text(sup).len()==1 {1} else {2};
} else if (sup_name == "mi" || sup_name == "mn" || sup_name=="mtext") && SMALL_UPPER_ROMAN_NUMERAL.is_match(as_text(sup)){
sup.set_attribute_value("data-number", small_roman_to_number(as_text(sup)));
sup.set_attribute_value(CHEM_ROMAN_NUMERAL_NUMBER, small_roman_to_number(as_text(sup)));
sup.set_attribute_value(MAYBE_CHEMISTRY, "2");
return 2;
} else if sup_name == "mrow" {
Expand Down
33 changes: 33 additions & 0 deletions tests/Languages/en/shared.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,39 @@ fn presentation_mathml_in_semantics() -> Result<()> {

}

#[test]
fn roman_like_superscript_identifier_is_not_chemistry() -> Result<()> {
// Regression test for https://github.com/daisy/MathCAT/issues/528
let expr = "<math>
<mi>I</mi>
<mo>=</mo>
<mo>−</mo>
<mi>b</mi>
<mi>r</mi>
<mo>+</mo>
<msup>
<mi>z</mi>
<mi>I</mi>
</msup>
</math>";
test("en", "ClearSpeak", expr, "cap i is equal to, negative b r, plus z to the cap i-th power")?;
Ok(())
}

#[test]
fn roman_like_identifier_sequence_is_not_number() -> Result<()> {
// Regression test for https://github.com/daisy/MathCAT/issues/528
let expr = "<math>
<mi>C</mi>
<mo>+</mo>
<mi>I</mi>
<mo>+</mo>
<mi>X</mi>
</math>";
test("en", "ClearSpeak", expr, "cap c, plus cap i, plus cap x")?;
Ok(())
}

#[test]
fn ignore_period() -> Result<()> {
// from https://en.wikipedia.org/wiki/Probability
Expand Down
Loading