diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 23455d162..4aeecd625 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -19,7 +19,7 @@ updates: groups: github-actions: patterns: - - "*"kly" + - "*" - package-ecosystem: "docker" directory: "/docker/" diff --git a/CLAUDE.md b/CLAUDE.md index d80512740..271937b7e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -23,10 +23,10 @@ mvn package site # Build without running tests (useful when tests fail with extra logging) mvn clean package -DskipTests -# Run tests only +# Run full test suite (always use this to verify changes) mvn test -# Run tests excluding ReferenceIntegrityTest with specific Cucumber tags +# Run a targeted subset during development (NOT for verifying changes — excludes ReferenceIntegrityTest) mvn test -Dtest=\!ReferenceIntegrityTest* -Dcucumber.filter.tags='@v3.7.x' # Run a single Cucumber scenario by name (issue number and subtest, e.g. #1458-1) @@ -172,7 +172,7 @@ Alternatively, use `--verbose 1` flag for INFO level output in reports (default - Source/Target: Java 17 (pom.xml:172-173) ### Main Dependencies -- **pds4-jparser** (version 2.11.0) - PDS4 label parsing and validation +- **pds4-jparser** (version 3.2.0-SNAPSHOT) - PDS4 label parsing and validation; **source is at `../pds4-jparser`** — check there first when tracing errors from pds4-jparser classes (e.g. `FieldValueValidator`, `ProblemType`, parser internals) - **pds3-product-tools** (version 4.4.2) - PDS3 validation - **Saxon-HE** (version 12.9) - XSLT and XPath processing - **registry-common** (version 2.1.0-SNAPSHOT) - Registry API integration diff --git a/scripts/benchmark_bundle.py b/scripts/benchmark_bundle.py new file mode 100644 index 000000000..126f5893c --- /dev/null +++ b/scripts/benchmark_bundle.py @@ -0,0 +1,515 @@ +#!/usr/bin/env python3 +"""Generate a synthetic PDS4 bundle and time validate against it. + +Exercises the referential integrity path (additionalReferentialIntegrityChecks, +collectAllContextReferences) which is the target of the #1568 caching optimization. + +Each product label includes Investigation_Area, Observing_System, and +Target_Identification Internal_References so all three cached identifier types +(logicalIdentifiers, lidOrLidVidReferences, contextAreaRefs) are exercised. + +Usage +----- + # Smoke test — 10 products, keep output dir for inspection: + python3 scripts/benchmark_bundle.py --keep + + # Timing run: + python3 scripts/benchmark_bundle.py --products 1000 + + # Two-run comparison (warm schema cache on first run): + python3 scripts/benchmark_bundle.py --products 500 --runs 2 + + # Custom validate binary: + python3 scripts/benchmark_bundle.py --products 500 \\ + --validate ./validate-4.2.0-SNAPSHOT/bin/validate +""" + +import argparse +import hashlib +import os +import shutil +import subprocess +import sys +import tempfile +import time + +# --------------------------------------------------------------------------- +# PDS4 constants +# --------------------------------------------------------------------------- +BUNDLE_NS = "http://pds.nasa.gov/pds4/pds/v1" +SCHEMA_URL = "https://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1D00.xsd" +SCH_URL = "https://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1D00.sch" +IM_VERSION = "1.13.0.0" + +BUNDLE_LID = "urn:nasa:pds:benchmark.bundle" +COLL_LID = f"{BUNDLE_LID}:data.benchmark" +COLL_LIDVID = f"{COLL_LID}::1.0" +INV_LID = "urn:nasa:pds:context:investigation:mission.cassini-huygens" +IH_LID = "urn:nasa:pds:context:instrument_host:spacecraft.co" +INSTR_LID = "urn:nasa:pds:context:instrument:rss.co" +TGT_LID = "urn:nasa:pds:context:target:planet.saturn" + +# --------------------------------------------------------------------------- +# XML generation helpers +# --------------------------------------------------------------------------- + +def _header(): + return ( + '\n' + f'\n' + ) + + +def _ns_attrs(): + return ( + f'xmlns="{BUNDLE_NS}"\n' + ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' + f' xsi:schemaLocation="{BUNDLE_NS} {SCHEMA_URL}"' + ) + + +def prod_lid(i): + return f"{COLL_LID}:product_{i:06d}" + + +def prod_lidvid(i): + return f"{prod_lid(i)}::1.0" + + +def bundle_xml(): + return f"""{_header()} + + {BUNDLE_LID} + 1.0 + Benchmark Bundle + {IM_VERSION} + Product_Bundle + + Benchmark Generator + 2024 + Synthetic bundle for benchmarking validate caching performance. + + + + + 2005-05-02T00:00:00Z + 2005-05-03T00:00:00Z + + + Science + Raw + + + Cassini-Huygens + Mission + + {INV_LID} + bundle_to_investigation + + + + + Cassini Orbiter + Spacecraft + + {IH_LID} + is_instrument_host + + + + Radio Science Subsystem + Instrument + + {INSTR_LID} + is_instrument + + + + + Saturn + Planet + + {TGT_LID} + bundle_to_target + + + + + Archive + + + {COLL_LIDVID} + Primary + bundle_has_data_collection + + +""" + + +def collection_xml(n, csv_name, csv_size, csv_md5): + return f"""{_header()} + + {COLL_LID} + 1.0 + Benchmark Data Collection + {IM_VERSION} + Product_Collection + + Benchmark Generator + 2024 + Synthetic data collection for benchmarking validate caching performance. + + + + + 2005-05-02T00:00:00Z + 2005-05-03T00:00:00Z + + + Science + Raw + + + Cassini-Huygens + Mission + + {INV_LID} + collection_to_investigation + + + + + Cassini Orbiter + Spacecraft + + {IH_LID} + is_instrument_host + + + + Radio Science Subsystem + Instrument + + {INSTR_LID} + is_instrument + + + + + Saturn + Planet + + {TGT_LID} + collection_to_target + + + + + Data + + + + {csv_name} + {csv_size} + {csv_md5} + + + 0 + PDS DSV 1 + {n} + Carriage-Return Line-Feed + Comma + + 2 + 0 + 257 + + Member Status + 1 + ASCII_String + 1 + + + LIDVID_LID + 2 + ASCII_LIDVID_LID + 255 + + + inventory_has_member_product + + + +""" + + +def product_xml(i): + lid = prod_lid(i) + dat = f"product_{i:06d}.dat" + return f"""{_header()} + + {lid} + 1.0 + Benchmark Product {i:06d} + {IM_VERSION} + Product_Observational + + + + 2005-05-02T00:00:00Z + 2005-05-03T00:00:00Z + + + Science + Raw + + + Cassini-Huygens + Mission + + {INV_LID} + data_to_investigation + + + + + Cassini Orbiter + Spacecraft + + {IH_LID} + is_instrument_host + + + + + Saturn + Planet + + {TGT_LID} + data_to_target + + + + + + {dat} + 0 + +
+ 0 + 1 + UTF-8 Text +
+
+
+""" + + +# --------------------------------------------------------------------------- +# Bundle generation +# --------------------------------------------------------------------------- + +def generate(outdir, n): + """Write bundle/ collection / products to outdir. Returns path to bundle label.""" + data_dir = os.path.join(outdir, "data") + os.makedirs(data_dir, exist_ok=True) + + # --- inventory CSV (CRLF line endings as required by PDS4) --- + csv_name = "collection_benchmark.csv" + csv_path = os.path.join(outdir, csv_name) + csv_lines = b"".join( + f"P,{prod_lidvid(i)}\r\n".encode() for i in range(1, n + 1) + ) + with open(csv_path, "wb") as f: + f.write(csv_lines) + csv_size = len(csv_lines) + csv_md5 = hashlib.md5(csv_lines).hexdigest() + + # --- collection label --- + coll_path = os.path.join(outdir, "collection_benchmark.xml") + with open(coll_path, "w", encoding="utf-8") as f: + f.write(collection_xml(n, csv_name, csv_size, csv_md5)) + + # --- bundle label --- + bundle_path = os.path.join(outdir, "bundle_benchmark.xml") + with open(bundle_path, "w", encoding="utf-8") as f: + f.write(bundle_xml()) + + # --- product labels + 0-byte stub data files --- + print(f" Generating {n} product labels...", end="", flush=True) + t0 = time.time() + for i in range(1, n + 1): + ppath = os.path.join(data_dir, f"product_{i:06d}.xml") + with open(ppath, "w", encoding="utf-8") as f: + f.write(product_xml(i)) + # 0-byte stub satisfies file-existence check; content skipped via --skip-content-validation + open(os.path.join(data_dir, f"product_{i:06d}.dat"), "wb").close() + if i % max(1, n // 20) == 0: + print(".", end="", flush=True) + elapsed = time.time() - t0 + print(f" done ({elapsed:.1f}s)") + + size_mb = sum( + os.path.getsize(os.path.join(dp, fn)) + for dp, _, fns in os.walk(outdir) + for fn in fns + ) / 1_048_576 + print(f" Bundle size: {size_mb:.1f} MB ({n + 2} labels + 1 CSV)") + return bundle_path + + +# --------------------------------------------------------------------------- +# Validate runner +# --------------------------------------------------------------------------- + +def find_validate(): + """Return path to the validate binary. + + Preference order: + 1. Freshly built distribution zip in target/ (if newer than snapshot) — auto-extracted to /tmp + 2. Pre-built snapshot in validate-4.2.0-SNAPSHOT/ + 3. validate on PATH + """ + repo_root = os.path.normpath(os.path.join(os.path.dirname(__file__), "..")) + snapshot_bin = os.path.join(repo_root, "validate-4.2.0-SNAPSHOT", "bin", "validate") + dist_zip = os.path.join(repo_root, "target", "validate-4.2.0-SNAPSHOT-bin.zip") + + # Auto-extract freshly built zip when it's newer than the snapshot binary + if os.path.isfile(dist_zip) and ( + not os.path.isfile(snapshot_bin) + or os.path.getmtime(dist_zip) > os.path.getmtime(snapshot_bin) + ): + extract_dir = "/tmp/validate_benchmark_bin" + extracted_bin = os.path.join(extract_dir, "validate-4.2.0-SNAPSHOT", "bin", "validate") + if not os.path.isfile(extracted_bin) or os.path.getmtime(dist_zip) > os.path.getmtime(extracted_bin): + import zipfile + print(f" Extracting fresh build from {dist_zip}...") + shutil.rmtree(extract_dir, ignore_errors=True) + os.makedirs(extract_dir, exist_ok=True) + with zipfile.ZipFile(dist_zip) as z: + z.extractall(extract_dir) + os.chmod(extracted_bin, 0o755) + print(f" Extracted to {extract_dir}") + return extracted_bin + + if os.path.isfile(snapshot_bin): + return snapshot_bin + return shutil.which("validate") + + +def run_validate(validate_cmd, bundle_path, extra_args): + """Run validate --rule pds4.bundle --skip-content-validation on bundle_path. + Returns (returncode, elapsed_seconds, stdout+stderr). + """ + bundle_dir = os.path.dirname(bundle_path) + cmd = [ + validate_cmd, + "--rule", "pds4.bundle", + "--skip-content-validation", + bundle_dir, + ] + extra_args + + print(f" $ {' '.join(cmd)}") + t0 = time.time() + result = subprocess.run(cmd, capture_output=True, text=True) + elapsed = time.time() - t0 + return result.returncode, elapsed, result.stdout + result.stderr + + +def summarise(output): + """Extract the final Summary block from validate output.""" + # Find the last "Summary:" heading (not per-product lines) + idx = output.rfind("\nSummary:") + if idx == -1: + return "(no summary found — see full output)" + return output[idx:].strip() + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description="Generate a synthetic PDS4 bundle and time validate.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("--products", type=int, default=10, + help="Number of product labels to generate (default: 10)") + parser.add_argument("--output-dir", default=None, + help="Directory to write bundle into (default: temp dir)") + parser.add_argument("--validate", default=None, + help="Path to validate binary (default: auto-detect)") + parser.add_argument("--runs", type=int, default=1, + help="How many timed validate runs to perform (default: 1)") + parser.add_argument("--keep", action="store_true", + help="Keep output directory after run (useful for smoke tests)") + parser.add_argument("--validate-args", default="", + help="Extra args passed through to validate (quoted string)") + args = parser.parse_args() + + # --- locate validate --- + validate_cmd = args.validate or find_validate() + if not validate_cmd: + sys.exit( + "ERROR: cannot find validate binary.\n" + "Run `mvn package -DskipTests` first, or pass --validate ." + ) + validate_cmd = os.path.abspath(validate_cmd) + print(f"validate: {validate_cmd}") + + # --- output directory --- + tmp_owned = False + if args.output_dir: + outdir = os.path.abspath(args.output_dir) + os.makedirs(outdir, exist_ok=True) + else: + outdir = tempfile.mkdtemp(prefix="pds4_benchmark_") + tmp_owned = True + + print(f"output: {outdir}") + print(f"products: {args.products}") + print() + + try: + # --- generate --- + print("=== Generating bundle ===") + bundle_path = generate(outdir, args.products) + print() + + # --- run validate --- + extra = args.validate_args.split() if args.validate_args else [] + timings = [] + for run_n in range(1, args.runs + 1): + label = f"Run {run_n}/{args.runs}" if args.runs > 1 else "Validate" + print(f"=== {label} ===") + rc, elapsed, output = run_validate(validate_cmd, bundle_path, extra) + timings.append(elapsed) + print(f" Exit code : {rc}") + print(f" Elapsed : {elapsed:.2f}s") + print(f" Summary :") + for line in summarise(output).splitlines(): + print(f" {line}") + print() + + # --- timing summary --- + if args.runs > 1: + print("=== Timing summary ===") + for i, t in enumerate(timings, 1): + print(f" Run {i}: {t:.2f}s") + print(f" Best : {min(timings):.2f}s") + print(f" Mean : {sum(timings)/len(timings):.2f}s") + print() + + if args.keep or args.output_dir: + print(f"Bundle kept at: {outdir}") + + finally: + if tmp_owned and not args.keep: + shutil.rmtree(outdir, ignore_errors=True) + + +if __name__ == "__main__": + main() diff --git a/src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java b/src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java new file mode 100644 index 000000000..fb920467a --- /dev/null +++ b/src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java @@ -0,0 +1,32 @@ +package gov.nasa.pds.tools.util; + +import java.util.ArrayList; + +/** + * Immutable cache of identifier data extracted from a parsed label, used to avoid re-parsing + * during the referential integrity phase. + */ +public class LabelCacheEntry { + private final ArrayList logicalIdentifiers; + private final ArrayList lidOrLidVidReferences; + private final ArrayList contextAreaRefs; + + public LabelCacheEntry(ArrayList logicalIdentifiers, + ArrayList lidOrLidVidReferences, ArrayList contextAreaRefs) { + this.logicalIdentifiers = logicalIdentifiers; + this.lidOrLidVidReferences = lidOrLidVidReferences; + this.contextAreaRefs = contextAreaRefs; + } + + public ArrayList getLogicalIdentifiers() { + return logicalIdentifiers; + } + + public ArrayList getLidOrLidVidReferences() { + return lidOrLidVidReferences; + } + + public ArrayList getContextAreaRefs() { + return contextAreaRefs; + } +} diff --git a/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java b/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java index f59486b55..a7698a0f1 100644 --- a/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java +++ b/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java @@ -365,6 +365,11 @@ public static synchronized ArrayList getIdentifiersCommon(DOMSource sour * @return lidOrLidVidReference The LID or LIDVID referenced in this label. */ public static ArrayList getLidVidReferences(DOMSource source, URL context) { + return getLidVidReferences(source, context, true); + } + + public static ArrayList getLidVidReferences(DOMSource source, URL context, + boolean reportCarriageReturns) { LOG.debug("getLidVidReferences:MY_SOURCE[{}]", source); String[] tagsList = new String[2]; @@ -372,7 +377,8 @@ public static ArrayList getLidVidReferences(DOMSource source, URL contex tagsList[1] = LID_REFERENCE; ArrayList lidOrLidVidReferences = - LabelUtil.getIdentifiersCommon(source, context, tagsList, INTERNAL_REFERENCE_AREA); + LabelUtil.getIdentifiersCommon(source, context, tagsList, INTERNAL_REFERENCE_AREA, + reportCarriageReturns); LOG.debug("getLidVidReferences:context,lidOrLidVidReferences {},{}", context, lidOrLidVidReferences); @@ -387,13 +393,19 @@ public static ArrayList getLidVidReferences(DOMSource source, URL contex * @return logicalIdentifiers A list of logical identifiers in this label. */ public static ArrayList getLogicalIdentifiers(DOMSource source, URL context) { + return getLogicalIdentifiers(source, context, true); + } + + public static ArrayList getLogicalIdentifiers(DOMSource source, URL context, + boolean reportCarriageReturns) { LOG.debug("getLogicalIdentifiers:MY_SOURCE[{}]", source); String[] tagsList = new String[1]; tagsList[0] = LOGICAL_IDENTIFIER_TAG; ArrayList logicalIdentifiers = - LabelUtil.getIdentifiersCommon(source, context, tagsList, IDENTIFICATION_AREA); + LabelUtil.getIdentifiersCommon(source, context, tagsList, IDENTIFICATION_AREA, + reportCarriageReturns); LOG.debug("getLogicalIdentifiers:context,logicalIdentifiers {},{}", context, logicalIdentifiers); diff --git a/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java b/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java index 030b2ab01..e00ff09d9 100644 --- a/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java +++ b/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java @@ -142,6 +142,7 @@ public class ReferentialIntegrityUtil { private static String[] tagsList = new String[2]; private static HashSet reportedErrorsReferenceSet = new HashSet<>(); private static URL parentBundleURL = null; + private static HashMap labelIdentifierCache = new HashMap<>(); /** * Initialize this class to ready for doing referential checks. @@ -163,6 +164,14 @@ public static void initialize(String referenceType, URL target, ProblemListener ReferentialIntegrityUtil.tagsList[1] = LabelUtil.LID_REFERENCE; } + public static void cacheLabelIdentifiers(URL url, LabelCacheEntry entry) { + labelIdentifierCache.put(url, entry); + } + + public static LabelCacheEntry getCachedLabelIdentifiers(URL url) { + return labelIdentifierCache.get(url); + } + /** * Reset this class to its initial state in case running from regression tests. * @@ -188,6 +197,7 @@ public static void reset() { ReferentialIntegrityUtil.urlsParsedCumulative.clear(); ReferentialIntegrityUtil.reportedErrorsReferenceSet.clear(); ReferentialIntegrityUtil.parentBundleURL = null; + ReferentialIntegrityUtil.labelIdentifierCache.clear(); } /** @@ -577,6 +587,30 @@ private static void addUniqueReferencesToMap(HashMap allContextAreaRefs, + ArrayList logicalIdentifiers, ArrayList lidOrLidVidReferences, + boolean labelIsBundleFlag, boolean labelIsCollectionFlag, URL url) { + if ((logicalIdentifiers == null) || logicalIdentifiers.isEmpty()) { + return; + } + if (labelIsCollectionFlag || labelIsBundleFlag) { + if (labelIsBundleFlag) { + ReferentialIntegrityUtil.addUniqueReferencesToMap(ReferentialIntegrityUtil.bundleReferenceMap, + allContextAreaRefs, url, logicalIdentifiers.get(0)); + } else { + ReferentialIntegrityUtil.addUniqueReferencesToMap( + ReferentialIntegrityUtil.collectionReferenceMap, allContextAreaRefs, url, + logicalIdentifiers.get(0)); + } + } else { + ReferentialIntegrityUtil.addUniqueReferencesToMap( + ReferentialIntegrityUtil.contextReferencesCumulative, allContextAreaRefs, url, + logicalIdentifiers.get(0)); + } + LOG.debug("collectAllContextReferences:url,contextReferencesCumulative {},{},{}", url, + contextReferencesCumulative, contextReferencesCumulative.size()); + } + private static void collectAllContextReferences(DOMSource domSource, ArrayList logicalIdentifiers, ArrayList lidOrLidVidReferences, boolean labelIsBundleFlag, boolean labelIsCollectionFlag, URL url) { @@ -786,16 +820,29 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun if (TargetExaminer.isTargetCollectionType (child.getUrl())) { labelIsCollectionFlag = true; } - xml = db.parse(url.openStream()); - domSource = new DOMSource(xml); - // Note that the function getLidVidReferences() collects all Internal_Reference - // elements in the PDS4 core namespace (including those in Reference_List, - // Context_Area, discipline LDD areas, and any other location in the label). - // so the lidOrLidVidReferencesCumulative will be a cumulative collection of all - // references collected in lidOrLidVidReferences for each label. - - ArrayList lidOrLidVidReferences = LabelUtil.getLidVidReferences(domSource, url); - ArrayList logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, url); + LabelCacheEntry cached = ReferentialIntegrityUtil.getCachedLabelIdentifiers(url); + ArrayList lidOrLidVidReferences; + ArrayList logicalIdentifiers; + if (cached != null) { + logicalIdentifiers = cached.getLogicalIdentifiers(); + lidOrLidVidReferences = cached.getLidOrLidVidReferences(); + // Re-parse to report any lid_reference \n errors, matching the uncached path + // where getLidVidReferences(true) reports them. cacheIdentifiers() uses false + // to avoid reporting these in single-label validation (backwards compatibility). + xml = db.parse(url.openStream()); + domSource = new DOMSource(xml); + LabelUtil.getLidVidReferences(domSource, url, true); + } else { + xml = db.parse(url.openStream()); + domSource = new DOMSource(xml); + // Note that the function getLidVidReferences() collects all Internal_Reference + // elements in the PDS4 core namespace (including those in Reference_List, + // Context_Area, discipline LDD areas, and any other location in the label). + // so the lidOrLidVidReferencesCumulative will be a cumulative collection of all + // references collected in lidOrLidVidReferences for each label. + lidOrLidVidReferences = LabelUtil.getLidVidReferences(domSource, url); + logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, url); + } LOG.debug("additionalReferentialIntegrityChecks:url,lidOrLidVidReferences {},{}", url, lidOrLidVidReferences.size()); LOG.debug("additionalReferentialIntegrityChecks:url,logicalIdentifiers {},{}", url, @@ -871,8 +918,14 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun // Collect all the context references defined for each label under the // "Context_Area" tag. if (ReferentialIntegrityUtil.contextReferenceCheck) { - ReferentialIntegrityUtil.collectAllContextReferences(domSource, logicalIdentifiers, - lidOrLidVidReferences, labelIsBundleFlag, labelIsCollectionFlag, url); + if (cached != null) { + ReferentialIntegrityUtil.collectAllContextReferences(cached.getContextAreaRefs(), + logicalIdentifiers, lidOrLidVidReferences, labelIsBundleFlag, + labelIsCollectionFlag, url); + } else { + ReferentialIntegrityUtil.collectAllContextReferences(domSource, logicalIdentifiers, + lidOrLidVidReferences, labelIsBundleFlag, labelIsCollectionFlag, url); + } } } else { diff --git a/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java b/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java index 9832e102f..fe37e2004 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java +++ b/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.net.URISyntaxException; import java.nio.file.Path; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -11,7 +12,9 @@ import javax.xml.transform.dom.DOMSource; import org.w3c.dom.Document; import org.xml.sax.SAXException; +import gov.nasa.pds.tools.util.LabelCacheEntry; import gov.nasa.pds.tools.util.LabelUtil; +import gov.nasa.pds.tools.util.ReferentialIntegrityUtil; public class CrossLabelFileAreaReferenceChecker { final private static HashMap isObservational = new HashMap(); @@ -26,24 +29,25 @@ private static String resolve (String name, ValidationTarget target) throws URIS * @param name - the file being referenced by the file area * @param target - the label being validated * @return true if name and target are unique and only known references and false otherwise - * All of these exceptions should not happen because they are getting the LID from the target. - * Would not have made it this far if that could not have happened already, So, just pass them - * back to the called and let them handle it with their own generic exception handler/message. - * @throws IOException - * @throws ParserConfigurationException - * @throws SAXException * @throws URISyntaxException */ - public static boolean add (String name, ValidationTarget target, boolean isObs) + public static boolean add(String name, ValidationTarget target, boolean isObs) throws IOException, ParserConfigurationException, SAXException, URISyntaxException { boolean success = false; - DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); - Document xml = dbf.newDocumentBuilder().parse(target.getUrl().openStream()); - DOMSource domSource = new DOMSource(xml); String full_name = resolve(name, target); isObservational.put(full_name, isObs || (isObservational.containsKey(full_name) ? isObservational.get(full_name) : false)); - for (String lid : LabelUtil.getLogicalIdentifiers (domSource, target.getUrl())) { + ArrayList logicalIdentifiers; + LabelCacheEntry cached = ReferentialIntegrityUtil.getCachedLabelIdentifiers(target.getUrl()); + if (cached != null) { + logicalIdentifiers = cached.getLogicalIdentifiers(); + } else { + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + Document xml = dbf.newDocumentBuilder().parse(target.getUrl().openStream()); + DOMSource domSource = new DOMSource(xml); + logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, target.getUrl()); + } + for (String lid : logicalIdentifiers) { if (lid.contains("::")) { lid = lid.substring (0, lid.indexOf("::")); } @@ -64,5 +68,6 @@ public static String getOtherFilename (String name, ValidationTarget target) thr } public static void reset() { knownRefs.clear(); + isObservational.clear(); } } diff --git a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java index 27cc86c8a..1660e0e33 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java +++ b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java @@ -331,7 +331,7 @@ private boolean validate(NodeInfo xml) { ancestor = ancestor.getParent(); isObservational |= OBS_DATA_TAGS.contains(ancestor.getLocalPart()); } - if (!CrossLabelFileAreaReferenceChecker.add (fullName, target, isObservational)) { + if (!CrossLabelFileAreaReferenceChecker.add(fullName, target, isObservational)) { this.getListener().addProblem( new ValidationProblem( new ProblemDefinition(ExceptionType.ERROR, ProblemType.DUPLICATED_FILE_AREA_REFERENCE, diff --git a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java index fb323f6c6..b3e1cbb68 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java +++ b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java @@ -26,6 +26,7 @@ import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; +import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; @@ -59,6 +60,10 @@ import gov.nasa.pds.validate.constants.Constants; import net.sf.saxon.trans.XPathException; import net.sf.saxon.tree.tiny.TinyNodeImpl; +import javax.xml.transform.dom.DOMSource; +import gov.nasa.pds.tools.util.LabelCacheEntry; +import gov.nasa.pds.tools.util.LabelUtil; +import gov.nasa.pds.tools.util.ReferentialIntegrityUtil; /** * Implements a validation chain that validates PDS4 bundles. It is applicable if there is a bundle @@ -318,6 +323,7 @@ public void validateLabel() { if (document != null) { getContext().put(PDS4Context.LABEL_DOCUMENT, document); labelIsValidFlag = true; // A non-null document signified that the label is valid. + cacheIdentifiers(getTarget()); } } catch (SAXException | IOException | ParserConfigurationException | TransformerException | MissingLabelSchemaException e) { @@ -337,6 +343,34 @@ public void validateLabel() { LOG.debug("validateLabel:target,labelIsValidFlag {},{}", target, labelIsValidFlag); } + private static void cacheIdentifiers(URL targetUrl) { + // Re-parse using plain DocumentBuilderFactory so whitespace in text content (e.g. \n in + // logical_identifier or lid_reference) is preserved exactly as authored. + // Saxon's parseAndValidate() normalizes xs:token values, stripping \n before we can detect them. + try { + Document rawDoc = + DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(targetUrl.openStream()); + DOMSource domSource = new DOMSource(rawDoc); + String[] tagsArr = {LabelUtil.LIDVID_REFERENCE, LabelUtil.LID_REFERENCE}; + ArrayList logicalIds = LabelUtil.getLogicalIdentifiers(domSource, targetUrl, true); + // false: lid_reference \n errors are only reported during bundle/collection validation + // (via additionalReferentialIntegrityChecks), not during single-label validation. + // Using true here would be a backwards-incompatible behavior change. + ArrayList lidVidRefs = LabelUtil.getLidVidReferences(domSource, targetUrl, false); + ArrayList contextRefs = new ArrayList<>(); + contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr, + LabelUtil.CONTEXT_AREA_INVESTIGATION_AREA_REFERENCE, false)); + contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr, + LabelUtil.CONTEXT_AREA_OBSERVATION_SYSTEM_COMPONENT_REFERENCE, false)); + contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr, + LabelUtil.CONTEXT_AREA_TARGET_IDENTIFICATION_REFERENCE, false)); + ReferentialIntegrityUtil.cacheLabelIdentifiers(targetUrl, + new LabelCacheEntry(logicalIds, lidVidRefs, contextRefs)); + } catch (Exception e) { + LOG.error("cacheIdentifiers: failed to parse {}: {}", targetUrl, e.getMessage()); + } + } + private boolean resolveSingleSchema(URL label, Map.Entry schemaLocation, URL schemaUrl, ProblemContainer container, ProblemContainer labelProblems, XMLCatalogResolver resolver) { diff --git a/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java b/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java index 30c3034fa..7b513c5bf 100644 --- a/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java +++ b/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java @@ -104,6 +104,7 @@ import gov.nasa.pds.tools.util.ReferentialIntegrityUtil; import gov.nasa.pds.tools.util.XMLExtractor; import gov.nasa.pds.tools.validate.ContentProblem; +import gov.nasa.pds.tools.validate.CrossLabelFileAreaReferenceChecker; import gov.nasa.pds.tools.validate.InMemoryRegistrar; import gov.nasa.pds.tools.validate.ProblemContainer; import gov.nasa.pds.tools.validate.ProblemDefinition; @@ -1643,6 +1644,7 @@ public boolean doValidation(Map checksumManifest) throws Exception // Due to the util class ReferentialIntegrityUtil being static, it need to be // reset() if running a regression test. ReferentialIntegrityUtil.reset(); + CrossLabelFileAreaReferenceChecker.reset(); return success; } diff --git a/src/test/resources/github15/test_check-no-pass_context_products.xml b/src/test/resources/github15/test_check-no-pass_context_products.xml index 8bb6d668e..b24fcfcc5 100644 --- a/src/test/resources/github15/test_check-no-pass_context_products.xml +++ b/src/test/resources/github15/test_check-no-pass_context_products.xml @@ -28,8 +28,7 @@ Test Mission - urn:nasa:pds:context:investigation:mission.mars_exploration_rover - + urn:nasa:pds:context:investigation:mission.mars_exploration_rover data_to_investigation @@ -39,8 +38,7 @@ FOO Spacecraft - urn:nasa:pds:context:instrument_host:spacecraft.FOO - + urn:nasa:pds:context:instrument_host:spacecraft.FOO is_instrument_host @@ -48,8 +46,7 @@ MER2 Spacecraft - urn:nasa:pds:context:instrument_host:spacecraft.mer2 - + urn:nasa:pds:context:instrument_host:spacecraft.mer2 is_instrument_host @@ -57,8 +54,7 @@ Pancam Instrument - urn:nasa:pds:context:instrument:pancam.mer2 - + urn:nasa:pds:context:instrument:pancam.mer2 is_instrument @@ -67,8 +63,7 @@ Mars Planet - urn:nasa:pds:context:target:planet.mars - + urn:nasa:pds:context:target:planet.mars data_to_target