diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 23455d162..4aeecd625 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -19,7 +19,7 @@ updates:
groups:
github-actions:
patterns:
- - "*"kly"
+ - "*"
- package-ecosystem: "docker"
directory: "/docker/"
diff --git a/CLAUDE.md b/CLAUDE.md
index d80512740..271937b7e 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -23,10 +23,10 @@ mvn package site
# Build without running tests (useful when tests fail with extra logging)
mvn clean package -DskipTests
-# Run tests only
+# Run full test suite (always use this to verify changes)
mvn test
-# Run tests excluding ReferenceIntegrityTest with specific Cucumber tags
+# Run a targeted subset during development (NOT for verifying changes — excludes ReferenceIntegrityTest)
mvn test -Dtest=\!ReferenceIntegrityTest* -Dcucumber.filter.tags='@v3.7.x'
# Run a single Cucumber scenario by name (issue number and subtest, e.g. #1458-1)
@@ -172,7 +172,7 @@ Alternatively, use `--verbose 1` flag for INFO level output in reports (default
- Source/Target: Java 17 (pom.xml:172-173)
### Main Dependencies
-- **pds4-jparser** (version 2.11.0) - PDS4 label parsing and validation
+- **pds4-jparser** (version 3.2.0-SNAPSHOT) - PDS4 label parsing and validation; **source is at `../pds4-jparser`** — check there first when tracing errors from pds4-jparser classes (e.g. `FieldValueValidator`, `ProblemType`, parser internals)
- **pds3-product-tools** (version 4.4.2) - PDS3 validation
- **Saxon-HE** (version 12.9) - XSLT and XPath processing
- **registry-common** (version 2.1.0-SNAPSHOT) - Registry API integration
diff --git a/scripts/benchmark_bundle.py b/scripts/benchmark_bundle.py
new file mode 100644
index 000000000..126f5893c
--- /dev/null
+++ b/scripts/benchmark_bundle.py
@@ -0,0 +1,515 @@
+#!/usr/bin/env python3
+"""Generate a synthetic PDS4 bundle and time validate against it.
+
+Exercises the referential integrity path (additionalReferentialIntegrityChecks,
+collectAllContextReferences) which is the target of the #1568 caching optimization.
+
+Each product label includes Investigation_Area, Observing_System, and
+Target_Identification Internal_References so all three cached identifier types
+(logicalIdentifiers, lidOrLidVidReferences, contextAreaRefs) are exercised.
+
+Usage
+-----
+ # Smoke test — 10 products, keep output dir for inspection:
+ python3 scripts/benchmark_bundle.py --keep
+
+ # Timing run:
+ python3 scripts/benchmark_bundle.py --products 1000
+
+ # Two-run comparison (warm schema cache on first run):
+ python3 scripts/benchmark_bundle.py --products 500 --runs 2
+
+ # Custom validate binary:
+ python3 scripts/benchmark_bundle.py --products 500 \\
+ --validate ./validate-4.2.0-SNAPSHOT/bin/validate
+"""
+
+import argparse
+import hashlib
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+
+# ---------------------------------------------------------------------------
+# PDS4 constants
+# ---------------------------------------------------------------------------
+BUNDLE_NS = "http://pds.nasa.gov/pds4/pds/v1"
+SCHEMA_URL = "https://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1D00.xsd"
+SCH_URL = "https://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1D00.sch"
+IM_VERSION = "1.13.0.0"
+
+BUNDLE_LID = "urn:nasa:pds:benchmark.bundle"
+COLL_LID = f"{BUNDLE_LID}:data.benchmark"
+COLL_LIDVID = f"{COLL_LID}::1.0"
+INV_LID = "urn:nasa:pds:context:investigation:mission.cassini-huygens"
+IH_LID = "urn:nasa:pds:context:instrument_host:spacecraft.co"
+INSTR_LID = "urn:nasa:pds:context:instrument:rss.co"
+TGT_LID = "urn:nasa:pds:context:target:planet.saturn"
+
+# ---------------------------------------------------------------------------
+# XML generation helpers
+# ---------------------------------------------------------------------------
+
+def _header():
+ return (
+ '\n'
+ f'\n'
+ )
+
+
+def _ns_attrs():
+ return (
+ f'xmlns="{BUNDLE_NS}"\n'
+ ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n'
+ f' xsi:schemaLocation="{BUNDLE_NS} {SCHEMA_URL}"'
+ )
+
+
+def prod_lid(i):
+ return f"{COLL_LID}:product_{i:06d}"
+
+
+def prod_lidvid(i):
+ return f"{prod_lid(i)}::1.0"
+
+
+def bundle_xml():
+ return f"""{_header()}
+
+ {BUNDLE_LID}
+ 1.0
+ Benchmark Bundle
+ {IM_VERSION}
+ Product_Bundle
+
+ Benchmark Generator
+ 2024
+ Synthetic bundle for benchmarking validate caching performance.
+
+
+
+
+ 2005-05-02T00:00:00Z
+ 2005-05-03T00:00:00Z
+
+
+ Science
+ Raw
+
+
+ Cassini-Huygens
+ Mission
+
+ {INV_LID}
+ bundle_to_investigation
+
+
+
+
+ Cassini Orbiter
+ Spacecraft
+
+ {IH_LID}
+ is_instrument_host
+
+
+
+ Radio Science Subsystem
+ Instrument
+
+ {INSTR_LID}
+ is_instrument
+
+
+
+
+ Saturn
+ Planet
+
+ {TGT_LID}
+ bundle_to_target
+
+
+
+
+ Archive
+
+
+ {COLL_LIDVID}
+ Primary
+ bundle_has_data_collection
+
+
+"""
+
+
+def collection_xml(n, csv_name, csv_size, csv_md5):
+ return f"""{_header()}
+
+ {COLL_LID}
+ 1.0
+ Benchmark Data Collection
+ {IM_VERSION}
+ Product_Collection
+
+ Benchmark Generator
+ 2024
+ Synthetic data collection for benchmarking validate caching performance.
+
+
+
+
+ 2005-05-02T00:00:00Z
+ 2005-05-03T00:00:00Z
+
+
+ Science
+ Raw
+
+
+ Cassini-Huygens
+ Mission
+
+ {INV_LID}
+ collection_to_investigation
+
+
+
+
+ Cassini Orbiter
+ Spacecraft
+
+ {IH_LID}
+ is_instrument_host
+
+
+
+ Radio Science Subsystem
+ Instrument
+
+ {INSTR_LID}
+ is_instrument
+
+
+
+
+ Saturn
+ Planet
+
+ {TGT_LID}
+ collection_to_target
+
+
+
+
+ Data
+
+
+
+ {csv_name}
+ {csv_size}
+ {csv_md5}
+
+
+ 0
+ PDS DSV 1
+ {n}
+ Carriage-Return Line-Feed
+ Comma
+
+ 2
+ 0
+ 257
+
+ Member Status
+ 1
+ ASCII_String
+ 1
+
+
+ LIDVID_LID
+ 2
+ ASCII_LIDVID_LID
+ 255
+
+
+ inventory_has_member_product
+
+
+
+"""
+
+
+def product_xml(i):
+ lid = prod_lid(i)
+ dat = f"product_{i:06d}.dat"
+ return f"""{_header()}
+
+ {lid}
+ 1.0
+ Benchmark Product {i:06d}
+ {IM_VERSION}
+ Product_Observational
+
+
+
+ 2005-05-02T00:00:00Z
+ 2005-05-03T00:00:00Z
+
+
+ Science
+ Raw
+
+
+ Cassini-Huygens
+ Mission
+
+ {INV_LID}
+ data_to_investigation
+
+
+
+
+ Cassini Orbiter
+ Spacecraft
+
+ {IH_LID}
+ is_instrument_host
+
+
+
+
+ Saturn
+ Planet
+
+ {TGT_LID}
+ data_to_target
+
+
+
+
+
+ {dat}
+ 0
+
+
+
+
+"""
+
+
+# ---------------------------------------------------------------------------
+# Bundle generation
+# ---------------------------------------------------------------------------
+
+def generate(outdir, n):
+ """Write bundle/ collection / products to outdir. Returns path to bundle label."""
+ data_dir = os.path.join(outdir, "data")
+ os.makedirs(data_dir, exist_ok=True)
+
+ # --- inventory CSV (CRLF line endings as required by PDS4) ---
+ csv_name = "collection_benchmark.csv"
+ csv_path = os.path.join(outdir, csv_name)
+ csv_lines = b"".join(
+ f"P,{prod_lidvid(i)}\r\n".encode() for i in range(1, n + 1)
+ )
+ with open(csv_path, "wb") as f:
+ f.write(csv_lines)
+ csv_size = len(csv_lines)
+ csv_md5 = hashlib.md5(csv_lines).hexdigest()
+
+ # --- collection label ---
+ coll_path = os.path.join(outdir, "collection_benchmark.xml")
+ with open(coll_path, "w", encoding="utf-8") as f:
+ f.write(collection_xml(n, csv_name, csv_size, csv_md5))
+
+ # --- bundle label ---
+ bundle_path = os.path.join(outdir, "bundle_benchmark.xml")
+ with open(bundle_path, "w", encoding="utf-8") as f:
+ f.write(bundle_xml())
+
+ # --- product labels + 0-byte stub data files ---
+ print(f" Generating {n} product labels...", end="", flush=True)
+ t0 = time.time()
+ for i in range(1, n + 1):
+ ppath = os.path.join(data_dir, f"product_{i:06d}.xml")
+ with open(ppath, "w", encoding="utf-8") as f:
+ f.write(product_xml(i))
+ # 0-byte stub satisfies file-existence check; content skipped via --skip-content-validation
+ open(os.path.join(data_dir, f"product_{i:06d}.dat"), "wb").close()
+ if i % max(1, n // 20) == 0:
+ print(".", end="", flush=True)
+ elapsed = time.time() - t0
+ print(f" done ({elapsed:.1f}s)")
+
+ size_mb = sum(
+ os.path.getsize(os.path.join(dp, fn))
+ for dp, _, fns in os.walk(outdir)
+ for fn in fns
+ ) / 1_048_576
+ print(f" Bundle size: {size_mb:.1f} MB ({n + 2} labels + 1 CSV)")
+ return bundle_path
+
+
+# ---------------------------------------------------------------------------
+# Validate runner
+# ---------------------------------------------------------------------------
+
+def find_validate():
+ """Return path to the validate binary.
+
+ Preference order:
+ 1. Freshly built distribution zip in target/ (if newer than snapshot) — auto-extracted to /tmp
+ 2. Pre-built snapshot in validate-4.2.0-SNAPSHOT/
+ 3. validate on PATH
+ """
+ repo_root = os.path.normpath(os.path.join(os.path.dirname(__file__), ".."))
+ snapshot_bin = os.path.join(repo_root, "validate-4.2.0-SNAPSHOT", "bin", "validate")
+ dist_zip = os.path.join(repo_root, "target", "validate-4.2.0-SNAPSHOT-bin.zip")
+
+ # Auto-extract freshly built zip when it's newer than the snapshot binary
+ if os.path.isfile(dist_zip) and (
+ not os.path.isfile(snapshot_bin)
+ or os.path.getmtime(dist_zip) > os.path.getmtime(snapshot_bin)
+ ):
+ extract_dir = "/tmp/validate_benchmark_bin"
+ extracted_bin = os.path.join(extract_dir, "validate-4.2.0-SNAPSHOT", "bin", "validate")
+ if not os.path.isfile(extracted_bin) or os.path.getmtime(dist_zip) > os.path.getmtime(extracted_bin):
+ import zipfile
+ print(f" Extracting fresh build from {dist_zip}...")
+ shutil.rmtree(extract_dir, ignore_errors=True)
+ os.makedirs(extract_dir, exist_ok=True)
+ with zipfile.ZipFile(dist_zip) as z:
+ z.extractall(extract_dir)
+ os.chmod(extracted_bin, 0o755)
+ print(f" Extracted to {extract_dir}")
+ return extracted_bin
+
+ if os.path.isfile(snapshot_bin):
+ return snapshot_bin
+ return shutil.which("validate")
+
+
+def run_validate(validate_cmd, bundle_path, extra_args):
+ """Run validate --rule pds4.bundle --skip-content-validation on bundle_path.
+ Returns (returncode, elapsed_seconds, stdout+stderr).
+ """
+ bundle_dir = os.path.dirname(bundle_path)
+ cmd = [
+ validate_cmd,
+ "--rule", "pds4.bundle",
+ "--skip-content-validation",
+ bundle_dir,
+ ] + extra_args
+
+ print(f" $ {' '.join(cmd)}")
+ t0 = time.time()
+ result = subprocess.run(cmd, capture_output=True, text=True)
+ elapsed = time.time() - t0
+ return result.returncode, elapsed, result.stdout + result.stderr
+
+
+def summarise(output):
+ """Extract the final Summary block from validate output."""
+ # Find the last "Summary:" heading (not per-product lines)
+ idx = output.rfind("\nSummary:")
+ if idx == -1:
+ return "(no summary found — see full output)"
+ return output[idx:].strip()
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Generate a synthetic PDS4 bundle and time validate.",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=__doc__,
+ )
+ parser.add_argument("--products", type=int, default=10,
+ help="Number of product labels to generate (default: 10)")
+ parser.add_argument("--output-dir", default=None,
+ help="Directory to write bundle into (default: temp dir)")
+ parser.add_argument("--validate", default=None,
+ help="Path to validate binary (default: auto-detect)")
+ parser.add_argument("--runs", type=int, default=1,
+ help="How many timed validate runs to perform (default: 1)")
+ parser.add_argument("--keep", action="store_true",
+ help="Keep output directory after run (useful for smoke tests)")
+ parser.add_argument("--validate-args", default="",
+ help="Extra args passed through to validate (quoted string)")
+ args = parser.parse_args()
+
+ # --- locate validate ---
+ validate_cmd = args.validate or find_validate()
+ if not validate_cmd:
+ sys.exit(
+ "ERROR: cannot find validate binary.\n"
+ "Run `mvn package -DskipTests` first, or pass --validate ."
+ )
+ validate_cmd = os.path.abspath(validate_cmd)
+ print(f"validate: {validate_cmd}")
+
+ # --- output directory ---
+ tmp_owned = False
+ if args.output_dir:
+ outdir = os.path.abspath(args.output_dir)
+ os.makedirs(outdir, exist_ok=True)
+ else:
+ outdir = tempfile.mkdtemp(prefix="pds4_benchmark_")
+ tmp_owned = True
+
+ print(f"output: {outdir}")
+ print(f"products: {args.products}")
+ print()
+
+ try:
+ # --- generate ---
+ print("=== Generating bundle ===")
+ bundle_path = generate(outdir, args.products)
+ print()
+
+ # --- run validate ---
+ extra = args.validate_args.split() if args.validate_args else []
+ timings = []
+ for run_n in range(1, args.runs + 1):
+ label = f"Run {run_n}/{args.runs}" if args.runs > 1 else "Validate"
+ print(f"=== {label} ===")
+ rc, elapsed, output = run_validate(validate_cmd, bundle_path, extra)
+ timings.append(elapsed)
+ print(f" Exit code : {rc}")
+ print(f" Elapsed : {elapsed:.2f}s")
+ print(f" Summary :")
+ for line in summarise(output).splitlines():
+ print(f" {line}")
+ print()
+
+ # --- timing summary ---
+ if args.runs > 1:
+ print("=== Timing summary ===")
+ for i, t in enumerate(timings, 1):
+ print(f" Run {i}: {t:.2f}s")
+ print(f" Best : {min(timings):.2f}s")
+ print(f" Mean : {sum(timings)/len(timings):.2f}s")
+ print()
+
+ if args.keep or args.output_dir:
+ print(f"Bundle kept at: {outdir}")
+
+ finally:
+ if tmp_owned and not args.keep:
+ shutil.rmtree(outdir, ignore_errors=True)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java b/src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java
new file mode 100644
index 000000000..fb920467a
--- /dev/null
+++ b/src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java
@@ -0,0 +1,32 @@
+package gov.nasa.pds.tools.util;
+
+import java.util.ArrayList;
+
+/**
+ * Immutable cache of identifier data extracted from a parsed label, used to avoid re-parsing
+ * during the referential integrity phase.
+ */
+public class LabelCacheEntry {
+ private final ArrayList logicalIdentifiers;
+ private final ArrayList lidOrLidVidReferences;
+ private final ArrayList contextAreaRefs;
+
+ public LabelCacheEntry(ArrayList logicalIdentifiers,
+ ArrayList lidOrLidVidReferences, ArrayList contextAreaRefs) {
+ this.logicalIdentifiers = logicalIdentifiers;
+ this.lidOrLidVidReferences = lidOrLidVidReferences;
+ this.contextAreaRefs = contextAreaRefs;
+ }
+
+ public ArrayList getLogicalIdentifiers() {
+ return logicalIdentifiers;
+ }
+
+ public ArrayList getLidOrLidVidReferences() {
+ return lidOrLidVidReferences;
+ }
+
+ public ArrayList getContextAreaRefs() {
+ return contextAreaRefs;
+ }
+}
diff --git a/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java b/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java
index f59486b55..a7698a0f1 100644
--- a/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java
+++ b/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java
@@ -365,6 +365,11 @@ public static synchronized ArrayList getIdentifiersCommon(DOMSource sour
* @return lidOrLidVidReference The LID or LIDVID referenced in this label.
*/
public static ArrayList getLidVidReferences(DOMSource source, URL context) {
+ return getLidVidReferences(source, context, true);
+ }
+
+ public static ArrayList getLidVidReferences(DOMSource source, URL context,
+ boolean reportCarriageReturns) {
LOG.debug("getLidVidReferences:MY_SOURCE[{}]", source);
String[] tagsList = new String[2];
@@ -372,7 +377,8 @@ public static ArrayList getLidVidReferences(DOMSource source, URL contex
tagsList[1] = LID_REFERENCE;
ArrayList lidOrLidVidReferences =
- LabelUtil.getIdentifiersCommon(source, context, tagsList, INTERNAL_REFERENCE_AREA);
+ LabelUtil.getIdentifiersCommon(source, context, tagsList, INTERNAL_REFERENCE_AREA,
+ reportCarriageReturns);
LOG.debug("getLidVidReferences:context,lidOrLidVidReferences {},{}", context,
lidOrLidVidReferences);
@@ -387,13 +393,19 @@ public static ArrayList getLidVidReferences(DOMSource source, URL contex
* @return logicalIdentifiers A list of logical identifiers in this label.
*/
public static ArrayList getLogicalIdentifiers(DOMSource source, URL context) {
+ return getLogicalIdentifiers(source, context, true);
+ }
+
+ public static ArrayList getLogicalIdentifiers(DOMSource source, URL context,
+ boolean reportCarriageReturns) {
LOG.debug("getLogicalIdentifiers:MY_SOURCE[{}]", source);
String[] tagsList = new String[1];
tagsList[0] = LOGICAL_IDENTIFIER_TAG;
ArrayList logicalIdentifiers =
- LabelUtil.getIdentifiersCommon(source, context, tagsList, IDENTIFICATION_AREA);
+ LabelUtil.getIdentifiersCommon(source, context, tagsList, IDENTIFICATION_AREA,
+ reportCarriageReturns);
LOG.debug("getLogicalIdentifiers:context,logicalIdentifiers {},{}", context,
logicalIdentifiers);
diff --git a/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java b/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java
index 030b2ab01..e00ff09d9 100644
--- a/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java
+++ b/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java
@@ -142,6 +142,7 @@ public class ReferentialIntegrityUtil {
private static String[] tagsList = new String[2];
private static HashSet reportedErrorsReferenceSet = new HashSet<>();
private static URL parentBundleURL = null;
+ private static HashMap labelIdentifierCache = new HashMap<>();
/**
* Initialize this class to ready for doing referential checks.
@@ -163,6 +164,14 @@ public static void initialize(String referenceType, URL target, ProblemListener
ReferentialIntegrityUtil.tagsList[1] = LabelUtil.LID_REFERENCE;
}
+ public static void cacheLabelIdentifiers(URL url, LabelCacheEntry entry) {
+ labelIdentifierCache.put(url, entry);
+ }
+
+ public static LabelCacheEntry getCachedLabelIdentifiers(URL url) {
+ return labelIdentifierCache.get(url);
+ }
+
/**
* Reset this class to its initial state in case running from regression tests.
*
@@ -188,6 +197,7 @@ public static void reset() {
ReferentialIntegrityUtil.urlsParsedCumulative.clear();
ReferentialIntegrityUtil.reportedErrorsReferenceSet.clear();
ReferentialIntegrityUtil.parentBundleURL = null;
+ ReferentialIntegrityUtil.labelIdentifierCache.clear();
}
/**
@@ -577,6 +587,30 @@ private static void addUniqueReferencesToMap(HashMap allContextAreaRefs,
+ ArrayList logicalIdentifiers, ArrayList lidOrLidVidReferences,
+ boolean labelIsBundleFlag, boolean labelIsCollectionFlag, URL url) {
+ if ((logicalIdentifiers == null) || logicalIdentifiers.isEmpty()) {
+ return;
+ }
+ if (labelIsCollectionFlag || labelIsBundleFlag) {
+ if (labelIsBundleFlag) {
+ ReferentialIntegrityUtil.addUniqueReferencesToMap(ReferentialIntegrityUtil.bundleReferenceMap,
+ allContextAreaRefs, url, logicalIdentifiers.get(0));
+ } else {
+ ReferentialIntegrityUtil.addUniqueReferencesToMap(
+ ReferentialIntegrityUtil.collectionReferenceMap, allContextAreaRefs, url,
+ logicalIdentifiers.get(0));
+ }
+ } else {
+ ReferentialIntegrityUtil.addUniqueReferencesToMap(
+ ReferentialIntegrityUtil.contextReferencesCumulative, allContextAreaRefs, url,
+ logicalIdentifiers.get(0));
+ }
+ LOG.debug("collectAllContextReferences:url,contextReferencesCumulative {},{},{}", url,
+ contextReferencesCumulative, contextReferencesCumulative.size());
+ }
+
private static void collectAllContextReferences(DOMSource domSource,
ArrayList logicalIdentifiers, ArrayList lidOrLidVidReferences,
boolean labelIsBundleFlag, boolean labelIsCollectionFlag, URL url) {
@@ -786,16 +820,29 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun
if (TargetExaminer.isTargetCollectionType (child.getUrl())) {
labelIsCollectionFlag = true;
}
- xml = db.parse(url.openStream());
- domSource = new DOMSource(xml);
- // Note that the function getLidVidReferences() collects all Internal_Reference
- // elements in the PDS4 core namespace (including those in Reference_List,
- // Context_Area, discipline LDD areas, and any other location in the label).
- // so the lidOrLidVidReferencesCumulative will be a cumulative collection of all
- // references collected in lidOrLidVidReferences for each label.
-
- ArrayList lidOrLidVidReferences = LabelUtil.getLidVidReferences(domSource, url);
- ArrayList logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, url);
+ LabelCacheEntry cached = ReferentialIntegrityUtil.getCachedLabelIdentifiers(url);
+ ArrayList lidOrLidVidReferences;
+ ArrayList logicalIdentifiers;
+ if (cached != null) {
+ logicalIdentifiers = cached.getLogicalIdentifiers();
+ lidOrLidVidReferences = cached.getLidOrLidVidReferences();
+ // Re-parse to report any lid_reference \n errors, matching the uncached path
+ // where getLidVidReferences(true) reports them. cacheIdentifiers() uses false
+ // to avoid reporting these in single-label validation (backwards compatibility).
+ xml = db.parse(url.openStream());
+ domSource = new DOMSource(xml);
+ LabelUtil.getLidVidReferences(domSource, url, true);
+ } else {
+ xml = db.parse(url.openStream());
+ domSource = new DOMSource(xml);
+ // Note that the function getLidVidReferences() collects all Internal_Reference
+ // elements in the PDS4 core namespace (including those in Reference_List,
+ // Context_Area, discipline LDD areas, and any other location in the label).
+ // so the lidOrLidVidReferencesCumulative will be a cumulative collection of all
+ // references collected in lidOrLidVidReferences for each label.
+ lidOrLidVidReferences = LabelUtil.getLidVidReferences(domSource, url);
+ logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, url);
+ }
LOG.debug("additionalReferentialIntegrityChecks:url,lidOrLidVidReferences {},{}", url,
lidOrLidVidReferences.size());
LOG.debug("additionalReferentialIntegrityChecks:url,logicalIdentifiers {},{}", url,
@@ -871,8 +918,14 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun
// Collect all the context references defined for each label under the
// "Context_Area" tag.
if (ReferentialIntegrityUtil.contextReferenceCheck) {
- ReferentialIntegrityUtil.collectAllContextReferences(domSource, logicalIdentifiers,
- lidOrLidVidReferences, labelIsBundleFlag, labelIsCollectionFlag, url);
+ if (cached != null) {
+ ReferentialIntegrityUtil.collectAllContextReferences(cached.getContextAreaRefs(),
+ logicalIdentifiers, lidOrLidVidReferences, labelIsBundleFlag,
+ labelIsCollectionFlag, url);
+ } else {
+ ReferentialIntegrityUtil.collectAllContextReferences(domSource, logicalIdentifiers,
+ lidOrLidVidReferences, labelIsBundleFlag, labelIsCollectionFlag, url);
+ }
}
} else {
diff --git a/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java b/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java
index 9832e102f..fe37e2004 100644
--- a/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java
+++ b/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java
@@ -3,6 +3,7 @@
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Path;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
@@ -11,7 +12,9 @@
import javax.xml.transform.dom.DOMSource;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
+import gov.nasa.pds.tools.util.LabelCacheEntry;
import gov.nasa.pds.tools.util.LabelUtil;
+import gov.nasa.pds.tools.util.ReferentialIntegrityUtil;
public class CrossLabelFileAreaReferenceChecker {
final private static HashMap isObservational = new HashMap();
@@ -26,24 +29,25 @@ private static String resolve (String name, ValidationTarget target) throws URIS
* @param name - the file being referenced by the file area
* @param target - the label being validated
* @return true if name and target are unique and only known references and false otherwise
- * All of these exceptions should not happen because they are getting the LID from the target.
- * Would not have made it this far if that could not have happened already, So, just pass them
- * back to the called and let them handle it with their own generic exception handler/message.
- * @throws IOException
- * @throws ParserConfigurationException
- * @throws SAXException
* @throws URISyntaxException
*/
- public static boolean add (String name, ValidationTarget target, boolean isObs)
+ public static boolean add(String name, ValidationTarget target, boolean isObs)
throws IOException, ParserConfigurationException, SAXException, URISyntaxException {
boolean success = false;
- DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
- Document xml = dbf.newDocumentBuilder().parse(target.getUrl().openStream());
- DOMSource domSource = new DOMSource(xml);
String full_name = resolve(name, target);
isObservational.put(full_name,
isObs || (isObservational.containsKey(full_name) ? isObservational.get(full_name) : false));
- for (String lid : LabelUtil.getLogicalIdentifiers (domSource, target.getUrl())) {
+ ArrayList logicalIdentifiers;
+ LabelCacheEntry cached = ReferentialIntegrityUtil.getCachedLabelIdentifiers(target.getUrl());
+ if (cached != null) {
+ logicalIdentifiers = cached.getLogicalIdentifiers();
+ } else {
+ DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+ Document xml = dbf.newDocumentBuilder().parse(target.getUrl().openStream());
+ DOMSource domSource = new DOMSource(xml);
+ logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, target.getUrl());
+ }
+ for (String lid : logicalIdentifiers) {
if (lid.contains("::")) {
lid = lid.substring (0, lid.indexOf("::"));
}
@@ -64,5 +68,6 @@ public static String getOtherFilename (String name, ValidationTarget target) thr
}
public static void reset() {
knownRefs.clear();
+ isObservational.clear();
}
}
diff --git a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java
index 27cc86c8a..1660e0e33 100644
--- a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java
+++ b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java
@@ -331,7 +331,7 @@ private boolean validate(NodeInfo xml) {
ancestor = ancestor.getParent();
isObservational |= OBS_DATA_TAGS.contains(ancestor.getLocalPart());
}
- if (!CrossLabelFileAreaReferenceChecker.add (fullName, target, isObservational)) {
+ if (!CrossLabelFileAreaReferenceChecker.add(fullName, target, isObservational)) {
this.getListener().addProblem(
new ValidationProblem(
new ProblemDefinition(ExceptionType.ERROR, ProblemType.DUPLICATED_FILE_AREA_REFERENCE,
diff --git a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java
index fb323f6c6..b3e1cbb68 100644
--- a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java
+++ b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java
@@ -26,6 +26,7 @@
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
@@ -59,6 +60,10 @@
import gov.nasa.pds.validate.constants.Constants;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.tree.tiny.TinyNodeImpl;
+import javax.xml.transform.dom.DOMSource;
+import gov.nasa.pds.tools.util.LabelCacheEntry;
+import gov.nasa.pds.tools.util.LabelUtil;
+import gov.nasa.pds.tools.util.ReferentialIntegrityUtil;
/**
* Implements a validation chain that validates PDS4 bundles. It is applicable if there is a bundle
@@ -318,6 +323,7 @@ public void validateLabel() {
if (document != null) {
getContext().put(PDS4Context.LABEL_DOCUMENT, document);
labelIsValidFlag = true; // A non-null document signified that the label is valid.
+ cacheIdentifiers(getTarget());
}
} catch (SAXException | IOException | ParserConfigurationException | TransformerException
| MissingLabelSchemaException e) {
@@ -337,6 +343,34 @@ public void validateLabel() {
LOG.debug("validateLabel:target,labelIsValidFlag {},{}", target, labelIsValidFlag);
}
+ private static void cacheIdentifiers(URL targetUrl) {
+ // Re-parse using plain DocumentBuilderFactory so whitespace in text content (e.g. \n in
+ // logical_identifier or lid_reference) is preserved exactly as authored.
+ // Saxon's parseAndValidate() normalizes xs:token values, stripping \n before we can detect them.
+ try {
+ Document rawDoc =
+ DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(targetUrl.openStream());
+ DOMSource domSource = new DOMSource(rawDoc);
+ String[] tagsArr = {LabelUtil.LIDVID_REFERENCE, LabelUtil.LID_REFERENCE};
+ ArrayList logicalIds = LabelUtil.getLogicalIdentifiers(domSource, targetUrl, true);
+ // false: lid_reference \n errors are only reported during bundle/collection validation
+ // (via additionalReferentialIntegrityChecks), not during single-label validation.
+ // Using true here would be a backwards-incompatible behavior change.
+ ArrayList lidVidRefs = LabelUtil.getLidVidReferences(domSource, targetUrl, false);
+ ArrayList contextRefs = new ArrayList<>();
+ contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr,
+ LabelUtil.CONTEXT_AREA_INVESTIGATION_AREA_REFERENCE, false));
+ contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr,
+ LabelUtil.CONTEXT_AREA_OBSERVATION_SYSTEM_COMPONENT_REFERENCE, false));
+ contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr,
+ LabelUtil.CONTEXT_AREA_TARGET_IDENTIFICATION_REFERENCE, false));
+ ReferentialIntegrityUtil.cacheLabelIdentifiers(targetUrl,
+ new LabelCacheEntry(logicalIds, lidVidRefs, contextRefs));
+ } catch (Exception e) {
+ LOG.error("cacheIdentifiers: failed to parse {}: {}", targetUrl, e.getMessage());
+ }
+ }
+
private boolean resolveSingleSchema(URL label, Map.Entry schemaLocation,
URL schemaUrl, ProblemContainer container, ProblemContainer labelProblems,
XMLCatalogResolver resolver) {
diff --git a/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java b/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java
index 30c3034fa..7b513c5bf 100644
--- a/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java
+++ b/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java
@@ -104,6 +104,7 @@
import gov.nasa.pds.tools.util.ReferentialIntegrityUtil;
import gov.nasa.pds.tools.util.XMLExtractor;
import gov.nasa.pds.tools.validate.ContentProblem;
+import gov.nasa.pds.tools.validate.CrossLabelFileAreaReferenceChecker;
import gov.nasa.pds.tools.validate.InMemoryRegistrar;
import gov.nasa.pds.tools.validate.ProblemContainer;
import gov.nasa.pds.tools.validate.ProblemDefinition;
@@ -1643,6 +1644,7 @@ public boolean doValidation(Map checksumManifest) throws Exception
// Due to the util class ReferentialIntegrityUtil being static, it need to be
// reset() if running a regression test.
ReferentialIntegrityUtil.reset();
+ CrossLabelFileAreaReferenceChecker.reset();
return success;
}
diff --git a/src/test/resources/github15/test_check-no-pass_context_products.xml b/src/test/resources/github15/test_check-no-pass_context_products.xml
index 8bb6d668e..b24fcfcc5 100644
--- a/src/test/resources/github15/test_check-no-pass_context_products.xml
+++ b/src/test/resources/github15/test_check-no-pass_context_products.xml
@@ -28,8 +28,7 @@
Test
Mission
- urn:nasa:pds:context:investigation:mission.mars_exploration_rover
-
+ urn:nasa:pds:context:investigation:mission.mars_exploration_rover
data_to_investigation
@@ -39,8 +38,7 @@
FOO
Spacecraft
- urn:nasa:pds:context:instrument_host:spacecraft.FOO
-
+ urn:nasa:pds:context:instrument_host:spacecraft.FOO
is_instrument_host
@@ -48,8 +46,7 @@
MER2
Spacecraft
- urn:nasa:pds:context:instrument_host:spacecraft.mer2
-
+ urn:nasa:pds:context:instrument_host:spacecraft.mer2
is_instrument_host
@@ -57,8 +54,7 @@
Pancam
Instrument
- urn:nasa:pds:context:instrument:pancam.mer2
-
+ urn:nasa:pds:context:instrument:pancam.mer2
is_instrument
@@ -67,8 +63,7 @@
Mars
Planet
- urn:nasa:pds:context:target:planet.mars
-
+ urn:nasa:pds:context:target:planet.mars
data_to_target