Wang-Bioinformatics-Lab · XianghuWang-287 · May 21, 2026 · May 21, 2026 · May 21, 2026 · Jun 12, 2026
diff --git a/bin/scripts/convert_falcon_to_mscluster_format.py b/bin/scripts/convert_falcon_to_mscluster_format.py
@@ -6,16 +6,120 @@
 import pandas as pd
 
 
+def _read_mzml_precursor_intensities(path):
+    """(scan -> precursor intensity) for MS2 spectra in an mzML file.
+
+    Reads pyteomics directly so a missing optional CV term on one spectrum
+    (e.g. 'collision energy') does not abort parsing of the whole file.
+    """
+    from pyteomics import mzml as _pmzml
+    out = {}
+    for spectrum in _pmzml.read(path):
+        if spectrum.get("ms level") != 2:
+            continue
+        scan = -1
+        for tok in spectrum.get("id", "").split():
+            if tok.startswith("scan="):
+                try:
+                    scan = int(tok.split("=", 1)[1])
+                except ValueError:
+                    pass
+            elif tok.startswith("scanId="):
+                try:
+                    scan = int(tok.split("=", 1)[1])
+                except ValueError:
+                    pass
+        if scan < 0:
+            continue
+        intensity = 0.0
+        try:
+            selected = (spectrum["precursorList"]["precursor"][0]
+                                ["selectedIonList"]["selectedIon"][0])
+            intensity = float(selected.get("peak intensity", 0.0))
+        except (KeyError, IndexError, TypeError, ValueError):
+            intensity = 0.0
+        out[scan] = intensity
+    return out
+
+
+def _read_mzxml_precursor_intensities(path):
+    """(scan -> precursor intensity) for MS2 spectra in an mzXML file."""
+    from pyteomics import mzxml as _pmzxml
+    out = {}
+    for spectrum in _pmzxml.read(path):
+        if int(spectrum.get("msLevel", 0)) != 2:
+            continue
+        try:
+            scan = int(spectrum.get("num", -1))
+        except (ValueError, TypeError):
+            scan = -1
+        if scan < 0:
+            continue
+        intensity = 0.0
+        try:
+            prec_list = spectrum.get("precursorMz") or []
+            if prec_list:
+                intensity = float(prec_list[0].get("precursorIntensity", 0.0))
+        except (KeyError, IndexError, TypeError, ValueError):
+            intensity = 0.0
+        out[scan] = intensity
+    return out
+
+
+def build_precursor_intensity_lookup(input_spectra_folder, needed_basenames):
+    """Map (basename, scan) -> precursor intensity by re-reading the input spectra.
+
+    Falcon's CSV doesn't carry per-spectrum precursor intensity, so without
+    this lookup #PrecIntensity stays 0 and every cell of the
+    precursor-intensity feature quant table is 0. MSCluster's binary reads the
+    value from the input mzML directly; we do the same here, extracting the
+    'peak intensity' CV term of the selected precursor ion (or
+    @precursorIntensity for mzXML). MGF inputs are skipped because standard
+    MGF does not carry precursor intensity.
+    """
+    intensity_lookup = {}
+    if not input_spectra_folder or not os.path.isdir(input_spectra_folder):
+        print(f"WARNING: input_spectra_folder '{input_spectra_folder}' is not a directory; "
+              f"precursor intensities will default to 0")
+        return intensity_lookup
+
+    for basename in needed_basenames:
+        spectra_path = os.path.join(input_spectra_folder, basename)
+        if not os.path.isfile(spectra_path):
+            print(f"WARNING: input spectrum file not found: {spectra_path}; "
+                  f"precursor intensities for this file will default to 0")
+            continue
+
+        ext = os.path.splitext(basename)[1].lower()
+        try:
+            if ext == ".mzml":
+                per_file = _read_mzml_precursor_intensities(spectra_path)
+            elif ext == ".mzxml":
+                per_file = _read_mzxml_precursor_intensities(spectra_path)
+            else:
+                # MGF and unknown extensions: no per-spectrum precursor
+                # intensity available, default to 0.
+                continue
+        except Exception as e:
+            print(f"WARNING: failed to read precursor intensities from {spectra_path}: {e}")
+            continue
+
+        for scan_int, intensity in per_file.items():
+            intensity_lookup[(basename, scan_int)] = float(intensity)
+
+    return intensity_lookup
+
 
 def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_clusterinfo, output_clustersummary, min_cluster_size=2):
     """
     Convert falcon output to mscluster format.
-    
+
     Falcon format: cluster, filename, scan, precursor_mz, retention_time, new_batch
     MSCluster format: #ClusterIdx, #Filename, #SpecIdx, #Scan, #ParentMass, #Charge, #RetTime, #PrecIntensity
-
-    Note: If falcon doesn't have certain fields (like charge, intensity), we use default values (0)
-    instead of fetching from MGF files.
+
+    Precursor intensity is not present in falcon's CSV, so it is looked up from
+    the original input spectra files (mzML/mzXML) using ming_spectrum_library.
+    Charge falls back to falcon's columns if available, otherwise defaults to 0.
     """
     # Load falcon CSV
     clusterinfo_df = pd.read_csv(falcon_csv, sep=',', comment='#')
@@ -46,11 +150,20 @@ def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_
 
     if min_cluster_size > 1:
         clusterinfo_df = clusterinfo_df[clusterinfo_df['cluster'] != -1]
-
+
+    # Build (basename, scan) -> precursor intensity lookup from the original
+    # input spectra files. Falcon does not write precursor intensities into its
+    # CSV, so without this every value in the precursor-intensity feature quant
+    # table ends up as 0.
+    needed_basenames = sorted({os.path.basename(str(fn)) for fn in clusterinfo_df['filename'].unique()})
+    precursor_intensity_lookup = build_precursor_intensity_lookup(input_spectra_folder, needed_basenames)
+    print(f"Loaded precursor intensities for {len(precursor_intensity_lookup)} spectra "
+          f"across {len(needed_basenames)} input files")
+
     # Convert to mscluster format
     mscluster_rows = []
     spec_idx_counter = 0
-    
+
     for idx, row in clusterinfo_df.iterrows():
         cluster_idx = int(row['cluster'])
 
@@ -93,7 +206,12 @@ def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_
                 precursor_intensity = float(row['precursor_intensity'])
             except (ValueError, TypeError):
                 precursor_intensity = 0.0
-
+        else:
+            precursor_intensity = precursor_intensity_lookup.get(
+                (os.path.basename(filename), scan), 0.0
+            )
+
+
         # Convert retention time to seconds if it's in minutes
         # Falcon typically outputs RT in minutes, mscluster uses seconds
         if retention_time > 0 and retention_time < 1000:  # Likely in minutes