Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 125 additions & 7 deletions bin/scripts/convert_falcon_to_mscluster_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,120 @@
import pandas as pd


def _read_mzml_precursor_intensities(path):
"""(scan -> precursor intensity) for MS2 spectra in an mzML file.

Reads pyteomics directly so a missing optional CV term on one spectrum
(e.g. 'collision energy') does not abort parsing of the whole file.
"""
from pyteomics import mzml as _pmzml
out = {}
for spectrum in _pmzml.read(path):
if spectrum.get("ms level") != 2:
continue
scan = -1
for tok in spectrum.get("id", "").split():
if tok.startswith("scan="):
try:
scan = int(tok.split("=", 1)[1])
except ValueError:
pass
elif tok.startswith("scanId="):
try:
scan = int(tok.split("=", 1)[1])
except ValueError:
pass
if scan < 0:
continue
intensity = 0.0
try:
selected = (spectrum["precursorList"]["precursor"][0]
["selectedIonList"]["selectedIon"][0])
intensity = float(selected.get("peak intensity", 0.0))
except (KeyError, IndexError, TypeError, ValueError):
intensity = 0.0
out[scan] = intensity
return out


def _read_mzxml_precursor_intensities(path):
"""(scan -> precursor intensity) for MS2 spectra in an mzXML file."""
from pyteomics import mzxml as _pmzxml
out = {}
for spectrum in _pmzxml.read(path):
if int(spectrum.get("msLevel", 0)) != 2:
continue
try:
scan = int(spectrum.get("num", -1))
except (ValueError, TypeError):
scan = -1
if scan < 0:
continue
intensity = 0.0
try:
prec_list = spectrum.get("precursorMz") or []
if prec_list:
intensity = float(prec_list[0].get("precursorIntensity", 0.0))
except (KeyError, IndexError, TypeError, ValueError):
intensity = 0.0
out[scan] = intensity
return out


def build_precursor_intensity_lookup(input_spectra_folder, needed_basenames):
"""Map (basename, scan) -> precursor intensity by re-reading the input spectra.

Falcon's CSV doesn't carry per-spectrum precursor intensity, so without
this lookup #PrecIntensity stays 0 and every cell of the
precursor-intensity feature quant table is 0. MSCluster's binary reads the
value from the input mzML directly; we do the same here, extracting the
'peak intensity' CV term of the selected precursor ion (or
@precursorIntensity for mzXML). MGF inputs are skipped because standard
MGF does not carry precursor intensity.
"""
intensity_lookup = {}
if not input_spectra_folder or not os.path.isdir(input_spectra_folder):
print(f"WARNING: input_spectra_folder '{input_spectra_folder}' is not a directory; "
f"precursor intensities will default to 0")
return intensity_lookup

for basename in needed_basenames:
spectra_path = os.path.join(input_spectra_folder, basename)
if not os.path.isfile(spectra_path):
print(f"WARNING: input spectrum file not found: {spectra_path}; "
f"precursor intensities for this file will default to 0")
continue

ext = os.path.splitext(basename)[1].lower()
try:
if ext == ".mzml":
per_file = _read_mzml_precursor_intensities(spectra_path)
elif ext == ".mzxml":
per_file = _read_mzxml_precursor_intensities(spectra_path)
else:
# MGF and unknown extensions: no per-spectrum precursor
# intensity available, default to 0.
continue
except Exception as e:
print(f"WARNING: failed to read precursor intensities from {spectra_path}: {e}")
continue

for scan_int, intensity in per_file.items():
intensity_lookup[(basename, scan_int)] = float(intensity)

return intensity_lookup


def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_clusterinfo, output_clustersummary, min_cluster_size=2):
"""
Convert falcon output to mscluster format.

Falcon format: cluster, filename, scan, precursor_mz, retention_time, new_batch
MSCluster format: #ClusterIdx, #Filename, #SpecIdx, #Scan, #ParentMass, #Charge, #RetTime, #PrecIntensity

Note: If falcon doesn't have certain fields (like charge, intensity), we use default values (0)
instead of fetching from MGF files.

Precursor intensity is not present in falcon's CSV, so it is looked up from
the original input spectra files (mzML/mzXML) using ming_spectrum_library.
Charge falls back to falcon's columns if available, otherwise defaults to 0.
"""
# Load falcon CSV
clusterinfo_df = pd.read_csv(falcon_csv, sep=',', comment='#')
Expand Down Expand Up @@ -46,11 +150,20 @@ def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_

if min_cluster_size > 1:
clusterinfo_df = clusterinfo_df[clusterinfo_df['cluster'] != -1]


# Build (basename, scan) -> precursor intensity lookup from the original
# input spectra files. Falcon does not write precursor intensities into its
# CSV, so without this every value in the precursor-intensity feature quant
# table ends up as 0.
needed_basenames = sorted({os.path.basename(str(fn)) for fn in clusterinfo_df['filename'].unique()})
precursor_intensity_lookup = build_precursor_intensity_lookup(input_spectra_folder, needed_basenames)
print(f"Loaded precursor intensities for {len(precursor_intensity_lookup)} spectra "
f"across {len(needed_basenames)} input files")

# Convert to mscluster format
mscluster_rows = []
spec_idx_counter = 0

for idx, row in clusterinfo_df.iterrows():
cluster_idx = int(row['cluster'])

Expand Down Expand Up @@ -93,7 +206,12 @@ def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_
precursor_intensity = float(row['precursor_intensity'])
except (ValueError, TypeError):
precursor_intensity = 0.0

else:
precursor_intensity = precursor_intensity_lookup.get(
(os.path.basename(filename), scan), 0.0
)


# Convert retention time to seconds if it's in minutes
# Falcon typically outputs RT in minutes, mscluster uses seconds
if retention_time > 0 and retention_time < 1000: # Likely in minutes
Expand Down