diff --git a/bin/scripts/convert_falcon_to_mscluster_format.py b/bin/scripts/convert_falcon_to_mscluster_format.py index e1310f7..e6f5848 100755 --- a/bin/scripts/convert_falcon_to_mscluster_format.py +++ b/bin/scripts/convert_falcon_to_mscluster_format.py @@ -6,16 +6,120 @@ import pandas as pd +def _read_mzml_precursor_intensities(path): + """(scan -> precursor intensity) for MS2 spectra in an mzML file. + + Reads pyteomics directly so a missing optional CV term on one spectrum + (e.g. 'collision energy') does not abort parsing of the whole file. + """ + from pyteomics import mzml as _pmzml + out = {} + for spectrum in _pmzml.read(path): + if spectrum.get("ms level") != 2: + continue + scan = -1 + for tok in spectrum.get("id", "").split(): + if tok.startswith("scan="): + try: + scan = int(tok.split("=", 1)[1]) + except ValueError: + pass + elif tok.startswith("scanId="): + try: + scan = int(tok.split("=", 1)[1]) + except ValueError: + pass + if scan < 0: + continue + intensity = 0.0 + try: + selected = (spectrum["precursorList"]["precursor"][0] + ["selectedIonList"]["selectedIon"][0]) + intensity = float(selected.get("peak intensity", 0.0)) + except (KeyError, IndexError, TypeError, ValueError): + intensity = 0.0 + out[scan] = intensity + return out + + +def _read_mzxml_precursor_intensities(path): + """(scan -> precursor intensity) for MS2 spectra in an mzXML file.""" + from pyteomics import mzxml as _pmzxml + out = {} + for spectrum in _pmzxml.read(path): + if int(spectrum.get("msLevel", 0)) != 2: + continue + try: + scan = int(spectrum.get("num", -1)) + except (ValueError, TypeError): + scan = -1 + if scan < 0: + continue + intensity = 0.0 + try: + prec_list = spectrum.get("precursorMz") or [] + if prec_list: + intensity = float(prec_list[0].get("precursorIntensity", 0.0)) + except (KeyError, IndexError, TypeError, ValueError): + intensity = 0.0 + out[scan] = intensity + return out + + +def build_precursor_intensity_lookup(input_spectra_folder, needed_basenames): + """Map (basename, scan) -> precursor intensity by re-reading the input spectra. + + Falcon's CSV doesn't carry per-spectrum precursor intensity, so without + this lookup #PrecIntensity stays 0 and every cell of the + precursor-intensity feature quant table is 0. MSCluster's binary reads the + value from the input mzML directly; we do the same here, extracting the + 'peak intensity' CV term of the selected precursor ion (or + @precursorIntensity for mzXML). MGF inputs are skipped because standard + MGF does not carry precursor intensity. + """ + intensity_lookup = {} + if not input_spectra_folder or not os.path.isdir(input_spectra_folder): + print(f"WARNING: input_spectra_folder '{input_spectra_folder}' is not a directory; " + f"precursor intensities will default to 0") + return intensity_lookup + + for basename in needed_basenames: + spectra_path = os.path.join(input_spectra_folder, basename) + if not os.path.isfile(spectra_path): + print(f"WARNING: input spectrum file not found: {spectra_path}; " + f"precursor intensities for this file will default to 0") + continue + + ext = os.path.splitext(basename)[1].lower() + try: + if ext == ".mzml": + per_file = _read_mzml_precursor_intensities(spectra_path) + elif ext == ".mzxml": + per_file = _read_mzxml_precursor_intensities(spectra_path) + else: + # MGF and unknown extensions: no per-spectrum precursor + # intensity available, default to 0. + continue + except Exception as e: + print(f"WARNING: failed to read precursor intensities from {spectra_path}: {e}") + continue + + for scan_int, intensity in per_file.items(): + intensity_lookup[(basename, scan_int)] = float(intensity) + + return intensity_lookup + def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_clusterinfo, output_clustersummary, min_cluster_size=2): """ Convert falcon output to mscluster format. - + Falcon format: cluster, filename, scan, precursor_mz, retention_time, new_batch MSCluster format: #ClusterIdx, #Filename, #SpecIdx, #Scan, #ParentMass, #Charge, #RetTime, #PrecIntensity - - Note: If falcon doesn't have certain fields (like charge, intensity), we use default values (0) - instead of fetching from MGF files. + + Precursor intensity is not present in falcon's CSV, so it is looked up from + the original input spectra files (mzML/mzXML) using ming_spectrum_library. + Charge falls back to falcon's columns if available, otherwise defaults to 0. """ # Load falcon CSV clusterinfo_df = pd.read_csv(falcon_csv, sep=',', comment='#') @@ -46,11 +150,20 @@ def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_ if min_cluster_size > 1: clusterinfo_df = clusterinfo_df[clusterinfo_df['cluster'] != -1] - + + # Build (basename, scan) -> precursor intensity lookup from the original + # input spectra files. Falcon does not write precursor intensities into its + # CSV, so without this every value in the precursor-intensity feature quant + # table ends up as 0. + needed_basenames = sorted({os.path.basename(str(fn)) for fn in clusterinfo_df['filename'].unique()}) + precursor_intensity_lookup = build_precursor_intensity_lookup(input_spectra_folder, needed_basenames) + print(f"Loaded precursor intensities for {len(precursor_intensity_lookup)} spectra " + f"across {len(needed_basenames)} input files") + # Convert to mscluster format mscluster_rows = [] spec_idx_counter = 0 - + for idx, row in clusterinfo_df.iterrows(): cluster_idx = int(row['cluster']) @@ -93,7 +206,12 @@ def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_ precursor_intensity = float(row['precursor_intensity']) except (ValueError, TypeError): precursor_intensity = 0.0 - + else: + precursor_intensity = precursor_intensity_lookup.get( + (os.path.basename(filename), scan), 0.0 + ) + + # Convert retention time to seconds if it's in minutes # Falcon typically outputs RT in minutes, mscluster uses seconds if retention_time > 0 and retention_time < 1000: # Likely in minutes