From 71f32f8dea03142eecd737afe33416dc60006c18 Mon Sep 17 00:00:00 2001 From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com> Date: Tue, 4 Feb 2025 10:04:12 +0100 Subject: [PATCH 01/10] fix/refactoring: fix the AIRR code; add checks for column names; fix the formatting and argument types where possible; highlight redundant code with ToDos --- panpipes/funcs/io.py | 689 +++++++++++++++++++++++++++---------------- 1 file changed, 441 insertions(+), 248 deletions(-) diff --git a/panpipes/funcs/io.py b/panpipes/funcs/io.py index 77d74d2b..b5ab4419 100644 --- a/panpipes/funcs/io.py +++ b/panpipes/funcs/io.py @@ -1,24 +1,30 @@ -from random import sample -import warnings -import re +import gzip +import logging import os -import pandas as pd +import re import sys -import h5py -from typing import Optional, Literal -from scanpy import read_10x_mtx, read_10x_h5, read_h5ad, read_text, read_csv, read_hdf +import warnings +from os import PathLike +from pathlib import Path +from typing import Literal, Optional +import h5py import muon as mu -import logging -import scirpy as ir -from itertools import chain -from .processing import concat_adata_list, intersection -from anndata import AnnData import numpy as np -from os import PathLike +import pandas as pd +import scirpy as ir +import yaml +from anndata import AnnData +from scanpy import read_10x_h5, read_10x_mtx, read_csv, read_h5ad, read_hdf, read_text +from scipy.io import mmwrite + +from .processing import intersection -def update_cellranger_col(path, raw=False, method="count", sample_id=""): - if method =="count": + +def update_cellranger_col( + path: str | Path, raw: bool = False, method: str = "count", sample_id: str = "" +): + if method == "count": # this assumes that we get a cellranger count base path and we want file path for sc.read_10x_h5 or similar if raw: append_str = "raw_feature_bc_matrix" @@ -28,321 +34,472 @@ def update_cellranger_col(path, raw=False, method="count", sample_id=""): if raw: append_str = os.path.join("multi", "count", "raw_feature_bc_matrix") else: - append_str = os.path.join("per_sample_outs", sample_id, "count", "sample_filtered_feature_bc_matrix") + append_str = os.path.join( + "per_sample_outs", + sample_id, + "count", + "sample_filtered_feature_bc_matrix", + ) else: sys.exit("specify cellranger filetype as cellranger_count or cellranger_multi") # is there a h5 file available - if os.path.exists(os.path.join(path, append_str)) == False: + if not os.path.exists(os.path.join(path, append_str)): sys.exit("cellranger path not found: %s" % os.path.join(path, append_str)) - if os.path.exists(os.path.join(path, append_str + '.h5')): + + if os.path.exists(os.path.join(path, append_str + ".h5")): append_str = append_str + ".h5" filetype = "10X_h5" else: filetype = "cellranger" + path = os.path.join(path, append_str) + return path, filetype -def gen_load_anndata_jobs(caf, load_raw=False, mode_dictionary = {}, load_prot_from_raw=False): +def gen_load_anndata_jobs( + caf: pd.DataFrame, + load_raw: bool = False, + mode_dictionary: Optional[dict] = None, + load_prot_from_raw: bool = False, +): + if mode_dictionary is None: + mode_dictionary = {} # not the best, dirty hack to make it compatible with code + """ Generate a load_adatas job for each line in submission.txt """ for nn in range(0, caf.shape[0]): - if pd.isna(caf['rna_path'][nn]): - rna_path= None - rna_filetype=None - elif caf['rna_filetype'][nn]=="cellranger" and mode_dictionary["rna"]: - rna_path, rna_filetype = update_cellranger_col(caf['rna_path'][nn], raw=load_raw, method="count") - elif caf['rna_filetype'][nn]=="cellranger_multi" and mode_dictionary["rna"]: - rna_path, rna_filetype = update_cellranger_col(caf['rna_path'][nn], raw=load_raw, method="multi", - sample_id=caf['sample_id'][nn]) + if pd.isna(caf["rna_path"][nn]): + rna_path = None + rna_filetype = None + elif caf["rna_filetype"][nn] == "cellranger" and mode_dictionary["rna"]: + rna_path, rna_filetype = update_cellranger_col( + caf["rna_path"][nn], raw=load_raw, method="count" + ) + elif caf["rna_filetype"][nn] == "cellranger_multi" and mode_dictionary["rna"]: + rna_path, rna_filetype = update_cellranger_col( + caf["rna_path"][nn], + raw=load_raw, + method="multi", + sample_id=caf["sample_id"][nn], + ) else: - rna_path, rna_filetype = caf[['rna_path', "rna_filetype"]].iloc[nn] + rna_path, rna_filetype = caf[["rna_path", "rna_filetype"]].iloc[nn] if load_raw: rna_path = re.sub("filtered", "raw", rna_path) + # manage the prot paths - if ('prot_path' in caf.columns and mode_dictionary["prot"]): + if "prot_path" in caf.columns and mode_dictionary["prot"]: # check if its the same as the rna path (data in the same file) - if pd.isna(caf['prot_path'][nn]): - prot_path= None - prot_filetype=None - elif caf['prot_path'][nn] == caf['rna_path'][nn]: + if pd.isna(caf["prot_path"][nn]): + prot_path = None + prot_filetype = None + elif caf["prot_path"][nn] == caf["rna_path"][nn]: prot_path, prot_filetype = rna_path, rna_filetype - elif caf['prot_filetype'][nn]=="cellranger": - # we might want to load the raw here because we want to then subset by good gex (rna) barcodes, + elif caf["prot_filetype"][nn] == "cellranger": + # we might want to load the raw here because we want to then subset by good gex (rna) barcodes, # this is why the load_prot_from_raw argument exists - prot_path, prot_filetype = update_cellranger_col(caf['prot_path'][nn], raw=load_prot_from_raw) - elif caf['prot_filetype'][nn]=="cellranger_multi": + prot_path, prot_filetype = update_cellranger_col( + caf["prot_path"][nn], raw=load_prot_from_raw + ) + elif caf["prot_filetype"][nn] == "cellranger_multi": # celranger multi has the same prot and gex (rna) barcodes - prot_path, prot_filetype = update_cellranger_col(caf['prot_path'][nn], raw=load_raw, method="multi", - sample_id=caf['sample_id'][nn]) + prot_path, prot_filetype = update_cellranger_col( + caf["prot_path"][nn], + raw=load_raw, + method="multi", + sample_id=caf["sample_id"][nn], + ) else: - prot_path, prot_filetype = caf[['prot_path', "prot_filetype"]].iloc[nn] + prot_path, prot_filetype = caf[["prot_path", "prot_filetype"]].iloc[nn] if load_prot_from_raw or load_raw: prot_path = re.sub("filtered", "raw", prot_path) else: - prot_path= None - prot_filetype=None + prot_path = None + prot_filetype = None + # load tcr_path - if 'tcr_path' in caf.columns and mode_dictionary["tcr"] and pd.notna(caf['tcr_path'][nn]): - tcr_path = caf['tcr_path'][nn] - tcr_filetype = caf['tcr_filetype'][nn] + if ( + "tcr_path" in caf.columns + and mode_dictionary["tcr"] + and pd.notna(caf["tcr_path"][nn]) + ): + tcr_path = caf["tcr_path"][nn] + tcr_filetype = caf["tcr_filetype"][nn] else: - tcr_path= None - tcr_filetype=None - if 'bcr_path' in caf.columns and mode_dictionary["bcr"] and pd.notna(caf['bcr_path'][nn]): - bcr_path = caf['bcr_path'][nn] - bcr_filetype = caf['bcr_filetype'][nn] + tcr_path = None + tcr_filetype = None + + # load bcr_path + if ( + "bcr_path" in caf.columns + and mode_dictionary["bcr"] + and pd.notna(caf["bcr_path"][nn]) + ): + bcr_path = caf["bcr_path"][nn] + bcr_filetype = caf["bcr_filetype"][nn] else: - bcr_path= None - bcr_filetype=None - if ('atac_path' in caf.columns and mode_dictionary["atac"]): + bcr_path = None + bcr_filetype = None + + # load atac_path + if "atac_path" in caf.columns and mode_dictionary["atac"]: if caf.shape[0] > 1: - sys.exit("You can only submit one atac/multiome file at a time. To aggregate, see cellranger aggr.") - if caf['atac_filetype'][nn]=="cellranger" : - atac_path, atac_filetype = update_cellranger_col(caf['atac_path'][nn], raw=load_raw, method="count") + sys.exit( + "You can only submit one atac/multiome file at a time. To aggregate, see cellranger aggr." + ) + if caf["atac_filetype"][nn] == "cellranger": + atac_path, atac_filetype = update_cellranger_col( + caf["atac_path"][nn], raw=load_raw, method="count" + ) else: - atac_path = caf['atac_path'][nn] - atac_filetype = caf['atac_filetype'][nn] - if 'fragments_file' in caf.columns and pd.notna(caf['fragments_file'][nn]): - fragments_file = caf['fragments_file'][nn] + atac_path = caf["atac_path"][nn] + atac_filetype = caf["atac_filetype"][nn] + if "fragments_file" in caf.columns and pd.notna(caf["fragments_file"][nn]): + fragments_file = caf["fragments_file"][nn] else: fragments_file = None - if 'per_barcode_metrics_file' in caf.columns and pd.notna(caf['per_barcode_metrics_file'][nn]): - per_barcode_metrics_file = caf['per_barcode_metrics_file'][nn] + if "per_barcode_metrics_file" in caf.columns and pd.notna( + caf["per_barcode_metrics_file"][nn] + ): + per_barcode_metrics_file = caf["per_barcode_metrics_file"][nn] else: per_barcode_metrics_file = None - if 'peak_annotation_file' in caf.columns and pd.notna(caf['peak_annotation_file'][nn]): - peak_annotation_file = caf['peak_annotation_file'][nn] + if "peak_annotation_file" in caf.columns and pd.notna( + caf["peak_annotation_file"][nn] + ): + peak_annotation_file = caf["peak_annotation_file"][nn] else: peak_annotation_file = None else: - atac_path= None - atac_filetype=None + atac_path = None + atac_filetype = None fragments_file = None peak_annotation_file = None per_barcode_metrics_file = None - - if 'barcode_mtd_path' in caf.columns: - cell_mtd_path = caf['barcode_mtd_path'][nn] + + if "barcode_mtd_path" in caf.columns: + cell_mtd_path = caf["barcode_mtd_path"][nn] else: cell_mtd_path = None - # create the output file - outfile = "./tmp/" + caf['sample_id'][nn] + + # create the output file + outfile = "./tmp/" + caf["sample_id"][nn] if load_raw: outfile = outfile + "_raw.h5mu" else: outfile = outfile + ".h5mu" - sample_id = caf['sample_id'][nn] - yield rna_path, outfile, \ - sample_id, \ - rna_filetype, \ - prot_path, prot_filetype, \ - tcr_path, tcr_filetype, \ - bcr_path, bcr_filetype, \ - atac_path, atac_filetype, \ - fragments_file, per_barcode_metrics_file, peak_annotation_file, \ - cell_mtd_path - - -def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True): + + sample_id = caf["sample_id"][nn] + + yield ( + rna_path, + outfile, + sample_id, + rna_filetype, + prot_path, + prot_filetype, + tcr_path, + tcr_filetype, + bcr_path, + bcr_filetype, + atac_path, + atac_filetype, + fragments_file, + per_barcode_metrics_file, + peak_annotation_file, + cell_mtd_path, + ) + + +def gen_load_spatial_jobs( + caf: pd.DataFrame, mode_dictionary: Optional[dict] = None, load_raw: bool = True +): """ Generate a load_spatial job for each line in submission.txt """ + + if mode_dictionary is None: + mode_dictionary = {} + for nn in range(0, caf.shape[0]): if "spatial_path" in caf.columns and mode_dictionary["spatial"]: if pd.isna(caf["spatial_path"][nn]): - spatial_path= None + spatial_path = None spatial_filetype = None else: spatial_path = caf["spatial_path"][nn] - if caf['spatial_filetype'][nn]=="vizgen": - spatial_filetype = caf['spatial_filetype'][nn] - #path, counts and metadata are mandatory + if caf["spatial_filetype"][nn] == "vizgen": + spatial_filetype = caf["spatial_filetype"][nn] + + # path, counts and metadata are mandatory if pd.notna(caf["spatial_counts"][nn]): - spatial_counts= caf["spatial_counts"][nn] + spatial_counts = caf["spatial_counts"][nn] else: spatial_counts = None + if pd.notna(caf["spatial_metadata"][nn]): spatial_metadata = caf["spatial_metadata"][nn] - else: + else: spatial_metadata = None - #transformation is optional + + # transformation is optional if pd.notna(caf["spatial_transformation"][nn]): spatial_transformation = caf["spatial_transformation"][nn] else: spatial_transformation = None - elif caf['spatial_filetype'][nn]=="visium": - spatial_metadata= None + + elif caf["spatial_filetype"][nn] == "visium": + spatial_metadata = None spatial_transformation = None - spatial_filetype = caf['spatial_filetype'][nn] + spatial_filetype = caf["spatial_filetype"][nn] + if pd.notna(caf["spatial_counts"][nn]): - spatial_counts= caf["spatial_counts"][nn] + spatial_counts = caf["spatial_counts"][nn] else: - spatial_counts = None + spatial_counts = None else: - spatial_path= None + spatial_path = None spatial_filetype = None spatial_counts = None spatial_metadata = None spatial_transformation = None - - if 'barcode_mtd_path' in caf.columns: - cell_mtd_path = caf['barcode_mtd_path'][nn] #not yielding this right now! + + # ToDo: cell_mtd_path is not used in this function below + if "barcode_mtd_path" in caf.columns: + cell_mtd_path = caf["barcode_mtd_path"][nn] # not yielding this right now! else: cell_mtd_path = None - # create the output file - outfile = "./tmp/" + caf['sample_id'][nn] + + # create the output file + outfile = "./tmp/" + caf["sample_id"][nn] + if load_raw: - outfile = outfile + "_raw.h5mu" + outfile = outfile + "_raw.h5mu" else: outfile = outfile + ".h5mu" - sample_id = caf['sample_id'][nn] - yield spatial_path, outfile, \ - sample_id, spatial_filetype, spatial_counts, spatial_metadata, spatial_transformation + + sample_id = caf["sample_id"][nn] + + yield ( + spatial_path, + outfile, + sample_id, + spatial_filetype, + spatial_counts, + spatial_metadata, + spatial_transformation, + ) def read_anndata( fname: Optional[str] = None, - use_muon: Optional[bool] = False, - modality: Literal['all', 'rna', 'prot','atac','rep'] = 'all'): + use_muon: Optional[bool] = False, + modality: Literal["all", "rna", "prot", "atac", "rep"] = "all", +): logging.info("reading %s" % fname) + + if fname is None: + sys.exit("anndata file not found; `fname` has not been passed/") + # check fname file exists try: os.path.isfile(fname) except FileNotFoundError: - sys.exit("anndata file not found") + sys.exit(f"anndata file not found: {fname}") + if use_muon is False: return read_h5ad(fname) else: logging.info("reading %s modality" % modality) - if modality=="all": + + if modality == "all": # note this just loads the rna part of the muon object return mu.read(fname) - if modality=="rna": + + if modality == "rna": # note this just loads the rna part of the muon object return mu.read(fname + "/rna") - elif modality=="prot": + elif modality == "prot": return mu.read(fname + "/prot") - elif modality=="atac": + elif modality == "atac": return mu.read(fname + "/atac") - elif modality=="rep": + elif modality == "rep": return mu.read(fname + "/rep") - + else: - sys.exit("modality not found, must be one of 'all', 'rna', 'prot','atac', 'rep' ") + sys.exit( + f"modality {modality} not found, must be one of 'all', 'rna', 'prot','atac', 'rep' " + ) -def write_anndata(adata, - fname, - use_muon=False, - modality: Literal['all', 'rna', 'prot'] = 'all'): +def write_anndata( + adata: AnnData, + fname: str, + use_muon: bool = False, + modality: Literal["all", "rna", "prot"] = "all", +): if use_muon is False: adata.write(fname) else: - if modality=="rna": + if modality == "rna": # note this just saves the rna part of the muon object return mu.write(fname + "/rna", adata) - if modality=="prot": + if modality == "prot": # note this just save the prot part of the muon object return mu.write(fname + "/prot", adata) - elif modality=="all": + elif modality == "all": return mu.write(fname, adata) else: sys.exit("modality not found, must be one of 'all', 'rna', 'prot'") + # --- loading raw dat -def write_obs(mdata, output_prefix="./", output_suffix="_filtered_cell_metadata.tsv"): - metafile = mdata.obs - metafile["cellbarcode"] = mdata.obs.index - savename= output_prefix + output_suffix - metafile.to_csv(savename, sep='\t', index=True) -def check_submission_file(caf): +def write_obs( + mdata: AnnData, + output_prefix: str = "./", + output_suffix: str = "_filtered_cell_metadata.tsv", +): + metafile = mdata.obs + metafile["cellbarcode"] = mdata.obs.index + savename = output_prefix + output_suffix + metafile.to_csv(savename, sep="\t", index=True) + + +def check_submission_file(caf: pd.DataFrame): + """ + Fixing the column names in the input data frame in case formatting is off. + """ + old_columns = caf.columns + caf.rename( + columns={col_name: col_name.strip() for col_name in caf.columns}, inplace=True + ) + caf.replace("^ +| +$", "", regex=True, inplace=True) + logging.info( + f"Formatted columns to remove spaces. Changes: {old_columns} -> {caf.columns}", + ) + if "filetype" in caf.columns or "path" in caf.columns: - raise ValueError("you appear to be using the old notation for the sample submission file, \ - please update to use rna_path instead of path and rna_filetype instead of filetype") + raise ValueError( + "you appear to be using the old notation for the sample submission file, \ + please update to use rna_path instead of path and rna_filetype instead of filetype" + ) + # check for required cols - req_cols = ['sample_id', 'rna_path', 'rna_filetype'] + req_cols = ["sample_id", "rna_path", "rna_filetype"] for rc in req_cols: if rc not in caf.columns: raise ValueError("required column %s missing from submission file" % rc) - -def check_filetype(path, filetype): +def check_filetype( + path: str, + filetype: Literal[ + "cellranger", + "h5ad", + "csv_matrix", + "txt_matrix", + "10X_h5", + "hdf", + "cellranger_vdj", + "vizgen", + ], +): logging.debug(path) logging.debug(filetype) - if filetype == "cellranger" : + + if filetype == "cellranger": if os.path.exists(path + "/matrix.mtx.gz"): logging.info("cellranger matrix found") elif os.path.exists(path + "/matrix.mtx"): logging.info("cellranger matrix found") else: - sys.exit("cellranger matrix file not found, have you specified the path correctly") + sys.exit( + "cellranger matrix file not found, have you specified the path correctly" + ) else: ftype_checks = { - "h5ad":r".h5ad$", + "h5ad": r".h5ad$", "csv_matrix": ".csv", - "txt_matrix" : ".txt", + "txt_matrix": ".txt", "10X_h5": ".h5", "hdf": ".h5", "cellranger_vdj": r".json|.csv", - "vizgen": r".txt|.csv|.tsv" #suboptimal for now but roll with it + "vizgen": r".txt|.csv|.tsv", # suboptimal for now but roll with it } if filetype not in ftype_checks.keys(): - sys.exit("unknown filetype %s, please specify one of: %s" % filetype, ftype_checks.keys().join(", ")) + sys.exit( + f"unknown filetype {filetype}, please specify one of: {list(ftype_checks.keys())}" + ) elif re.search(ftype_checks[filetype], path) is None: - sys.exit("filetype does not match file suffix, check it is specified correctly \ - (with %s as suffix (gzipped files ok for csv and tsv))" % ftype_checks[filetype]) - elif os.path.exists(path)==False: + sys.exit( + "filetype does not match file suffix, check it is specified correctly \ + (with %s as suffix (gzipped files ok for csv and tsv))" + % ftype_checks[filetype] + ) + elif not os.path.exists(path): sys.exit("file %s does not exist" % path) else: logging.info("%s file exists" % path) def read_scirpy( - fname, - filetype: Literal['cellranger_vdj', 'tracer', 'bracer', 'airr' ] = 'cellranger_vdj'): + fname: str, + filetype: Literal["cellranger_vdj", "tracer", "bracer", "airr"] = "cellranger_vdj", +): """ wrapper around all the scirpy io functions """ # check fname file exists logging.info("loading repertoire using scirpy, filetype %s" % filetype) + try: os.path.isfile(fname) except FileNotFoundError: sys.exit("vdj file not found: %s" % fname) - if filetype == "cellranger_vdj": + + if filetype == "cellranger_vdj": vdj = ir.io.read_10x_vdj(fname) elif filetype == "tracer": vdj = ir.io.read_tracer(fname) - elif filetype == "bracer": + elif filetype == "bracer": vdj = ir.io.read_bracer(fname) - elif filetype == "airr": + elif filetype == "airr": vdj = ir.io.read_airr(fname) else: - sys.exit("fileype not valid, must be one of 'cellranger_vdj', 'tracer', 'bracer', 'airr'") + sys.exit( + f"filetype {filetype} not valid, must be one of 'cellranger_vdj', 'tracer', 'bracer', 'airr'" + ) return vdj -def scp_read_10x_h5(filename: PathLike, library_keep=None, extended= False, *args, **kwargs) -> AnnData: +def scp_read_10x_h5( + filename: PathLike, library_keep=None, extended: bool = False, *args, **kwargs +) -> AnnData: """ - expanded sc.read_10x_h5 to filter for the library + expanded sc.read_10x_h5 to filter for the library adapted from https://github.com/scverse/muon/blob/master/muon/_prot/io.py """ adata = read_10x_h5(filename, gex_only=False, *args, **kwargs) + logging.debug(adata) logging.debug("filtering cellranger outputs to %s" % library_keep) + if extended is not None and extended is True: import shutil - logging.debug("copying %s" % filename ) + + logging.debug("copying %s" % filename) + h5file = h5py.File(filename, "r") + logging.debug("reading intervals") + if "interval" in h5file["matrix"]["features"]: intervals = np.array(h5file["matrix"]["features"]["interval"]).astype(str) - logging.debug("intervals %s" %intervals) + logging.debug("intervals %s" % intervals) adata.var["interval"] = intervals logging.debug(f"Added `interval` annotation for features from {filename}") logging.debug("updated adata %s" % adata) @@ -353,130 +510,156 @@ def scp_read_10x_h5(filename: PathLike, library_keep=None, extended= False, *arg h5file.close() else: logging.debug("skipping extended arg") + if library_keep is not None: - adata = adata[:, list(map(lambda x: x == library_keep, adata.var["feature_types"]))].copy() + adata = adata[ + :, list(map(lambda x: x == library_keep, adata.var["feature_types"])) + ].copy() + logging.debug("filtered anndata size %s" % adata) - - return adata + return adata def scp_read_10x_mtx(filename: PathLike, library_keep=None, *args, **kwargs) -> AnnData: """ - expanded sc.read_10x_mtx to filter for the library + expanded sc.read_10x_mtx to filter for the library adapted from https://github.com/scverse/muon/blob/master/muon/_prot/io.py """ - adata = read_10x_mtx(filename, gex_only=False, *args, **kwargs) #need to leave gex as this is scanpy's code + adata = read_10x_mtx( + filename, gex_only=False, *args, **kwargs + ) # need to leave gex as this is scanpy's code + logging.debug("filtering cellranger outputs to %s" % library_keep) + if library_keep is not None: adata = adata[ :, list(map(lambda x: x == library_keep, adata.var["feature_types"])) ].copy() + return adata -#---------------- + +# ---------------- # temp loading functions for spatial data # load_Spatial_in # load_spatial_from_multiple_files # ->>>how to write? -#---------------- - +# ---------------- - - -def load_adata_in(path, filetype, gex_only=True, var_names="gene_symbols", library=None, extended=False): +def load_adata_in( + path: str, + filetype: str, + gex_only: bool = True, # ToDo: it not used anywhere in the function + var_names: str = "gene_symbols", + library=None, + extended: bool = False, +): """ load in any format supported by the load_functions_dict expected to be called by load_mdata_from_multiple_files """ # all the supported loading functions here load_functions_dict = { - "cellranger": lambda x: scp_read_10x_mtx(x, - library_keep=library, - var_names=var_names, - cache=False), - "h5ad":read_h5ad, - "csv_matrix":read_csv, - "txt_matrix": read_text, - "10X_h5": lambda x: scp_read_10x_h5(x, library_keep=library, extended=extended), - "hd5": read_hdf, - # vdj functions - "cellranger_vdj": lambda x: read_scirpy(x, filetype="cellranger_vdj"), - "tracer":lambda x: read_scirpy(x, filetype="tracer"), - "bracer":lambda x: read_scirpy(x, filetype="bracer"), - "airr":lambda x: read_scirpy(x, filetype="airr"), - } - + "cellranger": lambda x: scp_read_10x_mtx( + x, library_keep=library, var_names=var_names, cache=False + ), + "h5ad": read_h5ad, + "csv_matrix": read_csv, + "txt_matrix": read_text, + "10X_h5": lambda x: scp_read_10x_h5(x, library_keep=library, extended=extended), + "hd5": read_hdf, + # vdj functions + "cellranger_vdj": lambda x: read_scirpy(x, filetype="cellranger_vdj"), + "tracer": lambda x: read_scirpy(x, filetype="tracer"), + "bracer": lambda x: read_scirpy(x, filetype="bracer"), + "airr": lambda x: read_scirpy(x, filetype="airr"), + } # the var_names argument decides which column of the cellranger features files ends up as the index. - if var_names=="gene_symbols": - col_fill="gene_ids" - elif var_names=="gene_ids": - col_fill="gene_symbols" + if var_names == "gene_symbols": + col_fill = "gene_ids" + elif var_names == "gene_ids": + col_fill = "gene_symbols" else: - col_fill="index" + col_fill = "index" # try: logging.debug("loading %s " % filetype) + adata = load_functions_dict[filetype](path) + logging.debug("this is the anndata now: %s" % adata) + # in some cases you need to update the var index col. if var_names in adata.var.columns: logging.info("resetting var.index") # we need to reset the index to be the expected one. - adata.var = adata.var.reset_index().rename(columns={"index":col_fill}).set_index(var_names) + adata.var = ( + adata.var.reset_index() + .rename(columns={"index": col_fill}) + .set_index(var_names) + ) + return adata # except: - # raise ValueError("unknown filetype %s, please specify one of: %s" % filetype, ", ".join(list(load_functions_dict.keys()))) - + # raise ValueError("unknown filetype %s, please specify one of: %s" % filetype, ", ".join(list(load_functions_dict.keys()))) -def update_intersecting_feature_names(adata1, adata2, prefix): +def update_intersecting_feature_names(adata1: AnnData, adata2: AnnData, prefix: str): # check for intersecting names - if (len(adata1.var_names) > 0 and len(adata2.var_names) > 0 ): - intersect_features = intersection(adata1.var_names.tolist(), adata2.var_names.tolist()) + if len(adata1.var_names) > 0 and len(adata2.var_names) > 0: + intersect_features = intersection( + adata1.var_names.tolist(), adata2.var_names.tolist() + ) if len(intersect_features) > 0: - # this will be a problem downstream, so we prefix the - warnings.warn("intersecting gene ids and other modalities are problematic downstream, \ - prefixing with %s" % prefix) - adata2.var[ prefix + 'id'] = adata2.var_names + # this will be a problem downstream, so we prefix the + warnings.warn( + "intersecting gene ids and other modalities are problematic downstream, \ + prefixing with %s" + % prefix + ) + adata2.var[prefix + "id"] = adata2.var_names adata2.var_names = [prefix + x for x in adata2.var_names.tolist()] + return adata2 def merge_tcr_bcr_into_one_anndata(tcr, bcr): logging.info("merging tcr and bcr into one rep modality") intersect_obs = list(set(bcr.obs_names).union(set(tcr.obs_names))) - adata = AnnData(X=np.empty(shape=(len(intersect_obs), 0)), - obs=pd.DataFrame(index=intersect_obs)) - # merge in the tcr and bcr with the rna - ir.pp.merge_airr_chains(adata , tcr) - ir.pp.merge_airr_chains(adata , bcr) + adata = AnnData( + X=np.empty(shape=(len(intersect_obs), 0)), obs=pd.DataFrame(index=intersect_obs) + ) + + # merge in the tcr and bcr with the rna + ir.pp.merge_airr(adata, tcr) + ir.pp.merge_airr(adata, bcr) ir.tl.chain_qc(adata) return adata - -def _make_one_rep_modality(data_dict): +def _make_one_rep_modality(data_dict: dict[AnnData]): """ takes the data dict from load_mdata_from_multiple_files, and collapses down the bcr and/or tcr into one modality called rep """ - if ("tcr" in data_dict.keys() and "bcr" in data_dict.keys()): + if "tcr" in data_dict.keys() and "bcr" in data_dict.keys(): # get the intersecting obs - data_dict['rep'] = merge_tcr_bcr_into_one_anndata(data_dict['tcr'], data_dict['bcr']) + data_dict["rep"] = merge_tcr_bcr_into_one_anndata( + data_dict["tcr"], data_dict["bcr"] + ) # remove separate assays - data_dict.pop('tcr', None) - data_dict.pop('bcr', None) - elif("tcr" in data_dict.keys()): + data_dict.pop("tcr", None) + data_dict.pop("bcr", None) + elif "tcr" in data_dict.keys(): logging.info("storing tcr in rep modality") - data_dict['rep'] = data_dict['tcr'] - data_dict.pop('tcr', None) - elif("bcr" in data_dict.keys()): + data_dict["rep"] = data_dict["tcr"] + data_dict.pop("tcr", None) + elif "bcr" in data_dict.keys(): logging.info("storing bcr in rep modality") - data_dict['rep'] = data_dict['bcr'] - data_dict.pop('bcr', None) - + data_dict["rep"] = data_dict["bcr"] + data_dict.pop("bcr", None) def load_mdata_from_multiple_files(all_files_dict): @@ -488,11 +671,11 @@ def load_mdata_from_multiple_files(all_files_dict): dictionary containing one key per assay out of [RNA, PROT, TCR, BCR, ATAC] and the values for each key is a list of file path and fie type e.g. {"RNA": [filepath, "filetype"], - "PROT: [file path, "filetype"]} + "PROT: [file path, "filetype"]} Filetypes supported for RNA/prot: ["cellranger", "h5ad", "csv_matrix", "txt_matrix", "10X_h5"], Filetypes supported for atac (multiome preferred is 10X_h5) ["10X_h5","cellranger","h5ad"] - Filetypes supported for rep: ["cellranger_vdj", "airr", "tracer", "bracer" ] - See scirpy documentation for more information of repertoire input formats + Filetypes supported for rep: ["cellranger_vdj", "airr", "tracer", "bracer" ] + See scirpy documentation for more information of repertoire input formats https://scverse.org/scirpy/latest/api.html#module-scirpy.io """ # convert names to match mudata conventions @@ -505,47 +688,50 @@ def load_mdata_from_multiple_files(all_files_dict): data_dict = {} for nm, x in all_files_dict.items(): logging.info("loading %s" % nm) - extra_args={} + extra_args = {} if nm == "rna": - extra_args['library'] = "Gene Expression" - if 'atac' in all_files_dict.keys(): - extra_args["extended"]= True + extra_args["library"] = "Gene Expression" + if "atac" in all_files_dict.keys(): + extra_args["extended"] = True else: - extra_args["extended"]= False + extra_args["extended"] = False if nm == "prot": extra_args["gex_only"] = False extra_args["var_names"] = "gene_ids" - extra_args['library'] = "Antibody Capture" - extra_args["extended"]= False + extra_args["library"] = "Antibody Capture" + extra_args["extended"] = False if nm == "atac": extra_args["gex_only"] = False - extra_args['library'] = "Peaks" - extra_args['extended'] = False + extra_args["library"] = "Peaks" + extra_args["extended"] = False if nm == "spatial": - extra_args["gex_only"] = True # check this for techs other than merfish and visium H&E - #extra_args["counts_file"] = - extra_args['extended'] = False + extra_args["gex_only"] = ( + True # check this for techs other than merfish and visium H&E + ) + # extra_args["counts_file"] = + extra_args["extended"] = False logging.debug("extra args") logging.debug(extra_args) - data_dict[nm] = load_adata_in(x[0], x[1], **extra_args) #** - #x[0] is the path, x[1] is the filetype + data_dict[nm] = load_adata_in(x[0], x[1], **extra_args) # ** + # x[0] is the path, x[1] is the filetype logging.debug(data_dict["rna"]) logging.debug(data_dict.keys()) # we want unique var names for each assay if "rna" in data_dict.keys(): logging.debug("rna adata %s " % data_dict["rna"]) - data_dict['rna'].var_names_make_unique() + data_dict["rna"].var_names_make_unique() for mod in data_dict.keys(): - if mod =="rna": + if mod == "rna": # skip since we want to update all the ones that are not rna pass else: logging.debug("second adata %s " % data_dict[mod]) logging.debug("updating intersection for %s" % mod) - data_dict[mod] = update_intersecting_feature_names(adata1=data_dict['rna'], adata2=data_dict[mod], - prefix=mod + "_") - #make repertoire data into one assay + data_dict[mod] = update_intersecting_feature_names( + adata1=data_dict["rna"], adata2=data_dict[mod], prefix=mod + "_" + ) + # make repertoire data into one assay _make_one_rep_modality(data_dict) logging.debug(data_dict) @@ -556,9 +742,7 @@ def load_mdata_from_multiple_files(all_files_dict): return mdata_ -from scipy.io import mmwrite -import gzip -def write_10x_counts(adata, path, layer=None): +def write_10x_counts(adata: AnnData, path: str, layer=None): if layer is None: arr = adata.X else: @@ -570,12 +754,17 @@ def write_10x_counts(adata, path, layer=None): arr = arr.astype(int) # write matrix.mtx mmwrite(os.path.join(path, "matrix.mtx"), arr.T) - with open(os.path.join(path, "matrix.mtx"), 'rb') as f_in, gzip.open(os.path.join(path, "matrix.mtx.gz"), 'wb') as f_out: + with ( + open(os.path.join(path, "matrix.mtx"), "rb") as f_in, + gzip.open(os.path.join(path, "matrix.mtx.gz"), "wb") as f_out, + ): f_out.writelines(f_in) os.remove(os.path.join(path, "matrix.mtx")) # write barcodes file barcodes = adata.obs_names.to_frame() - barcodes.to_csv(os.path.join(path, "barcodes.tsv.gz"), sep='\t', index=None, header=None) + barcodes.to_csv( + os.path.join(path, "barcodes.tsv.gz"), sep="\t", index=None, header=None + ) # write features file (trying to make sure the correct columns are stored) if "gene_ids" in adata.var.columns: # we assume the index is gene_ids @@ -583,9 +772,14 @@ def write_10x_counts(adata, path, layer=None): elif "gene_symbols" in adata.var.columns: features = adata.var.reset_index().rename(columns={"index": "gene_ids"}) if "prot_id" in features.columns: - features = features.drop(columns=['gene_symbols']).rename(columns={"prot_id" : "gene_symbols"}) - features = features[['gene_ids','gene_symbols', 'feature_types']] - features.to_csv(os.path.join(path, "features.tsv.gz"), sep='\t', index=None, header=None) + features = features.drop(columns=["gene_symbols"]).rename( + columns={"prot_id": "gene_symbols"} + ) + features = features[["gene_ids", "gene_symbols", "feature_types"]] + features.to_csv( + os.path.join(path, "features.tsv.gz"), sep="\t", index=None, header=None + ) + # https://stackoverflow.com/questions/33529312/remove-empty-dicts-in-nested-dictionary-with-recursive-function def dictionary_stripper(data): @@ -612,10 +806,10 @@ def replace_string_nones(d): elif v == "None": d[k] = None -import yaml -def read_yaml(fname): + +def read_yaml(fname: str | dict): if type(fname) is dict: - params=fname + params = fname elif os.path.exists(fname): with open(fname, "r") as stream: try: @@ -626,7 +820,7 @@ def read_yaml(fname): # if the yaml is being passed as a string try: params = yaml.safe_load(fname) - + except yaml.YAMLError as exc: print(exc) replace_string_nones(params) @@ -641,4 +835,3 @@ def read_yaml(fname): # "scanvi": lambda x: scvi.model.SCANVI.load_query_data, # } - From bb5ae89c565db6b67c53f449ac6cf09bf7cb2e32 Mon Sep 17 00:00:00 2001 From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com> Date: Tue, 4 Feb 2025 10:04:50 +0100 Subject: [PATCH 02/10] add scirpy version for pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2dc39d84..918cc2e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,7 @@ dependencies = [ "scib", "scib-metrics", "scikit-misc", - "scirpy", + "scirpy>=0.19.0", "scrublet", "scvi-tools", "sqlalchemy", @@ -82,9 +82,9 @@ refmap_old = [ panpipes = "panpipes:entry.main" [tool.setuptools] +packages = ["panpipes"] package-data = {"panpipes" = ["panpipes/*/*.yml", "R_scripts/*.R", "resources/*"]} - [build-system] requires = ["setuptools>=43.0.0", "wheel"] build-backend = "setuptools.build_meta" From 8c229bea5dda71bc6e539edf2c1774a5c3c1ce5d Mon Sep 17 00:00:00 2001 From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com> Date: Tue, 4 Feb 2025 10:17:52 +0100 Subject: [PATCH 03/10] refactoring: style + more verbose and detailed errros in case of file issues --- panpipes/python_scripts/run_scanpyQC_rna.py | 223 ++++++++++++-------- 1 file changed, 138 insertions(+), 85 deletions(-) diff --git a/panpipes/python_scripts/run_scanpyQC_rna.py b/panpipes/python_scripts/run_scanpyQC_rna.py index b1fca50d..7e063844 100644 --- a/panpipes/python_scripts/run_scanpyQC_rna.py +++ b/panpipes/python_scripts/run_scanpyQC_rna.py @@ -1,64 +1,67 @@ -''' +""" scanpy QC script RNA order of QC: - RNA - PROT - Repertoire - ATAC -''' -import sys -import logging -L = logging.getLogger() -L.setLevel(logging.INFO) -log_handler = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s') -log_handler.setFormatter(formatter) -L.addHandler(log_handler) +""" +import argparse +import logging +import os +import sys import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) +import muon as mu import pandas as pd -import os -import argparse import scanpy as sc -import muon as mu from panpipes.funcs.io import write_obs +L = logging.getLogger() +L.setLevel(logging.INFO) +log_handler = logging.StreamHandler(sys.stdout) +formatter = logging.Formatter("%(asctime)s: %(levelname)s - %(message)s") +log_handler.setFormatter(formatter) +L.addHandler(log_handler) + +warnings.simplefilter(action="ignore", category=FutureWarning) parser = argparse.ArgumentParser() # required option -parser.add_argument("--sampleprefix", - default="", - help="prefix to prepend when saving the metadata file") -parser.add_argument("--input_anndata", - default="adata_unfilt.h5ad", - help="") -parser.add_argument("--outfile", - default="adata_unfilt.h5ad", - help="") -parser.add_argument("--figdir", - default="./figures/", - help="path to save the figures to") -parser.add_argument("--figure_suffix", - default="_qc-plot.png", - help="figures filename suffix to be appended to figures/umap") -parser.add_argument("--scrubletdir", - default=None, - help="path to save the figures to") -parser.add_argument("--ccgenes", - default=None, - help="path to file containing cell cycle genes") -parser.add_argument("--customgenesfile", - default=None, - help="path to file containing list of genes to quantify") -parser.add_argument("--calc_proportions", - default="mitochondrial,ribosomal", - help="which list of genes to use to calc proportion of mapped reads over total,per cell?") -parser.add_argument("--score_genes", - default="MarkersNeutro", - help="which list of genes to use to scanpy.tl.score_genes per cell?") +parser.add_argument( + "--sampleprefix", default="", help="prefix to prepend when saving the metadata file" +) +parser.add_argument("--input_anndata", default="adata_unfilt.h5ad", help="") +parser.add_argument("--outfile", default="adata_unfilt.h5ad", help="") +parser.add_argument( + "--figdir", default="./figures/", help="path to save the figures to" +) +parser.add_argument( + "--figure_suffix", + default="_qc-plot.png", + help="figures filename suffix to be appended to figures/umap", +) +parser.add_argument("--scrubletdir", default=None, help="path to save the figures to") +parser.add_argument( + "--ccgenes", default=None, help="path to file containing cell cycle genes" +) +parser.add_argument( + "--customgenesfile", + default=None, + help="path to file containing list of genes to quantify", +) +parser.add_argument( + "--calc_proportions", + default="mitochondrial,ribosomal", + help="which list of genes to use to calc proportion of mapped reads over total,per cell?", +) +parser.add_argument( + "--score_genes", + default="MarkersNeutro", + help="which list of genes to use to scanpy.tl.score_genes per cell?", +) args, opt = parser.parse_known_args() L.info("Running with params: %s", args) @@ -70,31 +73,47 @@ os.mkdir(figdir) sc.settings.figdir = figdir -sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5)) +sc.set_figure_params( + scanpy=True, fontsize=14, dpi=300, facecolor="white", figsize=(5, 5) +) L.info("Reading in MuData from '%s'" % args.input_anndata) mdata = mu.read(args.input_anndata) -rna = mdata['rna'] +rna = mdata["rna"] # load the scrublet scores into the anndata (if they have been run) if args.scrubletdir is not None: scrub_dir = args.scrubletdir L.info("Merging in the scrublet scores from directory '%s'" % args.scrubletdir) - sample_ids = rna.obs[['sample_id']].drop_duplicates() - [scrub_dir + '/' + ss + '_scrublet_scores.txt' for ss in sample_ids.sample_id] - doubletscores = [pd.read_csv(scrub_dir + '/' + ss + '_scrublet_scores.txt', sep="\t", header=0) for ss in sample_ids.sample_id] - doubletscores = pd.concat(doubletscores, keys=sample_ids.sample_id).reset_index(level="sample_id") + sample_ids = rna.obs[["sample_id"]].drop_duplicates() + + # [scrub_dir + "/" + ss + "_scrublet_scores.txt" for ss in sample_ids.sample_id] # ToDo: it was uncommented – why is this here? + + doubletscores = [ + pd.read_csv(scrub_dir + "/" + ss + "_scrublet_scores.txt", sep="\t", header=0) + for ss in sample_ids.sample_id + ] + + doubletscores = pd.concat(doubletscores, keys=sample_ids.sample_id).reset_index( + level="sample_id" + ) # rename the barcodes to match up qwith what the rna.obs barcodes are if len(sample_ids) > 1: - doubletscores['barcode'] = doubletscores['barcode'].astype(str) + '-' + doubletscores['sample_id'].astype(str) - doubletscores = doubletscores.set_index('barcode').drop('sample_id', axis=1) + doubletscores["barcode"] = ( + doubletscores["barcode"].astype(str) + + "-" + + doubletscores["sample_id"].astype(str) + ) + doubletscores = doubletscores.set_index("barcode").drop("sample_id", axis=1) # merge with rna.obs - rna.obs = rna.obs.merge(doubletscores, how="left", left_index=True, right_index=True) + rna.obs = rna.obs.merge( + doubletscores, how="left", left_index=True, right_index=True + ) -# Cell-wise QC based on .var lists of genes +# Cell-wise QC based on .var lists of genes qc_vars = [] @@ -103,61 +122,93 @@ cat_dic = {} L.info("Reading in custom genes csv file from '%s'" % args.customgenesfile) customgenes = pd.read_csv(args.customgenesfile) - if not {'group', 'feature'}.issubset(customgenes.columns): - L.error("The custom genes file needs to have both columns, 'group' and 'feature'.") - sys.exit("The custom genes file needs to have both columns, 'group' and 'feature'.") - custom_cat = list(set(customgenes['group'].tolist())) + if not {"group", "feature"}.issubset(customgenes.columns): + L.error( + "The custom genes file needs to have both columns, 'group' and 'feature'." + ) + sys.exit( + "The custom genes file needs to have both columns, 'group' and 'feature'." + ) + custom_cat = list(set(customgenes["group"].tolist())) + for cc in custom_cat: - cat_dic[cc] = customgenes.loc[customgenes["group"] == cc,"feature"].tolist() + cat_dic[cc] = customgenes.loc[ + customgenes["group"] == cc, "feature" + ].tolist() + if args.calc_proportions is not None: calc_proportions = args.calc_proportions.split(",") calc_proportions = [a.strip() for a in calc_proportions] else: - L.warning("No genes to calculate proportions for") - + L.warning( + "No genes passed to `args.calc_proportions` to calculate proportions for" + ) + if args.score_genes is not None: score_genes = args.score_genes.split(",") score_genes = [a.strip() for a in score_genes] else: - L.warning("No genes to calculate scores for") - + L.warning("No genes passed to `args.score_genes` to calculate scores for") + else: - L.error("You have not provided a list of custom genes to use for QC purposes") - sys.exit("You have not provided a list of custom genes to use for QC purposes") + L.error( + f"File with custom genes for QC purposes `{args.customgenesfile}` does not exist." + ) + sys.exit( + f"File with custom genes for QC purposes `{args.customgenesfile}` does not exist." + ) +else: + L.error( + "You have not provided a file with a list of custom genes to use for QC purposes – `args.customgenesfile`." + ) + sys.exit( + "You have not provided a file with a list of custom genes to use for QC purposes – `args.customgenesfile`." + ) for kk in calc_proportions: - xname= kk + xname = kk gene_list = cat_dic[kk] - rna.var[xname] = [x in gene_list for x in rna.var_names] # annotate the group of hb genes as 'hb' + rna.var[xname] = [ + x in gene_list for x in rna.var_names + ] # annotate the group of hb genes as 'hb' qc_vars.append(xname) qc_info = "" if qc_vars != []: qc_info = " and calculating proportions for '%s'" % qc_vars L.info("Calculating QC metrics with scanpy.pp.calculate_qc_metrics()" + qc_info) -sc.pp.calculate_qc_metrics(rna, qc_vars=qc_vars, percent_top=None,log1p=True, inplace=True) +sc.pp.calculate_qc_metrics( + rna, qc_vars=qc_vars, percent_top=None, log1p=True, inplace=True +) if args.score_genes is not None: for kk in score_genes: L.info("Computing gene scores for '%s'" % kk) - xname= kk + xname = kk gene_list = cat_dic[kk] - sc.tl.score_genes(rna, gene_list , - ctrl_size=min(len(gene_list), 50), - gene_pool=None, - n_bins=25, - score_name=kk + '_score', - random_state=0, - copy=False, - use_raw=None) + sc.tl.score_genes( + rna, + gene_list, + ctrl_size=min(len(gene_list), 50), + gene_pool=None, + n_bins=25, + score_name=kk + "_score", + random_state=0, + copy=False, + use_raw=None, + ) if args.ccgenes is not None: L.info("Reading in cell cycle genes tsv file from '%s'" % args.ccgenes) - ccgenes = pd.read_csv(args.ccgenes, sep='\t') - if not {'cc_phase', 'gene_name'}.issubset(ccgenes.columns): - L.error("The cell cycle genes file needs to have both columns, 'cc_phase' and 'gene_name'.") - sys.exit("The cell cycle genes file needs to have both columns, 'cc_phase' and 'gene_name'.") + ccgenes = pd.read_csv(args.ccgenes, sep="\t") + if not {"cc_phase", "gene_name"}.issubset(ccgenes.columns): + L.error( + "The cell cycle genes file needs to have both columns, 'cc_phase' and 'gene_name'." + ) + sys.exit( + "The cell cycle genes file needs to have both columns, 'cc_phase' and 'gene_name'." + ) sgenes = ccgenes[ccgenes["cc_phase"] == "s"]["gene_name"].tolist() g2mgenes = ccgenes[ccgenes["cc_phase"] == "g2m"]["gene_name"].tolist() L.info("Calculating cell cycle scores") @@ -166,11 +217,13 @@ mdata.update() -L.info("Saving updated obs in a metadata tsv file to ./" + args.sampleprefix + "_cell_metadata.tsv") -write_obs(mdata, output_prefix=args.sampleprefix, - output_suffix="_cell_metadata.tsv") +L.info( + "Saving updated obs in a metadata tsv file to ./" + + args.sampleprefix + + "_cell_metadata.tsv" +) +write_obs(mdata, output_prefix=args.sampleprefix, output_suffix="_cell_metadata.tsv") L.info("Saving updated MuData to '%s'" % args.outfile) mdata.write(args.outfile) L.info("Done") - From d5e97c6c5e6dbedfaddffbaddf56cc45b6055539 Mon Sep 17 00:00:00 2001 From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com> Date: Tue, 4 Feb 2025 10:20:33 +0100 Subject: [PATCH 04/10] feat+refactoring: adapt scirpy to 0.19+; some refactoring with imports and style --- panpipes/python_scripts/run_scanpyQC_rep.py | 141 +++++++++++--------- 1 file changed, 78 insertions(+), 63 deletions(-) diff --git a/panpipes/python_scripts/run_scanpyQC_rep.py b/panpipes/python_scripts/run_scanpyQC_rep.py index 315f32e9..3a19879a 100644 --- a/panpipes/python_scripts/run_scanpyQC_rep.py +++ b/panpipes/python_scripts/run_scanpyQC_rep.py @@ -1,61 +1,67 @@ -''' +""" scanpy QC script RNA order of QC: - RNA - PROT - Repertoire - ATAC -''' -import sys +""" + +import argparse import logging +import sys +import warnings + +import matplotlib.pyplot as plt +import muon as mu +import scirpy as ir +import yaml + +from panpipes.funcs.io import write_obs + L = logging.getLogger() L.setLevel(logging.INFO) log_handler = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s') +formatter = logging.Formatter("%(asctime)s: %(levelname)s - %(message)s") log_handler.setFormatter(formatter) L.addHandler(log_handler) -import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) -import argparse -import pandas as pd -import muon as mu -import scirpy as ir -import matplotlib.pyplot as plt +warnings.simplefilter(action="ignore", category=FutureWarning) + plt.ioff() -from panpipes.funcs.io import write_obs -import yaml parser = argparse.ArgumentParser() # required option -parser.add_argument("--sampleprefix", - default="", - help="prefix to prepend when saving the metadata file") -parser.add_argument("--input_mudata", - default="mdata_unfilt.h5mu", - help="") -parser.add_argument("--output_mudata", - default="mdata_unfilt.h5mu", - help="") -parser.add_argument("--figdir", - default="./figures/", - help="path to save the figures to") - -parser.add_argument("--distance_metrics", - default=None, type=yaml.safe_load, - help="what metrics to calculate sequence distance metric? \ - any arguments from scirpy.pp.ir_dist as a dict in format'{arg: value}' ") -parser.add_argument("--clonotype_metrics", - default=None,type=yaml.safe_load, - help="what metrics to calculate sequence distance metric? \ - any arguments from scirpy.pp.define_clonotypes as a dict in format'{arg: value}' ") +parser.add_argument( + "--sampleprefix", default="", help="prefix to prepend when saving the metadata file" +) +parser.add_argument("--input_mudata", default="mdata_unfilt.h5mu", help="") +parser.add_argument("--output_mudata", default="mdata_unfilt.h5mu", help="") +parser.add_argument( + "--figdir", default="./figures/", help="path to save the figures to" +) + +parser.add_argument( + "--distance_metrics", + default=None, + type=yaml.safe_load, + help="what metrics to calculate sequence distance metric? \ + any arguments from scirpy.pp.ir_dist as a dict in format'{arg: value}' ", +) +parser.add_argument( + "--clonotype_metrics", + default=None, + type=yaml.safe_load, + help="what metrics to calculate sequence distance metric? \ + any arguments from scirpy.pp.define_clonotypes as a dict in format'{arg: value}' ", +) args, opt = parser.parse_known_args() L.info("Running with params: %s", args) L.info("Reading in MuData from '%s'" % args.input_mudata) mdata = mu.read(args.input_mudata) -rep = mdata['rep'] +rep = mdata["rep"] # chain qc L.info("Running scirpy.tl.chain_qc()") ir.tl.chain_qc(rep) @@ -72,78 +78,87 @@ L.info("Computing sequence-distance metric") ir.pp.ir_dist(rep, **dist_args) -# pull function arguments from args. +# pull function arguments from args. if args.clonotype_metrics is not None: # remove nones, so defaults are used clonotype_args = {k: v for k, v in args.clonotype_metrics.items() if v != "None"} clonotype_args = {k: v for k, v in clonotype_args.items() if v} else: - clonotype_args={} + clonotype_args = {} # define clonotypes + L.info("Clonotypes args: %s" % clonotype_args) L.info("Defining clonotypes") ir.tl.define_clonotypes(rep, **clonotype_args) + L.info("Adding column to obs recording which clonotypes are expanded") ir.tl.clonal_expansion(rep) - -category_cols = ['has_ir'] -for cc in category_cols: - if pd.api.types.infer_dtype(rep.obs[cc]) != "categorical": - rep.obs[cc] = rep.obs[cc].astype('string').astype('category') - -if "BCR" in rep.obs['receptor_type'].values: - bcr = rep[rep.obs.receptor_type =="BCR", :].copy() -else: - bcr=None - +tcr = None if "TCR" in rep.obs.receptor_type.values: - tcr = rep[rep.obs.receptor_type =="TCR", :].copy() -else: - tcr=None + tcr = rep[rep.obs.receptor_type == "TCR", :].copy() + +bcr = None +if "BCR" in rep.obs["receptor_type"].values: + bcr = rep[rep.obs.receptor_type == "BCR", :].copy() + # plot group abundances L.info("Plotting group abundances") fig, ax = plt.subplots() ax = ir.pl.group_abundance(rep, groupby="receptor_type", ax=ax) -fig.savefig(args.figdir +"/barplot_group_abundance_receptor_type.png", bbox_inches="tight") +fig.savefig( + args.figdir + "/barplot_group_abundance_receptor_type.png", bbox_inches="tight" +) if bcr is not None: fig, ax = plt.subplots() ax = ir.pl.group_abundance(bcr, groupby="chain_pairing", ax=ax) fig.tight_layout() - fig.savefig(args.figdir +"/barplot_group_abundance_bcr_receptor_subtype.png", bbox_inches="tight") + fig.savefig( + args.figdir + "/barplot_group_abundance_bcr_receptor_subtype.png", + bbox_inches="tight", + ) if tcr is not None: fig, ax = plt.subplots() - ax = ir.pl.group_abundance(tcr, groupby="chain_pairing", ax=ax) + ax = ir.pl.group_abundance(tcr, groupby="chain_pairing", ax=ax) fig.tight_layout() - fig.savefig(args.figdir +"/barplot_group_abundance_tcr_receptor_subtype.png", bbox_inches="tight") + fig.savefig( + args.figdir + "/barplot_group_abundance_tcr_receptor_subtype.png", + bbox_inches="tight", + ) # clonal expansion L.info("Plotting clonal expansion") -clone_counts = rep.obs.groupby("receptor_type").clone_id.value_counts().to_frame("cell_counts").reset_index() +clone_counts = ( + rep.obs.groupby("receptor_type") + .clone_id.value_counts() + .to_frame("cell_counts") + .reset_index() +) if bcr is not None: ax = ir.pl.clonal_expansion(bcr, groupby="sample_id") ax.set_title("bcr clonal expansion") - plt.savefig(args.figdir +"/barplot_clonal_expansion_bcr.png", bbox_inches="tight" ) + plt.savefig(args.figdir + "/barplot_clonal_expansion_bcr.png", bbox_inches="tight") if tcr is not None: ax = ir.pl.clonal_expansion(tcr, groupby="sample_id") ax.set_title("tcr clonal expansion") - plt.savefig(args.figdir +"/barplot_clonal_expansion_tcr.png", bbox_inches="tight") - + plt.savefig(args.figdir + "/barplot_clonal_expansion_tcr.png", bbox_inches="tight") mdata.update() -L.info("Saving updated obs in a metadata tsv file to ./" + args.sampleprefix + "_cell_metadata.tsv") -write_obs(mdata, output_prefix=args.sampleprefix, - output_suffix="_cell_metadata.tsv") +L.info( + "Saving updated obs in a metadata tsv file to ./" + + args.sampleprefix + + "_cell_metadata.tsv" +) +write_obs(mdata, output_prefix=args.sampleprefix, output_suffix="_cell_metadata.tsv") L.info("Saving updated MuData to '%s'" % args.output_mudata) mdata.write(args.output_mudata) L.info("Done") - From 98b72db9a674259ce70edc0b373ea957a6741759 Mon Sep 17 00:00:00 2001 From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com> Date: Tue, 4 Feb 2025 10:42:47 +0100 Subject: [PATCH 05/10] refactoring: imports, style, redundant code --- panpipes/panpipes/pipeline_ingest.py | 506 ++++++++++++++++----------- 1 file changed, 306 insertions(+), 200 deletions(-) diff --git a/panpipes/panpipes/pipeline_ingest.py b/panpipes/panpipes/pipeline_ingest.py index 12be6897..a5f74107 100644 --- a/panpipes/panpipes/pipeline_ingest.py +++ b/panpipes/panpipes/pipeline_ingest.py @@ -1,71 +1,74 @@ #! /usr/bin/env python -from ruffus import * -import sys +import logging import os -import re -from cgatcore import pipeline as P -import pandas as pd -import cgatcore.iotools as IOTools -from panpipes.funcs.io import check_submission_file, gen_load_anndata_jobs -# from scpipelines.funcs.processing import intersection +import sys import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) -warnings.filterwarnings("ignore", category=DeprecationWarning) -import warnings +import cgatcore.iotools as IOTools +import pandas as pd +from cgatcore import pipeline as P +from ruffus import ( + active_if, + follows, + originate, + mkdir, + files, + collate, + transform, + regex, + formatter, +) -# warnings.simplefilter('ignore', category=NumbaDeprecationWarning) -# warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning) +from panpipes.funcs.io import gen_load_anndata_jobs -import yaml +warnings.simplefilter(action="ignore", category=FutureWarning) +warnings.filterwarnings("ignore", category=DeprecationWarning) -import logging -# L = logging.getLogger('panpipes') -# L.setLevel(logging.INFO) -# log_handler = logging.StreamHandler(sys.stdout) -# log_formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s') -# log_handler.setFormatter(log_formatter) -# L.addHandler(log_handler) def get_logger(): return logging.getLogger("cgatcore.pipeline") PARAMS = P.get_parameters( - ["%s/pipeline.yml" % os.path.splitext(__file__)[0], - "pipeline.yml"]) - -PARAMS['py_path'] = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'python_scripts') -PARAMS['r_path'] = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'R_scripts') -PARAMS['resources_path'] = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources") + ["%s/pipeline.yml" % os.path.splitext(__file__)[0], "pipeline.yml"] +) + +PARAMS["py_path"] = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "python_scripts" +) +PARAMS["r_path"] = os.path.join(os.path.dirname(os.path.dirname(__file__)), "R_scripts") +PARAMS["resources_path"] = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "resources" +) job_kwargs = {} -if PARAMS['condaenv'] is not None: - job_kwargs["job_condaenv"] =PARAMS['condaenv'] +if PARAMS["condaenv"] is not None: + job_kwargs["job_condaenv"] = PARAMS["condaenv"] -#------------------------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------------------------ ## Create a dictionary of modalities -#------------------------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------------------------ mode_dictionary = PARAMS["modalities"] -#{'rna': True, 'prot': False, 'Repertoire': False, 'atac': False} +# {'rna': True, 'prot': False, 'Repertoire': False, 'atac': False} # ----------------------------------------------------------------------------------------------- ## 10X metrics plotting # ----------------------------------------------------------------------------------------------- + # TODO: update this to check for each modality csv file # TODO: check this works with current submission file -@active_if(PARAMS['plot_10X_metrics']) -@follows(mkdir('figures')) -@follows(mkdir('figures/tenx_metrics')) +@active_if(PARAMS["plot_10X_metrics"]) +@follows(mkdir("figures")) +@follows(mkdir("figures/tenx_metrics")) @follows(mkdir("logs")) @originate("logs/0_tenx_metrics_multi_aggregate.log") def aggregate_tenx_metrics_multi(outfile): """this is to aggregate all the cellranger multi metric_summary files it also does some plotting - """ + """ cmd = """ python %(py_path)s/aggregate_cellranger_summary_metrics.py --pipe_df %(submission_file)s @@ -73,64 +76,87 @@ def aggregate_tenx_metrics_multi(outfile): --cellranger_column_conversion_df %(resources_path)s/metrics_summary_col_conversion.tsv --output_file 10x_metrics.csv > %(outfile)s """ - job_kwargs["job_threads"] = PARAMS['resources_threads_low'] - log_msg = f"TASK: 'aggregate_tenx_metrics_multi'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{outfile}' FOR MORE INFORMATION." - get_logger().info(log_msg) + job_kwargs["job_threads"] = PARAMS["resources_threads_low"] + log_msg = ( + "TASK: 'aggregate_tenx_metrics_multi'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{outfile}' FOR MORE INFORMATION." + ) + get_logger().info(log_msg) P.run(cmd, **job_kwargs) @follows(aggregate_tenx_metrics_multi) def process_all_tenx_metrics(): pass + + # ----------------------------------------------------------------------------------------------- ## Creating h5mu from filtered data files # ----------------------------------------------------------------------------------------------- + def unfilt_file(): - sprefix = PARAMS['sample_prefix'] + sprefix = PARAMS["sample_prefix"] unfilt_file = sprefix + "_unfilt.h5mu" return unfilt_file - def gen_load_filtered_anndata_jobs(): - caf = pd.read_csv(PARAMS['submission_file'], sep='\t') - return gen_load_anndata_jobs(caf, load_raw=False, mode_dictionary=PARAMS["modalities"], load_prot_from_raw=PARAMS['load_prot_from_raw']) + caf = pd.read_csv(PARAMS["submission_file"], sep="\t") + + return gen_load_anndata_jobs( + caf, + load_raw=False, + mode_dictionary=PARAMS["modalities"], + load_prot_from_raw=PARAMS["load_prot_from_raw"], + ) - @follows(mkdir("logs")) @follows(mkdir("tmp")) @active_if(PARAMS["use_existing_h5mu"] is False) @files(gen_load_filtered_anndata_jobs) -def load_mudatas(rna_path, outfile, - sample_id, - rna_filetype, - prot_path, prot_filetype, - tcr_path, tcr_filetype, - bcr_path, bcr_filetype, - atac_path, atac_filetype, - fragments_file, per_barcode_metrics_file, peak_annotation_file, - cell_mtd_path): - - path_dict = {'rna':rna_path, - 'prot': prot_path, - 'bcr': bcr_path, - 'tcr' : tcr_path, - 'atac':atac_path} +def load_mudatas( + rna_path, + outfile, + sample_id, + rna_filetype, + prot_path, + prot_filetype, + tcr_path, + tcr_filetype, + bcr_path, + bcr_filetype, + atac_path, + atac_filetype, + fragments_file, + per_barcode_metrics_file, + peak_annotation_file, + cell_mtd_path, +): + path_dict = { + "rna": rna_path, + "prot": prot_path, + "bcr": bcr_path, + "tcr": tcr_path, + "atac": atac_path, + } print(path_dict) - - modality_dict = {k:True if path_dict[k] is not None else False for k,v in PARAMS['modalities'].items() } + + modality_dict = { + k: True if path_dict[k] is not None else False + for k, v in PARAMS["modalities"].items() + } print(modality_dict) - + cmd = """ python %(py_path)s/make_adata_from_csv.py --mode_dictionary "%(modality_dict)s" --sample_id %(sample_id)s --output_file %(outfile)s """ - - cmd += " --use_muon True" #this is still here? + + cmd += " --use_muon True" # this is still here? if rna_path is not None and pd.notna(rna_path): cmd += " --rna_infile %(rna_path)s" cmd += " --rna_filetype %(rna_filetype)s" @@ -147,7 +173,7 @@ def load_mudatas(rna_path, outfile, cmd += " --fragments_file %(fragments_file)s" if peak_annotation_file is not None and pd.notna(peak_annotation_file): cmd += " --peak_annotation_file %(peak_annotation_file)s" - # ~ means this tests "is not nan" + # ~ means this tests "is not nan" if tcr_path is not None and pd.notna(tcr_path): cmd += " --tcr_filtered_contigs %(tcr_path)s" cmd += " --tcr_filetype %(tcr_filetype)s" @@ -157,21 +183,22 @@ def load_mudatas(rna_path, outfile, logfile = f"logs/1_load_mudatas_{sample_id}.log" cmd += f" > {logfile}" # print(cmd) - job_kwargs["job_threads"] = PARAMS['resources_threads_medium'] - log_msg = f"TASK: 'load_mudatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION." + job_kwargs["job_threads"] = PARAMS["resources_threads_medium"] + log_msg = ( + "TASK: 'load_mudatas'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION." + ) get_logger().info(log_msg) P.run(cmd, **job_kwargs) - @active_if(PARAMS["use_existing_h5mu"] is False) -@collate(load_mudatas, - formatter(""), - unfilt_file()) +@collate(load_mudatas, formatter(""), unfilt_file()) def concat_filtered_mudatas(infiles, outfile): # print(infiles) # print(outfile) - infiles_str = ','.join(infiles) + infiles_str = ",".join(infiles) # TODO: not used anywhere + cmd = """ python %(py_path)s/concat_adata.py --input_files_str %(infiles_str)s @@ -180,19 +207,22 @@ def concat_filtered_mudatas(infiles, outfile): --sampleprefix %(sample_prefix)s --join_type %(concat_join_type)s """ - if PARAMS['metadatacols'] is not None and PARAMS['metadatacols'] != "": + if PARAMS["metadatacols"] is not None and PARAMS["metadatacols"] != "": cmd += " --metadatacols %(metadatacols)s" if PARAMS["barcode_mtd_include"] is True: cmd += " --barcode_mtd_df %(barcode_mtd_path)s" cmd += " --barcode_mtd_metadatacols %(barcode_mtd_metadatacols)s" - if PARAMS['protein_metadata_table'] is not None: + if PARAMS["protein_metadata_table"] is not None: cmd += " --protein_var_table %(protein_metadata_table)s" - if PARAMS['index_col_choice'] is not None: + if PARAMS["index_col_choice"] is not None: cmd += " --protein_new_index_col %(index_col_choice)s" logfile = "logs/1_concat_filtered_mudatas.log" cmd += f" > {logfile}" - job_kwargs["job_threads"] = PARAMS['resources_threads_high'] - log_msg = f"TASK: 'concat_filtered_mudatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION." + job_kwargs["job_threads"] = PARAMS["resources_threads_high"] + log_msg = ( + "TASK: 'concat_filtered_mudatas'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION." + ) get_logger().info(log_msg) P.run(cmd, **job_kwargs) # P.run("rm tmp/*", job_threads=PARAMS['resources_threads_low']) @@ -202,14 +232,16 @@ def concat_filtered_mudatas(infiles, outfile): ## Creating h5mu from bg data files # ----------------------------------------------------------------------------------------------- -if PARAMS['assess_background'] or (PARAMS["modalities"]["prot"] and "dsb" in PARAMS['normalisation_methods']): - PARAMS['bg_required'] = True +if PARAMS["assess_background"] or ( + PARAMS["modalities"]["prot"] and "dsb" in PARAMS["normalisation_methods"] +): + PARAMS["bg_required"] = True else: - PARAMS['bg_required'] = False - + PARAMS["bg_required"] = False + def bg_file(): - sprefix = PARAMS['sample_prefix'] + sprefix = PARAMS["sample_prefix"] use_muon = True if use_muon: bg_file = sprefix + "_bg.h5mu" @@ -219,36 +251,55 @@ def bg_file(): def gen_load_bg_anndata_jobs(): - caf = pd.read_csv(PARAMS['submission_file'], sep='\t') - return gen_load_anndata_jobs(caf, load_raw=True, mode_dictionary=PARAMS["modalities"], load_prot_from_raw=True) - + caf = pd.read_csv(PARAMS["submission_file"], sep="\t") + return gen_load_anndata_jobs( + caf, + load_raw=True, + mode_dictionary=PARAMS["modalities"], + load_prot_from_raw=True, + ) + @active_if(PARAMS["bg_required"]) @follows(mkdir("logs")) @follows(mkdir("tmp")) @active_if(PARAMS["use_existing_h5mu"] is False) @files(gen_load_bg_anndata_jobs) -def load_bg_mudatas(rna_path, outfile, - sample_id, - rna_filetype, - prot_path, prot_filetype, - tcr_path, tcr_filetype, - bcr_path, bcr_filetype, - atac_path, atac_filetype, - fragments_file, per_barcode_metrics_file, peak_annotation_file, - cell_mtd_path): - path_dict = {'rna':rna_path, - 'prot': prot_path, - 'bcr': bcr_path, - 'tcr' : tcr_path, - 'atac':atac_path} +def load_bg_mudatas( + rna_path, + outfile, + sample_id, + rna_filetype, + prot_path, + prot_filetype, + tcr_path, + tcr_filetype, + bcr_path, + bcr_filetype, + atac_path, + atac_filetype, + fragments_file, + per_barcode_metrics_file, + peak_annotation_file, + cell_mtd_path, +): + path_dict = { + "rna": rna_path, + "prot": prot_path, + "bcr": bcr_path, + "tcr": tcr_path, + "atac": atac_path, + } print(path_dict) - + # need to remove unnessacry modality - modality_dict = {k:True if path_dict[k] is not None else False for k,v in PARAMS['modalities'].items() } + modality_dict = { + k: True if path_dict[k] is not None else False + for k, v in PARAMS["modalities"].items() + } - modality_dict['tcr'] = False - modality_dict['bcr'] = False + modality_dict["tcr"] = False + modality_dict["bcr"] = False cmd = """ python %(py_path)s/make_adata_from_csv.py --sample_id %(sample_id)s @@ -269,24 +320,27 @@ def load_bg_mudatas(rna_path, outfile, cmd += " --fragments_file %(fragments_file)s" if peak_annotation_file is not None and pd.notna(peak_annotation_file): cmd += " --peak_annotation_file %(peak_annotation_file)s" - if PARAMS['protein_metadata_table'] is not None: - cmd += " --protein_var_table %(protein_metadata_table)s" #check which of these 2 needs to stay!!! - if PARAMS['index_col_choice'] is not None: + if PARAMS["protein_metadata_table"] is not None: + cmd += " --protein_var_table %(protein_metadata_table)s" # check which of these 2 needs to stay!!! + if PARAMS["index_col_choice"] is not None: cmd += " --protein_new_index_col %(index_col_choice)s" logfile = f"logs/1_load_bg_mudatas_%(sample_id)s.log" cmd += " > {logfile}" - job_kwargs["job_threads"] = PARAMS['resources_threads_medium'] - log_msg = f"TASK: 'load_bg_mudatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION." + job_kwargs["job_threads"] = PARAMS["resources_threads_medium"] + log_msg = ( + "TASK: 'load_bg_mudatas'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION." + ) get_logger().info(log_msg) P.run(cmd, **job_kwargs) @active_if(PARAMS["bg_required"]) -@active_if(PARAMS['downsample_background']) +@active_if(PARAMS["downsample_background"]) @follows(load_bg_mudatas) -@transform(load_bg_mudatas, - regex("./tmp/(.*)_raw.h5(.*)"), - r"./logs/1_\1_bg_downsampled.log") +@transform( + load_bg_mudatas, regex("./tmp/(.*)_raw.h5(.*)"), r"./logs/1_\1_bg_downsampled.log" +) def downsample_bg_mudatas(infile, outfile): cmd = """ python %(py_path)s/downsample.py @@ -294,7 +348,7 @@ def downsample_bg_mudatas(infile, outfile): --output_mudata "%(infile)s" --downsample_value 20000 > %(outfile)s """ - job_kwargs["job_threads"] = PARAMS['resources_threads_medium'] + job_kwargs["job_threads"] = PARAMS["resources_threads_medium"] # TODO: add log file? P.run(cmd, **job_kwargs) @@ -304,13 +358,13 @@ def downsample_bg_mudatas(infile, outfile): @active_if(PARAMS["use_existing_h5mu"] is False) @follows(load_bg_mudatas) @follows(downsample_bg_mudatas) -@collate(load_bg_mudatas, - formatter(""), - bg_file()) +@collate(load_bg_mudatas, formatter(""), bg_file()) def concat_bg_mudatas(infiles, outfile): # print(infiles) # print(outfile) - infiles_str = ','.join(infiles) + + infiles_str = ",".join(infiles) # TODO: not used anywhere + cmd = """ python %(py_path)s/concat_adata.py --input_files_str %(infiles_str)s @@ -319,76 +373,92 @@ def concat_bg_mudatas(infiles, outfile): --sampleprefix %(sample_prefix)s --join_type %(concat_join_type)s """ - if PARAMS['metadatacols'] is not None and PARAMS['metadatacols'] != "": + if PARAMS["metadatacols"] is not None and PARAMS["metadatacols"] != "": cmd += " --metadatacols %(metadatacols)s" - # if PARAMS["barcode_mtd_include"] is True: - # cmd += " --barcode_mtd_df %(barcode_mtd_path)s" + # if PARAMS["barcode_mtd_include"] is True: + # cmd += " --barcode_mtd_df %(barcode_mtd_path)s" # cmd += " --barcode_mtd_metadatacols %(barcode_mtd_metadatacols)s" logfile = "logs/1_concat_bg_mudatas.log" cmd += " > {logfile}" - job_kwargs["job_threads"] = PARAMS['resources_threads_high'] - log_msg = f"TASK: 'concat_bg_mudatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION." + job_kwargs["job_threads"] = PARAMS["resources_threads_high"] + log_msg = ( + "TASK: 'concat_bg_mudatas'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION." + ) get_logger().info(log_msg) P.run(cmd, **job_kwargs) # P.run("rm tmp/*", job_threads=PARAMS['resources_threads_low']) + # ----------------------------------------------------------------------------------------------- -## rna QC +## rna QC # ----------------------------------------------------------------------------------------------- def orfile(): - return PARAMS['sample_prefix'] + "_cell_metadata.tsv" + return PARAMS["sample_prefix"] + "_cell_metadata.tsv" + + @active_if(PARAMS["scr_run"]) -@active_if(PARAMS['modalities_rna']) +@active_if(PARAMS["modalities_rna"]) @active_if(PARAMS["use_existing_h5mu"] is False) @follows(mkdir("scrublet")) -@transform(load_mudatas, regex(r"./tmp/(.*).h5(.*)"), - r"scrublet/\1_scrublet_scores.txt", r"\1") +@transform( + load_mudatas, regex(r"./tmp/(.*).h5(.*)"), r"scrublet/\1_scrublet_scores.txt", r"\1" +) def run_scrublet(infile, outfile, sample_id): outdir = "./scrublet" + cmd = """python %(py_path)s/run_scrublet_scores.py --sample_id %(sample_id)s --inputpath %(infile)s --outdir %(outdir)s """ - if PARAMS['scr_expected_doublet_rate'] is not None: + if PARAMS["scr_expected_doublet_rate"] is not None: cmd += " --expected_doublet_rate %(scr_expected_doublet_rate)s" - if PARAMS['scr_sim_doublet_ratio'] is not None: + if PARAMS["scr_sim_doublet_ratio"] is not None: cmd += " --sim_doublet_ratio %(scr_sim_doublet_ratio)s" - if PARAMS['scr_n_neighbours'] is not None: + if PARAMS["scr_n_neighbours"] is not None: cmd += " --n_neighbors %(scr_n_neighbours)s" - if PARAMS['scr_min_counts'] is not None: + if PARAMS["scr_min_counts"] is not None: cmd += " --min_counts %(scr_min_counts)s" - if PARAMS['scr_min_cells'] is not None: + if PARAMS["scr_min_cells"] is not None: cmd += " --min_cells %(scr_min_cells)s" - if PARAMS['scr_min_gene_variability_pctl'] is not None: + if PARAMS["scr_min_gene_variability_pctl"] is not None: cmd += " --min_gene_variability_pctl %(scr_min_gene_variability_pctl)s" - if PARAMS['scr_n_prin_comps'] is not None: + if PARAMS["scr_n_prin_comps"] is not None: cmd += " --n_prin_comps %(scr_n_prin_comps)s" - if PARAMS['scr_use_thr'] is not None: + if PARAMS["scr_use_thr"] is not None: cmd += " --use_thr %(scr_use_thr)s" - if PARAMS['scr_call_doublets_thr'] is not None: + if PARAMS["scr_call_doublets_thr"] is not None: cmd += " --call_doublets_thr %(scr_call_doublets_thr)s" logfile = "logs/2_run_scrublet_" + sample_id + ".log" cmd += f" > {logfile}" - job_kwargs["job_threads"] = PARAMS['resources_threads_medium'] - log_msg = f"TASK: 'run_scrublet'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION." + job_kwargs["job_threads"] = PARAMS["resources_threads_medium"] + log_msg = ( + "TASK: 'run_scrublet'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION." + ) get_logger().info(log_msg) - P.run(cmd,**job_kwargs) + P.run(cmd, **job_kwargs) IOTools.touch_file(outfile) -@active_if(PARAMS['modalities_rna']) +@active_if(PARAMS["modalities_rna"]) @follows(mkdir("figures")) @follows(mkdir("figures/rna")) @follows(concat_filtered_mudatas, run_scrublet) @originate("logs/3_run_scanpy_qc_rna.log", orfile(), unfilt_file()) def run_rna_qc(log_file, outfile, unfilt_file): # infile = submission file - resources_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources") + + resources_path = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "resources" + ) + customgenesfile = PARAMS["custom_genes_file"] - calcproportions= PARAMS["calc_proportions"] + calcproportions = PARAMS["calc_proportions"] + cmd = """ python %(py_path)s/run_scanpyQC_rna.py --sampleprefix %(sample_prefix)s @@ -398,20 +468,22 @@ def run_rna_qc(log_file, outfile, unfilt_file): --calc_proportions %(calcproportions)s --figdir figures """ - if len(os.listdir('scrublet')) != 0: + if len(os.listdir("scrublet")) != 0: cmd += " --scrubletdir scrublet" + # if params is specified otherwise default values will be used. - # if params is specified otherwise default values will be used. - if PARAMS['score_genes'] is not None: - scoregenes = PARAMS['score_genes'] + if PARAMS["score_genes"] is not None: + scoregenes = PARAMS["score_genes"] cmd += " --score_genes %(scoregenes)s" - if PARAMS['ccgenes'] is not None: - if PARAMS['ccgenes'] == "default": + + if PARAMS["ccgenes"] is not None: + if PARAMS["ccgenes"] == "default": ccgenesfile = resources_path + "/cell_cycle_genes.tsv" else: - ccgenesfile = PARAMS['ccgenes'] + ccgenesfile = PARAMS["ccgenes"] cmd += " --ccgenes %(ccgenesfile)s" - cmd += " --channel_col %(channel_col)s" # is this needed? + + cmd += " --channel_col %(channel_col)s" # is this needed? # if PARAMS['isotype_upper_quantile'] is not None: # cmd += " --isotype_upper_quantile %(isotype_upper_quantile)s" @@ -419,29 +491,37 @@ def run_rna_qc(log_file, outfile, unfilt_file): # cmd += " --isotype_n_pass %(isotype_n_pass)s" # add log file cmd += " > %(log_file)s" - job_kwargs["job_threads"] = PARAMS['resources_threads_high'] - log_msg = f"TASK: 'run_rna_qc'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION." + job_kwargs["job_threads"] = PARAMS["resources_threads_high"] + + log_msg = ( + "TASK: 'run_rna_qc'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION." + ) get_logger().info(log_msg) + P.run(cmd, **job_kwargs) + if os.path.exists("cache"): P.run("rm -r cache") - #IOTools.touch_file(metadata) + # IOTools.touch_file(metadata) + -#TO DO ADD the other modalities in this order: +# TO DO ADD the other modalities in this order: # prot run_scanpy_qc_prot.log # ----------------------------------------------------------------------------------------------- -## prot QC +## prot QC # ----------------------------------------------------------------------------------------------- - + # this is because we don't want to try and load the file during the `config` stage + # def prot_cell_mtd(): # return PARAMS['sample_prefix'] + "_prot_cell_metadata.tsv" -@active_if(PARAMS['modalities_prot']) -@follows(mkdir('figures/prot')) +@active_if(PARAMS["modalities_prot"]) +@follows(mkdir("figures/prot")) @follows(concat_filtered_mudatas, run_rna_qc) @originate("logs/3_run_scanpy_qc_prot.log", orfile(), unfilt_file()) def run_scanpy_prot_qc(log_file, outfile, unfilt_file): @@ -453,27 +533,31 @@ def run_scanpy_prot_qc(log_file, outfile, unfilt_file): --outfile %(unfilt_file)s --figdir figures/prot/ """ - if PARAMS['channel_col'] is not None: + if PARAMS["channel_col"] is not None: cmd += " --channel_col %(channel_col)s" else: cmd += " --channel_col sample_id" - if PARAMS['plotqc_prot_metrics']: + if PARAMS["plotqc_prot_metrics"]: cmd += " --per_cell_metrics %(plotqc_prot_metrics)s" - if PARAMS['plot_metrics_per_prot']: + if PARAMS["plot_metrics_per_prot"]: cmd += " --per_prot_metrics %(plot_metrics_per_prot)s" - if PARAMS['identify_isotype_outliers']: + if PARAMS["identify_isotype_outliers"]: cmd += " --identify_isotype_outliers %(identify_isotype_outliers)s" cmd += " --isotype_upper_quantile %(isotype_upper_quantile)s" cmd += " --isotype_n_pass %(isotype_n_pass)s" # add log file cmd += " > %(log_file)s" - job_kwargs["job_threads"] = PARAMS['resources_threads_high'] - log_msg = f"TASK: 'run_scanpy_prot_qc'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION." + job_kwargs["job_threads"] = PARAMS["resources_threads_high"] + log_msg = ( + "TASK: 'run_scanpy_prot_qc'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION." + ) get_logger().info(log_msg) P.run(cmd, **job_kwargs) pass -@active_if(PARAMS['modalities_prot']) + +@active_if(PARAMS["modalities_prot"]) @follows(run_scanpy_prot_qc, concat_filtered_mudatas, concat_bg_mudatas) @originate("logs/3_run_dsb_clr.log", unfilt_file(), bg_file()) def run_dsb_clr(outfile, unfilt_file, bg_file): @@ -485,25 +569,29 @@ def run_dsb_clr(outfile, unfilt_file, bg_file): --filtered_mudata %(unfilt_file)s --figpath ./figures/prot """ - if PARAMS['channel_col'] is not None: + if PARAMS["channel_col"] is not None: cmd += " --channel_col %(channel_col)s" - if PARAMS['normalisation_methods'] is not None: + if PARAMS["normalisation_methods"] is not None: cmd += " --normalisation_methods %(normalisation_methods)s" - if PARAMS['quantile_clipping'] is not None: + if PARAMS["quantile_clipping"] is not None: cmd += " --quantile_clipping %(quantile_clipping)s" - if PARAMS['clr_margin'] is not None: + if PARAMS["clr_margin"] is not None: cmd += " --clr_margin %(clr_margin)s" - if PARAMS['save_norm_prot_mtx'] is True: + if PARAMS["save_norm_prot_mtx"] is True: cmd += " --save_mtx True" # find the bg data if available if os.path.exists(bg_file): cmd += " --bg_mudata %(bg_file)s" cmd += " > %(outfile)s" - job_kwargs["job_threads"] = PARAMS['resources_threads_high'] - log_msg = f"TASK: 'run_dsb_clr'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{outfile}' FOR MORE INFORMATION." + job_kwargs["job_threads"] = PARAMS["resources_threads_high"] + log_msg = ( + "TASK: 'run_dsb_clr'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{outfile}' FOR MORE INFORMATION." + ) get_logger().info(log_msg) P.run(cmd, **job_kwargs) + @follows(run_scanpy_prot_qc, run_dsb_clr) def run_prot_qc(): pass @@ -513,8 +601,8 @@ def run_prot_qc(): ## Repertoire QC # Repertoire run_scanpy_qc_rep.log # ----------------------------------------------------------------------------------------------- # toggle_rep_qc = () -@active_if(PARAMS['modalities_bcr'] or PARAMS['modalities_tcr'] ) -@follows(mkdir('figures/rep')) +@active_if(PARAMS["modalities_bcr"] or PARAMS["modalities_tcr"]) +@follows(mkdir("figures/rep")) @follows(run_rna_qc, run_prot_qc) @originate("logs/3_run_scanpy_qc_rep.log", unfilt_file()) def run_repertoire_qc(logfile, unfilt_file): @@ -527,27 +615,30 @@ def run_repertoire_qc(logfile, unfilt_file): --clonotype_metrics "%(clonotype_definition)s" """ cmd += " > %(logfile)s" - job_kwargs["job_threads"] = PARAMS['resources_threads_low'] - log_msg = f"TASK: 'run_repertoire_qc'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION." + job_kwargs["job_threads"] = PARAMS["resources_threads_low"] + log_msg = ( + "TASK: 'run_repertoire_qc'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION." + ) get_logger().info(log_msg) P.run(cmd, **job_kwargs) + # ----------------------------------------------------------------------------------------------- ## atac QC # run_scanpy_qc_atac.log # ----------------------------------------------------------------------------------------------- -@active_if(PARAMS['modalities_atac']) -@mkdir('figures/atac') +@active_if(PARAMS["modalities_atac"]) +@mkdir("figures/atac") @follows(run_rna_qc, run_prot_qc, run_repertoire_qc) @originate("logs/3_run_scanpy_qc_atac.log", orfile(), unfilt_file()) def run_atac_qc(log_file, outfile, unfilt_file): # if this is a multiple samples project - # they should be run together upfront + # they should be run together upfront # for independent atac run cellranger-atac count, then cellranger-atac aggr # https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/using/aggr # for multiome run cellranger-arc count then cellranger-arc aggr # https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/aggr - # - + # plot_qc_atac_metrics = PARAMS["plotqc_atac_metrics"] cmd = """ @@ -564,21 +655,25 @@ def run_atac_qc(log_file, outfile, unfilt_file): cmd += " --is_paired False" if PARAMS["features_tss"] is not None: cmd += " --is_paired False" - feat_file=PARAMS["features_tss"] + feat_file = PARAMS["features_tss"] cmd += " --tss_coordinates %(feat_file)s" if PARAMS["partner_rna"] is not None: cmd += " --is_paired False" prna_file = PARAMS["partner_rna"] cmd += " --paired_rna %(prna_file)s" - + cmd += " --use_muon True" cmd += " > %(log_file)s" - job_kwargs["job_threads"] = PARAMS['resources_threads_low'] - log_msg = f"TASK: 'run_atac_qc'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION." + job_kwargs["job_threads"] = PARAMS["resources_threads_low"] + log_msg = ( + f"TASK: 'run_atac_qc'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION." + ) get_logger().info(log_msg) P.run(cmd, **job_kwargs) + @follows(run_rna_qc, run_prot_qc, run_repertoire_qc, run_atac_qc) def run_qc(): pass @@ -588,10 +683,11 @@ def run_qc(): ## QC plotting # ----------------------------------------------------------------------------------------------- + @follows(run_qc) @originate("logs/4_plot_qc.log", orfile()) def plot_qc(log_file, cell_file): - qcmetrics = PARAMS['plotqc_rna_metrics'] + qcmetrics = PARAMS["plotqc_rna_metrics"] cmd = """ Rscript %(r_path)s/plotQC.R --prefilter TRUE @@ -600,19 +696,22 @@ def plot_qc(log_file, cell_file): --groupingvar %(plotqc_grouping_var)s """ cmd += " --scanpy_or_muon muon" - if PARAMS["modalities"]['rna'] and PARAMS['plotqc_rna_metrics'] is not None: + if PARAMS["modalities"]["rna"] and PARAMS["plotqc_rna_metrics"] is not None: cmd += " --rna_qc_metrics %(plotqc_rna_metrics)s" - if PARAMS["modalities"]['prot'] and PARAMS['plotqc_prot_metrics'] is not None: + if PARAMS["modalities"]["prot"] and PARAMS["plotqc_prot_metrics"] is not None: cmd += " --prot_qc_metrics %(plotqc_prot_metrics)s" - if PARAMS["modalities"]['atac'] and PARAMS['plotqc_atac_metrics'] is not None: + if PARAMS["modalities"]["atac"] and PARAMS["plotqc_atac_metrics"] is not None: cmd += " --atac_qc_metrics %(plotqc_atac_metrics)s" - if PARAMS['modalities_bcr'] or PARAMS['modalities_tcr']: - if PARAMS['plotqc_rep_metrics'] is not None: - pqrm = ','.join(['plotqc_rep_metrics']) + if PARAMS["modalities_bcr"] or PARAMS["modalities_tcr"]: + if PARAMS["plotqc_rep_metrics"] is not None: + pqrm = ",".join(["plotqc_rep_metrics"]) cmd += " --rep_qc_metrics %(pqrm)s" cmd += " > %(log_file)s" - job_kwargs["job_threads"] = PARAMS['resources_threads_low'] - log_msg = f"TASK: 'plot_qc'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION." + job_kwargs["job_threads"] = PARAMS["resources_threads_low"] + log_msg = ( + f"TASK: 'plot_qc'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION." + ) get_logger().info(log_msg) P.run(cmd, **job_kwargs) @@ -621,7 +720,8 @@ def plot_qc(log_file, cell_file): # assess background # ------- -@active_if(PARAMS['assess_background']) + +@active_if(PARAMS["assess_background"]) @follows(mkdir("figures/background")) @follows(run_qc) @follows(concat_bg_mudatas) @@ -635,13 +735,15 @@ def run_assess_background(log_file, unfilt_file, bg_file): --figpath "./figures/background/" """ cmd += " > %(log_file)s" - job_kwargs["job_threads"] = PARAMS['resources_threads_high'] - log_msg = f"TASK: 'run_assess_background'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION." + job_kwargs["job_threads"] = PARAMS["resources_threads_high"] + log_msg = ( + f"TASK: 'run_assess_background'" + + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION." + ) get_logger().info(log_msg) P.run(cmd, **job_kwargs) - # # ------------ # TO DO change the decorator to follow all qc plots? @follows(plot_qc) @@ -649,14 +751,16 @@ def run_assess_background(log_file, unfilt_file, bg_file): def all_rna_qc(): pass + @follows(run_dsb_clr, plot_qc) def all_prot_qc(): pass + @follows(process_all_tenx_metrics) @follows(all_prot_qc) @follows(all_rna_qc) -#@originate("ruffus.check") +# @originate("ruffus.check") def full(): """ All cgat pipelines should end with a full() function which updates, @@ -664,9 +768,10 @@ def full(): The @follows statement should ensure that all functions are covered, either directly or as prerequisites. """ - #IOTools.touch_file(file) + # IOTools.touch_file(file) pass + @originate("cleanup_done.txt") def cleanup(file): # remove any ctmp fails @@ -676,6 +781,7 @@ def cleanup(file): # delete empty dirs P.run("find ./ -empty -type d -delete", without_cluster=True) + def main(argv=None): if argv is None: argv = sys.argv From d8ec8071e62cbef9240383966317a81d71fba44d Mon Sep 17 00:00:00 2001 From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com> Date: Tue, 4 Feb 2025 10:58:27 +0100 Subject: [PATCH 06/10] refactoring: style, types, structure --- .../aggregate_cellranger_summary_metrics.py | 362 +++++++++++------- 1 file changed, 225 insertions(+), 137 deletions(-) diff --git a/panpipes/python_scripts/aggregate_cellranger_summary_metrics.py b/panpipes/python_scripts/aggregate_cellranger_summary_metrics.py index a08960a1..5386c65e 100644 --- a/panpipes/python_scripts/aggregate_cellranger_summary_metrics.py +++ b/panpipes/python_scripts/aggregate_cellranger_summary_metrics.py @@ -4,8 +4,8 @@ # takes cellranger multi outputs and cellranger count outputs, and makes them consistent for plotting. +from typing import Optional import pandas as pd -import glob import os import re import argparse @@ -16,32 +16,26 @@ import sys import logging + L = logging.getLogger() L.setLevel(logging.INFO) log_handler = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s') +formatter = logging.Formatter("%(asctime)s: %(levelname)s - %(message)s") log_handler.setFormatter(formatter) L.addHandler(log_handler) # parse arguments parser = argparse.ArgumentParser() -parser.add_argument('--pipe_df', - default='', - help='') -parser.add_argument('--output_file', - default='', - help='') -parser.add_argument('--figdir', - default='./figures', - help='') -parser.add_argument('--cellranger_column_conversion_df', - default='', - help='') +parser.add_argument("--pipe_df", default="", help="") +parser.add_argument("--output_file", default="", help="") +parser.add_argument("--figdir", default="./figures", help="") +parser.add_argument("--cellranger_column_conversion_df", default="", help="") parser.set_defaults(verbose=True) args = parser.parse_args() L.info("Running with params: %s", args) + def check_path(path): if os.path.isfile(path): return os.path.dirname(path) @@ -50,41 +44,53 @@ def check_path(path): else: return f"{path} does not exist or is an unrecognized type." -def get_metrics_summary_path(path,sample_id=None): - """ infers the path to metrics_summary.csv based on what cellranger algorithm is used + +def get_metrics_summary_path(path: str, sample_id: Optional[str] = None): + """infers the path to metrics_summary.csv based on what cellranger algorithm is used and checks the file exists Args: path (str):path to the folder called 'outs' sample_id (str): Required if path is to cellranger multi outputs, defaults to None - """ + """ # subset path to only go up to 'outs' - if 'outs' not in path: - L.warning("You are parsing a cellranger output but your path to raw data doesn't end with the outs folder") + if "outs" not in path: + L.warning( + "You are parsing a cellranger output but your path to raw data doesn't end with the outs folder" + ) path = check_path(path) else: path = path.split("outs")[0] + "outs" - outpath=None + + outpath = None + # use the path to cellranger count or vdj outputs as default - if os.path.exists(os.path.join(path, 'metrics_summary.csv') ): - outpath = os.path.join(path, 'metrics_summary.csv') - elif os.path.exists(os.path.join(path, 'summary.csv') ): - outpath = os.path.join(path, 'summary.csv') - elif sample_id is not None and os.path.exists(os.path.join(path, 'per_sample_outs', sample_id, 'metrics_summary.csv') ): - outpath = os.path.join(path, 'per_sample_outs', sample_id, 'metrics_summary.csv') - elif sample_id is None and os.path.exists(os.path.join(path, 'per_sample_outs')): - L.warning('Input folder appears to be from cellranger multi but no sample_id is given') + if os.path.exists(os.path.join(path, "metrics_summary.csv")): + outpath = os.path.join(path, "metrics_summary.csv") + elif os.path.exists(os.path.join(path, "summary.csv")): + outpath = os.path.join(path, "summary.csv") + elif sample_id is not None and os.path.exists( + os.path.join(path, "per_sample_outs", sample_id, "metrics_summary.csv") + ): + outpath = os.path.join( + path, "per_sample_outs", sample_id, "metrics_summary.csv" + ) + elif sample_id is None and os.path.exists(os.path.join(path, "per_sample_outs")): + L.warning( + "Input folder appears to be from cellranger multi but no sample_id is given" + ) else: # use the alternative path from cellranger_multi outputs - L.warning('Path not found') + L.warning("Path not found") + return outpath -def detect_cellranger_algorithm(pth): - if 'per_sample_outs' in pth: - filetype='multi' +def detect_cellranger_algorithm(pth: list[str]): + if "per_sample_outs" in pth: + filetype = "multi" else: - filetype='count' + filetype = "count" return filetype @@ -96,203 +102,285 @@ def get_all_unique_paths(pipe_df): Returns: pd.DataFrame: stacked dataframe of paths - """ - pipe_df = pipe_df.set_index('sample_id') - pipe_df = pipe_df.loc[:,pipe_df.columns.str.endswith('path')] + """ + pipe_df = pipe_df.set_index("sample_id") + pipe_df = pipe_df.loc[:, pipe_df.columns.str.endswith("path")] all_paths = pipe_df.stack().reset_index() - all_paths.columns = ['sample_id', 'path_type', 'path'] + all_paths.columns = ["sample_id", "path_type", "path"] all_paths = all_paths.drop_duplicates() # rename path_type to match cellranger terminolgy - recode_dict = {'rna_path': "Gene Expression", - 'prot_path': "Antibody Capture", - 'tcr_path': 'VDJ T', - 'bcr_path': 'VDJ B', - 'atac_path': 'ATAC' } - all_paths['path_type'] = all_paths['path_type'].replace(recode_dict) + recode_dict = { + "rna_path": "Gene Expression", + "prot_path": "Antibody Capture", + "tcr_path": "VDJ T", + "bcr_path": "VDJ B", + "atac_path": "ATAC", + } + all_paths["path_type"] = all_paths["path_type"].replace(recode_dict) return all_paths -def parse_10x_cellranger_multi(path_df,path_col='metrics_summary_path'): - """takes the metrics_summary.csv from cellranger multi for each sample +def parse_10x_cellranger_multi(path_df, path_col="metrics_summary_path"): + """takes the metrics_summary.csv from cellranger multi for each sample and concantenates into on pd.DataFrame (long format) Args: - path_df (pd.DataFrame): pandas dataframe with two columns sample_id and and + path_df (pd.DataFrame): pandas dataframe with two columns sample_id and and path which contains the path to the outs folder. path_col (str): colname for the columns containing the paths Returns: - pd.DataFrame: 7 columns ['sample_id', 'category', 'library_type', 'grouped_by', + pd.DataFrame: 7 columns ['sample_id', 'category', 'library_type', 'grouped_by', 'group_name', metric_name', 'metric_value'] - """ + """ + # subset to unique rows to save loading in duplicate data - path_df = path_df[['sample_id', path_col]].drop_duplicates() + path_df = path_df[["sample_id", path_col]].drop_duplicates() + # read and concat - msums = pd.concat([pd.read_csv(f) for f in path_df[path_col]], - keys=path_df['sample_id'], - names=['sample_id']).reset_index().drop(columns='level_1') + msums = ( + pd.concat( + [pd.read_csv(f) for f in path_df[path_col]], + keys=path_df["sample_id"], + names=["sample_id"], + ) + .reset_index() + .drop(columns="level_1") + ) + # remove spaces and capitils from columns names - msums.columns = [re.sub(' ', '_', x.lower()) for x in msums.columns] + msums.columns = [re.sub(" ", "_", x.lower()) for x in msums.columns] + # add a fix for version < cellranger 7 - ranger = {"library_or_sample":"category"} - msums.rename(columns =ranger,inplace=True) + ranger = {"library_or_sample": "category"} + msums.rename(columns=ranger, inplace=True) + # convert percentages from string to numeric - msums['metric_value'] = [re.sub(",|%", "", x) for x in msums['metric_value']] - msums['metric_value'] = msums['metric_value'].astype(float) + msums["metric_value"] = [re.sub(",|%", "", x) for x in msums["metric_value"]] + msums["metric_value"] = msums["metric_value"].astype(float) + # sort outputs - msums = msums.sort_values(['metric_name', 'sample_id']) + msums = msums.sort_values(["metric_name", "sample_id"]) + return msums -def parse_10x_cellranger_count(path_df, convert_df, path_col='metrics_summary_path'): - """takes the metrics_summary.csv from cellranger count output for each sample +def parse_10x_cellranger_count(path_df, convert_df, path_col="metrics_summary_path"): + """takes the metrics_summary.csv from cellranger count output for each sample and concantenates into on pd.DataFrame (long format) Args: - path_df (pd.DataFrame): pandas dataframe with three columns sample_id, path - path which contains the path to the outs folder.and + path_df (pd.DataFrame): pandas dataframe with three columns sample_id, path + path which contains the path to the outs folder.and path_type which contains the type of library (one of ['Gene Expression', 'Antibody Capture', 'VDJ T', 'VDJ B']) path_col (str): colname for the columns containing the paths Returns: - pd.DataFrame: 7 columns ['sample_id', 'category', 'library_type', 'grouped_by', + pd.DataFrame: 7 columns ['sample_id', 'category', 'library_type', 'grouped_by', 'group_name', metric_name', 'metric_value'] - """ - + """ + # read and concat - msums = pd.concat([pd.read_csv(f) for f in path_df[path_col]], - keys=[(x, y) for x, y in zip(path_df['sample_id'], - path_df['path_type'])], #count - names=['sample_id','path_type']) - msums = msums.stack().reset_index().drop(columns='level_2') + msums = pd.concat( + [pd.read_csv(f) for f in path_df[path_col]], + keys=[ + (x, y) for x, y in zip(path_df["sample_id"], path_df["path_type"]) + ], # count + names=["sample_id", "path_type"], + ) + + msums = msums.stack().reset_index().drop(columns="level_2") + # make the columns match the conversion df - msums.columns = ['sample_id', 'library_type', 'count_metric_name', 'metric_value'] + msums.columns = ["sample_id", "library_type", "count_metric_name", "metric_value"] + # update the columns names - msums = msums.merge(convert_df).drop(columns=['count_metric_name']) - msums['group_name'] = msums['sample_id'] - msums = msums[['sample_id','category', 'library_type', 'grouped_by', 'group_name', 'metric_name', 'metric_value']] + msums = msums.merge(convert_df).drop(columns=["count_metric_name"]) + msums["group_name"] = msums["sample_id"] + + msums = msums[ + [ + "sample_id", + "category", + "library_type", + "grouped_by", + "group_name", + "metric_name", + "metric_value", + ] + ] + # convert percentages from string to numeric - msums['metric_value'] = [re.sub(",|%", "", str(x)) for x in msums['metric_value']] - msums['metric_value'] = msums['metric_value'].astype(float) - msums = msums.sort_values(['library_type', 'metric_name', 'sample_id']) + msums["metric_value"] = [re.sub(",|%", "", str(x)) for x in msums["metric_value"]] + msums["metric_value"] = msums["metric_value"].astype(float) + + msums = msums.sort_values(["library_type", "metric_name", "sample_id"]) + return msums - + L.info("Reading in tsv file '%s'" % args.cellranger_column_conversion_df) -convert_df = pd.read_csv(args.cellranger_column_conversion_df, sep='\t') +convert_df = pd.read_csv(args.cellranger_column_conversion_df, sep="\t") # get the paths -pipe_df = pd.read_csv(args.pipe_df, sep='\t') -L.info('Searching for all cellranger paths') +pipe_df = pd.read_csv(args.pipe_df, sep="\t") +L.info("Searching for all cellranger paths") all_paths_df = get_all_unique_paths(pipe_df) -all_paths_df['metrics_summary_path'] = all_paths_df.apply(lambda x: get_metrics_summary_path(path=x.path, sample_id=x.sample_id), axis=1) +all_paths_df["metrics_summary_path"] = all_paths_df.apply( + lambda x: get_metrics_summary_path(path=x.path, sample_id=x.sample_id), axis=1 +) + +L.warning(f"pipe_df columns\n{pipe_df.columns}") +L.warning(f"pipe df\n{pipe_df.to_markdown()}") +L.warning(f"all_paths_df\n{all_paths_df.to_markdown()}") +x = all_paths_df["metrics_summary_path"] +L.warning(f"metrics summary path of all_paths_df\n{x.to_markdown()}") + # all_paths_df_uniq = all_paths_df.drop(columns=['path', 'path_type']).drop_duplicates().reset_index(drop=True) -all_paths_df['cellranger_type'] = [detect_cellranger_algorithm(x) for x in all_paths_df['metrics_summary_path']] +all_paths_df["cellranger_type"] = [ + detect_cellranger_algorithm(x) for x in all_paths_df["metrics_summary_path"] +] L.info("Cellranger metrics_summary files found: %s" % all_paths_df) # parse and concatenate the tenx x metrics summary files into long format L.info("Parsing and concatenating the 10X metrics summary files into long format") tenx_metrics = [] -if any(all_paths_df['cellranger_type']=='multi'): - tenx_metrics.append(parse_10x_cellranger_multi(all_paths_df[all_paths_df['cellranger_type']=='multi'])) - -if any(all_paths_df['cellranger_type']=='count'): - tenx_metrics.append(parse_10x_cellranger_count(all_paths_df[all_paths_df['cellranger_type']=='count'], convert_df)) +if any(all_paths_df["cellranger_type"] == "multi"): + tenx_metrics.append( + parse_10x_cellranger_multi( + all_paths_df[all_paths_df["cellranger_type"] == "multi"] + ) + ) + +if any(all_paths_df["cellranger_type"] == "count"): + tenx_metrics.append( + parse_10x_cellranger_count( + all_paths_df[all_paths_df["cellranger_type"] == "count"], convert_df + ) + ) tenx_metrics_full = pd.concat(tenx_metrics) -tenx_metrics_full = tenx_metrics_full.sort_values(['library_type', 'metric_name']) +tenx_metrics_full = tenx_metrics_full.sort_values(["library_type", "metric_name"]) L.info("Done. Saving to '%s'" % args.output_file) tenx_metrics_full.to_csv(args.output_file, index=False) # tenx_metrics['metric_value'] = [re.sub(",|%", "", x) for x in tenx_metrics['metric_value']] # tenx_metrics['metric_value'] = tenx_metrics['metric_value'].astype(float) -# split by library_type and metric_name +# split by library_type and metric_name L.info("Plotting metrics for each library_type and metric_name") -for idx, row in tenx_metrics_full[['library_type','metric_name']].drop_duplicates().iterrows(): - mn = row['metric_name'] - lt = row['library_type'] - plt_df = tenx_metrics_full[(tenx_metrics_full['library_type'] == lt) & (tenx_metrics_full['metric_name'] == mn)] +for idx, row in ( + tenx_metrics_full[["library_type", "metric_name"]].drop_duplicates().iterrows() +): + mn = row["metric_name"] + lt = row["library_type"] + plt_df = tenx_metrics_full[ + (tenx_metrics_full["library_type"] == lt) + & (tenx_metrics_full["metric_name"] == mn) + ] if len(plt_df.category.unique()) > 1: # if multiple categories i.e. Cells and Libraries, these are likely duplicate rows - plt_df = plt_df[['sample_id', 'metric_name', 'library_type' , 'metric_value']].drop_duplicates() - if len(plt_df['sample_id']) == len(plt_df['sample_id'].unique()): + plt_df = plt_df[ + ["sample_id", "metric_name", "library_type", "metric_value"] + ].drop_duplicates() + if len(plt_df["sample_id"]) == len(plt_df["sample_id"].unique()): L.info("Plotting barplot for library_type %s and metric_name %s" % (lt, mn)) - fig = sns.barplot(plt_df, x='sample_id', y='metric_value', color='grey') + fig = sns.barplot(plt_df, x="sample_id", y="metric_value", color="grey") fig.set_xticklabels(fig.get_xticklabels(), rotation=90) - fig.set_title(lt + ':' + mn) - plt.savefig(os.path.join(args.figdir, lt + '-' + mn + '.png'), bbox_inches='tight') + fig.set_title(lt + ":" + mn) + plt.savefig( + os.path.join(args.figdir, lt + "-" + mn + ".png"), bbox_inches="tight" + ) else: # do a boxplot instead L.info("Plotting boxplot for library_type %s and metric_name %s" % (lt, mn)) - fig = sns.boxplot(plt_df, x='sample_id', y='metric_value', color='grey') + fig = sns.boxplot(plt_df, x="sample_id", y="metric_value", color="grey") fig.set_xticklabels(fig.get_xticklabels(), rotation=90) - fig.set_title(lt + ':' + mn) - plt.savefig(os.path.join(args.figdir, lt + '-' + mn + '.png'), bbox_inches='tight') + fig.set_title(lt + ":" + mn) + plt.savefig( + os.path.join(args.figdir, lt + "-" + mn + ".png"), bbox_inches="tight" + ) plt.clf() ## gene expression extra plots ----------------------------- -rna_tenx_metrics = tenx_metrics_full[tenx_metrics_full['library_type'] == 'Gene Expression'] +rna_tenx_metrics = tenx_metrics_full[ + tenx_metrics_full["library_type"] == "Gene Expression" +] # sequencing_saturaion plot scatter plot -plot_metrics = ['Sequencing saturation', 'Mean reads per cell', 'Estimated number of cells', 'Number of reads', 'Median UMI counts per cell'] +plot_metrics = [ + "Sequencing saturation", + "Mean reads per cell", + "Estimated number of cells", + "Number of reads", + "Median UMI counts per cell", +] plt_df = rna_tenx_metrics[rna_tenx_metrics.metric_name.isin(plot_metrics)] -plt_df = plt_df[['sample_id', 'metric_name', 'metric_value']] -plt_tab = plt_df.pivot_table(index='sample_id', columns='metric_name', values='metric_value', aggfunc=sum) -# sns.scatterplot(data=plt_tab, -# x='Sequencing saturation', +plt_df = plt_df[["sample_id", "metric_name", "metric_value"]] +plt_tab = plt_df.pivot_table( + index="sample_id", columns="metric_name", values="metric_value", aggfunc=sum +) +# sns.scatterplot(data=plt_tab, +# x='Sequencing saturation', # y='Number of reads', size='Mean reads per cell',sizes=(20, 200)) -if 'Sequencing saturation' in plt_df.metric_name.unique(): +if "Sequencing saturation" in plt_df.metric_name.unique(): L.info("Plotting scatter plot of sequencing saturation against number of reads") f, ax = plt.subplots() - points = ax.scatter(x=plt_tab['Sequencing saturation'], - y=plt_tab['Number of reads'], - c=plt_tab['Estimated number of cells']) + points = ax.scatter( + x=plt_tab["Sequencing saturation"], + y=plt_tab["Number of reads"], + c=plt_tab["Estimated number of cells"], + ) # define labels - plt.xlabel('Sequencing saturation') - plt.ylabel('Number of reads') + plt.xlabel("Sequencing saturation") + plt.ylabel("Number of reads") # define color bar cbar = f.colorbar(points) cbar.ax.get_yaxis().labelpad = 15 - cbar.ax.set_ylabel('Estimated # of cells', rotation=270) + cbar.ax.set_ylabel("Estimated # of cells", rotation=270) # put it on a grid - plt.grid(axis='both', color='0.95') + plt.grid(axis="both", color="0.95") # save - plt.savefig(os.path.join(args.figdir,'10x_sequencing_saturation_summary.png')) + plt.savefig(os.path.join(args.figdir, "10x_sequencing_saturation_summary.png")) plt.clf() # plot 2 - cells vs UMI counts per cell -L.info("Plotting scatter plot of estimated number of cells against median UMI counts per cell") +L.info( + "Plotting scatter plot of estimated number of cells against median UMI counts per cell" +) fig, ax = plt.subplots() -x=list(np.log10(plt_tab['Estimated number of cells'])) -y=list(np.log10(plt_tab['Median UMI counts per cell'])) -ax.scatter(x=x, - y=y) +x = list(np.log10(plt_tab["Estimated number of cells"])) +y = list(np.log10(plt_tab["Median UMI counts per cell"])) +ax.scatter(x=x, y=y) # annotate each point for i, txt in enumerate(plt_tab.index): - ax.annotate(txt, (x[i], y[i]), size=8, ha='center') -plt.xlabel('log10(Estimated number of cells)') -plt.ylabel('log10(Median UMI counts per cell)') + ax.annotate(txt, (x[i], y[i]), size=8, ha="center") +plt.xlabel("log10(Estimated number of cells)") +plt.ylabel("log10(Median UMI counts per cell)") # put a boundary box around the good cells -plt.vlines(x=(np.log10(500), np.log10(20000)), - ymin=min(1.5, min(y)), - ymax=max(5, max(y)), - colors='black', linestyles='dotted' - ) -plt.hlines(y=(3, 4), - xmin=min(1.5, min(x)), - xmax=max(5, max(x)), - colors='black', linestyles='dotted' +plt.vlines( + x=(np.log10(500), np.log10(20000)), + ymin=min(1.5, min(y)), + ymax=max(5, max(y)), + colors="black", + linestyles="dotted", +) +plt.hlines( + y=(3, 4), + xmin=min(1.5, min(x)), + xmax=max(5, max(x)), + colors="black", + linestyles="dotted", ) # put it on a grid -plt.grid(axis='both', color='0.95') +plt.grid(axis="both", color="0.95") # save -plt.savefig(os.path.join(args.figdir,'10x_cells_by_UMIs.png')) +plt.savefig(os.path.join(args.figdir, "10x_cells_by_UMIs.png")) plt.clf() -L.info("Done") \ No newline at end of file +L.info("Done") From 4ce06ac0c471813f1dd30d6bc3fa49b921d1e205 Mon Sep 17 00:00:00 2001 From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com> Date: Tue, 4 Feb 2025 12:13:08 +0100 Subject: [PATCH 07/10] refactoring: style and error messages --- panpipes/python_scripts/concat_adata.py | 184 +++++++++++++----------- 1 file changed, 98 insertions(+), 86 deletions(-) diff --git a/panpipes/python_scripts/concat_adata.py b/panpipes/python_scripts/concat_adata.py index 0fab949d..13846df1 100644 --- a/panpipes/python_scripts/concat_adata.py +++ b/panpipes/python_scripts/concat_adata.py @@ -1,64 +1,57 @@ -import scanpy as sc -import glob -import pandas as pd import argparse +import logging import re -from muon import MuData +import sys +import warnings + import muon as mu +import pandas as pd -import warnings -# warnings.filterwarnings("ignore", category=UserWarning) -# warnings.filterwarnings("ignore", category=FutureWarning) +from panpipes.funcs.processing import concat_mdatas -from panpipes.funcs.processing import concat_mdatas +pd.set_option("display.max_rows", None) -pd.set_option('display.max_rows', None) -# import os -# os.chdir("../../data/run_qc_general") -import sys -import logging L = logging.getLogger() L.setLevel(logging.INFO) log_handler = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s') +formatter = logging.Formatter("%(asctime)s: %(levelname)s - %(message)s") log_handler.setFormatter(formatter) L.addHandler(log_handler) - parser = argparse.ArgumentParser() -parser.add_argument('--input_files_str', - default='', - help='') -parser.add_argument('--output_file', - default='', - help='mdata.h5mu') -parser.add_argument("--submissionfile", - default="caf_example", - help="this file has all the samples of the experiment. it has at least 3 columns: \ +parser.add_argument("--input_files_str", default="", help="") +parser.add_argument("--output_file", default="", help="mdata.h5mu") +parser.add_argument( + "--submissionfile", + default="caf_example", + help="this file has all the samples of the experiment. it has at least 3 columns: \ sample_id,path and type. \ - it can contain other anonymous columns which will be renamed using the metadatacols information") -parser.add_argument("--sampleprefix", - default="", - help="prefix to prepend when saving the metadata file") -parser.add_argument("--metadatacols", - default=None, - help="names of the metadata to rename the columns of the sample submission file ") -parser.add_argument("--join_type", - default="inner", - help="whether to concat anndata on the 'inner' (default) or 'outer'") -parser.add_argument('--barcode_mtd_df', - default=None, - help='csv file contining barcode level metadata') -parser.add_argument('--barcode_mtd_metadatacols', - default=None, - help='comma separated strings listing the column you want to keep in barcode_mtd_df') -parser.add_argument('--protein_var_table', - default=None, - help='') -parser.add_argument('--protein_new_index_col', - default=None, - help='') + it can contain other anonymous columns which will be renamed using the metadatacols information", +) +parser.add_argument( + "--sampleprefix", default="", help="prefix to prepend when saving the metadata file" +) +parser.add_argument( + "--metadatacols", + default=None, + help="names of the metadata to rename the columns of the sample submission file ", +) +parser.add_argument( + "--join_type", + default="inner", + help="whether to concat anndata on the 'inner' (default) or 'outer'", +) +parser.add_argument( + "--barcode_mtd_df", default=None, help="csv file contining barcode level metadata" +) +parser.add_argument( + "--barcode_mtd_metadatacols", + default=None, + help="comma separated strings listing the column you want to keep in barcode_mtd_df", +) +parser.add_argument("--protein_var_table", default=None, help="") +parser.add_argument("--protein_new_index_col", default=None, help="") parser.set_defaults(verbose=True) args, opt = parser.parse_known_args() @@ -67,10 +60,10 @@ if len(args.input_files_str) == 0: L.error("No input files detected") sys.exit("No input files detected") - -lf = re.split(',', args.input_files_str) -sfile=args.submissionfile +lf = re.split(",", args.input_files_str) + +sfile = args.submissionfile # lf = glob.glob("./tmp/*raw.h5mu") # sfile= "CARTC_bonemarrow_samples.tsv" @@ -83,7 +76,7 @@ # [x.var_names_make_unique() for x in mdatas] -if len(mdatas)==1: +if len(mdatas) == 1: # if mdatas length is 0 then no concatenation required mdata = mdatas[0] else: @@ -91,15 +84,13 @@ # with the atac, should it just error out to say # that atac shouldn't be concatenated? temp = mdatas[0] - if 'atac' in temp.mod.keys(): - mdata=mdatas[0] + if "atac" in temp.mod.keys(): + mdata = mdatas[0] del temp - elif 'prot' in temp.mod.keys() or 'rna' in temp.mod.keys(): + elif "prot" in temp.mod.keys() or "rna" in temp.mod.keys(): ## IF RNA and PROT is ok to concatenate ---------- L.info("Concatenating RNA and prot") - mdata = concat_mdatas(mdatas, - batch_key="sample_id", - join_type=args.join_type) + mdata = concat_mdatas(mdatas, batch_key="sample_id", join_type=args.join_type) # remove to avoid mem issues del mdatas @@ -113,20 +104,22 @@ metadatacols = args.metadatacols # metadatacols="mdata_cols" # check sample_id is in metadatacols as a minimum requirement -if metadatacols == None : +if metadatacols is None: pass -elif metadatacols == 'None': +elif metadatacols.lower() == "none": pass else: - metadatacols = metadatacols.split(',') - if 'sample_id' not in metadatacols: - metadatacols = ['sample_id'] + metadatacols + metadatacols = metadatacols.split(",") + + if "sample_id" not in metadatacols: + metadatacols = ["sample_id"] + metadatacols + # check all metadatacols are in the caf if all([x in caf.columns for x in metadatacols]): - # if mdata is actually a mudata it will be helpful - # (although annoying to store this in rna and protein as + # if mdata is actually a mudata it will be helpful + # (although annoying to store this in rna and protein as # well since the batch columns etc will be here) - # hopefullt his whole bit will become unnesacry because of + # hopefullt his whole bit will become unnesacry because of # https://github.com/scverse/mudata/issues/2 for mod in mdata.mod.keys(): L.debug(mod) @@ -134,21 +127,36 @@ assay = mdata[mod] # make sure there is no index name because rep might have one as default assay.obs.index.name = None - assay.obs = assay.obs.reset_index().merge(caf[metadatacols], on="sample_id", how="left").set_index('index').copy() + assay.obs = ( + assay.obs.reset_index() + .merge(caf[metadatacols], on="sample_id", how="left") + .set_index("index") + .copy() + ) # remove the index name introduced with the set index. assay.obs.index_name = None L.debug(assay.obs.head()) + mdata.update_obs() + # merge metadata # obs = mdata.obs - # obs['cellbarcode'] = obs.index - mdata.obs=mdata.obs.reset_index().merge(caf[metadatacols], on="sample_id", how="left").set_index('index') + # obs['cellbarcode'] = obs.index + mdata.obs = ( + mdata.obs.reset_index() + .merge(caf[metadatacols], on="sample_id", how="left") + .set_index("index") + ) + mdata.obs.index.name = None - mdata.obs['cellbarcode'] = mdata.obs.index + + mdata.obs["cellbarcode"] = mdata.obs.index + mdata.update_obs() else: # which column is missing? - missingcols= " ".join([x for x in metadatacols if x not in caf.columns]) + missingcols = " ".join([x for x in metadatacols if x not in caf.columns]) + sys.exit(caf.columns) L.error("Required columns missing form samples file: %s" % missingcols) sys.exit("Required columns missing form samples file: %s" % missingcols) @@ -157,22 +165,26 @@ # col_update = mdata['rep'].obs.columns[mdata['rep'].obs.columns.str.contains("count")] # mdata['rep'].obs[col_update] = mdata['rep'].obs[col_update].apply(pd.to_numeric) # mdata.update() - + # add in cell level metadata -if args.barcode_mtd_df is not None : +if args.barcode_mtd_df is not None: L.info("Adding barcode level metadata") # check demult_metadatacols exists and contains antibody - barcode_metadatacols = args.barcode_mtd_metadatacols.split(',') + barcode_metadatacols = args.barcode_mtd_metadatacols.split(",") # load the demultiplexing data barcode_mtd_df = pd.read_csv(args.barcode_mtd_df, index_col=None) - if len(barcode_mtd_df['sample_id'].unique().tolist()) > 1: - barcode_mtd_df.index = barcode_mtd_df[['barcode_id', 'sample_id']].apply(lambda row: "-".join(row.values.astype(str)), axis=1) + if len(barcode_mtd_df["sample_id"].unique().tolist()) > 1: + barcode_mtd_df.index = barcode_mtd_df[["barcode_id", "sample_id"]].apply( + lambda row: "-".join(row.values.astype(str)), axis=1 + ) else: - barcode_mtd_df.index = barcode_mtd_df['barcode_id'] - barcode_mtd_df= barcode_mtd_df.drop(columns=['sample_id']) + barcode_mtd_df.index = barcode_mtd_df["barcode_id"] + barcode_mtd_df = barcode_mtd_df.drop(columns=["sample_id"]) # add for each modality for mod in mdata.mod.keys(): - mdata[mod].obs = mdata[mod].obs.merge(barcode_mtd_df, left_index=True, right_index=True) + mdata[mod].obs = mdata[mod].obs.merge( + barcode_mtd_df, left_index=True, right_index=True + ) mdata.update() # add at top level mdata.obs = mdata.obs.merge(barcode_mtd_df, left_index=True, right_index=True) @@ -184,28 +196,28 @@ if args.protein_var_table is not None: try: L.info("Reading in protein var table from '%s'" % args.protein_var_table) - df = pd.read_csv(args.protein_var_table, sep='\t', index_col=0) + df = pd.read_csv(args.protein_var_table, sep="\t", index_col=0) L.info("Merging protein table with var") # add_var_mtd(mdata['prot'], df) - var_df = mdata['prot'].var.merge(df, left_index=True, right_index=True) + var_df = mdata["prot"].var.merge(df, left_index=True, right_index=True) if args.protein_new_index_col is not None: L.info("Updating prot.var index") # update_var_index(mdata['prot'], args.protein_new_index_col) - var_df =var_df.reset_index().set_index(args.protein_new_index_col) - var_df = var_df.rename(columns={'index':'orig_id'}) + var_df = var_df.reset_index().set_index(args.protein_new_index_col) + var_df = var_df.rename(columns={"index": "orig_id"}) var_df.index.name = None - mdata['prot'].var = var_df + mdata["prot"].var = var_df mdata.update_var() mdata.update() # we might want to split hashing antibodies into a separate modalities # we assume this has been inidicated in a "hashing_ab" column in the protein metadata file - if "hashing_ab" in mdata['prot'].var.columns: + if "hashing_ab" in mdata["prot"].var.columns: # create new modality for hashing - #mdata.mod["hashing_ab"]=mdata["prot"][:, mdata["prot"].var["hashing_ab"]] + # mdata.mod["hashing_ab"]=mdata["prot"][:, mdata["prot"].var["hashing_ab"]] # subset old modality to remove hashing - mdata.mod['prot'] = mdata["prot"][:, ~mdata["prot"].var["hashing_ab"]] + mdata.mod["prot"] = mdata["prot"][:, ~mdata["prot"].var["hashing_ab"]] except FileNotFoundError: - warnings.warn("Protein metadata table not found") + warnings.warn(f"Protein metadata table not found {args.protein_var_table}") mdata.update() # tidy up metadata # move sample_id to the front From a21a9de7c787cefbf36d9f51160a3f34aa59cc7a Mon Sep 17 00:00:00 2001 From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com> Date: Tue, 4 Feb 2025 12:18:06 +0100 Subject: [PATCH 08/10] fix & refactoring: double group_by; style; replace prints with cats --- panpipes/R_scripts/plotQC.R | 756 ++++++++++++++++++++---------------- 1 file changed, 422 insertions(+), 334 deletions(-) diff --git a/panpipes/R_scripts/plotQC.R b/panpipes/R_scripts/plotQC.R index ecff7b14..32a8ce9b 100644 --- a/panpipes/R_scripts/plotQC.R +++ b/panpipes/R_scripts/plotQC.R @@ -1,139 +1,179 @@ -# plotting utilities for qc +# plotting utilities for qc suppressPackageStartupMessages({ library(tidyverse) library(optparse) }) -do_scatter_plot <- function(df, x, y, facet=NULL, hue=NULL){ - g <- df %>% - ggplot(aes_string(x=x,y=y, color=hue)) + - geom_point(size=0.5) - if (!is.null(facet)){ - g <- g + facet_wrap(as.formula(paste0("~", sc)), ncol=6) +do_scatter_plot <- function(df, x, y, facet = NULL, hue = NULL) { + g <- df %>% + ggplot(aes_string(x = x, y = y, color = hue)) + + geom_point(size = 0.5) + if (!is.null(facet)) { + g <- g + facet_wrap(as.formula(paste0("~", sc)), ncol = 6) } - - if(is.numeric(df[hue])){ + + if (is.numeric(df[hue])) { g <- g + scale_color_viridis_c() } - g <- g+ theme_bw()+ - theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5), - strip.text.x = element_text(size=6), - legend.key.size = unit(0.2, 'cm')) + g <- g + theme_bw() + + theme( + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), + strip.text.x = element_text(size = 6), + legend.key.size = unit(0.2, "cm") + ) return(g) } -do_violin_plot <- function(df, qc, group){ - g <- df %>% drop_na() %>% - ggplot(aes_string(x="sample_id",y=qc)) + - geom_violin(aes_string(fill=group)) + - {if(qc=="doublet_scores") geom_hline(yintercept=0.25, color="grey50", linetype="dashed") }+ - {if("pct_counts" %in% qc) geom_hline(yintercept=c(5, 10, 20 ,70), color="grey50", linetype="dashed")}+ - {if("pct_counts" %in% qc) coord_cartesian(ylim=c(0,100))}+ - theme_bw()+ - theme(axis.text.x=element_text(size=8, angle=90), - axis.text.y=element_text(size=8)) - if(length(unique(df[[group]])) > 10){ - print(paste(sc, "has too many categories, removing legend")) - g <- g + theme(legend.position="none") +do_violin_plot <- function(df, qc, group) { + g <- df %>% + drop_na() %>% + ggplot(aes_string(x = "sample_id", y = qc)) + + geom_violin(aes_string(fill = group)) + + { + if (qc == "doublet_scores") geom_hline(yintercept = 0.25, color = "grey50", linetype = "dashed") + } + + { + if ("pct_counts" %in% qc) geom_hline(yintercept = c(5, 10, 20, 70), color = "grey50", linetype = "dashed") + } + + { + if ("pct_counts" %in% qc) coord_cartesian(ylim = c(0, 100)) + } + + theme_bw() + + theme( + axis.text.x = element_text(size = 8, angle = 90), + axis.text.y = element_text(size = 8) + ) + if (length(unique(df[[group]])) > 10) { + cat(sc, "has too many categories, removing legend\n") + g <- g + theme(legend.position = "none") } - + return(g) } -do_bar_plot <- function(df, qc, group){ +do_bar_plot <- function(df, qc, group) { g <- df %>% - ggplot(aes_string(x=group, fill=qc)) + - geom_bar(aes_string()) + - theme_bw()+ - theme(axis.text.x=element_text(size=8, angle=90), - axis.text.y=element_text(size=8)) -# if(length(unique(df[[group]])) > 10){ -# message(paste0(sc, "has too many categories, removing legend")) -# g <- g + theme(legend.position="none") -# } - + ggplot(aes_string(x = group, fill = qc)) + + geom_bar(aes_string()) + + theme_bw() + + theme( + axis.text.x = element_text(size = 8, angle = 90), + axis.text.y = element_text(size = 8) + ) + # if(length(unique(df[[group]])) > 10){ + # message(paste0(sc, "has too many categories, removing legend")) + # g <- g + theme(legend.position="none") + # } + return(g) } options(stringsAsFactors = F) -options(bitmaptype="cairo") +options(bitmaptype = "cairo") option_list <- list( - make_option(c("--cell_metadata"), default=NULL, - help="the path to the anndata object"), - make_option(c("--groupingvar"), default="sample_id,tissue,patient,channel", - help="names of grouping variables"), - make_option(c("--rna_qc_metrics"), default=NULL, - help="the qc_metrics to plot"), - make_option(c("--prot_qc_metrics"), default=NULL, - help="the qc_metrics to plot"), - make_option(c("--rep_qc_metrics"), default=NULL, - help="the qc_metrics to plot"), - make_option(c("--atac_qc_metrics"), default=NULL, - help="the qc_metrics to plot"), - make_option(c("--outdir"), default="./figures/", - help="the name of the output folder"), - make_option(c("--prefilter"), default=TRUE, - help="am i parsing data before or after filtering?"), - make_option(c("--sampleprefix"), default="", - help="the prefix to prepend to save summary filtering plots"), - make_option(c("--scanpy_or_muon"), default="scanpy", - help="was the input file written out from the obs of scanpy or muon") + make_option(c("--cell_metadata"), + default = NULL, + help = "the path to the anndata object" + ), + make_option(c("--groupingvar"), + default = "sample_id,tissue,patient,channel", + help = "names of grouping variables" + ), + make_option(c("--rna_qc_metrics"), + default = NULL, + help = "the qc_metrics to plot" + ), + make_option(c("--prot_qc_metrics"), + default = NULL, + help = "the qc_metrics to plot" + ), + make_option(c("--rep_qc_metrics"), + default = NULL, + help = "the qc_metrics to plot" + ), + make_option(c("--atac_qc_metrics"), + default = NULL, + help = "the qc_metrics to plot" + ), + make_option(c("--outdir"), + default = "./figures/", + help = "the name of the output folder" + ), + make_option(c("--prefilter"), + default = TRUE, + help = "am i parsing data before or after filtering?" + ), + make_option(c("--sampleprefix"), + default = "", + help = "the prefix to prepend to save summary filtering plots" + ), + make_option(c("--scanpy_or_muon"), + default = "scanpy", + help = "was the input file written out from the obs of scanpy or muon" + ) ) -opt <- parse_args(OptionParser(option_list=option_list)) +opt <- parse_args(OptionParser(option_list = option_list)) -if(is.null(opt$outdir)) { opt$outdir <- paste0(getwd(),"/")} -if(!grepl("\\/$", opt$outdir)){opt$outdir <- paste(opt$outdir, "/", sep = "")} -if(!file.exists(opt$outdir)){dir.create(opt$outdir)} +if (is.null(opt$outdir)) { + opt$outdir <- paste0(getwd(), "/") +} +if (!grepl("\\/$", opt$outdir)) { + opt$outdir <- paste(opt$outdir, "/", sep = "") +} +if (!file.exists(opt$outdir)) { + dir.create(opt$outdir) +} -run<- opt$outdir -opt[which(opt=="NULL")] <- NULL -opt[which(opt=="None")] <- NULL +run <- opt$outdir +opt[which(opt == "NULL")] <- NULL +opt[which(opt == "None")] <- NULL opt$prefilter <- as.logical(opt$prefilter) # load_data -------------------------------------------------------------------- -data_plot = read.delim(opt$cell_metadata) +data_plot <- read.delim(opt$cell_metadata) # define source facet for all plots -if (!is.null(opt$groupingvar)){ - source_facet <- strsplit(opt$groupingvar,",")[[1]] - check_excl <- source_facet[!source_facet %in% colnames(data_plot)] +if (!is.null(opt$groupingvar)) { + source_facet <- strsplit(opt$groupingvar, ",")[[1]] + check_excl <- source_facet[!source_facet %in% colnames(data_plot)] keep_source <- NULL - for (cc in check_excl){ - for (mod in c("rna","prot","atac","rep")){ - id <- paste(mod,cc,sep=".") - if(id %in% colnames(data_plot)){ - keep_source <- c(keep_source,id) + for (cc in check_excl) { + for (mod in c("rna", "prot", "atac", "rep")) { + id <- paste(mod, cc, sep = ".") + if (id %in% colnames(data_plot)) { + keep_source <- c(keep_source, id) } } } source_facet <- source_facet[source_facet %in% colnames(data_plot)] - source_facet <- unique(c(source_facet, keep_source) ) - if(length(source_facet)>0){ + + source_facet <- unique(c(source_facet, keep_source)) + if (length(source_facet) > 0) { # add sample_id as a minimum requirement if it's not there already - source_facet = unique(c("sample_id", source_facet)) - print(paste("Facet plotting with",source_facet)) + source_facet <- unique(c("sample_id", source_facet)) + cat("Facet plotting with:", source_facet, "\n") } -}else{ +} else { stop("Need the minimum variable _sampleid_ to facet on, will stop here") } # RNA plots -------------------------------------------------------------------- -print("RNA plots") +cat("RNA plots\n") -if(opt$scanpy_or_muon=="scanpy"){ +if (opt$scanpy_or_muon == "scanpy") { rna_data_plot <- data_plot -}else{ - rna_data_plot <- data_plot[,grep("^rna\\.",colnames(data_plot))] +} else { + rna_data_plot <- data_plot[, grep("^rna\\.", colnames(data_plot))] colnames(rna_data_plot) <- gsub("^rna\\.", "", colnames(rna_data_plot)) } -outpath = file.path(run, "rna") +outpath <- file.path(run, "rna") if (!dir.exists(outpath)) dir.create(outpath) @@ -143,68 +183,80 @@ if (!dir.exists(outpath)) dir.create(outpath) # "log1p_total_counts", # "n_genes_by_counts", # "log1p_n_genes_by_counts", -# "doublet_scores", -# "pct_counts_mt", -# "pct_counts_rp", +# "doublet_scores", +# "pct_counts_mt", +# "pct_counts_rp", # "pct_counts_ig", # "pct_counts_hb") # check these qc metrics are in the file if (!is.null(opt$rna_qc_metrics)) { - qcmetrics <- strsplit(opt$rna_qc_metrics,",")[[1]] + qcmetrics <- strsplit(opt$rna_qc_metrics, ",")[[1]] } qcmetrics <- qcmetrics[qcmetrics %in% colnames(rna_data_plot)] uniq_sample_id <- nrow(unique(rna_data_plot["sample_id"])) -rna_source_facet <- gsub("^rna\\.", "",grep("^rna.", source_facet, value = TRUE)) +rna_source_facet <- gsub("^rna\\.", "", grep("^rna.", source_facet, value = TRUE)) rna_source_facet <- unique(c(rna_source_facet, source_facet[!grepl("^rna.", source_facet)])) rna_source_facet <- rna_source_facet[rna_source_facet %in% colnames(rna_data_plot)] -for (qc in qcmetrics){ - print(paste("Plotting violin plots of", qc)) - for (sc in rna_source_facet){ #add gsub temp here +for (qc in qcmetrics) { + cat("Plotting violin plots of", qc, "\n") + for (sc in rna_source_facet) { # add gsub temp here g <- do_violin_plot(rna_data_plot, qc, sc) - if (uniq_sample_id > 50){width=12}else{width=6} - ggsave(g, filename=file.path(outpath, paste0("violin_", sc, "_rna-", qc,".png")), type="cairo", width= width, height=6) - + if (uniq_sample_id > 50) { + width <- 12 + } else { + width <- 6 + } + ggsave(g, filename = file.path(outpath, paste0("violin_", sc, "_rna-", qc, ".png")), type = "cairo", width = width, height = 6) } } -for (sc in rna_source_facet){ +for (sc in rna_source_facet) { uniq_source <- nrow(unique(rna_data_plot[sc])) - if(uniq_source >6){ - ncols=6 - nrows=ceiling(uniq_source/6) - }else{ - ncols=uniq_source - nrows=1 + if (uniq_source > 6) { + ncols <- 6 + nrows <- ceiling(uniq_source / 6) + } else { + ncols <- uniq_source + nrows <- 1 } # plot once per source facet - if (all(c("total_counts","n_genes_by_counts")%in% colnames(rna_data_plot))){ - print("Plotting scatter plot of total_counts and n_genes_by_counts") - g <- do_scatter_plot(rna_data_plot,x="total_counts",y="n_genes_by_counts", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-nUMI_v_rna-genes.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) + if (all(c("total_counts", "n_genes_by_counts") %in% colnames(rna_data_plot))) { + cat("Plotting scatter plot of total_counts and n_genes_by_counts\n") + g <- do_scatter_plot(rna_data_plot, x = "total_counts", y = "n_genes_by_counts", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "_rna-nUMI_v_rna-genes.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - - if (all(c("log1p_total_counts","log1p_n_genes_by_counts")%in% colnames(rna_data_plot))){ - print("Plotting scatter plot of log1p_total_counts and n_genes_by_counts") - g <- do_scatter_plot(rna_data_plot,x="log1p_total_counts",y="log1p_n_genes_by_counts", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-log1p_nUMI_v_rna-log1p_genes.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) + + if (all(c("log1p_total_counts", "log1p_n_genes_by_counts") %in% colnames(rna_data_plot))) { + cat("Plotting scatter plot of log1p_total_counts and n_genes_by_counts\n") + g <- do_scatter_plot(rna_data_plot, x = "log1p_total_counts", y = "log1p_n_genes_by_counts", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "_rna-log1p_nUMI_v_rna-log1p_genes.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - if (all(c("total_counts","pct_counts_mt")%in% colnames(rna_data_plot))){ - print("Plotting scatter plot of total_counts and pct_counts_mt") - g <- do_scatter_plot(rna_data_plot,x="total_counts",y="pct_counts_mt", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-nUMI_v_rna-pct_mt.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) + if (all(c("total_counts", "pct_counts_mt") %in% colnames(rna_data_plot))) { + cat("Plotting scatter plot of total_counts and pct_counts_mt\n") + g <- do_scatter_plot(rna_data_plot, x = "total_counts", y = "pct_counts_mt", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "_rna-nUMI_v_rna-pct_mt.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - if (all(c("n_genes_by_counts","doublet_scores","total_counts")%in% colnames(rna_data_plot))){ - print("Plotting scatter plot of n_genes_by_counts and doublet_scores") - g <- do_scatter_plot(rna_data_plot,x="n_genes_by_counts",y="doublet_scores", - hue="total_counts", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-genes_rna-doublet_scores_rna-numi.png")), - type="cairo", width= 3*ncols, height=3*nrows, dpi=200) + if (all(c("n_genes_by_counts", "doublet_scores", "total_counts") %in% colnames(rna_data_plot))) { + cat("Plotting scatter plot of n_genes_by_counts and doublet_scores\n") + g <- do_scatter_plot(rna_data_plot, + x = "n_genes_by_counts", y = "doublet_scores", + hue = "total_counts", facet = sc + ) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "_rna-genes_rna-doublet_scores_rna-numi.png")), + type = "cairo", width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - } @@ -213,300 +265,336 @@ for (sc in rna_source_facet){ -if(!is.null(opt$prot_qc_metrics)){ - print("Protein plots") +if (!is.null(opt$prot_qc_metrics)) { + cat("Protein plots\n") - qcmetrics <- strsplit(opt$prot_qc_metrics,",")[[1]] - prot_data_plot <- data_plot[,grep("^prot\\.",colnames(data_plot))] + qcmetrics <- strsplit(opt$prot_qc_metrics, ",")[[1]] + prot_data_plot <- data_plot[, grep("^prot\\.", colnames(data_plot))] colnames(prot_data_plot) <- gsub("^prot\\.", "", colnames(prot_data_plot)) - prot_source_facet <- gsub("^prot\\.", "",grep("^prot.", source_facet, value = TRUE)) + prot_source_facet <- gsub("^prot\\.", "", grep("^prot.", source_facet, value = TRUE)) prot_source_facet <- unique(c(prot_source_facet, source_facet[!grepl("^prot.", source_facet)])) prot_source_facet <- prot_source_facet[prot_source_facet %in% colnames(prot_data_plot)] - outpath = file.path(run, "prot") - if (!dir.exists(outpath)) { + outpath <- file.path(run, "prot") + if (!dir.exists(outpath)) { dir.create(outpath) - } + } uniq_sample_id <- nrow(unique(prot_data_plot["sample_id"])) # check these qc metrics are in the file qcmetrics <- qcmetrics[qcmetrics %in% colnames(prot_data_plot)] - for (qc in qcmetrics){ - print(paste("Plotting violin plots of", qc)) - for (sc in prot_source_facet){ + for (qc in qcmetrics) { + cat("Plotting violin plots of", qc, "\n") + for (sc in prot_source_facet) { g <- do_violin_plot(prot_data_plot, qc, sc) - if (uniq_sample_id > 50){width=12}else{width=6} - ggsave(g, filename=file.path(outpath, paste0("violin_", sc, "_prot-", qc,".png")), type="cairo", width= width, height=6) + if (uniq_sample_id > 50) { + width <- 12 + } else { + width <- 6 + } + ggsave(g, filename = file.path(outpath, paste0("violin_", sc, "_prot-", qc, ".png")), type = "cairo", width = width, height = 6) } } - + # do the following plots: # - prot:total_counts vs prot_nprot_by_counts # - prot:total_counts vs prot:isotype_counts # - prot:log1p_total_counts vs prot:log1p_isotype_counts # - prot:total_counts vs prot:pct_isotype_counts - - - for (sc in prot_source_facet){ + + + for (sc in prot_source_facet) { uniq_source <- nrow(unique(prot_data_plot[sc])) - if(uniq_source >6){ - ncols=6 - nrows=ceiling(uniq_source/6) - }else{ - ncols=uniq_source - nrows=1 + if (uniq_source > 6) { + ncols <- 6 + nrows <- ceiling(uniq_source / 6) + } else { + ncols <- uniq_source + nrows <- 1 } # plot once per source facet - if (all(c("total_counts","n_prot_by_counts")%in% colnames(prot_data_plot))){ - print("Plotting scatter plot of total_counts and n_prot_by_counts") - g <- do_scatter_plot(prot_data_plot,x="total_counts",y="n_prot_by_counts", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-prot.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) + if (all(c("total_counts", "n_prot_by_counts") %in% colnames(prot_data_plot))) { + cat("Plotting scatter plot of total_counts and n_prot_by_counts\n") + g <- do_scatter_plot(prot_data_plot, x = "total_counts", y = "n_prot_by_counts", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-prot.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - if (all(c("log1p_total_counts","log1p_n_prot_by_counts")%in% colnames(prot_data_plot))){ - print("Plotting scatter plot of log1p_total_counts and log1p_n_prot_by_counts") - g <- do_scatter_plot(prot_data_plot,x="log1p_total_counts",y="log1p_n_prot_by_counts", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-log1p_nUMI_v_prot-log1p_prot.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) + if (all(c("log1p_total_counts", "log1p_n_prot_by_counts") %in% colnames(prot_data_plot))) { + cat("Plotting scatter plot of log1p_total_counts and log1p_n_prot_by_counts\n") + g <- do_scatter_plot(prot_data_plot, x = "log1p_total_counts", y = "log1p_n_prot_by_counts", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "_prot-log1p_nUMI_v_prot-log1p_prot.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - if (all(c("total_counts","pct_counts_isotype")%in% colnames(prot_data_plot))){ - print("Plotting scatter plot of total_counts and pct_counts_isotype") - g <- do_scatter_plot(prot_data_plot,x="total_counts",y="pct_counts_isotype", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-pct_isotype.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) + if (all(c("total_counts", "pct_counts_isotype") %in% colnames(prot_data_plot))) { + cat("Plotting scatter plot of total_counts and pct_counts_isotype\n") + g <- do_scatter_plot(prot_data_plot, x = "total_counts", y = "pct_counts_isotype", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-pct_isotype.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - if (all(c("total_counts","total_counts_isotype")%in% colnames(prot_data_plot))){ - print("Plotting scatter plot of total_counts and total_counts_isotype") - g <- do_scatter_plot(prot_data_plot,x="total_counts",y="total_counts_isotype", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-total_counts_isotype.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) + if (all(c("total_counts", "total_counts_isotype") %in% colnames(prot_data_plot))) { + cat("Plotting scatter plot of total_counts and total_counts_isotype\n") + g <- do_scatter_plot(prot_data_plot, x = "total_counts", y = "total_counts_isotype", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-total_counts_isotype.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - if (all(c("log1p_total_counts","log1p_total_counts_isotype", "isotype_exclude")%in% colnames(prot_data_plot))){ - print("Plotting scatter plot of log1p_total_counts and log1p_total_counts_isotype") - g <- do_scatter_plot(prot_data_plot,x="log1p_total_counts",y="log1p_total_counts_isotype", hue="isotype_exclude", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-log1p_nUMI_v_prot-log1p_total_counts_isotype.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) + if (all(c("log1p_total_counts", "log1p_total_counts_isotype", "isotype_exclude") %in% colnames(prot_data_plot))) { + cat("Plotting scatter plot of log1p_total_counts and log1p_total_counts_isotype\n") + g <- do_scatter_plot(prot_data_plot, x = "log1p_total_counts", y = "log1p_total_counts_isotype", hue = "isotype_exclude", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "_prot-log1p_nUMI_v_prot-log1p_total_counts_isotype.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } } - } # Atac plots ---------------------------------------------------------------- -if(!is.null(opt$atac_qc_metrics)){ - print("ATAC plots") - atac_data_plot <- data_plot[,grep("^atac\\.",colnames(data_plot))] +if (!is.null(opt$atac_qc_metrics)) { + cat("ATAC plots\n") + atac_data_plot <- data_plot[, grep("^atac\\.", colnames(data_plot))] colnames(atac_data_plot) <- gsub("^atac\\.", "", colnames(atac_data_plot)) - atac_source_facet <- gsub("^atac\\.", "",grep("^atac.", source_facet, value = TRUE)) + atac_source_facet <- gsub("^atac\\.", "", grep("^atac.", source_facet, value = TRUE)) atac_source_facet <- unique(c(atac_source_facet, source_facet[!grepl("^atac.", source_facet)])) atac_source_facet <- atac_source_facet[atac_source_facet %in% colnames(atac_data_plot)] - outpath = file.path(run, "atac") + outpath <- file.path(run, "atac") if (!dir.exists(outpath)) dir.create(outpath) - + uniq_sample_id <- nrow(unique(atac_data_plot["sample_id"])) # check these qc metrics are in the file qcmetrics <- qcmetrics[qcmetrics %in% colnames(atac_data_plot)] - for (qc in qcmetrics){ + for (qc in qcmetrics) { print(paste("Plotting violin plots of", qc)) - for (sc in atac_source_facet){ + for (sc in atac_source_facet) { g <- do_violin_plot(atac_data_plot, qc, sc) - if (uniq_sample_id > 50){width=12}else{width=6} - ggsave(g, filename=file.path(outpath, paste0("violin_", sc, "_atac-", qc,".png")), type="cairo", width= width, height=6) + if (uniq_sample_id > 50) { + width <- 12 + } else { + width <- 6 + } + ggsave(g, filename = file.path(outpath, paste0("violin_", sc, "_atac-", qc, ".png")), type = "cairo", width = width, height = 6) } } } - + # Rep plots ---------------------------------------------------------------- if (!is.null(opt$rep_qc_metrics)) { - print("Repertoire plots") - qcmetrics <- strsplit(opt$rep_qc_metrics,",")[[1]] + cat("Repertoire plots\n") + qcmetrics <- strsplit(opt$rep_qc_metrics, ",")[[1]] qcmetrics <- gsub("rep:", "", qcmetrics) - rep_data_plot <- data_plot[,grep("^rep\\.",colnames(data_plot))] + rep_data_plot <- data_plot[, grep("^rep\\.", colnames(data_plot))] colnames(rep_data_plot) <- gsub("^rep\\.", "", colnames(rep_data_plot)) - rep_data_plot = rep_data_plot %>% filter(sample_id!="") - - rep_source_facet <- gsub("^rep\\.", "",grep("^rep.", source_facet, value = TRUE)) + rep_data_plot <- rep_data_plot %>% filter(sample_id != "") + + rep_source_facet <- gsub("^rep\\.", "", grep("^rep.", source_facet, value = TRUE)) rep_source_facet <- unique(c(rep_source_facet, source_facet[!grepl("^rep.", source_facet)])) rep_source_facet <- rep_source_facet[rep_source_facet %in% colnames(rep_data_plot)] - - outpath = file.path(run, "rep") + + outpath <- file.path(run, "rep") if (!dir.exists(outpath)) dir.create(outpath) - - + + uniq_sample_id <- nrow(unique(rep_data_plot["sample_id"])) # check these qc metrics are in the file qcmetrics <- qcmetrics[qcmetrics %in% colnames(rep_data_plot)] - for (qc in qcmetrics){ - print(paste("Plotting bar plots of", qc)) - for (sc in rep_source_facet){ + for (qc in qcmetrics) { + cat("Plotting bar plots of", qc. "\n") + for (sc in rep_source_facet) { g <- do_bar_plot(rep_data_plot, qc, sc) - if (uniq_sample_id > 50){width=12}else{width=6} - ggsave(g, filename=file.path(outpath, paste0("bar_", sc, "_rep-", qc,".png")), type="cairo", width= width, height=6) + if (uniq_sample_id > 50) { + width <- 12 + } else { + width <- 6 + } + ggsave(g, filename = file.path(outpath, paste0("bar_", sc, "_rep-", qc, ".png")), type = "cairo", width = width, height = 6) - if (!(qc %in% c('has_ir', "receptor_type"))){ + if (!(qc %in% c("has_ir", "receptor_type"))) { g <- do_bar_plot(rep_data_plot, qc, sc) + facet_grid(~receptor_type) - ggsave(g, filename=file.path(outpath, paste0("bar_facet_", sc, "_rep-", qc,".png")), type="cairo", width= width*4, height=6) - + ggsave(g, filename = file.path(outpath, paste0("bar_facet_", sc, "_rep-", qc, ".png")), type = "cairo", width = width * 4, height = 6) } } } } -if(!is.null(opt$prot_qc_metrics)){ -print("RNA vs. Protein plots") +if (!is.null(opt$prot_qc_metrics)) { + cat("RNA vs. Protein plots\n") -# rna vs prot plots ---------------------------------------------------------------- - + # rna vs prot plots ---------------------------------------------------------------- - outpath = file.path(run, "rna_v_prot") - if (!dir.exists(outpath)) dir.create(outpath) - - for (sc in source_facet){ + outpath <- file.path(run, "rna_v_prot") + if (!dir.exists(outpath)) dir.create(outpath) + + + for (sc in source_facet) { uniq_source <- nrow(unique(data_plot[sc])) - if(uniq_source >6){ - ncols=6 - nrows=ceiling(uniq_source/6) - }else{ - ncols=uniq_source - nrows=1 + if (uniq_source > 6) { + ncols <- 6 + nrows <- ceiling(uniq_source / 6) + } else { + ncols <- uniq_source + nrows <- 1 + } + if (all(c("rna.total_counts", "prot.total_counts") %in% colnames(data_plot))) { + cat("Plotting scatter plot of rna.total_counts and prot.total_counts\n") + g <- do_scatter_plot(data_plot, x = "rna.total_counts", y = "prot.total_counts", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "-nUMI_v_rna-nUMI.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - if (all(c("rna.total_counts","prot.total_counts")%in% colnames(data_plot))){ - print("Plotting scatter plot of rna.total_counts and prot.total_counts") - g <- do_scatter_plot(data_plot,x="rna.total_counts",y="prot.total_counts", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-nUMI_v_rna-nUMI.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) + if (all(c("rna.log1p_total_counts", "prot.log1p_total_counts") %in% colnames(data_plot))) { + cat("Plotting scatter plot of rna.log1p_total_counts and prot.log1p_total_counts\n") + g <- do_scatter_plot(data_plot, x = "rna.log1p_total_counts", y = "prot.log1p_total_counts", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "-log1p_nUMI_v_rna-log1p_nUMI.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - if (all(c("rna.log1p_total_counts","prot.log1p_total_counts")%in% colnames(data_plot))){ - print("Plotting scatter plot of rna.log1p_total_counts and prot.log1p_total_counts") - g <- do_scatter_plot(data_plot,x="rna.log1p_total_counts",y="prot.log1p_total_counts", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-log1p_nUMI_v_rna-log1p_nUMI.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) + if (all(c("rna.total_counts", "prot.total_counts_isotype") %in% colnames(data_plot))) { + cat("Plotting scatter plot of rna.total_counts and prot.total_counts_isotype\n") + g <- do_scatter_plot(data_plot, x = "rna.total_counts", y = "prot.total_counts_isotype", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "-nUMI_v_prot-counts_isotype.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - if (all(c("rna.total_counts","prot.total_counts_isotype")%in% colnames(data_plot))){ - print("Plotting scatter plot of rna.total_counts and prot.total_counts_isotype") - g <- do_scatter_plot(data_plot,x="rna.total_counts",y="prot.total_counts_isotype", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-nUMI_v_prot-counts_isotype.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) - } - if (all(c("rna.log1p_total_counts","prot.log1p_total_counts_isotype") %in% colnames(data_plot))){ - print("Plotting scatter plot of rna.log1p_total_counts and prot.log1p_total_counts_isotype") - g <- do_scatter_plot(data_plot,x="rna.log1p_total_counts",y="prot.log1p_total_counts_isotype", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-log1p_nUMI_v_prot-log1p_counts_isotype.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) + if (all(c("rna.log1p_total_counts", "prot.log1p_total_counts_isotype") %in% colnames(data_plot))) { + cat("Plotting scatter plot of rna.log1p_total_counts and prot.log1p_total_counts_isotype\n") + g <- do_scatter_plot(data_plot, x = "rna.log1p_total_counts", y = "prot.log1p_total_counts_isotype", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "-log1p_nUMI_v_prot-log1p_counts_isotype.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - if (all(c("rna.doublet_scores","prot.log1p_total_counts") %in% colnames(data_plot))){ - print("Plotting scatter plot of rna.doublet_scores and prot.log1p_total_counts") - g <- do_scatter_plot(data_plot,x="rna.doublet_scores",y="prot.log1p_total_counts", facet=sc) - ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-doublet_scores_v_prot-log1p_nUMI.png")), type="cairo", - width= 3*ncols, height=3*nrows, dpi=200) - + if (all(c("rna.doublet_scores", "prot.log1p_total_counts") %in% colnames(data_plot))) { + cat("Plotting scatter plot of rna.doublet_scores and prot.log1p_total_counts\n") + g <- do_scatter_plot(data_plot, x = "rna.doublet_scores", y = "prot.log1p_total_counts", facet = sc) + ggsave(g, + filename = file.path(outpath, paste0("scatter_", sc, "-doublet_scores_v_prot-log1p_nUMI.png")), type = "cairo", + width = 3 * ncols, height = 3 * nrows, dpi = 200 + ) } - - } - } sprefix <- opt$sampleprefix -if(opt$prefilter){ - print("Saving counts tables for references") - if(all(c("pct_counts_mt", "pct_counts_hb", "n_genes_by_counts", "doublet_scores") %in% colnames(rna_data_plot))){ - f1 <- rna_data_plot %>% - dplyr::filter(pct_counts_mt<=20 & pct_counts_hb<=70 &n_genes_by_counts>=100 & doublet_scores<=0.25) %>% - group_by_at(.vars=c(rna_source_facet)) %>% - group_by_at(.vars=c(rna_source_facet)) %>% - summarise(cell.count= n()) %>% - group_by_at(.vars="sample_id") %>% - rename(cell.count_f1=cell.count) - - f2 <- rna_data_plot %>% - dplyr::filter(pct_counts_mt<=10 & pct_counts_hb<=50 &n_genes_by_counts>=100 & doublet_scores<=0.25) %>% - group_by_at(.vars=c(rna_source_facet)) %>% - summarise(cell.count= n()) %>% - group_by_at(.vars="sample_id") %>% - rename(cell.count_f2=cell.count) - - f3 <- rna_data_plot %>% - dplyr::filter(pct_counts_mt<=5 & pct_counts_hb<=50 &n_genes_by_counts>=100 & n_genes_by_counts<=3000) %>% - group_by_at(.vars=c(rna_source_facet)) %>% - summarise(cell.count= n()) %>% - group_by_at(.vars="sample_id") %>% - rename(cell.count_f3=cell.count) - - baseline <- rna_data_plot %>% - group_by_at(.vars=c(rna_source_facet)) %>% - summarise(cell.count= n()) %>% - group_by_at(.vars="sample_id") %>% - rename(baseline.counts=cell.count) - - info <- merge(merge(merge(f1,f2,by=rna_source_facet,all=TRUE),f3, by=rna_source_facet, all=TRUE), baseline, by=rna_source_facet, all=T) %>% - mutate(percent_retain_f1 = 100*cell.count_f1/baseline.counts, - percent_retain_f2 = 100*cell.count_f2/baseline.counts, - percent_retain_f3 = 100*cell.count_f3/baseline.counts) - - - - write.table(info, file=paste0( sprefix,"_threshold_filter.tsv"), col.names=T, row.names=F, sep="\t", quote=F) - - explain <- data.frame(qcmetric=c("pct_counts_mt_max","pct_counts_hb_max","n_genes_by_counts_min","doublet_scores_max","n_genes_by_counts_max"), - f1=c(20,70,100,0.25,NA), - f2=c(10,50,100,0.25,NA), - f3=c(5,50,100,NA,3000)) - - write.table(explain, file=paste0(sprefix,"_threshold_filter_explained.tsv"), col.names=T, row.names=F, sep="\t", quote=F) - - lab <- c("%Mt <=10 & %HB<70 & minGenes>=100 & scrublet<=0.25", - "%Mt <=5 & %HB<50 & minGenes>=100 & scrublet<=0.25", - "%Mt <=5 & %HB<50 & minGenes>=100 & maxGenes<=3000") - names(lab) <- c("percent_retain_f1","percent_retain_f2","percent_retain_f3") - - g <- info %>% - pivot_longer(cols=starts_with("percent"), names_to="filterclass", values_to="percent")%>% - ggplot(aes(sample_id,percent, fill=filterclass)) + - geom_bar(stat="identity", position="dodge", color="black") + - facet_wrap(~filterclass, ncol=1, labeller = labeller(filterclass=lab)) + - theme_bw()+ - theme(axis.text.x=element_text(size=8,angle=45, hjust=1, vjust=1), - axis.text.y=element_text(size=13)) + - scale_fill_manual(values = c("red", "yellow","blue"), - breaks=c("percent_retain_f1","percent_retain_f2","percent_retain_f3"), - limits=c("percent_retain_f1","percent_retain_f2","percent_retain_f3")) + - coord_cartesian(ylim=c(0,100)) - - ggsave(g, file = paste0(run,"barplot_cellcounts_thresholds_filter.png"), type="cairo", width=9, height=9) +if (opt$prefilter) { + cat("Saving counts tables for references\n") + if (all(c("pct_counts_mt", "pct_counts_hb", "n_genes_by_counts", "doublet_scores") %in% colnames(rna_data_plot))) { + + f1 <- rna_data_plot %>% + dplyr::filter(pct_counts_mt <= 20 & pct_counts_hb <= 70 & n_genes_by_counts >= 100 & doublet_scores <= 0.25) %>% + group_by_at(.vars = c(rna_source_facet)) %>% + summarise(cell.count = n()) %>% + group_by_at(.vars = "sample_id") %>% + rename(cell.count_f1 = cell.count) + + f2 <- rna_data_plot %>% + dplyr::filter(pct_counts_mt <= 10 & pct_counts_hb <= 50 & n_genes_by_counts >= 100 & doublet_scores <= 0.25) %>% + group_by_at(.vars = c(rna_source_facet)) %>% + summarise(cell.count = n()) %>% + group_by_at(.vars = "sample_id") %>% + rename(cell.count_f2 = cell.count) + + f3 <- rna_data_plot %>% + dplyr::filter(pct_counts_mt <= 5 & pct_counts_hb <= 50 & n_genes_by_counts >= 100 & n_genes_by_counts <= 3000) %>% + group_by_at(.vars = c(rna_source_facet)) %>% + summarise(cell.count = n()) %>% + group_by_at(.vars = "sample_id") %>% + rename(cell.count_f3 = cell.count) + + baseline <- rna_data_plot %>% + group_by_at(.vars = c(rna_source_facet)) %>% + summarise(cell.count = n()) %>% + group_by_at(.vars = "sample_id") %>% + rename(baseline.counts = cell.count) + + info <- merge(merge(merge(f1, f2, by = rna_source_facet, all = TRUE), f3, by = rna_source_facet, all = TRUE), baseline, by = rna_source_facet, all = T) %>% + mutate( + percent_retain_f1 = 100 * cell.count_f1 / baseline.counts, + percent_retain_f2 = 100 * cell.count_f2 / baseline.counts, + percent_retain_f3 = 100 * cell.count_f3 / baseline.counts + ) + + + + write.table(info, file = paste0(sprefix, "_threshold_filter.tsv"), col.names = T, row.names = F, sep = "\t", quote = F) + + explain <- data.frame( + qcmetric = c("pct_counts_mt_max", "pct_counts_hb_max", "n_genes_by_counts_min", "doublet_scores_max", "n_genes_by_counts_max"), + f1 = c(20, 70, 100, 0.25, NA), + f2 = c(10, 50, 100, 0.25, NA), + f3 = c(5, 50, 100, NA, 3000) + ) + + write.table(explain, file = paste0(sprefix, "_threshold_filter_explained.tsv"), col.names = T, row.names = F, sep = "\t", quote = F) + + lab <- c( + "%Mt <=10 & %HB<70 & minGenes>=100 & scrublet<=0.25", + "%Mt <=5 & %HB<50 & minGenes>=100 & scrublet<=0.25", + "%Mt <=5 & %HB<50 & minGenes>=100 & maxGenes<=3000" + ) + names(lab) <- c("percent_retain_f1", "percent_retain_f2", "percent_retain_f3") + + g <- info %>% + pivot_longer(cols = starts_with("percent"), names_to = "filterclass", values_to = "percent") %>% + ggplot(aes(sample_id, percent, fill = filterclass)) + + geom_bar(stat = "identity", position = "dodge", color = "black") + + facet_wrap(~filterclass, ncol = 1, labeller = labeller(filterclass = lab)) + + theme_bw() + + theme( + axis.text.x = element_text(size = 8, angle = 45, hjust = 1, vjust = 1), + axis.text.y = element_text(size = 13) + ) + + scale_fill_manual( + values = c("red", "yellow", "blue"), + breaks = c("percent_retain_f1", "percent_retain_f2", "percent_retain_f3"), + limits = c("percent_retain_f1", "percent_retain_f2", "percent_retain_f3") + ) + + coord_cartesian(ylim = c(0, 100)) + + ggsave(g, file = paste0(run, "barplot_cellcounts_thresholds_filter.png"), type = "cairo", width = 9, height = 9) } - -}else{ - print("Producing files with final counts for cells after filtering") - - baseline <- rna_data_plot %>% - group_by_at(.vars=c(rna_source_facet)) %>% - summarise(cell.count= n()) %>% - group_by_at(.vars="sample_id") - - write.table(baseline, file=paste0( sprefix,"_filtered_data.tsv"), col.names=T, row.names=F, sep="\t", quote=F) - - g <- baseline %>% - ggplot(aes_string(x="sample_id", y="cell.count")) + - geom_bar(stat="identity", position="dodge", color="black") + - theme_bw()+ - theme(axis.text.x=element_text(size=8,angle=45, hjust=1.05, vjust=0.95), - axis.text.y=element_text(size=13)) - ggsave(g, file = file.path(run, "rna","barplot_cellcounts_filtered_data.png"), type="cairo", width=9, height=6) - +} else { + cat("Producing files with final counts for cells after filtering\n") + + baseline <- rna_data_plot %>% + group_by_at(.vars = c(rna_source_facet)) %>% + summarise(cell.count = n()) %>% + group_by_at(.vars = "sample_id") + + write.table(baseline, file = paste0(sprefix, "_filtered_data.tsv"), col.names = T, row.names = F, sep = "\t", quote = F) + + g <- baseline %>% + ggplot(aes_string(x = "sample_id", y = "cell.count")) + + geom_bar(stat = "identity", position = "dodge", color = "black") + + theme_bw() + + theme( + axis.text.x = element_text(size = 8, angle = 45, hjust = 1.05, vjust = 0.95), + axis.text.y = element_text(size = 13) + ) + ggsave(g, file = file.path(run, "rna", "barplot_cellcounts_filtered_data.png"), type = "cairo", width = 9, height = 6) } -print("Done") +cat("Done\n") From b76bfa978884201d6e99b5365a522cb276e94d3e Mon Sep 17 00:00:00 2001 From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com> Date: Tue, 4 Feb 2025 21:32:10 +0100 Subject: [PATCH 09/10] add some files to gitignore --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ca736369..a7b75651 100644 --- a/.gitignore +++ b/.gitignore @@ -165,6 +165,8 @@ cython_debug/ # panpipes specific .history -panpipes/.DS_Store .DS_Store *.code-workspace +*/.DS_Store +1_ingest/* +data/* From a148e99cd02806c1dc74a9b1abe3ec43e77ba645 Mon Sep 17 00:00:00 2001 From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com> Date: Thu, 6 Mar 2025 14:40:19 +0100 Subject: [PATCH 10/10] fix: fix an error in plotQC.R --- panpipes/R_scripts/plotQC.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/panpipes/R_scripts/plotQC.R b/panpipes/R_scripts/plotQC.R index 32a8ce9b..3f7427b1 100644 --- a/panpipes/R_scripts/plotQC.R +++ b/panpipes/R_scripts/plotQC.R @@ -414,7 +414,7 @@ if (!is.null(opt$rep_qc_metrics)) { # check these qc metrics are in the file qcmetrics <- qcmetrics[qcmetrics %in% colnames(rep_data_plot)] for (qc in qcmetrics) { - cat("Plotting bar plots of", qc. "\n") + cat("Plotting bar plots of", qc, "\n") for (sc in rep_source_facet) { g <- do_bar_plot(rep_data_plot, qc, sc) if (uniq_sample_id > 50) { @@ -501,7 +501,6 @@ sprefix <- opt$sampleprefix if (opt$prefilter) { cat("Saving counts tables for references\n") if (all(c("pct_counts_mt", "pct_counts_hb", "n_genes_by_counts", "doublet_scores") %in% colnames(rna_data_plot))) { - f1 <- rna_data_plot %>% dplyr::filter(pct_counts_mt <= 20 & pct_counts_hb <= 70 & n_genes_by_counts >= 100 & doublet_scores <= 0.25) %>% group_by_at(.vars = c(rna_source_facet)) %>%