From 71f32f8dea03142eecd737afe33416dc60006c18 Mon Sep 17 00:00:00 2001
From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com>
Date: Tue, 4 Feb 2025 10:04:12 +0100
Subject: [PATCH 01/10] fix/refactoring: fix the AIRR code; add checks for
 column names; fix the formatting and argument types where possible; highlight
 redundant code with ToDos

---
 panpipes/funcs/io.py | 689 +++++++++++++++++++++++++++----------------
 1 file changed, 441 insertions(+), 248 deletions(-)

diff --git a/panpipes/funcs/io.py b/panpipes/funcs/io.py
index 77d74d2b..b5ab4419 100644
--- a/panpipes/funcs/io.py
+++ b/panpipes/funcs/io.py
@@ -1,24 +1,30 @@
-from random import sample
-import warnings
-import re
+import gzip
+import logging
 import os
-import pandas as pd
+import re
 import sys
-import h5py
-from typing import Optional, Literal
-from scanpy import read_10x_mtx, read_10x_h5, read_h5ad, read_text, read_csv, read_hdf
+import warnings
+from os import PathLike
+from pathlib import Path
+from typing import Literal, Optional
 
+import h5py
 import muon as mu
-import logging
-import scirpy as ir 
-from itertools import chain
-from .processing import concat_adata_list, intersection
-from anndata import AnnData
 import numpy as np
-from os import PathLike
+import pandas as pd
+import scirpy as ir
+import yaml
+from anndata import AnnData
+from scanpy import read_10x_h5, read_10x_mtx, read_csv, read_h5ad, read_hdf, read_text
+from scipy.io import mmwrite
+
+from .processing import intersection
 
-def update_cellranger_col(path,  raw=False, method="count", sample_id=""):
-    if method =="count":
+
+def update_cellranger_col(
+    path: str | Path, raw: bool = False, method: str = "count", sample_id: str = ""
+):
+    if method == "count":
         # this assumes that we get a cellranger count base path and we want file path for sc.read_10x_h5 or similar
         if raw:
             append_str = "raw_feature_bc_matrix"
@@ -28,321 +34,472 @@ def update_cellranger_col(path,  raw=False, method="count", sample_id=""):
         if raw:
             append_str = os.path.join("multi", "count", "raw_feature_bc_matrix")
         else:
-            append_str = os.path.join("per_sample_outs", sample_id, "count", "sample_filtered_feature_bc_matrix")
+            append_str = os.path.join(
+                "per_sample_outs",
+                sample_id,
+                "count",
+                "sample_filtered_feature_bc_matrix",
+            )
     else:
         sys.exit("specify cellranger filetype as cellranger_count or cellranger_multi")
     # is there a h5 file available
-    if os.path.exists(os.path.join(path, append_str)) == False:
+    if not os.path.exists(os.path.join(path, append_str)):
         sys.exit("cellranger path not found: %s" % os.path.join(path, append_str))
-    if os.path.exists(os.path.join(path, append_str + '.h5')):
+
+    if os.path.exists(os.path.join(path, append_str + ".h5")):
         append_str = append_str + ".h5"
         filetype = "10X_h5"
     else:
         filetype = "cellranger"
+
     path = os.path.join(path, append_str)
+
     return path, filetype
 
 
-def gen_load_anndata_jobs(caf, load_raw=False, mode_dictionary = {}, load_prot_from_raw=False):
+def gen_load_anndata_jobs(
+    caf: pd.DataFrame,
+    load_raw: bool = False,
+    mode_dictionary: Optional[dict] = None,
+    load_prot_from_raw: bool = False,
+):
+    if mode_dictionary is None:
+        mode_dictionary = {}  # not the best, dirty hack to make it compatible with code
+
     """
     Generate a load_adatas job for each line in submission.txt
     """
     for nn in range(0, caf.shape[0]):
-        if pd.isna(caf['rna_path'][nn]):
-                rna_path= None
-                rna_filetype=None
-        elif caf['rna_filetype'][nn]=="cellranger" and mode_dictionary["rna"]:
-            rna_path, rna_filetype = update_cellranger_col(caf['rna_path'][nn], raw=load_raw, method="count")
-        elif caf['rna_filetype'][nn]=="cellranger_multi" and mode_dictionary["rna"]:
-            rna_path, rna_filetype = update_cellranger_col(caf['rna_path'][nn], raw=load_raw, method="multi", 
-                                                            sample_id=caf['sample_id'][nn])
+        if pd.isna(caf["rna_path"][nn]):
+            rna_path = None
+            rna_filetype = None
+        elif caf["rna_filetype"][nn] == "cellranger" and mode_dictionary["rna"]:
+            rna_path, rna_filetype = update_cellranger_col(
+                caf["rna_path"][nn], raw=load_raw, method="count"
+            )
+        elif caf["rna_filetype"][nn] == "cellranger_multi" and mode_dictionary["rna"]:
+            rna_path, rna_filetype = update_cellranger_col(
+                caf["rna_path"][nn],
+                raw=load_raw,
+                method="multi",
+                sample_id=caf["sample_id"][nn],
+            )
         else:
-            rna_path, rna_filetype = caf[['rna_path', "rna_filetype"]].iloc[nn]
+            rna_path, rna_filetype = caf[["rna_path", "rna_filetype"]].iloc[nn]
             if load_raw:
                 rna_path = re.sub("filtered", "raw", rna_path)
+
         # manage the prot paths
-        if ('prot_path' in caf.columns and mode_dictionary["prot"]):
+        if "prot_path" in caf.columns and mode_dictionary["prot"]:
             # check if its the same as the rna path (data in the same file)
-            if pd.isna(caf['prot_path'][nn]):
-                prot_path= None
-                prot_filetype=None
-            elif caf['prot_path'][nn] == caf['rna_path'][nn]:
+            if pd.isna(caf["prot_path"][nn]):
+                prot_path = None
+                prot_filetype = None
+            elif caf["prot_path"][nn] == caf["rna_path"][nn]:
                 prot_path, prot_filetype = rna_path, rna_filetype
-            elif caf['prot_filetype'][nn]=="cellranger":
-                # we might want to load the raw here because we want to then subset by good gex (rna) barcodes, 
+            elif caf["prot_filetype"][nn] == "cellranger":
+                # we might want to load the raw here because we want to then subset by good gex (rna) barcodes,
                 # this is why the load_prot_from_raw argument exists
-                prot_path, prot_filetype = update_cellranger_col(caf['prot_path'][nn], raw=load_prot_from_raw)
-            elif caf['prot_filetype'][nn]=="cellranger_multi":
+                prot_path, prot_filetype = update_cellranger_col(
+                    caf["prot_path"][nn], raw=load_prot_from_raw
+                )
+            elif caf["prot_filetype"][nn] == "cellranger_multi":
                 # celranger multi has the same prot and gex (rna) barcodes
-                prot_path, prot_filetype = update_cellranger_col(caf['prot_path'][nn], raw=load_raw, method="multi", 
-                                                                sample_id=caf['sample_id'][nn])
+                prot_path, prot_filetype = update_cellranger_col(
+                    caf["prot_path"][nn],
+                    raw=load_raw,
+                    method="multi",
+                    sample_id=caf["sample_id"][nn],
+                )
             else:
-                prot_path, prot_filetype = caf[['prot_path', "prot_filetype"]].iloc[nn]
+                prot_path, prot_filetype = caf[["prot_path", "prot_filetype"]].iloc[nn]
                 if load_prot_from_raw or load_raw:
                     prot_path = re.sub("filtered", "raw", prot_path)
         else:
-            prot_path= None
-            prot_filetype=None
+            prot_path = None
+            prot_filetype = None
+
         # load tcr_path
-        if 'tcr_path' in caf.columns and mode_dictionary["tcr"] and pd.notna(caf['tcr_path'][nn]):
-            tcr_path = caf['tcr_path'][nn]
-            tcr_filetype = caf['tcr_filetype'][nn]
+        if (
+            "tcr_path" in caf.columns
+            and mode_dictionary["tcr"]
+            and pd.notna(caf["tcr_path"][nn])
+        ):
+            tcr_path = caf["tcr_path"][nn]
+            tcr_filetype = caf["tcr_filetype"][nn]
         else:
-            tcr_path= None
-            tcr_filetype=None
-        if 'bcr_path' in caf.columns and mode_dictionary["bcr"] and pd.notna(caf['bcr_path'][nn]):
-            bcr_path = caf['bcr_path'][nn]
-            bcr_filetype = caf['bcr_filetype'][nn]
+            tcr_path = None
+            tcr_filetype = None
+
+        # load bcr_path
+        if (
+            "bcr_path" in caf.columns
+            and mode_dictionary["bcr"]
+            and pd.notna(caf["bcr_path"][nn])
+        ):
+            bcr_path = caf["bcr_path"][nn]
+            bcr_filetype = caf["bcr_filetype"][nn]
         else:
-            bcr_path= None
-            bcr_filetype=None
-        if ('atac_path' in caf.columns and mode_dictionary["atac"]):
+            bcr_path = None
+            bcr_filetype = None
+
+        # load atac_path
+        if "atac_path" in caf.columns and mode_dictionary["atac"]:
             if caf.shape[0] > 1:
-                sys.exit("You can only submit one atac/multiome file at a time. To aggregate, see cellranger aggr.")
-            if caf['atac_filetype'][nn]=="cellranger" :
-                atac_path, atac_filetype = update_cellranger_col(caf['atac_path'][nn], raw=load_raw, method="count")
+                sys.exit(
+                    "You can only submit one atac/multiome file at a time. To aggregate, see cellranger aggr."
+                )
+            if caf["atac_filetype"][nn] == "cellranger":
+                atac_path, atac_filetype = update_cellranger_col(
+                    caf["atac_path"][nn], raw=load_raw, method="count"
+                )
             else:
-                atac_path = caf['atac_path'][nn]
-                atac_filetype = caf['atac_filetype'][nn]
-            if 'fragments_file' in caf.columns and pd.notna(caf['fragments_file'][nn]):
-                fragments_file = caf['fragments_file'][nn]
+                atac_path = caf["atac_path"][nn]
+                atac_filetype = caf["atac_filetype"][nn]
+            if "fragments_file" in caf.columns and pd.notna(caf["fragments_file"][nn]):
+                fragments_file = caf["fragments_file"][nn]
             else:
                 fragments_file = None
-            if 'per_barcode_metrics_file' in caf.columns and pd.notna(caf['per_barcode_metrics_file'][nn]):
-                per_barcode_metrics_file = caf['per_barcode_metrics_file'][nn]
+            if "per_barcode_metrics_file" in caf.columns and pd.notna(
+                caf["per_barcode_metrics_file"][nn]
+            ):
+                per_barcode_metrics_file = caf["per_barcode_metrics_file"][nn]
             else:
                 per_barcode_metrics_file = None
-            if 'peak_annotation_file' in caf.columns and pd.notna(caf['peak_annotation_file'][nn]):    
-                peak_annotation_file = caf['peak_annotation_file'][nn]
+            if "peak_annotation_file" in caf.columns and pd.notna(
+                caf["peak_annotation_file"][nn]
+            ):
+                peak_annotation_file = caf["peak_annotation_file"][nn]
             else:
                 peak_annotation_file = None
         else:
-            atac_path= None
-            atac_filetype=None
+            atac_path = None
+            atac_filetype = None
             fragments_file = None
             peak_annotation_file = None
             per_barcode_metrics_file = None
-            
-        if 'barcode_mtd_path' in caf.columns:
-            cell_mtd_path = caf['barcode_mtd_path'][nn]
+
+        if "barcode_mtd_path" in caf.columns:
+            cell_mtd_path = caf["barcode_mtd_path"][nn]
         else:
             cell_mtd_path = None
-        # create the output file 
-        outfile = "./tmp/" + caf['sample_id'][nn]
+
+        # create the output file
+        outfile = "./tmp/" + caf["sample_id"][nn]
         if load_raw:
             outfile = outfile + "_raw.h5mu"
         else:
             outfile = outfile + ".h5mu"
-        sample_id = caf['sample_id'][nn]
-        yield rna_path, outfile, \
-              sample_id, \
-              rna_filetype,  \
-              prot_path, prot_filetype, \
-              tcr_path, tcr_filetype,  \
-              bcr_path, bcr_filetype, \
-              atac_path, atac_filetype, \
-              fragments_file, per_barcode_metrics_file, peak_annotation_file, \
-              cell_mtd_path
-        
-
-def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
+
+        sample_id = caf["sample_id"][nn]
+
+        yield (
+            rna_path,
+            outfile,
+            sample_id,
+            rna_filetype,
+            prot_path,
+            prot_filetype,
+            tcr_path,
+            tcr_filetype,
+            bcr_path,
+            bcr_filetype,
+            atac_path,
+            atac_filetype,
+            fragments_file,
+            per_barcode_metrics_file,
+            peak_annotation_file,
+            cell_mtd_path,
+        )
+
+
+def gen_load_spatial_jobs(
+    caf: pd.DataFrame, mode_dictionary: Optional[dict] = None, load_raw: bool = True
+):
     """
     Generate a load_spatial job for each line in submission.txt
     """
+
+    if mode_dictionary is None:
+        mode_dictionary = {}
+
     for nn in range(0, caf.shape[0]):
         if "spatial_path" in caf.columns and mode_dictionary["spatial"]:
             if pd.isna(caf["spatial_path"][nn]):
-                spatial_path= None
+                spatial_path = None
                 spatial_filetype = None
             else:
                 spatial_path = caf["spatial_path"][nn]
-            if caf['spatial_filetype'][nn]=="vizgen":
-                spatial_filetype = caf['spatial_filetype'][nn]
-                #path, counts and metadata are mandatory
+            if caf["spatial_filetype"][nn] == "vizgen":
+                spatial_filetype = caf["spatial_filetype"][nn]
+
+                # path, counts and metadata are mandatory
                 if pd.notna(caf["spatial_counts"][nn]):
-                    spatial_counts= caf["spatial_counts"][nn]
+                    spatial_counts = caf["spatial_counts"][nn]
                 else:
                     spatial_counts = None
+
                 if pd.notna(caf["spatial_metadata"][nn]):
                     spatial_metadata = caf["spatial_metadata"][nn]
-                else: 
+                else:
                     spatial_metadata = None
-                #transformation is optional
+
+                # transformation is optional
                 if pd.notna(caf["spatial_transformation"][nn]):
                     spatial_transformation = caf["spatial_transformation"][nn]
                 else:
                     spatial_transformation = None
-            elif caf['spatial_filetype'][nn]=="visium":
-                spatial_metadata= None
+
+            elif caf["spatial_filetype"][nn] == "visium":
+                spatial_metadata = None
                 spatial_transformation = None
-                spatial_filetype = caf['spatial_filetype'][nn]
+                spatial_filetype = caf["spatial_filetype"][nn]
+
                 if pd.notna(caf["spatial_counts"][nn]):
-                    spatial_counts= caf["spatial_counts"][nn]
+                    spatial_counts = caf["spatial_counts"][nn]
                 else:
-                    spatial_counts = None  
+                    spatial_counts = None
         else:
-            spatial_path= None
+            spatial_path = None
             spatial_filetype = None
             spatial_counts = None
             spatial_metadata = None
             spatial_transformation = None
-            
-        if 'barcode_mtd_path' in caf.columns:
-            cell_mtd_path = caf['barcode_mtd_path'][nn] #not yielding this right now!
+
+        # ToDo: cell_mtd_path is not used in this function below
+        if "barcode_mtd_path" in caf.columns:
+            cell_mtd_path = caf["barcode_mtd_path"][nn]  # not yielding this right now!
         else:
             cell_mtd_path = None
-        # create the output file 
-        outfile = "./tmp/" + caf['sample_id'][nn]
+
+        # create the output file
+        outfile = "./tmp/" + caf["sample_id"][nn]
+
         if load_raw:
-            outfile = outfile + "_raw.h5mu" 
+            outfile = outfile + "_raw.h5mu"
         else:
             outfile = outfile + ".h5mu"
-        sample_id = caf['sample_id'][nn]
-        yield spatial_path,  outfile, \
-              sample_id, spatial_filetype, spatial_counts, spatial_metadata, spatial_transformation
+
+        sample_id = caf["sample_id"][nn]
+
+        yield (
+            spatial_path,
+            outfile,
+            sample_id,
+            spatial_filetype,
+            spatial_counts,
+            spatial_metadata,
+            spatial_transformation,
+        )
 
 
 def read_anndata(
     fname: Optional[str] = None,
-    use_muon: Optional[bool] = False, 
-    modality: Literal['all', 'rna', 'prot','atac','rep'] = 'all'):
+    use_muon: Optional[bool] = False,
+    modality: Literal["all", "rna", "prot", "atac", "rep"] = "all",
+):
     logging.info("reading %s" % fname)
+
+    if fname is None:
+        sys.exit("anndata file not found; `fname` has not been passed/")
+
     # check fname file exists
     try:
         os.path.isfile(fname)
     except FileNotFoundError:
-        sys.exit("anndata file not found")
+        sys.exit(f"anndata file not found: {fname}")
+
     if use_muon is False:
         return read_h5ad(fname)
     else:
         logging.info("reading %s modality" % modality)
-        if modality=="all":
+
+        if modality == "all":
             # note this just loads the rna part of the muon object
             return mu.read(fname)
-        if modality=="rna":
+
+        if modality == "rna":
             # note this just loads the rna part of the muon object
             return mu.read(fname + "/rna")
-        elif modality=="prot":
+        elif modality == "prot":
             return mu.read(fname + "/prot")
-        elif modality=="atac":
+        elif modality == "atac":
             return mu.read(fname + "/atac")
-        elif modality=="rep":
+        elif modality == "rep":
             return mu.read(fname + "/rep")
-        
+
         else:
-            sys.exit("modality not found, must be one of 'all', 'rna', 'prot','atac', 'rep' ")
+            sys.exit(
+                f"modality {modality} not found, must be one of 'all', 'rna', 'prot','atac', 'rep' "
+            )
 
 
-def write_anndata(adata,
-                    fname, 
-                    use_muon=False, 
-                    modality: Literal['all', 'rna', 'prot'] = 'all'):
+def write_anndata(
+    adata: AnnData,
+    fname: str,
+    use_muon: bool = False,
+    modality: Literal["all", "rna", "prot"] = "all",
+):
     if use_muon is False:
         adata.write(fname)
     else:
-        if modality=="rna":
+        if modality == "rna":
             # note this just saves the rna part of the muon object
             return mu.write(fname + "/rna", adata)
-        if modality=="prot":
+        if modality == "prot":
             # note this just save the prot part of the muon object
             return mu.write(fname + "/prot", adata)
-        elif modality=="all":
+        elif modality == "all":
             return mu.write(fname, adata)
         else:
             sys.exit("modality not found, must be one of 'all', 'rna', 'prot'")
 
+
 # --- loading raw dat
 
-def write_obs(mdata, output_prefix="./", output_suffix="_filtered_cell_metadata.tsv"):
-        metafile = mdata.obs
-        metafile["cellbarcode"] = mdata.obs.index
-        savename= output_prefix + output_suffix
-        metafile.to_csv(savename, sep='\t', index=True)
 
-def check_submission_file(caf):
+def write_obs(
+    mdata: AnnData,
+    output_prefix: str = "./",
+    output_suffix: str = "_filtered_cell_metadata.tsv",
+):
+    metafile = mdata.obs
+    metafile["cellbarcode"] = mdata.obs.index
+    savename = output_prefix + output_suffix
+    metafile.to_csv(savename, sep="\t", index=True)
+
+
+def check_submission_file(caf: pd.DataFrame):
+    """
+    Fixing the column names in the input data frame in case formatting is off.
+    """
+    old_columns = caf.columns
+    caf.rename(
+        columns={col_name: col_name.strip() for col_name in caf.columns}, inplace=True
+    )
+    caf.replace("^ +| +$", "", regex=True, inplace=True)
+    logging.info(
+        f"Formatted columns to remove spaces. Changes: {old_columns} -> {caf.columns}",
+    )
+
     if "filetype" in caf.columns or "path" in caf.columns:
-        raise ValueError("you appear to be using the old notation for the sample submission file, \
-        please update to use rna_path instead of path and rna_filetype instead of filetype")
+        raise ValueError(
+            "you appear to be using the old notation for the sample submission file, \
+        please update to use rna_path instead of path and rna_filetype instead of filetype"
+        )
+
     # check for required cols
-    req_cols = ['sample_id', 'rna_path', 'rna_filetype']
+    req_cols = ["sample_id", "rna_path", "rna_filetype"]
     for rc in req_cols:
         if rc not in caf.columns:
             raise ValueError("required column %s missing from submission file" % rc)
 
 
-
-def check_filetype(path, filetype):
+def check_filetype(
+    path: str,
+    filetype: Literal[
+        "cellranger",
+        "h5ad",
+        "csv_matrix",
+        "txt_matrix",
+        "10X_h5",
+        "hdf",
+        "cellranger_vdj",
+        "vizgen",
+    ],
+):
     logging.debug(path)
     logging.debug(filetype)
-    if filetype == "cellranger" :
+
+    if filetype == "cellranger":
         if os.path.exists(path + "/matrix.mtx.gz"):
             logging.info("cellranger matrix found")
         elif os.path.exists(path + "/matrix.mtx"):
             logging.info("cellranger matrix found")
         else:
-            sys.exit("cellranger matrix file not found, have you specified the path correctly")
+            sys.exit(
+                "cellranger matrix file not found, have you specified the path correctly"
+            )
     else:
         ftype_checks = {
-            "h5ad":r".h5ad$",
+            "h5ad": r".h5ad$",
             "csv_matrix": ".csv",
-            "txt_matrix" : ".txt",
+            "txt_matrix": ".txt",
             "10X_h5": ".h5",
             "hdf": ".h5",
             "cellranger_vdj": r".json|.csv",
-            "vizgen": r".txt|.csv|.tsv" #suboptimal for now but roll with it
+            "vizgen": r".txt|.csv|.tsv",  # suboptimal for now but roll with it
         }
         if filetype not in ftype_checks.keys():
-            sys.exit("unknown filetype %s, please specify one of: %s" % filetype, ftype_checks.keys().join(", "))
+            sys.exit(
+                f"unknown filetype {filetype}, please specify one of: {list(ftype_checks.keys())}"
+            )
         elif re.search(ftype_checks[filetype], path) is None:
-            sys.exit("filetype does not match file suffix, check it is specified correctly \
-                    (with %s as suffix (gzipped files ok for csv and tsv))" % ftype_checks[filetype])
-        elif os.path.exists(path)==False:
+            sys.exit(
+                "filetype does not match file suffix, check it is specified correctly \
+                    (with %s as suffix (gzipped files ok for csv and tsv))"
+                % ftype_checks[filetype]
+            )
+        elif not os.path.exists(path):
             sys.exit("file %s does not exist" % path)
         else:
             logging.info("%s file exists" % path)
 
 
 def read_scirpy(
-    fname, 
-    filetype: Literal['cellranger_vdj', 'tracer', 'bracer', 'airr' ] = 'cellranger_vdj'):
+    fname: str,
+    filetype: Literal["cellranger_vdj", "tracer", "bracer", "airr"] = "cellranger_vdj",
+):
     """
     wrapper around all the scirpy io functions
     """
     # check fname file exists
     logging.info("loading repertoire using scirpy, filetype %s" % filetype)
+
     try:
         os.path.isfile(fname)
     except FileNotFoundError:
         sys.exit("vdj file not found: %s" % fname)
-    if filetype ==  "cellranger_vdj":
+
+    if filetype == "cellranger_vdj":
         vdj = ir.io.read_10x_vdj(fname)
     elif filetype == "tracer":
         vdj = ir.io.read_tracer(fname)
-    elif filetype ==  "bracer":
+    elif filetype == "bracer":
         vdj = ir.io.read_bracer(fname)
-    elif filetype ==  "airr":
+    elif filetype == "airr":
         vdj = ir.io.read_airr(fname)
     else:
-        sys.exit("fileype not valid, must be one of 'cellranger_vdj', 'tracer', 'bracer', 'airr'")
+        sys.exit(
+            f"filetype {filetype} not valid, must be one of 'cellranger_vdj', 'tracer', 'bracer', 'airr'"
+        )
     return vdj
 
 
-def scp_read_10x_h5(filename: PathLike, library_keep=None, extended= False, *args, **kwargs) -> AnnData:
+def scp_read_10x_h5(
+    filename: PathLike, library_keep=None, extended: bool = False, *args, **kwargs
+) -> AnnData:
     """
-    expanded sc.read_10x_h5 to filter for the library 
+    expanded sc.read_10x_h5 to filter for the library
     adapted from https://github.com/scverse/muon/blob/master/muon/_prot/io.py
     """
     adata = read_10x_h5(filename, gex_only=False, *args, **kwargs)
+
     logging.debug(adata)
     logging.debug("filtering cellranger outputs to %s" % library_keep)
+
     if extended is not None and extended is True:
         import shutil
-        logging.debug("copying %s" % filename )
+
+        logging.debug("copying %s" % filename)
+
         h5file = h5py.File(filename, "r")
+
         logging.debug("reading intervals")
+
         if "interval" in h5file["matrix"]["features"]:
             intervals = np.array(h5file["matrix"]["features"]["interval"]).astype(str)
-            logging.debug("intervals %s" %intervals)
+            logging.debug("intervals %s" % intervals)
             adata.var["interval"] = intervals
             logging.debug(f"Added `interval` annotation for features from {filename}")
             logging.debug("updated adata %s" % adata)
@@ -353,130 +510,156 @@ def scp_read_10x_h5(filename: PathLike, library_keep=None, extended= False, *arg
             h5file.close()
     else:
         logging.debug("skipping extended arg")
+
     if library_keep is not None:
-        adata = adata[:, list(map(lambda x: x == library_keep, adata.var["feature_types"]))].copy()
+        adata = adata[
+            :, list(map(lambda x: x == library_keep, adata.var["feature_types"]))
+        ].copy()
+
         logging.debug("filtered anndata size %s" % adata)
-    
-    return adata
 
+    return adata
 
 
 def scp_read_10x_mtx(filename: PathLike, library_keep=None, *args, **kwargs) -> AnnData:
     """
-    expanded sc.read_10x_mtx to filter for the library 
+    expanded sc.read_10x_mtx to filter for the library
     adapted from https://github.com/scverse/muon/blob/master/muon/_prot/io.py
     """
-    adata = read_10x_mtx(filename, gex_only=False, *args, **kwargs) #need to leave gex as this is scanpy's code
+    adata = read_10x_mtx(
+        filename, gex_only=False, *args, **kwargs
+    )  # need to leave gex as this is scanpy's code
+
     logging.debug("filtering cellranger outputs to %s" % library_keep)
+
     if library_keep is not None:
         adata = adata[
             :, list(map(lambda x: x == library_keep, adata.var["feature_types"]))
         ].copy()
+
     return adata
 
-#----------------
+
+# ----------------
 # temp loading functions for spatial data
 # load_Spatial_in
 # load_spatial_from_multiple_files
 # ->>>how to write?
-#----------------
-
+# ----------------
 
 
-
-
-def load_adata_in(path, filetype, gex_only=True, var_names="gene_symbols", library=None, extended=False):
+def load_adata_in(
+    path: str,
+    filetype: str,
+    gex_only: bool = True,  # ToDo: it not used anywhere in the function
+    var_names: str = "gene_symbols",
+    library=None,
+    extended: bool = False,
+):
     """
     load in any format supported by the load_functions_dict
     expected to be called by load_mdata_from_multiple_files
     """
     # all the supported loading functions here
     load_functions_dict = {
-    "cellranger": lambda x: scp_read_10x_mtx(x, 
-                                           library_keep=library,
-                                           var_names=var_names,
-                                           cache=False),
-     "h5ad":read_h5ad,
-     "csv_matrix":read_csv,
-     "txt_matrix": read_text,
-     "10X_h5": lambda x: scp_read_10x_h5(x, library_keep=library, extended=extended),
-     "hd5": read_hdf,
-      # vdj functions
-     "cellranger_vdj": lambda x: read_scirpy(x, filetype="cellranger_vdj"),
-     "tracer":lambda x: read_scirpy(x, filetype="tracer"),
-     "bracer":lambda x: read_scirpy(x, filetype="bracer"),
-     "airr":lambda x: read_scirpy(x, filetype="airr"),             
-     }
-    
+        "cellranger": lambda x: scp_read_10x_mtx(
+            x, library_keep=library, var_names=var_names, cache=False
+        ),
+        "h5ad": read_h5ad,
+        "csv_matrix": read_csv,
+        "txt_matrix": read_text,
+        "10X_h5": lambda x: scp_read_10x_h5(x, library_keep=library, extended=extended),
+        "hd5": read_hdf,
+        # vdj functions
+        "cellranger_vdj": lambda x: read_scirpy(x, filetype="cellranger_vdj"),
+        "tracer": lambda x: read_scirpy(x, filetype="tracer"),
+        "bracer": lambda x: read_scirpy(x, filetype="bracer"),
+        "airr": lambda x: read_scirpy(x, filetype="airr"),
+    }
 
     # the var_names argument decides which column of the cellranger features files ends up as the index.
-    if var_names=="gene_symbols":
-        col_fill="gene_ids"
-    elif var_names=="gene_ids":
-        col_fill="gene_symbols"
+    if var_names == "gene_symbols":
+        col_fill = "gene_ids"
+    elif var_names == "gene_ids":
+        col_fill = "gene_symbols"
     else:
-        col_fill="index"
+        col_fill = "index"
 
     # try:
     logging.debug("loading %s " % filetype)
+
     adata = load_functions_dict[filetype](path)
+
     logging.debug("this is the anndata now: %s" % adata)
+
     # in some cases you need to update the var index col.
     if var_names in adata.var.columns:
         logging.info("resetting var.index")
         # we need to reset the index to be the expected one.
-        adata.var = adata.var.reset_index().rename(columns={"index":col_fill}).set_index(var_names)
+        adata.var = (
+            adata.var.reset_index()
+            .rename(columns={"index": col_fill})
+            .set_index(var_names)
+        )
+
     return adata
     # except:
-    #     raise ValueError("unknown filetype %s, please specify one of: %s" % filetype, ", ".join(list(load_functions_dict.keys()))) 
-    
+    #     raise ValueError("unknown filetype %s, please specify one of: %s" % filetype, ", ".join(list(load_functions_dict.keys())))
 
 
-def update_intersecting_feature_names(adata1, adata2, prefix):
+def update_intersecting_feature_names(adata1: AnnData, adata2: AnnData, prefix: str):
     # check for intersecting names
-    if (len(adata1.var_names) > 0 and len(adata2.var_names) > 0 ):
-        intersect_features = intersection(adata1.var_names.tolist(), adata2.var_names.tolist())
+    if len(adata1.var_names) > 0 and len(adata2.var_names) > 0:
+        intersect_features = intersection(
+            adata1.var_names.tolist(), adata2.var_names.tolist()
+        )
         if len(intersect_features) > 0:
-            # this will be a problem downstream, so we prefix the 
-            warnings.warn("intersecting gene ids and other modalities are problematic downstream, \
-                prefixing  with %s" % prefix)
-            adata2.var[ prefix + 'id'] = adata2.var_names
+            # this will be a problem downstream, so we prefix the
+            warnings.warn(
+                "intersecting gene ids and other modalities are problematic downstream, \
+                prefixing  with %s"
+                % prefix
+            )
+            adata2.var[prefix + "id"] = adata2.var_names
             adata2.var_names = [prefix + x for x in adata2.var_names.tolist()]
+
     return adata2
 
 
 def merge_tcr_bcr_into_one_anndata(tcr, bcr):
     logging.info("merging tcr and bcr into one rep modality")
     intersect_obs = list(set(bcr.obs_names).union(set(tcr.obs_names)))
-    adata = AnnData(X=np.empty(shape=(len(intersect_obs), 0)),
-                       obs=pd.DataFrame(index=intersect_obs))
-    # merge in the tcr and bcr with the rna 
-    ir.pp.merge_airr_chains(adata , tcr)
-    ir.pp.merge_airr_chains(adata , bcr)
+    adata = AnnData(
+        X=np.empty(shape=(len(intersect_obs), 0)), obs=pd.DataFrame(index=intersect_obs)
+    )
+
+    # merge in the tcr and bcr with the rna
+    ir.pp.merge_airr(adata, tcr)
+    ir.pp.merge_airr(adata, bcr)
     ir.tl.chain_qc(adata)
     return adata
-    
 
 
-def _make_one_rep_modality(data_dict):
+def _make_one_rep_modality(data_dict: dict[AnnData]):
     """
     takes the data dict from load_mdata_from_multiple_files, and collapses down the bcr and/or tcr into one modality called rep
     """
-    if ("tcr" in data_dict.keys() and "bcr" in data_dict.keys()):
+    if "tcr" in data_dict.keys() and "bcr" in data_dict.keys():
         # get the intersecting obs
-        data_dict['rep'] = merge_tcr_bcr_into_one_anndata(data_dict['tcr'], data_dict['bcr'])
+        data_dict["rep"] = merge_tcr_bcr_into_one_anndata(
+            data_dict["tcr"], data_dict["bcr"]
+        )
         # remove separate assays
-        data_dict.pop('tcr', None)
-        data_dict.pop('bcr', None)
-    elif("tcr" in data_dict.keys()):
+        data_dict.pop("tcr", None)
+        data_dict.pop("bcr", None)
+    elif "tcr" in data_dict.keys():
         logging.info("storing tcr in rep modality")
-        data_dict['rep'] = data_dict['tcr']
-        data_dict.pop('tcr', None)
-    elif("bcr" in data_dict.keys()):
+        data_dict["rep"] = data_dict["tcr"]
+        data_dict.pop("tcr", None)
+    elif "bcr" in data_dict.keys():
         logging.info("storing bcr in rep modality")
-        data_dict['rep'] = data_dict['bcr']
-        data_dict.pop('bcr', None)
-    
+        data_dict["rep"] = data_dict["bcr"]
+        data_dict.pop("bcr", None)
 
 
 def load_mdata_from_multiple_files(all_files_dict):
@@ -488,11 +671,11 @@ def load_mdata_from_multiple_files(all_files_dict):
         dictionary containing one key per assay out of [RNA, PROT, TCR, BCR, ATAC]
         and the values for each key is a list of file path and fie type
         e.g.  {"RNA": [filepath, "filetype"],
-               "PROT: [file path, "filetype"]} 
+               "PROT: [file path, "filetype"]}
         Filetypes supported for RNA/prot: ["cellranger", "h5ad", "csv_matrix", "txt_matrix", "10X_h5"],
         Filetypes supported for atac (multiome preferred is 10X_h5) ["10X_h5","cellranger","h5ad"]
-        Filetypes supported for rep: ["cellranger_vdj", "airr", "tracer", "bracer"  ] 
-        See scirpy documentation for more information of repertoire input formats 
+        Filetypes supported for rep: ["cellranger_vdj", "airr", "tracer", "bracer"  ]
+        See scirpy documentation for more information of repertoire input formats
         https://scverse.org/scirpy/latest/api.html#module-scirpy.io
     """
     # convert names to match mudata conventions
@@ -505,47 +688,50 @@ def load_mdata_from_multiple_files(all_files_dict):
     data_dict = {}
     for nm, x in all_files_dict.items():
         logging.info("loading %s" % nm)
-        extra_args={}
+        extra_args = {}
         if nm == "rna":
-            extra_args['library'] = "Gene Expression"
-            if 'atac' in all_files_dict.keys():
-                extra_args["extended"]= True
+            extra_args["library"] = "Gene Expression"
+            if "atac" in all_files_dict.keys():
+                extra_args["extended"] = True
             else:
-                extra_args["extended"]= False
+                extra_args["extended"] = False
         if nm == "prot":
             extra_args["gex_only"] = False
             extra_args["var_names"] = "gene_ids"
-            extra_args['library'] = "Antibody Capture"
-            extra_args["extended"]= False
+            extra_args["library"] = "Antibody Capture"
+            extra_args["extended"] = False
         if nm == "atac":
             extra_args["gex_only"] = False
-            extra_args['library'] = "Peaks"
-            extra_args['extended'] = False
+            extra_args["library"] = "Peaks"
+            extra_args["extended"] = False
         if nm == "spatial":
-            extra_args["gex_only"] = True # check this for techs other than merfish and visium H&E
-            #extra_args["counts_file"] =
-            extra_args['extended'] = False
+            extra_args["gex_only"] = (
+                True  # check this for techs other than merfish and visium H&E
+            )
+            # extra_args["counts_file"] =
+            extra_args["extended"] = False
 
         logging.debug("extra args")
         logging.debug(extra_args)
-        data_dict[nm] = load_adata_in(x[0], x[1], **extra_args) #** 
-        #x[0] is the path, x[1] is the filetype
+        data_dict[nm] = load_adata_in(x[0], x[1], **extra_args)  # **
+        # x[0] is the path, x[1] is the filetype
     logging.debug(data_dict["rna"])
     logging.debug(data_dict.keys())
     # we want unique var names for each assay
     if "rna" in data_dict.keys():
         logging.debug("rna adata %s " % data_dict["rna"])
-        data_dict['rna'].var_names_make_unique()
+        data_dict["rna"].var_names_make_unique()
         for mod in data_dict.keys():
-            if mod =="rna":
+            if mod == "rna":
                 # skip since we want to update all the ones that are not rna
                 pass
             else:
                 logging.debug("second adata %s " % data_dict[mod])
                 logging.debug("updating intersection for %s" % mod)
-                data_dict[mod] = update_intersecting_feature_names(adata1=data_dict['rna'], adata2=data_dict[mod], 
-                prefix=mod + "_")
-    #make repertoire data into one assay
+                data_dict[mod] = update_intersecting_feature_names(
+                    adata1=data_dict["rna"], adata2=data_dict[mod], prefix=mod + "_"
+                )
+    # make repertoire data into one assay
     _make_one_rep_modality(data_dict)
     logging.debug(data_dict)
 
@@ -556,9 +742,7 @@ def load_mdata_from_multiple_files(all_files_dict):
     return mdata_
 
 
-from scipy.io import mmwrite
-import gzip
-def write_10x_counts(adata, path, layer=None):
+def write_10x_counts(adata: AnnData, path: str, layer=None):
     if layer is None:
         arr = adata.X
     else:
@@ -570,12 +754,17 @@ def write_10x_counts(adata, path, layer=None):
         arr = arr.astype(int)
     # write matrix.mtx
     mmwrite(os.path.join(path, "matrix.mtx"), arr.T)
-    with open(os.path.join(path, "matrix.mtx"), 'rb') as f_in, gzip.open(os.path.join(path, "matrix.mtx.gz"), 'wb') as f_out:
+    with (
+        open(os.path.join(path, "matrix.mtx"), "rb") as f_in,
+        gzip.open(os.path.join(path, "matrix.mtx.gz"), "wb") as f_out,
+    ):
         f_out.writelines(f_in)
     os.remove(os.path.join(path, "matrix.mtx"))
     # write barcodes file
     barcodes = adata.obs_names.to_frame()
-    barcodes.to_csv(os.path.join(path, "barcodes.tsv.gz"), sep='\t', index=None, header=None)
+    barcodes.to_csv(
+        os.path.join(path, "barcodes.tsv.gz"), sep="\t", index=None, header=None
+    )
     # write features file (trying to make sure the correct columns are stored)
     if "gene_ids" in adata.var.columns:
         # we assume the index is gene_ids
@@ -583,9 +772,14 @@ def write_10x_counts(adata, path, layer=None):
     elif "gene_symbols" in adata.var.columns:
         features = adata.var.reset_index().rename(columns={"index": "gene_ids"})
         if "prot_id" in features.columns:
-            features = features.drop(columns=['gene_symbols']).rename(columns={"prot_id" : "gene_symbols"})
-    features = features[['gene_ids','gene_symbols', 'feature_types']] 
-    features.to_csv(os.path.join(path, "features.tsv.gz"), sep='\t', index=None, header=None)
+            features = features.drop(columns=["gene_symbols"]).rename(
+                columns={"prot_id": "gene_symbols"}
+            )
+    features = features[["gene_ids", "gene_symbols", "feature_types"]]
+    features.to_csv(
+        os.path.join(path, "features.tsv.gz"), sep="\t", index=None, header=None
+    )
+
 
 # https://stackoverflow.com/questions/33529312/remove-empty-dicts-in-nested-dictionary-with-recursive-function
 def dictionary_stripper(data):
@@ -612,10 +806,10 @@ def replace_string_nones(d):
         elif v == "None":
             d[k] = None
 
-import yaml
-def read_yaml(fname):
+
+def read_yaml(fname: str | dict):
     if type(fname) is dict:
-        params=fname
+        params = fname
     elif os.path.exists(fname):
         with open(fname, "r") as stream:
             try:
@@ -626,7 +820,7 @@ def read_yaml(fname):
         # if the yaml is being passed as a string
         try:
             params = yaml.safe_load(fname)
-            
+
         except yaml.YAMLError as exc:
             print(exc)
     replace_string_nones(params)
@@ -641,4 +835,3 @@ def read_yaml(fname):
 #     "scanvi": lambda x: scvi.model.SCANVI.load_query_data,
 
 #     }
-    

From bb5ae89c565db6b67c53f449ac6cf09bf7cb2e32 Mon Sep 17 00:00:00 2001
From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com>
Date: Tue, 4 Feb 2025 10:04:50 +0100
Subject: [PATCH 02/10] add scirpy version for pyproject.toml

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2dc39d84..918cc2e2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,7 @@ dependencies = [
     "scib",
     "scib-metrics",
     "scikit-misc",
-    "scirpy",
+    "scirpy>=0.19.0",
     "scrublet",
     "scvi-tools",
     "sqlalchemy",
@@ -82,9 +82,9 @@ refmap_old = [
 panpipes = "panpipes:entry.main"
 
 [tool.setuptools]
+packages = ["panpipes"]
 package-data = {"panpipes" = ["panpipes/*/*.yml", "R_scripts/*.R", "resources/*"]}
 
-
 [build-system]
 requires = ["setuptools>=43.0.0", "wheel"]
 build-backend = "setuptools.build_meta"

From 8c229bea5dda71bc6e539edf2c1774a5c3c1ce5d Mon Sep 17 00:00:00 2001
From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com>
Date: Tue, 4 Feb 2025 10:17:52 +0100
Subject: [PATCH 03/10] refactoring: style + more verbose and detailed errros
 in case of file issues

---
 panpipes/python_scripts/run_scanpyQC_rna.py | 223 ++++++++++++--------
 1 file changed, 138 insertions(+), 85 deletions(-)

diff --git a/panpipes/python_scripts/run_scanpyQC_rna.py b/panpipes/python_scripts/run_scanpyQC_rna.py
index b1fca50d..7e063844 100644
--- a/panpipes/python_scripts/run_scanpyQC_rna.py
+++ b/panpipes/python_scripts/run_scanpyQC_rna.py
@@ -1,64 +1,67 @@
-'''
+"""
 scanpy QC script RNA
 order of QC:
 - RNA
 - PROT
 - Repertoire
 - ATAC
-'''
-import sys
-import logging
-L = logging.getLogger()
-L.setLevel(logging.INFO)
-log_handler = logging.StreamHandler(sys.stdout)
-formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s')
-log_handler.setFormatter(formatter)
-L.addHandler(log_handler)
+"""
 
+import argparse
+import logging
+import os
+import sys
 import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
 
+import muon as mu
 import pandas as pd
-import os
-import argparse
 import scanpy as sc
-import muon as mu
 
 from panpipes.funcs.io import write_obs
 
+L = logging.getLogger()
+L.setLevel(logging.INFO)
+log_handler = logging.StreamHandler(sys.stdout)
+formatter = logging.Formatter("%(asctime)s: %(levelname)s - %(message)s")
+log_handler.setFormatter(formatter)
+L.addHandler(log_handler)
+
+warnings.simplefilter(action="ignore", category=FutureWarning)
 
 parser = argparse.ArgumentParser()
 # required option
-parser.add_argument("--sampleprefix",
-                    default="",
-                    help="prefix to prepend when saving the metadata file")
-parser.add_argument("--input_anndata",
-                    default="adata_unfilt.h5ad",
-                    help="")
-parser.add_argument("--outfile",
-                    default="adata_unfilt.h5ad",
-                    help="")
-parser.add_argument("--figdir",
-                    default="./figures/",
-                    help="path to save the figures to")
-parser.add_argument("--figure_suffix",
-                    default="_qc-plot.png",
-                    help="figures filename suffix to be appended to figures/umap")
-parser.add_argument("--scrubletdir",
-                    default=None,
-                    help="path to save the figures to")
-parser.add_argument("--ccgenes",
-                    default=None,
-                    help="path to file containing cell cycle genes")
-parser.add_argument("--customgenesfile",
-                    default=None,
-                    help="path to file containing list of genes to quantify")
-parser.add_argument("--calc_proportions",
-                    default="mitochondrial,ribosomal",
-                    help="which list of genes to use to calc proportion of mapped reads over total,per cell?")
-parser.add_argument("--score_genes",
-                    default="MarkersNeutro",
-                    help="which list of genes to use to scanpy.tl.score_genes per cell?")
+parser.add_argument(
+    "--sampleprefix", default="", help="prefix to prepend when saving the metadata file"
+)
+parser.add_argument("--input_anndata", default="adata_unfilt.h5ad", help="")
+parser.add_argument("--outfile", default="adata_unfilt.h5ad", help="")
+parser.add_argument(
+    "--figdir", default="./figures/", help="path to save the figures to"
+)
+parser.add_argument(
+    "--figure_suffix",
+    default="_qc-plot.png",
+    help="figures filename suffix to be appended to figures/umap",
+)
+parser.add_argument("--scrubletdir", default=None, help="path to save the figures to")
+parser.add_argument(
+    "--ccgenes", default=None, help="path to file containing cell cycle genes"
+)
+parser.add_argument(
+    "--customgenesfile",
+    default=None,
+    help="path to file containing list of genes to quantify",
+)
+parser.add_argument(
+    "--calc_proportions",
+    default="mitochondrial,ribosomal",
+    help="which list of genes to use to calc proportion of mapped reads over total,per cell?",
+)
+parser.add_argument(
+    "--score_genes",
+    default="MarkersNeutro",
+    help="which list of genes to use to scanpy.tl.score_genes per cell?",
+)
 
 args, opt = parser.parse_known_args()
 L.info("Running with params: %s", args)
@@ -70,31 +73,47 @@
     os.mkdir(figdir)
 
 sc.settings.figdir = figdir
-sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5))
+sc.set_figure_params(
+    scanpy=True, fontsize=14, dpi=300, facecolor="white", figsize=(5, 5)
+)
 
 
 L.info("Reading in MuData from '%s'" % args.input_anndata)
 mdata = mu.read(args.input_anndata)
-rna = mdata['rna']
+rna = mdata["rna"]
 
 
 # load the scrublet scores into the anndata (if they have been run)
 if args.scrubletdir is not None:
     scrub_dir = args.scrubletdir
     L.info("Merging in the scrublet scores from directory '%s'" % args.scrubletdir)
-    sample_ids = rna.obs[['sample_id']].drop_duplicates()
-    [scrub_dir + '/' + ss + '_scrublet_scores.txt' for ss in sample_ids.sample_id]
-    doubletscores = [pd.read_csv(scrub_dir + '/' + ss + '_scrublet_scores.txt', sep="\t", header=0) for ss in sample_ids.sample_id]
-    doubletscores = pd.concat(doubletscores, keys=sample_ids.sample_id).reset_index(level="sample_id")
+    sample_ids = rna.obs[["sample_id"]].drop_duplicates()
+
+    # [scrub_dir + "/" + ss + "_scrublet_scores.txt" for ss in sample_ids.sample_id] # ToDo: it was uncommented – why is this here?
+
+    doubletscores = [
+        pd.read_csv(scrub_dir + "/" + ss + "_scrublet_scores.txt", sep="\t", header=0)
+        for ss in sample_ids.sample_id
+    ]
+
+    doubletscores = pd.concat(doubletscores, keys=sample_ids.sample_id).reset_index(
+        level="sample_id"
+    )
     # rename the barcodes to match up qwith what the rna.obs barcodes are
     if len(sample_ids) > 1:
-        doubletscores['barcode'] = doubletscores['barcode'].astype(str) + '-' + doubletscores['sample_id'].astype(str)
-    doubletscores = doubletscores.set_index('barcode').drop('sample_id', axis=1)
+        doubletscores["barcode"] = (
+            doubletscores["barcode"].astype(str)
+            + "-"
+            + doubletscores["sample_id"].astype(str)
+        )
+    doubletscores = doubletscores.set_index("barcode").drop("sample_id", axis=1)
     # merge with rna.obs
-    rna.obs = rna.obs.merge(doubletscores, how="left", left_index=True, right_index=True)
+    rna.obs = rna.obs.merge(
+        doubletscores, how="left", left_index=True, right_index=True
+    )
 
 
-# Cell-wise QC based on .var lists of genes 
+# Cell-wise QC based on .var lists of genes
 
 qc_vars = []
 
@@ -103,61 +122,93 @@
         cat_dic = {}
         L.info("Reading in custom genes csv file from '%s'" % args.customgenesfile)
         customgenes = pd.read_csv(args.customgenesfile)
-        if not {'group', 'feature'}.issubset(customgenes.columns):
-            L.error("The custom genes file needs to have both columns, 'group' and 'feature'.")
-            sys.exit("The custom genes file needs to have both columns, 'group' and 'feature'.")
-        custom_cat = list(set(customgenes['group'].tolist()))
+        if not {"group", "feature"}.issubset(customgenes.columns):
+            L.error(
+                "The custom genes file needs to have both columns, 'group' and 'feature'."
+            )
+            sys.exit(
+                "The custom genes file needs to have both columns, 'group' and 'feature'."
+            )
+        custom_cat = list(set(customgenes["group"].tolist()))
+
         for cc in custom_cat:
-            cat_dic[cc] = customgenes.loc[customgenes["group"] == cc,"feature"].tolist()
+            cat_dic[cc] = customgenes.loc[
+                customgenes["group"] == cc, "feature"
+            ].tolist()
+
         if args.calc_proportions is not None:
             calc_proportions = args.calc_proportions.split(",")
             calc_proportions = [a.strip() for a in calc_proportions]
         else:
-            L.warning("No genes to calculate proportions for")
-            
+            L.warning(
+                "No genes passed to `args.calc_proportions` to calculate proportions for"
+            )
+
         if args.score_genes is not None:
             score_genes = args.score_genes.split(",")
             score_genes = [a.strip() for a in score_genes]
         else:
-            L.warning("No genes to calculate scores for")    
-        
+            L.warning("No genes passed to `args.score_genes` to calculate scores for")
+
     else:
-        L.error("You have not provided a list of custom genes to use for QC purposes")
-        sys.exit("You have not provided a list of custom genes to use for QC purposes")
+        L.error(
+            f"File with custom genes for QC purposes `{args.customgenesfile}` does not exist."
+        )
+        sys.exit(
+            f"File with custom genes for QC purposes `{args.customgenesfile}` does not exist."
+        )
+else:
+    L.error(
+        "You have not provided a file with a list of custom genes to use for QC purposes – `args.customgenesfile`."
+    )
+    sys.exit(
+        "You have not provided a file with a list of custom genes to use for QC purposes – `args.customgenesfile`."
+    )
 
 for kk in calc_proportions:
-    xname= kk
+    xname = kk
     gene_list = cat_dic[kk]
-    rna.var[xname] = [x in gene_list for x in rna.var_names] # annotate the group of hb genes as 'hb'
+    rna.var[xname] = [
+        x in gene_list for x in rna.var_names
+    ]  # annotate the group of hb genes as 'hb'
     qc_vars.append(xname)
 
 qc_info = ""
 if qc_vars != []:
     qc_info = " and calculating proportions for '%s'" % qc_vars
 L.info("Calculating QC metrics with scanpy.pp.calculate_qc_metrics()" + qc_info)
-sc.pp.calculate_qc_metrics(rna, qc_vars=qc_vars, percent_top=None,log1p=True, inplace=True)
+sc.pp.calculate_qc_metrics(
+    rna, qc_vars=qc_vars, percent_top=None, log1p=True, inplace=True
+)
 
 if args.score_genes is not None:
     for kk in score_genes:
         L.info("Computing gene scores for '%s'" % kk)
-        xname= kk
+        xname = kk
         gene_list = cat_dic[kk]
-        sc.tl.score_genes(rna, gene_list , 
-                            ctrl_size=min(len(gene_list), 50), 
-                            gene_pool=None, 
-                            n_bins=25, 
-                            score_name=kk + '_score', 
-                            random_state=0, 
-                            copy=False, 
-                            use_raw=None)
+        sc.tl.score_genes(
+            rna,
+            gene_list,
+            ctrl_size=min(len(gene_list), 50),
+            gene_pool=None,
+            n_bins=25,
+            score_name=kk + "_score",
+            random_state=0,
+            copy=False,
+            use_raw=None,
+        )
 
 
 if args.ccgenes is not None:
     L.info("Reading in cell cycle genes tsv file from '%s'" % args.ccgenes)
-    ccgenes = pd.read_csv(args.ccgenes, sep='\t')
-    if not {'cc_phase', 'gene_name'}.issubset(ccgenes.columns):
-        L.error("The cell cycle genes file needs to have both columns, 'cc_phase' and 'gene_name'.")
-        sys.exit("The cell cycle genes file needs to have both columns, 'cc_phase' and 'gene_name'.")
+    ccgenes = pd.read_csv(args.ccgenes, sep="\t")
+    if not {"cc_phase", "gene_name"}.issubset(ccgenes.columns):
+        L.error(
+            "The cell cycle genes file needs to have both columns, 'cc_phase' and 'gene_name'."
+        )
+        sys.exit(
+            "The cell cycle genes file needs to have both columns, 'cc_phase' and 'gene_name'."
+        )
     sgenes = ccgenes[ccgenes["cc_phase"] == "s"]["gene_name"].tolist()
     g2mgenes = ccgenes[ccgenes["cc_phase"] == "g2m"]["gene_name"].tolist()
     L.info("Calculating cell cycle scores")
@@ -166,11 +217,13 @@
 
 mdata.update()
 
-L.info("Saving updated obs in a metadata tsv file to ./" + args.sampleprefix + "_cell_metadata.tsv")
-write_obs(mdata, output_prefix=args.sampleprefix, 
-        output_suffix="_cell_metadata.tsv")
+L.info(
+    "Saving updated obs in a metadata tsv file to ./"
+    + args.sampleprefix
+    + "_cell_metadata.tsv"
+)
+write_obs(mdata, output_prefix=args.sampleprefix, output_suffix="_cell_metadata.tsv")
 L.info("Saving updated MuData to '%s'" % args.outfile)
 mdata.write(args.outfile)
 
 L.info("Done")
-

From d5e97c6c5e6dbedfaddffbaddf56cc45b6055539 Mon Sep 17 00:00:00 2001
From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com>
Date: Tue, 4 Feb 2025 10:20:33 +0100
Subject: [PATCH 04/10] feat+refactoring: adapt scirpy to 0.19+; some
 refactoring with imports and style

---
 panpipes/python_scripts/run_scanpyQC_rep.py | 141 +++++++++++---------
 1 file changed, 78 insertions(+), 63 deletions(-)

diff --git a/panpipes/python_scripts/run_scanpyQC_rep.py b/panpipes/python_scripts/run_scanpyQC_rep.py
index 315f32e9..3a19879a 100644
--- a/panpipes/python_scripts/run_scanpyQC_rep.py
+++ b/panpipes/python_scripts/run_scanpyQC_rep.py
@@ -1,61 +1,67 @@
-'''
+"""
 scanpy QC script RNA
 order of QC:
 - RNA
 - PROT
 - Repertoire
 - ATAC
-'''
-import sys
+"""
+
+import argparse
 import logging
+import sys
+import warnings
+
+import matplotlib.pyplot as plt
+import muon as mu
+import scirpy as ir
+import yaml
+
+from panpipes.funcs.io import write_obs
+
 L = logging.getLogger()
 L.setLevel(logging.INFO)
 log_handler = logging.StreamHandler(sys.stdout)
-formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s')
+formatter = logging.Formatter("%(asctime)s: %(levelname)s - %(message)s")
 log_handler.setFormatter(formatter)
 L.addHandler(log_handler)
 
-import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
 
-import argparse
-import pandas as pd
-import muon as mu
-import scirpy as ir
-import matplotlib.pyplot as plt
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
 plt.ioff()
-from panpipes.funcs.io import write_obs
-import yaml
 
 parser = argparse.ArgumentParser()
 # required option
-parser.add_argument("--sampleprefix",
-                    default="",
-                    help="prefix to prepend when saving the metadata file")
-parser.add_argument("--input_mudata",
-                    default="mdata_unfilt.h5mu",
-                    help="")
-parser.add_argument("--output_mudata",
-                    default="mdata_unfilt.h5mu",
-                    help="")
-parser.add_argument("--figdir",
-                    default="./figures/",
-                    help="path to save the figures to")
-
-parser.add_argument("--distance_metrics",
-                    default=None, type=yaml.safe_load,
-                    help="what metrics to calculate sequence distance metric? \
-                    any arguments from scirpy.pp.ir_dist as a dict in format'{arg: value}' ")
-parser.add_argument("--clonotype_metrics",
-                    default=None,type=yaml.safe_load,
-                    help="what metrics to calculate sequence distance metric? \
-                    any arguments from scirpy.pp.define_clonotypes as a dict in format'{arg: value}' ")
+parser.add_argument(
+    "--sampleprefix", default="", help="prefix to prepend when saving the metadata file"
+)
+parser.add_argument("--input_mudata", default="mdata_unfilt.h5mu", help="")
+parser.add_argument("--output_mudata", default="mdata_unfilt.h5mu", help="")
+parser.add_argument(
+    "--figdir", default="./figures/", help="path to save the figures to"
+)
+
+parser.add_argument(
+    "--distance_metrics",
+    default=None,
+    type=yaml.safe_load,
+    help="what metrics to calculate sequence distance metric? \
+                    any arguments from scirpy.pp.ir_dist as a dict in format'{arg: value}' ",
+)
+parser.add_argument(
+    "--clonotype_metrics",
+    default=None,
+    type=yaml.safe_load,
+    help="what metrics to calculate sequence distance metric? \
+                    any arguments from scirpy.pp.define_clonotypes as a dict in format'{arg: value}' ",
+)
 args, opt = parser.parse_known_args()
 L.info("Running with params: %s", args)
 
 L.info("Reading in MuData from '%s'" % args.input_mudata)
 mdata = mu.read(args.input_mudata)
-rep = mdata['rep']
+rep = mdata["rep"]
 # chain qc
 L.info("Running scirpy.tl.chain_qc()")
 ir.tl.chain_qc(rep)
@@ -72,78 +78,87 @@
 L.info("Computing sequence-distance metric")
 ir.pp.ir_dist(rep, **dist_args)
 
-# pull function arguments from args. 
+# pull function arguments from args.
 if args.clonotype_metrics is not None:
     # remove nones, so defaults are used
     clonotype_args = {k: v for k, v in args.clonotype_metrics.items() if v != "None"}
     clonotype_args = {k: v for k, v in clonotype_args.items() if v}
 else:
-    clonotype_args={}
+    clonotype_args = {}
 # define clonotypes
+
 L.info("Clonotypes args: %s" % clonotype_args)
 L.info("Defining clonotypes")
 ir.tl.define_clonotypes(rep, **clonotype_args)
+
 L.info("Adding column to obs recording which clonotypes are expanded")
 ir.tl.clonal_expansion(rep)
 
-    
-category_cols = ['has_ir']
-for cc in category_cols:
-    if pd.api.types.infer_dtype(rep.obs[cc]) != "categorical":
-        rep.obs[cc] = rep.obs[cc].astype('string').astype('category')
 
-
-if "BCR" in rep.obs['receptor_type'].values:
-    bcr = rep[rep.obs.receptor_type =="BCR", :].copy()
-else:
-    bcr=None
-    
+tcr = None
 if "TCR" in rep.obs.receptor_type.values:
-    tcr = rep[rep.obs.receptor_type =="TCR", :].copy()
-else:
-    tcr=None
+    tcr = rep[rep.obs.receptor_type == "TCR", :].copy()
+
+bcr = None
+if "BCR" in rep.obs["receptor_type"].values:
+    bcr = rep[rep.obs.receptor_type == "BCR", :].copy()
+
 
 # plot group abundances
 L.info("Plotting group abundances")
 fig, ax = plt.subplots()
 ax = ir.pl.group_abundance(rep, groupby="receptor_type", ax=ax)
-fig.savefig(args.figdir +"/barplot_group_abundance_receptor_type.png", bbox_inches="tight")
+fig.savefig(
+    args.figdir + "/barplot_group_abundance_receptor_type.png", bbox_inches="tight"
+)
 
 if bcr is not None:
     fig, ax = plt.subplots()
     ax = ir.pl.group_abundance(bcr, groupby="chain_pairing", ax=ax)
     fig.tight_layout()
-    fig.savefig(args.figdir +"/barplot_group_abundance_bcr_receptor_subtype.png", bbox_inches="tight")
+    fig.savefig(
+        args.figdir + "/barplot_group_abundance_bcr_receptor_subtype.png",
+        bbox_inches="tight",
+    )
 
 if tcr is not None:
     fig, ax = plt.subplots()
-    ax = ir.pl.group_abundance(tcr, groupby="chain_pairing",  ax=ax)
+    ax = ir.pl.group_abundance(tcr, groupby="chain_pairing", ax=ax)
     fig.tight_layout()
-    fig.savefig(args.figdir +"/barplot_group_abundance_tcr_receptor_subtype.png", bbox_inches="tight")
+    fig.savefig(
+        args.figdir + "/barplot_group_abundance_tcr_receptor_subtype.png",
+        bbox_inches="tight",
+    )
 
 # clonal expansion
 
 L.info("Plotting clonal expansion")
-clone_counts = rep.obs.groupby("receptor_type").clone_id.value_counts().to_frame("cell_counts").reset_index()
+clone_counts = (
+    rep.obs.groupby("receptor_type")
+    .clone_id.value_counts()
+    .to_frame("cell_counts")
+    .reset_index()
+)
 
 if bcr is not None:
     ax = ir.pl.clonal_expansion(bcr, groupby="sample_id")
     ax.set_title("bcr clonal expansion")
-    plt.savefig(args.figdir +"/barplot_clonal_expansion_bcr.png", bbox_inches="tight" )
+    plt.savefig(args.figdir + "/barplot_clonal_expansion_bcr.png", bbox_inches="tight")
 
 if tcr is not None:
     ax = ir.pl.clonal_expansion(tcr, groupby="sample_id")
     ax.set_title("tcr clonal expansion")
-    plt.savefig(args.figdir +"/barplot_clonal_expansion_tcr.png", bbox_inches="tight")
-
+    plt.savefig(args.figdir + "/barplot_clonal_expansion_tcr.png", bbox_inches="tight")
 
 
 mdata.update()
-L.info("Saving updated obs in a metadata tsv file to ./" + args.sampleprefix + "_cell_metadata.tsv")
-write_obs(mdata, output_prefix=args.sampleprefix, 
-    output_suffix="_cell_metadata.tsv")
+L.info(
+    "Saving updated obs in a metadata tsv file to ./"
+    + args.sampleprefix
+    + "_cell_metadata.tsv"
+)
+write_obs(mdata, output_prefix=args.sampleprefix, output_suffix="_cell_metadata.tsv")
 L.info("Saving updated MuData to '%s'" % args.output_mudata)
 mdata.write(args.output_mudata)
 
 L.info("Done")
-

From 98b72db9a674259ce70edc0b373ea957a6741759 Mon Sep 17 00:00:00 2001
From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com>
Date: Tue, 4 Feb 2025 10:42:47 +0100
Subject: [PATCH 05/10] refactoring: imports, style, redundant code

---
 panpipes/panpipes/pipeline_ingest.py | 506 ++++++++++++++++-----------
 1 file changed, 306 insertions(+), 200 deletions(-)

diff --git a/panpipes/panpipes/pipeline_ingest.py b/panpipes/panpipes/pipeline_ingest.py
index 12be6897..a5f74107 100644
--- a/panpipes/panpipes/pipeline_ingest.py
+++ b/panpipes/panpipes/pipeline_ingest.py
@@ -1,71 +1,74 @@
 #! /usr/bin/env python
 
-from ruffus import *
-import sys
+import logging
 import os
-import re
-from cgatcore import pipeline as P
-import pandas as pd
-import cgatcore.iotools as IOTools
-from panpipes.funcs.io import check_submission_file, gen_load_anndata_jobs
-# from scpipelines.funcs.processing import intersection
+import sys
 import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-warnings.filterwarnings("ignore", category=DeprecationWarning) 
 
-import warnings
+import cgatcore.iotools as IOTools
+import pandas as pd
+from cgatcore import pipeline as P
+from ruffus import (
+    active_if,
+    follows,
+    originate,
+    mkdir,
+    files,
+    collate,
+    transform,
+    regex,
+    formatter,
+)
 
-# warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
-# warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)
+from panpipes.funcs.io import gen_load_anndata_jobs
 
-import yaml
+warnings.simplefilter(action="ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=DeprecationWarning)
 
-import logging
-# L = logging.getLogger('panpipes')
-# L.setLevel(logging.INFO)
-# log_handler = logging.StreamHandler(sys.stdout)
-# log_formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s')
-# log_handler.setFormatter(log_formatter)
-# L.addHandler(log_handler)
 
 def get_logger():
     return logging.getLogger("cgatcore.pipeline")
 
 
 PARAMS = P.get_parameters(
-    ["%s/pipeline.yml" % os.path.splitext(__file__)[0],
-     "pipeline.yml"])
-
-PARAMS['py_path'] =  os.path.join(os.path.dirname(os.path.dirname(__file__)), 'python_scripts')
-PARAMS['r_path'] = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'R_scripts')
-PARAMS['resources_path'] = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources")
+    ["%s/pipeline.yml" % os.path.splitext(__file__)[0], "pipeline.yml"]
+)
+
+PARAMS["py_path"] = os.path.join(
+    os.path.dirname(os.path.dirname(__file__)), "python_scripts"
+)
+PARAMS["r_path"] = os.path.join(os.path.dirname(os.path.dirname(__file__)), "R_scripts")
+PARAMS["resources_path"] = os.path.join(
+    os.path.dirname(os.path.dirname(__file__)), "resources"
+)
 job_kwargs = {}
 
-if PARAMS['condaenv'] is not None:
-    job_kwargs["job_condaenv"] =PARAMS['condaenv']
+if PARAMS["condaenv"] is not None:
+    job_kwargs["job_condaenv"] = PARAMS["condaenv"]
 
 
-#------------------------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------------------------
 ## Create a dictionary of modalities
-#------------------------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------------------------
 mode_dictionary = PARAMS["modalities"]
-#{'rna': True, 'prot': False, 'Repertoire': False, 'atac': False}
+# {'rna': True, 'prot': False, 'Repertoire': False, 'atac': False}
 
 # -----------------------------------------------------------------------------------------------
 ## 10X metrics plotting
 # -----------------------------------------------------------------------------------------------
 
+
 # TODO: update this to check for each modality csv file
 # TODO: check this works with current submission file
-@active_if(PARAMS['plot_10X_metrics'])
-@follows(mkdir('figures'))
-@follows(mkdir('figures/tenx_metrics'))
+@active_if(PARAMS["plot_10X_metrics"])
+@follows(mkdir("figures"))
+@follows(mkdir("figures/tenx_metrics"))
 @follows(mkdir("logs"))
 @originate("logs/0_tenx_metrics_multi_aggregate.log")
 def aggregate_tenx_metrics_multi(outfile):
     """this is to aggregate all the cellranger multi metric_summary files
     it also does some plotting
-    """    
+    """
     cmd = """
         python %(py_path)s/aggregate_cellranger_summary_metrics.py
             --pipe_df %(submission_file)s
@@ -73,64 +76,87 @@ def aggregate_tenx_metrics_multi(outfile):
             --cellranger_column_conversion_df %(resources_path)s/metrics_summary_col_conversion.tsv
             --output_file 10x_metrics.csv > %(outfile)s
             """
-    job_kwargs["job_threads"] = PARAMS['resources_threads_low']
-    log_msg = f"TASK: 'aggregate_tenx_metrics_multi'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{outfile}' FOR MORE INFORMATION."
-    get_logger().info(log_msg)   
+    job_kwargs["job_threads"] = PARAMS["resources_threads_low"]
+    log_msg = (
+        "TASK: 'aggregate_tenx_metrics_multi'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{outfile}' FOR MORE INFORMATION."
+    )
+    get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
 
 
 @follows(aggregate_tenx_metrics_multi)
 def process_all_tenx_metrics():
     pass
+
+
 # -----------------------------------------------------------------------------------------------
 ## Creating h5mu from filtered data files
 # -----------------------------------------------------------------------------------------------
 
+
 def unfilt_file():
-    sprefix = PARAMS['sample_prefix']
+    sprefix = PARAMS["sample_prefix"]
     unfilt_file = sprefix + "_unfilt.h5mu"
     return unfilt_file
 
 
-
 def gen_load_filtered_anndata_jobs():
-    caf = pd.read_csv(PARAMS['submission_file'], sep='\t')
-    return gen_load_anndata_jobs(caf, load_raw=False, mode_dictionary=PARAMS["modalities"], load_prot_from_raw=PARAMS['load_prot_from_raw'])
+    caf = pd.read_csv(PARAMS["submission_file"], sep="\t")
+
+    return gen_load_anndata_jobs(
+        caf,
+        load_raw=False,
+        mode_dictionary=PARAMS["modalities"],
+        load_prot_from_raw=PARAMS["load_prot_from_raw"],
+    )
 
-    
 
 @follows(mkdir("logs"))
 @follows(mkdir("tmp"))
 @active_if(PARAMS["use_existing_h5mu"] is False)
 @files(gen_load_filtered_anndata_jobs)
-def load_mudatas(rna_path, outfile, 
-              sample_id, 
-              rna_filetype,  
-              prot_path, prot_filetype, 
-              tcr_path, tcr_filetype,  
-              bcr_path, bcr_filetype, 
-              atac_path, atac_filetype, 
-              fragments_file, per_barcode_metrics_file, peak_annotation_file, 
-              cell_mtd_path):
-    
-    path_dict = {'rna':rna_path,
-                 'prot': prot_path,
-                 'bcr': bcr_path,
-                 'tcr' : tcr_path,
-                 'atac':atac_path}
+def load_mudatas(
+    rna_path,
+    outfile,
+    sample_id,
+    rna_filetype,
+    prot_path,
+    prot_filetype,
+    tcr_path,
+    tcr_filetype,
+    bcr_path,
+    bcr_filetype,
+    atac_path,
+    atac_filetype,
+    fragments_file,
+    per_barcode_metrics_file,
+    peak_annotation_file,
+    cell_mtd_path,
+):
+    path_dict = {
+        "rna": rna_path,
+        "prot": prot_path,
+        "bcr": bcr_path,
+        "tcr": tcr_path,
+        "atac": atac_path,
+    }
     print(path_dict)
-    
-    modality_dict = {k:True if path_dict[k] is not None else False for k,v in PARAMS['modalities'].items() }
+
+    modality_dict = {
+        k: True if path_dict[k] is not None else False
+        for k, v in PARAMS["modalities"].items()
+    }
     print(modality_dict)
-    
+
     cmd = """
         python %(py_path)s/make_adata_from_csv.py 
         --mode_dictionary "%(modality_dict)s"
         --sample_id %(sample_id)s
         --output_file %(outfile)s 
     """
-    
-    cmd += " --use_muon True" #this is still here?
+
+    cmd += " --use_muon True"  # this is still here?
     if rna_path is not None and pd.notna(rna_path):
         cmd += " --rna_infile %(rna_path)s"
         cmd += " --rna_filetype %(rna_filetype)s"
@@ -147,7 +173,7 @@ def load_mudatas(rna_path, outfile,
         cmd += " --fragments_file %(fragments_file)s"
     if peak_annotation_file is not None and pd.notna(peak_annotation_file):
         cmd += " --peak_annotation_file %(peak_annotation_file)s"
-      # ~ means this tests "is not nan"
+    # ~ means this tests "is not nan"
     if tcr_path is not None and pd.notna(tcr_path):
         cmd += " --tcr_filtered_contigs %(tcr_path)s"
         cmd += " --tcr_filetype %(tcr_filetype)s"
@@ -157,21 +183,22 @@ def load_mudatas(rna_path, outfile,
     logfile = f"logs/1_load_mudatas_{sample_id}.log"
     cmd += f" > {logfile}"
     # print(cmd)
-    job_kwargs["job_threads"] = PARAMS['resources_threads_medium']
-    log_msg = f"TASK: 'load_mudatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION."
+    job_kwargs["job_threads"] = PARAMS["resources_threads_medium"]
+    log_msg = (
+        "TASK: 'load_mudatas'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION."
+    )
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
 
 
-
 @active_if(PARAMS["use_existing_h5mu"] is False)
-@collate(load_mudatas,
-         formatter(""),
-         unfilt_file())
+@collate(load_mudatas, formatter(""), unfilt_file())
 def concat_filtered_mudatas(infiles, outfile):
     # print(infiles)
     # print(outfile)
-    infiles_str = ','.join(infiles)
+    infiles_str = ",".join(infiles)  # TODO: not used anywhere
+
     cmd = """
     python %(py_path)s/concat_adata.py 
         --input_files_str %(infiles_str)s 
@@ -180,19 +207,22 @@ def concat_filtered_mudatas(infiles, outfile):
         --sampleprefix %(sample_prefix)s
         --join_type %(concat_join_type)s
         """
-    if PARAMS['metadatacols'] is not None and PARAMS['metadatacols'] != "":
+    if PARAMS["metadatacols"] is not None and PARAMS["metadatacols"] != "":
         cmd += " --metadatacols  %(metadatacols)s"
     if PARAMS["barcode_mtd_include"] is True:
         cmd += " --barcode_mtd_df %(barcode_mtd_path)s"
         cmd += " --barcode_mtd_metadatacols %(barcode_mtd_metadatacols)s"
-    if PARAMS['protein_metadata_table'] is not None:
+    if PARAMS["protein_metadata_table"] is not None:
         cmd += " --protein_var_table %(protein_metadata_table)s"
-    if PARAMS['index_col_choice'] is not None:
+    if PARAMS["index_col_choice"] is not None:
         cmd += " --protein_new_index_col %(index_col_choice)s"
     logfile = "logs/1_concat_filtered_mudatas.log"
     cmd += f" > {logfile}"
-    job_kwargs["job_threads"] = PARAMS['resources_threads_high']
-    log_msg = f"TASK: 'concat_filtered_mudatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION."
+    job_kwargs["job_threads"] = PARAMS["resources_threads_high"]
+    log_msg = (
+        "TASK: 'concat_filtered_mudatas'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION."
+    )
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
     # P.run("rm tmp/*", job_threads=PARAMS['resources_threads_low'])
@@ -202,14 +232,16 @@ def concat_filtered_mudatas(infiles, outfile):
 ## Creating h5mu from bg data files
 # -----------------------------------------------------------------------------------------------
 
-if PARAMS['assess_background'] or (PARAMS["modalities"]["prot"] and "dsb" in PARAMS['normalisation_methods']):
-    PARAMS['bg_required'] = True
+if PARAMS["assess_background"] or (
+    PARAMS["modalities"]["prot"] and "dsb" in PARAMS["normalisation_methods"]
+):
+    PARAMS["bg_required"] = True
 else:
-    PARAMS['bg_required'] = False
-    
+    PARAMS["bg_required"] = False
+
 
 def bg_file():
-    sprefix = PARAMS['sample_prefix']
+    sprefix = PARAMS["sample_prefix"]
     use_muon = True
     if use_muon:
         bg_file = sprefix + "_bg.h5mu"
@@ -219,36 +251,55 @@ def bg_file():
 
 
 def gen_load_bg_anndata_jobs():
-    caf = pd.read_csv(PARAMS['submission_file'], sep='\t')
-    return gen_load_anndata_jobs(caf, load_raw=True, mode_dictionary=PARAMS["modalities"], load_prot_from_raw=True)
-    
+    caf = pd.read_csv(PARAMS["submission_file"], sep="\t")
+    return gen_load_anndata_jobs(
+        caf,
+        load_raw=True,
+        mode_dictionary=PARAMS["modalities"],
+        load_prot_from_raw=True,
+    )
+
 
 @active_if(PARAMS["bg_required"])
 @follows(mkdir("logs"))
 @follows(mkdir("tmp"))
 @active_if(PARAMS["use_existing_h5mu"] is False)
 @files(gen_load_bg_anndata_jobs)
-def load_bg_mudatas(rna_path, outfile, 
-              sample_id, 
-              rna_filetype,  
-              prot_path, prot_filetype, 
-              tcr_path, tcr_filetype,  
-              bcr_path, bcr_filetype, 
-              atac_path, atac_filetype, 
-              fragments_file, per_barcode_metrics_file, peak_annotation_file, 
-              cell_mtd_path):
-    path_dict = {'rna':rna_path,
-                 'prot': prot_path,
-                 'bcr': bcr_path,
-                 'tcr' : tcr_path,
-                 'atac':atac_path}
+def load_bg_mudatas(
+    rna_path,
+    outfile,
+    sample_id,
+    rna_filetype,
+    prot_path,
+    prot_filetype,
+    tcr_path,
+    tcr_filetype,
+    bcr_path,
+    bcr_filetype,
+    atac_path,
+    atac_filetype,
+    fragments_file,
+    per_barcode_metrics_file,
+    peak_annotation_file,
+    cell_mtd_path,
+):
+    path_dict = {
+        "rna": rna_path,
+        "prot": prot_path,
+        "bcr": bcr_path,
+        "tcr": tcr_path,
+        "atac": atac_path,
+    }
     print(path_dict)
-    
+
     # need to remove unnessacry modality
-    modality_dict = {k:True if path_dict[k] is not None else False for k,v in PARAMS['modalities'].items() }
+    modality_dict = {
+        k: True if path_dict[k] is not None else False
+        for k, v in PARAMS["modalities"].items()
+    }
 
-    modality_dict['tcr'] = False
-    modality_dict['bcr'] = False
+    modality_dict["tcr"] = False
+    modality_dict["bcr"] = False
     cmd = """
     python %(py_path)s/make_adata_from_csv.py 
     --sample_id %(sample_id)s
@@ -269,24 +320,27 @@ def load_bg_mudatas(rna_path, outfile,
         cmd += " --fragments_file %(fragments_file)s"
     if peak_annotation_file is not None and pd.notna(peak_annotation_file):
         cmd += " --peak_annotation_file %(peak_annotation_file)s"
-    if PARAMS['protein_metadata_table'] is not None:
-        cmd += " --protein_var_table %(protein_metadata_table)s"  #check which of these 2 needs to stay!!!
-    if PARAMS['index_col_choice'] is not None:
+    if PARAMS["protein_metadata_table"] is not None:
+        cmd += " --protein_var_table %(protein_metadata_table)s"  # check which of these 2 needs to stay!!!
+    if PARAMS["index_col_choice"] is not None:
         cmd += " --protein_new_index_col %(index_col_choice)s"
     logfile = f"logs/1_load_bg_mudatas_%(sample_id)s.log"
     cmd += " > {logfile}"
-    job_kwargs["job_threads"] = PARAMS['resources_threads_medium']
-    log_msg = f"TASK: 'load_bg_mudatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION."
+    job_kwargs["job_threads"] = PARAMS["resources_threads_medium"]
+    log_msg = (
+        "TASK: 'load_bg_mudatas'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION."
+    )
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
 
 
 @active_if(PARAMS["bg_required"])
-@active_if(PARAMS['downsample_background'])
+@active_if(PARAMS["downsample_background"])
 @follows(load_bg_mudatas)
-@transform(load_bg_mudatas, 
-           regex("./tmp/(.*)_raw.h5(.*)"), 
-           r"./logs/1_\1_bg_downsampled.log")
+@transform(
+    load_bg_mudatas, regex("./tmp/(.*)_raw.h5(.*)"), r"./logs/1_\1_bg_downsampled.log"
+)
 def downsample_bg_mudatas(infile, outfile):
     cmd = """
     python %(py_path)s/downsample.py 
@@ -294,7 +348,7 @@ def downsample_bg_mudatas(infile, outfile):
     --output_mudata "%(infile)s" 
     --downsample_value 20000  > %(outfile)s
     """
-    job_kwargs["job_threads"] = PARAMS['resources_threads_medium']
+    job_kwargs["job_threads"] = PARAMS["resources_threads_medium"]
     # TODO: add log file?
     P.run(cmd, **job_kwargs)
 
@@ -304,13 +358,13 @@ def downsample_bg_mudatas(infile, outfile):
 @active_if(PARAMS["use_existing_h5mu"] is False)
 @follows(load_bg_mudatas)
 @follows(downsample_bg_mudatas)
-@collate(load_bg_mudatas,
-         formatter(""),
-         bg_file())
+@collate(load_bg_mudatas, formatter(""), bg_file())
 def concat_bg_mudatas(infiles, outfile):
     # print(infiles)
     # print(outfile)
-    infiles_str = ','.join(infiles)
+
+    infiles_str = ",".join(infiles)  # TODO: not used anywhere
+
     cmd = """
     python %(py_path)s/concat_adata.py 
         --input_files_str %(infiles_str)s 
@@ -319,76 +373,92 @@ def concat_bg_mudatas(infiles, outfile):
         --sampleprefix %(sample_prefix)s
         --join_type %(concat_join_type)s
         """
-    if PARAMS['metadatacols'] is not None and PARAMS['metadatacols'] != "":
+    if PARAMS["metadatacols"] is not None and PARAMS["metadatacols"] != "":
         cmd += " --metadatacols  %(metadatacols)s"
-  #  if PARAMS["barcode_mtd_include"] is True:
-   #     cmd += " --barcode_mtd_df %(barcode_mtd_path)s"
+    #  if PARAMS["barcode_mtd_include"] is True:
+    #     cmd += " --barcode_mtd_df %(barcode_mtd_path)s"
     #    cmd += " --barcode_mtd_metadatacols %(barcode_mtd_metadatacols)s"
     logfile = "logs/1_concat_bg_mudatas.log"
     cmd += " > {logfile}"
-    job_kwargs["job_threads"] = PARAMS['resources_threads_high']
-    log_msg = f"TASK: 'concat_bg_mudatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION."
+    job_kwargs["job_threads"] = PARAMS["resources_threads_high"]
+    log_msg = (
+        "TASK: 'concat_bg_mudatas'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION."
+    )
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
     # P.run("rm tmp/*", job_threads=PARAMS['resources_threads_low'])
 
+
 # -----------------------------------------------------------------------------------------------
-## rna QC 
+## rna QC
 # -----------------------------------------------------------------------------------------------
 
 
 def orfile():
-    return PARAMS['sample_prefix'] + "_cell_metadata.tsv"
+    return PARAMS["sample_prefix"] + "_cell_metadata.tsv"
+
+
 @active_if(PARAMS["scr_run"])
-@active_if(PARAMS['modalities_rna'])
+@active_if(PARAMS["modalities_rna"])
 @active_if(PARAMS["use_existing_h5mu"] is False)
 @follows(mkdir("scrublet"))
-@transform(load_mudatas, regex(r"./tmp/(.*).h5(.*)"),
-           r"scrublet/\1_scrublet_scores.txt", r"\1")
+@transform(
+    load_mudatas, regex(r"./tmp/(.*).h5(.*)"), r"scrublet/\1_scrublet_scores.txt", r"\1"
+)
 def run_scrublet(infile, outfile, sample_id):
     outdir = "./scrublet"
+
     cmd = """python %(py_path)s/run_scrublet_scores.py
         --sample_id %(sample_id)s
         --inputpath %(infile)s
         --outdir %(outdir)s
         """
-    if PARAMS['scr_expected_doublet_rate'] is not None:
+    if PARAMS["scr_expected_doublet_rate"] is not None:
         cmd += " --expected_doublet_rate %(scr_expected_doublet_rate)s"
-    if PARAMS['scr_sim_doublet_ratio'] is not None:
+    if PARAMS["scr_sim_doublet_ratio"] is not None:
         cmd += " --sim_doublet_ratio %(scr_sim_doublet_ratio)s"
-    if PARAMS['scr_n_neighbours'] is not None:
+    if PARAMS["scr_n_neighbours"] is not None:
         cmd += " --n_neighbors %(scr_n_neighbours)s"
-    if PARAMS['scr_min_counts'] is not None:
+    if PARAMS["scr_min_counts"] is not None:
         cmd += " --min_counts %(scr_min_counts)s"
-    if PARAMS['scr_min_cells'] is not None:
+    if PARAMS["scr_min_cells"] is not None:
         cmd += " --min_cells %(scr_min_cells)s"
-    if PARAMS['scr_min_gene_variability_pctl'] is not None:
+    if PARAMS["scr_min_gene_variability_pctl"] is not None:
         cmd += " --min_gene_variability_pctl %(scr_min_gene_variability_pctl)s"
-    if PARAMS['scr_n_prin_comps'] is not None:
+    if PARAMS["scr_n_prin_comps"] is not None:
         cmd += " --n_prin_comps %(scr_n_prin_comps)s"
-    if PARAMS['scr_use_thr'] is not None:
+    if PARAMS["scr_use_thr"] is not None:
         cmd += " --use_thr %(scr_use_thr)s"
-    if PARAMS['scr_call_doublets_thr'] is not None:
+    if PARAMS["scr_call_doublets_thr"] is not None:
         cmd += " --call_doublets_thr %(scr_call_doublets_thr)s"
     logfile = "logs/2_run_scrublet_" + sample_id + ".log"
     cmd += f" > {logfile}"
-    job_kwargs["job_threads"] = PARAMS['resources_threads_medium']
-    log_msg = f"TASK: 'run_scrublet'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION."
+    job_kwargs["job_threads"] = PARAMS["resources_threads_medium"]
+    log_msg = (
+        "TASK: 'run_scrublet'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION."
+    )
     get_logger().info(log_msg)
-    P.run(cmd,**job_kwargs)
+    P.run(cmd, **job_kwargs)
     IOTools.touch_file(outfile)
 
 
-@active_if(PARAMS['modalities_rna'])
+@active_if(PARAMS["modalities_rna"])
 @follows(mkdir("figures"))
 @follows(mkdir("figures/rna"))
 @follows(concat_filtered_mudatas, run_scrublet)
 @originate("logs/3_run_scanpy_qc_rna.log", orfile(), unfilt_file())
 def run_rna_qc(log_file, outfile, unfilt_file):
     # infile = submission file
-    resources_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources")
+
+    resources_path = os.path.join(
+        os.path.dirname(os.path.dirname(__file__)), "resources"
+    )
+
     customgenesfile = PARAMS["custom_genes_file"]
-    calcproportions= PARAMS["calc_proportions"]
+    calcproportions = PARAMS["calc_proportions"]
+
     cmd = """
         python %(py_path)s/run_scanpyQC_rna.py
           --sampleprefix %(sample_prefix)s
@@ -398,20 +468,22 @@ def run_rna_qc(log_file, outfile, unfilt_file):
           --calc_proportions %(calcproportions)s
           --figdir figures
           """
-    if len(os.listdir('scrublet')) != 0:
+    if len(os.listdir("scrublet")) != 0:
         cmd += " --scrubletdir scrublet"
+
     # if params is specified otherwise default values will be used.
-    # if params is specified otherwise default values will be used.
-    if PARAMS['score_genes'] is not None:
-        scoregenes = PARAMS['score_genes']
+    if PARAMS["score_genes"] is not None:
+        scoregenes = PARAMS["score_genes"]
         cmd += " --score_genes %(scoregenes)s"
-    if PARAMS['ccgenes'] is not None:
-        if PARAMS['ccgenes'] == "default":
+
+    if PARAMS["ccgenes"] is not None:
+        if PARAMS["ccgenes"] == "default":
             ccgenesfile = resources_path + "/cell_cycle_genes.tsv"
         else:
-            ccgenesfile = PARAMS['ccgenes']
+            ccgenesfile = PARAMS["ccgenes"]
         cmd += " --ccgenes %(ccgenesfile)s"
-    cmd += " --channel_col %(channel_col)s" # is this needed?
+
+    cmd += " --channel_col %(channel_col)s"  # is this needed?
 
     # if PARAMS['isotype_upper_quantile'] is not None:
     #     cmd += " --isotype_upper_quantile %(isotype_upper_quantile)s"
@@ -419,29 +491,37 @@ def run_rna_qc(log_file, outfile, unfilt_file):
     #     cmd += " --isotype_n_pass %(isotype_n_pass)s"
     # add log file
     cmd += " > %(log_file)s"
-    job_kwargs["job_threads"] = PARAMS['resources_threads_high']
-    log_msg = f"TASK: 'run_rna_qc'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION."
+    job_kwargs["job_threads"] = PARAMS["resources_threads_high"]
+
+    log_msg = (
+        "TASK: 'run_rna_qc'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION."
+    )
     get_logger().info(log_msg)
+
     P.run(cmd, **job_kwargs)
+
     if os.path.exists("cache"):
         P.run("rm -r cache")
-    #IOTools.touch_file(metadata)
+    # IOTools.touch_file(metadata)
+
 
-#TO DO ADD the other modalities in this order:
+# TO DO ADD the other modalities in this order:
 
 # prot run_scanpy_qc_prot.log
 
 
 # -----------------------------------------------------------------------------------------------
-## prot QC 
+## prot QC
 # -----------------------------------------------------------------------------------------------
-    
+
 # this is because we don't want to try and load the file during the `config` stage
 
+
 # def prot_cell_mtd():
 #     return PARAMS['sample_prefix'] + "_prot_cell_metadata.tsv"
-@active_if(PARAMS['modalities_prot'])
-@follows(mkdir('figures/prot'))
+@active_if(PARAMS["modalities_prot"])
+@follows(mkdir("figures/prot"))
 @follows(concat_filtered_mudatas, run_rna_qc)
 @originate("logs/3_run_scanpy_qc_prot.log", orfile(), unfilt_file())
 def run_scanpy_prot_qc(log_file, outfile, unfilt_file):
@@ -453,27 +533,31 @@ def run_scanpy_prot_qc(log_file, outfile, unfilt_file):
           --outfile %(unfilt_file)s
           --figdir figures/prot/
           """
-    if PARAMS['channel_col'] is not None:
+    if PARAMS["channel_col"] is not None:
         cmd += " --channel_col %(channel_col)s"
     else:
         cmd += " --channel_col sample_id"
-    if PARAMS['plotqc_prot_metrics']:
+    if PARAMS["plotqc_prot_metrics"]:
         cmd += " --per_cell_metrics %(plotqc_prot_metrics)s"
-    if PARAMS['plot_metrics_per_prot']:
+    if PARAMS["plot_metrics_per_prot"]:
         cmd += " --per_prot_metrics %(plot_metrics_per_prot)s"
-    if PARAMS['identify_isotype_outliers']:
+    if PARAMS["identify_isotype_outliers"]:
         cmd += " --identify_isotype_outliers %(identify_isotype_outliers)s"
         cmd += " --isotype_upper_quantile %(isotype_upper_quantile)s"
         cmd += " --isotype_n_pass %(isotype_n_pass)s"
     # add log file
     cmd += " > %(log_file)s"
-    job_kwargs["job_threads"] = PARAMS['resources_threads_high']
-    log_msg = f"TASK: 'run_scanpy_prot_qc'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION."
+    job_kwargs["job_threads"] = PARAMS["resources_threads_high"]
+    log_msg = (
+        "TASK: 'run_scanpy_prot_qc'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION."
+    )
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
     pass
 
-@active_if(PARAMS['modalities_prot'])
+
+@active_if(PARAMS["modalities_prot"])
 @follows(run_scanpy_prot_qc, concat_filtered_mudatas, concat_bg_mudatas)
 @originate("logs/3_run_dsb_clr.log", unfilt_file(), bg_file())
 def run_dsb_clr(outfile, unfilt_file, bg_file):
@@ -485,25 +569,29 @@ def run_dsb_clr(outfile, unfilt_file, bg_file):
         --filtered_mudata %(unfilt_file)s
         --figpath ./figures/prot
         """
-    if PARAMS['channel_col'] is not None:
+    if PARAMS["channel_col"] is not None:
         cmd += " --channel_col %(channel_col)s"
-    if PARAMS['normalisation_methods'] is not None:
+    if PARAMS["normalisation_methods"] is not None:
         cmd += " --normalisation_methods %(normalisation_methods)s"
-    if PARAMS['quantile_clipping'] is not None:
+    if PARAMS["quantile_clipping"] is not None:
         cmd += " --quantile_clipping %(quantile_clipping)s"
-    if PARAMS['clr_margin'] is not None:
+    if PARAMS["clr_margin"] is not None:
         cmd += " --clr_margin %(clr_margin)s"
-    if PARAMS['save_norm_prot_mtx'] is True:
+    if PARAMS["save_norm_prot_mtx"] is True:
         cmd += " --save_mtx True"
     # find the bg data if available
     if os.path.exists(bg_file):
         cmd += " --bg_mudata %(bg_file)s"
     cmd += " > %(outfile)s"
-    job_kwargs["job_threads"] = PARAMS['resources_threads_high']
-    log_msg = f"TASK: 'run_dsb_clr'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{outfile}' FOR MORE INFORMATION."
+    job_kwargs["job_threads"] = PARAMS["resources_threads_high"]
+    log_msg = (
+        "TASK: 'run_dsb_clr'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{outfile}' FOR MORE INFORMATION."
+    )
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
 
+
 @follows(run_scanpy_prot_qc, run_dsb_clr)
 def run_prot_qc():
     pass
@@ -513,8 +601,8 @@ def run_prot_qc():
 ## Repertoire QC  # Repertoire run_scanpy_qc_rep.log
 # -----------------------------------------------------------------------------------------------
 # toggle_rep_qc = ()
-@active_if(PARAMS['modalities_bcr'] or PARAMS['modalities_tcr']  )
-@follows(mkdir('figures/rep'))
+@active_if(PARAMS["modalities_bcr"] or PARAMS["modalities_tcr"])
+@follows(mkdir("figures/rep"))
 @follows(run_rna_qc, run_prot_qc)
 @originate("logs/3_run_scanpy_qc_rep.log", unfilt_file())
 def run_repertoire_qc(logfile, unfilt_file):
@@ -527,27 +615,30 @@ def run_repertoire_qc(logfile, unfilt_file):
           --clonotype_metrics "%(clonotype_definition)s"
           """
     cmd += " > %(logfile)s"
-    job_kwargs["job_threads"] = PARAMS['resources_threads_low']
-    log_msg = f"TASK: 'run_repertoire_qc'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION."
+    job_kwargs["job_threads"] = PARAMS["resources_threads_low"]
+    log_msg = (
+        "TASK: 'run_repertoire_qc'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{logfile}' FOR MORE INFORMATION."
+    )
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
 
+
 # -----------------------------------------------------------------------------------------------
 ## atac QC # run_scanpy_qc_atac.log
 # -----------------------------------------------------------------------------------------------
-@active_if(PARAMS['modalities_atac'])
-@mkdir('figures/atac')
+@active_if(PARAMS["modalities_atac"])
+@mkdir("figures/atac")
 @follows(run_rna_qc, run_prot_qc, run_repertoire_qc)
 @originate("logs/3_run_scanpy_qc_atac.log", orfile(), unfilt_file())
 def run_atac_qc(log_file, outfile, unfilt_file):
     # if this is a multiple samples project
-    # they should be run together upfront 
+    # they should be run together upfront
     # for independent atac run cellranger-atac count, then cellranger-atac aggr
     # https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/using/aggr
     # for multiome run cellranger-arc count then cellranger-arc aggr
     # https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/aggr
-    # 
-    
+    #
 
     plot_qc_atac_metrics = PARAMS["plotqc_atac_metrics"]
     cmd = """
@@ -564,21 +655,25 @@ def run_atac_qc(log_file, outfile, unfilt_file):
         cmd += " --is_paired False"
     if PARAMS["features_tss"] is not None:
         cmd += " --is_paired False"
-        feat_file=PARAMS["features_tss"]
+        feat_file = PARAMS["features_tss"]
         cmd += " --tss_coordinates %(feat_file)s"
     if PARAMS["partner_rna"] is not None:
         cmd += " --is_paired False"
         prna_file = PARAMS["partner_rna"]
         cmd += " --paired_rna %(prna_file)s"
-    
+
     cmd += " --use_muon True"
 
     cmd += " > %(log_file)s"
-    job_kwargs["job_threads"] = PARAMS['resources_threads_low']
-    log_msg = f"TASK: 'run_atac_qc'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION."
+    job_kwargs["job_threads"] = PARAMS["resources_threads_low"]
+    log_msg = (
+        f"TASK: 'run_atac_qc'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION."
+    )
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
 
+
 @follows(run_rna_qc, run_prot_qc, run_repertoire_qc, run_atac_qc)
 def run_qc():
     pass
@@ -588,10 +683,11 @@ def run_qc():
 ## QC plotting
 # -----------------------------------------------------------------------------------------------
 
+
 @follows(run_qc)
 @originate("logs/4_plot_qc.log", orfile())
 def plot_qc(log_file, cell_file):
-    qcmetrics = PARAMS['plotqc_rna_metrics']
+    qcmetrics = PARAMS["plotqc_rna_metrics"]
     cmd = """
     Rscript %(r_path)s/plotQC.R 
     --prefilter TRUE
@@ -600,19 +696,22 @@ def plot_qc(log_file, cell_file):
     --groupingvar %(plotqc_grouping_var)s
     """
     cmd += " --scanpy_or_muon muon"
-    if PARAMS["modalities"]['rna'] and PARAMS['plotqc_rna_metrics'] is not None:
+    if PARAMS["modalities"]["rna"] and PARAMS["plotqc_rna_metrics"] is not None:
         cmd += " --rna_qc_metrics %(plotqc_rna_metrics)s"
-    if PARAMS["modalities"]['prot'] and PARAMS['plotqc_prot_metrics'] is not None:
+    if PARAMS["modalities"]["prot"] and PARAMS["plotqc_prot_metrics"] is not None:
         cmd += " --prot_qc_metrics %(plotqc_prot_metrics)s"
-    if PARAMS["modalities"]['atac'] and PARAMS['plotqc_atac_metrics'] is not None:
+    if PARAMS["modalities"]["atac"] and PARAMS["plotqc_atac_metrics"] is not None:
         cmd += " --atac_qc_metrics %(plotqc_atac_metrics)s"
-    if PARAMS['modalities_bcr'] or PARAMS['modalities_tcr']:
-        if PARAMS['plotqc_rep_metrics'] is not None:
-            pqrm = ','.join(['plotqc_rep_metrics'])
+    if PARAMS["modalities_bcr"] or PARAMS["modalities_tcr"]:
+        if PARAMS["plotqc_rep_metrics"] is not None:
+            pqrm = ",".join(["plotqc_rep_metrics"])
             cmd += " --rep_qc_metrics %(pqrm)s"
     cmd += " > %(log_file)s"
-    job_kwargs["job_threads"] = PARAMS['resources_threads_low']
-    log_msg = f"TASK: 'plot_qc'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION."
+    job_kwargs["job_threads"] = PARAMS["resources_threads_low"]
+    log_msg = (
+        f"TASK: 'plot_qc'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION."
+    )
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
 
@@ -621,7 +720,8 @@ def plot_qc(log_file, cell_file):
 # assess background
 # -------
 
-@active_if(PARAMS['assess_background'])
+
+@active_if(PARAMS["assess_background"])
 @follows(mkdir("figures/background"))
 @follows(run_qc)
 @follows(concat_bg_mudatas)
@@ -635,13 +735,15 @@ def run_assess_background(log_file, unfilt_file, bg_file):
         --figpath "./figures/background/"
     """
     cmd += " > %(log_file)s"
-    job_kwargs["job_threads"] = PARAMS['resources_threads_high']
-    log_msg = f"TASK: 'run_assess_background'" + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION."
+    job_kwargs["job_threads"] = PARAMS["resources_threads_high"]
+    log_msg = (
+        f"TASK: 'run_assess_background'"
+        + f" IN CASE OF ERROR, PLEASE REFER TO : '{log_file}' FOR MORE INFORMATION."
+    )
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
 
 
-
 # # ------------
 # TO DO change the decorator to follow all qc plots?
 @follows(plot_qc)
@@ -649,14 +751,16 @@ def run_assess_background(log_file, unfilt_file, bg_file):
 def all_rna_qc():
     pass
 
+
 @follows(run_dsb_clr, plot_qc)
 def all_prot_qc():
     pass
 
+
 @follows(process_all_tenx_metrics)
 @follows(all_prot_qc)
 @follows(all_rna_qc)
-#@originate("ruffus.check")
+# @originate("ruffus.check")
 def full():
     """
     All cgat pipelines should end with a full() function which updates,
@@ -664,9 +768,10 @@ def full():
     The @follows statement should ensure that all functions are covered,
     either directly or as prerequisites.
     """
-    #IOTools.touch_file(file)
+    # IOTools.touch_file(file)
     pass
 
+
 @originate("cleanup_done.txt")
 def cleanup(file):
     # remove any ctmp fails
@@ -676,6 +781,7 @@ def cleanup(file):
     # delete empty dirs
     P.run("find ./ -empty -type d -delete", without_cluster=True)
 
+
 def main(argv=None):
     if argv is None:
         argv = sys.argv

From d8ec8071e62cbef9240383966317a81d71fba44d Mon Sep 17 00:00:00 2001
From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com>
Date: Tue, 4 Feb 2025 10:58:27 +0100
Subject: [PATCH 06/10] refactoring: style, types, structure

---
 .../aggregate_cellranger_summary_metrics.py   | 362 +++++++++++-------
 1 file changed, 225 insertions(+), 137 deletions(-)

diff --git a/panpipes/python_scripts/aggregate_cellranger_summary_metrics.py b/panpipes/python_scripts/aggregate_cellranger_summary_metrics.py
index a08960a1..5386c65e 100644
--- a/panpipes/python_scripts/aggregate_cellranger_summary_metrics.py
+++ b/panpipes/python_scripts/aggregate_cellranger_summary_metrics.py
@@ -4,8 +4,8 @@
 # takes cellranger multi outputs and cellranger count outputs, and makes them consistent for plotting.
 
 
+from typing import Optional
 import pandas as pd
-import glob
 import os
 import re
 import argparse
@@ -16,32 +16,26 @@
 
 import sys
 import logging
+
 L = logging.getLogger()
 L.setLevel(logging.INFO)
 log_handler = logging.StreamHandler(sys.stdout)
-formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s')
+formatter = logging.Formatter("%(asctime)s: %(levelname)s - %(message)s")
 log_handler.setFormatter(formatter)
 L.addHandler(log_handler)
 
 # parse arguments
 parser = argparse.ArgumentParser()
-parser.add_argument('--pipe_df',
-                    default='',
-                    help='')
-parser.add_argument('--output_file',
-                    default='',
-                    help='')
-parser.add_argument('--figdir',
-                    default='./figures',
-                    help='')
-parser.add_argument('--cellranger_column_conversion_df',
-                    default='',
-                    help='')
+parser.add_argument("--pipe_df", default="", help="")
+parser.add_argument("--output_file", default="", help="")
+parser.add_argument("--figdir", default="./figures", help="")
+parser.add_argument("--cellranger_column_conversion_df", default="", help="")
 parser.set_defaults(verbose=True)
 args = parser.parse_args()
 
 L.info("Running with params: %s", args)
 
+
 def check_path(path):
     if os.path.isfile(path):
         return os.path.dirname(path)
@@ -50,41 +44,53 @@ def check_path(path):
     else:
         return f"{path} does not exist or is an unrecognized type."
 
-def get_metrics_summary_path(path,sample_id=None):
-    """ infers the path to metrics_summary.csv based on what cellranger algorithm is used 
+
+def get_metrics_summary_path(path: str, sample_id: Optional[str] = None):
+    """infers the path to metrics_summary.csv based on what cellranger algorithm is used
     and checks the file exists
 
     Args:
         path (str):path to the folder called 'outs'
         sample_id (str): Required if path is to cellranger multi outputs, defaults to None
-    """    
+    """
     # subset path to only go up to 'outs'
-    if 'outs' not in path:
-        L.warning("You are parsing a cellranger output but your path to raw data doesn't end with the outs folder")
+    if "outs" not in path:
+        L.warning(
+            "You are parsing a cellranger output but your path to raw data doesn't end with the outs folder"
+        )
         path = check_path(path)
     else:
         path = path.split("outs")[0] + "outs"
-    outpath=None
+
+    outpath = None
+
     # use the path to cellranger count or vdj outputs as default
-    if os.path.exists(os.path.join(path, 'metrics_summary.csv') ):
-        outpath = os.path.join(path, 'metrics_summary.csv') 
-    elif os.path.exists(os.path.join(path, 'summary.csv') ):
-        outpath = os.path.join(path, 'summary.csv') 
-    elif sample_id is not None and os.path.exists(os.path.join(path, 'per_sample_outs', sample_id, 'metrics_summary.csv') ):
-        outpath = os.path.join(path, 'per_sample_outs', sample_id, 'metrics_summary.csv')
-    elif sample_id is None and os.path.exists(os.path.join(path, 'per_sample_outs')):
-        L.warning('Input folder appears to be from cellranger multi but no sample_id is given')
+    if os.path.exists(os.path.join(path, "metrics_summary.csv")):
+        outpath = os.path.join(path, "metrics_summary.csv")
+    elif os.path.exists(os.path.join(path, "summary.csv")):
+        outpath = os.path.join(path, "summary.csv")
+    elif sample_id is not None and os.path.exists(
+        os.path.join(path, "per_sample_outs", sample_id, "metrics_summary.csv")
+    ):
+        outpath = os.path.join(
+            path, "per_sample_outs", sample_id, "metrics_summary.csv"
+        )
+    elif sample_id is None and os.path.exists(os.path.join(path, "per_sample_outs")):
+        L.warning(
+            "Input folder appears to be from cellranger multi but no sample_id is given"
+        )
     else:
         # use the alternative path from cellranger_multi outputs
-        L.warning('Path not found')
+        L.warning("Path not found")
+
     return outpath
 
 
-def detect_cellranger_algorithm(pth):
-    if 'per_sample_outs' in pth:
-        filetype='multi'
+def detect_cellranger_algorithm(pth: list[str]):
+    if "per_sample_outs" in pth:
+        filetype = "multi"
     else:
-        filetype='count'
+        filetype = "count"
     return filetype
 
 
@@ -96,203 +102,285 @@ def get_all_unique_paths(pipe_df):
 
     Returns:
         pd.DataFrame: stacked dataframe of paths
-    """    
-    pipe_df = pipe_df.set_index('sample_id')
-    pipe_df = pipe_df.loc[:,pipe_df.columns.str.endswith('path')]
+    """
+    pipe_df = pipe_df.set_index("sample_id")
+    pipe_df = pipe_df.loc[:, pipe_df.columns.str.endswith("path")]
     all_paths = pipe_df.stack().reset_index()
-    all_paths.columns = ['sample_id', 'path_type', 'path']
+    all_paths.columns = ["sample_id", "path_type", "path"]
     all_paths = all_paths.drop_duplicates()
     # rename path_type to match cellranger terminolgy
-    recode_dict = {'rna_path': "Gene Expression", 
-                   'prot_path': "Antibody Capture", 
-                   'tcr_path': 'VDJ T', 
-                   'bcr_path': 'VDJ B',
-                   'atac_path': 'ATAC' }
-    all_paths['path_type'] = all_paths['path_type'].replace(recode_dict)
+    recode_dict = {
+        "rna_path": "Gene Expression",
+        "prot_path": "Antibody Capture",
+        "tcr_path": "VDJ T",
+        "bcr_path": "VDJ B",
+        "atac_path": "ATAC",
+    }
+    all_paths["path_type"] = all_paths["path_type"].replace(recode_dict)
     return all_paths
 
 
-def parse_10x_cellranger_multi(path_df,path_col='metrics_summary_path'):
-    """takes the metrics_summary.csv from cellranger multi for each sample 
+def parse_10x_cellranger_multi(path_df, path_col="metrics_summary_path"):
+    """takes the metrics_summary.csv from cellranger multi for each sample
     and concantenates into on pd.DataFrame (long format)
 
     Args:
-        path_df (pd.DataFrame): pandas dataframe with two columns sample_id and and 
+        path_df (pd.DataFrame): pandas dataframe with two columns sample_id and and
                             path which contains the path to the outs folder.
         path_col (str): colname for the columns containing the paths
 
     Returns:
-        pd.DataFrame: 7 columns ['sample_id', 'category', 'library_type', 'grouped_by', 
+        pd.DataFrame: 7 columns ['sample_id', 'category', 'library_type', 'grouped_by',
         'group_name', metric_name', 'metric_value']
-    """   
+    """
+
     # subset to unique rows to save loading in duplicate data
-    path_df = path_df[['sample_id', path_col]].drop_duplicates()
+    path_df = path_df[["sample_id", path_col]].drop_duplicates()
+
     # read and concat
-    msums = pd.concat([pd.read_csv(f) for f in path_df[path_col]], 
-                    keys=path_df['sample_id'], 
-                    names=['sample_id']).reset_index().drop(columns='level_1')
+    msums = (
+        pd.concat(
+            [pd.read_csv(f) for f in path_df[path_col]],
+            keys=path_df["sample_id"],
+            names=["sample_id"],
+        )
+        .reset_index()
+        .drop(columns="level_1")
+    )
+
     # remove spaces and capitils from columns names
-    msums.columns = [re.sub(' ', '_', x.lower()) for x in msums.columns]
+    msums.columns = [re.sub(" ", "_", x.lower()) for x in msums.columns]
+
     # add a fix for version < cellranger 7
-    ranger = {"library_or_sample":"category"}
-    msums.rename(columns =ranger,inplace=True)
+    ranger = {"library_or_sample": "category"}
+    msums.rename(columns=ranger, inplace=True)
+
     # convert percentages from string to numeric
-    msums['metric_value'] = [re.sub(",|%", "", x) for x in msums['metric_value']]
-    msums['metric_value'] = msums['metric_value'].astype(float)
+    msums["metric_value"] = [re.sub(",|%", "", x) for x in msums["metric_value"]]
+    msums["metric_value"] = msums["metric_value"].astype(float)
+
     # sort outputs
-    msums = msums.sort_values(['metric_name', 'sample_id'])
+    msums = msums.sort_values(["metric_name", "sample_id"])
+
     return msums
 
 
-def parse_10x_cellranger_count(path_df, convert_df,  path_col='metrics_summary_path'):
-    """takes the metrics_summary.csv from cellranger count output for each sample 
+def parse_10x_cellranger_count(path_df, convert_df, path_col="metrics_summary_path"):
+    """takes the metrics_summary.csv from cellranger count output for each sample
     and concantenates into on pd.DataFrame (long format)
 
     Args:
-        path_df (pd.DataFrame): pandas dataframe with three columns sample_id, path 
-                                path which contains the path to the outs folder.and 
+        path_df (pd.DataFrame): pandas dataframe with three columns sample_id, path
+                                path which contains the path to the outs folder.and
                                 path_type which contains the type of library (one of ['Gene Expression', 'Antibody Capture', 'VDJ T', 'VDJ B'])
         path_col (str): colname for the columns containing the paths
 
     Returns:
-        pd.DataFrame: 7 columns ['sample_id', 'category', 'library_type', 'grouped_by', 
+        pd.DataFrame: 7 columns ['sample_id', 'category', 'library_type', 'grouped_by',
         'group_name', metric_name', 'metric_value']
-    """   
-    
+    """
+
     # read and concat
-    msums = pd.concat([pd.read_csv(f) for f in path_df[path_col]], 
-                    keys=[(x, y) for x, y in zip(path_df['sample_id'], 
-                                                 path_df['path_type'])], #count
-                    names=['sample_id','path_type'])
-    msums = msums.stack().reset_index().drop(columns='level_2')
+    msums = pd.concat(
+        [pd.read_csv(f) for f in path_df[path_col]],
+        keys=[
+            (x, y) for x, y in zip(path_df["sample_id"], path_df["path_type"])
+        ],  # count
+        names=["sample_id", "path_type"],
+    )
+
+    msums = msums.stack().reset_index().drop(columns="level_2")
+
     # make the columns match the conversion df
-    msums.columns = ['sample_id', 'library_type', 'count_metric_name', 'metric_value']
+    msums.columns = ["sample_id", "library_type", "count_metric_name", "metric_value"]
+
     # update the columns names
-    msums = msums.merge(convert_df).drop(columns=['count_metric_name'])
-    msums['group_name'] = msums['sample_id']
-    msums = msums[['sample_id','category', 'library_type', 'grouped_by', 'group_name', 'metric_name', 'metric_value']]
+    msums = msums.merge(convert_df).drop(columns=["count_metric_name"])
+    msums["group_name"] = msums["sample_id"]
+
+    msums = msums[
+        [
+            "sample_id",
+            "category",
+            "library_type",
+            "grouped_by",
+            "group_name",
+            "metric_name",
+            "metric_value",
+        ]
+    ]
+
     # convert percentages from string to numeric
-    msums['metric_value'] = [re.sub(",|%", "", str(x)) for x in msums['metric_value']]
-    msums['metric_value'] = msums['metric_value'].astype(float)
-    msums = msums.sort_values(['library_type', 'metric_name', 'sample_id'])
+    msums["metric_value"] = [re.sub(",|%", "", str(x)) for x in msums["metric_value"]]
+    msums["metric_value"] = msums["metric_value"].astype(float)
+
+    msums = msums.sort_values(["library_type", "metric_name", "sample_id"])
+
     return msums
-    
+
 
 L.info("Reading in tsv file '%s'" % args.cellranger_column_conversion_df)
-convert_df = pd.read_csv(args.cellranger_column_conversion_df, sep='\t')   
+convert_df = pd.read_csv(args.cellranger_column_conversion_df, sep="\t")
 
 # get the paths
-pipe_df = pd.read_csv(args.pipe_df, sep='\t')
-L.info('Searching for all cellranger paths')
+pipe_df = pd.read_csv(args.pipe_df, sep="\t")
+L.info("Searching for all cellranger paths")
 all_paths_df = get_all_unique_paths(pipe_df)
-all_paths_df['metrics_summary_path'] = all_paths_df.apply(lambda x: get_metrics_summary_path(path=x.path, sample_id=x.sample_id), axis=1)
+all_paths_df["metrics_summary_path"] = all_paths_df.apply(
+    lambda x: get_metrics_summary_path(path=x.path, sample_id=x.sample_id), axis=1
+)
+
+L.warning(f"pipe_df columns\n{pipe_df.columns}")
+L.warning(f"pipe df\n{pipe_df.to_markdown()}")
+L.warning(f"all_paths_df\n{all_paths_df.to_markdown()}")
+x = all_paths_df["metrics_summary_path"]
+L.warning(f"metrics summary path of all_paths_df\n{x.to_markdown()}")
+
 # all_paths_df_uniq = all_paths_df.drop(columns=['path', 'path_type']).drop_duplicates().reset_index(drop=True)
-all_paths_df['cellranger_type'] = [detect_cellranger_algorithm(x) for x in all_paths_df['metrics_summary_path']]
+all_paths_df["cellranger_type"] = [
+    detect_cellranger_algorithm(x) for x in all_paths_df["metrics_summary_path"]
+]
 L.info("Cellranger metrics_summary files found: %s" % all_paths_df)
 
 # parse and concatenate the tenx x metrics summary files into long format
 L.info("Parsing and concatenating the 10X metrics summary files into long format")
 tenx_metrics = []
-if any(all_paths_df['cellranger_type']=='multi'):
-    tenx_metrics.append(parse_10x_cellranger_multi(all_paths_df[all_paths_df['cellranger_type']=='multi']))
-
-if any(all_paths_df['cellranger_type']=='count'):
-    tenx_metrics.append(parse_10x_cellranger_count(all_paths_df[all_paths_df['cellranger_type']=='count'], convert_df))
+if any(all_paths_df["cellranger_type"] == "multi"):
+    tenx_metrics.append(
+        parse_10x_cellranger_multi(
+            all_paths_df[all_paths_df["cellranger_type"] == "multi"]
+        )
+    )
+
+if any(all_paths_df["cellranger_type"] == "count"):
+    tenx_metrics.append(
+        parse_10x_cellranger_count(
+            all_paths_df[all_paths_df["cellranger_type"] == "count"], convert_df
+        )
+    )
 
 tenx_metrics_full = pd.concat(tenx_metrics)
-tenx_metrics_full = tenx_metrics_full.sort_values(['library_type', 'metric_name'])
+tenx_metrics_full = tenx_metrics_full.sort_values(["library_type", "metric_name"])
 L.info("Done. Saving to '%s'" % args.output_file)
 tenx_metrics_full.to_csv(args.output_file, index=False)
 
 
 # tenx_metrics['metric_value'] = [re.sub(",|%", "", x) for x in tenx_metrics['metric_value']]
 # tenx_metrics['metric_value'] = tenx_metrics['metric_value'].astype(float)
-# split by library_type and metric_name 
+# split by library_type and metric_name
 L.info("Plotting metrics for each library_type and metric_name")
-for idx, row in tenx_metrics_full[['library_type','metric_name']].drop_duplicates().iterrows():
-    mn = row['metric_name']
-    lt = row['library_type']
-    plt_df = tenx_metrics_full[(tenx_metrics_full['library_type'] == lt) & (tenx_metrics_full['metric_name'] == mn)]  
+for idx, row in (
+    tenx_metrics_full[["library_type", "metric_name"]].drop_duplicates().iterrows()
+):
+    mn = row["metric_name"]
+    lt = row["library_type"]
+    plt_df = tenx_metrics_full[
+        (tenx_metrics_full["library_type"] == lt)
+        & (tenx_metrics_full["metric_name"] == mn)
+    ]
     if len(plt_df.category.unique()) > 1:
         # if multiple categories i.e. Cells and Libraries, these are likely duplicate rows
-        plt_df = plt_df[['sample_id', 'metric_name', 'library_type' , 'metric_value']].drop_duplicates()
-    if len(plt_df['sample_id']) == len(plt_df['sample_id'].unique()):
+        plt_df = plt_df[
+            ["sample_id", "metric_name", "library_type", "metric_value"]
+        ].drop_duplicates()
+    if len(plt_df["sample_id"]) == len(plt_df["sample_id"].unique()):
         L.info("Plotting barplot for library_type %s and metric_name %s" % (lt, mn))
-        fig = sns.barplot(plt_df, x='sample_id', y='metric_value', color='grey')
+        fig = sns.barplot(plt_df, x="sample_id", y="metric_value", color="grey")
         fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
-        fig.set_title(lt + ':' + mn)
-        plt.savefig(os.path.join(args.figdir, lt + '-' + mn + '.png'), bbox_inches='tight')
+        fig.set_title(lt + ":" + mn)
+        plt.savefig(
+            os.path.join(args.figdir, lt + "-" + mn + ".png"), bbox_inches="tight"
+        )
     else:
         # do a boxplot instead
         L.info("Plotting boxplot for library_type %s and metric_name %s" % (lt, mn))
-        fig = sns.boxplot(plt_df, x='sample_id', y='metric_value', color='grey')
+        fig = sns.boxplot(plt_df, x="sample_id", y="metric_value", color="grey")
         fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
-        fig.set_title(lt + ':' + mn)
-        plt.savefig(os.path.join(args.figdir, lt + '-' + mn + '.png'), bbox_inches='tight')
+        fig.set_title(lt + ":" + mn)
+        plt.savefig(
+            os.path.join(args.figdir, lt + "-" + mn + ".png"), bbox_inches="tight"
+        )
     plt.clf()
 
 
 ## gene expression  extra plots -----------------------------
 
-rna_tenx_metrics = tenx_metrics_full[tenx_metrics_full['library_type'] == 'Gene Expression']
+rna_tenx_metrics = tenx_metrics_full[
+    tenx_metrics_full["library_type"] == "Gene Expression"
+]
 
 # sequencing_saturaion plot scatter plot
-plot_metrics = ['Sequencing saturation', 'Mean reads per cell', 'Estimated number of cells', 'Number of reads', 'Median UMI counts per cell']
+plot_metrics = [
+    "Sequencing saturation",
+    "Mean reads per cell",
+    "Estimated number of cells",
+    "Number of reads",
+    "Median UMI counts per cell",
+]
 plt_df = rna_tenx_metrics[rna_tenx_metrics.metric_name.isin(plot_metrics)]
-plt_df = plt_df[['sample_id', 'metric_name', 'metric_value']]
-plt_tab = plt_df.pivot_table(index='sample_id', columns='metric_name', values='metric_value', aggfunc=sum)
-# sns.scatterplot(data=plt_tab, 
-#                 x='Sequencing saturation', 
+plt_df = plt_df[["sample_id", "metric_name", "metric_value"]]
+plt_tab = plt_df.pivot_table(
+    index="sample_id", columns="metric_name", values="metric_value", aggfunc=sum
+)
+# sns.scatterplot(data=plt_tab,
+#                 x='Sequencing saturation',
 #                 y='Number of reads', size='Mean reads per cell',sizes=(20, 200))
 
-if 'Sequencing saturation' in plt_df.metric_name.unique(): 
+if "Sequencing saturation" in plt_df.metric_name.unique():
     L.info("Plotting scatter plot of sequencing saturation against number of reads")
     f, ax = plt.subplots()
-    points = ax.scatter(x=plt_tab['Sequencing saturation'], 
-                y=plt_tab['Number of reads'], 
-                c=plt_tab['Estimated number of cells'])
+    points = ax.scatter(
+        x=plt_tab["Sequencing saturation"],
+        y=plt_tab["Number of reads"],
+        c=plt_tab["Estimated number of cells"],
+    )
     # define labels
-    plt.xlabel('Sequencing saturation')
-    plt.ylabel('Number of reads')
+    plt.xlabel("Sequencing saturation")
+    plt.ylabel("Number of reads")
     # define color bar
     cbar = f.colorbar(points)
     cbar.ax.get_yaxis().labelpad = 15
-    cbar.ax.set_ylabel('Estimated # of cells', rotation=270)
+    cbar.ax.set_ylabel("Estimated # of cells", rotation=270)
     # put it on a grid
-    plt.grid(axis='both', color='0.95')
+    plt.grid(axis="both", color="0.95")
     # save
-    plt.savefig(os.path.join(args.figdir,'10x_sequencing_saturation_summary.png'))
+    plt.savefig(os.path.join(args.figdir, "10x_sequencing_saturation_summary.png"))
     plt.clf()
 
 # plot 2 -  cells vs UMI counts per cell
 
-L.info("Plotting scatter plot of estimated number of cells against median UMI counts per cell")
+L.info(
+    "Plotting scatter plot of estimated number of cells against median UMI counts per cell"
+)
 fig, ax = plt.subplots()
-x=list(np.log10(plt_tab['Estimated number of cells']))
-y=list(np.log10(plt_tab['Median UMI counts per cell']))
-ax.scatter(x=x, 
-            y=y)
+x = list(np.log10(plt_tab["Estimated number of cells"]))
+y = list(np.log10(plt_tab["Median UMI counts per cell"]))
+ax.scatter(x=x, y=y)
 # annotate each point
 for i, txt in enumerate(plt_tab.index):
-    ax.annotate(txt, (x[i], y[i]), size=8, ha='center')
-plt.xlabel('log10(Estimated number of cells)')
-plt.ylabel('log10(Median UMI counts per cell)')
+    ax.annotate(txt, (x[i], y[i]), size=8, ha="center")
+plt.xlabel("log10(Estimated number of cells)")
+plt.ylabel("log10(Median UMI counts per cell)")
 # put a boundary box around the good cells
-plt.vlines(x=(np.log10(500), np.log10(20000)), 
-           ymin=min(1.5, min(y)),
-           ymax=max(5, max(y)),
-           colors='black', linestyles='dotted'
-           )
-plt.hlines(y=(3, 4), 
-            xmin=min(1.5, min(x)),
-           xmax=max(5, max(x)),
-           colors='black', linestyles='dotted'
+plt.vlines(
+    x=(np.log10(500), np.log10(20000)),
+    ymin=min(1.5, min(y)),
+    ymax=max(5, max(y)),
+    colors="black",
+    linestyles="dotted",
+)
+plt.hlines(
+    y=(3, 4),
+    xmin=min(1.5, min(x)),
+    xmax=max(5, max(x)),
+    colors="black",
+    linestyles="dotted",
 )
 
 # put it on a grid
-plt.grid(axis='both', color='0.95')
+plt.grid(axis="both", color="0.95")
 # save
-plt.savefig(os.path.join(args.figdir,'10x_cells_by_UMIs.png'))
+plt.savefig(os.path.join(args.figdir, "10x_cells_by_UMIs.png"))
 plt.clf()
 
-L.info("Done")
\ No newline at end of file
+L.info("Done")

From 4ce06ac0c471813f1dd30d6bc3fa49b921d1e205 Mon Sep 17 00:00:00 2001
From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com>
Date: Tue, 4 Feb 2025 12:13:08 +0100
Subject: [PATCH 07/10] refactoring: style and error messages

---
 panpipes/python_scripts/concat_adata.py | 184 +++++++++++++-----------
 1 file changed, 98 insertions(+), 86 deletions(-)

diff --git a/panpipes/python_scripts/concat_adata.py b/panpipes/python_scripts/concat_adata.py
index 0fab949d..13846df1 100644
--- a/panpipes/python_scripts/concat_adata.py
+++ b/panpipes/python_scripts/concat_adata.py
@@ -1,64 +1,57 @@
-import scanpy as sc
-import glob
-import pandas as pd
 import argparse
+import logging
 import re
-from  muon import MuData
+import sys
+import warnings
+
 import muon as mu
+import pandas as pd
 
-import warnings
-# warnings.filterwarnings("ignore", category=UserWarning)
-# warnings.filterwarnings("ignore", category=FutureWarning)
+from panpipes.funcs.processing import concat_mdatas
 
-from panpipes.funcs.processing import  concat_mdatas
+pd.set_option("display.max_rows", None)
 
-pd.set_option('display.max_rows', None)
-# import os
-# os.chdir("../../data/run_qc_general")
-import sys
-import logging
 L = logging.getLogger()
 L.setLevel(logging.INFO)
 log_handler = logging.StreamHandler(sys.stdout)
-formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s')
+formatter = logging.Formatter("%(asctime)s: %(levelname)s - %(message)s")
 log_handler.setFormatter(formatter)
 L.addHandler(log_handler)
 
 
-
 parser = argparse.ArgumentParser()
-parser.add_argument('--input_files_str',
-                    default='',
-                    help='')
-parser.add_argument('--output_file',
-                    default='',
-                    help='mdata.h5mu')
-parser.add_argument("--submissionfile",
-                    default="caf_example",
-                    help="this file has all the samples of the experiment. it has at least 3 columns: \
+parser.add_argument("--input_files_str", default="", help="")
+parser.add_argument("--output_file", default="", help="mdata.h5mu")
+parser.add_argument(
+    "--submissionfile",
+    default="caf_example",
+    help="this file has all the samples of the experiment. it has at least 3 columns: \
                     sample_id,path and type. \
-                    it can contain other anonymous columns which will be renamed using the metadatacols information")
-parser.add_argument("--sampleprefix",
-                    default="",
-                    help="prefix to prepend when saving the metadata file")
-parser.add_argument("--metadatacols",
-                    default=None,
-                    help="names of the metadata to rename the columns of the sample submission file ")
-parser.add_argument("--join_type",
-                    default="inner",
-                    help="whether to concat anndata on the 'inner' (default) or 'outer'")
-parser.add_argument('--barcode_mtd_df',
-                    default=None,
-                    help='csv file contining barcode level metadata')
-parser.add_argument('--barcode_mtd_metadatacols',
-                    default=None,
-                    help='comma separated strings listing the column you want to keep in barcode_mtd_df')            
-parser.add_argument('--protein_var_table',
-                    default=None,
-                    help='')
-parser.add_argument('--protein_new_index_col',
-                    default=None,
-                    help='')        
+                    it can contain other anonymous columns which will be renamed using the metadatacols information",
+)
+parser.add_argument(
+    "--sampleprefix", default="", help="prefix to prepend when saving the metadata file"
+)
+parser.add_argument(
+    "--metadatacols",
+    default=None,
+    help="names of the metadata to rename the columns of the sample submission file ",
+)
+parser.add_argument(
+    "--join_type",
+    default="inner",
+    help="whether to concat anndata on the 'inner' (default) or 'outer'",
+)
+parser.add_argument(
+    "--barcode_mtd_df", default=None, help="csv file contining barcode level metadata"
+)
+parser.add_argument(
+    "--barcode_mtd_metadatacols",
+    default=None,
+    help="comma separated strings listing the column you want to keep in barcode_mtd_df",
+)
+parser.add_argument("--protein_var_table", default=None, help="")
+parser.add_argument("--protein_new_index_col", default=None, help="")
 
 parser.set_defaults(verbose=True)
 args, opt = parser.parse_known_args()
@@ -67,10 +60,10 @@
 if len(args.input_files_str) == 0:
     L.error("No input files detected")
     sys.exit("No input files detected")
-    
-lf = re.split(',', args.input_files_str)
 
-sfile=args.submissionfile
+lf = re.split(",", args.input_files_str)
+
+sfile = args.submissionfile
 
 #  lf = glob.glob("./tmp/*raw.h5mu")
 # sfile= "CARTC_bonemarrow_samples.tsv"
@@ -83,7 +76,7 @@
 # [x.var_names_make_unique() for x in mdatas]
 
 
-if len(mdatas)==1:
+if len(mdatas) == 1:
     # if mdatas length is 0 then no concatenation required
     mdata = mdatas[0]
 else:
@@ -91,15 +84,13 @@
     # with the atac, should it just error out to say
     # that atac shouldn't be concatenated?
     temp = mdatas[0]
-    if 'atac' in temp.mod.keys():
-        mdata=mdatas[0]
+    if "atac" in temp.mod.keys():
+        mdata = mdatas[0]
         del temp
-    elif 'prot' in temp.mod.keys() or 'rna' in temp.mod.keys():
+    elif "prot" in temp.mod.keys() or "rna" in temp.mod.keys():
         ## IF RNA and PROT is ok to concatenate ----------
         L.info("Concatenating RNA and prot")
-        mdata = concat_mdatas(mdatas,
-            batch_key="sample_id", 
-            join_type=args.join_type)
+        mdata = concat_mdatas(mdatas, batch_key="sample_id", join_type=args.join_type)
 
 # remove to avoid mem issues
 del mdatas
@@ -113,20 +104,22 @@
 metadatacols = args.metadatacols
 # metadatacols="mdata_cols"
 # check sample_id is in metadatacols as a minimum requirement
-if metadatacols == None :
+if metadatacols is None:
     pass
-elif metadatacols == 'None':
+elif metadatacols.lower() == "none":
     pass
 else:
-    metadatacols = metadatacols.split(',')
-    if 'sample_id' not in metadatacols:
-        metadatacols = ['sample_id'] + metadatacols
+    metadatacols = metadatacols.split(",")
+
+    if "sample_id" not in metadatacols:
+        metadatacols = ["sample_id"] + metadatacols
+
     # check all metadatacols are in the caf
     if all([x in caf.columns for x in metadatacols]):
-        # if mdata is actually a mudata it will be helpful 
-        # (although annoying to store this in rna and protein as 
+        # if mdata is actually a mudata it will be helpful
+        # (although annoying to store this in rna and protein as
         # well since the batch columns etc will be here)
-        # hopefullt his whole bit will become unnesacry because of 
+        # hopefullt his whole bit will become unnesacry because of
         # https://github.com/scverse/mudata/issues/2
         for mod in mdata.mod.keys():
             L.debug(mod)
@@ -134,21 +127,36 @@
             assay = mdata[mod]
             # make sure there is no index name because rep might have one as default
             assay.obs.index.name = None
-            assay.obs = assay.obs.reset_index().merge(caf[metadatacols], on="sample_id", how="left").set_index('index').copy()
+            assay.obs = (
+                assay.obs.reset_index()
+                .merge(caf[metadatacols], on="sample_id", how="left")
+                .set_index("index")
+                .copy()
+            )
             # remove the index name introduced with the set index.
             assay.obs.index_name = None
             L.debug(assay.obs.head())
+
         mdata.update_obs()
+
         # merge metadata
         # obs = mdata.obs
-        # obs['cellbarcode'] = obs.index  
-        mdata.obs=mdata.obs.reset_index().merge(caf[metadatacols], on="sample_id", how="left").set_index('index')
+        # obs['cellbarcode'] = obs.index
+        mdata.obs = (
+            mdata.obs.reset_index()
+            .merge(caf[metadatacols], on="sample_id", how="left")
+            .set_index("index")
+        )
+
         mdata.obs.index.name = None
-        mdata.obs['cellbarcode'] = mdata.obs.index
+
+        mdata.obs["cellbarcode"] = mdata.obs.index
+
         mdata.update_obs()
     else:
         # which column is missing?
-        missingcols= " ".join([x for x in metadatacols if x not in caf.columns])
+        missingcols = " ".join([x for x in metadatacols if x not in caf.columns])
+        sys.exit(caf.columns)
         L.error("Required columns missing form samples file: %s" % missingcols)
         sys.exit("Required columns missing form samples file: %s" % missingcols)
 
@@ -157,22 +165,26 @@
 #     col_update = mdata['rep'].obs.columns[mdata['rep'].obs.columns.str.contains("count")]
 #     mdata['rep'].obs[col_update] = mdata['rep'].obs[col_update].apply(pd.to_numeric)
 #     mdata.update()
-    
+
 # add in cell level metadata
-if args.barcode_mtd_df is not None :
+if args.barcode_mtd_df is not None:
     L.info("Adding barcode level metadata")
     # check demult_metadatacols exists and contains antibody
-    barcode_metadatacols = args.barcode_mtd_metadatacols.split(',')
+    barcode_metadatacols = args.barcode_mtd_metadatacols.split(",")
     # load the demultiplexing data
     barcode_mtd_df = pd.read_csv(args.barcode_mtd_df, index_col=None)
-    if len(barcode_mtd_df['sample_id'].unique().tolist()) > 1:
-        barcode_mtd_df.index = barcode_mtd_df[['barcode_id', 'sample_id']].apply(lambda row: "-".join(row.values.astype(str)), axis=1)
+    if len(barcode_mtd_df["sample_id"].unique().tolist()) > 1:
+        barcode_mtd_df.index = barcode_mtd_df[["barcode_id", "sample_id"]].apply(
+            lambda row: "-".join(row.values.astype(str)), axis=1
+        )
     else:
-        barcode_mtd_df.index = barcode_mtd_df['barcode_id']
-    barcode_mtd_df= barcode_mtd_df.drop(columns=['sample_id'])
+        barcode_mtd_df.index = barcode_mtd_df["barcode_id"]
+    barcode_mtd_df = barcode_mtd_df.drop(columns=["sample_id"])
     # add for each modality
     for mod in mdata.mod.keys():
-        mdata[mod].obs = mdata[mod].obs.merge(barcode_mtd_df, left_index=True, right_index=True)
+        mdata[mod].obs = mdata[mod].obs.merge(
+            barcode_mtd_df, left_index=True, right_index=True
+        )
         mdata.update()
     # add at top level
     mdata.obs = mdata.obs.merge(barcode_mtd_df, left_index=True, right_index=True)
@@ -184,28 +196,28 @@
 if args.protein_var_table is not None:
     try:
         L.info("Reading in protein var table from '%s'" % args.protein_var_table)
-        df = pd.read_csv(args.protein_var_table, sep='\t', index_col=0)
+        df = pd.read_csv(args.protein_var_table, sep="\t", index_col=0)
         L.info("Merging protein table with var")
         # add_var_mtd(mdata['prot'], df)
-        var_df = mdata['prot'].var.merge(df, left_index=True, right_index=True)
+        var_df = mdata["prot"].var.merge(df, left_index=True, right_index=True)
         if args.protein_new_index_col is not None:
             L.info("Updating prot.var index")
             # update_var_index(mdata['prot'], args.protein_new_index_col)
-            var_df =var_df.reset_index().set_index(args.protein_new_index_col)
-            var_df = var_df.rename(columns={'index':'orig_id'})
+            var_df = var_df.reset_index().set_index(args.protein_new_index_col)
+            var_df = var_df.rename(columns={"index": "orig_id"})
             var_df.index.name = None
-        mdata['prot'].var = var_df
+        mdata["prot"].var = var_df
         mdata.update_var()
         mdata.update()
         # we might want to split hashing antibodies into a separate modalities
         # we assume this has been inidicated in a "hashing_ab" column in the protein metadata file
-        if "hashing_ab" in mdata['prot'].var.columns:
+        if "hashing_ab" in mdata["prot"].var.columns:
             # create new modality for hashing
-            #mdata.mod["hashing_ab"]=mdata["prot"][:, mdata["prot"].var["hashing_ab"]]
+            # mdata.mod["hashing_ab"]=mdata["prot"][:, mdata["prot"].var["hashing_ab"]]
             # subset old modality to remove hashing
-            mdata.mod['prot'] = mdata["prot"][:, ~mdata["prot"].var["hashing_ab"]]
+            mdata.mod["prot"] = mdata["prot"][:, ~mdata["prot"].var["hashing_ab"]]
     except FileNotFoundError:
-        warnings.warn("Protein metadata table not found")
+        warnings.warn(f"Protein metadata table not found {args.protein_var_table}")
 mdata.update()
 # tidy up metadata
 # move sample_id to the front

From a21a9de7c787cefbf36d9f51160a3f34aa59cc7a Mon Sep 17 00:00:00 2001
From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com>
Date: Tue, 4 Feb 2025 12:18:06 +0100
Subject: [PATCH 08/10] fix & refactoring: double group_by; style; replace
 prints with cats

---
 panpipes/R_scripts/plotQC.R | 756 ++++++++++++++++++++----------------
 1 file changed, 422 insertions(+), 334 deletions(-)

diff --git a/panpipes/R_scripts/plotQC.R b/panpipes/R_scripts/plotQC.R
index ecff7b14..32a8ce9b 100644
--- a/panpipes/R_scripts/plotQC.R
+++ b/panpipes/R_scripts/plotQC.R
@@ -1,139 +1,179 @@
-# plotting utilities for qc 
+# plotting utilities for qc
 suppressPackageStartupMessages({
   library(tidyverse)
   library(optparse)
 })
 
-do_scatter_plot <- function(df, x, y, facet=NULL, hue=NULL){
-  g <- df %>% 
-    ggplot(aes_string(x=x,y=y, color=hue)) + 
-    geom_point(size=0.5) 
-  if (!is.null(facet)){
-    g <- g + facet_wrap(as.formula(paste0("~", sc)), ncol=6) 
+do_scatter_plot <- function(df, x, y, facet = NULL, hue = NULL) {
+  g <- df %>%
+    ggplot(aes_string(x = x, y = y, color = hue)) +
+    geom_point(size = 0.5)
+  if (!is.null(facet)) {
+    g <- g + facet_wrap(as.formula(paste0("~", sc)), ncol = 6)
   }
-  
-  if(is.numeric(df[hue])){
+
+  if (is.numeric(df[hue])) {
     g <- g + scale_color_viridis_c()
   }
-  g <- g+ theme_bw()+
-    theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
-          strip.text.x = element_text(size=6),
-          legend.key.size = unit(0.2, 'cm')) 
+  g <- g + theme_bw() +
+    theme(
+      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
+      strip.text.x = element_text(size = 6),
+      legend.key.size = unit(0.2, "cm")
+    )
   return(g)
 }
 
-do_violin_plot <- function(df, qc, group){
-  g <- df %>% drop_na() %>%
-    ggplot(aes_string(x="sample_id",y=qc)) +
-    geom_violin(aes_string(fill=group)) + 
-    {if(qc=="doublet_scores") geom_hline(yintercept=0.25, color="grey50", linetype="dashed") }+
-    {if("pct_counts" %in% qc) geom_hline(yintercept=c(5, 10, 20 ,70), color="grey50", linetype="dashed")}+
-    {if("pct_counts" %in% qc) coord_cartesian(ylim=c(0,100))}+
-    theme_bw()+
-    theme(axis.text.x=element_text(size=8, angle=90),
-          axis.text.y=element_text(size=8))
-  if(length(unique(df[[group]])) > 10){
-    print(paste(sc, "has too many categories, removing legend"))
-    g <- g + theme(legend.position="none")
+do_violin_plot <- function(df, qc, group) {
+  g <- df %>%
+    drop_na() %>%
+    ggplot(aes_string(x = "sample_id", y = qc)) +
+    geom_violin(aes_string(fill = group)) +
+    {
+      if (qc == "doublet_scores") geom_hline(yintercept = 0.25, color = "grey50", linetype = "dashed")
+    } +
+    {
+      if ("pct_counts" %in% qc) geom_hline(yintercept = c(5, 10, 20, 70), color = "grey50", linetype = "dashed")
+    } +
+    {
+      if ("pct_counts" %in% qc) coord_cartesian(ylim = c(0, 100))
+    } +
+    theme_bw() +
+    theme(
+      axis.text.x = element_text(size = 8, angle = 90),
+      axis.text.y = element_text(size = 8)
+    )
+  if (length(unique(df[[group]])) > 10) {
+    cat(sc, "has too many categories, removing legend\n")
+    g <- g + theme(legend.position = "none")
   }
-  
+
   return(g)
 }
 
-do_bar_plot <- function(df, qc, group){
+do_bar_plot <- function(df, qc, group) {
   g <- df %>%
-    ggplot(aes_string(x=group, fill=qc)) +
-    geom_bar(aes_string()) + 
-    theme_bw()+
-    theme(axis.text.x=element_text(size=8, angle=90),
-          axis.text.y=element_text(size=8))
-#   if(length(unique(df[[group]])) > 10){
-#     message(paste0(sc, "has too many categories, removing legend"))
-#     g <- g + theme(legend.position="none")
-#   }
-  
+    ggplot(aes_string(x = group, fill = qc)) +
+    geom_bar(aes_string()) +
+    theme_bw() +
+    theme(
+      axis.text.x = element_text(size = 8, angle = 90),
+      axis.text.y = element_text(size = 8)
+    )
+  #   if(length(unique(df[[group]])) > 10){
+  #     message(paste0(sc, "has too many categories, removing legend"))
+  #     g <- g + theme(legend.position="none")
+  #   }
+
   return(g)
 }
 
 options(stringsAsFactors = F)
-options(bitmaptype="cairo")
+options(bitmaptype = "cairo")
 
 option_list <- list(
-  make_option(c("--cell_metadata"), default=NULL,
-              help="the path to the anndata object"),
-  make_option(c("--groupingvar"), default="sample_id,tissue,patient,channel",
-              help="names of grouping variables"),
-  make_option(c("--rna_qc_metrics"), default=NULL, 
-              help="the qc_metrics to plot"),
-  make_option(c("--prot_qc_metrics"), default=NULL, 
-              help="the qc_metrics to plot"),
-  make_option(c("--rep_qc_metrics"), default=NULL, 
-              help="the qc_metrics to plot"),
-  make_option(c("--atac_qc_metrics"), default=NULL, 
-              help="the qc_metrics to plot"),
-  make_option(c("--outdir"), default="./figures/",
-              help="the name of the output folder"),
-  make_option(c("--prefilter"), default=TRUE,
-              help="am i parsing data before or after filtering?"),
-  make_option(c("--sampleprefix"), default="",
-              help="the prefix to prepend to save summary filtering plots"),
-  make_option(c("--scanpy_or_muon"), default="scanpy", 
-              help="was the input file written out from the obs of scanpy or muon")
+  make_option(c("--cell_metadata"),
+    default = NULL,
+    help = "the path to the anndata object"
+  ),
+  make_option(c("--groupingvar"),
+    default = "sample_id,tissue,patient,channel",
+    help = "names of grouping variables"
+  ),
+  make_option(c("--rna_qc_metrics"),
+    default = NULL,
+    help = "the qc_metrics to plot"
+  ),
+  make_option(c("--prot_qc_metrics"),
+    default = NULL,
+    help = "the qc_metrics to plot"
+  ),
+  make_option(c("--rep_qc_metrics"),
+    default = NULL,
+    help = "the qc_metrics to plot"
+  ),
+  make_option(c("--atac_qc_metrics"),
+    default = NULL,
+    help = "the qc_metrics to plot"
+  ),
+  make_option(c("--outdir"),
+    default = "./figures/",
+    help = "the name of the output folder"
+  ),
+  make_option(c("--prefilter"),
+    default = TRUE,
+    help = "am i parsing data before or after filtering?"
+  ),
+  make_option(c("--sampleprefix"),
+    default = "",
+    help = "the prefix to prepend to save summary filtering plots"
+  ),
+  make_option(c("--scanpy_or_muon"),
+    default = "scanpy",
+    help = "was the input file written out from the obs of scanpy or muon"
+  )
 )
 
 
-opt <- parse_args(OptionParser(option_list=option_list))
+opt <- parse_args(OptionParser(option_list = option_list))
 
-if(is.null(opt$outdir)) { opt$outdir <- paste0(getwd(),"/")}
-if(!grepl("\\/$", opt$outdir)){opt$outdir <- paste(opt$outdir, "/", sep = "")}
-if(!file.exists(opt$outdir)){dir.create(opt$outdir)}
+if (is.null(opt$outdir)) {
+  opt$outdir <- paste0(getwd(), "/")
+}
+if (!grepl("\\/$", opt$outdir)) {
+  opt$outdir <- paste(opt$outdir, "/", sep = "")
+}
+if (!file.exists(opt$outdir)) {
+  dir.create(opt$outdir)
+}
 
 
-run<- opt$outdir
-opt[which(opt=="NULL")] <- NULL
-opt[which(opt=="None")] <- NULL
+run <- opt$outdir
+opt[which(opt == "NULL")] <- NULL
+opt[which(opt == "None")] <- NULL
 opt$prefilter <- as.logical(opt$prefilter)
 
 # load_data --------------------------------------------------------------------
 
-data_plot = read.delim(opt$cell_metadata)
+data_plot <- read.delim(opt$cell_metadata)
 
 # define source facet for all plots
-if (!is.null(opt$groupingvar)){
-  source_facet <- strsplit(opt$groupingvar,",")[[1]]
-  check_excl <- source_facet[!source_facet %in% colnames(data_plot)] 
+if (!is.null(opt$groupingvar)) {
+  source_facet <- strsplit(opt$groupingvar, ",")[[1]]
+  check_excl <- source_facet[!source_facet %in% colnames(data_plot)]
   keep_source <- NULL
-  for (cc in check_excl){
-    for (mod in c("rna","prot","atac","rep")){
-      id <- paste(mod,cc,sep=".")
-      if(id %in% colnames(data_plot)){
-        keep_source <- c(keep_source,id)
+  for (cc in check_excl) {
+    for (mod in c("rna", "prot", "atac", "rep")) {
+      id <- paste(mod, cc, sep = ".")
+      if (id %in% colnames(data_plot)) {
+        keep_source <- c(keep_source, id)
       }
     }
   }
 
   source_facet <- source_facet[source_facet %in% colnames(data_plot)]
-  source_facet <- unique(c(source_facet, keep_source) )
-  if(length(source_facet)>0){
+
+  source_facet <- unique(c(source_facet, keep_source))
+  if (length(source_facet) > 0) {
     # add sample_id as a minimum requirement if it's not there already
-    source_facet = unique(c("sample_id", source_facet))
-    print(paste("Facet plotting with",source_facet))
+    source_facet <- unique(c("sample_id", source_facet))
+    cat("Facet plotting with:", source_facet, "\n")
   }
-}else{
+} else {
   stop("Need the minimum variable _sampleid_ to facet on, will stop here")
 }
 # RNA plots --------------------------------------------------------------------
 
-print("RNA plots")
+cat("RNA plots\n")
 
-if(opt$scanpy_or_muon=="scanpy"){
+if (opt$scanpy_or_muon == "scanpy") {
   rna_data_plot <- data_plot
-}else{
-  rna_data_plot <- data_plot[,grep("^rna\\.",colnames(data_plot))]
+} else {
+  rna_data_plot <- data_plot[, grep("^rna\\.", colnames(data_plot))]
   colnames(rna_data_plot) <- gsub("^rna\\.", "", colnames(rna_data_plot))
 }
 
-outpath = file.path(run, "rna")
+outpath <- file.path(run, "rna")
 if (!dir.exists(outpath)) dir.create(outpath)
 
 
@@ -143,68 +183,80 @@ if (!dir.exists(outpath)) dir.create(outpath)
 #             "log1p_total_counts",
 #             "n_genes_by_counts",
 #             "log1p_n_genes_by_counts",
-#             "doublet_scores", 
-#             "pct_counts_mt", 
-#             "pct_counts_rp", 
+#             "doublet_scores",
+#             "pct_counts_mt",
+#             "pct_counts_rp",
 #             "pct_counts_ig",
 #             "pct_counts_hb")
 
 # check these qc metrics are in the file
 if (!is.null(opt$rna_qc_metrics)) {
-  qcmetrics <- strsplit(opt$rna_qc_metrics,",")[[1]]
+  qcmetrics <- strsplit(opt$rna_qc_metrics, ",")[[1]]
 }
 qcmetrics <- qcmetrics[qcmetrics %in% colnames(rna_data_plot)]
 uniq_sample_id <- nrow(unique(rna_data_plot["sample_id"]))
-rna_source_facet <- gsub("^rna\\.", "",grep("^rna.", source_facet, value = TRUE))
+rna_source_facet <- gsub("^rna\\.", "", grep("^rna.", source_facet, value = TRUE))
 rna_source_facet <- unique(c(rna_source_facet, source_facet[!grepl("^rna.", source_facet)]))
 rna_source_facet <- rna_source_facet[rna_source_facet %in% colnames(rna_data_plot)]
-for (qc in qcmetrics){
-  print(paste("Plotting violin plots of", qc))
-  for (sc in rna_source_facet){ #add gsub temp here
+for (qc in qcmetrics) {
+  cat("Plotting violin plots of", qc, "\n")
+  for (sc in rna_source_facet) { # add gsub temp here
     g <- do_violin_plot(rna_data_plot, qc, sc)
-    if (uniq_sample_id  > 50){width=12}else{width=6}
-    ggsave(g, filename=file.path(outpath, paste0("violin_", sc, "_rna-", qc,".png")), type="cairo", width= width, height=6)
-    
+    if (uniq_sample_id > 50) {
+      width <- 12
+    } else {
+      width <- 6
+    }
+    ggsave(g, filename = file.path(outpath, paste0("violin_", sc, "_rna-", qc, ".png")), type = "cairo", width = width, height = 6)
   }
 }
 
-for (sc in rna_source_facet){
+for (sc in rna_source_facet) {
   uniq_source <- nrow(unique(rna_data_plot[sc]))
-  if(uniq_source >6){
-    ncols=6
-    nrows=ceiling(uniq_source/6)
-  }else{
-    ncols=uniq_source
-    nrows=1
+  if (uniq_source > 6) {
+    ncols <- 6
+    nrows <- ceiling(uniq_source / 6)
+  } else {
+    ncols <- uniq_source
+    nrows <- 1
   }
   # plot once per source facet
-  if (all(c("total_counts","n_genes_by_counts")%in% colnames(rna_data_plot))){
-    print("Plotting scatter plot of total_counts and n_genes_by_counts")
-    g <- do_scatter_plot(rna_data_plot,x="total_counts",y="n_genes_by_counts", facet=sc)
-    ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-nUMI_v_rna-genes.png")), type="cairo",
-           width= 3*ncols, height=3*nrows, dpi=200)
+  if (all(c("total_counts", "n_genes_by_counts") %in% colnames(rna_data_plot))) {
+    cat("Plotting scatter plot of total_counts and n_genes_by_counts\n")
+    g <- do_scatter_plot(rna_data_plot, x = "total_counts", y = "n_genes_by_counts", facet = sc)
+    ggsave(g,
+      filename = file.path(outpath, paste0("scatter_", sc, "_rna-nUMI_v_rna-genes.png")), type = "cairo",
+      width = 3 * ncols, height = 3 * nrows, dpi = 200
+    )
   }
-  
-  if (all(c("log1p_total_counts","log1p_n_genes_by_counts")%in% colnames(rna_data_plot))){
-    print("Plotting scatter plot of log1p_total_counts and n_genes_by_counts")
-    g <- do_scatter_plot(rna_data_plot,x="log1p_total_counts",y="log1p_n_genes_by_counts", facet=sc)
-    ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-log1p_nUMI_v_rna-log1p_genes.png")), type="cairo",
-           width= 3*ncols, height=3*nrows, dpi=200)
+
+  if (all(c("log1p_total_counts", "log1p_n_genes_by_counts") %in% colnames(rna_data_plot))) {
+    cat("Plotting scatter plot of log1p_total_counts and n_genes_by_counts\n")
+    g <- do_scatter_plot(rna_data_plot, x = "log1p_total_counts", y = "log1p_n_genes_by_counts", facet = sc)
+    ggsave(g,
+      filename = file.path(outpath, paste0("scatter_", sc, "_rna-log1p_nUMI_v_rna-log1p_genes.png")), type = "cairo",
+      width = 3 * ncols, height = 3 * nrows, dpi = 200
+    )
   }
-  if (all(c("total_counts","pct_counts_mt")%in% colnames(rna_data_plot))){
-    print("Plotting scatter plot of total_counts and pct_counts_mt")
-    g <- do_scatter_plot(rna_data_plot,x="total_counts",y="pct_counts_mt", facet=sc)
-    ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-nUMI_v_rna-pct_mt.png")), type="cairo",
-           width= 3*ncols, height=3*nrows, dpi=200)
+  if (all(c("total_counts", "pct_counts_mt") %in% colnames(rna_data_plot))) {
+    cat("Plotting scatter plot of total_counts and pct_counts_mt\n")
+    g <- do_scatter_plot(rna_data_plot, x = "total_counts", y = "pct_counts_mt", facet = sc)
+    ggsave(g,
+      filename = file.path(outpath, paste0("scatter_", sc, "_rna-nUMI_v_rna-pct_mt.png")), type = "cairo",
+      width = 3 * ncols, height = 3 * nrows, dpi = 200
+    )
   }
-  if (all(c("n_genes_by_counts","doublet_scores","total_counts")%in% colnames(rna_data_plot))){
-    print("Plotting scatter plot of n_genes_by_counts and doublet_scores")
-    g <- do_scatter_plot(rna_data_plot,x="n_genes_by_counts",y="doublet_scores",
-                         hue="total_counts", facet=sc)
-    ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-genes_rna-doublet_scores_rna-numi.png")),    
-           type="cairo", width= 3*ncols, height=3*nrows, dpi=200)
+  if (all(c("n_genes_by_counts", "doublet_scores", "total_counts") %in% colnames(rna_data_plot))) {
+    cat("Plotting scatter plot of n_genes_by_counts and doublet_scores\n")
+    g <- do_scatter_plot(rna_data_plot,
+      x = "n_genes_by_counts", y = "doublet_scores",
+      hue = "total_counts", facet = sc
+    )
+    ggsave(g,
+      filename = file.path(outpath, paste0("scatter_", sc, "_rna-genes_rna-doublet_scores_rna-numi.png")),
+      type = "cairo", width = 3 * ncols, height = 3 * nrows, dpi = 200
+    )
   }
-  
 }
 
 
@@ -213,300 +265,336 @@ for (sc in rna_source_facet){
 
 
 
-if(!is.null(opt$prot_qc_metrics)){
-  print("Protein plots")
+if (!is.null(opt$prot_qc_metrics)) {
+  cat("Protein plots\n")
 
-  qcmetrics <- strsplit(opt$prot_qc_metrics,",")[[1]]
-  prot_data_plot <- data_plot[,grep("^prot\\.",colnames(data_plot))]
+  qcmetrics <- strsplit(opt$prot_qc_metrics, ",")[[1]]
+  prot_data_plot <- data_plot[, grep("^prot\\.", colnames(data_plot))]
   colnames(prot_data_plot) <- gsub("^prot\\.", "", colnames(prot_data_plot))
-  prot_source_facet <- gsub("^prot\\.", "",grep("^prot.", source_facet, value = TRUE))
+  prot_source_facet <- gsub("^prot\\.", "", grep("^prot.", source_facet, value = TRUE))
   prot_source_facet <- unique(c(prot_source_facet, source_facet[!grepl("^prot.", source_facet)]))
   prot_source_facet <- prot_source_facet[prot_source_facet %in% colnames(prot_data_plot)]
 
-  outpath = file.path(run, "prot")
-  if (!dir.exists(outpath)) { 
+  outpath <- file.path(run, "prot")
+  if (!dir.exists(outpath)) {
     dir.create(outpath)
-    }
+  }
   uniq_sample_id <- nrow(unique(prot_data_plot["sample_id"]))
 
   # check these qc metrics are in the file
   qcmetrics <- qcmetrics[qcmetrics %in% colnames(prot_data_plot)]
-  for (qc in qcmetrics){
-    print(paste("Plotting violin plots of", qc))
-    for (sc in prot_source_facet){
+  for (qc in qcmetrics) {
+    cat("Plotting violin plots of", qc, "\n")
+    for (sc in prot_source_facet) {
       g <- do_violin_plot(prot_data_plot, qc, sc)
-      if (uniq_sample_id  > 50){width=12}else{width=6}
-      ggsave(g, filename=file.path(outpath, paste0("violin_", sc, "_prot-", qc,".png")), type="cairo", width= width, height=6)
+      if (uniq_sample_id > 50) {
+        width <- 12
+      } else {
+        width <- 6
+      }
+      ggsave(g, filename = file.path(outpath, paste0("violin_", sc, "_prot-", qc, ".png")), type = "cairo", width = width, height = 6)
     }
   }
-  
+
   # do the following plots:
   # - prot:total_counts vs prot_nprot_by_counts
   # - prot:total_counts vs prot:isotype_counts
   # - prot:log1p_total_counts vs prot:log1p_isotype_counts
   # - prot:total_counts vs prot:pct_isotype_counts
-  
-  
-  for (sc in prot_source_facet){
+
+
+  for (sc in prot_source_facet) {
     uniq_source <- nrow(unique(prot_data_plot[sc]))
-    if(uniq_source >6){
-      ncols=6
-      nrows=ceiling(uniq_source/6)
-    }else{
-      ncols=uniq_source
-      nrows=1
+    if (uniq_source > 6) {
+      ncols <- 6
+      nrows <- ceiling(uniq_source / 6)
+    } else {
+      ncols <- uniq_source
+      nrows <- 1
     }
     # plot once per source facet
 
-    if (all(c("total_counts","n_prot_by_counts")%in% colnames(prot_data_plot))){
-      print("Plotting scatter plot of total_counts and n_prot_by_counts")
-      g <- do_scatter_plot(prot_data_plot,x="total_counts",y="n_prot_by_counts", facet=sc)
-      ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-prot.png")), type="cairo",
-             width= 3*ncols, height=3*nrows, dpi=200)
+    if (all(c("total_counts", "n_prot_by_counts") %in% colnames(prot_data_plot))) {
+      cat("Plotting scatter plot of total_counts and n_prot_by_counts\n")
+      g <- do_scatter_plot(prot_data_plot, x = "total_counts", y = "n_prot_by_counts", facet = sc)
+      ggsave(g,
+        filename = file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-prot.png")), type = "cairo",
+        width = 3 * ncols, height = 3 * nrows, dpi = 200
+      )
     }
-    if (all(c("log1p_total_counts","log1p_n_prot_by_counts")%in% colnames(prot_data_plot))){
-      print("Plotting scatter plot of log1p_total_counts and log1p_n_prot_by_counts")
-      g <- do_scatter_plot(prot_data_plot,x="log1p_total_counts",y="log1p_n_prot_by_counts", facet=sc)
-      ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-log1p_nUMI_v_prot-log1p_prot.png")), type="cairo",
-             width= 3*ncols, height=3*nrows, dpi=200)
+    if (all(c("log1p_total_counts", "log1p_n_prot_by_counts") %in% colnames(prot_data_plot))) {
+      cat("Plotting scatter plot of log1p_total_counts and log1p_n_prot_by_counts\n")
+      g <- do_scatter_plot(prot_data_plot, x = "log1p_total_counts", y = "log1p_n_prot_by_counts", facet = sc)
+      ggsave(g,
+        filename = file.path(outpath, paste0("scatter_", sc, "_prot-log1p_nUMI_v_prot-log1p_prot.png")), type = "cairo",
+        width = 3 * ncols, height = 3 * nrows, dpi = 200
+      )
     }
-    if (all(c("total_counts","pct_counts_isotype")%in% colnames(prot_data_plot))){
-      print("Plotting scatter plot of total_counts and pct_counts_isotype")
-      g <- do_scatter_plot(prot_data_plot,x="total_counts",y="pct_counts_isotype", facet=sc)
-      ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-pct_isotype.png")), type="cairo",
-             width= 3*ncols, height=3*nrows, dpi=200)
+    if (all(c("total_counts", "pct_counts_isotype") %in% colnames(prot_data_plot))) {
+      cat("Plotting scatter plot of total_counts and pct_counts_isotype\n")
+      g <- do_scatter_plot(prot_data_plot, x = "total_counts", y = "pct_counts_isotype", facet = sc)
+      ggsave(g,
+        filename = file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-pct_isotype.png")), type = "cairo",
+        width = 3 * ncols, height = 3 * nrows, dpi = 200
+      )
     }
-    if (all(c("total_counts","total_counts_isotype")%in% colnames(prot_data_plot))){
-      print("Plotting scatter plot of total_counts and total_counts_isotype")
-      g <- do_scatter_plot(prot_data_plot,x="total_counts",y="total_counts_isotype", facet=sc)
-      ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-total_counts_isotype.png")), type="cairo",
-             width= 3*ncols, height=3*nrows, dpi=200)
+    if (all(c("total_counts", "total_counts_isotype") %in% colnames(prot_data_plot))) {
+      cat("Plotting scatter plot of total_counts and total_counts_isotype\n")
+      g <- do_scatter_plot(prot_data_plot, x = "total_counts", y = "total_counts_isotype", facet = sc)
+      ggsave(g,
+        filename = file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-total_counts_isotype.png")), type = "cairo",
+        width = 3 * ncols, height = 3 * nrows, dpi = 200
+      )
     }
-    if (all(c("log1p_total_counts","log1p_total_counts_isotype", "isotype_exclude")%in% colnames(prot_data_plot))){
-      print("Plotting scatter plot of log1p_total_counts and log1p_total_counts_isotype")
-      g <- do_scatter_plot(prot_data_plot,x="log1p_total_counts",y="log1p_total_counts_isotype", hue="isotype_exclude", facet=sc)
-      ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-log1p_nUMI_v_prot-log1p_total_counts_isotype.png")), type="cairo",
-             width= 3*ncols, height=3*nrows, dpi=200)
+    if (all(c("log1p_total_counts", "log1p_total_counts_isotype", "isotype_exclude") %in% colnames(prot_data_plot))) {
+      cat("Plotting scatter plot of log1p_total_counts and log1p_total_counts_isotype\n")
+      g <- do_scatter_plot(prot_data_plot, x = "log1p_total_counts", y = "log1p_total_counts_isotype", hue = "isotype_exclude", facet = sc)
+      ggsave(g,
+        filename = file.path(outpath, paste0("scatter_", sc, "_prot-log1p_nUMI_v_prot-log1p_total_counts_isotype.png")), type = "cairo",
+        width = 3 * ncols, height = 3 * nrows, dpi = 200
+      )
     }
   }
-
 }
 # Atac plots ----------------------------------------------------------------
 
 
-if(!is.null(opt$atac_qc_metrics)){
-  print("ATAC plots")
-  atac_data_plot <- data_plot[,grep("^atac\\.",colnames(data_plot))]
+if (!is.null(opt$atac_qc_metrics)) {
+  cat("ATAC plots\n")
+  atac_data_plot <- data_plot[, grep("^atac\\.", colnames(data_plot))]
   colnames(atac_data_plot) <- gsub("^atac\\.", "", colnames(atac_data_plot))
-  atac_source_facet <- gsub("^atac\\.", "",grep("^atac.", source_facet, value = TRUE))
+  atac_source_facet <- gsub("^atac\\.", "", grep("^atac.", source_facet, value = TRUE))
   atac_source_facet <- unique(c(atac_source_facet, source_facet[!grepl("^atac.", source_facet)]))
   atac_source_facet <- atac_source_facet[atac_source_facet %in% colnames(atac_data_plot)]
 
 
-  outpath = file.path(run, "atac")
+  outpath <- file.path(run, "atac")
   if (!dir.exists(outpath)) dir.create(outpath)
-  
+
   uniq_sample_id <- nrow(unique(atac_data_plot["sample_id"]))
 
   # check these qc metrics are in the file
   qcmetrics <- qcmetrics[qcmetrics %in% colnames(atac_data_plot)]
-  for (qc in qcmetrics){
+  for (qc in qcmetrics) {
     print(paste("Plotting violin plots of", qc))
-    for (sc in atac_source_facet){
+    for (sc in atac_source_facet) {
       g <- do_violin_plot(atac_data_plot, qc, sc)
-      if (uniq_sample_id  > 50){width=12}else{width=6}
-      ggsave(g, filename=file.path(outpath, paste0("violin_", sc, "_atac-", qc,".png")), type="cairo", width= width, height=6)
+      if (uniq_sample_id > 50) {
+        width <- 12
+      } else {
+        width <- 6
+      }
+      ggsave(g, filename = file.path(outpath, paste0("violin_", sc, "_atac-", qc, ".png")), type = "cairo", width = width, height = 6)
     }
   }
 }
-  
+
 # Rep plots ----------------------------------------------------------------
 
 
 if (!is.null(opt$rep_qc_metrics)) {
-  print("Repertoire plots")
-  qcmetrics <- strsplit(opt$rep_qc_metrics,",")[[1]]
+  cat("Repertoire plots\n")
+  qcmetrics <- strsplit(opt$rep_qc_metrics, ",")[[1]]
   qcmetrics <- gsub("rep:", "", qcmetrics)
-  rep_data_plot <- data_plot[,grep("^rep\\.",colnames(data_plot))]
+  rep_data_plot <- data_plot[, grep("^rep\\.", colnames(data_plot))]
   colnames(rep_data_plot) <- gsub("^rep\\.", "", colnames(rep_data_plot))
-  rep_data_plot = rep_data_plot %>% filter(sample_id!="")
-  
-  rep_source_facet <- gsub("^rep\\.", "",grep("^rep.", source_facet, value = TRUE))
+  rep_data_plot <- rep_data_plot %>% filter(sample_id != "")
+
+  rep_source_facet <- gsub("^rep\\.", "", grep("^rep.", source_facet, value = TRUE))
   rep_source_facet <- unique(c(rep_source_facet, source_facet[!grepl("^rep.", source_facet)]))
   rep_source_facet <- rep_source_facet[rep_source_facet %in% colnames(rep_data_plot)]
 
-  
-  outpath = file.path(run, "rep")
+
+  outpath <- file.path(run, "rep")
   if (!dir.exists(outpath)) dir.create(outpath)
-  
-  
+
+
   uniq_sample_id <- nrow(unique(rep_data_plot["sample_id"]))
 
   # check these qc metrics are in the file
   qcmetrics <- qcmetrics[qcmetrics %in% colnames(rep_data_plot)]
-  for (qc in qcmetrics){
-    print(paste("Plotting bar plots of", qc))
-    for (sc in rep_source_facet){
+  for (qc in qcmetrics) {
+    cat("Plotting bar plots of", qc. "\n")
+    for (sc in rep_source_facet) {
       g <- do_bar_plot(rep_data_plot, qc, sc)
-      if (uniq_sample_id  > 50){width=12}else{width=6}
-        ggsave(g, filename=file.path(outpath, paste0("bar_", sc, "_rep-", qc,".png")), type="cairo", width= width, height=6)
+      if (uniq_sample_id > 50) {
+        width <- 12
+      } else {
+        width <- 6
+      }
+      ggsave(g, filename = file.path(outpath, paste0("bar_", sc, "_rep-", qc, ".png")), type = "cairo", width = width, height = 6)
 
-      if (!(qc %in% c('has_ir', "receptor_type"))){
+      if (!(qc %in% c("has_ir", "receptor_type"))) {
         g <- do_bar_plot(rep_data_plot, qc, sc) + facet_grid(~receptor_type)
-        ggsave(g, filename=file.path(outpath, paste0("bar_facet_", sc, "_rep-", qc,".png")), type="cairo", width= width*4, height=6)
-
+        ggsave(g, filename = file.path(outpath, paste0("bar_facet_", sc, "_rep-", qc, ".png")), type = "cairo", width = width * 4, height = 6)
       }
     }
   }
 }
 
 
-if(!is.null(opt$prot_qc_metrics)){
-print("RNA vs. Protein plots")
+if (!is.null(opt$prot_qc_metrics)) {
+  cat("RNA vs. Protein plots\n")
 
-# rna vs prot plots ----------------------------------------------------------------
-  
+  # rna vs prot plots ----------------------------------------------------------------
 
-  outpath = file.path(run, "rna_v_prot")
-  if (!dir.exists(outpath)) dir.create(outpath)    
-  
 
-  for (sc in source_facet){
+  outpath <- file.path(run, "rna_v_prot")
+  if (!dir.exists(outpath)) dir.create(outpath)
+
+
+  for (sc in source_facet) {
     uniq_source <- nrow(unique(data_plot[sc]))
-    if(uniq_source >6){
-      ncols=6
-      nrows=ceiling(uniq_source/6)
-    }else{
-      ncols=uniq_source
-      nrows=1
+    if (uniq_source > 6) {
+      ncols <- 6
+      nrows <- ceiling(uniq_source / 6)
+    } else {
+      ncols <- uniq_source
+      nrows <- 1
+    }
+    if (all(c("rna.total_counts", "prot.total_counts") %in% colnames(data_plot))) {
+      cat("Plotting scatter plot of rna.total_counts and prot.total_counts\n")
+      g <- do_scatter_plot(data_plot, x = "rna.total_counts", y = "prot.total_counts", facet = sc)
+      ggsave(g,
+        filename = file.path(outpath, paste0("scatter_", sc, "-nUMI_v_rna-nUMI.png")), type = "cairo",
+        width = 3 * ncols, height = 3 * nrows, dpi = 200
+      )
     }
-    if (all(c("rna.total_counts","prot.total_counts")%in% colnames(data_plot))){
-      print("Plotting scatter plot of rna.total_counts and prot.total_counts")
-      g <- do_scatter_plot(data_plot,x="rna.total_counts",y="prot.total_counts", facet=sc)
-      ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-nUMI_v_rna-nUMI.png")), type="cairo",
-              width= 3*ncols, height=3*nrows, dpi=200)
+    if (all(c("rna.log1p_total_counts", "prot.log1p_total_counts") %in% colnames(data_plot))) {
+      cat("Plotting scatter plot of rna.log1p_total_counts and prot.log1p_total_counts\n")
+      g <- do_scatter_plot(data_plot, x = "rna.log1p_total_counts", y = "prot.log1p_total_counts", facet = sc)
+      ggsave(g,
+        filename = file.path(outpath, paste0("scatter_", sc, "-log1p_nUMI_v_rna-log1p_nUMI.png")), type = "cairo",
+        width = 3 * ncols, height = 3 * nrows, dpi = 200
+      )
     }
-    if (all(c("rna.log1p_total_counts","prot.log1p_total_counts")%in% colnames(data_plot))){
-      print("Plotting scatter plot of rna.log1p_total_counts and prot.log1p_total_counts")
-      g <- do_scatter_plot(data_plot,x="rna.log1p_total_counts",y="prot.log1p_total_counts", facet=sc)
-      ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-log1p_nUMI_v_rna-log1p_nUMI.png")), type="cairo",
-              width= 3*ncols, height=3*nrows, dpi=200)
+    if (all(c("rna.total_counts", "prot.total_counts_isotype") %in% colnames(data_plot))) {
+      cat("Plotting scatter plot of rna.total_counts and prot.total_counts_isotype\n")
+      g <- do_scatter_plot(data_plot, x = "rna.total_counts", y = "prot.total_counts_isotype", facet = sc)
+      ggsave(g,
+        filename = file.path(outpath, paste0("scatter_", sc, "-nUMI_v_prot-counts_isotype.png")), type = "cairo",
+        width = 3 * ncols, height = 3 * nrows, dpi = 200
+      )
     }
-    if (all(c("rna.total_counts","prot.total_counts_isotype")%in% colnames(data_plot))){
-      print("Plotting scatter plot of rna.total_counts and prot.total_counts_isotype")
-      g <-  do_scatter_plot(data_plot,x="rna.total_counts",y="prot.total_counts_isotype", facet=sc)
-      ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-nUMI_v_prot-counts_isotype.png")), type="cairo",
-              width= 3*ncols, height=3*nrows, dpi=200)
-    }    
-    if (all(c("rna.log1p_total_counts","prot.log1p_total_counts_isotype") %in% colnames(data_plot))){
-      print("Plotting scatter plot of rna.log1p_total_counts and prot.log1p_total_counts_isotype")
-      g <-  do_scatter_plot(data_plot,x="rna.log1p_total_counts",y="prot.log1p_total_counts_isotype", facet=sc)
-      ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-log1p_nUMI_v_prot-log1p_counts_isotype.png")), type="cairo",
-              width= 3*ncols, height=3*nrows, dpi=200)
+    if (all(c("rna.log1p_total_counts", "prot.log1p_total_counts_isotype") %in% colnames(data_plot))) {
+      cat("Plotting scatter plot of rna.log1p_total_counts and prot.log1p_total_counts_isotype\n")
+      g <- do_scatter_plot(data_plot, x = "rna.log1p_total_counts", y = "prot.log1p_total_counts_isotype", facet = sc)
+      ggsave(g,
+        filename = file.path(outpath, paste0("scatter_", sc, "-log1p_nUMI_v_prot-log1p_counts_isotype.png")), type = "cairo",
+        width = 3 * ncols, height = 3 * nrows, dpi = 200
+      )
     }
-    if (all(c("rna.doublet_scores","prot.log1p_total_counts") %in% colnames(data_plot))){
-      print("Plotting scatter plot of rna.doublet_scores and prot.log1p_total_counts")
-      g <-  do_scatter_plot(data_plot,x="rna.doublet_scores",y="prot.log1p_total_counts", facet=sc)
-      ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-doublet_scores_v_prot-log1p_nUMI.png")), type="cairo",
-              width= 3*ncols, height=3*nrows, dpi=200)
-      
+    if (all(c("rna.doublet_scores", "prot.log1p_total_counts") %in% colnames(data_plot))) {
+      cat("Plotting scatter plot of rna.doublet_scores and prot.log1p_total_counts\n")
+      g <- do_scatter_plot(data_plot, x = "rna.doublet_scores", y = "prot.log1p_total_counts", facet = sc)
+      ggsave(g,
+        filename = file.path(outpath, paste0("scatter_", sc, "-doublet_scores_v_prot-log1p_nUMI.png")), type = "cairo",
+        width = 3 * ncols, height = 3 * nrows, dpi = 200
+      )
     }
-    
-    
   }
-  
 }
 
 
 sprefix <- opt$sampleprefix
 
-if(opt$prefilter){
-  print("Saving counts tables for references")
-  if(all(c("pct_counts_mt", "pct_counts_hb", "n_genes_by_counts", "doublet_scores") %in% colnames(rna_data_plot))){
-    f1 <- rna_data_plot %>% 
-      dplyr::filter(pct_counts_mt<=20 & pct_counts_hb<=70 &n_genes_by_counts>=100 & doublet_scores<=0.25) %>%
-      group_by_at(.vars=c(rna_source_facet)) %>%
-      group_by_at(.vars=c(rna_source_facet)) %>%
-      summarise(cell.count= n()) %>%
-      group_by_at(.vars="sample_id") %>% 
-      rename(cell.count_f1=cell.count) 
-    
-    f2 <- rna_data_plot %>% 
-      dplyr::filter(pct_counts_mt<=10 & pct_counts_hb<=50 &n_genes_by_counts>=100 & doublet_scores<=0.25) %>%
-      group_by_at(.vars=c(rna_source_facet)) %>%
-      summarise(cell.count= n()) %>% 
-      group_by_at(.vars="sample_id") %>% 
-      rename(cell.count_f2=cell.count) 
-    
-    f3 <- rna_data_plot %>% 
-      dplyr::filter(pct_counts_mt<=5 & pct_counts_hb<=50 &n_genes_by_counts>=100 & n_genes_by_counts<=3000) %>%
-      group_by_at(.vars=c(rna_source_facet)) %>%
-      summarise(cell.count= n()) %>% 
-      group_by_at(.vars="sample_id") %>% 
-      rename(cell.count_f3=cell.count) 
-    
-    baseline <- rna_data_plot %>% 
-      group_by_at(.vars=c(rna_source_facet)) %>%
-      summarise(cell.count= n()) %>% 
-      group_by_at(.vars="sample_id") %>% 
-      rename(baseline.counts=cell.count)
-    
-    info <- merge(merge(merge(f1,f2,by=rna_source_facet,all=TRUE),f3, by=rna_source_facet, all=TRUE), baseline, by=rna_source_facet, all=T) %>%
-      mutate(percent_retain_f1 = 100*cell.count_f1/baseline.counts,
-             percent_retain_f2 = 100*cell.count_f2/baseline.counts,
-             percent_retain_f3 = 100*cell.count_f3/baseline.counts) 
-    
-    
-    
-    write.table(info, file=paste0( sprefix,"_threshold_filter.tsv"), col.names=T, row.names=F, sep="\t", quote=F)
-    
-    explain <- data.frame(qcmetric=c("pct_counts_mt_max","pct_counts_hb_max","n_genes_by_counts_min","doublet_scores_max","n_genes_by_counts_max"),
-                          f1=c(20,70,100,0.25,NA),
-                          f2=c(10,50,100,0.25,NA),
-                          f3=c(5,50,100,NA,3000)) 
-    
-    write.table(explain, file=paste0(sprefix,"_threshold_filter_explained.tsv"), col.names=T, row.names=F, sep="\t", quote=F)
-    
-    lab <- c("%Mt <=10 & %HB<70 & minGenes>=100 & scrublet<=0.25",
-             "%Mt <=5 & %HB<50 & minGenes>=100 & scrublet<=0.25",
-             "%Mt <=5 & %HB<50 & minGenes>=100 & maxGenes<=3000")
-    names(lab) <- c("percent_retain_f1","percent_retain_f2","percent_retain_f3") 
-    
-    g <- info %>% 
-      pivot_longer(cols=starts_with("percent"), names_to="filterclass", values_to="percent")%>%
-      ggplot(aes(sample_id,percent, fill=filterclass)) +
-      geom_bar(stat="identity", position="dodge", color="black") +
-      facet_wrap(~filterclass, ncol=1, labeller = labeller(filterclass=lab)) +
-      theme_bw()+
-      theme(axis.text.x=element_text(size=8,angle=45, hjust=1, vjust=1),
-            axis.text.y=element_text(size=13)) +
-      scale_fill_manual(values = c("red", "yellow","blue"), 
-                        breaks=c("percent_retain_f1","percent_retain_f2","percent_retain_f3"),
-                        limits=c("percent_retain_f1","percent_retain_f2","percent_retain_f3")) + 
-      coord_cartesian(ylim=c(0,100)) 
-    
-    ggsave(g, file = paste0(run,"barplot_cellcounts_thresholds_filter.png"), type="cairo", width=9, height=9)
+if (opt$prefilter) {
+  cat("Saving counts tables for references\n")
+  if (all(c("pct_counts_mt", "pct_counts_hb", "n_genes_by_counts", "doublet_scores") %in% colnames(rna_data_plot))) {
+
+    f1 <- rna_data_plot %>%
+      dplyr::filter(pct_counts_mt <= 20 & pct_counts_hb <= 70 & n_genes_by_counts >= 100 & doublet_scores <= 0.25) %>%
+      group_by_at(.vars = c(rna_source_facet)) %>%
+      summarise(cell.count = n()) %>%
+      group_by_at(.vars = "sample_id") %>%
+      rename(cell.count_f1 = cell.count)
+
+    f2 <- rna_data_plot %>%
+      dplyr::filter(pct_counts_mt <= 10 & pct_counts_hb <= 50 & n_genes_by_counts >= 100 & doublet_scores <= 0.25) %>%
+      group_by_at(.vars = c(rna_source_facet)) %>%
+      summarise(cell.count = n()) %>%
+      group_by_at(.vars = "sample_id") %>%
+      rename(cell.count_f2 = cell.count)
+
+    f3 <- rna_data_plot %>%
+      dplyr::filter(pct_counts_mt <= 5 & pct_counts_hb <= 50 & n_genes_by_counts >= 100 & n_genes_by_counts <= 3000) %>%
+      group_by_at(.vars = c(rna_source_facet)) %>%
+      summarise(cell.count = n()) %>%
+      group_by_at(.vars = "sample_id") %>%
+      rename(cell.count_f3 = cell.count)
+
+    baseline <- rna_data_plot %>%
+      group_by_at(.vars = c(rna_source_facet)) %>%
+      summarise(cell.count = n()) %>%
+      group_by_at(.vars = "sample_id") %>%
+      rename(baseline.counts = cell.count)
+
+    info <- merge(merge(merge(f1, f2, by = rna_source_facet, all = TRUE), f3, by = rna_source_facet, all = TRUE), baseline, by = rna_source_facet, all = T) %>%
+      mutate(
+        percent_retain_f1 = 100 * cell.count_f1 / baseline.counts,
+        percent_retain_f2 = 100 * cell.count_f2 / baseline.counts,
+        percent_retain_f3 = 100 * cell.count_f3 / baseline.counts
+      )
+
+
+
+    write.table(info, file = paste0(sprefix, "_threshold_filter.tsv"), col.names = T, row.names = F, sep = "\t", quote = F)
+
+    explain <- data.frame(
+      qcmetric = c("pct_counts_mt_max", "pct_counts_hb_max", "n_genes_by_counts_min", "doublet_scores_max", "n_genes_by_counts_max"),
+      f1 = c(20, 70, 100, 0.25, NA),
+      f2 = c(10, 50, 100, 0.25, NA),
+      f3 = c(5, 50, 100, NA, 3000)
+    )
+
+    write.table(explain, file = paste0(sprefix, "_threshold_filter_explained.tsv"), col.names = T, row.names = F, sep = "\t", quote = F)
+
+    lab <- c(
+      "%Mt <=10 & %HB<70 & minGenes>=100 & scrublet<=0.25",
+      "%Mt <=5 & %HB<50 & minGenes>=100 & scrublet<=0.25",
+      "%Mt <=5 & %HB<50 & minGenes>=100 & maxGenes<=3000"
+    )
+    names(lab) <- c("percent_retain_f1", "percent_retain_f2", "percent_retain_f3")
+
+    g <- info %>%
+      pivot_longer(cols = starts_with("percent"), names_to = "filterclass", values_to = "percent") %>%
+      ggplot(aes(sample_id, percent, fill = filterclass)) +
+      geom_bar(stat = "identity", position = "dodge", color = "black") +
+      facet_wrap(~filterclass, ncol = 1, labeller = labeller(filterclass = lab)) +
+      theme_bw() +
+      theme(
+        axis.text.x = element_text(size = 8, angle = 45, hjust = 1, vjust = 1),
+        axis.text.y = element_text(size = 13)
+      ) +
+      scale_fill_manual(
+        values = c("red", "yellow", "blue"),
+        breaks = c("percent_retain_f1", "percent_retain_f2", "percent_retain_f3"),
+        limits = c("percent_retain_f1", "percent_retain_f2", "percent_retain_f3")
+      ) +
+      coord_cartesian(ylim = c(0, 100))
+
+    ggsave(g, file = paste0(run, "barplot_cellcounts_thresholds_filter.png"), type = "cairo", width = 9, height = 9)
   }
-  
-}else{
-  print("Producing files with final counts for cells after filtering")
-  
-  baseline <- rna_data_plot %>% 
-    group_by_at(.vars=c(rna_source_facet)) %>% 
-    summarise(cell.count= n()) %>% 
-    group_by_at(.vars="sample_id") 
-  
-  write.table(baseline, file=paste0( sprefix,"_filtered_data.tsv"), col.names=T, row.names=F, sep="\t", quote=F)
-  
-  g <- baseline %>% 
-    ggplot(aes_string(x="sample_id", y="cell.count")) +
-    geom_bar(stat="identity", position="dodge", color="black") +
-    theme_bw()+
-    theme(axis.text.x=element_text(size=8,angle=45, hjust=1.05, vjust=0.95),
-          axis.text.y=element_text(size=13)) 
-  ggsave(g, file = file.path(run, "rna","barplot_cellcounts_filtered_data.png"), type="cairo", width=9, height=6)
-  
+} else {
+  cat("Producing files with final counts for cells after filtering\n")
+
+  baseline <- rna_data_plot %>%
+    group_by_at(.vars = c(rna_source_facet)) %>%
+    summarise(cell.count = n()) %>%
+    group_by_at(.vars = "sample_id")
+
+  write.table(baseline, file = paste0(sprefix, "_filtered_data.tsv"), col.names = T, row.names = F, sep = "\t", quote = F)
+
+  g <- baseline %>%
+    ggplot(aes_string(x = "sample_id", y = "cell.count")) +
+    geom_bar(stat = "identity", position = "dodge", color = "black") +
+    theme_bw() +
+    theme(
+      axis.text.x = element_text(size = 8, angle = 45, hjust = 1.05, vjust = 0.95),
+      axis.text.y = element_text(size = 13)
+    )
+  ggsave(g, file = file.path(run, "rna", "barplot_cellcounts_filtered_data.png"), type = "cairo", width = 9, height = 6)
 }
 
 
-print("Done")
+cat("Done\n")

From b76bfa978884201d6e99b5365a522cb276e94d3e Mon Sep 17 00:00:00 2001
From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com>
Date: Tue, 4 Feb 2025 21:32:10 +0100
Subject: [PATCH 09/10] add some files to gitignore

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index ca736369..a7b75651 100644
--- a/.gitignore
+++ b/.gitignore
@@ -165,6 +165,8 @@ cython_debug/
 
 # panpipes specific
 .history
-panpipes/.DS_Store
 .DS_Store
 *.code-workspace
+*/.DS_Store
+1_ingest/*
+data/*

From a148e99cd02806c1dc74a9b1abe3ec43e77ba645 Mon Sep 17 00:00:00 2001
From: Vadim Nazarov <2979713+vadimnazarov@users.noreply.github.com>
Date: Thu, 6 Mar 2025 14:40:19 +0100
Subject: [PATCH 10/10] fix: fix an error in plotQC.R

---
 panpipes/R_scripts/plotQC.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/panpipes/R_scripts/plotQC.R b/panpipes/R_scripts/plotQC.R
index 32a8ce9b..3f7427b1 100644
--- a/panpipes/R_scripts/plotQC.R
+++ b/panpipes/R_scripts/plotQC.R
@@ -414,7 +414,7 @@ if (!is.null(opt$rep_qc_metrics)) {
   # check these qc metrics are in the file
   qcmetrics <- qcmetrics[qcmetrics %in% colnames(rep_data_plot)]
   for (qc in qcmetrics) {
-    cat("Plotting bar plots of", qc. "\n")
+    cat("Plotting bar plots of", qc, "\n")
     for (sc in rep_source_facet) {
       g <- do_bar_plot(rep_data_plot, qc, sc)
       if (uniq_sample_id > 50) {
@@ -501,7 +501,6 @@ sprefix <- opt$sampleprefix
 if (opt$prefilter) {
   cat("Saving counts tables for references\n")
   if (all(c("pct_counts_mt", "pct_counts_hb", "n_genes_by_counts", "doublet_scores") %in% colnames(rna_data_plot))) {
-
     f1 <- rna_data_plot %>%
       dplyr::filter(pct_counts_mt <= 20 & pct_counts_hb <= 70 & n_genes_by_counts >= 100 & doublet_scores <= 0.25) %>%
       group_by_at(.vars = c(rna_source_facet)) %>%