opentargets · polrus · Oct 9, 2025 · Oct 9, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/app/data/gmt/GO biological process/GO:BP_2025_background b/app/data/gmt/GO biological process/GO:BP_2025_background
diff --git a/app/data/gmt/GO cellular component/GO:CC_2025_background b/app/data/gmt/GO cellular component/GO:CC_2025_background
diff --git a/app/data/gmt/GO molecular function/GO:MF_2025_background b/app/data/gmt/GO molecular function/GO:MF_2025_background
diff --git a/app/data/gmt/Reactome/ReactomePathways_2025_background b/app/data/gmt/Reactome/ReactomePathways_2025_background
diff --git a/app/data/gmt/Target class/ChEMBL Target Class_background b/app/data/gmt/Target class/ChEMBL Target Class_background
diff --git a/app/main.py b/app/main.py
@@ -9,6 +9,7 @@
 from app.routers.pathways import router as pathways_router
 from app.routers.umap_router import router 
 from app.routers import gsea
+from app.scripts.prepare_gene_lists import generate_all_library_gene_lists
 
 import logging
 
@@ -46,6 +47,18 @@
 # Mount static files for the React app
 app.mount("/assets", StaticFiles(directory="./ui/dist/assets"), name="assets")
 
+# Prepare per-library gene lists from GMTs on startup (idempotent and fast if up-to-date)
+@app.on_event("startup")
+async def prepare_gene_lists_startup() -> None:
+    try:
+        updated = generate_all_library_gene_lists()
+        if updated:
+            logger.info("Prepared gene lists for libraries: %s", ", ".join(str(p) for p in updated))
+        else:
+            logger.info("Gene lists already up-to-date; no changes.")
+    except Exception as exc:  # noqa: BLE001
+        logger.exception("Failed to prepare gene lists on startup: %s", exc)
+
 @app.get("/")
 async def root():
     return {"message": f"Welcome to {config.APP_NAME}"}

diff --git a/app/routers/gsea.py b/app/routers/gsea.py
@@ -3,6 +3,7 @@
 import tempfile
 import pandas as pd
 import os
+import numpy as np
 
 router = APIRouter()
 
@@ -43,4 +44,8 @@ async def gsea_endpoint(
         # Clean up temp file
         os.unlink(tsv_path)
 
+    # Replace NaN/Inf with JSON-safe values
+    res_df = res_df.replace([np.inf, -np.inf], None)
+    res_df = res_df.where(pd.notna(res_df), None)
+
     return res_df.to_dict(orient="records")
diff --git a/app/scripts/__init__.py b/app/scripts/__init__.py
@@ -0,0 +1,2 @@
+# Makes `app.scripts` a regular package
+
diff --git a/app/scripts/prepare_gene_lists.py b/app/scripts/prepare_gene_lists.py
@@ -0,0 +1,80 @@
+from pathlib import Path
+import logging
+
+
+BASE_DIR = Path(__file__).resolve().parents[1]  # app/
+DATA_DIR = BASE_DIR / "data"
+GMT_DIR = DATA_DIR / "gmt"
+MIN_GENE_COL_IDX = 2
+LOGGER = logging.getLogger(__name__)
+
+
+def _collect_genes_from_gmt_file(gmt_path: Path) -> set[str]:
+    genes: set[str] = set()
+    with gmt_path.open("r") as f:
+        for line in f:
+            parts = line.rstrip("\n").split("\t")
+            if len(parts) > MIN_GENE_COL_IDX:
+                # All tokens after the second column are gene symbols
+                for token in parts[MIN_GENE_COL_IDX:]:
+                    stripped = token.strip()
+                    if stripped:
+                        genes.add(stripped)
+    return genes
+
+def generate_background_for_gmt(gmt_path: Path, suffix: str = "_background") -> Path | None:
+    """
+    Generate a deduplicated, sorted list of all genes present in the given .gmt
+    file and write them to an output file named `<input_stem>{suffix}` in the
+    same directory (no header). For example, `ReactomePathways_2025.gmt` ->
+    `ReactomePathways_2025_background`.
+
+    Skips work if the output exists and is newer than the .gmt file.
+    Returns the output path if written/updated, or None if skipped.
+    """
+    if not gmt_path.exists() or gmt_path.suffix.lower() != ".gmt":
+        return None
+
+    out_path = gmt_path.with_name(f"{gmt_path.stem}{suffix}")
+
+    # Skip if up-to-date
+    if out_path.exists() and out_path.stat().st_mtime >= gmt_path.stat().st_mtime:
+        return None
+
+    genes = _collect_genes_from_gmt_file(gmt_path)
+
+    # Write without header, one gene per line, sorted for determinism
+    with out_path.open("w") as f:
+        for gene in sorted(genes):
+            f.write(f"{gene}\n")
+
+    return out_path
+
+
+def generate_all_library_gene_lists(suffix: str = "_background") -> list[Path]:
+    """
+    For each .gmt file under each subdirectory in app/data/gmt, generate a
+    background gene list named `<input_stem>{suffix}` in the same directory.
+    Returns the list of paths that were written/updated.
+    """
+    written: list[Path] = []
+    if not GMT_DIR.exists():
+        return written
+
+    for library_dir in sorted(p for p in GMT_DIR.iterdir() if p.is_dir()):
+        for gmt_path in sorted(library_dir.glob("*.gmt")):
+            out = generate_background_for_gmt(gmt_path, suffix=suffix)
+            if out is not None:
+                written.append(out)
+    return written
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    updated = generate_all_library_gene_lists()
+    if updated:
+        LOGGER.info("Generated/updated gene lists:")
+        for p in updated:
+            LOGGER.info("%s", p)
+    else:
+        LOGGER.info("All background gene lists are up-to-date.")
diff --git a/app/services/gsea.py b/app/services/gsea.py
@@ -1,18 +1,42 @@
-import os
-import pandas as pd
-import blitzgsea as blitz
 from pathlib import Path
+import numpy as np
+import blitzgsea as blitz
+import gcsfs
+import pandas as pd
 
 BASE_DIR = Path(__file__).resolve().parents[1]  # app/
 DATA_DIR = BASE_DIR / "data"
 GMT_DIR = DATA_DIR / "gmt"
+MIN_GENE_COL_IDX = 2
+
+def get_approved_symbols_from_gcs():
+    """
+    Read approvedSymbol column from Open Targets target parquet files in GCS using gcsfs.
+    Returns a set of approved gene symbols.
+    """
+    # Initialize GCS filesystem
+    fs = gcsfs.GCSFileSystem()
+
+    # Define the GCS path to the target directory
+    gcs_path = "open-targets-pre-data-releases/25.09/output/target/"
+
+    # Read all parquet files in the directory as a single dataset
+    # This is much more efficient than downloading individual files
+    df = pd.read_parquet(gcs_path, filesystem=fs, columns=["approvedSymbol"])
+
+    # Extract unique approved symbols (excluding NaN values)
+    approved_symbols = set(df["approvedSymbol"].dropna().astype(str))
+
+
+    return approved_symbols
 
 def load_custom_gmt(path):
-    with open(path, 'r') as f:
+    p = Path(path)
+    with p.open("r") as f:
         return {
-            parts[0]: parts[2:]
+            parts[0]: parts[MIN_GENE_COL_IDX:]
             for line in f
-            if (parts := line.strip().split('\t')) and len(parts) > 2
+            if (parts := line.strip().split("\t")) and len(parts) > MIN_GENE_COL_IDX
         }
 
 def available_gmt_files():
@@ -34,7 +58,7 @@ def available_gmt_files():
             hierarchy_file = txt_files[0] if txt_files else None
             libraries[f"{folder.name}/{gmt_file.stem}"] = {
                 "gmt": gmt_file,
-                "hierarchy": hierarchy_file
+                "hierarchy": hierarchy_file,
             }
     return libraries
 
@@ -44,11 +68,12 @@ def run_gsea(input_tsv=None, gmt_name=None, processes=4):
     Pathway size is computed from the GMT (total genes in the pathway).
     Ensure 'Number of input genes' and 'Pathway size' are integers (no .0).
     """
-    input_tsv = Path(input_tsv) if input_tsv else DEFAULT_TEST_INPUT
+    input_tsv = Path(input_tsv)
 
     gmt_files = available_gmt_files()
     if not gmt_name or gmt_name not in gmt_files:
-        raise ValueError(f"Invalid gmt_name. Choose from: {list(gmt_files.keys())}")
+        msg = "Invalid gmt_name. Choose from: " + str(list(gmt_files.keys()))
+        raise ValueError(msg)
 
     gmt_file = gmt_files[gmt_name]["gmt"]
     hierarchy_file = gmt_files[gmt_name]["hierarchy"]
@@ -58,20 +83,20 @@ def run_gsea(input_tsv=None, gmt_name=None, processes=4):
 
     # --- Check if GMT file contains IDs in braces {ID} ---
     contains_braces = False
-    with open(gmt_file, 'r') as f:
+    with gmt_file.open("r") as f:
         for line in f:
             if "{" in line and "}" in line:
                 contains_braces = True
                 break
 
     # Build ID -> genes mapping from the GMT file
     id_to_genes = {}
-    with open(gmt_file, 'r') as f:
+    with gmt_file.open("r") as f:
         for line in f:
             parts = line.rstrip("\n").split("\t")
-            if len(parts) > 2:
+            if len(parts) > MIN_GENE_COL_IDX:
                 term = parts[0]
-                genes = parts[2:]
+                genes = parts[MIN_GENE_COL_IDX:]
                 if contains_braces and "{" in term and "}" in term:
                     start = term.find("{") + 1
                     end = term.find("}", start)
@@ -85,17 +110,56 @@ def run_gsea(input_tsv=None, gmt_name=None, processes=4):
     # --- Load input file safely ---
     df = pd.read_csv(input_tsv, sep="\t")
 
+    # Allow unnamed columns (0,1) and rename to expected headers
     if set(df.columns) == set(range(len(df.columns))):
         df = df.rename(columns={0: "symbol", 1: "globalScore"})
 
     if not {"symbol", "globalScore"}.issubset(df.columns):
-        raise ValueError("Input file must contain 'symbol' and 'globalScore' columns.")
+        msg = "Input file must contain 'symbol' and 'globalScore' columns."
+        raise ValueError(msg)
 
+    # Keep only required columns
     df = df[["symbol", "globalScore"]].copy()
+
+    # --- Merge background genes for the selected library (no duplicates) ---
+    # Prefer pre-generated background file next to the GMT; fallback to union from GMT
+    background_path = gmt_file.with_name(f"{gmt_file.stem}_background")
+    if background_path.exists():
+        with background_path.open("r") as f:
+            background_genes = {line.strip() for line in f if line.strip()}
+    else:
+        # Fallback: union of all genes from the GMT mapping
+        background_genes = set()
+        for genes in library_sets.values():
+            background_genes.update(g for g in genes if g)
+
+    existing_genes = set(df["symbol"].astype(str))
+    missing_genes = sorted(background_genes - existing_genes)
+    if missing_genes:
+        background_df = pd.DataFrame({
+            "symbol": missing_genes,
+            "globalScore": 0,
+        })
+        df = pd.concat([df, background_df], ignore_index=True)
+
+    # --- Filter genes to only include those in Open Targets approved symbols ---
+    approved_symbols = get_approved_symbols_from_gcs()
+    df = df[df["symbol"].astype(str).isin(approved_symbols)].copy()
+
+    # # After gene filtering
+    # print(f"Genes after filtering: {len(df)}")
+    # print(f"Sample of filtered genes: {df['symbol'].head()}")
+
+    # Sort by score desc and drop duplicate symbols keeping highest score (originals win over zeros)
     df = df.sort_values("globalScore", ascending=False)
+    df = df.drop_duplicates(subset=["symbol"], keep="first")
 
     res_df = blitz.gsea(df, library_sets, processes=processes).reset_index(names="Term")
 
+    # # After GSEA calculation
+    # print(f"GSEA results shape: {res_df.shape}")
+    # print(f"Columns with NaN: {res_df.isnull().sum()}")
+
     # --- Extract IDs and clean terms ---
     if contains_braces:
         term_series = res_df["Term"]
@@ -138,7 +202,7 @@ def run_gsea(input_tsv=None, gmt_name=None, processes=4):
     if hierarchy_file and hierarchy_file.exists():
         hierarchy_df = pd.read_csv(
             hierarchy_file, sep="\t", header=None,
-            names=["Parent pathway", "Child pathway"]
+            names=["Parent pathway", "Child pathway"],
         )
         res_df = res_df.merge(
             hierarchy_df, left_on="ID", right_on="Child pathway", how="left"
@@ -147,9 +211,9 @@ def run_gsea(input_tsv=None, gmt_name=None, processes=4):
             res_df.groupby(
                 [
                     "ID", "Link", "Pathway", "ES", "NES", "FDR", "p-value",
-                    "Sidak's p-value", "Number of input genes", "Leading edge genes", "Pathway size"
+                    "Sidak's p-value", "Number of input genes", "Leading edge genes", "Pathway size",
                 ],
-                dropna=False
+                dropna=False,
             )["Parent pathway"]
             .apply(lambda x: ",".join(sorted(set(x.dropna()))))
             .reset_index()
@@ -164,11 +228,30 @@ def safe_int_col(df_, col_name):
         """
         if col_name in df_.columns:
             s = df_[col_name].astype(str).str.replace(",", "", regex=False).str.strip()
-            s = s.replace({'': None, 'nan': None})
-            df_[col_name] = pd.to_numeric(s, errors='coerce').fillna(0).astype(int)
+            s = s.replace({"": None, "nan": None})
+            df_[col_name] = pd.to_numeric(s, errors="coerce").fillna(0).astype(int)
 
     safe_int_col(res_df, "Number of input genes")
     safe_int_col(res_df, "Pathway size")
 
+    # Handle NaN values for JSON serialization
+    res_df = res_df.replace([np.inf, -np.inf], np.nan)
+    res_df = res_df.fillna({
+        'ES': 0.0,
+        'NES': 0.0,
+        'FDR': 1.0,
+        'p-value': 1.0,
+        "Sidak's p-value": 1.0,
+        'Number of input genes': 0,
+        'Pathway size': 0
+    })
+
+    # Ensure string columns are properly handled
+    string_columns = ['Leading edge genes', 'Parent pathway']
+    for col in string_columns:
+        if col in res_df.columns:
+            res_df[col] = res_df[col].astype(str).replace('nan', '')
+
+
     return res_df
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,10 @@ dependencies = [
     "pandas>=2.2.3",
     "umap>=0.1.1",
     "hdbscan>=0.8.40",
-    "numpy>=1.26.4"
+    "numpy>=1.26.4",
+    "google-cloud-storage>=3.4.1",
+    "pyarrow>=22.0.0",
+    "gcsfs>=2025.9.0",
 ]
 
 [build-system]

diff --git a/ui/src/App.tsx b/ui/src/App.tsx
@@ -9,11 +9,12 @@ import Pathways from "./pages/Pathways";
 import { theme } from "./theme";
 
 function App() {
+    const basename = import.meta.env.DEV ? "/" : "/ui";
 	return (
 		<ApolloProvider client={client}>
 			<ThemeProvider theme={theme}>
 				<CssBaseline />
-				<Router basename="/ui">
+				<Router basename={basename}>
 					<div
 						style={{
 							minHeight: "100vh",

diff --git a/ui/src/lib/api.ts b/ui/src/lib/api.ts
@@ -1,4 +1,4 @@
-const API_BASE_URL = 'http://localhost:8000';
+const API_BASE_URL = import.meta.env.DEV ? '' : 'http://localhost:8000';
 
 export type Pathway = Record<string, any>;
 

diff --git a/ui/src/utils/colorPalettes.ts b/ui/src/utils/colorPalettes.ts
@@ -5,13 +5,7 @@
  * Used in flame graphs and other visualizations that need to show prioritization
  */
 export const PRIORITISATION_COLORS = [
-  "#a01813", // -1 (red)
-  "#bc3a19",
-  "#d65a1f",
-  "#e08145",
-  "#e3a772",
-  "#e6ca9c",
-  "#eceada", // 0 (neutral)
+  "#eceada", // 0 (neutral beige)
   "#c5d2c1",
   "#9ebaa8",
   "#78a290",

diff --git a/ui/vite.config.ts b/ui/vite.config.ts
@@ -4,4 +4,12 @@ import react from '@vitejs/plugin-react'
 // https://vite.dev/config/
 export default defineConfig({
   plugins: [react()],
+  server: {
+    port: 5173,
+    proxy: {
+      '/api': 'http://localhost:8000',
+      '/pathways': 'http://localhost:8000',
+      '/health': 'http://localhost:8000',
+    },
+  },
 })