opentargets · carcruz · Feb 4, 2026 · Oct 9, 2025 · Oct 9, 2025 · Oct 23, 2025
diff --git a/app/data/gmt/GO biological process/GO:BP_2025_background b/app/data/gmt/GO biological process/GO:BP_2025_background
diff --git a/app/data/gmt/GO cellular component/GO:CC_2025_background b/app/data/gmt/GO cellular component/GO:CC_2025_background
diff --git a/app/data/gmt/GO molecular function/GO:MF_2025_background b/app/data/gmt/GO molecular function/GO:MF_2025_background
diff --git a/app/data/gmt/Reactome/ReactomePathways_2025_background b/app/data/gmt/Reactome/ReactomePathways_2025_background
diff --git a/app/data/gmt/Target class/ChEMBL Target Class_background b/app/data/gmt/Target class/ChEMBL Target Class_background
diff --git a/app/main.py b/app/main.py
@@ -9,9 +9,15 @@
 from app.routers.pathways import router as pathways_router
 from app.routers.umap_router import router 
 from app.routers import gsea
+from app.scripts.prepare_gene_lists import generate_all_library_gene_lists
 
 import logging
 
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
 logger = logging.getLogger(__name__)
 config = get_config()
 
@@ -46,6 +52,18 @@
 # Mount static files for the React app
 app.mount("/assets", StaticFiles(directory="./ui/dist/assets"), name="assets")
 
+# Prepare per-library gene lists from GMTs on startup (idempotent and fast if up-to-date)
+@app.on_event("startup")
+async def prepare_gene_lists_startup() -> None:
+    try:
+        updated = generate_all_library_gene_lists()
+        if updated:
+            logger.info("Prepared gene lists for libraries: %s", ", ".join(str(p) for p in updated))
+        else:
+            logger.info("Gene lists already up-to-date; no changes.")
+    except Exception as exc:  # noqa: BLE001
+        logger.exception("Failed to prepare gene lists on startup: %s", exc)
+
 @app.get("/")
 async def root():
     return {"message": f"Welcome to {config.APP_NAME}"}

diff --git a/app/routers/gsea.py b/app/routers/gsea.py
@@ -1,8 +1,10 @@
 from fastapi import APIRouter, UploadFile, File, Query, HTTPException
+from typing import Literal
 from app.services.gsea import run_gsea, available_gmt_files
 import tempfile
 import pandas as pd
 import os
+import numpy as np
 
 router = APIRouter()
 
@@ -15,7 +17,11 @@ async def list_gmt_files():
 @router.post("/gsea")
 async def gsea_endpoint(
     tsv_file: UploadFile = File(..., description="TSV file containing at least 2 columns: 'symbol' and 'globalScore'"),
-    gmt_name: str = Query(..., description="GMT library name (without .gmt extension)")
+    gmt_name: str = Query(..., description="GMT library name (without .gmt extension)"),
+    analysis_direction: Literal["one_sided_positive", "one_sided_negative", "two_sided"] = Query(
+        default="one_sided_positive",
+        description="Analysis direction: 'one_sided_positive' filters NES > 0, 'one_sided_negative' filters NES < 0, 'two_sided' returns all results"
+    )
 ):
     # Validate file extension
     if not tsv_file.filename.endswith(".tsv"):
@@ -38,9 +44,22 @@ async def gsea_endpoint(
 
     # Run GSEA
     try:
-        res_df = run_gsea(input_tsv=tsv_path, gmt_name=gmt_name)
+        res_df, missing_stats = run_gsea(input_tsv=tsv_path, gmt_name=gmt_name)
     finally:
         # Clean up temp file
         os.unlink(tsv_path)
 
-    return res_df.to_dict(orient="records")
+    # Filter by NES based on analysis direction
+    if analysis_direction == "one_sided_positive":
+        res_df = res_df[res_df["NES"] > 0].copy()
+    elif analysis_direction == "one_sided_negative":
+        res_df = res_df[res_df["NES"] < 0].copy()
+
+    # Replace NaN/Inf with JSON-safe values
+    res_df = res_df.replace([np.inf, -np.inf], None)
+    res_df = res_df.where(pd.notna(res_df), None)
+
+    return {
+        "results": res_df.to_dict(orient="records"),
+        "input_overlap": missing_stats,
+    }
diff --git a/app/routers/pathways.py b/app/routers/pathways.py
@@ -1,6 +1,5 @@
-from typing import Any, Optional
-from fastapi import APIRouter, HTTPException, Query, UploadFile, File, Form
-import pandas as pd
+from typing import Any
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form
 from pathlib import Path
 
 from app.services.gsea import available_gmt_files, run_gsea
@@ -29,13 +28,17 @@ async def run_gsea_endpoint(
     """
     Run GSEA on an uploaded input file using a chosen GMT library.
     """
+    tmp_path = Path(f"/tmp/{file.filename}")
     try:
         # Save uploaded file to a temporary path
-        tmp_path = Path(f"/tmp/{file.filename}")
         with tmp_path.open("wb") as f:
             f.write(await file.read())
 
-        res_df = run_gsea(input_tsv=tmp_path, gmt_name=gmt_name, processes=processes)
+        res_df, _missing_stats = run_gsea(
+            input_tsv=tmp_path,
+            gmt_name=gmt_name,
+            processes=processes,
+        )
 
         # Return as JSON records
         return res_df.to_dict(orient="records")
@@ -44,3 +47,7 @@ async def run_gsea_endpoint(
         raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        # Clean up temp file
+        if tmp_path.exists():
+            tmp_path.unlink()
diff --git a/app/scripts/__init__.py b/app/scripts/__init__.py
@@ -0,0 +1,2 @@
+# Makes `app.scripts` a regular package
+
diff --git a/app/scripts/prepare_gene_lists.py b/app/scripts/prepare_gene_lists.py
@@ -0,0 +1,80 @@
+from pathlib import Path
+import logging
+
+
+BASE_DIR = Path(__file__).resolve().parents[1]  # app/
+DATA_DIR = BASE_DIR / "data"
+GMT_DIR = DATA_DIR / "gmt"
+MIN_GENE_COL_IDX = 2
+LOGGER = logging.getLogger(__name__)
+
+
+def _collect_genes_from_gmt_file(gmt_path: Path) -> set[str]:
+    genes: set[str] = set()
+    with gmt_path.open("r") as f:
+        for line in f:
+            parts = line.rstrip("\n").split("\t")
+            if len(parts) > MIN_GENE_COL_IDX:
+                # All tokens after the second column are gene symbols
+                for token in parts[MIN_GENE_COL_IDX:]:
+                    stripped = token.strip()
+                    if stripped:
+                        genes.add(stripped)
+    return genes
+
+def generate_background_for_gmt(gmt_path: Path, suffix: str = "_background") -> Path | None:
+    """
+    Generate a deduplicated, sorted list of all genes present in the given .gmt
+    file and write them to an output file named `<input_stem>{suffix}` in the
+    same directory (no header). For example, `ReactomePathways_2025.gmt` ->
+    `ReactomePathways_2025_background`.
+
+    Skips work if the output exists and is newer than the .gmt file.
+    Returns the output path if written/updated, or None if skipped.
+    """
+    if not gmt_path.exists() or gmt_path.suffix.lower() != ".gmt":
+        return None
+
+    out_path = gmt_path.with_name(f"{gmt_path.stem}{suffix}")
+
+    # Skip if up-to-date
+    if out_path.exists() and out_path.stat().st_mtime >= gmt_path.stat().st_mtime:
+        return None
+
+    genes = _collect_genes_from_gmt_file(gmt_path)
+
+    # Write without header, one gene per line, sorted for determinism
+    with out_path.open("w") as f:
+        for gene in sorted(genes):
+            f.write(f"{gene}\n")
+
+    return out_path
+
+
+def generate_all_library_gene_lists(suffix: str = "_background") -> list[Path]:
+    """
+    For each .gmt file under each subdirectory in app/data/gmt, generate a
+    background gene list named `<input_stem>{suffix}` in the same directory.
+    Returns the list of paths that were written/updated.
+    """
+    written: list[Path] = []
+    if not GMT_DIR.exists():
+        return written
+
+    for library_dir in sorted(p for p in GMT_DIR.iterdir() if p.is_dir()):
+        for gmt_path in sorted(library_dir.glob("*.gmt")):
+            out = generate_background_for_gmt(gmt_path, suffix=suffix)
+            if out is not None:
+                written.append(out)
+    return written
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    updated = generate_all_library_gene_lists()
+    if updated:
+        LOGGER.info("Generated/updated gene lists:")
+        for p in updated:
+            LOGGER.info("%s", p)
+    else:
+        LOGGER.info("All background gene lists are up-to-date.")