Skip to content
Merged
18,993 changes: 18,993 additions & 0 deletions app/data/gmt/GO biological process/GO:BP_2025_background

Large diffs are not rendered by default.

19,870 changes: 19,870 additions & 0 deletions app/data/gmt/GO cellular component/GO:CC_2025_background

Large diffs are not rendered by default.

17,923 changes: 17,923 additions & 0 deletions app/data/gmt/GO molecular function/GO:MF_2025_background

Large diffs are not rendered by default.

11,881 changes: 11,881 additions & 0 deletions app/data/gmt/Reactome/ReactomePathways_2025_background

Large diffs are not rendered by default.

4,919 changes: 4,919 additions & 0 deletions app/data/gmt/Target class/ChEMBL Target Class_background

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,15 @@
from app.routers.pathways import router as pathways_router
from app.routers.umap_router import router
from app.routers import gsea
from app.scripts.prepare_gene_lists import generate_all_library_gene_lists

import logging

# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
config = get_config()

Expand Down Expand Up @@ -46,6 +52,18 @@
# Mount static files for the React app
app.mount("/assets", StaticFiles(directory="./ui/dist/assets"), name="assets")

# Prepare per-library gene lists from GMTs on startup (idempotent and fast if up-to-date)
@app.on_event("startup")
async def prepare_gene_lists_startup() -> None:
try:
updated = generate_all_library_gene_lists()
if updated:
logger.info("Prepared gene lists for libraries: %s", ", ".join(str(p) for p in updated))
else:
logger.info("Gene lists already up-to-date; no changes.")
except Exception as exc: # noqa: BLE001
logger.exception("Failed to prepare gene lists on startup: %s", exc)

@app.get("/")
async def root():
return {"message": f"Welcome to {config.APP_NAME}"}
Expand Down
25 changes: 22 additions & 3 deletions app/routers/gsea.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from fastapi import APIRouter, UploadFile, File, Query, HTTPException
from typing import Literal
from app.services.gsea import run_gsea, available_gmt_files
import tempfile
import pandas as pd
import os
import numpy as np

router = APIRouter()

Expand All @@ -15,7 +17,11 @@ async def list_gmt_files():
@router.post("/gsea")
async def gsea_endpoint(
tsv_file: UploadFile = File(..., description="TSV file containing at least 2 columns: 'symbol' and 'globalScore'"),
gmt_name: str = Query(..., description="GMT library name (without .gmt extension)")
gmt_name: str = Query(..., description="GMT library name (without .gmt extension)"),
analysis_direction: Literal["one_sided_positive", "one_sided_negative", "two_sided"] = Query(
default="one_sided_positive",
description="Analysis direction: 'one_sided_positive' filters NES > 0, 'one_sided_negative' filters NES < 0, 'two_sided' returns all results"
)
):
# Validate file extension
if not tsv_file.filename.endswith(".tsv"):
Expand All @@ -38,9 +44,22 @@ async def gsea_endpoint(

# Run GSEA
try:
res_df = run_gsea(input_tsv=tsv_path, gmt_name=gmt_name)
res_df, missing_stats = run_gsea(input_tsv=tsv_path, gmt_name=gmt_name)
finally:
# Clean up temp file
os.unlink(tsv_path)

return res_df.to_dict(orient="records")
# Filter by NES based on analysis direction
if analysis_direction == "one_sided_positive":
res_df = res_df[res_df["NES"] > 0].copy()
elif analysis_direction == "one_sided_negative":
res_df = res_df[res_df["NES"] < 0].copy()

# Replace NaN/Inf with JSON-safe values
res_df = res_df.replace([np.inf, -np.inf], None)
res_df = res_df.where(pd.notna(res_df), None)

return {
"results": res_df.to_dict(orient="records"),
"input_overlap": missing_stats,
}
17 changes: 12 additions & 5 deletions app/routers/pathways.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import Any, Optional
from fastapi import APIRouter, HTTPException, Query, UploadFile, File, Form
import pandas as pd
from typing import Any
from fastapi import APIRouter, HTTPException, UploadFile, File, Form
from pathlib import Path

from app.services.gsea import available_gmt_files, run_gsea
Expand Down Expand Up @@ -29,13 +28,17 @@ async def run_gsea_endpoint(
"""
Run GSEA on an uploaded input file using a chosen GMT library.
"""
tmp_path = Path(f"/tmp/{file.filename}")
try:
# Save uploaded file to a temporary path
tmp_path = Path(f"/tmp/{file.filename}")
with tmp_path.open("wb") as f:
f.write(await file.read())

res_df = run_gsea(input_tsv=tmp_path, gmt_name=gmt_name, processes=processes)
res_df, _missing_stats = run_gsea(
input_tsv=tmp_path,
gmt_name=gmt_name,
processes=processes,
)

# Return as JSON records
return res_df.to_dict(orient="records")
Expand All @@ -44,3 +47,7 @@ async def run_gsea_endpoint(
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
# Clean up temp file
if tmp_path.exists():
tmp_path.unlink()
2 changes: 2 additions & 0 deletions app/scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Makes `app.scripts` a regular package

80 changes: 80 additions & 0 deletions app/scripts/prepare_gene_lists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from pathlib import Path
import logging


BASE_DIR = Path(__file__).resolve().parents[1] # app/
DATA_DIR = BASE_DIR / "data"
GMT_DIR = DATA_DIR / "gmt"
MIN_GENE_COL_IDX = 2
LOGGER = logging.getLogger(__name__)


def _collect_genes_from_gmt_file(gmt_path: Path) -> set[str]:
genes: set[str] = set()
with gmt_path.open("r") as f:
for line in f:
parts = line.rstrip("\n").split("\t")
if len(parts) > MIN_GENE_COL_IDX:
# All tokens after the second column are gene symbols
for token in parts[MIN_GENE_COL_IDX:]:
stripped = token.strip()
if stripped:
genes.add(stripped)
return genes

def generate_background_for_gmt(gmt_path: Path, suffix: str = "_background") -> Path | None:
"""
Generate a deduplicated, sorted list of all genes present in the given .gmt
file and write them to an output file named `<input_stem>{suffix}` in the
same directory (no header). For example, `ReactomePathways_2025.gmt` ->
`ReactomePathways_2025_background`.

Skips work if the output exists and is newer than the .gmt file.
Returns the output path if written/updated, or None if skipped.
"""
if not gmt_path.exists() or gmt_path.suffix.lower() != ".gmt":
return None

out_path = gmt_path.with_name(f"{gmt_path.stem}{suffix}")

# Skip if up-to-date
if out_path.exists() and out_path.stat().st_mtime >= gmt_path.stat().st_mtime:
return None

genes = _collect_genes_from_gmt_file(gmt_path)

# Write without header, one gene per line, sorted for determinism
with out_path.open("w") as f:
for gene in sorted(genes):
f.write(f"{gene}\n")

return out_path


def generate_all_library_gene_lists(suffix: str = "_background") -> list[Path]:
"""
For each .gmt file under each subdirectory in app/data/gmt, generate a
background gene list named `<input_stem>{suffix}` in the same directory.
Returns the list of paths that were written/updated.
"""
written: list[Path] = []
if not GMT_DIR.exists():
return written

for library_dir in sorted(p for p in GMT_DIR.iterdir() if p.is_dir()):
for gmt_path in sorted(library_dir.glob("*.gmt")):
out = generate_background_for_gmt(gmt_path, suffix=suffix)
if out is not None:
written.append(out)
return written


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
updated = generate_all_library_gene_lists()
if updated:
LOGGER.info("Generated/updated gene lists:")
for p in updated:
LOGGER.info("%s", p)
else:
LOGGER.info("All background gene lists are up-to-date.")
Loading