Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18,993 changes: 18,993 additions & 0 deletions app/data/gmt/GO biological process/GO:BP_2025_background

Large diffs are not rendered by default.

19,870 changes: 19,870 additions & 0 deletions app/data/gmt/GO cellular component/GO:CC_2025_background

Large diffs are not rendered by default.

17,923 changes: 17,923 additions & 0 deletions app/data/gmt/GO molecular function/GO:MF_2025_background

Large diffs are not rendered by default.

11,881 changes: 11,881 additions & 0 deletions app/data/gmt/Reactome/ReactomePathways_2025_background

Large diffs are not rendered by default.

4,919 changes: 4,919 additions & 0 deletions app/data/gmt/Target class/ChEMBL Target Class_background

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from app.routers.pathways import router as pathways_router
from app.routers.umap_router import router
from app.routers import gsea
from app.scripts.prepare_gene_lists import generate_all_library_gene_lists

import logging

Expand Down Expand Up @@ -46,6 +47,18 @@
# Mount static files for the React app
app.mount("/assets", StaticFiles(directory="./ui/dist/assets"), name="assets")

# Prepare per-library gene lists from GMTs on startup (idempotent and fast if up-to-date)
@app.on_event("startup")
async def prepare_gene_lists_startup() -> None:
try:
updated = generate_all_library_gene_lists()
if updated:
logger.info("Prepared gene lists for libraries: %s", ", ".join(str(p) for p in updated))
else:
logger.info("Gene lists already up-to-date; no changes.")
except Exception as exc: # noqa: BLE001
logger.exception("Failed to prepare gene lists on startup: %s", exc)

@app.get("/")
async def root():
return {"message": f"Welcome to {config.APP_NAME}"}
Expand Down
5 changes: 5 additions & 0 deletions app/routers/gsea.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import tempfile
import pandas as pd
import os
import numpy as np

router = APIRouter()

Expand Down Expand Up @@ -43,4 +44,8 @@ async def gsea_endpoint(
# Clean up temp file
os.unlink(tsv_path)

# Replace NaN/Inf with JSON-safe values
res_df = res_df.replace([np.inf, -np.inf], None)
res_df = res_df.where(pd.notna(res_df), None)

return res_df.to_dict(orient="records")
2 changes: 2 additions & 0 deletions app/scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Makes `app.scripts` a regular package

80 changes: 80 additions & 0 deletions app/scripts/prepare_gene_lists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from pathlib import Path
import logging


BASE_DIR = Path(__file__).resolve().parents[1] # app/
DATA_DIR = BASE_DIR / "data"
GMT_DIR = DATA_DIR / "gmt"
MIN_GENE_COL_IDX = 2
LOGGER = logging.getLogger(__name__)


def _collect_genes_from_gmt_file(gmt_path: Path) -> set[str]:
genes: set[str] = set()
with gmt_path.open("r") as f:
for line in f:
parts = line.rstrip("\n").split("\t")
if len(parts) > MIN_GENE_COL_IDX:
# All tokens after the second column are gene symbols
for token in parts[MIN_GENE_COL_IDX:]:
stripped = token.strip()
if stripped:
genes.add(stripped)
return genes

def generate_background_for_gmt(gmt_path: Path, suffix: str = "_background") -> Path | None:
"""
Generate a deduplicated, sorted list of all genes present in the given .gmt
file and write them to an output file named `<input_stem>{suffix}` in the
same directory (no header). For example, `ReactomePathways_2025.gmt` ->
`ReactomePathways_2025_background`.

Skips work if the output exists and is newer than the .gmt file.
Returns the output path if written/updated, or None if skipped.
"""
if not gmt_path.exists() or gmt_path.suffix.lower() != ".gmt":
return None

out_path = gmt_path.with_name(f"{gmt_path.stem}{suffix}")

# Skip if up-to-date
if out_path.exists() and out_path.stat().st_mtime >= gmt_path.stat().st_mtime:
return None

genes = _collect_genes_from_gmt_file(gmt_path)

# Write without header, one gene per line, sorted for determinism
with out_path.open("w") as f:
for gene in sorted(genes):
f.write(f"{gene}\n")

return out_path


def generate_all_library_gene_lists(suffix: str = "_background") -> list[Path]:
"""
For each .gmt file under each subdirectory in app/data/gmt, generate a
background gene list named `<input_stem>{suffix}` in the same directory.
Returns the list of paths that were written/updated.
"""
written: list[Path] = []
if not GMT_DIR.exists():
return written

for library_dir in sorted(p for p in GMT_DIR.iterdir() if p.is_dir()):
for gmt_path in sorted(library_dir.glob("*.gmt")):
out = generate_background_for_gmt(gmt_path, suffix=suffix)
if out is not None:
written.append(out)
return written


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
updated = generate_all_library_gene_lists()
if updated:
LOGGER.info("Generated/updated gene lists:")
for p in updated:
LOGGER.info("%s", p)
else:
LOGGER.info("All background gene lists are up-to-date.")
121 changes: 102 additions & 19 deletions app/services/gsea.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,42 @@
import os
import pandas as pd
import blitzgsea as blitz
from pathlib import Path
import numpy as np
import blitzgsea as blitz
import gcsfs
import pandas as pd

BASE_DIR = Path(__file__).resolve().parents[1] # app/
DATA_DIR = BASE_DIR / "data"
GMT_DIR = DATA_DIR / "gmt"
MIN_GENE_COL_IDX = 2

def get_approved_symbols_from_gcs():
"""
Read approvedSymbol column from Open Targets target parquet files in GCS using gcsfs.
Returns a set of approved gene symbols.
"""
# Initialize GCS filesystem
fs = gcsfs.GCSFileSystem()

# Define the GCS path to the target directory
gcs_path = "open-targets-pre-data-releases/25.09/output/target/"

# Read all parquet files in the directory as a single dataset
# This is much more efficient than downloading individual files
df = pd.read_parquet(gcs_path, filesystem=fs, columns=["approvedSymbol"])

# Extract unique approved symbols (excluding NaN values)
approved_symbols = set(df["approvedSymbol"].dropna().astype(str))


return approved_symbols

def load_custom_gmt(path):
with open(path, 'r') as f:
p = Path(path)
with p.open("r") as f:
return {
parts[0]: parts[2:]
parts[0]: parts[MIN_GENE_COL_IDX:]
for line in f
if (parts := line.strip().split('\t')) and len(parts) > 2
if (parts := line.strip().split("\t")) and len(parts) > MIN_GENE_COL_IDX
}

def available_gmt_files():
Expand All @@ -34,7 +58,7 @@ def available_gmt_files():
hierarchy_file = txt_files[0] if txt_files else None
libraries[f"{folder.name}/{gmt_file.stem}"] = {
"gmt": gmt_file,
"hierarchy": hierarchy_file
"hierarchy": hierarchy_file,
}
return libraries

Expand All @@ -44,11 +68,12 @@ def run_gsea(input_tsv=None, gmt_name=None, processes=4):
Pathway size is computed from the GMT (total genes in the pathway).
Ensure 'Number of input genes' and 'Pathway size' are integers (no .0).
"""
input_tsv = Path(input_tsv) if input_tsv else DEFAULT_TEST_INPUT
input_tsv = Path(input_tsv)

gmt_files = available_gmt_files()
if not gmt_name or gmt_name not in gmt_files:
raise ValueError(f"Invalid gmt_name. Choose from: {list(gmt_files.keys())}")
msg = "Invalid gmt_name. Choose from: " + str(list(gmt_files.keys()))
raise ValueError(msg)

gmt_file = gmt_files[gmt_name]["gmt"]
hierarchy_file = gmt_files[gmt_name]["hierarchy"]
Expand All @@ -58,20 +83,20 @@ def run_gsea(input_tsv=None, gmt_name=None, processes=4):

# --- Check if GMT file contains IDs in braces {ID} ---
contains_braces = False
with open(gmt_file, 'r') as f:
with gmt_file.open("r") as f:
for line in f:
if "{" in line and "}" in line:
contains_braces = True
break

# Build ID -> genes mapping from the GMT file
id_to_genes = {}
with open(gmt_file, 'r') as f:
with gmt_file.open("r") as f:
for line in f:
parts = line.rstrip("\n").split("\t")
if len(parts) > 2:
if len(parts) > MIN_GENE_COL_IDX:
term = parts[0]
genes = parts[2:]
genes = parts[MIN_GENE_COL_IDX:]
if contains_braces and "{" in term and "}" in term:
start = term.find("{") + 1
end = term.find("}", start)
Expand All @@ -85,17 +110,56 @@ def run_gsea(input_tsv=None, gmt_name=None, processes=4):
# --- Load input file safely ---
df = pd.read_csv(input_tsv, sep="\t")

# Allow unnamed columns (0,1) and rename to expected headers
if set(df.columns) == set(range(len(df.columns))):
df = df.rename(columns={0: "symbol", 1: "globalScore"})

if not {"symbol", "globalScore"}.issubset(df.columns):
raise ValueError("Input file must contain 'symbol' and 'globalScore' columns.")
msg = "Input file must contain 'symbol' and 'globalScore' columns."
raise ValueError(msg)

# Keep only required columns
df = df[["symbol", "globalScore"]].copy()

# --- Merge background genes for the selected library (no duplicates) ---
# Prefer pre-generated background file next to the GMT; fallback to union from GMT
background_path = gmt_file.with_name(f"{gmt_file.stem}_background")
if background_path.exists():
with background_path.open("r") as f:
background_genes = {line.strip() for line in f if line.strip()}
else:
# Fallback: union of all genes from the GMT mapping
background_genes = set()
for genes in library_sets.values():
background_genes.update(g for g in genes if g)

existing_genes = set(df["symbol"].astype(str))
missing_genes = sorted(background_genes - existing_genes)
if missing_genes:
background_df = pd.DataFrame({
"symbol": missing_genes,
"globalScore": 0,
})
df = pd.concat([df, background_df], ignore_index=True)

# --- Filter genes to only include those in Open Targets approved symbols ---
approved_symbols = get_approved_symbols_from_gcs()
df = df[df["symbol"].astype(str).isin(approved_symbols)].copy()

# # After gene filtering
# print(f"Genes after filtering: {len(df)}")
# print(f"Sample of filtered genes: {df['symbol'].head()}")

# Sort by score desc and drop duplicate symbols keeping highest score (originals win over zeros)
df = df.sort_values("globalScore", ascending=False)
df = df.drop_duplicates(subset=["symbol"], keep="first")

res_df = blitz.gsea(df, library_sets, processes=processes).reset_index(names="Term")

# # After GSEA calculation
# print(f"GSEA results shape: {res_df.shape}")
# print(f"Columns with NaN: {res_df.isnull().sum()}")

# --- Extract IDs and clean terms ---
if contains_braces:
term_series = res_df["Term"]
Expand Down Expand Up @@ -138,7 +202,7 @@ def run_gsea(input_tsv=None, gmt_name=None, processes=4):
if hierarchy_file and hierarchy_file.exists():
hierarchy_df = pd.read_csv(
hierarchy_file, sep="\t", header=None,
names=["Parent pathway", "Child pathway"]
names=["Parent pathway", "Child pathway"],
)
res_df = res_df.merge(
hierarchy_df, left_on="ID", right_on="Child pathway", how="left"
Expand All @@ -147,9 +211,9 @@ def run_gsea(input_tsv=None, gmt_name=None, processes=4):
res_df.groupby(
[
"ID", "Link", "Pathway", "ES", "NES", "FDR", "p-value",
"Sidak's p-value", "Number of input genes", "Leading edge genes", "Pathway size"
"Sidak's p-value", "Number of input genes", "Leading edge genes", "Pathway size",
],
dropna=False
dropna=False,
)["Parent pathway"]
.apply(lambda x: ",".join(sorted(set(x.dropna()))))
.reset_index()
Expand All @@ -164,11 +228,30 @@ def safe_int_col(df_, col_name):
"""
if col_name in df_.columns:
s = df_[col_name].astype(str).str.replace(",", "", regex=False).str.strip()
s = s.replace({'': None, 'nan': None})
df_[col_name] = pd.to_numeric(s, errors='coerce').fillna(0).astype(int)
s = s.replace({"": None, "nan": None})
df_[col_name] = pd.to_numeric(s, errors="coerce").fillna(0).astype(int)

safe_int_col(res_df, "Number of input genes")
safe_int_col(res_df, "Pathway size")

# Handle NaN values for JSON serialization
res_df = res_df.replace([np.inf, -np.inf], np.nan)
res_df = res_df.fillna({
'ES': 0.0,
'NES': 0.0,
'FDR': 1.0,
'p-value': 1.0,
"Sidak's p-value": 1.0,
'Number of input genes': 0,
'Pathway size': 0
})

# Ensure string columns are properly handled
string_columns = ['Leading edge genes', 'Parent pathway']
for col in string_columns:
if col in res_df.columns:
res_df[col] = res_df[col].astype(str).replace('nan', '')


return res_df

5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ dependencies = [
"pandas>=2.2.3",
"umap>=0.1.1",
"hdbscan>=0.8.40",
"numpy>=1.26.4"
"numpy>=1.26.4",
"google-cloud-storage>=3.4.1",
"pyarrow>=22.0.0",
"gcsfs>=2025.9.0",
]

[build-system]
Expand Down
3 changes: 2 additions & 1 deletion ui/src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ import Pathways from "./pages/Pathways";
import { theme } from "./theme";

function App() {
const basename = import.meta.env.DEV ? "/" : "/ui";
return (
<ApolloProvider client={client}>
<ThemeProvider theme={theme}>
<CssBaseline />
<Router basename="/ui">
<Router basename={basename}>
<div
style={{
minHeight: "100vh",
Expand Down
2 changes: 1 addition & 1 deletion ui/src/lib/api.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
const API_BASE_URL = 'http://localhost:8000';
const API_BASE_URL = import.meta.env.DEV ? '' : 'http://localhost:8000';

export type Pathway = Record<string, any>;

Expand Down
8 changes: 1 addition & 7 deletions ui/src/utils/colorPalettes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,7 @@
* Used in flame graphs and other visualizations that need to show prioritization
*/
export const PRIORITISATION_COLORS = [
"#a01813", // -1 (red)
"#bc3a19",
"#d65a1f",
"#e08145",
"#e3a772",
"#e6ca9c",
"#eceada", // 0 (neutral)
"#eceada", // 0 (neutral beige)
"#c5d2c1",
"#9ebaa8",
"#78a290",
Expand Down
8 changes: 8 additions & 0 deletions ui/vite.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,12 @@ import react from '@vitejs/plugin-react'
// https://vite.dev/config/
export default defineConfig({
plugins: [react()],
server: {
port: 5173,
proxy: {
'/api': 'http://localhost:8000',
'/pathways': 'http://localhost:8000',
'/health': 'http://localhost:8000',
},
},
})
Loading