diff --git a/Dockerfile b/Dockerfile
index 8253cef..92a16b0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,12 +27,19 @@ COPY app/ ./app/
# Copy built frontend from GitHub Actions build
COPY ui/dist ./ui/dist
-# Expose port
-EXPOSE 8000
+# Create startup script that properly handles runtime PORT variable
+RUN echo '#!/bin/sh\n\
+PORT="${PORT:-8080}"\n\
+echo "Starting server on port $PORT"\n\
+exec /app/.venv/bin/uvicorn app.main:app --host 0.0.0.0 --port "$PORT"' > /app/start.sh && \
+ chmod +x /app/start.sh
+
+# Expose port 8080 (Cloud Run default)
+EXPOSE 8080
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
- CMD curl -f http://localhost:8000/ || exit 1
+ CMD curl -f http://localhost:${PORT:-8080}/ || exit 1
# Run the application
-CMD ["/app/.venv/bin/uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD ["/app/start.sh"]
diff --git a/app/config.py b/app/config.py
index 583ab31..3fd8866 100644
--- a/app/config.py
+++ b/app/config.py
@@ -12,18 +12,29 @@ class DevelopmentConfig(BaseConfig):
CORS_ORIGINS = [
"http://localhost:3000",
"http://localhost:5173", # Vite dev server
+ "http://localhost:5174", # Vite dev server
"http://127.0.0.1:5173", # Vite dev server alternative
+ "http://127.0.0.1:5174", # Vite dev server alternative
"http://localhost:4173", # Vite preview server
"http://127.0.0.1:4173", # Vite preview server alternative
+ "http://localhost:8080", # Docker local
+ "http://localhost:8000", # Alternative local
]
class ProductionConfig(BaseConfig):
DEBUG = False
- CORS_ORIGINS = [""]
+ # For Cloud Run, you need to add your actual domain
+ CORS_ORIGINS = (
+ os.getenv("CORS_ORIGINS", "").split(",") if os.getenv("CORS_ORIGINS") else ["*"]
+ )
def get_config():
+ # Check for DEBUG env var first, then APP_ENV
+ if os.getenv("DEBUG", "").lower() == "true":
+ return DevelopmentConfig
+
env = os.getenv("APP_ENV", "development")
if env == "production":
return ProductionConfig
diff --git a/app/controllers/hello.py b/app/controllers/hello.py
deleted file mode 100644
index e69de29..0000000
diff --git a/app/main.py b/app/main.py
index a9751f6..62fb1a8 100644
--- a/app/main.py
+++ b/app/main.py
@@ -6,8 +6,6 @@
from starlette.exceptions import HTTPException as StarletteHTTPException
from starlette.responses import JSONResponse
from app.config import get_config
-from app.routers.pathways import router as pathways_router
-from app.routers.umap_router import router
from app.routers import gsea
from app.scripts.prepare_gene_lists import generate_all_library_gene_lists
@@ -45,8 +43,6 @@
# Include routers
app.include_router(gsea.router, prefix="/api", tags=["GSEA"])
-app.include_router(pathways_router)
-app.include_router(router, prefix="/umap")
# Mount static files for the React app
diff --git a/app/models/__init__.py b/app/models/__init__.py
index e69de29..4df7914 100644
--- a/app/models/__init__.py
+++ b/app/models/__init__.py
@@ -0,0 +1,3 @@
+from app.models.gsea import Gene, GseaJsonRequest
+
+__all__ = ["Gene", "GseaJsonRequest"]
diff --git a/app/models/gsea.py b/app/models/gsea.py
new file mode 100644
index 0000000..e8ec94c
--- /dev/null
+++ b/app/models/gsea.py
@@ -0,0 +1,24 @@
+from pydantic import BaseModel, Field, field_validator
+from typing import List
+
+
+class Gene(BaseModel):
+ """Individual gene with symbol and score."""
+
+ symbol: str = Field(..., description="Gene symbol (e.g., 'BRCA1', 'TP53')")
+ globalScore: float = Field(..., description="Gene score for ranking")
+
+
+class GseaJsonRequest(BaseModel):
+ """Request model for JSON-based GSEA endpoint."""
+
+ genes: List[Gene] = Field(
+ ..., min_length=1, description="List of genes with symbols and scores"
+ )
+
+ @field_validator("genes")
+ @classmethod
+ def validate_genes_not_empty(cls, v):
+ if not v or len(v) == 0:
+ raise ValueError("Genes list cannot be empty")
+ return v
diff --git a/app/routers/__init__.py b/app/routers/__init__.py
index e69de29..3c10ace 100644
--- a/app/routers/__init__.py
+++ b/app/routers/__init__.py
@@ -0,0 +1,5 @@
+"""API routers for the Pathways API."""
+
+from app.routers import gsea
+
+__all__ = ["gsea"]
diff --git a/app/routers/gsea.py b/app/routers/gsea.py
index 8c73437..611ae4e 100644
--- a/app/routers/gsea.py
+++ b/app/routers/gsea.py
@@ -1,6 +1,8 @@
from fastapi import APIRouter, UploadFile, File, Query, HTTPException
from typing import Literal
-from app.services.gsea import run_gsea, available_gmt_files
+from app.services.gsea import run_gsea_from_dataframe, available_gmt_files
+from app.models.gsea import GseaJsonRequest
+from app.utils.gsea_utils import validate_gsea_dataframe, handle_gsea_error
import tempfile
import pandas as pd
import os
@@ -8,46 +10,119 @@
router = APIRouter()
+
@router.get("/gsea/libraries")
async def list_gmt_files():
"""List available GMT libraries."""
return list(available_gmt_files().keys())
-@router.post("/gsea")
-async def gsea_endpoint(
- tsv_file: UploadFile = File(..., description="TSV file containing at least 2 columns: 'symbol' and 'globalScore'"),
+@router.post("/gsea/analyze/file")
+async def analyze_gsea_from_file(
+ tsv_file: UploadFile = File(
+ ...,
+ description="TSV file containing at least 2 columns: 'symbol' and 'globalScore'",
+ ),
gmt_name: str = Query(..., description="GMT library name (without .gmt extension)"),
analysis_direction: Literal["one_sided_positive", "one_sided_negative", "two_sided"] = Query(
default="one_sided_positive",
description="Analysis direction: 'one_sided_positive' filters NES > 0, 'one_sided_negative' filters NES < 0, 'two_sided' returns all results"
- )
+ ),
):
+ """
+ Run GSEA analysis from uploaded TSV file.
+
+ Upload a TSV file with gene symbols and scores to perform Gene Set Enrichment Analysis.
+
+ Example:
+ POST /api/gsea/analyze/file?gmt_name=Reactome/ReactomePathways_2025
+ Content-Type: multipart/form-data
+ Body: file=your_data.tsv
+ """
# Validate file extension
if not tsv_file.filename.endswith(".tsv"):
raise HTTPException(status_code=400, detail="File must be .tsv format")
- # Save to temp file
+ # Read and validate file
with tempfile.NamedTemporaryFile(delete=False, suffix=".tsv") as tmp:
content = await tsv_file.read()
tmp.write(content)
tsv_path = tmp.name
- # Validate TSV structure
try:
- df = pd.read_csv(tsv_path, sep="\t", nrows=1) # read first row
- if not {"symbol", "globalScore"}.issubset(df.columns) and not {0, 1}.issubset(df.columns):
- raise ValueError("TSV must contain 'symbol' and 'globalScore' columns (or two unnamed columns).")
- except Exception as e:
- os.unlink(tsv_path)
- raise HTTPException(status_code=400, detail=f"Invalid TSV format: {str(e)}")
+ # Load and validate DataFrame
+ df = pd.read_csv(tsv_path, sep="\t")
+ df = validate_gsea_dataframe(df)
- # Run GSEA
- try:
- res_df, missing_stats = run_gsea(input_tsv=tsv_path, gmt_name=gmt_name)
+ # Run GSEA
+ res_df, input_overlap = run_gsea_from_dataframe(df, gmt_name)
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ raise handle_gsea_error(e)
finally:
# Clean up temp file
- os.unlink(tsv_path)
+ if os.path.exists(tsv_path):
+ os.unlink(tsv_path)
+
+ # Filter by NES based on analysis direction
+ if analysis_direction == "one_sided_positive":
+ res_df = res_df[res_df["NES"] > 0].copy()
+ elif analysis_direction == "one_sided_negative":
+ res_df = res_df[res_df["NES"] < 0].copy()
+
+ # Replace NaN/Inf with JSON-safe values
+ res_df = res_df.replace([np.inf, -np.inf], None)
+ res_df = res_df.where(pd.notna(res_df), None)
+
+ return {
+ "results": res_df.to_dict(orient="records"),
+ "input_overlap": input_overlap,
+ }
+
+
+@router.post("/gsea/analyze/json")
+async def analyze_gsea_from_json(
+ request: GseaJsonRequest,
+ gmt_name: str = Query(..., description="GMT library name (without .gmt extension)"),
+ analysis_direction: Literal["one_sided_positive", "one_sided_negative", "two_sided"] = Query(
+ default="one_sided_positive",
+ description="Analysis direction: 'one_sided_positive' filters NES > 0, 'one_sided_negative' filters NES < 0, 'two_sided' returns all results"
+ ),
+):
+ """
+ Run GSEA analysis from JSON payload.
+
+ Send gene data as JSON to perform Gene Set Enrichment Analysis.
+
+ Example:
+ POST /api/gsea/analyze/json?gmt_name=Reactome/ReactomePathways_2025
+ Content-Type: application/json
+ Body: {
+ "genes": [
+ {"symbol": "BRCA1", "globalScore": 0.95},
+ {"symbol": "TP53", "globalScore": 0.87}
+ ]
+ }
+ """
+ try:
+ # Convert request to DataFrame
+ genes_data = [
+ {"symbol": g.symbol, "globalScore": g.globalScore} for g in request.genes
+ ]
+ df = pd.DataFrame(genes_data)
+
+ # Validate DataFrame (should already be valid via Pydantic, but double-check)
+ df = validate_gsea_dataframe(df)
+
+ # Run GSEA directly (no file I/O needed!)
+ res_df, input_overlap = run_gsea_from_dataframe(df, gmt_name)
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ raise handle_gsea_error(e)
# Filter by NES based on analysis direction
if analysis_direction == "one_sided_positive":
@@ -61,5 +136,5 @@ async def gsea_endpoint(
return {
"results": res_df.to_dict(orient="records"),
- "input_overlap": missing_stats,
+ "input_overlap": input_overlap,
}
diff --git a/app/routers/pathways.py b/app/routers/pathways.py
deleted file mode 100644
index f9d241c..0000000
--- a/app/routers/pathways.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from typing import Any
-from fastapi import APIRouter, HTTPException, UploadFile, File, Form
-from pathlib import Path
-
-from app.services.gsea import available_gmt_files, run_gsea
-
-router = APIRouter(prefix="/pathways", tags=["pathways"])
-
-
-@router.get("/available", response_model=list[str])
-async def list_libraries():
- """
- Return available GMT libraries under data/gmt (subfolders).
- Example: ["Reactome/reactome2022", "GO/go2022"]
- """
- try:
- return list(available_gmt_files().keys())
- except Exception as e:
- raise HTTPException(status_code=500, detail=str(e))
-
-
-@router.post("/gsea", response_model=list[dict[str, Any]])
-async def run_gsea_endpoint(
- gmt_name: str = Form(..., description="Library name, e.g. Reactome/reactome2022"),
- processes: int = Form(4, description="Number of CPU processes"),
- file: UploadFile = File(..., description="TSV file with 'symbol' and 'globalScore'")
-):
- """
- Run GSEA on an uploaded input file using a chosen GMT library.
- """
- tmp_path = Path(f"/tmp/{file.filename}")
- try:
- # Save uploaded file to a temporary path
- with tmp_path.open("wb") as f:
- f.write(await file.read())
-
- res_df, _missing_stats = run_gsea(
- input_tsv=tmp_path,
- gmt_name=gmt_name,
- processes=processes,
- )
-
- # Return as JSON records
- return res_df.to_dict(orient="records")
-
- except ValueError as e:
- raise HTTPException(status_code=400, detail=str(e))
- except Exception as e:
- raise HTTPException(status_code=500, detail=str(e))
- finally:
- # Clean up temp file
- if tmp_path.exists():
- tmp_path.unlink()
diff --git a/app/routers/umap_router.py b/app/routers/umap_router.py
deleted file mode 100644
index 160998a..0000000
--- a/app/routers/umap_router.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import os
-import pandas as pd
-from fastapi import APIRouter, Query, HTTPException
-from fastapi.responses import HTMLResponse
-from app.services.umap_service import perform_umap_clustering_api
-
-router = APIRouter()
-
-@router.get("/run", response_class=HTMLResponse)
-def run_umap(
- disease_id: str = Query(..., description="The disease ID (e.g., 'EFO_0000094')"),
- library: str = Query(..., description="The library name (e.g., 'open_targets')"),
- n_neighbors: int = Query(10),
- min_dist: float = Query(0.5),
- min_cluster_size: int = Query(12),
- umap_dimensions: int = Query(2)
-):
- try:
- # Run UMAP & clustering
- output_file = perform_umap_clustering_api(
- disease_id=disease_id,
- library=library,
- n_neighbors=n_neighbors,
- min_dist=min_dist,
- min_cluster_size=min_cluster_size,
- umap_dimensions=umap_dimensions
- )
-
- # Read TSV into a DataFrame
- df = pd.read_csv(output_file, sep='\t')
-
- # Convert to HTML table (with some basic styling)
- html = df.to_html(index=False, classes="table table-bordered table-striped")
-
- # Wrap in a basic HTML template
- html_template = f"""
-
-
- UMAP Clustering Results
-
-
-
- UMAP Clustering Results for {disease_id}
- {html}
-
-
- """
- return HTMLResponse(content=html_template)
-
- except Exception as e:
- raise HTTPException(status_code=500, detail=str(e))
diff --git a/app/services/__init__.py b/app/services/__init__.py
index e69de29..943a13f 100644
--- a/app/services/__init__.py
+++ b/app/services/__init__.py
@@ -0,0 +1,15 @@
+"""Service layer for the Pathways API."""
+
+from app.services.gsea import (
+ available_gmt_files,
+ run_gsea,
+ run_gsea_from_dataframe,
+ load_custom_gmt,
+)
+
+__all__ = [
+ "available_gmt_files",
+ "run_gsea",
+ "run_gsea_from_dataframe",
+ "load_custom_gmt",
+]
diff --git a/app/services/gsea.py b/app/services/gsea.py
index 9ba2c9e..66e51b4 100644
--- a/app/services/gsea.py
+++ b/app/services/gsea.py
@@ -16,20 +16,21 @@ def get_approved_symbols_from_gcs():
"""
# Initialize GCS filesystem
fs = gcsfs.GCSFileSystem()
-
+
# Define the GCS path to the target directory
gcs_path = "open-targets-pre-data-releases/25.09/output/target/"
-
+
# Read all parquet files in the directory as a single dataset
# This is much more efficient than downloading individual files
df = pd.read_parquet(gcs_path, filesystem=fs, columns=["approvedSymbol"])
-
+
# Extract unique approved symbols (excluding NaN values)
approved_symbols = set(df["approvedSymbol"].dropna().astype(str))
-
+
return approved_symbols
+
def load_custom_gmt(path):
p = Path(path)
with p.open("r") as f:
@@ -39,6 +40,7 @@ def load_custom_gmt(path):
if (parts := line.strip().split("\t")) and len(parts) > MIN_GENE_COL_IDX
}
+
def available_gmt_files():
"""
Return available GMT libraries as:
@@ -62,14 +64,24 @@ def available_gmt_files():
}
return libraries
-def run_gsea(input_tsv=None, gmt_name=None, processes=4):
- """
- Run GSEA using a chosen GMT library and its hierarchy (if present).
- Pathway size is computed from the GMT (total genes in the pathway).
- Ensure 'Number of input genes' and 'Pathway size' are integers (no .0).
+
+def run_gsea_from_dataframe(
+ df: pd.DataFrame, gmt_name: str, processes: int = 4
+) -> tuple[pd.DataFrame, dict]:
"""
- input_tsv = Path(input_tsv)
+ Run GSEA using a DataFrame directly (no file required).
+
+ Args:
+ df: DataFrame with 'symbol' and 'globalScore' columns, already validated
+ gmt_name: Name of GMT library to use
+ processes: Number of CPU processes
+ Returns:
+ Tuple of (DataFrame with GSEA results, overlap_stats dict)
+
+ Raises:
+ ValueError: If gmt_name is invalid or DataFrame is missing required columns
+ """
gmt_files = available_gmt_files()
if not gmt_name or gmt_name not in gmt_files:
msg = "Invalid gmt_name. Choose from: " + str(list(gmt_files.keys()))
@@ -107,16 +119,9 @@ def run_gsea(input_tsv=None, gmt_name=None, processes=4):
# if no braces, map Term itself as ID
id_to_genes[term] = genes
- # --- Load input file safely ---
- df = pd.read_csv(input_tsv, sep="\t")
-
- # Allow unnamed columns (0,1) and rename to expected headers
- if set(df.columns) == set(range(len(df.columns))):
- df = df.rename(columns={0: "symbol", 1: "globalScore"})
-
+ # Ensure DataFrame is properly formatted
if not {"symbol", "globalScore"}.issubset(df.columns):
- msg = "Input file must contain 'symbol' and 'globalScore' columns."
- raise ValueError(msg)
+ raise ValueError("DataFrame must contain 'symbol' and 'globalScore' columns.")
# Keep only required columns
df = df[["symbol", "globalScore"]].copy()
@@ -170,7 +175,9 @@ def run_gsea(input_tsv=None, gmt_name=None, processes=4):
if contains_braces:
term_series = res_df["Term"]
res_df["ID"] = term_series.str.extract(r"\{([^}]+)\}", expand=False).fillna("")
- res_df["Term"] = term_series.str.replace(r"\s*\{[^}]+\}", "", regex=True).str.strip()
+ res_df["Term"] = term_series.str.replace(
+ r"\s*\{[^}]+\}", "", regex=True
+ ).str.strip()
else:
res_df["ID"] = res_df["Term"] # use Term as ID directly
@@ -261,3 +268,33 @@ def safe_int_col(df_, col_name):
return res_df, overlap_stats
+
+def run_gsea(input_tsv=None, gmt_name=None, processes=4):
+ """
+ Run GSEA from a TSV file path (backward compatible).
+
+ Args:
+ input_tsv: Path to TSV file with 'symbol' and 'globalScore' columns
+ gmt_name: Name of GMT library to use
+ processes: Number of CPU processes
+
+ Returns:
+ Tuple of (DataFrame with GSEA results, overlap_stats dict)
+
+ Raises:
+ ValueError: If gmt_name is invalid or file is missing required columns
+ """
+ if not input_tsv:
+ raise ValueError("input_tsv parameter is required")
+
+ input_tsv = Path(input_tsv)
+
+ # Load input file
+ df = pd.read_csv(input_tsv, sep="\t")
+
+ # Handle unnamed columns (legacy support)
+ if set(df.columns) == set(range(len(df.columns))):
+ df = df.rename(columns={0: "symbol", 1: "globalScore"})
+
+ # Validate and run GSEA using the DataFrame-based function
+ return run_gsea_from_dataframe(df, gmt_name, processes)
diff --git a/app/services/pathways_service.py b/app/services/pathways_service.py
deleted file mode 100644
index dcc025e..0000000
--- a/app/services/pathways_service.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from typing import Any, List, Optional
-
-from pyspark.sql import DataFrame, Row
-from pyspark.sql.functions import col
-
-from app.utils.spark import get_spark_session
-
-BASE_PATH = "data/table_view"
-
-def fetch_pathways(
- disease_id: str,
- library: str,
- fdr_lt: Optional[float] = None,
- hide_leading_edge: bool = False,
-) -> List[dict[str, Any]]:
- """
- Load the parquet partition for `library`/`disease_id`,
- apply optional filters, drop `diseaseId` and optional columns,
- and return as list of dicts.
- """
- spark = get_spark_session()
- path = f"{BASE_PATH}/{library}/diseaseId={disease_id}"
-
- # Read the partition
- df: DataFrame = spark.read.parquet(path)
-
- # --- ALWAYS DROP the partition column so it doesn't appear in output ---
- if "diseaseId" in df.columns:
- df = df.drop("diseaseId")
-
- # Optional filtering
- if fdr_lt is not None:
- df = df.filter(col("FDR") < fdr_lt)
-
- # Optional column drop
- if hide_leading_edge and "Leading edge genes" in df.columns:
- df = df.drop("Leading edge genes")
-
- # Collect and convert each Row to a native dict
- rows: list[Row] = df.collect()
- return [row.asDict(recursive=True) for row in rows]
diff --git a/app/services/umap_service.py b/app/services/umap_service.py
deleted file mode 100644
index 0256b64..0000000
--- a/app/services/umap_service.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import os
-import pandas as pd
-import numpy as np
-import umap
-import hdbscan
-from scipy.spatial.distance import pdist, squareform
-from fastapi.responses import FileResponse # optional
-
-
-def poincare_dist(u, v):
- norm_u = np.linalg.norm(u)
- norm_v = np.linalg.norm(v)
- norm_diff = np.linalg.norm(u - v)
- denom = (1 - norm_u ** 2) * (1 - norm_v ** 2)
- if denom <= 0:
- return float('inf')
- argument = 1 + 2 * (norm_diff ** 2) / denom
- argument = max(argument, 1.0)
- return np.arccosh(argument)
-
-
-def compute_poincare_distance_matrix(embedding_matrix):
- return squareform(pdist(embedding_matrix, metric=poincare_dist))
-
-
-def perform_umap_clustering_api(
- disease_id: str,
- library: str,
- n_neighbors: int = 10,
- min_dist: float = 0.5,
- min_cluster_size: int = 12,
- umap_dimensions: int = 2,
- base_data_dir: str = "data/umap_dynamic"
-) -> str:
- """
- Perform UMAP and HDBSCAN clustering on embeddings andmetadata for a given disease_id and library.
- Returns path to the resulting TSV file.
- """
-
- # Ensure folder uses 'diseaseId=' prefix
- folder_name = disease_id
- if not disease_id.startswith("diseaseId="):
- folder_name = f"diseaseId={disease_id}"
-
- # Updated paths include the library directory
- coordinates_parquet_dir = os.path.join(base_data_dir, "target_embeddings", library, folder_name)
- metadata_parquet_dir = os.path.join(base_data_dir, "target_metadata", library, folder_name)
- output_dir = os.path.join(base_data_dir, "output", library, folder_name)
-
- # Load parquet datasets from directories
- metadata = pd.read_parquet(metadata_parquet_dir).query("geneticScore.notnull()")
- coords_df = pd.read_parquet(coordinates_parquet_dir)
-
- # Prepare coordinates
- coords_df = coords_df.rename(columns={coords_df.columns[0]: 'approvedSymbol'})
- coord_columns = coords_df.columns[1:]
- coords_df[coord_columns] = coords_df[coord_columns].astype(float)
-
- # Merge embeddings with metadata
- merged_df = pd.merge(metadata, coords_df, on='approvedSymbol', how='inner')
-
- # Validate embedding norms (Poincaré ball condition)
- embedding_matrix = merged_df[coord_columns].values
- norms = np.linalg.norm(embedding_matrix, axis=1)
- if np.any(norms >= 1):
- raise ValueError("Some embeddings lie outside the Poincaré ball (norm >= 1).")
-
- # Compute Poincaré distances
- distance_matrix = compute_poincare_distance_matrix(embedding_matrix)
-
- # UMAP dimensionality reduction
- reducer = umap.UMAP(
- n_neighbors=n_neighbors,
- min_dist=min_dist,
- n_components=umap_dimensions,
- metric='precomputed',
- random_state=42
- )
- embedding_umap = reducer.fit_transform(distance_matrix)
-
- # HDBSCAN clustering
- clusterer = hdbscan.HDBSCAN(
- min_cluster_size=min_cluster_size,
- min_samples=1,
- metric='precomputed'
- )
- cluster_labels = clusterer.fit_predict(distance_matrix)
-
- # Append UMAP and cluster labels
- for dim in range(umap_dimensions):
- merged_df[f'UMAP {dim+1}'] = embedding_umap[:, dim]
- merged_df['cluster'] = cluster_labels
-
- # Drop original embedding columns
- output_df = merged_df.drop(columns=coord_columns)
-
- # Save result to TSV
- os.makedirs(output_dir, exist_ok=True)
- output_file = os.path.join(output_dir, f"{disease_id}_clusters.tsv")
- output_df.to_csv(output_file, sep='\t', index=False)
-
- return output_file
\ No newline at end of file
diff --git a/app/utils/__init__.py b/app/utils/__init__.py
index e69de29..1245e14 100644
--- a/app/utils/__init__.py
+++ b/app/utils/__init__.py
@@ -0,0 +1,5 @@
+"""Utility functions for the Pathways API."""
+
+from app.utils.gsea_utils import validate_gsea_dataframe, handle_gsea_error
+
+__all__ = ["validate_gsea_dataframe", "handle_gsea_error"]
diff --git a/app/utils/gsea_utils.py b/app/utils/gsea_utils.py
new file mode 100644
index 0000000..0de35f4
--- /dev/null
+++ b/app/utils/gsea_utils.py
@@ -0,0 +1,67 @@
+from fastapi import HTTPException
+import pandas as pd
+
+
+def validate_gsea_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Validate and normalize a DataFrame for GSEA analysis.
+
+ Args:
+ df: Input DataFrame to validate
+
+ Returns:
+ Normalized DataFrame with 'symbol' and 'globalScore' columns, sorted by score
+
+ Raises:
+ HTTPException: If validation fails
+ """
+ # Handle unnamed columns (legacy support)
+ if set(df.columns) == set(range(len(df.columns))):
+ df = df.rename(columns={0: "symbol", 1: "globalScore"})
+
+ # Validate required columns
+ if not {"symbol", "globalScore"}.issubset(df.columns):
+ raise HTTPException(
+ status_code=400,
+ detail="Input must contain 'symbol' and 'globalScore' columns",
+ )
+
+ # Extract only required columns and sort
+ df = df[["symbol", "globalScore"]].copy()
+ df = df.sort_values("globalScore", ascending=False)
+
+ return df
+
+
+def handle_gsea_error(error: Exception) -> HTTPException:
+ """
+ Convert GSEA analysis errors into user-friendly HTTP exceptions.
+
+ Args:
+ error: The exception that occurred during GSEA analysis
+
+ Returns:
+ HTTPException with appropriate status code and message
+ """
+ error_str = str(error).lower()
+
+ if isinstance(error, ValueError):
+ if any(keyword in error_str for keyword in ["nan", "solver cannot continue"]):
+ return HTTPException(
+ status_code=400,
+ detail=(
+ "GSEA analysis failed. This usually means the gene symbols in your input "
+ "don't match any pathways in the selected GMT library. Please verify that "
+ "your input contains valid gene symbols (e.g., 'BRCA1', 'TP53', 'EGFR') "
+ "rather than disease names or other identifiers."
+ ),
+ )
+ else:
+ return HTTPException(
+ status_code=400, detail=f"GSEA analysis error: {str(error)}"
+ )
+ else:
+ return HTTPException(
+ status_code=500,
+ detail=f"Unexpected error during GSEA analysis: {str(error)}",
+ )
diff --git a/app/utils/spark.py b/app/utils/spark.py
deleted file mode 100644
index 8f27533..0000000
--- a/app/utils/spark.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from pyspark.sql import SparkSession
-
-_spark = None
-
-def get_spark_session() -> SparkSession:
- global _spark
- if _spark is None:
- _spark = (
- SparkSession
- .builder
- .appName("PathwaysAPI")
- .config("spark.ui.showConsoleProgress", "false")
- .getOrCreate()
- )
- return _spark
diff --git a/app/utils/xml_parser.py b/app/utils/xml_parser.py
deleted file mode 100644
index a15b82e..0000000
--- a/app/utils/xml_parser.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import xml.etree.ElementTree as ET
-
-
-def xml_to_plain_text(xml_string):
- try:
- root = ET.fromstring(xml_string)
- plain_text = "".join(root.itertext())
- return plain_text.strip()
- except ET.ParseError as e:
- print(f"Error al analizar el XML: {e}")
- return None
diff --git a/dev.sh b/dev.sh
new file mode 100644
index 0000000..241c68c
--- /dev/null
+++ b/dev.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Local development script for pathways-api
+
+echo "🚀 Starting Pathways API local development environment..."
+
+# Function to cleanup on exit
+cleanup() {
+ echo "🛑 Stopping services..."
+ docker-compose down
+ exit
+}
+
+# Set trap to cleanup on script exit
+trap cleanup EXIT INT TERM
+
+# Check if running in Docker mode or local mode
+if [ "$1" = "docker" ]; then
+ echo "📦 Running in Docker mode..."
+
+ # Build and run with docker-compose
+ docker-compose build
+ docker-compose up -d
+
+ echo "✅ API running at http://localhost:8080"
+ echo "📝 Logs: docker-compose logs -f pathways-api"
+
+ # Wait for services
+ docker-compose logs -f
+
+elif [ "$1" = "local" ]; then
+ echo "💻 Running in local development mode..."
+
+ # Start backend
+ echo "🔧 Starting backend..."
+ cd /Users/carlos_cruz/projects/ot/pathways-api
+ PORT=8080 uvicorn app.main:app --reload --host 0.0.0.0 --port 8080 &
+ BACKEND_PID=$!
+
+ # Start frontend
+ echo "🎨 Starting frontend..."
+ cd ui
+ npm run dev &
+ FRONTEND_PID=$!
+
+ echo "✅ Backend running at http://localhost:8080"
+ echo "✅ Frontend running at http://localhost:5173"
+
+ # Wait for both processes
+ wait $BACKEND_PID $FRONTEND_PID
+
+else
+ echo "Usage: $0 [docker|local]"
+ echo " docker - Run everything in Docker"
+ echo " local - Run backend and frontend locally"
+ exit 1
+fi
diff --git a/docker-compose.yml b/docker-compose.yml
index f1ab0e1..7d6547c 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,16 +6,18 @@ services:
context: .
dockerfile: Dockerfile
ports:
- - "8000:8000"
+ - "8080:8080"
environment:
- - DEBUG=false
- - CORS_ORIGINS=http://localhost:3000,http://localhost:8000
+ - PORT=8080
+ - DEBUG=true
+ - APP_ENV=development
+ - CORS_ORIGINS=http://localhost:3000,http://localhost:5173,http://127.0.0.1:5173,http://localhost:8080
volumes:
# Mount data directory for development
- ./app/data:/app/app/data:ro
restart: unless-stopped
healthcheck:
- test: [ "CMD", "curl", "-f", "http://localhost:8000/" ]
+ test: [ "CMD", "curl", "-f", "http://localhost:8080/" ]
interval: 30s
timeout: 10s
retries: 3
diff --git a/pyproject.toml b/pyproject.toml
index c9477ff..c0e9357 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,19 +1,10 @@
[project]
name = "pathways-api"
version = "0.1.0"
-description = "Pathways API for Open Targets - A comprehensive API for pathway analysis and visualization"
-readme = "README.md"
+description = "FastAPI service for pathway analysis (GSEA) for Open Targets"
authors = [{ name = "Open Targets", email = "contact@opentargets.org" }]
-license = { text = "Apache-2.0" }
-keywords = ["pathways", "bioinformatics", "api", "open-targets", "gsea", "umap"]
-classifiers = [
- "Development Status :: 4 - Beta",
- "Intended Audience :: Science/Research",
- "License :: OSI Approved :: Apache Software License",
- "Programming Language :: Python :: 3",
- "Programming Language :: Python :: 3.12",
- "Topic :: Scientific/Engineering :: Bio-Informatics",
-]
+readme = "README.md"
+
requires-python = ">=3.12"
dependencies = [
@@ -24,34 +15,11 @@ dependencies = [
"uvicorn>=0.32.0",
"blitzgsea>=1.3.54",
"pandas>=2.2.3",
- "umap>=0.1.1",
- "hdbscan>=0.8.40",
"numpy>=1.26.4",
"google-cloud-storage>=3.4.1",
"pyarrow>=22.0.0",
"gcsfs>=2025.9.0",
]
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[tool.hatch.build.targets.wheel]
-packages = ["app"]
-
[dependency-groups]
-lint = ["ruff>=0.7.1"]
-dev = [
- "pytest>=7.0.0",
- "pytest-asyncio>=0.21.0",
- "httpx>=0.24.0",
-]
-
-[tool.ruff]
-target-version = "py312"
-line-length = 88
-select = ["E", "F", "W", "C90", "I", "N", "UP", "YTT", "S", "BLE", "FBT", "B", "A", "COM", "C4", "DTZ", "T10", "EM", "EXE", "FA", "ISC", "ICN", "G", "INP", "PIE", "T20", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SIM", "TID", "TCH", "ARG", "PTH", "ERA", "PD", "PGH", "PL", "TRY", "FLY", "NPY", "AIR", "PERF", "FURB", "LOG", "RUF"]
-ignore = ["E501", "S101", "S104", "S105", "S106", "S107"]
-
-[tool.ruff.per-file-ignores]
-"__init__.py" = ["F401"]
+lint = ["ruff>=0.7.1"]
\ No newline at end of file
diff --git a/ui/.env.development b/ui/.env.development
new file mode 100644
index 0000000..e8b2023
--- /dev/null
+++ b/ui/.env.development
@@ -0,0 +1,2 @@
+# Add a .env file for local development
+VITE_API_URL=http://localhost:8000
diff --git a/ui/.env.production b/ui/.env.production
new file mode 100644
index 0000000..460e575
--- /dev/null
+++ b/ui/.env.production
@@ -0,0 +1 @@
+VITE_API_URL=http://localhost:8080
diff --git a/ui/index.html b/ui/index.html
index e4b78ea..53b3037 100644
--- a/ui/index.html
+++ b/ui/index.html
@@ -1,13 +1,16 @@
-
-
-
-
- Vite + React + TS
-
-
-
-
-
-
+
+
+
+
+
+ Vite + React + TS
+
+
+
+
+
+
+
+