diff --git a/.env.template b/.env.template index 011d722..3d15f56 100644 --- a/.env.template +++ b/.env.template @@ -1,10 +1,6 @@ # NOTE!!!: Don't copy and edit this file yourself. Instead, use ./scripts/generate-env.sh to generate a new one APP_HOSTNAME=dhis2-127-0-0-1.nip.io -LETSENCRYPT_ACME_EMAIL= -# ACME CA server - use staging for testing to avoid rate limits -# Uncomment the line below to use Let's Encrypt staging -# LETSENCRYPT_ACME_CASERVER=https://acme-staging-v02.api.letsencrypt.org/directory # Set this to the exact version you want to use. Example: 42.3.1 DHIS2_VERSION=42 @@ -12,7 +8,8 @@ DHIS2_VERSION=42 DHIS2_ADMIN_USERNAME=admin DHIS2_ADMIN_PASSWORD= -# This user is automatically created during startup +# These credentials are copied from stacks/monitoring/.env by generate-env.sh. +# They must match across all instances so the shared Prometheus can scrape metrics. DHIS2_MONITOR_USERNAME=monitor DHIS2_MONITOR_PASSWORD= @@ -24,10 +21,4 @@ POSTGRES_DB_PASSWORD= POSTGRES_METRICS_USERNAME=metrics POSTGRES_METRICS_PASSWORD= -GRAFANA_VERSION=10.0.0 -GRAFANA_ADMIN_PASSWORD= -PROMETHEUS_VERSION=v2.45.0 -PROMETHEUS_RETENTION_TIME=15d -LOKI_VERSION=2.9.0 -LOKI_RETENTION_PERIOD=744h POSTGRES_EXPORTER_VERSION=v0.17.1 diff --git a/.gitignore b/.gitignore index 3eff324..0c63f15 100644 --- a/.gitignore +++ b/.gitignore @@ -17,5 +17,14 @@ Thumbs.db .ansible/ -traefik/acme.json backups/ + +# Standalone stacks — env files contain passwords +stacks/traefik/.env +stacks/monitoring/.env + +# Generated route and target files (created by make launch-instance / launch-monitoring) +stacks/traefik/conf.d/*.yml +!stacks/traefik/conf.d/middlewares.yml +stacks/monitoring/targets/dhis2/*.json +stacks/monitoring/targets/postgres/*.json diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 15232e1..a1da7ae 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,6 +29,7 @@ repos: hooks: - id: shellcheck exclude: .envrc + args: [ -x ] - repo: https://github.com/scop/pre-commit-shfmt rev: v3.12.0-2 diff --git a/Makefile b/Makefile index c3eb592..cfd9c66 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,12 @@ PRE_COMMIT_VERSION ?= 4.3.0 -.PHONY: init playwright test reinit check backup-database backup-file-storage backup restore-database restore-file-storage restore docs launch clean clean-all config get-backup-timestamp +# PROJECT_NAME: unique name for this instance (e.g. dev, test, prod). +# Env file is always instances/$(PROJECT_NAME).env — generated by scripts/generate-env.sh. +PROJECT_NAME ?= $(notdir $(CURDIR)) +ENV_FILE = instances/$(PROJECT_NAME).env +BACKUP_DIR ?= ./backups/$(PROJECT_NAME) + +.PHONY: init playwright test reinit check backup-database backup-file-storage backup restore-database restore-file-storage restore docs generate-stack-envs create-instance list-instances start-postgres start-instance start-traefik start-monitoring ensure-networks stop-instance delete-instance clean clean-all config get-backup-timestamp init: @test -d .venv || python3 -m venv .venv @@ -21,6 +27,14 @@ reinit: rm -rf .venv $(MAKE) init +# Generate .env files for stacks/traefik/ and stacks/monitoring/ (run once per server). +# Requires: GEN_LETSENCRYPT_ACME_EMAIL and GEN_GRAFANA_HOSTNAME to be set. +# Example: GEN_LETSENCRYPT_ACME_EMAIL=ops@example.com GEN_GRAFANA_HOSTNAME=grafana.example.com make generate-stack-envs +generate-stack-envs: + GEN_LETSENCRYPT_ACME_EMAIL=$(GEN_LETSENCRYPT_ACME_EMAIL) \ + GEN_GRAFANA_HOSTNAME=$(GEN_GRAFANA_HOSTNAME) \ + ./scripts/generate-stack-envs.sh + install-loki-driver: docker plugin ls --format '{{.Name}}' | grep -q 'loki:latest' || ./scripts/install-loki-driver.sh docker plugin ls @@ -30,44 +44,140 @@ check: BACKUP_TIMESTAMP ?= $(shell date -u +%Y-%m-%d_%H-%M-%S_%Z) +POSTGRES_COMPOSE_CMD = docker compose \ + --project-name $(PROJECT_NAME) \ + --env-file $(ENV_FILE) \ + -f stacks/postgres/docker-compose.yml + +BACKUP_COMPOSE_CMD = BACKUP_DIR=$(BACKUP_DIR) docker compose \ + --project-name $(PROJECT_NAME) \ + --env-file $(ENV_FILE) \ + -f docker-compose.yml \ + -f stacks/backup/docker-compose.yml + get-backup-timestamp: @echo $(BACKUP_TIMESTAMP) backup-database: - mkdir -p ./backups - docker compose run -e BACKUP_TIMESTAMP=$(BACKUP_TIMESTAMP) --rm backup-database + mkdir -p $(BACKUP_DIR) + $(BACKUP_COMPOSE_CMD) run -e BACKUP_TIMESTAMP=$(BACKUP_TIMESTAMP) --rm backup-database backup-file-storage: - mkdir -p ./backups - docker compose run -e BACKUP_TIMESTAMP=$(BACKUP_TIMESTAMP) --rm backup-file-storage + mkdir -p $(BACKUP_DIR) + $(BACKUP_COMPOSE_CMD) run -e BACKUP_TIMESTAMP=$(BACKUP_TIMESTAMP) --rm backup-file-storage backup: backup-database backup-file-storage restore-database: - docker compose stop app - docker compose run --rm restore-database - docker compose start app + $(BACKUP_COMPOSE_CMD) stop app + $(BACKUP_COMPOSE_CMD) run --rm restore-database + $(BACKUP_COMPOSE_CMD) start app restore-file-storage: - docker compose stop app - docker compose run --rm restore-file-storage - docker compose start app + $(BACKUP_COMPOSE_CMD) stop app + $(BACKUP_COMPOSE_CMD) run --rm restore-file-storage + $(BACKUP_COMPOSE_CMD) start app restore: - docker compose stop app - docker compose run --rm restore-database - docker compose run --rm restore-file-storage - docker compose start app + $(BACKUP_COMPOSE_CMD) stop app + $(BACKUP_COMPOSE_CMD) run --rm restore-database + $(BACKUP_COMPOSE_CMD) run --rm restore-file-storage + $(BACKUP_COMPOSE_CMD) start app docs: mkdir -p ./docs - docker compose run --rm compose-docs > docs/environment-variables.md - -COMPOSE_CMD = docker compose -f docker-compose.yml -f overlays/traefik-dashboard/docker-compose.yml -f overlays/monitoring/docker-compose.yml -f overlays/profiling/docker-compose.yml -f overlays/glowroot/docker-compose.yml + docker compose -f stacks/docs/docker-compose.yml run --rm compose-docs > docs/environment-variables.md + +COMPOSE_CMD = docker compose \ + --project-name $(PROJECT_NAME) \ + --env-file $(ENV_FILE) \ + -f docker-compose.yml \ + -f overlays/profiling/docker-compose.yml \ + -f overlays/glowroot/docker-compose.yml + +# Create the shared Docker networks required by the multi-instance workflow. +# Safe to run multiple times (ignores errors if networks already exist). +ensure-networks: + docker network create proxy 2>/dev/null || true + docker network create monitoring 2>/dev/null || true + +# Start the standalone Traefik gateway (run once; watches stacks/traefik/conf.d/ for route changes) +start-traefik: ensure-networks + docker compose -f stacks/traefik/docker-compose.yml --env-file stacks/traefik/.env up $(COMPOSE_OPTS) + +# Start the standalone monitoring stack (run once; watches stacks/monitoring/targets/ for new instances) +start-monitoring: ensure-networks + GRAFANA_HOSTNAME=$$(grep -E '^GRAFANA_HOSTNAME=' stacks/monitoring/.env | cut -d= -f2-) \ + envsubst < stacks/traefik/conf.d/monitoring.yml.template > stacks/traefik/conf.d/monitoring.yml + docker compose -f stacks/monitoring/docker-compose.yml --env-file stacks/monitoring/.env up $(COMPOSE_OPTS) + +# Generate the env file for a new instance. +# Example: APP_HOSTNAME=dhis2.example.com PROJECT_NAME=prod make create-instance +create-instance: + @test -n "$(APP_HOSTNAME)" || (echo "Error: APP_HOSTNAME must be set" >&2; exit 1) + GEN_PROJECT_NAME=$(PROJECT_NAME) GEN_APP_HOSTNAME=$(APP_HOSTNAME) ./scripts/generate-env.sh + +# List all configured instances with their hostname and running container count. +list-instances: + @envs=$$(ls instances/*.env 2>/dev/null); \ + if [ -z "$$envs" ]; then \ + echo "No instances found in instances/"; \ + else \ + printf "%-20s %-40s %s\n" "INSTANCE" "HOSTNAME" "CONTAINERS"; \ + for env in $$envs; do \ + name=$$(basename $$env .env); \ + hostname=$$(grep -E '^APP_HOSTNAME=' $$env | cut -d= -f2-); \ + running=$$(docker compose --project-name $$name -f docker-compose.yml ps -q 2>/dev/null | wc -l | tr -d ' '); \ + printf "%-20s %-40s %s running\n" "$$name" "$$hostname" "$$running"; \ + done; \ + fi -launch: install-loki-driver +# Start the PostgreSQL stack for a named instance. +# Creates a per-instance db network (PROJECT_NAME-db) and waits until healthy. +start-postgres: + docker network create $(PROJECT_NAME)-db 2>/dev/null || true + $(POSTGRES_COMPOSE_CMD) up --wait -d + +# Start a named DHIS2 instance connected to the standalone Traefik, monitoring, and postgres stacks. +# Requires: PROJECT_NAME and ENV_FILE to be set, ensure-networks and start-traefik and +# start-monitoring to have been run first. +# Example: PROJECT_NAME=dev ENV_FILE=instances/dev.env make start-instance +start-instance: ensure-networks install-loki-driver start-postgres + PROJECT_NAME=$(PROJECT_NAME) APP_HOSTNAME=$$(grep -E '^APP_HOSTNAME=' $(ENV_FILE) | cut -d= -f2-) \ + envsubst < stacks/traefik/conf.d/instance.yml.template > stacks/traefik/conf.d/$(PROJECT_NAME).yml + PROJECT_NAME=$(PROJECT_NAME) \ + envsubst < stacks/monitoring/targets/dhis2/instance.json.template > stacks/monitoring/targets/dhis2/$(PROJECT_NAME).json + PROJECT_NAME=$(PROJECT_NAME) \ + envsubst < stacks/monitoring/targets/postgres/instance.json.template > stacks/monitoring/targets/postgres/$(PROJECT_NAME).json $(COMPOSE_CMD) up $(COMPOSE_OPTS) +# Stop a named DHIS2 instance and remove its Traefik routes and Prometheus targets. +# Example: PROJECT_NAME=dev ENV_FILE=instances/dev.env make stop-instance +stop-instance: + $(COMPOSE_CMD) down --remove-orphans + $(POSTGRES_COMPOSE_CMD) down + docker network rm $(PROJECT_NAME)-db 2>/dev/null || true + rm -f stacks/traefik/conf.d/$(PROJECT_NAME).yml + rm -f stacks/monitoring/targets/dhis2/$(PROJECT_NAME).json + rm -f stacks/monitoring/targets/postgres/$(PROJECT_NAME).json + +# Delete a named DHIS2 instance: stop containers, remove volumes, and delete env file. +# WARNING: This permanently destroys all data for the instance. +# Example: PROJECT_NAME=dev ENV_FILE=instances/dev.env make delete-instance +delete-instance: + @if [ -t 0 ]; then \ + echo "WARNING: This will permanently destroy all data for instance '$(PROJECT_NAME)' (database, file storage, env file)."; \ + echo "This action is irreversible."; \ + read -p "Are you sure? [y/N] " confirm && [ "$$confirm" = "y" ] || (echo "Aborted." && exit 1); \ + fi + $(COMPOSE_CMD) down --remove-orphans --volumes + $(POSTGRES_COMPOSE_CMD) down --volumes + docker network rm $(PROJECT_NAME)-db 2>/dev/null || true + rm -f stacks/traefik/conf.d/$(PROJECT_NAME).yml + rm -f stacks/monitoring/targets/dhis2/$(PROJECT_NAME).json + rm -f stacks/monitoring/targets/postgres/$(PROJECT_NAME).json + rm -f instances/$(PROJECT_NAME).env + clean: $(COMPOSE_CMD) down --remove-orphans diff --git a/README.md b/README.md index 8c2b0a2..e69e17f 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,6 @@ > Key points: > > - The primary limitation is overall maturity — more real-world testing and validation are required. -> - Does not yet support multiple DHIS2 environments on the same server out of the box. > - Direct database access and advanced operations require technical knowledge. > - Tuning and optimisation (postgresql, DHIS2 and container resource allocation) is needed per deployment > @@ -16,7 +15,56 @@ ## Overview -This repository provides a Docker-based deployment for the DHIS2 application, designed for both local development/testing and secure production implementations. It leverages Docker Compose to orchestrate DHIS2, PostgreSQL, Traefik (as a reverse proxy), and an optional monitoring stack. Facilities are also provided for backup and restore of the database and file storage. +This repository provides a Docker-based deployment for the DHIS2 application, designed for both local development/testing and secure production implementations. It supports running multiple named DHIS2 instances on the same host, each with its own PostgreSQL database, isolated Docker networks, and automatically registered Traefik routes and Prometheus scrape targets. + +The stack is orchestrated through `make` targets using Docker Compose. Traefik (reverse proxy) and a monitoring stack (Grafana, Prometheus, Loki) are launched once per host and shared across all instances. + +```mermaid +flowchart LR + Browser["browser"] + + subgraph server["Your server"] + direction TB + Traefik["Traefik
HTTPS routing & SSL certificates"] + + subgraph inst3["DHIS2 instance: other..."] + direction TB + App3["DHIS2"] + DB3[("PostgreSQL")] + App3 --> DB3 + end + + subgraph inst2["DHIS2 instance: dev"] + direction TB + App2["DHIS2"] + DB2[("PostgreSQL")] + App2 --> DB2 + end + + subgraph inst1["DHIS2 instance: prod"] + direction TB + App1["DHIS2"] + DB1[("PostgreSQL")] + App1 --> DB1 + end + + Mon["Monitoring
Grafana · Prometheus · Loki"] + end + + Browser -->|"prod.your-domain.com"| Traefik + Browser -->|"dev.your-domain.com"| Traefik + Browser -->|"other.your-domain.com"| Traefik + Browser -->|"grafana.your-domain.com [VPN]"| Mon + Traefik --> App1 + Traefik --> App2 + Traefik --> App3 + Mon -. "metrics & logs" .-> App1 + Mon -. "metrics & logs" .-> App2 + Mon -. "metrics & logs" .-> App3 +``` + +> [!WARNING] +> If you are upgrading from a previous version of this tool and have a `.env` file in the root of the repository, please remove it! Otherwise it may interfere with the new project-based deployment mechanism. ## Table of contents @@ -26,8 +74,11 @@ This repository provides a Docker-based deployment for the DHIS2 application, de - [Quick Start](#quick-start) - [Deployment For Production](#deployment-for-production) - [Deployment Prerequisites](#deployment-prerequisites) - - [Configure Environment](#configure-environment) - - [Launch the application](#launch-the-application) + - [One-time server setup](#one-time-server-setup) + - [Create an instance](#create-an-instance) + - [Start an instance](#start-an-instance) + - [Manage multiple instances](#manage-multiple-instances) + - [Stop an instance](#stop-an-instance) - [Advanced Usage](#advanced-usage) - [PostgreSQL Configuration](#postgresql-configuration) - [Additional Services (Overlays)](#additional-services-overlays) @@ -60,25 +111,56 @@ This repository provides a Docker-based deployment for the DHIS2 application, de This section is for users who want to quickly set up and test the DHIS2 application on their local machine. +```mermaid +%%{init: {"themeVariables": {"fontSize": "13px"}}}%% +flowchart TD + + subgraph per["② For each DHIS2 instance"] + D["PROJECT_NAME=<name> APP_HOSTNAME=<hostname>
make create-instance
"] + E["Review instances/<name>.env
Passwords and settings are generated for you
"] + F["PROJECT_NAME=<name> make start-instance"] + D --> E --> F + end + + F --> G(["DHIS2 is live at <hostname> ✓"]) + + A(["Start"]) --> B + + subgraph once["① One-time host setup"] + B["make generate-stack-envs
Set your email and Grafana hostname"] + C["make start-traefik
make start-monitoring"] + B --> C + end + +``` + ```shell -# The first two lines will check out the repository in your current folder +# Clone the repository git clone https://github.com/dhis2/docker-deployment.git && \ cd docker-deployment -# the next three lines will set the environment to run the compose file (start here if you are alread in the repository folder). - export GEN_APP_HOSTNAME=dhis2-127-0-0-1.nip.io && \ - export GEN_LETSENCRYPT_ACME_EMAIL=whatever@dhis2.org && \ - ./scripts/generate-env.sh -# the last line will launch DHIS2 and services (start here if you've already set the environment. - make launch + +# One-time host setup: configure and launch Traefik and the monitoring stack +GEN_LETSENCRYPT_ACME_EMAIL=whatever@dhis2.org \ +GEN_GRAFANA_HOSTNAME=grafana.127-0-0-1.nip.io \ + make generate-stack-envs + +make start-traefik & +make start-monitoring & + +# Create and launch a DHIS2 instance (called "prod") +APP_HOSTNAME=dhis2.127-0-0-1.nip.io PROJECT_NAME=prod make create-instance +PROJECT_NAME=prod make start-instance ``` -Open [http://dhis2-127-0-0-1.nip.io](http://dhis2-127-0-0-1.nip.io) in your favorite browser. +Open [http://dhis2.127-0-0-1.nip.io](http://dhis2.127-0-0-1.nip.io) in your favorite browser. > [!NOTE] > Your browser will warn you that the certificate is not trusted. This is expected, as it is a self-signed certificate. +> +> For local testing without real DNS, [nip.io](https://nip.io) provides free wildcard DNS that resolves to an embedded IP address — for example, `dhis2.127-0-0-1.nip.io` resolves to `127.0.0.1` with no configuration required. > [!NOTE] -> The default admin credentials are available in the `.env` file. +> The default DHIS2 admin credentials are available in `instances/prod.env`. ## Deployment For Production @@ -89,41 +171,105 @@ This section is for users planning to deploy DHIS2 in a production environment. Before deploying to production, ensure you have: - A dedicated host or virtual machine with Docker and Docker Compose installed. -- A fully qualified domain name (FQDN) for your DHIS2 instance. +- A fully qualified domain name (FQDN) for each DHIS2 instance you plan to run. - A valid email address for Let's Encrypt certificate management. - Appropriate firewall rules configured for ports 80 and 443. -### Configure Environment +> [!NOTE] +> A wildcard DNS record (`*.your-domain.com`) pointing to your server is a convenient way to cover all instances with a single DNS entry — each instance then gets its own subdomain (e.g. `prod.your-domain.com`, `dev.your-domain.com`). +> +> You can also namespace instances under a shared subdomain: add an A record for `dhis2.your-domain.com` and a wildcard `*.dhis2.your-domain.com`, then host `prod` at `dhis2.your-domain.com` and additional instances at `dev.dhis2.your-domain.com`, `test.dhis2.your-domain.com`, etc. Note that a wildcard does not match the bare subdomain it is rooted at, so the explicit A record for `dhis2.your-domain.com` is required alongside the wildcard. +> +> For local testing without real DNS, [nip.io](https://nip.io) can also be used, as mentioned earlier. -The following environment variables are required to configure the application. +### One-time server setup + +Run these commands once per host before creating any instances. They generate environment files for the shared Traefik and monitoring stacks, then start both. ```shell -# Provide the FQDN for your DHIS2 instance. -export GEN_APP_HOSTNAME= -# A valid email address is required for Let's Encrypt certificate management. -export GEN_LETSENCRYPT_ACME_EMAIL=your@email.com +GEN_LETSENCRYPT_ACME_EMAIL=your@email.com \ +GEN_GRAFANA_HOSTNAME=grafana.your-domain.com \ + make generate-stack-envs + +COMPOSE_OPTS=-d make start-traefik +COMPOSE_OPTS=-d make start-monitoring ``` -Generate a new `.env` file by executing the following command: +`COMPOSE_OPTS=-d` runs both stacks in detached mode. Traefik watches `stacks/traefik/conf.d/` for route changes; Prometheus watches `stacks/monitoring/targets/` for new scrape targets — both pick up new instances automatically without a restart. + +### Create an instance + +Generate the environment file for a named instance. `PROJECT_NAME` is a short identifier (e.g. `prod`, `dev`, `test`, `one`, `two`,...) that is used as the Docker Compose project name and must be unique on the host. ```shell -./scripts/generate-env.sh +APP_HOSTNAME=. PROJECT_NAME= make create-instance ``` -For production, carefully review and configure all environment variables in your `.env` file. Refer to the comprehensive [environment variables documentation](docs/environment-variables.md) for details on each variable. It is recommended not to change the generated values of the password variables unless you need to do so to align with your organization's security policies, or existing components. +This writes `instances/.env` with generated passwords and the supplied hostname. Review and adjust that file before launching — see the [environment variables documentation](docs/environment-variables.md) for details on each variable. -### Launch the application +You can create multiple instances in this way, by simply using different names for each. -Once the environment is configured, launch the application using Docker Compose: +You can list your instances at any time: ```shell -docker compose up +make list-instances ``` -Open `https://` in your favorite browser. +### Start an instance + +Start an instance by targeting that named instance with the `PROJECT_NAME` variable: + +```shell +PROJECT_NAME= make start-instance +``` + +This will: + +1. Create a dedicated `-db` Docker network for database isolation. +2. Start a PostgreSQL container for the instance and wait until it is healthy. +3. Register the instance's hostname with Traefik by writing `stacks/traefik/conf.d/.yml`. +4. Register Prometheus scrape targets for the app and database. +5. Start the DHIS2 application container. > [!NOTE] -> The first time you launch the application, it will initialise with a blank database. *The default admin credentials are available in the `.env` file.* If you have an existing database, you can restore it following the [Backup and Restore](#backup-and-restore) section, under Advanced Usage, below. +> The first time you launch an instance it will initialise with a blank database. If you have an existing database, you can restore it following the [Backup and Restore](#backup-and-restore) section below. + +### Manage multiple instances + +Additional instances can be created and launched independently. Each instance is fully isolated with its own database, network, and monitoring targets. + +```shell +# Add a second instance +APP_HOSTNAME=dev.your-domain.com PROJECT_NAME=dev make create-instance +PROJECT_NAME=dev make start-instance +``` + +To see all configured instances and their running container counts: + +```shell +make list-instances +``` + +### Stop an instance + +Stopping an instance brings down its containers and removes its Traefik routes and Prometheus targets. The `instances/.env` file is retained so the instance can be relaunched later. + +```shell +PROJECT_NAME= make stop-instance +``` + +The diagram below summarises all possible states for an instance and the commands that move between them: + +```mermaid +stateDiagram-v2 + direction LR + [*] --> **Configured** : **make create-instance**
generates instances/name.env + + **Configured** --> **Running** : **make start-instance**
starts database + app, registers routes + + **Running** --> **Configured** : **make stop-instance**
stops containers, removes routes
config file is kept + +``` ## Advanced Usage @@ -141,7 +287,7 @@ SELECT pg_reload_conf(); ### Additional Services (Overlays) -Deployments can benefit from additional services provided by compose overlays. +Deployments can benefit from additional services provided by compose overlays. Pass overlays via `COMPOSE_OPTS` or by setting them directly in the compose command. #### Traefik Dashboard @@ -164,10 +310,10 @@ docker compose -f docker-compose.yml -f overlays/glowroot/docker-compose.yml up The profiling overlay adds distributed tracing capabilities using Grafana Tempo and OpenTelemetry. This allows you to trace requests through the DHIS2 application, providing insights into performance bottlenecks and request flows. > [!NOTE] -> The profiling overlay requires the monitoring overlay to be enabled first. +> The profiling overlay requires the monitoring stack to be running first (`make start-monitoring`). ```shell -docker compose -f docker-compose.yml -f overlays/monitoring/docker-compose.yml -f overlays/profiling/docker-compose.yml up +PROJECT_NAME= COMPOSE_OPTS="-f overlays/profiling/docker-compose.yml" make start-instance ``` For detailed configuration and usage, see the [Profiling Overlay README](overlays/profiling/README.md). @@ -176,12 +322,14 @@ For detailed configuration and usage, see the [Profiling Overlay README](overlay Robust backup and restore procedures are essential for production. Backups are stored in the `./backups` directory. We support backup and restore of both the database and the file storage. +All backup and restore commands require `PROJECT_NAME` to target the correct instance. + #### Backup A complete backup of both the database and file storage can be created by executing: ```shell -make backup +PROJECT_NAME= make backup ``` This command will create two files in the `./backups` directory: one for the database and one for the file storage. @@ -189,7 +337,7 @@ This command will create two files in the `./backups` directory: one for the dat - **Backup Database**: The database can be backed up in `custom` (default) or `plain` format, controlled by the `POSTGRES_BACKUP_FORMAT` environment variable. ```shell - make backup-database + PROJECT_NAME= make backup-database ``` This creates a file in `./backups` named `$TIMESTAMP.pgc` (custom) or `$TIMESTAMP.sql.gz` (plain). Consult the [PostgreSQL documentation](https://www.postgresql.org/docs/current/app-pgdump.html) for more details. @@ -197,7 +345,7 @@ This command will create two files in the `./backups` directory: one for the dat - **Backup File Storage**: ```shell - make backup-file-storage + PROJECT_NAME= make backup-file-storage ``` #### Backup Timestamp @@ -205,7 +353,7 @@ This command will create two files in the `./backups` directory: one for the dat By default, backups are automatically named with a timestamp in the format `YYYY-MM-DD_HH-MM-SS_UTC`. You can override this by setting the `BACKUP_TIMESTAMP` environment variable when running backup commands: ```shell -BACKUP_TIMESTAMP= make backup +PROJECT_NAME= BACKUP_TIMESTAMP= make backup ``` #### Restore @@ -215,19 +363,19 @@ The restore process relies on the `DB_RESTORE_FILE` and `FILE_STORAGE_RESTORE_SO A complete restore of both database and file storage can be done by executing: ```shell -make restore +PROJECT_NAME= make restore ``` - **Restore Database**: Set the `DB_RESTORE_FILE` environment variable to the backup file name. ```shell - make restore-database + PROJECT_NAME= make restore-database ``` - **Restore File Storage**: Set the `FILE_STORAGE_RESTORE_SOURCE_DIR` environment variable to the backup directory name. ```shell - make restore-file-storage + PROJECT_NAME= make restore-file-storage ``` ### Let's Encrypt Certificate Management @@ -237,7 +385,7 @@ make restore - **Production (default):** trusted certificates with standard Let's Encrypt rate limits. - **Staging:** untrusted test certificates with much higher rate limits for validation and CI/testing workflows. -To use staging, set this in `.env`: +To use staging, set this in `stacks/traefik/.env`: ```dotenv LETSENCRYPT_ACME_CASERVER=https://acme-staging-v02.api.letsencrypt.org/directory @@ -245,11 +393,11 @@ LETSENCRYPT_ACME_CASERVER=https://acme-staging-v02.api.letsencrypt.org/directory ### Monitoring -The monitoring stack is crucial for understanding the health and performance of your production DHIS2 deployment. It includes Grafana, Loki, and Prometheus for logs and metrics collection. +The monitoring stack provides visibility into the health and performance of all running DHIS2 instances. It includes Grafana, Loki, and Prometheus for logs and metrics collection and is launched once per host, shared across all instances. #### Prerequisites -The Docker Loki Driver plugin is required to forward container logs to Loki. Install it using: +The Docker Loki Driver plugin is required to forward container logs to Loki. It is installed automatically when launching an instance, but can also be installed manually: ```shell ./scripts/install-loki-driver.sh @@ -257,16 +405,16 @@ The Docker Loki Driver plugin is required to forward container logs to Loki. Ins #### Monitoring Deployment -Deploy the monitoring stack using: +The monitoring stack is launched as part of the one-time server setup: ```shell -docker compose -f docker-compose.yml -f overlays/monitoring/docker-compose.yml up +COMPOSE_OPTS=-d make start-monitoring ``` This deploys: - **Grafana**: A web-based monitoring and visualization platform with preloaded dashboards for Traefik, PostgreSQL, and server/host data. -- **Prometheus**: Collects metrics from the DHIS2 application (`/api/metrics`), Postgres Exporter, Traefik, Node Exporter, cAdvisor, and Prometheus itself. Data is stored locally for 15 days (default). +- **Prometheus**: Collects metrics from each DHIS2 instance (`/api/metrics`) and its Postgres Exporter. New instances are picked up automatically via file-based service discovery in `stacks/monitoring/targets/`. - **Loki**: Aggregates all container logs (DHIS2, PostgreSQL, Traefik) via the Docker Loki Driver plugin. Logs are indexed by labels for efficiency. #### DHIS2 Monitoring @@ -275,15 +423,15 @@ DHIS2's built-in monitoring API is enabled, exposing health and performance metr #### Accessing Monitoring Services -1. Start services with the monitoring overlay (as shown above). -2. Open `https://grafana.{APP_HOSTNAME}` in your browser (where `{APP_HOSTNAME}` is from your `.env` file). +1. Ensure the monitoring stack is running (`make start-monitoring`). +2. Open `https://` in your browser (the hostname configured during server setup). 3. Login with: - Username: `admin` - - Password: Check your `.env` file for `GRAFANA_ADMIN_PASSWORD`. + - Password: Check `stacks/monitoring/.env` for `GRAFANA_ADMIN_PASSWORD`. #### Configuration -Monitoring settings can be configured via environment variables in your `.env` file: +Monitoring settings can be configured via environment variables in `stacks/monitoring/.env`: - `GRAFANA_ADMIN_PASSWORD`: Grafana admin password (auto-generated). - `PROMETHEUS_RETENTION_TIME`: Prometheus data retention (default: `15d`). @@ -307,18 +455,32 @@ make init ### Start all services -To start all services for development: +To start all services for development, follow the same flow as production using local `nip.io` hostnames: ```shell -make launch +GEN_LETSENCRYPT_ACME_EMAIL=dev@dhis2.org \ +GEN_GRAFANA_HOSTNAME=grafana.127-0-0-1.nip.io \ + make generate-stack-envs + +make start-traefik & +make start-monitoring & + +APP_HOSTNAME=dhis2.127-0-0-1.nip.io PROJECT_NAME=dev make create-instance +PROJECT_NAME=dev make start-instance ``` ### Clean all services -To stop and remove all services and their associated data: +To stop and remove a development instance and its associated data: + +```shell +PROJECT_NAME=dev make stop-instance +``` + +To destroy all Docker volumes (database, file storage, monitoring data) for a full reset: ```shell -make clean +PROJECT_NAME=dev make clean-all ``` ### Run end-to-end tests @@ -330,13 +492,14 @@ make test Note that the environment needs to be "fresh" for the end-to-end tests' expectations to succeed, so it's advised to clean the environment beforehand. ```shell -make clean && make test +PROJECT_NAME=dev make stop-instance && make test ``` ## Further Documentation For more in-depth information, please refer to the following: +- [Instance Management](docs/instance-management.md) - [Environment Variables](docs/environment-variables.md) - [PostgreSQL Documentation](https://www.postgresql.org/docs/current/app-pgdump.html) diff --git a/docker-compose.yml b/docker-compose.yml index 5ad1e6a..c7142c0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,15 +1,23 @@ x-database-image: &database-image image: postgis/postgis:${POSTGRES_VERSION:-16-master} -x-file-storage-image: &file-storage-image - image: rclone/rclone:${RCLONE_VERSION:-1.68} +x-healthcheck-options: &healthcheck-options + interval: 10s + timeout: 3s + retries: 3 + start_period: 30s + +x-loki-logging: &loki-logging + driver: loki + options: + loki-url: "http://localhost:3100/loki/api/v1/push" + loki-timeout: "1s" + loki-retries: "2" + loki-external-labels: "instance=${COMPOSE_PROJECT_NAME}" services: app: image: dhis2/core:${DHIS2_VERSION:-42} - depends_on: - database: - condition: service_healthy volumes: - dhis2:/opt/dhis2/ #- ./config/dhis2/log4j2.xml:/opt/dhis2/log4j2.xml:ro @@ -39,10 +47,21 @@ services: POSTGRES_DB_PASSWORD: ${POSTGRES_DB_PASSWORD} # -- Enable system audit logging SYSTEM_AUDIT_ENABLED: ${SYSTEM_AUDIT_ENABLED:-off} + MONITORING_API_ENABLED: on + MONITORING_JVM_ENABLED: on + MONITORING_DBPOOL_ENABLED: on + MONITORING_HIBERNATE_ENABLED: on + MONITORING_UPTIME_ENABLED: on + MONITORING_CPU_ENABLED: on networks: - - frontend - - application - - database + application: + database: + proxy: + aliases: + - "${COMPOSE_PROJECT_NAME}-app" + monitoring: + aliases: + - "${COMPOSE_PROJECT_NAME}-app" restart: unless-stopped healthcheck: test: [ "CMD", "curl", "-f", "http://127.0.0.1:8080/dhis-web-login/" ] @@ -50,6 +69,7 @@ services: timeout: 3s retries: 3 start_period: 120s + logging: *loki-logging user: 65534:65534 cap_drop: - ALL @@ -78,238 +98,74 @@ services: - database entrypoint: [ "/bin/sh", "/update-admin-password.sh" ] working_dir: /root + logging: *loki-logging <<: *database-image cap_drop: - ALL security_opt: - no-new-privileges:true - database: + # Creates the DHIS2 monitoring user used by Prometheus to scrape /api/metrics. + # Runs once per instance after the admin password is set. + create-monitoring-user: + image: alpine:3.22 + depends_on: + update-admin-password: + condition: service_completed_successfully volumes: - - postgres:/var/lib/postgresql/data - - ./init-scripts:/docker-entrypoint-initdb.d:ro - - ./config/postgresql/postgresql.conf:/etc/postgresql/postgresql.conf:ro - - ./config/postgresql/conf.d:/etc/postgresql/conf.d:ro + - ./scripts/create-monitoring-user.sh:/create-monitoring-user.sh:ro environment: - # -- Postgres user password - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - # -- Name of the database - POSTGRES_DB: ${POSTGRES_DB:-dhis} - # -- Database username - POSTGRES_DB_USERNAME: ${POSTGRES_DB_USERNAME} - # -- Database password - POSTGRES_DB_PASSWORD: ${POSTGRES_DB_PASSWORD} - # -- Initdb arguments - POSTGRES_INITDB_ARGS: "--auth-host=scram-sha-256 --auth-local=scram-sha-256" - # -- Metrics username - POSTGRES_METRICS_USERNAME: ${POSTGRES_METRICS_USERNAME} - # -- Metrics user password - POSTGRES_METRICS_PASSWORD: ${POSTGRES_METRICS_PASSWORD} + DHIS2_HOSTNAME: ${DHIS2_HOSTNAME:-http://app:8080} + DHIS2_ADMIN_USERNAME: ${DHIS2_ADMIN_USERNAME} + DHIS2_ADMIN_PASSWORD: ${DHIS2_ADMIN_PASSWORD} + DHIS2_MONITOR_USERNAME: ${DHIS2_MONITOR_USERNAME} + DHIS2_MONITOR_PASSWORD: ${DHIS2_MONITOR_PASSWORD} networks: - - database - command: > - postgres - -c config_file=/etc/postgresql/postgresql.conf - -c hba_file=/var/lib/postgresql/data/pg_hba.conf - restart: unless-stopped - healthcheck: - test: [ "CMD-SHELL", "pg_isready -U postgres -d ${POSTGRES_DB}" ] - interval: 10s - timeout: 5s - retries: 3 - user: postgres - <<: *database-image + - application + entrypoint: [ "/bin/sh", "/create-monitoring-user.sh" ] + logging: *loki-logging cap_drop: - ALL - read_only: true security_opt: - no-new-privileges:true - tmpfs: - - /var/run/postgresql - - traefik-init: - image: alpine:3.22 - volumes: - - cert:/cert:rw - - ./scripts/init-cert.sh:/init-cert.sh:ro - command: [ "/init-cert.sh" ] - traefik: - image: traefik:v3.5 + # Exports PostgreSQL metrics for Prometheus scraping. + # Attached to monitoring with a predictable alias for the shared Prometheus. + postgres-exporter: + image: quay.io/prometheuscommunity/postgres-exporter:${POSTGRES_EXPORTER_VERSION:-v0.17.1} depends_on: - traefik-init: + create-monitoring-user: condition: service_completed_successfully - # Prevent Traefik from starting if the DHIS2 administrator password isn't updated! - update-admin-password: - condition: service_completed_successfully - volumes: - - ./traefik/dynamic.yml:/etc/traefik/dynamic.yml:ro - - cert:/cert:rw environment: - # -- Log level - TRAEFIK_LOG_LEVEL: ${LOG_LEVEL:-INFO} - # -- Enable access logs - TRAEFIK_ACCESSLOG: ${LOG_ACCESS:-true} - # -- Access log format - TRAEFIK_ACCESSLOG_FORMAT: ${LOG_FORMAT:-json} - # -- Allow ping - TRAEFIK_PING: true - # -- Default entrypoint port - TRAEFIK_ENTRYPOINTS_WEB_ADDRESS: :80 - # -- Redirect to https - TRAEFIK_ENTRYPOINTS_WEB_HTTP_REDIRECTIONS_ENTRYPOINT_TO: websecure - # -- Redirect scheme - TRAEFIK_ENTRYPOINTS_WEB_HTTP_REDIRECTIONS_ENTRYPOINT_SCHEME: https - # -- Default secure entrypoint port - TRAEFIK_ENTRYPOINTS_WEBSECURE_ADDRESS: :443 - # -- Provider file - TRAEFIK_PROVIDERS_FILE_FILENAME: /etc/traefik/dynamic.yml - # -- Watch the provider file for changes - TRAEFIK_PROVIDERS_FILE_WATCH: false - # -- Enable API - TRAEFIK_API: false - # -- Allow insecure API access - TRAEFIK_API_INSECURE: false - # -- Enable Prometheus metrics - TRAEFIK_METRICS_PROMETHEUS: true - # -- ACME email - TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_EMAIL: ${LETSENCRYPT_ACME_EMAIL} - # -- ACME storage file - TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_STORAGE: /cert/acme.json - # -- ACME TLS challenge - TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_TLSCHALLENGE: true - # -- ACME CA server - TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_CASERVER: ${LETSENCRYPT_ACME_CASERVER:-https://acme-v02.api.letsencrypt.org/directory} - # -- Hostname - APP_HOSTNAME: ${APP_HOSTNAME} - ports: - - "0.0.0.0:80:80" - - "0.0.0.0:443:443" + DATA_SOURCE_NAME: "postgresql://${POSTGRES_METRICS_USERNAME}:${POSTGRES_METRICS_PASSWORD}@database:5432/${POSTGRES_DB}?sslmode=disable" networks: - - frontend + database: + monitoring: + aliases: + - "${COMPOSE_PROJECT_NAME}-postgres-exporter" restart: unless-stopped healthcheck: - test: [ "CMD", "traefik", "healthcheck" ] - interval: 10s - timeout: 3s - retries: 3 - start_period: 10s + test: [ "CMD", "wget", "--no-verbose", "--tries=1", "--quiet", "--output-document=/dev/null", "http://localhost:9187/metrics" ] + <<: *healthcheck-options + logging: *loki-logging user: nobody:nobody cap_drop: - ALL read_only: true security_opt: - no-new-privileges:true - tmpfs: - - /tmp - - backup-database: - depends_on: - database: - condition: service_healthy - volumes: - - ./backups:/backups - - ./scripts/backup-database.sh:/backup-database.sh:ro - environment: - # -- Database hostname - POSTGRES_HOST: database - # -- Database username - POSTGRES_USER: ${POSTGRES_DB_USERNAME} - # -- Database password - POSTGRES_PASSWORD: ${POSTGRES_DB_PASSWORD} - # -- Database name - POSTGRES_DB: ${POSTGRES_DB:-dhis} - # -- Database backup format - POSTGRES_BACKUP_FORMAT: ${POSTGRES_BACKUP_FORMAT:-custom} - # -- The `PGPASSWORD` environment variable is used by the `pg_dump` command` - PGPASSWORD: ${POSTGRES_DB_PASSWORD} - networks: - - database - entrypoint: [ "/bin/bash", "/backup-database.sh" ] - <<: *database-image - profiles: - - backup - - backup-file-storage: - volumes: - - dhis2:/opt/dhis2:ro - - ./backups:/backups - - ./scripts/backup-file-storage.sh:/backup-file-storage.sh:ro - environment: - # -- Backup timestamp. Used to name the backup directory and the backup file. Since those are created by different containers, we need to ensure the backup timestamp is the same for both containers. - BACKUP_TIMESTAMP: ${BACKUP_TIMESTAMP} - # -- Directory to back up - BACKUP_SOURCE_PATH: ${BACKUP_SOURCE_PATH:-/opt/dhis2/files} - networks: - - application - entrypoint: [ "/bin/sh", "/backup-file-storage.sh" ] - <<: *file-storage-image - profiles: - - backup - - restore-database: - depends_on: - database: - condition: service_healthy - volumes: - - ./backups:/backups:ro - - ./scripts/restore-database.sh:/restore-database.sh:ro - - ./scripts/fix-ownership.sh:/fix-ownership.sh:ro - environment: - # -- Database hostname - POSTGRES_HOST: database - # -- Database username - POSTGRES_USER: ${POSTGRES_DB_USERNAME} - # -- Database password - POSTGRES_PASSWORD: ${POSTGRES_DB_PASSWORD} - # -- Database name - POSTGRES_DB: ${POSTGRES_DB:-dhis} - # -- The `PGPASSWORD` environment variable is used by the `pg_dump` command` - PGPASSWORD: ${POSTGRES_PASSWORD} - # -- Database restore file - DB_RESTORE_FILE: ${DB_RESTORE_FILE} - # -- Number of parallel jobs for pg_restore - DB_RESTORE_NUMBER_OF_JOBS: ${DB_RESTORE_NUMBER_OF_JOBS:-4} - networks: - - database - entrypoint: [ "/bin/bash", "/restore-database.sh" ] - <<: *database-image - profiles: - - restore - - restore-file-storage: - volumes: - - dhis2:/opt/dhis2 - - ./backups:/backups:ro - - ./scripts/restore-file-storage.sh:/restore-file-storage.sh:ro - environment: - # -- Directory to restore from - FILE_STORAGE_RESTORE_SOURCE_DIR: ${FILE_STORAGE_RESTORE_SOURCE_DIR} - # -- Directory to restore to - RESTORE_DESTINATION_PATH: ${RESTORE_DESTINATION_PATH:-/opt/dhis2/files} - networks: - - application - entrypoint: [ "/bin/sh", "/restore-file-storage.sh" ] - <<: *file-storage-image - profiles: - - restore - - compose-docs: - image: tons/docker-compose-docs:2.1.0 - volumes: - - .:/src:ro - environment: - DOCKER_COMPOSE_FILE_GLOBS: /src/docker-compose.yml;/src/overlays/*/docker-compose.yml - profiles: - - docs networks: - frontend: application: database: + name: ${COMPOSE_PROJECT_NAME}-db + external: true + proxy: + name: proxy + external: true monitoring: + name: monitoring + external: true volumes: dhis2: {} - postgres: {} - cert: {} diff --git a/docs/architecture.d2 b/docs/architecture.d2 new file mode 100644 index 0000000..470bd6f --- /dev/null +++ b/docs/architecture.d2 @@ -0,0 +1,137 @@ +# Docker deployment — network and service architecture +# +# Generate SVG: +# ~/.local/bin/d2 docs/architecture.d2 docs/architecture.svg +# +# NOTE: the DHIS2 CLI is also named "d2" and shadows the diagram tool in PATH. +# Use the full path ~/.local/bin/d2 or alias it: +# alias d2diagram=~/.local/bin/d2 +# +# Watch mode (re-renders on save): +# ~/.local/bin/d2 --watch docs/architecture.d2 docs/architecture.svg + +vars: { + d2-config: { + layout-engine: elk + } +} + +classes: { + network_box: { + style: { + fill: "#f8fafc" + stroke: "#94a3b8" + stroke-dash: 4 + border-radius: 8 + font-size: 13 + } + } + app_node: { + style: { + fill: "#dbeafe" + stroke: "#3b82f6" + border-radius: 4 + } + } + monitoring_node: { + style: { + fill: "#dcfce7" + stroke: "#16a34a" + border-radius: 4 + } + } + db_node: { + style: { + fill: "#fef9c3" + stroke: "#ca8a04" + border-radius: 4 + } + } +} + +direction: down + +# ── External ────────────────────────────────────────────────── + +internet: Internet {shape: cloud} + +# ── Shared: Traefik (proxy network) ────────────────────────── + +proxy_network: proxy network { + class: network_box + + traefik: Traefik { + class: app_node + label: "Traefik\nwatches conf.d/*.yml" + } +} + +# ── Shared: Monitoring (monitoring_net) ─────────────────────── + +monitoring_net: monitoring_net { + class: network_box + + prometheus: Prometheus {class: monitoring_node} + grafana: Grafana {class: monitoring_node} + loki: Loki {class: monitoring_node} + + prometheus -> grafana: query + loki -> grafana: query +} + +# ── Instance: one ───────────────────────────────────────────── + +one: Instance: one { + class: network_box + + app: one-app { + class: app_node + label: "one-app\nproxy · monitoring_net · one-db" + } + db: one-postgres { + shape: cylinder + class: db_node + } + exporter: postgres-exporter { + class: db_node + } + + app -> db: JDBC + db -> exporter +} + +# ── Instance: two ───────────────────────────────────────────── + +two: Instance: two { + class: network_box + + app: two-app { + class: app_node + label: "two-app\nproxy · monitoring_net · two-db" + } + db: two-postgres { + shape: cylinder + class: db_node + } + exporter: postgres-exporter { + class: db_node + } + + app -> db: JDBC + db -> exporter +} + +# ── Connections ─────────────────────────────────────────────── + +internet -> proxy_network.traefik: HTTPS :443 + +proxy_network.traefik -> one.app: HTTP :8080 +proxy_network.traefik -> two.app: HTTP :8080 + +one.app -> monitoring_net.loki: logs +two.app -> monitoring_net.loki: logs + +monitoring_net.prometheus -> one.app: scrape :8080 +monitoring_net.prometheus -> two.app: scrape :8080 +monitoring_net.prometheus -> one.exporter: scrape :9187 +monitoring_net.prometheus -> two.exporter: scrape :9187 diff --git a/docs/architecture.svg b/docs/architecture.svg new file mode 100644 index 0000000..ea16f48 --- /dev/null +++ b/docs/architecture.svg @@ -0,0 +1,121 @@ +Internetproxy networkInstance: oneInstance: twomonitoring_netTraefikwatches conf.d/*.ymlone-appproxy · monitoring_net · one-dbone-postgrespostgres-exportertwo-appproxy · monitoring_net · two-dbtwo-postgrespostgres-exporterPrometheusGrafanaLoki JDBCJDBCqueryqueryHTTPS :443HTTP :8080HTTP :8080logslogsscrape :8080scrape :8080scrape :9187scrape :9187 + + + + + + + + + + + + + + + diff --git a/docs/instance-management.md b/docs/instance-management.md new file mode 100644 index 0000000..5bd0917 --- /dev/null +++ b/docs/instance-management.md @@ -0,0 +1,109 @@ +# Instance Management Workflows + +A collection of useful process and architectural reference diagrams. + +## One-time server setup + +Run once per host before creating any instances. + +```mermaid +%%{init: {"themeVariables": {"fontSize": "13px"}}}%% +flowchart TD + A([Start]) --> B["make generate-stack-envs
GEN_LETSENCRYPT_ACME_EMAIL=...
GEN_GRAFANA_HOSTNAME=..."] + B --> C["stacks/traefik/.env
stacks/monitoring/.env written"] + C --> D["make start-traefik"] + C --> E["make start-monitoring"] + D --> F["Creates: proxy network
Creates: monitoring network
Starts: Traefik container
Watches: stacks/traefik/conf.d/"] + E --> G["Writes: conf.d/monitoring.yml
Starts: Grafana, Prometheus,
Loki, etc.
Watches: monitoring/targets/"] +``` + +## Instance lifecycle (sequence) + +```mermaid +sequenceDiagram + actor Operator + participant Make + participant FS as instances/ + participant Docker + participant Traefik + participant Monitoring + + Note over Operator,FS: make create-instance + Operator->>Make: APP_HOSTNAME=… PROJECT_NAME=NAME + Make->>FS: generate instances/NAME.env + Note right of FS: passwords + hostname set + Make-->>Operator: instances/NAME.env created + + Note over Operator,Monitoring: make start-instance + Operator->>Make: PROJECT_NAME=NAME + Make->>Docker: ensure proxy + monitoring networks + Make->>Docker: create NAME-db network + Make->>Docker: compose up postgres --wait + Docker-->>Make: postgres healthy + Make->>Traefik: write conf.d/NAME.yml + Note right of Traefik: hot-reloads route immediately + Make->>Monitoring: write targets/dhis2/NAME.json + Make->>Monitoring: write targets/postgres/NAME.json + Note right of Monitoring: picks up new scrape targets + Make->>Docker: compose up app + Docker-->>Operator: Running at NAME hostname + + + Note over Operator,Monitoring: make stop-instance + Operator->>Make: PROJECT_NAME=NAME + Make->>Docker: compose down app + overlays + Make->>Docker: postgres compose down + Make->>Docker: remove NAME-db network + Make->>Traefik: remove conf.d/NAME.yml + Note right of Traefik: deregisters route + Make->>Monitoring: remove targets/NAME.json + Note right of Monitoring: stops scraping + Make-->>Operator: Instance stopped (env file retained) + + Note over Operator,FS: make delete-instance (planned) + Operator->>Make: PROJECT_NAME=NAME + Make->>FS: remove instances/NAME.env + Make-->>Operator: Instance fully removed + +``` + +## Network architecture + +### Docker network membership + +![Architecture](./architecture.svg) + +| Service | proxy | monitoring | one-db | two-db | +|---|:---:|:---:|:---:|:---:| +| Traefik | ✓ | | | | +| one-app | ✓ | ✓ | ✓ | | +| two-app | ✓ | ✓ | | ✓ | +| one-postgres + exporter | | ✓ | ✓ | | +| two-postgres + exporter | | ✓ | | ✓ | +| Prometheus | | ✓ | | | +| Grafana | | ✓ | | | +| Loki | | ✓ | | | + +### Generating the architecture SVG + +The D2 source is at [`docs/architecture.d2`](./architecture.d2). It uses the [ELK](https://eclipse.dev/elk/) layout engine for cleaner routing of dense graphs. + +#### Install D2 + +```bash +# macOS +brew install d2 + +# Linux / WSL +curl -fsSL https://d2lang.com/install.sh | sh +``` + +#### Generate the SVG + +> **Note:** the DHIS2 CLI is also named `d2` and shadows the diagram tool in `$PATH`. +> Use the full path, or add an alias: `alias d2diagram=~/.local/bin/d2` + +```bash +~/.local/bin/d2 docs/architecture.d2 docs/architecture.svg + +``` diff --git a/instances/.gitignore b/instances/.gitignore new file mode 100644 index 0000000..ec8bc68 --- /dev/null +++ b/instances/.gitignore @@ -0,0 +1,2 @@ +# Instance-specific env files contain passwords — never commit them +*.env diff --git a/overlays/monitoring/config/grafana/dashboards/README.md b/overlays/monitoring/config/grafana/dashboards/README.md deleted file mode 100644 index 210b65f..0000000 --- a/overlays/monitoring/config/grafana/dashboards/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# Introduction - -Dashboards can be added by simply downloading the json specification into this folder. - -As of this writing the following dashboards are available - -* https://grafana.com/grafana/dashboards/17346-traefik-official-standalone-dashboard/ -* https://grafana.com/grafana/dashboards/9628-postgresql-database/ -* https://grafana.com/grafana/dashboards/1860-node-exporter-full/ -* https://grafana.com/grafana/dashboards/19792-cadvisor-dashboard/ diff --git a/overlays/profiling/docker-compose.yml b/overlays/profiling/docker-compose.yml index 5a4c683..73394ed 100644 --- a/overlays/profiling/docker-compose.yml +++ b/overlays/profiling/docker-compose.yml @@ -21,10 +21,6 @@ services: JAVA_TOOL_OPTIONS: >- -Dlog4j2.configurationFile=/opt/dhis2/log4j2.xml -javaagent:/otel/opentelemetry-javaagent.jar -Dotel.service.name=dhis2 -Dotel.traces.exporter=otlp -Dotel.exporter.otlp.endpoint=http://tempo:4318 -Dotel.metrics.exporter=none -Dotel.logs.exporter=none -Dotel.instrumentation.jdbc.enabled=true -Dotel.instrumentation.hibernate.enabled=true - grafana: - volumes: - - ./overlays/profiling/config/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro - # This is needed to avoid permission issues with the tempo container when running as non-root user tempo-init: image: busybox:1.37.0 @@ -81,6 +77,11 @@ services: security_opt: - no-new-privileges:true +networks: + monitoring: + name: monitoring + external: true + volumes: tempo: {} otel: {} diff --git a/overlays/traefik-dashboard/docker-compose.yml b/overlays/traefik-dashboard/docker-compose.yml deleted file mode 100644 index 91a44c4..0000000 --- a/overlays/traefik-dashboard/docker-compose.yml +++ /dev/null @@ -1,9 +0,0 @@ -services: - traefik: - environment: - # -- Enable dashboard - TRAEFIK_API_DASHBOARD: true - # -- Allow insecure access to the dashboard - TRAEFIK_API_INSECURE: true - ports: - - "127.0.0.1:8080:8080" diff --git a/scripts/generate-env.sh b/scripts/generate-env.sh index 9f5e92b..15df7ff 100755 --- a/scripts/generate-env.sh +++ b/scripts/generate-env.sh @@ -1,27 +1,31 @@ #!/usr/bin/env bash +set -euo pipefail -REQUIRED_COMMANDS=("tr" "head" "fold" "shuf" "sed" "chmod" "cp") -MISSING_COMMANDS=() +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=scripts/lib/env-utils.sh +source "$SCRIPT_DIR/lib/env-utils.sh" -for cmd in "${REQUIRED_COMMANDS[@]}"; do - if ! command -v "$cmd" >/dev/null 2>&1; then - MISSING_COMMANDS+=("$cmd") - fi -done +# shellcheck disable=SC2119 +check_required_commands -if [ ${#MISSING_COMMANDS[@]} -ne 0 ]; then - echo "Error: The following required commands are not available:" >&2 - printf " - %s\n" "${MISSING_COMMANDS[@]}" >&2 - echo "" >&2 - echo "Please install the missing commands and try again." >&2 +MONITORING_ENV="stacks/monitoring/.env" +if [ ! -f "$MONITORING_ENV" ]; then + echo "Error: '$MONITORING_ENV' not found." >&2 + echo "Run 'make generate-stack-envs' first to set up the shared stacks." >&2 exit 1 fi -OUTPUT_FILE=".env" +# If GEN_PROJECT_NAME is set, write to instances/.env instead of .env. +if [ -n "${GEN_PROJECT_NAME:-}" ]; then + mkdir -p instances + OUTPUT_FILE="instances/${GEN_PROJECT_NAME}.env" +else + OUTPUT_FILE=".env" +fi TEMPLATE_FILE=".env.template" if [ -f "$OUTPUT_FILE" ]; then - echo "Error: An '$OUTPUT_FILE' file already exists." >&2 + echo "Error: '$OUTPUT_FILE' already exists." >&2 exit 1 fi @@ -30,57 +34,30 @@ if [ ! -f "$TEMPLATE_FILE" ]; then exit 1 fi -LENGTH=32 -CHARSET='A-Za-z0-9_=.-' - -generate_password() { - local password="" - password+=$(LC_ALL=C tr -dc '[:upper:]' < /dev/urandom | head -c 1) - password+=$(LC_ALL=C tr -dc '[:lower:]' < /dev/urandom | head -c 1) - password+=$(LC_ALL=C tr -dc '0-9' < /dev/urandom | head -c 1) - password+=$(LC_ALL=C tr -dc '_=.-' < /dev/urandom | head -c 1) - local remaining=$((LENGTH - 4)) - password+=$(LC_ALL=C tr -dc "$CHARSET" < /dev/urandom | head -c "$remaining") - echo "$password" | fold -w1 | shuf | tr -d '\n' -} - -# Validate required inputs for ungeneratable values +# Validate required inputs : "${GEN_APP_HOSTNAME:?Environment variable GEN_APP_HOSTNAME must be set}" -: "${GEN_LETSENCRYPT_ACME_EMAIL:?Environment variable GEN_LETSENCRYPT_ACME_EMAIL must be set}" DHIS2_ADMIN_PASSWORD=$(generate_password) POSTGRES_PASSWORD=$(generate_password) POSTGRES_DB_PASSWORD=$(generate_password) POSTGRES_METRICS_PASSWORD=$(generate_password) -GRAFANA_ADMIN_PASSWORD=$(generate_password) -DHIS2_MONITOR_PASSWORD=$(generate_password) -# Detect GNU vs BSD sed -if sed --version >/dev/null 2>&1; then - SED_FLAGS=(-i) -else - SED_FLAGS=(-i '') -fi +# Read shared monitoring credentials from the monitoring stack env so all instances match. +DHIS2_MONITOR_PASSWORD=$(grep -E '^DHIS2_MONITOR_PASSWORD=' "$MONITORING_ENV" | cut -d= -f2-) +DHIS2_MONITOR_USERNAME=$(grep -E '^DHIS2_MONITOR_USERNAME=' "$MONITORING_ENV" | cut -d= -f2-) cp "$TEMPLATE_FILE" "$OUTPUT_FILE" # Remove the first line beginning with "# NOTE!!!:" and any leading blank lines sed "${SED_FLAGS[@]}" -e '/^# NOTE!!!:/d' -e '/./,$!d' "$OUTPUT_FILE" -update_env_var() { - local key="$1" - local value="$2" - sed "${SED_FLAGS[@]}" "s|^${key}=.*|${key}=${value}|" "$OUTPUT_FILE" -} - -update_env_var "DHIS2_ADMIN_PASSWORD" "$DHIS2_ADMIN_PASSWORD" -update_env_var "POSTGRES_PASSWORD" "$POSTGRES_PASSWORD" -update_env_var "POSTGRES_DB_PASSWORD" "$POSTGRES_DB_PASSWORD" -update_env_var "POSTGRES_METRICS_PASSWORD" "$POSTGRES_METRICS_PASSWORD" -update_env_var "GRAFANA_ADMIN_PASSWORD" "$GRAFANA_ADMIN_PASSWORD" -update_env_var "DHIS2_MONITOR_PASSWORD" "$DHIS2_MONITOR_PASSWORD" -update_env_var "APP_HOSTNAME" "$GEN_APP_HOSTNAME" -update_env_var "LETSENCRYPT_ACME_EMAIL" "$GEN_LETSENCRYPT_ACME_EMAIL" +update_env_var "$OUTPUT_FILE" "DHIS2_ADMIN_PASSWORD" "$DHIS2_ADMIN_PASSWORD" +update_env_var "$OUTPUT_FILE" "DHIS2_MONITOR_USERNAME" "$DHIS2_MONITOR_USERNAME" +update_env_var "$OUTPUT_FILE" "DHIS2_MONITOR_PASSWORD" "$DHIS2_MONITOR_PASSWORD" +update_env_var "$OUTPUT_FILE" "POSTGRES_PASSWORD" "$POSTGRES_PASSWORD" +update_env_var "$OUTPUT_FILE" "POSTGRES_DB_PASSWORD" "$POSTGRES_DB_PASSWORD" +update_env_var "$OUTPUT_FILE" "POSTGRES_METRICS_PASSWORD" "$POSTGRES_METRICS_PASSWORD" +update_env_var "$OUTPUT_FILE" "APP_HOSTNAME" "$GEN_APP_HOSTNAME" chmod u+rw,go-rwx "$OUTPUT_FILE" diff --git a/scripts/generate-stack-envs.sh b/scripts/generate-stack-envs.sh new file mode 100755 index 0000000..cf89522 --- /dev/null +++ b/scripts/generate-stack-envs.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Generates .env files for the standalone stacks (stacks/traefik/ and stacks/monitoring/). +# Run this once during initial server setup, before deploying any instances. +# +# Required environment variables: +# GEN_LETSENCRYPT_ACME_EMAIL - Email address for Let's Encrypt registration +# GEN_GRAFANA_HOSTNAME - Hostname for the Grafana UI (e.g. grafana.example.com) + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=scripts/lib/env-utils.sh +source "$SCRIPT_DIR/lib/env-utils.sh" + +# shellcheck disable=SC2119 +check_required_commands + +TRAEFIK_ENV="stacks/traefik/.env" +MONITORING_ENV="stacks/monitoring/.env" + +for f in "$TRAEFIK_ENV" "$MONITORING_ENV"; do + if [ -f "$f" ]; then + echo "Error: '$f' already exists. Remove it first if you want to regenerate." >&2 + exit 1 + fi +done + +: "${GEN_LETSENCRYPT_ACME_EMAIL:?Environment variable GEN_LETSENCRYPT_ACME_EMAIL must be set}" +: "${GEN_GRAFANA_HOSTNAME:?Environment variable GEN_GRAFANA_HOSTNAME must be set}" + +GRAFANA_ADMIN_PASSWORD=$(generate_password) +DHIS2_MONITOR_PASSWORD=$(generate_password) + +cp stacks/traefik/.env.template "$TRAEFIK_ENV" +update_env_var "$TRAEFIK_ENV" "LETSENCRYPT_ACME_EMAIL" "$GEN_LETSENCRYPT_ACME_EMAIL" +chmod u+rw,go-rwx "$TRAEFIK_ENV" + +cp stacks/monitoring/.env.template "$MONITORING_ENV" +update_env_var "$MONITORING_ENV" "GRAFANA_HOSTNAME" "$GEN_GRAFANA_HOSTNAME" +update_env_var "$MONITORING_ENV" "GRAFANA_ADMIN_PASSWORD" "$GRAFANA_ADMIN_PASSWORD" +update_env_var "$MONITORING_ENV" "DHIS2_MONITOR_PASSWORD" "$DHIS2_MONITOR_PASSWORD" +chmod u+rw,go-rwx "$MONITORING_ENV" + +echo "Generated $TRAEFIK_ENV" +echo "Generated $MONITORING_ENV" +echo "" +echo "Grafana will be available at: https://${GEN_GRAFANA_HOSTNAME}" +echo "Grafana admin password stored in: $MONITORING_ENV" diff --git a/scripts/lib/env-utils.sh b/scripts/lib/env-utils.sh new file mode 100755 index 0000000..7e47980 --- /dev/null +++ b/scripts/lib/env-utils.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# Shared utilities for .env generation scripts. +# Source this file; do not execute it directly. + +_REQUIRED_COMMANDS=("tr" "head" "fold" "shuf" "sed" "chmod" "cp") + +check_required_commands() { + local missing=() + for cmd in "${_REQUIRED_COMMANDS[@]}" "$@"; do + if ! command -v "$cmd" >/dev/null 2>&1; then + missing+=("$cmd") + fi + done + if [ ${#missing[@]} -ne 0 ]; then + echo "Error: The following required commands are not available:" >&2 + printf " - %s\n" "${missing[@]}" >&2 + echo "" >&2 + echo "Please install the missing commands and try again." >&2 + exit 1 + fi +} + +_LENGTH=32 +_CHARSET='A-Za-z0-9_=.-' + +generate_password() { + local password="" + password+=$(LC_ALL=C tr -dc '[:upper:]' < /dev/urandom | head -c 1) + password+=$(LC_ALL=C tr -dc '[:lower:]' < /dev/urandom | head -c 1) + password+=$(LC_ALL=C tr -dc '0-9' < /dev/urandom | head -c 1) + password+=$(LC_ALL=C tr -dc '_=.-' < /dev/urandom | head -c 1) + local remaining=$((_LENGTH - 4)) + password+=$(LC_ALL=C tr -dc "$_CHARSET" < /dev/urandom | head -c "$remaining") + echo "$password" | fold -w1 | shuf | tr -d '\n' +} + +# Detect GNU vs BSD sed once, available to all sourcing scripts. +if sed --version >/dev/null 2>&1; then + SED_FLAGS=(-i) +else + SED_FLAGS=(-i '') +fi + +update_env_var() { + local file="$1" + local key="$2" + local value="$3" + sed "${SED_FLAGS[@]}" "s|^${key}=.*|${key}=${value}|" "$file" +} diff --git a/server-tools/.env.template b/server-tools/.env.template deleted file mode 100644 index 3d0792e..0000000 --- a/server-tools/.env.template +++ /dev/null @@ -1,2 +0,0 @@ -GEN_APP_HOSTNAME=your.domain.com -GEN_LETSENCRYPT_ACME_EMAIL=your.email@example.com diff --git a/server-tools/.envrc b/server-tools/.envrc deleted file mode 100644 index fe7c01a..0000000 --- a/server-tools/.envrc +++ /dev/null @@ -1 +0,0 @@ -dotenv diff --git a/server-tools/README.md b/server-tools/README.md index 0d8cd06..7ce8383 100644 --- a/server-tools/README.md +++ b/server-tools/README.md @@ -1,12 +1,12 @@ # DHIS2 Docker Deployment Ansible Playbook -This Ansible playbook automates the deployment of the DHIS2 Docker stack. +This Ansible playbook provisions a host for running the DHIS2 Docker stack. It installs Docker, applies firewall and security hardening, and clones this repository onto the host. Starting the stack itself is done manually with the `make start-*` targets from the cloned working tree, not by Ansible. ## Features - **Infrastructure Bootstrapping**: Installs required system packages including Docker and Docker Compose - **Security Hardening**: Applies system hardening based on the microk8s-playbook harden.yaml, adapted for Docker -- **Deployment Automation**: Clones the repository, generates .env file, and deploys with selected overlays +- **Repository Checkout**: Clones the deployment repository to `deploy_dir` - **Modularity**: Uses Ansible roles for easy extension and maintenance - **Idempotency**: Safe to run multiple times @@ -15,48 +15,43 @@ This Ansible playbook automates the deployment of the DHIS2 Docker stack. - Ansible installed on the control machine - Target server with Ubuntu 24.04 - SSH access to the target server with sudo privileges -- Environment variables set: `GEN_APP_HOSTNAME` and `GEN_LETSENCRYPT_ACME_EMAIL` ## Configuration Edit `group_vars/all.yml` to customize: -- `app_hostname`: Application hostname (set via env var) -- `letsencrypt_email`: Let's Encrypt email (set via env var) -- `overlays`: List of overlays to enable, e.g., `['monitoring']` -- Other variables as needed +- `repo_url`: Repository to clone onto the host +- `deploy_dir`: Where to clone it +- `firewall_allowed_ports`: Host-facing TCP ports to open +- `allowed_ssh_users`: Users allowed to SSH in +- `docker_user` / `docker_group` / `docker_home`: Account that owns the Docker workload ## Usage -1. Set environment variables: +1. Update the inventory file `inventory.ini` according to your needs - ```bash - export GEN_APP_HOSTNAME=your.domain.com - export GEN_LETSENCRYPT_ACME_EMAIL=your.email@example.com - ``` - -2. Update the inventory file `inventory.ini` according to your needs - -3. Copy your public SSH key to the target server +2. Copy your public SSH key to the target server ```bash ssh-copy-id ubuntu@ ``` -4. Store your user's sudo password in `./.ansible_become_pass` +3. Store your user's sudo password in `./.ansible_become_pass` -5. Run the playbook: +4. Run the playbook: ```bash make deployment ``` +5. Once provisioning finishes, SSH to the host and start the stacks from the cloned repo using the `make start-*` targets (see the repo's top-level README). + ## Roles - **bootstrap**: Installs Docker, creates users, sets up directories - **firewall**: Configures firewall rules for Docker and host-facing ports - **harden**: Applies security hardening (SSH, kernel, Docker config) -- **deploy**: Clones repo, generates .env, runs docker-compose +- **deploy**: Clones the deployment repository to `deploy_dir` ## Security Notes @@ -64,16 +59,11 @@ Edit `group_vars/all.yml` to customize: - Firewall rules are configured to deny all by default and only allow SSH, HTTP, HTTPS and inter-container communication only on default subnets - AppArmor is enabled - Unattended-upgrades are enabled -- Secrets are handled via environment variables and .env file -The above is only a subset of the security hardening that is applied. For more information, see -the [firewall](roles/firewall/tasks/main.yml) and [harden](roles/harden/tasks/main.yml) roles. +The above is only a subset of the security hardening that is applied. For more information, see the [firewall](roles/firewall/tasks/main.yml) and [harden](roles/harden/tasks/main.yml) roles. ### Firewall Management -All firewall rules for Docker and host-facing ports are managed by the `firewall` role. -See [roles/firewall/tasks/main.yml](roles/firewall/tasks/main.yml) for more details. +All firewall rules for Docker and host-facing ports are managed by the `firewall` role. See [roles/firewall/tasks/main.yml](roles/firewall/tasks/main.yml) for more details. -⚠️ **Important:** Do **not** use UFW or other firewall frontends alongside this setup. Docker bypasses standard host -chains (INPUT/OUTPUT), so UFW rules are ignored or may conflict. All host and container traffic should be managed -exclusively through this Ansible role. +⚠️ **Important:** Do **not** use UFW or other firewall frontends alongside this setup. Docker bypasses standard host chains (INPUT/OUTPUT), so UFW rules are ignored or may conflict. All host and container traffic should be managed exclusively through this Ansible role. diff --git a/server-tools/group_vars/all.yml b/server-tools/group_vars/all.yml index 110cffb..585b4a5 100644 --- a/server-tools/group_vars/all.yml +++ b/server-tools/group_vars/all.yml @@ -1,19 +1,7 @@ -# Configuration variables for the DHIS2 Docker deployment playbook - -# Application hostname (required for .env generation) -app_hostname: "{{ lookup('env', 'GEN_APP_HOSTNAME') | mandatory }}" - -# Let's Encrypt ACME email (required for .env generation) -letsencrypt_email: "{{ lookup('env', 'GEN_LETSENCRYPT_ACME_EMAIL') | mandatory }}" - -# List of overlays to enable (e.g., ['monitoring']) -# Note that if you add or remove overlays, all services will be stopped and restarted, so there will be some downtime -overlays: [ ] +# Configuration variables for the DHIS2 Docker server provisioning playbook repo_url: https://github.com/dhis2/docker-deployment -repo_branch: master - deploy_dir: /opt/dhis2 firewall_allowed_ports: [ 22, 80, 443 ] @@ -26,6 +14,3 @@ docker_user: dhis2 docker_group: docker docker_home: "/home/{{ docker_user }}" - -# Enable deployment of DHIS2 -enable_deploy: true diff --git a/server-tools/inventory.ini b/server-tools/inventory.ini index 2886706..c55605a 100644 --- a/server-tools/inventory.ini +++ b/server-tools/inventory.ini @@ -1,2 +1,2 @@ [servers] -your-server ansible_host=192.168.122.227 ansible_user=ubuntu +your-server ansible_host=192.168.122.101 ansible_user=ubuntu diff --git a/server-tools/roles/bootstrap/tasks/main.yml b/server-tools/roles/bootstrap/tasks/main.yml index 11d2832..07339a4 100644 --- a/server-tools/roles/bootstrap/tasks/main.yml +++ b/server-tools/roles/bootstrap/tasks/main.yml @@ -43,11 +43,11 @@ home: "{{ docker_home }}" create_home: yes -- name: Add user to docker group - user: - name: "{{ docker_user }}" - groups: "{{ docker_group }}" - append: yes +#- name: Add user to docker group +# user: +# name: "{{ docker_user }}" +# groups: "{{ docker_group }}" +# append: yes - name: Ensure deploy directory exists file: diff --git a/server-tools/roles/deploy/tasks/main.yml b/server-tools/roles/deploy/tasks/main.yml index 3a5d1b3..72ca6f9 100644 --- a/server-tools/roles/deploy/tasks/main.yml +++ b/server-tools/roles/deploy/tasks/main.yml @@ -1,196 +1,11 @@ -- name: Check if deploy dir exists - stat: - path: "{{ deploy_dir }}" - register: dir_stat - - name: Check if deploy dir is a git repo stat: path: "{{ deploy_dir }}/.git" register: git_repo -- name: Check deploy dir contents - find: - path: "{{ deploy_dir }}" - file_type: any - register: dir_contents - when: dir_stat.stat.exists - -- name: Fail if deploy dir exists, is not empty, and is not a git repo - fail: - msg: "Deploy directory {{ deploy_dir }} exists, is not empty, and is not a git repository. Please remove it manually or check for conflicts." - when: dir_stat.stat.exists and not git_repo.stat.exists and dir_contents.files | length > 0 - - name: Clone the repository command: git clone {{ repo_url }} {{ deploy_dir }} when: not git_repo.stat.exists - name: Add deploy directory to git safe directories command: git config --global --add safe.directory {{ deploy_dir }} - -- name: Fetch all branches - command: git fetch --all - args: - chdir: "{{ deploy_dir }}" - -- name: Checkout the specified branch - command: git checkout {{ repo_branch }} - args: - chdir: "{{ deploy_dir }}" - -- name: Pull latest changes - command: git pull - args: - chdir: "{{ deploy_dir }}" - when: git_repo.stat.exists - -- name: Generate .env file - shell: | - export GEN_APP_HOSTNAME="{{ app_hostname }}" - export GEN_LETSENCRYPT_ACME_EMAIL="{{ letsencrypt_email }}" - ./scripts/generate-env.sh - args: - chdir: "{{ deploy_dir }}" - creates: "{{ deploy_dir }}/.env" - -- name: Set .env permissions - file: - path: "{{ deploy_dir }}/.env" - owner: "{{ docker_user }}" - group: "root" - -- name: Create Traefik acme.json - file: - path: "{{ deploy_dir }}/traefik/acme.json" - state: touch - owner: "65534" - group: "65534" - mode: 0600 - -- name: Install Loki Docker driver - become: yes - command: ./scripts/install-loki-driver.sh - args: - chdir: "{{ deploy_dir }}" - when: "'monitoring' in overlays" - -- name: Read current overlays - slurp: - src: "{{ deploy_dir }}/current_overlays.json" - register: current_overlays_slurp - ignore_errors: true - -- name: Set old overlays - set_fact: - old_overlays: "{{ (current_overlays_slurp.content | b64decode | from_json) if current_overlays_slurp is succeeded else [] }}" - -- debug: - msg: "old_overlays: {{ old_overlays }}, overlays: {{ overlays }}" - -- name: Set compose files - set_fact: - compose_files: "{{ ['-f docker-compose.yml'] + overlays | map('regex_replace', '^(.*)$', '-f overlays/\\1/docker-compose.yml') | list }}" - -- debug: - msg: "compose_files: {{ compose_files }}" - -- debug: - msg: "overlays changed: {{ old_overlays != overlays }}" - -- name: Stop dhis2 service when overlays changed - systemd: - name: dhis2 - state: stopped - when: old_overlays != overlays - -- name: Write current overlays - copy: - dest: "{{ deploy_dir }}/current_overlays.json" - content: "{{ overlays | to_json }}" - -- name: Create systemd service for dhis2 - copy: - dest: /etc/systemd/system/dhis2.service - content: | - [Unit] - Description=DHIS2 Docker Compose Service - After=docker.service - Requires=docker.service - - [Service] - Type=simple - User={{ docker_user }} - Group={{ docker_group }} - WorkingDirectory={{ deploy_dir }} - ExecStart=/usr/bin/docker compose {{ compose_files | join(' ') }} up - ExecStop=/usr/bin/docker compose {{ compose_files | join(' ') }} down - Restart=always - - [Install] - WantedBy=multi-user.target - -- name: Show systemd service content - command: cat /etc/systemd/system/dhis2.service - register: systemd_content - -- debug: - msg: "{{ systemd_content.stdout }}" - -- name: Reload systemd daemon - systemd: - daemon_reload: yes - -- name: Enable dhis2 service - systemd: - name: dhis2 - enabled: yes - -- name: Find all overlay directories - shell: find overlays/ -name "docker-compose.yml" -o -name "docker-compose.yaml" | xargs dirname | sed 's|overlays/||' | sort | uniq - args: - chdir: "{{ deploy_dir }}" - register: all_overlays_raw - -- name: Set all overlays - set_fact: - all_overlays: "{{ all_overlays_raw.stdout_lines | select | list }}" - -- name: Set all compose files - set_fact: - all_compose_files: "{{ ['-f docker-compose.yml'] + all_overlays | map('regex_replace', '^(.*)$', '-f overlays/\\1/docker-compose.yml') | list }}" - -- debug: - msg: "all_overlays: {{ all_overlays }}, all_compose_files: {{ all_compose_files }}" - -- debug: - msg: "Running: docker compose {{ all_compose_files | join(' ') }} down" - -- name: Down all services - command: docker compose {{ all_compose_files | join(' ') }} down - args: - chdir: "{{ deploy_dir }}" - -- debug: - msg: "Starting: docker compose {{ compose_files | join(' ') }} up" - -- debug: - msg: "old_overlays: {{ old_overlays }}, overlays: {{ overlays }}, changed: {{ old_overlays != overlays }}" - -- name: Start dhis2 service - systemd: - name: dhis2 - state: started - -- name: Wait for services to be healthy - command: docker compose {{ compose_files | join(' ') }} ps - args: - chdir: "{{ deploy_dir }}" - register: compose_ps - # TODO: This doesn't work at all... if a container goes to unhealthy state this will pass - until: "'(healthy)' in compose_ps.stdout and 'starting' not in compose_ps.stdout and 'unhealthy' not in compose_ps.stdout" - retries: 30 - delay: 10 - -- name: Verify all services are running - assert: - that: "'Exit' not in compose_ps.stdout" - fail_msg: "Some services failed to start" diff --git a/server-tools/site.yml b/server-tools/site.yml index 8bb003f..fa4900b 100644 --- a/server-tools/site.yml +++ b/server-tools/site.yml @@ -1,4 +1,4 @@ -- name: Deploy DHIS2 Docker Stack +- name: Provision DHIS2 Docker host hosts: all become: true @@ -6,5 +6,4 @@ - bootstrap - firewall - harden - - role: deploy - when: enable_deploy | default(false) | bool + - deploy diff --git a/stacks/backup/docker-compose.yml b/stacks/backup/docker-compose.yml new file mode 100644 index 0000000..3bc08c2 --- /dev/null +++ b/stacks/backup/docker-compose.yml @@ -0,0 +1,96 @@ +x-database-image: &database-image + image: postgis/postgis:${POSTGRES_VERSION:-16-master} + +x-file-storage-image: &file-storage-image + image: rclone/rclone:${RCLONE_VERSION:-1.68} + +services: + backup-database: + volumes: + - ${BACKUP_DIR}:/backups + - ./scripts/backup-database.sh:/backup-database.sh:ro + environment: + # -- Database hostname + POSTGRES_HOST: database + # -- Database username + POSTGRES_USER: ${POSTGRES_DB_USERNAME} + # -- Database password + POSTGRES_PASSWORD: ${POSTGRES_DB_PASSWORD} + # -- Database name + POSTGRES_DB: ${POSTGRES_DB:-dhis} + # -- Database backup format + POSTGRES_BACKUP_FORMAT: ${POSTGRES_BACKUP_FORMAT:-custom} + # -- The `PGPASSWORD` environment variable is used by the `pg_dump` command` + PGPASSWORD: ${POSTGRES_DB_PASSWORD} + networks: + - db + entrypoint: [ "/bin/bash", "/backup-database.sh" ] + <<: *database-image + profiles: + - backup + + backup-file-storage: + volumes: + - dhis2:/opt/dhis2:ro + - ${BACKUP_DIR}:/backups + - ./scripts/backup-file-storage.sh:/backup-file-storage.sh:ro + environment: + # -- Backup timestamp. Used to name the backup directory and the backup file. Since those are created by different containers, we need to ensure the backup timestamp is the same for both containers. + BACKUP_TIMESTAMP: ${BACKUP_TIMESTAMP} + # -- Directory to back up + BACKUP_SOURCE_PATH: ${BACKUP_SOURCE_PATH:-/opt/dhis2/files} + networks: + - application + entrypoint: [ "/bin/sh", "/backup-file-storage.sh" ] + <<: *file-storage-image + profiles: + - backup + + restore-database: + volumes: + - ${BACKUP_DIR}:/backups:ro + - ./scripts/restore-database.sh:/restore-database.sh:ro + - ./scripts/fix-ownership.sh:/fix-ownership.sh:ro + environment: + # -- Database hostname + POSTGRES_HOST: database + # -- Database username + POSTGRES_USER: ${POSTGRES_DB_USERNAME} + # -- Database password + POSTGRES_PASSWORD: ${POSTGRES_DB_PASSWORD} + # -- Database name + POSTGRES_DB: ${POSTGRES_DB:-dhis} + # -- The `PGPASSWORD` environment variable is used by the `pg_dump` command` + PGPASSWORD: ${POSTGRES_PASSWORD} + # -- Database restore file + DB_RESTORE_FILE: ${DB_RESTORE_FILE} + # -- Number of parallel jobs for pg_restore + DB_RESTORE_NUMBER_OF_JOBS: ${DB_RESTORE_NUMBER_OF_JOBS:-4} + networks: + - db + entrypoint: [ "/bin/bash", "/restore-database.sh" ] + <<: *database-image + profiles: + - restore + + restore-file-storage: + volumes: + - dhis2:/opt/dhis2 + - ${BACKUP_DIR}:/backups:ro + - ./scripts/restore-file-storage.sh:/restore-file-storage.sh:ro + environment: + # -- Directory to restore from + FILE_STORAGE_RESTORE_SOURCE_DIR: ${FILE_STORAGE_RESTORE_SOURCE_DIR} + # -- Directory to restore to + RESTORE_DESTINATION_PATH: ${RESTORE_DESTINATION_PATH:-/opt/dhis2/files} + networks: + - application + entrypoint: [ "/bin/sh", "/restore-file-storage.sh" ] + <<: *file-storage-image + profiles: + - restore + +networks: + db: + name: ${COMPOSE_PROJECT_NAME}-db + external: true diff --git a/stacks/docs/docker-compose.yml b/stacks/docs/docker-compose.yml new file mode 100644 index 0000000..091617a --- /dev/null +++ b/stacks/docs/docker-compose.yml @@ -0,0 +1,8 @@ +services: + compose-docs: + image: tons/docker-compose-docs:2.1.0 + volumes: + - ../../:/src:ro + environment: + DOCKER_COMPOSE_FILE_GLOBS: /src/docker-compose.yml;/src/overlays/*/docker-compose.yml + network_mode: none diff --git a/stacks/monitoring/.env.template b/stacks/monitoring/.env.template new file mode 100644 index 0000000..d56c3ad --- /dev/null +++ b/stacks/monitoring/.env.template @@ -0,0 +1,26 @@ +# Configuration for the standalone monitoring stack. +# Use scripts/generate-stack-envs.sh to generate stacks/monitoring/.env — do not edit this file directly. + +# -- Hostname for the Grafana UI (routed via the standalone Traefik gateway) +GRAFANA_HOSTNAME=grafana.example.com + +# -- Grafana admin password +GRAFANA_ADMIN_PASSWORD= + +# -- DHIS2 monitoring user credentials — must match DHIS2_MONITOR_USERNAME and +# DHIS2_MONITOR_PASSWORD in all instance .env files so Prometheus can scrape metrics +DHIS2_MONITOR_USERNAME=monitor +DHIS2_MONITOR_PASSWORD= + +# -- Prometheus data retention period +PROMETHEUS_RETENTION_TIME=15d + +# -- Loki log retention period +LOKI_RETENTION_PERIOD=744h + +# -- Image versions (override to pin specific versions) +GRAFANA_VERSION=10.0.0 +PROMETHEUS_VERSION=v2.45.0 +LOKI_VERSION=2.9.0 +NODE_EXPORTER_VERSION=v1.6.1 +CADVISOR_VERSION=v0.47.0 diff --git a/overlays/monitoring/config/grafana/dashboards/cadvisor-dashboard-19792_rev6.json b/stacks/monitoring/config/grafana/dashboards/cadvisor-dashboard-19792_rev6.json similarity index 100% rename from overlays/monitoring/config/grafana/dashboards/cadvisor-dashboard-19792_rev6.json rename to stacks/monitoring/config/grafana/dashboards/cadvisor-dashboard-19792_rev6.json diff --git a/overlays/monitoring/config/grafana/dashboards/node-exporter-full-1860_rev41.json b/stacks/monitoring/config/grafana/dashboards/node-exporter-full-1860_rev41.json similarity index 100% rename from overlays/monitoring/config/grafana/dashboards/node-exporter-full-1860_rev41.json rename to stacks/monitoring/config/grafana/dashboards/node-exporter-full-1860_rev41.json diff --git a/overlays/monitoring/config/grafana/dashboards/postgresql-database-9628_rev8.json b/stacks/monitoring/config/grafana/dashboards/postgresql-database-9628_rev8.json similarity index 100% rename from overlays/monitoring/config/grafana/dashboards/postgresql-database-9628_rev8.json rename to stacks/monitoring/config/grafana/dashboards/postgresql-database-9628_rev8.json diff --git a/overlays/monitoring/config/grafana/dashboards/traefik-official-standalone-dashboard-17346_rev9.json b/stacks/monitoring/config/grafana/dashboards/traefik-official-standalone-dashboard-17346_rev9.json similarity index 100% rename from overlays/monitoring/config/grafana/dashboards/traefik-official-standalone-dashboard-17346_rev9.json rename to stacks/monitoring/config/grafana/dashboards/traefik-official-standalone-dashboard-17346_rev9.json diff --git a/overlays/monitoring/config/grafana/provisioning/dashboards/dashboard.yml b/stacks/monitoring/config/grafana/provisioning/dashboards/dashboard.yml similarity index 100% rename from overlays/monitoring/config/grafana/provisioning/dashboards/dashboard.yml rename to stacks/monitoring/config/grafana/provisioning/dashboards/dashboard.yml diff --git a/overlays/monitoring/config/grafana/provisioning/datasources/datasources.yml b/stacks/monitoring/config/grafana/provisioning/datasources/datasources.yml similarity index 100% rename from overlays/monitoring/config/grafana/provisioning/datasources/datasources.yml rename to stacks/monitoring/config/grafana/provisioning/datasources/datasources.yml diff --git a/stacks/monitoring/config/grafana/provisioning/datasources/tempo.yml b/stacks/monitoring/config/grafana/provisioning/datasources/tempo.yml new file mode 100644 index 0000000..46e4c1d --- /dev/null +++ b/stacks/monitoring/config/grafana/provisioning/datasources/tempo.yml @@ -0,0 +1,33 @@ +apiVersion: 1 + +datasources: + - name: Tempo + uid: tempo + type: tempo + access: proxy + url: http://tempo:3200 + editable: true + jsonData: + httpMethod: GET + # Enable service graph and node graph visualizations + serviceMap: + datasourceUid: prometheus + nodeGraph: + enabled: true + # Link traces to logs in Loki + tracesToLogs: + datasourceUid: loki + filterByTraceID: true + filterBySpanID: false + mapTagNamesEnabled: true + tags: + - key: service.name + value: compose_service + # Link traces to metrics + tracesToMetrics: + datasourceUid: prometheus + tags: + - key: service.name + value: job + lokiSearch: + datasourceUid: loki diff --git a/stacks/monitoring/config/prometheus/prometheus.yml b/stacks/monitoring/config/prometheus/prometheus.yml new file mode 100644 index 0000000..58e837a --- /dev/null +++ b/stacks/monitoring/config/prometheus/prometheus.yml @@ -0,0 +1,34 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + # DHIS2 application metrics — one target file per instance added by `make start-instance` + # All instances must use the same DHIS2_MONITOR_USERNAME and DHIS2_MONITOR_PASSWORD + # (set in monitoring/.env and matching each instance's .env file). + - job_name: 'dhis2' + file_sd_configs: + - files: + - /etc/prometheus/targets/dhis2/*.json + refresh_interval: 30s + metrics_path: '/api/metrics' + basic_auth: + username: ${DHIS2_MONITOR_USERNAME} + password_file: /run/secrets/dhis2_monitor_password + + # PostgreSQL metrics — one target file per instance added by `make start-instance` + - job_name: 'postgres' + file_sd_configs: + - files: + - /etc/prometheus/targets/postgres/*.json + refresh_interval: 30s + + # Host metrics (runs once for the whole server) + - job_name: 'node-exporter' + static_configs: + - targets: [ 'node-exporter:9100' ] + + # Container metrics (runs once for the whole server) + - job_name: 'cadvisor' + static_configs: + - targets: [ 'cadvisor:8080' ] diff --git a/overlays/monitoring/docker-compose.yml b/stacks/monitoring/docker-compose.yml similarity index 54% rename from overlays/monitoring/docker-compose.yml rename to stacks/monitoring/docker-compose.yml index 9138594..89c81f3 100644 --- a/overlays/monitoring/docker-compose.yml +++ b/stacks/monitoring/docker-compose.yml @@ -1,74 +1,34 @@ +# Standalone monitoring stack for multi-instance deployments. +# +# Deploy this stack once before launching any DHIS2 instances: +# make start-monitoring +# +# Per-instance scrape targets are added automatically to monitoring/targets/dhis2/ and +# monitoring/targets/postgres/ by `make start-instance` and removed by `make stop-instance`. +# Prometheus refreshes these directories every 30 seconds — no restart required. +# +# Loki is exposed on 127.0.0.1:3100 for the Docker logging driver. All instance containers +# using the shared-monitoring overlay push logs here with an `instance` label for filtering. +# +# NOTE: DHIS2_MONITOR_USERNAME and DHIS2_MONITOR_PASSWORD in this file's .env must match +# the values set in all instance .env files, since Prometheus uses a single shared credential. +# +# The monitoring and proxy external networks must exist before starting this stack: +# make ensure-networks + x-healthcheck-options: &healthcheck-options interval: 10s timeout: 3s retries: 3 start_period: 30s -x-loki-logging: &loki-logging - driver: loki - options: - loki-url: "http://localhost:3100/loki/api/v1/push" - loki-timeout: "1s" - loki-retries: "2" - services: - app: - environment: - MONITORING_API_ENABLED: on - MONITORING_JVM_ENABLED: on - MONITORING_DBPOOL_ENABLED: on - MONITORING_HIBERNATE_ENABLED: on - MONITORING_UPTIME_ENABLED: on - MONITORING_CPU_ENABLED: on - networks: - - frontend - - application - - database - - monitoring - logging: *loki-logging - - update-admin-password: - networks: - - database - logging: *loki-logging - - create-monitoring-user: - image: alpine:3.22 - # We're installing jq in the script, so root is required. And for the same reason, read_only isn't possible either. We could create a docker file and install it there... - # user: guest:405 - # read_only: true - depends_on: - update-admin-password: - condition: service_completed_successfully - volumes: - - ./scripts/create-monitoring-user.sh:/create-monitoring-user.sh:ro - environment: - # This must match the name of the Docker Compose service running the DHIS2 app and the port it is listening on - DHIS2_HOSTNAME: ${DHIS2_HOSTNAME:-http://app:8080} - DHIS2_ADMIN_USERNAME: ${DHIS2_ADMIN_USERNAME} - DHIS2_ADMIN_PASSWORD: ${DHIS2_ADMIN_PASSWORD} - DHIS2_MONITOR_USERNAME: ${DHIS2_MONITOR_USERNAME} - DHIS2_MONITOR_PASSWORD: ${DHIS2_MONITOR_PASSWORD} - networks: - - application - entrypoint: [ "/bin/sh", "/create-monitoring-user.sh" ] - logging: *loki-logging - cap_drop: - - ALL - security_opt: - - no-new-privileges:true - - database: - networks: - - database - logging: *loki-logging - grafana: image: grafana/grafana:${GRAFANA_VERSION:-10.0.0} volumes: - grafana:/var/lib/grafana - - ./overlays/monitoring/config/grafana/dashboards:/var/lib/grafana/dashboards:ro - - ./overlays/monitoring/config/grafana/provisioning:/etc/grafana/provisioning:ro + - ./config/grafana/dashboards:/var/lib/grafana/dashboards:ro + - ./config/grafana/provisioning:/etc/grafana/provisioning:ro environment: GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD} GF_SECURITY_ADMIN_USER: admin @@ -78,12 +38,14 @@ services: GF_ANALYTICS_CHECK_FOR_UPDATES: false GF_LOG_LEVEL: warn GF_PATHS_PROVISIONING: /etc/grafana/provisioning - GF_SERVER_ROOT_URL: https://grafana.${APP_HOSTNAME} + GF_SERVER_ROOT_URL: https://${GRAFANA_HOSTNAME} GF_SERVER_SERVE_FROM_SUB_PATH: false DS_PROMETHEUS: prometheus networks: - - frontend - - monitoring + monitoring_internal: + proxy: + aliases: + - monitoring-grafana restart: unless-stopped healthcheck: test: [ "CMD", "curl", "-f", "http://localhost:3000/api/health" ] @@ -104,11 +66,11 @@ services: - loki:/loki environment: LOKI_RETENTION_PERIOD: ${LOKI_RETENTION_PERIOD:-744h} - # The Docker Logging Driver is running on the host network, and therefore we need to expose Loki on it + # The Docker Logging Driver runs on the host network, so Loki must be exposed on it ports: - "127.0.0.1:3100:3100" networks: - - monitoring + - monitoring_internal restart: unless-stopped healthcheck: test: [ "CMD", "wget", "--no-verbose", "--tries=1", "--quiet", "--output-document=/dev/null", "http://localhost:3100/ready" ] @@ -120,13 +82,12 @@ services: security_opt: - no-new-privileges:true - # This is needed to avoid permission issues with the loki container when running as non-root user loki-init: image: busybox:1.37.0 volumes: - loki:/loki networks: - - monitoring + - monitoring_internal command: [ "sh", "-c", "mkdir -p /loki/chunks /loki/rules /loki/wal /loki/boltdb-shipper-compactor && chown -R nobody:nobody /loki" ] user: root security_opt: @@ -136,8 +97,10 @@ services: image: prom/prometheus:${PROMETHEUS_VERSION:-v2.45.0} volumes: - prometheus:/prometheus + - ./targets/dhis2:/etc/prometheus/targets/dhis2:ro + - ./targets/postgres:/etc/prometheus/targets/postgres:ro configs: - - source: prometheus + - source: prometheus_config target: /etc/prometheus/prometheus.yml mode: 444 secrets: @@ -145,6 +108,7 @@ services: environment: PROMETHEUS_RETENTION_TIME: ${PROMETHEUS_RETENTION_TIME:-15d} networks: + - monitoring_internal - monitoring command: - "--config.file=/etc/prometheus/prometheus.yml" @@ -152,6 +116,7 @@ services: - "--web.console.libraries=/etc/prometheus/console_libraries" - "--web.console.templates=/etc/prometheus/consoles" - "--web.enable-lifecycle" + - "--storage.tsdb.retention.time=${PROMETHEUS_RETENTION_TIME:-15d}" restart: unless-stopped healthcheck: test: [ "CMD", "wget", "--no-verbose", "--tries=1", "--quiet", "--output-document=/dev/null", "http://localhost:9090/-/healthy" ] @@ -159,41 +124,9 @@ services: user: nobody:nobody cap_drop: - ALL - # TODO: If we want read only we can't read the DHIS2_MONITOR_PASSWORD from the host environment. We would need to - # write it to a file and reference the file in our secrets configuration by the end of this file. For the sake of - # simplicity I'll advice that we don't use read only for now. - #read_only: true security_opt: - no-new-privileges:true - postgres-exporter: - image: quay.io/prometheuscommunity/postgres-exporter:${POSTGRES_EXPORTER_VERSION:-v0.17.1} - depends_on: - create-monitoring-user: - condition: service_completed_successfully - environment: - DATA_SOURCE_NAME: "postgresql://${POSTGRES_METRICS_USERNAME}:${POSTGRES_METRICS_PASSWORD}@database:5432/${POSTGRES_DB}?sslmode=disable" - networks: - - database - - monitoring - restart: unless-stopped - healthcheck: - test: [ "CMD", "wget", "--no-verbose", "--tries=1", "--quiet", "--output-document=/dev/null", "http://localhost:9187/metrics" ] - <<: *healthcheck-options - logging: *loki-logging - user: nobody:nobody - cap_drop: - - ALL - read_only: true - security_opt: - - no-new-privileges:true - - traefik: - networks: - - monitoring - - frontend - logging: *loki-logging - node-exporter: image: prom/node-exporter:${NODE_EXPORTER_VERSION:-v1.6.1} volumes: @@ -201,7 +134,7 @@ services: - /sys:/host/sys:ro - /:/rootfs:ro networks: - - monitoring + - monitoring_internal command: - '--path.procfs=/host/proc' - '--path.sysfs=/host/sys' @@ -226,10 +159,8 @@ services: - /var/lib/docker:/var/lib/docker:ro - /dev/disk:/dev/disk:ro networks: - - monitoring + - monitoring_internal command: - # For the sake of avoiding: "Unable to get btrfs mountpoint IDs: stat failed on /dev/mapper/luks-id with error: no such file or directory" - # https://github.com/google/cadvisor/issues/3357 - --disable_metrics=disk,referenced_memory restart: unless-stopped healthcheck: @@ -242,6 +173,15 @@ services: - no-new-privileges:true userns_mode: "host" +networks: + monitoring_internal: + monitoring: + name: monitoring + external: true + proxy: + name: proxy + external: true + volumes: loki: {} prometheus: {} @@ -251,31 +191,41 @@ secrets: dhis2_monitor_password: environment: DHIS2_MONITOR_PASSWORD +# Docker Compose performs variable substitution on configs.content, so +# ${DHIS2_MONITOR_USERNAME} is resolved from monitoring/.env at deploy time. configs: - prometheus: + prometheus_config: content: | global: scrape_interval: 15s evaluation_interval: 15s scrape_configs: + # DHIS2 application metrics — one target file per instance in targets/dhis2/ + # All instances must use the same DHIS2_MONITOR_USERNAME and DHIS2_MONITOR_PASSWORD. - job_name: 'dhis2' - static_configs: - - targets: [ 'app:8080' ] + file_sd_configs: + - files: + - /etc/prometheus/targets/dhis2/*.json + refresh_interval: 30s metrics_path: '/api/metrics' basic_auth: username: ${DHIS2_MONITOR_USERNAME} password_file: /run/secrets/dhis2_monitor_password - - job_name: 'traefik' - static_configs: - - targets: [ 'traefik:8080' ] - metrics_path: '/metrics' + + # PostgreSQL metrics — one target file per instance in targets/postgres/ - job_name: 'postgres' - static_configs: - - targets: [ 'postgres-exporter:9187' ] + file_sd_configs: + - files: + - /etc/prometheus/targets/postgres/*.json + refresh_interval: 30s + + # Host metrics (runs once for the whole server) - job_name: 'node-exporter' static_configs: - targets: [ 'node-exporter:9100' ] + + # Container metrics (runs once for the whole server) - job_name: 'cadvisor' static_configs: - targets: [ 'cadvisor:8080' ] diff --git a/stacks/monitoring/targets/dhis2/instance.json.template b/stacks/monitoring/targets/dhis2/instance.json.template new file mode 100644 index 0000000..d4a3b50 --- /dev/null +++ b/stacks/monitoring/targets/dhis2/instance.json.template @@ -0,0 +1,8 @@ +[ + { + "targets": ["${PROJECT_NAME}-app:8080"], + "labels": { + "instance": "${PROJECT_NAME}" + } + } +] diff --git a/stacks/monitoring/targets/postgres/instance.json.template b/stacks/monitoring/targets/postgres/instance.json.template new file mode 100644 index 0000000..e637f4d --- /dev/null +++ b/stacks/monitoring/targets/postgres/instance.json.template @@ -0,0 +1,8 @@ +[ + { + "targets": ["${PROJECT_NAME}-postgres-exporter:9187"], + "labels": { + "instance": "${PROJECT_NAME}" + } + } +] diff --git a/stacks/postgres/docker-compose.yml b/stacks/postgres/docker-compose.yml new file mode 100644 index 0000000..8544138 --- /dev/null +++ b/stacks/postgres/docker-compose.yml @@ -0,0 +1,56 @@ +x-database-image: &database-image + image: postgis/postgis:${POSTGRES_VERSION:-16-master} + +services: + database: + volumes: + - postgres:/var/lib/postgresql/data + - ../../init-scripts:/docker-entrypoint-initdb.d:ro + - ../../config/postgresql/postgresql.conf:/etc/postgresql/postgresql.conf:ro + - ../../config/postgresql/conf.d:/etc/postgresql/conf.d:ro + environment: + # -- Postgres user password + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + # -- Name of the database + POSTGRES_DB: ${POSTGRES_DB:-dhis} + # -- Database username + POSTGRES_DB_USERNAME: ${POSTGRES_DB_USERNAME} + # -- Database password + POSTGRES_DB_PASSWORD: ${POSTGRES_DB_PASSWORD} + # -- Initdb arguments + POSTGRES_INITDB_ARGS: "--auth-host=scram-sha-256 --auth-local=scram-sha-256" + # -- Metrics username + POSTGRES_METRICS_USERNAME: ${POSTGRES_METRICS_USERNAME} + # -- Metrics user password + POSTGRES_METRICS_PASSWORD: ${POSTGRES_METRICS_PASSWORD} + networks: + db: + aliases: + - database + command: > + postgres + -c config_file=/etc/postgresql/postgresql.conf + -c hba_file=/var/lib/postgresql/data/pg_hba.conf + restart: unless-stopped + healthcheck: + test: [ "CMD-SHELL", "pg_isready -U postgres -d ${POSTGRES_DB}" ] + interval: 10s + timeout: 5s + retries: 3 + user: postgres + <<: *database-image + cap_drop: + - ALL + read_only: true + security_opt: + - no-new-privileges:true + tmpfs: + - /var/run/postgresql + +networks: + db: + name: ${COMPOSE_PROJECT_NAME}-db + external: true + +volumes: + postgres: {} diff --git a/stacks/traefik/.env.template b/stacks/traefik/.env.template new file mode 100644 index 0000000..02bcddd --- /dev/null +++ b/stacks/traefik/.env.template @@ -0,0 +1,15 @@ +# Configuration for the standalone Traefik gateway stack. +# Use scripts/generate-stack-envs.sh to generate stacks/traefik/.env — do not edit this file directly. + +# -- Email address for Let's Encrypt certificate registration and renewal notices +LETSENCRYPT_ACME_EMAIL= + +# -- Let's Encrypt CA server. Use the staging URL while testing to avoid rate limits: +# https://acme-staging-v02.api.letsencrypt.org/directory +LETSENCRYPT_ACME_CASERVER=https://acme-v02.api.letsencrypt.org/directory + +# -- Log level (DEBUG, INFO, WARN, ERROR) +LOG_LEVEL=INFO + +# -- Enable access logs (true/false) +LOG_ACCESS=true diff --git a/stacks/traefik/conf.d/instance.yml.template b/stacks/traefik/conf.d/instance.yml.template new file mode 100644 index 0000000..2e11a71 --- /dev/null +++ b/stacks/traefik/conf.d/instance.yml.template @@ -0,0 +1,36 @@ +# Generated per-instance Traefik routing configuration. +# Created by `make launch-instance` — do not edit manually. +# Remove with `make stop-instance PROJECT_NAME=`. + +http: + routers: + ${PROJECT_NAME}-app: + rule: "Host(`${APP_HOSTNAME}`)" + service: ${PROJECT_NAME}-app + entryPoints: + - websecure + tls: + certResolver: letsencrypt + middlewares: + - security + + ${PROJECT_NAME}-glowroot: + rule: "Host(`glowroot.${APP_HOSTNAME}`)" + service: ${PROJECT_NAME}-glowroot + entryPoints: + - websecure + tls: + certResolver: letsencrypt + middlewares: + - security + + services: + ${PROJECT_NAME}-app: + loadBalancer: + servers: + - url: "http://${PROJECT_NAME}-app:8080" + + ${PROJECT_NAME}-glowroot: + loadBalancer: + servers: + - url: "http://${PROJECT_NAME}-app:4000" diff --git a/stacks/traefik/conf.d/middlewares.yml b/stacks/traefik/conf.d/middlewares.yml new file mode 100644 index 0000000..992fb04 --- /dev/null +++ b/stacks/traefik/conf.d/middlewares.yml @@ -0,0 +1,15 @@ +# Shared middleware definitions used by all per-instance route configs. +# This file is always present in conf.d/ and loaded by the standalone Traefik stack. + +http: + middlewares: + security: + headers: + forceSTSHeader: true + stsSeconds: 63072000 + stsIncludeSubdomains: true + stsPreload: true + referrerPolicy: "strict-origin-when-cross-origin" + frameDeny: true + contentTypeNosniff: true + browserXssFilter: true diff --git a/stacks/traefik/conf.d/monitoring.yml.template b/stacks/traefik/conf.d/monitoring.yml.template new file mode 100644 index 0000000..293f862 --- /dev/null +++ b/stacks/traefik/conf.d/monitoring.yml.template @@ -0,0 +1,20 @@ +# Traefik route for the shared monitoring stack (Grafana). +# Generated by `make launch-monitoring` — do not edit manually. + +http: + routers: + monitoring-grafana: + rule: "Host(`${GRAFANA_HOSTNAME}`)" + service: monitoring-grafana + entryPoints: + - websecure + tls: + certResolver: letsencrypt + middlewares: + - security + + services: + monitoring-grafana: + loadBalancer: + servers: + - url: "http://monitoring-grafana:3000" diff --git a/stacks/traefik/docker-compose.yml b/stacks/traefik/docker-compose.yml new file mode 100644 index 0000000..cfa28f2 --- /dev/null +++ b/stacks/traefik/docker-compose.yml @@ -0,0 +1,86 @@ +# Standalone Traefik gateway stack for multi-instance deployments. +# +# Deploy this stack once before launching any DHIS2 instances: +# make start-traefik +# +# Routes are added automatically to traefik/conf.d/ by `make start-instance` +# and removed by `make stop-instance`. Traefik watches the directory and reloads +# within ~1 second of any change — no restart required. +# +# The proxy external network must exist before starting this stack: +# make ensure-networks + +services: + traefik-init: + image: alpine:3.22 + volumes: + - cert:/cert:rw + - ../../scripts/init-cert.sh:/init-cert.sh:ro + command: [ "/init-cert.sh" ] + security_opt: + - no-new-privileges:true + + traefik: + image: traefik:v3.5 + depends_on: + traefik-init: + condition: service_completed_successfully + volumes: + - ./conf.d:/etc/traefik/conf.d:ro + - cert:/cert:rw + environment: + # -- Log level + TRAEFIK_LOG_LEVEL: ${LOG_LEVEL:-INFO} + # -- Enable access logs + TRAEFIK_ACCESSLOG: ${LOG_ACCESS:-true} + # -- Access log format + TRAEFIK_ACCESSLOG_FORMAT: ${LOG_FORMAT:-json} + # -- Allow ping + TRAEFIK_PING: true + # -- HTTP entrypoint — redirect all traffic to HTTPS + TRAEFIK_ENTRYPOINTS_WEB_ADDRESS: :80 + TRAEFIK_ENTRYPOINTS_WEB_HTTP_REDIRECTIONS_ENTRYPOINT_TO: websecure + TRAEFIK_ENTRYPOINTS_WEB_HTTP_REDIRECTIONS_ENTRYPOINT_SCHEME: https + # -- HTTPS entrypoint + TRAEFIK_ENTRYPOINTS_WEBSECURE_ADDRESS: :443 + # -- Watch conf.d/ for route changes (picks up new instances within ~1s) + TRAEFIK_PROVIDERS_FILE_DIRECTORY: /etc/traefik/conf.d + TRAEFIK_PROVIDERS_FILE_WATCH: true + # -- Disable Docker provider (no socket access) + TRAEFIK_API: false + TRAEFIK_API_INSECURE: false + # -- Enable Prometheus metrics + TRAEFIK_METRICS_PROMETHEUS: true + # -- Let's Encrypt + TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_EMAIL: ${LETSENCRYPT_ACME_EMAIL} + TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_STORAGE: /cert/acme.json + TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_TLSCHALLENGE: true + TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_CASERVER: ${LETSENCRYPT_ACME_CASERVER:-https://acme-v02.api.letsencrypt.org/directory} + ports: + - "0.0.0.0:80:80" + - "0.0.0.0:443:443" + networks: + - proxy + restart: unless-stopped + healthcheck: + test: [ "CMD", "traefik", "healthcheck" ] + interval: 10s + timeout: 3s + retries: 3 + start_period: 10s + user: nobody:nobody + cap_drop: + - ALL + read_only: true + security_opt: + - no-new-privileges:true + tmpfs: + - /tmp + +networks: + proxy: + name: proxy + external: true + +volumes: + cert: {} diff --git a/traefik/dynamic.yml b/stacks/traefik/dynamic.yml similarity index 97% rename from traefik/dynamic.yml rename to stacks/traefik/dynamic.yml index f06bb89..e08d45d 100644 --- a/traefik/dynamic.yml +++ b/stacks/traefik/dynamic.yml @@ -40,6 +40,7 @@ http: referrerPolicy: "strict-origin-when-cross-origin" frameDeny: true contentTypeNosniff: true + browserXssFilter: true services: app: