diff --git a/src/common/core/templates/docgen-metrics.md b/src/common/core/templates/docgen-metrics.md index 7491b7c1..17f6b192 100644 --- a/src/common/core/templates/docgen-metrics.md +++ b/src/common/core/templates/docgen-metrics.md @@ -6,7 +6,9 @@ sidebar_position: 20 ## Prometheus -To enable the Prometheus `/metrics` endpoint, set the `PROMETHEUS_ENABLED` environment variable to `true`. +To enable the Prometheus `/metrics` endpoint, set the `PROMETHEUS_ENABLED` environment variable to `true`. + +When enabled, Flagsmith serves the `/metrics` endpoint on port 9100. The metrics provided by Flagsmith are described below. diff --git a/src/common/gunicorn/conf.py b/src/common/gunicorn/conf.py index 8dc33502..6da7b4fc 100644 --- a/src/common/gunicorn/conf.py +++ b/src/common/gunicorn/conf.py @@ -4,6 +4,7 @@ It is used to correctly support Prometheus metrics in a multi-process environment. """ +import os import typing from prometheus_client.multiprocess import mark_process_dead @@ -13,6 +14,15 @@ from gunicorn.workers.base import Worker # type: ignore[import-untyped] -def worker_exit(server: "Arbiter", worker: "Worker") -> None: +def when_ready(server: "Arbiter") -> None: + """Start the standalone Prometheus metrics server after Gunicorn is ready.""" + prometheus_enabled = os.getenv("PROMETHEUS_ENABLED", "") + if prometheus_enabled.lower() == "true": # Django settings are not available + from common.gunicorn.metrics_server import start_metrics_server + + start_metrics_server() + + +def child_exit(server: "Arbiter", worker: "Worker") -> None: """Detach the process Prometheus metrics collector when a worker exits.""" mark_process_dead(worker.pid) # type: ignore[no-untyped-call] diff --git a/src/common/gunicorn/metrics_server.py b/src/common/gunicorn/metrics_server.py new file mode 100644 index 00000000..d7f54eec --- /dev/null +++ b/src/common/gunicorn/metrics_server.py @@ -0,0 +1,65 @@ +""" +Standalone Prometheus metrics HTTP server. + +This module provides a separate HTTP server for Prometheus metrics, +independent of the main Gunicorn application server. This improves +metrics reliability under high API load. + +The server runs in a daemon thread and serves metrics from the shared +PROMETHEUS_MULTIPROC_DIR directory. +""" + +import logging +import os +import threading + +from prometheus_client import CollectorRegistry, start_http_server +from prometheus_client.multiprocess import MultiProcessCollector + +logger = logging.getLogger(__name__) + +METRICS_SERVER_PORT = 9100 + +_server_started = False +_server_lock = threading.Lock() + + +def get_multiprocess_registry() -> CollectorRegistry: + """Create a registry configured for multiprocess metric collection.""" + registry = CollectorRegistry() + MultiProcessCollector(registry) # type: ignore[no-untyped-call] + return registry + + +def start_metrics_server( + port: int = METRICS_SERVER_PORT, +) -> None: + """ + Start the standalone Prometheus metrics HTTP server. + + This function is idempotent - calling it multiple times will only + start one server. The server runs in a daemon thread. + + Args: + port: The port to serve metrics on. Defaults to 9100. + """ + global _server_started + + with _server_lock: + if _server_started: + logger.debug("Metrics server already started") + return + + prometheus_multiproc_dir = os.environ.get("PROMETHEUS_MULTIPROC_DIR") + if not prometheus_multiproc_dir: + logger.warning("PROMETHEUS_MULTIPROC_DIR not set, skipping metrics server") + return + + registry = get_multiprocess_registry() + + try: + start_http_server(port=port, registry=registry) + _server_started = True + logger.info("Prometheus metrics server started on port %d", port) + except OSError as e: + logger.error("Failed to start metrics server on port %d: %s", port, e) diff --git a/tests/integration/core/snapshots/test_docgen__metrics__runs_expected.txt b/tests/integration/core/snapshots/test_docgen__metrics__runs_expected.txt index 85beb66f..fbb3f898 100644 --- a/tests/integration/core/snapshots/test_docgen__metrics__runs_expected.txt +++ b/tests/integration/core/snapshots/test_docgen__metrics__runs_expected.txt @@ -6,7 +6,9 @@ sidebar_position: 20 ## Prometheus -To enable the Prometheus `/metrics` endpoint, set the `PROMETHEUS_ENABLED` environment variable to `true`. +To enable the Prometheus `/metrics` endpoint, set the `PROMETHEUS_ENABLED` environment variable to `true`. + +When enabled, Flagsmith serves the `/metrics` endpoint on port 9100. The metrics provided by Flagsmith are described below. diff --git a/tests/integration/gunicorn/__init__.py b/tests/integration/gunicorn/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/gunicorn/conftest.py b/tests/integration/gunicorn/conftest.py new file mode 100644 index 00000000..b7a37437 --- /dev/null +++ b/tests/integration/gunicorn/conftest.py @@ -0,0 +1,15 @@ +from typing import Generator + +import pytest + + +@pytest.fixture(autouse=True) +def reset_metrics_server_state() -> Generator[None, None, None]: + """Reset the metrics server global state between tests.""" + from common.gunicorn import metrics_server + + metrics_server._server_started = False + + yield + + metrics_server._server_started = False diff --git a/tests/integration/gunicorn/test_metrics_server.py b/tests/integration/gunicorn/test_metrics_server.py new file mode 100644 index 00000000..ec0ec371 --- /dev/null +++ b/tests/integration/gunicorn/test_metrics_server.py @@ -0,0 +1,90 @@ +import socket +import urllib.request + +import prometheus_client +import pytest + +from common.gunicorn.metrics_server import start_metrics_server +from tests import GetLogsFixture + + +@pytest.mark.prometheus_multiprocess_mode +def test_start_metrics_server__multiprocess_mode__serves_metrics( + unused_tcp_port: int, + test_metric: prometheus_client.Counter, +) -> None: + # Given + test_metric.labels(test_name="standalone_server_test").inc() + + # When + start_metrics_server(port=unused_tcp_port) + + # Then + with urllib.request.urlopen( + f"http://localhost:{unused_tcp_port}/metrics" + ) as response: + content = response.read().decode() + + assert response.status == 200 + assert "pytest_tests_run_total" in content + assert 'test_name="standalone_server_test"' in content + + +def test_start_metrics_server__multiproc_dir_unset__logs_warning_and_skips( + get_logs: GetLogsFixture, +) -> None: + # Given + # PROMETHEUS_MULTIPROC_DIR is not set (default state) + + # When + start_metrics_server() + + # Then + logs = get_logs("common.gunicorn.metrics_server") + assert ( + "WARNING", + "PROMETHEUS_MULTIPROC_DIR not set, skipping metrics server", + ) in logs + + +@pytest.mark.prometheus_multiprocess_mode +def test_start_metrics_server__called_multiple_times__remains_idempotent( + unused_tcp_port: int, +) -> None: + # Given + start_metrics_server(port=unused_tcp_port) + + # When + start_metrics_server(port=unused_tcp_port) + start_metrics_server(port=unused_tcp_port) + + # Then + with urllib.request.urlopen( + f"http://localhost:{unused_tcp_port}/metrics" + ) as response: + assert response.status == 200 + + +@pytest.mark.prometheus_multiprocess_mode +def test_start_metrics_server__port_unavailable__logs_error( + unused_tcp_port: int, + get_logs: GetLogsFixture, +) -> None: + # Given + # Bind to 0.0.0.0 to match prometheus_client's default address + blocker = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + blocker.bind(("0.0.0.0", unused_tcp_port)) + blocker.listen(1) + + try: + # When + start_metrics_server(port=unused_tcp_port) + + # Then + logs = get_logs("common.gunicorn.metrics_server") + assert any( + level == "ERROR" and "Failed to start metrics server" in msg + for level, msg in logs + ) + finally: + blocker.close() diff --git a/tests/unit/common/gunicorn/test_conf.py b/tests/unit/common/gunicorn/test_conf.py new file mode 100644 index 00000000..0229b42d --- /dev/null +++ b/tests/unit/common/gunicorn/test_conf.py @@ -0,0 +1,60 @@ +from unittest.mock import Mock + +import pytest +from pytest_mock import MockerFixture + +from common.gunicorn.conf import child_exit, when_ready + + +def test_child_exit__calls_mark_process_dead_with_worker_pid( + mocker: MockerFixture, +) -> None: + # Given + mark_process_dead_mock = mocker.patch("common.gunicorn.conf.mark_process_dead") + server = Mock() + worker = Mock() + worker.pid = 12345 + + # When + child_exit(server, worker) + + # Then + mark_process_dead_mock.assert_called_once_with(12345) + + +@pytest.mark.parametrize("prometheus_enabled", ("true", "TRUE")) +def test_when_ready__prometheus_enabled__starts_metrics_server( + mocker: MockerFixture, + prometheus_enabled: str, +) -> None: + # Given + mocker.patch.dict("os.environ", {"PROMETHEUS_ENABLED": prometheus_enabled}) + start_metrics_server_mock = mocker.patch( + "common.gunicorn.metrics_server.start_metrics_server" + ) + server = Mock() + + # When + when_ready(server) + + # Then + start_metrics_server_mock.assert_called_once() + + +@pytest.mark.parametrize("prometheus_enabled", ("", "false")) +def test_when_ready__prometheus_disabled__does_not_start_metrics_server( + mocker: MockerFixture, + prometheus_enabled: str, +) -> None: + # Given + mocker.patch.dict("os.environ", {"PROMETHEUS_ENABLED": prometheus_enabled}) + start_metrics_server_mock = mocker.patch( + "common.gunicorn.metrics_server.start_metrics_server" + ) + server = Mock() + + # When + when_ready(server) + + # Then + start_metrics_server_mock.assert_not_called() diff --git a/tests/unit/common/gunicorn/test_utils.py b/tests/unit/common/gunicorn/test_utils.py index 4aaee909..9740f80c 100644 --- a/tests/unit/common/gunicorn/test_utils.py +++ b/tests/unit/common/gunicorn/test_utils.py @@ -44,7 +44,6 @@ def test_run_server__default_config_file__runs_expected( # Given # prevent real forking from Gunicorn mocker.patch("os.fork").return_value = 0 - mark_process_dead_mock = mocker.patch("common.gunicorn.conf.mark_process_dead") pid = os.getpid() @@ -58,9 +57,6 @@ def delay_kill(pid: int = pid) -> None: with pytest.raises(SystemExit): run_server({"bind": f"0.0.0.0:{unused_tcp_port}"}) - # Then - mark_process_dead_mock.assert_called_once_with(pid) - def test_get_route_template__returns_expected__caches_expected( mocker: MockerFixture,