Skip to content
4 changes: 3 additions & 1 deletion src/common/core/templates/docgen-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ sidebar_position: 20

## Prometheus

To enable the Prometheus `/metrics` endpoint, set the `PROMETHEUS_ENABLED` environment variable to `true`.
To enable the Prometheus `/metrics` endpoint, set the `PROMETHEUS_ENABLED` environment variable to `true`.

When enabled, Flagsmith serves the `/metrics` endpoint on port 9100.

The metrics provided by Flagsmith are described below.

Expand Down
12 changes: 11 additions & 1 deletion src/common/gunicorn/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
It is used to correctly support Prometheus metrics in a multi-process environment.
"""

import os
import typing

from prometheus_client.multiprocess import mark_process_dead
Expand All @@ -13,6 +14,15 @@
from gunicorn.workers.base import Worker # type: ignore[import-untyped]


def worker_exit(server: "Arbiter", worker: "Worker") -> None:
def when_ready(server: "Arbiter") -> None:
"""Start the standalone Prometheus metrics server after Gunicorn is ready."""
prometheus_enabled = os.getenv("PROMETHEUS_ENABLED", "")
if prometheus_enabled.lower() == "true": # Django settings are not available
from common.gunicorn.metrics_server import start_metrics_server

start_metrics_server()


def child_exit(server: "Arbiter", worker: "Worker") -> None:
"""Detach the process Prometheus metrics collector when a worker exits."""
mark_process_dead(worker.pid) # type: ignore[no-untyped-call]
65 changes: 65 additions & 0 deletions src/common/gunicorn/metrics_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""
Standalone Prometheus metrics HTTP server.

This module provides a separate HTTP server for Prometheus metrics,
independent of the main Gunicorn application server. This improves
metrics reliability under high API load.

The server runs in a daemon thread and serves metrics from the shared
PROMETHEUS_MULTIPROC_DIR directory.
"""

import logging
import os
import threading

from prometheus_client import CollectorRegistry, start_http_server
from prometheus_client.multiprocess import MultiProcessCollector

logger = logging.getLogger(__name__)

METRICS_SERVER_PORT = 9100

_server_started = False
_server_lock = threading.Lock()


def get_multiprocess_registry() -> CollectorRegistry:
"""Create a registry configured for multiprocess metric collection."""
registry = CollectorRegistry()
MultiProcessCollector(registry) # type: ignore[no-untyped-call]
return registry


def start_metrics_server(
port: int = METRICS_SERVER_PORT,
) -> None:
"""
Start the standalone Prometheus metrics HTTP server.

This function is idempotent - calling it multiple times will only
start one server. The server runs in a daemon thread.

Args:
port: The port to serve metrics on. Defaults to 9100.
"""
global _server_started

with _server_lock:
if _server_started:
logger.debug("Metrics server already started")
return

prometheus_multiproc_dir = os.environ.get("PROMETHEUS_MULTIPROC_DIR")
if not prometheus_multiproc_dir:
logger.warning("PROMETHEUS_MULTIPROC_DIR not set, skipping metrics server")
return

registry = get_multiprocess_registry()

try:
start_http_server(port=port, registry=registry)
_server_started = True
logger.info("Prometheus metrics server started on port %d", port)
except OSError as e:
logger.error("Failed to start metrics server on port %d: %s", port, e)
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ sidebar_position: 20

## Prometheus

To enable the Prometheus `/metrics` endpoint, set the `PROMETHEUS_ENABLED` environment variable to `true`.
To enable the Prometheus `/metrics` endpoint, set the `PROMETHEUS_ENABLED` environment variable to `true`.

When enabled, Flagsmith serves the `/metrics` endpoint on port 9100.

The metrics provided by Flagsmith are described below.

Expand Down
Empty file.
15 changes: 15 additions & 0 deletions tests/integration/gunicorn/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import Generator

import pytest


@pytest.fixture(autouse=True)
def reset_metrics_server_state() -> Generator[None, None, None]:
"""Reset the metrics server global state between tests."""
from common.gunicorn import metrics_server

metrics_server._server_started = False

yield

metrics_server._server_started = False
90 changes: 90 additions & 0 deletions tests/integration/gunicorn/test_metrics_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import socket
import urllib.request

import prometheus_client
import pytest

from common.gunicorn.metrics_server import start_metrics_server
from tests import GetLogsFixture


@pytest.mark.prometheus_multiprocess_mode
def test_start_metrics_server__multiprocess_mode__serves_metrics(
unused_tcp_port: int,
test_metric: prometheus_client.Counter,
) -> None:
# Given
test_metric.labels(test_name="standalone_server_test").inc()

# When
start_metrics_server(port=unused_tcp_port)

# Then
with urllib.request.urlopen(
f"http://localhost:{unused_tcp_port}/metrics"
) as response:
content = response.read().decode()

assert response.status == 200
assert "pytest_tests_run_total" in content
assert 'test_name="standalone_server_test"' in content


def test_start_metrics_server__multiproc_dir_unset__logs_warning_and_skips(
get_logs: GetLogsFixture,
) -> None:
# Given
# PROMETHEUS_MULTIPROC_DIR is not set (default state)

# When
start_metrics_server()

# Then
logs = get_logs("common.gunicorn.metrics_server")
assert (
"WARNING",
"PROMETHEUS_MULTIPROC_DIR not set, skipping metrics server",
) in logs


@pytest.mark.prometheus_multiprocess_mode
def test_start_metrics_server__called_multiple_times__remains_idempotent(
unused_tcp_port: int,
) -> None:
# Given
start_metrics_server(port=unused_tcp_port)

# When
start_metrics_server(port=unused_tcp_port)
start_metrics_server(port=unused_tcp_port)

# Then
with urllib.request.urlopen(
f"http://localhost:{unused_tcp_port}/metrics"
) as response:
assert response.status == 200


@pytest.mark.prometheus_multiprocess_mode
def test_start_metrics_server__port_unavailable__logs_error(
unused_tcp_port: int,
get_logs: GetLogsFixture,
) -> None:
# Given
# Bind to 0.0.0.0 to match prometheus_client's default address
blocker = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
blocker.bind(("0.0.0.0", unused_tcp_port))
blocker.listen(1)

try:
# When
start_metrics_server(port=unused_tcp_port)

# Then
logs = get_logs("common.gunicorn.metrics_server")
assert any(
level == "ERROR" and "Failed to start metrics server" in msg
for level, msg in logs
)
finally:
blocker.close()
60 changes: 60 additions & 0 deletions tests/unit/common/gunicorn/test_conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from unittest.mock import Mock

import pytest
from pytest_mock import MockerFixture

from common.gunicorn.conf import child_exit, when_ready


def test_child_exit__calls_mark_process_dead_with_worker_pid(
mocker: MockerFixture,
) -> None:
# Given
mark_process_dead_mock = mocker.patch("common.gunicorn.conf.mark_process_dead")
server = Mock()
worker = Mock()
worker.pid = 12345

# When
child_exit(server, worker)

# Then
mark_process_dead_mock.assert_called_once_with(12345)


@pytest.mark.parametrize("prometheus_enabled", ("true", "TRUE"))
def test_when_ready__prometheus_enabled__starts_metrics_server(
mocker: MockerFixture,
prometheus_enabled: str,
) -> None:
# Given
mocker.patch.dict("os.environ", {"PROMETHEUS_ENABLED": prometheus_enabled})
start_metrics_server_mock = mocker.patch(
"common.gunicorn.metrics_server.start_metrics_server"
)
server = Mock()

# When
when_ready(server)

# Then
start_metrics_server_mock.assert_called_once()


@pytest.mark.parametrize("prometheus_enabled", ("", "false"))
def test_when_ready__prometheus_disabled__does_not_start_metrics_server(
mocker: MockerFixture,
prometheus_enabled: str,
) -> None:
# Given
mocker.patch.dict("os.environ", {"PROMETHEUS_ENABLED": prometheus_enabled})
start_metrics_server_mock = mocker.patch(
"common.gunicorn.metrics_server.start_metrics_server"
)
server = Mock()

# When
when_ready(server)

# Then
start_metrics_server_mock.assert_not_called()
4 changes: 0 additions & 4 deletions tests/unit/common/gunicorn/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def test_run_server__default_config_file__runs_expected(
# Given
# prevent real forking from Gunicorn
mocker.patch("os.fork").return_value = 0
mark_process_dead_mock = mocker.patch("common.gunicorn.conf.mark_process_dead")

pid = os.getpid()

Expand All @@ -58,9 +57,6 @@ def delay_kill(pid: int = pid) -> None:
with pytest.raises(SystemExit):
run_server({"bind": f"0.0.0.0:{unused_tcp_port}"})

# Then
mark_process_dead_mock.assert_called_once_with(pid)


def test_get_route_template__returns_expected__caches_expected(
mocker: MockerFixture,
Expand Down