From 5e4a6c563500bdc3e4e385312afc6e4ccf6392b0 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Tue, 4 Mar 2025 08:41:53 +0000 Subject: [PATCH] fix(server/prometheus): ensure consistent Prometheus registry is used (#4106) * fix(server/prometheus): ensure consistent Prometheus registry is used - there was a conflicting dependency between Metrics initialization and Modules initialization; resolved by separating registry initialization from metrics initialization - pass in the registry to prevent implicit dependency being broken - when registering a metric, first attempt to remove any of existing metrics with same name to prevent errors - to prevent sneaky uses of the implicit registry, replace default import with explicit import so it is clearer when prometheusClient.registry is used * Add tests for registering metrics --- packages/server/app.ts | 39 +++++--- packages/server/bin/ts-www | 4 +- packages/server/bin/www | 4 +- .../modules/core/graph/plugins/logging.ts | 6 +- .../metrics/apolloSubscriptionMonitoring.ts | 47 +++++----- .../express/metrics/errorMetrics.ts | 24 ++--- .../highFrequencyMetrics/heapSizeAndUsed.ts | 14 ++- .../highfrequencyMonitoring.ts | 14 +-- .../knexConnectionPool.ts | 24 ++++- .../highFrequencyMetrics/processCPUTotal.ts | 12 ++- .../httpServer/httpServerMonitoring.ts | 16 ++-- .../components/knex/knexMonitoring.ts | 92 ++++++++++++------- packages/server/observability/index.ts | 37 ++++++-- packages/server/observability/logging.ts | 1 + .../observability/tests/metrics.spec.ts | 69 ++++++++++++++ packages/server/test/graphqlHelper.ts | 2 +- 16 files changed, 291 insertions(+), 114 deletions(-) create mode 100644 packages/server/observability/tests/metrics.spec.ts diff --git a/packages/server/app.ts b/packages/server/app.ts index e656bb5cb..c8bf2ab7a 100644 --- a/packages/server/app.ts +++ b/packages/server/app.ts @@ -10,7 +10,7 @@ import 'express-async-errors' import cookieParser from 'cookie-parser' import { createTerminus } from '@godaddy/terminus' -import Metrics from '@/observability' +import Metrics, { initPrometheusRegistry } from '@/observability' import { startupLogger, shutdownLogger, @@ -23,8 +23,8 @@ import { sanitizeHeaders } from '@/observability/components/express/expressLogging' -import { errorMetricsMiddleware } from '@/observability/components/express/metrics/errorMetrics' -import prometheusClient from 'prom-client' +import { errorMetricsMiddlewareFactory } from '@/observability/components/express/metrics/errorMetrics' +import prometheusClient, { Registry } from 'prom-client' import { ApolloServer } from '@apollo/server' import { expressMiddleware } from '@apollo/server/express4' @@ -103,9 +103,11 @@ const isWsServer = (server: http.Server | MockWsServer): server is MockWsServer * is that graphql-ws uses an entirely different protocol, so the client-side has to change as well, and so old clients * will be unable to use any WebSocket/subscriptions functionality with the updated server */ -export function buildApolloSubscriptionServer( +export function buildApolloSubscriptionServer(params: { server: http.Server | MockWsServer -): SubscriptionServer { + registers?: Registry[] +}): SubscriptionServer { + const { server, registers } = params const httpServer = isWsServer(server) ? undefined : server const mockServer = isWsServer(server) ? server : undefined @@ -118,7 +120,9 @@ export function buildApolloSubscriptionServer( metricConnectedClients, metricSubscriptionTotalOperations, metricSubscriptionTotalResponses - } = initApolloSubscriptionMonitoring() + } = initApolloSubscriptionMonitoring({ + registers: registers ?? [prometheusClient.register] + }) const getHeaders = (params: { connContext?: PossiblyMockedConnectionContext @@ -256,7 +260,7 @@ export async function buildApolloServer(options?: { schema, plugins: [ statusCodePlugin, - loggingPluginFactory({ register: prometheusClient.register }), + loggingPluginFactory({ registers: [prometheusClient.register] }), ApolloServerPluginLandingPageLocalDefault({ embed: true, includeCookies: true @@ -305,6 +309,8 @@ export async function init() { startupLogger.info('🖼️ Serving for frontend-2...') const app = express() + const promRegister = initPrometheusRegistry() // has to be called before both Metrics and Modules are initialized + app.disable('x-powered-by') // Moves things along automatically on restart. @@ -352,11 +358,14 @@ export async function init() { // Metrics relies on 'regions' table in the database, so much be initialized after migrations in the main database ("migrateDbToLatest({ region: 'main'," etc..) // It also relies on the regional knex clients, which will initialize and run migrations in the respective regions. // It must be initialized after the multiregion module is initialized in ModulesSetup.init - await Metrics(app) + await Metrics({ app, registry: promRegister }) // Init HTTP server & subscription server const server = http.createServer(app) - const subscriptionServer = buildApolloSubscriptionServer(server) + const subscriptionServer = buildApolloSubscriptionServer({ + server, + registers: [promRegister] + }) // Initialize graphql server const graphqlServer = await buildApolloServer({ @@ -370,12 +379,13 @@ export async function init() { ) // At the very end adding default error handler middleware - app.use(errorMetricsMiddleware) + app.use(errorMetricsMiddlewareFactory({ promRegisters: [promRegister] })) app.use(defaultErrorHandler) return { app, graphqlServer, + registers: [promRegister], server, subscriptionServer, readinessCheck: healthchecks.isReady @@ -414,11 +424,13 @@ async function createFrontendProxy() { export async function startHttp(params: { server: http.Server app: Express + registers?: Registry[] graphqlServer: ApolloServer readinessCheck: ReadinessHandler customPortOverride?: number }) { - const { server, app, graphqlServer, readinessCheck, customPortOverride } = params + const { server, app, registers, graphqlServer, readinessCheck, customPortOverride } = + params let bindAddress = getBindAddress() // defaults to 127.0.0.1 let port = getPort() // defaults to 3000 @@ -436,7 +448,10 @@ export async function startHttp(params: { bindAddress = getBindAddress('0.0.0.0') } - monitorActiveConnections(server) + monitorActiveConnections({ + httpServer: server, + registers: registers ?? [prometheusClient.register] + }) app.set('port', port) diff --git a/packages/server/bin/ts-www b/packages/server/bin/ts-www index c7a6b71d8..361cbcb4a 100755 --- a/packages/server/bin/ts-www +++ b/packages/server/bin/ts-www @@ -11,8 +11,8 @@ const { logger } = require('../observability/logging') const { init, startHttp } = require('../app') init() - .then(({ app, graphqlServer, server, readinessCheck }) => - startHttp({ app, graphqlServer, server, readinessCheck }) + .then(({ app, graphqlServer, registers, server, readinessCheck }) => + startHttp({ app, graphqlServer, registers, server, readinessCheck }) ) .catch((err) => { logger.error(err, 'Failed to start server. Exiting with non-zero exit code...') diff --git a/packages/server/bin/www b/packages/server/bin/www index 7849628b6..31e1c2369 100755 --- a/packages/server/bin/www +++ b/packages/server/bin/www @@ -5,8 +5,8 @@ const { logger } = require('../dist/observability/logging') const { init, startHttp } = require('../dist/app') init() - .then(({ app, graphqlServer, server, readinessCheck }) => - startHttp({ app, graphqlServer, server, readinessCheck }) + .then(({ app, graphqlServer, registers, server, readinessCheck }) => + startHttp({ app, graphqlServer, registers, server, readinessCheck }) ) .catch((err) => { logger.error(err, 'Failed to start server. Exiting with non-zero exit code...') diff --git a/packages/server/modules/core/graph/plugins/logging.ts b/packages/server/modules/core/graph/plugins/logging.ts index f7c296dab..2dd2a86b6 100644 --- a/packages/server/modules/core/graph/plugins/logging.ts +++ b/packages/server/modules/core/graph/plugins/logging.ts @@ -29,14 +29,14 @@ const isFieldNode = (node: SelectionNode): node is FieldNode => node.kind === 'F let metricCallCount: Counter export const loggingPluginFactory: (deps: { - register: Registry + registers: Registry[] }) => ApolloServerPlugin = (deps) => ({ serverWillStart: async () => { - deps.register.removeSingleMetric('speckle_server_apollo_calls') + deps.registers.forEach((r) => r.removeSingleMetric('speckle_server_apollo_calls')) metricCallCount = new Counter({ name: 'speckle_server_apollo_calls', help: 'Number of calls', - registers: [deps.register], + registers: deps.registers, labelNames: ['actionName'] }) }, diff --git a/packages/server/observability/components/apollo/metrics/apolloSubscriptionMonitoring.ts b/packages/server/observability/components/apollo/metrics/apolloSubscriptionMonitoring.ts index f99f99e9e..ced3453f8 100644 --- a/packages/server/observability/components/apollo/metrics/apolloSubscriptionMonitoring.ts +++ b/packages/server/observability/components/apollo/metrics/apolloSubscriptionMonitoring.ts @@ -1,15 +1,14 @@ -import prometheusClient from 'prom-client' +import { Counter, Gauge, Registry } from 'prom-client' let apolloSubscriptionMonitoringIsInitialized = false -let metricConnectCounter: prometheusClient.Counter -let metricConnectedClients: prometheusClient.Gauge -let metricSubscriptionTotalOperations: prometheusClient.Counter<'subscriptionType'> -let metricSubscriptionTotalResponses: prometheusClient.Counter< - 'subscriptionType' | 'status' -> +let metricConnectCounter: Counter +let metricConnectedClients: Gauge +let metricSubscriptionTotalOperations: Counter<'subscriptionType'> +let metricSubscriptionTotalResponses: Counter<'subscriptionType' | 'status'> -export const initApolloSubscriptionMonitoring = () => { +export const initApolloSubscriptionMonitoring = (params: { registers: Registry[] }) => { + const { registers } = params if (apolloSubscriptionMonitoringIsInitialized) return { metricConnectCounter, @@ -19,33 +18,37 @@ export const initApolloSubscriptionMonitoring = () => { } // Init metrics - prometheusClient.register.removeSingleMetric('speckle_server_apollo_connect') - metricConnectCounter = new prometheusClient.Counter({ + registers.forEach((r) => r.removeSingleMetric('speckle_server_apollo_connect')) + metricConnectCounter = new Counter({ name: 'speckle_server_apollo_connect', - help: 'Number of connects' + help: 'Number of connects', + registers }) - prometheusClient.register.removeSingleMetric('speckle_server_apollo_clients') - metricConnectedClients = new prometheusClient.Gauge({ + registers.forEach((r) => r.removeSingleMetric('speckle_server_apollo_clients')) + metricConnectedClients = new Gauge({ name: 'speckle_server_apollo_clients', - help: 'Number of currently connected clients' + help: 'Number of currently connected clients', + registers }) - prometheusClient.register.removeSingleMetric( - 'speckle_server_apollo_graphql_total_subscription_operations' + registers.forEach((r) => + r.removeSingleMetric('speckle_server_apollo_graphql_total_subscription_operations') ) - metricSubscriptionTotalOperations = new prometheusClient.Counter({ + metricSubscriptionTotalOperations = new Counter({ name: 'speckle_server_apollo_graphql_total_subscription_operations', help: 'Number of total subscription operations served by this instance', - labelNames: ['subscriptionType'] as const + labelNames: ['subscriptionType'] as const, + registers }) - prometheusClient.register.removeSingleMetric( - 'speckle_server_apollo_graphql_total_subscription_responses' + registers.forEach((r) => + r.removeSingleMetric('speckle_server_apollo_graphql_total_subscription_responses') ) - metricSubscriptionTotalResponses = new prometheusClient.Counter({ + metricSubscriptionTotalResponses = new Counter({ name: 'speckle_server_apollo_graphql_total_subscription_responses', help: 'Number of total subscription responses served by this instance', - labelNames: ['subscriptionType', 'status'] as const + labelNames: ['subscriptionType', 'status'] as const, + registers }) apolloSubscriptionMonitoringIsInitialized = true diff --git a/packages/server/observability/components/express/metrics/errorMetrics.ts b/packages/server/observability/components/express/metrics/errorMetrics.ts index 93cc64547..169d7f6ae 100644 --- a/packages/server/observability/components/express/metrics/errorMetrics.ts +++ b/packages/server/observability/components/express/metrics/errorMetrics.ts @@ -1,22 +1,22 @@ /* istanbul ignore file */ import type { Nullable } from '@speckle/shared' -import prometheusClient from 'prom-client' -import type express from 'express' +import { Counter, type Registry } from 'prom-client' +import type { ErrorRequestHandler } from 'express' -// eslint-disable-next-line @typescript-eslint/no-explicit-any -let metricErrorCount: Nullable> = null +let metricErrorCount: Nullable> = null -export const errorMetricsMiddleware: express.ErrorRequestHandler = ( - err, - req, - res, - next -) => { +export const errorMetricsMiddlewareFactory: (params: { + promRegisters: Registry[] +}) => ErrorRequestHandler = (params) => (err, req, res, next) => { if (metricErrorCount === null) { - metricErrorCount = new prometheusClient.Counter({ + params.promRegisters.forEach((register) => { + register.removeSingleMetric('speckle_server_request_errors') + }) + metricErrorCount = new Counter({ name: 'speckle_server_request_errors', help: 'Number of requests that threw exceptions', - labelNames: ['route'] + labelNames: ['route'], + registers: params.promRegisters }) } diff --git a/packages/server/observability/components/highFrequencyMetrics/heapSizeAndUsed.ts b/packages/server/observability/components/highFrequencyMetrics/heapSizeAndUsed.ts index f820faef5..4d09bcdce 100644 --- a/packages/server/observability/components/highFrequencyMetrics/heapSizeAndUsed.ts +++ b/packages/server/observability/components/highFrequencyMetrics/heapSizeAndUsed.ts @@ -41,15 +41,17 @@ type MetricConfig = { } export const heapSizeAndUsed = ( - registry: Registry, + registers: Registry[], config: MetricConfig = {} ): Metric => { - const registers = registry ? [registry] : undefined const namePrefix = config.prefix ?? '' const labels = config.labels ?? {} const labelNames = Object.keys(labels) const buckets = { ...DEFAULT_NODEJS_HEAP_SIZE_BUCKETS, ...config.buckets } + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + NODEJS_HEAP_SIZE_TOTAL) + }) const heapSizeTotal = new Histogram({ name: namePrefix + NODEJS_HEAP_SIZE_TOTAL, help: 'Process heap size from Node.js in bytes. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', @@ -57,6 +59,10 @@ export const heapSizeAndUsed = ( buckets: buckets.NODEJS_HEAP_SIZE_TOTAL, labelNames }) + + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + NODEJS_HEAP_SIZE_USED) + }) const heapSizeUsed = new Histogram({ name: namePrefix + NODEJS_HEAP_SIZE_USED, help: 'Process heap size used from Node.js in bytes. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', @@ -64,6 +70,10 @@ export const heapSizeAndUsed = ( buckets: buckets.NODEJS_HEAP_SIZE_USED, labelNames }) + + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + NODEJS_EXTERNAL_MEMORY) + }) const externalMemUsed = new Histogram({ name: namePrefix + NODEJS_EXTERNAL_MEMORY, help: 'Node.js external memory size in bytes. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', diff --git a/packages/server/observability/components/highFrequencyMetrics/highfrequencyMonitoring.ts b/packages/server/observability/components/highFrequencyMetrics/highfrequencyMonitoring.ts index 8ede14d8e..48e69fda1 100644 --- a/packages/server/observability/components/highFrequencyMetrics/highfrequencyMonitoring.ts +++ b/packages/server/observability/components/highFrequencyMetrics/highfrequencyMonitoring.ts @@ -24,23 +24,25 @@ type HighFrequencyMonitor = { } export const initHighFrequencyMonitoring = (params: { - register: Registry + registers: Registry[] collectionPeriodMilliseconds: number config: MetricConfig }): HighFrequencyMonitor => { - const { register, collectionPeriodMilliseconds } = params + const { registers, collectionPeriodMilliseconds } = params const config = params.config - const registers = register ? [register] : undefined const namePrefix = config.prefix ?? '' const labels = config.labels ?? {} const labelNames = Object.keys(labels) const metrics = [ - processCpuTotal(register, config), - heapSizeAndUsed(register, config), - knexConnections(register, config) + processCpuTotal(registers, config), + heapSizeAndUsed(registers, config), + knexConnections(registers, config) ] + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + 'self_monitor_time_high_frequency') + }) const selfMonitor = new Histogram({ name: namePrefix + 'self_monitor_time_high_frequency', help: 'The time taken to collect all of the high frequency metrics, seconds.', diff --git a/packages/server/observability/components/highFrequencyMetrics/knexConnectionPool.ts b/packages/server/observability/components/highFrequencyMetrics/knexConnectionPool.ts index 3130029b3..d0c28981e 100644 --- a/packages/server/observability/components/highFrequencyMetrics/knexConnectionPool.ts +++ b/packages/server/observability/components/highFrequencyMetrics/knexConnectionPool.ts @@ -36,13 +36,18 @@ type MetricConfig = { > } -export const knexConnections = (registry: Registry, config: MetricConfig): Metric => { - const registers = registry ? [registry] : undefined +export const knexConnections = ( + registers: Registry[], + config: MetricConfig +): Metric => { const namePrefix = config.prefix ?? '' const labels = config.labels ?? {} const labelNames = [...Object.keys(labels), 'region'] const buckets = { ...DEFAULT_KNEX_TOTAL_BUCKETS, ...config.buckets } + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + KNEX_CONNECTIONS_FREE) + }) const knexConnectionsFree = new Histogram({ name: namePrefix + KNEX_CONNECTIONS_FREE, help: 'Number of free DB connections. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', @@ -51,6 +56,9 @@ export const knexConnections = (registry: Registry, config: MetricConfig): Metri labelNames }) + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + KNEX_CONNECTIONS_USED) + }) const knexConnectionsUsed = new Histogram({ name: namePrefix + KNEX_CONNECTIONS_USED, help: 'Number of used DB connections', @@ -59,6 +67,9 @@ export const knexConnections = (registry: Registry, config: MetricConfig): Metri labelNames }) + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + KNEX_PENDING_ACQUIRES) + }) const knexPendingAcquires = new Histogram({ name: namePrefix + KNEX_PENDING_ACQUIRES, help: 'Number of pending DB connection aquires', @@ -67,6 +78,9 @@ export const knexConnections = (registry: Registry, config: MetricConfig): Metri labelNames }) + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + KNEX_PENDING_CREATES) + }) const knexPendingCreates = new Histogram({ name: namePrefix + KNEX_PENDING_CREATES, help: 'Number of pending DB connection creates', @@ -75,6 +89,9 @@ export const knexConnections = (registry: Registry, config: MetricConfig): Metri labelNames }) + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + KNEX_PENDING_VALIDATIONS) + }) const knexPendingValidations = new Histogram({ name: namePrefix + KNEX_PENDING_VALIDATIONS, help: 'Number of pending DB connection validations. This is a state between pending acquisition and acquiring a connection.', @@ -83,6 +100,9 @@ export const knexConnections = (registry: Registry, config: MetricConfig): Metri labelNames }) + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + KNEX_REMAINING_CAPACITY) + }) const knexRemainingCapacity = new Histogram({ name: namePrefix + KNEX_REMAINING_CAPACITY, help: 'Remaining capacity of the DB connection pool', diff --git a/packages/server/observability/components/highFrequencyMetrics/processCPUTotal.ts b/packages/server/observability/components/highFrequencyMetrics/processCPUTotal.ts index b5dacbd7a..5e1fa9119 100644 --- a/packages/server/observability/components/highFrequencyMetrics/processCPUTotal.ts +++ b/packages/server/observability/components/highFrequencyMetrics/processCPUTotal.ts @@ -41,15 +41,17 @@ type MetricConfig = { } export const processCpuTotal = ( - registry: Registry, + registers: Registry[], config: MetricConfig = {} ): Metric => { - const registers = registry ? [registry] : undefined const namePrefix = config.prefix ?? '' const labels = config.labels ?? {} const labelNames = Object.keys(labels) const buckets = { ...DEFAULT_CPU_TOTAL_BUCKETS, ...config.buckets } + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + PROCESS_CPU_USER_SECONDS) + }) const cpuUserUsageHistogram = new Histogram({ name: namePrefix + PROCESS_CPU_USER_SECONDS, help: 'Total user CPU time spent in seconds. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', @@ -57,6 +59,9 @@ export const processCpuTotal = ( buckets: buckets.PROCESS_CPU_USER_SECONDS, registers }) + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + PROCESS_CPU_SYSTEM_SECONDS) + }) const cpuSystemUsageHistogram = new Histogram({ name: namePrefix + PROCESS_CPU_SYSTEM_SECONDS, help: 'Total system CPU time spent in seconds. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', @@ -64,6 +69,9 @@ export const processCpuTotal = ( buckets: buckets.PROCESS_CPU_SYSTEM_SECONDS, labelNames }) + registers.forEach((r) => { + r.removeSingleMetric(namePrefix + PROCESS_CPU_SECONDS) + }) const cpuUsageHistogram = new Histogram({ name: namePrefix + PROCESS_CPU_SECONDS, help: 'Total user and system CPU time spent in seconds. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', diff --git a/packages/server/observability/components/httpServer/httpServerMonitoring.ts b/packages/server/observability/components/httpServer/httpServerMonitoring.ts index afa551b03..e084d6888 100644 --- a/packages/server/observability/components/httpServer/httpServerMonitoring.ts +++ b/packages/server/observability/components/httpServer/httpServerMonitoring.ts @@ -1,17 +1,21 @@ /* istanbul ignore file */ import type { Nullable } from '@speckle/shared' -import prometheusClient from 'prom-client' -import type http from 'http' +import { Registry, Gauge } from 'prom-client' +import type { Server } from 'http' // eslint-disable-next-line @typescript-eslint/no-explicit-any -let metricActiveConnections: Nullable> = null +let metricActiveConnections: Nullable> = null -export const monitorActiveConnections = (httpServer: http.Server) => { +export const monitorActiveConnections = (params: { + httpServer: Server + registers: Registry[] +}) => { + const { httpServer, registers } = params if (metricActiveConnections !== null) { - prometheusClient.register.removeSingleMetric('speckle_server_active_connections') + registers.forEach((r) => r.removeSingleMetric('speckle_server_active_connections')) } - metricActiveConnections = new prometheusClient.Gauge({ + metricActiveConnections = new Gauge({ name: 'speckle_server_active_connections', help: 'Number of active http connections', async collect() { diff --git a/packages/server/observability/components/knex/knexMonitoring.ts b/packages/server/observability/components/knex/knexMonitoring.ts index 044cc70ff..8e4981d21 100644 --- a/packages/server/observability/components/knex/knexMonitoring.ts +++ b/packages/server/observability/components/knex/knexMonitoring.ts @@ -1,4 +1,4 @@ -import prometheusClient, { type Registry } from 'prom-client' +import { type Registry, Summary, Counter, Gauge, Histogram } from 'prom-client' import { numberOfFreeConnections } from '@/modules/shared/helpers/dbHelper' import { type Knex } from 'knex' import { Logger } from 'pino' @@ -7,12 +7,12 @@ import { omit } from 'lodash' import { getRequestContext } from '@/observability/components/express/requestContext' import { collectLongTrace } from '@speckle/shared' -let metricQueryDuration: prometheusClient.Summary -let metricQueryErrors: prometheusClient.Counter -let metricConnectionAcquisitionDuration: prometheusClient.Histogram -let metricConnectionPoolErrors: prometheusClient.Counter -let metricConnectionInUseDuration: prometheusClient.Histogram -let metricConnectionPoolReapingDuration: prometheusClient.Histogram +let metricQueryDuration: Summary +let metricQueryErrors: Counter +let metricConnectionAcquisitionDuration: Histogram +let metricConnectionPoolErrors: Counter +let metricConnectionInUseDuration: Histogram +let metricConnectionPoolReapingDuration: Histogram const initializedRegions: string[] = [] let initializedPollingMetrics = false @@ -20,13 +20,16 @@ export const initKnexPrometheusMetrics = async (params: { getAllDbClients: () => Promise< Array<{ client: Knex; isMain: boolean; regionKey: string }> > - register: Registry + registers: Registry[] logger: Logger }) => { + const { registers } = params if (!initializedPollingMetrics) { initializedPollingMetrics = true - new prometheusClient.Gauge({ - registers: [params.register], + + registers.forEach((r) => r.removeSingleMetric('speckle_server_knex_free')) + new Gauge({ + registers, name: 'speckle_server_knex_free', labelNames: ['region'], help: 'Number of free DB connections', @@ -40,8 +43,9 @@ export const initKnexPrometheusMetrics = async (params: { } }) - new prometheusClient.Gauge({ - registers: [params.register], + registers.forEach((r) => r.removeSingleMetric('speckle_server_knex_used')) + new Gauge({ + registers, name: 'speckle_server_knex_used', labelNames: ['region'], help: 'Number of used DB connections', @@ -55,8 +59,9 @@ export const initKnexPrometheusMetrics = async (params: { } }) - new prometheusClient.Gauge({ - registers: [params.register], + registers.forEach((r) => r.removeSingleMetric('speckle_server_knex_pending')) + new Gauge({ + registers, name: 'speckle_server_knex_pending', labelNames: ['region'], help: 'Number of pending DB connection aquires', @@ -70,8 +75,11 @@ export const initKnexPrometheusMetrics = async (params: { } }) - new prometheusClient.Gauge({ - registers: [params.register], + registers.forEach((r) => + r.removeSingleMetric('speckle_server_knex_pending_creates') + ) + new Gauge({ + registers, name: 'speckle_server_knex_pending_creates', labelNames: ['region'], help: 'Number of pending DB connection creates', @@ -85,8 +93,11 @@ export const initKnexPrometheusMetrics = async (params: { } }) - new prometheusClient.Gauge({ - registers: [params.register], + registers.forEach((r) => + r.removeSingleMetric('speckle_server_knex_pending_validations') + ) + new Gauge({ + registers, name: 'speckle_server_knex_pending_validations', labelNames: ['region'], help: 'Number of pending DB connection validations. This is a state between pending acquisition and acquiring a connection.', @@ -100,8 +111,11 @@ export const initKnexPrometheusMetrics = async (params: { } }) - new prometheusClient.Gauge({ - registers: [params.register], + registers.forEach((r) => + r.removeSingleMetric('speckle_server_knex_remaining_capacity') + ) + new Gauge({ + registers, name: 'speckle_server_knex_remaining_capacity', labelNames: ['region'], help: 'Remaining capacity of the DB connection pool', @@ -115,43 +129,57 @@ export const initKnexPrometheusMetrics = async (params: { } }) - metricQueryDuration = new prometheusClient.Summary({ - registers: [params.register], + registers.forEach((r) => r.removeSingleMetric('speckle_server_knex_query_duration')) + metricQueryDuration = new Summary({ + registers, labelNames: ['sqlMethod', 'sqlNumberBindings', 'region'], name: 'speckle_server_knex_query_duration', help: 'Summary of the DB query durations in seconds' }) - metricQueryErrors = new prometheusClient.Counter({ - registers: [params.register], + registers.forEach((r) => r.removeSingleMetric('speckle_server_knex_query_errors')) + metricQueryErrors = new Counter({ + registers, labelNames: ['sqlMethod', 'sqlNumberBindings', 'region'], name: 'speckle_server_knex_query_errors', help: 'Number of DB queries with errors' }) - metricConnectionAcquisitionDuration = new prometheusClient.Histogram({ - registers: [params.register], + registers.forEach((r) => + r.removeSingleMetric('speckle_server_knex_connection_acquisition_duration') + ) + metricConnectionAcquisitionDuration = new Histogram({ + registers, name: 'speckle_server_knex_connection_acquisition_duration', labelNames: ['region'], help: 'Summary of the DB connection acquisition duration, from request to acquire connection from pool until successfully acquired, in seconds' }) - metricConnectionPoolErrors = new prometheusClient.Counter({ - registers: [params.register], + registers.forEach((r) => + r.removeSingleMetric('speckle_server_knex_connection_acquisition_errors') + ) + metricConnectionPoolErrors = new Counter({ + registers, name: 'speckle_server_knex_connection_acquisition_errors', labelNames: ['region'], help: 'Number of DB connection pool acquisition errors' }) - metricConnectionInUseDuration = new prometheusClient.Histogram({ - registers: [params.register], + registers.forEach((r) => + r.removeSingleMetric('speckle_server_knex_connection_usage_duration') + ) + metricConnectionInUseDuration = new Histogram({ + registers, name: 'speckle_server_knex_connection_usage_duration', labelNames: ['region'], help: 'Summary of the DB connection duration, from successful acquisition of connection from pool until release back to pool, in seconds' }) - metricConnectionPoolReapingDuration = new prometheusClient.Histogram({ - registers: [params.register], + registers.forEach((r) => + r.removeSingleMetric('speckle_server_knex_connection_pool_reaping_duration') + ) + metricConnectionPoolReapingDuration = new Histogram({ + registers, name: 'speckle_server_knex_connection_pool_reaping_duration', labelNames: ['region'], help: 'Summary of the DB connection pool reaping duration, in seconds. Reaping is the process of removing idle connections from the pool.' diff --git a/packages/server/observability/index.ts b/packages/server/observability/index.ts index e38237b22..e5f93e0a6 100644 --- a/packages/server/observability/index.ts +++ b/packages/server/observability/index.ts @@ -1,5 +1,5 @@ /* istanbul ignore file */ -import prometheusClient from 'prom-client' +import prometheusClient, { Registry } from 'prom-client' import promBundle from 'express-prom-bundle' import { initKnexPrometheusMetrics } from '@/observability/components/knex/knexMonitoring' @@ -10,18 +10,34 @@ import type express from 'express' import { getAllRegisteredDbClients } from '@/modules/multiregion/utils/dbSelector' let prometheusInitialized = false +let prometheusRegistryInitialized = false -export default async function (app: express.Express) { - if (!prometheusInitialized) { - prometheusInitialized = true +/** + * This has to be called prior to using Prometheus + * @returns The registry of Prometheus metrics which will be served + */ +export function initPrometheusRegistry() { + if (!prometheusRegistryInitialized) { + prometheusRegistryInitialized = true prometheusClient.register.clear() prometheusClient.register.setDefaultLabels({ project: 'speckle-server', app: 'server' }) - prometheusClient.collectDefaultMetrics() + } + + return prometheusClient.register +} + +export default async function (params: { app: express.Express; registry: Registry }) { + const { app, registry } = params + if (!prometheusInitialized) { + prometheusInitialized = true + prometheusClient.collectDefaultMetrics({ + register: registry + }) const highfrequencyMonitoring = initHighFrequencyMonitoring({ - register: prometheusClient.register, + registers: [registry], collectionPeriodMilliseconds: highFrequencyMetricsCollectionPeriodMs(), config: { getDbClients: getAllRegisteredDbClients @@ -30,7 +46,7 @@ export default async function (app: express.Express) { highfrequencyMonitoring.start() await initKnexPrometheusMetrics({ - register: prometheusClient.register, + registers: [registry], getAllDbClients: getAllRegisteredDbClients, logger }) @@ -41,7 +57,8 @@ export default async function (app: express.Express) { includePath: true, httpDurationMetricName: 'speckle_server_request_duration', metricType: 'summary', - autoregister: false + autoregister: false, + promRegistry: registry }) ) } @@ -49,8 +66,8 @@ export default async function (app: express.Express) { // Expose prometheus metrics app.get('/metrics', async (req, res, next) => { try { - res.set('Content-Type', prometheusClient.register.contentType) - res.end(await prometheusClient.register.metrics()) + res.set('Content-Type', registry.contentType) + res.end(await registry.metrics()) } catch (ex: unknown) { res.status(500).end(ex instanceof Error ? ex.message : `${ex}`) next(ex) diff --git a/packages/server/observability/logging.ts b/packages/server/observability/logging.ts index 8e0af6f63..a0020aac6 100644 --- a/packages/server/observability/logging.ts +++ b/packages/server/observability/logging.ts @@ -35,6 +35,7 @@ export const testLogger = extendLoggerComponent(logger, 'test') export const fileUploadsLogger = extendLoggerComponent(logger, 'file-uploads') export const emailLogger = extendLoggerComponent(logger, 'email') export const taskSchedulerLogger = extendLoggerComponent(logger, 'task-scheduler') +export const previewLogger = extendLoggerComponent(logger, 'preview') export type Logger = typeof logger export { extendLoggerComponent, Observability } diff --git a/packages/server/observability/tests/metrics.spec.ts b/packages/server/observability/tests/metrics.spec.ts new file mode 100644 index 000000000..0a71d3978 --- /dev/null +++ b/packages/server/observability/tests/metrics.spec.ts @@ -0,0 +1,69 @@ +import { beforeEachContext, initializeTestServer } from '@/test/hooks' +import { expect } from 'chai' + +describe('Observability', () => { + describe('Metrics', () => { + let serverAddress: string + before(async () => { + const ctx = await beforeEachContext() + ;({ serverAddress } = await initializeTestServer(ctx)) + }) + + describe('Register metrics', () => { + let metricsPageBody = '' + before(async () => { + const metricsResponse = await fetch(`${serverAddress}/metrics`, { + method: 'GET' + }) + metricsPageBody = await metricsResponse.text() + }) + const testCases = [ + 'speckle_server_apollo_calls', + 'speckle_server_request_duration', + 'speckle_server_request_errors', + 'speckle_server_active_connections', + 'speckle_server_apollo_connect', + 'speckle_server_apollo_clients', + 'speckle_server_apollo_graphql_total_subscription_operations', + 'speckle_server_apollo_graphql_total_subscription_responses', + 'speckle_server_active_connections', + 'speckle_server_knex_free', + 'speckle_server_knex_used', + 'speckle_server_knex_pending', + 'speckle_server_knex_pending_creates', + 'speckle_server_knex_pending_validations', + 'speckle_server_knex_remaining_capacity', + 'speckle_server_knex_query_duration', + 'speckle_server_knex_query_errors', + 'speckle_server_knex_connection_acquisition_duration', + 'speckle_server_knex_connection_acquisition_errors', + 'speckle_server_knex_connection_usage_duration', + 'speckle_server_knex_connection_pool_reaping_duration', + 'nodejs_heap_size_total_bytes_high_frequency', + 'nodejs_heap_size_used_bytes_high_frequency', + 'nodejs_external_memory_bytes_high_frequency', + 'self_monitor_time_high_frequency', + 'knex_connections_free_high_frequency', + 'knex_connections_used_high_frequency', + 'knex_pending_acquires_high_frequency', + 'knex_pending_creates_high_frequency', + 'knex_pending_validations_high_frequency', + 'knex_remaining_capacity_high_frequency', + 'process_cpu_user_seconds_total_high_frequency', + 'process_cpu_system_seconds_total_high_frequency', + 'process_cpu_seconds_total_high_frequency' + ] + + testCases.forEach((testCase) => + it(`should register metric ${testCase}`, async () => { + const re = new RegExp(String.raw`(^${testCase}.*)\}\s([\d]+)$`, 'gm') + const match = [...metricsPageBody.matchAll(re)] + if (!match) { + expect(match).not.to.be.null + return '' //HACK force correct type below + } + }) + ) + }) + }) +}) diff --git a/packages/server/test/graphqlHelper.ts b/packages/server/test/graphqlHelper.ts index 00420dbca..aabaf89f5 100644 --- a/packages/server/test/graphqlHelper.ts +++ b/packages/server/test/graphqlHelper.ts @@ -290,7 +290,7 @@ export const testApolloSubscriptionServer = async () => { set(mockWsServer, 'removeListener', mockWsServer.off.bind(mockWsServer)) // backwards compat w/ subscriptions-transport-ws const mockWs = MockSocket.WebSocket as unknown as ws.WebSocket - const apolloSubServer = buildApolloSubscriptionServer(mockWsServer) + const apolloSubServer = buildApolloSubscriptionServer({ server: mockWsServer }) // weakRef to ensure we dont prevent garbage collection const clients: WeakRef[] = []