Files
speckle-server/packages/server/logging/knexMonitoring.ts
T
Iain Sproat a0a1363e01 fix(logging): redact detail in database error message (#2642)
- may contain secrets which should not be logged
2024-08-12 17:38:21 +01:00

274 lines
9.5 KiB
TypeScript

import prometheusClient, { type Registry } from 'prom-client'
import { numberOfFreeConnections } from '@/modules/shared/helpers/dbHelper'
import { type Knex } from 'knex'
import { Logger } from 'pino'
import { toNDecimalPlaces } from '@/modules/core/utils/formatting'
import { omit } from 'lodash'
export const initKnexPrometheusMetrics = (params: {
db: Knex
register: Registry
logger: Logger
}) => {
const normalizeSqlMethod = (sqlMethod: string) => {
if (!sqlMethod) return 'unknown'
switch (sqlMethod.toLocaleLowerCase()) {
case 'first':
return 'select'
default:
return sqlMethod.toLocaleLowerCase()
}
}
const queryStartTime: Record<string, number> = {}
const connectionAcquisitionStartTime: Record<string, number> = {}
const connectionInUseStartTime: Record<string, number> = {}
new prometheusClient.Gauge({
registers: [params.register],
name: 'speckle_server_knex_free',
help: 'Number of free DB connections',
collect() {
this.set(params.db.client.pool.numFree())
}
})
new prometheusClient.Gauge({
registers: [params.register],
name: 'speckle_server_knex_used',
help: 'Number of used DB connections',
collect() {
this.set(params.db.client.pool.numUsed())
}
})
new prometheusClient.Gauge({
registers: [params.register],
name: 'speckle_server_knex_pending',
help: 'Number of pending DB connection aquires',
collect() {
this.set(params.db.client.pool.numPendingAcquires())
}
})
new prometheusClient.Gauge({
registers: [params.register],
name: 'speckle_server_knex_pending_creates',
help: 'Number of pending DB connection creates',
collect() {
this.set(params.db.client.pool.numPendingCreates())
}
})
new prometheusClient.Gauge({
registers: [params.register],
name: 'speckle_server_knex_pending_validations',
help: 'Number of pending DB connection validations. This is a state between pending acquisition and acquiring a connection.',
collect() {
this.set(params.db.client.pool.numPendingValidations())
}
})
new prometheusClient.Gauge({
registers: [params.register],
name: 'speckle_server_knex_remaining_capacity',
help: 'Remaining capacity of the DB connection pool',
collect() {
this.set(numberOfFreeConnections(params.db))
}
})
const metricQueryDuration = new prometheusClient.Summary({
registers: [params.register],
labelNames: ['sqlMethod', 'sqlNumberBindings'],
name: 'speckle_server_knex_query_duration',
help: 'Summary of the DB query durations in seconds'
})
const metricQueryErrors = new prometheusClient.Counter({
registers: [params.register],
labelNames: ['sqlMethod', 'sqlNumberBindings'],
name: 'speckle_server_knex_query_errors',
help: 'Number of DB queries with errors'
})
const metricConnectionAcquisitionDuration = new prometheusClient.Histogram({
registers: [params.register],
name: 'speckle_server_knex_connection_acquisition_duration',
help: 'Summary of the DB connection acquisition duration, from request to acquire connection from pool until successfully acquired, in seconds'
})
const metricConnectionPoolErrors = new prometheusClient.Counter({
registers: [params.register],
name: 'speckle_server_knex_connection_acquisition_errors',
help: 'Number of DB connection pool acquisition errors'
})
const metricConnectionInUseDuration = new prometheusClient.Histogram({
registers: [params.register],
name: 'speckle_server_knex_connection_usage_duration',
help: 'Summary of the DB connection duration, from successful acquisition of connection from pool until release back to pool, in seconds'
})
const metricConnectionPoolReapingDuration = new prometheusClient.Histogram({
registers: [params.register],
name: 'speckle_server_knex_connection_pool_reaping_duration',
help: 'Summary of the DB connection pool reaping duration, in seconds. Reaping is the process of removing idle connections from the pool.'
})
// configure hooks on knex
params.db.on('query', (data) => {
const queryId = data.__knexQueryUid + ''
queryStartTime[queryId] = performance.now()
})
params.db.on('query-response', (_data, querySpec) => {
const queryId = querySpec.__knexQueryUid + ''
const durationMs = performance.now() - queryStartTime[queryId]
const durationSec = toNDecimalPlaces(durationMs / 1000, 2)
delete queryStartTime[queryId]
if (!isNaN(durationSec))
metricQueryDuration
.labels({
sqlMethod: normalizeSqlMethod(querySpec.method),
sqlNumberBindings: querySpec.bindings?.length || -1
})
.observe(durationSec)
params.logger.debug(
{
sql: querySpec.sql,
sqlMethod: normalizeSqlMethod(querySpec.method),
sqlQueryId: queryId,
sqlQueryDurationMs: toNDecimalPlaces(durationMs, 0),
sqlNumberBindings: querySpec.bindings?.length || -1
},
"DB query successfully completed, for method '{sqlMethod}', after {sqlQueryDurationMs}ms"
)
})
params.db.on('query-error', (err, querySpec) => {
const queryId = querySpec.__knexQueryUid + ''
const durationMs = performance.now() - queryStartTime[queryId]
const durationSec = toNDecimalPlaces(durationMs / 1000, 2)
delete queryStartTime[queryId]
if (!isNaN(durationSec))
metricQueryDuration
.labels({
sqlMethod: normalizeSqlMethod(querySpec.method),
sqlNumberBindings: querySpec.bindings?.length || -1
})
.observe(durationSec)
metricQueryErrors.inc()
params.logger.warn(
{
err: omit(err, 'detail'),
sql: querySpec.sql,
sqlMethod: normalizeSqlMethod(querySpec.method),
sqlQueryId: queryId,
sqlQueryDurationMs: toNDecimalPlaces(durationMs, 0),
sqlNumberBindings: querySpec.bindings?.length || -1
},
'DB query errored for {sqlMethod} after {sqlQueryDurationMs}ms'
)
})
const pool = params.db.client.pool
// configure hooks on knex connection pool
pool.on('acquireRequest', (eventId: number) => {
connectionAcquisitionStartTime[eventId] = performance.now()
// params.logger.debug(
// {
// eventId
// },
// 'DB connection acquisition request occurred.'
// )
})
pool.on('acquireSuccess', (eventId: number, resource: unknown) => {
const now = performance.now()
const durationMs = now - connectionAcquisitionStartTime[eventId]
delete connectionAcquisitionStartTime[eventId]
if (!isNaN(durationMs)) metricConnectionAcquisitionDuration.observe(durationMs)
// successful acquisition is the start of usage, so record that start time
let knexUid: string | undefined = undefined
if (resource && typeof resource === 'object' && '__knexUid' in resource) {
const _knexUid = resource['__knexUid']
if (_knexUid && typeof _knexUid === 'string') {
knexUid = _knexUid
connectionInUseStartTime[knexUid] = now
}
}
// params.logger.debug(
// {
// eventId,
// knexUid,
// connectionAcquisitionDurationMs: toNDecimalPlaces(durationMs, 0)
// },
// 'DB connection (knexUid: {knexUid}) acquired after {connectionAcquisitionDurationMs}ms'
// )
})
pool.on('acquireFail', (eventId: number, err: unknown) => {
const now = performance.now()
const durationMs = now - connectionAcquisitionStartTime[eventId]
delete connectionAcquisitionStartTime[eventId]
metricConnectionPoolErrors.inc()
params.logger.warn(
{
err,
eventId,
connectionAcquisitionDurationMs: toNDecimalPlaces(durationMs, 0)
},
'DB connection acquisition failed after {connectionAcquisitionDurationMs}ms'
)
})
// resource returned to pool
pool.on('release', (resource: unknown) => {
if (!(resource && typeof resource === 'object' && '__knexUid' in resource)) return
const knexUid = resource['__knexUid']
if (!knexUid || typeof knexUid !== 'string') return
const now = performance.now()
const durationMs = now - connectionInUseStartTime[knexUid]
if (!isNaN(durationMs)) metricConnectionInUseDuration.observe(durationMs)
// params.logger.debug(
// {
// knexUid,
// connectionInUseDurationMs: toNDecimalPlaces(durationMs, 0)
// },
// 'DB connection (knexUid: {knexUid}) released after {connectionInUseDurationMs}ms'
// )
})
// resource was created and added to the pool
// pool.on('createRequest', (eventId) => {})
// pool.on('createSuccess', (eventId, resource) => {})
// pool.on('createFail', (eventId, err) => {})
// resource is destroyed and evicted from pool
// resource may or may not be invalid when destroySuccess / destroyFail is called
// pool.on('destroyRequest', (eventId, resource) => {})
// pool.on('destroySuccess', (eventId, resource) => {})
// pool.on('destroyFail', (eventId, resource, err) => {})
// when internal reaping event clock is activated / deactivated
let reapingStartTime: number | undefined = undefined
pool.on('startReaping', () => {
reapingStartTime = performance.now()
})
pool.on('stopReaping', () => {
if (!reapingStartTime) return
const durationMs = performance.now() - reapingStartTime
if (!isNaN(durationMs)) metricConnectionPoolReapingDuration.observe(durationMs)
reapingStartTime = undefined
})
// pool is destroyed (after poolDestroySuccess all event handlers are also cleared)
// pool.on('poolDestroyRequest', (eventId) => {})
// pool.on('poolDestroySuccess', (eventId) => {})
}