feat(server): improving knex query stack traces (#3855)

* feat(server): improving knex query stack traces

* minor cleanup

* more improvements
This commit is contained in:
Kristaps Fabians Geikins
2025-01-21 11:59:09 +02:00
committed by GitHub
parent 92d5fe4066
commit 347fa4b462
9 changed files with 101 additions and 12 deletions
+7 -4
View File
@@ -30,10 +30,6 @@ if (isApolloMonitoringEnabled() && !getApolloServerVersion()) {
process.env.APOLLO_SERVER_USER_VERSION = getServerVersion()
}
//knex is a singleton controlled by module so can't wait til app init
const { initOpenTelemetry } = require('./otel')
initOpenTelemetry()
// If running in test env, load .env.test first
// (appRoot necessary, cause env files aren't loaded through require() calls)
if (isTestEnv()) {
@@ -49,6 +45,13 @@ if (isTestEnv()) {
dotenv.config({ path: `${packageRoot}/.env` })
// knex is a singleton controlled by module so can't wait til app init
const { initOpenTelemetry } = require('./otel')
initOpenTelemetry()
const { patchKnex } = require('./modules/core/patches/knex')
patchKnex()
module.exports = {
appRoot,
packageRoot
+17 -8
View File
@@ -5,6 +5,7 @@ import { Logger } from 'pino'
import { toNDecimalPlaces } from '@/modules/core/utils/formatting'
import { omit } from 'lodash'
import { getRequestContext } from '@/logging/requestContext'
import { collectLongTrace } from '@speckle/shared'
let metricQueryDuration: prometheusClient.Summary<string>
let metricQueryErrors: prometheusClient.Counter<string>
@@ -183,6 +184,7 @@ interface QueryEvent extends Knex.Sql {
__knexUid: string
__knexTxId: string
__knexQueryUid: string
__stackTrace: string
}
const initKnexPrometheusMetricsForRegionEvents = async (params: {
@@ -191,20 +193,25 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: {
logger: Logger
}) => {
const { region, db } = params
const queryStartTime: Record<string, number> = {}
const queryMetadata: Record<string, { startTime: number; stackTrace: string }> = {}
const connectionAcquisitionStartTime: Record<string, number> = {}
const connectionInUseStartTime: Record<string, number> = {}
db.on('query', (data: QueryEvent) => {
const queryId = data.__knexQueryUid + ''
queryStartTime[queryId] = performance.now()
queryMetadata[queryId] = {
startTime: performance.now(),
stackTrace: data.__stackTrace
}
})
db.on('query-response', (_response: unknown, data: QueryEvent) => {
const queryId = data.__knexQueryUid + ''
const durationMs = performance.now() - queryStartTime[queryId]
const { startTime = NaN, stackTrace = undefined } = queryMetadata[queryId] || {}
const durationMs = performance.now() - startTime
const durationSec = toNDecimalPlaces(durationMs / 1000, 2)
delete queryStartTime[queryId]
delete queryMetadata[queryId]
if (!isNaN(durationSec))
metricQueryDuration
.labels({
@@ -214,7 +221,6 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: {
})
.observe(durationSec)
const trace = (new Error().stack || '').split('\n').slice(1).join('\n').trim()
const reqCtx = getRequestContext()
// Update reqCtx with DB query metrics
@@ -223,6 +229,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: {
reqCtx.dbMetrics.totalDuration += durationMs || 0
}
const trace = stackTrace || collectLongTrace()
params.logger.info(
{
region,
@@ -240,9 +247,11 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: {
db.on('query-error', (err: unknown, data: QueryEvent) => {
const queryId = data.__knexQueryUid + ''
const durationMs = performance.now() - queryStartTime[queryId]
const { startTime = NaN, stackTrace = undefined } = queryMetadata[queryId] || {}
const durationMs = performance.now() - startTime
const durationSec = toNDecimalPlaces(durationMs / 1000, 2)
delete queryStartTime[queryId]
delete queryMetadata[queryId]
if (!isNaN(durationSec))
metricQueryDuration
@@ -254,7 +263,6 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: {
.observe(durationSec)
metricQueryErrors.inc()
const trace = (new Error().stack || '').split('\n').slice(1).join('\n').trim()
const reqCtx = getRequestContext()
// Update reqCtx with DB query metrics
@@ -263,6 +271,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: {
reqCtx.dbMetrics.totalDuration += durationMs || 0
}
const trace = stackTrace || collectLongTrace()
params.logger.warn(
{
err: typeof err === 'object' ? omit(err, 'detail') : err,
@@ -0,0 +1,27 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import { enableImprovedKnexTelemetryStackTraces } from '@/modules/shared/helpers/envHelper'
import { collectLongTrace } from '@speckle/shared'
import knexQueryCompiler from 'knex/lib/query/querycompiler'
/**
* ⚠️ HERE BE DARKNESS! ⚠️
* We're monkey patching knex which is pretty cursed, but i'm not aware of any other way
* to get a proper stack trace in query event handlers
*/
export const patchKnex = () => {
if (enableImprovedKnexTelemetryStackTraces()) {
// Preserve stack trace on query compilation
const originalToSQL = knexQueryCompiler.prototype.toSQL
const newToSQL: typeof originalToSQL = function (
this: typeof knexQueryCompiler.prototype,
...args: any
) {
const ret = originalToSQL.apply(this, args)
ret.__stackTrace = collectLongTrace()
return ret
}
knexQueryCompiler.prototype.toSQL = newToSQL
}
}
@@ -443,3 +443,7 @@ export const knexAsyncStackTracesEnabled = () => {
export const asyncRequestContextEnabled = () => {
return getBooleanFromEnv('ASYNC_REQUEST_CONTEXT_ENABLED')
}
export function enableImprovedKnexTelemetryStackTraces() {
return getBooleanFromEnv('KNEX_IMPROVED_TELEMETRY_STACK_TRACES')
}
+22
View File
@@ -0,0 +1,22 @@
// used in patches/knex.ts
declare module 'knex/lib/query/querycompiler' {
declare class QueryCompiler {
toSQL(
method: string,
tz: string
): {
method: string
options: Record<string, unknown>
timeout: number | false
cancelOnTimeout: boolean
bindings: unknown[]
__knexQueryUid: string
/**
* This one's patched in by us for telemetry purposes
*/
__stackTrace: string
}
}
export default QueryCompiler
}
+11
View File
@@ -20,3 +20,14 @@ export function throwUncoveredError(e: never): never {
export function createUncoveredError(e: never) {
return new Error(`Uncovered error case ${e}.`)
}
/**
* Note: Only V8 and Node.js support controlling the stack trace limit
*/
export const collectLongTrace = (limit?: number) => {
const originalLimit = Error.stackTraceLimit
Error.stackTraceLimit = limit || 30
const trace = (new Error().stack || '').split('\n').slice(1).join('\n').trim()
Error.stackTraceLimit = originalLimit
return trace
}
@@ -782,6 +782,11 @@ Generate the environment variables for Speckle server and Speckle objects deploy
value: {{ .Values.db.knexAsyncStackTracesEnabled | quote }}
{{- end}}
{{- if .Values.db.knexImprovedTelemetryStackTraces }}
- name: KNEX_IMPROVED_TELEMETRY_STACK_TRACES
value: {{ .Values.db.knexImprovedTelemetryStackTraces | quote }}
{{- end}}
- name: PGSSLMODE
value: "{{ .Values.db.PGSSLMODE }}"
@@ -246,6 +246,11 @@
"description": "If enabled, will provide better stack traces for errors arising out of knex operations",
"default": false
},
"knexImprovedTelemetryStackTraces": {
"type": "boolean",
"description": "If enabled, will provide better stack traces for knex telemetry",
"default": false
},
"databaseName": {
"type": "string",
"description": "(Optional) The name of the Postgres database to which Speckle will connect. Only required for the Database Monitoring utility when the connection string is to a database connection pool and multi-region is disabled, otherwise this value is ignored.",
+3
View File
@@ -197,6 +197,9 @@ db:
## @param db.knexAsyncStackTracesEnabled If enabled, will provide better stack traces for errors arising out of knex operations
##
knexAsyncStackTracesEnabled: false
## @param db.knexImprovedTelemetryStackTraces If enabled, will provide better stack traces for knex telemetry
##
knexImprovedTelemetryStackTraces: false
## @param db.databaseName (Optional) The name of the Postgres database to which Speckle will connect. Only required for the Database Monitoring utility when the connection string is to a database connection pool and multi-region is disabled, otherwise this value is ignored.
databaseName: ''