diff --git a/packages/server/bootstrap.js b/packages/server/bootstrap.js index 0cdb3ef54..35fc7fc18 100644 --- a/packages/server/bootstrap.js +++ b/packages/server/bootstrap.js @@ -30,10 +30,6 @@ if (isApolloMonitoringEnabled() && !getApolloServerVersion()) { process.env.APOLLO_SERVER_USER_VERSION = getServerVersion() } -//knex is a singleton controlled by module so can't wait til app init -const { initOpenTelemetry } = require('./otel') -initOpenTelemetry() - // If running in test env, load .env.test first // (appRoot necessary, cause env files aren't loaded through require() calls) if (isTestEnv()) { @@ -49,6 +45,13 @@ if (isTestEnv()) { dotenv.config({ path: `${packageRoot}/.env` }) +// knex is a singleton controlled by module so can't wait til app init +const { initOpenTelemetry } = require('./otel') +initOpenTelemetry() + +const { patchKnex } = require('./modules/core/patches/knex') +patchKnex() + module.exports = { appRoot, packageRoot diff --git a/packages/server/logging/knexMonitoring.ts b/packages/server/logging/knexMonitoring.ts index 51d3eaf6e..604eb6945 100644 --- a/packages/server/logging/knexMonitoring.ts +++ b/packages/server/logging/knexMonitoring.ts @@ -5,6 +5,7 @@ import { Logger } from 'pino' import { toNDecimalPlaces } from '@/modules/core/utils/formatting' import { omit } from 'lodash' import { getRequestContext } from '@/logging/requestContext' +import { collectLongTrace } from '@speckle/shared' let metricQueryDuration: prometheusClient.Summary let metricQueryErrors: prometheusClient.Counter @@ -183,6 +184,7 @@ interface QueryEvent extends Knex.Sql { __knexUid: string __knexTxId: string __knexQueryUid: string + __stackTrace: string } const initKnexPrometheusMetricsForRegionEvents = async (params: { @@ -191,20 +193,25 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { logger: Logger }) => { const { region, db } = params - const queryStartTime: Record = {} + const queryMetadata: Record = {} const connectionAcquisitionStartTime: Record = {} const connectionInUseStartTime: Record = {} db.on('query', (data: QueryEvent) => { const queryId = data.__knexQueryUid + '' - queryStartTime[queryId] = performance.now() + queryMetadata[queryId] = { + startTime: performance.now(), + stackTrace: data.__stackTrace + } }) db.on('query-response', (_response: unknown, data: QueryEvent) => { const queryId = data.__knexQueryUid + '' - const durationMs = performance.now() - queryStartTime[queryId] + const { startTime = NaN, stackTrace = undefined } = queryMetadata[queryId] || {} + + const durationMs = performance.now() - startTime const durationSec = toNDecimalPlaces(durationMs / 1000, 2) - delete queryStartTime[queryId] + delete queryMetadata[queryId] if (!isNaN(durationSec)) metricQueryDuration .labels({ @@ -214,7 +221,6 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { }) .observe(durationSec) - const trace = (new Error().stack || '').split('\n').slice(1).join('\n').trim() const reqCtx = getRequestContext() // Update reqCtx with DB query metrics @@ -223,6 +229,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { reqCtx.dbMetrics.totalDuration += durationMs || 0 } + const trace = stackTrace || collectLongTrace() params.logger.info( { region, @@ -240,9 +247,11 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { db.on('query-error', (err: unknown, data: QueryEvent) => { const queryId = data.__knexQueryUid + '' - const durationMs = performance.now() - queryStartTime[queryId] + const { startTime = NaN, stackTrace = undefined } = queryMetadata[queryId] || {} + + const durationMs = performance.now() - startTime const durationSec = toNDecimalPlaces(durationMs / 1000, 2) - delete queryStartTime[queryId] + delete queryMetadata[queryId] if (!isNaN(durationSec)) metricQueryDuration @@ -254,7 +263,6 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { .observe(durationSec) metricQueryErrors.inc() - const trace = (new Error().stack || '').split('\n').slice(1).join('\n').trim() const reqCtx = getRequestContext() // Update reqCtx with DB query metrics @@ -263,6 +271,7 @@ const initKnexPrometheusMetricsForRegionEvents = async (params: { reqCtx.dbMetrics.totalDuration += durationMs || 0 } + const trace = stackTrace || collectLongTrace() params.logger.warn( { err: typeof err === 'object' ? omit(err, 'detail') : err, diff --git a/packages/server/modules/core/patches/knex.ts b/packages/server/modules/core/patches/knex.ts new file mode 100644 index 000000000..3db5e65ca --- /dev/null +++ b/packages/server/modules/core/patches/knex.ts @@ -0,0 +1,27 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ +import { enableImprovedKnexTelemetryStackTraces } from '@/modules/shared/helpers/envHelper' +import { collectLongTrace } from '@speckle/shared' +import knexQueryCompiler from 'knex/lib/query/querycompiler' + +/** + * ⚠️ HERE BE DARKNESS! ⚠️ + * We're monkey patching knex which is pretty cursed, but i'm not aware of any other way + * to get a proper stack trace in query event handlers + */ + +export const patchKnex = () => { + if (enableImprovedKnexTelemetryStackTraces()) { + // Preserve stack trace on query compilation + const originalToSQL = knexQueryCompiler.prototype.toSQL + const newToSQL: typeof originalToSQL = function ( + this: typeof knexQueryCompiler.prototype, + ...args: any + ) { + const ret = originalToSQL.apply(this, args) + ret.__stackTrace = collectLongTrace() + return ret + } + + knexQueryCompiler.prototype.toSQL = newToSQL + } +} diff --git a/packages/server/modules/shared/helpers/envHelper.ts b/packages/server/modules/shared/helpers/envHelper.ts index f3e2f65f5..e47f9665c 100644 --- a/packages/server/modules/shared/helpers/envHelper.ts +++ b/packages/server/modules/shared/helpers/envHelper.ts @@ -443,3 +443,7 @@ export const knexAsyncStackTracesEnabled = () => { export const asyncRequestContextEnabled = () => { return getBooleanFromEnv('ASYNC_REQUEST_CONTEXT_ENABLED') } + +export function enableImprovedKnexTelemetryStackTraces() { + return getBooleanFromEnv('KNEX_IMPROVED_TELEMETRY_STACK_TRACES') +} diff --git a/packages/server/type-augmentations/knex.d.ts b/packages/server/type-augmentations/knex.d.ts new file mode 100644 index 000000000..035be8953 --- /dev/null +++ b/packages/server/type-augmentations/knex.d.ts @@ -0,0 +1,22 @@ +// used in patches/knex.ts +declare module 'knex/lib/query/querycompiler' { + declare class QueryCompiler { + toSQL( + method: string, + tz: string + ): { + method: string + options: Record + timeout: number | false + cancelOnTimeout: boolean + bindings: unknown[] + __knexQueryUid: string + /** + * This one's patched in by us for telemetry purposes + */ + __stackTrace: string + } + } + + export default QueryCompiler +} diff --git a/packages/shared/src/core/helpers/error.ts b/packages/shared/src/core/helpers/error.ts index 3fb35b02b..3a16cdd16 100644 --- a/packages/shared/src/core/helpers/error.ts +++ b/packages/shared/src/core/helpers/error.ts @@ -20,3 +20,14 @@ export function throwUncoveredError(e: never): never { export function createUncoveredError(e: never) { return new Error(`Uncovered error case ${e}.`) } + +/** + * Note: Only V8 and Node.js support controlling the stack trace limit + */ +export const collectLongTrace = (limit?: number) => { + const originalLimit = Error.stackTraceLimit + Error.stackTraceLimit = limit || 30 + const trace = (new Error().stack || '').split('\n').slice(1).join('\n').trim() + Error.stackTraceLimit = originalLimit + return trace +} diff --git a/utils/helm/speckle-server/templates/_helpers.tpl b/utils/helm/speckle-server/templates/_helpers.tpl index fd27441fc..d4c1f877d 100644 --- a/utils/helm/speckle-server/templates/_helpers.tpl +++ b/utils/helm/speckle-server/templates/_helpers.tpl @@ -782,6 +782,11 @@ Generate the environment variables for Speckle server and Speckle objects deploy value: {{ .Values.db.knexAsyncStackTracesEnabled | quote }} {{- end}} +{{- if .Values.db.knexImprovedTelemetryStackTraces }} +- name: KNEX_IMPROVED_TELEMETRY_STACK_TRACES + value: {{ .Values.db.knexImprovedTelemetryStackTraces | quote }} +{{- end}} + - name: PGSSLMODE value: "{{ .Values.db.PGSSLMODE }}" diff --git a/utils/helm/speckle-server/values.schema.json b/utils/helm/speckle-server/values.schema.json index 093f61c13..edf79ce32 100644 --- a/utils/helm/speckle-server/values.schema.json +++ b/utils/helm/speckle-server/values.schema.json @@ -246,6 +246,11 @@ "description": "If enabled, will provide better stack traces for errors arising out of knex operations", "default": false }, + "knexImprovedTelemetryStackTraces": { + "type": "boolean", + "description": "If enabled, will provide better stack traces for knex telemetry", + "default": false + }, "databaseName": { "type": "string", "description": "(Optional) The name of the Postgres database to which Speckle will connect. Only required for the Database Monitoring utility when the connection string is to a database connection pool and multi-region is disabled, otherwise this value is ignored.", diff --git a/utils/helm/speckle-server/values.yaml b/utils/helm/speckle-server/values.yaml index e3bd1e233..dffeb5dbf 100644 --- a/utils/helm/speckle-server/values.yaml +++ b/utils/helm/speckle-server/values.yaml @@ -197,6 +197,9 @@ db: ## @param db.knexAsyncStackTracesEnabled If enabled, will provide better stack traces for errors arising out of knex operations ## knexAsyncStackTracesEnabled: false + ## @param db.knexImprovedTelemetryStackTraces If enabled, will provide better stack traces for knex telemetry + ## + knexImprovedTelemetryStackTraces: false ## @param db.databaseName (Optional) The name of the Postgres database to which Speckle will connect. Only required for the Database Monitoring utility when the connection string is to a database connection pool and multi-region is disabled, otherwise this value is ignored. databaseName: ''