From b119f2ee830fad1f4c14057d2b4c858a652ebc46 Mon Sep 17 00:00:00 2001 From: Iain Sproat <68657+iainsproat@users.noreply.github.com> Date: Tue, 9 Jul 2024 14:57:06 +0100 Subject: [PATCH] feat(metrics): collect metrics at a higher frequency then prometheus scrapes (#2479) * feat(metrics): collect metrics at a higher frequency then prometheus scrapes * metrics presented as histograms * Metric for self monitoring time taken to collect the high frequency metrics * Initiate high frequency metrics alongside default metrics * Collect metrics every 100ms --- .../highFrequencyMetrics/heapSizeAndUsed.ts | 95 +++++++++++++++++++ .../highfrequencyMonitoring.ts | 72 ++++++++++++++ .../highFrequencyMetrics/processCPUTotal.ts | 91 ++++++++++++++++++ packages/server/logging/index.js | 10 +- 4 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 packages/server/logging/highFrequencyMetrics/heapSizeAndUsed.ts create mode 100644 packages/server/logging/highFrequencyMetrics/highfrequencyMonitoring.ts create mode 100644 packages/server/logging/highFrequencyMetrics/processCPUTotal.ts diff --git a/packages/server/logging/highFrequencyMetrics/heapSizeAndUsed.ts b/packages/server/logging/highFrequencyMetrics/heapSizeAndUsed.ts new file mode 100644 index 000000000..e01e9e892 --- /dev/null +++ b/packages/server/logging/highFrequencyMetrics/heapSizeAndUsed.ts @@ -0,0 +1,95 @@ +/** + Adapted from prom-client: https://github.com/siimon/prom-client/tree/master/lib/metrics + + Copyright 2015 Simon Nyberg + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +import { Histogram, Registry } from 'prom-client' +import type { Metric } from '@/logging/highFrequencyMetrics/highfrequencyMonitoring' + +const NODEJS_HEAP_SIZE_TOTAL = 'nodejs_heap_size_total_bytes_high_frequency' +const NODEJS_HEAP_SIZE_USED = 'nodejs_heap_size_used_bytes_high_frequency' +const NODEJS_EXTERNAL_MEMORY = 'nodejs_external_memory_bytes_high_frequency' + +type BucketName = + | typeof NODEJS_HEAP_SIZE_TOTAL + | typeof NODEJS_HEAP_SIZE_USED + | typeof NODEJS_EXTERNAL_MEMORY + +const DEFAULT_NODEJS_HEAP_SIZE_BUCKETS = { + NODEJS_HEAP_SIZE_TOTAL: [0, 0.1e9, 0.25e9, 0.5e9, 0.75e9, 1e9, 2e9], //TODO: check if this is the right default + NODEJS_HEAP_SIZE_USED: [0, 0.1e9, 0.25e9, 0.5e9, 0.75e9, 1e9, 2e9], //TODO: check if this is the right default + NODEJS_EXTERNAL_MEMORY: [0, 0.1e9, 0.25e9, 0.5e9, 0.75e9, 1e9, 2e9] //TODO: check if this is the right default +} + +type MetricConfig = { + prefix?: string + labels?: Record + buckets?: Record +} + +export const heapSizeAndUsed = ( + registry: Registry, + config: MetricConfig = {} +): Metric => { + const registers = registry ? [registry] : undefined + const namePrefix = config.prefix ?? '' + const labels = config.labels ?? {} + const labelNames = Object.keys(labels) + const buckets = { ...DEFAULT_NODEJS_HEAP_SIZE_BUCKETS, ...config.buckets } + + const heapSizeTotal = new Histogram({ + name: namePrefix + NODEJS_HEAP_SIZE_TOTAL, + help: 'Process heap size from Node.js in bytes. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', + registers, + buckets: buckets.NODEJS_HEAP_SIZE_TOTAL, + labelNames + }) + const heapSizeUsed = new Histogram({ + name: namePrefix + NODEJS_HEAP_SIZE_USED, + help: 'Process heap size used from Node.js in bytes. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', + registers, + buckets: buckets.NODEJS_HEAP_SIZE_USED, + labelNames + }) + const externalMemUsed = new Histogram({ + name: namePrefix + NODEJS_EXTERNAL_MEMORY, + help: 'Node.js external memory size in bytes. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', + registers, + buckets: buckets.NODEJS_EXTERNAL_MEMORY, + labelNames + }) + + return { + collect: () => { + const memUsage = safeMemoryUsage() + if (memUsage) { + heapSizeTotal.observe(labels, memUsage.heapTotal) + heapSizeUsed.observe(labels, memUsage.heapUsed) + if (memUsage.external !== undefined) { + externalMemUsed.observe(labels, memUsage.external) + } + } + } + } +} + +function safeMemoryUsage() { + try { + return process.memoryUsage() + } catch { + return + } +} diff --git a/packages/server/logging/highFrequencyMetrics/highfrequencyMonitoring.ts b/packages/server/logging/highFrequencyMetrics/highfrequencyMonitoring.ts new file mode 100644 index 000000000..6092c42e8 --- /dev/null +++ b/packages/server/logging/highFrequencyMetrics/highfrequencyMonitoring.ts @@ -0,0 +1,72 @@ +/** + * High frequency monitoring, collects data related to CPU, memory, and network usage + * at a higher frequency than the default prometheus monitoring. It makes the data + * available to Prometheus via an histogram. + */ + +import { Histogram, Registry } from 'prom-client' +import { processCpuTotal } from '@/logging/highFrequencyMetrics/processCPUTotal' +import { heapSizeAndUsed } from '@/logging/highFrequencyMetrics/heapSizeAndUsed' + +type MetricConfig = { + prefix?: string + labels?: Record + buckets?: Record +} + +type HighFrequencyMonitor = { + start: () => () => void +} + +export const initHighFrequencyMonitoring = (params: { + register: Registry + collectionPeriodMilliseconds: number + config?: MetricConfig +}): HighFrequencyMonitor => { + const { register, collectionPeriodMilliseconds } = params + const config = params.config ?? {} + const registers = register ? [register] : undefined + const namePrefix = config.prefix ?? '' + const labels = config.labels ?? {} + const labelNames = Object.keys(labels) + + const metrics = [processCpuTotal(register, config), heapSizeAndUsed(register, config)] + + const selfMonitor = new Histogram({ + name: namePrefix + 'self_monitor_time_high_frequency', + help: 'The time taken to collect all of the high frequency metrics, seconds.', + registers, + buckets: [0, 0.001, 0.01, 0.025, 0.05, 0.1, 0.2], + labelNames + }) + + return { + start: collectHighFrequencyMetrics({ + selfMonitor, + metrics, + collectionPeriodMilliseconds + }) + } +} + +export interface Metric { + collect: () => void +} + +const collectHighFrequencyMetrics = (params: { + selfMonitor: Histogram + collectionPeriodMilliseconds: number + metrics: Metric[] +}) => { + const { selfMonitor, metrics, collectionPeriodMilliseconds } = params + return () => { + const intervalId = setInterval(() => { + const end = selfMonitor.startTimer() + for (const metric of metrics) { + metric.collect() + } + end() + }, collectionPeriodMilliseconds) + return () => clearInterval(intervalId) + } +} diff --git a/packages/server/logging/highFrequencyMetrics/processCPUTotal.ts b/packages/server/logging/highFrequencyMetrics/processCPUTotal.ts new file mode 100644 index 000000000..6d1f44144 --- /dev/null +++ b/packages/server/logging/highFrequencyMetrics/processCPUTotal.ts @@ -0,0 +1,91 @@ +/** + * Adapted from prom-client: https://github.com/siimon/prom-client/tree/master/lib/metrics + * + Copyright 2015 Simon Nyberg + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +import { Histogram, Registry } from 'prom-client' +import type { Metric } from '@/logging/highFrequencyMetrics/highfrequencyMonitoring' + +const PROCESS_CPU_USER_SECONDS = 'process_cpu_user_seconds_total_high_frequency' +const PROCESS_CPU_SYSTEM_SECONDS = 'process_cpu_system_seconds_total_high_frequency' +const PROCESS_CPU_SECONDS = 'process_cpu_seconds_total_high_frequency' + +type BucketName = + | typeof PROCESS_CPU_USER_SECONDS + | typeof PROCESS_CPU_SYSTEM_SECONDS + | typeof PROCESS_CPU_SECONDS + +const DEFAULT_CPU_TOTAL_BUCKETS = { + PROCESS_CPU_SECONDS: [0, 0.1, 0.25, 0.5, 0.75, 1, 2], //TODO: check if this is the right default + PROCESS_CPU_USER_SECONDS: [0, 0.1, 0.25, 0.5, 0.75, 1, 2], //TODO: check if this is the right default + PROCESS_CPU_SYSTEM_SECONDS: [0, 0.1, 0.25, 0.5, 0.75, 1, 2] //TODO: check if this is the right default +} + +type MetricConfig = { + prefix?: string + labels?: Record + buckets?: Record +} + +export const processCpuTotal = ( + registry: Registry, + config: MetricConfig = {} +): Metric => { + const registers = registry ? [registry] : undefined + const namePrefix = config.prefix ?? '' + const labels = config.labels ?? {} + const labelNames = Object.keys(labels) + const buckets = { ...DEFAULT_CPU_TOTAL_BUCKETS, ...config.buckets } + + const cpuUserUsageHistogram = new Histogram({ + name: namePrefix + PROCESS_CPU_USER_SECONDS, + help: 'Total user CPU time spent in seconds. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', + labelNames, + buckets: buckets.PROCESS_CPU_USER_SECONDS, + registers + }) + const cpuSystemUsageHistogram = new Histogram({ + name: namePrefix + PROCESS_CPU_SYSTEM_SECONDS, + help: 'Total system CPU time spent in seconds. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', + registers, + buckets: buckets.PROCESS_CPU_SYSTEM_SECONDS, + labelNames + }) + const cpuUsageHistogram = new Histogram({ + name: namePrefix + PROCESS_CPU_SECONDS, + help: 'Total user and system CPU time spent in seconds. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.', + registers, + buckets: buckets.PROCESS_CPU_USER_SECONDS, + labelNames + }) + + let lastCpuUsage = process.cpuUsage() + + return { + collect: () => { + const cpuUsage = process.cpuUsage() + + const userUsageMicros = cpuUsage.user - lastCpuUsage.user + const systemUsageMicros = cpuUsage.system - lastCpuUsage.system + + lastCpuUsage = cpuUsage + + cpuUserUsageHistogram.observe(labels, userUsageMicros / 1e6) + cpuSystemUsageHistogram.observe(labels, systemUsageMicros / 1e6) + cpuUsageHistogram.observe(labels, (userUsageMicros + systemUsageMicros) / 1e6) + } + } +} diff --git a/packages/server/logging/index.js b/packages/server/logging/index.js index c05733da2..aec1e4594 100644 --- a/packages/server/logging/index.js +++ b/packages/server/logging/index.js @@ -5,7 +5,10 @@ const { getMachineId } = require('./machineId') const prometheusClient = require('prom-client') const promBundle = require('express-prom-bundle') -const { initKnexPrometheusMetrics } = require('./knexMonitoring') +const { initKnexPrometheusMetrics } = require('@/logging/knexMonitoring') +const { + initHighFrequencyMonitoring +} = require('@/logging/highFrequencyMetrics/highfrequencyMonitoring') let prometheusInitialized = false @@ -20,6 +23,11 @@ module.exports = function (app) { app: 'server' }) prometheusClient.collectDefaultMetrics() + const highfrequencyMonitoring = initHighFrequencyMonitoring({ + register: prometheusClient.register, + collectionPeriodMilliseconds: 100 + }) + highfrequencyMonitoring.start() initKnexPrometheusMetrics() const expressMetricsMiddleware = promBundle({