feat(metrics): collect metrics at a higher frequency then prometheus scrapes (#2479)

* feat(metrics): collect metrics at a higher frequency then prometheus scrapes
* metrics presented as histograms
* Metric for self monitoring time taken to collect the high frequency metrics
* Initiate high frequency metrics alongside default metrics
* Collect metrics every 100ms
This commit is contained in:
Iain Sproat
2024-07-09 14:57:06 +01:00
committed by GitHub
parent 61c4f7c9aa
commit b119f2ee83
4 changed files with 267 additions and 1 deletions
@@ -0,0 +1,95 @@
/**
Adapted from prom-client: https://github.com/siimon/prom-client/tree/master/lib/metrics
Copyright 2015 Simon Nyberg
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import { Histogram, Registry } from 'prom-client'
import type { Metric } from '@/logging/highFrequencyMetrics/highfrequencyMonitoring'
const NODEJS_HEAP_SIZE_TOTAL = 'nodejs_heap_size_total_bytes_high_frequency'
const NODEJS_HEAP_SIZE_USED = 'nodejs_heap_size_used_bytes_high_frequency'
const NODEJS_EXTERNAL_MEMORY = 'nodejs_external_memory_bytes_high_frequency'
type BucketName =
| typeof NODEJS_HEAP_SIZE_TOTAL
| typeof NODEJS_HEAP_SIZE_USED
| typeof NODEJS_EXTERNAL_MEMORY
const DEFAULT_NODEJS_HEAP_SIZE_BUCKETS = {
NODEJS_HEAP_SIZE_TOTAL: [0, 0.1e9, 0.25e9, 0.5e9, 0.75e9, 1e9, 2e9], //TODO: check if this is the right default
NODEJS_HEAP_SIZE_USED: [0, 0.1e9, 0.25e9, 0.5e9, 0.75e9, 1e9, 2e9], //TODO: check if this is the right default
NODEJS_EXTERNAL_MEMORY: [0, 0.1e9, 0.25e9, 0.5e9, 0.75e9, 1e9, 2e9] //TODO: check if this is the right default
}
type MetricConfig = {
prefix?: string
labels?: Record<string, string>
buckets?: Record<BucketName, number[]>
}
export const heapSizeAndUsed = (
registry: Registry,
config: MetricConfig = {}
): Metric => {
const registers = registry ? [registry] : undefined
const namePrefix = config.prefix ?? ''
const labels = config.labels ?? {}
const labelNames = Object.keys(labels)
const buckets = { ...DEFAULT_NODEJS_HEAP_SIZE_BUCKETS, ...config.buckets }
const heapSizeTotal = new Histogram({
name: namePrefix + NODEJS_HEAP_SIZE_TOTAL,
help: 'Process heap size from Node.js in bytes. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.',
registers,
buckets: buckets.NODEJS_HEAP_SIZE_TOTAL,
labelNames
})
const heapSizeUsed = new Histogram({
name: namePrefix + NODEJS_HEAP_SIZE_USED,
help: 'Process heap size used from Node.js in bytes. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.',
registers,
buckets: buckets.NODEJS_HEAP_SIZE_USED,
labelNames
})
const externalMemUsed = new Histogram({
name: namePrefix + NODEJS_EXTERNAL_MEMORY,
help: 'Node.js external memory size in bytes. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.',
registers,
buckets: buckets.NODEJS_EXTERNAL_MEMORY,
labelNames
})
return {
collect: () => {
const memUsage = safeMemoryUsage()
if (memUsage) {
heapSizeTotal.observe(labels, memUsage.heapTotal)
heapSizeUsed.observe(labels, memUsage.heapUsed)
if (memUsage.external !== undefined) {
externalMemUsed.observe(labels, memUsage.external)
}
}
}
}
}
function safeMemoryUsage() {
try {
return process.memoryUsage()
} catch {
return
}
}
@@ -0,0 +1,72 @@
/**
* High frequency monitoring, collects data related to CPU, memory, and network usage
* at a higher frequency than the default prometheus monitoring. It makes the data
* available to Prometheus via an histogram.
*/
import { Histogram, Registry } from 'prom-client'
import { processCpuTotal } from '@/logging/highFrequencyMetrics/processCPUTotal'
import { heapSizeAndUsed } from '@/logging/highFrequencyMetrics/heapSizeAndUsed'
type MetricConfig = {
prefix?: string
labels?: Record<string, string>
buckets?: Record<string, number[]>
}
type HighFrequencyMonitor = {
start: () => () => void
}
export const initHighFrequencyMonitoring = (params: {
register: Registry
collectionPeriodMilliseconds: number
config?: MetricConfig
}): HighFrequencyMonitor => {
const { register, collectionPeriodMilliseconds } = params
const config = params.config ?? {}
const registers = register ? [register] : undefined
const namePrefix = config.prefix ?? ''
const labels = config.labels ?? {}
const labelNames = Object.keys(labels)
const metrics = [processCpuTotal(register, config), heapSizeAndUsed(register, config)]
const selfMonitor = new Histogram({
name: namePrefix + 'self_monitor_time_high_frequency',
help: 'The time taken to collect all of the high frequency metrics, seconds.',
registers,
buckets: [0, 0.001, 0.01, 0.025, 0.05, 0.1, 0.2],
labelNames
})
return {
start: collectHighFrequencyMetrics({
selfMonitor,
metrics,
collectionPeriodMilliseconds
})
}
}
export interface Metric {
collect: () => void
}
const collectHighFrequencyMetrics = (params: {
selfMonitor: Histogram<string>
collectionPeriodMilliseconds: number
metrics: Metric[]
}) => {
const { selfMonitor, metrics, collectionPeriodMilliseconds } = params
return () => {
const intervalId = setInterval(() => {
const end = selfMonitor.startTimer()
for (const metric of metrics) {
metric.collect()
}
end()
}, collectionPeriodMilliseconds)
return () => clearInterval(intervalId)
}
}
@@ -0,0 +1,91 @@
/**
* Adapted from prom-client: https://github.com/siimon/prom-client/tree/master/lib/metrics
*
Copyright 2015 Simon Nyberg
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import { Histogram, Registry } from 'prom-client'
import type { Metric } from '@/logging/highFrequencyMetrics/highfrequencyMonitoring'
const PROCESS_CPU_USER_SECONDS = 'process_cpu_user_seconds_total_high_frequency'
const PROCESS_CPU_SYSTEM_SECONDS = 'process_cpu_system_seconds_total_high_frequency'
const PROCESS_CPU_SECONDS = 'process_cpu_seconds_total_high_frequency'
type BucketName =
| typeof PROCESS_CPU_USER_SECONDS
| typeof PROCESS_CPU_SYSTEM_SECONDS
| typeof PROCESS_CPU_SECONDS
const DEFAULT_CPU_TOTAL_BUCKETS = {
PROCESS_CPU_SECONDS: [0, 0.1, 0.25, 0.5, 0.75, 1, 2], //TODO: check if this is the right default
PROCESS_CPU_USER_SECONDS: [0, 0.1, 0.25, 0.5, 0.75, 1, 2], //TODO: check if this is the right default
PROCESS_CPU_SYSTEM_SECONDS: [0, 0.1, 0.25, 0.5, 0.75, 1, 2] //TODO: check if this is the right default
}
type MetricConfig = {
prefix?: string
labels?: Record<string, string>
buckets?: Record<BucketName, number[]>
}
export const processCpuTotal = (
registry: Registry,
config: MetricConfig = {}
): Metric => {
const registers = registry ? [registry] : undefined
const namePrefix = config.prefix ?? ''
const labels = config.labels ?? {}
const labelNames = Object.keys(labels)
const buckets = { ...DEFAULT_CPU_TOTAL_BUCKETS, ...config.buckets }
const cpuUserUsageHistogram = new Histogram({
name: namePrefix + PROCESS_CPU_USER_SECONDS,
help: 'Total user CPU time spent in seconds. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.',
labelNames,
buckets: buckets.PROCESS_CPU_USER_SECONDS,
registers
})
const cpuSystemUsageHistogram = new Histogram({
name: namePrefix + PROCESS_CPU_SYSTEM_SECONDS,
help: 'Total system CPU time spent in seconds. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.',
registers,
buckets: buckets.PROCESS_CPU_SYSTEM_SECONDS,
labelNames
})
const cpuUsageHistogram = new Histogram({
name: namePrefix + PROCESS_CPU_SECONDS,
help: 'Total user and system CPU time spent in seconds. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.',
registers,
buckets: buckets.PROCESS_CPU_USER_SECONDS,
labelNames
})
let lastCpuUsage = process.cpuUsage()
return {
collect: () => {
const cpuUsage = process.cpuUsage()
const userUsageMicros = cpuUsage.user - lastCpuUsage.user
const systemUsageMicros = cpuUsage.system - lastCpuUsage.system
lastCpuUsage = cpuUsage
cpuUserUsageHistogram.observe(labels, userUsageMicros / 1e6)
cpuSystemUsageHistogram.observe(labels, systemUsageMicros / 1e6)
cpuUsageHistogram.observe(labels, (userUsageMicros + systemUsageMicros) / 1e6)
}
}
}
+9 -1
View File
@@ -5,7 +5,10 @@ const { getMachineId } = require('./machineId')
const prometheusClient = require('prom-client')
const promBundle = require('express-prom-bundle')
const { initKnexPrometheusMetrics } = require('./knexMonitoring')
const { initKnexPrometheusMetrics } = require('@/logging/knexMonitoring')
const {
initHighFrequencyMonitoring
} = require('@/logging/highFrequencyMetrics/highfrequencyMonitoring')
let prometheusInitialized = false
@@ -20,6 +23,11 @@ module.exports = function (app) {
app: 'server'
})
prometheusClient.collectDefaultMetrics()
const highfrequencyMonitoring = initHighFrequencyMonitoring({
register: prometheusClient.register,
collectionPeriodMilliseconds: 100
})
highfrequencyMonitoring.start()
initKnexPrometheusMetrics()
const expressMetricsMiddleware = promBundle({