feat(metrics): collect metrics at a higher frequency then prometheus scrapes (#2479)
* feat(metrics): collect metrics at a higher frequency then prometheus scrapes * metrics presented as histograms * Metric for self monitoring time taken to collect the high frequency metrics * Initiate high frequency metrics alongside default metrics * Collect metrics every 100ms
This commit is contained in:
@@ -0,0 +1,95 @@
|
||||
/**
|
||||
Adapted from prom-client: https://github.com/siimon/prom-client/tree/master/lib/metrics
|
||||
|
||||
Copyright 2015 Simon Nyberg
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
import { Histogram, Registry } from 'prom-client'
|
||||
import type { Metric } from '@/logging/highFrequencyMetrics/highfrequencyMonitoring'
|
||||
|
||||
const NODEJS_HEAP_SIZE_TOTAL = 'nodejs_heap_size_total_bytes_high_frequency'
|
||||
const NODEJS_HEAP_SIZE_USED = 'nodejs_heap_size_used_bytes_high_frequency'
|
||||
const NODEJS_EXTERNAL_MEMORY = 'nodejs_external_memory_bytes_high_frequency'
|
||||
|
||||
type BucketName =
|
||||
| typeof NODEJS_HEAP_SIZE_TOTAL
|
||||
| typeof NODEJS_HEAP_SIZE_USED
|
||||
| typeof NODEJS_EXTERNAL_MEMORY
|
||||
|
||||
const DEFAULT_NODEJS_HEAP_SIZE_BUCKETS = {
|
||||
NODEJS_HEAP_SIZE_TOTAL: [0, 0.1e9, 0.25e9, 0.5e9, 0.75e9, 1e9, 2e9], //TODO: check if this is the right default
|
||||
NODEJS_HEAP_SIZE_USED: [0, 0.1e9, 0.25e9, 0.5e9, 0.75e9, 1e9, 2e9], //TODO: check if this is the right default
|
||||
NODEJS_EXTERNAL_MEMORY: [0, 0.1e9, 0.25e9, 0.5e9, 0.75e9, 1e9, 2e9] //TODO: check if this is the right default
|
||||
}
|
||||
|
||||
type MetricConfig = {
|
||||
prefix?: string
|
||||
labels?: Record<string, string>
|
||||
buckets?: Record<BucketName, number[]>
|
||||
}
|
||||
|
||||
export const heapSizeAndUsed = (
|
||||
registry: Registry,
|
||||
config: MetricConfig = {}
|
||||
): Metric => {
|
||||
const registers = registry ? [registry] : undefined
|
||||
const namePrefix = config.prefix ?? ''
|
||||
const labels = config.labels ?? {}
|
||||
const labelNames = Object.keys(labels)
|
||||
const buckets = { ...DEFAULT_NODEJS_HEAP_SIZE_BUCKETS, ...config.buckets }
|
||||
|
||||
const heapSizeTotal = new Histogram({
|
||||
name: namePrefix + NODEJS_HEAP_SIZE_TOTAL,
|
||||
help: 'Process heap size from Node.js in bytes. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.',
|
||||
registers,
|
||||
buckets: buckets.NODEJS_HEAP_SIZE_TOTAL,
|
||||
labelNames
|
||||
})
|
||||
const heapSizeUsed = new Histogram({
|
||||
name: namePrefix + NODEJS_HEAP_SIZE_USED,
|
||||
help: 'Process heap size used from Node.js in bytes. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.',
|
||||
registers,
|
||||
buckets: buckets.NODEJS_HEAP_SIZE_USED,
|
||||
labelNames
|
||||
})
|
||||
const externalMemUsed = new Histogram({
|
||||
name: namePrefix + NODEJS_EXTERNAL_MEMORY,
|
||||
help: 'Node.js external memory size in bytes. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.',
|
||||
registers,
|
||||
buckets: buckets.NODEJS_EXTERNAL_MEMORY,
|
||||
labelNames
|
||||
})
|
||||
|
||||
return {
|
||||
collect: () => {
|
||||
const memUsage = safeMemoryUsage()
|
||||
if (memUsage) {
|
||||
heapSizeTotal.observe(labels, memUsage.heapTotal)
|
||||
heapSizeUsed.observe(labels, memUsage.heapUsed)
|
||||
if (memUsage.external !== undefined) {
|
||||
externalMemUsed.observe(labels, memUsage.external)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function safeMemoryUsage() {
|
||||
try {
|
||||
return process.memoryUsage()
|
||||
} catch {
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
/**
|
||||
* High frequency monitoring, collects data related to CPU, memory, and network usage
|
||||
* at a higher frequency than the default prometheus monitoring. It makes the data
|
||||
* available to Prometheus via an histogram.
|
||||
*/
|
||||
|
||||
import { Histogram, Registry } from 'prom-client'
|
||||
import { processCpuTotal } from '@/logging/highFrequencyMetrics/processCPUTotal'
|
||||
import { heapSizeAndUsed } from '@/logging/highFrequencyMetrics/heapSizeAndUsed'
|
||||
|
||||
type MetricConfig = {
|
||||
prefix?: string
|
||||
labels?: Record<string, string>
|
||||
buckets?: Record<string, number[]>
|
||||
}
|
||||
|
||||
type HighFrequencyMonitor = {
|
||||
start: () => () => void
|
||||
}
|
||||
|
||||
export const initHighFrequencyMonitoring = (params: {
|
||||
register: Registry
|
||||
collectionPeriodMilliseconds: number
|
||||
config?: MetricConfig
|
||||
}): HighFrequencyMonitor => {
|
||||
const { register, collectionPeriodMilliseconds } = params
|
||||
const config = params.config ?? {}
|
||||
const registers = register ? [register] : undefined
|
||||
const namePrefix = config.prefix ?? ''
|
||||
const labels = config.labels ?? {}
|
||||
const labelNames = Object.keys(labels)
|
||||
|
||||
const metrics = [processCpuTotal(register, config), heapSizeAndUsed(register, config)]
|
||||
|
||||
const selfMonitor = new Histogram({
|
||||
name: namePrefix + 'self_monitor_time_high_frequency',
|
||||
help: 'The time taken to collect all of the high frequency metrics, seconds.',
|
||||
registers,
|
||||
buckets: [0, 0.001, 0.01, 0.025, 0.05, 0.1, 0.2],
|
||||
labelNames
|
||||
})
|
||||
|
||||
return {
|
||||
start: collectHighFrequencyMetrics({
|
||||
selfMonitor,
|
||||
metrics,
|
||||
collectionPeriodMilliseconds
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
export interface Metric {
|
||||
collect: () => void
|
||||
}
|
||||
|
||||
const collectHighFrequencyMetrics = (params: {
|
||||
selfMonitor: Histogram<string>
|
||||
collectionPeriodMilliseconds: number
|
||||
metrics: Metric[]
|
||||
}) => {
|
||||
const { selfMonitor, metrics, collectionPeriodMilliseconds } = params
|
||||
return () => {
|
||||
const intervalId = setInterval(() => {
|
||||
const end = selfMonitor.startTimer()
|
||||
for (const metric of metrics) {
|
||||
metric.collect()
|
||||
}
|
||||
end()
|
||||
}, collectionPeriodMilliseconds)
|
||||
return () => clearInterval(intervalId)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
/**
|
||||
* Adapted from prom-client: https://github.com/siimon/prom-client/tree/master/lib/metrics
|
||||
*
|
||||
Copyright 2015 Simon Nyberg
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
import { Histogram, Registry } from 'prom-client'
|
||||
import type { Metric } from '@/logging/highFrequencyMetrics/highfrequencyMonitoring'
|
||||
|
||||
const PROCESS_CPU_USER_SECONDS = 'process_cpu_user_seconds_total_high_frequency'
|
||||
const PROCESS_CPU_SYSTEM_SECONDS = 'process_cpu_system_seconds_total_high_frequency'
|
||||
const PROCESS_CPU_SECONDS = 'process_cpu_seconds_total_high_frequency'
|
||||
|
||||
type BucketName =
|
||||
| typeof PROCESS_CPU_USER_SECONDS
|
||||
| typeof PROCESS_CPU_SYSTEM_SECONDS
|
||||
| typeof PROCESS_CPU_SECONDS
|
||||
|
||||
const DEFAULT_CPU_TOTAL_BUCKETS = {
|
||||
PROCESS_CPU_SECONDS: [0, 0.1, 0.25, 0.5, 0.75, 1, 2], //TODO: check if this is the right default
|
||||
PROCESS_CPU_USER_SECONDS: [0, 0.1, 0.25, 0.5, 0.75, 1, 2], //TODO: check if this is the right default
|
||||
PROCESS_CPU_SYSTEM_SECONDS: [0, 0.1, 0.25, 0.5, 0.75, 1, 2] //TODO: check if this is the right default
|
||||
}
|
||||
|
||||
type MetricConfig = {
|
||||
prefix?: string
|
||||
labels?: Record<string, string>
|
||||
buckets?: Record<BucketName, number[]>
|
||||
}
|
||||
|
||||
export const processCpuTotal = (
|
||||
registry: Registry,
|
||||
config: MetricConfig = {}
|
||||
): Metric => {
|
||||
const registers = registry ? [registry] : undefined
|
||||
const namePrefix = config.prefix ?? ''
|
||||
const labels = config.labels ?? {}
|
||||
const labelNames = Object.keys(labels)
|
||||
const buckets = { ...DEFAULT_CPU_TOTAL_BUCKETS, ...config.buckets }
|
||||
|
||||
const cpuUserUsageHistogram = new Histogram({
|
||||
name: namePrefix + PROCESS_CPU_USER_SECONDS,
|
||||
help: 'Total user CPU time spent in seconds. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.',
|
||||
labelNames,
|
||||
buckets: buckets.PROCESS_CPU_USER_SECONDS,
|
||||
registers
|
||||
})
|
||||
const cpuSystemUsageHistogram = new Histogram({
|
||||
name: namePrefix + PROCESS_CPU_SYSTEM_SECONDS,
|
||||
help: 'Total system CPU time spent in seconds. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.',
|
||||
registers,
|
||||
buckets: buckets.PROCESS_CPU_SYSTEM_SECONDS,
|
||||
labelNames
|
||||
})
|
||||
const cpuUsageHistogram = new Histogram({
|
||||
name: namePrefix + PROCESS_CPU_SECONDS,
|
||||
help: 'Total user and system CPU time spent in seconds. This data is collected at a higher frequency than Prometheus scrapes, and is presented as a Histogram.',
|
||||
registers,
|
||||
buckets: buckets.PROCESS_CPU_USER_SECONDS,
|
||||
labelNames
|
||||
})
|
||||
|
||||
let lastCpuUsage = process.cpuUsage()
|
||||
|
||||
return {
|
||||
collect: () => {
|
||||
const cpuUsage = process.cpuUsage()
|
||||
|
||||
const userUsageMicros = cpuUsage.user - lastCpuUsage.user
|
||||
const systemUsageMicros = cpuUsage.system - lastCpuUsage.system
|
||||
|
||||
lastCpuUsage = cpuUsage
|
||||
|
||||
cpuUserUsageHistogram.observe(labels, userUsageMicros / 1e6)
|
||||
cpuSystemUsageHistogram.observe(labels, systemUsageMicros / 1e6)
|
||||
cpuUsageHistogram.observe(labels, (userUsageMicros + systemUsageMicros) / 1e6)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5,7 +5,10 @@ const { getMachineId } = require('./machineId')
|
||||
const prometheusClient = require('prom-client')
|
||||
const promBundle = require('express-prom-bundle')
|
||||
|
||||
const { initKnexPrometheusMetrics } = require('./knexMonitoring')
|
||||
const { initKnexPrometheusMetrics } = require('@/logging/knexMonitoring')
|
||||
const {
|
||||
initHighFrequencyMonitoring
|
||||
} = require('@/logging/highFrequencyMetrics/highfrequencyMonitoring')
|
||||
|
||||
let prometheusInitialized = false
|
||||
|
||||
@@ -20,6 +23,11 @@ module.exports = function (app) {
|
||||
app: 'server'
|
||||
})
|
||||
prometheusClient.collectDefaultMetrics()
|
||||
const highfrequencyMonitoring = initHighFrequencyMonitoring({
|
||||
register: prometheusClient.register,
|
||||
collectionPeriodMilliseconds: 100
|
||||
})
|
||||
highfrequencyMonitoring.start()
|
||||
|
||||
initKnexPrometheusMetrics()
|
||||
const expressMetricsMiddleware = promBundle({
|
||||
|
||||
Reference in New Issue
Block a user