feat(server/previews): retry dead preview resurrection if no backpressure (#5099)

This commit is contained in:
Iain Sproat
2025-07-17 13:09:29 +01:00
committed by GitHub
parent 37bf431025
commit fd4588c99b
9 changed files with 77 additions and 4 deletions
@@ -132,3 +132,5 @@ export type BuildUpdateObjectPreview = (params: {
}) => Promise<UpdateObjectPreview>
export type ObserveMetrics = (params: { payload: PreviewResultPayload }) => void
export type GetNumberOfJobsInRequestQueue = () => Promise<number>
+2 -1
View File
@@ -6,6 +6,7 @@ import {
disablePreviews,
getFeatureFlags,
getPreviewServiceRedisUrl,
getPreviewServiceRetryPeriodMinutes,
getRedisUrl,
getServerOrigin
} from '@/modules/shared/helpers/envHelper'
@@ -101,7 +102,7 @@ export const init: SpeckleModule['init'] = async ({
scheduleExecution,
previewRequestQueue,
responseQueueName,
cronExpression: '*/23 * * * *' // every 23 minutes (kind of random prime number to reduce syncing with other possibly heavy tasks)
cronExpression: `*/${getPreviewServiceRetryPeriodMinutes()} * * * *`
})
]
: [])
@@ -1,12 +1,13 @@
import type {
BuildUpdateObjectPreview,
GetNumberOfJobsInRequestQueue,
RequestObjectPreview
} from '@/modules/previews/domain/operations'
import type { Logger } from '@/observability/logging'
import type { Queue, Job } from 'bull'
import { PreviewStatus } from '@/modules/previews/domain/consts'
import type { JobPayload } from '@speckle/shared/workers/previews'
import { fromJobId, jobIdSchema } from '@speckle/shared/workers/previews'
import { fromJobId } from '@speckle/shared/workers/previews'
export const requestObjectPreviewFactory =
({
@@ -63,3 +64,10 @@ export const requestFailedHandlerFactory =
)
}
}
export const getNumberOfJobsInQueueFactory =
(deps: { queue: Queue<JobPayload> }): GetNumberOfJobsInRequestQueue =>
async () => {
const counts = await deps.queue.getJobCounts()
return counts.waiting + counts.active + counts.delayed
}
@@ -1,5 +1,6 @@
import type { Logger } from '@/observability/logging'
import {
GetNumberOfJobsInRequestQueue,
GetPaginatedObjectPreviewsInErrorState,
GetPaginatedObjectPreviewsPage,
GetPaginatedObjectPreviewsTotalCount,
@@ -12,6 +13,7 @@ import { DefaultAppIds } from '@/modules/auth/defaultApps'
import { TokenResourceIdentifierType } from '@/modules/core/domain/tokens/types'
import { GetStreamCollaborators } from '@/modules/core/domain/streams/operations'
import { CreateAndStoreAppToken } from '@/modules/core/domain/tokens/operations'
import { getPreviewServiceMaxQueueBackpressure } from '@/modules/shared/helpers/envHelper'
export const getPaginatedObjectPreviewInErrorStateFactory =
(deps: {
@@ -47,6 +49,7 @@ export const retryFailedPreviewsFactory = (deps: {
serverOrigin: string
createAppToken: CreateAndStoreAppToken
requestObjectPreview: RequestObjectPreview
getNumberOfJobsInQueue: GetNumberOfJobsInRequestQueue
}) => {
const {
getPaginatedObjectPreviewsInErrorState,
@@ -54,7 +57,8 @@ export const retryFailedPreviewsFactory = (deps: {
getStreamCollaborators,
serverOrigin,
createAppToken,
requestObjectPreview
requestObjectPreview,
getNumberOfJobsInQueue
} = deps
return async (params: { logger: Logger }): Promise<boolean> => {
const { logger } = params
@@ -68,6 +72,16 @@ export const retryFailedPreviewsFactory = (deps: {
return false
}
// do not retry if we have backpressure in the queue
const queueLength = await getNumberOfJobsInQueue()
if (queueLength > getPreviewServiceMaxQueueBackpressure()) {
logger.info(
{ queueLength, totalErroredPreviewCount: totalCount },
'Backpressure detected in the preview request queue, queue length is {queueLength} jobs. Found {totalErroredPreviewCount} object previews in error state, but are not retrying any on this iteration.'
)
return false
}
const objPreview = items[0]
const { streamId, objectId } = objPreview
@@ -4,7 +4,10 @@ import {
previewServiceShouldUsePrivateObjectsServerUrl
} from '@/modules/shared/helpers/envHelper'
import type { Queue } from 'bull'
import { requestObjectPreviewFactory } from '@/modules/previews/queues/previews'
import {
getNumberOfJobsInQueueFactory,
requestObjectPreviewFactory
} from '@/modules/previews/queues/previews'
import type { ScheduleExecution } from '@/modules/core/domain/scheduledTasks/operations'
import { getRegisteredDbClients } from '@/modules/multiregion/utils/dbSelector'
import {
@@ -72,6 +75,9 @@ export const scheduleRetryFailedPreviews = async ({
db
}),
storeUserServerAppToken: storeUserServerAppTokenFactory({ db })
}),
getNumberOfJobsInQueue: getNumberOfJobsInQueueFactory({
queue: previewRequestQueue
})
})
)
@@ -513,6 +513,24 @@ export const getPreviewServiceTimeoutMilliseconds = (): number => {
return getIntFromEnv('PREVIEW_SERVICE_TIMEOUT_MILLISECONDS', '3600000') // 1 hour
}
export const getPreviewServiceRetryPeriodMinutes = (): number => {
const value = getIntFromEnv('PREVIEW_SERVICE_RETRY_PERIOD_MINUTES', '1')
if (value < 1 || value > 60)
throw new MisconfiguredEnvironmentError(
`PREVIEW_SERVICE_RETRY_PERIOD_MINUTES must be an integer between 1 and 60, got ${value}`
)
return value
}
export const getPreviewServiceMaxQueueBackpressure = (): number => {
const value = getIntFromEnv('PREVIEW_SERVICE_MAX_QUEUE_BACKPRESSURE', '1')
if (value < 1)
throw new MisconfiguredEnvironmentError(
`PREVIEW_SERVICE_MAX_QUEUE_BACKPRESSURE must be an integer greater than 0, got ${value}`
)
return value
}
export const emailVerificationTimeoutMinutes = (): number => {
return getIntFromEnv('EMAIL_VERIFICATION_TIMEOUT_MINUTES', '5')
}
@@ -771,6 +771,12 @@ Generate the environment variables for Speckle server and Speckle objects deploy
- name: PREVIEW_SERVICE_TIMEOUT_MILLISECONDS
value: {{ .Values.preview_service.puppeteer.timeoutMilliseconds | quote }}
{{- end }}
{{- if .Values.featureFlags.retryErroredPreviewsEnabled }}
- name: PREVIEW_SERVICE_MAX_QUEUE_BACKPRESSURE
value: {{ .Values.preview_service.maxQueueBackpressure | quote }}
- name: PREVIEW_SERVICE_RETRY_PERIOD_MINUTES
value: {{ .Values.preview_service.retryPeriodMinutes | quote }}
{{- end }}
{{- end }}
# *** Redis ***
@@ -2058,6 +2058,16 @@
"description": "The maximum number of connections that the Preview Service postgres client will make to the Postgres database.",
"default": 2
},
"maxQueueBackpressure": {
"type": "number",
"description": "The maximum number of items that can be queued in the Preview Service job queue before we stop retrying previously errored preview jobs. This is used to prevent the Preview Service from being overwhelmed with too many jobs.",
"default": 1
},
"retryPeriodMinutes": {
"type": "number",
"description": "The period, in minutes, between retries of previously errored jobs. Must be an integer between 1 and 60.",
"default": 1
},
"puppeteer": {
"type": "object",
"properties": {
+8
View File
@@ -1227,6 +1227,14 @@ preview_service:
##
postgresMaxConnections: 2
## @param preview_service.maxQueueBackpressure The maximum number of items that can be queued in the Preview Service job queue before we stop retrying previously errored preview jobs. This is used to prevent the Preview Service from being overwhelmed with too many jobs.
## If the queue exceeds this number, the Preview Service will stop retrying previously errored jobs until the queue size is below this number.
maxQueueBackpressure: 1
## @param preview_service.retryPeriodMinutes The period, in minutes, between retries of previously errored jobs. Must be an integer between 1 and 60.
##
retryPeriodMinutes: 1
puppeteer:
## @param preview_service.puppeteer.userDataDirectory The path to the user data directory. If not set, defaults to '/tmp/puppeteer'. This is mounted in the deployment as a volume with read-write access.
userDataDirectory: ''