feat(server/previews): retry dead preview resurrection if no backpressure (#5099)
This commit is contained in:
@@ -132,3 +132,5 @@ export type BuildUpdateObjectPreview = (params: {
|
||||
}) => Promise<UpdateObjectPreview>
|
||||
|
||||
export type ObserveMetrics = (params: { payload: PreviewResultPayload }) => void
|
||||
|
||||
export type GetNumberOfJobsInRequestQueue = () => Promise<number>
|
||||
|
||||
@@ -6,6 +6,7 @@ import {
|
||||
disablePreviews,
|
||||
getFeatureFlags,
|
||||
getPreviewServiceRedisUrl,
|
||||
getPreviewServiceRetryPeriodMinutes,
|
||||
getRedisUrl,
|
||||
getServerOrigin
|
||||
} from '@/modules/shared/helpers/envHelper'
|
||||
@@ -101,7 +102,7 @@ export const init: SpeckleModule['init'] = async ({
|
||||
scheduleExecution,
|
||||
previewRequestQueue,
|
||||
responseQueueName,
|
||||
cronExpression: '*/23 * * * *' // every 23 minutes (kind of random prime number to reduce syncing with other possibly heavy tasks)
|
||||
cronExpression: `*/${getPreviewServiceRetryPeriodMinutes()} * * * *`
|
||||
})
|
||||
]
|
||||
: [])
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
import type {
|
||||
BuildUpdateObjectPreview,
|
||||
GetNumberOfJobsInRequestQueue,
|
||||
RequestObjectPreview
|
||||
} from '@/modules/previews/domain/operations'
|
||||
import type { Logger } from '@/observability/logging'
|
||||
import type { Queue, Job } from 'bull'
|
||||
import { PreviewStatus } from '@/modules/previews/domain/consts'
|
||||
import type { JobPayload } from '@speckle/shared/workers/previews'
|
||||
import { fromJobId, jobIdSchema } from '@speckle/shared/workers/previews'
|
||||
import { fromJobId } from '@speckle/shared/workers/previews'
|
||||
|
||||
export const requestObjectPreviewFactory =
|
||||
({
|
||||
@@ -63,3 +64,10 @@ export const requestFailedHandlerFactory =
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
export const getNumberOfJobsInQueueFactory =
|
||||
(deps: { queue: Queue<JobPayload> }): GetNumberOfJobsInRequestQueue =>
|
||||
async () => {
|
||||
const counts = await deps.queue.getJobCounts()
|
||||
return counts.waiting + counts.active + counts.delayed
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import type { Logger } from '@/observability/logging'
|
||||
import {
|
||||
GetNumberOfJobsInRequestQueue,
|
||||
GetPaginatedObjectPreviewsInErrorState,
|
||||
GetPaginatedObjectPreviewsPage,
|
||||
GetPaginatedObjectPreviewsTotalCount,
|
||||
@@ -12,6 +13,7 @@ import { DefaultAppIds } from '@/modules/auth/defaultApps'
|
||||
import { TokenResourceIdentifierType } from '@/modules/core/domain/tokens/types'
|
||||
import { GetStreamCollaborators } from '@/modules/core/domain/streams/operations'
|
||||
import { CreateAndStoreAppToken } from '@/modules/core/domain/tokens/operations'
|
||||
import { getPreviewServiceMaxQueueBackpressure } from '@/modules/shared/helpers/envHelper'
|
||||
|
||||
export const getPaginatedObjectPreviewInErrorStateFactory =
|
||||
(deps: {
|
||||
@@ -47,6 +49,7 @@ export const retryFailedPreviewsFactory = (deps: {
|
||||
serverOrigin: string
|
||||
createAppToken: CreateAndStoreAppToken
|
||||
requestObjectPreview: RequestObjectPreview
|
||||
getNumberOfJobsInQueue: GetNumberOfJobsInRequestQueue
|
||||
}) => {
|
||||
const {
|
||||
getPaginatedObjectPreviewsInErrorState,
|
||||
@@ -54,7 +57,8 @@ export const retryFailedPreviewsFactory = (deps: {
|
||||
getStreamCollaborators,
|
||||
serverOrigin,
|
||||
createAppToken,
|
||||
requestObjectPreview
|
||||
requestObjectPreview,
|
||||
getNumberOfJobsInQueue
|
||||
} = deps
|
||||
return async (params: { logger: Logger }): Promise<boolean> => {
|
||||
const { logger } = params
|
||||
@@ -68,6 +72,16 @@ export const retryFailedPreviewsFactory = (deps: {
|
||||
return false
|
||||
}
|
||||
|
||||
// do not retry if we have backpressure in the queue
|
||||
const queueLength = await getNumberOfJobsInQueue()
|
||||
if (queueLength > getPreviewServiceMaxQueueBackpressure()) {
|
||||
logger.info(
|
||||
{ queueLength, totalErroredPreviewCount: totalCount },
|
||||
'Backpressure detected in the preview request queue, queue length is {queueLength} jobs. Found {totalErroredPreviewCount} object previews in error state, but are not retrying any on this iteration.'
|
||||
)
|
||||
return false
|
||||
}
|
||||
|
||||
const objPreview = items[0]
|
||||
const { streamId, objectId } = objPreview
|
||||
|
||||
|
||||
@@ -4,7 +4,10 @@ import {
|
||||
previewServiceShouldUsePrivateObjectsServerUrl
|
||||
} from '@/modules/shared/helpers/envHelper'
|
||||
import type { Queue } from 'bull'
|
||||
import { requestObjectPreviewFactory } from '@/modules/previews/queues/previews'
|
||||
import {
|
||||
getNumberOfJobsInQueueFactory,
|
||||
requestObjectPreviewFactory
|
||||
} from '@/modules/previews/queues/previews'
|
||||
import type { ScheduleExecution } from '@/modules/core/domain/scheduledTasks/operations'
|
||||
import { getRegisteredDbClients } from '@/modules/multiregion/utils/dbSelector'
|
||||
import {
|
||||
@@ -72,6 +75,9 @@ export const scheduleRetryFailedPreviews = async ({
|
||||
db
|
||||
}),
|
||||
storeUserServerAppToken: storeUserServerAppTokenFactory({ db })
|
||||
}),
|
||||
getNumberOfJobsInQueue: getNumberOfJobsInQueueFactory({
|
||||
queue: previewRequestQueue
|
||||
})
|
||||
})
|
||||
)
|
||||
|
||||
@@ -513,6 +513,24 @@ export const getPreviewServiceTimeoutMilliseconds = (): number => {
|
||||
return getIntFromEnv('PREVIEW_SERVICE_TIMEOUT_MILLISECONDS', '3600000') // 1 hour
|
||||
}
|
||||
|
||||
export const getPreviewServiceRetryPeriodMinutes = (): number => {
|
||||
const value = getIntFromEnv('PREVIEW_SERVICE_RETRY_PERIOD_MINUTES', '1')
|
||||
if (value < 1 || value > 60)
|
||||
throw new MisconfiguredEnvironmentError(
|
||||
`PREVIEW_SERVICE_RETRY_PERIOD_MINUTES must be an integer between 1 and 60, got ${value}`
|
||||
)
|
||||
return value
|
||||
}
|
||||
|
||||
export const getPreviewServiceMaxQueueBackpressure = (): number => {
|
||||
const value = getIntFromEnv('PREVIEW_SERVICE_MAX_QUEUE_BACKPRESSURE', '1')
|
||||
if (value < 1)
|
||||
throw new MisconfiguredEnvironmentError(
|
||||
`PREVIEW_SERVICE_MAX_QUEUE_BACKPRESSURE must be an integer greater than 0, got ${value}`
|
||||
)
|
||||
return value
|
||||
}
|
||||
|
||||
export const emailVerificationTimeoutMinutes = (): number => {
|
||||
return getIntFromEnv('EMAIL_VERIFICATION_TIMEOUT_MINUTES', '5')
|
||||
}
|
||||
|
||||
@@ -771,6 +771,12 @@ Generate the environment variables for Speckle server and Speckle objects deploy
|
||||
- name: PREVIEW_SERVICE_TIMEOUT_MILLISECONDS
|
||||
value: {{ .Values.preview_service.puppeteer.timeoutMilliseconds | quote }}
|
||||
{{- end }}
|
||||
{{- if .Values.featureFlags.retryErroredPreviewsEnabled }}
|
||||
- name: PREVIEW_SERVICE_MAX_QUEUE_BACKPRESSURE
|
||||
value: {{ .Values.preview_service.maxQueueBackpressure | quote }}
|
||||
- name: PREVIEW_SERVICE_RETRY_PERIOD_MINUTES
|
||||
value: {{ .Values.preview_service.retryPeriodMinutes | quote }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
# *** Redis ***
|
||||
|
||||
@@ -2058,6 +2058,16 @@
|
||||
"description": "The maximum number of connections that the Preview Service postgres client will make to the Postgres database.",
|
||||
"default": 2
|
||||
},
|
||||
"maxQueueBackpressure": {
|
||||
"type": "number",
|
||||
"description": "The maximum number of items that can be queued in the Preview Service job queue before we stop retrying previously errored preview jobs. This is used to prevent the Preview Service from being overwhelmed with too many jobs.",
|
||||
"default": 1
|
||||
},
|
||||
"retryPeriodMinutes": {
|
||||
"type": "number",
|
||||
"description": "The period, in minutes, between retries of previously errored jobs. Must be an integer between 1 and 60.",
|
||||
"default": 1
|
||||
},
|
||||
"puppeteer": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
||||
@@ -1227,6 +1227,14 @@ preview_service:
|
||||
##
|
||||
postgresMaxConnections: 2
|
||||
|
||||
## @param preview_service.maxQueueBackpressure The maximum number of items that can be queued in the Preview Service job queue before we stop retrying previously errored preview jobs. This is used to prevent the Preview Service from being overwhelmed with too many jobs.
|
||||
## If the queue exceeds this number, the Preview Service will stop retrying previously errored jobs until the queue size is below this number.
|
||||
maxQueueBackpressure: 1
|
||||
|
||||
## @param preview_service.retryPeriodMinutes The period, in minutes, between retries of previously errored jobs. Must be an integer between 1 and 60.
|
||||
##
|
||||
retryPeriodMinutes: 1
|
||||
|
||||
puppeteer:
|
||||
## @param preview_service.puppeteer.userDataDirectory The path to the user data directory. If not set, defaults to '/tmp/puppeteer'. This is mounted in the deployment as a volume with read-write access.
|
||||
userDataDirectory: ''
|
||||
|
||||
Reference in New Issue
Block a user