diff --git a/packages/server/modules/previews/domain/operations.ts b/packages/server/modules/previews/domain/operations.ts index a2175e500..75d43cf01 100644 --- a/packages/server/modules/previews/domain/operations.ts +++ b/packages/server/modules/previews/domain/operations.ts @@ -132,3 +132,5 @@ export type BuildUpdateObjectPreview = (params: { }) => Promise export type ObserveMetrics = (params: { payload: PreviewResultPayload }) => void + +export type GetNumberOfJobsInRequestQueue = () => Promise diff --git a/packages/server/modules/previews/index.ts b/packages/server/modules/previews/index.ts index 2df4a6742..76b05324a 100644 --- a/packages/server/modules/previews/index.ts +++ b/packages/server/modules/previews/index.ts @@ -6,6 +6,7 @@ import { disablePreviews, getFeatureFlags, getPreviewServiceRedisUrl, + getPreviewServiceRetryPeriodMinutes, getRedisUrl, getServerOrigin } from '@/modules/shared/helpers/envHelper' @@ -101,7 +102,7 @@ export const init: SpeckleModule['init'] = async ({ scheduleExecution, previewRequestQueue, responseQueueName, - cronExpression: '*/23 * * * *' // every 23 minutes (kind of random prime number to reduce syncing with other possibly heavy tasks) + cronExpression: `*/${getPreviewServiceRetryPeriodMinutes()} * * * *` }) ] : []) diff --git a/packages/server/modules/previews/queues/previews.ts b/packages/server/modules/previews/queues/previews.ts index f7cabf40e..d6fdc731c 100644 --- a/packages/server/modules/previews/queues/previews.ts +++ b/packages/server/modules/previews/queues/previews.ts @@ -1,12 +1,13 @@ import type { BuildUpdateObjectPreview, + GetNumberOfJobsInRequestQueue, RequestObjectPreview } from '@/modules/previews/domain/operations' import type { Logger } from '@/observability/logging' import type { Queue, Job } from 'bull' import { PreviewStatus } from '@/modules/previews/domain/consts' import type { JobPayload } from '@speckle/shared/workers/previews' -import { fromJobId, jobIdSchema } from '@speckle/shared/workers/previews' +import { fromJobId } from '@speckle/shared/workers/previews' export const requestObjectPreviewFactory = ({ @@ -63,3 +64,10 @@ export const requestFailedHandlerFactory = ) } } + +export const getNumberOfJobsInQueueFactory = + (deps: { queue: Queue }): GetNumberOfJobsInRequestQueue => + async () => { + const counts = await deps.queue.getJobCounts() + return counts.waiting + counts.active + counts.delayed + } diff --git a/packages/server/modules/previews/services/retryErrors.ts b/packages/server/modules/previews/services/retryErrors.ts index dd9643be7..d0ec235cd 100644 --- a/packages/server/modules/previews/services/retryErrors.ts +++ b/packages/server/modules/previews/services/retryErrors.ts @@ -1,5 +1,6 @@ import type { Logger } from '@/observability/logging' import { + GetNumberOfJobsInRequestQueue, GetPaginatedObjectPreviewsInErrorState, GetPaginatedObjectPreviewsPage, GetPaginatedObjectPreviewsTotalCount, @@ -12,6 +13,7 @@ import { DefaultAppIds } from '@/modules/auth/defaultApps' import { TokenResourceIdentifierType } from '@/modules/core/domain/tokens/types' import { GetStreamCollaborators } from '@/modules/core/domain/streams/operations' import { CreateAndStoreAppToken } from '@/modules/core/domain/tokens/operations' +import { getPreviewServiceMaxQueueBackpressure } from '@/modules/shared/helpers/envHelper' export const getPaginatedObjectPreviewInErrorStateFactory = (deps: { @@ -47,6 +49,7 @@ export const retryFailedPreviewsFactory = (deps: { serverOrigin: string createAppToken: CreateAndStoreAppToken requestObjectPreview: RequestObjectPreview + getNumberOfJobsInQueue: GetNumberOfJobsInRequestQueue }) => { const { getPaginatedObjectPreviewsInErrorState, @@ -54,7 +57,8 @@ export const retryFailedPreviewsFactory = (deps: { getStreamCollaborators, serverOrigin, createAppToken, - requestObjectPreview + requestObjectPreview, + getNumberOfJobsInQueue } = deps return async (params: { logger: Logger }): Promise => { const { logger } = params @@ -68,6 +72,16 @@ export const retryFailedPreviewsFactory = (deps: { return false } + // do not retry if we have backpressure in the queue + const queueLength = await getNumberOfJobsInQueue() + if (queueLength > getPreviewServiceMaxQueueBackpressure()) { + logger.info( + { queueLength, totalErroredPreviewCount: totalCount }, + 'Backpressure detected in the preview request queue, queue length is {queueLength} jobs. Found {totalErroredPreviewCount} object previews in error state, but are not retrying any on this iteration.' + ) + return false + } + const objPreview = items[0] const { streamId, objectId } = objPreview diff --git a/packages/server/modules/previews/tasks/tasks.ts b/packages/server/modules/previews/tasks/tasks.ts index f820243d8..23978491f 100644 --- a/packages/server/modules/previews/tasks/tasks.ts +++ b/packages/server/modules/previews/tasks/tasks.ts @@ -4,7 +4,10 @@ import { previewServiceShouldUsePrivateObjectsServerUrl } from '@/modules/shared/helpers/envHelper' import type { Queue } from 'bull' -import { requestObjectPreviewFactory } from '@/modules/previews/queues/previews' +import { + getNumberOfJobsInQueueFactory, + requestObjectPreviewFactory +} from '@/modules/previews/queues/previews' import type { ScheduleExecution } from '@/modules/core/domain/scheduledTasks/operations' import { getRegisteredDbClients } from '@/modules/multiregion/utils/dbSelector' import { @@ -72,6 +75,9 @@ export const scheduleRetryFailedPreviews = async ({ db }), storeUserServerAppToken: storeUserServerAppTokenFactory({ db }) + }), + getNumberOfJobsInQueue: getNumberOfJobsInQueueFactory({ + queue: previewRequestQueue }) }) ) diff --git a/packages/server/modules/shared/helpers/envHelper.ts b/packages/server/modules/shared/helpers/envHelper.ts index 12f0f4dd8..73df103e5 100644 --- a/packages/server/modules/shared/helpers/envHelper.ts +++ b/packages/server/modules/shared/helpers/envHelper.ts @@ -513,6 +513,24 @@ export const getPreviewServiceTimeoutMilliseconds = (): number => { return getIntFromEnv('PREVIEW_SERVICE_TIMEOUT_MILLISECONDS', '3600000') // 1 hour } +export const getPreviewServiceRetryPeriodMinutes = (): number => { + const value = getIntFromEnv('PREVIEW_SERVICE_RETRY_PERIOD_MINUTES', '1') + if (value < 1 || value > 60) + throw new MisconfiguredEnvironmentError( + `PREVIEW_SERVICE_RETRY_PERIOD_MINUTES must be an integer between 1 and 60, got ${value}` + ) + return value +} + +export const getPreviewServiceMaxQueueBackpressure = (): number => { + const value = getIntFromEnv('PREVIEW_SERVICE_MAX_QUEUE_BACKPRESSURE', '1') + if (value < 1) + throw new MisconfiguredEnvironmentError( + `PREVIEW_SERVICE_MAX_QUEUE_BACKPRESSURE must be an integer greater than 0, got ${value}` + ) + return value +} + export const emailVerificationTimeoutMinutes = (): number => { return getIntFromEnv('EMAIL_VERIFICATION_TIMEOUT_MINUTES', '5') } diff --git a/utils/helm/speckle-server/templates/_helpers.tpl b/utils/helm/speckle-server/templates/_helpers.tpl index 447c87f96..f3a656060 100644 --- a/utils/helm/speckle-server/templates/_helpers.tpl +++ b/utils/helm/speckle-server/templates/_helpers.tpl @@ -771,6 +771,12 @@ Generate the environment variables for Speckle server and Speckle objects deploy - name: PREVIEW_SERVICE_TIMEOUT_MILLISECONDS value: {{ .Values.preview_service.puppeteer.timeoutMilliseconds | quote }} {{- end }} +{{- if .Values.featureFlags.retryErroredPreviewsEnabled }} +- name: PREVIEW_SERVICE_MAX_QUEUE_BACKPRESSURE + value: {{ .Values.preview_service.maxQueueBackpressure | quote }} +- name: PREVIEW_SERVICE_RETRY_PERIOD_MINUTES + value: {{ .Values.preview_service.retryPeriodMinutes | quote }} +{{- end }} {{- end }} # *** Redis *** diff --git a/utils/helm/speckle-server/values.schema.json b/utils/helm/speckle-server/values.schema.json index f11c1f79b..9bbbdeed9 100644 --- a/utils/helm/speckle-server/values.schema.json +++ b/utils/helm/speckle-server/values.schema.json @@ -2058,6 +2058,16 @@ "description": "The maximum number of connections that the Preview Service postgres client will make to the Postgres database.", "default": 2 }, + "maxQueueBackpressure": { + "type": "number", + "description": "The maximum number of items that can be queued in the Preview Service job queue before we stop retrying previously errored preview jobs. This is used to prevent the Preview Service from being overwhelmed with too many jobs.", + "default": 1 + }, + "retryPeriodMinutes": { + "type": "number", + "description": "The period, in minutes, between retries of previously errored jobs. Must be an integer between 1 and 60.", + "default": 1 + }, "puppeteer": { "type": "object", "properties": { diff --git a/utils/helm/speckle-server/values.yaml b/utils/helm/speckle-server/values.yaml index 92070b2cf..6daf0165b 100644 --- a/utils/helm/speckle-server/values.yaml +++ b/utils/helm/speckle-server/values.yaml @@ -1227,6 +1227,14 @@ preview_service: ## postgresMaxConnections: 2 + ## @param preview_service.maxQueueBackpressure The maximum number of items that can be queued in the Preview Service job queue before we stop retrying previously errored preview jobs. This is used to prevent the Preview Service from being overwhelmed with too many jobs. + ## If the queue exceeds this number, the Preview Service will stop retrying previously errored jobs until the queue size is below this number. + maxQueueBackpressure: 1 + + ## @param preview_service.retryPeriodMinutes The period, in minutes, between retries of previously errored jobs. Must be an integer between 1 and 60. + ## + retryPeriodMinutes: 1 + puppeteer: ## @param preview_service.puppeteer.userDataDirectory The path to the user data directory. If not set, defaults to '/tmp/puppeteer'. This is mounted in the deployment as a volume with read-write access. userDataDirectory: ''