Service health (#516)

* health checks, sigterm handling for all services
This commit is contained in:
Cristian Balas
2022-01-12 15:18:32 +02:00
committed by GitHub
parent 856c816a7a
commit 019b7ac495
19 changed files with 31796 additions and 31421 deletions
+6 -2
View File
@@ -5,6 +5,7 @@ services:
build:
context: .
dockerfile: packages/frontend/Dockerfile
image: speckle/speckle-frontend:local
restart: always
ports:
- "0.0.0.0:80:80"
@@ -13,6 +14,7 @@ services:
build:
context: .
dockerfile: packages/server/Dockerfile
image: speckle/speckle-server:local
restart: always
environment:
# TODO: Change this to the URL of the speckle server, as accessed from the network
@@ -41,6 +43,7 @@ services:
build:
context: .
dockerfile: packages/preview-service/Dockerfile
image: speckle/speckle-preview-service:local
restart: always
mem_limit: "1000m"
memswap_limit: "1000m"
@@ -52,15 +55,17 @@ services:
build:
context: .
dockerfile: packages/webhook-service/Dockerfile
image: speckle/speckle-webhook-service:local
restart: always
environment:
DEBUG: "webhook-service:*"
PG_CONNECTION_STRING: "postgres://speckle:speckle@postgres/speckle"
fileimport-service:
build:
context: .
dockerfile: packages/fileimport-service/Dockerfile
image: speckle/speckle-fileimport-service:local
restart: always
environment:
DEBUG: "fileimport-service:*"
@@ -72,4 +77,3 @@ services:
S3_BUCKET: "speckle-server"
SPECKLE_SERVER_URL: "http://speckle-server:3000"
-1
View File
@@ -17,5 +17,4 @@ RUN npm ci
COPY packages/fileimport-service .
ENTRYPOINT [ "tini", "--" ]
CMD ["node", "src/daemon.js"]
+17
View File
@@ -9,9 +9,13 @@ const { spawn } = require( 'child_process' )
const ServerAPI = require( '../ifc/api' )
const HEALTHCHECK_FILE_PATH = '/tmp/last_successful_query'
const TMP_FILE_PATH = '/tmp/file_to_import'
const TMP_RESULTS_PATH = '/tmp/import_result.json'
let shouldExit = false
async function startTask() {
let { rows } = await knex.raw( `
UPDATE file_uploads
@@ -155,8 +159,15 @@ function runProcessWithTimeout( cmd, cmdArgs, extraEnv, timeoutMs ) {
}
async function tick() {
if ( shouldExit ) {
process.exit( 0 )
}
try {
let task = await startTask()
fs.writeFile( HEALTHCHECK_FILE_PATH, '' + Date.now(), () => {} )
if ( !task ) {
setTimeout( tick, 1000 )
return
@@ -175,6 +186,12 @@ async function tick() {
async function main() {
console.log( 'Starting FileUploads Service...' )
process.on( 'SIGTERM', () => {
shouldExit = true
console.log( 'Shutting down...' )
} )
tick()
}
-1
View File
@@ -55,5 +55,4 @@ COPY --from=build-stage /opt/preview-service /opt/preview-service
WORKDIR /opt/preview-service
ENTRYPOINT [ "tini", "--" ]
CMD ["node", "bin/www"]
@@ -5,6 +5,12 @@ const knex = require( '../knex' )
const fetch = require( 'node-fetch' )
const ObjectPreview = ( ) => knex( 'object_preview' )
const Previews = ( ) => knex( 'previews' )
const fs = require( 'fs' )
let shouldExit = false
const HEALTHCHECK_FILE_PATH = '/tmp/last_successful_query'
async function startTask() {
let { rows } = await knex.raw( `
@@ -72,8 +78,15 @@ async function doTask( task ) {
}
async function tick() {
if ( shouldExit ) {
process.exit( 0 )
}
try {
let task = await startTask()
fs.writeFile( HEALTHCHECK_FILE_PATH, '' + Date.now(), () => {} )
if ( !task ) {
setTimeout( tick, 1000 )
return
@@ -92,6 +105,11 @@ async function tick() {
async function startPreviewService() {
console.log( '📸 Started Preview Service' )
process.on( 'SIGTERM', () => {
shouldExit = true
console.log( 'Shutting down...' )
} )
tick()
}
-1
View File
@@ -20,5 +20,4 @@ RUN npm ci
COPY packages/server .
ENTRYPOINT [ "tini", "--" ]
CMD ["node", "bin/www"]
+17
View File
@@ -10,6 +10,7 @@ const logger = require( 'morgan-debug' )
const bodyParser = require( 'body-parser' )
const path = require( 'path' )
const debug = require( 'debug' )
const { createTerminus } = require( '@godaddy/terminus' )
const Sentry = require( '@sentry/node' )
const Tracing = require( '@sentry/tracing' )
@@ -152,6 +153,22 @@ exports.startHttp = async ( app, customPortOverride ) => {
app.use( Sentry.Handlers.errorHandler( ) )
// large timeout to allow large downloads on slow connections to finish
createTerminus( server, {
signals: [ 'SIGTERM', 'SIGINT' ],
timeout: 5 * 60 * 1000,
beforeShutdown: () => {
debug( 'speckle:shutdown' )( 'Shutting down (signal received)...' )
},
onSignal: () => {
// Other custom cleanup after connections are finished
},
onShutdown: () => {
debug( 'speckle:shutdown' )( 'Shutdown completed' )
process.exit( 0 )
}
} )
server.on( 'listening', ( ) => {
debug( 'speckle:startup' )( `🚀 My name is Speckle Server, and I'm running at ${server.address().address}:${server.address().port}` )
app.emit( 'appStarted' )
+31446 -31415
View File
File diff suppressed because it is too large Load Diff
+1
View File
@@ -18,6 +18,7 @@
"test:report": "npm run test -- --reporter mocha-junit-reporter --reporter-options mochaFile=reports/test-results.xml"
},
"dependencies": {
"@godaddy/terminus": "^4.9.0",
"@sentry/node": "^5.29.2",
"@sentry/tracing": "^5.29.2",
"apollo-server-express": "^2.19.0",
-1
View File
@@ -17,5 +17,4 @@ RUN npm ci
COPY packages/webhook-service/src .
ENTRYPOINT [ "tini", "--" ]
CMD ["node", "main.js"]
+17
View File
@@ -2,6 +2,10 @@
const crypto = require( 'crypto' )
const knex = require( './knex' )
const fs = require( 'fs' )
let shouldExit = false
const HEALTHCHECK_FILE_PATH = '/tmp/last_successful_query'
const { makeNetworkRequest, isLocalNetworkUrl } = require( './webhookCaller' )
@@ -77,8 +81,15 @@ async function doTask( task ) {
}
async function tick() {
if ( shouldExit ) {
process.exit( 0 )
}
try {
let task = await startTask()
fs.writeFile( HEALTHCHECK_FILE_PATH, '' + Date.now(), () => {} )
if ( !task ) {
setTimeout( tick, 1000 )
return
@@ -97,6 +108,12 @@ async function tick() {
async function main() {
console.log( 'Starting Webhook Service...' )
process.on( 'SIGTERM', () => {
shouldExit = true
console.log( 'Shutting down...' )
} )
tick()
}
@@ -27,6 +27,8 @@ spec:
name: postgres-certificate
{{- end }}
terminationGracePeriodSeconds: 310
containers:
- name: main
image: speckle/speckle-server:{{ .Values.docker_image_tag }}
@@ -45,6 +47,32 @@ spec:
mountPath: /postgres-certificate
{{- end }}
# Allow for k8s to remove the pod from the service endpoints to stop receive traffic
lifecycle:
preStop:
exec:
command: ["sleep", "5"]
livenessProbe:
# account for long-running migrations
initialDelaySeconds: 600
periodSeconds: 60
exec:
command:
- node
- -e
- require('request')('http://localhost:3000/graphql?query={serverInfo{version}}', (e,r,b) => process.exit(b.toLowerCase().includes('error')))
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 10
exec:
command:
- node
- -e
- require('request')('http://localhost:3000/graphql?query={serverInfo{version}}', (e,r,b) => process.exit(b.toLowerCase().includes('error')))
env:
- name: CANONICAL_URL
value: https://{{ .Values.domain }}
@@ -95,6 +123,9 @@ spec:
secretKeyRef:
name: {{ .Values.secretName }}
key: s3_secret_key
- name: S3_CREATE_BUCKET
value: "{{ .Values.s3.create_bucket }}"
{{- end }}
# *** Authentication ***
@@ -29,10 +29,22 @@ spec:
name: postgres-certificate
{{- end }}
# Should be > File import timeout to allow finishing up imports
terminationGracePeriodSeconds: 610
containers:
- name: main
image: speckle/speckle-fileimport-service:{{ .Values.docker_image_tag }}
livenessProbe:
initialDelaySeconds: 60
periodSeconds: 60
exec:
command:
- node
- -e
- process.exit(Date.now() - require('fs').readFileSync('/tmp/last_successful_query', 'utf8') > 15 * 60 * 1000)
resources:
requests:
cpu: {{ .Values.fileimport_service.requests.cpu }}
@@ -30,3 +30,16 @@ spec:
limits:
cpu: {{ .Values.frontend.limits.cpu }}
memory: {{ .Values.frontend.limits.memory }}
# Allow for k8s to remove the pod from the service endpoints to stop receive traffic
lifecycle:
preStop:
exec:
command: ["sleep", "5"]
readinessProbe:
httpGet:
path: /
port: 80
initialDelaySeconds: 5
periodSeconds: 5
@@ -27,10 +27,22 @@ spec:
name: postgres-certificate
{{- end }}
# Should be > preview generation time ( 1 hour for good measure )
terminationGracePeriodSeconds: 3600
containers:
- name: main
image: speckle/speckle-preview-service:{{ .Values.docker_image_tag }}
livenessProbe:
initialDelaySeconds: 60
periodSeconds: 60
exec:
command:
- node
- -e
- process.exit(Date.now() - require('fs').readFileSync('/tmp/last_successful_query', 'utf8') > 3600 * 1000)
resources:
requests:
cpu: {{ .Values.preview_service.requests.cpu }}
@@ -27,10 +27,22 @@ spec:
name: postgres-certificate
{{- end }}
# Should be > webhook max call time ( ~= 10 seconds )
terminationGracePeriodSeconds: 30
containers:
- name: main
image: speckle/speckle-webhook-service:{{ .Values.docker_image_tag }}
livenessProbe:
initialDelaySeconds: 60
periodSeconds: 60
exec:
command:
- node
- -e
- process.exit(Date.now() - require('fs').readFileSync('/tmp/last_successful_query', 'utf8') > 30 * 1000)
resources:
requests:
cpu: {{ .Values.webhook_service.requests.cpu }}
+1
View File
@@ -14,6 +14,7 @@ s3:
endpoint: ""
bucket: ""
access_key: ""
create_bucket: "false"
# secret_key: secret -> s3_secret_key
#redis:
+177
View File
@@ -0,0 +1,177 @@
apiVersion: v1
kind: Namespace
metadata:
name: speckle-test
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: pg
namespace: speckle-test
labels:
app: pg
project: speckle-server
spec:
replicas: 1
selector:
matchLabels:
app: pg
project: speckle-server
template:
metadata:
labels:
app: pg
project: speckle-server
spec:
priorityClassName: high-priority
containers:
- name: main
image: postgres:13.1-alpine
resources:
limits:
cpu: 1000m
memory: 1Gi
env:
- name: POSTGRES_DB
value: speckle
- name: POSTGRES_USER
value: speckle
- name: POSTGRES_PASSWORD
value: speckle
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis
namespace: speckle-test
labels:
app: redis
project: speckle-server
spec:
replicas: 1
selector:
matchLabels:
app: redis
project: speckle-server
template:
metadata:
labels:
app: redis
project: speckle-server
spec:
priorityClassName: high-priority
containers:
- name: main
image: redis:6.0-alpine
resources:
limits:
cpu: 1000m
memory: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: minio
namespace: speckle-test
labels:
app: minio
project: speckle-server
spec:
replicas: 1
selector:
matchLabels:
app: minio
project: speckle-server
template:
metadata:
labels:
app: minio
project: speckle-server
spec:
priorityClassName: high-priority
containers:
- name: main
image: minio/minio
args: ["server", "/data", "--console-address", ":9001"]
resources:
limits:
cpu: 1000m
memory: 1Gi
---
apiVersion: v1
kind: Service
metadata:
name: pg
namespace: speckle-test
labels:
app: pg
project: speckle-server
spec:
selector:
app: pg
project: speckle-server
ports:
- protocol: TCP
name: pg
port: 5432
targetPort: 5432
---
apiVersion: v1
kind: Service
metadata:
name: redis
namespace: speckle-test
labels:
app: redis
project: speckle-server
spec:
selector:
app: redis
project: speckle-server
ports:
- protocol: TCP
name: redis
port: 6379
targetPort: 6379
---
apiVersion: v1
kind: Service
metadata:
name: minio
namespace: speckle-test
labels:
app: minio
project: speckle-server
spec:
type: NodePort
selector:
app: minio
project: speckle-server
ports:
- protocol: TCP
name: minio1
port: 9000
targetPort: 9000
- protocol: TCP
name: minio2
port: 9001
targetPort: 9001
---
apiVersion: v1
kind: Secret
metadata:
name: server-vars
namespace: speckle-test
stringData:
postgres_url: postgresql://speckle:speckle@pg/speckle
redis_url: redis://redis
session_secret: hello
email_password: ""
s3_secret_key: minioadmin
google_client_secret: ""
+16
View File
@@ -0,0 +1,16 @@
namespace: speckle-test
domain: myspeckleserver
docker_image_tag: "local"
db:
PGSSLMODE: allow3
s3:
endpoint: "http://minio:9000/"
bucket: "speckle"
access_key: "minioadmin"
create_bucket: "true"
cert_manager_issuer: letsencrypt-staging