From d891f9e2bcb8055621c8f5136656c4653cab5047 Mon Sep 17 00:00:00 2001 From: Jonathon Hall Date: Mon, 21 Oct 2024 09:55:17 -0400 Subject: [PATCH] worker.py: Avoid letting a job get stuck if exception occurs Some instances of stuck jobs were observed recently for PureOS. From the logs, I think a Python exception may have occurred after the build completed but before the artifacts were uploaded. I can't tell what might have caused that exception, if it did occur. This change would ensure that a 'rejected' status is sent if this occurs, rather than leaving the job stuck with no result. I applied this change to fennel.pureos.net (the new worker) to try to identify what was causing the stuck jobs, but this never happened again. No jobs got stuck after that and this exception code was never hit, so I can't identify the root cause. Signed-off-by: Jonathon Hall --- spark/worker.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/spark/worker.py b/spark/worker.py index 51816b1..a505bb0 100644 --- a/spark/worker.py +++ b/spark/worker.py @@ -184,19 +184,32 @@ def _request_job(self): # there are no jobs available for us return False - job_module = job_reply.get('module') - job_kind = job_reply.get('kind') - job_id = job_reply.get('uuid') - - if job_kind in self._conf.accepted_job_kinds: - return self._run_job(job_reply) - else: - log.warning( - 'Received job of type {0}::{1} which we can not handle.'.format( - job_module, job_kind + # Now that we've accepted a job, we just reply to the server, even if + # an exception occurs. If we don't, the job is stuck indefinitely and + # we will not be able to accept another. + try: + job_module = job_reply.get('module') + job_kind = job_reply.get('kind') + job_id = job_reply.get('uuid') + + if job_kind in self._conf.accepted_job_kinds: + return self._run_job(job_reply) + else: + log.warning( + 'Received job of type {0}::{1} which we can not handle.'.format( + job_module, job_kind + ) ) - ) + self._conn.send_job_status(job_id, JobStatus.REJECTED) + return False + except: # noqa: E722 pylint: disable=bare-except + import traceback + + tb = traceback.format_exc() + jlog.write(tb) self._conn.send_job_status(job_id, JobStatus.REJECTED) + log.warning(tb) + log.info('Rejected job {} due to exception'.format(job_id)) return False def _update_archive_data(self) -> bool: