| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113 |
- --[[
- Move stalled jobs to wait.
- Input:
- KEYS[1] 'stalled' (SET)
- KEYS[2] 'wait', (LIST)
- KEYS[3] 'active', (LIST)
- KEYS[4] 'stalled-check', (KEY)
- KEYS[5] 'meta', (KEY)
- KEYS[6] 'paused', (LIST)
- KEYS[7] 'marker'
- KEYS[8] 'event stream' (STREAM)
- ARGV[1] Max stalled job count
- ARGV[2] queue.toKey('')
- ARGV[3] timestamp
- ARGV[4] max check time
- Events:
- 'stalled' with stalled job id.
- ]]
- local rcall = redis.call
- -- Includes
- --- @include "includes/addJobInTargetList"
- --- @include "includes/batches"
- --- @include "includes/moveJobToWait"
- --- @include "includes/trimEvents"
- local stalledKey = KEYS[1]
- local waitKey = KEYS[2]
- local activeKey = KEYS[3]
- local stalledCheckKey = KEYS[4]
- local metaKey = KEYS[5]
- local pausedKey = KEYS[6]
- local markerKey = KEYS[7]
- local eventStreamKey = KEYS[8]
- local maxStalledJobCount = tonumber(ARGV[1])
- local queueKeyPrefix = ARGV[2]
- local timestamp = ARGV[3]
- local maxCheckTime = ARGV[4]
- if rcall("EXISTS", stalledCheckKey) == 1 then
- return {}
- end
- rcall("SET", stalledCheckKey, timestamp, "PX", maxCheckTime)
- -- Trim events before emiting them to avoid trimming events emitted in this script
- trimEvents(metaKey, eventStreamKey)
- -- Move all stalled jobs to wait
- local stalling = rcall('SMEMBERS', stalledKey)
- local stalled = {}
- if (#stalling > 0) then
- rcall('DEL', stalledKey)
- -- Remove from active list
- for i, jobId in ipairs(stalling) do
- -- Markers in waitlist DEPRECATED in v5: Remove in v6.
- if string.sub(jobId, 1, 2) == "0:" then
- -- If the jobId is a delay marker ID we just remove it.
- rcall("LREM", activeKey, 1, jobId)
- else
- local jobKey = queueKeyPrefix .. jobId
- -- Check that the lock is also missing, then we can handle this job as really stalled.
- if (rcall("EXISTS", jobKey .. ":lock") == 0) then
- -- Remove from the active queue.
- local removed = rcall("LREM", activeKey, 1, jobId)
- if (removed > 0) then
- -- If this job has been stalled too many times, such as if it crashes the worker, then fail it.
- local stalledCount = rcall("HINCRBY", jobKey, "stc", 1)
-
- -- Check if this is a repeatable job by looking at job options
- local jobOpts = rcall("HGET", jobKey, "opts")
- local isRepeatableJob = false
- if jobOpts then
- local opts = cjson.decode(jobOpts)
- if opts and opts["repeat"] then
- isRepeatableJob = true
- end
- end
-
- -- Only fail job if it exceeds stall limit AND is not a repeatable job
- if stalledCount > maxStalledJobCount and not isRepeatableJob then
- local failedReason = "job stalled more than allowable limit"
- rcall("HSET", jobKey, "defa", failedReason)
- end
-
- moveJobToWait(metaKey, activeKey, waitKey, pausedKey, markerKey, eventStreamKey, jobId,
- "RPUSH")
- -- Emit the stalled event
- rcall("XADD", eventStreamKey, "*", "event", "stalled", "jobId", jobId)
- table.insert(stalled, jobId)
- end
- end
- end
- end
- end
- -- Mark potentially stalled jobs
- local active = rcall('LRANGE', activeKey, 0, -1)
- if (#active > 0) then
- for from, to in batches(#active, 7000) do
- rcall('SADD', stalledKey, unpack(active, from, to))
- end
- end
- return stalled
|