moveStalledJobsToWait-8.lua 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. --[[
  2. Move stalled jobs to wait.
  3. Input:
  4. KEYS[1] 'stalled' (SET)
  5. KEYS[2] 'wait', (LIST)
  6. KEYS[3] 'active', (LIST)
  7. KEYS[4] 'stalled-check', (KEY)
  8. KEYS[5] 'meta', (KEY)
  9. KEYS[6] 'paused', (LIST)
  10. KEYS[7] 'marker'
  11. KEYS[8] 'event stream' (STREAM)
  12. ARGV[1] Max stalled job count
  13. ARGV[2] queue.toKey('')
  14. ARGV[3] timestamp
  15. ARGV[4] max check time
  16. Events:
  17. 'stalled' with stalled job id.
  18. ]]
  19. local rcall = redis.call
  20. -- Includes
  21. --- @include "includes/addJobInTargetList"
  22. --- @include "includes/batches"
  23. --- @include "includes/moveJobToWait"
  24. --- @include "includes/trimEvents"
  25. local stalledKey = KEYS[1]
  26. local waitKey = KEYS[2]
  27. local activeKey = KEYS[3]
  28. local stalledCheckKey = KEYS[4]
  29. local metaKey = KEYS[5]
  30. local pausedKey = KEYS[6]
  31. local markerKey = KEYS[7]
  32. local eventStreamKey = KEYS[8]
  33. local maxStalledJobCount = tonumber(ARGV[1])
  34. local queueKeyPrefix = ARGV[2]
  35. local timestamp = ARGV[3]
  36. local maxCheckTime = ARGV[4]
  37. if rcall("EXISTS", stalledCheckKey) == 1 then
  38. return {}
  39. end
  40. rcall("SET", stalledCheckKey, timestamp, "PX", maxCheckTime)
  41. -- Trim events before emiting them to avoid trimming events emitted in this script
  42. trimEvents(metaKey, eventStreamKey)
  43. -- Move all stalled jobs to wait
  44. local stalling = rcall('SMEMBERS', stalledKey)
  45. local stalled = {}
  46. if (#stalling > 0) then
  47. rcall('DEL', stalledKey)
  48. -- Remove from active list
  49. for i, jobId in ipairs(stalling) do
  50. -- Markers in waitlist DEPRECATED in v5: Remove in v6.
  51. if string.sub(jobId, 1, 2) == "0:" then
  52. -- If the jobId is a delay marker ID we just remove it.
  53. rcall("LREM", activeKey, 1, jobId)
  54. else
  55. local jobKey = queueKeyPrefix .. jobId
  56. -- Check that the lock is also missing, then we can handle this job as really stalled.
  57. if (rcall("EXISTS", jobKey .. ":lock") == 0) then
  58. -- Remove from the active queue.
  59. local removed = rcall("LREM", activeKey, 1, jobId)
  60. if (removed > 0) then
  61. -- If this job has been stalled too many times, such as if it crashes the worker, then fail it.
  62. local stalledCount = rcall("HINCRBY", jobKey, "stc", 1)
  63. -- Check if this is a repeatable job by looking at job options
  64. local jobOpts = rcall("HGET", jobKey, "opts")
  65. local isRepeatableJob = false
  66. if jobOpts then
  67. local opts = cjson.decode(jobOpts)
  68. if opts and opts["repeat"] then
  69. isRepeatableJob = true
  70. end
  71. end
  72. -- Only fail job if it exceeds stall limit AND is not a repeatable job
  73. if stalledCount > maxStalledJobCount and not isRepeatableJob then
  74. local failedReason = "job stalled more than allowable limit"
  75. rcall("HSET", jobKey, "defa", failedReason)
  76. end
  77. moveJobToWait(metaKey, activeKey, waitKey, pausedKey, markerKey, eventStreamKey, jobId,
  78. "RPUSH")
  79. -- Emit the stalled event
  80. rcall("XADD", eventStreamKey, "*", "event", "stalled", "jobId", jobId)
  81. table.insert(stalled, jobId)
  82. end
  83. end
  84. end
  85. end
  86. end
  87. -- Mark potentially stalled jobs
  88. local active = rcall('LRANGE', activeKey, 0, -1)
  89. if (#active > 0) then
  90. for from, to in batches(#active, 7000) do
  91. rcall('SADD', stalledKey, unpack(active, from, to))
  92. end
  93. end
  94. return stalled