20
20
from logging import getLogger
21
21
from logging .handlers import RotatingFileHandler
22
22
from configparser import ConfigParser
23
+ from threading import Lock
23
24
24
25
import redis
25
26
@@ -59,6 +60,29 @@ def reraise(tp, value, tb=None):
59
60
JOBS_EXPIRATION = 15 # 20 * 60 # how long to keep job ids in the EOS registry (exactly-once-semantics)
60
61
61
62
config = None
63
+ first_exception_info = None
64
+ safe_thread_lock = Lock ()
65
+
66
+
67
+ class SafeThread (threading .Thread ):
68
+ def __init__ (self , * , target , name , args = (), kwargs = None , daemon = None ):
69
+ super ().__init__ (None , target , name , args , kwargs , daemon = daemon )
70
+ self .exc_info = None
71
+
72
+ def run (self ):
73
+ global first_exception_info
74
+ try :
75
+ self ._target (* self ._args , ** self ._kwargs )
76
+ except :
77
+ exc_info = sys .exc_info ()
78
+ logger .info ("exception in '%s'" , self .name , exc_info = exc_info )
79
+ with safe_thread_lock :
80
+ if not first_exception_info :
81
+ first_exception_info = sys .exc_info ()
82
+ finally :
83
+ # Avoid a refcycle if the thread is running a function with
84
+ # an argument that has a member that points to the thread.
85
+ del self ._target , self ._args , self ._kwargs
62
86
63
87
64
88
class LineTimeout (Exception ):
@@ -439,9 +463,7 @@ def _kill():
439
463
except Exception as e :
440
464
self .logger .error (e )
441
465
442
- thread = threading .Thread (target = _kill , name = "killer-%s" % self .job_id )
443
- thread .daemon = True
444
- thread .start ()
466
+ SafeThread (target = _kill , name = "killer-%s" % self .job_id , daemon = True ).start ()
445
467
self .reset_timeout (new_timeout = graceful_timeout + 10 )
446
468
447
469
@@ -451,7 +473,7 @@ def __init__(self, *args, **kwargs):
451
473
super (RebootJob , self ).__init__ (* args , ** kwargs )
452
474
453
475
def start (self ):
454
- threading . Thread (target = self .reboot_host , name = "Reboot" ).start ()
476
+ SafeThread (target = self .reboot_host , name = "Reboot" ).start ()
455
477
456
478
def reboot_host (self ):
457
479
with open (REBOOT_FILENAME , 'w' ) as f :
@@ -497,8 +519,6 @@ def __init__(self):
497
519
self .output_lock = threading .RLock ()
498
520
self .redis = None
499
521
self .host_id = None
500
- self .redis_fetcher = None
501
- self .redis_sender = None
502
522
self .job_poller = None
503
523
self .fds_poller = select .poll ()
504
524
self .fds_to_channels = {}
@@ -620,7 +640,7 @@ def stop_for_reboot(self, requested_by):
620
640
621
641
requested_by .log ("Some jobs not yet finished, setting exit code to 'reboot' and proceeding" )
622
642
with self .pipeline () as pipeline :
623
- for job_id , job in self .current_processes .items ():
643
+ for job_id , job in list ( self .current_processes .items () ):
624
644
if job_id == requested_by .job_id :
625
645
continue
626
646
job .set_result ('reboot' )
@@ -744,35 +764,24 @@ def sync_jobs_progress(self):
744
764
else :
745
765
time .sleep (CYCLE_DURATION )
746
766
747
- def start_worker (self , worker , name ):
748
-
749
- def safe_run ():
750
- try :
751
- return worker ()
752
- except : # noqa
753
- self .exc_info = sys .exc_info ()
754
- logger .debug ("exception in '%s'" , name , exc_info = self .exc_info )
755
-
756
- t = threading .Thread (target = safe_run , name = name )
757
- t .daemon = True
758
- t .start ()
759
- return t
760
-
761
767
def start (self ):
768
+ global first_exception_info
769
+ first_exception_info = None
770
+
762
771
self .finalize_previous_session ()
763
772
if os .path .isfile (JOBS_SEEN ):
764
773
with open (JOBS_SEEN , "r" ) as f :
765
774
self .seen_jobs = json .load (f )
766
775
767
- self . redis_fetcher = self .start_worker ( self . fetch_new_jobs , name = "RedisFetcher" )
768
- self . redis_sender = self .start_worker ( self . sync_jobs_progress , name = "JobProgress" )
776
+ SafeThread ( target = self .fetch_new_jobs , name = "RedisFetcher" , daemon = True ). start ( )
777
+ SafeThread ( target = self .sync_jobs_progress , name = "JobProgress" , daemon = True ). start ( )
769
778
770
779
while not self .stop_agent .is_set ():
771
780
if not self .get_jobs_outputs ():
772
781
time .sleep (CYCLE_DURATION / 10.0 )
773
- if self . exc_info :
782
+ if first_exception_info :
774
783
logger .debug ("re-raising exception from worker" )
775
- reraise (* self . exc_info )
784
+ reraise (* first_exception_info )
776
785
assert False , "exception should have been raised"
777
786
778
787
def setup (self ):
@@ -819,7 +828,7 @@ def unregister_fileno(self, fileno):
819
828
820
829
821
830
def wait_proc (proc , timeout ):
822
- t = threading . Thread (target = proc .wait )
831
+ t = SafeThread (target = proc .wait , name = 'wait_proc' )
823
832
t .start ()
824
833
t .join (timeout )
825
834
return not t .is_alive ()
0 commit comments