2222
2323
2424def run_command (command_idx ):
25- command , idx = command_idx
25+ command , env , idx = command_idx
2626 gpu_index = current_process ()._identity [0 ] % gpu_count
2727 if torch .cuda .is_available ():
2828 command += f" -device cuda:{ gpu_index } "
@@ -38,6 +38,7 @@ def run_command(command_idx):
3838 command += " -device cpu"
3939 print ("Command:" , idx , "on cpu on process" , current_process ()._identity [0 ])
4040
41+ env_str = " " .join (f"{ k } ={ v } " for k , v in env .items ())
4142 today = date .today ()
4243 os .makedirs ("./logs" , exist_ok = True )
4344 try :
@@ -48,28 +49,29 @@ def run_command(command_idx):
4849 shell = True ,
4950 check = True ,
5051 stderr = err ,
51- env = {** os .environ , "PYTHONOPTIMIZE" : "2" },
52+ env = {** os .environ , "PYTHONOPTIMIZE" : "2" , ** env },
5253 )
5354 os .remove (f"./logs/error_{ idx } _{ today } .txt" )
5455 elapsed = time .time () - start
5556 with open ("./logs/finished_runs.txt" , "a+" ) as fp :
56- fp .write (f"{ idx } -> { today } -> " + str (elapsed ) + "s + " + command + "\n " )
57+ fp .write (f"{ idx } -> { today } -> " + str (elapsed ) + "s + " + env_str + " " + command + "\n " )
5758 except subprocess .CalledProcessError :
5859 with open (f"./logs/failed_runs_{ today } .txt" , "a+" ) as fp :
59- fp .write (command + "\n " )
60+ fp .write (env_str + " " + command + "\n " )
6061
6162
6263def create_run (
63- dataset ,
64- model ,
65- optimizer ,
66- seed ,
67- epochs ,
68- es_patience ,
69- batch_size ,
70- scheduler_params ,
71- lr ,
72- reduction ,
64+ dataset ,
65+ model ,
66+ optimizer ,
67+ seed ,
68+ epochs ,
69+ es_patience ,
70+ batch_size ,
71+ scheduler_params ,
72+ lr ,
73+ reduction ,
74+ loss_scaling
7375):
7476 scheduler_name , scheduler_params = scheduler_params
7577 scheduler_params = str (scheduler_params ).replace (" " , "" )
@@ -94,15 +96,19 @@ def create_run(
9496 f" --disable_progress_bar"
9597 f" --stderr"
9698 f" --verbose"
97- ) + (" --half" if torch .cuda .is_available () else "" )
99+ ) + (
100+ " --half" if torch .cuda .is_available () else ""
101+ ) + (
102+ f" -loss_scaling { loss_scaling } " if loss_scaling is not None else ""
103+ )
98104
99105
100106def generate_runs ():
101107 datasets = [
102108 # 'cifar10',
103- "cifar10" ,
109+ # "cifar10",
104110 "cifar100" ,
105- "FashionMNIST" ,
111+ # "FashionMNIST",
106112 ]
107113 models = [
108114 "preresnet18_c10"
@@ -120,7 +126,7 @@ def generate_runs():
120126 20
121127 ]
122128 batch_sizes = [
123- 16 , 32
129+ 2048
124130 ]
125131 lrs = [
126132 0.001
@@ -129,49 +135,29 @@ def generate_runs():
129135 "mean"
130136 ]
131137 schedulers = [
132- # ('IncreaseBSOnPlateau', {'mode': 'min', 'factor': 2.0, 'max_batch_size': max_batch_size}),
133- # ('IncreaseBSOnPlateau', {'mode': 'min', 'factor': 5.0, 'max_batch_size': max_batch_size}),
134- # ('ReduceLROnPlateau', {'mode': 'min', 'factor': 0.5}),
135- # ('ReduceLROnPlateau', {'mode': 'min', 'factor': 0.2}),
136- #
137- # ('StepBS', {'step_size': 30, 'gamma': 2.0, 'max_batch_size': max_batch_size}),
138- # ('StepBS', {'step_size': 50, 'gamma': 2.0, 'max_batch_size': max_batch_size}),
139- # ('StepBS', {'step_size': 30, 'gamma': 5.0, 'max_batch_size': max_batch_size}),
140- # ('StepBS', {'step_size': 50, 'gamma': 5.0, 'max_batch_size': max_batch_size}),
141- #
142- # ('StepLR', {'step_size': 30, 'gamma': 0.5}),
143- # ('StepLR', {'step_size': 50, 'gamma': 0.5}),
144- # ('StepLR', {'step_size': 30, 'gamma': 0.2}),
145- # ('StepLR', {'step_size': 50, 'gamma': 0.2}),
146- ("ExponentialBS" , {"gamma" : 1.01 , "max_batch_size" : 1000 }),
147- ("ExponentialLR" , {"gamma" : 0.99 }),
148- ("PolynomialBS" , {"total_iters" : 200 , "max_batch_size" : 1000 }),
149- ("PolynomialLR" , {"total_iters" : 200 }),
150- # ('CosineAnnealingBS', {'total_iters': 200, 'max_batch_size': 1000}),
151- # ('CosineAnnealingLR', {'T_max': 200, }),
152- #
153- # ('CosineAnnealingBSWithWarmRestarts', {'t_0': 100, 'factor': 1, 'max_batch_size': 1000}),
154- # ('CosineAnnealingWarmRestarts', {'T_0': 100, 'T_mult': 1}),
155- #
156- # ('CyclicBS', {'min_batch_size':10, 'base_batch_size': 500, 'step_size_down': 20, 'mode': 'triangular2', 'max_batch_size': 1000}),
157- # ('CyclicLR', {'base_lr':0.0001, 'max_lr': 0.01, 'step_size_up': 20, 'mode': 'triangular2'}),
158- #
159- # ('OneCycleBS', {'total_steps':200, 'base_batch_size': 300, 'min_batch_size': 10, 'max_batch_size': 1000}),
160- # ('OneCycleLR', {'total_steps':200, 'max_lr': 0.01}),
138+ ("StepLR" , {"step_size" : 30 , "gamma" : 0.5 }),
139+ ]
140+ loss_scalings = [
141+ None , "uniform-scaling" , "normal-scaling"
142+ ]
143+ loss_scaling_ranges = [
144+ 0.1 , 0.25 , 0.5 , 0.75
161145 ]
162146
163147 runs = []
148+ envs = []
164149 for (
165- dataset ,
166- model ,
167- optimizer ,
168- seed ,
169- epochs ,
170- es_patience ,
171- batch_size ,
172- scheduler_params ,
173- lr ,
174- reduction ,
150+ dataset ,
151+ model ,
152+ optimizer ,
153+ seed ,
154+ epochs ,
155+ es_patience ,
156+ batch_size ,
157+ scheduler_params ,
158+ lr ,
159+ reduction ,
160+ loss_scaling
175161 ) in itertools .product (
176162 datasets ,
177163 models ,
@@ -183,6 +169,7 @@ def generate_runs():
183169 schedulers ,
184170 lrs ,
185171 reductions ,
172+ loss_scalings
186173 ):
187174 run = create_run (
188175 dataset = dataset ,
@@ -195,19 +182,26 @@ def generate_runs():
195182 scheduler_params = scheduler_params ,
196183 lr = lr ,
197184 reduction = reduction ,
185+ loss_scaling = loss_scaling
198186 )
199- runs .append (run )
187+ if loss_scaling is not None :
188+ for loss_scaling_range in loss_scaling_ranges :
189+ runs .append (run )
190+ envs .append ({'loss_scaling_range' : loss_scaling_range })
191+ else :
192+ runs .append (run )
193+ envs .append ({})
200194
201- return [f"python main.py { i } " for i in runs ]
195+ return [f"python main.py { i } " for i in runs ], envs
202196
203197
204198if __name__ == "__main__" :
205199 freeze_support ()
206- runs = generate_runs ()
200+ runs , envs = generate_runs ()
207201
208202 # # Debug
209- # for i in runs:
210- # print(i)
203+ # for i, env in zip( runs, envs) :
204+ # print(env, i)
211205
212206 print (len (runs ))
213207 if last_index == - 1 or last_index > len (runs ):
@@ -216,5 +210,5 @@ def generate_runs():
216210 with ProcessPoolExecutor (max_workers = gpu_count * processes_per_gpu ) as executor :
217211 executor .map (
218212 run_command ,
219- [(runs [index ], index ) for index in range (run_index , last_index )],
213+ [(runs [index ], envs [ index ], index ) for index in range (run_index , last_index )],
220214 )
0 commit comments