Skip to content

Commit 3e853ba

Browse files
Added environment variables to experiment runner
1 parent 4335623 commit 3e853ba

File tree

2 files changed

+58
-64
lines changed

2 files changed

+58
-64
lines changed

experiment_runner.py

Lines changed: 57 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323

2424
def run_command(command_idx):
25-
command, idx = command_idx
25+
command, env, idx = command_idx
2626
gpu_index = current_process()._identity[0] % gpu_count
2727
if torch.cuda.is_available():
2828
command += f" -device cuda:{gpu_index}"
@@ -38,6 +38,7 @@ def run_command(command_idx):
3838
command += " -device cpu"
3939
print("Command:", idx, "on cpu on process", current_process()._identity[0])
4040

41+
env_str = " ".join(f"{k}={v}" for k, v in env.items())
4142
today = date.today()
4243
os.makedirs("./logs", exist_ok=True)
4344
try:
@@ -48,28 +49,29 @@ def run_command(command_idx):
4849
shell=True,
4950
check=True,
5051
stderr=err,
51-
env={**os.environ, "PYTHONOPTIMIZE": "2"},
52+
env={**os.environ, "PYTHONOPTIMIZE": "2", **env},
5253
)
5354
os.remove(f"./logs/error_{idx}_{today}.txt")
5455
elapsed = time.time() - start
5556
with open("./logs/finished_runs.txt", "a+") as fp:
56-
fp.write(f"{idx} -> {today} -> " + str(elapsed) + "s + " + command + "\n")
57+
fp.write(f"{idx} -> {today} -> " + str(elapsed) + "s + " + env_str + " " + command + "\n")
5758
except subprocess.CalledProcessError:
5859
with open(f"./logs/failed_runs_{today}.txt", "a+") as fp:
59-
fp.write(command + "\n")
60+
fp.write(env_str + " " + command + "\n")
6061

6162

6263
def create_run(
63-
dataset,
64-
model,
65-
optimizer,
66-
seed,
67-
epochs,
68-
es_patience,
69-
batch_size,
70-
scheduler_params,
71-
lr,
72-
reduction,
64+
dataset,
65+
model,
66+
optimizer,
67+
seed,
68+
epochs,
69+
es_patience,
70+
batch_size,
71+
scheduler_params,
72+
lr,
73+
reduction,
74+
loss_scaling
7375
):
7476
scheduler_name, scheduler_params = scheduler_params
7577
scheduler_params = str(scheduler_params).replace(" ", "")
@@ -94,15 +96,19 @@ def create_run(
9496
f" --disable_progress_bar"
9597
f" --stderr"
9698
f" --verbose"
97-
) + (" --half" if torch.cuda.is_available() else "")
99+
) + (
100+
" --half" if torch.cuda.is_available() else ""
101+
) + (
102+
f" -loss_scaling {loss_scaling}" if loss_scaling is not None else ""
103+
)
98104

99105

100106
def generate_runs():
101107
datasets = [
102108
# 'cifar10',
103-
"cifar10",
109+
# "cifar10",
104110
"cifar100",
105-
"FashionMNIST",
111+
# "FashionMNIST",
106112
]
107113
models = [
108114
"preresnet18_c10"
@@ -120,7 +126,7 @@ def generate_runs():
120126
20
121127
]
122128
batch_sizes = [
123-
16, 32
129+
2048
124130
]
125131
lrs = [
126132
0.001
@@ -129,49 +135,29 @@ def generate_runs():
129135
"mean"
130136
]
131137
schedulers = [
132-
# ('IncreaseBSOnPlateau', {'mode': 'min', 'factor': 2.0, 'max_batch_size': max_batch_size}),
133-
# ('IncreaseBSOnPlateau', {'mode': 'min', 'factor': 5.0, 'max_batch_size': max_batch_size}),
134-
# ('ReduceLROnPlateau', {'mode': 'min', 'factor': 0.5}),
135-
# ('ReduceLROnPlateau', {'mode': 'min', 'factor': 0.2}),
136-
#
137-
# ('StepBS', {'step_size': 30, 'gamma': 2.0, 'max_batch_size': max_batch_size}),
138-
# ('StepBS', {'step_size': 50, 'gamma': 2.0, 'max_batch_size': max_batch_size}),
139-
# ('StepBS', {'step_size': 30, 'gamma': 5.0, 'max_batch_size': max_batch_size}),
140-
# ('StepBS', {'step_size': 50, 'gamma': 5.0, 'max_batch_size': max_batch_size}),
141-
#
142-
# ('StepLR', {'step_size': 30, 'gamma': 0.5}),
143-
# ('StepLR', {'step_size': 50, 'gamma': 0.5}),
144-
# ('StepLR', {'step_size': 30, 'gamma': 0.2}),
145-
# ('StepLR', {'step_size': 50, 'gamma': 0.2}),
146-
("ExponentialBS", {"gamma": 1.01, "max_batch_size": 1000}),
147-
("ExponentialLR", {"gamma": 0.99}),
148-
("PolynomialBS", {"total_iters": 200, "max_batch_size": 1000}),
149-
("PolynomialLR", {"total_iters": 200}),
150-
# ('CosineAnnealingBS', {'total_iters': 200, 'max_batch_size': 1000}),
151-
# ('CosineAnnealingLR', {'T_max': 200, }),
152-
#
153-
# ('CosineAnnealingBSWithWarmRestarts', {'t_0': 100, 'factor': 1, 'max_batch_size': 1000}),
154-
# ('CosineAnnealingWarmRestarts', {'T_0': 100, 'T_mult': 1}),
155-
#
156-
# ('CyclicBS', {'min_batch_size':10, 'base_batch_size': 500, 'step_size_down': 20, 'mode': 'triangular2', 'max_batch_size': 1000}),
157-
# ('CyclicLR', {'base_lr':0.0001, 'max_lr': 0.01, 'step_size_up': 20, 'mode': 'triangular2'}),
158-
#
159-
# ('OneCycleBS', {'total_steps':200, 'base_batch_size': 300, 'min_batch_size': 10, 'max_batch_size': 1000}),
160-
# ('OneCycleLR', {'total_steps':200, 'max_lr': 0.01}),
138+
("StepLR", {"step_size": 30, "gamma": 0.5}),
139+
]
140+
loss_scalings = [
141+
None, "uniform-scaling", "normal-scaling"
142+
]
143+
loss_scaling_ranges = [
144+
0.1, 0.25, 0.5, 0.75
161145
]
162146

163147
runs = []
148+
envs = []
164149
for (
165-
dataset,
166-
model,
167-
optimizer,
168-
seed,
169-
epochs,
170-
es_patience,
171-
batch_size,
172-
scheduler_params,
173-
lr,
174-
reduction,
150+
dataset,
151+
model,
152+
optimizer,
153+
seed,
154+
epochs,
155+
es_patience,
156+
batch_size,
157+
scheduler_params,
158+
lr,
159+
reduction,
160+
loss_scaling
175161
) in itertools.product(
176162
datasets,
177163
models,
@@ -183,6 +169,7 @@ def generate_runs():
183169
schedulers,
184170
lrs,
185171
reductions,
172+
loss_scalings
186173
):
187174
run = create_run(
188175
dataset=dataset,
@@ -195,19 +182,26 @@ def generate_runs():
195182
scheduler_params=scheduler_params,
196183
lr=lr,
197184
reduction=reduction,
185+
loss_scaling=loss_scaling
198186
)
199-
runs.append(run)
187+
if loss_scaling is not None:
188+
for loss_scaling_range in loss_scaling_ranges:
189+
runs.append(run)
190+
envs.append({'loss_scaling_range': loss_scaling_range})
191+
else:
192+
runs.append(run)
193+
envs.append({})
200194

201-
return [f"python main.py {i}" for i in runs]
195+
return [f"python main.py {i}" for i in runs], envs
202196

203197

204198
if __name__ == "__main__":
205199
freeze_support()
206-
runs = generate_runs()
200+
runs, envs = generate_runs()
207201

208202
# # Debug
209-
# for i in runs:
210-
# print(i)
203+
# for i, env in zip(runs, envs):
204+
# print(env, i)
211205

212206
print(len(runs))
213207
if last_index == -1 or last_index > len(runs):
@@ -216,5 +210,5 @@ def generate_runs():
216210
with ProcessPoolExecutor(max_workers=gpu_count * processes_per_gpu) as executor:
217211
executor.map(
218212
run_command,
219-
[(runs[index], index) for index in range(run_index, last_index)],
213+
[(runs[index], envs[index], index) for index in range(run_index, last_index)],
220214
)

main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
"-loss_scaling",
3939
default=None,
4040
type=str,
41-
help="loss reduction. Can be default, normal-scaling.",
41+
help="loss reduction. Can be default, normal-scaling, uniform-scaling.",
4242
)
4343
parser.add_argument(
4444
"-fill", default=None, type=float, help="fill value for transformations"

0 commit comments

Comments
 (0)