From e1780312acc653465dea10b7fb679e7fb8696306 Mon Sep 17 00:00:00 2001 From: Oliver Adams Date: Mon, 16 Sep 2024 04:47:04 +0000 Subject: [PATCH] Disabling dataloading prefetch to fix a bug. There was a bug where consecutive training steps would use the same batch. The number of training steps using the same batch is equal to `n_cpu`. This is to do with the number of workers prefetching data in the dataloader, and somehow each of them independently keeping track of their own `data_counter` member in `mmap_batch_generator`. I don't completely understand how the prefetching caused this, all I know is that it did. The fix substantially changes the training dynamics by bringing stochasticity back into training. In particular training error is much lower and validation accuracy and recall are much higher. It's noteworthy that validation false positives are also much higher. The training logic in this codebase is quite idiosyncratic so it's likely some of the settings and logic need to be recalibrated to work well with correct minibatch SGD to minimize false positives. --- my_model.yaml | 32 ++++++++++++++++++++++++++++++++ openwakeword/train.py | 2 +- 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100755 my_model.yaml diff --git a/my_model.yaml b/my_model.yaml new file mode 100755 index 0000000..22c1b3b --- /dev/null +++ b/my_model.yaml @@ -0,0 +1,32 @@ +augmentation_batch_size: 16 +augmentation_rounds: 1 +background_paths: +- ./audioset_16k +- ./koda_audio +#- ./fma +background_paths_duplication_rate: +- 1 +batch_n_per_class: + ACAV100M_sample: 1024 + adversarial_negative: 400 + positive: 400 +custom_negative_phrases: [] +false_positive_validation_data_path: validation_set_features.npy +feature_data_files: + ACAV100M_sample: openwakeword_features_ACAV100M_2000_hrs_16bit.npy +layer_size: 128 +max_negative_weight: 1500 +model_name: koda_stop +model_type: dnn +n_samples: 100000 +n_samples_val: 2000 +output_dir: ./koda_stop_24 +piper_sample_generator_path: ./piper-sample-generator +rir_paths: +- ./mit_rirs +steps: 25000 +target_false_positives_per_hour: 2 +target_phrase: +- koda stop +tts_batch_size: 50 +include_adversarial_examples: true diff --git a/openwakeword/train.py b/openwakeword/train.py index f564254..f3204a6 100755 --- a/openwakeword/train.py +++ b/openwakeword/train.py @@ -862,7 +862,7 @@ def __iter__(self): else: n_cpus = n_cpus//2 X_train = torch.utils.data.DataLoader(IterDataset(batch_generator), - batch_size=None, num_workers=n_cpus, prefetch_factor=16) + batch_size=None) X_val_fp = np.load(config["false_positive_validation_data_path"]) X_val_fp = np.array([X_val_fp[i:i+input_shape[0]] for i in range(0, X_val_fp.shape[0]-input_shape[0], 1)]) # reshape to match model