From e1780312acc653465dea10b7fb679e7fb8696306 Mon Sep 17 00:00:00 2001
From: Oliver Adams <oliver.adams@gmail.com>
Date: Mon, 16 Sep 2024 04:47:04 +0000
Subject: [PATCH] Disabling dataloading prefetch to fix a bug.

There was a bug where consecutive training steps would use the same batch. The
number of training steps using the same batch is equal to `n_cpu`. This is to
do with the number of workers prefetching data in the dataloader, and somehow
each of them independently keeping track of their own `data_counter` member in
`mmap_batch_generator`. I don't completely understand how the prefetching
caused this, all I know is that it did.

The fix substantially changes the training dynamics by bringing stochasticity
back into training. In particular training error is much lower and validation
accuracy and recall are much higher. It's noteworthy that validation false
positives are also much higher. The training logic in this codebase is quite
idiosyncratic so it's likely some of the settings and logic need to be
recalibrated to work well with correct minibatch SGD to minimize false
positives.
---
 my_model.yaml         | 32 ++++++++++++++++++++++++++++++++
 openwakeword/train.py |  2 +-
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100755 my_model.yaml

diff --git a/my_model.yaml b/my_model.yaml
new file mode 100755
index 0000000..22c1b3b
--- /dev/null
+++ b/my_model.yaml
@@ -0,0 +1,32 @@
+augmentation_batch_size: 16
+augmentation_rounds: 1
+background_paths:
+- ./audioset_16k
+- ./koda_audio
+#- ./fma
+background_paths_duplication_rate:
+- 1
+batch_n_per_class:
+  ACAV100M_sample: 1024
+  adversarial_negative: 400
+  positive: 400
+custom_negative_phrases: []
+false_positive_validation_data_path: validation_set_features.npy
+feature_data_files:
+  ACAV100M_sample: openwakeword_features_ACAV100M_2000_hrs_16bit.npy
+layer_size: 128
+max_negative_weight: 1500
+model_name: koda_stop
+model_type: dnn
+n_samples: 100000
+n_samples_val: 2000
+output_dir: ./koda_stop_24
+piper_sample_generator_path: ./piper-sample-generator
+rir_paths:
+- ./mit_rirs
+steps: 25000
+target_false_positives_per_hour: 2
+target_phrase:
+- koda stop
+tts_batch_size: 50
+include_adversarial_examples: true
diff --git a/openwakeword/train.py b/openwakeword/train.py
index f564254..f3204a6 100755
--- a/openwakeword/train.py
+++ b/openwakeword/train.py
@@ -862,7 +862,7 @@ def __iter__(self):
         else:
             n_cpus = n_cpus//2
         X_train = torch.utils.data.DataLoader(IterDataset(batch_generator),
-                                              batch_size=None, num_workers=n_cpus, prefetch_factor=16)
+                                              batch_size=None)
 
         X_val_fp = np.load(config["false_positive_validation_data_path"])
         X_val_fp = np.array([X_val_fp[i:i+input_shape[0]] for i in range(0, X_val_fp.shape[0]-input_shape[0], 1)])  # reshape to match model