mcrl · InputNamePlz · Sep 28, 2025
diff --git a/models/demos/qwen3/test_dataset/dataset_loader.py b/models/demos/qwen3/test_dataset/dataset_loader.py
@@ -18,7 +18,7 @@ def load_prompts(batch_size, prompt_len):
         FileNotFoundError: If dataset file not found
     """
     # Get the JSON file path (assuming it's in the same directory)
-    json_path = os.path.join(os.path.dirname(__file__), "test_dataset_HuggingFaceFW_fineweb_train.json")
+    json_path = os.path.join(os.path.dirname(__file__), "test_dataset_HuggingFaceFW_fineweb-edu_train.json")
 
     with open(json_path, 'r', encoding='utf-8') as f:
         data = json.load(f)

diff --git a/models/demos/qwen3/test_dataset/dataset_maker.py b/models/demos/qwen3/test_dataset/dataset_maker.py
@@ -12,19 +12,20 @@
 
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-30B-A3B")
 
-dataset_name = "HuggingFaceFW/fineweb"
-# "HuggingFaceFW/fineweb", "wikitext", "cais/mmlu"
+dataset_name = "HuggingFaceFW/fineweb-edu"
+# "HuggingFaceFW/fineweb-edu", "wikitext", "cais/mmlu"
 
 # Dataset configurations
 dataset_configs = {
     "HuggingFaceFW/fineweb": {"config": None, "streaming": True, "split": "train", "has_subjects": False},
+    "HuggingFaceFW/fineweb-edu": {"config": None, "streaming": True, "split": "train", "has_subjects": False},
     "wikitext": {"config": "wikitext-103-v1", "streaming": False, "split": "train", "has_subjects": False},
     "cais/mmlu": {"config": "all", "streaming": False, "split": "auxiliary_train", "has_subjects": True}
 }
 
-token_lengths = [2**i for i in range(11)]  # [1, 2, ... , 512, 1024]
+token_lengths = [512, 1024]  # [1, 2, ... , 512, 1024]
 
-num_per_token_length = 128
+num_per_token_length = 1024
 
 #################################################