diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index 2f2627367..6d9baf90d 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -23,7 +23,7 @@ def load_module_from_py_file(py_file: str) -> object: return module -def get_custom_dataset(dataset_config, tokenizer, split: str): +def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None): if ":" in dataset_config.file: module_path, func_name = dataset_config.file.split(":") else: @@ -38,7 +38,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str): module = load_module_from_py_file(module_path.as_posix()) try: - return getattr(module, func_name)(dataset_config, tokenizer, split) + return getattr(module, func_name)(dataset_config, tokenizer, split, context_length) except AttributeError as e: print( f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})." diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py index 1642a56d3..42d0aae71 100644 --- a/QEfficient/finetune/utils/dataset_utils.py +++ b/QEfficient/finetune/utils/dataset_utils.py @@ -51,7 +51,7 @@ def get_dataloader_kwargs(train_config, dataset, dataset_processer, split): ) else: kwargs["sampler"] = torch.utils.data.DistributedSampler( - dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=True + dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False ) kwargs["batch_size"] = batch_size kwargs["drop_last"] = True diff --git a/docs/source/finetune.md b/docs/source/finetune.md index e5a8bc475..70bf35e67 100644 --- a/docs/source/finetune.md +++ b/docs/source/finetune.md @@ -64,4 +64,44 @@ to visualise the data, ```python tensorboard --logdir runs/ --bind_all -``` \ No newline at end of file +``` + +## Some features/functionalities of fine-tuning stack: + 1) Gradient accumulation: By default, gradient accumulation happens for 4 steps. To update this value, command line argument gradient_accumulation_steps has to be passed. (Example: '--gradient_accumulation_steps 8') + 2) Gradient Checkpointing: By default, gradient checkpointing is disabled. To enable it, command line argument gradient_accumulation_steps has to be passed. + +## Fine-Tuning on custom dataset + +To run fine tuning for any user specific dataset, prepare the dataset using the following steps: + + 1) Create a directory named 'dataset' inside efficient-transformers. + 2) Inside this directory, create a file named 'custom_dataset.py'. + 3) Inside the newly created efficient-transformers/dataset/custom_dataset.py, define a function named 'get_custom_dataset'. + 4) get_custom_dataset() should have following 4 parameters: dataset_config, tokenizer, split, context_length. + 5) Inside get_custom_dataset(), user needs to apply prompt and tokenize the dataset accordingly. Please refer the below template on how to define get_custom_dataset(). + 6) For examples, please refer python files present in [dataset](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset). In case of Samsum dataset, get_preprocessed_samsum() of efficient-transformers/QEfficient/finetune/dataset/samsum_dataset.py is called. + 7) In [dataset_config.py](https://github.com/quic/efficient-transformers/blob/main/QEfficient/finetune/configs/dataset_config.py), for custom_dataset class, pass the appropriate value for train_split and test_split. As an alternative, these values can be passed as command line arguments as well with the finetune command. For example "--train_split train". + 8) While running fine tuning, pass argument "-–dataset custom_dataset" to finetune on custom dataset. + +Template for get_custom_dataset() to be defined inside efficient-transformers/dataset/custom_dataset.py is as follows: + +```python +def get_custom_dataset(dataset_config, tokenizer, split, context_length=None): + + # load dataset + # based on split, retrieve only the specific portion of the dataset (train or eval) either here or at the last + + def apply_prompt_template(): + # transform the passed datapoint by applying the prompt on it + + def tokenize(): + # tokenize the passed datapoint + + # define the prompt + # call apply_prompt_template() for each data point: + # dataset = dataset.map(apply_prompt_template ,) + # call tokenize() for each data point: + # dataset = dataset.map(tokenize, ) + + return dataset +```