flexflow · chloeg12 · Feb 4, 2025 · Feb 24, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
@@ -114,7 +114,7 @@ RUN pip3 install transformers>=4.47.1 sentencepiece einops
 RUN pip3 install tensorflow notebook
 # PEFT-related
 RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft pytest
-RUN pip3 install streamlit
+RUN pip3 install uvicorn fastapi streamlit
 # flash-attn
 RUN if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
         pip3 install flash-attn; \

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
@@ -605,7 +605,8 @@ void flexflow_model_generate(flexflow_model_t handle_,
                              int *training_steps,
                              int **output_length_and_tokens,
                              int *num_finetuning_losses,
-                             float *finetuning_losses);
+                             float *finetuning_losses,
+                             char const **log_filepaths);
 
 void flexflow_model_set_position_offset(flexflow_model_t handle, int offset);
 

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
@@ -110,6 +110,7 @@ struct Request {
     // if left as -1, it will be set to the number of entries in the dataset
     int gradient_accumulation_steps = -1;
     // std::vector<int> finetuning_tokens_per_batch;
+    std::string log_filepath;
   };
   RequestType req_type = REQ_INFERENCE;
   RequestGuid guid = BatchConfig::INVALID_GUID;

diff --git a/inference/python/demo_ff_peft.py b/inference/python/demo_ff_peft.py
@@ -0,0 +1,195 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flexflow.serve as ff
+import argparse, json, os
+from types import SimpleNamespace
+
+
+def get_configs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-config-file",
+        help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+        type=str,
+        default="",
+    )
+    args = parser.parse_args()
+
+    # Load configs from JSON file (if specified)
+    if len(args.config_file) > 0:
+        if not os.path.isfile(args.config_file):
+            raise FileNotFoundError(f"Config file {args.config_file} not found.")
+        try:
+            with open(args.config_file) as f:
+                return json.load(f)
+        except json.JSONDecodeError as e:
+            print("JSON format error:")
+            print(e)
+    else:
+        # Define sample configs
+        ff_init_configs = {
+            # required parameters
+            "num_gpus": 4,
+            "memory_per_gpu": 30000,
+            "zero_copy_memory_per_node": 40000,
+            # optional parameters
+            "num_cpus": 4,
+            "legion_utility_processors": 8,
+            "data_parallelism_degree": 1,
+            "tensor_parallelism_degree": 4,
+            "pipeline_parallelism_degree": 1,
+            "offload": False,
+            "offload_reserve_space_size": 8 * 1024,  # 8GB
+            "use_4bit_quantization": False,
+            "use_8bit_quantization": False,
+            "enable_peft": True,
+            "profiling": False,
+            "inference_debugging": False,
+            "fusion": True,
+        }
+        model_configs = {
+            # required parameters
+            # "base_model": "JackFram/llama-160m",
+            # "inference_peft_model_id": "goliaro/llama-160m-lora",
+            # "finetuning_peft_model_id": "goliaro/llama-160m-lora",
+            # "base_model": "meta-llama/Meta-Llama-3-8B",
+            "base_model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "inference_peft_model_id": "goliaro/llama-3-8b-lora",
+            "finetuning_peft_model_id": "goliaro/llama-3-8b-lora-dolly",
+            # optional parameters
+            "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+            "refresh_cache": False,
+            "full_precision": False,
+            # "prompt": "hello, hellohello",
+            "prompt": os.path.join(
+                os.path.dirname(os.path.abspath(__file__)),
+                "./prompt_dataset.json",
+            ),
+            "finetuning_dataset": os.path.join(
+                os.path.dirname(os.path.abspath(__file__)),
+                "../prompt/peft_dataset.json",
+            ),
+            "output_file": "",
+            "max_requests_per_batch": 16,
+            "max_seq_length": 600,
+            "max_tokens_per_batch": 1024,
+            "max_concurrent_adapters": 1,
+        }
+        # Merge dictionaries
+        ff_init_configs.update(model_configs)
+        return ff_init_configs
+
+
+def main():
+    configs_dict = get_configs()
+    configs = SimpleNamespace(**configs_dict)
+
+    # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+    ff.init(configs_dict)
+
+    # Create the FlexFlow LLM
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
+    llm = ff.LLM(
+        configs.base_model,
+        data_type=ff_data_type,
+        cache_path=configs.cache_path,
+        refresh_cache=configs.refresh_cache,
+        output_file=configs.output_file,
+    )
+
+    # Compile the LLM for inference and load the weights into memory
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    enable_peft_finetuning = len(configs.finetuning_dataset) > 0
+    llm.compile(
+        generation_config,
+        max_requests_per_batch=configs_dict.get("max_requests_per_batch", 1)
+        + enable_peft_finetuning,
+        max_seq_length=configs_dict.get("max_seq_length", 2048),
+        max_tokens_per_batch=configs_dict.get("max_tokens_per_batch", 128),
+        num_kv_cache_slots=configs_dict.get("num_kv_cache_slots", -1),
+        max_concurrent_adapters=configs_dict.get("max_concurrent_adapters", 1)
+        + enable_peft_finetuning,
+        enable_peft_finetuning=enable_peft_finetuning,
+    )
+
+    llm.start_server()
+
+    # Add inference and/or finetuning lora
+    lora_inference_config = None
+    lora_finetuning_config = None
+    # if len(configs.prompt) > 0:
+        # lora_inference_config = ff.LoraLinearConfig(
+        #     llm.cache_path,
+        #     configs.inference_peft_model_id,
+        #     base_model_name_or_path=configs.base_model,
+        # )
+        # llm.register_peft_adapter(lora_inference_config)
+
+    # if len(configs.finetuning_dataset) > 0:
+    #     lora_finetuning_config = ff.LoraLinearConfig(
+    #         llm.cache_path,
+    #         configs.inference_peft_model_id,
+    #         trainable=True,
+    #         init_lora_weights=True,
+    #         target_modules=["down_proj"],
+    #         base_model_name_or_path=configs.base_model,
+    #         optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
+    #         optimizer_kwargs={
+    #             "learning_rate": 0.001,
+    #             "momentum": 0.0,
+    #             "weight_decay": 0.0,
+    #             "nesterov": False,
+    #         },
+    #     )
+    #     llm.register_peft_adapter(lora_finetuning_config)
+
+    requests = []
+    # Serving
+    if len(configs.prompt) > 0:
+        prompts = [s for s in json.load(open(configs.prompt))]
+        inference_requests = [
+            ff.Request(
+                ff.RequestType.REQ_INFERENCE,
+                prompt=prompt,
+                max_new_tokens=300,
+                # peft_model_id=llm.get_ff_peft_id(lora_inference_config),
+                peft_model_id=None,
+            )
+            for prompt in prompts
+        ]
+        requests += inference_requests
+    # # Finetuning
+    # if len(configs.finetuning_dataset) > 0:
+    #     finetuning_request = ff.Request(
+    #         ff.RequestType.REQ_FINETUNING,
+    #         peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),
+    #         dataset_filepath=configs.finetuning_dataset,
+    #         max_training_epochs=2,
+    #     )
+    #     requests.append(finetuning_request)
+
+    results = llm.generate(requests)
+    print("Output: " + results[0].output_text.decode("utf-8"))
+
+    llm.stop_server()
+
+
+if __name__ == "__main__":
+    print("flexflow PEFT example")
+    main()
diff --git a/inference/python/prompt_dataset.json b/inference/python/prompt_dataset.json
@@ -0,0 +1,3 @@
+[
+    "hello"
+]
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    [
+        "hello"
+    ]