Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
18f57e3
Initial commit to cc-flexflow
chloeg12 Feb 4, 2025
fadb35e
no bugs for backend and frontend
chloeg12 Feb 24, 2025
c5512ba
Finish basic demo
goliaro Mar 7, 2025
2552067
Remove ff_peft.py changes before merging into streamlit
goliaro Mar 7, 2025
8e71456
remove submodules
goliaro Mar 12, 2025
ecbcc4e
Update load fine-tuned model on hf and dataset processing
chloeg12 Mar 17, 2025
e60137a
Resolved merge conflicts between demo-update and flexflow-serve/strea…
chloeg12 Mar 17, 2025
fcea5c2
Finish model upload & dataset
chloeg12 Mar 18, 2025
dcd84a1
Fix max_training_epoch
chloeg12 Mar 18, 2025
b81f743
Delete tmp_datasets folder
chloeg12 Mar 18, 2025
4c6c10c
remove submodules
chloeg12 Mar 21, 2025
5d81d89
remove submodules
chloeg12 Mar 21, 2025
6cb624b
Update training progress in demo, add real-time logging
chloeg12 Mar 28, 2025
913d251
Merge streamlit into demo-update: resolved conflicts
chloeg12 Mar 28, 2025
acf5be4
tmp fix
goliaro Mar 29, 2025
8989b6f
Fix merge conflicts from streamlit branch
goliaro Mar 29, 2025
c893f36
Fix inference concurrency
goliaro Mar 29, 2025
b3a6adc
Merge branch 'streamlit' into demo-update
sfc-gh-goliaro Mar 29, 2025
e62a0bf
Limit one finetune each time
goliaro Mar 29, 2025
651470e
Merge branch 'demo-update' of https://github.com/flexflow/flexflow-se…
goliaro Mar 29, 2025
acac79b
update
goliaro Mar 29, 2025
f7574c7
Remove duplicate function
goliaro Mar 29, 2025
b041aeb
demo
goliaro Mar 29, 2025
37e0bf6
Solve conflicts in merging streamlit
goliaro Apr 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/flexflow-environment/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ RUN pip3 install transformers>=4.47.1 sentencepiece einops
RUN pip3 install tensorflow notebook
# PEFT-related
RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft pytest
RUN pip3 install streamlit
RUN pip3 install uvicorn fastapi streamlit
# flash-attn
RUN if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
pip3 install flash-attn; \
Expand Down
3 changes: 2 additions & 1 deletion include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,8 @@ void flexflow_model_generate(flexflow_model_t handle_,
int *training_steps,
int **output_length_and_tokens,
int *num_finetuning_losses,
float *finetuning_losses);
float *finetuning_losses,
char const **log_filepaths);

void flexflow_model_set_position_offset(flexflow_model_t handle, int offset);

Expand Down
1 change: 1 addition & 0 deletions include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ struct Request {
// if left as -1, it will be set to the number of entries in the dataset
int gradient_accumulation_steps = -1;
// std::vector<int> finetuning_tokens_per_batch;
std::string log_filepath;
};
RequestType req_type = REQ_INFERENCE;
RequestGuid guid = BatchConfig::INVALID_GUID;
Expand Down
195 changes: 195 additions & 0 deletions inference/python/demo_ff_peft.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import flexflow.serve as ff
import argparse, json, os
from types import SimpleNamespace


def get_configs():
parser = argparse.ArgumentParser()
parser.add_argument(
"-config-file",
help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
type=str,
default="",
)
args = parser.parse_args()

# Load configs from JSON file (if specified)
if len(args.config_file) > 0:
if not os.path.isfile(args.config_file):
raise FileNotFoundError(f"Config file {args.config_file} not found.")
try:
with open(args.config_file) as f:
return json.load(f)
except json.JSONDecodeError as e:
print("JSON format error:")
print(e)
else:
# Define sample configs
ff_init_configs = {
# required parameters
"num_gpus": 4,
"memory_per_gpu": 30000,
"zero_copy_memory_per_node": 40000,
# optional parameters
"num_cpus": 4,
"legion_utility_processors": 8,
"data_parallelism_degree": 1,
"tensor_parallelism_degree": 4,
"pipeline_parallelism_degree": 1,
"offload": False,
"offload_reserve_space_size": 8 * 1024, # 8GB
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"enable_peft": True,
"profiling": False,
"inference_debugging": False,
"fusion": True,
}
model_configs = {
# required parameters
# "base_model": "JackFram/llama-160m",
# "inference_peft_model_id": "goliaro/llama-160m-lora",
# "finetuning_peft_model_id": "goliaro/llama-160m-lora",
# "base_model": "meta-llama/Meta-Llama-3-8B",
"base_model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"inference_peft_model_id": "goliaro/llama-3-8b-lora",
"finetuning_peft_model_id": "goliaro/llama-3-8b-lora-dolly",
# optional parameters
"cache_path": os.environ.get("FF_CACHE_PATH", ""),
"refresh_cache": False,
"full_precision": False,
# "prompt": "hello, hellohello",
"prompt": os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"./prompt_dataset.json",
),
"finetuning_dataset": os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"../prompt/peft_dataset.json",
),
"output_file": "",
"max_requests_per_batch": 16,
"max_seq_length": 600,
"max_tokens_per_batch": 1024,
"max_concurrent_adapters": 1,
}
# Merge dictionaries
ff_init_configs.update(model_configs)
return ff_init_configs


def main():
configs_dict = get_configs()
configs = SimpleNamespace(**configs_dict)

# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
ff.init(configs_dict)

# Create the FlexFlow LLM
ff_data_type = (
ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
)
llm = ff.LLM(
configs.base_model,
data_type=ff_data_type,
cache_path=configs.cache_path,
refresh_cache=configs.refresh_cache,
output_file=configs.output_file,
)

# Compile the LLM for inference and load the weights into memory
generation_config = ff.GenerationConfig(
do_sample=False, temperature=0.9, topp=0.8, topk=1
)
enable_peft_finetuning = len(configs.finetuning_dataset) > 0
llm.compile(
generation_config,
max_requests_per_batch=configs_dict.get("max_requests_per_batch", 1)
+ enable_peft_finetuning,
max_seq_length=configs_dict.get("max_seq_length", 2048),
max_tokens_per_batch=configs_dict.get("max_tokens_per_batch", 128),
num_kv_cache_slots=configs_dict.get("num_kv_cache_slots", -1),
max_concurrent_adapters=configs_dict.get("max_concurrent_adapters", 1)
+ enable_peft_finetuning,
enable_peft_finetuning=enable_peft_finetuning,
)

llm.start_server()

# Add inference and/or finetuning lora
lora_inference_config = None
lora_finetuning_config = None
# if len(configs.prompt) > 0:
# lora_inference_config = ff.LoraLinearConfig(
# llm.cache_path,
# configs.inference_peft_model_id,
# base_model_name_or_path=configs.base_model,
# )
# llm.register_peft_adapter(lora_inference_config)

# if len(configs.finetuning_dataset) > 0:
# lora_finetuning_config = ff.LoraLinearConfig(
# llm.cache_path,
# configs.inference_peft_model_id,
# trainable=True,
# init_lora_weights=True,
# target_modules=["down_proj"],
# base_model_name_or_path=configs.base_model,
# optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
# optimizer_kwargs={
# "learning_rate": 0.001,
# "momentum": 0.0,
# "weight_decay": 0.0,
# "nesterov": False,
# },
# )
# llm.register_peft_adapter(lora_finetuning_config)

requests = []
# Serving
if len(configs.prompt) > 0:
prompts = [s for s in json.load(open(configs.prompt))]
inference_requests = [
ff.Request(
ff.RequestType.REQ_INFERENCE,
prompt=prompt,
max_new_tokens=300,
# peft_model_id=llm.get_ff_peft_id(lora_inference_config),
peft_model_id=None,
)
for prompt in prompts
]
requests += inference_requests
# # Finetuning
# if len(configs.finetuning_dataset) > 0:
# finetuning_request = ff.Request(
# ff.RequestType.REQ_FINETUNING,
# peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),
# dataset_filepath=configs.finetuning_dataset,
# max_training_epochs=2,
# )
# requests.append(finetuning_request)

results = llm.generate(requests)
print("Output: " + results[0].output_text.decode("utf-8"))

llm.stop_server()


if __name__ == "__main__":
print("flexflow PEFT example")
main()
3 changes: 3 additions & 0 deletions inference/python/prompt_dataset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[
"hello"
]
Loading
Loading