Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion simdistserve/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,30 @@ Ideally you should get the following result:
Best per GPU rate: 1.56
Best config: pp_cross=1, tp_prefill=2, pp_prefill=1, tp_decode=1, pp_decode=1
```

### Ratio search
Given the parallel strategy of prefill and decoding instances, search for the best config ratio M:N.
```bash
python -m simdistserve.simulate_ratio \
--prefill-tp 8 \
--prefill-pp 1 \
--decode-tp 8 \
--decode-pp 1 \
--max-prefill-instances 8 \
--max-decode-instances 8 \
--kv-cache-mem-per-gpu 64 \
--kv-transfer-bw 600 \
--model-type "facebook/opt-66b" \
--workload sharegpt --backend distserve \
--prefill-target 200 --decode-target 100 \
--prefill-percentage 90 --decode-percentage 90 \
--max-per-gpu-rate 5 \
--esp 0.25 \
--N 300
```
Output:
```text
Best config: prefill_instance=8, decode_instance=3, per_gpu_rate=2.1875
```
## Architecture

The simulator is written on top of `simpy`, a discrete event simulator built natively in Python.
Expand Down
26 changes: 26 additions & 0 deletions simdistserve/base/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
E_DO_PREFILL = "do_prefill"
E_WAIT_DECODE = "wait_decode"
E_DO_DECODE = "do_decode"
E_WAIT_KVCACHE_MIGRATION = "wait_kvcache_migration"
E_DO_KVCACHE_MIGRATION = "do_kvcache_migration"
E_FINISH_PREFILL = "finish_prefill"
E_FINISH_DECODE = "finish_decode"
E_EXIT_SYSTEM = "exit_system"
Expand Down Expand Up @@ -61,10 +63,27 @@ def __init__(
# set this value if a request belongs to a particular chunk
# The last worker in the pipeline unset this value at a chunk's end.
self.chunk_id = None
# after the request is finished prefill, `kvcache_generated` should be set to `True`.
self.kvcache_generated = False
self.prefill_is_finished = False
# workers do frefill/deocde in PP.
self.prefill_workers = []
self.decode_workers = []
# workers already sent/received kvcache.
self.migrated_prefill_workers = []
self.migrated_decode_workers = []
# migrate finished event
self.migrate_event = None
# migrate time for one prefill worker
self.migrate_time = 0

@property
def current_context_len(self):
return self.prefill_lens + max(0, self.counter)

@property
def kvcache_migrate_is_done(self):
return len(self.migrated_prefill_workers) == len(self.prefill_workers)

def _log_event(self, event, wid=-1):
if not self.env:
Expand All @@ -88,6 +107,12 @@ def wait_decode(self, wid=None):

def do_decode(self, wid=None):
self._log_event(E_DO_DECODE, wid=wid)

def wait_kvcache_migration(self, wid=None):
self._log_event(E_WAIT_KVCACHE_MIGRATION, wid=wid)

def do_kvcache_migration(self, wid=None):
self._log_event(E_DO_KVCACHE_MIGRATION, wid=wid)

def _reset_chunked_prefill_metadata(self):
"""Reset the metadata of chunked prefill."""
Expand All @@ -111,6 +136,7 @@ def finish_prefill(self, is_finished_one_round=False, wid=None, next_wid=None):
# Reset counter to 0
# TODO: Should we do self.counter += 1?
self.counter = 0
self.prefill_is_finished = True
# Hack to ensure "wait_decode" appears at least once.
self.wait_decode(wid=next_wid)
if not self.should_finish():
Expand Down
Loading