LLMServe · Toseic · Nov 28, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 17, 2024
diff --git a/simdistserve/README.md b/simdistserve/README.md
@@ -48,7 +48,30 @@ Ideally you should get the following result:
 Best per GPU rate: 1.56
 Best config: pp_cross=1, tp_prefill=2, pp_prefill=1, tp_decode=1, pp_decode=1
 ```
-
+### Ratio search
+Given the parallel strategy of prefill and decoding instances, search for the best config ratio M:N.
+```bash
+python -m simdistserve.simulate_ratio \
+    --prefill-tp 8 \
+    --prefill-pp 1 \
+    --decode-tp 8 \
+    --decode-pp 1 \
+    --max-prefill-instances 8 \
+    --max-decode-instances 8 \
+    --kv-cache-mem-per-gpu 64 \
+    --kv-transfer-bw 600 \
+    --model-type "facebook/opt-66b" \
+    --workload sharegpt --backend distserve \
+    --prefill-target 200 --decode-target 100 \
+    --prefill-percentage 90 --decode-percentage 90 \
+    --max-per-gpu-rate 5 \
+    --esp 0.25 \
+    --N 300
+```
+Output:
+```text
+Best config: prefill_instance=8, decode_instance=3, per_gpu_rate=2.1875
+```
 ## Architecture
 
 The simulator is written on top of `simpy`, a discrete event simulator built natively in Python. 

diff --git a/simdistserve/base/request.py b/simdistserve/base/request.py
@@ -7,6 +7,8 @@
 E_DO_PREFILL = "do_prefill"
 E_WAIT_DECODE = "wait_decode"
 E_DO_DECODE = "do_decode"
+E_WAIT_KVCACHE_MIGRATION = "wait_kvcache_migration"
+E_DO_KVCACHE_MIGRATION = "do_kvcache_migration"
 E_FINISH_PREFILL = "finish_prefill"
 E_FINISH_DECODE = "finish_decode"
 E_EXIT_SYSTEM = "exit_system"
@@ -61,10 +63,27 @@ def __init__(
         # set this value if a request belongs to a particular chunk
         # The last worker in the pipeline unset this value at a chunk's end.
         self.chunk_id = None
+        # after the request is finished prefill, `kvcache_generated` should be set to `True`.
+        self.kvcache_generated = False
+        self.prefill_is_finished = False
+        # workers do frefill/deocde in PP.
+        self.prefill_workers = []
+        self.decode_workers = []
+        # workers already sent/received kvcache.
+        self.migrated_prefill_workers = []
+        self.migrated_decode_workers = []
+        # migrate finished event
+        self.migrate_event = None
+        # migrate time for one prefill worker
+        self.migrate_time = 0
 
     @property
     def current_context_len(self):
         return self.prefill_lens + max(0, self.counter)
+
+    @property
+    def kvcache_migrate_is_done(self):
+        return len(self.migrated_prefill_workers) == len(self.prefill_workers)
 
     def _log_event(self, event, wid=-1):
         if not self.env:
@@ -88,6 +107,12 @@ def wait_decode(self, wid=None):
 
     def do_decode(self, wid=None):
         self._log_event(E_DO_DECODE, wid=wid)
+
+    def wait_kvcache_migration(self, wid=None):
+        self._log_event(E_WAIT_KVCACHE_MIGRATION, wid=wid)
+
+    def do_kvcache_migration(self, wid=None):
+        self._log_event(E_DO_KVCACHE_MIGRATION, wid=wid)
 
     def _reset_chunked_prefill_metadata(self):
         """Reset the metadata of chunked prefill."""
@@ -111,6 +136,7 @@ def finish_prefill(self, is_finished_one_round=False, wid=None, next_wid=None):
         # Reset counter to 0
         # TODO: Should we do self.counter += 1?
         self.counter = 0
+        self.prefill_is_finished = True
         # Hack to ensure "wait_decode" appears at least once.
         self.wait_decode(wid=next_wid)
         if not self.should_finish():