From 7f488b1ddc00bf123121d94671c09e52389731cf Mon Sep 17 00:00:00 2001 From: psychedelicious <4822129+psychedelicious@users.noreply.github.com> Date: Sat, 7 Jun 2025 16:40:36 +1000 Subject: [PATCH 1/3] fix(app): reduce peak memory usage We've long suspected there is a memory leak in Invoke, but that may not be true. What looks like a memory leak may in fact be the expected behaviour for our allocation patterns. We observe ~20 to ~30 MB increase in memory usage per session executed. I did some prolonged tests, where I measured the process's RSS in bytes while doing 200 SDXL generations. I found that it eventually leveled off at around 100 generations, at which point memory usage had climbed by ~900MB from its starting point. I used tracemalloc to diff the allocations of single session executions and found that we are allocating ~20MB or so per session in `ModelPatcher.apply_ti()`. In `ModelPatcher.apply_ti()` we add tokens to the tokenizer when handling TIs. The added tokens should be scoped to only the current invocation, but there is no simple way to remove the tokens afterwards. As a workaround for this, we clone the tokenizer, add the TI tokens to the clone, and use the clone to when running compel. Afterwards, this cloned tokenizer is discarded. The tokenizer uses ~20MB of memory, and it has referrers/referents to other compel stuff. This is what is causing the observed increases in memory per session! We'd expect these objects to be GC'd but python doesn't do it immediately. After creating the cond tensors, we quickly move on to denoising. So there isn't any time for the GC to happen to free up its existing memory arenas/blocks to reuse them. Instead, python needs to request more memory from the OS. We can improve the situation by immediately calling `del` on the tokenizer clone and related objects. In fact, we already had some code in the compel nodes to `del` some of these objects, but not all. Adding the `del`s vastly improves things. We hit peak RSS in half the sessions (~50 or less) and it's now ~100MB more than starting value. There is still a gradual increase in memory usage until we level off. --- invokeai/app/invocations/compel.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/invokeai/app/invocations/compel.py b/invokeai/app/invocations/compel.py index 36c9d7b6522..3ad05bcc9b9 100644 --- a/invokeai/app/invocations/compel.py +++ b/invokeai/app/invocations/compel.py @@ -114,6 +114,13 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]: c, _options = compel.build_conditioning_tensor_for_conjunction(conjunction) + del compel + del patched_tokenizer + del tokenizer + del ti_manager + del text_encoder + del text_encoder_info + c = c.detach().to("cpu") conditioning_data = ConditioningFieldData(conditionings=[BasicConditioningInfo(embeds=c)]) @@ -222,7 +229,10 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]: else: c_pooled = None + del compel + del patched_tokenizer del tokenizer + del ti_manager del text_encoder del text_encoder_info From b6feade57134e803266f1e1450a449bafebbad32 Mon Sep 17 00:00:00 2001 From: psychedelicious <4822129+psychedelicious@users.noreply.github.com> Date: Sat, 7 Jun 2025 18:23:05 +1000 Subject: [PATCH 2/3] perf(app): skip TI logic when no TIs to apply --- invokeai/backend/model_patcher.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/invokeai/backend/model_patcher.py b/invokeai/backend/model_patcher.py index 3d614077c1f..a1d8bbed0a5 100644 --- a/invokeai/backend/model_patcher.py +++ b/invokeai/backend/model_patcher.py @@ -46,6 +46,10 @@ def apply_ti( text_encoder: Union[CLIPTextModel, CLIPTextModelWithProjection], ti_list: List[Tuple[str, TextualInversionModelRaw]], ) -> Iterator[Tuple[CLIPTokenizer, TextualInversionManager]]: + if len(ti_list) == 0: + yield tokenizer, TextualInversionManager(tokenizer) + return + init_tokens_count = None new_tokens_added = None From 82fb6e50a9d3e925b0a361b7f9ab734c2cab7dda Mon Sep 17 00:00:00 2001 From: psychedelicious <4822129+psychedelicious@users.noreply.github.com> Date: Sat, 7 Jun 2025 18:37:25 +1000 Subject: [PATCH 3/3] perf(app): gc before every queue item This reduces peak memory usage at a negligible cost. Queue items typically take on the order of seconds, making the time cost of a GC essentially free. Not a great idea on a hotter code path though. --- .../session_processor/session_processor_default.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/invokeai/app/services/session_processor/session_processor_default.py b/invokeai/app/services/session_processor/session_processor_default.py index 07b1bacfc48..6c320eabda5 100644 --- a/invokeai/app/services/session_processor/session_processor_default.py +++ b/invokeai/app/services/session_processor/session_processor_default.py @@ -1,3 +1,4 @@ +import gc import traceback from contextlib import suppress from threading import BoundedSemaphore, Thread @@ -439,6 +440,12 @@ def _process( poll_now_event.wait(self._polling_interval) continue + # GC-ing here can reduce peak memory usage of the invoke process by freeing allocated memory blocks. + # Most queue items take seconds to execute, so the relative cost of a GC is very small. + # Python will never cede allocated memory back to the OS, so anything we can do to reduce the peak + # allocation is well worth it. + gc.collect() + self._invoker.services.logger.info( f"Executing queue item {self._queue_item.item_id}, session {self._queue_item.session_id}" )