diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
index 63888eab7b4481..63bcbcee912c0f 100644
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -1261,6 +1261,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(size));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sizehint));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(skip_file_prefixes));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(skip_non_matching_threads));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sleep));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sock));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sort));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
index b863a7c970e3d4..681d41cbde32a2 100644
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -752,6 +752,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(size)
         STRUCT_FOR_ID(sizehint)
         STRUCT_FOR_ID(skip_file_prefixes)
+        STRUCT_FOR_ID(skip_non_matching_threads)
         STRUCT_FOR_ID(sleep)
         STRUCT_FOR_ID(sock)
         STRUCT_FOR_ID(sort)
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
index 3ce7200ffeb6a4..840f6940ca273e 100644
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -1259,6 +1259,7 @@ extern "C" {
     INIT_ID(size), \
     INIT_ID(sizehint), \
     INIT_ID(skip_file_prefixes), \
+    INIT_ID(skip_non_matching_threads), \
     INIT_ID(sleep), \
     INIT_ID(sock), \
     INIT_ID(sort), \
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
index e76e603230a5db..3bd14f958da839 100644
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -2796,6 +2796,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(skip_non_matching_threads);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(sleep);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
diff --git a/Lib/profiling/sampling/collector.py b/Lib/profiling/sampling/collector.py
index 112d3071a1148c..3333e7bc99d177 100644
--- a/Lib/profiling/sampling/collector.py
+++ b/Lib/profiling/sampling/collector.py
@@ -1,5 +1,17 @@
 from abc import ABC, abstractmethod
 
+# Enums are slow
+THREAD_STATE_RUNNING = 0
+THREAD_STATE_IDLE = 1
+THREAD_STATE_GIL_WAIT = 2
+THREAD_STATE_UNKNOWN = 3
+
+STATUS = {
+    THREAD_STATE_RUNNING: "running",
+    THREAD_STATE_IDLE: "idle",
+    THREAD_STATE_GIL_WAIT: "gil_wait",
+    THREAD_STATE_UNKNOWN: "unknown",
+}
 
 class Collector(ABC):
     @abstractmethod
@@ -10,10 +22,12 @@ def collect(self, stack_frames):
     def export(self, filename):
         """Export collected data to a file."""
 
-    def _iter_all_frames(self, stack_frames):
+    def _iter_all_frames(self, stack_frames, skip_idle=False):
         """Iterate over all frame stacks from all interpreters and threads."""
         for interpreter_info in stack_frames:
             for thread_info in interpreter_info.threads:
+                if skip_idle and thread_info.status != THREAD_STATE_RUNNING:
+                    continue
                 frames = thread_info.frame_info
                 if frames:
                     yield frames
diff --git a/Lib/profiling/sampling/pstats_collector.py b/Lib/profiling/sampling/pstats_collector.py
index d492c15bb2aaf8..dec81b60659c53 100644
--- a/Lib/profiling/sampling/pstats_collector.py
+++ b/Lib/profiling/sampling/pstats_collector.py
@@ -5,7 +5,7 @@
 
 
 class PstatsCollector(Collector):
-    def __init__(self, sample_interval_usec):
+    def __init__(self, sample_interval_usec, *, skip_idle=False):
         self.result = collections.defaultdict(
             lambda: dict(total_rec_calls=0, direct_calls=0, cumulative_calls=0)
         )
@@ -14,6 +14,7 @@ def __init__(self, sample_interval_usec):
         self.callers = collections.defaultdict(
             lambda: collections.defaultdict(int)
         )
+        self.skip_idle = skip_idle
 
     def _process_frames(self, frames):
         """Process a single thread's frame stack."""
@@ -40,7 +41,7 @@ def _process_frames(self, frames):
             self.callers[callee][caller] += 1
 
     def collect(self, stack_frames):
-        for frames in self._iter_all_frames(stack_frames):
+        for frames in self._iter_all_frames(stack_frames, skip_idle=self.skip_idle):
             self._process_frames(frames)
 
     def export(self, filename):
diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py
index 8a65f312234730..20437481a0af98 100644
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@@ -15,6 +15,21 @@
 from .stack_collector import CollapsedStackCollector, FlamegraphCollector
 
 _FREE_THREADED_BUILD = sysconfig.get_config_var("Py_GIL_DISABLED") is not None
+
+# Profiling mode constants
+PROFILING_MODE_WALL = 0
+PROFILING_MODE_CPU = 1
+PROFILING_MODE_GIL = 2
+
+
+def _parse_mode(mode_string):
+    """Convert mode string to mode constant."""
+    mode_map = {
+        "wall": PROFILING_MODE_WALL,
+        "cpu": PROFILING_MODE_CPU,
+        "gil": PROFILING_MODE_GIL,
+    }
+    return mode_map[mode_string]
 _HELP_DESCRIPTION = """Sample a process's stack frames and generate profiling data.
 Supports the following target modes:
   - -p PID: Profile an existing process by PID
@@ -120,18 +135,18 @@ def _run_with_sync(original_cmd):
 
 
 class SampleProfiler:
-    def __init__(self, pid, sample_interval_usec, all_threads):
+    def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL):
         self.pid = pid
         self.sample_interval_usec = sample_interval_usec
         self.all_threads = all_threads
         if _FREE_THREADED_BUILD:
             self.unwinder = _remote_debugging.RemoteUnwinder(
-                self.pid, all_threads=self.all_threads
+                self.pid, all_threads=self.all_threads, mode=mode
             )
         else:
             only_active_threads = bool(self.all_threads)
             self.unwinder = _remote_debugging.RemoteUnwinder(
-                self.pid, only_active_thread=only_active_threads
+                self.pid, only_active_thread=only_active_threads, mode=mode
             )
         # Track sample intervals and total sample count
         self.sample_intervals = deque(maxlen=100)
@@ -596,21 +611,25 @@ def sample(
     show_summary=True,
     output_format="pstats",
     realtime_stats=False,
+    mode=PROFILING_MODE_WALL,
 ):
     profiler = SampleProfiler(
-        pid, sample_interval_usec, all_threads=all_threads
+        pid, sample_interval_usec, all_threads=all_threads, mode=mode
     )
     profiler.realtime_stats = realtime_stats
 
+    # Determine skip_idle for collector compatibility
+    skip_idle = mode != PROFILING_MODE_WALL
+
     collector = None
     match output_format:
         case "pstats":
-            collector = PstatsCollector(sample_interval_usec)
+            collector = PstatsCollector(sample_interval_usec, skip_idle=skip_idle)
         case "collapsed":
-            collector = CollapsedStackCollector()
+            collector = CollapsedStackCollector(skip_idle=skip_idle)
             filename = filename or f"collapsed.{pid}.txt"
         case "flamegraph":
-            collector = FlamegraphCollector()
+            collector = FlamegraphCollector(skip_idle=skip_idle)
             filename = filename or f"flamegraph.{pid}.html"
         case _:
             raise ValueError(f"Invalid output format: {output_format}")
@@ -661,6 +680,8 @@ def wait_for_process_and_sample(pid, sort_value, args):
     if not filename and args.format == "collapsed":
         filename = f"collapsed.{pid}.txt"
 
+    mode = _parse_mode(args.mode)
+
     sample(
         pid,
         sort=sort_value,
@@ -672,6 +693,7 @@ def wait_for_process_and_sample(pid, sort_value, args):
         show_summary=not args.no_summary,
         output_format=args.format,
         realtime_stats=args.realtime_stats,
+        mode=mode,
     )
 
 
@@ -726,6 +748,15 @@ def main():
         help="Print real-time sampling statistics (Hz, mean, min, max, stdev) during profiling",
     )
 
+    # Mode options
+    mode_group = parser.add_argument_group("Mode options")
+    mode_group.add_argument(
+        "--mode",
+        choices=["wall", "cpu", "gil"],
+        default="wall",
+        help="Sampling mode: wall (all threads), cpu (only CPU-running threads), gil (only GIL-holding threads)",
+    )
+
     # Output format selection
     output_group = parser.add_argument_group("Output options")
     output_format = output_group.add_mutually_exclusive_group()
@@ -850,6 +881,8 @@ def main():
     elif target_count > 1:
         parser.error("only one target type can be specified: -p/--pid, -m/--module, or script")
 
+    mode = _parse_mode(args.mode)
+
     if args.pid:
         sample(
             args.pid,
@@ -862,6 +895,7 @@ def main():
             show_summary=not args.no_summary,
             output_format=args.format,
             realtime_stats=args.realtime_stats,
+            mode=mode,
         )
     elif args.module or args.args:
         if args.module:
diff --git a/Lib/profiling/sampling/stack_collector.py b/Lib/profiling/sampling/stack_collector.py
index 0588f822cd54f2..6983be70ee0440 100644
--- a/Lib/profiling/sampling/stack_collector.py
+++ b/Lib/profiling/sampling/stack_collector.py
@@ -11,8 +11,11 @@
 
 
 class StackTraceCollector(Collector):
-    def collect(self, stack_frames):
-        for frames in self._iter_all_frames(stack_frames):
+    def __init__(self, *, skip_idle=False):
+        self.skip_idle = skip_idle
+
+    def collect(self, stack_frames, skip_idle=False):
+        for frames in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
             if not frames:
                 continue
             self.process_frames(frames)
@@ -22,7 +25,8 @@ def process_frames(self, frames):
 
 
 class CollapsedStackCollector(StackTraceCollector):
-    def __init__(self):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.stack_counter = collections.Counter()
 
     def process_frames(self, frames):
@@ -46,7 +50,8 @@ def export(self, filename):
 
 
 class FlamegraphCollector(StackTraceCollector):
-    def __init__(self):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.stats = {}
         self._root = {"samples": 0, "children": {}}
         self._total_samples = 0
diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py
index 262e472da7eac3..2f8f5f0e169339 100644
--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@@ -19,6 +19,11 @@
 
 import subprocess
 
+# Profiling mode constants
+PROFILING_MODE_WALL = 0
+PROFILING_MODE_CPU = 1
+PROFILING_MODE_GIL = 2
+
 try:
     from concurrent import interpreters
 except ImportError:
@@ -1670,6 +1675,228 @@ def test_unsupported_platform_error(self):
             str(cm.exception)
         )
 
+class TestDetectionOfThreadStatus(unittest.TestCase):
+    @unittest.skipIf(
+        sys.platform not in ("linux", "darwin", "win32"),
+        "Test only runs on unsupported platforms (not Linux, macOS, or Windows)",
+    )
+    @unittest.skipIf(sys.platform == "android", "Android raises Linux-specific exception")
+    def test_thread_status_detection(self):
+        port = find_unused_port()
+        script = textwrap.dedent(
+            f"""\
+            import time, sys, socket, threading
+            import os
+
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.connect(('localhost', {port}))
+
+            def sleeper():
+                tid = threading.get_native_id()
+                sock.sendall(f'ready:sleeper:{{tid}}\\n'.encode())
+                time.sleep(10000)
+
+            def busy():
+                tid = threading.get_native_id()
+                sock.sendall(f'ready:busy:{{tid}}\\n'.encode())
+                x = 0
+                while True:
+                    x = x + 1
+                time.sleep(0.5)
+
+            t1 = threading.Thread(target=sleeper)
+            t2 = threading.Thread(target=busy)
+            t1.start()
+            t2.start()
+            sock.sendall(b'ready:main\\n')
+            t1.join()
+            t2.join()
+            sock.close()
+            """
+        )
+        with os_helper.temp_dir() as work_dir:
+            script_dir = os.path.join(work_dir, "script_pkg")
+            os.mkdir(script_dir)
+            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            server_socket.bind(("localhost", port))
+            server_socket.settimeout(SHORT_TIMEOUT)
+            server_socket.listen(1)
+
+            script_name = _make_test_script(script_dir, "thread_status_script", script)
+            client_socket = None
+            try:
+                p = subprocess.Popen([sys.executable, script_name])
+                client_socket, _ = server_socket.accept()
+                server_socket.close()
+                response = b""
+                sleeper_tid = None
+                busy_tid = None
+                while True:
+                    chunk = client_socket.recv(1024)
+                    response += chunk
+                    if b"ready:main" in response and b"ready:sleeper" in response and b"ready:busy" in response:
+                        # Parse TIDs from the response
+                        for line in response.split(b"\n"):
+                            if line.startswith(b"ready:sleeper:"):
+                                try:
+                                    sleeper_tid = int(line.split(b":")[-1])
+                                except Exception:
+                                    pass
+                            elif line.startswith(b"ready:busy:"):
+                                try:
+                                    busy_tid = int(line.split(b":")[-1])
+                                except Exception:
+                                    pass
+                        break
+
+                attempts = 10
+                try:
+                    unwinder = RemoteUnwinder(p.pid, all_threads=True, mode=PROFILING_MODE_CPU,
+                                                skip_non_matching_threads=False)
+                    for _ in range(attempts):
+                        traces = unwinder.get_stack_trace()
+                        # Check if any thread is running
+                        if any(thread_info.status == 0 for interpreter_info in traces
+                               for thread_info in interpreter_info.threads):
+                            break
+                        time.sleep(0.5)  # Give a bit of time to let threads settle
+                except PermissionError:
+                    self.skipTest(
+                        "Insufficient permissions to read the stack trace"
+                    )
+
+
+                # Find threads and their statuses
+                statuses = {}
+                for interpreter_info in traces:
+                    for thread_info in interpreter_info.threads:
+                        statuses[thread_info.thread_id] = thread_info.status
+
+                self.assertIsNotNone(sleeper_tid, "Sleeper thread id not received")
+                self.assertIsNotNone(busy_tid, "Busy thread id not received")
+                self.assertIn(sleeper_tid, statuses, "Sleeper tid not found in sampled threads")
+                self.assertIn(busy_tid, statuses, "Busy tid not found in sampled threads")
+                self.assertEqual(statuses[sleeper_tid], 1, "Sleeper thread should be idle (1)")
+                self.assertEqual(statuses[busy_tid], 0, "Busy thread should be running (0)")
+
+            finally:
+                if client_socket is not None:
+                    client_socket.close()
+                p.terminate()
+                p.wait(timeout=SHORT_TIMEOUT)
+
+    @unittest.skipIf(
+        sys.platform not in ("linux", "darwin", "win32"),
+        "Test only runs on unsupported platforms (not Linux, macOS, or Windows)",
+    )
+    @unittest.skipIf(sys.platform == "android", "Android raises Linux-specific exception")
+    def test_thread_status_gil_detection(self):
+        port = find_unused_port()
+        script = textwrap.dedent(
+            f"""\
+            import time, sys, socket, threading
+            import os
+
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.connect(('localhost', {port}))
+
+            def sleeper():
+                tid = threading.get_native_id()
+                sock.sendall(f'ready:sleeper:{{tid}}\\n'.encode())
+                time.sleep(10000)
+
+            def busy():
+                tid = threading.get_native_id()
+                sock.sendall(f'ready:busy:{{tid}}\\n'.encode())
+                x = 0
+                while True:
+                    x = x + 1
+                time.sleep(0.5)
+
+            t1 = threading.Thread(target=sleeper)
+            t2 = threading.Thread(target=busy)
+            t1.start()
+            t2.start()
+            sock.sendall(b'ready:main\\n')
+            t1.join()
+            t2.join()
+            sock.close()
+            """
+        )
+        with os_helper.temp_dir() as work_dir:
+            script_dir = os.path.join(work_dir, "script_pkg")
+            os.mkdir(script_dir)
+            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            server_socket.bind(("localhost", port))
+            server_socket.settimeout(SHORT_TIMEOUT)
+            server_socket.listen(1)
+
+            script_name = _make_test_script(script_dir, "thread_status_script", script)
+            client_socket = None
+            try:
+                p = subprocess.Popen([sys.executable, script_name])
+                client_socket, _ = server_socket.accept()
+                server_socket.close()
+                response = b""
+                sleeper_tid = None
+                busy_tid = None
+                while True:
+                    chunk = client_socket.recv(1024)
+                    response += chunk
+                    if b"ready:main" in response and b"ready:sleeper" in response and b"ready:busy" in response:
+                        # Parse TIDs from the response
+                        for line in response.split(b"\n"):
+                            if line.startswith(b"ready:sleeper:"):
+                                try:
+                                    sleeper_tid = int(line.split(b":")[-1])
+                                except Exception:
+                                    pass
+                            elif line.startswith(b"ready:busy:"):
+                                try:
+                                    busy_tid = int(line.split(b":")[-1])
+                                except Exception:
+                                    pass
+                        break
+
+                attempts = 10
+                try:
+                    unwinder = RemoteUnwinder(p.pid, all_threads=True, mode=PROFILING_MODE_GIL,
+                                                skip_non_matching_threads=False)
+                    for _ in range(attempts):
+                        traces = unwinder.get_stack_trace()
+                        # Check if any thread is running
+                        if any(thread_info.status == 0 for interpreter_info in traces
+                               for thread_info in interpreter_info.threads):
+                            break
+                        time.sleep(0.5)  # Give a bit of time to let threads settle
+                except PermissionError:
+                    self.skipTest(
+                        "Insufficient permissions to read the stack trace"
+                    )
+
+
+                # Find threads and their statuses
+                statuses = {}
+                for interpreter_info in traces:
+                    for thread_info in interpreter_info.threads:
+                        statuses[thread_info.thread_id] = thread_info.status
+
+                self.assertIsNotNone(sleeper_tid, "Sleeper thread id not received")
+                self.assertIsNotNone(busy_tid, "Busy thread id not received")
+                self.assertIn(sleeper_tid, statuses, "Sleeper tid not found in sampled threads")
+                self.assertIn(busy_tid, statuses, "Busy tid not found in sampled threads")
+                self.assertEqual(statuses[sleeper_tid], 2, "Sleeper thread should be idle (1)")
+                self.assertEqual(statuses[busy_tid], 0, "Busy thread should be running (0)")
+
+            finally:
+                if client_socket is not None:
+                    client_socket.close()
+                p.terminate()
+                p.wait(timeout=SHORT_TIMEOUT)
+
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/test/test_profiling/test_sampling_profiler.py b/Lib/test/test_profiling/test_sampling_profiler.py
index a6ca0fea0d46e4..fd6b1862230288 100644
--- a/Lib/test/test_profiling/test_sampling_profiler.py
+++ b/Lib/test/test_profiling/test_sampling_profiler.py
@@ -1999,6 +1999,7 @@ def test_cli_module_argument_parsing(self):
                 show_summary=True,
                 output_format="pstats",
                 realtime_stats=False,
+                mode=0
             )
 
     @unittest.skipIf(is_emscripten, "socket.SO_REUSEADDR does not exist")
@@ -2026,6 +2027,7 @@ def test_cli_module_with_arguments(self):
                 show_summary=True,
                 output_format="pstats",
                 realtime_stats=False,
+                mode=0
             )
 
     @unittest.skipIf(is_emscripten, "socket.SO_REUSEADDR does not exist")
@@ -2053,6 +2055,7 @@ def test_cli_script_argument_parsing(self):
                 show_summary=True,
                 output_format="pstats",
                 realtime_stats=False,
+                mode=0
             )
 
     @unittest.skipIf(is_emscripten, "socket.SO_REUSEADDR does not exist")
@@ -2152,6 +2155,7 @@ def test_cli_module_with_profiler_options(self):
                 show_summary=True,
                 output_format="pstats",
                 realtime_stats=False,
+                mode=0
             )
 
     @unittest.skipIf(is_emscripten, "socket.SO_REUSEADDR does not exist")
@@ -2185,6 +2189,7 @@ def test_cli_script_with_profiler_options(self):
                 show_summary=True,
                 output_format="collapsed",
                 realtime_stats=False,
+                mode=0
             )
 
     def test_cli_empty_module_name(self):
@@ -2396,6 +2401,7 @@ def test_argument_parsing_basic(self):
                 show_summary=True,
                 output_format="pstats",
                 realtime_stats=False,
+                mode=0
             )
 
     def test_sort_options(self):
@@ -2426,5 +2432,360 @@ def test_sort_options(self):
                 mock_sample.reset_mock()
 
 
+class TestCpuModeFiltering(unittest.TestCase):
+    """Test CPU mode filtering functionality (--mode=cpu)."""
+
+    def test_mode_validation(self):
+        """Test that CLI validates mode choices correctly."""
+        # Invalid mode choice should raise SystemExit
+        test_args = ["profiling.sampling.sample", "--mode", "invalid", "-p", "12345"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("sys.stderr", io.StringIO()) as mock_stderr,
+            self.assertRaises(SystemExit) as cm,
+        ):
+            profiling.sampling.sample.main()
+
+        self.assertEqual(cm.exception.code, 2)  # argparse error
+        error_msg = mock_stderr.getvalue()
+        self.assertIn("invalid choice", error_msg)
+
+    def test_frames_filtered_with_skip_idle(self):
+        """Test that frames are actually filtered when skip_idle=True."""
+        # Create mock frames with different thread statuses
+        class MockThreadInfoWithStatus:
+            def __init__(self, thread_id, frame_info, status):
+                self.thread_id = thread_id
+                self.frame_info = frame_info
+                self.status = status
+
+        # Create test data: running thread, idle thread, and another running thread
+        test_frames = [
+            MockInterpreterInfo(0, [
+                MockThreadInfoWithStatus(1, [MockFrameInfo("active1.py", 10, "active_func1")], 0),  # RUNNING
+                MockThreadInfoWithStatus(2, [MockFrameInfo("idle.py", 20, "idle_func")], 1),        # IDLE
+                MockThreadInfoWithStatus(3, [MockFrameInfo("active2.py", 30, "active_func2")], 0),  # RUNNING
+            ])
+        ]
+
+        # Test with skip_idle=True - should only process running threads
+        collector_skip = PstatsCollector(sample_interval_usec=1000, skip_idle=True)
+        collector_skip.collect(test_frames)
+
+        # Should only have functions from running threads (status 0)
+        active1_key = ("active1.py", 10, "active_func1")
+        active2_key = ("active2.py", 30, "active_func2")
+        idle_key = ("idle.py", 20, "idle_func")
+
+        self.assertIn(active1_key, collector_skip.result)
+        self.assertIn(active2_key, collector_skip.result)
+        self.assertNotIn(idle_key, collector_skip.result)  # Idle thread should be filtered out
+
+        # Test with skip_idle=False - should process all threads
+        collector_no_skip = PstatsCollector(sample_interval_usec=1000, skip_idle=False)
+        collector_no_skip.collect(test_frames)
+
+        # Should have functions from all threads
+        self.assertIn(active1_key, collector_no_skip.result)
+        self.assertIn(active2_key, collector_no_skip.result)
+        self.assertIn(idle_key, collector_no_skip.result)  # Idle thread should be included
+
+    @requires_subprocess()
+    def test_cpu_mode_integration_filtering(self):
+        """Integration test: CPU mode should only capture active threads, not idle ones."""
+        # Script with one mostly-idle thread and one CPU-active thread
+        cpu_vs_idle_script = '''
+import time
+import threading
+
+def idle_worker():
+    time.sleep(999999)
+
+def cpu_active_worker():
+    x = 1
+    while True:
+        x += 1
+
+def main():
+# Start both threads
+    idle_thread = threading.Thread(target=idle_worker)
+    cpu_thread = threading.Thread(target=cpu_active_worker)
+    idle_thread.start()
+    cpu_thread.start()
+    idle_thread.join()
+    cpu_thread.join()
+
+main()
+
+'''
+        with test_subprocess(cpu_vs_idle_script) as proc:
+            with (
+                io.StringIO() as captured_output,
+                mock.patch("sys.stdout", captured_output),
+            ):
+                try:
+                    profiling.sampling.sample.sample(
+                        proc.pid,
+                        duration_sec=0.5,
+                        sample_interval_usec=5000,
+                        mode=1,  # CPU mode
+                        show_summary=False,
+                        all_threads=True,
+                    )
+                except (PermissionError, RuntimeError) as e:
+                    self.skipTest("Insufficient permissions for remote profiling")
+
+                cpu_mode_output = captured_output.getvalue()
+
+            # Test wall-clock mode (mode=0) - should capture both functions
+            with (
+                io.StringIO() as captured_output,
+                mock.patch("sys.stdout", captured_output),
+            ):
+                try:
+                    profiling.sampling.sample.sample(
+                        proc.pid,
+                        duration_sec=0.5,
+                        sample_interval_usec=5000,
+                        mode=0,  # Wall-clock mode
+                        show_summary=False,
+                        all_threads=True,
+                    )
+                except (PermissionError, RuntimeError) as e:
+                    self.skipTest("Insufficient permissions for remote profiling")
+
+                wall_mode_output = captured_output.getvalue()
+
+            # Verify both modes captured samples
+            self.assertIn("Captured", cpu_mode_output)
+            self.assertIn("samples", cpu_mode_output)
+            self.assertIn("Captured", wall_mode_output)
+            self.assertIn("samples", wall_mode_output)
+
+            # CPU mode should strongly favor cpu_active_worker over mostly_idle_worker
+            self.assertIn("cpu_active_worker", cpu_mode_output)
+            self.assertNotIn("idle_worker", cpu_mode_output)
+
+            # Wall-clock mode should capture both types of work
+            self.assertIn("cpu_active_worker", wall_mode_output)
+            self.assertIn("idle_worker", wall_mode_output)
+
+
+class TestGilModeFiltering(unittest.TestCase):
+    """Test GIL mode filtering functionality (--mode=gil)."""
+
+    def test_gil_mode_validation(self):
+        """Test that CLI accepts gil mode choice correctly."""
+        test_args = ["profiling.sampling.sample", "--mode", "gil", "-p", "12345"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.sample.sample") as mock_sample,
+        ):
+            try:
+                profiling.sampling.sample.main()
+            except SystemExit:
+                pass  # Expected due to invalid PID
+
+        # Should have attempted to call sample with mode=2 (GIL mode)
+        mock_sample.assert_called_once()
+        call_args = mock_sample.call_args[1]
+        self.assertEqual(call_args["mode"], 2)  # PROFILING_MODE_GIL
+
+    def test_gil_mode_sample_function_call(self):
+        """Test that sample() function correctly uses GIL mode."""
+        with (
+            mock.patch("profiling.sampling.sample.SampleProfiler") as mock_profiler,
+            mock.patch("profiling.sampling.sample.PstatsCollector") as mock_collector,
+        ):
+            # Mock the profiler instance
+            mock_instance = mock.Mock()
+            mock_profiler.return_value = mock_instance
+
+            # Mock the collector instance
+            mock_collector_instance = mock.Mock()
+            mock_collector.return_value = mock_collector_instance
+
+            # Call sample with GIL mode and a filename to avoid pstats creation
+            profiling.sampling.sample.sample(
+                12345,
+                mode=2,  # PROFILING_MODE_GIL
+                duration_sec=1,
+                sample_interval_usec=1000,
+                filename="test_output.txt",
+            )
+
+            # Verify SampleProfiler was created with correct mode
+            mock_profiler.assert_called_once()
+            call_args = mock_profiler.call_args
+            self.assertEqual(call_args[1]['mode'], 2)  # mode parameter
+
+            # Verify profiler.sample was called
+            mock_instance.sample.assert_called_once()
+
+            # Verify collector.export was called since we provided a filename
+            mock_collector_instance.export.assert_called_once_with("test_output.txt")
+
+    def test_gil_mode_collector_configuration(self):
+        """Test that collectors are configured correctly for GIL mode."""
+        with (
+            mock.patch("profiling.sampling.sample.SampleProfiler") as mock_profiler,
+            mock.patch("profiling.sampling.sample.PstatsCollector") as mock_collector,
+        ):
+            # Mock the profiler instance
+            mock_instance = mock.Mock()
+            mock_profiler.return_value = mock_instance
+
+            # Call sample with GIL mode
+            profiling.sampling.sample.sample(
+                12345,
+                mode=2,  # PROFILING_MODE_GIL
+                output_format="pstats",
+            )
+
+            # Verify collector was created with skip_idle=True (since mode != WALL)
+            mock_collector.assert_called_once()
+            call_args = mock_collector.call_args[1]
+            self.assertTrue(call_args['skip_idle'])
+
+    def test_gil_mode_with_collapsed_format(self):
+        """Test GIL mode with collapsed stack format."""
+        with (
+            mock.patch("profiling.sampling.sample.SampleProfiler") as mock_profiler,
+            mock.patch("profiling.sampling.sample.CollapsedStackCollector") as mock_collector,
+        ):
+            # Mock the profiler instance
+            mock_instance = mock.Mock()
+            mock_profiler.return_value = mock_instance
+
+            # Call sample with GIL mode and collapsed format
+            profiling.sampling.sample.sample(
+                12345,
+                mode=2,  # PROFILING_MODE_GIL
+                output_format="collapsed",
+                filename="test_output.txt",
+            )
+
+            # Verify collector was created with skip_idle=True
+            mock_collector.assert_called_once()
+            call_args = mock_collector.call_args[1]
+            self.assertTrue(call_args['skip_idle'])
+
+    def test_gil_mode_cli_argument_parsing(self):
+        """Test CLI argument parsing for GIL mode with various options."""
+        test_args = [
+            "profiling.sampling.sample",
+            "--mode", "gil",
+            "--interval", "500",
+            "--duration", "5",
+            "-p", "12345"
+        ]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.sample.sample") as mock_sample,
+        ):
+            try:
+                profiling.sampling.sample.main()
+            except SystemExit:
+                pass  # Expected due to invalid PID
+
+        # Verify all arguments were parsed correctly
+        mock_sample.assert_called_once()
+        call_args = mock_sample.call_args[1]
+        self.assertEqual(call_args["mode"], 2)  # GIL mode
+        self.assertEqual(call_args["sample_interval_usec"], 500)
+        self.assertEqual(call_args["duration_sec"], 5)
+
+    @requires_subprocess()
+    def test_gil_mode_integration_behavior(self):
+        """Integration test: GIL mode should capture GIL-holding threads."""
+        # Create a test script with GIL-releasing operations
+        gil_test_script = '''
+import time
+import threading
+
+def gil_releasing_work():
+    time.sleep(999999)
+
+def gil_holding_work():
+    x = 1
+    while True:
+        x += 1
+
+def main():
+# Start both threads
+    idle_thread = threading.Thread(target=gil_releasing_work)
+    cpu_thread = threading.Thread(target=gil_holding_work)
+    idle_thread.start()
+    cpu_thread.start()
+    idle_thread.join()
+    cpu_thread.join()
+
+main()
+'''
+        with test_subprocess(gil_test_script) as proc:
+            with (
+                io.StringIO() as captured_output,
+                mock.patch("sys.stdout", captured_output),
+            ):
+                try:
+                    profiling.sampling.sample.sample(
+                        proc.pid,
+                        duration_sec=0.5,
+                        sample_interval_usec=5000,
+                        mode=2,  # GIL mode
+                        show_summary=False,
+                        all_threads=True,
+                    )
+                except (PermissionError, RuntimeError) as e:
+                    self.skipTest("Insufficient permissions for remote profiling")
+
+                gil_mode_output = captured_output.getvalue()
+
+            # Test wall-clock mode for comparison
+            with (
+                io.StringIO() as captured_output,
+                mock.patch("sys.stdout", captured_output),
+            ):
+                try:
+                    profiling.sampling.sample.sample(
+                        proc.pid,
+                        duration_sec=0.5,
+                        sample_interval_usec=5000,
+                        mode=0,  # Wall-clock mode
+                        show_summary=False,
+                        all_threads=True,
+                    )
+                except (PermissionError, RuntimeError) as e:
+                    self.skipTest("Insufficient permissions for remote profiling")
+
+                wall_mode_output = captured_output.getvalue()
+
+            # GIL mode should primarily capture GIL-holding work
+            # (Note: actual behavior depends on threading implementation)
+            self.assertIn("gil_holding_work", gil_mode_output)
+
+            # Wall-clock mode should capture both types of work
+            self.assertIn("gil_holding_work", wall_mode_output)
+
+    def test_mode_constants_are_defined(self):
+        """Test that all profiling mode constants are properly defined."""
+        self.assertEqual(profiling.sampling.sample.PROFILING_MODE_WALL, 0)
+        self.assertEqual(profiling.sampling.sample.PROFILING_MODE_CPU, 1)
+        self.assertEqual(profiling.sampling.sample.PROFILING_MODE_GIL, 2)
+
+    def test_parse_mode_function(self):
+        """Test the _parse_mode function with all valid modes."""
+        self.assertEqual(profiling.sampling.sample._parse_mode("wall"), 0)
+        self.assertEqual(profiling.sampling.sample._parse_mode("cpu"), 1)
+        self.assertEqual(profiling.sampling.sample._parse_mode("gil"), 2)
+
+        # Test invalid mode raises KeyError
+        with self.assertRaises(KeyError):
+            profiling.sampling.sample._parse_mode("invalid")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Modules/_remote_debugging_module.c b/Modules/_remote_debugging_module.c
index c306143ee73b18..d56f53aebb8717 100644
--- a/Modules/_remote_debugging_module.c
+++ b/Modules/_remote_debugging_module.c
@@ -34,6 +34,31 @@
 #    define HAVE_PROCESS_VM_READV 0
 #endif
 
+// Returns thread status using proc_pidinfo, caches thread_id_offset on first use (macOS only)
+#if defined(__APPLE__) && TARGET_OS_OSX
+#include <libproc.h>
+#include <sys/types.h>
+#define MAX_NATIVE_THREADS 4096
+#endif
+
+#ifdef MS_WINDOWS
+#include <windows.h>
+#include <winternl.h>
+// ntstatus.h conflicts with windows.h so we have to define the NTSTATUS values we need
+#define STATUS_SUCCESS                   ((NTSTATUS)0x00000000L)
+#define STATUS_INFO_LENGTH_MISMATCH      ((NTSTATUS)0xC0000004L)
+typedef enum _WIN32_THREADSTATE {
+    WIN32_THREADSTATE_INITIALIZED = 0,   // Recognized by the kernel
+    WIN32_THREADSTATE_READY       = 1,   // Prepared to run on the next available processor
+    WIN32_THREADSTATE_RUNNING     = 2,   // Currently executing
+    WIN32_THREADSTATE_STANDBY     = 3,   // About to run, only one thread may be in this state at a time
+    WIN32_THREADSTATE_TERMINATED  = 4,   // Finished executing
+    WIN32_THREADSTATE_WAITING     = 5,   // Not ready for the processor, when ready, it will be rescheduled
+    WIN32_THREADSTATE_TRANSITION  = 6,   // Waiting for resources other than the processor
+    WIN32_THREADSTATE_UNKNOWN     = 7    // Thread state is unknown
+} WIN32_THREADSTATE;
+#endif
+
 /* ============================================================================
  * TYPE DEFINITIONS AND STRUCTURES
  * ============================================================================ */
@@ -153,6 +178,7 @@ static PyStructSequence_Desc CoroInfo_desc = {
 // ThreadInfo structseq type - replaces 2-tuple (thread_id, frame_info)
 static PyStructSequence_Field ThreadInfo_fields[] = {
     {"thread_id", "Thread ID"},
+    {"status", "Thread status"},
     {"frame_info", "Frame information"},
     {NULL}
 };
@@ -211,6 +237,19 @@ typedef struct {
     PyTypeObject *AwaitedInfo_Type;
 } RemoteDebuggingState;
 
+enum _ThreadState {
+    THREAD_STATE_RUNNING,
+    THREAD_STATE_IDLE,
+    THREAD_STATE_GIL_WAIT,
+    THREAD_STATE_UNKNOWN
+};
+
+enum _ProfilingMode {
+    PROFILING_MODE_WALL = 0,
+    PROFILING_MODE_CPU = 1,
+    PROFILING_MODE_GIL = 2
+};
+
 typedef struct {
     PyObject_HEAD
     proc_handle_t handle;
@@ -224,12 +263,21 @@ typedef struct {
     _Py_hashtable_t *code_object_cache;
     int debug;
     int only_active_thread;
+    int mode;  // Use enum _ProfilingMode values
+    int skip_non_matching_threads;  // New option to skip threads that don't match mode
     RemoteDebuggingState *cached_state;  // Cached module state
 #ifdef Py_GIL_DISABLED
     // TLBC cache invalidation tracking
     uint32_t tlbc_generation;  // Track TLBC index pool changes
     _Py_hashtable_t *tlbc_cache;  // Cache of TLBC arrays by code object address
 #endif
+#ifdef __APPLE__
+    uint64_t thread_id_offset;
+#endif
+#ifdef MS_WINDOWS
+    PVOID win_process_buffer;
+    ULONG win_process_buffer_size;
+#endif
 } RemoteUnwinderObject;
 
 #define RemoteUnwinder_CAST(op) ((RemoteUnwinderObject *)(op))
@@ -2453,10 +2501,139 @@ process_frame_chain(
     return 0;
 }
 
+static int
+get_thread_status(RemoteUnwinderObject *unwinder, uint64_t tid, uint64_t pthread_id) {
+#if defined(__APPLE__) && TARGET_OS_OSX
+   if (unwinder->thread_id_offset == 0) {
+        uint64_t *tids = (uint64_t *)PyMem_Malloc(MAX_NATIVE_THREADS * sizeof(uint64_t));
+        if (!tids) {
+            PyErr_NoMemory();
+            return -1;
+        }
+        int n = proc_pidinfo(unwinder->handle.pid, PROC_PIDLISTTHREADS, 0, tids, MAX_NATIVE_THREADS * sizeof(uint64_t)) / sizeof(uint64_t);
+        if (n <= 0) {
+            PyMem_Free(tids);
+            return THREAD_STATE_UNKNOWN;
+        }
+        uint64_t min_offset = UINT64_MAX;
+        for (int i = 0; i < n; i++) {
+            uint64_t offset = tids[i] - pthread_id;
+            if (offset < min_offset) {
+                min_offset = offset;
+            }
+        }
+        unwinder->thread_id_offset = min_offset;
+        PyMem_Free(tids);
+    }
+    struct proc_threadinfo ti;
+    uint64_t tid_with_offset = pthread_id + unwinder->thread_id_offset;
+    if (proc_pidinfo(unwinder->handle.pid, PROC_PIDTHREADINFO, tid_with_offset, &ti, sizeof(ti)) != sizeof(ti)) {
+        return THREAD_STATE_UNKNOWN;
+    }
+    if (ti.pth_run_state == TH_STATE_RUNNING) {
+        return THREAD_STATE_RUNNING;
+    }
+    return THREAD_STATE_IDLE;
+#elif defined(__linux__)
+    char stat_path[256];
+    char buffer[2048] = "";
+
+    snprintf(stat_path, sizeof(stat_path), "/proc/%d/task/%lu/stat", unwinder->handle.pid, tid);
+
+    int fd = open(stat_path, O_RDONLY);
+    if (fd == -1) {
+        return THREAD_STATE_UNKNOWN;
+    }
+
+    if (read(fd, buffer, 2047) == 0) {
+        close(fd);
+        return THREAD_STATE_UNKNOWN;
+    }
+    close(fd);
+
+    char *p = strchr(buffer, ')');
+    if (!p) {
+        return THREAD_STATE_UNKNOWN;
+    }
+
+    p += 2;  // Skip ") "
+    if (*p == ' ') {
+        p++;
+    }
+
+    switch (*p) {
+        case 'R':  // Running
+            return THREAD_STATE_RUNNING;
+        case 'S':  // Interruptible sleep
+        case 'D':  // Uninterruptible sleep
+        case 'T':  // Stopped
+        case 'Z':  // Zombie
+        case 'I':  // Idle kernel thread
+            return THREAD_STATE_IDLE;
+        default:
+            return THREAD_STATE_UNKNOWN;
+    }
+#elif defined(MS_WINDOWS)
+    ULONG n;
+    NTSTATUS status = NtQuerySystemInformation(
+        SystemProcessInformation,
+        unwinder->win_process_buffer,
+        unwinder->win_process_buffer_size,
+        &n
+    );
+    if (status == STATUS_INFO_LENGTH_MISMATCH) {
+        // Buffer was too small so we reallocate a larger one and try again.
+        unwinder->win_process_buffer_size = n;
+        PVOID new_buffer = PyMem_Realloc(unwinder->win_process_buffer, n);
+        if (!new_buffer) {
+            return -1;
+        }
+        unwinder->win_process_buffer = new_buffer;
+        return get_thread_status(unwinder, tid, pthread_id);
+    }
+    if (status != STATUS_SUCCESS) {
+        return -1;
+    }
+
+    SYSTEM_PROCESS_INFORMATION *pi = (SYSTEM_PROCESS_INFORMATION *)unwinder->win_process_buffer;
+    while ((ULONG)(ULONG_PTR)pi->UniqueProcessId != unwinder->handle.pid) {
+        if (pi->NextEntryOffset == 0) {
+            // We didn't find the process
+            return -1;
+        }
+        pi = (SYSTEM_PROCESS_INFORMATION *)(((BYTE *)pi) + pi->NextEntryOffset);
+    }
+
+    SYSTEM_THREAD_INFORMATION *ti = (SYSTEM_THREAD_INFORMATION *)((char *)pi + sizeof(SYSTEM_PROCESS_INFORMATION));
+    for (Py_ssize_t i = 0; i < pi->NumberOfThreads; i++, ti++) {
+        if (ti->ClientId.UniqueThread == (HANDLE)tid) {
+            return ti->ThreadState != WIN32_THREADSTATE_RUNNING ? THREAD_STATE_IDLE : THREAD_STATE_RUNNING;
+        }
+    }
+
+    return -1;
+#else
+    return THREAD_STATE_UNKNOWN;
+#endif
+}
+
+typedef struct {
+    unsigned int initialized:1;
+    unsigned int bound:1;
+    unsigned int unbound:1;
+    unsigned int bound_gilstate:1;
+    unsigned int active:1;
+    unsigned int finalizing:1;
+    unsigned int cleared:1;
+    unsigned int finalized:1;
+    unsigned int :24;
+} _thread_status;
+
 static PyObject*
 unwind_stack_for_thread(
     RemoteUnwinderObject *unwinder,
-    uintptr_t *current_tstate
+    uintptr_t *current_tstate,
+    uintptr_t gil_holder_tstate
 ) {
     PyObject *frame_info = NULL;
     PyObject *thread_id = NULL;
@@ -2471,6 +2648,44 @@ unwind_stack_for_thread(
         goto error;
     }
 
+    long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id);
+
+    // Calculate thread status based on mode
+    int status = THREAD_STATE_UNKNOWN;
+    if (unwinder->mode == PROFILING_MODE_CPU) {
+        long pthread_id = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.thread_id);
+        status = get_thread_status(unwinder, tid, pthread_id);
+        if (status == -1) {
+            PyErr_Print();
+            PyErr_SetString(PyExc_RuntimeError, "Failed to get thread status");
+            goto error;
+        }
+    } else if (unwinder->mode == PROFILING_MODE_GIL) {
+#ifdef Py_GIL_DISABLED
+        // All threads are considered running in free threading builds if they have a thread state attached
+        int active = GET_MEMBER(_thread_status, ts, unwinder->debug_offsets.thread_state.status).active;
+        status = active ? THREAD_STATE_RUNNING : THREAD_STATE_GIL_WAIT;
+#else
+        status = (*current_tstate == gil_holder_tstate) ? THREAD_STATE_RUNNING : THREAD_STATE_GIL_WAIT;
+#endif
+    } else {
+        // PROFILING_MODE_WALL - all threads are considered running
+        status = THREAD_STATE_RUNNING;
+    }
+
+    // Check if we should skip this thread based on mode
+    int should_skip = 0;
+    if (unwinder->skip_non_matching_threads && status != THREAD_STATE_RUNNING &&
+        (unwinder->mode == PROFILING_MODE_CPU || unwinder->mode == PROFILING_MODE_GIL)) {
+        should_skip = 1;
+    }
+
+    if (should_skip) {
+        // Advance to next thread and return NULL to skip processing
+        *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
+        return NULL;
+    }
+
     uintptr_t frame_addr = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.current_frame);
 
     frame_info = PyList_New(0);
@@ -2491,8 +2706,7 @@ unwind_stack_for_thread(
 
     *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
 
-    thread_id = PyLong_FromLongLong(
-        GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id));
+    thread_id = PyLong_FromLongLong(tid);
     if (thread_id == NULL) {
         set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID");
         goto error;
@@ -2505,8 +2719,16 @@ unwind_stack_for_thread(
         goto error;
     }
 
-    PyStructSequence_SetItem(result, 0, thread_id);  // Steals reference
-    PyStructSequence_SetItem(result, 1, frame_info); // Steals reference
+    PyObject *py_status = PyLong_FromLong(status);
+    if (py_status == NULL) {
+        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread status");
+        goto error;
+    }
+    PyErr_Print();
+
+    PyStructSequence_SetItem(result, 0, thread_id);
+    PyStructSequence_SetItem(result, 1, py_status);  // Steals reference
+    PyStructSequence_SetItem(result, 2, frame_info); // Steals reference
 
     cleanup_stack_chunks(&chunks);
     return result;
@@ -2537,7 +2759,9 @@ _remote_debugging.RemoteUnwinder.__init__
     *
     all_threads: bool = False
     only_active_thread: bool = False
+    mode: int = 0
     debug: bool = False
+    skip_non_matching_threads: bool = True
 
 Initialize a new RemoteUnwinder object for debugging a remote Python process.
 
@@ -2546,9 +2770,12 @@ Initialize a new RemoteUnwinder object for debugging a remote Python process.
     all_threads: If True, initialize state for all threads in the process.
                 If False, only initialize for the main thread.
     only_active_thread: If True, only sample the thread holding the GIL.
+    mode: Profiling mode: 0=WALL (wall-time), 1=CPU (cpu-time), 2=GIL (gil-time).
                        Cannot be used together with all_threads=True.
     debug: If True, chain exceptions to explain the sequence of events that
            lead to the exception.
+    skip_non_matching_threads: If True, skip threads that don't match the selected mode.
+                              If False, include all threads regardless of mode.
 
 The RemoteUnwinder provides functionality to inspect and debug a running Python
 process, including examining thread states, stack frames and other runtime data.
@@ -2564,8 +2791,9 @@ static int
 _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
                                                int pid, int all_threads,
                                                int only_active_thread,
-                                               int debug)
-/*[clinic end generated code: output=13ba77598ecdcbe1 input=cfc21663fbe263c4]*/
+                                               int mode, int debug,
+                                               int skip_non_matching_threads)
+/*[clinic end generated code: output=abf5ea5cd58bcb36 input=08fb6ace023ec3b5]*/
 {
     // Validate that all_threads and only_active_thread are not both True
     if (all_threads && only_active_thread) {
@@ -2584,6 +2812,8 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
 
     self->debug = debug;
     self->only_active_thread = only_active_thread;
+    self->mode = mode;
+    self->skip_non_matching_threads = skip_non_matching_threads;
     self->cached_state = NULL;
     if (_Py_RemoteDebug_InitProcHandle(&self->handle, pid) < 0) {
         set_exception_cause(self, PyExc_RuntimeError, "Failed to initialize process handle");
@@ -2656,6 +2886,15 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
     }
 #endif
 
+#if defined(__APPLE__) 
+    self->thread_id_offset = 0;
+#endif
+
+#ifdef MS_WINDOWS
+    self->win_process_buffer = NULL;
+    self->win_process_buffer_size = 0;
+#endif
+
     return 0;
 }
 
@@ -2761,21 +3000,25 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
             goto exit;
         }
 
+        // Get the GIL holder for this interpreter (needed for GIL_WAIT logic)
+        uintptr_t gil_holder_tstate = 0;
+        int gil_locked = GET_MEMBER(int, interp_state_buffer,
+            self->debug_offsets.interpreter_state.gil_runtime_state_locked);
+        if (gil_locked) {
+            gil_holder_tstate = (uintptr_t)GET_MEMBER(PyThreadState*, interp_state_buffer,
+                self->debug_offsets.interpreter_state.gil_runtime_state_holder);
+        }
+
         uintptr_t current_tstate;
         if (self->only_active_thread) {
             // Find the GIL holder for THIS interpreter
-            int gil_locked = GET_MEMBER(int, interp_state_buffer,
-                self->debug_offsets.interpreter_state.gil_runtime_state_locked);
-
             if (!gil_locked) {
                 // This interpreter's GIL is not locked, skip it
                 Py_DECREF(interpreter_threads);
                 goto next_interpreter;
             }
 
-            // Get the GIL holder for this interpreter
-            current_tstate = (uintptr_t)GET_MEMBER(PyThreadState*, interp_state_buffer,
-                self->debug_offsets.interpreter_state.gil_runtime_state_holder);
+            current_tstate = gil_holder_tstate;
         } else if (self->tstate_addr == 0) {
             // Get all threads for this interpreter
             current_tstate = GET_MEMBER(uintptr_t, interp_state_buffer,
@@ -2786,8 +3029,14 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
         }
 
         while (current_tstate != 0) {
-            PyObject* frame_info = unwind_stack_for_thread(self, &current_tstate);
+            PyObject* frame_info = unwind_stack_for_thread(self, &current_tstate, gil_holder_tstate);
             if (!frame_info) {
+                // Check if this was an intentional skip due to mode-based filtering
+                if ((self->mode == PROFILING_MODE_CPU || self->mode == PROFILING_MODE_GIL) && !PyErr_Occurred()) {
+                    // Thread was skipped due to mode filtering, continue to next thread
+                    continue;
+                }
+                // This was an actual error
                 Py_DECREF(interpreter_threads);
                 set_exception_cause(self, PyExc_RuntimeError, "Failed to unwind stack for thread");
                 Py_CLEAR(result);
@@ -3038,6 +3287,12 @@ RemoteUnwinder_dealloc(PyObject *op)
     if (self->code_object_cache) {
         _Py_hashtable_destroy(self->code_object_cache);
     }
+#ifdef MS_WINDOWS
+    if(self->win_process_buffer != NULL) {
+        PyMem_Free(self->win_process_buffer);
+    }
+#endif
+
 #ifdef Py_GIL_DISABLED
     if (self->tlbc_cache) {
         _Py_hashtable_destroy(self->tlbc_cache);
diff --git a/Modules/clinic/_remote_debugging_module.c.h b/Modules/clinic/_remote_debugging_module.c.h
index 9bfcdde407fe3c..7dd54e3124887b 100644
--- a/Modules/clinic/_remote_debugging_module.c.h
+++ b/Modules/clinic/_remote_debugging_module.c.h
@@ -11,7 +11,7 @@ preserve
 
 PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
 "RemoteUnwinder(pid, *, all_threads=False, only_active_thread=False,\n"
-"               debug=False)\n"
+"               mode=0, debug=False, skip_non_matching_threads=True)\n"
 "--\n"
 "\n"
 "Initialize a new RemoteUnwinder object for debugging a remote Python process.\n"
@@ -21,9 +21,12 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
 "    all_threads: If True, initialize state for all threads in the process.\n"
 "                If False, only initialize for the main thread.\n"
 "    only_active_thread: If True, only sample the thread holding the GIL.\n"
+"    mode: Profiling mode: 0=WALL (wall-time), 1=CPU (cpu-time), 2=GIL (gil-time).\n"
 "                       Cannot be used together with all_threads=True.\n"
 "    debug: If True, chain exceptions to explain the sequence of events that\n"
 "           lead to the exception.\n"
+"    skip_non_matching_threads: If True, skip threads that don\'t match the selected mode.\n"
+"                              If False, include all threads regardless of mode.\n"
 "\n"
 "The RemoteUnwinder provides functionality to inspect and debug a running Python\n"
 "process, including examining thread states, stack frames and other runtime data.\n"
@@ -38,7 +41,8 @@ static int
 _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
                                                int pid, int all_threads,
                                                int only_active_thread,
-                                               int debug);
+                                               int mode, int debug,
+                                               int skip_non_matching_threads);
 
 static int
 _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObject *kwargs)
@@ -46,7 +50,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
     int return_value = -1;
     #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
 
-    #define NUM_KEYWORDS 4
+    #define NUM_KEYWORDS 6
     static struct {
         PyGC_Head _this_is_not_used;
         PyObject_VAR_HEAD
@@ -55,7 +59,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
     } _kwtuple = {
         .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
         .ob_hash = -1,
-        .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(debug), },
+        .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(mode), &_Py_ID(debug), &_Py_ID(skip_non_matching_threads), },
     };
     #undef NUM_KEYWORDS
     #define KWTUPLE (&_kwtuple.ob_base.ob_base)
@@ -64,21 +68,23 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
     #  define KWTUPLE NULL
     #endif  // !Py_BUILD_CORE
 
-    static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "debug", NULL};
+    static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "mode", "debug", "skip_non_matching_threads", NULL};
     static _PyArg_Parser _parser = {
         .keywords = _keywords,
         .fname = "RemoteUnwinder",
         .kwtuple = KWTUPLE,
     };
     #undef KWTUPLE
-    PyObject *argsbuf[4];
+    PyObject *argsbuf[6];
     PyObject * const *fastargs;
     Py_ssize_t nargs = PyTuple_GET_SIZE(args);
     Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 1;
     int pid;
     int all_threads = 0;
     int only_active_thread = 0;
+    int mode = 0;
     int debug = 0;
+    int skip_non_matching_threads = 1;
 
     fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
             /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
@@ -110,12 +116,30 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
             goto skip_optional_kwonly;
         }
     }
-    debug = PyObject_IsTrue(fastargs[3]);
-    if (debug < 0) {
+    if (fastargs[3]) {
+        mode = PyLong_AsInt(fastargs[3]);
+        if (mode == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
+    }
+    if (fastargs[4]) {
+        debug = PyObject_IsTrue(fastargs[4]);
+        if (debug < 0) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
+    }
+    skip_non_matching_threads = PyObject_IsTrue(fastargs[5]);
+    if (skip_non_matching_threads < 0) {
         goto exit;
     }
 skip_optional_kwonly:
-    return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, debug);
+    return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, mode, debug, skip_non_matching_threads);
 
 exit:
     return return_value;
@@ -297,4 +321,4 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace(PyObject *self, PyObject
 
     return return_value;
 }
-/*[clinic end generated code: output=2ba15411abf82c33 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=2caefeddf7683d32 input=a9049054013a1b77]*/
diff --git a/PCbuild/_remote_debugging.vcxproj b/PCbuild/_remote_debugging.vcxproj
index c55f2908e03d33..a01905fdf2f437 100644
--- a/PCbuild/_remote_debugging.vcxproj
+++ b/PCbuild/_remote_debugging.vcxproj
@@ -92,6 +92,11 @@
   <PropertyGroup>
     <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
   </PropertyGroup>
+  <ItemDefinitionGroup>
+  <Link>
+    <AdditionalDependencies>ntdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+  </Link>
+</ItemDefinitionGroup>
   <ItemGroup>
     <ClCompile Include="..\Modules\_remote_debugging_module.c" />
   </ItemGroup>