Skip to content

Commit 943e684

Browse files
authored
Model breakdown - device utilization (#61)
* added logic for sending gpu utilization data * added tensor core usage * change logging output * changed ignored file names * changed gitignore * created disconnection function * review changes * deleted commented code * changed logic for gpu time calculations * added warm-up step for pytorch profiling * added message to error_message * added multiprocess * Fixed launching deepview from command line
1 parent b5e4f51 commit 943e684

File tree

10 files changed

+1020
-222
lines changed

10 files changed

+1020
-222
lines changed

deepview_profile/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,5 @@
99
__name__ = package["tool"]["poetry"]["name"]
1010
__version__ = package["tool"]["poetry"]["version"]
1111
__description__ = package["tool"]["poetry"]["description"]
12+
13+
from .__main__ import main

deepview_profile/analysis/request_manager.py

Lines changed: 40 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import math
33
import time
44
from concurrent.futures import ThreadPoolExecutor
5-
5+
import torch.multiprocessing as mp
66
from deepview_profile.analysis.runner import analyze_project
77
from deepview_profile.exceptions import AnalysisError
88
from deepview_profile.nvml import NVML
@@ -24,6 +24,7 @@ def __init__(self, enqueue_response, message_sender, connection_manager):
2424
self._message_sender = message_sender
2525
self._connection_manager = connection_manager
2626
self._nvml = NVML()
27+
mp.set_start_method("spawn")
2728

2829
def start(self):
2930
self._nvml.start()
@@ -52,18 +53,13 @@ def _handle_analysis_request(self, analysis_request, context):
5253
context.sequence_number,
5354
*(context.address),
5455
)
55-
connection = self._connection_manager.get_connection(context.address)
56+
connection = self._connection_manager.get_connection(
57+
context.address)
5658
analyzer = analyze_project(
5759
connection.project_root, connection.entry_point, self._nvml)
5860

5961
# Abort early if the connection has been closed
60-
if not context.state.connected:
61-
logger.error(
62-
'Aborting request %d from (%s:%d) early '
63-
'because the client has disconnected.',
64-
context.sequence_number,
65-
*(context.address),
66-
)
62+
if self._early_disconnection_error(context):
6763
return
6864

6965
breakdown = next(analyzer)
@@ -73,13 +69,7 @@ def _handle_analysis_request(self, analysis_request, context):
7369
context,
7470
)
7571

76-
if not context.state.connected:
77-
logger.error(
78-
'Aborting request %d from (%s:%d) early '
79-
'because the client has disconnected.',
80-
context.sequence_number,
81-
*(context.address),
82-
)
72+
if self._early_disconnection_error(context):
8373
return
8474

8575
throughput = next(analyzer)
@@ -90,13 +80,7 @@ def _handle_analysis_request(self, analysis_request, context):
9080
)
9181

9282
# send habitat response
93-
if not context.state.connected:
94-
logger.error(
95-
'Aborting request %d from (%s:%d) early '
96-
'because the client has disconnected.',
97-
context.sequence_number,
98-
*(context.address),
99-
)
83+
if self._early_disconnection_error(context):
10084
return
10185

10286
habitat_resp = next(analyzer)
@@ -106,14 +90,19 @@ def _handle_analysis_request(self, analysis_request, context):
10690
context,
10791
)
10892

93+
# send utilization data
94+
if self._early_disconnection_error(context):
95+
return
96+
97+
utilization_resp = next(analyzer)
98+
self._enqueue_response(
99+
self._send_utilization_response,
100+
utilization_resp,
101+
context
102+
)
103+
109104
# send energy response
110-
if not context.state.connected:
111-
logger.error(
112-
'Aborting request %d from (%s:%d) early '
113-
'because the client has disconnected.',
114-
context.sequence_number,
115-
*(context.address),
116-
)
105+
if self._early_disconnection_error(context):
117106
return
118107

119108
energy_resp = next(analyzer)
@@ -198,3 +187,24 @@ def _send_energy_response(self, energy_resp, context):
198187
except Exception:
199188
logger.exception(
200189
'Exception occurred when sending an energy response.')
190+
191+
def _send_utilization_response(self, utilization_resp, context):
192+
# Called from the main executor. Do not call directly!
193+
try:
194+
self._message_sender.send_utilization_response(
195+
utilization_resp, context)
196+
except Exception:
197+
logger.exception(
198+
'Exception occurred when sending utilization response.')
199+
200+
def _early_disconnection_error(self, context):
201+
if not context.state.connected:
202+
logger.error(
203+
'Aborting request %d from (%s:%d) early '
204+
'because the client has disconnected.',
205+
context.sequence_number,
206+
*(context.address),
207+
)
208+
return True
209+
210+
return False

deepview_profile/analysis/runner.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,31 @@
11
import argparse
22
import logging
33
import os
4-
5-
import torch
64
from deepview_profile.analysis.session import AnalysisSession
75
from deepview_profile.nvml import NVML
86
from deepview_profile.utils import release_memory
97

8+
109
def analyze_project(project_root, entry_point, nvml):
11-
release_memory()
1210
session = AnalysisSession.new_from(project_root, entry_point)
11+
release_memory()
12+
13+
print("analyze_project: running measure_breakdown()")
1314
yield session.measure_breakdown(nvml)
1415
release_memory()
16+
17+
print("analyze_project: running measure_throughput()")
1518
yield session.measure_throughput()
1619
release_memory()
1720

1821
print("analyze_project: running deepview_predict()")
1922
yield session.habitat_predict()
2023
release_memory()
2124

25+
print("analyze_project: running measure_utilization()")
26+
yield session.measure_utilization()
27+
release_memory()
28+
2229
print("analyze_project: running energy_compute()")
2330
yield session.energy_compute()
2431
release_memory()

0 commit comments

Comments
 (0)