[MLP IMPROVEMENTS] Add new sampling for bmm recordings (#57)

johncalesp · web-flow · commit 455d44d21681 · 2024-07-09T14:05:51.000-04:00
* predict linear using kernel time

* include __matmul__ in bmm predictor

* linear model mixed dataset

* reverted to test all models

* reshape __matmul__ to fit into bmm

* test reshape of __matmul__ args

* new sampling strategy for bmm

* additional verifications for bmm

* cap memory for bmm sampling

* readjust bmm mem ceil

* trained bmm mlp model with new data

* deleted temporary debug print

* restore rest of experiments

* fix bmm max memory consumption

* add more L4 samples to train bmm mlp

---------

Co-authored-by: John Calderon &lt;john.calderon@centml.ai&gt;
diff --git a/analyzer/habitat/analysis/predictor.py b/analyzer/habitat/analysis/predictor.py
@@ -2,6 +2,7 @@
 import logging
 import operator
 import numpy as np
+import math
 
 from habitat.analysis import SPECIAL_OPERATIONS
 from habitat.analysis.operation import PredictedOperation
@@ -116,9 +117,9 @@ def predict_operation(self, operation, dest_device, unscaled=False):
             return self._special_scale(operation, dest_device, self._conv2d_scale, unscaled)
         elif operation.name == 'lstm':
             return self._special_scale(operation, dest_device, self._lstm_scale, unscaled)
-        elif operation.name in ['linear','__matmul__']:
+        elif operation.name == 'linear':
             return self._special_scale(operation, dest_device, self._linear_scale, unscaled)
-        elif operation.name == 'bmm':
+        elif operation.name in ['bmm', '__matmul__']:
             return self._special_scale(operation, dest_device, self._bmm_scale, unscaled)
         elif operation.name == 'conv_transpose2d':
             return self._special_scale(operation, dest_device, self._conv_transpose2d_scale, unscaled)
@@ -284,6 +285,7 @@ def _linear_scale(self, operation, dest_device, unscaled=False):
         arguments = [arguments[x] for x in self.linear_pred.model.features]
 
         pred_dest = self.linear_pred.predict(arguments, dest_device.name)
+
         pred_orig = self.linear_pred.predict(arguments, operation.device.name)
 
         if unscaled:
@@ -295,18 +297,30 @@ def _linear_scale(self, operation, dest_device, unscaled=False):
         return operation.run_time_ms * pred_dest / pred_orig
 
     def _bmm_scale(self, operation, dest_device, unscaled=False):
+        # nn.Linear may call __matmul__ which in turn calls bmm
+        # but the shape of the arguments may be [a,b,c,d]. 
+        # So we need to reshape them into [a*b,c,d]
+        reshape_args = []
+        for arg in operation.arguments.args:
+            if len(arg) > 3:
+                reshape_args.append([math.prod(arg[:-2]),arg[-2], arg[-1]])
+            else:
+                reshape_args.append(arg)
+        operation.arguments.args = reshape_args
+        
         merged = name_all_arguments(
             BMM_PARAMS,
             operation.arguments.args,
             operation.arguments.kwargs,
         )
-
+    
         arguments = dict(
             batch=merged['input'][0],
             left=merged['input'][1],
             middle=merged['input'][2],
             right=merged['mat2'][2],
         )
+
         arguments = [arguments[x] for x in self.bmm_pred.model.features]
 
         pred_dest = self.bmm_pred.predict(arguments, dest_device.name)
diff --git a/analyzer/habitat/data/bmm/model.pth b/analyzer/habitat/data/bmm/model.pth
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:628cd9ecca8cda59e0b5277580c996a72bae9b29bf3c5bdabccd9dfa6fc34389
+oid sha256:70c172469e8c1244e7fb53444e324bc8ac2cd4f8552e86a7e2d3444c02f43128
 size 33634474
diff --git a/tools/recording/bmm_sampled_params.pkl b/tools/recording/bmm_sampled_params.pkl
diff --git a/tools/recording/parameter_generator.py b/tools/recording/parameter_generator.py
@@ -4,10 +4,14 @@
 import sys
 import random
 from typing import Dict, List
+import psutil
 
+# SET CEIL FOR AVAILABLE RAM (avoid running-out-mem for sampling bmm)
+CURR_MEM = psutil.virtual_memory()[1]
+BMM_MEM_CEIL = int(0.9 * CURR_MEM)
 
 class main_generator:
-    "Special distribution for conv2d and linear records"
+    "Special distribution for conv2d, bmm, batch_norm, and linear"
 
     def __init__(self, ops):
 
@@ -16,6 +20,10 @@ def __init__(self, ops):
 
         if ops == "conv2d" or ops == "batch_norm":
             filename = "conv2d_sampled_params.pkl"
+
+        elif ops == "bmm":
+            filename = "bmm_sampled_params.pkl"
+
         elif ops == "linear":
             filename = "linear_sampled_params.pkl"
 
@@ -25,7 +33,7 @@ def __init__(self, ops):
         param_dict: Dict[str, int] = dict()
         dist_arr: List[List[int, int]] = []
 
-        if ops == "conv2d" or ops == "batch_norm":
+        if ops in ["conv2d", "bmm", "batch_norm"]:
             # weight by model count
             model_counts: Dict[str, int] = dict()
             for row in data:
@@ -73,7 +81,7 @@ def generate_sample(self):
                 ]
                 if round_sample[2] != 0 and round_sample[3] != 0:
                     return round_sample
-            
+
             elif self._ops == "batch_norm":
                 round_sample = [
                     self.round(sample[0][0]),  # in_channels
@@ -85,6 +93,24 @@ def generate_sample(self):
                 if round_sample[1] != 0:
                     return [round_sample[1]]
 
+            elif self._ops == "bmm":
+                round_sample = [
+                    self.round(sample[0][0]),  # bs
+                    self.round(sample[1][0]),  # left
+                    self.round(sample[2][0]),  # middle
+                    self.round(sample[3][0]),  # right
+                ]
+                # validate non-zeros
+                # check if available memory (RuntimeError DefaultCPUAllocator: can't allocate memory)
+                # 4 for FP32
+                matrix_a_size = 4 * round_sample[0] * round_sample[1] * round_sample[2]
+                matrix_b_size = 4 * round_sample[0] * round_sample[2] * round_sample[3]
+                if (
+                    np.all(round_sample)
+                    and matrix_a_size + matrix_b_size < BMM_MEM_CEIL
+                ):
+                    return round_sample
+
             elif self._ops == "linear":
                 in_features = self.round(sample[0][0])
                 out_features = self.round(sample[1][0])
diff --git a/tools/recording/record_common.py b/tools/recording/record_common.py
@@ -18,7 +18,7 @@
 Some operators such as conv2d and linear need to be sampled from a different distribution (gaussian + uniform)
 main_generator generates these new samples
 """
-SPECIAL_SAMPLING_OPS = ['conv2d','linear', 'batch_norm']
+SPECIAL_SAMPLING_OPS = ['conv2d','linear', 'batch_norm', 'bmm']
 
 class Measurer:
     def __init__(