first naive attempt

carstenbauer · carstenbauer · commit c24234d3a973 · 2022-03-01T21:08:56.000+01:00
diff --git a/src/device/intrinsics/wmma.jl b/src/device/intrinsics/wmma.jl
@@ -14,6 +14,7 @@ const map_ptx_to_jl_array = Dict(
                                  "s8"  => Int8,
                                  "s32" => Int32,
                                  "f16" => Float16,
+                                 "tf32" => Float32,
                                  "f32" => Float32
                                 )
 
@@ -23,6 +24,7 @@ const map_ptx_to_jl_frag = Dict(
                                 "s8"  => UInt32,
                                 "s32" => Int32,
                                 "f16" => NTuple{2, VecElement{Float16}},
+                                "tf32" => Float32,
                                 "f32" => Float32
                                )
 
@@ -40,6 +42,8 @@ const map_frag_sizes = Dict(
                             "a.f16.m16n16k16" => 8,
                             "a.f16.m8n32k16"  => 8,
                             "a.f16.m32n8k16"  => 8,
+                            
+                            "a.tf32.m16n16k8" => 8,
                             # B
                             "b.u8.m16n16k16"  => 2,
                             "b.u8.m8n32k16"   => 4,
@@ -52,6 +56,8 @@ const map_frag_sizes = Dict(
                             "b.f16.m16n16k16" => 8,
                             "b.f16.m8n32k16"  => 8,
                             "b.f16.m32n8k16"  => 8,
+
+                            "b.tf32.m16n16k8" => 8,
                             # C                            
                             "c.s32.m16n16k16" => 8,
                             "c.s32.m8n32k16"  => 8,
@@ -64,6 +70,8 @@ const map_frag_sizes = Dict(
                             "c.f32.m16n16k16" => 8,
                             "c.f32.m8n32k16"  => 8,
                             "c.f32.m32n8k16"  => 8,
+
+                            "c.f32.m16n16k8"  => 8,
                             # D
                             "d.s32.m16n16k16" => 8,
                             "d.s32.m8n32k16"  => 8,
@@ -76,6 +84,8 @@ const map_frag_sizes = Dict(
                             "d.f32.m16n16k16" => 8,
                             "d.f32.m8n32k16"  => 8,
                             "d.f32.m32n8k16"  => 8,
+
+                            "d.f32.m16n16k8"  => 8,
                            )
 
 # Maps PTX AS to CUDA.AS
@@ -87,6 +97,10 @@ const map_ptx_as_to_as_ty = Dict(
 
 # Valid WMMA Operation configurations: Shape (M,N,K), Matrix, Element Type
 
+# TF32-Precision Floating Point
+const ldst_tf32_ab_ops = [(16,16,8)], ["a", "b"], ["tf32"]
+const ldst_tf32_cd_ops = [(16,16,8)], ["c", "d"], ["f32"]
+const wmma_tf32_ops    = [(16,16,8)], ["tf32"], ["f32"], ["f32"]
 # Half-Precision Floating Point
 const ldst_half_ab_ops = [(16,16,16), (32,8,16), (8,32,16)], ["a", "b"], ["f16"]
 const ldst_half_cd_ops = [(16,16,16), (32,8,16), (8,32,16)], ["c", "d"], ["f16", "f32"]
@@ -97,11 +111,12 @@ const ldst_int_cd_ops = [(16,16,16), (32,8,16), (8,32,16)], ["c", "d"], ["s32"]
 const wmma_int_ops    = [(16,16,16), (32,8,16), (8,32,16)], ["s8", "u8"], ["s32"], ["s32"]
 
 const all_ldst_ops = vcat(ldst_half_ab_ops, ldst_half_cd_ops,
-                          ldst_int_ab_ops,  ldst_int_cd_ops)
-const all_wmma_ops = vcat(wmma_half_ops, wmma_int_ops)
+                          ldst_int_ab_ops,  ldst_int_cd_ops,
+                          ldst_tf32_ab_ops, ldst_tf32_cd_ops)
+const all_wmma_ops = vcat(wmma_half_ops, wmma_int_ops, wmma_tf32_ops)
 
 # Valid WMMA operation shapes
-const valid_shapes = [(16, 16, 16), (32, 8, 16), (8, 32, 16)]
+const valid_shapes = [(16, 16, 16), (32, 8, 16), (8, 32, 16), (16,16,8)]
 
 ################################################################################
 # HELPER FUNCTIONS