Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
a109619
Apply Hashem's patch
avbokovoy Sep 30, 2024
bade572
Add missing header files for compiling.
xinyazhang May 22, 2024
c4e350d
Enable HIP Performance GPU kernel and fix compiling errors
xinyazhang May 29, 2024
695b7c5
Only enable fp32 cache_t, and leave fp16/bf16 cache_t as TODO.
xinyazhang May 29, 2024
b19af6b
Note weight_decay problems
xinyazhang Jun 10, 2024
05b49aa
Move files under codegen/training/backward/rocm/ to include/fbgemm_gp…
xinyazhang Jun 10, 2024
13d018c
Add the missing file
xinyazhang Jun 18, 2024
495acec
[Nobag support] Add experimental nobag support
avbokovoy Jun 14, 2024
649f6f1
Fix wrong num_tables initialization
avbokovoy Jun 17, 2024
f57307f
Remove debug output
avbokovoy Jul 2, 2024
336c034
Revert "Add the missing file"
xinyazhang Jun 20, 2024
925a922
support weight decay mode
xinyazhang Jun 20, 2024
5329442
Unify weight_decay_mode to int32_t
xinyazhang Jun 20, 2024
e366ef4
Fix the compiling error.
xinyazhang Jul 18, 2024
3d9dcf5
Fix weight decay integration problem.
xinyazhang Jul 22, 2024
31ba8e2
Clean up new perf kernel (#64)
avbokovoy Aug 8, 2024
8245526
Reorganize wave_reduce function. Rework bit_cast
avbokovoy Aug 9, 2024
856e1f9
Enable fp16 cache type
avbokovoy Aug 9, 2024
0c09c00
Switch to builtin bit_cast to avoid c++20 compilation issues
avbokovoy Aug 9, 2024
7e7f4ca
Fix operation specification in DPP macro
avbokovoy Aug 9, 2024
67d722b
Return missing broadcast operations
avbokovoy Aug 9, 2024
fcd5f58
Account for NRVO in pack function
avbokovoy Aug 12, 2024
f0434d1
Remove redundant cache type check
avbokovoy Aug 13, 2024
631b247
Fix synchronization issue in dpp_reduction
avbokovoy Aug 15, 2024
494400f
Add new bwd kernel unit test
avbokovoy Aug 14, 2024
ec8d429
Fix bwd kernel grid size
avbokovoy Aug 14, 2024
d6fba37
Fix weighted mode
avbokovoy Aug 14, 2024
b8a24b0
Fix whitespaces
avbokovoy Aug 14, 2024
fbfae97
Disable unsupported weight decay modes
avbokovoy Aug 15, 2024
e96e875
Disable nobag test
avbokovoy Aug 15, 2024
e190ff6
Fix wrong reduce_op usage
avbokovoy Aug 15, 2024
bf63532
Address new index packing in bag mode
avbokovoy Aug 15, 2024
9aa3829
Try to unify CPU kernel call with current approach
avbokovoy Aug 21, 2024
e007de4
Fix grid and block size when optimized kernel called
avbokovoy Aug 21, 2024
3d08fcd
Add 160 embedding size support
avbokovoy Aug 26, 2024
9c18707
Update bwd template generation
avbokovoy Oct 4, 2024
d80d6f0
Fix logging in benchmarks
q10 Sep 18, 2024
8752bd3
Mute debug output
avbokovoy Oct 4, 2024
92d768a
Fix grid size for optimized bwd kernel
avbokovoy Oct 10, 2024
bbecd7c
Rework hip bwd kernel instantiation
avbokovoy Oct 10, 2024
5387fa3
Rename iterators + minor fwd kernel clean-up
avbokovoy Oct 10, 2024
ca8473d
Add experimental Vec2 support
avbokovoy Oct 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion fbgemm_gpu/bench/histogram_binning_calibration_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
import torch
from torch import Tensor

logging.basicConfig(level=logging.DEBUG)
logger: logging.Logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

try:
# pyre-ignore[21]
Expand Down
3 changes: 2 additions & 1 deletion fbgemm_gpu/bench/jagged_tensor_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
import torch
from torch.profiler import profile

logging.basicConfig(level=logging.DEBUG)
logger: logging.Logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
open_source: bool = getattr(fbgemm_gpu, "open_source", False)
Expand Down
3 changes: 3 additions & 0 deletions fbgemm_gpu/bench/merge_embeddings_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
# pyre-fixme[21]: Could not find name `ProfilerActivity` in `torch.profiler`.
from torch.profiler import profile, ProfilerActivity

logger: logging.Logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
open_source: bool = getattr(fbgemm_gpu, "open_source", False)

Expand Down
4 changes: 2 additions & 2 deletions fbgemm_gpu/bench/quantize_ops_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
# pyre-ignore[21]
from torch.profiler import profile, ProfilerActivity


logging.basicConfig(level=logging.DEBUG)
logger: logging.Logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
open_source: bool = getattr(fbgemm_gpu, "open_source", False)
Expand Down
3 changes: 2 additions & 1 deletion fbgemm_gpu/bench/sparse_ops_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@

from torch.profiler import profile

logging.basicConfig(level=logging.DEBUG)
logger: logging.Logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
open_source: bool = getattr(fbgemm_gpu, "open_source", False)
Expand Down
3 changes: 2 additions & 1 deletion fbgemm_gpu/bench/split_embeddings_cache_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@

from torch import nn, Tensor

logging.basicConfig(level=logging.DEBUG)
logger: logging.Logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

try:
# pyre-ignore[21]
Expand Down
3 changes: 3 additions & 0 deletions fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@
from torch import Tensor
from torch.profiler import profile

logger: logging.Logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

haveAIBench = False
try:
from aibench_observer.utils.observer import emitMetric
Expand Down
5 changes: 2 additions & 3 deletions fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,13 @@
from torch.autograd.profiler import record_function
from torch.profiler import profile

logging.basicConfig(level=logging.DEBUG)
logger: logging.Logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

load_torch_module(
"//deeplearning/fbgemm/fbgemm_gpu:ssd_split_table_batched_embeddings",
)

logging.basicConfig(level=logging.DEBUG)


@click.group()
def cli() -> None:
Expand Down
3 changes: 2 additions & 1 deletion fbgemm_gpu/bench/stride_gemm_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
import torch
from fbgemm_gpu.bench.bench_utils import benchmark_torch_function

logging.basicConfig(level=logging.DEBUG)
logger: logging.Logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

try:
# pyre-ignore[21]
Expand Down
22 changes: 22 additions & 0 deletions fbgemm_gpu/codegen/genscript/generate_backward_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,26 @@ def generate_backward_indices() -> None:
)

@staticmethod
def generate_rocm_backward_split(**kwargs: Any) -> None:
# Generate backward device kernels based on weighted (True/False), VBE
# (True/False), no bag (True/False)
template_filepath = (
"training/backward/rocm/embedding_backward_split_device_kernel_template.hip"
)

BackwardSplitGenerator.render_backward_templates(
template_filepath,
"",
"{}gen_embedding_backward_{}_device_kernel_hip.hip",
{
"has_gpu_support": True,
"has_vbe_support": False,
"has_ssd_support": False,
"dense": False,
"gen_once": False,
},
)

def generate_python_sources(
all_optimizers: List[str], ssd_optimizers: List[str]
) -> None:
Expand Down Expand Up @@ -370,6 +390,8 @@ def generate() -> None:
BackwardSplitGenerator.generate_backward_split(
ssd_tensors=ssd_tensors, **optimizer
)
# TODO: if is_rocm
BackwardSplitGenerator.generate_rocm_backward_split(**optimizer)

# Generate common device kernels for backwards
BackwardSplitGenerator.generate_backward_device()
Expand Down
Loading