Skip to content

Commit 45d6cec

Browse files
committed
Optimize AMD GFX906 flash attention with DS_SWIZZLE instrinsics Author: iacopPBK <[email protected]>
1 parent d72f5f7 commit 45d6cec

File tree

6 files changed

+663
-564
lines changed

6 files changed

+663
-564
lines changed

README.md

Lines changed: 128 additions & 561 deletions
Large diffs are not rendered by default.

SCRIPT_compile_MI50.sh

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
#!/bin/bash
2+
#
3+
# SCRIPT MI50 Compilation Script for llama.cpp
4+
# Optimized build for AMD MI50 (gfx906) with ROCm/HIP support
5+
#
6+
# This script compiles llama.cpp with maximum optimizations for the MI50 GPU
7+
# including server support, flash attention, and all performance features
8+
#
9+
10+
set -e # Exit on any error
11+
12+
# Colors for output
13+
RED='\033[0;31m'
14+
GREEN='\033[0;32m'
15+
YELLOW='\033[1;33m'
16+
BLUE='\033[0;34m'
17+
NC='\033[0m' # No Color
18+
19+
echo -e "${BLUE}======================================${NC}"
20+
echo -e "${BLUE} SCRIPT MI50 llama.cpp Builder ${NC}"
21+
echo -e "${BLUE}======================================${NC}"
22+
23+
# Check if we're in the right directory
24+
if [[ ! -f "CMakeLists.txt" ]]; then
25+
echo -e "${RED}Error: Not in llama.cpp root directory${NC}"
26+
echo "Please run this script from the llama.cpp root directory"
27+
exit 1
28+
fi
29+
30+
# Verify ROCm installation
31+
echo -e "${YELLOW}Checking ROCm installation...${NC}"
32+
if ! command -v rocm_agent_enumerator &> /dev/null; then
33+
echo -e "${RED}Error: ROCm not found. Please install ROCm first.${NC}"
34+
exit 1
35+
fi
36+
37+
# Check for gfx906 support
38+
GPUS=$(rocm_agent_enumerator)
39+
if [[ ! "$GPUS" =~ "gfx906" ]]; then
40+
echo -e "${RED}Warning: gfx906 (MI50) not detected in system${NC}"
41+
echo "Available GPUs: $GPUS"
42+
read -p "Continue anyway? (y/N): " -n 1 -r
43+
echo
44+
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
45+
exit 1
46+
fi
47+
fi
48+
49+
echo -e "${GREEN}✓ ROCm installation verified${NC}"
50+
echo -e "${GREEN}✓ Available GPUs: $GPUS${NC}"
51+
52+
# Set ROCm environment variables for optimal gfx906 compilation
53+
echo -e "${YELLOW}Setting ROCm environment variables for gfx906...${NC}"
54+
export ROCM_PATH=${ROCM_PATH:-/opt/rocm}
55+
export HCC_AMDGPU_TARGET=gfx906
56+
export HSA_OVERRIDE_GFX_VERSION=9.0.6
57+
export AMDGPU_TARGETS=gfx906
58+
export GPU_TARGETS=gfx906
59+
60+
# Clean previous build
61+
echo -e "${YELLOW}Cleaning previous build...${NC}"
62+
rm -rf build
63+
mkdir -p build
64+
65+
# Configure with maximum optimizations
66+
echo -e "${YELLOW}Configuring CMake with MI50 optimizations...${NC}"
67+
cd build
68+
69+
cmake .. \
70+
-DCMAKE_BUILD_TYPE=Release \
71+
-DCMAKE_C_COMPILER=gcc \
72+
-DCMAKE_CXX_COMPILER=g++ \
73+
-DCMAKE_HIP_COMPILER_FORCED=1 \
74+
-DCMAKE_HIP_ARCHITECTURES=gfx906 \
75+
-DCMAKE_C_FLAGS="-O3 -march=native -mtune=native -DNDEBUG -ffast-math -fno-finite-math-only -ffp-contract=fast" \
76+
-DCMAKE_CXX_FLAGS="-O3 -march=native -mtune=native -DNDEBUG -DGGML_HIP_GFX906_OPTIMIZED -ffast-math -fno-finite-math-only -ffp-contract=fast" \
77+
-DCMAKE_HIP_FLAGS=" --offload-arch=gfx906 -DGGML_HIP_GFX906_OPTIMIZED -Wno-ignored-attributes -Wno-cuda-compat -Wno-unused-result -mllvm -amdgpu-simplify-libcall -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-enable-lower-module-lds=false -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -ffast-math -ffp-contract=fast" \
78+
-DGGML_HIP=ON \
79+
-DGGML_HIP_MMQ_MFMA=ON \
80+
-DGGML_HIP_GRAPHS=ON \
81+
-DGGML_HIP_NO_VMM=ON \
82+
-DGGML_HIP_EXPORT_METRICS=ON \
83+
-DGGML_HIP_GFX906_OPTIMIZED=ON \
84+
-DGGML_NATIVE=ON \
85+
-DGGML_CUDA_FA=ON \
86+
-DGGML_CUDA_FA_ALL_QUANTS=ON \
87+
-DGGML_CUDA_FORCE_MMQ=OFF \
88+
-DGGML_CUDA_FORCE_CUBLAS=OFF \
89+
-DGGML_CUDA_NO_PEER_COPY=ON \
90+
-DLLAMA_BUILD_SERVER=ON \
91+
-DLLAMA_BUILD_EXAMPLES=ON \
92+
-DLLAMA_BUILD_TOOLS=ON \
93+
-DLLAMA_BUILD_TESTS=OFF \
94+
-DLLAMA_CURL=ON \
95+
-DLLAMA_STATIC=OFF
96+
97+
if [[ $? -ne 0 ]]; then
98+
echo -e "${RED}✗ CMake configuration failed${NC}"
99+
exit 1
100+
fi
101+
102+
echo -e "${GREEN}✓ CMake configuration successful${NC}"
103+
104+
# Compile with all CPU cores and dump detailed logs
105+
NPROC=$(nproc)
106+
LOG_FILE="compilation_log.txt"
107+
echo -e "${YELLOW}Compiling with $NPROC cores...${NC}"
108+
echo -e "${YELLOW}This may take several minutes...${NC}"
109+
echo -e "${YELLOW}Detailed compilation log will be saved to: $LOG_FILE${NC}"
110+
111+
# Clear previous log
112+
> $LOG_FILE
113+
114+
# Run make with detailed output and save to log file
115+
make -j$NPROC 2>&1 | tee $LOG_FILE
116+
117+
if [[ ${PIPESTATUS[0]} -ne 0 ]]; then
118+
echo -e "${RED}✗ Compilation failed${NC}"
119+
echo -e "${RED}Check $LOG_FILE for detailed error information${NC}"
120+
exit 1
121+
fi
122+
123+
echo -e "${GREEN}✓ Compilation successful!${NC}"
124+
125+
# Verify the build
126+
echo -e "${YELLOW}Verifying build...${NC}"
127+
128+
# Check if main executables were built
129+
EXECUTABLES=(
130+
"bin/llama-cli"
131+
"bin/llama-server"
132+
"bin/llama-bench"
133+
"bin/libggml-hip.so"
134+
)
135+
136+
ALL_GOOD=true
137+
for exec in "${EXECUTABLES[@]}"; do
138+
if [[ -f "$exec" ]]; then
139+
echo -e "${GREEN}$exec built successfully${NC}"
140+
141+
# Check HIP linking for executables (not libraries)
142+
if [[ "$exec" =~ ^bin/llama- && ! "$exec" =~ \.so$ ]]; then
143+
if ldd "$exec" | grep -q "libggml-hip.so"; then
144+
echo -e "${GREEN} ✓ HIP backend linked${NC}"
145+
else
146+
echo -e "${RED} ✗ HIP backend not linked${NC}"
147+
ALL_GOOD=false
148+
fi
149+
fi
150+
else
151+
echo -e "${RED}$exec not found${NC}"
152+
ALL_GOOD=false
153+
fi
154+
done
155+
156+
if [[ "$ALL_GOOD" = false ]]; then
157+
echo -e "${RED}✗ Build verification failed${NC}"
158+
exit 1
159+
fi
160+
161+
# Display ROCm libraries linked
162+
echo -e "${YELLOW}ROCm libraries linked:${NC}"
163+
ldd bin/llama-cli | grep -E "(hip|roc)" | head -5
164+
165+
# Quick functionality test
166+
echo -e "${YELLOW}Testing HIP backend availability...${NC}"
167+
if ./bin/llama-cli --help 2>/dev/null | grep -q "backend"; then
168+
echo -e "${GREEN}✓ llama-cli responding correctly${NC}"
169+
else
170+
echo -e "${RED}✗ llama-cli test failed${NC}"
171+
fi
172+
173+
# Success message
174+
echo
175+
echo -e "${GREEN}======================================${NC}"
176+
echo -e "${GREEN} ✓ BUILD COMPLETED SUCCESSFULLY ${NC}"
177+
echo -e "${GREEN}======================================${NC}"
178+
echo
179+
echo -e "${BLUE}Built executables:${NC}"
180+
echo " • CLI: ./build/bin/llama-cli"
181+
echo " • Server: ./build/bin/llama-server"
182+
echo " • Bench: ./build/bin/llama-bench"
183+
echo
184+
echo -e "${BLUE}Optimizations enabled:${NC}"
185+
echo " • Target GPU: AMD MI50 (gfx906)"
186+
echo " • HIP/ROCm backend with MFMA support"
187+
echo " • Flash Attention kernels"
188+
echo " • All quantization formats"
189+
echo " • Performance metrics export"
190+
echo " • Native CPU optimizations"
191+
echo " • Optimization 5: GFX906 compiler flags (-ffast-math, early-inline, function-calls=false)"
192+
echo
193+
echo -e "${BLUE}Ready to run:${NC}"
194+
echo " ./SCRIPT_launch_server_MI50.sh <model.gguf>"
195+
echo
196+
echo -e "${YELLOW}Note: Make sure to set proper ROCm environment variables before running!${NC}"
197+
echo
198+
echo -e "${BLUE}For debugging with maximum HIP logging:${NC}"
199+
echo " export AMD_LOG_LEVEL=8"
200+
echo " export AMD_LOG_MASK=0xFFFFFFFF"
201+
echo " export AMD_SERIALIZE_KERNEL=3"
202+
echo " ./SCRIPT_launch_server_MI50.sh <model.gguf> 2>&1 | tee hip_debug.log"

SCRIPT_launch_server_MI50.sh

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/bin/bash
2+
#
3+
# Launch llama.cpp server with AMD MI50 ROCm support
4+
# Built for gfx906 architecture
5+
#
6+
7+
# Set ROCm environment variables for MI50 ONLY (optimal configuration)
8+
export HSA_OVERRIDE_GFX_VERSION=9.0.6
9+
export HIP_VISIBLE_DEVICES=0 # ONLY MI50 (Device 0)
10+
export CUDA_VISIBLE_DEVICES=0 # Additional CUDA compatibility
11+
export ROCR_VISIBLE_DEVICES=0 # ROCr runtime device selection
12+
export GGML_BACKEND_HIP=1
13+
export HCC_AMDGPU_TARGET=gfx906
14+
15+
# Path to your model file - update this to your actual model path
16+
MODEL_PATH="/home/iacopo/Downloads/Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf"
17+
18+
PARAMS=(
19+
-m "$MODEL_PATH"
20+
-ngl 99 # Offload all layers to GPU
21+
-c 32000 # Context size
22+
-np 1 # Parallel requests
23+
-t $(nproc) # Use all CPU threads
24+
--port 8090 # Server port
25+
--host 0.0.0.0 # Listen on all interfaces
26+
#--mlock # Lock model in memory
27+
#--no-mmap # Don't use memory mapping
28+
-b 512 # Batch size
29+
#--cont-batching # Enable continuous batching
30+
--flash-attn on # Enable flash attention
31+
--cache-type-k q8_0 # q8_0 quantized K cache (50% memory savings)
32+
--cache-type-v q8_0 # q8_0 quantized V cache (50% memory savings)
33+
--main-gpu 0 # Force MI50 as main GPU
34+
--device "ROCm0" # Explicit ROCm device
35+
# --no-warmup # Skip warmup for consistent profiling
36+
)
37+
38+
# Check if model file exists
39+
if [ ! -f "$MODEL_PATH" ]; then
40+
echo "Error: Model file not found at: $MODEL_PATH"
41+
echo "Usage: $0 [model_path] [additional_args...]"
42+
echo ""
43+
echo "Example: $0 ./models/llama-2-7b-chat.q4_0.gguf --ctx-size 8192"
44+
exit 1
45+
fi
46+
47+
# Display GPU info
48+
echo "=== ROCm GPU Information ==="
49+
rocm-smi --showproductname --showtemp --showmeminfo --showuse --showpower
50+
echo ""
51+
52+
# Launch llama.cpp server
53+
echo "=== Launching llama.cpp server with MI50 optimization ==="
54+
echo "Model: $MODEL_PATH"
55+
echo "GPU: MI50 (gfx906)"
56+
echo "Server will be available at: http://localhost:8080"
57+
echo "Parameters: ${PARAMS[*]} ${@:2}"
58+
echo ""
59+
60+
cd "$(dirname "$0")"
61+
./build/bin/llama-server "${PARAMS[@]}" "${@:2}"

0 commit comments

Comments
 (0)