2485-hipblas.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

From 317875873ef6b14316c87d011185577172be65bd Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 4 Jun 2024 13:02:22 +0200
Subject: [PATCH] feat(amdgpu): try to build in single binary

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/workflows/release.yaml | 10 ++++++++++
 Makefile                       |  8 ++++++++
 pkg/model/initializers.go      | 20 ++++++++++++++++++--
 3 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 618c81a39af..f9e734c0f5a 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -38,6 +38,15 @@ jobs:
           sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
         env:
           CUDA_VERSION: 12-3
+      - name: "Install Hipblas"
+        run: |
+          sudo apt-get update && \
+          sudo apt-get install -y --no-install-recommends \
+              hipblas-dev \
+              rocblas-dev && \
+          sudo apt-get clean && \
+          sudo rm -rf /var/lib/apt/lists/* && \
+          sudo ldconfig 
       - name: Cache grpc
         id: cache-grpc
         uses: actions/cache@v4
@@ -61,6 +70,7 @@ jobs:
           go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
           export PATH=$PATH:$GOPATH/bin
           export PATH=/usr/local/cuda/bin:$PATH
+          export PATH=/opt/rocm/bin:$PATH
           GO_TAGS=p2p make dist
       - uses: actions/upload-artifact@v4
         with:
diff --git a/Makefile b/Makefile
index f2c03086662..c0abfc2ae80 100644
--- a/Makefile
+++ b/Makefile
@@ -327,6 +327,7 @@ ifeq ($(OS),Darwin)
 	$(info ${GREEN}I Skip CUDA build on MacOS${RESET})
 else
 	$(MAKE) backend-assets/grpc/llama-cpp-cuda
+	$(MAKE) backend-assets/grpc/llama-cpp-hipblas
 endif
 	$(MAKE) build
 	mkdir -p release
@@ -712,6 +713,13 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
 	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
 
+backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-hipblas
+	$(MAKE) -C backend/cpp/llama-hipblas purge
+	$(info ${GREEN}I llama-cpp build info:hipblas${RESET})
+	BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
+
 backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-grpc
 	$(MAKE) -C backend/cpp/llama-grpc purge
diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index d013740ce5d..e9001f0a968 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -37,6 +37,7 @@ const (
 	LLamaCPPAVX      = "llama-cpp-avx"
 	LLamaCPPFallback = "llama-cpp-fallback"
 	LLamaCPPCUDA     = "llama-cpp-cuda"
+	LLamaCPPHipblas  = "llama-cpp-hipblas"
 	LLamaCPPGRPC     = "llama-cpp-grpc"
 
 	Gpt4AllLlamaBackend = "gpt4all-llama"
@@ -93,7 +94,7 @@ ENTRY:
 	if autoDetect {
 		// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
 		// when starting the service
-		foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda := false, false, false, false, false
+		foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas := false, false, false, false, false, false
 		if _, ok := backends[LLamaCPP]; !ok {
 			for _, e := range entry {
 				if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
@@ -116,6 +117,10 @@ ENTRY:
 					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPCUDA)
 					foundLCPPCuda = true
 				}
+				if strings.Contains(e.Name(), LLamaCPPHipblas) && !foundLCPPHipblas {
+					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPHipblas)
+					foundLCPPHipblas = true
+				}
 			}
 		}
 	}
@@ -169,6 +174,7 @@ ENTRY:
 // selectGRPCProcess selects the GRPC process to start based on system capabilities
 func selectGRPCProcess(backend, assetDir string) string {
 	foundCUDA := false
+	foundAMDGPU := false
 	var grpcProcess string
 
 	// Select backend now just for llama.cpp
@@ -195,10 +201,20 @@ func selectGRPCProcess(backend, assetDir string) string {
 					log.Info().Msgf("GPU device found but no CUDA backend present")
 				}
 			}
+			if strings.Contains(gpu.String(), "amd") {
+				p := backendPath(assetDir, LLamaCPPHipblas)
+				if _, err := os.Stat(p); err == nil {
+					log.Info().Msgf("[%s] attempting to load with HIPBLAS variant", backend)
+					grpcProcess = p
+					foundAMDGPU = true
+				} else {
+					log.Info().Msgf("GPU device found but no HIPBLAS backend present")
+				}
+			}
 		}
 	}
 
-	if foundCUDA {
+	if foundCUDA || foundAMDGPU {
 		return grpcProcess
 	}