1cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
2
3find_package(CUDAToolkit)
4
5if (CUDAToolkit_FOUND)
6 message(STATUS "CUDA Toolkit found")
7
8 if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
9 # native == GPUs available at build time
10 # 50 == Maxwell, lowest CUDA 12 standard
11 # 60 == P100, FP16 CUDA intrinsics
12 # 61 == Pascal, __dp4a instruction (per-byte integer dot product)
13 # 70 == V100, FP16 tensor cores
14 # 75 == Turing, int8 tensor cores
15 # 80 == Ampere, asynchronous data loading, faster tensor core instructions
16 # 86 == RTX 3000, needs CUDA v11.1
17 # 89 == RTX 4000, needs CUDA v11.8
18 # 120 == Blackwell, needs CUDA v12.8, FP4 tensor cores
19 #
20 # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
21 # XX-real == compile CUDA code as device code for this specific architecture
22 # no suffix == compile as both PTX and device code
23 #
24 # The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
25 # for best performance and to also build real architectures for the most commonly used GPUs.
26 if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
27 set(CMAKE_CUDA_ARCHITECTURES "native")
28 else()
29 if (CUDAToolkit_VERSION VERSION_LESS "13")
30 list(APPEND CMAKE_CUDA_ARCHITECTURES 50-virtual 61-virtual 70-virtual)
31 endif ()
32
33 list(APPEND CMAKE_CUDA_ARCHITECTURES 75-virtual 80-virtual 86-real)
34
35 if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
36 list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real)
37 endif()
38
39 if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
40 # The CUDA architecture 120f-virtual would in principle work for Blackwell support
41 # but the newly added "f" suffix conflicted with a preexising regex for validating CUDA architectures in CMake.
42 # So either a recent CMake version or one with the backported fix is needed.
43 # The following versions should work:
44 # - CMake >= v3.31.8 && CMake < v4.0.0
45 # - CMake >= v4.0.2
46 # This is NOT documented in the CMake release notes,
47 # check Modules/Internal/CMakeCUDAArchitecturesValidate.cmake in the CMake git repository instead.
48 # However, the architectures 120a-real and 121a-real should work with basically any CMake version and
49 # until the release of e.g. Rubin there is no benefit to shipping virtual architectures for Blackwell.
50 list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real)
51 endif()
52 if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.9")
53 list(APPEND CMAKE_CUDA_ARCHITECTURES 121a-real)
54 endif()
55 endif()
56 endif()
57
58 enable_language(CUDA)
59
60 # TODO: Remove once CCCL 3.2 has been released and bundled with CUDA Toolkit
61 if (GGML_CUDA_CUB_3DOT2)
62 include(FetchContent)
63
64 FetchContent_Declare(
65 CCCL
66 GIT_REPOSITORY https://github.com/nvidia/cccl.git
67 GIT_TAG v3.2.0
68 GIT_SHALLOW TRUE
69 )
70
71 FetchContent_MakeAvailable(CCCL)
72 endif()
73
74 # Replace any plain 12X CUDA architectures with their "architecture-specific" equivalents 12Xa.
75 # 12X is forwards-compatible, 12Xa is not.
76 # Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa.
77 # But while 12X vs. 12Xa can be checked in device code there is (to my knowledge) no easy way to do the same check in host code.
78 # So for now just replace all instances of 12X with 12Xa, this should be fine until Rubin is released.
79 foreach(ARCHS IN ITEMS CMAKE_CUDA_ARCHITECTURES CMAKE_CUDA_ARCHITECTURES_NATIVE)
80 set(FIXED_ARCHS "")
81 foreach(ARCH IN LISTS ${ARCHS})
82 if (ARCH MATCHES "^12[0-9](-real|-virtual)?$")
83 string(REGEX REPLACE "^(12[0-9])((-real|-virtual)?)$" "\\1a\\2" FIXED_ARCH ${ARCH})
84 message(STATUS "Replacing ${ARCH} in ${ARCHS} with ${FIXED_ARCH}")
85 list(APPEND FIXED_ARCHS "${FIXED_ARCH}")
86 else()
87 list(APPEND FIXED_ARCHS "${ARCH}")
88 endif()
89 endforeach()
90 set(${ARCHS} ${FIXED_ARCHS})
91 endforeach()
92
93 # If we try to compile a "native" build it will use the 12X architectures and fail.
94 # So we should instead use the native architectures as determined by CMake after replacing 12X with 12Xa.
95 # But if at the time of the build no GPUs are connected at all CMAKE_CUDA_ARCHITECTURES will contain garbage that we should not use.
96 if (CMAKE_CUDA_ARCHITECTURES STREQUAL "native" AND CMAKE_CUDA_ARCHITECTURES_NATIVE MATCHES "^[0-9]+(a|f)?(-real|-virtual)?(;[0-9]+(a|f)?(-real|-virtual)?|;)*$")
97 set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
98 endif()
99 message(STATUS "Using CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} CMAKE_CUDA_ARCHITECTURES_NATIVE=${CMAKE_CUDA_ARCHITECTURES_NATIVE}")
100
101 file(GLOB GGML_HEADERS_CUDA "*.cuh")
102 list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
103
104 file(GLOB GGML_SOURCES_CUDA "*.cu")
105 file(GLOB SRCS "template-instances/fattn-tile*.cu")
106 list(APPEND GGML_SOURCES_CUDA ${SRCS})
107 file(GLOB SRCS "template-instances/fattn-mma*.cu")
108 list(APPEND GGML_SOURCES_CUDA ${SRCS})
109 file(GLOB SRCS "template-instances/mmq*.cu")
110 list(APPEND GGML_SOURCES_CUDA ${SRCS})
111 file(GLOB SRCS "template-instances/mmf*.cu")
112 list(APPEND GGML_SOURCES_CUDA ${SRCS})
113
114 if (GGML_CUDA_FA_ALL_QUANTS)
115 file(GLOB SRCS "template-instances/fattn-vec*.cu")
116 list(APPEND GGML_SOURCES_CUDA ${SRCS})
117 add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
118 else()
119 file(GLOB SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
120 list(APPEND GGML_SOURCES_CUDA ${SRCS})
121 file(GLOB SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
122 list(APPEND GGML_SOURCES_CUDA ${SRCS})
123 file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu")
124 list(APPEND GGML_SOURCES_CUDA ${SRCS})
125 endif()
126
127 ggml_add_backend_library(ggml-cuda
128 ${GGML_HEADERS_CUDA}
129 ${GGML_SOURCES_CUDA}
130 )
131
132 add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
133
134 if (GGML_CUDA_GRAPHS)
135 add_compile_definitions(GGML_CUDA_USE_GRAPHS)
136 endif()
137
138 if (GGML_CUDA_FORCE_MMQ)
139 add_compile_definitions(GGML_CUDA_FORCE_MMQ)
140 endif()
141
142 if (GGML_CUDA_FORCE_CUBLAS)
143 add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
144 endif()
145
146 if (GGML_CUDA_NO_VMM)
147 add_compile_definitions(GGML_CUDA_NO_VMM)
148 endif()
149
150 if (NOT GGML_CUDA_FA)
151 add_compile_definitions(GGML_CUDA_NO_FA)
152 endif()
153
154 if (GGML_CUDA_NO_PEER_COPY)
155 add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
156 endif()
157
158 if (GGML_STATIC)
159 if (WIN32)
160 # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
161 target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas)
162 else ()
163 if (GGML_CUDA_CUB_3DOT2)
164 target_link_libraries(ggml-cuda PRIVATE CCCL::CCCL)
165 endif()
166 if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.1")
167 target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
168 else()
169 target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static)
170 endif()
171 endif()
172 else()
173 if (GGML_CUDA_CUB_3DOT2)
174 target_link_libraries(ggml-cuda PRIVATE CCCL::CCCL)
175 endif()
176 target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas)
177 endif()
178
179 if (GGML_CUDA_NO_VMM)
180 # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
181 else()
182 target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
183 endif()
184
185 set(CUDA_CXX_FLAGS "")
186
187 set(CUDA_FLAGS -use_fast_math -extended-lambda)
188
189 if (GGML_CUDA_DEBUG)
190 list(APPEND CUDA_FLAGS -lineinfo)
191 add_compile_definitions(GGML_CUDA_DEBUG)
192 endif()
193
194 if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
195 # Options are:
196 # - none (not recommended)
197 # - speed (nvcc's default)
198 # - balance
199 # - size
200 list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE})
201 endif()
202
203 if (GGML_FATAL_WARNINGS)
204 list(APPEND CUDA_FLAGS -Werror all-warnings)
205 endif()
206
207 if (GGML_ALL_WARNINGS AND NOT MSVC)
208 set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
209 if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
210 list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
211 endif()
212
213 execute_process(
214 COMMAND ${NVCC_CMD} -Xcompiler --version
215 OUTPUT_VARIABLE CUDA_CCFULLVER
216 ERROR_QUIET
217 )
218
219 if (NOT CUDA_CCFULLVER MATCHES clang)
220 set(CUDA_CCID "GNU")
221 execute_process(
222 COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
223 OUTPUT_VARIABLE CUDA_CCVER
224 ERROR_QUIET
225 OUTPUT_STRIP_TRAILING_WHITESPACE
226 )
227 else()
228 if (CUDA_CCFULLVER MATCHES Apple)
229 set(CUDA_CCID "AppleClang")
230 else()
231 set(CUDA_CCID "Clang")
232 endif()
233 string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
234 endif()
235
236 message(STATUS "CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
237
238 ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
239 list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
240 endif()
241
242 if (NOT MSVC)
243 list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
244 else()
245 # CCCL 3.2 onwards will require a cpp-standard-compliant preprocessor for MSVC
246 # https://github.com/NVIDIA/cccl/pull/6827
247 list(APPEND CUDA_CXX_FLAGS /Zc:preprocessor)
248 endif()
249
250 list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
251
252 if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
253 list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
254 endif()
255
256 target_compile_options(ggml-cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
257else()
258 message(FATAL_ERROR "CUDA Toolkit not found")
259endif()