1cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
  2
  3find_package(CUDAToolkit)
  4
  5if (CUDAToolkit_FOUND)
  6    message(STATUS "CUDA Toolkit found")
  7
  8    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
  9        # native == GPUs available at build time
 10        # 50     == Maxwell, lowest CUDA 12 standard
 11        # 60     == P100, FP16 CUDA intrinsics
 12        # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
 13        # 70     == V100, FP16 tensor cores
 14        # 75     == Turing, int8 tensor cores
 15        # 80     == Ampere, asynchronous data loading, faster tensor core instructions
 16        # 86     == RTX 3000, needs CUDA v11.1
 17        # 89     == RTX 4000, needs CUDA v11.8
 18        # 120    == Blackwell, needs CUDA v12.8, FP4 tensor cores
 19        #
 20        # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
 21        # XX-real    == compile CUDA code as device code for this specific architecture
 22        # no suffix  == compile as both PTX and device code
 23        #
 24        # The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
 25        #     for best performance and to also build real architectures for the most commonly used GPUs.
 26        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
 27            set(CMAKE_CUDA_ARCHITECTURES "native")
 28        else()
 29            if (CUDAToolkit_VERSION VERSION_LESS "13")
 30                list(APPEND CMAKE_CUDA_ARCHITECTURES 50-virtual 61-virtual 70-virtual)
 31            endif ()
 32
 33            list(APPEND CMAKE_CUDA_ARCHITECTURES 75-virtual 80-virtual 86-real)
 34
 35            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
 36                list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real)
 37            endif()
 38
 39            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
 40                # The CUDA architecture 120f-virtual would in principle work for Blackwell support
 41                #     but the newly added "f" suffix conflicted with a preexising regex for validating CUDA architectures in CMake.
 42                # So either a recent CMake version or one with the backported fix is needed.
 43                # The following versions should work:
 44                #   - CMake >= v3.31.8 && CMake < v4.0.0
 45                #   - CMake >= v4.0.2
 46                # This is NOT documented in the CMake release notes,
 47                #     check Modules/Internal/CMakeCUDAArchitecturesValidate.cmake in the CMake git repository instead.
 48                # However, the architectures 120a-real and 121a-real should work with basically any CMake version and
 49                #     until the release of e.g. Rubin there is no benefit to shipping virtual architectures for Blackwell.
 50                list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real)
 51            endif()
 52            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.9")
 53                list(APPEND CMAKE_CUDA_ARCHITECTURES 121a-real)
 54            endif()
 55        endif()
 56    endif()
 57
 58    enable_language(CUDA)
 59
 60    # TODO: Remove once CCCL 3.2 has been released and bundled with CUDA Toolkit
 61    if (GGML_CUDA_CUB_3DOT2)
 62        include(FetchContent)
 63
 64        FetchContent_Declare(
 65            CCCL
 66            GIT_REPOSITORY https://github.com/nvidia/cccl.git
 67            GIT_TAG        v3.2.0
 68            GIT_SHALLOW    TRUE
 69        )
 70
 71        FetchContent_MakeAvailable(CCCL)
 72    endif()
 73
 74    # Replace any plain 12X CUDA architectures with their "architecture-specific" equivalents 12Xa.
 75    # 12X is forwards-compatible, 12Xa is not.
 76    # Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa.
 77    # But while 12X vs. 12Xa can be checked in device code there is (to my knowledge) no easy way to do the same check in host code.
 78    # So for now just replace all instances of 12X with 12Xa, this should be fine until Rubin is released.
 79    foreach(ARCHS IN ITEMS CMAKE_CUDA_ARCHITECTURES CMAKE_CUDA_ARCHITECTURES_NATIVE)
 80        set(FIXED_ARCHS "")
 81        foreach(ARCH IN LISTS ${ARCHS})
 82            if (ARCH MATCHES "^12[0-9](-real|-virtual)?$")
 83                string(REGEX REPLACE "^(12[0-9])((-real|-virtual)?)$" "\\1a\\2" FIXED_ARCH ${ARCH})
 84                message(STATUS "Replacing ${ARCH} in ${ARCHS} with ${FIXED_ARCH}")
 85                list(APPEND FIXED_ARCHS "${FIXED_ARCH}")
 86            else()
 87                list(APPEND FIXED_ARCHS "${ARCH}")
 88            endif()
 89        endforeach()
 90        set(${ARCHS} ${FIXED_ARCHS})
 91    endforeach()
 92
 93    # If we try to compile a "native" build it will use the 12X architectures and fail.
 94    # So we should instead use the native architectures as determined by CMake after replacing 12X with 12Xa.
 95    # But if at the time of the build no GPUs are connected at all CMAKE_CUDA_ARCHITECTURES will contain garbage that we should not use.
 96    if (CMAKE_CUDA_ARCHITECTURES STREQUAL "native" AND CMAKE_CUDA_ARCHITECTURES_NATIVE MATCHES "^[0-9]+(a|f)?(-real|-virtual)?(;[0-9]+(a|f)?(-real|-virtual)?|;)*$")
 97        set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
 98    endif()
 99    message(STATUS "Using CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} CMAKE_CUDA_ARCHITECTURES_NATIVE=${CMAKE_CUDA_ARCHITECTURES_NATIVE}")
100
101    file(GLOB   GGML_HEADERS_CUDA "*.cuh")
102    list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
103
104    file(GLOB   GGML_SOURCES_CUDA "*.cu")
105    file(GLOB   SRCS "template-instances/fattn-tile*.cu")
106    list(APPEND GGML_SOURCES_CUDA ${SRCS})
107    file(GLOB   SRCS "template-instances/fattn-mma*.cu")
108    list(APPEND GGML_SOURCES_CUDA ${SRCS})
109    file(GLOB   SRCS "template-instances/mmq*.cu")
110    list(APPEND GGML_SOURCES_CUDA ${SRCS})
111    file(GLOB   SRCS "template-instances/mmf*.cu")
112    list(APPEND GGML_SOURCES_CUDA ${SRCS})
113
114    if (GGML_CUDA_FA_ALL_QUANTS)
115        file(GLOB   SRCS "template-instances/fattn-vec*.cu")
116        list(APPEND GGML_SOURCES_CUDA ${SRCS})
117        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
118    else()
119        file(GLOB   SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
120        list(APPEND GGML_SOURCES_CUDA ${SRCS})
121        file(GLOB   SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
122        list(APPEND GGML_SOURCES_CUDA ${SRCS})
123        file(GLOB   SRCS "template-instances/fattn-vec*f16-f16.cu")
124        list(APPEND GGML_SOURCES_CUDA ${SRCS})
125    endif()
126
127    ggml_add_backend_library(ggml-cuda
128                             ${GGML_HEADERS_CUDA}
129                             ${GGML_SOURCES_CUDA}
130                            )
131
132    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
133
134    if (GGML_CUDA_GRAPHS)
135        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
136    endif()
137
138    if (GGML_CUDA_FORCE_MMQ)
139        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
140    endif()
141
142    if (GGML_CUDA_FORCE_CUBLAS)
143        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
144    endif()
145
146    if (GGML_CUDA_NO_VMM)
147        add_compile_definitions(GGML_CUDA_NO_VMM)
148    endif()
149
150    if (NOT GGML_CUDA_FA)
151        add_compile_definitions(GGML_CUDA_NO_FA)
152    endif()
153
154    if (GGML_CUDA_NO_PEER_COPY)
155        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
156    endif()
157
158    if (GGML_STATIC)
159        if (WIN32)
160            # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
161            target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas)
162        else ()
163            if (GGML_CUDA_CUB_3DOT2)
164                target_link_libraries(ggml-cuda PRIVATE  CCCL::CCCL)
165            endif()
166            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.1")
167                target_link_libraries(ggml-cuda PRIVATE  CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
168            else()
169                target_link_libraries(ggml-cuda PRIVATE  CUDA::cudart_static CUDA::cublas_static)
170            endif()
171        endif()
172    else()
173        if (GGML_CUDA_CUB_3DOT2)
174            target_link_libraries(ggml-cuda PRIVATE  CCCL::CCCL)
175        endif()
176        target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas)
177    endif()
178
179    if (GGML_CUDA_NO_VMM)
180        # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
181    else()
182        target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
183    endif()
184
185    set(CUDA_CXX_FLAGS "")
186
187    set(CUDA_FLAGS -use_fast_math -extended-lambda)
188
189    if (GGML_CUDA_DEBUG)
190        list(APPEND CUDA_FLAGS -lineinfo)
191        add_compile_definitions(GGML_CUDA_DEBUG)
192    endif()
193
194    if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
195        # Options are:
196        # - none (not recommended)
197        # - speed (nvcc's default)
198        # - balance
199        # - size
200        list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE})
201    endif()
202
203    if (GGML_FATAL_WARNINGS)
204        list(APPEND CUDA_FLAGS -Werror all-warnings)
205    endif()
206
207    if (GGML_ALL_WARNINGS AND NOT MSVC)
208        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
209        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
210            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
211        endif()
212
213        execute_process(
214            COMMAND ${NVCC_CMD} -Xcompiler --version
215            OUTPUT_VARIABLE CUDA_CCFULLVER
216            ERROR_QUIET
217        )
218
219        if (NOT CUDA_CCFULLVER MATCHES clang)
220            set(CUDA_CCID "GNU")
221            execute_process(
222                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
223                OUTPUT_VARIABLE CUDA_CCVER
224                ERROR_QUIET
225                OUTPUT_STRIP_TRAILING_WHITESPACE
226            )
227        else()
228            if (CUDA_CCFULLVER MATCHES Apple)
229                set(CUDA_CCID "AppleClang")
230            else()
231                set(CUDA_CCID "Clang")
232            endif()
233            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
234        endif()
235
236        message(STATUS "CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
237
238        ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
239        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
240    endif()
241
242    if (NOT MSVC)
243        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
244    else()
245        # CCCL 3.2 onwards will require a cpp-standard-compliant preprocessor for MSVC
246        # https://github.com/NVIDIA/cccl/pull/6827
247        list(APPEND CUDA_CXX_FLAGS /Zc:preprocessor)
248    endif()
249
250    list(JOIN   CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
251
252    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
253        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
254    endif()
255
256    target_compile_options(ggml-cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
257else()
258    message(FATAL_ERROR "CUDA Toolkit not found")
259endif()