summaryrefslogtreecommitdiff
path: root/llama.cpp/.devops
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/.devops
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/.devops')
-rw-r--r--llama.cpp/.devops/cann.Dockerfile130
-rw-r--r--llama.cpp/.devops/cpu.Dockerfile88
-rw-r--r--llama.cpp/.devops/cuda-new.Dockerfile95
-rw-r--r--llama.cpp/.devops/cuda.Dockerfile94
-rw-r--r--llama.cpp/.devops/intel.Dockerfile95
-rw-r--r--llama.cpp/.devops/llama-cli-cann.Dockerfile45
-rw-r--r--llama.cpp/.devops/llama-cpp-cuda.srpm.spec85
-rw-r--r--llama.cpp/.devops/llama-cpp.srpm.spec87
-rw-r--r--llama.cpp/.devops/musa.Dockerfile101
-rw-r--r--llama.cpp/.devops/nix/apps.nix21
-rw-r--r--llama.cpp/.devops/nix/devshells.nix52
-rw-r--r--llama.cpp/.devops/nix/docker.nix37
-rw-r--r--llama.cpp/.devops/nix/jetson-support.nix39
-rw-r--r--llama.cpp/.devops/nix/nixpkgs-instances.nix45
-rw-r--r--llama.cpp/.devops/nix/package-gguf-py.nix38
-rw-r--r--llama.cpp/.devops/nix/package.nix243
-rw-r--r--llama.cpp/.devops/nix/python-scripts.nix66
-rw-r--r--llama.cpp/.devops/nix/scope.nix35
-rw-r--r--llama.cpp/.devops/nix/sif.nix27
-rw-r--r--llama.cpp/.devops/rocm.Dockerfile114
-rw-r--r--llama.cpp/.devops/s390x.Dockerfile126
-rwxr-xr-xllama.cpp/.devops/tools.sh53
-rw-r--r--llama.cpp/.devops/vulkan.Dockerfile90
23 files changed, 1806 insertions, 0 deletions
diff --git a/llama.cpp/.devops/cann.Dockerfile b/llama.cpp/.devops/cann.Dockerfile
new file mode 100644
index 0000000..6de2221
--- /dev/null
+++ b/llama.cpp/.devops/cann.Dockerfile
@@ -0,0 +1,130 @@
+# ==============================================================================
+# ARGUMENTS
+# ==============================================================================
+
+# Define the CANN base image for easier version updates later
+ARG CHIP_TYPE=910b
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
+
+# ==============================================================================
+# BUILD STAGE
+# Compile all binary files and libraries
+# ==============================================================================
+FROM ${CANN_BASE_IMAGE} AS build
+
+# -- Install build dependencies --
+RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
+ yum clean all && \
+ rm -rf /var/cache/yum
+
+# -- Set the working directory --
+WORKDIR /app
+
+# -- Copy project files --
+COPY . .
+
+# -- Set CANN environment variables (required for compilation) --
+# Using ENV instead of `source` allows environment variables to persist across the entire image layer
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
+# ... You can add other environment variables from the original file as needed ...
+# For brevity, only core variables are listed here. You can paste the original ENV list here.
+
+# -- Build llama.cpp --
+# Use the passed CHIP_TYPE argument and add general build options
+ARG CHIP_TYPE
+RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
+ && \
+ cmake -B build \
+ -DGGML_CANN=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DSOC_TYPE=ascend${CHIP_TYPE} \
+ -DUSE_ACL_GRAPH=ON \
+ . && \
+ cmake --build build --config Release -j$(nproc)
+
+# -- Organize build artifacts for copying in later stages --
+# Create a lib directory to store all .so files
+RUN mkdir -p /app/lib && \
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+# Create a full directory to store all executables and Python scripts
+RUN mkdir -p /app/full && \
+ cp build/bin/* /app/full/ && \
+ cp *.py /app/full/ && \
+ cp -r gguf-py /app/full/ && \
+ cp -r requirements /app/full/ && \
+ cp requirements.txt /app/full/
+ # If you have a tools.sh script, make sure it is copied here
+ # cp .devops/tools.sh /app/full/tools.sh
+
+# ==============================================================================
+# BASE STAGE
+# Create a minimal base image with CANN runtime and common libraries
+# ==============================================================================
+FROM ${CANN_BASE_IMAGE} AS base
+
+# -- Install runtime dependencies --
+RUN yum install -y libgomp curl && \
+ yum clean all && \
+ rm -rf /var/cache/yum
+
+# -- Set CANN environment variables (required for runtime) --
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+# ... You can add other environment variables from the original file as needed ...
+
+WORKDIR /app
+
+# Copy compiled .so files from the build stage
+COPY --from=build /app/lib/ /app
+
+# ==============================================================================
+# FINAL STAGES (TARGETS)
+# ==============================================================================
+
+### Target: full
+# Complete image with all tools, Python bindings, and dependencies
+# ==============================================================================
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+# Install Python dependencies
+RUN yum install -y git python3 python3-pip && \
+ pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
+ pip3 install --no-cache-dir -r requirements.txt && \
+ yum clean all && \
+ rm -rf /var/cache/yum
+
+# You need to provide a tools.sh script as the entrypoint
+ENTRYPOINT ["/app/tools.sh"]
+# If there is no tools.sh, you can set the default to start the server
+# ENTRYPOINT ["/app/llama-server"]
+
+### Target: light
+# Lightweight image containing only llama-cli and llama-completion
+# ==============================================================================
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Target: server
+# Dedicated server image containing only llama-server
+# ==============================================================================
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/llama.cpp/.devops/cpu.Dockerfile b/llama.cpp/.devops/cpu.Dockerfile
new file mode 100644
index 0000000..c70a2de
--- /dev/null
+++ b/llama.cpp/.devops/cpu.Dockerfile
@@ -0,0 +1,88 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+ARG TARGETARCH
+
+RUN apt-get update && \
+ apt-get install -y build-essential git cmake libssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
+ cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
+ else \
+ echo "Unsupported architecture"; \
+ exit 1; \
+ fi && \
+ cmake --build build -j $(nproc)
+
+RUN mkdir -p /app/lib && \
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+RUN mkdir -p /app/full \
+ && cp build/bin/* /app/full \
+ && cp *.py /app/full \
+ && cp -r gguf-py /app/full \
+ && cp -r requirements /app/full \
+ && cp requirements.txt /app/full \
+ && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ubuntu:$UBUNTU_VERSION AS base
+
+RUN apt-get update \
+ && apt-get install -y libgomp1 curl\
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+ && apt-get install -y \
+ git \
+ python3 \
+ python3-pip \
+ && pip install --upgrade pip setuptools wheel \
+ && pip install -r requirements.txt \
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/llama.cpp/.devops/cuda-new.Dockerfile b/llama.cpp/.devops/cuda-new.Dockerfile
new file mode 100644
index 0000000..98dc147
--- /dev/null
+++ b/llama.cpp/.devops/cuda-new.Dockerfile
@@ -0,0 +1,95 @@
+ARG UBUNTU_VERSION=24.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=13.1.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+ apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+ fi && \
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+ cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+RUN mkdir -p /app/full \
+ && cp build/bin/* /app/full \
+ && cp *.py /app/full \
+ && cp -r gguf-py /app/full \
+ && cp -r requirements /app/full \
+ && cp requirements.txt /app/full \
+ && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_CUDA_RUN_CONTAINER} AS base
+
+RUN apt-get update \
+ && apt-get install -y libgomp1 curl\
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+ && apt-get install -y \
+ git \
+ python3 \
+ python3-pip \
+ python3-wheel \
+ && pip install --break-system-packages --upgrade setuptools \
+ && pip install --break-system-packages -r requirements.txt \
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/llama.cpp/.devops/cuda.Dockerfile b/llama.cpp/.devops/cuda.Dockerfile
new file mode 100644
index 0000000..52f103b
--- /dev/null
+++ b/llama.cpp/.devops/cuda.Dockerfile
@@ -0,0 +1,94 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.4.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+ apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+ fi && \
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+ cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+RUN mkdir -p /app/full \
+ && cp build/bin/* /app/full \
+ && cp *.py /app/full \
+ && cp -r gguf-py /app/full \
+ && cp -r requirements /app/full \
+ && cp requirements.txt /app/full \
+ && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_CUDA_RUN_CONTAINER} AS base
+
+RUN apt-get update \
+ && apt-get install -y libgomp1 curl\
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+ && apt-get install -y \
+ git \
+ python3 \
+ python3-pip \
+ && pip install --upgrade pip setuptools wheel \
+ && pip install --break-system-packages -r requirements.txt \
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/llama.cpp/.devops/intel.Dockerfile b/llama.cpp/.devops/intel.Dockerfile
new file mode 100644
index 0000000..35ea4ad
--- /dev/null
+++ b/llama.cpp/.devops/intel.Dockerfile
@@ -0,0 +1,95 @@
+ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04
+
+## Build Image
+
+FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
+
+ARG GGML_SYCL_F16=OFF
+RUN apt-get update && \
+ apt-get install -y git libssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+ echo "GGML_SYCL_F16 is set" \
+ && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+ fi && \
+ echo "Building with dynamic libs" && \
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
+ cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+RUN mkdir -p /app/full \
+ && cp build/bin/* /app/full \
+ && cp *.py /app/full \
+ && cp -r gguf-py /app/full \
+ && cp -r requirements /app/full \
+ && cp requirements.txt /app/full \
+ && cp .devops/tools.sh /app/full/tools.sh
+
+FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
+
+RUN apt-get update \
+ && apt-get install -y libgomp1 curl\
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/lib/ /app
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update && \
+ apt-get install -y \
+ git \
+ python3 \
+ python3-pip \
+ python3-venv && \
+ python3 -m venv /opt/venv && \
+ . /opt/venv/bin/activate && \
+ pip install --upgrade pip setuptools wheel && \
+ pip install -r requirements.txt && \
+ apt autoremove -y && \
+ apt clean -y && \
+ rm -rf /tmp/* /var/tmp/* && \
+ find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+ find /var/cache -type f -delete
+
+ENV PATH="/opt/venv/bin:$PATH"
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/lib/ /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/lib/ /app
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
+
diff --git a/llama.cpp/.devops/llama-cli-cann.Dockerfile b/llama.cpp/.devops/llama-cli-cann.Dockerfile
new file mode 100644
index 0000000..5bbc9ee
--- /dev/null
+++ b/llama.cpp/.devops/llama-cli-cann.Dockerfile
@@ -0,0 +1,45 @@
+ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
+
+FROM ascendai/cann:$ASCEND_VERSION AS build
+
+WORKDIR /app
+
+COPY . .
+
+RUN yum install -y gcc g++ cmake make openssl-devel
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
+
+# find libascend_hal.so, because the drive hasn`t been mounted.
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
+
+RUN echo "Building with static libs" && \
+ source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
+ cmake --build build --config Release --target llama-cli && \
+ cmake --build build --config Release --target llama-completion
+
+# TODO: use image with NNRT
+FROM ascendai/cann:$ASCEND_VERSION AS runtime
+COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
+
+ENV LC_ALL=C.utf8
+
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
+
+ENTRYPOINT ["/llama-cli" ]
diff --git a/llama.cpp/.devops/llama-cpp-cuda.srpm.spec b/llama.cpp/.devops/llama-cpp-cuda.srpm.spec
new file mode 100644
index 0000000..4d42a90
--- /dev/null
+++ b/llama.cpp/.devops/llama-cpp-cuda.srpm.spec
@@ -0,0 +1,85 @@
+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+# We need to declare standard versioning if people want to sort latest releases.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+# It is up to the user to install the correct vendor-specific support.
+
+Name: llama.cpp-cuda
+Version: %( date "+%%Y%%m%%d" )
+Release: 1%{?dist}
+Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+License: MIT
+Source0: https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires: coreutils make gcc-c++ git cuda-toolkit
+Requires: cuda-toolkit
+URL: https://github.com/ggml-org/llama.cpp
+
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+
+%description
+CPU inference for Meta's Lllama2 models using default options.
+
+%prep
+%setup -n llama.cpp-master
+
+%build
+make -j GGML_CUDA=1
+
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
+cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
+cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
+cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
+
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+
+[Install]
+WantedBy=default.target
+EOF
+
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+
+%files
+%{_bindir}/llama-cuda-cli
+%{_bindir}/llama-cuda-completion
+%{_bindir}/llama-cuda-server
+%{_bindir}/llama-cuda-simple
+/usr/lib/systemd/system/llamacuda.service
+%config /etc/sysconfig/llama
+
+%pre
+
+%post
+
+%preun
+%postun
+
+%changelog
diff --git a/llama.cpp/.devops/llama-cpp.srpm.spec b/llama.cpp/.devops/llama-cpp.srpm.spec
new file mode 100644
index 0000000..0a4f430
--- /dev/null
+++ b/llama.cpp/.devops/llama-cpp.srpm.spec
@@ -0,0 +1,87 @@
+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+# We need to declare standard versioning if people want to sort latest releases.
+# In the meantime, YYYYMMDD format will be used.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+# It is up to the user to install the correct vendor-specific support.
+
+Name: llama.cpp
+Version: %( date "+%%Y%%m%%d" )
+Release: 1%{?dist}
+Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+License: MIT
+Source0: https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires: coreutils make gcc-c++ git libstdc++-devel
+Requires: libstdc++
+URL: https://github.com/ggml-org/llama.cpp
+
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+
+%description
+CPU inference for Meta's Lllama2 models using default options.
+Models are not included in this package and must be downloaded separately.
+
+%prep
+%setup -n llama.cpp-master
+
+%build
+make -j
+
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
+cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
+cp -p llama-server %{buildroot}%{_bindir}/llama-server
+cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
+
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llama-server $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+
+[Install]
+WantedBy=default.target
+EOF
+
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+
+%files
+%{_bindir}/llama-cli
+%{_bindir}/llama-completion
+%{_bindir}/llama-server
+%{_bindir}/llama-simple
+/usr/lib/systemd/system/llama.service
+%config /etc/sysconfig/llama
+
+%pre
+
+%post
+
+%preun
+%postun
+
+%changelog
diff --git a/llama.cpp/.devops/musa.Dockerfile b/llama.cpp/.devops/musa.Dockerfile
new file mode 100644
index 0000000..9eb4985
--- /dev/null
+++ b/llama.cpp/.devops/musa.Dockerfile
@@ -0,0 +1,101 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc4.3.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
+
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+ apt-get install -y \
+ build-essential \
+ cmake \
+ python3 \
+ python3-pip \
+ git \
+ libssl-dev \
+ libgomp1
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+ export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+ fi && \
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+ cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+RUN mkdir -p /app/full \
+ && cp build/bin/* /app/full \
+ && cp *.py /app/full \
+ && cp -r gguf-py /app/full \
+ && cp -r requirements /app/full \
+ && cp requirements.txt /app/full \
+ && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_MUSA_RUN_CONTAINER} AS base
+
+RUN apt-get update \
+ && apt-get install -y libgomp1 curl\
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+ && apt-get install -y \
+ git \
+ python3 \
+ python3-pip \
+ && pip install --upgrade pip setuptools wheel \
+ && pip install -r requirements.txt \
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/llama.cpp/.devops/nix/apps.nix b/llama.cpp/.devops/nix/apps.nix
new file mode 100644
index 0000000..0ecf19f
--- /dev/null
+++ b/llama.cpp/.devops/nix/apps.nix
@@ -0,0 +1,21 @@
+{
+ perSystem =
+ { config, lib, ... }:
+ {
+ apps =
+ let
+ inherit (config.packages) default;
+ binaries = [
+ "llama-cli"
+ "llama-embedding"
+ "llama-server"
+ "llama-quantize"
+ ];
+ mkApp = name: {
+ type = "app";
+ program = "${default}/bin/${name}";
+ };
+ in
+ lib.genAttrs binaries mkApp;
+ };
+}
diff --git a/llama.cpp/.devops/nix/devshells.nix b/llama.cpp/.devops/nix/devshells.nix
new file mode 100644
index 0000000..bfd304a
--- /dev/null
+++ b/llama.cpp/.devops/nix/devshells.nix
@@ -0,0 +1,52 @@
+{ inputs, ... }:
+
+{
+ perSystem =
+ {
+ config,
+ lib,
+ system,
+ ...
+ }:
+ {
+ devShells =
+ let
+ pkgs = import inputs.nixpkgs { inherit system; };
+ stdenv = pkgs.stdenv;
+ scripts = config.packages.python-scripts;
+ in
+ lib.pipe (config.packages) [
+ (lib.concatMapAttrs (
+ name: package: {
+ ${name} = pkgs.mkShell {
+ name = "${name}";
+ inputsFrom = [ package ];
+ shellHook = ''
+ echo "Entering ${name} devShell"
+ '';
+ };
+ "${name}-extra" =
+ if (name == "python-scripts") then
+ null
+ else
+ pkgs.mkShell {
+ name = "${name}-extra";
+ inputsFrom = [
+ package
+ scripts
+ ];
+ # Extra packages that *may* be used by some scripts
+ packages = [
+ pkgs.python3Packages.tiktoken
+ ];
+ shellHook = ''
+ echo "Entering ${name} devShell"
+ addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
+ '';
+ };
+ }
+ ))
+ (lib.filterAttrs (name: value: value != null))
+ ];
+ };
+}
diff --git a/llama.cpp/.devops/nix/docker.nix b/llama.cpp/.devops/nix/docker.nix
new file mode 100644
index 0000000..d607b45
--- /dev/null
+++ b/llama.cpp/.devops/nix/docker.nix
@@ -0,0 +1,37 @@
+{
+ lib,
+ dockerTools,
+ buildEnv,
+ llama-cpp,
+ interactive ? true,
+ coreutils,
+}:
+
+# A tar that can be fed into `docker load`:
+#
+# $ nix build .#llamaPackages.docker
+# $ docker load < result
+
+# For details and variations cf.
+# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
+# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
+# - https://nixery.dev/
+
+# Approximate (compressed) sizes, at the time of writing, are:
+#
+# .#llamaPackages.docker: 125M;
+# .#llamaPackagesCuda.docker: 537M;
+# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
+
+dockerTools.buildLayeredImage {
+ name = llama-cpp.pname;
+ tag = "latest";
+
+ contents =
+ [ llama-cpp ]
+ ++ lib.optionals interactive [
+ coreutils
+ dockerTools.binSh
+ dockerTools.caCertificates
+ ];
+}
diff --git a/llama.cpp/.devops/nix/jetson-support.nix b/llama.cpp/.devops/nix/jetson-support.nix
new file mode 100644
index 0000000..78e2e40
--- /dev/null
+++ b/llama.cpp/.devops/nix/jetson-support.nix
@@ -0,0 +1,39 @@
+{ inputs, ... }:
+{
+ perSystem =
+ {
+ config,
+ system,
+ lib,
+ pkgsCuda,
+ ...
+ }:
+ {
+ legacyPackages =
+ let
+ caps.llamaPackagesXavier = "7.2";
+ caps.llamaPackagesOrin = "8.7";
+ caps.llamaPackagesTX2 = "6.2";
+ caps.llamaPackagesNano = "5.3";
+
+ pkgsFor =
+ cap:
+ import inputs.nixpkgs {
+ inherit system;
+ config = {
+ cudaSupport = true;
+ cudaCapabilities = [ cap ];
+ cudaEnableForwardCompat = false;
+ inherit (pkgsCuda.config) allowUnfreePredicate;
+ };
+ };
+ in
+ builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
+
+ packages = lib.optionalAttrs (system == "aarch64-linux") {
+ jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
+ jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
+ jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
+ };
+ };
+}
diff --git a/llama.cpp/.devops/nix/nixpkgs-instances.nix b/llama.cpp/.devops/nix/nixpkgs-instances.nix
new file mode 100644
index 0000000..40cf58f
--- /dev/null
+++ b/llama.cpp/.devops/nix/nixpkgs-instances.nix
@@ -0,0 +1,45 @@
+{ inputs, ... }:
+{
+ # The _module.args definitions are passed on to modules as arguments. E.g.
+ # the module `{ pkgs ... }: { /* config */ }` implicitly uses
+ # `_module.args.pkgs` (defined in this case by flake-parts).
+ perSystem =
+ { lib, system, ... }:
+ {
+ _module.args = {
+ # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
+ # again, the below creates several nixpkgs instances which the
+ # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
+ #
+ # This is currently "slow" and "expensive", on a certain scale.
+ # This also isn't "right" in that this hinders dependency injection at
+ # the level of flake inputs. This might get removed in the foreseeable
+ # future.
+ #
+ # Note that you can use these expressions without Nix
+ # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
+
+ pkgsCuda = import inputs.nixpkgs {
+ inherit system;
+ # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
+ # and ucx are built with CUDA support)
+ config.cudaSupport = true;
+ config.allowUnfreePredicate =
+ p:
+ builtins.all (
+ license:
+ license.free
+ || builtins.elem license.shortName [
+ "CUDA EULA"
+ "cuDNN EULA"
+ ]
+ ) (p.meta.licenses or (lib.toList p.meta.license));
+ };
+ # Ensure dependencies use ROCm consistently
+ pkgsRocm = import inputs.nixpkgs {
+ inherit system;
+ config.rocmSupport = true;
+ };
+ };
+ };
+}
diff --git a/llama.cpp/.devops/nix/package-gguf-py.nix b/llama.cpp/.devops/nix/package-gguf-py.nix
new file mode 100644
index 0000000..de3ac84
--- /dev/null
+++ b/llama.cpp/.devops/nix/package-gguf-py.nix
@@ -0,0 +1,38 @@
+{
+ lib,
+ llamaVersion,
+ numpy,
+ tqdm,
+ requests,
+ sentencepiece,
+ pyyaml,
+ poetry-core,
+ buildPythonPackage,
+ pytestCheckHook,
+}:
+
+buildPythonPackage {
+ pname = "gguf";
+ version = llamaVersion;
+ pyproject = true;
+ nativeBuildInputs = [ poetry-core ];
+ propagatedBuildInputs = [
+ numpy
+ tqdm
+ sentencepiece
+ pyyaml
+ requests
+ ];
+ src = lib.cleanSource ../../gguf-py;
+ pythonImportsCheck = [
+ "numpy"
+ "gguf"
+ ];
+ nativeCheckInputs = [ pytestCheckHook ];
+ doCheck = true;
+ meta = with lib; {
+ description = "Python package for writing binary files in the GGUF format";
+ license = licenses.mit;
+ maintainers = [ maintainers.ditsuke ];
+ };
+}
diff --git a/llama.cpp/.devops/nix/package.nix b/llama.cpp/.devops/nix/package.nix
new file mode 100644
index 0000000..79a7270
--- /dev/null
+++ b/llama.cpp/.devops/nix/package.nix
@@ -0,0 +1,243 @@
+{
+ lib,
+ glibc,
+ config,
+ stdenv,
+ runCommand,
+ cmake,
+ ninja,
+ pkg-config,
+ git,
+ mpi,
+ blas,
+ cudaPackages,
+ autoAddDriverRunpath,
+ darwin,
+ rocmPackages,
+ vulkan-headers,
+ vulkan-loader,
+ curl,
+ shaderc,
+ useBlas ?
+ builtins.all (x: !x) [
+ useCuda
+ useMetalKit
+ useRocm
+ useVulkan
+ ]
+ && blas.meta.available,
+ useCuda ? config.cudaSupport,
+ useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
+ # Increases the runtime closure size by ~700M
+ useMpi ? false,
+ useRocm ? config.rocmSupport,
+ rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
+ useVulkan ? false,
+ useRpc ? false,
+ llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
+
+ # It's necessary to consistently use backendStdenv when building with CUDA support,
+ # otherwise we get libstdc++ errors downstream.
+ effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
+ enableStatic ? effectiveStdenv.hostPlatform.isStatic,
+ precompileMetalShaders ? false,
+}:
+
+let
+ inherit (lib)
+ cmakeBool
+ cmakeFeature
+ optionalAttrs
+ optionals
+ strings
+ ;
+
+ stdenv = throw "Use effectiveStdenv instead";
+
+ suffices =
+ lib.optionals useBlas [ "BLAS" ]
+ ++ lib.optionals useCuda [ "CUDA" ]
+ ++ lib.optionals useMetalKit [ "MetalKit" ]
+ ++ lib.optionals useMpi [ "MPI" ]
+ ++ lib.optionals useRocm [ "ROCm" ]
+ ++ lib.optionals useVulkan [ "Vulkan" ];
+
+ pnameSuffix =
+ strings.optionalString (suffices != [ ])
+ "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
+ descriptionSuffix = strings.optionalString (
+ suffices != [ ]
+ ) ", accelerated with ${strings.concatStringsSep ", " suffices}";
+
+ xcrunHost = runCommand "xcrunHost" { } ''
+ mkdir -p $out/bin
+ ln -s /usr/bin/xcrun $out/bin
+ '';
+
+ # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
+ # separately
+ darwinBuildInputs =
+ with darwin.apple_sdk.frameworks;
+ [
+ Accelerate
+ CoreVideo
+ CoreGraphics
+ ]
+ ++ optionals useMetalKit [ MetalKit ];
+
+ cudaBuildInputs = with cudaPackages; [
+ cuda_cudart
+ cuda_cccl # <nv/target>
+ libcublas
+ ];
+
+ rocmBuildInputs = with rocmPackages; [
+ clr
+ hipblas
+ rocblas
+ ];
+
+ vulkanBuildInputs = [
+ vulkan-headers
+ vulkan-loader
+ shaderc
+ ];
+in
+
+effectiveStdenv.mkDerivation (finalAttrs: {
+ pname = "llama-cpp${pnameSuffix}";
+ version = llamaVersion;
+
+ # Note: none of the files discarded here are visible in the sandbox or
+ # affect the output hash. This also means they can be modified without
+ # triggering a rebuild.
+ src = lib.cleanSourceWith {
+ filter =
+ name: type:
+ let
+ noneOf = builtins.all (x: !x);
+ baseName = baseNameOf name;
+ in
+ noneOf [
+ (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
+ (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
+ (lib.hasPrefix "." baseName) # Skip hidden files and directories
+ (baseName == "flake.lock")
+ ];
+ src = lib.cleanSource ../../.;
+ };
+
+ postPatch = ''
+ '';
+
+ # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
+ # `default.metallib` may be compiled with Metal compiler from XCode
+ # and we need to escape sandbox on MacOS to access Metal compiler.
+ # `xcrun` is used find the path of the Metal compiler, which is varible
+ # and not on $PATH
+ # see https://github.com/ggml-org/llama.cpp/pull/6118 for discussion
+ __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
+
+ nativeBuildInputs =
+ [
+ cmake
+ ninja
+ pkg-config
+ git
+ ]
+ ++ optionals useCuda [
+ cudaPackages.cuda_nvcc
+
+ autoAddDriverRunpath
+ ]
+ ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
+ ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
+
+ buildInputs =
+ optionals effectiveStdenv.isDarwin darwinBuildInputs
+ ++ optionals useCuda cudaBuildInputs
+ ++ optionals useMpi [ mpi ]
+ ++ optionals useRocm rocmBuildInputs
+ ++ optionals useBlas [ blas ]
+ ++ optionals useVulkan vulkanBuildInputs;
+
+ cmakeFlags =
+ [
+ (cmakeBool "LLAMA_BUILD_SERVER" true)
+ (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
+ (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+ (cmakeBool "GGML_NATIVE" false)
+ (cmakeBool "GGML_BLAS" useBlas)
+ (cmakeBool "GGML_CUDA" useCuda)
+ (cmakeBool "GGML_HIP" useRocm)
+ (cmakeBool "GGML_METAL" useMetalKit)
+ (cmakeBool "GGML_VULKAN" useVulkan)
+ (cmakeBool "GGML_STATIC" enableStatic)
+ (cmakeBool "GGML_RPC" useRpc)
+ ]
+ ++ optionals useCuda [
+ (
+ with cudaPackages.flags;
+ cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
+ builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
+ )
+ )
+ ]
+ ++ optionals useRocm [
+ (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
+ (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
+ ]
+ ++ optionals useMetalKit [
+ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
+ (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
+ ];
+
+ # Environment variables needed for ROCm
+ env = optionalAttrs useRocm {
+ ROCM_PATH = "${rocmPackages.clr}";
+ HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
+ };
+
+ # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
+ # if they haven't been added yet.
+ postInstall = ''
+ mkdir -p $out/include
+ cp $src/include/llama.h $out/include/
+ '';
+
+ meta = {
+ # Configurations we don't want even the CI to evaluate. Results in the
+ # "unsupported platform" messages. This is mostly a no-op, because
+ # cudaPackages would've refused to evaluate anyway.
+ badPlatforms = optionals useCuda lib.platforms.darwin;
+
+ # Configurations that are known to result in build failures. Can be
+ # overridden by importing Nixpkgs with `allowBroken = true`.
+ broken = (useMetalKit && !effectiveStdenv.isDarwin);
+
+ description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
+ homepage = "https://github.com/ggml-org/llama.cpp/";
+ license = lib.licenses.mit;
+
+ # Accommodates `nix run` and `lib.getExe`
+ mainProgram = "llama-cli";
+
+ # These people might respond, on the best effort basis, if you ping them
+ # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
+ # Consider adding yourself to this list if you want to ensure this flake
+ # stays maintained and you're willing to invest your time. Do not add
+ # other people without their consent. Consider removing people after
+ # they've been unreachable for long periods of time.
+
+ # Note that lib.maintainers is defined in Nixpkgs, but you may just add
+ # an attrset following the same format as in
+ # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
+ maintainers = with lib.maintainers; [
+ philiptaron
+ SomeoneSerge
+ ];
+
+ # Extend `badPlatforms` instead
+ platforms = lib.platforms.all;
+ };
+})
diff --git a/llama.cpp/.devops/nix/python-scripts.nix b/llama.cpp/.devops/nix/python-scripts.nix
new file mode 100644
index 0000000..56ea182
--- /dev/null
+++ b/llama.cpp/.devops/nix/python-scripts.nix
@@ -0,0 +1,66 @@
+{
+ lib,
+ stdenv,
+ buildPythonPackage,
+ poetry-core,
+ mkShell,
+ python3Packages,
+ gguf-py,
+}@inputs:
+
+let
+ llama-python-deps = with python3Packages; [
+ numpy
+ sentencepiece
+ transformers
+ protobuf
+ torchWithoutCuda
+ gguf-py
+ tqdm
+
+ # for scripts/compare-llama-bench.py
+ gitpython
+ tabulate
+
+ # for examples/pydantic-models-to-grammar-examples.py
+ docstring-parser
+ pydantic
+
+ ];
+
+ llama-python-test-deps = with python3Packages; [
+ # Server bench
+ matplotlib
+
+ # server tests
+ openai
+ pytest
+ prometheus-client
+ ];
+in
+
+buildPythonPackage ({
+ pname = "llama-scripts";
+ version = "0.0.0";
+ pyproject = true;
+
+ # NOTE: The files filtered out here are not visible in the build sandbox, neither
+ # do they affect the output hash. They can be modified without triggering a rebuild.
+ src = lib.cleanSourceWith {
+ filter =
+ name: type:
+ let
+ any = builtins.any (x: x);
+ baseName = builtins.baseNameOf name;
+ in
+ any [
+ (lib.hasSuffix ".py" name)
+ (baseName == "README.md")
+ (baseName == "pyproject.toml")
+ ];
+ src = lib.cleanSource ../../.;
+ };
+ nativeBuildInputs = [ poetry-core ];
+ nativeCheckInputs = llama-python-test-deps;
+ dependencies = llama-python-deps;
+})
diff --git a/llama.cpp/.devops/nix/scope.nix b/llama.cpp/.devops/nix/scope.nix
new file mode 100644
index 0000000..b4328a7
--- /dev/null
+++ b/llama.cpp/.devops/nix/scope.nix
@@ -0,0 +1,35 @@
+{
+ lib,
+ newScope,
+ python3,
+ llamaVersion ? "0.0.0",
+}:
+
+let
+ pythonPackages = python3.pkgs;
+in
+
+# We're using `makeScope` instead of just writing out an attrset
+# because it allows users to apply overlays later using `overrideScope'`.
+# Cf. https://noogle.dev/f/lib/makeScope
+
+lib.makeScope newScope (self: {
+ inherit llamaVersion;
+ gguf-py = self.callPackage ./package-gguf-py.nix {
+ inherit (pythonPackages)
+ numpy
+ tqdm
+ sentencepiece
+ pyyaml
+ pytestCheckHook
+ requests
+ buildPythonPackage
+ poetry-core
+ ;
+ };
+ python-scripts = self.callPackage ./python-scripts.nix { inherit (pythonPackages) buildPythonPackage poetry-core; };
+ llama-cpp = self.callPackage ./package.nix { };
+ docker = self.callPackage ./docker.nix { };
+ docker-min = self.callPackage ./docker.nix { interactive = false; };
+ sif = self.callPackage ./sif.nix { };
+})
diff --git a/llama.cpp/.devops/nix/sif.nix b/llama.cpp/.devops/nix/sif.nix
new file mode 100644
index 0000000..7a5e1dd
--- /dev/null
+++ b/llama.cpp/.devops/nix/sif.nix
@@ -0,0 +1,27 @@
+{
+ lib,
+ singularity-tools,
+ llama-cpp,
+ bashInteractive,
+ interactive ? false,
+}:
+
+let
+ optionalInt = cond: x: if cond then x else 0;
+in
+singularity-tools.buildImage rec {
+ inherit (llama-cpp) name;
+ contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
+
+ # These are excessive (but safe) for most variants. Building singularity
+ # images requires superuser privileges, so we build them inside a VM in a
+ # writable image of pre-determined size.
+ #
+ # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
+ #
+ # Expected image sizes:
+ # - cpu/blas: 150M,
+ # - cuda, all gencodes: 560M,
+ diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
+ memSize = diskSize;
+}
diff --git a/llama.cpp/.devops/rocm.Dockerfile b/llama.cpp/.devops/rocm.Dockerfile
new file mode 100644
index 0000000..14936f8
--- /dev/null
+++ b/llama.cpp/.devops/rocm.Dockerfile
@@ -0,0 +1,114 @@
+ARG UBUNTU_VERSION=24.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=7.0
+ARG AMDGPU_VERSION=7.0
+
+# Target the ROCm build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+### Build image
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+# gfx803, gfx900, gfx906, gfx1032, gfx1101, gfx1102,not officialy supported
+# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html
+
+ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
+#ARG ROCM_DOCKER_ARCH='gfx1151'
+
+# Set ROCm architectures
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+
+RUN apt-get update \
+ && apt-get install -y \
+ build-essential \
+ cmake \
+ git \
+ libssl-dev \
+ curl \
+ libgomp1
+
+WORKDIR /app
+
+COPY . .
+
+RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
+ cmake -S . -B build \
+ -DGGML_HIP=ON \
+ -DGGML_HIP_ROCWMMA_FATTN=ON \
+ -DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \
+ -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \
+ -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
+ && cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib \
+ && find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+RUN mkdir -p /app/full \
+ && cp build/bin/* /app/full \
+ && cp *.py /app/full \
+ && cp -r gguf-py /app/full \
+ && cp -r requirements /app/full \
+ && cp requirements.txt /app/full \
+ && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_ROCM_DEV_CONTAINER} AS base
+
+RUN apt-get update \
+ && apt-get install -y libgomp1 curl\
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+ && apt-get install -y \
+ git \
+ python3-pip \
+ python3 \
+ python3-wheel\
+ && pip install --break-system-packages --upgrade setuptools \
+ && pip install --break-system-packages -r requirements.txt \
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/llama.cpp/.devops/s390x.Dockerfile b/llama.cpp/.devops/s390x.Dockerfile
new file mode 100644
index 0000000..757cd97
--- /dev/null
+++ b/llama.cpp/.devops/s390x.Dockerfile
@@ -0,0 +1,126 @@
+ARG GCC_VERSION=15.2.0
+ARG UBUNTU_VERSION=24.04
+
+### Build Llama.cpp stage
+FROM gcc:${GCC_VERSION} AS build
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
+ apt update -y && \
+ apt upgrade -y && \
+ apt install -y --no-install-recommends \
+ git cmake ccache ninja-build \
+ # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+ libopenblas-dev libssl-dev && \
+ rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+COPY . .
+
+RUN --mount=type=cache,target=/root/.ccache \
+ --mount=type=cache,target=/app/build \
+ cmake -S . -B build -G Ninja \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ -DLLAMA_BUILD_TESTS=OFF \
+ -DGGML_NATIVE=OFF \
+ -DGGML_BACKEND_DL=ON \
+ -DGGML_CPU_ALL_VARIANTS=ON \
+ -DGGML_BLAS=ON \
+ -DGGML_BLAS_VENDOR=OpenBLAS && \
+ cmake --build build --config Release -j $(nproc) && \
+ cmake --install build --prefix /opt/llama.cpp
+
+COPY *.py /opt/llama.cpp/bin
+COPY .devops/tools.sh /opt/llama.cpp/bin
+
+COPY gguf-py /opt/llama.cpp/gguf-py
+COPY requirements.txt /opt/llama.cpp/gguf-py
+COPY requirements /opt/llama.cpp/gguf-py/requirements
+
+
+### Collect all llama.cpp binaries, libraries and distro libraries
+FROM scratch AS collector
+
+# Copy llama.cpp binaries and libraries
+COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
+COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
+COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
+
+
+### Base image
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
+ apt update -y && \
+ apt install -y --no-install-recommends \
+ # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+ # See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
+ curl libgomp1 libopenblas-dev && \
+ apt autoremove -y && \
+ apt clean -y && \
+ rm -rf /tmp/* /var/tmp/* && \
+ find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+ find /var/cache -type f -delete
+
+# Copy llama.cpp libraries
+COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
+
+
+### Full
+FROM base AS full
+
+ENV PATH="/root/.cargo/bin:${PATH}"
+WORKDIR /app
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
+ apt update -y && \
+ apt install -y \
+ git cmake libjpeg-dev \
+ python3 python3-pip python3-dev && \
+ apt autoremove -y && \
+ apt clean -y && \
+ rm -rf /tmp/* /var/tmp/* && \
+ find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+ find /var/cache -type f -delete
+
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
+
+COPY --from=collector /llama.cpp/bin /app
+COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
+
+RUN pip install --no-cache-dir --break-system-packages \
+ -r /app/gguf-py/requirements.txt
+
+ENTRYPOINT [ "/app/tools.sh" ]
+
+
+### CLI Only
+FROM base AS light
+
+WORKDIR /llama.cpp/bin
+
+# Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
+
+ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
+
+
+### Server
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+WORKDIR /llama.cpp/bin
+
+# Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
+
+EXPOSE 8080
+
+ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]
diff --git a/llama.cpp/.devops/tools.sh b/llama.cpp/.devops/tools.sh
new file mode 100755
index 0000000..cc5ee17
--- /dev/null
+++ b/llama.cpp/.devops/tools.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+set -e
+
+# Read the first argument into a variable
+arg1="$1"
+
+# Shift the arguments to remove the first one
+shift
+
+if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
+ exec python3 ./convert_hf_to_gguf.py "$@"
+elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
+ exec ./llama-quantize "$@"
+elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
+ exec ./llama-cli "$@"
+elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
+ exec ./llama-completion "$@"
+elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
+ exec ./llama-bench "$@"
+elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
+ exec ./llama-perplexity "$@"
+elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
+ echo "Converting PTH to GGML..."
+ for i in $(ls $1/$2/ggml-model-f16.bin*); do
+ if [ -f "${i/f16/q4_0}" ]; then
+ echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
+ else
+ echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
+ exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
+ fi
+ done
+elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
+ exec ./llama-server "$@"
+else
+ echo "Unknown command: $arg1"
+ echo "Available commands: "
+ echo " --run (-r): Run a model (chat) previously converted into ggml"
+ echo " ex: -m /models/7B/ggml-model-q4_0.bin"
+ echo " --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
+ echo " ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
+ echo " --bench (-b): Benchmark the performance of the inference for various parameters."
+ echo " ex: -m model.gguf"
+ echo " --perplexity (-p): Measure the perplexity of a model over a given text."
+ echo " ex: -m model.gguf -f file.txt"
+ echo " --convert (-c): Convert a llama model into ggml"
+ echo " ex: --outtype f16 \"/models/7B/\" "
+ echo " --quantize (-q): Optimize with quantization process ggml"
+ echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
+ echo " --all-in-one (-a): Execute --convert & --quantize"
+ echo " ex: \"/models/\" 7B"
+ echo " --server (-s): Run a model on the server"
+ echo " ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
+fi
diff --git a/llama.cpp/.devops/vulkan.Dockerfile b/llama.cpp/.devops/vulkan.Dockerfile
new file mode 100644
index 0000000..5d6c87e
--- /dev/null
+++ b/llama.cpp/.devops/vulkan.Dockerfile
@@ -0,0 +1,90 @@
+ARG UBUNTU_VERSION=26.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget xz-utils
+
+# Install SSL and Vulkan SDK dependencies
+RUN apt install -y libssl-dev curl \
+ libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
+
+# Build it
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
+ cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+ find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+RUN mkdir -p /app/full \
+ && cp build/bin/* /app/full \
+ && cp *.py /app/full \
+ && cp -r gguf-py /app/full \
+ && cp -r requirements /app/full \
+ && cp requirements.txt /app/full \
+ && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ubuntu:$UBUNTU_VERSION AS base
+
+RUN apt-get update \
+ && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
+ libglvnd0 libgl1 libglx0 libegl1 libgles2 \
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+ && apt-get install -y \
+ build-essential \
+ git \
+ python3 \
+ python3-dev \
+ python3-pip \
+ python3-wheel \
+ && pip install --break-system-packages --upgrade setuptools \
+ && pip install --break-system-packages -r requirements.txt \
+ && apt autoremove -y \
+ && apt clean -y \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]