summaryrefslogtreecommitdiff
path: root/llama.cpp/.devops/nix
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/.devops/nix')
-rw-r--r--llama.cpp/.devops/nix/apps.nix21
-rw-r--r--llama.cpp/.devops/nix/devshells.nix52
-rw-r--r--llama.cpp/.devops/nix/docker.nix37
-rw-r--r--llama.cpp/.devops/nix/jetson-support.nix39
-rw-r--r--llama.cpp/.devops/nix/nixpkgs-instances.nix45
-rw-r--r--llama.cpp/.devops/nix/package-gguf-py.nix38
-rw-r--r--llama.cpp/.devops/nix/package.nix243
-rw-r--r--llama.cpp/.devops/nix/python-scripts.nix66
-rw-r--r--llama.cpp/.devops/nix/scope.nix35
-rw-r--r--llama.cpp/.devops/nix/sif.nix27
10 files changed, 603 insertions, 0 deletions
diff --git a/llama.cpp/.devops/nix/apps.nix b/llama.cpp/.devops/nix/apps.nix
new file mode 100644
index 0000000..0ecf19f
--- /dev/null
+++ b/llama.cpp/.devops/nix/apps.nix
@@ -0,0 +1,21 @@
+{
+ perSystem =
+ { config, lib, ... }:
+ {
+ apps =
+ let
+ inherit (config.packages) default;
+ binaries = [
+ "llama-cli"
+ "llama-embedding"
+ "llama-server"
+ "llama-quantize"
+ ];
+ mkApp = name: {
+ type = "app";
+ program = "${default}/bin/${name}";
+ };
+ in
+ lib.genAttrs binaries mkApp;
+ };
+}
diff --git a/llama.cpp/.devops/nix/devshells.nix b/llama.cpp/.devops/nix/devshells.nix
new file mode 100644
index 0000000..bfd304a
--- /dev/null
+++ b/llama.cpp/.devops/nix/devshells.nix
@@ -0,0 +1,52 @@
+{ inputs, ... }:
+
+{
+ perSystem =
+ {
+ config,
+ lib,
+ system,
+ ...
+ }:
+ {
+ devShells =
+ let
+ pkgs = import inputs.nixpkgs { inherit system; };
+ stdenv = pkgs.stdenv;
+ scripts = config.packages.python-scripts;
+ in
+ lib.pipe (config.packages) [
+ (lib.concatMapAttrs (
+ name: package: {
+ ${name} = pkgs.mkShell {
+ name = "${name}";
+ inputsFrom = [ package ];
+ shellHook = ''
+ echo "Entering ${name} devShell"
+ '';
+ };
+ "${name}-extra" =
+ if (name == "python-scripts") then
+ null
+ else
+ pkgs.mkShell {
+ name = "${name}-extra";
+ inputsFrom = [
+ package
+ scripts
+ ];
+ # Extra packages that *may* be used by some scripts
+ packages = [
+ pkgs.python3Packages.tiktoken
+ ];
+ shellHook = ''
+ echo "Entering ${name} devShell"
+ addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
+ '';
+ };
+ }
+ ))
+ (lib.filterAttrs (name: value: value != null))
+ ];
+ };
+}
diff --git a/llama.cpp/.devops/nix/docker.nix b/llama.cpp/.devops/nix/docker.nix
new file mode 100644
index 0000000..d607b45
--- /dev/null
+++ b/llama.cpp/.devops/nix/docker.nix
@@ -0,0 +1,37 @@
+{
+ lib,
+ dockerTools,
+ buildEnv,
+ llama-cpp,
+ interactive ? true,
+ coreutils,
+}:
+
+# A tar that can be fed into `docker load`:
+#
+# $ nix build .#llamaPackages.docker
+# $ docker load < result
+
+# For details and variations cf.
+# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
+# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
+# - https://nixery.dev/
+
+# Approximate (compressed) sizes, at the time of writing, are:
+#
+# .#llamaPackages.docker: 125M;
+# .#llamaPackagesCuda.docker: 537M;
+# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
+
+dockerTools.buildLayeredImage {
+ name = llama-cpp.pname;
+ tag = "latest";
+
+ contents =
+ [ llama-cpp ]
+ ++ lib.optionals interactive [
+ coreutils
+ dockerTools.binSh
+ dockerTools.caCertificates
+ ];
+}
diff --git a/llama.cpp/.devops/nix/jetson-support.nix b/llama.cpp/.devops/nix/jetson-support.nix
new file mode 100644
index 0000000..78e2e40
--- /dev/null
+++ b/llama.cpp/.devops/nix/jetson-support.nix
@@ -0,0 +1,39 @@
+{ inputs, ... }:
+{
+ perSystem =
+ {
+ config,
+ system,
+ lib,
+ pkgsCuda,
+ ...
+ }:
+ {
+ legacyPackages =
+ let
+ caps.llamaPackagesXavier = "7.2";
+ caps.llamaPackagesOrin = "8.7";
+ caps.llamaPackagesTX2 = "6.2";
+ caps.llamaPackagesNano = "5.3";
+
+ pkgsFor =
+ cap:
+ import inputs.nixpkgs {
+ inherit system;
+ config = {
+ cudaSupport = true;
+ cudaCapabilities = [ cap ];
+ cudaEnableForwardCompat = false;
+ inherit (pkgsCuda.config) allowUnfreePredicate;
+ };
+ };
+ in
+ builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
+
+ packages = lib.optionalAttrs (system == "aarch64-linux") {
+ jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
+ jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
+ jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
+ };
+ };
+}
diff --git a/llama.cpp/.devops/nix/nixpkgs-instances.nix b/llama.cpp/.devops/nix/nixpkgs-instances.nix
new file mode 100644
index 0000000..40cf58f
--- /dev/null
+++ b/llama.cpp/.devops/nix/nixpkgs-instances.nix
@@ -0,0 +1,45 @@
+{ inputs, ... }:
+{
+ # The _module.args definitions are passed on to modules as arguments. E.g.
+ # the module `{ pkgs ... }: { /* config */ }` implicitly uses
+ # `_module.args.pkgs` (defined in this case by flake-parts).
+ perSystem =
+ { lib, system, ... }:
+ {
+ _module.args = {
+ # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
+ # again, the below creates several nixpkgs instances which the
+ # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
+ #
+ # This is currently "slow" and "expensive", on a certain scale.
+ # This also isn't "right" in that this hinders dependency injection at
+ # the level of flake inputs. This might get removed in the foreseeable
+ # future.
+ #
+ # Note that you can use these expressions without Nix
+ # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
+
+ pkgsCuda = import inputs.nixpkgs {
+ inherit system;
+ # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
+ # and ucx are built with CUDA support)
+ config.cudaSupport = true;
+ config.allowUnfreePredicate =
+ p:
+ builtins.all (
+ license:
+ license.free
+ || builtins.elem license.shortName [
+ "CUDA EULA"
+ "cuDNN EULA"
+ ]
+ ) (p.meta.licenses or (lib.toList p.meta.license));
+ };
+ # Ensure dependencies use ROCm consistently
+ pkgsRocm = import inputs.nixpkgs {
+ inherit system;
+ config.rocmSupport = true;
+ };
+ };
+ };
+}
diff --git a/llama.cpp/.devops/nix/package-gguf-py.nix b/llama.cpp/.devops/nix/package-gguf-py.nix
new file mode 100644
index 0000000..de3ac84
--- /dev/null
+++ b/llama.cpp/.devops/nix/package-gguf-py.nix
@@ -0,0 +1,38 @@
+{
+ lib,
+ llamaVersion,
+ numpy,
+ tqdm,
+ requests,
+ sentencepiece,
+ pyyaml,
+ poetry-core,
+ buildPythonPackage,
+ pytestCheckHook,
+}:
+
+buildPythonPackage {
+ pname = "gguf";
+ version = llamaVersion;
+ pyproject = true;
+ nativeBuildInputs = [ poetry-core ];
+ propagatedBuildInputs = [
+ numpy
+ tqdm
+ sentencepiece
+ pyyaml
+ requests
+ ];
+ src = lib.cleanSource ../../gguf-py;
+ pythonImportsCheck = [
+ "numpy"
+ "gguf"
+ ];
+ nativeCheckInputs = [ pytestCheckHook ];
+ doCheck = true;
+ meta = with lib; {
+ description = "Python package for writing binary files in the GGUF format";
+ license = licenses.mit;
+ maintainers = [ maintainers.ditsuke ];
+ };
+}
diff --git a/llama.cpp/.devops/nix/package.nix b/llama.cpp/.devops/nix/package.nix
new file mode 100644
index 0000000..79a7270
--- /dev/null
+++ b/llama.cpp/.devops/nix/package.nix
@@ -0,0 +1,243 @@
+{
+ lib,
+ glibc,
+ config,
+ stdenv,
+ runCommand,
+ cmake,
+ ninja,
+ pkg-config,
+ git,
+ mpi,
+ blas,
+ cudaPackages,
+ autoAddDriverRunpath,
+ darwin,
+ rocmPackages,
+ vulkan-headers,
+ vulkan-loader,
+ curl,
+ shaderc,
+ useBlas ?
+ builtins.all (x: !x) [
+ useCuda
+ useMetalKit
+ useRocm
+ useVulkan
+ ]
+ && blas.meta.available,
+ useCuda ? config.cudaSupport,
+ useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
+ # Increases the runtime closure size by ~700M
+ useMpi ? false,
+ useRocm ? config.rocmSupport,
+ rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
+ useVulkan ? false,
+ useRpc ? false,
+ llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
+
+ # It's necessary to consistently use backendStdenv when building with CUDA support,
+ # otherwise we get libstdc++ errors downstream.
+ effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
+ enableStatic ? effectiveStdenv.hostPlatform.isStatic,
+ precompileMetalShaders ? false,
+}:
+
+let
+ inherit (lib)
+ cmakeBool
+ cmakeFeature
+ optionalAttrs
+ optionals
+ strings
+ ;
+
+ stdenv = throw "Use effectiveStdenv instead";
+
+ suffices =
+ lib.optionals useBlas [ "BLAS" ]
+ ++ lib.optionals useCuda [ "CUDA" ]
+ ++ lib.optionals useMetalKit [ "MetalKit" ]
+ ++ lib.optionals useMpi [ "MPI" ]
+ ++ lib.optionals useRocm [ "ROCm" ]
+ ++ lib.optionals useVulkan [ "Vulkan" ];
+
+ pnameSuffix =
+ strings.optionalString (suffices != [ ])
+ "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
+ descriptionSuffix = strings.optionalString (
+ suffices != [ ]
+ ) ", accelerated with ${strings.concatStringsSep ", " suffices}";
+
+ xcrunHost = runCommand "xcrunHost" { } ''
+ mkdir -p $out/bin
+ ln -s /usr/bin/xcrun $out/bin
+ '';
+
+ # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
+ # separately
+ darwinBuildInputs =
+ with darwin.apple_sdk.frameworks;
+ [
+ Accelerate
+ CoreVideo
+ CoreGraphics
+ ]
+ ++ optionals useMetalKit [ MetalKit ];
+
+ cudaBuildInputs = with cudaPackages; [
+ cuda_cudart
+ cuda_cccl # <nv/target>
+ libcublas
+ ];
+
+ rocmBuildInputs = with rocmPackages; [
+ clr
+ hipblas
+ rocblas
+ ];
+
+ vulkanBuildInputs = [
+ vulkan-headers
+ vulkan-loader
+ shaderc
+ ];
+in
+
+effectiveStdenv.mkDerivation (finalAttrs: {
+ pname = "llama-cpp${pnameSuffix}";
+ version = llamaVersion;
+
+ # Note: none of the files discarded here are visible in the sandbox or
+ # affect the output hash. This also means they can be modified without
+ # triggering a rebuild.
+ src = lib.cleanSourceWith {
+ filter =
+ name: type:
+ let
+ noneOf = builtins.all (x: !x);
+ baseName = baseNameOf name;
+ in
+ noneOf [
+ (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
+ (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
+ (lib.hasPrefix "." baseName) # Skip hidden files and directories
+ (baseName == "flake.lock")
+ ];
+ src = lib.cleanSource ../../.;
+ };
+
+ postPatch = ''
+ '';
+
+ # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
+ # `default.metallib` may be compiled with Metal compiler from XCode
+ # and we need to escape sandbox on MacOS to access Metal compiler.
+ # `xcrun` is used find the path of the Metal compiler, which is varible
+ # and not on $PATH
+ # see https://github.com/ggml-org/llama.cpp/pull/6118 for discussion
+ __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
+
+ nativeBuildInputs =
+ [
+ cmake
+ ninja
+ pkg-config
+ git
+ ]
+ ++ optionals useCuda [
+ cudaPackages.cuda_nvcc
+
+ autoAddDriverRunpath
+ ]
+ ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
+ ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
+
+ buildInputs =
+ optionals effectiveStdenv.isDarwin darwinBuildInputs
+ ++ optionals useCuda cudaBuildInputs
+ ++ optionals useMpi [ mpi ]
+ ++ optionals useRocm rocmBuildInputs
+ ++ optionals useBlas [ blas ]
+ ++ optionals useVulkan vulkanBuildInputs;
+
+ cmakeFlags =
+ [
+ (cmakeBool "LLAMA_BUILD_SERVER" true)
+ (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
+ (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+ (cmakeBool "GGML_NATIVE" false)
+ (cmakeBool "GGML_BLAS" useBlas)
+ (cmakeBool "GGML_CUDA" useCuda)
+ (cmakeBool "GGML_HIP" useRocm)
+ (cmakeBool "GGML_METAL" useMetalKit)
+ (cmakeBool "GGML_VULKAN" useVulkan)
+ (cmakeBool "GGML_STATIC" enableStatic)
+ (cmakeBool "GGML_RPC" useRpc)
+ ]
+ ++ optionals useCuda [
+ (
+ with cudaPackages.flags;
+ cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
+ builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
+ )
+ )
+ ]
+ ++ optionals useRocm [
+ (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
+ (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
+ ]
+ ++ optionals useMetalKit [
+ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
+ (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
+ ];
+
+ # Environment variables needed for ROCm
+ env = optionalAttrs useRocm {
+ ROCM_PATH = "${rocmPackages.clr}";
+ HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
+ };
+
+ # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
+ # if they haven't been added yet.
+ postInstall = ''
+ mkdir -p $out/include
+ cp $src/include/llama.h $out/include/
+ '';
+
+ meta = {
+ # Configurations we don't want even the CI to evaluate. Results in the
+ # "unsupported platform" messages. This is mostly a no-op, because
+ # cudaPackages would've refused to evaluate anyway.
+ badPlatforms = optionals useCuda lib.platforms.darwin;
+
+ # Configurations that are known to result in build failures. Can be
+ # overridden by importing Nixpkgs with `allowBroken = true`.
+ broken = (useMetalKit && !effectiveStdenv.isDarwin);
+
+ description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
+ homepage = "https://github.com/ggml-org/llama.cpp/";
+ license = lib.licenses.mit;
+
+ # Accommodates `nix run` and `lib.getExe`
+ mainProgram = "llama-cli";
+
+ # These people might respond, on the best effort basis, if you ping them
+ # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
+ # Consider adding yourself to this list if you want to ensure this flake
+ # stays maintained and you're willing to invest your time. Do not add
+ # other people without their consent. Consider removing people after
+ # they've been unreachable for long periods of time.
+
+ # Note that lib.maintainers is defined in Nixpkgs, but you may just add
+ # an attrset following the same format as in
+ # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
+ maintainers = with lib.maintainers; [
+ philiptaron
+ SomeoneSerge
+ ];
+
+ # Extend `badPlatforms` instead
+ platforms = lib.platforms.all;
+ };
+})
diff --git a/llama.cpp/.devops/nix/python-scripts.nix b/llama.cpp/.devops/nix/python-scripts.nix
new file mode 100644
index 0000000..56ea182
--- /dev/null
+++ b/llama.cpp/.devops/nix/python-scripts.nix
@@ -0,0 +1,66 @@
+{
+ lib,
+ stdenv,
+ buildPythonPackage,
+ poetry-core,
+ mkShell,
+ python3Packages,
+ gguf-py,
+}@inputs:
+
+let
+ llama-python-deps = with python3Packages; [
+ numpy
+ sentencepiece
+ transformers
+ protobuf
+ torchWithoutCuda
+ gguf-py
+ tqdm
+
+ # for scripts/compare-llama-bench.py
+ gitpython
+ tabulate
+
+ # for examples/pydantic-models-to-grammar-examples.py
+ docstring-parser
+ pydantic
+
+ ];
+
+ llama-python-test-deps = with python3Packages; [
+ # Server bench
+ matplotlib
+
+ # server tests
+ openai
+ pytest
+ prometheus-client
+ ];
+in
+
+buildPythonPackage ({
+ pname = "llama-scripts";
+ version = "0.0.0";
+ pyproject = true;
+
+ # NOTE: The files filtered out here are not visible in the build sandbox, neither
+ # do they affect the output hash. They can be modified without triggering a rebuild.
+ src = lib.cleanSourceWith {
+ filter =
+ name: type:
+ let
+ any = builtins.any (x: x);
+ baseName = builtins.baseNameOf name;
+ in
+ any [
+ (lib.hasSuffix ".py" name)
+ (baseName == "README.md")
+ (baseName == "pyproject.toml")
+ ];
+ src = lib.cleanSource ../../.;
+ };
+ nativeBuildInputs = [ poetry-core ];
+ nativeCheckInputs = llama-python-test-deps;
+ dependencies = llama-python-deps;
+})
diff --git a/llama.cpp/.devops/nix/scope.nix b/llama.cpp/.devops/nix/scope.nix
new file mode 100644
index 0000000..b4328a7
--- /dev/null
+++ b/llama.cpp/.devops/nix/scope.nix
@@ -0,0 +1,35 @@
+{
+ lib,
+ newScope,
+ python3,
+ llamaVersion ? "0.0.0",
+}:
+
+let
+ pythonPackages = python3.pkgs;
+in
+
+# We're using `makeScope` instead of just writing out an attrset
+# because it allows users to apply overlays later using `overrideScope'`.
+# Cf. https://noogle.dev/f/lib/makeScope
+
+lib.makeScope newScope (self: {
+ inherit llamaVersion;
+ gguf-py = self.callPackage ./package-gguf-py.nix {
+ inherit (pythonPackages)
+ numpy
+ tqdm
+ sentencepiece
+ pyyaml
+ pytestCheckHook
+ requests
+ buildPythonPackage
+ poetry-core
+ ;
+ };
+ python-scripts = self.callPackage ./python-scripts.nix { inherit (pythonPackages) buildPythonPackage poetry-core; };
+ llama-cpp = self.callPackage ./package.nix { };
+ docker = self.callPackage ./docker.nix { };
+ docker-min = self.callPackage ./docker.nix { interactive = false; };
+ sif = self.callPackage ./sif.nix { };
+})
diff --git a/llama.cpp/.devops/nix/sif.nix b/llama.cpp/.devops/nix/sif.nix
new file mode 100644
index 0000000..7a5e1dd
--- /dev/null
+++ b/llama.cpp/.devops/nix/sif.nix
@@ -0,0 +1,27 @@
+{
+ lib,
+ singularity-tools,
+ llama-cpp,
+ bashInteractive,
+ interactive ? false,
+}:
+
+let
+ optionalInt = cond: x: if cond then x else 0;
+in
+singularity-tools.buildImage rec {
+ inherit (llama-cpp) name;
+ contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
+
+ # These are excessive (but safe) for most variants. Building singularity
+ # images requires superuser privileges, so we build them inside a VM in a
+ # writable image of pre-determined size.
+ #
+ # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
+ #
+ # Expected image sizes:
+ # - cpu/blas: 150M,
+ # - cuda, all gencodes: 560M,
+ diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
+ memSize = diskSize;
+}