summaryrefslogtreecommitdiff
path: root/llama.cpp/examples/sycl
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/examples/sycl
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/examples/sycl')
-rw-r--r--llama.cpp/examples/sycl/CMakeLists.txt9
-rw-r--r--llama.cpp/examples/sycl/README.md41
-rwxr-xr-xllama.cpp/examples/sycl/build.sh23
-rw-r--r--llama.cpp/examples/sycl/ls-sycl-device.cpp13
-rwxr-xr-xllama.cpp/examples/sycl/run-llama2.sh31
-rwxr-xr-xllama.cpp/examples/sycl/test.sh130
-rw-r--r--llama.cpp/examples/sycl/win-build-sycl.bat31
-rw-r--r--llama.cpp/examples/sycl/win-run-llama2.bat11
-rw-r--r--llama.cpp/examples/sycl/win-test.bat11
9 files changed, 300 insertions, 0 deletions
diff --git a/llama.cpp/examples/sycl/CMakeLists.txt b/llama.cpp/examples/sycl/CMakeLists.txt
new file mode 100644
index 0000000..e4d5083
--- /dev/null
+++ b/llama.cpp/examples/sycl/CMakeLists.txt
@@ -0,0 +1,9 @@
+# MIT license
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+set(TARGET llama-ls-sycl-device)
+add_executable(${TARGET} ls-sycl-device.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/llama.cpp/examples/sycl/README.md b/llama.cpp/examples/sycl/README.md
new file mode 100644
index 0000000..8819d87
--- /dev/null
+++ b/llama.cpp/examples/sycl/README.md
@@ -0,0 +1,41 @@
+# llama.cpp/example/sycl
+
+This example program provides the tools for llama.cpp for SYCL on Intel GPU.
+
+## Tool
+
+|Tool Name| Function|Status|
+|-|-|-|
+|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
+
+### llama-ls-sycl-device
+
+List all SYCL devices with ID, compute capability, max work group size, ect.
+
+1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
+
+2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)*
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+3. Execute
+
+```
+./build/bin/llama-ls-sycl-device
+```
+
+Check the ID in startup log, like:
+
+```
+found 2 SYCL devices:
+| | | | |Max | |Max |Global | |
+| | | | |compute|Max work|sub |mem | |
+|ID| Device Type| Name|Version|units |group |group|size | Driver version|
+|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|
+| 0| [level_zero:gpu:0]| Intel Arc A770 Graphics| 1.3| 512| 1024| 32| 16225M| 1.3.29138|
+| 1| [level_zero:gpu:1]| Intel UHD Graphics 750| 1.3| 32| 512| 32| 62631M| 1.3.29138|
+
+```
+
diff --git a/llama.cpp/examples/sycl/build.sh b/llama.cpp/examples/sycl/build.sh
new file mode 100755
index 0000000..635e74f
--- /dev/null
+++ b/llama.cpp/examples/sycl/build.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# MIT license
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+
+#for FP16
+#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference
+
+#for FP32
+cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF
+
+#build example/main
+#cmake --build . --config Release --target main
+
+#build example/llama-bench
+#cmake --build . --config Release --target llama-bench
+
+#build all binary
+cmake --build . --config Release -j -v
diff --git a/llama.cpp/examples/sycl/ls-sycl-device.cpp b/llama.cpp/examples/sycl/ls-sycl-device.cpp
new file mode 100644
index 0000000..74a8b7f
--- /dev/null
+++ b/llama.cpp/examples/sycl/ls-sycl-device.cpp
@@ -0,0 +1,13 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+
+#include "ggml-sycl.h"
+
+int main() {
+ ggml_backend_sycl_print_sycl_devices();
+ return 0;
+}
diff --git a/llama.cpp/examples/sycl/run-llama2.sh b/llama.cpp/examples/sycl/run-llama2.sh
new file mode 100755
index 0000000..d33f82f
--- /dev/null
+++ b/llama.cpp/examples/sycl/run-llama2.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+# MIT license
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+export ONEAPI_DEVICE_SELECTOR="level_zero:0"
+source /opt/intel/oneapi/setvars.sh
+
+#export GGML_SYCL_DEBUG=1
+
+#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+
+INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
+MODEL_FILE=models/llama-2-7b.Q4_0.gguf
+NGL=99
+CONTEXT=4096
+
+#support malloc device memory more than 4GB.
+export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+
+LOAD_MODE='--mmap'
+if [ $# -gt 0 ]; then
+ GGML_SYCL_DEVICE=$1
+ echo "use $GGML_SYCL_DEVICE as main GPU"
+ #use signle GPU only
+ ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
+
+else
+ #use multiple GPUs with same max compute units
+ ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
+fi
diff --git a/llama.cpp/examples/sycl/test.sh b/llama.cpp/examples/sycl/test.sh
new file mode 100755
index 0000000..140c191
--- /dev/null
+++ b/llama.cpp/examples/sycl/test.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+# MIT license
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+Help() {
+ cat << EOF
+Usage: $(basename "$0") [OPTIONS]
+
+This script processes files with specified options.
+
+Options:
+ -h, --help Display this help message and exit.
+ -c, --context <value> Set context length. Bigger need more memory.
+ -p, --promote <value> Prompt to start generation with.
+ -m, --model <value> Full model file path.
+ -mg,--main-gpu <value> Set main GPU ID (0 - n) for single GPU mode.
+ -sm,--split-mode <value> How to split the model across multiple GPUs, one of:
+ - none: use one GPU only
+ - layer (default): split layers and KV across GPUs
+ - row: split rows across GPUs
+ -ngl,--n-gpu-layers <value> Max. number of layers to store in VRAM (default: -1)
+ -lv,--log-verbosity <value> Set the verbosity threshold. Messages with a higher verbosity will be
+ ignored. Values:
+ - 0: generic output
+ - 1: error
+ - 2: warning
+ - 3: info
+ - 4: debug
+
+
+EOF
+}
+
+BIN_FILE=./build/bin/llama-completion
+SEED=0
+GPUS_SETTING=""
+
+INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
+MODEL_FILE=models/llama-2-7b.Q4_0.gguf
+NGL=99
+CONTEXT=4096
+GGML_SYCL_DEVICE=-1
+SPLIT_MODE=layer
+LOG_VERBOSE=3
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ -c|--context)
+ CONTEXT=$2
+ # Shift twice to consume both the option flag and its value
+ shift
+ shift
+ ;;
+ -p|--promote)
+ # Option that is a simple flag (boolean)
+ INPUT_PROMPT="$2"
+ # Shift once to consume the option flag
+ shift
+ shift
+ ;;
+ -m|--model)
+ MODEL_FILE="$2"
+ # Shift twice to consume both the option flag and its value
+ shift
+ shift
+ ;;
+ -mg|--main-gpu)
+ GGML_SYCL_DEVICE=$2
+ SPLIT_MODE=none
+ # Shift twice to consume both the option flag and its value
+ shift
+ shift
+ ;;
+ -sm|--split-mode)
+ SPLIT_MODE=$2
+ # Shift twice to consume both the option flag and its value
+ shift
+ shift
+ ;;
+ -ngl|--n-gpu-layers)
+ NGL=$2
+ # Shift twice to consume both the option flag and its value
+ shift
+ shift
+ ;;
+ -lv|--log-verbosity)
+ LOG_VERBOSE=$2
+ # Shift twice to consume both the option flag and its value
+ shift
+ shift
+ ;;
+ -h|--help)
+ Help
+ exit 0
+ ;;
+ *)
+ # Handle unknown options or stop processing options
+ echo "Invalid option: $1"
+ # Optional: exit script or shift to treat remaining as positional args
+ exit 1
+ ;;
+ esac
+done
+
+
+
+source /opt/intel/oneapi/setvars.sh
+
+#export GGML_SYCL_DEBUG=1
+
+#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+
+#support malloc device memory more than 4GB.
+export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
+
+if [ $GGML_SYCL_DEVICE -ne -1 ]; then
+ echo "Use $GGML_SYCL_DEVICE as main GPU"
+ #use signle GPU only
+ GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
+ export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
+ echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
+else
+ echo "Use all Intel GPUs, including iGPU & dGPU"
+ fi
+
+echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
+ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
+
diff --git a/llama.cpp/examples/sycl/win-build-sycl.bat b/llama.cpp/examples/sycl/win-build-sycl.bat
new file mode 100644
index 0000000..fc8b33b
--- /dev/null
+++ b/llama.cpp/examples/sycl/win-build-sycl.bat
@@ -0,0 +1,31 @@
+
+:: MIT license
+:: Copyright (C) 2024 Intel Corporation
+:: SPDX-License-Identifier: MIT
+
+
+IF not exist build (mkdir build)
+cd build
+if %errorlevel% neq 0 goto ERROR
+
+@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+if %errorlevel% neq 0 goto ERROR
+
+:: for FP16
+:: faster for long-prompt inference
+:: cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+
+:: for FP32
+cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+if %errorlevel% neq 0 goto ERROR
+
+:: build all binary
+cmake --build . -j
+if %errorlevel% neq 0 goto ERROR
+
+cd ..
+exit /B 0
+
+:ERROR
+echo comomand error: %errorlevel%
+exit /B %errorlevel%
diff --git a/llama.cpp/examples/sycl/win-run-llama2.bat b/llama.cpp/examples/sycl/win-run-llama2.bat
new file mode 100644
index 0000000..1f2dab8
--- /dev/null
+++ b/llama.cpp/examples/sycl/win-run-llama2.bat
@@ -0,0 +1,11 @@
+:: MIT license
+:: Copyright (C) 2024 Intel Corporation
+:: SPDX-License-Identifier: MIT
+
+set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+
+:: support malloc device memory more than 4GB.
+set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+set LOAD_MODE="--mmap"
+.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%
diff --git a/llama.cpp/examples/sycl/win-test.bat b/llama.cpp/examples/sycl/win-test.bat
new file mode 100644
index 0000000..1f2dab8
--- /dev/null
+++ b/llama.cpp/examples/sycl/win-test.bat
@@ -0,0 +1,11 @@
+:: MIT license
+:: Copyright (C) 2024 Intel Corporation
+:: SPDX-License-Identifier: MIT
+
+set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+
+:: support malloc device memory more than 4GB.
+set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+set LOAD_MODE="--mmap"
+.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%