1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
# Configuration for llama.cpp service
# Path to the llama-server binary
LLAMA_SERVER_BINARY="/usr/bin/llama-server"
# Additional flags for the llama-server
LLAMA_SERVER_FLAGS="--port 8000"
# -m, --model FNAME: Model path (default: `models/7B/ggml-model-f16.gguf`)
LLAMA_SERVER_FLAGS+=" -m edit-etc-llama.cpp-service-config"
# General Options
# ----------------
# --verbose-prompt: Print a verbose prompt before generation (default: false)
# LLAMA_SERVER_FLAGS+=" --verbose-prompt"
# -t, --threads N: Number of threads to use during generation (default: -1)
# LLAMA_SERVER_FLAGS+=" -t 4"
# -tb, --threads-batch N: Number of threads for batch and prompt processing (default: same as --threads)
# LLAMA_SERVER_FLAGS+=" -tb 2"
# -C, --cpu-mask M: CPU affinity mask (default: "")
# LLAMA_SERVER_FLAGS+=" -C 0xff"
# -Cr, --cpu-range lo-hi: Range of CPUs for affinity
# LLAMA_SERVER_FLAGS+=" -Cr 0-3"
# --cpu-strict <0|1>: Use strict CPU placement (default: 0)
# LLAMA_SERVER_FLAGS+=" --cpu-strict 1"
# --prio N: Set process/thread priority: 0-normal, 1-medium, 2-high, 3-realtime (default: 0)
# LLAMA_SERVER_FLAGS+=" --prio 2"
# -c, --ctx-size N: Size of the prompt context (default: 4096, 0 = loaded from model)
# LLAMA_SERVER_FLAGS+=" -c 2048"
# -n, --predict, --n-predict N: Number of tokens to predict (default: -1, -1 = infinity)
# LLAMA_SERVER_FLAGS+=" -n 100"
# --keep N: Number of tokens to keep from the initial prompt (default: 0, -1 = all)
# LLAMA_SERVER_FLAGS+=" --keep 10"
# -b, --batch-size N: Logical maximum batch size (default: 2048)
# LLAMA_SERVER_FLAGS+=" -b 128"
# Advanced Options
# ----------------
# -fa, --flash-attn: Enable Flash Attention (default: disabled)
# LLAMA_SERVER_FLAGS+=" -fa"
# --no-perf: Disable internal libllama performance timings (default: false)
# LLAMA_SERVER_FLAGS+=" --no-perf"
# -f, --file FNAME: File containing the prompt (default: none)
# LLAMA_SERVER_FLAGS+=" -f /path/to/prompt.txt"
# --mlock: Force the system to keep the model in RAM
# LLAMA_SERVER_FLAGS+=" --mlock"
# GPU Options
# -----------
# -ngl, --gpu-layers N: Number of layers to store in VRAM
# LLAMA_SERVER_FLAGS+=" -ngl 10"
# -sm, --split-mode {none,layer,row}: Split model across multiple GPUs (default: layer)
# LLAMA_SERVER_FLAGS+=" -sm row"
# Logging Options
# ---------------
# --log-file FNAME: Log output to a file
# LLAMA_SERVER_FLAGS+=" --log-file /var/log/llama.cpp.log"
# --log-colors: Enable colored logging
# LLAMA_SERVER_FLAGS+=" --log-colors"
# --log-timestamps: Enable timestamps in log messages
# LLAMA_SERVER_FLAGS+=" --log-timestamps"
# Example Usage
# -------------
# Combine multiple flags by appending them to LLAMA_SERVER_FLAGS
# LLAMA_SERVER_FLAGS+=" -t 4 -c 2048 --verbose-prompt"
|