sglang.conf


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

# SGLang global configuration
# Loaded by all sglang@<instance> units before the instance-specific config.
# Use this for hardware-specific and site-specific settings.
#
# SGLANG_OPTS: hardware/site flags (applied to all models)
# SGLANG_ARGS: model flags (set in per-instance /etc/sglang/<model>.conf)
#
# Tensor parallelism (set according to your GPU count):
#   --tp-size 1    Single GPU
#   --tp-size 2    Two GPUs (halves latency, doubles KV cache headroom)
#   --tp-size 4    Four GPUs
#   --tp-size 8    Eight GPUs
#
# Other useful options:
#   --port 30000                 Server port (default: 30000)
#   --mem-fraction-static 0.85   Fraction of GPU memory for weights+KV
#   --context-length 131072      Override model's default context length
#   --disable-radix-cache        Disable prefix caching
#   --disable-cuda-graph         Disable CUDA graphs
#
# See all options: python -m sglang.launch_server --help
# CUDA toolkit path (required by deep_gemm JIT compilation)
CUDA_HOME=/opt/cuda

SGLANG_OPTS="--port 30000"