server-llm.sh
1 #!/bin/bash 2 # 3 # Helper script for deploying llama.cpp server with a single Bash command 4 # 5 # - Works on Linux and macOS 6 # - Supports: CPU, CUDA, Metal 7 # - Can run all GGUF models from HuggingFace 8 # - Can serve requests in parallel 9 # - Always builds latest llama.cpp from GitHub 10 # 11 # Limitations 12 # 13 # - Chat templates are poorly supported (base models recommended) 14 # - Might be unstable! 15 # 16 # Usage: 17 # ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive] 18 # 19 # --port: port number, default is 8888 20 # --repo: path to a repo containing GGUF model files 21 # --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input 22 # --backend: cpu, cuda, metal, depends on the OS 23 # --gpu-id: gpu id, default is 0 24 # --n-parallel: number of parallel requests, default is 8 25 # --n-kv: KV cache size, default is 4096 26 # --verbose: verbose output 27 # --non-interactive: run without asking a permission to run 28 # 29 # Example: 30 # 31 # bash -c "$(curl -s https://ggml.ai/server-llm.sh)" 32 # 33 34 set -e 35 36 # required utils: curl, git, make 37 if ! command -v curl &> /dev/null; then 38 printf "[-] curl not found\n" 39 exit 1 40 fi 41 if ! command -v git &> /dev/null; then 42 printf "[-] git not found\n" 43 exit 1 44 fi 45 if ! command -v make &> /dev/null; then 46 printf "[-] make not found\n" 47 exit 1 48 fi 49 50 # parse arguments 51 is_interactive=1 52 port=8888 53 repo="" 54 wtype="" 55 backend="cpu" 56 57 # if macOS, use metal backend by default 58 if [[ "$OSTYPE" == "darwin"* ]]; then 59 backend="metal" 60 elif command -v nvcc &> /dev/null; then 61 backend="cuda" 62 fi 63 64 gpu_id=0 65 n_parallel=8 66 n_kv=4096 67 verbose=0 68 69 function print_usage { 70 printf "Usage:\n" 71 printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n" 72 printf " --port: port number, default is 8888\n" 73 printf " --repo: path to a repo containing GGUF model files\n" 74 printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n" 75 printf " --backend: cpu, cuda, metal, depends on the OS\n" 76 printf " --gpu-id: gpu id, default is 0\n" 77 printf " --n-parallel: number of parallel requests, default is 8\n" 78 printf " --n-kv: KV cache size, default is 4096\n" 79 printf " --verbose: verbose output\n\n" 80 printf " --non-interactive: run without asking a permission to run\n" 81 printf "Example:\n\n" 82 printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n' 83 } 84 85 while [[ $# -gt 0 ]]; do 86 key="$1" 87 case $key in 88 --non-interactive) 89 is_interactive=0 90 shift 91 ;; 92 --port) 93 port="$2" 94 shift 95 shift 96 ;; 97 --repo) 98 repo="$2" 99 shift 100 shift 101 ;; 102 --wtype) 103 wtype="$2" 104 shift 105 shift 106 ;; 107 --backend) 108 backend="$2" 109 shift 110 shift 111 ;; 112 --gpu-id) 113 gpu_id="$2" 114 shift 115 shift 116 ;; 117 --n-parallel) 118 n_parallel="$2" 119 shift 120 shift 121 ;; 122 --n-kv) 123 n_kv="$2" 124 shift 125 shift 126 ;; 127 --verbose) 128 verbose=1 129 shift 130 ;; 131 --help) 132 print_usage 133 exit 0 134 ;; 135 *) 136 echo "Unknown argument: $key" 137 print_usage 138 exit 1 139 ;; 140 esac 141 done 142 143 # available weights types 144 wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K") 145 146 wfiles=() 147 for wt in "${wtypes[@]}"; do 148 wfiles+=("") 149 done 150 151 # map wtype input to index 152 if [[ ! -z "$wtype" ]]; then 153 iw=-1 154 is=0 155 for wt in "${wtypes[@]}"; do 156 # uppercase 157 uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]') 158 if [[ "$uwt" == "$wtype" ]]; then 159 iw=$is 160 break 161 fi 162 is=$((is+1)) 163 done 164 165 if [[ $iw -eq -1 ]]; then 166 printf "[-] Invalid weight type: %s\n" "$wtype" 167 exit 1 168 fi 169 170 wtype="$iw" 171 fi 172 173 # sample repos 174 repos=( 175 "https://huggingface.co/TheBloke/Llama-2-7B-GGUF" 176 "https://huggingface.co/TheBloke/Llama-2-13B-GGUF" 177 "https://huggingface.co/TheBloke/Llama-2-70B-GGUF" 178 "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF" 179 "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF" 180 "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF" 181 "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF" 182 "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF" 183 "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF" 184 "https://huggingface.co/TheBloke/CausalLM-7B-GGUF" 185 ) 186 if [ $is_interactive -eq 1 ]; then 187 printf "\n" 188 printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n" 189 printf " Based on the options that follow, the script might download a model file\n" 190 printf " from the internet, which can be a few GBs in size. The script will also\n" 191 printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n" 192 printf "\n" 193 printf " Upon success, an HTTP server will be started and it will serve the selected\n" 194 printf " model using llama.cpp for demonstration purposes.\n" 195 printf "\n" 196 printf " Please note:\n" 197 printf "\n" 198 printf " - All new data will be stored in the current folder\n" 199 printf " - The server will be listening on all network interfaces\n" 200 printf " - The server will run with default settings which are not always optimal\n" 201 printf " - Do not judge the quality of a model based on the results from this script\n" 202 printf " - Do not use this script to benchmark llama.cpp\n" 203 printf " - Do not use this script in production\n" 204 printf " - This script is only for demonstration purposes\n" 205 printf "\n" 206 printf " If you don't know what you are doing, please press Ctrl-C to abort now\n" 207 printf "\n" 208 printf " Press Enter to continue ...\n\n" 209 210 read 211 fi 212 213 if [[ -z "$repo" ]]; then 214 printf "[+] No repo provided from the command line\n" 215 printf " Please select a number from the list below or enter an URL:\n\n" 216 217 is=0 218 for r in "${repos[@]}"; do 219 printf " %2d) %s\n" $is "$r" 220 is=$((is+1)) 221 done 222 223 # ask for repo until index of sample repo is provided or an URL 224 while [[ -z "$repo" ]]; do 225 printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n" 226 read -p "[+] Select repo: " repo 227 228 # check if the input is a number 229 if [[ "$repo" =~ ^[0-9]+$ ]]; then 230 if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then 231 repo="${repos[$repo]}" 232 else 233 printf "[-] Invalid repo index: %s\n" "$repo" 234 repo="" 235 fi 236 elif [[ "$repo" =~ ^https?:// ]]; then 237 repo="$repo" 238 else 239 printf "[-] Invalid repo URL: %s\n" "$repo" 240 repo="" 241 fi 242 done 243 fi 244 245 # remove suffix 246 repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g') 247 248 printf "[+] Checking for GGUF model files in %s\n" "$repo" 249 250 # find GGUF files in the source 251 # TODO: better logic 252 model_tree="${repo%/}/tree/main" 253 model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g') 254 255 # list all files in the provided git repo 256 printf "[+] Model files:\n\n" 257 for file in $model_files; do 258 # determine iw by grepping the filename with wtypes 259 iw=-1 260 is=0 261 for wt in "${wtypes[@]}"; do 262 # uppercase 263 ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]') 264 if [[ "$ufile" =~ "$wt" ]]; then 265 iw=$is 266 break 267 fi 268 is=$((is+1)) 269 done 270 271 if [[ $iw -eq -1 ]]; then 272 continue 273 fi 274 275 wfiles[$iw]="$file" 276 277 have=" " 278 if [[ -f "$file" ]]; then 279 have="*" 280 fi 281 282 printf " %2d) %s %s\n" $iw "$have" "$file" 283 done 284 285 wfile="${wfiles[$wtype]}" 286 287 # ask for weights type until provided and available 288 while [[ -z "$wfile" ]]; do 289 printf "\n" 290 read -p "[+] Select weight type: " wtype 291 wfile="${wfiles[$wtype]}" 292 293 if [[ -z "$wfile" ]]; then 294 printf "[-] Invalid weight type: %s\n" "$wtype" 295 wtype="" 296 fi 297 done 298 299 printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile" 300 301 url="${repo%/}/resolve/main/$wfile" 302 303 # check file if the model has been downloaded before 304 chk="$wfile.chk" 305 306 # check if we should download the file 307 # - if $wfile does not exist 308 # - if $wfile exists but $chk does not exist 309 # - if $wfile exists and $chk exists but $wfile is newer than $chk 310 # TODO: better logic using git lfs info 311 312 do_download=0 313 314 if [[ ! -f "$wfile" ]]; then 315 do_download=1 316 elif [[ ! -f "$chk" ]]; then 317 do_download=1 318 elif [[ "$wfile" -nt "$chk" ]]; then 319 do_download=1 320 fi 321 322 if [[ $do_download -eq 1 ]]; then 323 printf "[+] Downloading weights from %s\n" "$url" 324 325 # download the weights file 326 curl -o "$wfile" -# -L "$url" 327 328 # create a check file if successful 329 if [[ $? -eq 0 ]]; then 330 printf "[+] Creating check file %s\n" "$chk" 331 touch "$chk" 332 fi 333 else 334 printf "[+] Using cached weights %s\n" "$wfile" 335 fi 336 337 # get latest llama.cpp and build 338 339 printf "[+] Downloading latest llama.cpp\n" 340 341 llama_cpp_dir="__llama_cpp_port_${port}__" 342 343 if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then 344 # if the dir exists and there isn't a file "__ggml_script__" in it, abort 345 printf "[-] Directory %s already exists\n" "$llama_cpp_dir" 346 printf "[-] Please remove it and try again\n" 347 exit 1 348 elif [[ -d "$llama_cpp_dir" ]]; then 349 printf "[+] Directory %s already exists\n" "$llama_cpp_dir" 350 printf "[+] Using cached llama.cpp\n" 351 352 cd "$llama_cpp_dir" 353 git reset --hard 354 git fetch 355 git checkout origin/master 356 357 cd .. 358 else 359 printf "[+] Cloning llama.cpp\n" 360 361 git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir" 362 fi 363 364 # mark that that the directory is made by this script 365 touch "$llama_cpp_dir/__ggml_script__" 366 367 if [[ $verbose -eq 1 ]]; then 368 set -x 369 fi 370 371 # build 372 cd "$llama_cpp_dir" 373 374 make clean 375 376 log="--silent" 377 if [[ $verbose -eq 1 ]]; then 378 log="" 379 fi 380 381 if [[ "$backend" == "cuda" ]]; then 382 printf "[+] Building with CUDA backend\n" 383 LLAMA_CUDA=1 make -j llama-server $log 384 elif [[ "$backend" == "cpu" ]]; then 385 printf "[+] Building with CPU backend\n" 386 make -j llama-server $log 387 elif [[ "$backend" == "metal" ]]; then 388 printf "[+] Building with Metal backend\n" 389 make -j llama-server $log 390 else 391 printf "[-] Unknown backend: %s\n" "$backend" 392 exit 1 393 fi 394 395 # run the server 396 397 printf "[+] Running server\n" 398 399 args="" 400 if [[ "$backend" == "cuda" ]]; then 401 export CUDA_VISIBLE_DEVICES=$gpu_id 402 args="-ngl 999" 403 elif [[ "$backend" == "cpu" ]]; then 404 args="-ngl 0" 405 elif [[ "$backend" == "metal" ]]; then 406 args="-ngl 999" 407 else 408 printf "[-] Unknown backend: %s\n" "$backend" 409 exit 1 410 fi 411 412 if [[ $verbose -eq 1 ]]; then 413 args="$args --verbose" 414 fi 415 416 ./llama-server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args 417 418 exit 0