/ scripts / server-llm.sh
server-llm.sh
  1  #!/bin/bash
  2  #
  3  # Helper script for deploying llama.cpp server with a single Bash command
  4  #
  5  # - Works on Linux and macOS
  6  # - Supports: CPU, CUDA, Metal
  7  # - Can run all GGUF models from HuggingFace
  8  # - Can serve requests in parallel
  9  # - Always builds latest llama.cpp from GitHub
 10  #
 11  # Limitations
 12  #
 13  # - Chat templates are poorly supported (base models recommended)
 14  # - Might be unstable!
 15  #
 16  # Usage:
 17  #   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]
 18  #
 19  #   --port:            port number, default is 8888
 20  #   --repo:            path to a repo containing GGUF model files
 21  #   --wtype:           weights type (f16, q8_0, q4_0, q4_1), default is user-input
 22  #   --backend:         cpu, cuda, metal, depends on the OS
 23  #   --gpu-id:          gpu id, default is 0
 24  #   --n-parallel:      number of parallel requests, default is 8
 25  #   --n-kv:            KV cache size, default is 4096
 26  #   --verbose:         verbose output
 27  #   --non-interactive: run without asking a permission to run
 28  #
 29  # Example:
 30  #
 31  #   bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
 32  #
 33  
 34  set -e
 35  
 36  # required utils: curl, git, make
 37  if ! command -v curl &> /dev/null; then
 38      printf "[-] curl not found\n"
 39      exit 1
 40  fi
 41  if ! command -v git &> /dev/null; then
 42      printf "[-] git not found\n"
 43      exit 1
 44  fi
 45  if ! command -v make &> /dev/null; then
 46      printf "[-] make not found\n"
 47      exit 1
 48  fi
 49  
 50  # parse arguments
 51  is_interactive=1
 52  port=8888
 53  repo=""
 54  wtype=""
 55  backend="cpu"
 56  
 57  # if macOS, use metal backend by default
 58  if [[ "$OSTYPE" == "darwin"* ]]; then
 59      backend="metal"
 60  elif command -v nvcc &> /dev/null; then
 61      backend="cuda"
 62  fi
 63  
 64  gpu_id=0
 65  n_parallel=8
 66  n_kv=4096
 67  verbose=0
 68  
 69  function print_usage {
 70      printf "Usage:\n"
 71      printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n"
 72      printf "  --port:             port number, default is 8888\n"
 73      printf "  --repo:             path to a repo containing GGUF model files\n"
 74      printf "  --wtype:            weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
 75      printf "  --backend:          cpu, cuda, metal, depends on the OS\n"
 76      printf "  --gpu-id:           gpu id, default is 0\n"
 77      printf "  --n-parallel:       number of parallel requests, default is 8\n"
 78      printf "  --n-kv:             KV cache size, default is 4096\n"
 79      printf "  --verbose:          verbose output\n\n"
 80      printf "  --non-interactive:  run without asking a permission to run\n"
 81      printf "Example:\n\n"
 82      printf '  bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
 83  }
 84  
 85  while [[ $# -gt 0 ]]; do
 86      key="$1"
 87      case $key in
 88          --non-interactive)
 89              is_interactive=0
 90              shift
 91              ;;
 92          --port)
 93              port="$2"
 94              shift
 95              shift
 96              ;;
 97          --repo)
 98              repo="$2"
 99              shift
100              shift
101              ;;
102          --wtype)
103              wtype="$2"
104              shift
105              shift
106              ;;
107          --backend)
108              backend="$2"
109              shift
110              shift
111              ;;
112          --gpu-id)
113              gpu_id="$2"
114              shift
115              shift
116              ;;
117          --n-parallel)
118              n_parallel="$2"
119              shift
120              shift
121              ;;
122          --n-kv)
123              n_kv="$2"
124              shift
125              shift
126              ;;
127          --verbose)
128              verbose=1
129              shift
130              ;;
131          --help)
132              print_usage
133              exit 0
134              ;;
135          *)
136              echo "Unknown argument: $key"
137              print_usage
138              exit 1
139              ;;
140      esac
141  done
142  
143  # available weights types
144  wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
145  
146  wfiles=()
147  for wt in "${wtypes[@]}"; do
148      wfiles+=("")
149  done
150  
151  # map wtype input to index
152  if [[ ! -z "$wtype" ]]; then
153      iw=-1
154      is=0
155      for wt in "${wtypes[@]}"; do
156          # uppercase
157          uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]')
158          if [[ "$uwt" == "$wtype" ]]; then
159              iw=$is
160              break
161          fi
162          is=$((is+1))
163      done
164  
165      if [[ $iw -eq -1 ]]; then
166          printf "[-] Invalid weight type: %s\n" "$wtype"
167          exit 1
168      fi
169  
170      wtype="$iw"
171  fi
172  
173  # sample repos
174  repos=(
175      "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
176      "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
177      "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
178      "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
179      "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
180      "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
181      "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
182      "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
183      "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
184      "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
185  )
186  if [ $is_interactive -eq 1 ]; then
187      printf "\n"
188      printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
189      printf "    Based on the options that follow, the script might download a model file\n"
190      printf "    from the internet, which can be a few GBs in size. The script will also\n"
191      printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
192      printf "\n"
193      printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
194      printf "    model using llama.cpp for demonstration purposes.\n"
195      printf "\n"
196      printf "    Please note:\n"
197      printf "\n"
198      printf "    - All new data will be stored in the current folder\n"
199      printf "    - The server will be listening on all network interfaces\n"
200      printf "    - The server will run with default settings which are not always optimal\n"
201      printf "    - Do not judge the quality of a model based on the results from this script\n"
202      printf "    - Do not use this script to benchmark llama.cpp\n"
203      printf "    - Do not use this script in production\n"
204      printf "    - This script is only for demonstration purposes\n"
205      printf "\n"
206      printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n"
207      printf "\n"
208      printf "    Press Enter to continue ...\n\n"
209  
210      read
211  fi
212  
213  if [[ -z "$repo" ]]; then
214      printf "[+] No repo provided from the command line\n"
215      printf "    Please select a number from the list below or enter an URL:\n\n"
216  
217      is=0
218      for r in "${repos[@]}"; do
219          printf "    %2d) %s\n" $is "$r"
220          is=$((is+1))
221      done
222  
223      # ask for repo until index of sample repo is provided or an URL
224      while [[ -z "$repo" ]]; do
225          printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
226          read -p "[+] Select repo: " repo
227  
228          # check if the input is a number
229          if [[ "$repo" =~ ^[0-9]+$ ]]; then
230              if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
231                  repo="${repos[$repo]}"
232              else
233                  printf "[-] Invalid repo index: %s\n" "$repo"
234                  repo=""
235              fi
236          elif [[ "$repo" =~ ^https?:// ]]; then
237              repo="$repo"
238          else
239              printf "[-] Invalid repo URL: %s\n" "$repo"
240              repo=""
241          fi
242      done
243  fi
244  
245  # remove suffix
246  repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
247  
248  printf "[+] Checking for GGUF model files in %s\n" "$repo"
249  
250  # find GGUF files in the source
251  # TODO: better logic
252  model_tree="${repo%/}/tree/main"
253  model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
254  
255  # list all files in the provided git repo
256  printf "[+] Model files:\n\n"
257  for file in $model_files; do
258      # determine iw by grepping the filename with wtypes
259      iw=-1
260      is=0
261      for wt in "${wtypes[@]}"; do
262          # uppercase
263          ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
264          if [[ "$ufile" =~ "$wt" ]]; then
265              iw=$is
266              break
267          fi
268          is=$((is+1))
269      done
270  
271      if [[ $iw -eq -1 ]]; then
272          continue
273      fi
274  
275      wfiles[$iw]="$file"
276  
277      have=" "
278      if [[ -f "$file" ]]; then
279          have="*"
280      fi
281  
282      printf "    %2d) %s %s\n" $iw "$have" "$file"
283  done
284  
285  wfile="${wfiles[$wtype]}"
286  
287  # ask for weights type until provided and available
288  while [[ -z "$wfile" ]]; do
289      printf "\n"
290      read -p "[+] Select weight type: " wtype
291      wfile="${wfiles[$wtype]}"
292  
293      if [[ -z "$wfile" ]]; then
294          printf "[-] Invalid weight type: %s\n" "$wtype"
295          wtype=""
296      fi
297  done
298  
299  printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
300  
301  url="${repo%/}/resolve/main/$wfile"
302  
303  # check file if the model has been downloaded before
304  chk="$wfile.chk"
305  
306  # check if we should download the file
307  # - if $wfile does not exist
308  # - if $wfile exists but $chk does not exist
309  # - if $wfile exists and $chk exists but $wfile is newer than $chk
310  # TODO: better logic using git lfs info
311  
312  do_download=0
313  
314  if [[ ! -f "$wfile" ]]; then
315      do_download=1
316  elif [[ ! -f "$chk" ]]; then
317      do_download=1
318  elif [[ "$wfile" -nt "$chk" ]]; then
319      do_download=1
320  fi
321  
322  if [[ $do_download -eq 1 ]]; then
323      printf "[+] Downloading weights from %s\n" "$url"
324  
325      # download the weights file
326      curl -o "$wfile" -# -L "$url"
327  
328      # create a check file if successful
329      if [[ $? -eq 0 ]]; then
330          printf "[+] Creating check file %s\n" "$chk"
331          touch "$chk"
332      fi
333  else
334      printf "[+] Using cached weights %s\n" "$wfile"
335  fi
336  
337  # get latest llama.cpp and build
338  
339  printf "[+] Downloading latest llama.cpp\n"
340  
341  llama_cpp_dir="__llama_cpp_port_${port}__"
342  
343  if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
344      # if the dir exists and there isn't a file "__ggml_script__" in it, abort
345      printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
346      printf "[-] Please remove it and try again\n"
347      exit 1
348  elif [[ -d "$llama_cpp_dir" ]]; then
349      printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
350      printf "[+] Using cached llama.cpp\n"
351  
352      cd "$llama_cpp_dir"
353      git reset --hard
354      git fetch
355      git checkout origin/master
356  
357      cd ..
358  else
359      printf "[+] Cloning llama.cpp\n"
360  
361      git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
362  fi
363  
364  # mark that that the directory is made by this script
365  touch "$llama_cpp_dir/__ggml_script__"
366  
367  if [[ $verbose -eq 1 ]]; then
368      set -x
369  fi
370  
371  # build
372  cd "$llama_cpp_dir"
373  
374  make clean
375  
376  log="--silent"
377  if [[ $verbose -eq 1 ]]; then
378      log=""
379  fi
380  
381  if [[ "$backend" == "cuda" ]]; then
382      printf "[+] Building with CUDA backend\n"
383      LLAMA_CUDA=1 make -j llama-server $log
384  elif [[ "$backend" == "cpu" ]]; then
385      printf "[+] Building with CPU backend\n"
386      make -j llama-server $log
387  elif [[ "$backend" == "metal" ]]; then
388      printf "[+] Building with Metal backend\n"
389      make -j llama-server $log
390  else
391      printf "[-] Unknown backend: %s\n" "$backend"
392      exit 1
393  fi
394  
395  # run the server
396  
397  printf "[+] Running server\n"
398  
399  args=""
400  if [[ "$backend" == "cuda" ]]; then
401      export CUDA_VISIBLE_DEVICES=$gpu_id
402      args="-ngl 999"
403  elif [[ "$backend" == "cpu" ]]; then
404      args="-ngl 0"
405  elif [[ "$backend" == "metal" ]]; then
406      args="-ngl 999"
407  else
408      printf "[-] Unknown backend: %s\n" "$backend"
409      exit 1
410  fi
411  
412  if [[ $verbose -eq 1 ]]; then
413      args="$args --verbose"
414  fi
415  
416  ./llama-server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
417  
418  exit 0