/ scripts / pod-llama.sh
pod-llama.sh
  1  #!/bin/bash
  2  #
  3  # Use this script only on fresh pods (runpod.io)!
  4  # Otherwise, it can break your environment!
  5  #
  6  
  7  if [ -z "$1" ]; then
  8      echo "Usage: $0 <data>"
  9      echo "  0: no models"
 10      echo "  1: tinyllama-1b"
 11      echo "  2: codellama-7b"
 12      echo "  3: codellama-13b"
 13      echo "  4: codellama-34b"
 14      echo "  5: codellama-7b-instruct"
 15      echo "  6: codellama-13b-instruct"
 16      echo "  7: codellama-34b-instruct"
 17  
 18      exit 1
 19  fi
 20  
 21  set -x
 22  
 23  # setup deps
 24  apt-get update
 25  apt-get install -y git-lfs cmake cmake-curses-gui vim ruby
 26  git-lfs install
 27  
 28  if [ ! -d "/workspace" ]; then
 29      ln -sfn $(pwd) /workspace
 30  fi
 31  
 32  # download data
 33  cd /workspace
 34  
 35  # this is useful to git clone repos without doubling the disk size due to .git
 36  git clone https://github.com/iboB/git-lfs-download
 37  ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download
 38  
 39  # llama.cpp
 40  cd /workspace
 41  git clone https://github.com/ggerganov/llama.cpp
 42  
 43  cd llama.cpp
 44  
 45  LLAMA_CUDA=1 make -j
 46  
 47  ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3  ./models/tinyllama-1b
 48  ln -sfn /workspace/CodeLlama-7b-hf           ./models/codellama-7b
 49  ln -sfn /workspace/CodeLlama-13b-hf          ./models/codellama-13b
 50  ln -sfn /workspace/CodeLlama-34b-hf          ./models/codellama-34b
 51  ln -sfn /workspace/CodeLlama-7b-Instruct-hf  ./models/codellama-7b-instruct
 52  ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct
 53  ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct
 54  
 55  pip install -r requirements.txt
 56  
 57  # cmake
 58  cd /workspace/llama.cpp
 59  
 60  mkdir build-cublas
 61  cd build-cublas
 62  
 63  cmake -DLLAMA_CUDA=1 ../
 64  make -j
 65  
 66  if [ "$1" -eq "0" ]; then
 67      exit 0
 68  fi
 69  
 70  # more models
 71  if [ "$1" -eq "1" ]; then
 72      cd /workspace
 73  
 74      git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3
 75  
 76      cd /workspace/llama.cpp
 77  
 78      python3 examples/convert-legacy-llama.py ./models/tinyllama-1b  --outfile ./models/tinyllama-1b/ggml-model-f16.gguf  --outtype f16
 79  
 80      ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
 81      ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
 82      ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0
 83  fi
 84  
 85  if [ "$1" -eq "2" ]; then
 86      cd /workspace
 87  
 88      git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf  --without *safetensors*
 89      rm -v ./CodeLlama-7b-hf/*safetensors*
 90  
 91      cd /workspace/llama.cpp
 92  
 93      python3 examples/convert-legacy-llama.py ./models/codellama-7b  --outfile ./models/codellama-7b/ggml-model-f16.gguf  --outtype f16
 94  
 95      ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
 96      ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
 97      ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0
 98  fi
 99  
100  if [ "$1" -eq "3" ]; then
101      cd /workspace
102  
103      git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors*
104      rm -v ./CodeLlama-13b-hf/*safetensors*
105  
106      cd /workspace/llama.cpp
107  
108      python3 examples/convert-legacy-llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
109  
110      ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
111      ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
112      ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0
113  fi
114  
115  if [ "$1" -eq "4" ]; then
116      cd /workspace
117  
118      git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors*
119      rm -v ./CodeLlama-34b-hf/*safetensors*
120  
121      cd /workspace/llama.cpp
122  
123      python3 examples/convert-legacy-llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
124  
125      ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
126      ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
127      ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0
128  fi
129  
130  if [ "$1" -eq "5" ]; then
131      cd /workspace
132  
133      git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf  --without *safetensors*
134      rm -v ./CodeLlama-7b-Instruct-hf/*safetensors*
135  
136      cd /workspace/llama.cpp
137  
138      python3 examples/convert-legacy-llama.py ./models/codellama-7b-instruct  --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf  --outtype f16
139  
140      ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
141      ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
142      ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0
143  fi
144  
145  if [ "$1" -eq "6" ]; then
146      cd /workspace
147  
148      git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors*
149      rm -v ./CodeLlama-13b-Instruct-hf/*safetensors*
150  
151      cd /workspace/llama.cpp
152  
153      python3 examples/convert-legacy-llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
154  
155      ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
156      ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
157      ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0
158  fi
159  
160  if [ "$1" -eq "7" ]; then
161      cd /workspace
162  
163      git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors*
164      rm -v ./CodeLlama-34b-Instruct-hf/*safetensors*
165  
166      cd /workspace/llama.cpp
167  
168      python3 examples/convert-legacy-llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
169  
170      ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
171      ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
172      ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0
173  fi
174  
175  if [ "$1" -eq "1" ]; then
176      # perf + perplexity
177      cd /workspace/llama.cpp/build-cublas
178  
179      make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128"
180  
181      ../scripts/get-wikitext-2.sh
182      unzip wikitext-2-raw-v1.zip
183  
184      make -j && ./bin/llama-perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
185  
186      # batched
187      cd /workspace/llama.cpp
188  
189      LLAMA_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
190  
191      # batched-bench
192      cd /workspace/llama.cpp
193  
194      LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
195  
196      # parallel
197      cd /workspace/llama.cpp
198  
199      LLAMA_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
200  
201  fi
202  
203  # speculative
204  #if [ "$1" -eq "7" ]; then
205  #    cd /workspace/llama.cpp
206  #
207  #    LLAMA_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
208  #fi
209  
210  # more benches
211  #LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
212  #LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
213