pod-llama.sh
1 #!/bin/bash 2 # 3 # Use this script only on fresh pods (runpod.io)! 4 # Otherwise, it can break your environment! 5 # 6 7 if [ -z "$1" ]; then 8 echo "Usage: $0 <data>" 9 echo " 0: no models" 10 echo " 1: tinyllama-1b" 11 echo " 2: codellama-7b" 12 echo " 3: codellama-13b" 13 echo " 4: codellama-34b" 14 echo " 5: codellama-7b-instruct" 15 echo " 6: codellama-13b-instruct" 16 echo " 7: codellama-34b-instruct" 17 18 exit 1 19 fi 20 21 set -x 22 23 # setup deps 24 apt-get update 25 apt-get install -y git-lfs cmake cmake-curses-gui vim ruby 26 git-lfs install 27 28 if [ ! -d "/workspace" ]; then 29 ln -sfn $(pwd) /workspace 30 fi 31 32 # download data 33 cd /workspace 34 35 # this is useful to git clone repos without doubling the disk size due to .git 36 git clone https://github.com/iboB/git-lfs-download 37 ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download 38 39 # llama.cpp 40 cd /workspace 41 git clone https://github.com/ggerganov/llama.cpp 42 43 cd llama.cpp 44 45 LLAMA_CUDA=1 make -j 46 47 ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b 48 ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b 49 ln -sfn /workspace/CodeLlama-13b-hf ./models/codellama-13b 50 ln -sfn /workspace/CodeLlama-34b-hf ./models/codellama-34b 51 ln -sfn /workspace/CodeLlama-7b-Instruct-hf ./models/codellama-7b-instruct 52 ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct 53 ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct 54 55 pip install -r requirements.txt 56 57 # cmake 58 cd /workspace/llama.cpp 59 60 mkdir build-cublas 61 cd build-cublas 62 63 cmake -DLLAMA_CUDA=1 ../ 64 make -j 65 66 if [ "$1" -eq "0" ]; then 67 exit 0 68 fi 69 70 # more models 71 if [ "$1" -eq "1" ]; then 72 cd /workspace 73 74 git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3 75 76 cd /workspace/llama.cpp 77 78 python3 examples/convert-legacy-llama.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16 79 80 ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0 81 ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k 82 ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0 83 fi 84 85 if [ "$1" -eq "2" ]; then 86 cd /workspace 87 88 git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf --without *safetensors* 89 rm -v ./CodeLlama-7b-hf/*safetensors* 90 91 cd /workspace/llama.cpp 92 93 python3 examples/convert-legacy-llama.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16 94 95 ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0 96 ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k 97 ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0 98 fi 99 100 if [ "$1" -eq "3" ]; then 101 cd /workspace 102 103 git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors* 104 rm -v ./CodeLlama-13b-hf/*safetensors* 105 106 cd /workspace/llama.cpp 107 108 python3 examples/convert-legacy-llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16 109 110 ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0 111 ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k 112 ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0 113 fi 114 115 if [ "$1" -eq "4" ]; then 116 cd /workspace 117 118 git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors* 119 rm -v ./CodeLlama-34b-hf/*safetensors* 120 121 cd /workspace/llama.cpp 122 123 python3 examples/convert-legacy-llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16 124 125 ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0 126 ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k 127 ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0 128 fi 129 130 if [ "$1" -eq "5" ]; then 131 cd /workspace 132 133 git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf --without *safetensors* 134 rm -v ./CodeLlama-7b-Instruct-hf/*safetensors* 135 136 cd /workspace/llama.cpp 137 138 python3 examples/convert-legacy-llama.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16 139 140 ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0 141 ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k 142 ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0 143 fi 144 145 if [ "$1" -eq "6" ]; then 146 cd /workspace 147 148 git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors* 149 rm -v ./CodeLlama-13b-Instruct-hf/*safetensors* 150 151 cd /workspace/llama.cpp 152 153 python3 examples/convert-legacy-llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16 154 155 ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0 156 ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k 157 ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0 158 fi 159 160 if [ "$1" -eq "7" ]; then 161 cd /workspace 162 163 git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors* 164 rm -v ./CodeLlama-34b-Instruct-hf/*safetensors* 165 166 cd /workspace/llama.cpp 167 168 python3 examples/convert-legacy-llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16 169 170 ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0 171 ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k 172 ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0 173 fi 174 175 if [ "$1" -eq "1" ]; then 176 # perf + perplexity 177 cd /workspace/llama.cpp/build-cublas 178 179 make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128" 180 181 ../scripts/get-wikitext-2.sh 182 unzip wikitext-2-raw-v1.zip 183 184 make -j && ./bin/llama-perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32 185 186 # batched 187 cd /workspace/llama.cpp 188 189 LLAMA_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 190 191 # batched-bench 192 cd /workspace/llama.cpp 193 194 LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 195 196 # parallel 197 cd /workspace/llama.cpp 198 199 LLAMA_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb 200 201 fi 202 203 # speculative 204 #if [ "$1" -eq "7" ]; then 205 # cd /workspace/llama.cpp 206 # 207 # LLAMA_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 208 #fi 209 210 # more benches 211 #LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 212 #LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 213