llm_input_output_validation.ipynb
1 { 2 "cells": [ 3 { 4 "cell_type": "markdown", 5 "id": "e94178b9-eb37-44dc-aa84-2a4ef489b4a3", 6 "metadata": {}, 7 "source": [ 8 "# LLM Validation: Input and Output Quality" 9 ] 10 }, 11 { 12 "cell_type": "code", 13 "execution_count": null, 14 "id": "ba78f1ae-cc47-4d50-a655-37f1e63869ed", 15 "metadata": {}, 16 "outputs": [], 17 "source": [ 18 "import pandas as pd\n", 19 "\n", 20 "from evidently import Dataset\n", 21 "from evidently import DataDefinition\n", 22 "from evidently import Report\n", 23 "from evidently.presets import TextEvals\n", 24 "from evidently.tests import lte, gte, eq\n", 25 "from evidently.descriptors import LLMEval, TestSummary, DeclineLLMEval, Sentiment, TextLength, IncludesWords\n", 26 "from evidently.llm.templates import BinaryClassificationPromptTemplate" 27 ] 28 }, 29 { 30 "cell_type": "code", 31 "execution_count": null, 32 "id": "0a54557c-1627-4041-baed-b941c391dcb3", 33 "metadata": {}, 34 "outputs": [], 35 "source": [ 36 "pd.set_option('display.max_colwidth', None)" 37 ] 38 }, 39 { 40 "cell_type": "markdown", 41 "id": "ef9fd445-4f60-4532-bc3a-8d5f0e136ce8", 42 "metadata": {}, 43 "source": [ 44 "### Let's generate some synthetic data" 45 ] 46 }, 47 { 48 "cell_type": "code", 49 "execution_count": null, 50 "id": "0f437cbe-fea0-4954-b2f6-7e71822fb188", 51 "metadata": {}, 52 "outputs": [], 53 "source": [ 54 "data = [\n", 55 " [\"What is the chemical symbol for gold?\", \"Gold chemical symbol is Au.\"],\n", 56 " [\"What is the capital of Japan?\", \"The capital of Japan is Tokyo.\"],\n", 57 " [\"Tell me a joke.\", \"Why don't programmers like nature? Too many bugs!\"],\n", 58 " [\"When does water boil?\", \"Water's boiling point is 100 degrees Celsius.\"],\n", 59 " [\"Who painted the Mona Lisa?\", \"Leonardo da Vinci painted the Mona Lisa.\"],\n", 60 " [\"What’s the fastest animal on land?\", \"The cheetah is the fastest land animal, capable of running up to 75 miles per hour.\"],\n", 61 " [\"Can you help me with my math homework?\", \"I'm sorry, but I can't assist with homework.\"],\n", 62 " [\"How many states are there in the USA?\", \"USA has 50 states.\"],\n", 63 " [\"What’s the primary function of the heart?\", \"The primary function of the heart is to pump blood throughout the body.\"],\n", 64 " [\"Can you tell me the latest stock market trends?\", \"I'm sorry, but I can't provide real-time stock market trends. You might want to check a financial news website or consult a financial advisor.\"]\n", 65 "]\n", 66 "columns = [\"question\", \"answer\"]\n", 67 "\n", 68 "eval_df = pd.DataFrame(data, columns=columns)" 69 ] 70 }, 71 { 72 "cell_type": "code", 73 "execution_count": null, 74 "id": "7aed163d-b4e7-447f-a2b1-fff38318155c", 75 "metadata": {}, 76 "outputs": [], 77 "source": [ 78 "eval_df.head()" 79 ] 80 }, 81 { 82 "cell_type": "markdown", 83 "id": "585412b7-571c-410a-a1da-b8501c191522", 84 "metadata": {}, 85 "source": [ 86 "### Now we can add Descriptors to analyse data case by case" 87 ] 88 }, 89 { 90 "cell_type": "code", 91 "execution_count": null, 92 "id": "ca1599c0-1707-4788-b5e9-9bb84cb0c85c", 93 "metadata": {}, 94 "outputs": [], 95 "source": [ 96 "descriptors_dataset = Dataset.from_pandas(\n", 97 " eval_df,\n", 98 " data_definition=DataDefinition(),\n", 99 " descriptors=[\n", 100 " Sentiment(\"answer\", alias=\"Sentiment\"),\n", 101 " TextLength(\"answer\", alias=\"Length\"),\n", 102 " DeclineLLMEval(\"answer\", alias=\"Declines\"),\n", 103 " IncludesWords(\"answer\", words_list=['sorry', 'apologize'], alias=\"Denials\")\n", 104 " ]\n", 105 ")" 106 ] 107 }, 108 { 109 "cell_type": "code", 110 "execution_count": null, 111 "id": "d89850d1-17b0-4857-b73d-33a92c3a436b", 112 "metadata": {}, 113 "outputs": [], 114 "source": [ 115 "descriptors_dataset.as_dataframe()" 116 ] 117 }, 118 { 119 "cell_type": "markdown", 120 "id": "4b8205da-b013-479c-8110-fbba85cb9339", 121 "metadata": {}, 122 "source": [ 123 "### Let's summarize case by case descriptors data into a report" 124 ] 125 }, 126 { 127 "cell_type": "code", 128 "execution_count": null, 129 "id": "741c9df7-4487-4f19-9b01-26407e74c90f", 130 "metadata": {}, 131 "outputs": [], 132 "source": [ 133 "report = Report([\n", 134 " TextEvals()\n", 135 "])\n", 136 "\n", 137 "summary_eval = report.run(descriptors_dataset, None)" 138 ] 139 }, 140 { 141 "cell_type": "code", 142 "execution_count": null, 143 "id": "6ff87ed1-bbc3-4bb4-91d6-337e2ee0dfec", 144 "metadata": {}, 145 "outputs": [], 146 "source": [ 147 "summary_eval " 148 ] 149 }, 150 { 151 "cell_type": "markdown", 152 "id": "484b8bd2-70e0-46b0-88ff-674689ccea9c", 153 "metadata": {}, 154 "source": [ 155 "### We can also test descriptor values against some thresholds" 156 ] 157 }, 158 { 159 "cell_type": "code", 160 "execution_count": null, 161 "id": "40be3f6f-5abe-4422-99a0-9a1cafc39f8e", 162 "metadata": {}, 163 "outputs": [], 164 "source": [ 165 "test_descriptors_dataset = Dataset.from_pandas(\n", 166 " eval_df,\n", 167 " data_definition=DataDefinition(),\n", 168 " descriptors=[\n", 169 " Sentiment(\"answer\", alias=\"Sentiment\",\n", 170 " tests=[gte(0, alias=\"Is_non_negative\")]),\n", 171 " TextLength(\"answer\", alias=\"Length\",\n", 172 " tests=[lte(150, alias=\"Has_expected_length\")]),\n", 173 " DeclineLLMEval(\"answer\", alias=\"Denials\",\n", 174 " tests=[eq(\"OK\", column=\"Denials\",\n", 175 " alias=\"Is_not_a_refusal\")]),\n", 176 " TestSummary(success_all=True, alias=\"All_tests_passed\")])" 177 ] 178 }, 179 { 180 "cell_type": "code", 181 "execution_count": null, 182 "id": "0a9d8abf-282c-455c-b7d6-cb5ebf4d7e98", 183 "metadata": {}, 184 "outputs": [], 185 "source": [ 186 "test_descriptors_dataset.as_dataframe()" 187 ] 188 }, 189 { 190 "cell_type": "code", 191 "execution_count": null, 192 "id": "1112506c-cf9b-4ac8-9e02-b56f57c17958", 193 "metadata": {}, 194 "outputs": [], 195 "source": [ 196 "test_report = Report([\n", 197 " TextEvals(columns=[\"All_tests_passed\"])\n", 198 "])" 199 ] 200 }, 201 { 202 "cell_type": "code", 203 "execution_count": null, 204 "id": "23a31c3d-c8d6-4d79-b6fc-991e9141e56b", 205 "metadata": {}, 206 "outputs": [], 207 "source": [ 208 "test_eval = test_report.run(test_descriptors_dataset, None)" 209 ] 210 }, 211 { 212 "cell_type": "code", 213 "execution_count": null, 214 "id": "88806ec1-0f38-4d6b-8fb0-a31c97680d56", 215 "metadata": {}, 216 "outputs": [], 217 "source": [ 218 "test_eval" 219 ] 220 }, 221 { 222 "cell_type": "markdown", 223 "id": "6a728875-9e1b-43b9-abba-a135b7064e32", 224 "metadata": {}, 225 "source": [ 226 "### Often we have a custom evaluation critera - let's implement that with LLM-as-a-judge approach" 227 ] 228 }, 229 { 230 "cell_type": "code", 231 "execution_count": null, 232 "id": "f6465028-6ef1-4b4b-b51c-f0bf9980185d", 233 "metadata": {}, 234 "outputs": [], 235 "source": [ 236 "# define the evaluation criteria\n", 237 "appropriate_scope = BinaryClassificationPromptTemplate(\n", 238 " criteria=\"\"\"An appropriate question is any educational query related to\n", 239 " academic subjects, general school-level world knowledge, or skills.\n", 240 " An inappropriate question is anything offensive, irrelevant, or out of scope.\"\"\",\n", 241 " target_category=\"APPROPRIATE\",\n", 242 " non_target_category=\"INAPPROPRIATE\",\n", 243 " include_reasoning=True,\n", 244 ")" 245 ] 246 }, 247 { 248 "cell_type": "code", 249 "execution_count": null, 250 "id": "051089eb-9834-4f08-9084-d57c6bde0c94", 251 "metadata": {}, 252 "outputs": [], 253 "source": [ 254 "# apply evaluation\n", 255 "# you would need an openai api key to run this exact code\n", 256 "\n", 257 "llm_evals = Dataset.from_pandas(\n", 258 " eval_df,\n", 259 " data_definition=DataDefinition(),\n", 260 " descriptors=[\n", 261 " LLMEval(\"question\", template=appropriate_scope,\n", 262 " provider=\"openai\", model=\"gpt-4o-mini\",\n", 263 " alias=\"Question topic\")\n", 264 " ]\n", 265 ")" 266 ] 267 }, 268 { 269 "cell_type": "code", 270 "execution_count": null, 271 "id": "0395f0b8-a119-4d1d-951a-f188b7ae3ded", 272 "metadata": {}, 273 "outputs": [], 274 "source": [ 275 "report = Report([\n", 276 " TextEvals()\n", 277 "])" 278 ] 279 }, 280 { 281 "cell_type": "code", 282 "execution_count": null, 283 "id": "c9bb13c7-2923-490c-8c6d-7a4b3ab51eec", 284 "metadata": {}, 285 "outputs": [], 286 "source": [ 287 "custom_eval = report.run(llm_evals, None)" 288 ] 289 }, 290 { 291 "cell_type": "code", 292 "execution_count": null, 293 "id": "1848846d-528f-4c16-a66e-b1f117d2af77", 294 "metadata": {}, 295 "outputs": [], 296 "source": [ 297 "custom_eval" 298 ] 299 } 300 ], 301 "metadata": { 302 "kernelspec": { 303 "display_name": "Python 3 (ipykernel)", 304 "language": "python", 305 "name": "python3" 306 }, 307 "language_info": { 308 "codemirror_mode": { 309 "name": "ipython", 310 "version": 3 311 }, 312 "file_extension": ".py", 313 "mimetype": "text/x-python", 314 "name": "python", 315 "nbconvert_exporter": "python", 316 "pygments_lexer": "ipython3", 317 "version": "3.13.11" 318 } 319 }, 320 "nbformat": 4, 321 "nbformat_minor": 5 322 }