ScriptDecoder.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Derrick Oswald 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptDecoder.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2005/05/15 11:49:04 $ 10 // $Revision: 1.4 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.scanners; 28 29 import org.htmlparser.lexer.Cursor; 30 import org.htmlparser.lexer.Page; 31 import org.htmlparser.util.ParserException; 32 33 /** 34 * Decode script. 35 * Script obfuscated by the <A href="http://www.microsoft.com/downloads/details.aspx?FamilyId=E7877F67-C447-4873-B1B0-21F0626A6329&displaylang=en" target="_parent">Windows Script Encoder</A> 36 * provided by Microsoft, is converted to plaintext. This code is based loosely 37 * on example code provided by MrBrownstone with changes by Joe Steele, see 38 * <A href="http://www.virtualconspiracy.com/download/scrdec14.c" target="_parent">scrdec14.c</A>. 39 */ 40 public class ScriptDecoder 41 { 42 /** 43 * Termination state. 44 */ 45 public static final int STATE_DONE = 0; 46 47 /** 48 * State on entry. 49 */ 50 public static final int STATE_INITIAL = 1; 51 52 /** 53 * State while reading the encoded length. 54 */ 55 protected static final int STATE_LENGTH = 2; 56 57 /** 58 * State when reading up to decoded text. 59 */ 60 protected static final int STATE_PREFIX = 3; 61 62 /** 63 * State while decoding. 64 */ 65 protected static final int STATE_DECODE = 4; 66 67 /** 68 * State when reading an escape sequence. 69 */ 70 protected static final int STATE_ESCAPE = 5; 71 72 /** 73 * State when reading the checksum. 74 */ 75 protected static final int STATE_CHECKSUM = 6; 76 77 /** 78 * State while exiting. 79 */ 80 protected static final int STATE_FINAL = 7; 81 82 /** 83 * The state to enter when decrypting is complete. 84 * If this is STATE_DONE, the decryption will return with any characters 85 * following the encoded text still unconsumed. 86 * Otherwise, if this is STATE_INITIAL, the input will be exhausted and 87 * all following characters will be contained in the return value 88 * of the <code>Decode()</code> method. 89 */ 90 public static int LAST_STATE = STATE_DONE; 91 92 /** 93 * Table of lookup choice. 94 * The decoding cycles between three flavours determined 95 * by this sequence of 64 choices, corresponding to the 96 * first dimension of the lookup table. 97 */ 98 protected static byte mEncodingIndex[] = 99 { 100 1, 2, 0, 1, 2, 0, 2, 0, 0, 2, 0, 2, 1, 0, 2, 0, 101 1, 0, 2, 0, 1, 1, 2, 0, 0, 2, 1, 0, 2, 0, 0, 2, 102 1, 1, 0, 2, 0, 2, 0, 1, 0, 1, 1, 2, 0, 1, 0, 2, 103 1, 0, 2, 0, 1, 1, 2, 0, 0, 1, 1, 2, 0, 1, 0, 2, 104 }; 105 106 /** 107 * Two dimensional lookup table. 108 * The decoding uses this table to determine the plaintext for 109 * characters that aren't mEscaped. 110 */ 111 protected static char mLookupTable[][] = 112 { 113 { 114 '{', 115 '2', '0', '!', ')', '[', '8', '3', '=', 116 'X', ':', '5', 'e', '9', '\\', 'V', 's', 117 'f', 'N', 'E', 'k', 'b', 'Y', 'x', '^', 118 '}', 'J', 'm', 'q', 0, '`', 0, 'S', 119 0, 'B', '\'', 'H', 'r', 'u', '1', '7', 120 'M', 'R', '"', 'T', 'j', 'G', 'd', '-', 121 ' ', '', '.', 'L', ']', '~', 'l', 'o', 122 'y', 't', 'C', '&', 'v', '%', '$', '+', 123 '(', '#', 'A', '4', '\t', '*', 'D', '?', 124 'w', ';', 'U', 'i', 'a', 'c', 'P', 'g', 125 'Q', 'I', 'O', 'F', 'h', '|', '6', 'p', 126 'n', 'z', '/', '_', 'K', 'Z', ',', 'W', 127 }, 128 { 129 'W', 130 '.', 'G', 'z', 'V', 'B', 'j', '/', '&', 131 'I', 'A', '4', '2', '[', 'v', 'r', 'C', 132 '8', '9', 'p', 'E', 'h', 'q', 'O', '\t', 133 'b', 'D', '#', 'u', 0, '~', 0, '^', 134 0, 'w', 'J', 'a', ']', '"', 'K', 'o', 135 'N', ';', 'L', 'P', 'g', '*', '}', 't', 136 'T', '+', '-', ',', '0', 'n', 'k', 'f', 137 '5', '%', '!', 'd', 'M', 'R', 'c', '?', 138 '{', 'x', ')', '(', 's', 'Y', '3', '', 139 'm', 'U', 'S', '|', ':', '_', 'e', 'F', 140 'X', '1', 'i', 'l', 'Z', 'H', '\'', '\\', 141 '=', '$', 'y', '7', '`', 'Q', ' ', '6', 142 }, 143 { 144 'n', 145 '-', 'u', 'R', '`', 'q', '^', 'I', '\\', 146 'b', '}', ')', '6', ' ', '|', 'z', '', 147 'k', 'c', '3', '+', 'h', 'Q', 'f', 'v', 148 '1', 'd', 'T', 'C', 0, ':', 0, '~', 149 0, 'E', ',', '*', 't', '\'', '7', 'D', 150 'y', 'Y', '/', 'o', '&', 'r', 'j', '9', 151 '{', '?', '8', 'w', 'g', 'S', 'G', '4', 152 'x', ']', '0', '#', 'Z', '[', 'l', 'H', 153 'U', 'p', 'i', '.', 'L', '!', '$', 'N', 154 'P', '\t', 'V', 's', '5', 'a', 'K', 'X', 155 ';', 'W', '"', 'm', 'M', '%', '(', 'F', 156 'J', '2', 'A', '=', '_', 'O', 'B', 'e', 157 }, 158 }; 159 160 /** 161 * The base 64 decoding table. 162 * This array determines the value of decoded base 64 elements. 163 */ 164 protected static int mDigits[]; 165 static 166 { 167 mDigits = new int[0x7b]; 168 for (int i = 0; i < 26; i++) 169 { 170 mDigits['A' + i] = i; 171 mDigits['a' + i] = i + 26; 172 } 173 for (int i = 0; i < 10; i++) 174 mDigits['0' + i] = i + 52; 175 mDigits[0x2b] = '>'; 176 mDigits[0x2f] = '?'; 177 } 178 179 /** 180 * The leader. 181 * The prefix to the encoded script is #@~^nnnnnn== where the n are the 182 * length digits in base64. 183 */ 184 protected static char mLeader[] = 185 { 186 '#', 187 '@', 188 '~', 189 '^', 190 }; 191 192 /** 193 * The prefix. 194 * The prfix separates the encoded text from the length. 195 */ 196 protected static char mPrefix[] = 197 { 198 '=', 199 '=', 200 }; 201 202 /** 203 * The trailer. 204 * The suffix to the encoded script is nnnnnn==^#~@ where the n are the 205 * checksum digits in base64. These characters are the part after the checksum. 206 */ 207 protected static char mTrailer[] = 208 { 209 '=', 210 '=', 211 '^', 212 '#', 213 '~', 214 '@', 215 }; 216 217 /** 218 * Escape sequence characters. 219 */ 220 protected static char mEscapes[] = 221 { 222 '#', 223 '&', 224 '!', 225 '*', 226 '$', 227 }; 228 229 /** 230 * The escaped characters corresponding to the each escape sequence. 231 */ 232 protected static char mEscaped[] = //"\r\n<>@"; 233 { 234 '\r', 235 '\n', 236 '<', 237 '>', 238 '@', 239 }; 240 241 /** 242 * Extract the base 64 encoded number. 243 * This is a very limited subset of base 64 encoded characters. 244 * Six characters are expected. These are translated into a single long 245 * value. For a more complete base 64 codec see for example the base64 246 * package of <A href="http://sourceforge.net/projects/iharder/" target="_parent">iHarder.net</A> 247 * @param p Six base 64 encoded digits. 248 * @return The value of the decoded number. 249 */ 250 protected static long decodeBase64 (char[] p) 251 { 252 long ret; 253 254 ret = 0; 255 256 ret += (mDigits[p[0]] << 2); 257 ret += (mDigits[p[1]] >> 4); 258 ret += (mDigits[p[1]] & 0xf) << 12; 259 ret += ((mDigits[p[2]] >> 2) << 8); 260 ret += ((mDigits[p[2]] & 0x3) << 22); 261 ret += (mDigits[p[3]] << 16); 262 ret += ((mDigits[p[4]] << 2) << 24); 263 ret += ((mDigits[p[5]] >> 4) << 24); 264 265 return (ret); 266 } 267 268 /** 269 * Decode script encoded by the Microsoft obfuscator. 270 * @param page The source for encoded text. 271 * @param cursor The position at which to start decoding. 272 * This is advanced to the end of the encoded text. 273 * @return The plaintext. 274 * @exception ParserException If an error is discovered while decoding. 275 */ 276 public static String Decode (Page page, Cursor cursor) 277 throws 278 ParserException 279 { 280 int state; 281 int substate_initial; 282 int substate_length; 283 int substate_prefix; 284 int substate_checksum; 285 int substate_final; 286 long checksum; 287 long length; 288 char buffer[]; 289 buffer = new char[6]; 290 int index; 291 char character; 292 int input_character; 293 boolean found; 294 StringBuffer ret; 295 296 ret = new StringBuffer (1024); 297 298 state = STATE_INITIAL; 299 substate_initial = 0; 300 substate_length = 0; 301 substate_prefix = 0; 302 substate_checksum = 0; 303 substate_final = 0; 304 length = 0L; 305 checksum = 0L; 306 index = 0; 307 while (STATE_DONE != state) 308 { 309 input_character = page.getCharacter (cursor); 310 character = (char)input_character; 311 if (Page.EOF == input_character) 312 { 313 if ( (STATE_INITIAL != state) 314 || (0 != substate_initial) 315 || (0 != substate_length) 316 || (0 != substate_prefix) 317 || (0 != substate_checksum) 318 || (0 != substate_final)) 319 throw new ParserException ("illegal state for exit"); 320 state = STATE_DONE; 321 } 322 else 323 switch (state) 324 { 325 case STATE_INITIAL: 326 if (character == mLeader[substate_initial]) 327 { 328 substate_initial++; 329 if (substate_initial == mLeader.length) 330 { 331 substate_initial = 0; 332 state = STATE_LENGTH; 333 } 334 } 335 else 336 { 337 // oops, flush 338 for (int k = 0; 0 < substate_initial; k++) 339 { 340 ret.append (mLeader[k++]); 341 substate_initial--; 342 } 343 ret.append (character); 344 } 345 break; 346 347 case STATE_LENGTH: 348 buffer[substate_length] = character; 349 substate_length++; 350 if (substate_length >= buffer.length) 351 { 352 length = decodeBase64 (buffer); 353 if (0 > length) 354 throw new ParserException ("illegal length: " + length); 355 substate_length = 0; 356 state = STATE_PREFIX; 357 } 358 break; 359 360 case STATE_PREFIX: 361 if (character == mPrefix[substate_prefix]) 362 substate_prefix++; 363 else 364 throw new ParserException ("illegal character encountered: " + (int)character + " ('" + character + "')"); 365 if (substate_prefix >= mPrefix.length) 366 { 367 substate_prefix = 0; 368 state = STATE_DECODE; 369 } 370 break; 371 372 case STATE_DECODE: 373 if ('@' == character) 374 state = STATE_ESCAPE; 375 else 376 { 377 if (input_character < 0x80) 378 { 379 if (input_character == '\t') 380 input_character = 0; 381 else if (input_character >= ' ') 382 input_character -= ' ' - 1; 383 else 384 throw new ParserException ("illegal encoded character: " + input_character + " ('" + character + "')"); 385 char ch = mLookupTable[mEncodingIndex[index % 64]][input_character]; 386 ret.append (ch); 387 checksum += ch; 388 index++; 389 } 390 else 391 ret.append (character); 392 } 393 length--; 394 if (0 == length) 395 { 396 index = 0; 397 state = STATE_CHECKSUM; 398 } 399 break; 400 401 case STATE_ESCAPE: 402 found = false; 403 for (int i = 0; i < mEscapes.length; i++) 404 if (character == mEscapes[i]) 405 { 406 found = true; 407 character = mEscaped[i]; 408 } 409 if (!found) 410 throw new ParserException ("unexpected escape character: " + (int)character + " ('" + character + "')"); 411 ret.append (character); 412 checksum += character; 413 index++; 414 state = STATE_DECODE; 415 length--; 416 if (0 == length) 417 { 418 index = 0; 419 state = STATE_CHECKSUM; 420 } 421 break; 422 423 case STATE_CHECKSUM: 424 buffer[substate_checksum] = character; 425 substate_checksum++; 426 if (substate_checksum >= buffer.length) 427 { 428 long check = decodeBase64 (buffer); 429 if (check != checksum) 430 throw new ParserException ("incorrect checksum, expected " + check + ", calculated " + checksum); 431 checksum = 0; 432 substate_checksum = 0; 433 state = STATE_FINAL; 434 } 435 break; 436 437 case STATE_FINAL: 438 if (character == mTrailer[substate_final]) 439 substate_final++; 440 else 441 throw new ParserException ("illegal character encountered: " + (int)character + " ('" + character + "')"); 442 if (substate_final >= mTrailer.length) 443 { 444 substate_final = 0; 445 state = LAST_STATE; 446 } 447 break; 448 default: 449 throw new ParserException ("invalid state: " + state); 450 } 451 } 452 453 return (ret.toString ()); 454 } 455 456 // /** 457 // * Example mainline for decrypting script. 458 // * Change a file with encrypted script into one without. 459 // * <em>WARNING: This does not preserve DOS type line endings.</em> 460 // * @param args Command line arguments. Two file names, input and output. 461 // * Optionally, the character set to use as a third argument. 462 // * @exception IOException If the input file doesn't exist, or the output 463 // * file cannot be created. 464 // * @exception ParserException If there is a decryption problem. 465 // */ 466 // public static void main (String[] args) 467 // throws 468 // IOException, 469 // ParserException 470 // { 471 // String charset; 472 // FileInputStream in; 473 // Page page; 474 // Cursor cursor; 475 // String string; 476 // int ret; 477 // 478 // if (args.length < 2) 479 // { 480 // System.out.println ("Usage: java org.htmlparser.scanners.ScriptDecoder <infile> <outfile> [charset]"); 481 // ret = 1; 482 // } 483 // else 484 // { 485 // if (2 < args.length) 486 // charset = args[2]; 487 // else 488 // charset = "ISO-8859-1"; 489 // in = new FileInputStream (args[0]); 490 // page = new Page (in, charset); 491 // cursor = new Cursor (page, 0); 492 // ScriptDecoder.LAST_STATE = STATE_INITIAL; 493 // string = ScriptDecoder.Decode (page, cursor); 494 // in.close (); 495 // 496 // FileOutputStream outfile = new FileOutputStream (args[1]); 497 // outfile.write (string.getBytes (charset)); 498 // outfile.close (); 499 // ret = (0 != string.length ()) ? 0 : 1; 500 // } 501 // 502 // System.exit (ret); 503 // } 504 }