/ org.htmlparser / src / org / htmlparser / scanners / ScriptDecoder.java
ScriptDecoder.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2004 Derrick Oswald
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptDecoder.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2005/05/15 11:49:04 $
 10  // $Revision: 1.4 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser.scanners;
 28  
 29  import org.htmlparser.lexer.Cursor;
 30  import org.htmlparser.lexer.Page;
 31  import org.htmlparser.util.ParserException;
 32  
 33  /**
 34   * Decode script.
 35   * Script obfuscated by the <A href="http://www.microsoft.com/downloads/details.aspx?FamilyId=E7877F67-C447-4873-B1B0-21F0626A6329&displaylang=en" target="_parent">Windows Script Encoder</A>
 36   * provided by Microsoft, is converted to plaintext. This code is based loosely
 37   * on example code provided by MrBrownstone with changes by Joe Steele, see
 38   * <A href="http://www.virtualconspiracy.com/download/scrdec14.c" target="_parent">scrdec14.c</A>.
 39   */
 40  public class ScriptDecoder
 41  {
 42      /**
 43       * Termination state.
 44       */
 45      public static final int STATE_DONE = 0;
 46  
 47      /**
 48       * State on entry.
 49       */
 50      public static final int STATE_INITIAL = 1;
 51  
 52      /**
 53       * State while reading the encoded length.
 54       */
 55      protected static final int STATE_LENGTH = 2;
 56  
 57      /**
 58       * State when reading up to decoded text.
 59       */
 60      protected static final int STATE_PREFIX = 3;
 61  
 62      /**
 63       * State while decoding.
 64       */
 65      protected static final int STATE_DECODE = 4;
 66  
 67      /**
 68       * State when reading an escape sequence.
 69       */
 70      protected static final int STATE_ESCAPE = 5;
 71  
 72      /**
 73       * State when reading the checksum.
 74       */
 75      protected static final int STATE_CHECKSUM = 6;
 76  
 77      /**
 78       * State while exiting.
 79       */
 80      protected static final int STATE_FINAL = 7;
 81  
 82      /**
 83       * The state to enter when decrypting is complete.
 84       * If this is STATE_DONE, the decryption will return with any characters
 85       * following the encoded text still unconsumed.
 86       * Otherwise, if this is STATE_INITIAL, the input will be exhausted and
 87       * all following characters will be contained in the return value
 88       * of the <code>Decode()</code> method.
 89       */
 90      public static int LAST_STATE = STATE_DONE;
 91  
 92      /**
 93       * Table of lookup choice.
 94       * The decoding cycles between three flavours determined
 95       * by this sequence of 64 choices, corresponding to the
 96       * first dimension of the lookup table.
 97       */
 98      protected static byte mEncodingIndex[] =
 99      {
100          1, 2, 0, 1, 2, 0, 2, 0, 0, 2, 0, 2, 1, 0, 2, 0, 
101          1, 0, 2, 0, 1, 1, 2, 0, 0, 2, 1, 0, 2, 0, 0, 2, 
102          1, 1, 0, 2, 0, 2, 0, 1, 0, 1, 1, 2, 0, 1, 0, 2, 
103          1, 0, 2, 0, 1, 1, 2, 0, 0, 1, 1, 2, 0, 1, 0, 2,
104      };
105  
106      /**
107       * Two dimensional lookup table.
108       * The decoding uses this table to determine the plaintext for
109       * characters that aren't mEscaped.
110       */
111      protected static char mLookupTable[][] =
112      {
113          {
114              '{', 
115              '2',  '0',  '!',  ')',  '[',  '8',  '3',  '=', 
116              'X',  ':',  '5',  'e',  '9', '\\',  'V',  's', 
117              'f',  'N',  'E',  'k',  'b',  'Y',  'x',  '^', 
118              '}',  'J',  'm',  'q',    0,  '`',    0,  'S', 
119                0,  'B', '\'',  'H',  'r',  'u',  '1',  '7', 
120              'M',  'R',  '"',  'T',  'j',  'G',  'd',  '-', 
121              ' ',  '',  '.',  'L',  ']',  '~',  'l',  'o', 
122              'y',  't',  'C',  '&',  'v',  '%',  '$',  '+', 
123              '(',  '#',  'A',  '4', '\t',  '*',  'D',  '?', 
124              'w',  ';',  'U',  'i',  'a',  'c',  'P',  'g', 
125              'Q',  'I',  'O',  'F',  'h',  '|',  '6',  'p', 
126              'n',  'z',  '/',  '_',  'K',  'Z',  ',',  'W', 
127          },
128          {
129              'W', 
130              '.',  'G',  'z',  'V',  'B',  'j',  '/',  '&', 
131              'I',  'A',  '4',  '2',  '[',  'v',  'r',  'C', 
132              '8',  '9',  'p',  'E',  'h',  'q',  'O', '\t', 
133              'b',  'D',  '#',  'u',    0,  '~',    0,  '^', 
134                0,  'w',  'J',  'a',  ']',  '"',  'K',  'o', 
135              'N',  ';',  'L',  'P',  'g',  '*',  '}',  't', 
136              'T',  '+',  '-',  ',',  '0',  'n',  'k',  'f', 
137              '5',  '%',  '!',  'd',  'M',  'R',  'c',  '?', 
138              '{',  'x',  ')',  '(',  's',  'Y',  '3',  '', 
139              'm',  'U',  'S',  '|',  ':',  '_',  'e',  'F', 
140              'X',  '1',  'i',  'l',  'Z',  'H', '\'', '\\', 
141              '=',  '$',  'y',  '7',  '`',  'Q',  ' ',  '6', 
142          },
143          {
144              'n', 
145              '-',  'u',  'R',  '`',  'q',  '^',  'I', '\\', 
146              'b',  '}',  ')',  '6',  ' ',  '|',  'z',  '', 
147              'k',  'c',  '3',  '+',  'h',  'Q',  'f',  'v', 
148              '1',  'd',  'T',  'C',    0,  ':',    0,  '~', 
149                0,  'E',  ',',  '*',  't', '\'',  '7',  'D', 
150              'y',  'Y',  '/',  'o',  '&',  'r',  'j',  '9', 
151              '{',  '?',  '8',  'w',  'g',  'S',  'G',  '4', 
152              'x',  ']',  '0',  '#',  'Z',  '[',  'l',  'H', 
153              'U',  'p',  'i',  '.',  'L',  '!',  '$',  'N', 
154              'P', '\t',  'V',  's',  '5',  'a',  'K',  'X', 
155              ';',  'W',  '"',  'm',  'M',  '%',  '(',  'F', 
156              'J',  '2',  'A',  '=',  '_',  'O',  'B',  'e', 
157          },
158      };
159  
160      /**
161       * The base 64 decoding table.
162       * This array determines the value of decoded base 64 elements.
163       */
164      protected static int mDigits[];
165      static
166      {
167          mDigits = new int[0x7b];
168          for (int i = 0; i < 26; i++)
169          {
170              mDigits['A' + i] = i;
171              mDigits['a' + i] = i + 26;
172          }
173          for (int i = 0; i < 10; i++)
174              mDigits['0' + i] = i + 52;
175          mDigits[0x2b] = '>';
176          mDigits[0x2f] = '?';
177      }
178  
179      /**
180       * The leader.
181       * The prefix to the encoded script is #@~^nnnnnn== where the n are the
182       * length digits in base64.
183       */
184      protected static char mLeader[] =
185      { 
186          '#',
187          '@',
188          '~',
189          '^',
190      };
191  
192      /**
193       * The prefix.
194       * The prfix separates the encoded text from the length.
195       */
196      protected static char mPrefix[] =
197      { 
198          '=',
199          '=',
200      };
201  
202      /**
203       * The trailer.
204       * The suffix to the encoded script is nnnnnn==^#~@ where the n are the
205       * checksum digits in base64. These characters are the part after the checksum.
206       */
207      protected static char mTrailer[] =
208      { 
209          '=',
210          '=',
211          '^',
212          '#',
213          '~',
214          '@',
215      };
216  
217      /**
218       * Escape sequence characters.
219       */
220      protected static char mEscapes[] =
221      {
222          '#',
223          '&',
224          '!',
225          '*',
226          '$',
227      };
228  
229      /**
230       * The escaped characters corresponding to the each escape sequence.
231       */
232      protected static char mEscaped[] = //"\r\n<>@";
233      {
234          '\r',
235          '\n',
236          '<',
237          '>',
238          '@',
239      };
240  
241      /**
242       * Extract the base 64 encoded number.
243       * This is a very limited subset of base 64 encoded characters.
244       * Six characters are expected. These are translated into a single long
245       * value. For a more complete base 64 codec see for example the base64
246       * package of <A href="http://sourceforge.net/projects/iharder/" target="_parent">iHarder.net</A>
247       * @param p Six base 64 encoded digits.
248       * @return The value of the decoded number.
249       */
250      protected static long decodeBase64 (char[] p)
251      {
252          long ret;
253          
254          ret = 0;
255  
256          ret +=  (mDigits[p[0]] << 2);
257          ret +=  (mDigits[p[1]] >> 4);
258          ret +=  (mDigits[p[1]] & 0xf) << 12;
259          ret += ((mDigits[p[2]] >> 2) << 8); 
260          ret += ((mDigits[p[2]] & 0x3) << 22);
261          ret +=  (mDigits[p[3]] << 16);
262          ret += ((mDigits[p[4]] << 2) << 24);
263          ret += ((mDigits[p[5]] >> 4) << 24);
264  
265          return (ret);
266      }
267  
268      /**
269       * Decode script encoded by the Microsoft obfuscator.
270       * @param page The source for encoded text.
271       * @param cursor The position at which to start decoding.
272       * This is advanced to the end of the encoded text.
273       * @return The plaintext.
274       * @exception ParserException If an error is discovered while decoding.
275       */
276      public static String Decode (Page page, Cursor cursor)
277          throws
278              ParserException
279      {
280          int state;
281          int substate_initial;
282          int substate_length;
283          int substate_prefix;
284          int substate_checksum;
285          int substate_final;
286          long checksum;
287          long length;
288          char buffer[];
289          buffer = new char[6];
290          int index;
291          char character;
292          int input_character;
293          boolean found;
294          StringBuffer ret;
295          
296          ret = new StringBuffer (1024);
297  
298          state = STATE_INITIAL;
299          substate_initial = 0;
300          substate_length = 0;
301          substate_prefix = 0;
302          substate_checksum = 0;
303          substate_final = 0;
304          length = 0L;
305          checksum = 0L;
306          index = 0;
307          while (STATE_DONE != state)
308          {
309              input_character = page.getCharacter (cursor);
310              character = (char)input_character;
311              if (Page.EOF == input_character)
312              {
313                  if (   (STATE_INITIAL != state)
314                      || (0 != substate_initial)
315                      || (0 != substate_length)
316                      || (0 != substate_prefix)
317                      || (0 != substate_checksum)
318                      || (0 != substate_final))
319                      throw new ParserException ("illegal state for exit");
320                  state = STATE_DONE;
321              }
322              else
323                  switch (state)
324                  {
325                      case STATE_INITIAL:
326                          if (character == mLeader[substate_initial])
327                          {
328                              substate_initial++;
329                              if (substate_initial == mLeader.length)
330                              {
331                                  substate_initial = 0;
332                                  state = STATE_LENGTH;
333                              }
334                          }
335                          else
336                          {
337                              // oops, flush
338                              for (int k = 0; 0 < substate_initial; k++)
339                              {
340                                  ret.append (mLeader[k++]);
341                                  substate_initial--;
342                              }
343                              ret.append (character);
344                          }
345                          break;
346  
347                      case STATE_LENGTH:
348                          buffer[substate_length] = character;
349                          substate_length++;
350                          if (substate_length >= buffer.length)
351                          {
352                              length = decodeBase64 (buffer);
353                              if (0 > length)
354                                  throw new ParserException ("illegal length: " + length);
355                              substate_length = 0;
356                              state = STATE_PREFIX;
357                          }
358                          break;
359  
360                      case STATE_PREFIX:
361                          if (character == mPrefix[substate_prefix])
362                              substate_prefix++;
363                          else
364                              throw new ParserException ("illegal character encountered: " + (int)character + " ('" + character + "')");
365                          if (substate_prefix >= mPrefix.length)
366                          {
367                              substate_prefix = 0;
368                              state = STATE_DECODE;
369                          }
370                          break;
371  
372                      case STATE_DECODE:
373                          if ('@' == character)
374                              state = STATE_ESCAPE;
375                          else
376                          {
377                              if (input_character < 0x80)
378                              {
379                                  if (input_character == '\t')
380                                      input_character = 0;
381                                  else if (input_character >= ' ')
382                                      input_character -= ' ' - 1;
383                                  else
384                                      throw new ParserException ("illegal encoded character: " + input_character + " ('" + character + "')");
385                                  char ch = mLookupTable[mEncodingIndex[index % 64]][input_character];
386                                  ret.append (ch);
387                                  checksum += ch;
388                                  index++;
389                              }
390                              else
391                                  ret.append (character);
392                          }
393                          length--;
394                          if (0 == length)
395                          {
396                              index = 0;
397                              state = STATE_CHECKSUM;
398                          }
399                          break;
400  
401                      case STATE_ESCAPE:
402                          found = false;
403                          for (int i = 0; i < mEscapes.length; i++)
404                              if (character == mEscapes[i])
405                              {
406                                  found = true;
407                                  character = mEscaped[i];
408                              }
409                          if (!found)
410                              throw new ParserException ("unexpected escape character: " + (int)character + " ('" + character + "')");
411                          ret.append (character);
412                          checksum += character;
413                          index++;
414                          state = STATE_DECODE;
415                          length--;
416                          if (0 == length)
417                          {
418                              index = 0;
419                              state = STATE_CHECKSUM;
420                          }
421                          break;
422  
423                      case STATE_CHECKSUM:
424                          buffer[substate_checksum] = character;
425                          substate_checksum++;
426                          if (substate_checksum >= buffer.length)
427                          {
428                              long check = decodeBase64 (buffer);
429                              if (check != checksum)
430                                  throw new ParserException ("incorrect checksum, expected " + check + ", calculated " + checksum);
431                              checksum = 0;
432                              substate_checksum = 0;
433                              state = STATE_FINAL;
434                          }
435                          break;
436  
437                      case STATE_FINAL:
438                          if (character == mTrailer[substate_final])
439                              substate_final++;
440                          else
441                              throw new ParserException ("illegal character encountered: " + (int)character + " ('" + character + "')");
442                          if (substate_final >= mTrailer.length)
443                          {
444                              substate_final = 0;
445                              state = LAST_STATE;
446                          }
447                          break;
448                      default:
449                          throw new ParserException ("invalid state: " + state);
450                  }
451          }
452  
453          return (ret.toString ());
454      }
455  
456  //    /**
457  //     * Example mainline for decrypting script.
458  //     * Change a file with encrypted script into one without.
459  //     * <em>WARNING: This does not preserve DOS type line endings.</em>
460  //     * @param args Command line arguments. Two file names, input and output.
461  //     * Optionally, the character set to use as a third argument.
462  //     * @exception IOException If the input file doesn't exist, or the output
463  //     * file cannot be created.
464  //     * @exception ParserException If there is a decryption problem.
465  //     */
466  //    public static void main (String[] args)
467  //         throws
468  //            IOException,
469  //            ParserException
470  //    {
471  //        String charset;
472  //        FileInputStream in;
473  //        Page page;
474  //        Cursor cursor;
475  //        String string;
476  //        int ret;
477  //        
478  //        if (args.length < 2)
479  //        {
480  //            System.out.println ("Usage: java org.htmlparser.scanners.ScriptDecoder <infile> <outfile> [charset]");
481  //            ret = 1;
482  //        }
483  //        else
484  //        {
485  //            if (2 < args.length)
486  //                charset = args[2];
487  //            else
488  //                charset = "ISO-8859-1";
489  //            in = new FileInputStream (args[0]);
490  //            page = new Page (in, charset);
491  //            cursor = new Cursor (page, 0);
492  //            ScriptDecoder.LAST_STATE = STATE_INITIAL;
493  //            string = ScriptDecoder.Decode (page, cursor);
494  //            in.close ();
495  //            
496  //            FileOutputStream outfile = new FileOutputStream (args[1]);
497  //            outfile.write (string.getBytes (charset));
498  //            outfile.close ();
499  //            ret = (0 != string.length ()) ? 0 : 1;
500  //        }
501  //        
502  //        System.exit (ret);
503  //    }
504  }