Cradicle Explorer

/ org.htmlparser / src / org / htmlparser / util / Translate.java
Translate.java
   1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
   2  // http://sourceforge.org/projects/htmlparser
   3  // Copyright (C) 2004 Derrick Oswald
   4  //
   5  // Revision Control Information
   6  //
   7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/Translate.java,v $
   8  // $Author: derrickoswald $
   9  // $Date: 2004/07/31 16:42:33 $
  10  // $Revision: 1.46 $
  11  //
  12  // This library is free software; you can redistribute it and/or
  13  // modify it under the terms of the GNU Lesser General Public
  14  // License as published by the Free Software Foundation; either
  15  // version 2.1 of the License, or (at your option) any later version.
  16  //
  17  // This library is distributed in the hope that it will be useful,
  18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20  // Lesser General Public License for more details.
  21  //
  22  // You should have received a copy of the GNU Lesser General Public
  23  // License along with this library; if not, write to the Free Software
  24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  25  //
  26  
  27  package org.htmlparser.util;
  28  
  29  import java.io.BufferedReader;
  30  import java.io.BufferedWriter;
  31  import java.io.IOException;
  32  import java.io.InputStream;
  33  import java.io.InputStreamReader;
  34  import java.io.OutputStreamWriter;
  35  import java.io.PrintStream;
  36  import java.io.PrintWriter;
  37  import java.io.Reader;
  38  import java.io.UnsupportedEncodingException;
  39  
  40  import org.htmlparser.util.sort.Sort;
  41  
  42  /**
  43   * Extended character entity reference.
  44   * Handles kernels within other strings, just for lookup purposes.
  45   */
  46  class CharacterReferenceEx extends CharacterReference
  47  {
  48      /**
  49       * The starting point in the string.
  50       */
  51      protected int mStart;
  52  
  53      /**
  54       * The ending point in the string.
  55       */
  56      protected int mEnd;
  57  
  58      /**
  59       * Zero args constructor.
  60       * This object is only ever used after setting the kernel, start and end.
  61       */
  62      public CharacterReferenceEx ()
  63      {
  64          super ("", 0);
  65      }
  66  
  67      /**
  68       * Set the starting point of the kernel.
  69       */
  70      public void setStart (int start)
  71      {
  72          mStart = start;
  73      }
  74  
  75      /**
  76       * Set the supposed ending point.
  77       * This only specifies an upper bound on the kernel length.
  78       */
  79      public void setEnd (int end)
  80      {
  81          mEnd = end;
  82      }
  83  
  84      /**
  85       * Get this CharacterReference's kernel.
  86       * @return The kernel in the equivalent character entity reference.
  87       */
  88      public String getKernel ()
  89      {
  90          return (mKernel.substring (mStart, mEnd));
  91      }
  92  
  93      //
  94      // Ordered interface
  95      //
  96  
  97      /**
  98       * Compare one reference to another.
  99       * @see org.htmlparser.util.sort.Ordered
 100       */
 101      public int compare (Object that)
 102      {
 103          CharacterReference r;
 104          String kernel;
 105          int length;
 106          int ret;
 107  
 108          ret = 0;
 109          r = (CharacterReference)that;
 110          kernel = r.getKernel ();
 111          length = kernel.length ();
 112          for (int i = mStart, j = 0; i < mEnd; i++, j++)
 113          {
 114              if (j >= length)
 115              {
 116                  ret = 1;
 117                  break;
 118              }
 119              ret = mKernel.charAt (i) - kernel.charAt (j);
 120              if (0 != ret)
 121                  break;
 122          }
 123  
 124          return (ret);
 125      }
 126  }
 127  
 128  /**
 129   * Translate numeric character references and character entity references to unicode characters.
 130   * Based on tables found at <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">
 131   * http://www.w3.org/TR/REC-html40/sgml/entities.html</a>
 132   * <p>Typical usage:
 133   * <pre>
 134   *      String s = Translate.decode (getTextFromHtmlPage ());
 135   * </pre>
 136   * or
 137   * <pre>
 138   *      String s = "&lt;HTML&gt;" + Translate.encode (getArbitraryText ()) + "&lt;/HTML&gt;";
 139   * </pre>
 140   */
 141  public class Translate
 142  {
 143      /**
 144       * If this member is set <code>true</code>, decoding of streams is
 145       * done line by line in order to reduce the maximum memory required.
 146       */
 147      static public boolean DECODE_LINE_BY_LINE = false;
 148  
 149      /**
 150       * If this member is set <code>true</code>, encoding of numeric character
 151       * references uses hexadecimal digits, i.e. &amp;#x25CB;, instead of decimal
 152       * digits.
 153       */
 154      static public boolean ENCODE_HEXADECIMAL = false;
 155  
 156      /**
 157       * Table mapping entity reference kernel to character.
 158       * This is sorted by kernel when the class is loaded.
 159       */
 160      protected static final CharacterReference[] mCharacterReferences =
 161      {
 162          // Portions © International Organization for Standardization 1986
 163          // Permission to copy in any form is granted for use with
 164          // conforming SGML systems and applications as defined in
 165          // ISO 8879, provided this notice is included in all copies.
 166          // Character entity set. Typical invocation:
 167          // <!ENTITY % HTMLlat1 PUBLIC
 168          // "-//W3C//ENTITIES Latin 1//EN//HTML">
 169          // %HTMLlat1;
 170          new CharacterReference ("nbsp",     '\u00a0'), // no-break space = non-breaking space, U+00A0 ISOnum
 171          new CharacterReference ("iexcl",    '\u00a1'), // inverted exclamation mark, U+00A1 ISOnum
 172          new CharacterReference ("cent",     '\u00a2'), // cent sign, U+00A2 ISOnum
 173          new CharacterReference ("pound",    '\u00a3'), // pound sign, U+00A3 ISOnum
 174          new CharacterReference ("curren",   '\u00a4'), // currency sign, U+00A4 ISOnum
 175          new CharacterReference ("yen",      '\u00a5'), // yen sign = yuan sign, U+00A5 ISOnum
 176          new CharacterReference ("brvbar",   '\u00a6'), // broken bar = broken vertical bar, U+00A6 ISOnum
 177          new CharacterReference ("sect",     '\u00a7'), // section sign, U+00A7 ISOnum
 178          new CharacterReference ("uml",      '\u00a8'), // diaeresis = spacing diaeresis, U+00A8 ISOdia
 179          new CharacterReference ("copy",     '\u00a9'), // copyright sign, U+00A9 ISOnum
 180          new CharacterReference ("ordf",     '\u00aa'), // feminine ordinal indicator, U+00AA ISOnum
 181          new CharacterReference ("laquo",    '\u00ab'), // left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
 182          new CharacterReference ("not",      '\u00ac'), // not sign, U+00AC ISOnum
 183          new CharacterReference ("shy",      '\u00ad'), // soft hyphen = discretionary hyphen, U+00AD ISOnum
 184          new CharacterReference ("reg",      '\u00ae'), // registered sign = registered trade mark sign, U+00AE ISOnum
 185          new CharacterReference ("macr",     '\u00af'), // macron = spacing macron = overline = APL overbar, U+00AF ISOdia
 186          new CharacterReference ("deg",      '\u00b0'), // degree sign, U+00B0 ISOnum
 187          new CharacterReference ("plusmn",   '\u00b1'), // plus-minus sign = plus-or-minus sign, U+00B1 ISOnum
 188          new CharacterReference ("sup2",     '\u00b2'), // superscript two = superscript digit two = squared, U+00B2 ISOnum
 189          new CharacterReference ("sup3",     '\u00b3'), // superscript three = superscript digit three = cubed, U+00B3 ISOnum
 190          new CharacterReference ("acute",    '\u00b4'), // acute accent = spacing acute, U+00B4 ISOdia
 191          new CharacterReference ("micro",    '\u00b5'), // micro sign, U+00B5 ISOnum
 192          new CharacterReference ("para",     '\u00b6'), // pilcrow sign = paragraph sign, U+00B6 ISOnum
 193          new CharacterReference ("middot",   '\u00b7'), // middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
 194          new CharacterReference ("cedil",    '\u00b8'), // cedilla = spacing cedilla, U+00B8 ISOdia
 195          new CharacterReference ("sup1",     '\u00b9'), // superscript one = superscript digit one, U+00B9 ISOnum
 196          new CharacterReference ("ordm",     '\u00ba'), // masculine ordinal indicator, U+00BA ISOnum
 197          new CharacterReference ("raquo",    '\u00bb'), // right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
 198          new CharacterReference ("frac14",   '\u00bc'), // vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
 199          new CharacterReference ("frac12",   '\u00bd'), // vulgar fraction one half = fraction one half, U+00BD ISOnum
 200          new CharacterReference ("frac34",   '\u00be'), // vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
 201          new CharacterReference ("iquest",   '\u00bf'), // inverted question mark = turned question mark, U+00BF ISOnum
 202          new CharacterReference ("Agrave",   '\u00c0'), // latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1
 203          new CharacterReference ("Aacute",   '\u00c1'), // latin capital letter A with acute, U+00C1 ISOlat1
 204          new CharacterReference ("Acirc",    '\u00c2'), // latin capital letter A with circumflex, U+00C2 ISOlat1
 205          new CharacterReference ("Atilde",   '\u00c3'), // latin capital letter A with tilde, U+00C3 ISOlat1
 206          new CharacterReference ("Auml",     '\u00c4'), // latin capital letter A with diaeresis, U+00C4 ISOlat1
 207          new CharacterReference ("Aring",    '\u00c5'), // latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1
 208          new CharacterReference ("AElig",    '\u00c6'), // latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
 209          new CharacterReference ("Ccedil",   '\u00c7'), // latin capital letter C with cedilla, U+00C7 ISOlat1
 210          new CharacterReference ("Egrave",   '\u00c8'), // latin capital letter E with grave, U+00C8 ISOlat1
 211          new CharacterReference ("Eacute",   '\u00c9'), // latin capital letter E with acute, U+00C9 ISOlat1
 212          new CharacterReference ("Ecirc",    '\u00ca'), // latin capital letter E with circumflex, U+00CA ISOlat1
 213          new CharacterReference ("Euml",     '\u00cb'), // latin capital letter E with diaeresis, U+00CB ISOlat1
 214          new CharacterReference ("Igrave",   '\u00cc'), // latin capital letter I with grave, U+00CC ISOlat1
 215          new CharacterReference ("Iacute",   '\u00cd'), // latin capital letter I with acute, U+00CD ISOlat1
 216          new CharacterReference ("Icirc",    '\u00ce'), // latin capital letter I with circumflex, U+00CE ISOlat1
 217          new CharacterReference ("Iuml",     '\u00cf'), // latin capital letter I with diaeresis, U+00CF ISOlat1
 218          new CharacterReference ("ETH",      '\u00d0'), // latin capital letter ETH, U+00D0 ISOlat1
 219          new CharacterReference ("Ntilde",   '\u00d1'), // latin capital letter N with tilde, U+00D1 ISOlat1
 220          new CharacterReference ("Ograve",   '\u00d2'), // latin capital letter O with grave, U+00D2 ISOlat1
 221          new CharacterReference ("Oacute",   '\u00d3'), // latin capital letter O with acute, U+00D3 ISOlat1
 222          new CharacterReference ("Ocirc",    '\u00d4'), // latin capital letter O with circumflex, U+00D4 ISOlat1
 223          new CharacterReference ("Otilde",   '\u00d5'), // latin capital letter O with tilde, U+00D5 ISOlat1
 224          new CharacterReference ("Ouml",     '\u00d6'), // latin capital letter O with diaeresis, U+00D6 ISOlat1
 225          new CharacterReference ("times",    '\u00d7'), // multiplication sign, U+00D7 ISOnum
 226          new CharacterReference ("Oslash",   '\u00d8'), // latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1
 227          new CharacterReference ("Ugrave",   '\u00d9'), // latin capital letter U with grave, U+00D9 ISOlat1
 228          new CharacterReference ("Uacute",   '\u00da'), // latin capital letter U with acute, U+00DA ISOlat1
 229          new CharacterReference ("Ucirc",    '\u00db'), // latin capital letter U with circumflex, U+00DB ISOlat1
 230          new CharacterReference ("Uuml",     '\u00dc'), // latin capital letter U with diaeresis, U+00DC ISOlat1
 231          new CharacterReference ("Yacute",   '\u00dd'), // latin capital letter Y with acute, U+00DD ISOlat1
 232          new CharacterReference ("THORN",    '\u00de'), // latin capital letter THORN, U+00DE ISOlat1
 233          new CharacterReference ("szlig",    '\u00df'), // latin small letter sharp s = ess-zed, U+00DF ISOlat1
 234          new CharacterReference ("agrave",   '\u00e0'), // latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
 235          new CharacterReference ("aacute",   '\u00e1'), // latin small letter a with acute, U+00E1 ISOlat1
 236          new CharacterReference ("acirc",    '\u00e2'), // latin small letter a with circumflex, U+00E2 ISOlat1
 237          new CharacterReference ("atilde",   '\u00e3'), // latin small letter a with tilde, U+00E3 ISOlat1
 238          new CharacterReference ("auml",     '\u00e4'), // latin small letter a with diaeresis, U+00E4 ISOlat1
 239          new CharacterReference ("aring",    '\u00e5'), // latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1
 240          new CharacterReference ("aelig",    '\u00e6'), // latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
 241          new CharacterReference ("ccedil",   '\u00e7'), // latin small letter c with cedilla, U+00E7 ISOlat1
 242          new CharacterReference ("egrave",   '\u00e8'), // latin small letter e with grave, U+00E8 ISOlat1
 243          new CharacterReference ("eacute",   '\u00e9'), // latin small letter e with acute, U+00E9 ISOlat1
 244          new CharacterReference ("ecirc",    '\u00ea'), // latin small letter e with circumflex, U+00EA ISOlat1
 245          new CharacterReference ("euml",     '\u00eb'), // latin small letter e with diaeresis, U+00EB ISOlat1
 246          new CharacterReference ("igrave",   '\u00ec'), // latin small letter i with grave, U+00EC ISOlat1
 247          new CharacterReference ("iacute",   '\u00ed'), // latin small letter i with acute, U+00ED ISOlat1
 248          new CharacterReference ("icirc",    '\u00ee'), // latin small letter i with circumflex, U+00EE ISOlat1
 249          new CharacterReference ("iuml",     '\u00ef'), // latin small letter i with diaeresis, U+00EF ISOlat1
 250          new CharacterReference ("eth",      '\u00f0'), // latin small letter eth, U+00F0 ISOlat1
 251          new CharacterReference ("ntilde",   '\u00f1'), // latin small letter n with tilde, U+00F1 ISOlat1
 252          new CharacterReference ("ograve",   '\u00f2'), // latin small letter o with grave, U+00F2 ISOlat1
 253          new CharacterReference ("oacute",   '\u00f3'), // latin small letter o with acute, U+00F3 ISOlat1
 254          new CharacterReference ("ocirc",    '\u00f4'), // latin small letter o with circumflex, U+00F4 ISOlat1
 255          new CharacterReference ("otilde",   '\u00f5'), // latin small letter o with tilde, U+00F5 ISOlat1
 256          new CharacterReference ("ouml",     '\u00f6'), // latin small letter o with diaeresis, U+00F6 ISOlat1
 257          new CharacterReference ("divide",   '\u00f7'), // division sign, U+00F7 ISOnum
 258          new CharacterReference ("oslash",   '\u00f8'), // latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1
 259          new CharacterReference ("ugrave",   '\u00f9'), // latin small letter u with grave, U+00F9 ISOlat1
 260          new CharacterReference ("uacute",   '\u00fa'), // latin small letter u with acute, U+00FA ISOlat1
 261          new CharacterReference ("ucirc",    '\u00fb'), // latin small letter u with circumflex, U+00FB ISOlat1
 262          new CharacterReference ("uuml",     '\u00fc'), // latin small letter u with diaeresis, U+00FC ISOlat1
 263          new CharacterReference ("yacute",   '\u00fd'), // latin small letter y with acute, U+00FD ISOlat1
 264          new CharacterReference ("thorn",    '\u00fe'), // latin small letter thorn, U+00FE ISOlat1
 265          new CharacterReference ("yuml",     '\u00ff'), // latin small letter y with diaeresis, U+00FF ISOlat1
 266          // Mathematical, Greek and Symbolic characters for HTML
 267          // Character entity set. Typical invocation:
 268          // <!ENTITY % HTMLsymbol PUBLIC
 269          // "-//W3C//ENTITIES Symbols//EN//HTML">
 270          // %HTMLsymbol;
 271          // Portions © International Organization for Standardization 1986:
 272          // Permission to copy in any form is granted for use with
 273          // conforming SGML systems and applications as defined in
 274          // ISO 8879, provided this notice is included in all copies.
 275          // Relevant ISO entity set is given unless names are newly introduced.
 276          // New names (i.e., not in ISO 8879 list) do not clash with any
 277          // existing ISO 8879 entity names. ISO 10646 character numbers
 278          // are given for each character, in hex. CDATA values are decimal
 279          // conversions of the ISO 10646 values and refer to the document
 280          // character set. Names are ISO 10646 names.
 281          // Latin Extended-B
 282          new CharacterReference ("fnof",     '\u0192'), // latin small f with hook = function = florin, U+0192 ISOtech
 283          // Greek
 284          new CharacterReference ("Alpha",    '\u0391'), // greek capital letter alpha, U+0391
 285          new CharacterReference ("Beta",     '\u0392'), // greek capital letter beta, U+0392
 286          new CharacterReference ("Gamma",    '\u0393'), // greek capital letter gamma, U+0393 ISOgrk3
 287          new CharacterReference ("Delta",    '\u0394'), // greek capital letter delta, U+0394 ISOgrk3
 288          new CharacterReference ("Epsilon",  '\u0395'), // greek capital letter epsilon, U+0395
 289          new CharacterReference ("Zeta",     '\u0396'), // greek capital letter zeta, U+0396
 290          new CharacterReference ("Eta",      '\u0397'), // greek capital letter eta, U+0397
 291          new CharacterReference ("Theta",    '\u0398'), // greek capital letter theta, U+0398 ISOgrk3
 292          new CharacterReference ("Iota",     '\u0399'), // greek capital letter iota, U+0399
 293          new CharacterReference ("Kappa",    '\u039a'), // greek capital letter kappa, U+039A
 294          new CharacterReference ("Lambda",   '\u039b'), // greek capital letter lambda, U+039B ISOgrk3
 295          new CharacterReference ("Mu",       '\u039c'), // greek capital letter mu, U+039C
 296          new CharacterReference ("Nu",       '\u039d'), // greek capital letter nu, U+039D
 297          new CharacterReference ("Xi",       '\u039e'), // greek capital letter xi, U+039E ISOgrk3
 298          new CharacterReference ("Omicron",  '\u039f'), // greek capital letter omicron, U+039F
 299          new CharacterReference ("Pi",       '\u03a0'), // greek capital letter pi, U+03A0 ISOgrk3
 300          new CharacterReference ("Rho",      '\u03a1'), // greek capital letter rho, U+03A1
 301          // there is no Sigmaf, and no U+03A2 character either
 302          new CharacterReference ("Sigma",    '\u03a3'), // greek capital letter sigma, U+03A3 ISOgrk3
 303          new CharacterReference ("Tau",      '\u03a4'), // greek capital letter tau, U+03A4
 304          new CharacterReference ("Upsilon",  '\u03a5'), // greek capital letter upsilon, U+03A5 ISOgrk3
 305          new CharacterReference ("Phi",      '\u03a6'), // greek capital letter phi, U+03A6 ISOgrk3
 306          new CharacterReference ("Chi",      '\u03a7'), // greek capital letter chi, U+03A7
 307          new CharacterReference ("Psi",      '\u03a8'), // greek capital letter psi, U+03A8 ISOgrk3
 308          new CharacterReference ("Omega",    '\u03a9'), // greek capital letter omega, U+03A9 ISOgrk3
 309          new CharacterReference ("alpha",    '\u03b1'), // greek small letter alpha, U+03B1 ISOgrk3
 310          new CharacterReference ("beta",     '\u03b2'), // greek small letter beta, U+03B2 ISOgrk3
 311          new CharacterReference ("gamma",    '\u03b3'), // greek small letter gamma, U+03B3 ISOgrk3
 312          new CharacterReference ("delta",    '\u03b4'), // greek small letter delta, U+03B4 ISOgrk3
 313          new CharacterReference ("epsilon",  '\u03b5'), // greek small letter epsilon, U+03B5 ISOgrk3
 314          new CharacterReference ("zeta",     '\u03b6'), // greek small letter zeta, U+03B6 ISOgrk3
 315          new CharacterReference ("eta",      '\u03b7'), // greek small letter eta, U+03B7 ISOgrk3
 316          new CharacterReference ("theta",    '\u03b8'), // greek small letter theta, U+03B8 ISOgrk3
 317          new CharacterReference ("iota",     '\u03b9'), // greek small letter iota, U+03B9 ISOgrk3
 318          new CharacterReference ("kappa",    '\u03ba'), // greek small letter kappa, U+03BA ISOgrk3
 319          new CharacterReference ("lambda",   '\u03bb'), // greek small letter lambda, U+03BB ISOgrk3
 320          new CharacterReference ("mu",       '\u03bc'), // greek small letter mu, U+03BC ISOgrk3
 321          new CharacterReference ("nu",       '\u03bd'), // greek small letter nu, U+03BD ISOgrk3
 322          new CharacterReference ("xi",       '\u03be'), // greek small letter xi, U+03BE ISOgrk3
 323          new CharacterReference ("omicron",  '\u03bf'), // greek small letter omicron, U+03BF NEW
 324          new CharacterReference ("pi",       '\u03c0'), // greek small letter pi, U+03C0 ISOgrk3
 325          new CharacterReference ("rho",      '\u03c1'), // greek small letter rho, U+03C1 ISOgrk3
 326          new CharacterReference ("sigmaf",   '\u03c2'), // greek small letter final sigma, U+03C2 ISOgrk3
 327          new CharacterReference ("sigma",    '\u03c3'), // greek small letter sigma, U+03C3 ISOgrk3
 328          new CharacterReference ("tau",      '\u03c4'), // greek small letter tau, U+03C4 ISOgrk3
 329          new CharacterReference ("upsilon",  '\u03c5'), // greek small letter upsilon, U+03C5 ISOgrk3
 330          new CharacterReference ("phi",      '\u03c6'), // greek small letter phi, U+03C6 ISOgrk3
 331          new CharacterReference ("chi",      '\u03c7'), // greek small letter chi, U+03C7 ISOgrk3
 332          new CharacterReference ("psi",      '\u03c8'), // greek small letter psi, U+03C8 ISOgrk3
 333          new CharacterReference ("omega",    '\u03c9'), // greek small letter omega, U+03C9 ISOgrk3
 334          new CharacterReference ("thetasym", '\u03d1'), // greek small letter theta symbol, U+03D1 NEW
 335          new CharacterReference ("upsih",    '\u03d2'), // greek upsilon with hook symbol, U+03D2 NEW
 336          new CharacterReference ("piv",      '\u03d6'), // greek pi symbol, U+03D6 ISOgrk3
 337          // General Punctuation
 338          new CharacterReference ("bull",     '\u2022'), // bullet = black small circle, U+2022 ISOpub
 339          // bullet is NOT the same as bullet operator, U+2219
 340          new CharacterReference ("hellip",   '\u2026'), // horizontal ellipsis = three dot leader, U+2026 ISOpub
 341          new CharacterReference ("prime",    '\u2032'), // prime = minutes = feet, U+2032 ISOtech
 342          new CharacterReference ("Prime",    '\u2033'), // double prime = seconds = inches, U+2033 ISOtech
 343          new CharacterReference ("oline",    '\u203e'), // overline = spacing overscore, U+203E NEW
 344          new CharacterReference ("frasl",    '\u2044'), // fraction slash, U+2044 NEW
 345          // Letterlike Symbols
 346          new CharacterReference ("weierp",   '\u2118'), // script capital P = power set = Weierstrass p, U+2118 ISOamso
 347          new CharacterReference ("image",    '\u2111'), // blackletter capital I = imaginary part, U+2111 ISOamso
 348          new CharacterReference ("real",     '\u211c'), // blackletter capital R = real part symbol, U+211C ISOamso
 349          new CharacterReference ("trade",    '\u2122'), // trade mark sign, U+2122 ISOnum
 350          new CharacterReference ("alefsym",  '\u2135'), // alef symbol = first transfinite cardinal, U+2135 NEW
 351          // alef symbol is NOT the same as hebrew letter alef,
 352          // U+05D0 although the same glyph could be used to depict both characters
 353          // Arrows
 354          new CharacterReference ("larr",     '\u2190'), // leftwards arrow, U+2190 ISOnum
 355          new CharacterReference ("uarr",     '\u2191'), // upwards arrow, U+2191 ISOnum
 356          new CharacterReference ("rarr",     '\u2192'), // rightwards arrow, U+2192 ISOnum
 357          new CharacterReference ("darr",     '\u2193'), // downwards arrow, U+2193 ISOnum
 358          new CharacterReference ("harr",     '\u2194'), // left right arrow, U+2194 ISOamsa
 359          new CharacterReference ("crarr",    '\u21b5'), // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
 360          new CharacterReference ("lArr",     '\u21d0'), // leftwards double arrow, U+21D0 ISOtech
 361          // ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
 362          // but also does not have any other character for that function. So ? lArr can
 363          // be used for 'is implied by' as ISOtech suggests
 364          new CharacterReference ("uArr",     '\u21d1'), // upwards double arrow, U+21D1 ISOamsa
 365          new CharacterReference ("rArr",     '\u21d2'), // rightwards double arrow, U+21D2 ISOtech
 366          // ISO 10646 does not say this is the 'implies' character but does not have 
 367          // another character with this function so ?
 368          // rArr can be used for 'implies' as ISOtech suggests
 369          new CharacterReference ("dArr",     '\u21d3'), // downwards double arrow, U+21D3 ISOamsa
 370          new CharacterReference ("hArr",     '\u21d4'), // left right double arrow, U+21D4 ISOamsa
 371          // Mathematical Operators
 372          new CharacterReference ("forall",   '\u2200'), // for all, U+2200 ISOtech
 373          new CharacterReference ("part",     '\u2202'), // partial differential, U+2202 ISOtech
 374          new CharacterReference ("exist",    '\u2203'), // there exists, U+2203 ISOtech
 375          new CharacterReference ("empty",    '\u2205'), // empty set = null set = diameter, U+2205 ISOamso
 376          new CharacterReference ("nabla",    '\u2207'), // nabla = backward difference, U+2207 ISOtech
 377          new CharacterReference ("isin",     '\u2208'), // element of, U+2208 ISOtech
 378          new CharacterReference ("notin",    '\u2209'), // not an element of, U+2209 ISOtech
 379          new CharacterReference ("ni",       '\u220b'), // contains as member, U+220B ISOtech
 380          // should there be a more memorable name than 'ni'?
 381          new CharacterReference ("prod",     '\u220f'), // n-ary product = product sign, U+220F ISOamsb
 382          // prod is NOT the same character as U+03A0 'greek capital letter pi' though
 383          // the same glyph might be used for both
 384          new CharacterReference ("sum",      '\u2211'), // n-ary sumation, U+2211 ISOamsb
 385          // sum is NOT the same character as U+03A3 'greek capital letter sigma'
 386          // though the same glyph might be used for both
 387          new CharacterReference ("minus",    '\u2212'), // minus sign, U+2212 ISOtech
 388          new CharacterReference ("lowast",   '\u2217'), // asterisk operator, U+2217 ISOtech
 389          new CharacterReference ("radic",    '\u221a'), // square root = radical sign, U+221A ISOtech
 390          new CharacterReference ("prop",     '\u221d'), // proportional to, U+221D ISOtech
 391          new CharacterReference ("infin",    '\u221e'), // infinity, U+221E ISOtech
 392          new CharacterReference ("ang",      '\u2220'), // angle, U+2220 ISOamso
 393          new CharacterReference ("and",      '\u2227'), // logical and = wedge, U+2227 ISOtech
 394          new CharacterReference ("or",       '\u2228'), // logical or = vee, U+2228 ISOtech
 395          new CharacterReference ("cap",      '\u2229'), // intersection = cap, U+2229 ISOtech
 396          new CharacterReference ("cup",      '\u222a'), // union = cup, U+222A ISOtech
 397          new CharacterReference ("int",      '\u222b'), // integral, U+222B ISOtech
 398          new CharacterReference ("there4",   '\u2234'), // therefore, U+2234 ISOtech
 399          new CharacterReference ("sim",      '\u223c'), // tilde operator = varies with = similar to, U+223C ISOtech
 400          // tilde operator is NOT the same character as the tilde, U+007E,
 401          // although the same glyph might be used to represent both
 402          new CharacterReference ("cong",     '\u2245'), // approximately equal to, U+2245 ISOtech
 403          new CharacterReference ("asymp",    '\u2248'), // almost equal to = asymptotic to, U+2248 ISOamsr
 404          new CharacterReference ("ne",       '\u2260'), // not equal to, U+2260 ISOtech
 405          new CharacterReference ("equiv",    '\u2261'), // identical to, U+2261 ISOtech
 406          new CharacterReference ("le",       '\u2264'), // less-than or equal to, U+2264 ISOtech
 407          new CharacterReference ("ge",       '\u2265'), // greater-than or equal to, U+2265 ISOtech
 408          new CharacterReference ("sub",      '\u2282'), // subset of, U+2282 ISOtech
 409          new CharacterReference ("sup",      '\u2283'), // superset of, U+2283 ISOtech
 410          // note that nsup, 'not a superset of, U+2283' is not covered by the Symbol 
 411          // font encoding and is not included. Should it be, for symmetry?
 412          // It is in ISOamsn
 413          new CharacterReference ("nsub",     '\u2284'), // not a subset of, U+2284 ISOamsn
 414          new CharacterReference ("sube",     '\u2286'), // subset of or equal to, U+2286 ISOtech
 415          new CharacterReference ("supe",     '\u2287'), // superset of or equal to, U+2287 ISOtech
 416          new CharacterReference ("oplus",    '\u2295'), // circled plus = direct sum, U+2295 ISOamsb
 417          new CharacterReference ("otimes",   '\u2297'), // circled times = vector product, U+2297 ISOamsb
 418          new CharacterReference ("perp",     '\u22a5'), // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
 419          new CharacterReference ("sdot",     '\u22c5'), // dot operator, U+22C5 ISOamsb
 420          // dot operator is NOT the same character as U+00B7 middle dot
 421          // Miscellaneous Technical
 422          new CharacterReference ("lceil",    '\u2308'), // left ceiling = apl upstile, U+2308 ISOamsc
 423          new CharacterReference ("rceil",    '\u2309'), // right ceiling, U+2309 ISOamsc
 424          new CharacterReference ("lfloor",   '\u230a'), // left floor = apl downstile, U+230A ISOamsc
 425          new CharacterReference ("rfloor",   '\u230b'), // right floor, U+230B ISOamsc
 426          new CharacterReference ("lang",     '\u2329'), // left-pointing angle bracket = bra, U+2329 ISOtech
 427          // lang is NOT the same character as U+003C 'less than' 
 428          // or U+2039 'single left-pointing angle quotation mark'
 429          new CharacterReference ("rang",     '\u232a'), // right-pointing angle bracket = ket, U+232A ISOtech
 430          // rang is NOT the same character as U+003E 'greater than' 
 431          // or U+203A 'single right-pointing angle quotation mark'
 432          // Geometric Shapes
 433          new CharacterReference ("loz",      '\u25ca'), // lozenge, U+25CA ISOpub
 434          // Miscellaneous Symbols
 435          new CharacterReference ("spades",   '\u2660'), // black spade suit, U+2660 ISOpub
 436          // black here seems to mean filled as opposed to hollow
 437          new CharacterReference ("clubs",    '\u2663'), // black club suit = shamrock, U+2663 ISOpub
 438          new CharacterReference ("hearts",   '\u2665'), // black heart suit = valentine, U+2665 ISOpub
 439          new CharacterReference ("diams",    '\u2666'), // black diamond suit, U+2666 ISOpub
 440          // Special characters for HTML
 441          // Character entity set. Typical invocation:
 442          // <!ENTITY % HTMLspecial PUBLIC
 443          // "-//W3C//ENTITIES Special//EN//HTML">
 444          // %HTMLspecial;
 445          // Portions © International Organization for Standardization 1986:
 446          // Permission to copy in any form is granted for use with
 447          // conforming SGML systems and applications as defined in
 448          // ISO 8879, provided this notice is included in all copies.
 449          // Relevant ISO entity set is given unless names are newly introduced.
 450          // New names (i.e., not in ISO 8879 list) do not clash with any
 451          // existing ISO 8879 entity names. ISO 10646 character numbers
 452          // are given for each character, in hex. CDATA values are decimal
 453          // conversions of the ISO 10646 values and refer to the document
 454          // character set. Names are ISO 10646 names.
 455          // C0 Controls and Basic Latin
 456          new CharacterReference ("quot",     '\u0022'), // quotation mark = APL quote, U+0022 ISOnum
 457          new CharacterReference ("amp",      '\u0026'), // ampersand, U+0026 ISOnum
 458          new CharacterReference ("lt",       '\u003c'), // less-than sign, U+003C ISOnum
 459          new CharacterReference ("gt",       '\u003e'), // greater-than sign, U+003E ISOnum
 460          // Latin Extended-A
 461          new CharacterReference ("OElig",    '\u0152'), // latin capital ligature OE, U+0152 ISOlat2
 462          new CharacterReference ("oelig",    '\u0153'), // latin small ligature oe, U+0153 ISOlat2
 463          // ligature is a misnomer, this is a separate character in some languages
 464          new CharacterReference ("Scaron",   '\u0160'), // latin capital letter S with caron, U+0160 ISOlat2
 465          new CharacterReference ("scaron",   '\u0161'), // latin small letter s with caron, U+0161 ISOlat2
 466          new CharacterReference ("Yuml",     '\u0178'), // latin capital letter Y with diaeresis, U+0178 ISOlat2
 467          // Spacing Modifier Letters
 468          new CharacterReference ("circ",     '\u02c6'), // modifier letter circumflex accent, U+02C6 ISOpub
 469          new CharacterReference ("tilde",    '\u02dc'), // small tilde, U+02DC ISOdia
 470          // General Punctuation
 471          new CharacterReference ("ensp",     '\u2002'), // en space, U+2002 ISOpub
 472          new CharacterReference ("emsp",     '\u2003'), // em space, U+2003 ISOpub
 473          new CharacterReference ("thinsp",   '\u2009'), // thin space, U+2009 ISOpub
 474          new CharacterReference ("zwnj",     '\u200c'), // zero width non-joiner, U+200C NEW RFC 2070
 475          new CharacterReference ("zwj",      '\u200d'), // zero width joiner, U+200D NEW RFC 2070
 476          new CharacterReference ("lrm",      '\u200e'), // left-to-right mark, U+200E NEW RFC 2070
 477          new CharacterReference ("rlm",      '\u200f'), // right-to-left mark, U+200F NEW RFC 2070
 478          new CharacterReference ("ndash",    '\u2013'), // en dash, U+2013 ISOpub
 479          new CharacterReference ("mdash",    '\u2014'), // em dash, U+2014 ISOpub
 480          new CharacterReference ("lsquo",    '\u2018'), // left single quotation mark, U+2018 ISOnum
 481          new CharacterReference ("rsquo",    '\u2019'), // right single quotation mark, U+2019 ISOnum
 482          new CharacterReference ("sbquo",    '\u201a'), // single low-9 quotation mark, U+201A NEW
 483          new CharacterReference ("ldquo",    '\u201c'), // left double quotation mark, U+201C ISOnum
 484          new CharacterReference ("rdquo",    '\u201d'), // right double quotation mark, U+201D ISOnum
 485          new CharacterReference ("bdquo",    '\u201e'), // double low-9 quotation mark, U+201E NEW
 486          new CharacterReference ("dagger",   '\u2020'), // dagger, U+2020 ISOpub
 487          new CharacterReference ("Dagger",   '\u2021'), // double dagger, U+2021 ISOpub
 488          new CharacterReference ("permil",   '\u2030'), // per mille sign, U+2030 ISOtech
 489          new CharacterReference ("lsaquo",   '\u2039'), // single left-pointing angle quotation mark, U+2039 ISO proposed
 490          // lsaquo is proposed but not yet ISO standardized
 491          new CharacterReference ("rsaquo",   '\u203a'), // single right-pointing angle quotation mark, U+203A ISO proposed
 492          // rsaquo is proposed but not yet ISO standardized
 493          new CharacterReference ("euro",     '\u20ac'), // euro sign, U+20AC NEW
 494      };
 495  
 496      /**
 497       * The dividing point between a simple table lookup and a binary search.
 498       * Characters below the break point are stored in a sparse array allowing
 499       * direct index lookup.
 500       */
 501      protected static final int BREAKPOINT = 0x100;
 502  
 503      /**
 504       * List of references sorted by character.
 505       * The first part of this array, up to <code>BREAKPOINT</code> is stored
 506       * in a direct translational table, indexing into the table with a character
 507       * yields the reference. The second part is dense and sorted by character,
 508       * suitable for binary lookup.
 509       */
 510      protected static final CharacterReference[] mCharacterList;
 511  
 512      static
 513      {
 514          int index;
 515          CharacterReference item;
 516          int character;
 517  
 518          // count below the break point
 519          index = 0;
 520          for (int i = 0; i < mCharacterReferences.length; i++)
 521              if (mCharacterReferences[i].getCharacter () < BREAKPOINT)
 522                  index++;
 523          // allocate enough for the linear table and remainder
 524          mCharacterList = new CharacterReference[BREAKPOINT + mCharacterReferences.length - index];
 525          index = BREAKPOINT;
 526          for (int i = 0; i < mCharacterReferences.length; i++)
 527          {
 528              item = mCharacterReferences[i];
 529              character = mCharacterReferences[i].getCharacter ();
 530              if (character < BREAKPOINT)
 531                  mCharacterList[character] = item;
 532              else
 533              {
 534                  // use a linear search and insertion sort, done only once
 535                  int x = BREAKPOINT;
 536                  while (x < index)
 537                      if (mCharacterList[x].getCharacter () > character)
 538                          break;
 539                      else
 540                          x++;
 541                  int y = index - 1;
 542                  while (y >= x)
 543                  {
 544                      mCharacterList[y + 1] = mCharacterList[y];
 545                      y--;
 546                  }
 547                  mCharacterList[x] = item;
 548                  index++;
 549              }
 550          }
 551          // reorder the original array into kernel order
 552          Sort.QuickSort (mCharacterReferences);
 553      }
 554  
 555      /**
 556       * Private constructor.
 557       * This class is fully static and thread safe.
 558       */
 559      private Translate ()
 560      {
 561      }
 562  
 563      /**
 564       * Binary search for a reference.
 565       * @param array The array of <code>CharacterReference</code> objects.
 566       * @param ref The character to search for.
 567       * @param lo The lower index within which to look.
 568       * @param hi The upper index within which to look.
 569       * @return The index at which reference was found or is to be inserted.
 570       */
 571      protected static int lookup (CharacterReference[] array, char ref, int lo, int hi)
 572      {   int num;
 573          int mid;
 574          int half;
 575          int result;
 576          int ret;
 577  
 578          ret = -1;
 579  
 580          num = (hi - lo) + 1;
 581          while ((-1 == ret) && (lo <= hi))
 582          {
 583              half = num / 2;
 584              mid = lo + ((0 != (num & 1)) ? half : half - 1);
 585              result = ref - array[mid].getCharacter ();
 586              if (0 == result)
 587                  ret = mid;
 588              else if (0 > result)
 589              {
 590                  hi = mid - 1;
 591                  num = ((0 != (num & 1)) ? half : half - 1);
 592              }
 593              else
 594              {
 595                  lo = mid + 1;
 596                  num = half;
 597              }
 598          }
 599          if (-1 == ret)
 600              ret = lo;
 601  
 602          return (ret);
 603      }
 604  
 605      /**
 606       * Look up a reference by character.
 607       * Use a combination of direct table lookup and binary search to find
 608       * the reference corresponding to the character.
 609       * @param character The character to be looked up.
 610       * @return The entity reference for that character or <code>null</code>.
 611       */
 612      public static CharacterReference lookup (char character)
 613      {
 614          int index;
 615          CharacterReference ret;
 616  
 617          if (character < BREAKPOINT)
 618              ret = mCharacterList[character];
 619          else
 620          {
 621              index = lookup (mCharacterList, character, BREAKPOINT, mCharacterList.length - 1);
 622              if (index < mCharacterList.length)
 623              {
 624                  ret = mCharacterList[index];
 625                  if (character != ret.getCharacter ())
 626                      ret = null;
 627              }
 628              else
 629                  ret = null;
 630          }
 631          
 632          return (ret);
 633      }
 634  
 635      /**
 636       * Look up a reference by kernel.
 637       * Use a binary search on the ordered list of known references.
 638       * Since the binary search returns the position at which a new item should
 639       * be inserted, we check the references earlier in the list if there is
 640       * a failure.
 641       * @param key A character reference with the kernel set to the string
 642       * to be found. It need not be truncated at the exact end of the reference.
 643       */
 644      protected static CharacterReference lookup (CharacterReference key)
 645      {
 646          String string;
 647          int index;
 648          String kernel;
 649          char character;
 650          CharacterReference test;
 651          CharacterReference ret;
 652  
 653          // Care should be taken here because some entity references are
 654          // prefixes of others, i.e.:
 655          // \u2209[notin] \u00ac[not]
 656          // \u00ba[ordm] \u2228[or]
 657          // \u03d6[piv] \u03c0[pi]
 658          // \u00b3[sup3] \u2283[sup]
 659          ret = null;
 660          index = Sort.bsearch (mCharacterReferences, key);
 661          string = key.getKernel ();
 662          if (index < mCharacterReferences.length)
 663          {
 664              ret = mCharacterReferences[index];
 665              kernel = ret.getKernel ();
 666              if (!string.regionMatches (
 667                  0,
 668                  kernel,
 669                  0,
 670                  kernel.length ()))
 671              {   // not exact, check references starting with same character
 672                  // to see if a subset matches
 673                  ret = null;
 674              }
 675          }
 676          if (null == ret)
 677          {
 678              character = string.charAt (0);
 679              while (--index >= 0)
 680              {
 681                  test = mCharacterReferences[index];
 682                  kernel = test.getKernel ();
 683                  if (character == kernel.charAt (0))
 684                  {
 685                      if (string.regionMatches (
 686                          0,
 687                          kernel,
 688                          0,
 689                          kernel.length ()))
 690                      {
 691                          ret = test;
 692                          break;
 693                      }
 694                  }
 695                  else
 696                      break;
 697              }
 698          }
 699          
 700          return (ret);
 701      }
 702  
 703      /**
 704       * Look up a reference by kernel.
 705       * Use a binary search on the ordered list of known references.
 706       * <em>This is not very efficient, use {@link org.htmlparser.util.Translate#lookup(org.htmlparser.util.CharacterReference) lookup(CharacterReference)}
 707       * instead.</em>
 708       * @param kernel The string to lookup, i.e. "amp".
 709       * @param start The starting point in the string of the kernel.
 710       * @param end The ending point in the string of the kernel.
 711       * This should be the index of the semicolon if it exists, or failing that,
 712       * at least an index past the last character of the kernel.
 713       * @return The reference that matches the given string, or <code>null</code>
 714       * if it wasn't found.
 715       */
 716      public static CharacterReference lookup (String kernel, int start, int end)
 717      {
 718          CharacterReferenceEx probe;
 719          
 720          probe = new CharacterReferenceEx ();
 721          probe.setKernel (kernel);
 722          probe.setStart (start);
 723          probe.setEnd (end);
 724  
 725          return (lookup (probe));
 726      }
 727  
 728      /**
 729       * Convert a reference to a unicode character.
 730       * Convert a single numeric character reference or character entity reference
 731       * to a unicode character.
 732       * @param string The string to convert. Of the form &xxxx; or &amp;#xxxx; with
 733       * or without the leading ampersand or trailing semi-colon.
 734       * @param start The starting pooint in the string to look for a character reference.
 735       * @param end The ending point in the string to stop looking for a character reference.
 736       * @return The converted character or '' (zero) if the string is an
 737       * invalid reference.
 738       * @deprecated Use {@link #decode(String) decode}.
 739       */
 740      public static char convertToChar (String string, int start, int end)
 741      {
 742          return (decode (string.substring (start, end)).charAt (0));
 743      }
 744  
 745      /**
 746       * Convert a reference to a unicode character.
 747       * Convert a single numeric character reference or character entity reference
 748       * to a unicode character.
 749       * @param string The string to convert. Of the form &xxxx; or &amp;#xxxx; with
 750       * or without the leading ampersand or trailing semi-colon.
 751       * @return The converted character or '' (zero) if the string is an
 752       * invalid reference.
 753       * @deprecated Use {@link #decode(String) decode}.
 754       */
 755      public static char convertToChar (String string)
 756      {
 757          return (decode (string).charAt (0));
 758      }
 759  
 760      /**
 761       * Decode a string containing references.
 762       * Change all numeric character reference and character entity references
 763       * to unicode characters.
 764       * @param string The string to translate.
 765       */
 766      public static String decode (String string)
 767      {
 768          CharacterReferenceEx key;
 769          int amp;
 770          int index;
 771          int length;
 772          StringBuffer buffer;
 773          char character;
 774          int number;
 775          int radix;
 776          int i;
 777          int semi;
 778          boolean done;
 779          CharacterReference item;
 780          String ret;
 781  
 782          if (-1 == (amp = string.indexOf ('&')))
 783              ret = string;
 784          else
 785          {
 786              key = null;
 787              index = 0;
 788              length = string.length ();
 789              buffer = new StringBuffer (length);
 790              do
 791              {
 792                  // equivalent to buffer.append (string.substring (index, amp));
 793                  // but without the allocation of a new String
 794                  while (index < amp)
 795                      buffer.append (string.charAt (index++));
 796                  
 797                  index++;
 798                  if (index < length)
 799                  {
 800                      character = string.charAt (index);
 801                      if ('#' == character)
 802                      {
 803                          // numeric character reference
 804                          index++;
 805                          number = 0;
 806                          radix = 0;
 807                          i = index;
 808                          done = false;
 809                          while ((i < length) && !done)
 810                          {
 811                              character = string.charAt (i);
 812                              switch (character)
 813                              {
 814                                  case '0':
 815                                  case '1':
 816                                  case '2':
 817                                  case '3':
 818                                  case '4':
 819                                  case '5':
 820                                  case '6':
 821                                  case '7':
 822                                  case '8':
 823                                  case '9':
 824                                      if (0 == radix)
 825                                          radix = 10;
 826                                      number = number * radix + (character - '0');
 827                                      break;
 828                                  case 'A':
 829                                  case 'B':
 830                                  case 'C':
 831                                  case 'D':
 832                                  case 'E':
 833                                  case 'F':
 834                                      if (16 == radix)
 835                                          number = number * radix + (character - 'A' + 10);
 836                                      else
 837                                          done = true;
 838                                      break;
 839                                  case 'a':
 840                                  case 'b':
 841                                  case 'c':
 842                                  case 'd':
 843                                  case 'e':
 844                                  case 'f':
 845                                      if (16 == radix)
 846                                          number = number * radix + (character - 'a' + 10);
 847                                      else
 848                                          done = true;
 849                                      break;
 850                                  case 'x':
 851                                  case 'X':
 852                                      if (0 == radix)
 853                                          radix = 16;
 854                                      else
 855                                          done = true;
 856                                      break;
 857                                  case ';':
 858                                      done = true;
 859                                      i++;
 860                                      break;
 861                                  default:
 862                                      done = true;
 863                                      break;
 864                              }
 865                              if (!done)
 866                                  i++;
 867                          }
 868                          if (0 != number)
 869                          {
 870                              buffer.append ((char)number);
 871                              index = i;
 872                              amp = index;
 873                          }
 874                          
 875                      }
 876                      else if (Character.isLetter (character)) // really can't start with a digit eh...
 877                      {
 878                          // character entity reference
 879                          i = index + 1;
 880                          done = false;
 881                          semi = length;
 882                          while ((i < length) && !done)
 883                          {
 884                              character = string.charAt (i);
 885                              if (';' == character)
 886                              {
 887                                  done = true;
 888                                  semi = i;
 889                                  i++;
 890                              }
 891                              else if (Character.isLetterOrDigit (character))
 892                                  i++;
 893                              else
 894                              {
 895                                  done = true;
 896                                  semi = i;
 897                              }
 898                          }
 899                          // new CharacterReference (string.substring (index, semi), 0);
 900                          if (null == key)
 901                              key = new CharacterReferenceEx ();
 902                          key.setKernel (string);
 903                          key.setStart (index);
 904                          key.setEnd (semi);
 905                          item = lookup (key);
 906                          if (null != item)
 907                          {
 908                              buffer.append ((char)item.getCharacter ());
 909                              index += item.getKernel ().length ();
 910                              if ((index < length) && (';' == string.charAt (index)))
 911                                  index++;
 912                              amp = index;
 913                          }
 914                      }
 915                      else
 916                      {
 917                          // need do nothing here, the ampersand will be consumed below
 918                      }
 919                  }
 920                  // gather up unconsumed characters
 921                  while (amp < index)
 922                      buffer.append (string.charAt (amp++));
 923              }
 924              while ((index < length) && (-1 != (amp = string.indexOf ('&', index))));
 925              // equivalent to buffer.append (string.substring (index));
 926              // but without the allocation of a new String
 927              while (index < length)
 928                  buffer.append (string.charAt (index++));
 929              ret = buffer.toString ();
 930          }
 931  
 932          return (ret);
 933      }
 934  
 935      /**
 936       * Decode the characters in a string buffer containing references.
 937       * Change all numeric character reference and character entity references
 938       * to unicode characters.
 939       * @param buffer The StringBuffer containing references.
 940       * @return The decoded string.
 941       */
 942      public static String decode (StringBuffer buffer)
 943      {
 944          return decode (buffer.toString());
 945      }
 946  
 947      /**
 948       * Decode a stream containing references.
 949       * Change all numeric character reference and character entity references
 950       * to unicode characters. If <code>DECODE_LINE_BY_LINE</code> is true,
 951       * the input stream is broken up into lines, terminated by either
 952       * carriage return or newline, in order to reduce the latency and maximum
 953       * buffering memory size required.
 954       * @param in The stream to translate. It is assumed that the input
 955       * stream is encoded with ISO-8859-1 since the table of character
 956       * entity references in this class applies only to ISO-8859-1.
 957       * @param out The stream to write the decoded stream to.
 958       */
 959      public static void decode (InputStream in, PrintStream out)
 960      {
 961          Reader reader;
 962          StringBuffer buffer;
 963          int character;
 964          String string;
 965          boolean newlines;
 966  
 967          try
 968          {
 969              try
 970              {
 971                  reader = new BufferedReader (new InputStreamReader (in, "ISO-8859-1"));
 972              }
 973              catch (UnsupportedEncodingException use)
 974              {
 975                  // yeah, like this will happen; OK, assume the default is ISO-8859-1
 976                  reader = new BufferedReader (new InputStreamReader (in));
 977              }
 978              buffer = new StringBuffer (1024);
 979              newlines = false;
 980              if (DECODE_LINE_BY_LINE)
 981                  while (-1 != (character = reader.read ()))
 982                  {
 983                      if (('\r' == character) || ('\n' == character))
 984                      {
 985                          if (!newlines)
 986                          {
 987                              string = decode (buffer.toString ());
 988                              out.print (string);
 989                              buffer.setLength (0);
 990                              newlines = true;
 991                          }
 992                          buffer.append ((char)character);
 993                      }
 994                      else
 995                      {
 996                          if (newlines)
 997                          {
 998                              out.print (buffer.toString ());
 999                              buffer.setLength (0);
1000                              newlines = false;
1001                          }
1002                          buffer.append ((char)character);
1003                      }
1004                  }
1005              else
1006                  while (-1 != (character = reader.read ()))
1007                      buffer.append ((char)character);
1008              if (0 != buffer.length ())
1009              {
1010                  if (newlines)
1011                      out.print (buffer.toString ());
1012                  else
1013                  {
1014                      string = decode (buffer.toString ());
1015                      out.print (string);
1016                  }
1017              }
1018          }
1019          catch (IOException ioe)
1020          {
1021              out.println ();
1022              out.println (ioe.getMessage ());
1023          }
1024          finally
1025          {
1026              out.flush ();
1027          }
1028      }
1029  
1030      /**
1031       * Convert a character to a numeric character reference.
1032       * Convert a unicode character to a numeric character reference of
1033       * the form &amp;#xxxx;.
1034       * @param character The character to convert.
1035       * @return The converted character.
1036       * @deprecated Use {@link #encode(int) encode}.
1037       */
1038      public static String convertToString (int character)
1039      {
1040          return (encode (character));
1041      }
1042  
1043      /**
1044       * Convert a character to a numeric character reference.
1045       * Convert a unicode character to a numeric character reference of
1046       * the form &amp;#xxxx;.
1047       * @param character The character to convert.
1048       * @return The converted character.
1049       */
1050      public static String encode (int character)
1051      {
1052          StringBuffer ret;
1053  
1054          ret = new StringBuffer (13); /* &#2147483647; */
1055          ret.append ("&#");
1056          if (ENCODE_HEXADECIMAL)
1057          {
1058              ret.append ("x");
1059              ret.append (Integer.toHexString (character));
1060          }
1061          else
1062              ret.append (character);
1063          ret.append (';');
1064  
1065          return (ret.toString ());
1066      }
1067      
1068      /**
1069       * Encode a string to use references.
1070       * Change all characters that are not ISO-8859-1 to their numeric character
1071       * reference or character entity reference.
1072       * @param string The string to translate.
1073       * @return The encoded string.
1074       */
1075      public static String encode (String string)
1076      {
1077          int length;
1078          char c;
1079          CharacterReference candidate;
1080          StringBuffer ret;
1081  
1082          ret = new StringBuffer (string.length () * 6);
1083          length  = string.length ();
1084          for (int i = 0; i < length; i++)
1085          {
1086              c = string.charAt (i);
1087              candidate = lookup (c);
1088              if (null != candidate)
1089              {
1090                  ret.append ('&');
1091                  ret.append (candidate.getKernel ());
1092                  ret.append (';');
1093              }
1094              else if (!(c < 0x007F))
1095              {
1096                  ret.append ("&#");
1097                  if (ENCODE_HEXADECIMAL)
1098                  {
1099                      ret.append ("x");
1100                      ret.append (Integer.toHexString (c));
1101                  }
1102                  else
1103                      ret.append ((int)c);
1104                  ret.append (';');
1105              }
1106              else
1107                  ret.append (c);
1108          }
1109  
1110          return (ret.toString ());
1111      }
1112  
1113      /**
1114       * Encode a stream to use references.
1115       * Change all characters that are not ISO-8859-1 to their numeric character
1116       * reference or character entity reference.
1117       * @param in The stream to translate. It is assumed that the input
1118       * stream is encoded with ISO-8859-1 since the table of character
1119       * entity references in this class applies only to ISO-8859-1.
1120       * @param out The stream to write the decoded stream to.
1121       */
1122      public static void encode (InputStream in, PrintStream out)
1123      {
1124          Reader reader;
1125          char c;
1126          int index;
1127          CharacterReference candidate;
1128          PrintWriter output;
1129  
1130          try
1131          {
1132              reader = new BufferedReader (new InputStreamReader (in, "ISO-8859-1"));
1133              output = new PrintWriter (new BufferedWriter (new OutputStreamWriter (out, "ISO-8859-1")));
1134          }
1135          catch (UnsupportedEncodingException use)
1136          {
1137              // yeah, like this will happen; OK, assume default is ISO-8859-1
1138              reader = new BufferedReader (new InputStreamReader (in));
1139              output = new PrintWriter (new BufferedWriter (new OutputStreamWriter (out)));
1140          }
1141          try
1142          {
1143              while (-1 != (index = reader.read ()))
1144              {
1145                  c = (char)index;
1146                  candidate = lookup (c);
1147                  if (null != candidate)
1148                  {
1149                      output.print ('&');
1150                      output.print (candidate.getKernel ());
1151                      output.print (';');
1152                  }
1153                  else if (!(c < 0x007F))
1154                  {
1155                      output.print ("&#");
1156                      if (ENCODE_HEXADECIMAL)
1157                      {
1158                          output.print ("x");
1159                          output.print (Integer.toHexString (c));
1160                      }
1161                      else
1162                          output.print ((int)c);
1163                      output.print (';');
1164                  }
1165                  else
1166                      output.print (c);
1167              }
1168          }
1169          catch (IOException ioe)
1170          {
1171              output.println ();
1172              output.println (ioe.getMessage ());
1173          }
1174          finally
1175          {
1176              output.flush ();
1177          }
1178      }
1179  
1180      /**
1181       * Numeric character reference and character entity reference to unicode codec.
1182       * Translate the <code>System.in</code> input into an encoded or decoded
1183       * stream and send the results to <code>System.out</code>.
1184       * @param args If arg[0] is <code>-encode</code> perform an encoding on
1185       * <code>System.in</code>, otherwise perform a decoding.
1186       */
1187      public static void main (String[] args)
1188      {
1189          boolean encode;
1190  
1191          if (0 < args.length && args[0].equalsIgnoreCase ("-encode"))
1192              encode = true;
1193          else
1194              encode = false;
1195          if (encode)
1196              encode (System.in, System.out);
1197          else
1198              decode (System.in, System.out);
1199      }
1200  }