Translate.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Derrick Oswald 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/Translate.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2004/07/31 16:42:33 $ 10 // $Revision: 1.46 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.util; 28 29 import java.io.BufferedReader; 30 import java.io.BufferedWriter; 31 import java.io.IOException; 32 import java.io.InputStream; 33 import java.io.InputStreamReader; 34 import java.io.OutputStreamWriter; 35 import java.io.PrintStream; 36 import java.io.PrintWriter; 37 import java.io.Reader; 38 import java.io.UnsupportedEncodingException; 39 40 import org.htmlparser.util.sort.Sort; 41 42 /** 43 * Extended character entity reference. 44 * Handles kernels within other strings, just for lookup purposes. 45 */ 46 class CharacterReferenceEx extends CharacterReference 47 { 48 /** 49 * The starting point in the string. 50 */ 51 protected int mStart; 52 53 /** 54 * The ending point in the string. 55 */ 56 protected int mEnd; 57 58 /** 59 * Zero args constructor. 60 * This object is only ever used after setting the kernel, start and end. 61 */ 62 public CharacterReferenceEx () 63 { 64 super ("", 0); 65 } 66 67 /** 68 * Set the starting point of the kernel. 69 */ 70 public void setStart (int start) 71 { 72 mStart = start; 73 } 74 75 /** 76 * Set the supposed ending point. 77 * This only specifies an upper bound on the kernel length. 78 */ 79 public void setEnd (int end) 80 { 81 mEnd = end; 82 } 83 84 /** 85 * Get this CharacterReference's kernel. 86 * @return The kernel in the equivalent character entity reference. 87 */ 88 public String getKernel () 89 { 90 return (mKernel.substring (mStart, mEnd)); 91 } 92 93 // 94 // Ordered interface 95 // 96 97 /** 98 * Compare one reference to another. 99 * @see org.htmlparser.util.sort.Ordered 100 */ 101 public int compare (Object that) 102 { 103 CharacterReference r; 104 String kernel; 105 int length; 106 int ret; 107 108 ret = 0; 109 r = (CharacterReference)that; 110 kernel = r.getKernel (); 111 length = kernel.length (); 112 for (int i = mStart, j = 0; i < mEnd; i++, j++) 113 { 114 if (j >= length) 115 { 116 ret = 1; 117 break; 118 } 119 ret = mKernel.charAt (i) - kernel.charAt (j); 120 if (0 != ret) 121 break; 122 } 123 124 return (ret); 125 } 126 } 127 128 /** 129 * Translate numeric character references and character entity references to unicode characters. 130 * Based on tables found at <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html"> 131 * http://www.w3.org/TR/REC-html40/sgml/entities.html</a> 132 * <p>Typical usage: 133 * <pre> 134 * String s = Translate.decode (getTextFromHtmlPage ()); 135 * </pre> 136 * or 137 * <pre> 138 * String s = "<HTML>" + Translate.encode (getArbitraryText ()) + "</HTML>"; 139 * </pre> 140 */ 141 public class Translate 142 { 143 /** 144 * If this member is set <code>true</code>, decoding of streams is 145 * done line by line in order to reduce the maximum memory required. 146 */ 147 static public boolean DECODE_LINE_BY_LINE = false; 148 149 /** 150 * If this member is set <code>true</code>, encoding of numeric character 151 * references uses hexadecimal digits, i.e. &#x25CB;, instead of decimal 152 * digits. 153 */ 154 static public boolean ENCODE_HEXADECIMAL = false; 155 156 /** 157 * Table mapping entity reference kernel to character. 158 * This is sorted by kernel when the class is loaded. 159 */ 160 protected static final CharacterReference[] mCharacterReferences = 161 { 162 // Portions © International Organization for Standardization 1986 163 // Permission to copy in any form is granted for use with 164 // conforming SGML systems and applications as defined in 165 // ISO 8879, provided this notice is included in all copies. 166 // Character entity set. Typical invocation: 167 // <!ENTITY % HTMLlat1 PUBLIC 168 // "-//W3C//ENTITIES Latin 1//EN//HTML"> 169 // %HTMLlat1; 170 new CharacterReference ("nbsp", '\u00a0'), // no-break space = non-breaking space, U+00A0 ISOnum 171 new CharacterReference ("iexcl", '\u00a1'), // inverted exclamation mark, U+00A1 ISOnum 172 new CharacterReference ("cent", '\u00a2'), // cent sign, U+00A2 ISOnum 173 new CharacterReference ("pound", '\u00a3'), // pound sign, U+00A3 ISOnum 174 new CharacterReference ("curren", '\u00a4'), // currency sign, U+00A4 ISOnum 175 new CharacterReference ("yen", '\u00a5'), // yen sign = yuan sign, U+00A5 ISOnum 176 new CharacterReference ("brvbar", '\u00a6'), // broken bar = broken vertical bar, U+00A6 ISOnum 177 new CharacterReference ("sect", '\u00a7'), // section sign, U+00A7 ISOnum 178 new CharacterReference ("uml", '\u00a8'), // diaeresis = spacing diaeresis, U+00A8 ISOdia 179 new CharacterReference ("copy", '\u00a9'), // copyright sign, U+00A9 ISOnum 180 new CharacterReference ("ordf", '\u00aa'), // feminine ordinal indicator, U+00AA ISOnum 181 new CharacterReference ("laquo", '\u00ab'), // left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum 182 new CharacterReference ("not", '\u00ac'), // not sign, U+00AC ISOnum 183 new CharacterReference ("shy", '\u00ad'), // soft hyphen = discretionary hyphen, U+00AD ISOnum 184 new CharacterReference ("reg", '\u00ae'), // registered sign = registered trade mark sign, U+00AE ISOnum 185 new CharacterReference ("macr", '\u00af'), // macron = spacing macron = overline = APL overbar, U+00AF ISOdia 186 new CharacterReference ("deg", '\u00b0'), // degree sign, U+00B0 ISOnum 187 new CharacterReference ("plusmn", '\u00b1'), // plus-minus sign = plus-or-minus sign, U+00B1 ISOnum 188 new CharacterReference ("sup2", '\u00b2'), // superscript two = superscript digit two = squared, U+00B2 ISOnum 189 new CharacterReference ("sup3", '\u00b3'), // superscript three = superscript digit three = cubed, U+00B3 ISOnum 190 new CharacterReference ("acute", '\u00b4'), // acute accent = spacing acute, U+00B4 ISOdia 191 new CharacterReference ("micro", '\u00b5'), // micro sign, U+00B5 ISOnum 192 new CharacterReference ("para", '\u00b6'), // pilcrow sign = paragraph sign, U+00B6 ISOnum 193 new CharacterReference ("middot", '\u00b7'), // middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum 194 new CharacterReference ("cedil", '\u00b8'), // cedilla = spacing cedilla, U+00B8 ISOdia 195 new CharacterReference ("sup1", '\u00b9'), // superscript one = superscript digit one, U+00B9 ISOnum 196 new CharacterReference ("ordm", '\u00ba'), // masculine ordinal indicator, U+00BA ISOnum 197 new CharacterReference ("raquo", '\u00bb'), // right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum 198 new CharacterReference ("frac14", '\u00bc'), // vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum 199 new CharacterReference ("frac12", '\u00bd'), // vulgar fraction one half = fraction one half, U+00BD ISOnum 200 new CharacterReference ("frac34", '\u00be'), // vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum 201 new CharacterReference ("iquest", '\u00bf'), // inverted question mark = turned question mark, U+00BF ISOnum 202 new CharacterReference ("Agrave", '\u00c0'), // latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 203 new CharacterReference ("Aacute", '\u00c1'), // latin capital letter A with acute, U+00C1 ISOlat1 204 new CharacterReference ("Acirc", '\u00c2'), // latin capital letter A with circumflex, U+00C2 ISOlat1 205 new CharacterReference ("Atilde", '\u00c3'), // latin capital letter A with tilde, U+00C3 ISOlat1 206 new CharacterReference ("Auml", '\u00c4'), // latin capital letter A with diaeresis, U+00C4 ISOlat1 207 new CharacterReference ("Aring", '\u00c5'), // latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 208 new CharacterReference ("AElig", '\u00c6'), // latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 209 new CharacterReference ("Ccedil", '\u00c7'), // latin capital letter C with cedilla, U+00C7 ISOlat1 210 new CharacterReference ("Egrave", '\u00c8'), // latin capital letter E with grave, U+00C8 ISOlat1 211 new CharacterReference ("Eacute", '\u00c9'), // latin capital letter E with acute, U+00C9 ISOlat1 212 new CharacterReference ("Ecirc", '\u00ca'), // latin capital letter E with circumflex, U+00CA ISOlat1 213 new CharacterReference ("Euml", '\u00cb'), // latin capital letter E with diaeresis, U+00CB ISOlat1 214 new CharacterReference ("Igrave", '\u00cc'), // latin capital letter I with grave, U+00CC ISOlat1 215 new CharacterReference ("Iacute", '\u00cd'), // latin capital letter I with acute, U+00CD ISOlat1 216 new CharacterReference ("Icirc", '\u00ce'), // latin capital letter I with circumflex, U+00CE ISOlat1 217 new CharacterReference ("Iuml", '\u00cf'), // latin capital letter I with diaeresis, U+00CF ISOlat1 218 new CharacterReference ("ETH", '\u00d0'), // latin capital letter ETH, U+00D0 ISOlat1 219 new CharacterReference ("Ntilde", '\u00d1'), // latin capital letter N with tilde, U+00D1 ISOlat1 220 new CharacterReference ("Ograve", '\u00d2'), // latin capital letter O with grave, U+00D2 ISOlat1 221 new CharacterReference ("Oacute", '\u00d3'), // latin capital letter O with acute, U+00D3 ISOlat1 222 new CharacterReference ("Ocirc", '\u00d4'), // latin capital letter O with circumflex, U+00D4 ISOlat1 223 new CharacterReference ("Otilde", '\u00d5'), // latin capital letter O with tilde, U+00D5 ISOlat1 224 new CharacterReference ("Ouml", '\u00d6'), // latin capital letter O with diaeresis, U+00D6 ISOlat1 225 new CharacterReference ("times", '\u00d7'), // multiplication sign, U+00D7 ISOnum 226 new CharacterReference ("Oslash", '\u00d8'), // latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 227 new CharacterReference ("Ugrave", '\u00d9'), // latin capital letter U with grave, U+00D9 ISOlat1 228 new CharacterReference ("Uacute", '\u00da'), // latin capital letter U with acute, U+00DA ISOlat1 229 new CharacterReference ("Ucirc", '\u00db'), // latin capital letter U with circumflex, U+00DB ISOlat1 230 new CharacterReference ("Uuml", '\u00dc'), // latin capital letter U with diaeresis, U+00DC ISOlat1 231 new CharacterReference ("Yacute", '\u00dd'), // latin capital letter Y with acute, U+00DD ISOlat1 232 new CharacterReference ("THORN", '\u00de'), // latin capital letter THORN, U+00DE ISOlat1 233 new CharacterReference ("szlig", '\u00df'), // latin small letter sharp s = ess-zed, U+00DF ISOlat1 234 new CharacterReference ("agrave", '\u00e0'), // latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 235 new CharacterReference ("aacute", '\u00e1'), // latin small letter a with acute, U+00E1 ISOlat1 236 new CharacterReference ("acirc", '\u00e2'), // latin small letter a with circumflex, U+00E2 ISOlat1 237 new CharacterReference ("atilde", '\u00e3'), // latin small letter a with tilde, U+00E3 ISOlat1 238 new CharacterReference ("auml", '\u00e4'), // latin small letter a with diaeresis, U+00E4 ISOlat1 239 new CharacterReference ("aring", '\u00e5'), // latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 240 new CharacterReference ("aelig", '\u00e6'), // latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 241 new CharacterReference ("ccedil", '\u00e7'), // latin small letter c with cedilla, U+00E7 ISOlat1 242 new CharacterReference ("egrave", '\u00e8'), // latin small letter e with grave, U+00E8 ISOlat1 243 new CharacterReference ("eacute", '\u00e9'), // latin small letter e with acute, U+00E9 ISOlat1 244 new CharacterReference ("ecirc", '\u00ea'), // latin small letter e with circumflex, U+00EA ISOlat1 245 new CharacterReference ("euml", '\u00eb'), // latin small letter e with diaeresis, U+00EB ISOlat1 246 new CharacterReference ("igrave", '\u00ec'), // latin small letter i with grave, U+00EC ISOlat1 247 new CharacterReference ("iacute", '\u00ed'), // latin small letter i with acute, U+00ED ISOlat1 248 new CharacterReference ("icirc", '\u00ee'), // latin small letter i with circumflex, U+00EE ISOlat1 249 new CharacterReference ("iuml", '\u00ef'), // latin small letter i with diaeresis, U+00EF ISOlat1 250 new CharacterReference ("eth", '\u00f0'), // latin small letter eth, U+00F0 ISOlat1 251 new CharacterReference ("ntilde", '\u00f1'), // latin small letter n with tilde, U+00F1 ISOlat1 252 new CharacterReference ("ograve", '\u00f2'), // latin small letter o with grave, U+00F2 ISOlat1 253 new CharacterReference ("oacute", '\u00f3'), // latin small letter o with acute, U+00F3 ISOlat1 254 new CharacterReference ("ocirc", '\u00f4'), // latin small letter o with circumflex, U+00F4 ISOlat1 255 new CharacterReference ("otilde", '\u00f5'), // latin small letter o with tilde, U+00F5 ISOlat1 256 new CharacterReference ("ouml", '\u00f6'), // latin small letter o with diaeresis, U+00F6 ISOlat1 257 new CharacterReference ("divide", '\u00f7'), // division sign, U+00F7 ISOnum 258 new CharacterReference ("oslash", '\u00f8'), // latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 259 new CharacterReference ("ugrave", '\u00f9'), // latin small letter u with grave, U+00F9 ISOlat1 260 new CharacterReference ("uacute", '\u00fa'), // latin small letter u with acute, U+00FA ISOlat1 261 new CharacterReference ("ucirc", '\u00fb'), // latin small letter u with circumflex, U+00FB ISOlat1 262 new CharacterReference ("uuml", '\u00fc'), // latin small letter u with diaeresis, U+00FC ISOlat1 263 new CharacterReference ("yacute", '\u00fd'), // latin small letter y with acute, U+00FD ISOlat1 264 new CharacterReference ("thorn", '\u00fe'), // latin small letter thorn, U+00FE ISOlat1 265 new CharacterReference ("yuml", '\u00ff'), // latin small letter y with diaeresis, U+00FF ISOlat1 266 // Mathematical, Greek and Symbolic characters for HTML 267 // Character entity set. Typical invocation: 268 // <!ENTITY % HTMLsymbol PUBLIC 269 // "-//W3C//ENTITIES Symbols//EN//HTML"> 270 // %HTMLsymbol; 271 // Portions © International Organization for Standardization 1986: 272 // Permission to copy in any form is granted for use with 273 // conforming SGML systems and applications as defined in 274 // ISO 8879, provided this notice is included in all copies. 275 // Relevant ISO entity set is given unless names are newly introduced. 276 // New names (i.e., not in ISO 8879 list) do not clash with any 277 // existing ISO 8879 entity names. ISO 10646 character numbers 278 // are given for each character, in hex. CDATA values are decimal 279 // conversions of the ISO 10646 values and refer to the document 280 // character set. Names are ISO 10646 names. 281 // Latin Extended-B 282 new CharacterReference ("fnof", '\u0192'), // latin small f with hook = function = florin, U+0192 ISOtech 283 // Greek 284 new CharacterReference ("Alpha", '\u0391'), // greek capital letter alpha, U+0391 285 new CharacterReference ("Beta", '\u0392'), // greek capital letter beta, U+0392 286 new CharacterReference ("Gamma", '\u0393'), // greek capital letter gamma, U+0393 ISOgrk3 287 new CharacterReference ("Delta", '\u0394'), // greek capital letter delta, U+0394 ISOgrk3 288 new CharacterReference ("Epsilon", '\u0395'), // greek capital letter epsilon, U+0395 289 new CharacterReference ("Zeta", '\u0396'), // greek capital letter zeta, U+0396 290 new CharacterReference ("Eta", '\u0397'), // greek capital letter eta, U+0397 291 new CharacterReference ("Theta", '\u0398'), // greek capital letter theta, U+0398 ISOgrk3 292 new CharacterReference ("Iota", '\u0399'), // greek capital letter iota, U+0399 293 new CharacterReference ("Kappa", '\u039a'), // greek capital letter kappa, U+039A 294 new CharacterReference ("Lambda", '\u039b'), // greek capital letter lambda, U+039B ISOgrk3 295 new CharacterReference ("Mu", '\u039c'), // greek capital letter mu, U+039C 296 new CharacterReference ("Nu", '\u039d'), // greek capital letter nu, U+039D 297 new CharacterReference ("Xi", '\u039e'), // greek capital letter xi, U+039E ISOgrk3 298 new CharacterReference ("Omicron", '\u039f'), // greek capital letter omicron, U+039F 299 new CharacterReference ("Pi", '\u03a0'), // greek capital letter pi, U+03A0 ISOgrk3 300 new CharacterReference ("Rho", '\u03a1'), // greek capital letter rho, U+03A1 301 // there is no Sigmaf, and no U+03A2 character either 302 new CharacterReference ("Sigma", '\u03a3'), // greek capital letter sigma, U+03A3 ISOgrk3 303 new CharacterReference ("Tau", '\u03a4'), // greek capital letter tau, U+03A4 304 new CharacterReference ("Upsilon", '\u03a5'), // greek capital letter upsilon, U+03A5 ISOgrk3 305 new CharacterReference ("Phi", '\u03a6'), // greek capital letter phi, U+03A6 ISOgrk3 306 new CharacterReference ("Chi", '\u03a7'), // greek capital letter chi, U+03A7 307 new CharacterReference ("Psi", '\u03a8'), // greek capital letter psi, U+03A8 ISOgrk3 308 new CharacterReference ("Omega", '\u03a9'), // greek capital letter omega, U+03A9 ISOgrk3 309 new CharacterReference ("alpha", '\u03b1'), // greek small letter alpha, U+03B1 ISOgrk3 310 new CharacterReference ("beta", '\u03b2'), // greek small letter beta, U+03B2 ISOgrk3 311 new CharacterReference ("gamma", '\u03b3'), // greek small letter gamma, U+03B3 ISOgrk3 312 new CharacterReference ("delta", '\u03b4'), // greek small letter delta, U+03B4 ISOgrk3 313 new CharacterReference ("epsilon", '\u03b5'), // greek small letter epsilon, U+03B5 ISOgrk3 314 new CharacterReference ("zeta", '\u03b6'), // greek small letter zeta, U+03B6 ISOgrk3 315 new CharacterReference ("eta", '\u03b7'), // greek small letter eta, U+03B7 ISOgrk3 316 new CharacterReference ("theta", '\u03b8'), // greek small letter theta, U+03B8 ISOgrk3 317 new CharacterReference ("iota", '\u03b9'), // greek small letter iota, U+03B9 ISOgrk3 318 new CharacterReference ("kappa", '\u03ba'), // greek small letter kappa, U+03BA ISOgrk3 319 new CharacterReference ("lambda", '\u03bb'), // greek small letter lambda, U+03BB ISOgrk3 320 new CharacterReference ("mu", '\u03bc'), // greek small letter mu, U+03BC ISOgrk3 321 new CharacterReference ("nu", '\u03bd'), // greek small letter nu, U+03BD ISOgrk3 322 new CharacterReference ("xi", '\u03be'), // greek small letter xi, U+03BE ISOgrk3 323 new CharacterReference ("omicron", '\u03bf'), // greek small letter omicron, U+03BF NEW 324 new CharacterReference ("pi", '\u03c0'), // greek small letter pi, U+03C0 ISOgrk3 325 new CharacterReference ("rho", '\u03c1'), // greek small letter rho, U+03C1 ISOgrk3 326 new CharacterReference ("sigmaf", '\u03c2'), // greek small letter final sigma, U+03C2 ISOgrk3 327 new CharacterReference ("sigma", '\u03c3'), // greek small letter sigma, U+03C3 ISOgrk3 328 new CharacterReference ("tau", '\u03c4'), // greek small letter tau, U+03C4 ISOgrk3 329 new CharacterReference ("upsilon", '\u03c5'), // greek small letter upsilon, U+03C5 ISOgrk3 330 new CharacterReference ("phi", '\u03c6'), // greek small letter phi, U+03C6 ISOgrk3 331 new CharacterReference ("chi", '\u03c7'), // greek small letter chi, U+03C7 ISOgrk3 332 new CharacterReference ("psi", '\u03c8'), // greek small letter psi, U+03C8 ISOgrk3 333 new CharacterReference ("omega", '\u03c9'), // greek small letter omega, U+03C9 ISOgrk3 334 new CharacterReference ("thetasym", '\u03d1'), // greek small letter theta symbol, U+03D1 NEW 335 new CharacterReference ("upsih", '\u03d2'), // greek upsilon with hook symbol, U+03D2 NEW 336 new CharacterReference ("piv", '\u03d6'), // greek pi symbol, U+03D6 ISOgrk3 337 // General Punctuation 338 new CharacterReference ("bull", '\u2022'), // bullet = black small circle, U+2022 ISOpub 339 // bullet is NOT the same as bullet operator, U+2219 340 new CharacterReference ("hellip", '\u2026'), // horizontal ellipsis = three dot leader, U+2026 ISOpub 341 new CharacterReference ("prime", '\u2032'), // prime = minutes = feet, U+2032 ISOtech 342 new CharacterReference ("Prime", '\u2033'), // double prime = seconds = inches, U+2033 ISOtech 343 new CharacterReference ("oline", '\u203e'), // overline = spacing overscore, U+203E NEW 344 new CharacterReference ("frasl", '\u2044'), // fraction slash, U+2044 NEW 345 // Letterlike Symbols 346 new CharacterReference ("weierp", '\u2118'), // script capital P = power set = Weierstrass p, U+2118 ISOamso 347 new CharacterReference ("image", '\u2111'), // blackletter capital I = imaginary part, U+2111 ISOamso 348 new CharacterReference ("real", '\u211c'), // blackletter capital R = real part symbol, U+211C ISOamso 349 new CharacterReference ("trade", '\u2122'), // trade mark sign, U+2122 ISOnum 350 new CharacterReference ("alefsym", '\u2135'), // alef symbol = first transfinite cardinal, U+2135 NEW 351 // alef symbol is NOT the same as hebrew letter alef, 352 // U+05D0 although the same glyph could be used to depict both characters 353 // Arrows 354 new CharacterReference ("larr", '\u2190'), // leftwards arrow, U+2190 ISOnum 355 new CharacterReference ("uarr", '\u2191'), // upwards arrow, U+2191 ISOnum 356 new CharacterReference ("rarr", '\u2192'), // rightwards arrow, U+2192 ISOnum 357 new CharacterReference ("darr", '\u2193'), // downwards arrow, U+2193 ISOnum 358 new CharacterReference ("harr", '\u2194'), // left right arrow, U+2194 ISOamsa 359 new CharacterReference ("crarr", '\u21b5'), // downwards arrow with corner leftwards = carriage return, U+21B5 NEW 360 new CharacterReference ("lArr", '\u21d0'), // leftwards double arrow, U+21D0 ISOtech 361 // ISO 10646 does not say that lArr is the same as the 'is implied by' arrow 362 // but also does not have any other character for that function. So ? lArr can 363 // be used for 'is implied by' as ISOtech suggests 364 new CharacterReference ("uArr", '\u21d1'), // upwards double arrow, U+21D1 ISOamsa 365 new CharacterReference ("rArr", '\u21d2'), // rightwards double arrow, U+21D2 ISOtech 366 // ISO 10646 does not say this is the 'implies' character but does not have 367 // another character with this function so ? 368 // rArr can be used for 'implies' as ISOtech suggests 369 new CharacterReference ("dArr", '\u21d3'), // downwards double arrow, U+21D3 ISOamsa 370 new CharacterReference ("hArr", '\u21d4'), // left right double arrow, U+21D4 ISOamsa 371 // Mathematical Operators 372 new CharacterReference ("forall", '\u2200'), // for all, U+2200 ISOtech 373 new CharacterReference ("part", '\u2202'), // partial differential, U+2202 ISOtech 374 new CharacterReference ("exist", '\u2203'), // there exists, U+2203 ISOtech 375 new CharacterReference ("empty", '\u2205'), // empty set = null set = diameter, U+2205 ISOamso 376 new CharacterReference ("nabla", '\u2207'), // nabla = backward difference, U+2207 ISOtech 377 new CharacterReference ("isin", '\u2208'), // element of, U+2208 ISOtech 378 new CharacterReference ("notin", '\u2209'), // not an element of, U+2209 ISOtech 379 new CharacterReference ("ni", '\u220b'), // contains as member, U+220B ISOtech 380 // should there be a more memorable name than 'ni'? 381 new CharacterReference ("prod", '\u220f'), // n-ary product = product sign, U+220F ISOamsb 382 // prod is NOT the same character as U+03A0 'greek capital letter pi' though 383 // the same glyph might be used for both 384 new CharacterReference ("sum", '\u2211'), // n-ary sumation, U+2211 ISOamsb 385 // sum is NOT the same character as U+03A3 'greek capital letter sigma' 386 // though the same glyph might be used for both 387 new CharacterReference ("minus", '\u2212'), // minus sign, U+2212 ISOtech 388 new CharacterReference ("lowast", '\u2217'), // asterisk operator, U+2217 ISOtech 389 new CharacterReference ("radic", '\u221a'), // square root = radical sign, U+221A ISOtech 390 new CharacterReference ("prop", '\u221d'), // proportional to, U+221D ISOtech 391 new CharacterReference ("infin", '\u221e'), // infinity, U+221E ISOtech 392 new CharacterReference ("ang", '\u2220'), // angle, U+2220 ISOamso 393 new CharacterReference ("and", '\u2227'), // logical and = wedge, U+2227 ISOtech 394 new CharacterReference ("or", '\u2228'), // logical or = vee, U+2228 ISOtech 395 new CharacterReference ("cap", '\u2229'), // intersection = cap, U+2229 ISOtech 396 new CharacterReference ("cup", '\u222a'), // union = cup, U+222A ISOtech 397 new CharacterReference ("int", '\u222b'), // integral, U+222B ISOtech 398 new CharacterReference ("there4", '\u2234'), // therefore, U+2234 ISOtech 399 new CharacterReference ("sim", '\u223c'), // tilde operator = varies with = similar to, U+223C ISOtech 400 // tilde operator is NOT the same character as the tilde, U+007E, 401 // although the same glyph might be used to represent both 402 new CharacterReference ("cong", '\u2245'), // approximately equal to, U+2245 ISOtech 403 new CharacterReference ("asymp", '\u2248'), // almost equal to = asymptotic to, U+2248 ISOamsr 404 new CharacterReference ("ne", '\u2260'), // not equal to, U+2260 ISOtech 405 new CharacterReference ("equiv", '\u2261'), // identical to, U+2261 ISOtech 406 new CharacterReference ("le", '\u2264'), // less-than or equal to, U+2264 ISOtech 407 new CharacterReference ("ge", '\u2265'), // greater-than or equal to, U+2265 ISOtech 408 new CharacterReference ("sub", '\u2282'), // subset of, U+2282 ISOtech 409 new CharacterReference ("sup", '\u2283'), // superset of, U+2283 ISOtech 410 // note that nsup, 'not a superset of, U+2283' is not covered by the Symbol 411 // font encoding and is not included. Should it be, for symmetry? 412 // It is in ISOamsn 413 new CharacterReference ("nsub", '\u2284'), // not a subset of, U+2284 ISOamsn 414 new CharacterReference ("sube", '\u2286'), // subset of or equal to, U+2286 ISOtech 415 new CharacterReference ("supe", '\u2287'), // superset of or equal to, U+2287 ISOtech 416 new CharacterReference ("oplus", '\u2295'), // circled plus = direct sum, U+2295 ISOamsb 417 new CharacterReference ("otimes", '\u2297'), // circled times = vector product, U+2297 ISOamsb 418 new CharacterReference ("perp", '\u22a5'), // up tack = orthogonal to = perpendicular, U+22A5 ISOtech 419 new CharacterReference ("sdot", '\u22c5'), // dot operator, U+22C5 ISOamsb 420 // dot operator is NOT the same character as U+00B7 middle dot 421 // Miscellaneous Technical 422 new CharacterReference ("lceil", '\u2308'), // left ceiling = apl upstile, U+2308 ISOamsc 423 new CharacterReference ("rceil", '\u2309'), // right ceiling, U+2309 ISOamsc 424 new CharacterReference ("lfloor", '\u230a'), // left floor = apl downstile, U+230A ISOamsc 425 new CharacterReference ("rfloor", '\u230b'), // right floor, U+230B ISOamsc 426 new CharacterReference ("lang", '\u2329'), // left-pointing angle bracket = bra, U+2329 ISOtech 427 // lang is NOT the same character as U+003C 'less than' 428 // or U+2039 'single left-pointing angle quotation mark' 429 new CharacterReference ("rang", '\u232a'), // right-pointing angle bracket = ket, U+232A ISOtech 430 // rang is NOT the same character as U+003E 'greater than' 431 // or U+203A 'single right-pointing angle quotation mark' 432 // Geometric Shapes 433 new CharacterReference ("loz", '\u25ca'), // lozenge, U+25CA ISOpub 434 // Miscellaneous Symbols 435 new CharacterReference ("spades", '\u2660'), // black spade suit, U+2660 ISOpub 436 // black here seems to mean filled as opposed to hollow 437 new CharacterReference ("clubs", '\u2663'), // black club suit = shamrock, U+2663 ISOpub 438 new CharacterReference ("hearts", '\u2665'), // black heart suit = valentine, U+2665 ISOpub 439 new CharacterReference ("diams", '\u2666'), // black diamond suit, U+2666 ISOpub 440 // Special characters for HTML 441 // Character entity set. Typical invocation: 442 // <!ENTITY % HTMLspecial PUBLIC 443 // "-//W3C//ENTITIES Special//EN//HTML"> 444 // %HTMLspecial; 445 // Portions © International Organization for Standardization 1986: 446 // Permission to copy in any form is granted for use with 447 // conforming SGML systems and applications as defined in 448 // ISO 8879, provided this notice is included in all copies. 449 // Relevant ISO entity set is given unless names are newly introduced. 450 // New names (i.e., not in ISO 8879 list) do not clash with any 451 // existing ISO 8879 entity names. ISO 10646 character numbers 452 // are given for each character, in hex. CDATA values are decimal 453 // conversions of the ISO 10646 values and refer to the document 454 // character set. Names are ISO 10646 names. 455 // C0 Controls and Basic Latin 456 new CharacterReference ("quot", '\u0022'), // quotation mark = APL quote, U+0022 ISOnum 457 new CharacterReference ("amp", '\u0026'), // ampersand, U+0026 ISOnum 458 new CharacterReference ("lt", '\u003c'), // less-than sign, U+003C ISOnum 459 new CharacterReference ("gt", '\u003e'), // greater-than sign, U+003E ISOnum 460 // Latin Extended-A 461 new CharacterReference ("OElig", '\u0152'), // latin capital ligature OE, U+0152 ISOlat2 462 new CharacterReference ("oelig", '\u0153'), // latin small ligature oe, U+0153 ISOlat2 463 // ligature is a misnomer, this is a separate character in some languages 464 new CharacterReference ("Scaron", '\u0160'), // latin capital letter S with caron, U+0160 ISOlat2 465 new CharacterReference ("scaron", '\u0161'), // latin small letter s with caron, U+0161 ISOlat2 466 new CharacterReference ("Yuml", '\u0178'), // latin capital letter Y with diaeresis, U+0178 ISOlat2 467 // Spacing Modifier Letters 468 new CharacterReference ("circ", '\u02c6'), // modifier letter circumflex accent, U+02C6 ISOpub 469 new CharacterReference ("tilde", '\u02dc'), // small tilde, U+02DC ISOdia 470 // General Punctuation 471 new CharacterReference ("ensp", '\u2002'), // en space, U+2002 ISOpub 472 new CharacterReference ("emsp", '\u2003'), // em space, U+2003 ISOpub 473 new CharacterReference ("thinsp", '\u2009'), // thin space, U+2009 ISOpub 474 new CharacterReference ("zwnj", '\u200c'), // zero width non-joiner, U+200C NEW RFC 2070 475 new CharacterReference ("zwj", '\u200d'), // zero width joiner, U+200D NEW RFC 2070 476 new CharacterReference ("lrm", '\u200e'), // left-to-right mark, U+200E NEW RFC 2070 477 new CharacterReference ("rlm", '\u200f'), // right-to-left mark, U+200F NEW RFC 2070 478 new CharacterReference ("ndash", '\u2013'), // en dash, U+2013 ISOpub 479 new CharacterReference ("mdash", '\u2014'), // em dash, U+2014 ISOpub 480 new CharacterReference ("lsquo", '\u2018'), // left single quotation mark, U+2018 ISOnum 481 new CharacterReference ("rsquo", '\u2019'), // right single quotation mark, U+2019 ISOnum 482 new CharacterReference ("sbquo", '\u201a'), // single low-9 quotation mark, U+201A NEW 483 new CharacterReference ("ldquo", '\u201c'), // left double quotation mark, U+201C ISOnum 484 new CharacterReference ("rdquo", '\u201d'), // right double quotation mark, U+201D ISOnum 485 new CharacterReference ("bdquo", '\u201e'), // double low-9 quotation mark, U+201E NEW 486 new CharacterReference ("dagger", '\u2020'), // dagger, U+2020 ISOpub 487 new CharacterReference ("Dagger", '\u2021'), // double dagger, U+2021 ISOpub 488 new CharacterReference ("permil", '\u2030'), // per mille sign, U+2030 ISOtech 489 new CharacterReference ("lsaquo", '\u2039'), // single left-pointing angle quotation mark, U+2039 ISO proposed 490 // lsaquo is proposed but not yet ISO standardized 491 new CharacterReference ("rsaquo", '\u203a'), // single right-pointing angle quotation mark, U+203A ISO proposed 492 // rsaquo is proposed but not yet ISO standardized 493 new CharacterReference ("euro", '\u20ac'), // euro sign, U+20AC NEW 494 }; 495 496 /** 497 * The dividing point between a simple table lookup and a binary search. 498 * Characters below the break point are stored in a sparse array allowing 499 * direct index lookup. 500 */ 501 protected static final int BREAKPOINT = 0x100; 502 503 /** 504 * List of references sorted by character. 505 * The first part of this array, up to <code>BREAKPOINT</code> is stored 506 * in a direct translational table, indexing into the table with a character 507 * yields the reference. The second part is dense and sorted by character, 508 * suitable for binary lookup. 509 */ 510 protected static final CharacterReference[] mCharacterList; 511 512 static 513 { 514 int index; 515 CharacterReference item; 516 int character; 517 518 // count below the break point 519 index = 0; 520 for (int i = 0; i < mCharacterReferences.length; i++) 521 if (mCharacterReferences[i].getCharacter () < BREAKPOINT) 522 index++; 523 // allocate enough for the linear table and remainder 524 mCharacterList = new CharacterReference[BREAKPOINT + mCharacterReferences.length - index]; 525 index = BREAKPOINT; 526 for (int i = 0; i < mCharacterReferences.length; i++) 527 { 528 item = mCharacterReferences[i]; 529 character = mCharacterReferences[i].getCharacter (); 530 if (character < BREAKPOINT) 531 mCharacterList[character] = item; 532 else 533 { 534 // use a linear search and insertion sort, done only once 535 int x = BREAKPOINT; 536 while (x < index) 537 if (mCharacterList[x].getCharacter () > character) 538 break; 539 else 540 x++; 541 int y = index - 1; 542 while (y >= x) 543 { 544 mCharacterList[y + 1] = mCharacterList[y]; 545 y--; 546 } 547 mCharacterList[x] = item; 548 index++; 549 } 550 } 551 // reorder the original array into kernel order 552 Sort.QuickSort (mCharacterReferences); 553 } 554 555 /** 556 * Private constructor. 557 * This class is fully static and thread safe. 558 */ 559 private Translate () 560 { 561 } 562 563 /** 564 * Binary search for a reference. 565 * @param array The array of <code>CharacterReference</code> objects. 566 * @param ref The character to search for. 567 * @param lo The lower index within which to look. 568 * @param hi The upper index within which to look. 569 * @return The index at which reference was found or is to be inserted. 570 */ 571 protected static int lookup (CharacterReference[] array, char ref, int lo, int hi) 572 { int num; 573 int mid; 574 int half; 575 int result; 576 int ret; 577 578 ret = -1; 579 580 num = (hi - lo) + 1; 581 while ((-1 == ret) && (lo <= hi)) 582 { 583 half = num / 2; 584 mid = lo + ((0 != (num & 1)) ? half : half - 1); 585 result = ref - array[mid].getCharacter (); 586 if (0 == result) 587 ret = mid; 588 else if (0 > result) 589 { 590 hi = mid - 1; 591 num = ((0 != (num & 1)) ? half : half - 1); 592 } 593 else 594 { 595 lo = mid + 1; 596 num = half; 597 } 598 } 599 if (-1 == ret) 600 ret = lo; 601 602 return (ret); 603 } 604 605 /** 606 * Look up a reference by character. 607 * Use a combination of direct table lookup and binary search to find 608 * the reference corresponding to the character. 609 * @param character The character to be looked up. 610 * @return The entity reference for that character or <code>null</code>. 611 */ 612 public static CharacterReference lookup (char character) 613 { 614 int index; 615 CharacterReference ret; 616 617 if (character < BREAKPOINT) 618 ret = mCharacterList[character]; 619 else 620 { 621 index = lookup (mCharacterList, character, BREAKPOINT, mCharacterList.length - 1); 622 if (index < mCharacterList.length) 623 { 624 ret = mCharacterList[index]; 625 if (character != ret.getCharacter ()) 626 ret = null; 627 } 628 else 629 ret = null; 630 } 631 632 return (ret); 633 } 634 635 /** 636 * Look up a reference by kernel. 637 * Use a binary search on the ordered list of known references. 638 * Since the binary search returns the position at which a new item should 639 * be inserted, we check the references earlier in the list if there is 640 * a failure. 641 * @param key A character reference with the kernel set to the string 642 * to be found. It need not be truncated at the exact end of the reference. 643 */ 644 protected static CharacterReference lookup (CharacterReference key) 645 { 646 String string; 647 int index; 648 String kernel; 649 char character; 650 CharacterReference test; 651 CharacterReference ret; 652 653 // Care should be taken here because some entity references are 654 // prefixes of others, i.e.: 655 // \u2209[notin] \u00ac[not] 656 // \u00ba[ordm] \u2228[or] 657 // \u03d6[piv] \u03c0[pi] 658 // \u00b3[sup3] \u2283[sup] 659 ret = null; 660 index = Sort.bsearch (mCharacterReferences, key); 661 string = key.getKernel (); 662 if (index < mCharacterReferences.length) 663 { 664 ret = mCharacterReferences[index]; 665 kernel = ret.getKernel (); 666 if (!string.regionMatches ( 667 0, 668 kernel, 669 0, 670 kernel.length ())) 671 { // not exact, check references starting with same character 672 // to see if a subset matches 673 ret = null; 674 } 675 } 676 if (null == ret) 677 { 678 character = string.charAt (0); 679 while (--index >= 0) 680 { 681 test = mCharacterReferences[index]; 682 kernel = test.getKernel (); 683 if (character == kernel.charAt (0)) 684 { 685 if (string.regionMatches ( 686 0, 687 kernel, 688 0, 689 kernel.length ())) 690 { 691 ret = test; 692 break; 693 } 694 } 695 else 696 break; 697 } 698 } 699 700 return (ret); 701 } 702 703 /** 704 * Look up a reference by kernel. 705 * Use a binary search on the ordered list of known references. 706 * <em>This is not very efficient, use {@link org.htmlparser.util.Translate#lookup(org.htmlparser.util.CharacterReference) lookup(CharacterReference)} 707 * instead.</em> 708 * @param kernel The string to lookup, i.e. "amp". 709 * @param start The starting point in the string of the kernel. 710 * @param end The ending point in the string of the kernel. 711 * This should be the index of the semicolon if it exists, or failing that, 712 * at least an index past the last character of the kernel. 713 * @return The reference that matches the given string, or <code>null</code> 714 * if it wasn't found. 715 */ 716 public static CharacterReference lookup (String kernel, int start, int end) 717 { 718 CharacterReferenceEx probe; 719 720 probe = new CharacterReferenceEx (); 721 probe.setKernel (kernel); 722 probe.setStart (start); 723 probe.setEnd (end); 724 725 return (lookup (probe)); 726 } 727 728 /** 729 * Convert a reference to a unicode character. 730 * Convert a single numeric character reference or character entity reference 731 * to a unicode character. 732 * @param string The string to convert. Of the form &xxxx; or &#xxxx; with 733 * or without the leading ampersand or trailing semi-colon. 734 * @param start The starting pooint in the string to look for a character reference. 735 * @param end The ending point in the string to stop looking for a character reference. 736 * @return The converted character or '