/ org.htmlparser / src / org / htmlparser / lexer / Page.java
Page.java
   1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
   2  // http://sourceforge.org/projects/htmlparser
   3  // Copyright (C) 2004 Derrick Oswald
   4  //
   5  // Revision Control Information
   6  //
   7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v $
   8  // $Author: derrickoswald $
   9  // $Date: 2006/03/19 17:09:09 $
  10  // $Revision: 1.53 $
  11  //
  12  // This library is free software; you can redistribute it and/or
  13  // modify it under the terms of the GNU Lesser General Public
  14  // License as published by the Free Software Foundation; either
  15  // version 2.1 of the License, or (at your option) any later version.
  16  //
  17  // This library is distributed in the hope that it will be useful,
  18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20  // Lesser General Public License for more details.
  21  //
  22  // You should have received a copy of the GNU Lesser General Public
  23  // License along with this library; if not, write to the Free Software
  24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  25  //
  26  
  27  package org.htmlparser.lexer;
  28  
  29  import java.io.InputStream;
  30  import java.io.IOException;
  31  import java.io.ObjectInputStream;
  32  import java.io.ObjectOutputStream;
  33  import java.io.Serializable;
  34  import java.io.UnsupportedEncodingException;
  35  import java.lang.reflect.InvocationTargetException;
  36  import java.lang.reflect.Method;
  37  import java.net.MalformedURLException;
  38  import java.net.URL;
  39  import java.net.URLConnection;
  40  import java.net.UnknownHostException;
  41  import java.util.zip.GZIPInputStream;
  42  import java.util.zip.InflaterInputStream;
  43  
  44  import org.htmlparser.http.ConnectionManager;
  45  import org.htmlparser.util.ParserException;
  46  
  47  /**
  48   * Represents the contents of an HTML page.
  49   * Contains the source of characters and an index of positions of line
  50   * separators (actually the first character position on the next line).
  51   */
  52  public class Page
  53      implements
  54          Serializable
  55  {
  56      /**
  57       * The default charset.
  58       * This should be <code>{@value}</code>,
  59       * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616)
  60       * section 3.7.1
  61       * <p>Another alias is "8859_1".
  62       */
  63      public static final String DEFAULT_CHARSET = "ISO-8859-1";
  64  
  65      /**
  66       * The default content type.
  67       * In the absence of alternate information, assume html content ({@value}).
  68       */
  69      public static final String DEFAULT_CONTENT_TYPE = "text/html";
  70  
  71      /**
  72       * Character value when the page is exhausted.
  73       * Has a value of {@value}.
  74       */
  75      public static final char EOF = (char)Source.EOF;
  76  
  77      /**
  78       * The URL this page is coming from.
  79       * Cached value of <code>getConnection().toExternalForm()</code> or
  80       * <code>setUrl()</code>.
  81       */
  82      protected String mUrl;
  83  
  84      /**
  85       * The base URL for this page.
  86       */
  87      protected String mBaseUrl;
  88  
  89      /**
  90       * The source of characters.
  91       */
  92      protected Source mSource;
  93  
  94      /**
  95       * Character positions of the first character in each line.
  96       */
  97      protected PageIndex mIndex;
  98  
  99      /**
 100       * The connection this page is coming from or <code>null</code>.
 101       */
 102      protected transient URLConnection mConnection;
 103  
 104      /**
 105       * Connection control (proxy, cookies, authorization).
 106       */
 107      protected static ConnectionManager mConnectionManager =
 108          new ConnectionManager ();
 109  
 110      /**
 111       * Construct an empty page.
 112       */
 113      public Page ()
 114      {
 115          this ("");
 116      }
 117  
 118      /**
 119       * Construct a page reading from a URL connection.
 120       * @param connection A fully conditioned connection. The connect()
 121       * method will be called so it need not be connected yet.
 122       * @exception ParserException An exception object wrapping a number of
 123       * possible error conditions, some of which are outlined below.
 124       * <li>IOException If an i/o exception occurs creating the
 125       * source.</li>
 126       * <li>UnsupportedEncodingException if the character set specified in the
 127       * HTTP header is not supported.</li>
 128       */
 129      public Page (URLConnection connection) throws ParserException
 130      {
 131          if (null == connection)
 132              throw new IllegalArgumentException ("connection cannot be null");
 133          setConnection (connection);
 134          mBaseUrl = null;
 135      }
 136  
 137      /**
 138       * Construct a page from a stream encoded with the given charset.
 139       * @param stream The source of bytes.
 140       * @param charset The encoding used.
 141       * If null, defaults to the <code>DEFAULT_CHARSET</code>.
 142       * @exception UnsupportedEncodingException If the given charset
 143       * is not supported.
 144       */
 145      public Page (InputStream stream, String charset)
 146          throws
 147              UnsupportedEncodingException
 148      {
 149          if (null == stream)
 150              throw new IllegalArgumentException ("stream cannot be null");
 151          if (null == charset)
 152              charset = DEFAULT_CHARSET;
 153          mSource = new InputStreamSource (stream, charset);
 154          mIndex = new PageIndex (this);
 155          mConnection = null;
 156          mUrl = null;
 157          mBaseUrl = null;
 158      }
 159  
 160      /**
 161       * Construct a page from the given string.
 162       * @param text The HTML text.
 163       * @param charset <em>Optional</em>. The character set encoding that will
 164       * be reported by {@link #getEncoding}. If charset is <code>null</code>
 165       * the default character set is used.
 166       */
 167      public Page (String text, String charset)
 168      {
 169          if (null == text)
 170              throw new IllegalArgumentException ("text cannot be null");
 171          if (null == charset)
 172              charset = DEFAULT_CHARSET;
 173          mSource = new StringSource (text, charset);
 174          mIndex = new PageIndex (this);
 175          mConnection = null;
 176          mUrl = null;
 177          mBaseUrl = null;
 178      }
 179  
 180      /**
 181       * Construct a page from the given string.
 182       * The page will report that it is using an encoding of
 183       * {@link #DEFAULT_CHARSET}.
 184       * @param text The HTML text.
 185       */
 186      public Page (String text)
 187      {
 188          this (text, null);
 189      }
 190  
 191      /**
 192       * Construct a page from a source.
 193       * @param source The source of characters.
 194       */
 195      public Page (Source source)
 196      {
 197          if (null == source)
 198              throw new IllegalArgumentException ("source cannot be null");
 199          mSource = source;
 200          mIndex = new PageIndex (this);
 201          mConnection = null;
 202          mUrl = null;
 203          mBaseUrl = null;
 204      }
 205  
 206      //
 207      // static methods
 208      //
 209  
 210      /**
 211       * Get the connection manager all Parsers use.
 212       * @return The connection manager.
 213       */
 214      public static ConnectionManager getConnectionManager ()
 215      {
 216          return (mConnectionManager);
 217      }
 218  
 219      /**
 220       * Set the connection manager to use.
 221       * @param manager The new connection manager.
 222       */
 223      public static void setConnectionManager (ConnectionManager manager)
 224      {
 225          mConnectionManager = manager;
 226      }
 227  
 228      /**
 229       * Get a CharacterSet name corresponding to a charset parameter.
 230       * @param content A text line of the form:
 231       * <pre>
 232       * text/html; charset=Shift_JIS
 233       * </pre>
 234       * which is applicable both to the HTTP header field Content-Type and
 235       * the meta tag http-equiv="Content-Type".
 236       * Note this method also handles non-compliant quoted charset directives
 237       * such as:
 238       * <pre>
 239       * text/html; charset="UTF-8"
 240       * </pre>
 241       * and
 242       * <pre>
 243       * text/html; charset='UTF-8'
 244       * </pre>
 245       * @return The character set name to use when reading the input stream.
 246       * For JDKs that have the Charset class this is qualified by passing
 247       * the name to findCharset() to render it into canonical form.
 248       * If the charset parameter is not found in the given string, the default
 249       * character set is returned.
 250       * @see #findCharset
 251       * @see #DEFAULT_CHARSET
 252       */
 253      public String getCharset (String content)
 254      {
 255          final String CHARSET_STRING = "charset";
 256          int index;
 257          String ret;
 258  
 259          if (null == mSource)
 260              ret = DEFAULT_CHARSET;
 261          else
 262              // use existing (possibly supplied) character set:
 263              // bug #1322686 when illegal charset specified
 264              ret = mSource.getEncoding ();
 265          if (null != content)
 266          {
 267              index = content.indexOf (CHARSET_STRING);
 268  
 269              if (index != -1)
 270              {
 271                  content = content.substring (index +
 272                      CHARSET_STRING.length ()).trim ();
 273                  if (content.startsWith ("="))
 274                  {
 275                      content = content.substring (1).trim ();
 276                      index = content.indexOf (";");
 277                      if (index != -1)
 278                          content = content.substring (0, index);
 279  
 280                      //remove any double quotes from around charset string
 281                      if (content.startsWith ("\"") && content.endsWith ("\"")
 282                          && (1 < content.length ()))
 283                          content = content.substring (1, content.length () - 1);
 284  
 285                      //remove any single quote from around charset string
 286                      if (content.startsWith ("'") && content.endsWith ("'")
 287                          && (1 < content.length ()))
 288                          content = content.substring (1, content.length () - 1);
 289  
 290                      ret = findCharset (content, ret);
 291  
 292                      // Charset names are not case-sensitive;
 293                      // that is, case is always ignored when comparing
 294                      // charset names.
 295  //                    if (!ret.equalsIgnoreCase (content))
 296  //                    {
 297  //                        System.out.println (
 298  //                            "detected charset \""
 299  //                            + content
 300  //                            + "\", using \""
 301  //                            + ret
 302  //                            + "\"");
 303  //                    }
 304                  }
 305              }
 306          }
 307  
 308          return (ret);
 309      }
 310  
 311      /**
 312       * Lookup a character set name.
 313       * <em>Vacuous for JVM's without <code>java.nio.charset</code>.</em>
 314       * This uses reflection so the code will still run under prior JDK's but
 315       * in that case the default is always returned.
 316       * @param name The name to look up. One of the aliases for a character set.
 317       * @param fallback The name to return if the lookup fails.
 318       * @return The character set name.
 319       */
 320      public static String findCharset (String name, String fallback)
 321      {
 322          String ret;
 323  
 324          try
 325          {
 326              Class cls;
 327              Method method;
 328              Object object;
 329  
 330              cls = Class.forName ("java.nio.charset.Charset");
 331              method = cls.getMethod ("forName", new Class[] { String.class });
 332              object = method.invoke (null, new Object[] { name });
 333              method = cls.getMethod ("name", new Class[] { });
 334              object = method.invoke (object, new Object[] { });
 335              ret = (String)object;
 336          }
 337          catch (ClassNotFoundException cnfe)
 338          {
 339              // for reflection exceptions, assume the name is correct
 340              ret = name;
 341          }
 342          catch (NoSuchMethodException nsme)
 343          {
 344              // for reflection exceptions, assume the name is correct
 345              ret = name;
 346          }
 347          catch (IllegalAccessException ia)
 348          {
 349              // for reflection exceptions, assume the name is correct
 350              ret = name;
 351          }
 352          catch (InvocationTargetException ita)
 353          {
 354              // java.nio.charset.IllegalCharsetNameException
 355              // and java.nio.charset.UnsupportedCharsetException
 356              // return the default
 357              ret = fallback;
 358              System.out.println (
 359                  "unable to determine cannonical charset name for "
 360                  + name
 361                  + " - using "
 362                  + fallback);
 363          }
 364  
 365          return (ret);
 366      }
 367  
 368      //
 369      // Serialization support
 370      //
 371  
 372      /**
 373       * Serialize the page.
 374       * There are two modes to serializing a page based on the connected state.
 375       * If connected, the URL and the current offset is saved, while if
 376       * disconnected, the underling source is saved.
 377       * @param out The object stream to store this object in.
 378       * @exception IOException If there is a serialization problem.
 379       */
 380      private void writeObject (ObjectOutputStream out)
 381          throws
 382              IOException
 383      {
 384          String href;
 385          Source source;
 386          PageIndex index;
 387  
 388          // two cases, reading from a URL and not
 389          if (null != getConnection ())
 390          {
 391              out.writeBoolean (true);
 392              out.writeInt (mSource.offset ()); // need to preread this much
 393              href = getUrl ();
 394              out.writeObject (href);
 395              setUrl (getConnection ().getURL ().toExternalForm ());
 396              source = getSource ();
 397              mSource = null; // don't serialize the source if we can avoid it
 398              index = mIndex;
 399              mIndex = null; // will get recreated; valid for the new page anyway?
 400              out.defaultWriteObject ();
 401              mSource = source;
 402              mIndex = index;
 403          }
 404          else
 405          {
 406              out.writeBoolean (false);
 407              href = getUrl ();
 408              out.writeObject (href);
 409              setUrl (null); // don't try and read a bogus URL
 410              out.defaultWriteObject ();
 411              setUrl (href);
 412          }
 413      }
 414  
 415      /**
 416       * Deserialize the page.
 417       * For details see <code>writeObject()</code>.
 418       * @param in The object stream to decode.
 419       * @exception IOException If there is a deserialization problem with
 420       * the stream.
 421       * @exception ClassNotFoundException If the deserialized class can't be
 422       * located with the current classpath and class loader.
 423       */
 424      private void readObject (ObjectInputStream in)
 425          throws
 426              IOException,
 427              ClassNotFoundException
 428      {
 429          boolean fromurl;
 430          int offset;
 431          String href;
 432          URL url;
 433          Cursor cursor;
 434  
 435          fromurl = in.readBoolean ();
 436          if (fromurl)
 437          {
 438              offset = in.readInt ();
 439              href = (String)in.readObject ();
 440              in.defaultReadObject ();
 441              // open the URL
 442              if (null != getUrl ())
 443              {
 444                  url = new URL (getUrl ());
 445                  try
 446                  {
 447                      setConnection (url.openConnection ());
 448                  }
 449                  catch (ParserException pe)
 450                  {
 451                      throw new IOException (pe.getMessage ());
 452                  }
 453              }
 454              cursor = new Cursor (this, 0);
 455              for (int i = 0; i < offset; i++)
 456                  try
 457                  {
 458                      getCharacter (cursor);
 459                  }
 460                  catch (ParserException pe)
 461                  {
 462                      throw new IOException (pe.getMessage ());
 463                  }
 464              setUrl (href);
 465          }
 466          else
 467          {
 468              href = (String)in.readObject ();
 469              in.defaultReadObject ();
 470              setUrl (href);
 471          }
 472      }
 473  
 474      /**
 475       * Reset the page by resetting the source of characters.
 476       */
 477      public void reset ()
 478      {
 479          getSource ().reset ();
 480          mIndex = new PageIndex (this); // todo: is this really necessary?
 481      }
 482  
 483      /**
 484       * Close the page by destroying the source of characters.
 485       * @exception IOException If destroying the source encounters an error.
 486       */
 487      public void close () throws IOException
 488      {
 489          if (null != getSource ())
 490              getSource ().destroy ();
 491      }
 492  
 493      /**
 494       * Clean up this page, releasing resources.
 495       * Calls <code>close()</code>.
 496       * @exception Throwable if <code>close()</code> throws an
 497       * <code>IOException</code>.
 498       */
 499      protected void finalize ()
 500          throws
 501              Throwable
 502      {
 503          close ();
 504      }
 505  
 506      /**
 507       * Get the connection, if any.
 508       * @return The connection object for this page, or null if this page
 509       * is built from a stream or a string.
 510       */
 511      public URLConnection getConnection ()
 512      {
 513          return (mConnection);
 514      }
 515  
 516      /**
 517       * Set the URLConnection to be used by this page.
 518       * Starts reading from the given connection.
 519       * This also resets the current url.
 520       * @param connection The connection to use.
 521       * It will be connected by this method.
 522       * @exception ParserException If the <code>connect()</code> method fails,
 523       * or an I/O error occurs opening the input stream or the character set
 524       * designated in the HTTP header is unsupported.
 525       */
 526      public void setConnection (URLConnection connection)
 527          throws
 528              ParserException
 529      {
 530          Stream stream;
 531          String type;
 532          String charset;
 533          String contentEncoding;
 534  
 535          mConnection = connection;
 536          try
 537          {
 538              getConnection ().connect ();
 539          }
 540          catch (UnknownHostException uhe)
 541          {
 542              throw new ParserException ("Connect to "
 543                  + mConnection.getURL ().toExternalForm () + " failed.", uhe);
 544          }
 545          catch (IOException ioe)
 546          {
 547              throw new ParserException ("Exception connecting to "
 548                  + mConnection.getURL ().toExternalForm ()
 549                  + " (" + ioe.getMessage () + ").", ioe);
 550          }
 551          type = getContentType ();
 552          charset = getCharset (type);
 553          try
 554          {
 555              contentEncoding = connection.getContentEncoding();
 556              if ((null != contentEncoding)
 557                  && (-1 != contentEncoding.indexOf ("gzip")))
 558              {
 559                  stream = new Stream (new GZIPInputStream (
 560                      getConnection ().getInputStream ()));
 561              }
 562              else if ((null != contentEncoding)
 563                  && (-1 != contentEncoding.indexOf ("deflate")))
 564              {
 565                  stream = new Stream (new InflaterInputStream (
 566                      getConnection ().getInputStream ()));
 567              }
 568              else
 569              {
 570                  stream = new Stream (getConnection ().getInputStream ());
 571              }
 572  
 573              try
 574              {
 575                  mSource = new InputStreamSource (stream, charset);
 576              }
 577              catch (UnsupportedEncodingException uee)
 578              {
 579  //                StringBuffer msg;
 580  //
 581  //                msg = new StringBuffer (1024);
 582  //                msg.append (getConnection ().getURL ().toExternalForm ());
 583  //                msg.append (" has an encoding (");
 584  //                msg.append (charset);
 585  //                msg.append (") which is not supported, using ");
 586  //                msg.append (DEFAULT_CHARSET);
 587  //                System.out.println (msg.toString ());
 588                  charset = DEFAULT_CHARSET;
 589                  mSource = new InputStreamSource (stream, charset);
 590              }
 591          }
 592          catch (IOException ioe)
 593          {
 594              throw new ParserException ("Exception getting input stream from "
 595                  + mConnection.getURL ().toExternalForm ()
 596                  + " (" + ioe.getMessage () + ").", ioe);
 597          }
 598          mUrl = connection.getURL ().toExternalForm ();
 599          mIndex = new PageIndex (this);
 600      }
 601  
 602      /**
 603       * Get the URL for this page.
 604       * This is only available if the page has a connection
 605       * (<code>getConnection()</code> returns non-null), or the document base has
 606       * been set via a call to <code>setUrl()</code>.
 607       * @return The url for the connection, or <code>null</code> if there is
 608       * no conenction or the document base has not been set.
 609       */
 610      public String getUrl ()
 611      {
 612          return (mUrl);
 613      }
 614  
 615      /**
 616       * Set the URL for this page.
 617       * This doesn't affect the contents of the page, just the interpretation
 618       * of relative links from this point forward.
 619       * @param url The new URL.
 620       */
 621      public void setUrl (String url)
 622      {
 623          mUrl = url;
 624      }
 625  
 626      /**
 627       * Gets the baseUrl.
 628       * @return The base URL for this page, or <code>null</code> if not set.
 629       */
 630      public String getBaseUrl ()
 631      {
 632          return (mBaseUrl);
 633      }
 634  
 635      /**
 636       * Sets the baseUrl.
 637       * @param url The base url for this page.
 638       */
 639      public void setBaseUrl (String url)
 640      {
 641          mBaseUrl = url;
 642      }
 643  
 644      /**
 645       * Get the source this page is reading from.
 646       * @return The current source.
 647       */
 648      public Source getSource ()
 649      {
 650          return (mSource);
 651      }
 652  
 653      /**
 654       * Try and extract the content type from the HTTP header.
 655       * @return The content type.
 656       */
 657      public String getContentType ()
 658      {
 659          URLConnection connection;
 660          String content;
 661          String ret;
 662  
 663          ret = DEFAULT_CONTENT_TYPE;
 664          connection = getConnection ();
 665          if (null != connection)
 666          {
 667              content = connection.getContentType ();
 668              if (null != content)
 669                  ret = content;
 670          }
 671  
 672          return (ret);
 673      }
 674  
 675      /**
 676       * Read the character at the given cursor position.
 677       * The cursor position can be only behind or equal to the
 678       * current source position.
 679       * Returns end of lines (EOL) as \n, by converting \r and \r\n to \n,
 680       * and updates the end-of-line index accordingly
 681       * Advances the cursor position by one (or two in the \r\n case).
 682       * @param cursor The position to read at.
 683       * @return The character at that position, and modifies the cursor to
 684       * prepare for the next read. If the source is exhausted a zero is returned.
 685       * @exception ParserException If an IOException on the underlying source
 686       * occurs, or an attemp is made to read characters in the future (the
 687       * cursor position is ahead of the underlying stream)
 688       */
 689      public char getCharacter (Cursor cursor)
 690          throws
 691              ParserException
 692      {
 693          int i;
 694          int offset;
 695          char ret;
 696  
 697          i = cursor.getPosition ();
 698          offset = mSource.offset ();
 699          if (offset == i)
 700              try
 701              {
 702                  i = mSource.read ();
 703                  if (Source.EOF == i)
 704                      ret = EOF;
 705                  else
 706                  {
 707                      ret = (char)i;
 708                      cursor.advance ();
 709                  }
 710              }
 711              catch (IOException ioe)
 712              {
 713                  throw new ParserException (
 714                      "problem reading a character at position "
 715                      + cursor.getPosition (), ioe);
 716              }
 717          else if (offset > i)
 718          {
 719              // historic read
 720              try
 721              {
 722                  ret = mSource.getCharacter (i);
 723              }
 724              catch (IOException ioe)
 725              {
 726                  throw new ParserException (
 727                      "can't read a character at position "
 728                      + i, ioe);
 729              }
 730              cursor.advance ();
 731          }
 732          else
 733              // hmmm, we could skip ahead, but then what about the EOL index
 734              throw new ParserException (
 735                  "attempt to read future characters from source "
 736                  + i + " > " + mSource.offset ());
 737  
 738          // handle \r
 739          if ('\r' == ret)
 740          {   // switch to single character EOL
 741              ret = '\n';
 742  
 743              // check for a \n in the next position
 744              if (mSource.offset () == cursor.getPosition ())
 745                  try
 746                  {
 747                      i = mSource.read ();
 748                      if (Source.EOF == i)
 749                      {
 750                          // do nothing
 751                      }
 752                      else if ('\n' == (char)i)
 753                          cursor.advance ();
 754                      else
 755                          try
 756                          {
 757                              mSource.unread ();
 758                          }
 759                          catch (IOException ioe)
 760                          {
 761                              throw new ParserException (
 762                                  "can't unread a character at position "
 763                                  + cursor.getPosition (), ioe);
 764                          }
 765                  }
 766                  catch (IOException ioe)
 767                  {
 768                      throw new ParserException (
 769                          "problem reading a character at position "
 770                          + cursor.getPosition (), ioe);
 771                  }
 772              else
 773                  try
 774                  {
 775                      if ('\n' == mSource.getCharacter (cursor.getPosition ()))
 776                          cursor.advance ();
 777                  }
 778                  catch (IOException ioe)
 779                  {
 780                      throw new ParserException (
 781                          "can't read a character at position "
 782                          + cursor.getPosition (), ioe);
 783                  }
 784          }
 785          if ('\n' == ret)
 786              // update the EOL index in any case
 787              mIndex.add (cursor);
 788  
 789          return (ret);
 790      }
 791  
 792      /**
 793       * Get the current encoding being used.
 794       * @return The encoding used to convert characters.
 795       */
 796      public String getEncoding ()
 797      {
 798          return (getSource ().getEncoding ());
 799      }
 800  
 801      /**
 802       * Begins reading from the source with the given character set.
 803       * If the current encoding is the same as the requested encoding,
 804       * this method is a no-op. Otherwise any subsequent characters read from
 805       * this page will have been decoded using the given character set.<p>
 806       * Some magic happens here to obtain this result if characters have already
 807       * been consumed from this page.
 808       * Since a Reader cannot be dynamically altered to use a different character
 809       * set, the underlying stream is reset, a new Source is constructed
 810       * and a comparison made of the characters read so far with the newly
 811       * read characters up to the current position.
 812       * If a difference is encountered, or some other problem occurs,
 813       * an exception is thrown.
 814       * @param character_set The character set to use to convert bytes into
 815       * characters.
 816       * @exception ParserException If a character mismatch occurs between
 817       * characters already provided and those that would have been returned
 818       * had the new character set been in effect from the beginning. An
 819       * exception is also thrown if the underlying stream won't put up with
 820       * these shenanigans.
 821       */
 822      public void setEncoding (String character_set)
 823          throws
 824              ParserException
 825      {
 826          getSource ().setEncoding (character_set);
 827      }
 828  
 829      /**
 830       * Build a URL from the link and base provided.
 831       * @return An absolute URL.
 832       * @param link The (relative) URI.
 833       * @param base The base URL of the page, either from the &lt;BASE&gt; tag
 834       * or, if none, the URL the page is being fetched from.
 835       * @exception MalformedURLException If creating the URL fails.
 836       */
 837      public URL constructUrl (String link, String base)
 838          throws MalformedURLException
 839      {
 840          String path;
 841          boolean modified;
 842          boolean absolute;
 843          int index;
 844          URL url; // constructed URL combining relative link and base
 845  
 846          url = new URL (new URL (base), link);
 847          path = url.getFile ();
 848          modified = false;
 849          absolute = link.startsWith ("/");
 850          if (!absolute)
 851          {   // we prefer to fix incorrect relative links
 852              // this doesn't fix them all, just the ones at the start
 853              while (path.startsWith ("/."))
 854              {
 855                  if (path.startsWith ("/../"))
 856                  {
 857                      path = path.substring (3);
 858                      modified = true;
 859                  }
 860                  else if (path.startsWith ("/./") || path.startsWith("/."))
 861                  {
 862                      path = path.substring (2);
 863                      modified = true;
 864                  }
 865                  else
 866                      break;
 867              }
 868          }
 869          // fix backslashes
 870          while (-1 != (index = path.indexOf ("/\\")))
 871          {
 872              path = path.substring (0, index + 1) + path.substring (index + 2);
 873              modified = true;
 874          }
 875          if (modified)
 876              url = new URL (url, path);
 877  
 878          return (url);
 879      }
 880  
 881      /**
 882       * Create an absolute URL from a relative link.
 883       * @param link The reslative portion of a URL.
 884       * @return The fully qualified URL or the original link if it was absolute
 885       * already or a failure occured.
 886       */
 887      public String getAbsoluteURL (String link)
 888      {
 889          String base;
 890          URL url;
 891          String ret;
 892  
 893          if ((null == link) || ("".equals (link)))
 894              ret = "";
 895          else
 896              try
 897              {
 898                  base =  getBaseUrl ();
 899                  if (null == base)
 900                      base = getUrl ();
 901                  if (null == base)
 902                      ret = link;
 903                  else
 904                  {
 905                      url = constructUrl (link, base);
 906                      ret = url.toExternalForm ();
 907                  }
 908              }
 909              catch (MalformedURLException murle)
 910              {
 911                  ret = link;
 912              }
 913  
 914          return (ret);
 915      }
 916  
 917      /**
 918       * Get the line number for a cursor.
 919       * @param cursor The character offset into the page.
 920       * @return The line number the character is in.
 921       */
 922      public int row (Cursor cursor)
 923      {
 924          return (mIndex.row (cursor));
 925      }
 926  
 927      /**
 928       * Get the line number for a cursor.
 929       * @param position The character offset into the page.
 930       * @return The line number the character is in.
 931       */
 932      public int row (int position)
 933      {
 934          return (mIndex.row (position));
 935      }
 936  
 937      /**
 938       * Get the column number for a cursor.
 939       * @param cursor The character offset into the page.
 940       * @return The character offset into the line this cursor is on.
 941       */
 942      public int column (Cursor cursor)
 943      {
 944          return (mIndex.column (cursor));
 945      }
 946  
 947      /**
 948       * Get the column number for a cursor.
 949       * @param position The character offset into the page.
 950       * @return The character offset into the line this cursor is on.
 951       */
 952      public int column (int position)
 953      {
 954          return (mIndex.column (position));
 955      }
 956  
 957      /**
 958       * Get the text identified by the given limits.
 959       * @param start The starting position, zero based.
 960       * @param end The ending position
 961       * (exclusive, i.e. the character at the ending position is not included),
 962       * zero based.
 963       * @return The text from <code>start</code> to <code>end</code>.
 964       * @see #getText(StringBuffer, int, int)
 965       * @exception IllegalArgumentException If an attempt is made to get
 966       * characters ahead of the current source offset (character position).
 967       */
 968      public String getText (int start, int end)
 969          throws
 970              IllegalArgumentException
 971      {
 972          String ret;
 973  
 974          try
 975          {
 976              ret = mSource.getString (start, end - start);
 977          }
 978          catch (IOException ioe)
 979          {
 980              throw new IllegalArgumentException (
 981                  "can't get the "
 982                  + (end - start)
 983                  + "characters at position "
 984                  + start
 985                  + " - "
 986                  + ioe.getMessage ());
 987          }
 988  
 989          return (ret);
 990      }
 991  
 992      /**
 993       * Put the text identified by the given limits into the given buffer.
 994       * @param buffer The accumulator for the characters.
 995       * @param start The starting position, zero based.
 996       * @param end The ending position
 997       * (exclusive, i.e. the character at the ending position is not included),
 998       * zero based.
 999       * @exception IllegalArgumentException If an attempt is made to get
1000       * characters ahead of the current source offset (character position).
1001       */
1002      public void getText (StringBuffer buffer, int start, int end)
1003          throws
1004              IllegalArgumentException
1005      {
1006          int length;
1007  
1008          if ((mSource.offset () < start) || (mSource.offset () < end))
1009              throw new IllegalArgumentException (
1010                  "attempt to extract future characters from source"
1011                  + start + "|" + end + " > " + mSource.offset ());
1012          if (end < start)
1013          {
1014              length = end;
1015              end = start;
1016              start = length;
1017          }
1018          length = end - start;
1019          try
1020          {
1021              mSource.getCharacters (buffer, start, length);
1022          }
1023          catch (IOException ioe)
1024          {
1025              throw new IllegalArgumentException (
1026                  "can't get the "
1027                  + (end - start)
1028                  + "characters at position "
1029                  + start
1030                  + " - "
1031                  + ioe.getMessage ());
1032          }
1033      }
1034  
1035      /**
1036       * Get all text read so far from the source.
1037       * @return The text from the source.
1038       * @see #getText(StringBuffer)
1039       */
1040      public String getText ()
1041      {
1042          return (getText (0, mSource.offset ()));
1043      }
1044  
1045      /**
1046       * Put all text read so far from the source into the given buffer.
1047       * @param buffer The accumulator for the characters.
1048       * @see #getText(StringBuffer,int,int)
1049       */
1050      public void getText (StringBuffer buffer)
1051      {
1052          getText (buffer, 0, mSource.offset ());
1053      }
1054  
1055      /**
1056       * Put the text identified by the given limits into the given array at the specified offset.
1057       * @param array The array of characters.
1058       * @param offset The starting position in the array where characters are to be placed.
1059       * @param start The starting position, zero based.
1060       * @param end The ending position
1061       * (exclusive, i.e. the character at the ending position is not included),
1062       * zero based.
1063       * @exception IllegalArgumentException If an attempt is made to get
1064       * characters ahead of the current source offset (character position).
1065       */
1066      public void getText (char[] array, int offset, int start, int end)
1067          throws
1068              IllegalArgumentException
1069      {
1070          int length;
1071  
1072          if ((mSource.offset () < start) || (mSource.offset () < end))
1073              throw new IllegalArgumentException ("attempt to extract future characters from source");
1074          if (end < start)
1075          {   // swap
1076              length = end;
1077              end = start;
1078              start = length;
1079          }
1080          length = end - start;
1081          try
1082          {
1083              mSource.getCharacters (array, offset, start, end);
1084          }
1085          catch (IOException ioe)
1086          {
1087              throw new IllegalArgumentException (
1088                  "can't get the "
1089                  + (end - start)
1090                  + "characters at position "
1091                  + start
1092                  + " - "
1093                  + ioe.getMessage ());
1094          }
1095      }
1096  
1097      /**
1098       * Get the text line the position of the cursor lies on.
1099       * @param cursor The position to calculate for.
1100       * @return The contents of the URL or file corresponding to the line number
1101       * containing the cursor position.
1102       */
1103      public String getLine (Cursor cursor)
1104      {
1105          int line;
1106          int size;
1107          int start;
1108          int end;
1109  
1110          line = row (cursor);
1111          size = mIndex.size ();
1112          if (line < size)
1113          {
1114              start = mIndex.elementAt (line);
1115              line++;
1116              if (line <= size)
1117                  end = mIndex.elementAt (line);
1118              else
1119                  end = mSource.offset ();
1120          }
1121          else // current line
1122          {
1123              start = mIndex.elementAt (line - 1);
1124              end = mSource.offset ();
1125          }
1126          
1127              
1128          return (getText (start,  end));
1129      }
1130  
1131      /**
1132       * Get the text line the position of the cursor lies on.
1133       * @param position The position to calculate for.
1134       * @return The contents of the URL or file corresponding to the line number
1135       * containg the cursor position.
1136       */
1137      public String getLine (int position)
1138      {
1139          return (getLine (new Cursor (this, position)));
1140      }
1141      
1142      /**
1143       * Display some of this page as a string.
1144       * @return The last few characters the source read in.
1145       */
1146      public String toString ()
1147      {
1148          StringBuffer buffer;
1149          int start;
1150          String ret;
1151  
1152          if (mSource.offset () > 0)
1153          {
1154              buffer = new StringBuffer (43);
1155              start = mSource.offset () - 40;
1156              if (0 > start)
1157                  start = 0;
1158              else
1159                  buffer.append ("...");
1160              getText (buffer, start, mSource.offset ());
1161              ret = buffer.toString ();
1162          }
1163          else
1164              ret = super.toString ();
1165          
1166          return (ret);
1167      }
1168  }