Page.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Derrick Oswald 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2006/03/19 17:09:09 $ 10 // $Revision: 1.53 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.lexer; 28 29 import java.io.InputStream; 30 import java.io.IOException; 31 import java.io.ObjectInputStream; 32 import java.io.ObjectOutputStream; 33 import java.io.Serializable; 34 import java.io.UnsupportedEncodingException; 35 import java.lang.reflect.InvocationTargetException; 36 import java.lang.reflect.Method; 37 import java.net.MalformedURLException; 38 import java.net.URL; 39 import java.net.URLConnection; 40 import java.net.UnknownHostException; 41 import java.util.zip.GZIPInputStream; 42 import java.util.zip.InflaterInputStream; 43 44 import org.htmlparser.http.ConnectionManager; 45 import org.htmlparser.util.ParserException; 46 47 /** 48 * Represents the contents of an HTML page. 49 * Contains the source of characters and an index of positions of line 50 * separators (actually the first character position on the next line). 51 */ 52 public class Page 53 implements 54 Serializable 55 { 56 /** 57 * The default charset. 58 * This should be <code>{@value}</code>, 59 * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) 60 * section 3.7.1 61 * <p>Another alias is "8859_1". 62 */ 63 public static final String DEFAULT_CHARSET = "ISO-8859-1"; 64 65 /** 66 * The default content type. 67 * In the absence of alternate information, assume html content ({@value}). 68 */ 69 public static final String DEFAULT_CONTENT_TYPE = "text/html"; 70 71 /** 72 * Character value when the page is exhausted. 73 * Has a value of {@value}. 74 */ 75 public static final char EOF = (char)Source.EOF; 76 77 /** 78 * The URL this page is coming from. 79 * Cached value of <code>getConnection().toExternalForm()</code> or 80 * <code>setUrl()</code>. 81 */ 82 protected String mUrl; 83 84 /** 85 * The base URL for this page. 86 */ 87 protected String mBaseUrl; 88 89 /** 90 * The source of characters. 91 */ 92 protected Source mSource; 93 94 /** 95 * Character positions of the first character in each line. 96 */ 97 protected PageIndex mIndex; 98 99 /** 100 * The connection this page is coming from or <code>null</code>. 101 */ 102 protected transient URLConnection mConnection; 103 104 /** 105 * Connection control (proxy, cookies, authorization). 106 */ 107 protected static ConnectionManager mConnectionManager = 108 new ConnectionManager (); 109 110 /** 111 * Construct an empty page. 112 */ 113 public Page () 114 { 115 this (""); 116 } 117 118 /** 119 * Construct a page reading from a URL connection. 120 * @param connection A fully conditioned connection. The connect() 121 * method will be called so it need not be connected yet. 122 * @exception ParserException An exception object wrapping a number of 123 * possible error conditions, some of which are outlined below. 124 * <li>IOException If an i/o exception occurs creating the 125 * source.</li> 126 * <li>UnsupportedEncodingException if the character set specified in the 127 * HTTP header is not supported.</li> 128 */ 129 public Page (URLConnection connection) throws ParserException 130 { 131 if (null == connection) 132 throw new IllegalArgumentException ("connection cannot be null"); 133 setConnection (connection); 134 mBaseUrl = null; 135 } 136 137 /** 138 * Construct a page from a stream encoded with the given charset. 139 * @param stream The source of bytes. 140 * @param charset The encoding used. 141 * If null, defaults to the <code>DEFAULT_CHARSET</code>. 142 * @exception UnsupportedEncodingException If the given charset 143 * is not supported. 144 */ 145 public Page (InputStream stream, String charset) 146 throws 147 UnsupportedEncodingException 148 { 149 if (null == stream) 150 throw new IllegalArgumentException ("stream cannot be null"); 151 if (null == charset) 152 charset = DEFAULT_CHARSET; 153 mSource = new InputStreamSource (stream, charset); 154 mIndex = new PageIndex (this); 155 mConnection = null; 156 mUrl = null; 157 mBaseUrl = null; 158 } 159 160 /** 161 * Construct a page from the given string. 162 * @param text The HTML text. 163 * @param charset <em>Optional</em>. The character set encoding that will 164 * be reported by {@link #getEncoding}. If charset is <code>null</code> 165 * the default character set is used. 166 */ 167 public Page (String text, String charset) 168 { 169 if (null == text) 170 throw new IllegalArgumentException ("text cannot be null"); 171 if (null == charset) 172 charset = DEFAULT_CHARSET; 173 mSource = new StringSource (text, charset); 174 mIndex = new PageIndex (this); 175 mConnection = null; 176 mUrl = null; 177 mBaseUrl = null; 178 } 179 180 /** 181 * Construct a page from the given string. 182 * The page will report that it is using an encoding of 183 * {@link #DEFAULT_CHARSET}. 184 * @param text The HTML text. 185 */ 186 public Page (String text) 187 { 188 this (text, null); 189 } 190 191 /** 192 * Construct a page from a source. 193 * @param source The source of characters. 194 */ 195 public Page (Source source) 196 { 197 if (null == source) 198 throw new IllegalArgumentException ("source cannot be null"); 199 mSource = source; 200 mIndex = new PageIndex (this); 201 mConnection = null; 202 mUrl = null; 203 mBaseUrl = null; 204 } 205 206 // 207 // static methods 208 // 209 210 /** 211 * Get the connection manager all Parsers use. 212 * @return The connection manager. 213 */ 214 public static ConnectionManager getConnectionManager () 215 { 216 return (mConnectionManager); 217 } 218 219 /** 220 * Set the connection manager to use. 221 * @param manager The new connection manager. 222 */ 223 public static void setConnectionManager (ConnectionManager manager) 224 { 225 mConnectionManager = manager; 226 } 227 228 /** 229 * Get a CharacterSet name corresponding to a charset parameter. 230 * @param content A text line of the form: 231 * <pre> 232 * text/html; charset=Shift_JIS 233 * </pre> 234 * which is applicable both to the HTTP header field Content-Type and 235 * the meta tag http-equiv="Content-Type". 236 * Note this method also handles non-compliant quoted charset directives 237 * such as: 238 * <pre> 239 * text/html; charset="UTF-8" 240 * </pre> 241 * and 242 * <pre> 243 * text/html; charset='UTF-8' 244 * </pre> 245 * @return The character set name to use when reading the input stream. 246 * For JDKs that have the Charset class this is qualified by passing 247 * the name to findCharset() to render it into canonical form. 248 * If the charset parameter is not found in the given string, the default 249 * character set is returned. 250 * @see #findCharset 251 * @see #DEFAULT_CHARSET 252 */ 253 public String getCharset (String content) 254 { 255 final String CHARSET_STRING = "charset"; 256 int index; 257 String ret; 258 259 if (null == mSource) 260 ret = DEFAULT_CHARSET; 261 else 262 // use existing (possibly supplied) character set: 263 // bug #1322686 when illegal charset specified 264 ret = mSource.getEncoding (); 265 if (null != content) 266 { 267 index = content.indexOf (CHARSET_STRING); 268 269 if (index != -1) 270 { 271 content = content.substring (index + 272 CHARSET_STRING.length ()).trim (); 273 if (content.startsWith ("=")) 274 { 275 content = content.substring (1).trim (); 276 index = content.indexOf (";"); 277 if (index != -1) 278 content = content.substring (0, index); 279 280 //remove any double quotes from around charset string 281 if (content.startsWith ("\"") && content.endsWith ("\"") 282 && (1 < content.length ())) 283 content = content.substring (1, content.length () - 1); 284 285 //remove any single quote from around charset string 286 if (content.startsWith ("'") && content.endsWith ("'") 287 && (1 < content.length ())) 288 content = content.substring (1, content.length () - 1); 289 290 ret = findCharset (content, ret); 291 292 // Charset names are not case-sensitive; 293 // that is, case is always ignored when comparing 294 // charset names. 295 // if (!ret.equalsIgnoreCase (content)) 296 // { 297 // System.out.println ( 298 // "detected charset \"" 299 // + content 300 // + "\", using \"" 301 // + ret 302 // + "\""); 303 // } 304 } 305 } 306 } 307 308 return (ret); 309 } 310 311 /** 312 * Lookup a character set name. 313 * <em>Vacuous for JVM's without <code>java.nio.charset</code>.</em> 314 * This uses reflection so the code will still run under prior JDK's but 315 * in that case the default is always returned. 316 * @param name The name to look up. One of the aliases for a character set. 317 * @param fallback The name to return if the lookup fails. 318 * @return The character set name. 319 */ 320 public static String findCharset (String name, String fallback) 321 { 322 String ret; 323 324 try 325 { 326 Class cls; 327 Method method; 328 Object object; 329 330 cls = Class.forName ("java.nio.charset.Charset"); 331 method = cls.getMethod ("forName", new Class[] { String.class }); 332 object = method.invoke (null, new Object[] { name }); 333 method = cls.getMethod ("name", new Class[] { }); 334 object = method.invoke (object, new Object[] { }); 335 ret = (String)object; 336 } 337 catch (ClassNotFoundException cnfe) 338 { 339 // for reflection exceptions, assume the name is correct 340 ret = name; 341 } 342 catch (NoSuchMethodException nsme) 343 { 344 // for reflection exceptions, assume the name is correct 345 ret = name; 346 } 347 catch (IllegalAccessException ia) 348 { 349 // for reflection exceptions, assume the name is correct 350 ret = name; 351 } 352 catch (InvocationTargetException ita) 353 { 354 // java.nio.charset.IllegalCharsetNameException 355 // and java.nio.charset.UnsupportedCharsetException 356 // return the default 357 ret = fallback; 358 System.out.println ( 359 "unable to determine cannonical charset name for " 360 + name 361 + " - using " 362 + fallback); 363 } 364 365 return (ret); 366 } 367 368 // 369 // Serialization support 370 // 371 372 /** 373 * Serialize the page. 374 * There are two modes to serializing a page based on the connected state. 375 * If connected, the URL and the current offset is saved, while if 376 * disconnected, the underling source is saved. 377 * @param out The object stream to store this object in. 378 * @exception IOException If there is a serialization problem. 379 */ 380 private void writeObject (ObjectOutputStream out) 381 throws 382 IOException 383 { 384 String href; 385 Source source; 386 PageIndex index; 387 388 // two cases, reading from a URL and not 389 if (null != getConnection ()) 390 { 391 out.writeBoolean (true); 392 out.writeInt (mSource.offset ()); // need to preread this much 393 href = getUrl (); 394 out.writeObject (href); 395 setUrl (getConnection ().getURL ().toExternalForm ()); 396 source = getSource (); 397 mSource = null; // don't serialize the source if we can avoid it 398 index = mIndex; 399 mIndex = null; // will get recreated; valid for the new page anyway? 400 out.defaultWriteObject (); 401 mSource = source; 402 mIndex = index; 403 } 404 else 405 { 406 out.writeBoolean (false); 407 href = getUrl (); 408 out.writeObject (href); 409 setUrl (null); // don't try and read a bogus URL 410 out.defaultWriteObject (); 411 setUrl (href); 412 } 413 } 414 415 /** 416 * Deserialize the page. 417 * For details see <code>writeObject()</code>. 418 * @param in The object stream to decode. 419 * @exception IOException If there is a deserialization problem with 420 * the stream. 421 * @exception ClassNotFoundException If the deserialized class can't be 422 * located with the current classpath and class loader. 423 */ 424 private void readObject (ObjectInputStream in) 425 throws 426 IOException, 427 ClassNotFoundException 428 { 429 boolean fromurl; 430 int offset; 431 String href; 432 URL url; 433 Cursor cursor; 434 435 fromurl = in.readBoolean (); 436 if (fromurl) 437 { 438 offset = in.readInt (); 439 href = (String)in.readObject (); 440 in.defaultReadObject (); 441 // open the URL 442 if (null != getUrl ()) 443 { 444 url = new URL (getUrl ()); 445 try 446 { 447 setConnection (url.openConnection ()); 448 } 449 catch (ParserException pe) 450 { 451 throw new IOException (pe.getMessage ()); 452 } 453 } 454 cursor = new Cursor (this, 0); 455 for (int i = 0; i < offset; i++) 456 try 457 { 458 getCharacter (cursor); 459 } 460 catch (ParserException pe) 461 { 462 throw new IOException (pe.getMessage ()); 463 } 464 setUrl (href); 465 } 466 else 467 { 468 href = (String)in.readObject (); 469 in.defaultReadObject (); 470 setUrl (href); 471 } 472 } 473 474 /** 475 * Reset the page by resetting the source of characters. 476 */ 477 public void reset () 478 { 479 getSource ().reset (); 480 mIndex = new PageIndex (this); // todo: is this really necessary? 481 } 482 483 /** 484 * Close the page by destroying the source of characters. 485 * @exception IOException If destroying the source encounters an error. 486 */ 487 public void close () throws IOException 488 { 489 if (null != getSource ()) 490 getSource ().destroy (); 491 } 492 493 /** 494 * Clean up this page, releasing resources. 495 * Calls <code>close()</code>. 496 * @exception Throwable if <code>close()</code> throws an 497 * <code>IOException</code>. 498 */ 499 protected void finalize () 500 throws 501 Throwable 502 { 503 close (); 504 } 505 506 /** 507 * Get the connection, if any. 508 * @return The connection object for this page, or null if this page 509 * is built from a stream or a string. 510 */ 511 public URLConnection getConnection () 512 { 513 return (mConnection); 514 } 515 516 /** 517 * Set the URLConnection to be used by this page. 518 * Starts reading from the given connection. 519 * This also resets the current url. 520 * @param connection The connection to use. 521 * It will be connected by this method. 522 * @exception ParserException If the <code>connect()</code> method fails, 523 * or an I/O error occurs opening the input stream or the character set 524 * designated in the HTTP header is unsupported. 525 */ 526 public void setConnection (URLConnection connection) 527 throws 528 ParserException 529 { 530 Stream stream; 531 String type; 532 String charset; 533 String contentEncoding; 534 535 mConnection = connection; 536 try 537 { 538 getConnection ().connect (); 539 } 540 catch (UnknownHostException uhe) 541 { 542 throw new ParserException ("Connect to " 543 + mConnection.getURL ().toExternalForm () + " failed.", uhe); 544 } 545 catch (IOException ioe) 546 { 547 throw new ParserException ("Exception connecting to " 548 + mConnection.getURL ().toExternalForm () 549 + " (" + ioe.getMessage () + ").", ioe); 550 } 551 type = getContentType (); 552 charset = getCharset (type); 553 try 554 { 555 contentEncoding = connection.getContentEncoding(); 556 if ((null != contentEncoding) 557 && (-1 != contentEncoding.indexOf ("gzip"))) 558 { 559 stream = new Stream (new GZIPInputStream ( 560 getConnection ().getInputStream ())); 561 } 562 else if ((null != contentEncoding) 563 && (-1 != contentEncoding.indexOf ("deflate"))) 564 { 565 stream = new Stream (new InflaterInputStream ( 566 getConnection ().getInputStream ())); 567 } 568 else 569 { 570 stream = new Stream (getConnection ().getInputStream ()); 571 } 572 573 try 574 { 575 mSource = new InputStreamSource (stream, charset); 576 } 577 catch (UnsupportedEncodingException uee) 578 { 579 // StringBuffer msg; 580 // 581 // msg = new StringBuffer (1024); 582 // msg.append (getConnection ().getURL ().toExternalForm ()); 583 // msg.append (" has an encoding ("); 584 // msg.append (charset); 585 // msg.append (") which is not supported, using "); 586 // msg.append (DEFAULT_CHARSET); 587 // System.out.println (msg.toString ()); 588 charset = DEFAULT_CHARSET; 589 mSource = new InputStreamSource (stream, charset); 590 } 591 } 592 catch (IOException ioe) 593 { 594 throw new ParserException ("Exception getting input stream from " 595 + mConnection.getURL ().toExternalForm () 596 + " (" + ioe.getMessage () + ").", ioe); 597 } 598 mUrl = connection.getURL ().toExternalForm (); 599 mIndex = new PageIndex (this); 600 } 601 602 /** 603 * Get the URL for this page. 604 * This is only available if the page has a connection 605 * (<code>getConnection()</code> returns non-null), or the document base has 606 * been set via a call to <code>setUrl()</code>. 607 * @return The url for the connection, or <code>null</code> if there is 608 * no conenction or the document base has not been set. 609 */ 610 public String getUrl () 611 { 612 return (mUrl); 613 } 614 615 /** 616 * Set the URL for this page. 617 * This doesn't affect the contents of the page, just the interpretation 618 * of relative links from this point forward. 619 * @param url The new URL. 620 */ 621 public void setUrl (String url) 622 { 623 mUrl = url; 624 } 625 626 /** 627 * Gets the baseUrl. 628 * @return The base URL for this page, or <code>null</code> if not set. 629 */ 630 public String getBaseUrl () 631 { 632 return (mBaseUrl); 633 } 634 635 /** 636 * Sets the baseUrl. 637 * @param url The base url for this page. 638 */ 639 public void setBaseUrl (String url) 640 { 641 mBaseUrl = url; 642 } 643 644 /** 645 * Get the source this page is reading from. 646 * @return The current source. 647 */ 648 public Source getSource () 649 { 650 return (mSource); 651 } 652 653 /** 654 * Try and extract the content type from the HTTP header. 655 * @return The content type. 656 */ 657 public String getContentType () 658 { 659 URLConnection connection; 660 String content; 661 String ret; 662 663 ret = DEFAULT_CONTENT_TYPE; 664 connection = getConnection (); 665 if (null != connection) 666 { 667 content = connection.getContentType (); 668 if (null != content) 669 ret = content; 670 } 671 672 return (ret); 673 } 674 675 /** 676 * Read the character at the given cursor position. 677 * The cursor position can be only behind or equal to the 678 * current source position. 679 * Returns end of lines (EOL) as \n, by converting \r and \r\n to \n, 680 * and updates the end-of-line index accordingly 681 * Advances the cursor position by one (or two in the \r\n case). 682 * @param cursor The position to read at. 683 * @return The character at that position, and modifies the cursor to 684 * prepare for the next read. If the source is exhausted a zero is returned. 685 * @exception ParserException If an IOException on the underlying source 686 * occurs, or an attemp is made to read characters in the future (the 687 * cursor position is ahead of the underlying stream) 688 */ 689 public char getCharacter (Cursor cursor) 690 throws 691 ParserException 692 { 693 int i; 694 int offset; 695 char ret; 696 697 i = cursor.getPosition (); 698 offset = mSource.offset (); 699 if (offset == i) 700 try 701 { 702 i = mSource.read (); 703 if (Source.EOF == i) 704 ret = EOF; 705 else 706 { 707 ret = (char)i; 708 cursor.advance (); 709 } 710 } 711 catch (IOException ioe) 712 { 713 throw new ParserException ( 714 "problem reading a character at position " 715 + cursor.getPosition (), ioe); 716 } 717 else if (offset > i) 718 { 719 // historic read 720 try 721 { 722 ret = mSource.getCharacter (i); 723 } 724 catch (IOException ioe) 725 { 726 throw new ParserException ( 727 "can't read a character at position " 728 + i, ioe); 729 } 730 cursor.advance (); 731 } 732 else 733 // hmmm, we could skip ahead, but then what about the EOL index 734 throw new ParserException ( 735 "attempt to read future characters from source " 736 + i + " > " + mSource.offset ()); 737 738 // handle \r 739 if ('\r' == ret) 740 { // switch to single character EOL 741 ret = '\n'; 742 743 // check for a \n in the next position 744 if (mSource.offset () == cursor.getPosition ()) 745 try 746 { 747 i = mSource.read (); 748 if (Source.EOF == i) 749 { 750 // do nothing 751 } 752 else if ('\n' == (char)i) 753 cursor.advance (); 754 else 755 try 756 { 757 mSource.unread (); 758 } 759 catch (IOException ioe) 760 { 761 throw new ParserException ( 762 "can't unread a character at position " 763 + cursor.getPosition (), ioe); 764 } 765 } 766 catch (IOException ioe) 767 { 768 throw new ParserException ( 769 "problem reading a character at position " 770 + cursor.getPosition (), ioe); 771 } 772 else 773 try 774 { 775 if ('\n' == mSource.getCharacter (cursor.getPosition ())) 776 cursor.advance (); 777 } 778 catch (IOException ioe) 779 { 780 throw new ParserException ( 781 "can't read a character at position " 782 + cursor.getPosition (), ioe); 783 } 784 } 785 if ('\n' == ret) 786 // update the EOL index in any case 787 mIndex.add (cursor); 788 789 return (ret); 790 } 791 792 /** 793 * Get the current encoding being used. 794 * @return The encoding used to convert characters. 795 */ 796 public String getEncoding () 797 { 798 return (getSource ().getEncoding ()); 799 } 800 801 /** 802 * Begins reading from the source with the given character set. 803 * If the current encoding is the same as the requested encoding, 804 * this method is a no-op. Otherwise any subsequent characters read from 805 * this page will have been decoded using the given character set.<p> 806 * Some magic happens here to obtain this result if characters have already 807 * been consumed from this page. 808 * Since a Reader cannot be dynamically altered to use a different character 809 * set, the underlying stream is reset, a new Source is constructed 810 * and a comparison made of the characters read so far with the newly 811 * read characters up to the current position. 812 * If a difference is encountered, or some other problem occurs, 813 * an exception is thrown. 814 * @param character_set The character set to use to convert bytes into 815 * characters. 816 * @exception ParserException If a character mismatch occurs between 817 * characters already provided and those that would have been returned 818 * had the new character set been in effect from the beginning. An 819 * exception is also thrown if the underlying stream won't put up with 820 * these shenanigans. 821 */ 822 public void setEncoding (String character_set) 823 throws 824 ParserException 825 { 826 getSource ().setEncoding (character_set); 827 } 828 829 /** 830 * Build a URL from the link and base provided. 831 * @return An absolute URL. 832 * @param link The (relative) URI. 833 * @param base The base URL of the page, either from the <BASE> tag 834 * or, if none, the URL the page is being fetched from. 835 * @exception MalformedURLException If creating the URL fails. 836 */ 837 public URL constructUrl (String link, String base) 838 throws MalformedURLException 839 { 840 String path; 841 boolean modified; 842 boolean absolute; 843 int index; 844 URL url; // constructed URL combining relative link and base 845 846 url = new URL (new URL (base), link); 847 path = url.getFile (); 848 modified = false; 849 absolute = link.startsWith ("/"); 850 if (!absolute) 851 { // we prefer to fix incorrect relative links 852 // this doesn't fix them all, just the ones at the start 853 while (path.startsWith ("/.")) 854 { 855 if (path.startsWith ("/../")) 856 { 857 path = path.substring (3); 858 modified = true; 859 } 860 else if (path.startsWith ("/./") || path.startsWith("/.")) 861 { 862 path = path.substring (2); 863 modified = true; 864 } 865 else 866 break; 867 } 868 } 869 // fix backslashes 870 while (-1 != (index = path.indexOf ("/\\"))) 871 { 872 path = path.substring (0, index + 1) + path.substring (index + 2); 873 modified = true; 874 } 875 if (modified) 876 url = new URL (url, path); 877 878 return (url); 879 } 880 881 /** 882 * Create an absolute URL from a relative link. 883 * @param link The reslative portion of a URL. 884 * @return The fully qualified URL or the original link if it was absolute 885 * already or a failure occured. 886 */ 887 public String getAbsoluteURL (String link) 888 { 889 String base; 890 URL url; 891 String ret; 892 893 if ((null == link) || ("".equals (link))) 894 ret = ""; 895 else 896 try 897 { 898 base = getBaseUrl (); 899 if (null == base) 900 base = getUrl (); 901 if (null == base) 902 ret = link; 903 else 904 { 905 url = constructUrl (link, base); 906 ret = url.toExternalForm (); 907 } 908 } 909 catch (MalformedURLException murle) 910 { 911 ret = link; 912 } 913 914 return (ret); 915 } 916 917 /** 918 * Get the line number for a cursor. 919 * @param cursor The character offset into the page. 920 * @return The line number the character is in. 921 */ 922 public int row (Cursor cursor) 923 { 924 return (mIndex.row (cursor)); 925 } 926 927 /** 928 * Get the line number for a cursor. 929 * @param position The character offset into the page. 930 * @return The line number the character is in. 931 */ 932 public int row (int position) 933 { 934 return (mIndex.row (position)); 935 } 936 937 /** 938 * Get the column number for a cursor. 939 * @param cursor The character offset into the page. 940 * @return The character offset into the line this cursor is on. 941 */ 942 public int column (Cursor cursor) 943 { 944 return (mIndex.column (cursor)); 945 } 946 947 /** 948 * Get the column number for a cursor. 949 * @param position The character offset into the page. 950 * @return The character offset into the line this cursor is on. 951 */ 952 public int column (int position) 953 { 954 return (mIndex.column (position)); 955 } 956 957 /** 958 * Get the text identified by the given limits. 959 * @param start The starting position, zero based. 960 * @param end The ending position 961 * (exclusive, i.e. the character at the ending position is not included), 962 * zero based. 963 * @return The text from <code>start</code> to <code>end</code>. 964 * @see #getText(StringBuffer, int, int) 965 * @exception IllegalArgumentException If an attempt is made to get 966 * characters ahead of the current source offset (character position). 967 */ 968 public String getText (int start, int end) 969 throws 970 IllegalArgumentException 971 { 972 String ret; 973 974 try 975 { 976 ret = mSource.getString (start, end - start); 977 } 978 catch (IOException ioe) 979 { 980 throw new IllegalArgumentException ( 981 "can't get the " 982 + (end - start) 983 + "characters at position " 984 + start 985 + " - " 986 + ioe.getMessage ()); 987 } 988 989 return (ret); 990 } 991 992 /** 993 * Put the text identified by the given limits into the given buffer. 994 * @param buffer The accumulator for the characters. 995 * @param start The starting position, zero based. 996 * @param end The ending position 997 * (exclusive, i.e. the character at the ending position is not included), 998 * zero based. 999 * @exception IllegalArgumentException If an attempt is made to get 1000 * characters ahead of the current source offset (character position). 1001 */ 1002 public void getText (StringBuffer buffer, int start, int end) 1003 throws 1004 IllegalArgumentException 1005 { 1006 int length; 1007 1008 if ((mSource.offset () < start) || (mSource.offset () < end)) 1009 throw new IllegalArgumentException ( 1010 "attempt to extract future characters from source" 1011 + start + "|" + end + " > " + mSource.offset ()); 1012 if (end < start) 1013 { 1014 length = end; 1015 end = start; 1016 start = length; 1017 } 1018 length = end - start; 1019 try 1020 { 1021 mSource.getCharacters (buffer, start, length); 1022 } 1023 catch (IOException ioe) 1024 { 1025 throw new IllegalArgumentException ( 1026 "can't get the " 1027 + (end - start) 1028 + "characters at position " 1029 + start 1030 + " - " 1031 + ioe.getMessage ()); 1032 } 1033 } 1034 1035 /** 1036 * Get all text read so far from the source. 1037 * @return The text from the source. 1038 * @see #getText(StringBuffer) 1039 */ 1040 public String getText () 1041 { 1042 return (getText (0, mSource.offset ())); 1043 } 1044 1045 /** 1046 * Put all text read so far from the source into the given buffer. 1047 * @param buffer The accumulator for the characters. 1048 * @see #getText(StringBuffer,int,int) 1049 */ 1050 public void getText (StringBuffer buffer) 1051 { 1052 getText (buffer, 0, mSource.offset ()); 1053 } 1054 1055 /** 1056 * Put the text identified by the given limits into the given array at the specified offset. 1057 * @param array The array of characters. 1058 * @param offset The starting position in the array where characters are to be placed. 1059 * @param start The starting position, zero based. 1060 * @param end The ending position 1061 * (exclusive, i.e. the character at the ending position is not included), 1062 * zero based. 1063 * @exception IllegalArgumentException If an attempt is made to get 1064 * characters ahead of the current source offset (character position). 1065 */ 1066 public void getText (char[] array, int offset, int start, int end) 1067 throws 1068 IllegalArgumentException 1069 { 1070 int length; 1071 1072 if ((mSource.offset () < start) || (mSource.offset () < end)) 1073 throw new IllegalArgumentException ("attempt to extract future characters from source"); 1074 if (end < start) 1075 { // swap 1076 length = end; 1077 end = start; 1078 start = length; 1079 } 1080 length = end - start; 1081 try 1082 { 1083 mSource.getCharacters (array, offset, start, end); 1084 } 1085 catch (IOException ioe) 1086 { 1087 throw new IllegalArgumentException ( 1088 "can't get the " 1089 + (end - start) 1090 + "characters at position " 1091 + start 1092 + " - " 1093 + ioe.getMessage ()); 1094 } 1095 } 1096 1097 /** 1098 * Get the text line the position of the cursor lies on. 1099 * @param cursor The position to calculate for. 1100 * @return The contents of the URL or file corresponding to the line number 1101 * containing the cursor position. 1102 */ 1103 public String getLine (Cursor cursor) 1104 { 1105 int line; 1106 int size; 1107 int start; 1108 int end; 1109 1110 line = row (cursor); 1111 size = mIndex.size (); 1112 if (line < size) 1113 { 1114 start = mIndex.elementAt (line); 1115 line++; 1116 if (line <= size) 1117 end = mIndex.elementAt (line); 1118 else 1119 end = mSource.offset (); 1120 } 1121 else // current line 1122 { 1123 start = mIndex.elementAt (line - 1); 1124 end = mSource.offset (); 1125 } 1126 1127 1128 return (getText (start, end)); 1129 } 1130 1131 /** 1132 * Get the text line the position of the cursor lies on. 1133 * @param position The position to calculate for. 1134 * @return The contents of the URL or file corresponding to the line number 1135 * containg the cursor position. 1136 */ 1137 public String getLine (int position) 1138 { 1139 return (getLine (new Cursor (this, position))); 1140 } 1141 1142 /** 1143 * Display some of this page as a string. 1144 * @return The last few characters the source read in. 1145 */ 1146 public String toString () 1147 { 1148 StringBuffer buffer; 1149 int start; 1150 String ret; 1151 1152 if (mSource.offset () > 0) 1153 { 1154 buffer = new StringBuffer (43); 1155 start = mSource.offset () - 40; 1156 if (0 > start) 1157 start = 0; 1158 else 1159 buffer.append ("..."); 1160 getText (buffer, start, mSource.offset ()); 1161 ret = buffer.toString (); 1162 } 1163 else 1164 ret = super.toString (); 1165 1166 return (ret); 1167 } 1168 }