StringBean.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Derrick Oswald 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2005/05/15 11:49:03 $ 10 // $Revision: 1.44 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.beans; 28 29 import java.beans.PropertyChangeListener; 30 import java.beans.PropertyChangeSupport; 31 import java.io.Serializable; 32 import java.net.URLConnection; 33 34 import org.htmlparser.Parser; 35 import org.htmlparser.Text; 36 import org.htmlparser.tags.LinkTag; 37 import org.htmlparser.Tag; 38 import org.htmlparser.util.ParserException; 39 import org.htmlparser.util.EncodingChangeException; 40 import org.htmlparser.util.Translate; 41 import org.htmlparser.visitors.NodeVisitor; 42 43 /** 44 * Extract strings from a URL. 45 * <p>Text within <SCRIPT></SCRIPT> tags is removed.</p> 46 * <p>The text within <PRE></PRE> tags is not altered.</p> 47 * <p>The property <code>Strings</code>, which is the output property is null 48 * until a URL is set. So a typical usage is:</p> 49 * <pre> 50 * StringBean sb = new StringBean (); 51 * sb.setLinks (false); 52 * sb.setReplaceNonBreakingSpaces (true); 53 * sb.setCollapse (true); 54 * sb.setURL ("http://www.netbeans.org"); // the HTTP is performed here 55 * String s = sb.getStrings (); 56 * </pre> 57 * You can also use the StringBean as a NodeVisitor on your own parser, 58 * in which case you have to refetch your page if you change one of the 59 * properties because it resets the Strings property:</p> 60 * <pre> 61 * StringBean sb = new StringBean (); 62 * Parser parser = new Parser ("http://cbc.ca"); 63 * parser.visitAllNodesWith (sb); 64 * String s = sb.getStrings (); 65 * sb.setLinks (true); 66 * parser.reset (); 67 * parser.visitAllNodesWith (sb); 68 * String sl = sb.getStrings (); 69 * </pre> 70 * According to Nick Burch, who contributed the patch, this is handy if you 71 * don't want StringBean to wander off and get the content itself, either 72 * because you already have it, it's not on a website etc. 73 */ 74 public class StringBean extends NodeVisitor implements Serializable 75 { 76 /** 77 * Property name in event where the URL contents changes. 78 */ 79 public static final String PROP_STRINGS_PROPERTY = "strings"; 80 81 /** 82 * Property name in event where the 'embed links' state changes. 83 */ 84 public static final String PROP_LINKS_PROPERTY = "links"; 85 86 /** 87 * Property name in event where the URL changes. 88 */ 89 public static final String PROP_URL_PROPERTY = "URL"; 90 91 /** 92 * Property name in event where the 'replace non-breaking spaces' 93 * state changes. 94 */ 95 public static final String PROP_REPLACE_SPACE_PROPERTY = 96 "replaceNonBreakingSpaces"; 97 98 /** 99 * Property name in event where the 'collapse whitespace' state changes. 100 */ 101 public static final String PROP_COLLAPSE_PROPERTY = "collapse"; 102 103 /** 104 * Property name in event where the connection changes. 105 */ 106 public static final String PROP_CONNECTION_PROPERTY = "connection"; 107 108 /** 109 * A newline. 110 */ 111 private static final String NEWLINE = System.getProperty ("line.separator"); 112 113 /** 114 * The length of the NEWLINE. 115 */ 116 private static final int NEWLINE_SIZE = NEWLINE.length (); 117 118 /** 119 * Bound property support. 120 */ 121 protected PropertyChangeSupport mPropertySupport; 122 123 /** 124 * The parser used to extract strings. 125 */ 126 protected Parser mParser; 127 128 /** 129 * The strings extracted from the URL. 130 */ 131 protected String mStrings; 132 133 /** 134 * If <code>true</code> the link URLs are embedded in the text output. 135 */ 136 protected boolean mLinks; 137 138 /** 139 * If <code>true</code> regular space characters are substituted for 140 * non-breaking spaces in the text output. 141 */ 142 protected boolean mReplaceSpace; 143 144 /** 145 * If <code>true</code> sequences of whitespace characters are replaced 146 * with a single space character. 147 */ 148 protected boolean mCollapse; 149 150 /** 151 * The buffer text is stored in while traversing the HTML. 152 */ 153 protected StringBuffer mBuffer; 154 155 /** 156 * Set <code>true</code> when traversing a SCRIPT tag. 157 */ 158 protected boolean mIsScript; 159 160 /** 161 * Set <code>true</code> when traversing a PRE tag. 162 */ 163 protected boolean mIsPre; 164 165 /** 166 * Set <code>true</code> when traversing a STYLE tag. 167 */ 168 protected boolean mIsStyle; 169 170 /** 171 * Create a StringBean object. 172 * Default property values are set to 'do the right thing': 173 * <p><code>Links</code> is set <code>false</code> so text appears like a 174 * browser would display it, albeit without the colour or underline clues 175 * normally associated with a link.</p> 176 * <p><code>ReplaceNonBreakingSpaces</code> is set <code>true</code>, so 177 * that printing the text works, but the extra information regarding these 178 * formatting marks is available if you set it false.</p> 179 * <p><code>Collapse</code> is set <code>true</code>, so text appears 180 * compact like a browser would display it.</p> 181 */ 182 public StringBean () 183 { 184 super (true, true); 185 mPropertySupport = new PropertyChangeSupport (this); 186 mParser = new Parser (); 187 mStrings = null; 188 mLinks = false; 189 mReplaceSpace = true; 190 mCollapse = true; 191 mBuffer = new StringBuffer (4096); 192 mIsScript = false; 193 mIsPre = false; 194 mIsStyle = false; 195 } 196 197 // 198 // internals 199 // 200 201 /** 202 * Appends a newline to the buffer if there isn't one there already. 203 * Except if the buffer is empty. 204 */ 205 protected void carriageReturn () 206 { 207 int length; 208 209 length = mBuffer.length (); 210 if ((0 != length) // don't append newlines to the beginning of a buffer 211 && ((NEWLINE_SIZE <= length) // not enough chars to hold a NEWLINE 212 && (!mBuffer.substring ( 213 length - NEWLINE_SIZE, length).equals (NEWLINE)))) 214 mBuffer.append (NEWLINE); 215 } 216 217 /** 218 * Add the given text collapsing whitespace. 219 * Use a little finite state machine: 220 * <pre> 221 * state 0: whitepace was last emitted character 222 * state 1: in whitespace 223 * state 2: in word 224 * A whitespace character moves us to state 1 and any other character 225 * moves us to state 2, except that state 0 stays in state 0 until 226 * a non-whitespace and going from whitespace to word we emit a space 227 * before the character: 228 * input: whitespace other-character 229 * state\next 230 * 0 0 2 231 * 1 1 space then 2 232 * 2 1 2 233 * </pre> 234 * @param buffer The buffer to append to. 235 * @param string The string to append. 236 */ 237 protected void collapse (StringBuffer buffer, String string) 238 { 239 int chars; 240 int length; 241 int state; 242 char character; 243 244 chars = string.length (); 245 if (0 != chars) 246 { 247 length = buffer.length (); 248 state = ((0 == length) 249 || (buffer.charAt (length - 1) == ' ') 250 || ((NEWLINE_SIZE <= length) 251 && buffer.substring ( 252 length - NEWLINE_SIZE, length).equals (NEWLINE))) 253 ? 0 : 1; 254 for (int i = 0; i < chars; i++) 255 { 256 character = string.charAt (i); 257 switch (character) 258 { 259 // see HTML specification section 9.1 White space 260 // http://www.w3.org/TR/html4/struct/text.html#h-9.1 261 case '\u0020': 262 case '\u0009': 263 case '\u000C': 264 case '\u200B': 265 case '\r': 266 case '\n': 267 if (0 != state) 268 state = 1; 269 break; 270 default: 271 if (1 == state) 272 buffer.append (' '); 273 state = 2; 274 buffer.append (character); 275 } 276 } 277 } 278 } 279 280 /** 281 * Extract the text from a page. 282 * @return The textual contents of the page. 283 * @exception ParserException If a parse error occurs. 284 */ 285 protected String extractStrings () 286 throws 287 ParserException 288 { 289 String ret; 290 291 mParser.visitAllNodesWith (this); 292 ret = mBuffer.toString (); 293 mBuffer = new StringBuffer(4096); 294 295 return (ret); 296 } 297 298 /** 299 * Assign the <code>Strings</code> property, firing the property change. 300 * @param strings The new value of the <code>Strings</code> property. 301 */ 302 protected void updateStrings (String strings) 303 { 304 String oldValue; 305 306 if ((null == mStrings) || !mStrings.equals (strings)) 307 { 308 oldValue = mStrings; 309 mStrings = strings; 310 mPropertySupport.firePropertyChange ( 311 PROP_STRINGS_PROPERTY, oldValue, strings); 312 } 313 } 314 315 /** 316 * Fetch the URL contents. 317 * Only do work if there is a valid parser with it's URL set. 318 */ 319 protected void setStrings () 320 { 321 if (null != getURL ()) 322 try 323 { 324 try 325 { 326 mParser.visitAllNodesWith (this); 327 updateStrings (mBuffer.toString ()); 328 } 329 finally 330 { 331 mBuffer = new StringBuffer (4096); 332 } 333 } 334 catch (EncodingChangeException ece) 335 { 336 mIsPre = false; 337 mIsScript = false; 338 mIsStyle = false; 339 try 340 { // try again with the encoding now in force 341 mParser.reset (); 342 mBuffer = new StringBuffer (4096); 343 mParser.visitAllNodesWith (this); 344 updateStrings (mBuffer.toString ()); 345 } 346 catch (ParserException pe) 347 { 348 updateStrings (pe.toString ()); 349 } 350 finally 351 { 352 mBuffer = new StringBuffer (4096); 353 } 354 } 355 catch (ParserException pe) 356 { 357 updateStrings (pe.toString ()); 358 } 359 else 360 { 361 // reset in case this StringBean is used as a visitor 362 // on another parser, not it's own 363 mStrings = null; 364 mBuffer = new StringBuffer (4096); 365 } 366 } 367 368 /** 369 * Refetch the URL contents. 370 * Only need to worry if there is already a valid parser and it's 371 * been spent fetching the string contents. 372 */ 373 private void resetStrings () 374 { 375 if (null != mStrings) 376 try 377 { 378 mParser.setURL (getURL ()); 379 setStrings (); 380 } 381 catch (ParserException pe) 382 { 383 updateStrings (pe.toString ()); 384 } 385 } 386 387 // 388 // Property change support. 389 // 390 391 /** 392 * Add a PropertyChangeListener to the listener list. 393 * The listener is registered for all properties. 394 * @param listener The PropertyChangeListener to be added. 395 */ 396 public void addPropertyChangeListener (PropertyChangeListener listener) 397 { 398 mPropertySupport.addPropertyChangeListener (listener); 399 } 400 401 /** 402 * Remove a PropertyChangeListener from the listener list. 403 * This removes a registered PropertyChangeListener. 404 * @param listener The PropertyChangeListener to be removed. 405 */ 406 public void removePropertyChangeListener (PropertyChangeListener listener) 407 { 408 mPropertySupport.removePropertyChangeListener (listener); 409 } 410 411 // 412 // Properties 413 // 414 415 /** 416 * Return the textual contents of the URL. 417 * This is the primary output of the bean. 418 * @return The user visible (what would be seen in a browser) text. 419 */ 420 public String getStrings () 421 { 422 if (null == mStrings) 423 if (0 == mBuffer.length ()) 424 setStrings (); 425 else 426 updateStrings (mBuffer.toString ()); 427 428 return (mStrings); 429 } 430 431 /** 432 * Get the current 'include links' state. 433 * @return <code>true</code> if link text is included in the text extracted 434 * from the URL, <code>false</code> otherwise. 435 */ 436 public boolean getLinks () 437 { 438 return (mLinks); 439 } 440 441 /** 442 * Set the 'include links' state. 443 * If the setting is changed after the URL has been set, the text from the 444 * URL will be reacquired, which is possibly expensive. 445 * @param links Use <code>true</code> if link text is to be included in the 446 * text extracted from the URL, <code>false</code> otherwise. 447 */ 448 public void setLinks (boolean links) 449 { 450 boolean oldValue = mLinks; 451 if (oldValue != links) 452 { 453 mLinks = links; 454 mPropertySupport.firePropertyChange ( 455 PROP_LINKS_PROPERTY, oldValue, links); 456 resetStrings (); 457 } 458 } 459 460 /** 461 * Get the current URL. 462 * @return The URL from which text has been extracted, or <code>null</code> 463 * if this property has not been set yet. 464 */ 465 public String getURL () 466 { 467 return ((null != mParser) ? mParser.getURL () : null); 468 } 469 470 /** 471 * Set the URL to extract strings from. 472 * The text from the URL will be fetched, which may be expensive, so this 473 * property should be set last. 474 * @param url The URL that text should be fetched from. 475 */ 476 public void setURL (String url) 477 { 478 String old; 479 URLConnection conn; 480 481 old = getURL (); 482 conn = getConnection (); 483 if (((null == old) && (null != url)) || ((null != old) 484 && !old.equals (url))) 485 { 486 try 487 { 488 if (null == mParser) 489 mParser = new Parser (url); 490 else 491 mParser.setURL (url); 492 mPropertySupport.firePropertyChange ( 493 PROP_URL_PROPERTY, old, getURL ()); 494 mPropertySupport.firePropertyChange ( 495 PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); 496 setStrings (); 497 } 498 catch (ParserException pe) 499 { 500 updateStrings (pe.toString ()); 501 } 502 } 503 } 504 505 /** 506 * Get the current 'replace non breaking spaces' state. 507 * @return <code>true</code> if non-breaking spaces (character '\u00a0', 508 * numeric character reference &#160; or character entity 509 * reference &nbsp;) are to be replaced with normal 510 * spaces (character '\u0020'). 511 */ 512 public boolean getReplaceNonBreakingSpaces () 513 { 514 return (mReplaceSpace); 515 } 516 517 /** 518 * Set the 'replace non breaking spaces' state. 519 * If the setting is changed after the URL has been set, the text from the 520 * URL will be reacquired, which is possibly expensive. 521 * @param replace <code>true</code> if non-breaking spaces 522 * (character '\u00a0', numeric character reference &#160; 523 * or character entity reference &nbsp;) are to be replaced with normal 524 * spaces (character '\u0020'). 525 */ 526 public void setReplaceNonBreakingSpaces (boolean replace) 527 { 528 boolean oldValue = mReplaceSpace; 529 if (oldValue != replace) 530 { 531 mReplaceSpace = replace; 532 mPropertySupport.firePropertyChange (PROP_REPLACE_SPACE_PROPERTY, 533 oldValue, replace); 534 resetStrings (); 535 } 536 } 537 538 /** 539 * Get the current 'collapse whitespace' state. 540 * If set to <code>true</code> this emulates the operation of browsers 541 * in interpretting text where <quote>user agents should collapse input 542 * white space sequences when producing output inter-word space</quote>. 543 * See HTML specification section 9.1 White space 544 * <a href="http://www.w3.org/TR/html4/struct/text.html#h-9.1"> 545 * http://www.w3.org/TR/html4/struct/text.html#h-9.1</a>. 546 * @return <code>true</code> if sequences of whitespace (space '\u0020', 547 * tab '\u0009', form feed '\u000C', zero-width space '\u200B', 548 * carriage-return '\r' and NEWLINE '\n') are to be replaced with a single 549 * space. 550 */ 551 public boolean getCollapse () 552 { 553 return (mCollapse); 554 } 555 556 /** 557 * Set the current 'collapse whitespace' state. 558 * If the setting is changed after the URL has been set, the text from the 559 * URL will be reacquired, which is possibly expensive. 560 * @param collapse If <code>true</code>, sequences of whitespace 561 * will be reduced to a single space. 562 */ 563 public void setCollapse (boolean collapse) 564 { 565 boolean oldValue = mCollapse; 566 if (oldValue != collapse) 567 { 568 mCollapse = collapse; 569 mPropertySupport.firePropertyChange ( 570 PROP_COLLAPSE_PROPERTY, oldValue, collapse); 571 resetStrings (); 572 } 573 } 574 575 /** 576 * Get the current connection. 577 * @return The connection that the parser has or <code>null</code> if it 578 * hasn't been set or the parser hasn't been constructed yet. 579 */ 580 public URLConnection getConnection () 581 { 582 return ((null != mParser) ? mParser.getConnection () : null); 583 } 584 585 /** 586 * Set the parser's connection. 587 * The text from the URL will be fetched, which may be expensive, so this 588 * property should be set last. 589 * @param connection New value of property Connection. 590 */ 591 public void setConnection (URLConnection connection) 592 { 593 String url; 594 URLConnection conn; 595 596 url = getURL (); 597 conn = getConnection (); 598 if (((null == conn) && (null != connection)) 599 || ((null != conn) && !conn.equals (connection))) 600 { 601 try 602 { 603 if (null == mParser) 604 mParser = new Parser (connection); 605 else 606 mParser.setConnection (connection); 607 mPropertySupport.firePropertyChange ( 608 PROP_URL_PROPERTY, url, getURL ()); 609 mPropertySupport.firePropertyChange ( 610 PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); 611 setStrings (); 612 } 613 catch (ParserException pe) 614 { 615 updateStrings (pe.toString ()); 616 } 617 } 618 } 619 620 // 621 // NodeVisitor overrides 622 // 623 624 /** 625 * Appends the text to the output. 626 * @param string The text node. 627 */ 628 public void visitStringNode (Text string) 629 { 630 if (!mIsScript && !mIsStyle) 631 { 632 String text = string.getText (); 633 if (!mIsPre) 634 { 635 text = Translate.decode (text); 636 if (getReplaceNonBreakingSpaces ()) 637 text = text.replace ('\u00a0', ' '); 638 if (getCollapse ()) 639 collapse (mBuffer, text); 640 else 641 mBuffer.append (text); 642 } 643 else 644 mBuffer.append (text); 645 } 646 } 647 648 /** 649 * Appends a NEWLINE to the output if the tag breaks flow, and 650 * possibly sets the state of the PRE and SCRIPT flags. 651 * @param tag The tag to examine. 652 */ 653 public void visitTag (Tag tag) 654 { 655 String name; 656 657 if (tag instanceof LinkTag) 658 if (getLinks ()) 659 { // appends the link as text between angle brackets to the output. 660 mBuffer.append ("<"); 661 mBuffer.append (((LinkTag)tag).getLink ()); 662 mBuffer.append (">"); 663 } 664 name = tag.getTagName (); 665 if (name.equalsIgnoreCase ("PRE")) 666 mIsPre = true; 667 else if (name.equalsIgnoreCase ("SCRIPT")) 668 mIsScript = true; 669 else if (name.equalsIgnoreCase ("STYLE")) 670 mIsStyle = true; 671 if (tag.breaksFlow ()) 672 carriageReturn (); 673 } 674 675 /** 676 * Resets the state of the PRE and SCRIPT flags. 677 * @param tag The end tag to process. 678 */ 679 public void visitEndTag (Tag tag) 680 { 681 String name; 682 683 name = tag.getTagName (); 684 if (name.equalsIgnoreCase ("PRE")) 685 mIsPre = false; 686 else if (name.equalsIgnoreCase ("SCRIPT")) 687 mIsScript = false; 688 else if (name.equalsIgnoreCase ("STYLE")) 689 mIsStyle = false; 690 } 691 692 /** 693 * Unit test. 694 * @param args Pass arg[0] as the URL to process. 695 */ 696 public static void main (String[] args) 697 { 698 if (0 >= args.length) 699 System.out.println ("Usage: java -classpath htmlparser.jar" 700 + " org.htmlparser.beans.StringBean <http://whatever_url>"); 701 else 702 { 703 StringBean sb = new StringBean (); 704 sb.setLinks (false); 705 sb.setReplaceNonBreakingSpaces (true); 706 sb.setCollapse (true); 707 sb.setURL (args[0]); 708 System.out.println (sb.getStrings ()); 709 } 710 } 711 }