/ org.htmlparser / src / org / htmlparser / beans / StringBean.java
StringBean.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2004 Derrick Oswald
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2005/05/15 11:49:03 $
 10  // $Revision: 1.44 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser.beans;
 28  
 29  import java.beans.PropertyChangeListener;
 30  import java.beans.PropertyChangeSupport;
 31  import java.io.Serializable;
 32  import java.net.URLConnection;
 33  
 34  import org.htmlparser.Parser;
 35  import org.htmlparser.Text;
 36  import org.htmlparser.tags.LinkTag;
 37  import org.htmlparser.Tag;
 38  import org.htmlparser.util.ParserException;
 39  import org.htmlparser.util.EncodingChangeException;
 40  import org.htmlparser.util.Translate;
 41  import org.htmlparser.visitors.NodeVisitor;
 42  
 43  /**
 44   * Extract strings from a URL.
 45   * <p>Text within &lt;SCRIPT&gt;&lt;/SCRIPT&gt; tags is removed.</p>
 46   * <p>The text within &lt;PRE&gt;&lt;/PRE&gt; tags is not altered.</p>
 47   * <p>The property <code>Strings</code>, which is the output property is null
 48   * until a URL is set. So a typical usage is:</p>
 49   * <pre>
 50   *     StringBean sb = new StringBean ();
 51   *     sb.setLinks (false);
 52   *     sb.setReplaceNonBreakingSpaces (true);
 53   *     sb.setCollapse (true);
 54   *     sb.setURL ("http://www.netbeans.org"); // the HTTP is performed here
 55   *     String s = sb.getStrings ();
 56   * </pre>
 57   * You can also use the StringBean as a NodeVisitor on your own parser,
 58   * in which case you have to refetch your page if you change one of the
 59   * properties because it resets the Strings property:</p>
 60   * <pre>
 61   *     StringBean sb = new StringBean ();
 62   *     Parser parser = new Parser ("http://cbc.ca");
 63   *     parser.visitAllNodesWith (sb);
 64   *     String s = sb.getStrings ();
 65   *     sb.setLinks (true);
 66   *     parser.reset ();
 67   *     parser.visitAllNodesWith (sb);
 68   *     String sl = sb.getStrings ();
 69   * </pre>
 70   * According to Nick Burch, who contributed the patch, this is handy if you
 71   * don't want StringBean to wander off and get the content itself, either
 72   * because you already have it, it's not on a website etc.
 73   */
 74  public class StringBean extends NodeVisitor implements Serializable
 75  {
 76      /**
 77       * Property name in event where the URL contents changes.
 78       */
 79      public static final String PROP_STRINGS_PROPERTY = "strings";
 80  
 81      /**
 82       * Property name in event where the 'embed links' state changes.
 83       */
 84      public static final String PROP_LINKS_PROPERTY = "links";
 85  
 86      /**
 87       * Property name in event where the URL changes.
 88       */
 89      public static final String PROP_URL_PROPERTY = "URL";
 90  
 91      /**
 92       * Property name in event where the 'replace non-breaking spaces'
 93       * state changes.
 94       */
 95      public static final String PROP_REPLACE_SPACE_PROPERTY =
 96          "replaceNonBreakingSpaces";
 97  
 98      /**
 99       * Property name in event where the 'collapse whitespace' state changes.
100       */
101      public static final String PROP_COLLAPSE_PROPERTY = "collapse";
102  
103      /**
104       * Property name in event where the connection changes.
105       */
106      public static final String PROP_CONNECTION_PROPERTY = "connection";
107  
108      /**
109       * A newline.
110       */
111      private static final String NEWLINE = System.getProperty ("line.separator");
112  
113      /**
114       * The length of the NEWLINE.
115       */
116      private static final int NEWLINE_SIZE = NEWLINE.length ();
117  
118      /**
119       * Bound property support.
120       */
121      protected PropertyChangeSupport mPropertySupport;
122  
123      /**
124       * The parser used to extract strings.
125       */
126      protected Parser mParser;
127  
128      /**
129       * The strings extracted from the URL.
130       */
131      protected String mStrings;
132  
133      /**
134       * If <code>true</code> the link URLs are embedded in the text output.
135       */
136      protected boolean mLinks;
137  
138      /**
139       * If <code>true</code> regular space characters are substituted for
140       * non-breaking spaces in the text output.
141       */
142      protected boolean mReplaceSpace;
143  
144      /**
145       * If <code>true</code> sequences of whitespace characters are replaced
146       * with a single space character.
147       */
148      protected boolean mCollapse;
149  
150      /**
151       * The buffer text is stored in while traversing the HTML.
152       */
153      protected StringBuffer mBuffer;
154  
155      /**
156       * Set <code>true</code> when traversing a SCRIPT tag.
157       */
158      protected boolean mIsScript;
159  
160      /**
161       * Set <code>true</code> when traversing a PRE tag.
162       */
163      protected boolean mIsPre;
164  
165      /**
166       * Set <code>true</code> when traversing a STYLE tag.
167       */
168      protected boolean mIsStyle;
169  
170     /**
171       * Create a StringBean object.
172       * Default property values are set to 'do the right thing':
173       * <p><code>Links</code> is set <code>false</code> so text appears like a
174       * browser would display it, albeit without the colour or underline clues
175       * normally associated with a link.</p>
176       * <p><code>ReplaceNonBreakingSpaces</code> is set <code>true</code>, so
177       * that printing the text works, but the extra information regarding these
178       * formatting marks is available if you set it false.</p>
179       * <p><code>Collapse</code> is set <code>true</code>, so text appears
180       * compact like a browser would display it.</p>
181       */
182      public StringBean ()
183      {
184          super (true, true);
185          mPropertySupport = new PropertyChangeSupport (this);
186          mParser = new Parser ();
187          mStrings = null;
188          mLinks = false;
189          mReplaceSpace = true;
190          mCollapse = true;
191          mBuffer = new StringBuffer (4096);
192          mIsScript = false;
193          mIsPre = false;
194          mIsStyle = false;
195      }
196  
197      //
198      // internals
199      //
200  
201      /**
202       * Appends a newline to the buffer if there isn't one there already.
203       * Except if the buffer is empty.
204       */
205      protected void carriageReturn ()
206      {
207          int length;
208  
209          length = mBuffer.length ();
210          if ((0 != length) // don't append newlines to the beginning of a buffer
211              && ((NEWLINE_SIZE <= length) // not enough chars to hold a NEWLINE
212              && (!mBuffer.substring (
213                  length - NEWLINE_SIZE, length).equals (NEWLINE))))
214              mBuffer.append (NEWLINE);
215      }
216  
217      /**
218       * Add the given text collapsing whitespace.
219       * Use a little finite state machine:
220       * <pre>
221       * state 0: whitepace was last emitted character
222       * state 1: in whitespace
223       * state 2: in word
224       * A whitespace character moves us to state 1 and any other character
225       * moves us to state 2, except that state 0 stays in state 0 until
226       * a non-whitespace and going from whitespace to word we emit a space
227       * before the character:
228       *    input:     whitespace   other-character
229       * state\next
230       *    0               0             2
231       *    1               1        space then 2
232       *    2               1             2
233       * </pre>
234       * @param buffer The buffer to append to.
235       * @param string The string to append.
236       */
237      protected void collapse (StringBuffer buffer, String string)
238      {
239          int chars;
240          int length;
241          int state;
242          char character;
243  
244          chars = string.length ();
245          if (0 != chars)
246          {
247              length = buffer.length ();
248              state = ((0 == length)
249                  || (buffer.charAt (length - 1) == ' ')
250                  || ((NEWLINE_SIZE <= length)
251                      && buffer.substring (
252                          length - NEWLINE_SIZE, length).equals (NEWLINE)))
253                  ? 0 : 1;
254              for (int i = 0; i < chars; i++)
255              {
256                  character = string.charAt (i);
257                  switch (character)
258                  {
259                      // see HTML specification section 9.1 White space
260                      // http://www.w3.org/TR/html4/struct/text.html#h-9.1
261                      case '\u0020':
262                      case '\u0009':
263                      case '\u000C':
264                      case '\u200B':
265                      case '\r':
266                      case '\n':
267                          if (0 != state)
268                              state = 1;
269                          break;
270                      default:
271                          if (1 == state)
272                              buffer.append (' ');
273                          state = 2;
274                          buffer.append (character);
275                  }
276              }
277          }
278      }
279  
280      /**
281       * Extract the text from a page.
282       * @return The textual contents of the page.
283       * @exception ParserException If a parse error occurs.
284       */
285      protected String extractStrings ()
286          throws
287              ParserException
288      {
289          String ret;
290  
291          mParser.visitAllNodesWith (this);
292          ret = mBuffer.toString ();
293          mBuffer = new StringBuffer(4096);
294  
295          return (ret);
296      }
297  
298      /**
299       * Assign the <code>Strings</code> property, firing the property change.
300       * @param strings The new value of the <code>Strings</code> property.
301       */
302      protected void updateStrings (String strings)
303      {
304          String oldValue;
305  
306          if ((null == mStrings) || !mStrings.equals (strings))
307          {
308              oldValue = mStrings;
309              mStrings = strings;
310              mPropertySupport.firePropertyChange (
311                  PROP_STRINGS_PROPERTY, oldValue, strings);
312          }
313      }
314  
315      /**
316       * Fetch the URL contents.
317       * Only do work if there is a valid parser with it's URL set.
318       */
319      protected void setStrings ()
320      {
321          if (null != getURL ())
322              try
323              {
324                  try
325                  {
326                      mParser.visitAllNodesWith (this);
327                      updateStrings (mBuffer.toString ());
328                  }
329                  finally
330                  {
331                      mBuffer = new StringBuffer (4096);
332                  }
333              }
334              catch (EncodingChangeException ece)
335              {
336                  mIsPre = false;
337                  mIsScript = false;
338                  mIsStyle = false;
339                  try
340                  {   // try again with the encoding now in force
341                      mParser.reset ();
342                      mBuffer = new StringBuffer (4096);
343                      mParser.visitAllNodesWith (this);
344                      updateStrings (mBuffer.toString ());
345                  }
346                  catch (ParserException pe)
347                  {
348                      updateStrings (pe.toString ());
349                  }
350                  finally
351                  {
352                      mBuffer = new StringBuffer (4096);
353                  }
354               }
355              catch (ParserException pe)
356              {
357                  updateStrings (pe.toString ());
358              }
359          else
360          {
361              // reset in case this StringBean is used as a visitor
362              // on another parser, not it's own
363              mStrings = null;
364              mBuffer = new StringBuffer (4096);
365          }
366      }
367  
368      /**
369       * Refetch the URL contents.
370       * Only need to worry if there is already a valid parser and it's
371       * been spent fetching the string contents.
372       */
373      private void resetStrings ()
374      {
375          if (null != mStrings)
376              try
377              {
378                  mParser.setURL (getURL ());
379                  setStrings ();
380              }
381              catch (ParserException pe)
382              {
383                  updateStrings (pe.toString ());
384              }
385      }
386  
387      //
388      // Property change support.
389      //
390  
391      /**
392       * Add a PropertyChangeListener to the listener list.
393       * The listener is registered for all properties.
394       * @param listener The PropertyChangeListener to be added.
395       */
396      public void addPropertyChangeListener (PropertyChangeListener listener)
397      {
398          mPropertySupport.addPropertyChangeListener (listener);
399      }
400  
401      /**
402       * Remove a PropertyChangeListener from the listener list.
403       * This removes a registered PropertyChangeListener.
404       * @param listener The PropertyChangeListener to be removed.
405       */
406      public void removePropertyChangeListener (PropertyChangeListener listener)
407      {
408          mPropertySupport.removePropertyChangeListener (listener);
409      }
410  
411      //
412      // Properties
413      //
414  
415      /**
416       * Return the textual contents of the URL.
417       * This is the primary output of the bean.
418       * @return The user visible (what would be seen in a browser) text.
419       */
420      public String getStrings ()
421      {
422          if (null == mStrings)
423          if (0 == mBuffer.length ())
424              setStrings ();
425          else
426              updateStrings (mBuffer.toString ());
427  
428          return (mStrings);
429      }
430  
431      /**
432       * Get the current 'include links' state.
433       * @return <code>true</code> if link text is included in the text extracted
434       * from the URL, <code>false</code> otherwise.
435       */
436      public boolean getLinks ()
437      {
438          return (mLinks);
439      }
440  
441      /**
442       * Set the 'include links' state.
443       * If the setting is changed after the URL has been set, the text from the
444       * URL will be reacquired, which is possibly expensive.
445       * @param links Use <code>true</code> if link text is to be included in the
446       * text extracted from the URL, <code>false</code> otherwise.
447       */
448      public void setLinks (boolean links)
449      {
450          boolean oldValue = mLinks;
451          if (oldValue != links)
452          {
453              mLinks = links;
454              mPropertySupport.firePropertyChange (
455                  PROP_LINKS_PROPERTY, oldValue, links);
456              resetStrings ();
457          }
458      }
459  
460      /**
461       * Get the current URL.
462       * @return The URL from which text has been extracted, or <code>null</code>
463       * if this property has not been set yet.
464       */
465      public String getURL ()
466      {
467           return ((null != mParser) ? mParser.getURL () : null);
468      }
469  
470      /**
471       * Set the URL to extract strings from.
472       * The text from the URL will be fetched, which may be expensive, so this
473       * property should be set last.
474       * @param url The URL that text should be fetched from.
475       */
476      public void setURL (String url)
477      {
478          String old;
479          URLConnection conn;
480  
481          old = getURL ();
482          conn = getConnection ();
483          if (((null == old) && (null != url)) || ((null != old)
484              && !old.equals (url)))
485          {
486              try
487              {
488                  if (null == mParser)
489                      mParser = new Parser (url);
490                  else
491                      mParser.setURL (url);
492                  mPropertySupport.firePropertyChange (
493                      PROP_URL_PROPERTY, old, getURL ());
494                  mPropertySupport.firePropertyChange (
495                      PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());
496                  setStrings ();
497              }
498              catch (ParserException pe)
499              {
500                  updateStrings (pe.toString ());
501              }
502          }
503      }
504  
505      /**
506       * Get the current 'replace non breaking spaces' state.
507       * @return <code>true</code> if non-breaking spaces (character '&#92;u00a0',
508       * numeric character reference &amp;#160; or character entity
509       * reference &amp;nbsp;) are to be replaced with normal
510       * spaces (character '&#92;u0020').
511       */
512      public boolean getReplaceNonBreakingSpaces ()
513      {
514          return (mReplaceSpace);
515      }
516  
517      /**
518       * Set the 'replace non breaking spaces' state.
519       * If the setting is changed after the URL has been set, the text from the
520       * URL will be reacquired, which is possibly expensive.
521       * @param replace <code>true</code> if non-breaking spaces
522       * (character '&#92;u00a0', numeric character reference &amp;#160;
523       * or character entity reference &amp;nbsp;) are to be replaced with normal
524       * spaces (character '&#92;u0020').
525       */
526      public void setReplaceNonBreakingSpaces (boolean replace)
527      {
528          boolean oldValue = mReplaceSpace;
529          if (oldValue != replace)
530          {
531              mReplaceSpace = replace;
532              mPropertySupport.firePropertyChange (PROP_REPLACE_SPACE_PROPERTY,
533                  oldValue, replace);
534              resetStrings ();
535          }
536      }
537  
538      /**
539       * Get the current 'collapse whitespace' state.
540       * If set to <code>true</code> this emulates the operation of browsers
541       * in interpretting text where <quote>user agents should collapse input
542       * white space sequences when producing output inter-word space</quote>.
543       * See HTML specification section 9.1 White space
544       * <a href="http://www.w3.org/TR/html4/struct/text.html#h-9.1">
545       * http://www.w3.org/TR/html4/struct/text.html#h-9.1</a>.
546       * @return <code>true</code> if sequences of whitespace (space '&#92;u0020',
547       * tab '&#92;u0009', form feed '&#92;u000C', zero-width space '&#92;u200B',
548       * carriage-return '\r' and NEWLINE '\n') are to be replaced with a single
549       * space.
550       */
551      public boolean getCollapse ()
552      {
553          return (mCollapse);
554      }
555  
556      /**
557       * Set the current 'collapse whitespace' state.
558       * If the setting is changed after the URL has been set, the text from the
559       * URL will be reacquired, which is possibly expensive.
560       * @param collapse If <code>true</code>, sequences of whitespace
561       * will be reduced to a single space.
562       */
563      public void setCollapse (boolean collapse)
564      {
565          boolean oldValue = mCollapse;
566          if (oldValue != collapse)
567          {
568              mCollapse = collapse;
569              mPropertySupport.firePropertyChange (
570                      PROP_COLLAPSE_PROPERTY, oldValue, collapse);
571              resetStrings ();
572          }
573      }
574  
575      /**
576       * Get the current connection.
577       * @return The connection that the parser has or <code>null</code> if it
578       * hasn't been set or the parser hasn't been constructed yet.
579       */
580      public URLConnection getConnection ()
581      {
582          return ((null != mParser) ? mParser.getConnection () : null);
583      }
584  
585      /**
586       * Set the parser's connection.
587       * The text from the URL will be fetched, which may be expensive, so this
588       * property should be set last.
589       * @param connection New value of property Connection.
590       */
591      public void setConnection (URLConnection connection)
592      {
593          String url;
594          URLConnection conn;
595  
596          url = getURL ();
597          conn = getConnection ();
598          if (((null == conn) && (null != connection))
599              || ((null != conn) && !conn.equals (connection)))
600          {
601              try
602              {
603                  if (null == mParser)
604                      mParser = new Parser (connection);
605                  else
606                      mParser.setConnection (connection);
607                  mPropertySupport.firePropertyChange (
608                      PROP_URL_PROPERTY, url, getURL ());
609                  mPropertySupport.firePropertyChange (
610                      PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());
611                  setStrings ();
612              }
613              catch (ParserException pe)
614              {
615                  updateStrings (pe.toString ());
616              }
617          }
618      }
619  
620      //
621      // NodeVisitor overrides
622      //
623  
624      /**
625       * Appends the text to the output.
626       * @param string The text node.
627       */
628      public void visitStringNode (Text string)
629      {
630          if (!mIsScript && !mIsStyle)
631          {
632              String text = string.getText ();
633              if (!mIsPre)
634              {
635                  text = Translate.decode (text);
636                  if (getReplaceNonBreakingSpaces ())
637                      text = text.replace ('\u00a0', ' ');
638                  if (getCollapse ())
639                      collapse (mBuffer, text);
640                  else
641                      mBuffer.append (text);
642              }
643              else
644                  mBuffer.append (text);
645          }
646      }
647  
648      /**
649       * Appends a NEWLINE to the output if the tag breaks flow, and
650       * possibly sets the state of the PRE and SCRIPT flags.
651       * @param tag The tag to examine.
652       */
653      public void visitTag (Tag tag)
654      {
655          String name;
656  
657          if (tag instanceof LinkTag)
658              if (getLinks ())
659              { // appends the link as text between angle brackets to the output.
660                  mBuffer.append ("<");
661                  mBuffer.append (((LinkTag)tag).getLink ());
662                  mBuffer.append (">");
663              }
664          name = tag.getTagName ();
665          if (name.equalsIgnoreCase ("PRE"))
666              mIsPre = true;
667          else if (name.equalsIgnoreCase ("SCRIPT"))
668              mIsScript = true;
669          else if (name.equalsIgnoreCase ("STYLE"))
670              mIsStyle = true;
671          if (tag.breaksFlow ())
672              carriageReturn ();
673      }
674  
675      /**
676       * Resets the state of the PRE and SCRIPT flags.
677       * @param tag The end tag to process.
678       */
679      public void visitEndTag (Tag tag)
680      {
681          String name;
682  
683          name = tag.getTagName ();
684          if (name.equalsIgnoreCase ("PRE"))
685              mIsPre = false;
686          else if (name.equalsIgnoreCase ("SCRIPT"))
687              mIsScript = false;
688          else if (name.equalsIgnoreCase ("STYLE"))
689              mIsStyle = false;
690      }
691  
692      /**
693       * Unit test.
694       * @param args Pass arg[0] as the URL to process.
695       */
696      public static void main (String[] args)
697      {
698          if (0 >= args.length)
699              System.out.println ("Usage: java -classpath htmlparser.jar"
700                  + " org.htmlparser.beans.StringBean <http://whatever_url>");
701          else
702          {
703              StringBean sb = new StringBean ();
704              sb.setLinks (false);
705              sb.setReplaceNonBreakingSpaces (true);
706              sb.setCollapse (true);
707              sb.setURL (args[0]);
708              System.out.println (sb.getStrings ());
709          }
710      }
711  }