/ org.htmlparser / src / org / htmlparser / beans / FilterBean.java
FilterBean.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2004 Derrick Oswald
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/FilterBean.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2005/09/18 23:40:44 $
 10  // $Revision: 1.4 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser.beans;
 28  
 29  import java.beans.PropertyChangeListener;
 30  import java.beans.PropertyChangeSupport;
 31  import java.io.Serializable;
 32  import java.net.URLConnection;
 33  
 34  import org.htmlparser.NodeFilter;
 35  import org.htmlparser.Parser;
 36  import org.htmlparser.util.NodeList;
 37  import org.htmlparser.util.ParserException;
 38  import org.htmlparser.util.EncodingChangeException;
 39  
 40  /**
 41   * Extract nodes from a URL using a filter.
 42   * <pre>
 43   * <code>
 44   *     FilterBean fb = new FilterBean ("http://cbc.ca");
 45   *     fb.setFilters (new NodeFilter[] { new TagNameFilter ("META") });
 46   *     fb.setURL ("http://cbc.ca");
 47   *     System.out.println (fb.getNodes ().toHtml ());
 48   * </code>
 49   * </pre>
 50   */
 51  public class FilterBean
 52      implements
 53          Serializable
 54  {
 55      /**
 56       * Property name in event where the URL contents changes.
 57       */
 58      public static final String PROP_NODES_PROPERTY = "nodes";
 59  
 60      /**
 61       * Property name in event where the URL contents changes.
 62       */
 63      public static final String PROP_TEXT_PROPERTY = "text";
 64  
 65      /**
 66       * Property name in event where the URL changes.
 67       */
 68      public static final String PROP_URL_PROPERTY = "URL";
 69  
 70      /**
 71       * Property name in event where the connection changes.
 72       */
 73      public static final String PROP_CONNECTION_PROPERTY = "connection";
 74  
 75      /**
 76       * Bound property support.
 77       */
 78      protected PropertyChangeSupport mPropertySupport;
 79  
 80      /**
 81       * The parser used to filter.
 82       */
 83      protected Parser mParser;
 84  
 85      /**
 86       * The filter set.
 87       */
 88      protected NodeFilter[] mFilters;
 89  
 90      /**
 91       * The nodes extracted from the URL.
 92       */
 93      protected NodeList mNodes;
 94  
 95      /**
 96       * The recursion behaviour for elements of the filter array.
 97       * If <code>true</code> the filters are applied recursively.
 98       * @see org.htmlparser.util.NodeList#extractAllNodesThatMatch(NodeFilter, boolean).
 99       */
100      protected boolean mRecursive;
101  
102     /**
103       * Create a FilterBean object.
104       */
105      public FilterBean ()
106      {
107          mPropertySupport = new PropertyChangeSupport (this);
108          mParser = new Parser ();
109          mFilters = null;
110          mNodes = null;
111          mRecursive = true;
112      }
113  
114      //
115      // internals
116      //
117  
118      /**
119       * Assign the <code>Nodes</code> property, firing the property change.
120       * @param nodes The new value of the <code>Nodes</code> property.
121       */
122      protected void updateNodes (NodeList nodes)
123      {
124          NodeList oldValue;
125          String oldText;
126          String newText;
127  
128          if ((null == mNodes) || !mNodes.equals (nodes))
129          {
130              oldValue = mNodes;
131              if (null != oldValue)
132                  oldText = getText ();
133              else
134                  oldText = "";
135              if (null == oldText)
136                  oldText = "";
137              mNodes = nodes;
138              if (null != mNodes) // TODO: fix this null problem
139                  newText = getText ();
140              else // StringBean finds no nodes
141                  newText = "";
142              if (null == newText)
143                  newText = "";
144              mPropertySupport.firePropertyChange (
145                  PROP_NODES_PROPERTY, oldValue, nodes);
146              if (!newText.equals (oldText))
147                  mPropertySupport.firePropertyChange (
148                      PROP_TEXT_PROPERTY, oldText, newText);
149          }
150      }
151  
152      /**
153       * Apply each of the filters.
154       * The first filter is applied to the output of the parser.
155       * Subsequent filters are applied to the output of the prior filter.
156       * @return A list of nodes passed through all filters.
157       * If there are no filters, returns the entire page.
158       * @throws ParserException If an encoding change occurs
159       * or there is some other problem.
160       */
161      protected NodeList applyFilters ()
162          throws
163              ParserException
164      {
165          NodeFilter[] filters;
166          NodeList ret;
167  
168          ret = mParser.parse (null);
169          filters = getFilters ();
170          if (null != filters)
171              for (int i = 0; i < filters.length; i++)
172                  ret = ret.extractAllNodesThatMatch (filters[i], mRecursive);
173  
174          return (ret);
175      }
176  
177      /**
178       * Fetch the URL contents and filter it.
179       * Only do work if there is a valid parser with it's URL set.
180       */
181      protected void setNodes ()
182      {
183          NodeList list;
184  
185          if (null != getURL ())
186              try
187              {
188                  list = applyFilters ();
189                  updateNodes (list);
190              }
191              catch (EncodingChangeException ece)
192              {
193                  try
194                  {   // try again with the encoding now in force
195                      mParser.reset ();
196                      list = applyFilters ();
197                      updateNodes (list);
198                  }
199                  catch (ParserException pe)
200                  {
201                      updateNodes (new NodeList ());
202                  }
203               }
204              catch (ParserException pe)
205              {
206                  updateNodes (new NodeList ());
207              }
208      }
209  
210      //
211      // Property change support.
212      //
213  
214      /**
215       * Add a PropertyChangeListener to the listener list.
216       * The listener is registered for all properties.
217       * @param listener The PropertyChangeListener to be added.
218       */
219      public void addPropertyChangeListener (PropertyChangeListener listener)
220      {
221          mPropertySupport.addPropertyChangeListener (listener);
222      }
223  
224      /**
225       * Remove a PropertyChangeListener from the listener list.
226       * This removes a registered PropertyChangeListener.
227       * @param listener The PropertyChangeListener to be removed.
228       */
229      public void removePropertyChangeListener (PropertyChangeListener listener)
230      {
231          mPropertySupport.removePropertyChangeListener (listener);
232      }
233  
234      //
235      // Properties
236      //
237  
238      /**
239       * Return the nodes of the URL matching the filter.
240       * This is the primary output of the bean.
241       * @return The nodes from the URL matching the current filter.
242       */
243      public NodeList getNodes ()
244      {
245          if (null == mNodes)
246              setNodes ();
247  
248          return (mNodes);
249      }
250  
251      /**
252       * Get the current URL.
253       * @return The URL from which text has been extracted, or <code>null</code>
254       * if this property has not been set yet.
255       */
256      public String getURL ()
257      {
258           return ((null != mParser) ? mParser.getURL () : null);
259      }
260  
261      /**
262       * Set the URL to extract strings from.
263       * The text from the URL will be fetched, which may be expensive, so this
264       * property should be set last.
265       * @param url The URL that text should be fetched from.
266       */
267      public void setURL (String url)
268      {
269          String old;
270          URLConnection conn;
271  
272          old = getURL ();
273          conn = getConnection ();
274          if (((null == old) && (null != url)) || ((null != old)
275              && !old.equals (url)))
276          {
277              try
278              {
279                  if (null == mParser)
280                      mParser = new Parser (url);
281                  else
282                      mParser.setURL (url);
283                  mPropertySupport.firePropertyChange (
284                      PROP_URL_PROPERTY, old, getURL ());
285                  mPropertySupport.firePropertyChange (
286                      PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());
287                  setNodes ();
288              }
289              catch (ParserException pe)
290              {
291                  updateNodes (new NodeList ());
292              }
293          }
294      }
295  
296      /**
297       * Get the current connection.
298       * @return The connection that the parser has or <code>null</code> if it
299       * hasn't been set or the parser hasn't been constructed yet.
300       */
301      public URLConnection getConnection ()
302      {
303          return ((null != mParser) ? mParser.getConnection () : null);
304      }
305  
306      /**
307       * Set the parser's connection.
308       * The text from the URL will be fetched, which may be expensive, so this
309       * property should be set last.
310       * @param connection New value of property Connection.
311       */
312      public void setConnection (URLConnection connection)
313      {
314          String url;
315          URLConnection conn;
316  
317          url = getURL ();
318          conn = getConnection ();
319          if (((null == conn) && (null != connection)) || ((null != conn)
320              && !conn.equals (connection)))
321          {
322              try
323              {
324                  if (null == mParser)
325                      mParser = new Parser (connection);
326                  else
327                      mParser.setConnection (connection);
328                  mPropertySupport.firePropertyChange (
329                      PROP_URL_PROPERTY, url, getURL ());
330                  mPropertySupport.firePropertyChange (
331                      PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());
332                  setNodes ();
333              }
334              catch (ParserException pe)
335              {
336                  updateNodes (new NodeList ());
337              }
338          }
339      }
340  
341      /**
342       * Get the current filter set.
343       * @return The current filters.
344       */
345      public NodeFilter[] getFilters ()
346      {
347          return (mFilters);
348      }
349  
350      /**
351       * Set the filters for the bean.
352       * If the parser has been set, it is reset and
353       * the nodes are refetched with the new filters.
354       * @param filters The filter set to use.
355       */
356      public void setFilters (NodeFilter[] filters)
357      {
358          mFilters = filters;
359          if (null != getParser ())
360          {
361              getParser ().reset ();
362              setNodes ();
363          }
364      }
365  
366      /**
367       * Get the parser used to fetch nodes.
368       * @return The parser used by the bean.
369       */
370      public Parser getParser ()
371      {
372          return (mParser);
373      }
374  
375      /**
376       * Set the parser for the bean.
377       * The parser is used immediately to fetch the nodes,
378       * which for a null filter means all the nodes
379       * @param parser The parser to use.
380       */
381      public void setParser (Parser parser)
382      {
383          mParser = parser;
384          if (null != getFilters ())
385              setNodes ();
386      }
387  
388      /**
389       * Convenience method to apply a {@link StringBean} to the filter results.
390       * This may yield duplicate or multiple text elements if the node list
391       * contains nodes from two or more levels in the same nested tag heirarchy,
392       * but if the node list contains only one tag, it provides access to the
393       * text within the node.
394       * @return The textual contents of the nodes that pass through the filter set,
395       * as collected by the StringBean. 
396       */
397      public String getText ()
398      {
399          NodeList list;
400          StringBean sb;
401          String ret;
402  
403          list = getNodes ();
404          if (0 != list.size ())
405          {
406              sb = new StringBean ();
407              for (int i = 0; i < list.size (); i++)
408                  list.elementAt (i).accept (sb);
409              ret = sb.getStrings ();
410          }
411          else
412              ret = "";
413          
414          return (ret);
415      }
416  
417      /**
418       * Get the current recursion behaviour.
419       * @return The recursion (applies to children, children's children, etc)
420       * behavior currently being used.
421       */
422      public boolean getRecursive ()
423      {
424          return (mRecursive);
425      }
426  
427      /**
428       * Set the recursion behaviour.
429       * @param recursive If <code>true</code> the
430       * <code>extractAllNodesThatMatch()</code> call is performed recursively.
431       * @see org.htmlparser.util.NodeList#extractAllNodesThatMatch(NodeFilter, boolean).
432       */
433      public void setRecursive (boolean recursive)
434      {
435          mRecursive = recursive;
436      }
437  
438      /**
439       * Unit test.
440       * @param args Pass arg[0] as the URL to process,
441       * and optionally a node name for filtering.
442       */
443      public static void main (String[] args)
444      {
445          if (0 >= args.length)
446              System.out.println ("Usage: java -classpath htmlparser.jar org.htmlparser.beans.FilterBean <http://whatever_url> [node name]");
447          else
448          {
449              FilterBean fb = new FilterBean ();
450              if (1 < args.length)
451                  fb.setFilters (new NodeFilter[] { new org.htmlparser.filters.TagNameFilter (args[1]) });
452              fb.setURL (args[0]);
453              //System.out.println (fb.getNodes ().toHtml ());
454              System.out.println (fb.getText ());
455          }
456      }
457  }