FilterBean.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Derrick Oswald 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/FilterBean.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2005/09/18 23:40:44 $ 10 // $Revision: 1.4 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.beans; 28 29 import java.beans.PropertyChangeListener; 30 import java.beans.PropertyChangeSupport; 31 import java.io.Serializable; 32 import java.net.URLConnection; 33 34 import org.htmlparser.NodeFilter; 35 import org.htmlparser.Parser; 36 import org.htmlparser.util.NodeList; 37 import org.htmlparser.util.ParserException; 38 import org.htmlparser.util.EncodingChangeException; 39 40 /** 41 * Extract nodes from a URL using a filter. 42 * <pre> 43 * <code> 44 * FilterBean fb = new FilterBean ("http://cbc.ca"); 45 * fb.setFilters (new NodeFilter[] { new TagNameFilter ("META") }); 46 * fb.setURL ("http://cbc.ca"); 47 * System.out.println (fb.getNodes ().toHtml ()); 48 * </code> 49 * </pre> 50 */ 51 public class FilterBean 52 implements 53 Serializable 54 { 55 /** 56 * Property name in event where the URL contents changes. 57 */ 58 public static final String PROP_NODES_PROPERTY = "nodes"; 59 60 /** 61 * Property name in event where the URL contents changes. 62 */ 63 public static final String PROP_TEXT_PROPERTY = "text"; 64 65 /** 66 * Property name in event where the URL changes. 67 */ 68 public static final String PROP_URL_PROPERTY = "URL"; 69 70 /** 71 * Property name in event where the connection changes. 72 */ 73 public static final String PROP_CONNECTION_PROPERTY = "connection"; 74 75 /** 76 * Bound property support. 77 */ 78 protected PropertyChangeSupport mPropertySupport; 79 80 /** 81 * The parser used to filter. 82 */ 83 protected Parser mParser; 84 85 /** 86 * The filter set. 87 */ 88 protected NodeFilter[] mFilters; 89 90 /** 91 * The nodes extracted from the URL. 92 */ 93 protected NodeList mNodes; 94 95 /** 96 * The recursion behaviour for elements of the filter array. 97 * If <code>true</code> the filters are applied recursively. 98 * @see org.htmlparser.util.NodeList#extractAllNodesThatMatch(NodeFilter, boolean). 99 */ 100 protected boolean mRecursive; 101 102 /** 103 * Create a FilterBean object. 104 */ 105 public FilterBean () 106 { 107 mPropertySupport = new PropertyChangeSupport (this); 108 mParser = new Parser (); 109 mFilters = null; 110 mNodes = null; 111 mRecursive = true; 112 } 113 114 // 115 // internals 116 // 117 118 /** 119 * Assign the <code>Nodes</code> property, firing the property change. 120 * @param nodes The new value of the <code>Nodes</code> property. 121 */ 122 protected void updateNodes (NodeList nodes) 123 { 124 NodeList oldValue; 125 String oldText; 126 String newText; 127 128 if ((null == mNodes) || !mNodes.equals (nodes)) 129 { 130 oldValue = mNodes; 131 if (null != oldValue) 132 oldText = getText (); 133 else 134 oldText = ""; 135 if (null == oldText) 136 oldText = ""; 137 mNodes = nodes; 138 if (null != mNodes) // TODO: fix this null problem 139 newText = getText (); 140 else // StringBean finds no nodes 141 newText = ""; 142 if (null == newText) 143 newText = ""; 144 mPropertySupport.firePropertyChange ( 145 PROP_NODES_PROPERTY, oldValue, nodes); 146 if (!newText.equals (oldText)) 147 mPropertySupport.firePropertyChange ( 148 PROP_TEXT_PROPERTY, oldText, newText); 149 } 150 } 151 152 /** 153 * Apply each of the filters. 154 * The first filter is applied to the output of the parser. 155 * Subsequent filters are applied to the output of the prior filter. 156 * @return A list of nodes passed through all filters. 157 * If there are no filters, returns the entire page. 158 * @throws ParserException If an encoding change occurs 159 * or there is some other problem. 160 */ 161 protected NodeList applyFilters () 162 throws 163 ParserException 164 { 165 NodeFilter[] filters; 166 NodeList ret; 167 168 ret = mParser.parse (null); 169 filters = getFilters (); 170 if (null != filters) 171 for (int i = 0; i < filters.length; i++) 172 ret = ret.extractAllNodesThatMatch (filters[i], mRecursive); 173 174 return (ret); 175 } 176 177 /** 178 * Fetch the URL contents and filter it. 179 * Only do work if there is a valid parser with it's URL set. 180 */ 181 protected void setNodes () 182 { 183 NodeList list; 184 185 if (null != getURL ()) 186 try 187 { 188 list = applyFilters (); 189 updateNodes (list); 190 } 191 catch (EncodingChangeException ece) 192 { 193 try 194 { // try again with the encoding now in force 195 mParser.reset (); 196 list = applyFilters (); 197 updateNodes (list); 198 } 199 catch (ParserException pe) 200 { 201 updateNodes (new NodeList ()); 202 } 203 } 204 catch (ParserException pe) 205 { 206 updateNodes (new NodeList ()); 207 } 208 } 209 210 // 211 // Property change support. 212 // 213 214 /** 215 * Add a PropertyChangeListener to the listener list. 216 * The listener is registered for all properties. 217 * @param listener The PropertyChangeListener to be added. 218 */ 219 public void addPropertyChangeListener (PropertyChangeListener listener) 220 { 221 mPropertySupport.addPropertyChangeListener (listener); 222 } 223 224 /** 225 * Remove a PropertyChangeListener from the listener list. 226 * This removes a registered PropertyChangeListener. 227 * @param listener The PropertyChangeListener to be removed. 228 */ 229 public void removePropertyChangeListener (PropertyChangeListener listener) 230 { 231 mPropertySupport.removePropertyChangeListener (listener); 232 } 233 234 // 235 // Properties 236 // 237 238 /** 239 * Return the nodes of the URL matching the filter. 240 * This is the primary output of the bean. 241 * @return The nodes from the URL matching the current filter. 242 */ 243 public NodeList getNodes () 244 { 245 if (null == mNodes) 246 setNodes (); 247 248 return (mNodes); 249 } 250 251 /** 252 * Get the current URL. 253 * @return The URL from which text has been extracted, or <code>null</code> 254 * if this property has not been set yet. 255 */ 256 public String getURL () 257 { 258 return ((null != mParser) ? mParser.getURL () : null); 259 } 260 261 /** 262 * Set the URL to extract strings from. 263 * The text from the URL will be fetched, which may be expensive, so this 264 * property should be set last. 265 * @param url The URL that text should be fetched from. 266 */ 267 public void setURL (String url) 268 { 269 String old; 270 URLConnection conn; 271 272 old = getURL (); 273 conn = getConnection (); 274 if (((null == old) && (null != url)) || ((null != old) 275 && !old.equals (url))) 276 { 277 try 278 { 279 if (null == mParser) 280 mParser = new Parser (url); 281 else 282 mParser.setURL (url); 283 mPropertySupport.firePropertyChange ( 284 PROP_URL_PROPERTY, old, getURL ()); 285 mPropertySupport.firePropertyChange ( 286 PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); 287 setNodes (); 288 } 289 catch (ParserException pe) 290 { 291 updateNodes (new NodeList ()); 292 } 293 } 294 } 295 296 /** 297 * Get the current connection. 298 * @return The connection that the parser has or <code>null</code> if it 299 * hasn't been set or the parser hasn't been constructed yet. 300 */ 301 public URLConnection getConnection () 302 { 303 return ((null != mParser) ? mParser.getConnection () : null); 304 } 305 306 /** 307 * Set the parser's connection. 308 * The text from the URL will be fetched, which may be expensive, so this 309 * property should be set last. 310 * @param connection New value of property Connection. 311 */ 312 public void setConnection (URLConnection connection) 313 { 314 String url; 315 URLConnection conn; 316 317 url = getURL (); 318 conn = getConnection (); 319 if (((null == conn) && (null != connection)) || ((null != conn) 320 && !conn.equals (connection))) 321 { 322 try 323 { 324 if (null == mParser) 325 mParser = new Parser (connection); 326 else 327 mParser.setConnection (connection); 328 mPropertySupport.firePropertyChange ( 329 PROP_URL_PROPERTY, url, getURL ()); 330 mPropertySupport.firePropertyChange ( 331 PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); 332 setNodes (); 333 } 334 catch (ParserException pe) 335 { 336 updateNodes (new NodeList ()); 337 } 338 } 339 } 340 341 /** 342 * Get the current filter set. 343 * @return The current filters. 344 */ 345 public NodeFilter[] getFilters () 346 { 347 return (mFilters); 348 } 349 350 /** 351 * Set the filters for the bean. 352 * If the parser has been set, it is reset and 353 * the nodes are refetched with the new filters. 354 * @param filters The filter set to use. 355 */ 356 public void setFilters (NodeFilter[] filters) 357 { 358 mFilters = filters; 359 if (null != getParser ()) 360 { 361 getParser ().reset (); 362 setNodes (); 363 } 364 } 365 366 /** 367 * Get the parser used to fetch nodes. 368 * @return The parser used by the bean. 369 */ 370 public Parser getParser () 371 { 372 return (mParser); 373 } 374 375 /** 376 * Set the parser for the bean. 377 * The parser is used immediately to fetch the nodes, 378 * which for a null filter means all the nodes 379 * @param parser The parser to use. 380 */ 381 public void setParser (Parser parser) 382 { 383 mParser = parser; 384 if (null != getFilters ()) 385 setNodes (); 386 } 387 388 /** 389 * Convenience method to apply a {@link StringBean} to the filter results. 390 * This may yield duplicate or multiple text elements if the node list 391 * contains nodes from two or more levels in the same nested tag heirarchy, 392 * but if the node list contains only one tag, it provides access to the 393 * text within the node. 394 * @return The textual contents of the nodes that pass through the filter set, 395 * as collected by the StringBean. 396 */ 397 public String getText () 398 { 399 NodeList list; 400 StringBean sb; 401 String ret; 402 403 list = getNodes (); 404 if (0 != list.size ()) 405 { 406 sb = new StringBean (); 407 for (int i = 0; i < list.size (); i++) 408 list.elementAt (i).accept (sb); 409 ret = sb.getStrings (); 410 } 411 else 412 ret = ""; 413 414 return (ret); 415 } 416 417 /** 418 * Get the current recursion behaviour. 419 * @return The recursion (applies to children, children's children, etc) 420 * behavior currently being used. 421 */ 422 public boolean getRecursive () 423 { 424 return (mRecursive); 425 } 426 427 /** 428 * Set the recursion behaviour. 429 * @param recursive If <code>true</code> the 430 * <code>extractAllNodesThatMatch()</code> call is performed recursively. 431 * @see org.htmlparser.util.NodeList#extractAllNodesThatMatch(NodeFilter, boolean). 432 */ 433 public void setRecursive (boolean recursive) 434 { 435 mRecursive = recursive; 436 } 437 438 /** 439 * Unit test. 440 * @param args Pass arg[0] as the URL to process, 441 * and optionally a node name for filtering. 442 */ 443 public static void main (String[] args) 444 { 445 if (0 >= args.length) 446 System.out.println ("Usage: java -classpath htmlparser.jar org.htmlparser.beans.FilterBean <http://whatever_url> [node name]"); 447 else 448 { 449 FilterBean fb = new FilterBean (); 450 if (1 < args.length) 451 fb.setFilters (new NodeFilter[] { new org.htmlparser.filters.TagNameFilter (args[1]) }); 452 fb.setURL (args[0]); 453 //System.out.println (fb.getNodes ().toHtml ()); 454 System.out.println (fb.getText ()); 455 } 456 } 457 }