Parser.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Somik Raha 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2006/03/20 00:26:01 $ 10 // $Revision: 1.111 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser; 28 29 import java.io.Serializable; 30 import java.net.HttpURLConnection; 31 import java.net.URLConnection; 32 33 import org.htmlparser.filters.TagNameFilter; 34 import org.htmlparser.filters.NodeClassFilter; 35 import org.htmlparser.http.ConnectionManager; 36 import org.htmlparser.http.ConnectionMonitor; 37 import org.htmlparser.http.HttpHeader; 38 import org.htmlparser.lexer.Lexer; 39 import org.htmlparser.lexer.Page; 40 import org.htmlparser.util.DefaultParserFeedback; 41 import org.htmlparser.util.IteratorImpl; 42 import org.htmlparser.util.NodeIterator; 43 import org.htmlparser.util.NodeList; 44 import org.htmlparser.util.ParserException; 45 import org.htmlparser.util.ParserFeedback; 46 import org.htmlparser.visitors.NodeVisitor; 47 48 /** 49 * The main parser class. 50 * This is the primary class of the HTML Parser library. It provides 51 * constructors that take a {@link #Parser(String) String}, 52 * a {@link #Parser(URLConnection) URLConnection}, or a 53 * {@link #Parser(Lexer) Lexer}. In the case of a String, an 54 * attempt is made to open it as a URL, and if that fails it assumes it is a 55 * local disk file. If you want to actually parse a String, use 56 * {@link #setInputHTML setInputHTML()} after using the 57 * {@link #Parser() no-args} constructor, or use {@link #createParser}. 58 * <p>The Parser provides access to the contents of the 59 * page, via a {@link #elements() NodeIterator}, a 60 * {@link #parse(NodeFilter) NodeList} or a 61 * {@link #visitAllNodesWith NodeVisitor}. 62 * <p>Typical usage of the parser is: 63 * <code> 64 * <pre> 65 * Parser parser = new Parser ("http://whatever"); 66 * NodeList list = parser.parse (); 67 * // do something with your list of nodes. 68 * </pre> 69 * </code></p> 70 * <p>What types of nodes and what can be done with them is dependant on the 71 * setup, but in general a node can be converted back to HTML and it's 72 * children (enclosed nodes) and parent can be obtained, because nodes are 73 * nested. See the {@link Node} interface.</p> 74 * <p>For example, if the URL contains:<br> 75 * <code> 76 * {@.html 77 * <html> 78 * <head> 79 * <title>Mondays -- What a bad idea.</title> 80 * </head> 81 * <body BGCOLOR="#FFFFFF"> 82 * Most people have a pathological hatred of Mondays... 83 * </body> 84 * </html>} 85 * </code><br> 86 * and the example code above is used, the list contain only one element, the 87 * {@.html <html>} node. This node is a {@link org.htmlparser.tags tag}, 88 * which is an object of class 89 * {@link org.htmlparser.tags.Html Html} if the default {@link NodeFactory} 90 * (a {@link PrototypicalNodeFactory}) is used.</p> 91 * <p>To get at further content, the children of the top 92 * level nodes must be examined. When digging through a node list one must be 93 * conscious of the possibility of whitespace between nodes, e.g. in the example 94 * above: 95 * <code> 96 * <pre> 97 * Node node = list.elementAt (0); 98 * NodeList sublist = node.getChildren (); 99 * System.out.println (sublist.size ()); 100 * </pre> 101 * </code> 102 * would print out 5, not 2, because there are newlines after {@.html <html>}, 103 * {@.html </head>} and {@.html </body>} that are children of the HTML node 104 * besides the {@.html <head>} and {@.html <body>} nodes.</p> 105 * <p>Because processing nodes is so common, two interfaces are provided to 106 * ease this task, {@link org.htmlparser.filters filters} 107 * and {@link org.htmlparser.visitors visitors}. 108 */ 109 public class Parser 110 implements 111 Serializable, 112 ConnectionMonitor 113 { 114 // Please don't change the formatting of the version variables below. 115 // This is done so as to facilitate ant script processing. 116 117 /** 118 * The floating point version number ({@value}). 119 */ 120 public static final double 121 VERSION_NUMBER = 1.6 122 ; 123 124 /** 125 * The type of version ({@value}). 126 */ 127 public static final String 128 VERSION_TYPE = "Integration Build" 129 ; 130 131 /** 132 * The date of the version ({@value}). 133 */ 134 public static final String 135 VERSION_DATE = "Mar 19, 2006" 136 ; 137 138 // End of formatting 139 140 /** 141 * The display version ({@value}). 142 */ 143 public static final String VERSION_STRING = 144 "" + VERSION_NUMBER 145 + " (" + VERSION_TYPE + " " + VERSION_DATE + ")"; 146 147 /** 148 * Feedback object. 149 */ 150 protected ParserFeedback mFeedback; 151 152 /** 153 * The html lexer associated with this parser. 154 */ 155 protected Lexer mLexer; 156 157 /** 158 * A quiet message sink. 159 * Use this for no feedback. 160 */ 161 public static final ParserFeedback DEVNULL = 162 new DefaultParserFeedback (DefaultParserFeedback.QUIET); 163 164 /** 165 * A verbose message sink. 166 * Use this for output on <code>System.out</code>. 167 */ 168 public static final ParserFeedback STDOUT = new DefaultParserFeedback (); 169 170 // 171 // Static methods 172 // 173 174 /** 175 * Return the version string of this parser. 176 * @return A string of the form: 177 * <pre> 178 * "[floating point number] ([build-type] [build-date])" 179 * </pre> 180 */ 181 public static String getVersion () 182 { 183 return (VERSION_STRING); 184 } 185 186 /** 187 * Return the version number of this parser. 188 * @return A floating point number, the whole number part is the major 189 * version, and the fractional part is the minor version. 190 */ 191 public static double getVersionNumber () 192 { 193 return (VERSION_NUMBER); 194 } 195 196 /** 197 * Get the connection manager all Parsers use. 198 * @return The connection manager. 199 * @see #setConnectionManager 200 */ 201 public static ConnectionManager getConnectionManager () 202 { 203 return (Page.getConnectionManager ()); 204 } 205 206 /** 207 * Set the connection manager all Parsers use. 208 * @param manager The new connection manager. 209 * @see #getConnectionManager 210 */ 211 public static void setConnectionManager (ConnectionManager manager) 212 { 213 Page.setConnectionManager (manager); 214 } 215 216 /** 217 * Creates the parser on an input string. 218 * @param html The string containing HTML. 219 * @param charset <em>Optional</em>. The character set encoding that will 220 * be reported by {@link #getEncoding}. If charset is <code>null</code> 221 * the default character set is used. 222 * @return A parser with the <code>html</code> string as input. 223 */ 224 public static Parser createParser (String html, String charset) 225 { 226 Parser ret; 227 228 if (null == html) 229 throw new IllegalArgumentException ("html cannot be null"); 230 ret = new Parser (new Lexer (new Page (html, charset))); 231 232 return (ret); 233 } 234 235 // 236 // Constructors 237 // 238 239 /** 240 * Zero argument constructor. 241 * The parser is in a safe but useless state parsing an empty string. 242 * Set the lexer or connection using {@link #setLexer} 243 * or {@link #setConnection}. 244 * @see #setLexer(Lexer) 245 * @see #setConnection(URLConnection) 246 */ 247 public Parser () 248 { 249 this (new Lexer (new Page ("")), DEVNULL); 250 } 251 252 /** 253 * Construct a parser using the provided lexer and feedback object. 254 * This would be used to create a parser for special cases where the 255 * normal creation of a lexer on a URLConnection needs to be customized. 256 * @param lexer The lexer to draw characters from. 257 * @param fb The object to use when information, 258 * warning and error messages are produced. If <em>null</em> no feedback 259 * is provided. 260 */ 261 public Parser (Lexer lexer, ParserFeedback fb) 262 { 263 setFeedback (fb); 264 if (null == lexer) 265 throw new IllegalArgumentException ("lexer cannot be null"); 266 setLexer (lexer); 267 setNodeFactory (new PrototypicalNodeFactory ()); 268 } 269 270 /** 271 * Constructor for custom HTTP access. 272 * This would be used to create a parser for a URLConnection that needs 273 * a special setup or negotiation conditioning beyond what is available 274 * from the {@link #getConnectionManager ConnectionManager}. 275 * @param connection A fully conditioned connection. The connect() 276 * method will be called so it need not be connected yet. 277 * @param fb The object to use for message communication. 278 * @throws ParserException If the creation of the underlying Lexer 279 * cannot be performed. 280 */ 281 public Parser (URLConnection connection, ParserFeedback fb) 282 throws 283 ParserException 284 { 285 this (new Lexer (connection), fb); 286 } 287 288 /** 289 * Creates a Parser object with the location of the resource (URL or file) 290 * You would typically create a DefaultHTMLParserFeedback object and pass 291 * it in. 292 * @see #Parser(URLConnection,ParserFeedback) 293 * @param resourceLocn Either the URL or the filename (autodetects). 294 * A standard HTTP GET is performed to read the content of the URL. 295 * @param feedback The HTMLParserFeedback object to use when information, 296 * warning and error messages are produced. If <em>null</em> no feedback 297 * is provided. 298 * @throws ParserException If the URL is invalid. 299 */ 300 public Parser (String resourceLocn, ParserFeedback feedback) 301 throws 302 ParserException 303 { 304 this (getConnectionManager ().openConnection (resourceLocn), feedback); 305 } 306 307 /** 308 * Creates a Parser object with the location of the resource (URL or file). 309 * A DefaultHTMLParserFeedback object is used for feedback. 310 * @param resourceLocn Either the URL or the filename (autodetects). 311 * @throws ParserException If the resourceLocn argument does not resolve 312 * to a valid page or file. 313 */ 314 public Parser (String resourceLocn) throws ParserException 315 { 316 this (resourceLocn, STDOUT); 317 } 318 319 /** 320 * Construct a parser using the provided lexer. 321 * A feedback object printing to {@link #STDOUT System.out} is used. 322 * This would be used to create a parser for special cases where the 323 * normal creation of a lexer on a URLConnection needs to be customized. 324 * @param lexer The lexer to draw characters from. 325 */ 326 public Parser (Lexer lexer) 327 { 328 this (lexer, STDOUT); 329 } 330 331 /** 332 * Construct a parser using the provided URLConnection. 333 * This would be used to create a parser for a URLConnection that needs 334 * a special setup or negotiation conditioning beyond what is available 335 * from the {@link #getConnectionManager ConnectionManager}. 336 * A feedback object printing to {@link #STDOUT System.out} is used. 337 * @see #Parser(URLConnection,ParserFeedback) 338 * @param connection A fully conditioned connection. The connect() 339 * method will be called so it need not be connected yet. 340 * @throws ParserException If the creation of the underlying Lexer 341 * cannot be performed. 342 */ 343 public Parser (URLConnection connection) throws ParserException 344 { 345 this (connection, STDOUT); 346 } 347 348 // 349 // Bean patterns 350 // 351 352 /** 353 * Set the connection for this parser. 354 * This method creates a new <code>Lexer</code> reading from the connection. 355 * @param connection A fully conditioned connection. The connect() 356 * method will be called so it need not be connected yet. 357 * @exception ParserException if the character set specified in the 358 * HTTP header is not supported, or an i/o exception occurs creating the 359 * lexer. 360 * @see #setLexer 361 * @see #getConnection 362 */ 363 public void setConnection (URLConnection connection) 364 throws 365 ParserException 366 { 367 if (null == connection) 368 throw new IllegalArgumentException ("connection cannot be null"); 369 setLexer (new Lexer (connection)); 370 } 371 372 /** 373 * Return the current connection. 374 * @return The connection either created by the parser or passed into this 375 * parser via {@link #setConnection}. 376 * @see #setConnection(URLConnection) 377 */ 378 public URLConnection getConnection () 379 { 380 return (getLexer ().getPage ().getConnection ()); 381 } 382 383 /** 384 * Set the URL for this parser. 385 * This method creates a new Lexer reading from the given URL. 386 * Trying to set the url to null or an empty string is a no-op. 387 * @param url The new URL for the parser. 388 * @throws ParserException If the url is invalid or creation of the 389 * underlying Lexer cannot be performed. 390 * @see #getURL 391 */ 392 public void setURL (String url) 393 throws 394 ParserException 395 { 396 if ((null != url) && !"".equals (url)) 397 setConnection (getConnectionManager ().openConnection (url)); 398 } 399 400 /** 401 * Return the current URL being parsed. 402 * @return The current url. This is the URL for the current page. 403 * A string passed into the constructor or set via setURL may be altered, 404 * for example, a file name may be modified to be a URL. 405 * @see Page#getUrl 406 * @see #setURL 407 */ 408 public String getURL () 409 { 410 return (getLexer ().getPage ().getUrl ()); 411 } 412 413 /** 414 * Set the encoding for the page this parser is reading from. 415 * @param encoding The new character set to use. 416 * @throws ParserException If the encoding change causes characters that 417 * have already been consumed to differ from the characters that would 418 * have been seen had the new encoding been in force. 419 * @see org.htmlparser.util.EncodingChangeException 420 * @see #getEncoding 421 */ 422 public void setEncoding (String encoding) 423 throws 424 ParserException 425 { 426 getLexer ().getPage ().setEncoding (encoding); 427 } 428 429 /** 430 * Get the encoding for the page this parser is reading from. 431 * This item is set from the HTTP header but may be overridden by meta 432 * tags in the head, so this may change after the head has been parsed. 433 * @return The encoding currently in force. 434 * @see #setEncoding 435 */ 436 public String getEncoding () 437 { 438 return (getLexer ().getPage ().getEncoding ()); 439 } 440 441 /** 442 * Set the lexer for this parser. 443 * The current NodeFactory is transferred to (set on) the given lexer, 444 * since the lexer owns the node factory object. 445 * It does not adjust the <code>feedback</code> object. 446 * Trying to set the lexer to <code>null</code> is a no-op. 447 * @param lexer The lexer object to use. 448 * @see #setNodeFactory 449 * @see #getLexer 450 */ 451 public void setLexer (Lexer lexer) 452 { 453 NodeFactory factory; 454 String type; 455 456 if (null != lexer) 457 { // move a node factory that's been set to the new lexer 458 factory = null; 459 if (null != getLexer ()) 460 factory = getLexer ().getNodeFactory (); 461 if (null != factory) 462 lexer.setNodeFactory (factory); 463 mLexer = lexer; 464 // warn about content that's not likely text 465 type = mLexer.getPage ().getContentType (); 466 if (type != null && !type.startsWith ("text")) 467 getFeedback ().warning ( 468 "URL " 469 + mLexer.getPage ().getUrl () 470 + " does not contain text"); 471 } 472 } 473 474 /** 475 * Returns the lexer associated with the parser. 476 * @return The current lexer. 477 * @see #setLexer 478 */ 479 public Lexer getLexer () 480 { 481 return (mLexer); 482 } 483 484 /** 485 * Get the current node factory. 486 * @return The current lexer's node factory. 487 * @see #setNodeFactory 488 */ 489 public NodeFactory getNodeFactory () 490 { 491 return (getLexer ().getNodeFactory ()); 492 } 493 494 /** 495 * Set the current node factory. 496 * @param factory The new node factory for the current lexer. 497 * @see #getNodeFactory 498 */ 499 public void setNodeFactory (NodeFactory factory) 500 { 501 if (null == factory) 502 throw new IllegalArgumentException ("node factory cannot be null"); 503 getLexer ().setNodeFactory (factory); 504 } 505 506 /** 507 * Sets the feedback object used in scanning. 508 * @param fb The new feedback object to use. If this is null a 509 * {@link #DEVNULL silent feedback object} is used. 510 * @see #getFeedback 511 */ 512 public void setFeedback (ParserFeedback fb) 513 { 514 if (null == fb) 515 mFeedback = DEVNULL; 516 else 517 mFeedback = fb; 518 } 519 520 /** 521 * Returns the current feedback object. 522 * @return The feedback object currently being used. 523 * @see #setFeedback 524 */ 525 public ParserFeedback getFeedback() 526 { 527 return (mFeedback); 528 } 529 530 // 531 // Public methods 532 // 533 534 /** 535 * Reset the parser to start from the beginning again. 536 * This assumes support for a reset from the underlying 537 * {@link org.htmlparser.lexer.Source} object. 538 * <p>This is cheaper (in terms of time) than resetting the URL, i.e. 539 * <pre> 540 * parser.setURL (parser.getURL ()); 541 * </pre> 542 * because the page is not refetched from the internet. 543 * <em>Note: the nodes returned on the second parse are new 544 * nodes and not the same nodes returned on the first parse. If you 545 * want the same nodes for re-use, collect them in a NodeList with 546 * {@link #parse(NodeFilter) parse(null)} and operate on the NodeList.</em> 547 */ 548 public void reset () 549 { 550 getLexer ().reset (); 551 } 552 553 /** 554 * Returns an iterator (enumeration) over the html nodes. 555 * {@link org.htmlparser.nodes Nodes} can be of three main types: 556 * <ul> 557 * <li>{@link org.htmlparser.nodes.TagNode TagNode}</li> 558 * <li>{@link org.htmlparser.nodes.TextNode TextNode}</li> 559 * <li>{@link org.htmlparser.nodes.RemarkNode RemarkNode}</li> 560 * </ul> 561 * In general, when parsing with an iterator or processing a NodeList, 562 * you will need to use recursion. For example: 563 * <code> 564 * <pre> 565 * void processMyNodes (Node node) 566 * { 567 * if (node instanceof TextNode) 568 * { 569 * // downcast to TextNode 570 * TextNode text = (TextNode)node; 571 * // do whatever processing you want with the text 572 * System.out.println (text.getText ()); 573 * } 574 * if (node instanceof RemarkNode) 575 * { 576 * // downcast to RemarkNode 577 * RemarkNode remark = (RemarkNode)node; 578 * // do whatever processing you want with the comment 579 * } 580 * else if (node instanceof TagNode) 581 * { 582 * // downcast to TagNode 583 * TagNode tag = (TagNode)node; 584 * // do whatever processing you want with the tag itself 585 * // ... 586 * // process recursively (nodes within nodes) via getChildren() 587 * NodeList nl = tag.getChildren (); 588 * if (null != nl) 589 * for (NodeIterator i = nl.elements (); i.hasMoreElements (); ) 590 * processMyNodes (i.nextNode ()); 591 * } 592 * } 593 * 594 * Parser parser = new Parser ("http://www.yahoo.com"); 595 * for (NodeIterator i = parser.elements (); i.hasMoreElements (); ) 596 * processMyNodes (i.nextNode ()); 597 * </pre> 598 * </code> 599 * @throws ParserException If a parsing error occurs. 600 * @return An iterator over the top level nodes (usually {@.html <html>}). 601 */ 602 public NodeIterator elements () throws ParserException 603 { 604 return (new IteratorImpl (getLexer (), getFeedback ())); 605 } 606 607 /** 608 * Parse the given resource, using the filter provided. 609 * This can be used to extract information from specific nodes. 610 * When used with a <code>null</code> filter it returns an 611 * entire page which can then be modified and converted back to HTML 612 * (Note: the synthesis use-case is not handled very well; the parser 613 * is more often used to extract information from a web page). 614 * <p>For example, to replace the entire contents of the HEAD with a 615 * single TITLE tag you could do this: 616 * <pre> 617 * NodeList nl = parser.parse (null); // here is your two node list 618 * NodeList heads = nl.extractAllNodesThatMatch (new TagNameFilter ("HEAD")) 619 * if (heads.size () > 0) // there may not be a HEAD tag 620 * { 621 * Head head = heads.elementAt (0); // there should be only one 622 * head.removeAll (); // clean out the contents 623 * Tag title = new TitleTag (); 624 * title.setTagName ("title"); 625 * title.setChildren (new NodeList (new TextNode ("The New Title"))); 626 * Tag title_end = new TitleTag (); 627 * title_end.setTagName ("/title"); 628 * title.setEndTag (title_end); 629 * head.add (title); 630 * } 631 * System.out.println (nl.toHtml ()); // output the modified HTML 632 * </pre> 633 * @return The list of matching nodes (for a <code>null</code> 634 * filter this is all the top level nodes). 635 * @param filter The filter to apply to the parsed nodes, 636 * or <code>null</code> to retrieve all the top level nodes. 637 * @throws ParserException If a parsing error occurs. 638 */ 639 public NodeList parse (NodeFilter filter) throws ParserException 640 { 641 NodeIterator e; 642 Node node; 643 NodeList ret; 644 645 ret = new NodeList (); 646 for (e = elements (); e.hasMoreNodes (); ) 647 { 648 node = e.nextNode (); 649 if (null != filter) 650 node.collectInto (ret, filter); 651 else 652 ret.add (node); 653 } 654 655 return (ret); 656 } 657 658 /** 659 * Apply the given visitor to the current page. 660 * The visitor is passed to the <code>accept()</code> method of each node 661 * in the page in a depth first traversal. The visitor 662 * <code>beginParsing()</code> method is called prior to processing the 663 * page and <code>finishedParsing()</code> is called after the processing. 664 * @param visitor The visitor to visit all nodes with. 665 * @throws ParserException If a parse error occurs while traversing 666 * the page with the visitor. 667 */ 668 public void visitAllNodesWith (NodeVisitor visitor) throws ParserException 669 { 670 Node node; 671 visitor.beginParsing(); 672 for (NodeIterator e = elements(); e.hasMoreNodes(); ) 673 { 674 node = e.nextNode(); 675 node.accept(visitor); 676 } 677 visitor.finishedParsing(); 678 } 679 680 /** 681 * Initializes the parser with the given input HTML String. 682 * @param inputHTML the input HTML that is to be parsed. 683 * @throws ParserException If a error occurs in setting up the 684 * underlying Lexer. 685 */ 686 public void setInputHTML (String inputHTML) 687 throws 688 ParserException 689 { 690 if (null == inputHTML) 691 throw new IllegalArgumentException ("html cannot be null"); 692 if (!"".equals (inputHTML)) 693 setLexer (new Lexer (new Page (inputHTML))); 694 } 695 696 /** 697 * Extract all nodes matching the given filter. 698 * @see Node#collectInto(NodeList, NodeFilter) 699 * @param filter The filter to be applied to the nodes. 700 * @throws ParserException If a parse error occurs. 701 * @return A list of nodes matching the filter criteria, 702 * i.e. for which the filter's accept method 703 * returned <code>true</code>. 704 */ 705 public NodeList extractAllNodesThatMatch (NodeFilter filter) 706 throws 707 ParserException 708 { 709 NodeIterator e; 710 NodeList ret; 711 712 ret = new NodeList (); 713 for (e = elements (); e.hasMoreNodes (); ) 714 e.nextNode ().collectInto (ret, filter); 715 716 return (ret); 717 } 718 719 /** 720 * Convenience method to extract all nodes of a given class type. 721 * Equivalent to 722 * <code>extractAllNodesThatMatch (new NodeClassFilter (nodeType))</code>. 723 * @param nodeType The class of the nodes to collect. 724 * @throws ParserException If a parse error occurs. 725 * @return A list of nodes which have the class specified. 726 * @deprecated Use extractAllNodesThatMatch (new NodeClassFilter (cls)). 727 * @see #extractAllNodesThatAre 728 */ 729 public Node [] extractAllNodesThatAre (Class nodeType) 730 throws 731 ParserException 732 { 733 NodeList ret; 734 735 ret = extractAllNodesThatMatch (new NodeClassFilter (nodeType)); 736 737 return (ret.toNodeArray ()); 738 } 739 740 // 741 // ConnectionMonitor interface 742 // 743 744 /** 745 * Called just prior to calling connect. 746 * Part of the ConnectionMonitor interface, this implementation just 747 * sends the request header to the feedback object if any. 748 * @param connection The connection which is about to be connected. 749 * @throws ParserException <em>Not used</em> 750 * @see ConnectionMonitor#preConnect 751 */ 752 public void preConnect (HttpURLConnection connection) 753 throws 754 ParserException 755 { 756 getFeedback ().info (HttpHeader.getRequestHeader (connection)); 757 } 758 759 /** 760 * Called just after calling connect. 761 * Part of the ConnectionMonitor interface, this implementation just 762 * sends the response header to the feedback object if any. 763 * @param connection The connection that was just connected. 764 * @throws ParserException <em>Not used.</em> 765 * @see ConnectionMonitor#postConnect 766 */ 767 public void postConnect (HttpURLConnection connection) 768 throws 769 ParserException 770 { 771 getFeedback ().info (HttpHeader.getResponseHeader (connection)); 772 } 773 774 /** 775 * The main program, which can be executed from the command line. 776 * @param args A URL or file name to parse, and an optional tag name to be 777 * used as a filter. 778 */ 779 public static void main (String [] args) 780 { 781 Parser parser; 782 NodeFilter filter; 783 784 if (args.length < 1 || args[0].equals ("-help")) 785 { 786 System.out.println ("HTML Parser v" + VERSION_STRING + "\n"); 787 System.out.println (); 788 System.out.println ("Syntax : java -jar htmlparser.jar" 789 + " <file/page> [type]"); 790 System.out.println (" <file/page> the URL or file to be parsed"); 791 System.out.println (" type the node type, for example:"); 792 System.out.println (" A - Show only the link tags"); 793 System.out.println (" IMG - Show only the image tags"); 794 System.out.println (" TITLE - Show only the title tag"); 795 System.out.println (); 796 System.out.println ("Example : java -jar htmlparser.jar" 797 + " http://www.yahoo.com"); 798 System.out.println (); 799 } 800 else 801 try 802 { 803 parser = new Parser (); 804 if (1 < args.length) 805 filter = new TagNameFilter (args[1]); 806 else 807 { // for a simple dump, use more verbose settings 808 filter = null; 809 parser.setFeedback (Parser.STDOUT); 810 getConnectionManager ().setMonitor (parser); 811 } 812 parser.setURL (args[0]); 813 System.out.println (parser.parse (filter)); 814 } 815 catch (ParserException e) 816 { 817 e.printStackTrace (); 818 } 819 } 820 }