/ org.htmlparser / src / org / htmlparser / Parser.java
Parser.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2004 Somik Raha
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2006/03/20 00:26:01 $
 10  // $Revision: 1.111 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser;
 28  
 29  import java.io.Serializable;
 30  import java.net.HttpURLConnection;
 31  import java.net.URLConnection;
 32  
 33  import org.htmlparser.filters.TagNameFilter;
 34  import org.htmlparser.filters.NodeClassFilter;
 35  import org.htmlparser.http.ConnectionManager;
 36  import org.htmlparser.http.ConnectionMonitor;
 37  import org.htmlparser.http.HttpHeader;
 38  import org.htmlparser.lexer.Lexer;
 39  import org.htmlparser.lexer.Page;
 40  import org.htmlparser.util.DefaultParserFeedback;
 41  import org.htmlparser.util.IteratorImpl;
 42  import org.htmlparser.util.NodeIterator;
 43  import org.htmlparser.util.NodeList;
 44  import org.htmlparser.util.ParserException;
 45  import org.htmlparser.util.ParserFeedback;
 46  import org.htmlparser.visitors.NodeVisitor;
 47  
 48  /**
 49   * The main parser class.
 50   * This is the primary class of the HTML Parser library. It provides
 51   * constructors that take a {@link #Parser(String) String},
 52   * a {@link #Parser(URLConnection) URLConnection}, or a
 53   * {@link #Parser(Lexer) Lexer}.  In the case of a String, an
 54   * attempt is made to open it as a URL, and if that fails it assumes it is a
 55   * local disk file. If you want to actually parse a String, use
 56   * {@link #setInputHTML setInputHTML()} after using the
 57   * {@link #Parser() no-args} constructor, or use {@link #createParser}.
 58   * <p>The Parser provides access to the contents of the
 59   * page, via a {@link #elements() NodeIterator}, a
 60   * {@link #parse(NodeFilter) NodeList} or a
 61   * {@link #visitAllNodesWith NodeVisitor}.
 62   * <p>Typical usage of the parser is:
 63   * <code>
 64   * <pre>
 65   * Parser parser = new Parser ("http://whatever");
 66   * NodeList list = parser.parse ();
 67   * // do something with your list of nodes.
 68   * </pre>
 69   * </code></p>
 70   * <p>What types of nodes and what can be done with them is dependant on the
 71   * setup, but in general a node can be converted back to HTML and it's
 72   * children (enclosed nodes) and parent can be obtained, because nodes are
 73   * nested. See the {@link Node} interface.</p>
 74   * <p>For example, if the URL contains:<br>
 75   * <code>
 76   * {@.html
 77   * <html>
 78   * <head>
 79   * <title>Mondays -- What a bad idea.</title>
 80   * </head>
 81   * <body BGCOLOR="#FFFFFF">
 82   * Most people have a pathological hatred of Mondays...
 83   * </body>
 84   * </html>}
 85   * </code><br>
 86   * and the example code above is used, the list contain only one element, the
 87   * {@.html <html>} node.  This node is a {@link org.htmlparser.tags tag},
 88   * which is an object of class
 89   * {@link org.htmlparser.tags.Html Html} if the default {@link NodeFactory}
 90   * (a {@link PrototypicalNodeFactory}) is used.</p>
 91   * <p>To get at further content, the children of the top
 92   * level nodes must be examined. When digging through a node list one must be
 93   * conscious of the possibility of whitespace between nodes, e.g. in the example
 94   * above:
 95   * <code>
 96   * <pre>
 97   * Node node = list.elementAt (0);
 98   * NodeList sublist = node.getChildren ();
 99   * System.out.println (sublist.size ());
100   * </pre>
101   * </code>
102   * would print out 5, not 2, because there are newlines after {@.html <html>},
103   * {@.html </head>} and {@.html </body>} that are children of the HTML node
104   * besides the {@.html <head>} and {@.html <body>} nodes.</p>
105   * <p>Because processing nodes is so common, two interfaces are provided to
106   * ease this task, {@link org.htmlparser.filters filters}
107   * and {@link org.htmlparser.visitors visitors}.
108   */
109  public class Parser
110      implements
111          Serializable,
112          ConnectionMonitor
113  {
114      // Please don't change the formatting of the version variables below.
115      // This is done so as to facilitate ant script processing.
116  
117      /**
118       * The floating point version number ({@value}).
119       */
120      public static final double
121      VERSION_NUMBER = 1.6
122      ;
123  
124      /**
125       * The type of version ({@value}).
126       */
127      public static final String
128      VERSION_TYPE = "Integration Build"
129      ;
130  
131      /**
132       * The date of the version ({@value}).
133       */
134      public static final String
135      VERSION_DATE = "Mar 19, 2006"
136      ;
137  
138      // End of formatting
139  
140      /**
141       * The display version ({@value}).
142       */
143      public static final String VERSION_STRING =
144              "" + VERSION_NUMBER
145              + " (" + VERSION_TYPE + " " + VERSION_DATE + ")";
146  
147      /**
148       * Feedback object.
149       */
150      protected ParserFeedback mFeedback;
151  
152      /**
153       * The html lexer associated with this parser.
154       */
155      protected Lexer mLexer;
156  
157      /**
158       * A quiet message sink.
159       * Use this for no feedback.
160       */
161      public static final ParserFeedback DEVNULL =
162          new DefaultParserFeedback (DefaultParserFeedback.QUIET);
163  
164      /**
165       * A verbose message sink.
166       * Use this for output on <code>System.out</code>.
167       */
168      public static final ParserFeedback STDOUT = new DefaultParserFeedback ();
169  
170      //
171      // Static methods
172      //
173  
174      /**
175       * Return the version string of this parser.
176       * @return A string of the form:
177       * <pre>
178       * "[floating point number] ([build-type] [build-date])"
179       * </pre>
180       */
181      public static String getVersion ()
182      {
183          return (VERSION_STRING);
184      }
185  
186      /**
187       * Return the version number of this parser.
188       * @return A floating point number, the whole number part is the major
189       * version, and the fractional part is the minor version.
190       */
191      public static double getVersionNumber ()
192      {
193          return (VERSION_NUMBER);
194      }
195  
196      /**
197       * Get the connection manager all Parsers use.
198       * @return The connection manager.
199       * @see #setConnectionManager
200       */
201      public static ConnectionManager getConnectionManager ()
202      {
203          return (Page.getConnectionManager ());
204      }
205  
206      /**
207       * Set the connection manager all Parsers use.
208       * @param manager The new connection manager.
209       * @see #getConnectionManager
210       */
211      public static void setConnectionManager (ConnectionManager manager)
212      {
213          Page.setConnectionManager (manager);
214      }
215  
216      /**
217       * Creates the parser on an input string.
218       * @param html The string containing HTML.
219       * @param charset <em>Optional</em>. The character set encoding that will
220       * be reported by {@link #getEncoding}. If charset is <code>null</code>
221       * the default character set is used.
222       * @return A parser with the <code>html</code> string as input.
223       */
224      public static Parser createParser (String html, String charset)
225      {
226          Parser ret;
227  
228          if (null == html)
229              throw new IllegalArgumentException ("html cannot be null");
230          ret = new Parser (new Lexer (new Page (html, charset)));
231  
232          return (ret);
233      }
234  
235      //
236      // Constructors
237      //
238  
239      /**
240       * Zero argument constructor.
241       * The parser is in a safe but useless state parsing an empty string.
242       * Set the lexer or connection using {@link #setLexer}
243       * or {@link #setConnection}.
244       * @see #setLexer(Lexer)
245       * @see #setConnection(URLConnection)
246       */
247      public Parser ()
248      {
249          this (new Lexer (new Page ("")), DEVNULL);
250      }
251  
252      /**
253       * Construct a parser using the provided lexer and feedback object.
254       * This would be used to create a parser for special cases where the
255       * normal creation of a lexer on a URLConnection needs to be customized.
256       * @param lexer The lexer to draw characters from.
257       * @param fb The object to use when information,
258       * warning and error messages are produced. If <em>null</em> no feedback
259       * is provided.
260       */
261      public Parser (Lexer lexer, ParserFeedback fb)
262      {
263          setFeedback (fb);
264          if (null == lexer)
265              throw new IllegalArgumentException ("lexer cannot be null");
266          setLexer (lexer);
267          setNodeFactory (new PrototypicalNodeFactory ());
268      }
269  
270      /**
271       * Constructor for custom HTTP access.
272       * This would be used to create a parser for a URLConnection that needs
273       * a special setup or negotiation conditioning beyond what is available
274       * from the {@link #getConnectionManager ConnectionManager}.
275       * @param connection A fully conditioned connection. The connect()
276       * method will be called so it need not be connected yet.
277       * @param fb The object to use for message communication.
278       * @throws ParserException If the creation of the underlying Lexer
279       * cannot be performed.
280       */
281      public Parser (URLConnection connection, ParserFeedback fb)
282          throws
283              ParserException
284      {
285          this (new Lexer (connection), fb);
286      }
287  
288      /**
289       * Creates a Parser object with the location of the resource (URL or file)
290       * You would typically create a DefaultHTMLParserFeedback object and pass
291       * it in.
292       * @see #Parser(URLConnection,ParserFeedback)
293       * @param resourceLocn Either the URL or the filename (autodetects).
294       * A standard HTTP GET is performed to read the content of the URL.
295       * @param feedback The HTMLParserFeedback object to use when information,
296       * warning and error messages are produced. If <em>null</em> no feedback
297       * is provided.
298       * @throws ParserException If the URL is invalid.
299       */
300      public Parser (String resourceLocn, ParserFeedback feedback)
301          throws
302              ParserException
303      {
304          this (getConnectionManager ().openConnection (resourceLocn), feedback);
305      }
306  
307      /**
308       * Creates a Parser object with the location of the resource (URL or file).
309       * A DefaultHTMLParserFeedback object is used for feedback.
310       * @param resourceLocn Either the URL or the filename (autodetects).
311       * @throws ParserException If the resourceLocn argument does not resolve
312       * to a valid page or file.
313       */
314      public Parser (String resourceLocn) throws ParserException
315      {
316          this (resourceLocn, STDOUT);
317      }
318  
319      /**
320       * Construct a parser using the provided lexer.
321       * A feedback object printing to {@link #STDOUT System.out} is used.
322       * This would be used to create a parser for special cases where the
323       * normal creation of a lexer on a URLConnection needs to be customized.
324       * @param lexer The lexer to draw characters from.
325       */
326      public Parser (Lexer lexer)
327      {
328          this (lexer, STDOUT);
329      }
330  
331      /**
332       * Construct a parser using the provided URLConnection.
333       * This would be used to create a parser for a URLConnection that needs
334       * a special setup or negotiation conditioning beyond what is available
335       * from the {@link #getConnectionManager ConnectionManager}.
336       * A feedback object printing to {@link #STDOUT System.out} is used.
337       * @see #Parser(URLConnection,ParserFeedback)
338       * @param connection A fully conditioned connection. The connect()
339       * method will be called so it need not be connected yet.
340       * @throws ParserException If the creation of the underlying Lexer
341       * cannot be performed.
342       */
343      public Parser (URLConnection connection) throws ParserException
344      {
345          this (connection, STDOUT);
346      }
347  
348      //
349      // Bean patterns
350      //
351  
352      /**
353       * Set the connection for this parser.
354       * This method creates a new <code>Lexer</code> reading from the connection.
355       * @param connection A fully conditioned connection. The connect()
356       * method will be called so it need not be connected yet.
357       * @exception ParserException if the character set specified in the
358       * HTTP header is not supported, or an i/o exception occurs creating the
359       * lexer.
360       * @see #setLexer
361       * @see #getConnection
362       */
363      public void setConnection (URLConnection connection)
364          throws
365              ParserException
366      {
367          if (null == connection)
368              throw new IllegalArgumentException ("connection cannot be null");
369          setLexer (new Lexer (connection));
370      }
371  
372      /**
373       * Return the current connection.
374       * @return The connection either created by the parser or passed into this
375       * parser via {@link #setConnection}.
376       * @see #setConnection(URLConnection)
377       */
378      public URLConnection getConnection ()
379      {
380          return (getLexer ().getPage ().getConnection ());
381      }
382  
383      /**
384       * Set the URL for this parser.
385       * This method creates a new Lexer reading from the given URL.
386       * Trying to set the url to null or an empty string is a no-op.
387       * @param url The new URL for the parser.
388       * @throws ParserException If the url is invalid or creation of the
389       * underlying Lexer cannot be performed.
390       * @see #getURL
391       */
392      public void setURL (String url)
393          throws
394              ParserException
395      {
396          if ((null != url) && !"".equals (url))
397              setConnection (getConnectionManager ().openConnection (url));
398      }
399  
400      /**
401       * Return the current URL being parsed.
402       * @return The current url. This is the URL for the current page.
403       * A string passed into the constructor or set via setURL may be altered,
404       * for example, a file name may be modified to be a URL.
405       * @see Page#getUrl
406       * @see #setURL
407       */
408      public String getURL ()
409      {
410          return (getLexer ().getPage ().getUrl ());
411      }
412  
413      /**
414       * Set the encoding for the page this parser is reading from.
415       * @param encoding The new character set to use.
416       * @throws ParserException If the encoding change causes characters that
417       * have already been consumed to differ from the characters that would
418       * have been seen had the new encoding been in force.
419       * @see org.htmlparser.util.EncodingChangeException
420       * @see #getEncoding
421       */
422      public void setEncoding (String encoding)
423          throws
424              ParserException
425      {
426          getLexer ().getPage ().setEncoding (encoding);
427      }
428  
429      /**
430       * Get the encoding for the page this parser is reading from.
431       * This item is set from the HTTP header but may be overridden by meta
432       * tags in the head, so this may change after the head has been parsed.
433       * @return The encoding currently in force.
434       * @see #setEncoding
435       */
436      public String getEncoding ()
437      {
438          return (getLexer ().getPage ().getEncoding ());
439      }
440  
441      /**
442       * Set the lexer for this parser.
443       * The current NodeFactory is transferred to (set on) the given lexer,
444       * since the lexer owns the node factory object.
445       * It does not adjust the <code>feedback</code> object.
446       * Trying to set the lexer to <code>null</code> is a no-op.
447       * @param lexer The lexer object to use.
448       * @see #setNodeFactory
449       * @see #getLexer
450       */
451      public void setLexer (Lexer lexer)
452      {
453          NodeFactory factory;
454          String type;
455  
456          if (null != lexer)
457          {   // move a node factory that's been set to the new lexer
458              factory = null;
459              if (null != getLexer ())
460                  factory = getLexer ().getNodeFactory ();
461              if (null != factory)
462                  lexer.setNodeFactory (factory);
463              mLexer = lexer;
464              // warn about content that's not likely text
465              type = mLexer.getPage ().getContentType ();
466              if (type != null && !type.startsWith ("text"))
467                  getFeedback ().warning (
468                      "URL "
469                      + mLexer.getPage ().getUrl ()
470                      + " does not contain text");
471          }
472      }
473  
474      /**
475       * Returns the lexer associated with the parser.
476       * @return The current lexer.
477       * @see #setLexer
478       */
479      public Lexer getLexer ()
480      {
481          return (mLexer);
482      }
483  
484      /**
485       * Get the current node factory.
486       * @return The current lexer's node factory.
487       * @see #setNodeFactory
488       */
489      public NodeFactory getNodeFactory ()
490      {
491          return (getLexer ().getNodeFactory ());
492      }
493  
494      /**
495       * Set the current node factory.
496       * @param factory The new node factory for the current lexer.
497       * @see #getNodeFactory
498       */
499      public void setNodeFactory (NodeFactory factory)
500      {
501          if (null == factory)
502              throw new IllegalArgumentException ("node factory cannot be null");
503          getLexer ().setNodeFactory (factory);
504      }
505  
506      /**
507       * Sets the feedback object used in scanning.
508       * @param fb The new feedback object to use. If this is null a
509       * {@link #DEVNULL silent feedback object} is used.
510       * @see #getFeedback
511       */
512      public void setFeedback (ParserFeedback fb)
513      {
514          if (null == fb)
515              mFeedback = DEVNULL;
516          else
517              mFeedback = fb;
518      }
519  
520      /**
521       * Returns the current feedback object.
522       * @return The feedback object currently being used.
523       * @see #setFeedback
524       */
525      public ParserFeedback getFeedback()
526      {
527          return (mFeedback);
528      }
529  
530      //
531      // Public methods
532      //
533  
534      /**
535       * Reset the parser to start from the beginning again.
536       * This assumes support for a reset from the underlying
537       * {@link org.htmlparser.lexer.Source} object.
538       * <p>This is cheaper (in terms of time) than resetting the URL, i.e.
539       * <pre>
540       * parser.setURL (parser.getURL ());
541       * </pre>
542       * because the page is not refetched from the internet.
543       * <em>Note: the nodes returned on the second parse are new
544       * nodes and not the same nodes returned on the first parse. If you
545       * want the same nodes for re-use, collect them in a NodeList with
546       * {@link #parse(NodeFilter) parse(null)} and operate on the NodeList.</em>
547       */
548      public void reset ()
549      {
550          getLexer ().reset ();
551      }
552  
553      /**
554       * Returns an iterator (enumeration) over the html nodes.
555       * {@link org.htmlparser.nodes Nodes} can be of three main types:
556       * <ul>
557       * <li>{@link org.htmlparser.nodes.TagNode TagNode}</li>
558       * <li>{@link org.htmlparser.nodes.TextNode TextNode}</li>
559       * <li>{@link org.htmlparser.nodes.RemarkNode RemarkNode}</li>
560       * </ul>
561       * In general, when parsing with an iterator or processing a NodeList,
562       * you will need to use recursion. For example:
563       * <code>
564       * <pre>
565       * void processMyNodes (Node node)
566       * {
567       *     if (node instanceof TextNode)
568       *     {
569       *         // downcast to TextNode
570       *         TextNode text = (TextNode)node;
571       *         // do whatever processing you want with the text
572       *         System.out.println (text.getText ());
573       *     }
574       *     if (node instanceof RemarkNode)
575       *     {
576       *         // downcast to RemarkNode
577       *         RemarkNode remark = (RemarkNode)node;
578       *         // do whatever processing you want with the comment
579       *     }
580       *     else if (node instanceof TagNode)
581       *     {
582       *         // downcast to TagNode
583       *         TagNode tag = (TagNode)node;
584       *         // do whatever processing you want with the tag itself
585       *         // ...
586       *         // process recursively (nodes within nodes) via getChildren()
587       *         NodeList nl = tag.getChildren ();
588       *         if (null != nl)
589       *             for (NodeIterator i = nl.elements (); i.hasMoreElements (); )
590       *                 processMyNodes (i.nextNode ());
591       *     }
592       * }
593       *
594       * Parser parser = new Parser ("http://www.yahoo.com");
595       * for (NodeIterator i = parser.elements (); i.hasMoreElements (); )
596       *     processMyNodes (i.nextNode ());
597       * </pre>
598       * </code>
599       * @throws ParserException If a parsing error occurs.
600       * @return An iterator over the top level nodes (usually {@.html <html>}).
601       */
602      public NodeIterator elements () throws ParserException
603      {
604          return (new IteratorImpl (getLexer (), getFeedback ()));
605      }
606  
607      /**
608       * Parse the given resource, using the filter provided.
609       * This can be used to extract information from specific nodes.
610       * When used with a <code>null</code> filter it returns an
611       * entire page which can then be modified and converted back to HTML
612       * (Note: the synthesis use-case is not handled very well; the parser
613       * is more often used to extract information from a web page).
614       * <p>For example, to replace the entire contents of the HEAD with a
615       * single TITLE tag you could do this:
616       * <pre>
617       * NodeList nl = parser.parse (null); // here is your two node list
618       * NodeList heads = nl.extractAllNodesThatMatch (new TagNameFilter ("HEAD"))
619       * if (heads.size () > 0) // there may not be a HEAD tag
620       * {
621       *     Head head = heads.elementAt (0); // there should be only one
622       *     head.removeAll (); // clean out the contents
623       *     Tag title = new TitleTag ();
624       *     title.setTagName ("title");
625       *     title.setChildren (new NodeList (new TextNode ("The New Title")));
626       *     Tag title_end = new TitleTag ();
627       *     title_end.setTagName ("/title");
628       *     title.setEndTag (title_end);
629       *     head.add (title);
630       * }
631       * System.out.println (nl.toHtml ()); // output the modified HTML
632       * </pre>
633       * @return The list of matching nodes (for a <code>null</code>
634       * filter this is all the top level nodes).
635       * @param filter The filter to apply to the parsed nodes,
636       * or <code>null</code> to retrieve all the top level nodes.
637       * @throws ParserException If a parsing error occurs.
638       */
639      public NodeList parse (NodeFilter filter) throws ParserException
640      {
641          NodeIterator e;
642          Node node;
643          NodeList ret;
644  
645          ret = new NodeList ();
646          for (e = elements (); e.hasMoreNodes (); )
647          {
648              node = e.nextNode ();
649              if (null != filter)
650                  node.collectInto (ret, filter);
651              else
652                  ret.add (node);
653          }
654  
655          return (ret);
656      }
657  
658      /**
659       * Apply the given visitor to the current page.
660       * The visitor is passed to the <code>accept()</code> method of each node
661       * in the page in a depth first traversal. The visitor
662       * <code>beginParsing()</code> method is called prior to processing the
663       * page and <code>finishedParsing()</code> is called after the processing.
664       * @param visitor The visitor to visit all nodes with.
665       * @throws ParserException If a parse error occurs while traversing
666       * the page with the visitor.
667       */
668      public void visitAllNodesWith (NodeVisitor visitor) throws ParserException
669      {
670          Node node;
671          visitor.beginParsing();
672          for (NodeIterator e = elements(); e.hasMoreNodes(); )
673          {
674              node = e.nextNode();
675              node.accept(visitor);
676          }
677          visitor.finishedParsing();
678      }
679  
680      /**
681       * Initializes the parser with the given input HTML String.
682       * @param inputHTML the input HTML that is to be parsed.
683       * @throws ParserException If a error occurs in setting up the
684       * underlying Lexer.
685       */
686      public void setInputHTML (String inputHTML)
687          throws
688              ParserException
689      {
690          if (null == inputHTML)
691              throw new IllegalArgumentException ("html cannot be null");
692          if (!"".equals (inputHTML))
693              setLexer (new Lexer (new Page (inputHTML)));
694      }
695  
696      /**
697       * Extract all nodes matching the given filter.
698       * @see Node#collectInto(NodeList, NodeFilter)
699       * @param filter The filter to be applied to the nodes.
700       * @throws ParserException If a parse error occurs.
701       * @return A list of nodes matching the filter criteria,
702       * i.e. for which the filter's accept method
703       * returned <code>true</code>.
704       */
705      public NodeList extractAllNodesThatMatch (NodeFilter filter)
706          throws
707              ParserException
708      {
709          NodeIterator e;
710          NodeList ret;
711  
712          ret = new NodeList ();
713          for (e = elements (); e.hasMoreNodes (); )
714              e.nextNode ().collectInto (ret, filter);
715  
716          return (ret);
717      }
718  
719      /**
720       * Convenience method to extract all nodes of a given class type.
721       * Equivalent to
722       * <code>extractAllNodesThatMatch (new NodeClassFilter (nodeType))</code>.
723       * @param nodeType The class of the nodes to collect.
724       * @throws ParserException If a parse error occurs.
725       * @return A list of nodes which have the class specified.
726       * @deprecated Use extractAllNodesThatMatch (new NodeClassFilter (cls)).
727       * @see #extractAllNodesThatAre
728       */
729      public Node [] extractAllNodesThatAre (Class nodeType)
730          throws
731              ParserException
732      {
733          NodeList ret;
734  
735          ret = extractAllNodesThatMatch (new NodeClassFilter (nodeType));
736  
737          return (ret.toNodeArray ());
738      }
739  
740      //
741      // ConnectionMonitor interface
742      //
743  
744      /**
745       * Called just prior to calling connect.
746       * Part of the ConnectionMonitor interface, this implementation just
747       * sends the request header to the feedback object if any.
748       * @param connection The connection which is about to be connected.
749       * @throws ParserException <em>Not used</em>
750       * @see ConnectionMonitor#preConnect
751       */
752      public void preConnect (HttpURLConnection connection)
753          throws
754              ParserException
755      {
756          getFeedback ().info (HttpHeader.getRequestHeader (connection));
757      }
758  
759      /**
760       * Called just after calling connect.
761       * Part of the ConnectionMonitor interface, this implementation just
762       * sends the response header to the feedback object if any.
763       * @param connection The connection that was just connected.
764       * @throws ParserException <em>Not used.</em>
765       * @see ConnectionMonitor#postConnect
766       */
767      public void postConnect (HttpURLConnection connection)
768          throws
769              ParserException
770      {
771          getFeedback ().info (HttpHeader.getResponseHeader (connection));
772      }
773  
774      /**
775       * The main program, which can be executed from the command line.
776       * @param args A URL or file name to parse, and an optional tag name to be
777       * used as a filter.
778       */
779      public static void main (String [] args)
780      {
781          Parser parser;
782          NodeFilter filter;
783  
784          if (args.length < 1 || args[0].equals ("-help"))
785          {
786              System.out.println ("HTML Parser v" + VERSION_STRING + "\n");
787              System.out.println ();
788              System.out.println ("Syntax : java -jar htmlparser.jar"
789                      + " <file/page> [type]");
790              System.out.println ("   <file/page> the URL or file to be parsed");
791              System.out.println ("   type the node type, for example:");
792              System.out.println ("     A - Show only the link tags");
793              System.out.println ("     IMG - Show only the image tags");
794              System.out.println ("     TITLE - Show only the title tag");
795              System.out.println ();
796              System.out.println ("Example : java -jar htmlparser.jar"
797                      + " http://www.yahoo.com");
798              System.out.println ();
799          }
800          else
801              try
802              {
803                  parser = new Parser ();
804                  if (1 < args.length)
805                      filter = new TagNameFilter (args[1]);
806                  else
807                  {   // for a simple dump, use more verbose settings
808                      filter = null;
809                      parser.setFeedback (Parser.STDOUT);
810                      getConnectionManager ().setMonitor (parser);
811                  }
812                  parser.setURL (args[0]);
813                  System.out.println (parser.parse (filter));
814              }
815              catch (ParserException e)
816              {
817                  e.printStackTrace ();
818              }
819      }
820  }