/ org.htmlparser / src / org / htmlparser / tags / CompositeTag.java
CompositeTag.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2004 Somik Raha
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/CompositeTag.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2005/06/20 01:56:32 $
 10  // $Revision: 1.81 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser.tags;
 28  
 29  import java.util.Locale;
 30  
 31  import org.htmlparser.Node;
 32  import org.htmlparser.NodeFilter;
 33  import org.htmlparser.Text;
 34  import org.htmlparser.Tag;
 35  import org.htmlparser.filters.NodeClassFilter;
 36  import org.htmlparser.nodes.AbstractNode;
 37  import org.htmlparser.nodes.TagNode;
 38  import org.htmlparser.scanners.CompositeTagScanner;
 39  import org.htmlparser.util.NodeList;
 40  import org.htmlparser.util.SimpleNodeIterator;
 41  import org.htmlparser.visitors.NodeVisitor;
 42  
 43  /**
 44   * The base class for tags that have an end tag.
 45   * Provided extra accessors for the children above and beyond what the basic
 46   * {@link Tag} provides. Also handles the conversion of it's children for
 47   * the {@link #toHtml toHtml} method.
 48   */
 49  public class CompositeTag extends TagNode
 50  {
 51      /**
 52       * The tag that causes this tag to finish.
 53       * May be a virtual tag generated by the scanning logic.
 54       */
 55      protected Tag mEndTag;
 56  
 57      /**
 58       * The default scanner for non-composite tags.
 59       */
 60      protected final static CompositeTagScanner mDefaultCompositeScanner = new CompositeTagScanner ();
 61  
 62      /**
 63       * Create a composite tag.
 64       */
 65      public CompositeTag ()
 66      {
 67          setThisScanner (mDefaultCompositeScanner);
 68      }
 69      
 70      /**
 71       * Get an iterator over the children of this node.
 72       * @return Am iterator over the children of this node.
 73       */
 74      public SimpleNodeIterator children ()
 75      {
 76          SimpleNodeIterator ret;
 77  
 78          if (null != getChildren ())
 79              ret = getChildren ().elements ();
 80          else
 81              ret = (new NodeList ()).elements ();
 82  
 83          return (ret);
 84      }
 85  
 86      /**
 87       * Get the child of this node at the given position.
 88       * @param index The in the node list of the child.
 89       * @return The child at that index.
 90       */
 91      public Node getChild (int index)
 92      {
 93          return (
 94              (null == getChildren ()) ? null :
 95              getChildren ().elementAt (index));
 96      }
 97  
 98      /**
 99       * Get the children as an array of <code>Node</code> objects.
100       * @return The children in an array.
101       */
102      public Node [] getChildrenAsNodeArray ()
103      {
104          return (
105              (null == getChildren ()) ? new Node[0] :
106              getChildren ().toNodeArray ());
107      }
108  
109      /**
110       * Remove the child at the position given.
111       * @param i The index of the child to remove.
112       */
113      public void removeChild (int i)
114      {
115          if (null != getChildren ())
116              getChildren ().remove (i);
117      }
118  
119      /**
120       * Return the child tags as an iterator.
121       * Equivalent to calling getChildren ().elements ().
122       * @return An iterator over the children.
123       */
124      public SimpleNodeIterator elements()
125      {
126          return (
127              (null == getChildren ()) ? new NodeList ().elements () :
128              getChildren ().elements ());
129      }
130  
131      /**
132       * Return the textual contents of this tag and it's children.
133       * @return The 'browser' text contents of this tag.
134       */
135      public String toPlainTextString() {
136          StringBuffer stringRepresentation = new StringBuffer();
137          for (SimpleNodeIterator e=children();e.hasMoreNodes();) {
138              stringRepresentation.append(e.nextNode().toPlainTextString());
139          }
140          return stringRepresentation.toString();
141      }
142  
143      /**
144       * Add the textual contents of the children of this node to the buffer.
145       * @param sb The buffer to append to.
146       */
147      protected void putChildrenInto(StringBuffer sb)
148      {
149          Node node;
150          for (SimpleNodeIterator e = children (); e.hasMoreNodes ();)
151          {
152              node = e.nextNode ();
153              // eliminate virtual tags
154  //            if (!(node.getStartPosition () == node.getEndPosition ()))
155                  sb.append (node.toHtml ());
156          }
157      }
158  
159      /**
160       * Add the textual contents of the end tag of this node to the buffer.
161       * @param sb The buffer to append to.
162       */
163      protected void putEndTagInto(StringBuffer sb)
164      {
165          // eliminate virtual tags
166  //        if (!(endTag.getStartPosition () == endTag.getEndPosition ()))
167              sb.append(getEndTag ().toHtml());
168      }
169  
170      /**
171       * Return this tag as HTML code.
172       * @return This tag and it's contents (children) and the end tag
173       * as HTML code.
174       */
175      public String toHtml() {
176          StringBuffer sb = new StringBuffer();
177          sb.append (super.toHtml ());
178          if (!isEmptyXmlTag())
179          {
180              putChildrenInto(sb);
181              if (null != getEndTag ())
182                  putEndTagInto(sb);
183          }
184          return sb.toString();
185      }
186  
187      /**
188       * Searches all children who for a name attribute. Returns first match.
189       * @param name Attribute to match in tag
190       * @return Tag Tag matching the name attribute
191       */
192      public Tag searchByName(String name) {
193          Node node;
194          Tag tag = null;
195          boolean found = false;
196          for (SimpleNodeIterator e = children();e.hasMoreNodes() && !found;) {
197              node = e.nextNode();
198              if (node instanceof Tag)
199              {
200                  tag = (Tag)node;
201                  String nameAttribute = tag.getAttribute("NAME");
202                  if (nameAttribute!=null && nameAttribute.equals(name))
203                      found=true;
204              }
205          }
206          if (found)
207              return tag;
208          else
209              return null;
210      }
211  
212      /**
213       * Searches for all nodes whose text representation contains the search string.
214       * Collects all nodes containing the search string into a NodeList.
215       * This search is <b>case-insensitive</b> and the search string and the
216       * node text are converted to uppercase using an English locale.
217       * For example, if you wish to find any textareas in a form tag containing
218       * "hello world", the code would be:
219       * <code>
220       * NodeList nodeList = formTag.searchFor("Hello World");
221       * </code>
222       * @param searchString Search criterion.
223       * @return A collection of nodes whose string contents or
224       * representation have the <code>searchString</code> in them.
225       */
226      public NodeList searchFor (String searchString)
227      {
228          return (searchFor (searchString, false));
229      }
230  
231      /**
232       * Searches for all nodes whose text representation contains the search string.
233       * Collects all nodes containing the search string into a NodeList.
234       * For example, if you wish to find any textareas in a form tag containing
235       * "hello world", the code would be:
236       * <code>
237       * NodeList nodeList = formTag.searchFor("Hello World");
238       * </code>
239       * @param searchString Search criterion.
240       * @param caseSensitive If <code>true</code> this search should be case
241       * sensitive. Otherwise, the search string and the node text are converted
242       * to uppercase using an English locale.
243       * @return A collection of nodes whose string contents or
244       * representation have the <code>searchString</code> in them.
245       */
246      public NodeList searchFor (String searchString, boolean caseSensitive)
247      {
248          return (searchFor (searchString, caseSensitive, Locale.ENGLISH));
249      }
250  
251      /**
252       * Searches for all nodes whose text representation contains the search string.
253       * Collects all nodes containing the search string into a NodeList.
254       * For example, if you wish to find any textareas in a form tag containing
255       * "hello world", the code would be:
256       * <code>
257       * NodeList nodeList = formTag.searchFor("Hello World");
258       * </code>
259       * @param searchString Search criterion.
260       * @param caseSensitive If <code>true</code> this search should be case
261       * sensitive. Otherwise, the search string and the node text are converted
262       * to uppercase using the locale provided.
263       * @param locale The locale for uppercase conversion.
264       * @return A collection of nodes whose string contents or
265       * representation have the <code>searchString</code> in them.
266       */
267      public NodeList searchFor (String searchString, boolean caseSensitive, Locale locale)
268      {
269          Node node;
270          String text;
271          NodeList ret;
272          
273          ret = new NodeList ();
274  
275          if (!caseSensitive)
276              searchString = searchString.toUpperCase (locale);
277          for (SimpleNodeIterator e = children (); e.hasMoreNodes (); )
278          {
279              node = e.nextNode ();
280              text = node.toPlainTextString ();
281              if (!caseSensitive)
282                  text = text.toUpperCase (locale);
283              if (-1 != text.indexOf (searchString))
284                  ret.add (node);
285          }
286  
287          return (ret);
288      }
289  
290      /**
291       * Collect all objects that are of a certain type
292       * Note that this will not check for parent types, and will not
293       * recurse through child tags
294       * @param classType The class to search for.
295       * @param recursive If true, recursively search through the children.
296       * @return A list of children found.
297       */
298      public NodeList searchFor (Class classType, boolean recursive)
299      {
300          NodeList children;
301          NodeList ret;
302  
303          children = getChildren ();
304          if (null == children)
305              ret = new NodeList ();
306          else
307              ret = children.extractAllNodesThatMatch (
308                  new NodeClassFilter (classType), recursive);
309  
310          return (ret);
311      }
312  
313      /**
314       * Returns the node number of the first node containing the given text.
315       * This can be useful to index into the composite tag and get other children.
316       * Text is compared without case sensitivity and conversion to uppercase
317       * uses an English locale.
318       * @param text The text to search for.
319       * @return int The node index in the children list of the node containing
320       * the text or -1 if not found.
321       * @see #findPositionOf (String, Locale)
322       */
323      public int findPositionOf (String text)
324      {
325          return (findPositionOf (text, Locale.ENGLISH));
326      }
327  
328      /**
329       * Returns the node number of the first node containing the given text.
330       * This can be useful to index into the composite tag and get other children.
331       * Text is compared without case sensitivity and conversion to uppercase
332       * uses the supplied locale.
333       * @return int The node index in the children list of the node containing
334       * the text or -1 if not found.
335       * @param locale The locale to use in converting to uppercase.
336       * @param text The text to search for.
337       */
338      public int findPositionOf (String text, Locale locale)
339      {
340          Node node;
341          int loc;
342          
343          loc = 0;
344          text = text.toUpperCase (locale);
345          for (SimpleNodeIterator e = children (); e.hasMoreNodes (); )
346          {
347              node = e.nextNode ();
348              if (-1 != node.toPlainTextString ().toUpperCase (locale).indexOf (text))
349                  return loc;
350              loc++;
351          }
352          return -1;
353      }
354  
355      /**
356       * Returns the node number of a child node given the node object.
357       * This would typically be used in conjuction with digUpStringNode,
358       * after which the string node's parent can be used to find the
359       * string node's position. Faster than calling findPositionOf(text)
360       * again. Note that the position is at a linear level alone - there
361       * is no recursion in this method.
362       * @param searchNode The child node to find.
363       * @return The offset of the child tag or -1 if it was not found.
364       */
365      public int findPositionOf(Node searchNode) {
366          Node node;
367          int loc = 0;
368          for (SimpleNodeIterator e=children();e.hasMoreNodes();) {
369              node = e.nextNode();
370              if (node==searchNode) {
371                  return loc;
372              }
373              loc++;
374          }
375          return -1;
376      }
377  
378      /**
379       * Get child at given index
380       * @param index The index into the child node list.
381       * @return Node The child node at the given index or null if none.
382       */
383      public Node childAt (int index)
384      {
385          return (
386              (null == getChildren ()) ? null :
387              getChildren ().elementAt (index));
388      }
389  
390      /**
391       * Collect this node and its child nodes (if-applicable) into the list parameter,
392       * provided the node satisfies the filtering criteria.
393       * <p>This mechanism allows powerful filtering code to be written very easily,
394       * without bothering about collection of embedded tags separately.
395       * e.g. when we try to get all the links on a page, it is not possible to
396       * get it at the top-level, as many tags (like form tags), can contain
397       * links embedded in them. We could get the links out by checking if the
398       * current node is a {@link CompositeTag}, and going through its children.
399       * So this method provides a convenient way to do this.</p>
400       * <p>Using collectInto(), programs get a lot shorter. Now, the code to
401       * extract all links from a page would look like:
402       * <pre>
403       * NodeList list = new NodeList();
404       * NodeFilter filter = new TagNameFilter ("A");
405       * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
406       *      e.nextNode().collectInto(list, filter);
407       * </pre>
408       * Thus, <code>list</code> will hold all the link nodes, irrespective of how
409       * deep the links are embedded.</p>
410       * <p>Another way to accomplish the same objective is:
411       * <pre>
412       * NodeList list = new NodeList();
413       * NodeFilter filter = new TagClassFilter (LinkTag.class);
414       * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
415       *      e.nextNode().collectInto(list, filter);
416       * </pre>
417       * This is slightly less specific because the LinkTag class may be
418       * registered for more than one node name, e.g. &lt;LINK&gt; tags too.</p>
419       * @param list The list to add nodes to.
420       * @param filter The filter to apply.
421       * @see org.htmlparser.filters
422       */
423      public void collectInto (NodeList list, NodeFilter filter)
424      {
425          super.collectInto (list, filter);
426          for (SimpleNodeIterator e = children(); e.hasMoreNodes ();)
427              e.nextNode ().collectInto (list, filter);
428          if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
429              getEndTag ().collectInto (list, filter);
430      }
431  
432      /**
433       * Return the HTML code for the children of this tag.
434       * @return A string with the HTML code for the contents of this tag.
435       */
436      public String getChildrenHTML() {
437          StringBuffer buff = new StringBuffer();
438          for (SimpleNodeIterator e = children();e.hasMoreNodes();) {
439              AbstractNode node = (AbstractNode)e.nextNode();
440              buff.append(node.toHtml());
441          }
442          return buff.toString();
443      }
444  
445      /**
446       * Tag visiting code.
447       * Invokes <code>accept()</code> on the start tag and then
448       * walks the child list invoking <code>accept()</code> on each
449       * of the children, finishing up with an <code>accept()</code>
450       * call on the end tag. If <code>shouldRecurseSelf()</code>
451       * returns true it then asks the visitor to visit itself.
452       * @param visitor The <code>NodeVisitor</code> object to be signalled
453       * for each child and possibly this tag.
454       */
455      public void accept (NodeVisitor visitor)
456      {
457          SimpleNodeIterator children;
458          Node child;
459  
460          if (visitor.shouldRecurseSelf ())
461              visitor.visitTag (this);
462          if (visitor.shouldRecurseChildren ())
463          {
464              if (null != getChildren ())
465              {
466                  children = children ();
467                  while (children.hasMoreNodes ())
468                  {
469                      child = children.nextNode ();
470                      child.accept (visitor);
471                  }
472              }
473              if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
474                  getEndTag ().accept (visitor);
475          }
476      }
477  
478      /**
479       * Return the number of child nodes in this tag.
480       * @return The child node count.
481       */
482      public int getChildCount()
483      {
484          NodeList children;
485          
486          children = getChildren ();
487  
488          return ((null == children) ? 0 : children.size ());
489      }
490  
491      /**
492       * Get the end tag for this tag.
493       * For example, if the node is {@.html <LABEL>The label</LABLE>}, then
494       * this method would return the {@.html </LABLE>} end tag.
495       * @return The end tag for this node.
496       * <em>Note: If the start and end position of the end tag is the same,
497       * then the end tag was injected (it's a virtual end tag).</em>
498       */
499      public Tag getEndTag()
500      {
501          return (mEndTag);
502      }
503  
504      /**
505       * Set the end tag for this tag.
506       * @param tag The new end tag for this tag.
507       * Note: no checking is perfromed so you can generate bad HTML by setting
508       * the end tag with a name not equal to the name of the start tag,
509       * i.e. {@.html <LABEL>The label</TITLE>}
510       */
511      public void setEndTag (Tag tag)
512      {
513          mEndTag = tag;
514      }
515  
516      /**
517       * Finds a text node, however embedded it might be, and returns
518       * it. The text node will retain links to its parents, so
519       * further navigation is possible.
520       * @param searchText The text to search for.
521       * @return The list of text nodes (recursively) found.
522       */
523      public Text[] digupStringNode(String searchText) {
524          NodeList nodeList = searchFor(searchText);
525          NodeList stringNodes = new NodeList();
526          for (int i=0;i<nodeList.size();i++) {
527              Node node = nodeList.elementAt(i);
528              if (node instanceof Text) {
529                  stringNodes.add(node);
530              } else {
531                  if (node instanceof CompositeTag) {
532                      CompositeTag ctag = (CompositeTag)node;
533                      Text[] nodes = ctag.digupStringNode(searchText);
534                      for (int j=0;j<nodes.length;j++)
535                          stringNodes.add(nodes[j]);
536                  }
537              }
538          }
539          Text[] stringNode = new Text[stringNodes.size()];
540          for (int i=0;i<stringNode.length;i++) {
541              stringNode[i] = (Text)stringNodes.elementAt(i);
542          }
543          return stringNode;
544      }
545  
546      /**
547       * Return a string representation of the contents of this tag, it's children and it's end tag suitable for debugging.
548       * @return A textual representation of the tag.
549       */
550      public String toString ()
551      {
552          StringBuffer ret;
553          
554          ret = new StringBuffer (1024);
555          toString (0, ret);
556          
557          return (ret.toString ());
558      }
559  
560      /**
561       * Return the text contained in this tag.
562       * @return The complete contents of the tag (within the angle brackets).
563       */
564      public String getText ()
565      {
566          String ret;
567          
568          ret = super.toHtml ();
569          ret = ret.substring (1, ret.length () - 1);
570          
571          return (ret);
572      }
573  
574      /**
575       * Return the text between the start tag and the end tag.
576       * @return The contents of the CompositeTag.
577       */
578      public String getStringText ()
579      {
580          String ret;
581          int start = getEndPosition ();
582          int end = mEndTag.getStartPosition ();
583          ret = getPage ().getText (start, end);
584          
585          return (ret);
586      }
587  
588      /**
589       * Return a string representation of the contents of this tag, it's children and it's end tag suitable for debugging.
590       * @param level The indentation level to use.
591       * @param buffer The buffer to append to.
592       */
593      public void toString (int level, StringBuffer buffer)
594      {
595          Node node;
596  
597          for (int i = 0; i < level; i++)
598              buffer.append ("  ");
599          buffer.append (super.toString ());
600          buffer.append (System.getProperty ("line.separator"));
601          for (SimpleNodeIterator e = children (); e.hasMoreNodes ();)
602          {
603              node = e.nextNode ();
604              if (node instanceof CompositeTag)
605                  ((CompositeTag)node).toString (level + 1, buffer);
606              else
607              {
608                  for (int i = 0; i <= level; i++)
609                      buffer.append ("  ");
610                  buffer.append (node);
611                  buffer.append (System.getProperty ("line.separator"));
612              }
613          }
614          
615          if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
616              // eliminate virtual tags
617  //            if (!(getEndTag ().getStartPosition () == getEndTag ().getEndPosition ()))
618              {
619                  for (int i = 0; i <= level; i++)
620                      buffer.append ("  ");
621                  buffer.append (getEndTag ().toString ());
622                  buffer.append (System.getProperty ("line.separator"));
623              }
624      }
625  }