CompositeTag.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Somik Raha 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/CompositeTag.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2005/06/20 01:56:32 $ 10 // $Revision: 1.81 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.tags; 28 29 import java.util.Locale; 30 31 import org.htmlparser.Node; 32 import org.htmlparser.NodeFilter; 33 import org.htmlparser.Text; 34 import org.htmlparser.Tag; 35 import org.htmlparser.filters.NodeClassFilter; 36 import org.htmlparser.nodes.AbstractNode; 37 import org.htmlparser.nodes.TagNode; 38 import org.htmlparser.scanners.CompositeTagScanner; 39 import org.htmlparser.util.NodeList; 40 import org.htmlparser.util.SimpleNodeIterator; 41 import org.htmlparser.visitors.NodeVisitor; 42 43 /** 44 * The base class for tags that have an end tag. 45 * Provided extra accessors for the children above and beyond what the basic 46 * {@link Tag} provides. Also handles the conversion of it's children for 47 * the {@link #toHtml toHtml} method. 48 */ 49 public class CompositeTag extends TagNode 50 { 51 /** 52 * The tag that causes this tag to finish. 53 * May be a virtual tag generated by the scanning logic. 54 */ 55 protected Tag mEndTag; 56 57 /** 58 * The default scanner for non-composite tags. 59 */ 60 protected final static CompositeTagScanner mDefaultCompositeScanner = new CompositeTagScanner (); 61 62 /** 63 * Create a composite tag. 64 */ 65 public CompositeTag () 66 { 67 setThisScanner (mDefaultCompositeScanner); 68 } 69 70 /** 71 * Get an iterator over the children of this node. 72 * @return Am iterator over the children of this node. 73 */ 74 public SimpleNodeIterator children () 75 { 76 SimpleNodeIterator ret; 77 78 if (null != getChildren ()) 79 ret = getChildren ().elements (); 80 else 81 ret = (new NodeList ()).elements (); 82 83 return (ret); 84 } 85 86 /** 87 * Get the child of this node at the given position. 88 * @param index The in the node list of the child. 89 * @return The child at that index. 90 */ 91 public Node getChild (int index) 92 { 93 return ( 94 (null == getChildren ()) ? null : 95 getChildren ().elementAt (index)); 96 } 97 98 /** 99 * Get the children as an array of <code>Node</code> objects. 100 * @return The children in an array. 101 */ 102 public Node [] getChildrenAsNodeArray () 103 { 104 return ( 105 (null == getChildren ()) ? new Node[0] : 106 getChildren ().toNodeArray ()); 107 } 108 109 /** 110 * Remove the child at the position given. 111 * @param i The index of the child to remove. 112 */ 113 public void removeChild (int i) 114 { 115 if (null != getChildren ()) 116 getChildren ().remove (i); 117 } 118 119 /** 120 * Return the child tags as an iterator. 121 * Equivalent to calling getChildren ().elements (). 122 * @return An iterator over the children. 123 */ 124 public SimpleNodeIterator elements() 125 { 126 return ( 127 (null == getChildren ()) ? new NodeList ().elements () : 128 getChildren ().elements ()); 129 } 130 131 /** 132 * Return the textual contents of this tag and it's children. 133 * @return The 'browser' text contents of this tag. 134 */ 135 public String toPlainTextString() { 136 StringBuffer stringRepresentation = new StringBuffer(); 137 for (SimpleNodeIterator e=children();e.hasMoreNodes();) { 138 stringRepresentation.append(e.nextNode().toPlainTextString()); 139 } 140 return stringRepresentation.toString(); 141 } 142 143 /** 144 * Add the textual contents of the children of this node to the buffer. 145 * @param sb The buffer to append to. 146 */ 147 protected void putChildrenInto(StringBuffer sb) 148 { 149 Node node; 150 for (SimpleNodeIterator e = children (); e.hasMoreNodes ();) 151 { 152 node = e.nextNode (); 153 // eliminate virtual tags 154 // if (!(node.getStartPosition () == node.getEndPosition ())) 155 sb.append (node.toHtml ()); 156 } 157 } 158 159 /** 160 * Add the textual contents of the end tag of this node to the buffer. 161 * @param sb The buffer to append to. 162 */ 163 protected void putEndTagInto(StringBuffer sb) 164 { 165 // eliminate virtual tags 166 // if (!(endTag.getStartPosition () == endTag.getEndPosition ())) 167 sb.append(getEndTag ().toHtml()); 168 } 169 170 /** 171 * Return this tag as HTML code. 172 * @return This tag and it's contents (children) and the end tag 173 * as HTML code. 174 */ 175 public String toHtml() { 176 StringBuffer sb = new StringBuffer(); 177 sb.append (super.toHtml ()); 178 if (!isEmptyXmlTag()) 179 { 180 putChildrenInto(sb); 181 if (null != getEndTag ()) 182 putEndTagInto(sb); 183 } 184 return sb.toString(); 185 } 186 187 /** 188 * Searches all children who for a name attribute. Returns first match. 189 * @param name Attribute to match in tag 190 * @return Tag Tag matching the name attribute 191 */ 192 public Tag searchByName(String name) { 193 Node node; 194 Tag tag = null; 195 boolean found = false; 196 for (SimpleNodeIterator e = children();e.hasMoreNodes() && !found;) { 197 node = e.nextNode(); 198 if (node instanceof Tag) 199 { 200 tag = (Tag)node; 201 String nameAttribute = tag.getAttribute("NAME"); 202 if (nameAttribute!=null && nameAttribute.equals(name)) 203 found=true; 204 } 205 } 206 if (found) 207 return tag; 208 else 209 return null; 210 } 211 212 /** 213 * Searches for all nodes whose text representation contains the search string. 214 * Collects all nodes containing the search string into a NodeList. 215 * This search is <b>case-insensitive</b> and the search string and the 216 * node text are converted to uppercase using an English locale. 217 * For example, if you wish to find any textareas in a form tag containing 218 * "hello world", the code would be: 219 * <code> 220 * NodeList nodeList = formTag.searchFor("Hello World"); 221 * </code> 222 * @param searchString Search criterion. 223 * @return A collection of nodes whose string contents or 224 * representation have the <code>searchString</code> in them. 225 */ 226 public NodeList searchFor (String searchString) 227 { 228 return (searchFor (searchString, false)); 229 } 230 231 /** 232 * Searches for all nodes whose text representation contains the search string. 233 * Collects all nodes containing the search string into a NodeList. 234 * For example, if you wish to find any textareas in a form tag containing 235 * "hello world", the code would be: 236 * <code> 237 * NodeList nodeList = formTag.searchFor("Hello World"); 238 * </code> 239 * @param searchString Search criterion. 240 * @param caseSensitive If <code>true</code> this search should be case 241 * sensitive. Otherwise, the search string and the node text are converted 242 * to uppercase using an English locale. 243 * @return A collection of nodes whose string contents or 244 * representation have the <code>searchString</code> in them. 245 */ 246 public NodeList searchFor (String searchString, boolean caseSensitive) 247 { 248 return (searchFor (searchString, caseSensitive, Locale.ENGLISH)); 249 } 250 251 /** 252 * Searches for all nodes whose text representation contains the search string. 253 * Collects all nodes containing the search string into a NodeList. 254 * For example, if you wish to find any textareas in a form tag containing 255 * "hello world", the code would be: 256 * <code> 257 * NodeList nodeList = formTag.searchFor("Hello World"); 258 * </code> 259 * @param searchString Search criterion. 260 * @param caseSensitive If <code>true</code> this search should be case 261 * sensitive. Otherwise, the search string and the node text are converted 262 * to uppercase using the locale provided. 263 * @param locale The locale for uppercase conversion. 264 * @return A collection of nodes whose string contents or 265 * representation have the <code>searchString</code> in them. 266 */ 267 public NodeList searchFor (String searchString, boolean caseSensitive, Locale locale) 268 { 269 Node node; 270 String text; 271 NodeList ret; 272 273 ret = new NodeList (); 274 275 if (!caseSensitive) 276 searchString = searchString.toUpperCase (locale); 277 for (SimpleNodeIterator e = children (); e.hasMoreNodes (); ) 278 { 279 node = e.nextNode (); 280 text = node.toPlainTextString (); 281 if (!caseSensitive) 282 text = text.toUpperCase (locale); 283 if (-1 != text.indexOf (searchString)) 284 ret.add (node); 285 } 286 287 return (ret); 288 } 289 290 /** 291 * Collect all objects that are of a certain type 292 * Note that this will not check for parent types, and will not 293 * recurse through child tags 294 * @param classType The class to search for. 295 * @param recursive If true, recursively search through the children. 296 * @return A list of children found. 297 */ 298 public NodeList searchFor (Class classType, boolean recursive) 299 { 300 NodeList children; 301 NodeList ret; 302 303 children = getChildren (); 304 if (null == children) 305 ret = new NodeList (); 306 else 307 ret = children.extractAllNodesThatMatch ( 308 new NodeClassFilter (classType), recursive); 309 310 return (ret); 311 } 312 313 /** 314 * Returns the node number of the first node containing the given text. 315 * This can be useful to index into the composite tag and get other children. 316 * Text is compared without case sensitivity and conversion to uppercase 317 * uses an English locale. 318 * @param text The text to search for. 319 * @return int The node index in the children list of the node containing 320 * the text or -1 if not found. 321 * @see #findPositionOf (String, Locale) 322 */ 323 public int findPositionOf (String text) 324 { 325 return (findPositionOf (text, Locale.ENGLISH)); 326 } 327 328 /** 329 * Returns the node number of the first node containing the given text. 330 * This can be useful to index into the composite tag and get other children. 331 * Text is compared without case sensitivity and conversion to uppercase 332 * uses the supplied locale. 333 * @return int The node index in the children list of the node containing 334 * the text or -1 if not found. 335 * @param locale The locale to use in converting to uppercase. 336 * @param text The text to search for. 337 */ 338 public int findPositionOf (String text, Locale locale) 339 { 340 Node node; 341 int loc; 342 343 loc = 0; 344 text = text.toUpperCase (locale); 345 for (SimpleNodeIterator e = children (); e.hasMoreNodes (); ) 346 { 347 node = e.nextNode (); 348 if (-1 != node.toPlainTextString ().toUpperCase (locale).indexOf (text)) 349 return loc; 350 loc++; 351 } 352 return -1; 353 } 354 355 /** 356 * Returns the node number of a child node given the node object. 357 * This would typically be used in conjuction with digUpStringNode, 358 * after which the string node's parent can be used to find the 359 * string node's position. Faster than calling findPositionOf(text) 360 * again. Note that the position is at a linear level alone - there 361 * is no recursion in this method. 362 * @param searchNode The child node to find. 363 * @return The offset of the child tag or -1 if it was not found. 364 */ 365 public int findPositionOf(Node searchNode) { 366 Node node; 367 int loc = 0; 368 for (SimpleNodeIterator e=children();e.hasMoreNodes();) { 369 node = e.nextNode(); 370 if (node==searchNode) { 371 return loc; 372 } 373 loc++; 374 } 375 return -1; 376 } 377 378 /** 379 * Get child at given index 380 * @param index The index into the child node list. 381 * @return Node The child node at the given index or null if none. 382 */ 383 public Node childAt (int index) 384 { 385 return ( 386 (null == getChildren ()) ? null : 387 getChildren ().elementAt (index)); 388 } 389 390 /** 391 * Collect this node and its child nodes (if-applicable) into the list parameter, 392 * provided the node satisfies the filtering criteria. 393 * <p>This mechanism allows powerful filtering code to be written very easily, 394 * without bothering about collection of embedded tags separately. 395 * e.g. when we try to get all the links on a page, it is not possible to 396 * get it at the top-level, as many tags (like form tags), can contain 397 * links embedded in them. We could get the links out by checking if the 398 * current node is a {@link CompositeTag}, and going through its children. 399 * So this method provides a convenient way to do this.</p> 400 * <p>Using collectInto(), programs get a lot shorter. Now, the code to 401 * extract all links from a page would look like: 402 * <pre> 403 * NodeList list = new NodeList(); 404 * NodeFilter filter = new TagNameFilter ("A"); 405 * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) 406 * e.nextNode().collectInto(list, filter); 407 * </pre> 408 * Thus, <code>list</code> will hold all the link nodes, irrespective of how 409 * deep the links are embedded.</p> 410 * <p>Another way to accomplish the same objective is: 411 * <pre> 412 * NodeList list = new NodeList(); 413 * NodeFilter filter = new TagClassFilter (LinkTag.class); 414 * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) 415 * e.nextNode().collectInto(list, filter); 416 * </pre> 417 * This is slightly less specific because the LinkTag class may be 418 * registered for more than one node name, e.g. <LINK> tags too.</p> 419 * @param list The list to add nodes to. 420 * @param filter The filter to apply. 421 * @see org.htmlparser.filters 422 */ 423 public void collectInto (NodeList list, NodeFilter filter) 424 { 425 super.collectInto (list, filter); 426 for (SimpleNodeIterator e = children(); e.hasMoreNodes ();) 427 e.nextNode ().collectInto (list, filter); 428 if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/> 429 getEndTag ().collectInto (list, filter); 430 } 431 432 /** 433 * Return the HTML code for the children of this tag. 434 * @return A string with the HTML code for the contents of this tag. 435 */ 436 public String getChildrenHTML() { 437 StringBuffer buff = new StringBuffer(); 438 for (SimpleNodeIterator e = children();e.hasMoreNodes();) { 439 AbstractNode node = (AbstractNode)e.nextNode(); 440 buff.append(node.toHtml()); 441 } 442 return buff.toString(); 443 } 444 445 /** 446 * Tag visiting code. 447 * Invokes <code>accept()</code> on the start tag and then 448 * walks the child list invoking <code>accept()</code> on each 449 * of the children, finishing up with an <code>accept()</code> 450 * call on the end tag. If <code>shouldRecurseSelf()</code> 451 * returns true it then asks the visitor to visit itself. 452 * @param visitor The <code>NodeVisitor</code> object to be signalled 453 * for each child and possibly this tag. 454 */ 455 public void accept (NodeVisitor visitor) 456 { 457 SimpleNodeIterator children; 458 Node child; 459 460 if (visitor.shouldRecurseSelf ()) 461 visitor.visitTag (this); 462 if (visitor.shouldRecurseChildren ()) 463 { 464 if (null != getChildren ()) 465 { 466 children = children (); 467 while (children.hasMoreNodes ()) 468 { 469 child = children.nextNode (); 470 child.accept (visitor); 471 } 472 } 473 if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/> 474 getEndTag ().accept (visitor); 475 } 476 } 477 478 /** 479 * Return the number of child nodes in this tag. 480 * @return The child node count. 481 */ 482 public int getChildCount() 483 { 484 NodeList children; 485 486 children = getChildren (); 487 488 return ((null == children) ? 0 : children.size ()); 489 } 490 491 /** 492 * Get the end tag for this tag. 493 * For example, if the node is {@.html <LABEL>The label</LABLE>}, then 494 * this method would return the {@.html </LABLE>} end tag. 495 * @return The end tag for this node. 496 * <em>Note: If the start and end position of the end tag is the same, 497 * then the end tag was injected (it's a virtual end tag).</em> 498 */ 499 public Tag getEndTag() 500 { 501 return (mEndTag); 502 } 503 504 /** 505 * Set the end tag for this tag. 506 * @param tag The new end tag for this tag. 507 * Note: no checking is perfromed so you can generate bad HTML by setting 508 * the end tag with a name not equal to the name of the start tag, 509 * i.e. {@.html <LABEL>The label</TITLE>} 510 */ 511 public void setEndTag (Tag tag) 512 { 513 mEndTag = tag; 514 } 515 516 /** 517 * Finds a text node, however embedded it might be, and returns 518 * it. The text node will retain links to its parents, so 519 * further navigation is possible. 520 * @param searchText The text to search for. 521 * @return The list of text nodes (recursively) found. 522 */ 523 public Text[] digupStringNode(String searchText) { 524 NodeList nodeList = searchFor(searchText); 525 NodeList stringNodes = new NodeList(); 526 for (int i=0;i<nodeList.size();i++) { 527 Node node = nodeList.elementAt(i); 528 if (node instanceof Text) { 529 stringNodes.add(node); 530 } else { 531 if (node instanceof CompositeTag) { 532 CompositeTag ctag = (CompositeTag)node; 533 Text[] nodes = ctag.digupStringNode(searchText); 534 for (int j=0;j<nodes.length;j++) 535 stringNodes.add(nodes[j]); 536 } 537 } 538 } 539 Text[] stringNode = new Text[stringNodes.size()]; 540 for (int i=0;i<stringNode.length;i++) { 541 stringNode[i] = (Text)stringNodes.elementAt(i); 542 } 543 return stringNode; 544 } 545 546 /** 547 * Return a string representation of the contents of this tag, it's children and it's end tag suitable for debugging. 548 * @return A textual representation of the tag. 549 */ 550 public String toString () 551 { 552 StringBuffer ret; 553 554 ret = new StringBuffer (1024); 555 toString (0, ret); 556 557 return (ret.toString ()); 558 } 559 560 /** 561 * Return the text contained in this tag. 562 * @return The complete contents of the tag (within the angle brackets). 563 */ 564 public String getText () 565 { 566 String ret; 567 568 ret = super.toHtml (); 569 ret = ret.substring (1, ret.length () - 1); 570 571 return (ret); 572 } 573 574 /** 575 * Return the text between the start tag and the end tag. 576 * @return The contents of the CompositeTag. 577 */ 578 public String getStringText () 579 { 580 String ret; 581 int start = getEndPosition (); 582 int end = mEndTag.getStartPosition (); 583 ret = getPage ().getText (start, end); 584 585 return (ret); 586 } 587 588 /** 589 * Return a string representation of the contents of this tag, it's children and it's end tag suitable for debugging. 590 * @param level The indentation level to use. 591 * @param buffer The buffer to append to. 592 */ 593 public void toString (int level, StringBuffer buffer) 594 { 595 Node node; 596 597 for (int i = 0; i < level; i++) 598 buffer.append (" "); 599 buffer.append (super.toString ()); 600 buffer.append (System.getProperty ("line.separator")); 601 for (SimpleNodeIterator e = children (); e.hasMoreNodes ();) 602 { 603 node = e.nextNode (); 604 if (node instanceof CompositeTag) 605 ((CompositeTag)node).toString (level + 1, buffer); 606 else 607 { 608 for (int i = 0; i <= level; i++) 609 buffer.append (" "); 610 buffer.append (node); 611 buffer.append (System.getProperty ("line.separator")); 612 } 613 } 614 615 if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/> 616 // eliminate virtual tags 617 // if (!(getEndTag ().getStartPosition () == getEndTag ().getEndPosition ())) 618 { 619 for (int i = 0; i <= level; i++) 620 buffer.append (" "); 621 buffer.append (getEndTag ().toString ()); 622 buffer.append (System.getProperty ("line.separator")); 623 } 624 } 625 }