CompositeTagScanner.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2003 Somik Raha 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2005/04/10 23:20:44 $ 10 // $Revision: 1.90 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.scanners; 28 29 import java.util.Vector; 30 31 import org.htmlparser.Attribute; 32 import org.htmlparser.Node; 33 import org.htmlparser.Tag; 34 import org.htmlparser.lexer.Lexer; 35 import org.htmlparser.lexer.Page; 36 import org.htmlparser.scanners.Scanner; 37 import org.htmlparser.util.NodeList; 38 import org.htmlparser.util.ParserException; 39 40 /** 41 * The main scanning logic for nested tags. 42 * When asked to scan, this class gathers nodes into a heirarchy of tags. 43 */ 44 public class CompositeTagScanner extends TagScanner 45 { 46 /** 47 * Determine whether to use JVM or NodeList stack. 48 * This can be set to true to get the original behaviour of 49 * recursion into composite tags on the JVM stack. 50 * This may lead to StackOverFlowException problems in some cases 51 * i.e. Windows. 52 */ 53 private static final boolean mUseJVMStack = false; 54 55 /** 56 * Determine whether unexpected end tags should cause stack roll-up. 57 * This can be set to true to get the original behaviour of gathering 58 * end tags into whatever tag is open. 59 * This can be expensive, but should only be needed in the presence of 60 * bad HTML. 61 */ 62 private static final boolean mLeaveEnds = false; 63 64 /** 65 * Create a composite tag scanner. 66 */ 67 public CompositeTagScanner () 68 { 69 } 70 71 /** 72 * Collect the children. 73 * <p>An initial test is performed for an empty XML tag, in which case 74 * the start tag and end tag of the returned tag are the same and it has 75 * no children.<p> 76 * If it's not an empty XML tag, the lexer is repeatedly asked for 77 * subsequent nodes until an end tag is found or a node is encountered 78 * that matches the tag ender set or end tag ender set. 79 * In the latter case, a virtual end tag is created. 80 * Each node found that is not the end tag is added to 81 * the list of children. The end tag is special and not a child.<p> 82 * Nodes that also have a CompositeTagScanner as their scanner are 83 * recursed into, which provides the nested structure of an HTML page. 84 * This method operates in two possible modes, depending on a private boolean. 85 * It can recurse on the JVM stack, which has caused some overflow problems 86 * in the past, or it can use the supplied stack argument to nest scanning 87 * of child tags within itself. The former is left as an option in the code, 88 * mostly to help subsequent modifiers visualize what the internal nesting 89 * is doing. 90 * @param tag The tag this scanner is responsible for. 91 * @param lexer The source of subsequent nodes. 92 * @param stack The parse stack. May contain pending tags that enclose 93 * this tag. 94 * @return The resultant tag (may be unchanged). 95 */ 96 public Tag scan (Tag tag, Lexer lexer, NodeList stack) throws ParserException 97 { 98 Node node; 99 Tag next; 100 String name; 101 Scanner scanner; 102 Tag ret; 103 104 ret = tag; 105 106 if (ret.isEmptyXmlTag ()) 107 ret.setEndTag (ret); 108 else 109 do 110 { 111 node = lexer.nextNode (false); 112 if (null != node) 113 { 114 if (node instanceof Tag) 115 { 116 next = (Tag)node; 117 name = next.getTagName (); 118 // check for normal end tag 119 if (next.isEndTag () && name.equals (ret.getTagName ())) 120 { 121 ret.setEndTag (next); 122 node = null; 123 } 124 else if (isTagToBeEndedFor (ret, next)) // check DTD 125 { 126 // backup one node. insert a virtual end tag later 127 lexer.setPosition (next.getStartPosition ()); 128 node = null; 129 } 130 else if (!next.isEndTag ()) 131 { 132 // now recurse if there is a scanner for this type of tag 133 scanner = next.getThisScanner (); 134 if (null != scanner) 135 { 136 if (mUseJVMStack) 137 { // JVM stack recursion 138 node = scanner.scan (next, lexer, stack); 139 addChild (ret, node); 140 } 141 else 142 { 143 // fake recursion: 144 if (scanner == this) 145 { 146 if (next.isEmptyXmlTag ()) 147 { 148 next.setEndTag (next); 149 finishTag (next, lexer); 150 addChild (ret, next); 151 } 152 else 153 { 154 stack.add (ret); 155 ret = next; 156 } 157 } 158 else 159 { // normal recursion if switching scanners 160 node = scanner.scan (next, lexer, stack); 161 addChild (ret, node); 162 } 163 } 164 } 165 else 166 addChild (ret, next); 167 } 168 else 169 { 170 if (!mUseJVMStack && !mLeaveEnds) 171 { 172 // Since all non-end tags are consumed by the 173 // previous clause, we're here because we have an 174 // end tag with no opening tag... this could be bad. 175 // There are two cases... 176 // 1) The tag hasn't been registered, in which case 177 // we just add it as a simple child, like it's 178 // opening tag 179 // 2) There may be an opening tag further up the 180 // parse stack that needs closing. 181 // So, we ask the factory for a node like this one 182 // (since end tags never have scanners) and see 183 // if it's scanner is a composite tag scanner. 184 // If it is we walk up the parse stack looking for 185 // something that needs this end tag to finish it. 186 // If there is something, we close off all the tags 187 // walked over and continue on as if nothing 188 // happened. 189 Vector attributes = new Vector (); 190 attributes.addElement (new Attribute (name, null)); 191 Tag opener = lexer.getNodeFactory ().createTagNode ( 192 lexer.getPage (), next.getStartPosition (), next.getEndPosition (), 193 attributes); 194 195 scanner = opener.getThisScanner (); 196 if ((null != scanner) && (scanner == this)) 197 { 198 // uh-oh 199 int index = -1; 200 for (int i = stack.size () - 1; (-1 == index) && (i >= 0); i--) 201 { 202 // short circuit here... assume everything on the stack has this as it's scanner 203 // we'll need to stop if either of those conditions isn't met 204 Tag boffo = (Tag)stack.elementAt (i); 205 if (name.equals (boffo.getTagName ())) 206 index = i; 207 else if (isTagToBeEndedFor (boffo, next)) // check DTD 208 index = i; 209 } 210 if (-1 != index) 211 { 212 // finish off the current one first 213 finishTag (ret, lexer); 214 addChild ((Tag)stack.elementAt (stack.size () - 1), ret); 215 for (int i = stack.size () - 1; i > index; i--) 216 { 217 Tag fred = (Tag)stack.remove (i); 218 finishTag (fred, lexer); 219 addChild ((Tag)stack.elementAt (i - 1), fred); 220 } 221 ret = (Tag)stack.remove (index); 222 node = null; 223 } 224 else 225 addChild (ret, next); // default behaviour 226 } 227 else 228 addChild (ret, next); // default behaviour 229 } 230 else 231 addChild (ret, next); 232 } 233 } 234 else 235 { 236 addChild (ret, node); 237 node.doSemanticAction (); 238 } 239 } 240 241 if (!mUseJVMStack) 242 { 243 // handle coming out of fake recursion 244 if (null == node) 245 { 246 int depth = stack.size (); 247 if (0 != depth) 248 { 249 node = stack.elementAt (depth - 1); 250 if (node instanceof Tag) 251 { 252 Tag precursor = (Tag)node; 253 scanner = precursor.getThisScanner (); 254 if (scanner == this) 255 { 256 stack.remove (depth - 1); 257 finishTag (ret, lexer); 258 addChild (precursor, ret); 259 ret = precursor; 260 } 261 else 262 node = null; // normal recursion 263 } 264 else 265 node = null; // normal recursion 266 } 267 } 268 } 269 } 270 while (null != node); 271 272 finishTag (ret, lexer); 273 274 return (ret); 275 } 276 277 /** 278 * Add a child to the given tag. 279 * @param parent The parent tag. 280 * @param child The child node. 281 */ 282 protected void addChild (Tag parent, Node child) 283 { 284 if (null == parent.getChildren ()) 285 parent.setChildren (new NodeList ()); 286 child.setParent (parent); 287 parent.getChildren ().add (child); 288 } 289 290 /** 291 * Finish off a tag. 292 * Perhap add a virtual end tag. 293 * Set the end tag parent as this tag. 294 * Perform the semantic acton. 295 * @param tag The tag to finish off. 296 * @param lexer A lexer positioned at the end of the tag. 297 */ 298 protected void finishTag (Tag tag, Lexer lexer) 299 throws 300 ParserException 301 { 302 if (null == tag.getEndTag ()) 303 tag.setEndTag (createVirtualEndTag (tag, lexer, lexer.getPage (), lexer.getCursor ().getPosition ())); 304 tag.getEndTag ().setParent (tag); 305 tag.doSemanticAction (); 306 } 307 308 /** 309 * Creates an end tag with the same name as the given tag. 310 * @param tag The tag to end. 311 * @param lexer The object containg the node factory. 312 * @param page The page the tag is on (virtually). 313 * @param position The offset into the page at which the tag is to 314 * be anchored. 315 * @return An end tag with the name '"/" + tag.getTagName()' and a start 316 * and end position at the given position. The fact these positions are 317 * equal may be used to distinguish it as a virtual tag later on. 318 */ 319 protected Tag createVirtualEndTag (Tag tag, Lexer lexer, Page page, int position) 320 throws 321 ParserException 322 { 323 Tag ret; 324 String name; 325 Vector attributes; 326 327 name = "/" + tag.getRawTagName (); 328 attributes = new Vector (); 329 attributes.addElement (new Attribute (name, (String)null)); 330 ret = lexer.getNodeFactory ().createTagNode ( 331 page, position, position, attributes); 332 333 return (ret); 334 } 335 336 /** 337 * Determine if the current tag should be terminated by the given tag. 338 * Examines the 'enders' or 'end tag enders' lists of the current tag 339 * for a match with the given tag. Which list is chosen depends on whether 340 * tag is an end tag ('end tag enders') or not ('enders'). 341 * @param current The tag that might need to be ended. 342 * @param tag The candidate tag that might end the current one. 343 * @return <code>true</code> if the name of the given tag is a member of 344 * the appropriate list. 345 */ 346 public final boolean isTagToBeEndedFor (Tag current, Tag tag) 347 { 348 String name; 349 String[] ends; 350 boolean ret; 351 352 ret = false; 353 354 name = tag.getTagName (); 355 if (tag.isEndTag ()) 356 ends = current.getEndTagEnders (); 357 else 358 ends = current.getEnders (); 359 for (int i = 0; i < ends.length; i++) 360 if (name.equalsIgnoreCase (ends[i])) 361 { 362 ret = true; 363 break; 364 } 365 366 return (ret); 367 } 368 }