/ org.htmlparser / src / org / htmlparser / scanners / CompositeTagScanner.java
CompositeTagScanner.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2003 Somik Raha
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2005/04/10 23:20:44 $
 10  // $Revision: 1.90 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser.scanners;
 28  
 29  import java.util.Vector;
 30  
 31  import org.htmlparser.Attribute;
 32  import org.htmlparser.Node;
 33  import org.htmlparser.Tag;
 34  import org.htmlparser.lexer.Lexer;
 35  import org.htmlparser.lexer.Page;
 36  import org.htmlparser.scanners.Scanner;
 37  import org.htmlparser.util.NodeList;
 38  import org.htmlparser.util.ParserException;
 39  
 40  /**
 41   * The main scanning logic for nested tags.
 42   * When asked to scan, this class gathers nodes into a heirarchy of tags.
 43   */
 44  public class CompositeTagScanner extends TagScanner
 45  {
 46      /**
 47       * Determine whether to use JVM or NodeList stack.
 48       * This can be set to true to get the original behaviour of
 49       * recursion into composite tags on the JVM stack.
 50       * This may lead to StackOverFlowException problems in some cases
 51       * i.e. Windows.
 52       */
 53      private static final boolean mUseJVMStack = false;
 54  
 55      /**
 56       * Determine whether unexpected end tags should cause stack roll-up.
 57       * This can be set to true to get the original behaviour of gathering
 58       * end tags into whatever tag is open.
 59       * This can be expensive, but should only be needed in the presence of
 60       * bad HTML.
 61       */
 62      private static final boolean mLeaveEnds = false;
 63  
 64      /**
 65       * Create a composite tag scanner.
 66       */
 67      public CompositeTagScanner ()
 68      {
 69      }
 70  
 71      /**
 72       * Collect the children.
 73       * <p>An initial test is performed for an empty XML tag, in which case
 74       * the start tag and end tag of the returned tag are the same and it has
 75       * no children.<p>
 76       * If it's not an empty XML tag, the lexer is repeatedly asked for
 77       * subsequent nodes until an end tag is found or a node is encountered
 78       * that matches the tag ender set or end tag ender set.
 79       * In the latter case, a virtual end tag is created.
 80       * Each node found that is not the end tag is added to
 81       * the list of children. The end tag is special and not a child.<p>
 82       * Nodes that also have a CompositeTagScanner as their scanner are
 83       * recursed into, which provides the nested structure of an HTML page.
 84       * This method operates in two possible modes, depending on a private boolean.
 85       * It can recurse on the JVM stack, which has caused some overflow problems
 86       * in the past, or it can use the supplied stack argument to nest scanning
 87       * of child tags within itself. The former is left as an option in the code,
 88       * mostly to help subsequent modifiers visualize what the internal nesting
 89       * is doing.
 90       * @param tag The tag this scanner is responsible for.
 91       * @param lexer The source of subsequent nodes.
 92       * @param stack The parse stack. May contain pending tags that enclose
 93       * this tag.
 94       * @return The resultant tag (may be unchanged).
 95       */
 96      public Tag scan (Tag tag, Lexer lexer, NodeList stack) throws ParserException
 97      {
 98          Node node;
 99          Tag next;
100          String name;
101          Scanner scanner;
102          Tag ret;
103          
104          ret = tag;
105  
106          if (ret.isEmptyXmlTag ())
107              ret.setEndTag (ret);
108          else
109              do
110              {
111                  node = lexer.nextNode (false);
112                  if (null != node)
113                  {
114                      if (node instanceof Tag)
115                      {
116                          next = (Tag)node;
117                          name = next.getTagName ();
118                          // check for normal end tag
119                          if (next.isEndTag () && name.equals (ret.getTagName ()))
120                          {
121                              ret.setEndTag (next);
122                              node = null;
123                          }
124                          else if (isTagToBeEndedFor (ret, next)) // check DTD
125                          {
126                              // backup one node. insert a virtual end tag later
127                              lexer.setPosition (next.getStartPosition ());
128                              node = null;
129                          }
130                          else if (!next.isEndTag ())
131                          {
132                              // now recurse if there is a scanner for this type of tag
133                              scanner = next.getThisScanner ();
134                              if (null != scanner)
135                              {
136                                  if (mUseJVMStack)
137                                  {   // JVM stack recursion
138                                      node = scanner.scan (next, lexer, stack);
139                                      addChild (ret, node);
140                                  }
141                                  else
142                                  {
143                                      // fake recursion:
144                                      if (scanner == this)
145                                      {
146                                          if (next.isEmptyXmlTag ())
147                                          {
148                                              next.setEndTag (next);
149                                              finishTag (next, lexer);
150                                              addChild (ret, next);
151                                          }
152                                          else
153                                          {
154                                              stack.add (ret);
155                                              ret = next;
156                                          }
157                                      }
158                                      else
159                                      {   // normal recursion if switching scanners
160                                          node = scanner.scan (next, lexer, stack);
161                                          addChild (ret, node);
162                                      }
163                                  }
164                              }
165                              else
166                                  addChild (ret, next);
167                          }
168                          else
169                          {
170                              if (!mUseJVMStack && !mLeaveEnds)
171                              {
172                                  // Since all non-end tags are consumed by the
173                                  // previous clause, we're here because we have an
174                                  // end tag with no opening tag... this could be bad.
175                                  // There are two cases...
176                                  // 1) The tag hasn't been registered, in which case
177                                  // we just add it as a simple child, like it's
178                                  // opening tag
179                                  // 2) There may be an opening tag further up the
180                                  // parse stack that needs closing.
181                                  // So, we ask the factory for a node like this one
182                                  // (since end tags never have scanners) and see
183                                  // if it's scanner is a composite tag scanner.
184                                  // If it is we walk up the parse stack looking for
185                                  // something that needs this end tag to finish it.
186                                  // If there is something, we close off all the tags
187                                  // walked over and continue on as if nothing
188                                  // happened.
189                                  Vector attributes = new Vector ();
190                                  attributes.addElement (new Attribute (name, null));
191                                  Tag opener = lexer.getNodeFactory ().createTagNode (
192                                      lexer.getPage (), next.getStartPosition (), next.getEndPosition (),
193                                      attributes);
194  
195                                  scanner = opener.getThisScanner ();
196                                  if ((null != scanner) && (scanner == this))
197                                  {
198                                      // uh-oh
199                                      int index = -1;
200                                      for (int i = stack.size () - 1; (-1 == index) && (i >= 0); i--)
201                                      {
202                                          // short circuit here... assume everything on the stack has this as it's scanner
203                                          // we'll need to stop if either of those conditions isn't met
204                                          Tag boffo = (Tag)stack.elementAt (i);
205                                          if (name.equals (boffo.getTagName ()))
206                                              index = i;
207                                          else if (isTagToBeEndedFor (boffo, next)) // check DTD
208                                              index = i;
209                                      }
210                                      if (-1 != index)
211                                      {
212                                          // finish off the current one first
213                                          finishTag (ret, lexer);
214                                          addChild ((Tag)stack.elementAt (stack.size () - 1), ret);
215                                          for (int i = stack.size () - 1; i > index; i--)
216                                          {
217                                              Tag fred = (Tag)stack.remove (i);
218                                              finishTag (fred, lexer);
219                                              addChild ((Tag)stack.elementAt (i - 1), fred);
220                                          }
221                                          ret = (Tag)stack.remove (index);
222                                          node = null;
223                                      }
224                                      else
225                                          addChild (ret, next); // default behaviour
226                                  }
227                                  else
228                                      addChild (ret, next); // default behaviour
229                              }
230                              else
231                                  addChild (ret, next);
232                          }
233                      }
234                      else
235                      {
236                          addChild (ret, node);
237                          node.doSemanticAction ();
238                      }
239                  }
240  
241                  if (!mUseJVMStack)
242                  {
243                      // handle coming out of fake recursion
244                      if (null == node)
245                      {
246                          int depth = stack.size ();
247                          if (0 != depth)
248                          {
249                              node = stack.elementAt (depth - 1);
250                              if (node instanceof Tag)
251                              {
252                                  Tag precursor = (Tag)node;
253                                  scanner = precursor.getThisScanner ();
254                                  if (scanner == this)
255                                  {
256                                      stack.remove (depth - 1);
257                                      finishTag (ret, lexer);
258                                      addChild (precursor, ret);
259                                      ret = precursor;
260                                  }
261                                  else
262                                      node = null; // normal recursion
263                              }
264                              else
265                                  node = null; // normal recursion
266                          }
267                      }
268                  }
269              }
270              while (null != node);
271  
272          finishTag (ret, lexer);
273  
274          return (ret);
275      }
276  
277      /**
278       * Add a child to the given tag.
279       * @param parent The parent tag.
280       * @param child The child node.
281       */
282      protected void addChild (Tag parent, Node child)
283      {
284          if (null == parent.getChildren ())
285              parent.setChildren (new NodeList ());
286          child.setParent (parent);
287          parent.getChildren ().add (child);
288      }
289  
290      /**
291       * Finish off a tag.
292       * Perhap add a virtual end tag.
293       * Set the end tag parent as this tag.
294       * Perform the semantic acton.
295       * @param tag The tag to finish off.
296       * @param lexer A lexer positioned at the end of the tag.
297       */
298      protected void finishTag (Tag tag, Lexer lexer)
299          throws
300              ParserException
301      {
302          if (null == tag.getEndTag ())
303              tag.setEndTag (createVirtualEndTag (tag, lexer, lexer.getPage (), lexer.getCursor ().getPosition ()));
304          tag.getEndTag ().setParent (tag);
305          tag.doSemanticAction ();
306      }
307  
308      /**
309       * Creates an end tag with the same name as the given tag.
310       * @param tag The tag to end.
311       * @param lexer The object containg the node factory.
312       * @param page The page the tag is on (virtually).
313       * @param position The offset into the page at which the tag is to
314       * be anchored.
315       * @return An end tag with the name '"/" + tag.getTagName()' and a start
316       * and end position at the given position. The fact these positions are
317       * equal may be used to distinguish it as a virtual tag later on.
318       */
319      protected Tag createVirtualEndTag (Tag tag, Lexer lexer, Page page, int position)
320          throws
321              ParserException
322      {
323          Tag ret;
324          String name;
325          Vector attributes;
326          
327          name = "/" + tag.getRawTagName ();
328          attributes = new Vector ();
329          attributes.addElement (new Attribute (name, (String)null));
330          ret = lexer.getNodeFactory ().createTagNode (
331                                      page, position, position, attributes);
332          
333          return (ret);
334      }
335  
336      /**
337       * Determine if the current tag should be terminated by the given tag.
338       * Examines the 'enders' or 'end tag enders' lists of the current tag
339       * for a match with the given tag. Which list is chosen depends on whether
340       * tag is an end tag ('end tag enders') or not ('enders').
341       * @param current The tag that might need to be ended.
342       * @param tag The candidate tag that might end the current one.
343       * @return <code>true</code> if the name of the given tag is a member of
344       * the appropriate list.
345       */
346      public final boolean isTagToBeEndedFor (Tag current, Tag tag)
347      {
348          String name;
349          String[] ends;
350          boolean ret;
351  
352          ret = false;
353  
354          name = tag.getTagName ();
355          if (tag.isEndTag ())
356              ends = current.getEndTagEnders ();
357          else
358              ends = current.getEnders ();
359          for (int i = 0; i < ends.length; i++)
360              if (name.equalsIgnoreCase (ends[i]))
361              {
362                  ret = true;
363                  break;
364              }
365          
366          return (ret);
367      }
368  }