/ org.htmlparser / src / org / htmlparser / scanners / ScriptScanner.java
ScriptScanner.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2003 Somik Raha
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2005/03/12 17:53:10 $
 10  // $Revision: 1.63 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser.scanners;
 28  
 29  import java.util.Vector;
 30  
 31  import org.htmlparser.Attribute;
 32  import org.htmlparser.Node;
 33  import org.htmlparser.NodeFactory;
 34  import org.htmlparser.PrototypicalNodeFactory;
 35  import org.htmlparser.Remark;
 36  import org.htmlparser.Tag;
 37  import org.htmlparser.Text;
 38  import org.htmlparser.lexer.Cursor;
 39  import org.htmlparser.lexer.Lexer;
 40  import org.htmlparser.lexer.Page;
 41  import org.htmlparser.scanners.ScriptDecoder;
 42  import org.htmlparser.tags.ScriptTag;
 43  import org.htmlparser.util.NodeList;
 44  import org.htmlparser.util.ParserException;
 45  
 46  /**
 47   * The ScriptScanner handles script CDATA.
 48   */
 49  public class ScriptScanner
 50      extends
 51          CompositeTagScanner
 52  {
 53      /**
 54       * Strict parsing of CDATA flag.
 55       * If this flag is set true, the parsing of script is performed without
 56       * regard to quotes. This means that erroneous script such as:
 57       * <pre>
 58       * document.write("&lt;/script&gt");
 59       * </pre>
 60       * will be parsed in strict accordance with appendix
 61       * <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data">
 62       * B.3.2 Specifying non-HTML data</a> of the
 63       * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a> and
 64       * hence will be split into two or more nodes. Correct javascript would
 65       * escape the ETAGO:
 66       * <pre>
 67       * document.write("&lt;\/script&gt");
 68       * </pre>
 69       * If true, CDATA parsing will stop at the first ETAGO ("&lt;/") no matter
 70       * whether it is quoted or not. If false, balanced quotes (either single or
 71       * double) will shield an ETAGO. Beacuse of the possibility of quotes within
 72       * single or multiline comments, these are also parsed. In most cases,
 73       * users prefer non-strict handling since there is so much broken script
 74       * out in the wild.
 75       */
 76      public static boolean STRICT = false;
 77  
 78      /**
 79       * Create a script scanner.
 80       */
 81      public ScriptScanner()
 82      {
 83      }
 84  
 85      /**
 86       * Scan for script.
 87       * Accumulates text from the page, until &lt;/[a-zA-Z] is encountered.
 88       * @param tag The tag this scanner is responsible for.
 89       * @param lexer The source of CDATA.
 90       * @param stack The parse stack, <em>not used</em>.
 91       */
 92      public Tag scan (Tag tag, Lexer lexer, NodeList stack)
 93          throws ParserException
 94      {
 95          String language;
 96          String code;
 97          Node content;
 98          int position;
 99          Node node;
100          Attribute attribute;
101          Vector vector;
102  
103          if (tag instanceof ScriptTag)
104          {
105              language = ((ScriptTag)tag).getLanguage ();
106              if ((null != language) &&
107                  (language.equalsIgnoreCase ("JScript.Encode") ||
108                   language.equalsIgnoreCase ("VBScript.Encode")))
109              {
110                  code = ScriptDecoder.Decode (lexer.getPage (), lexer.getCursor ());
111                  ((ScriptTag)tag).setScriptCode (code);
112              }
113          }
114          content = lexer.parseCDATA (!STRICT);
115          position = lexer.getPosition ();
116          node = lexer.nextNode (false);
117          if (null != node)
118              if (!(node instanceof Tag) || !(   ((Tag)node).isEndTag ()
119                  && ((Tag)node).getTagName ().equals (tag.getIds ()[0])))
120              {
121                  lexer.setPosition (position);
122                  node = null;
123              }
124  
125          // build new end tag if required
126          if (null == node)
127          {
128              attribute = new Attribute ("/script", null);
129              vector = new Vector ();
130              vector.addElement (attribute);
131              node = lexer.getNodeFactory ().createTagNode (
132                  lexer.getPage (), position, position, vector);
133          }
134          tag.setEndTag ((Tag)node);
135          if (null != content)
136          {
137              tag.setChildren (new NodeList (content));
138              content.setParent (tag);
139          }
140          node.setParent (tag);
141          tag.doSemanticAction ();
142  
143          return (tag);
144      }
145  }