ScriptScanner.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2003 Somik Raha 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2005/03/12 17:53:10 $ 10 // $Revision: 1.63 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.scanners; 28 29 import java.util.Vector; 30 31 import org.htmlparser.Attribute; 32 import org.htmlparser.Node; 33 import org.htmlparser.NodeFactory; 34 import org.htmlparser.PrototypicalNodeFactory; 35 import org.htmlparser.Remark; 36 import org.htmlparser.Tag; 37 import org.htmlparser.Text; 38 import org.htmlparser.lexer.Cursor; 39 import org.htmlparser.lexer.Lexer; 40 import org.htmlparser.lexer.Page; 41 import org.htmlparser.scanners.ScriptDecoder; 42 import org.htmlparser.tags.ScriptTag; 43 import org.htmlparser.util.NodeList; 44 import org.htmlparser.util.ParserException; 45 46 /** 47 * The ScriptScanner handles script CDATA. 48 */ 49 public class ScriptScanner 50 extends 51 CompositeTagScanner 52 { 53 /** 54 * Strict parsing of CDATA flag. 55 * If this flag is set true, the parsing of script is performed without 56 * regard to quotes. This means that erroneous script such as: 57 * <pre> 58 * document.write("</script>"); 59 * </pre> 60 * will be parsed in strict accordance with appendix 61 * <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data"> 62 * B.3.2 Specifying non-HTML data</a> of the 63 * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a> and 64 * hence will be split into two or more nodes. Correct javascript would 65 * escape the ETAGO: 66 * <pre> 67 * document.write("<\/script>"); 68 * </pre> 69 * If true, CDATA parsing will stop at the first ETAGO ("</") no matter 70 * whether it is quoted or not. If false, balanced quotes (either single or 71 * double) will shield an ETAGO. Beacuse of the possibility of quotes within 72 * single or multiline comments, these are also parsed. In most cases, 73 * users prefer non-strict handling since there is so much broken script 74 * out in the wild. 75 */ 76 public static boolean STRICT = false; 77 78 /** 79 * Create a script scanner. 80 */ 81 public ScriptScanner() 82 { 83 } 84 85 /** 86 * Scan for script. 87 * Accumulates text from the page, until </[a-zA-Z] is encountered. 88 * @param tag The tag this scanner is responsible for. 89 * @param lexer The source of CDATA. 90 * @param stack The parse stack, <em>not used</em>. 91 */ 92 public Tag scan (Tag tag, Lexer lexer, NodeList stack) 93 throws ParserException 94 { 95 String language; 96 String code; 97 Node content; 98 int position; 99 Node node; 100 Attribute attribute; 101 Vector vector; 102 103 if (tag instanceof ScriptTag) 104 { 105 language = ((ScriptTag)tag).getLanguage (); 106 if ((null != language) && 107 (language.equalsIgnoreCase ("JScript.Encode") || 108 language.equalsIgnoreCase ("VBScript.Encode"))) 109 { 110 code = ScriptDecoder.Decode (lexer.getPage (), lexer.getCursor ()); 111 ((ScriptTag)tag).setScriptCode (code); 112 } 113 } 114 content = lexer.parseCDATA (!STRICT); 115 position = lexer.getPosition (); 116 node = lexer.nextNode (false); 117 if (null != node) 118 if (!(node instanceof Tag) || !( ((Tag)node).isEndTag () 119 && ((Tag)node).getTagName ().equals (tag.getIds ()[0]))) 120 { 121 lexer.setPosition (position); 122 node = null; 123 } 124 125 // build new end tag if required 126 if (null == node) 127 { 128 attribute = new Attribute ("/script", null); 129 vector = new Vector (); 130 vector.addElement (attribute); 131 node = lexer.getNodeFactory ().createTagNode ( 132 lexer.getPage (), position, position, vector); 133 } 134 tag.setEndTag ((Tag)node); 135 if (null != content) 136 { 137 tag.setChildren (new NodeList (content)); 138 content.setParent (tag); 139 } 140 node.setParent (tag); 141 tag.doSemanticAction (); 142 143 return (tag); 144 } 145 }