PrototypicalNodeFactory.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2003 Derrick Oswald 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/PrototypicalNodeFactory.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2006/03/19 15:01:24 $ 10 // $Revision: 1.19 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser; 28 29 import java.io.Serializable; 30 import java.util.Hashtable; 31 import java.util.Locale; 32 import java.util.Map; 33 import java.util.Set; 34 import java.util.Vector; 35 36 import org.htmlparser.lexer.Page; 37 import org.htmlparser.nodes.TextNode; 38 import org.htmlparser.nodes.RemarkNode; 39 import org.htmlparser.nodes.TagNode; 40 import org.htmlparser.tags.AppletTag; 41 import org.htmlparser.tags.BaseHrefTag; 42 import org.htmlparser.tags.BodyTag; 43 import org.htmlparser.tags.Bullet; 44 import org.htmlparser.tags.BulletList; 45 import org.htmlparser.tags.DefinitionList; 46 import org.htmlparser.tags.DefinitionListBullet; 47 import org.htmlparser.tags.Div; 48 import org.htmlparser.tags.DoctypeTag; 49 import org.htmlparser.tags.FormTag; 50 import org.htmlparser.tags.FrameSetTag; 51 import org.htmlparser.tags.FrameTag; 52 import org.htmlparser.tags.HeadingTag; 53 import org.htmlparser.tags.HeadTag; 54 import org.htmlparser.tags.Html; 55 import org.htmlparser.tags.ImageTag; 56 import org.htmlparser.tags.InputTag; 57 import org.htmlparser.tags.JspTag; 58 import org.htmlparser.tags.LabelTag; 59 import org.htmlparser.tags.LinkTag; 60 import org.htmlparser.tags.MetaTag; 61 import org.htmlparser.tags.ObjectTag; 62 import org.htmlparser.tags.OptionTag; 63 import org.htmlparser.tags.ParagraphTag; 64 import org.htmlparser.tags.ProcessingInstructionTag; 65 import org.htmlparser.tags.ScriptTag; 66 import org.htmlparser.tags.SelectTag; 67 import org.htmlparser.tags.Span; 68 import org.htmlparser.tags.StyleTag; 69 import org.htmlparser.tags.TableColumn; 70 import org.htmlparser.tags.TableHeader; 71 import org.htmlparser.tags.TableRow; 72 import org.htmlparser.tags.TableTag; 73 import org.htmlparser.tags.TextareaTag; 74 import org.htmlparser.tags.TitleTag; 75 76 /** 77 * A node factory based on the prototype pattern. 78 * This factory uses the prototype pattern to generate new nodes. 79 * These are cloned as needed to form new {@link Text}, {@link Remark} and 80 * {@link Tag} nodes. 81 * <p>Text and remark nodes are generated from prototypes accessed 82 * via the {@link #setTextPrototype(Text) textPrototype} and 83 * {@link #setRemarkPrototype(Remark) remarkPrototype} properties respectively. 84 * Tag nodes are generated as follows: 85 * <p>Prototype tags, in the form of undifferentiated tags, are held in a hash 86 * table. On a request for a tag, the attributes are examined for the name 87 * of the tag to be created. If a prototype of that name has been registered 88 * (exists in the hash table), it is cloned and the clone is given the 89 * characteristics ({@link Attribute Attributes}, start and end position) 90 * of the requested tag.</p> 91 * <p>In the case that no tag has been registered under that name, 92 * a generic tag is created from the prototype acessed via the 93 * {@link #setTagPrototype(Tag) tagPrototype} property.</p> 94 * <p>The hash table of registered tags can be automatically populated with 95 * all the known tags from the {@link org.htmlparser.tags} package when 96 * the factory is constructed, or it can start out empty and be populated 97 * explicitly.</p> 98 * <p>Here is an example of how to override all text issued from 99 * {@link org.htmlparser.nodes.TextNode#toPlainTextString() 100 * Text.toPlainTextString()}, 101 * in this case decoding (converting character references), 102 * which illustrates the use of setting the text prototype: 103 * <pre> 104 * PrototypicalNodeFactory factory = new PrototypicalNodeFactory (); 105 * factory.setTextPrototype ( 106 * // create a inner class that is a subclass of TextNode 107 * new TextNode () { 108 * public String toPlainTextString() 109 * { 110 * String original = super.toPlainTextString (); 111 * return (org.htmlparser.util.Translate.decode (original)); 112 * } 113 * }); 114 * Parser parser = new Parser (); 115 * parser.setNodeFactory (factory); 116 * </pre></p> 117 * <p>Here is an example of using a custom link tag, in this case just 118 * printing the URL, which illustrates registering a tag: 119 * <pre> 120 * 121 * class PrintingLinkTag extends LinkTag 122 * { 123 * public void doSemanticAction () 124 * throws 125 * ParserException 126 * { 127 * System.out.println (getLink ()); 128 * } 129 * } 130 * PrototypicalNodeFactory factory = new PrototypicalNodeFactory (); 131 * factory.registerTag (new PrintingLinkTag ()); 132 * Parser parser = new Parser (); 133 * parser.setNodeFactory (factory); 134 * </pre></p> 135 */ 136 public class PrototypicalNodeFactory 137 implements 138 Serializable, 139 NodeFactory 140 { 141 /** 142 * The prototypical text node. 143 */ 144 protected Text mText; 145 146 /** 147 * The prototypical remark node. 148 */ 149 protected Remark mRemark; 150 151 /** 152 * The prototypical tag node. 153 */ 154 protected Tag mTag; 155 156 /** 157 * The list of tags to return. 158 * The list is keyed by tag name. 159 */ 160 protected Map mBlastocyst; 161 162 /** 163 * Create a new factory with all tags registered. 164 * Equivalent to 165 * {@link #PrototypicalNodeFactory() PrototypicalNodeFactory(false)}. 166 */ 167 public PrototypicalNodeFactory () 168 { 169 this (false); 170 } 171 172 /** 173 * Create a new factory. 174 * @param empty If <code>true</code>, creates an empty factory, 175 * otherwise create a new factory with all tags registered. 176 */ 177 public PrototypicalNodeFactory (boolean empty) 178 { 179 clear (); 180 mText = new TextNode (null, 0, 0); 181 mRemark = new RemarkNode (null, 0, 0); 182 mTag = new TagNode (null, 0, 0, null); 183 if (!empty) 184 registerTags (); 185 } 186 187 /** 188 * Create a new factory with the given tag as the only registered tag. 189 * @param tag The single tag to register in the otherwise empty factory. 190 */ 191 public PrototypicalNodeFactory (Tag tag) 192 { 193 this (true); 194 registerTag (tag); 195 } 196 197 /** 198 * Create a new factory with the given tags registered. 199 * @param tags The tags to register in the otherwise empty factory. 200 */ 201 public PrototypicalNodeFactory (Tag[] tags) 202 { 203 this (true); 204 for (int i = 0; i < tags.length; i++) 205 registerTag (tags[i]); 206 } 207 208 /** 209 * Adds a tag to the registry. 210 * @param id The name under which to register the tag. 211 * <strong>For proper operation, the id should be uppercase so it 212 * will be matched by a Map lookup.</strong> 213 * @param tag The tag to be returned from a {@link #createTagNode} call. 214 * @return The tag previously registered with that id if any, 215 * or <code>null</code> if none. 216 */ 217 public Tag put (String id, Tag tag) 218 { 219 return ((Tag)mBlastocyst.put (id, tag)); 220 } 221 222 /** 223 * Gets a tag from the registry. 224 * @param id The name of the tag to return. 225 * @return The tag registered under the <code>id</code> name, 226 * or <code>null</code> if none. 227 */ 228 public Tag get (String id) 229 { 230 return ((Tag)mBlastocyst.get (id)); 231 } 232 233 /** 234 * Remove a tag from the registry. 235 * @param id The name of the tag to remove. 236 * @return The tag that was registered with that <code>id</code>, 237 * or <code>null</code> if none. 238 */ 239 public Tag remove (String id) 240 { 241 return ((Tag)mBlastocyst.remove (id)); 242 } 243 244 /** 245 * Clean out the registry. 246 */ 247 public void clear () 248 { 249 mBlastocyst = new Hashtable (); 250 } 251 252 /** 253 * Get the list of tag names. 254 * @return The names of the tags currently registered. 255 */ 256 public Set getTagNames () 257 { 258 return (mBlastocyst.keySet ()); 259 } 260 261 /** 262 * Register a tag. 263 * Registers the given tag under every {@link Tag#getIds() id} that the 264 * tag has (i.e. all names returned by {@link Tag#getIds() tag.getIds()}. 265 * <p><strong>For proper operation, the ids are converted to uppercase so 266 * they will be matched by a Map lookup.</strong> 267 * @param tag The tag to register. 268 */ 269 public void registerTag (Tag tag) 270 { 271 String[] ids; 272 273 ids = tag.getIds (); 274 for (int i = 0; i < ids.length; i++) 275 put (ids[i].toUpperCase (Locale.ENGLISH), tag); 276 } 277 278 /** 279 * Unregister a tag. 280 * Unregisters the given tag from every {@link Tag#getIds() id} the tag has. 281 * <p><strong>The ids are converted to uppercase to undo the operation 282 * of registerTag.</strong> 283 * @param tag The tag to unregister. 284 */ 285 public void unregisterTag (Tag tag) 286 { 287 String[] ids; 288 289 ids = tag.getIds (); 290 for (int i = 0; i < ids.length; i++) 291 remove (ids[i].toUpperCase (Locale.ENGLISH)); 292 } 293 294 /** 295 * Register all known tags in the tag package. 296 * Registers tags from the {@link org.htmlparser.tags tag package} by 297 * calling {@link #registerTag(Tag) registerTag()}. 298 * @return 'this' nodefactory as a convenience. 299 */ 300 public PrototypicalNodeFactory registerTags () 301 { 302 registerTag (new AppletTag ()); 303 registerTag (new BaseHrefTag ()); 304 registerTag (new Bullet ()); 305 registerTag (new BulletList ()); 306 registerTag (new DefinitionList ()); 307 registerTag (new DefinitionListBullet ()); 308 registerTag (new DoctypeTag ()); 309 registerTag (new FormTag ()); 310 registerTag (new FrameSetTag ()); 311 registerTag (new FrameTag ()); 312 registerTag (new HeadingTag ()); 313 registerTag (new ImageTag ()); 314 registerTag (new InputTag ()); 315 registerTag (new JspTag ()); 316 registerTag (new LabelTag ()); 317 registerTag (new LinkTag ()); 318 registerTag (new MetaTag ()); 319 registerTag (new ObjectTag ()); 320 registerTag (new OptionTag ()); 321 registerTag (new ParagraphTag ()); 322 registerTag (new ProcessingInstructionTag ()); 323 registerTag (new ScriptTag ()); 324 registerTag (new SelectTag ()); 325 registerTag (new StyleTag ()); 326 registerTag (new TableColumn ()); 327 registerTag (new TableHeader ()); 328 registerTag (new TableRow ()); 329 registerTag (new TableTag ()); 330 registerTag (new TextareaTag ()); 331 registerTag (new TitleTag ()); 332 registerTag (new Div ()); 333 registerTag (new Span ()); 334 registerTag (new BodyTag ()); 335 registerTag (new HeadTag ()); 336 registerTag (new Html ()); 337 338 339 return (this); 340 } 341 342 /** 343 * Get the object that is cloned to generate text nodes. 344 * @return The prototype for {@link Text} nodes. 345 * @see #setTextPrototype 346 */ 347 public Text getTextPrototype () 348 { 349 return (mText); 350 } 351 352 /** 353 * Set the object to be used to generate text nodes. 354 * @param text The prototype for {@link Text} nodes. 355 * If <code>null</code> the prototype is set to the default 356 * ({@link TextNode}). 357 * @see #getTextPrototype 358 */ 359 public void setTextPrototype (Text text) 360 { 361 if (null == text) 362 mText = new TextNode (null, 0, 0); 363 else 364 mText = text; 365 } 366 367 /** 368 * Get the object that is cloned to generate remark nodes. 369 * @return The prototype for {@link Remark} nodes. 370 * @see #setRemarkPrototype 371 */ 372 public Remark getRemarkPrototype () 373 { 374 return (mRemark); 375 } 376 377 /** 378 * Set the object to be used to generate remark nodes. 379 * @param remark The prototype for {@link Remark} nodes. 380 * If <code>null</code> the prototype is set to the default 381 * ({@link RemarkNode}). 382 * @see #getRemarkPrototype 383 */ 384 public void setRemarkPrototype (Remark remark) 385 { 386 if (null == remark) 387 mRemark = new RemarkNode (null, 0, 0); 388 else 389 mRemark = remark; 390 } 391 392 /** 393 * Get the object that is cloned to generate tag nodes. 394 * Clones of this object are returned from {@link #createTagNode} when no 395 * specific tag is found in the list of registered tags. 396 * @return The prototype for {@link Tag} nodes. 397 * @see #setTagPrototype 398 */ 399 public Tag getTagPrototype () 400 { 401 return (mTag); 402 } 403 404 /** 405 * Set the object to be used to generate tag nodes. 406 * Clones of this object are returned from {@link #createTagNode} when no 407 * specific tag is found in the list of registered tags. 408 * @param tag The prototype for {@link Tag} nodes. 409 * If <code>null</code> the prototype is set to the default 410 * ({@link TagNode}). 411 * @see #getTagPrototype 412 */ 413 public void setTagPrototype (Tag tag) 414 { 415 if (null == tag) 416 mTag = new TagNode (null, 0, 0, null); 417 else 418 mTag = tag; 419 } 420 421 // 422 // NodeFactory interface 423 // 424 425 /** 426 * Create a new string node. 427 * @param page The page the node is on. 428 * @param start The beginning position of the string. 429 * @param end The ending position of the string. 430 * @return A text node comprising the indicated characters from the page. 431 */ 432 public Text createStringNode (Page page, int start, int end) 433 { 434 Text ret; 435 436 try 437 { 438 ret = (Text)(getTextPrototype ().clone ()); 439 ret.setPage (page); 440 ret.setStartPosition (start); 441 ret.setEndPosition (end); 442 } 443 catch (CloneNotSupportedException cnse) 444 { 445 ret = new TextNode (page, start, end); 446 } 447 448 return (ret); 449 } 450 451 /** 452 * Create a new remark node. 453 * @param page The page the node is on. 454 * @param start The beginning position of the remark. 455 * @param end The ending positiong of the remark. 456 * @return A remark node comprising the indicated characters from the page. 457 */ 458 public Remark createRemarkNode (Page page, int start, int end) 459 { 460 Remark ret; 461 462 try 463 { 464 ret = (Remark)(getRemarkPrototype ().clone ()); 465 ret.setPage (page); 466 ret.setStartPosition (start); 467 ret.setEndPosition (end); 468 } 469 catch (CloneNotSupportedException cnse) 470 { 471 ret = new RemarkNode (page, start, end); 472 } 473 474 return (ret); 475 } 476 477 /** 478 * Create a new tag node. 479 * Note that the attributes vector contains at least one element, 480 * which is the tag name (standalone attribute) at position zero. 481 * This can be used to decide which type of node to create, or 482 * gate other processing that may be appropriate. 483 * @param page The page the node is on. 484 * @param start The beginning position of the tag. 485 * @param end The ending positiong of the tag. 486 * @param attributes The attributes contained in this tag. 487 * @return A tag node comprising the indicated characters from the page. 488 */ 489 public Tag createTagNode (Page page, int start, int end, Vector attributes) 490 { 491 Attribute attribute; 492 String id; 493 Tag prototype; 494 Tag ret; 495 496 ret = null; 497 498 if (0 != attributes.size ()) 499 { 500 attribute = (Attribute)attributes.elementAt (0); 501 id = attribute.getName (); 502 if (null != id) 503 { 504 try 505 { 506 id = id.toUpperCase (Locale.ENGLISH); 507 if (!id.startsWith ("/")) 508 { 509 if (id.endsWith ("/")) 510 id = id.substring (0, id.length () - 1); 511 prototype = (Tag)mBlastocyst.get (id); 512 if (null != prototype) 513 { 514 ret = (Tag)prototype.clone (); 515 ret.setPage (page); 516 ret.setStartPosition (start); 517 ret.setEndPosition (end); 518 ret.setAttributesEx (attributes); 519 } 520 } 521 } 522 catch (CloneNotSupportedException cnse) 523 { 524 // default to creating a generic one 525 } 526 } 527 } 528 if (null == ret) 529 { // generate a generic node 530 try 531 { 532 ret = (Tag)getTagPrototype ().clone (); 533 ret.setPage (page); 534 ret.setStartPosition (start); 535 ret.setEndPosition (end); 536 ret.setAttributesEx (attributes); 537 } 538 catch (CloneNotSupportedException cnse) 539 { 540 ret = new TagNode (page, start, end, attributes); 541 } 542 } 543 544 return (ret); 545 } 546 }