/ org.htmlparser / src / org / htmlparser / PrototypicalNodeFactory.java
PrototypicalNodeFactory.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2003 Derrick Oswald
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/PrototypicalNodeFactory.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2006/03/19 15:01:24 $
 10  // $Revision: 1.19 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser;
 28  
 29  import java.io.Serializable;
 30  import java.util.Hashtable;
 31  import java.util.Locale;
 32  import java.util.Map;
 33  import java.util.Set;
 34  import java.util.Vector;
 35  
 36  import org.htmlparser.lexer.Page;
 37  import org.htmlparser.nodes.TextNode;
 38  import org.htmlparser.nodes.RemarkNode;
 39  import org.htmlparser.nodes.TagNode;
 40  import org.htmlparser.tags.AppletTag;
 41  import org.htmlparser.tags.BaseHrefTag;
 42  import org.htmlparser.tags.BodyTag;
 43  import org.htmlparser.tags.Bullet;
 44  import org.htmlparser.tags.BulletList;
 45  import org.htmlparser.tags.DefinitionList;
 46  import org.htmlparser.tags.DefinitionListBullet;
 47  import org.htmlparser.tags.Div;
 48  import org.htmlparser.tags.DoctypeTag;
 49  import org.htmlparser.tags.FormTag;
 50  import org.htmlparser.tags.FrameSetTag;
 51  import org.htmlparser.tags.FrameTag;
 52  import org.htmlparser.tags.HeadingTag;
 53  import org.htmlparser.tags.HeadTag;
 54  import org.htmlparser.tags.Html;
 55  import org.htmlparser.tags.ImageTag;
 56  import org.htmlparser.tags.InputTag;
 57  import org.htmlparser.tags.JspTag;
 58  import org.htmlparser.tags.LabelTag;
 59  import org.htmlparser.tags.LinkTag;
 60  import org.htmlparser.tags.MetaTag;
 61  import org.htmlparser.tags.ObjectTag;
 62  import org.htmlparser.tags.OptionTag;
 63  import org.htmlparser.tags.ParagraphTag;
 64  import org.htmlparser.tags.ProcessingInstructionTag;
 65  import org.htmlparser.tags.ScriptTag;
 66  import org.htmlparser.tags.SelectTag;
 67  import org.htmlparser.tags.Span;
 68  import org.htmlparser.tags.StyleTag;
 69  import org.htmlparser.tags.TableColumn;
 70  import org.htmlparser.tags.TableHeader;
 71  import org.htmlparser.tags.TableRow;
 72  import org.htmlparser.tags.TableTag;
 73  import org.htmlparser.tags.TextareaTag;
 74  import org.htmlparser.tags.TitleTag;
 75  
 76  /**
 77   * A node factory based on the prototype pattern.
 78   * This factory uses the prototype pattern to generate new nodes.
 79   * These are cloned as needed to form new {@link Text}, {@link Remark} and
 80   * {@link Tag} nodes.
 81   * <p>Text and remark nodes are generated from prototypes accessed
 82   * via the {@link #setTextPrototype(Text) textPrototype} and
 83   * {@link #setRemarkPrototype(Remark) remarkPrototype} properties respectively.
 84   * Tag nodes are generated as follows:
 85   * <p>Prototype tags, in the form of undifferentiated tags, are held in a hash
 86   * table. On a request for a tag, the attributes are examined for the name
 87   * of the tag to be created. If a prototype of that name has been registered
 88   * (exists in the hash table), it is cloned and the clone is given the
 89   * characteristics ({@link Attribute Attributes}, start and end position)
 90   * of the requested tag.</p>
 91   * <p>In the case that no tag has been registered under that name,
 92   * a generic tag is created from the prototype acessed via the
 93   * {@link #setTagPrototype(Tag) tagPrototype} property.</p>
 94   * <p>The hash table of registered tags can be automatically populated with
 95   * all the known tags from the {@link org.htmlparser.tags} package when
 96   * the factory is constructed, or it can start out empty and be populated
 97   * explicitly.</p>
 98   * <p>Here is an example of how to override all text issued from
 99   * {@link org.htmlparser.nodes.TextNode#toPlainTextString()
100   * Text.toPlainTextString()},
101   * in this case decoding (converting character references),
102   * which illustrates the use of setting the text prototype:
103   * <pre>
104   * PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();
105   * factory.setTextPrototype (
106   *     // create a inner class that is a subclass of TextNode
107   *     new TextNode () {
108   *         public String toPlainTextString()
109   *         {
110   *             String original = super.toPlainTextString ();
111   *             return (org.htmlparser.util.Translate.decode (original));
112   *         }
113   *     });
114   * Parser parser = new Parser ();
115   * parser.setNodeFactory (factory);
116   * </pre></p>
117   * <p>Here is an example of using a custom link tag, in this case just
118   * printing the URL, which illustrates registering a tag:
119   * <pre>
120   *
121   * class PrintingLinkTag extends LinkTag
122   * {
123   *     public void doSemanticAction ()
124   *         throws
125   *             ParserException
126   *     {
127   *         System.out.println (getLink ());
128   *     }
129   * }
130   * PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();
131   * factory.registerTag (new PrintingLinkTag ());
132   * Parser parser = new Parser ();
133   * parser.setNodeFactory (factory);
134   * </pre></p>
135   */
136  public class PrototypicalNodeFactory
137      implements
138          Serializable,
139          NodeFactory
140  {
141      /**
142       * The prototypical text node.
143       */
144      protected Text mText;
145  
146      /**
147       * The prototypical remark node.
148       */
149      protected Remark mRemark;
150  
151      /**
152       * The prototypical tag node.
153       */
154      protected Tag mTag;
155  
156      /**
157       * The list of tags to return.
158       * The list is keyed by tag name.
159       */
160      protected Map mBlastocyst;
161  
162      /**
163       * Create a new factory with all tags registered.
164       * Equivalent to
165       * {@link #PrototypicalNodeFactory() PrototypicalNodeFactory(false)}.
166       */
167      public PrototypicalNodeFactory ()
168      {
169          this (false);
170      }
171  
172      /**
173       * Create a new factory.
174       * @param empty If <code>true</code>, creates an empty factory,
175       * otherwise create a new factory with all tags registered.
176       */
177      public PrototypicalNodeFactory (boolean empty)
178      {
179          clear ();
180          mText = new TextNode (null, 0, 0);
181          mRemark = new RemarkNode (null, 0, 0);
182          mTag = new TagNode (null, 0, 0, null);
183          if (!empty)
184              registerTags ();
185      }
186  
187      /**
188       * Create a new factory with the given tag as the only registered tag.
189       * @param tag The single tag to register in the otherwise empty factory.
190       */
191      public PrototypicalNodeFactory (Tag tag)
192      {
193          this (true);
194          registerTag (tag);
195      }
196  
197      /**
198       * Create a new factory with the given tags registered.
199       * @param tags The tags to register in the otherwise empty factory.
200       */
201      public PrototypicalNodeFactory (Tag[] tags)
202      {
203          this (true);
204          for (int i = 0; i < tags.length; i++)
205              registerTag (tags[i]);
206      }
207  
208      /**
209       * Adds a tag to the registry.
210       * @param id The name under which to register the tag.
211       * <strong>For proper operation, the id should be uppercase so it
212       * will be matched by a Map lookup.</strong>
213       * @param tag The tag to be returned from a {@link #createTagNode} call.
214       * @return The tag previously registered with that id if any,
215       * or <code>null</code> if none.
216       */
217      public Tag put (String id, Tag tag)
218      {
219          return ((Tag)mBlastocyst.put (id, tag));
220      }
221  
222      /**
223       * Gets a tag from the registry.
224       * @param id The name of the tag to return.
225       * @return The tag registered under the <code>id</code> name,
226       * or <code>null</code> if none.
227       */
228      public Tag get (String id)
229      {
230          return ((Tag)mBlastocyst.get (id));
231      }
232  
233      /**
234       * Remove a tag from the registry.
235       * @param id The name of the tag to remove.
236       * @return The tag that was registered with that <code>id</code>,
237       * or <code>null</code> if none.
238       */
239      public Tag remove (String id)
240      {
241          return ((Tag)mBlastocyst.remove (id));
242      }
243  
244      /**
245       * Clean out the registry.
246       */
247      public void clear ()
248      {
249          mBlastocyst = new Hashtable ();
250      }
251  
252      /**
253       * Get the list of tag names.
254       * @return The names of the tags currently registered.
255       */
256      public Set getTagNames ()
257      {
258          return (mBlastocyst.keySet ());
259      }
260  
261      /**
262       * Register a tag.
263       * Registers the given tag under every {@link Tag#getIds() id} that the
264       * tag has (i.e. all names returned by {@link Tag#getIds() tag.getIds()}.
265       * <p><strong>For proper operation, the ids are converted to uppercase so
266       * they will be matched by a Map lookup.</strong>
267       * @param tag The tag to register.
268       */
269      public void registerTag (Tag tag)
270      {
271          String[] ids;
272  
273          ids = tag.getIds ();
274          for (int i = 0; i < ids.length; i++)
275              put (ids[i].toUpperCase (Locale.ENGLISH), tag);
276      }
277  
278      /**
279       * Unregister a tag.
280       * Unregisters the given tag from every {@link Tag#getIds() id} the tag has.
281       * <p><strong>The ids are converted to uppercase to undo the operation
282       * of registerTag.</strong>
283       * @param tag The tag to unregister.
284       */
285      public void unregisterTag (Tag tag)
286      {
287          String[] ids;
288  
289          ids = tag.getIds ();
290          for (int i = 0; i < ids.length; i++)
291              remove (ids[i].toUpperCase (Locale.ENGLISH));
292      }
293  
294      /**
295       * Register all known tags in the tag package.
296       * Registers tags from the {@link org.htmlparser.tags tag package} by
297       * calling {@link #registerTag(Tag) registerTag()}.
298       * @return 'this' nodefactory as a convenience.
299       */
300      public PrototypicalNodeFactory registerTags ()
301      {
302          registerTag (new AppletTag ());
303          registerTag (new BaseHrefTag ());
304          registerTag (new Bullet ());
305          registerTag (new BulletList ());
306          registerTag (new DefinitionList ());
307          registerTag (new DefinitionListBullet ());
308          registerTag (new DoctypeTag ());
309          registerTag (new FormTag ());
310          registerTag (new FrameSetTag ());
311          registerTag (new FrameTag ());
312          registerTag (new HeadingTag ());
313          registerTag (new ImageTag ());
314          registerTag (new InputTag ());
315          registerTag (new JspTag ());
316          registerTag (new LabelTag ());
317          registerTag (new LinkTag ());
318          registerTag (new MetaTag ());
319          registerTag (new ObjectTag ());
320          registerTag (new OptionTag ());
321          registerTag (new ParagraphTag ());
322          registerTag (new ProcessingInstructionTag ());
323          registerTag (new ScriptTag ());
324          registerTag (new SelectTag ());
325          registerTag (new StyleTag ());
326          registerTag (new TableColumn ());
327          registerTag (new TableHeader ());
328          registerTag (new TableRow ());
329          registerTag (new TableTag ());
330          registerTag (new TextareaTag ());
331          registerTag (new TitleTag ());
332          registerTag (new Div ());
333          registerTag (new Span ());
334          registerTag (new BodyTag ());
335          registerTag (new HeadTag ());
336          registerTag (new Html ());
337          
338  
339          return (this);
340      }
341  
342      /**
343       * Get the object that is cloned to generate text nodes.
344       * @return The prototype for {@link Text} nodes.
345       * @see #setTextPrototype
346       */
347      public Text getTextPrototype ()
348      {
349          return (mText);
350      }
351  
352      /**
353       * Set the object to be used to generate text nodes.
354       * @param text The prototype for {@link Text} nodes.
355       * If <code>null</code> the prototype is set to the default
356       * ({@link TextNode}).
357       * @see #getTextPrototype
358       */
359      public void setTextPrototype (Text text)
360      {
361          if (null == text)
362              mText = new TextNode (null, 0, 0);
363          else
364              mText = text;
365      }
366  
367      /**
368       * Get the object that is cloned to generate remark nodes.
369       * @return The prototype for {@link Remark} nodes.
370       * @see #setRemarkPrototype
371       */
372      public Remark getRemarkPrototype ()
373      {
374          return (mRemark);
375      }
376  
377      /**
378       * Set the object to be used to generate remark nodes.
379       * @param remark The prototype for {@link Remark} nodes.
380       * If <code>null</code> the prototype is set to the default
381       * ({@link RemarkNode}).
382       * @see #getRemarkPrototype
383       */
384      public void setRemarkPrototype (Remark remark)
385      {
386          if (null == remark)
387              mRemark = new RemarkNode (null, 0, 0);
388          else
389              mRemark = remark;
390      }
391  
392      /**
393       * Get the object that is cloned to generate tag nodes.
394       * Clones of this object are returned from {@link #createTagNode} when no
395       * specific tag is found in the list of registered tags.
396       * @return The prototype for {@link Tag} nodes.
397       * @see #setTagPrototype
398       */
399      public Tag getTagPrototype ()
400      {
401          return (mTag);
402      }
403  
404      /**
405       * Set the object to be used to generate tag nodes.
406       * Clones of this object are returned from {@link #createTagNode} when no
407       * specific tag is found in the list of registered tags.
408       * @param tag The prototype for {@link Tag} nodes.
409       * If <code>null</code> the prototype is set to the default
410       * ({@link TagNode}).
411       * @see #getTagPrototype
412       */
413      public void setTagPrototype (Tag tag)
414      {
415          if (null == tag)
416              mTag = new TagNode (null, 0, 0, null);
417          else
418              mTag = tag;
419      }
420  
421      //
422      // NodeFactory interface
423      //
424  
425      /**
426       * Create a new string node.
427       * @param page The page the node is on.
428       * @param start The beginning position of the string.
429       * @param end The ending position of the string.
430       * @return A text node comprising the indicated characters from the page.
431       */
432      public Text createStringNode (Page page, int start, int end)
433      {
434          Text ret;
435  
436          try
437          {
438              ret = (Text)(getTextPrototype ().clone ());
439              ret.setPage (page);
440              ret.setStartPosition (start);
441              ret.setEndPosition (end);
442          }
443          catch (CloneNotSupportedException cnse)
444          {
445              ret = new TextNode (page, start, end);
446          }
447  
448          return (ret);
449      }
450  
451      /**
452       * Create a new remark node.
453       * @param page The page the node is on.
454       * @param start The beginning position of the remark.
455       * @param end The ending positiong of the remark.
456       * @return A remark node comprising the indicated characters from the page.
457       */
458      public Remark createRemarkNode (Page page, int start, int end)
459      {
460          Remark ret;
461  
462          try
463          {
464              ret = (Remark)(getRemarkPrototype ().clone ());
465              ret.setPage (page);
466              ret.setStartPosition (start);
467              ret.setEndPosition (end);
468          }
469          catch (CloneNotSupportedException cnse)
470          {
471              ret = new RemarkNode (page, start, end);
472          }
473  
474          return (ret);
475      }
476  
477      /**
478       * Create a new tag node.
479       * Note that the attributes vector contains at least one element,
480       * which is the tag name (standalone attribute) at position zero.
481       * This can be used to decide which type of node to create, or
482       * gate other processing that may be appropriate.
483       * @param page The page the node is on.
484       * @param start The beginning position of the tag.
485       * @param end The ending positiong of the tag.
486       * @param attributes The attributes contained in this tag.
487       * @return A tag node comprising the indicated characters from the page.
488       */
489      public Tag createTagNode (Page page, int start, int end, Vector attributes)
490      {
491          Attribute attribute;
492          String id;
493          Tag prototype;
494          Tag ret;
495  
496          ret = null;
497  
498          if (0 != attributes.size ())
499          {
500              attribute = (Attribute)attributes.elementAt (0);
501              id = attribute.getName ();
502              if (null != id)
503              {
504                  try
505                  {
506                      id = id.toUpperCase (Locale.ENGLISH);
507                      if (!id.startsWith ("/"))
508                      {
509                          if (id.endsWith ("/"))
510                              id = id.substring (0, id.length () - 1);
511                          prototype = (Tag)mBlastocyst.get (id);
512                          if (null != prototype)
513                          {
514                              ret = (Tag)prototype.clone ();
515                              ret.setPage (page);
516                              ret.setStartPosition (start);
517                              ret.setEndPosition (end);
518                              ret.setAttributesEx (attributes);
519                          }
520                      }
521                  }
522                  catch (CloneNotSupportedException cnse)
523                  {
524                      // default to creating a generic one
525                  }
526              }
527          }
528          if (null == ret)
529          {   // generate a generic node
530              try
531              {
532                  ret = (Tag)getTagPrototype ().clone ();
533                  ret.setPage (page);
534                  ret.setStartPosition (start);
535                  ret.setEndPosition (end);
536                  ret.setAttributesEx (attributes);
537              }
538              catch (CloneNotSupportedException cnse)
539              {
540                  ret = new TagNode (page, start, end, attributes);
541              }
542          }
543  
544          return (ret);
545      }
546  }