/ org.htmlparser / src / org / htmlparser / nodes / AbstractNode.java
AbstractNode.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2004 Somik Raha
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/AbstractNode.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2005/10/26 22:01:23 $
 10  // $Revision: 1.5 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser.nodes;
 28  
 29  import java.io.Serializable;
 30  
 31  import org.htmlparser.Node;
 32  import org.htmlparser.NodeFilter;
 33  import org.htmlparser.lexer.Page;
 34  import org.htmlparser.util.NodeList;
 35  import org.htmlparser.util.ParserException;
 36  import org.htmlparser.visitors.NodeVisitor;
 37  
 38  /**
 39   * The concrete base class for all types of nodes (tags, text remarks).
 40   * This class provides basic functionality to hold the {@link Page}, the
 41   * starting and ending position in the page, the parent and the list of
 42   * {@link NodeList children}.
 43   */
 44  public abstract class AbstractNode implements Node, Serializable
 45  {
 46      /**
 47       * The page this node came from.
 48       */
 49      protected Page mPage;
 50  
 51      /**
 52       * The beginning position of the tag in the line
 53       */
 54      protected int nodeBegin;
 55  
 56      /**
 57       * The ending position of the tag in the line
 58       */
 59      protected int nodeEnd;
 60  
 61      /**
 62       * The parent of this node.
 63       */
 64      protected Node parent;
 65  
 66      /**
 67       * The children of this node.
 68       */
 69      protected NodeList children;
 70  
 71      /**
 72       * Create an abstract node with the page positions given.
 73       * Remember the page and start & end cursor positions.
 74       * @param page The page this tag was read from.
 75       * @param start The starting offset of this node within the page.
 76       * @param end The ending offset of this node within the page.
 77       */
 78      public AbstractNode (Page page, int start, int end)
 79      {
 80          mPage = page;
 81          nodeBegin = start;
 82          nodeEnd = end;
 83          parent = null;
 84          children = null;
 85      }
 86  
 87      /**
 88       * Clone this object.
 89       * Exposes java.lang.Object clone as a public method.
 90       * @return A clone of this object.
 91       * @exception CloneNotSupportedException This shouldn't be thrown since
 92       * the {@link Node} interface extends Cloneable.
 93       */
 94      public Object clone() throws CloneNotSupportedException
 95      {
 96          return (super.clone ());
 97      }
 98  
 99      /**
100       * Returns a string representation of the node.
101       * It allows a simple string transformation
102       * of a web page, regardless of node type.<br>
103       * Typical application code (for extracting only the text from a web page)
104       * would then be simplified to:<br>
105       * <pre>
106       * Node node;
107       * for (Enumeration e = parser.elements (); e.hasMoreElements (); )
108       * {
109       *     node = (Node)e.nextElement();
110       *     System.out.println (node.toPlainTextString ());
111       *     // or do whatever processing you wish with the plain text string
112       * }
113       * </pre>
114       * @return The 'browser' content of this node.
115       */
116      public abstract String toPlainTextString ();
117  
118      /**
119       * Return the HTML that generated this node.
120       * This method will make it easier when using html parser to reproduce html
121       * pages (with or without modifications).
122       * Applications reproducing html can use this method on nodes which are to
123       * be used or transferred as they were recieved, with the original html.
124       * @return The HTML code for this node.
125       */
126      public abstract String toHtml ();
127  
128      /**
129       * Return a string representation of the node.
130       * Subclasses must define this method, and this is typically to be used in the manner<br>
131       * <pre>System.out.println(node)</pre>
132       * @return A textual representation of the node suitable for debugging
133       */
134      public abstract String toString ();
135  
136      /**
137       * Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node
138       * satisfies the filtering criteria.<P>
139       * 
140       * This mechanism allows powerful filtering code to be written very easily,
141       * without bothering about collection of embedded tags separately.
142       * e.g. when we try to get all the links on a page, it is not possible to
143       * get it at the top-level, as many tags (like form tags), can contain
144       * links embedded in them. We could get the links out by checking if the
145       * current node is a {@link org.htmlparser.tags.CompositeTag}, and going through its children.
146       * So this method provides a convenient way to do this.<P>
147       * 
148       * Using collectInto(), programs get a lot shorter. Now, the code to
149       * extract all links from a page would look like:
150       * <pre>
151       * NodeList collectionList = new NodeList();
152       * NodeFilter filter = new TagNameFilter ("A");
153       * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
154       *      e.nextNode().collectInto(collectionList, filter);
155       * </pre>
156       * Thus, collectionList will hold all the link nodes, irrespective of how
157       * deep the links are embedded.<P>
158       * 
159       * Another way to accomplish the same objective is:
160       * <pre>
161       * NodeList collectionList = new NodeList();
162       * NodeFilter filter = new TagClassFilter (LinkTag.class);
163       * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
164       *      e.nextNode().collectInto(collectionList, filter);
165       * </pre>
166       * This is slightly less specific because the LinkTag class may be
167       * registered for more than one node name, e.g. &lt;LINK&gt; tags too.
168       * @param list The node list to collect acceptable nodes into.
169       * @param filter The filter to determine which nodes are retained.
170       */
171      public void collectInto (NodeList list, NodeFilter filter)
172      {
173          if (filter.accept (this))
174              list.add (this);
175      }
176  
177      /**
178       * Get the page this node came from.
179       * @return The page that supplied this node.
180       */
181      public Page getPage ()
182      {
183          return (mPage);
184      }
185  
186      /**
187       * Set the page this node came from.
188       * @param page The page that supplied this node.
189       */
190      public void setPage (Page page)
191      {
192          mPage = page;
193      }
194  
195      /**
196       * Gets the starting position of the node.
197       * @return The start position.
198       */
199      public int getStartPosition ()
200      {
201          return (nodeBegin);
202      }
203  
204      /**
205       * Sets the starting position of the node.
206       * @param position The new start position.
207       */
208      public void setStartPosition (int position)
209      {
210          nodeBegin = position;
211      }
212  
213      /**
214       * Gets the ending position of the node.
215       * @return The end position.
216       */
217      public int getEndPosition ()
218      {
219          return (nodeEnd);
220      }
221  
222      /**
223       * Sets the ending position of the node.
224       * @param position The new end position.
225       */
226      public void setEndPosition (int position)
227      {
228          nodeEnd = position;
229      }
230  
231      /**
232       * Visit this node.
233       * @param visitor The visitor that is visiting this node.
234       */
235      public abstract void accept (NodeVisitor visitor);
236  
237      /**
238       * Get the parent of this node.
239       * This will always return null when parsing without scanners,
240       * i.e. if semantic parsing was not performed.
241       * The object returned from this method can be safely cast to a <code>CompositeTag</code>.
242       * @return The parent of this node, if it's been set, <code>null</code> otherwise.
243       */
244      public Node getParent ()
245      {
246          return (parent);
247      }
248  
249      /**
250       * Sets the parent of this node.
251       * @param node The node that contains this node. Must be a <code>CompositeTag</code>.
252       */
253      public void setParent (Node node)
254      {
255          parent = node;
256      }
257  
258      /**
259       * Get the children of this node.
260       * @return The list of children contained by this node, if it's been set, <code>null</code> otherwise.
261       */
262      public NodeList getChildren ()
263      {
264          return (children);
265      }
266  
267      /**
268       * Set the children of this node.
269       * @param children The new list of children this node contains.
270       */
271      public void setChildren (NodeList children)
272      {
273          this.children = children;
274      }
275      
276      /**
277       * Get the first child of this node.
278       * @return The first child in the list of children contained by this node,
279       * <code>null</code> otherwise.
280       */
281      public Node getFirstChild ()
282      {
283          if (children == null)
284              return null;
285          if (children.size() == 0)
286              return null;
287          return children.elementAt(0);
288      }
289      
290      /**
291       * Get the last child of this node.
292       * @return The last child in the list of children contained by this node,
293       * <code>null</code> otherwise.
294       */
295      public Node getLastChild ()
296      {
297          if (children == null)
298              return null;
299          int numChildren = children.size();
300          if (numChildren == 0)
301              return null;
302          return children.elementAt(numChildren - 1);
303      }
304      
305      /**
306       * Get the previous sibling to this node.
307       * @return The previous sibling to this node if one exists,
308       * <code>null</code> otherwise.
309       */
310      public Node getPreviousSibling ()
311      {
312          Node parentNode = this.getParent();
313          if (parentNode == null)//root node
314              return null;
315          NodeList siblings = parentNode.getChildren();
316          if (siblings == null)//this should actually be an error
317              return null;
318          int numSiblings = siblings.size();
319          if (numSiblings < 2)//need at least one other node to have a chance of having any siblings
320              return null;
321          int positionInParent = -1;
322          for (int i = 0; i < numSiblings; i++)
323          {
324              if (siblings.elementAt(i) == this)
325              {
326                  positionInParent = i;
327                  break;
328              }
329          }
330          if (positionInParent < 1)//no previous siblings
331              return null;
332          return siblings.elementAt(positionInParent - 1);
333      }
334      
335      /**
336       * Get the next sibling to this node.
337       * @return The next sibling to this node if one exists,
338       * <code>null</code> otherwise.
339       */
340      public Node getNextSibling ()
341      {
342          Node parentNode = this.getParent();
343          if (parentNode == null)//root node
344              return null;
345          NodeList siblings = parentNode.getChildren();
346          if (siblings == null)//this should actually be an error
347              return null;
348          int numSiblings = siblings.size();
349          if (numSiblings < 2)//need at least one other node to have a chance of having any siblings
350              return null;
351          int positionInParent = -1;
352          for (int i = 0; i < numSiblings; i++)
353          {
354              if (siblings.elementAt(i) == this)
355              {
356                  positionInParent = i;
357                  break;
358              }
359          }
360          if (positionInParent == -1)//this should actually be an error
361              return null;
362          if (positionInParent == (numSiblings - 1))//no next sibling
363              return null;
364          return siblings.elementAt(positionInParent + 1);
365      }
366  
367      /**
368       * Returns the text of the node.
369       * @return The text of this node. The default is <code>null</code>.
370       */
371      public String getText ()
372      {
373          return null;
374      }
375  
376      /**
377       * Sets the string contents of the node.
378       * @param text The new text for the node.
379       */
380      public void setText(String text)
381      {
382      }
383  
384      /**
385       * Perform the meaning of this tag.
386       * The default action is to do nothing.
387       * @exception ParserException <em>Not used.</em> Provides for subclasses
388       * that may want to indicate an exceptional condition.
389       */
390      public void doSemanticAction ()
391          throws
392              ParserException
393      {
394      }
395  }