AbstractNode.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Somik Raha 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/AbstractNode.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2005/10/26 22:01:23 $ 10 // $Revision: 1.5 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.nodes; 28 29 import java.io.Serializable; 30 31 import org.htmlparser.Node; 32 import org.htmlparser.NodeFilter; 33 import org.htmlparser.lexer.Page; 34 import org.htmlparser.util.NodeList; 35 import org.htmlparser.util.ParserException; 36 import org.htmlparser.visitors.NodeVisitor; 37 38 /** 39 * The concrete base class for all types of nodes (tags, text remarks). 40 * This class provides basic functionality to hold the {@link Page}, the 41 * starting and ending position in the page, the parent and the list of 42 * {@link NodeList children}. 43 */ 44 public abstract class AbstractNode implements Node, Serializable 45 { 46 /** 47 * The page this node came from. 48 */ 49 protected Page mPage; 50 51 /** 52 * The beginning position of the tag in the line 53 */ 54 protected int nodeBegin; 55 56 /** 57 * The ending position of the tag in the line 58 */ 59 protected int nodeEnd; 60 61 /** 62 * The parent of this node. 63 */ 64 protected Node parent; 65 66 /** 67 * The children of this node. 68 */ 69 protected NodeList children; 70 71 /** 72 * Create an abstract node with the page positions given. 73 * Remember the page and start & end cursor positions. 74 * @param page The page this tag was read from. 75 * @param start The starting offset of this node within the page. 76 * @param end The ending offset of this node within the page. 77 */ 78 public AbstractNode (Page page, int start, int end) 79 { 80 mPage = page; 81 nodeBegin = start; 82 nodeEnd = end; 83 parent = null; 84 children = null; 85 } 86 87 /** 88 * Clone this object. 89 * Exposes java.lang.Object clone as a public method. 90 * @return A clone of this object. 91 * @exception CloneNotSupportedException This shouldn't be thrown since 92 * the {@link Node} interface extends Cloneable. 93 */ 94 public Object clone() throws CloneNotSupportedException 95 { 96 return (super.clone ()); 97 } 98 99 /** 100 * Returns a string representation of the node. 101 * It allows a simple string transformation 102 * of a web page, regardless of node type.<br> 103 * Typical application code (for extracting only the text from a web page) 104 * would then be simplified to:<br> 105 * <pre> 106 * Node node; 107 * for (Enumeration e = parser.elements (); e.hasMoreElements (); ) 108 * { 109 * node = (Node)e.nextElement(); 110 * System.out.println (node.toPlainTextString ()); 111 * // or do whatever processing you wish with the plain text string 112 * } 113 * </pre> 114 * @return The 'browser' content of this node. 115 */ 116 public abstract String toPlainTextString (); 117 118 /** 119 * Return the HTML that generated this node. 120 * This method will make it easier when using html parser to reproduce html 121 * pages (with or without modifications). 122 * Applications reproducing html can use this method on nodes which are to 123 * be used or transferred as they were recieved, with the original html. 124 * @return The HTML code for this node. 125 */ 126 public abstract String toHtml (); 127 128 /** 129 * Return a string representation of the node. 130 * Subclasses must define this method, and this is typically to be used in the manner<br> 131 * <pre>System.out.println(node)</pre> 132 * @return A textual representation of the node suitable for debugging 133 */ 134 public abstract String toString (); 135 136 /** 137 * Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node 138 * satisfies the filtering criteria.<P> 139 * 140 * This mechanism allows powerful filtering code to be written very easily, 141 * without bothering about collection of embedded tags separately. 142 * e.g. when we try to get all the links on a page, it is not possible to 143 * get it at the top-level, as many tags (like form tags), can contain 144 * links embedded in them. We could get the links out by checking if the 145 * current node is a {@link org.htmlparser.tags.CompositeTag}, and going through its children. 146 * So this method provides a convenient way to do this.<P> 147 * 148 * Using collectInto(), programs get a lot shorter. Now, the code to 149 * extract all links from a page would look like: 150 * <pre> 151 * NodeList collectionList = new NodeList(); 152 * NodeFilter filter = new TagNameFilter ("A"); 153 * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) 154 * e.nextNode().collectInto(collectionList, filter); 155 * </pre> 156 * Thus, collectionList will hold all the link nodes, irrespective of how 157 * deep the links are embedded.<P> 158 * 159 * Another way to accomplish the same objective is: 160 * <pre> 161 * NodeList collectionList = new NodeList(); 162 * NodeFilter filter = new TagClassFilter (LinkTag.class); 163 * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) 164 * e.nextNode().collectInto(collectionList, filter); 165 * </pre> 166 * This is slightly less specific because the LinkTag class may be 167 * registered for more than one node name, e.g. <LINK> tags too. 168 * @param list The node list to collect acceptable nodes into. 169 * @param filter The filter to determine which nodes are retained. 170 */ 171 public void collectInto (NodeList list, NodeFilter filter) 172 { 173 if (filter.accept (this)) 174 list.add (this); 175 } 176 177 /** 178 * Get the page this node came from. 179 * @return The page that supplied this node. 180 */ 181 public Page getPage () 182 { 183 return (mPage); 184 } 185 186 /** 187 * Set the page this node came from. 188 * @param page The page that supplied this node. 189 */ 190 public void setPage (Page page) 191 { 192 mPage = page; 193 } 194 195 /** 196 * Gets the starting position of the node. 197 * @return The start position. 198 */ 199 public int getStartPosition () 200 { 201 return (nodeBegin); 202 } 203 204 /** 205 * Sets the starting position of the node. 206 * @param position The new start position. 207 */ 208 public void setStartPosition (int position) 209 { 210 nodeBegin = position; 211 } 212 213 /** 214 * Gets the ending position of the node. 215 * @return The end position. 216 */ 217 public int getEndPosition () 218 { 219 return (nodeEnd); 220 } 221 222 /** 223 * Sets the ending position of the node. 224 * @param position The new end position. 225 */ 226 public void setEndPosition (int position) 227 { 228 nodeEnd = position; 229 } 230 231 /** 232 * Visit this node. 233 * @param visitor The visitor that is visiting this node. 234 */ 235 public abstract void accept (NodeVisitor visitor); 236 237 /** 238 * Get the parent of this node. 239 * This will always return null when parsing without scanners, 240 * i.e. if semantic parsing was not performed. 241 * The object returned from this method can be safely cast to a <code>CompositeTag</code>. 242 * @return The parent of this node, if it's been set, <code>null</code> otherwise. 243 */ 244 public Node getParent () 245 { 246 return (parent); 247 } 248 249 /** 250 * Sets the parent of this node. 251 * @param node The node that contains this node. Must be a <code>CompositeTag</code>. 252 */ 253 public void setParent (Node node) 254 { 255 parent = node; 256 } 257 258 /** 259 * Get the children of this node. 260 * @return The list of children contained by this node, if it's been set, <code>null</code> otherwise. 261 */ 262 public NodeList getChildren () 263 { 264 return (children); 265 } 266 267 /** 268 * Set the children of this node. 269 * @param children The new list of children this node contains. 270 */ 271 public void setChildren (NodeList children) 272 { 273 this.children = children; 274 } 275 276 /** 277 * Get the first child of this node. 278 * @return The first child in the list of children contained by this node, 279 * <code>null</code> otherwise. 280 */ 281 public Node getFirstChild () 282 { 283 if (children == null) 284 return null; 285 if (children.size() == 0) 286 return null; 287 return children.elementAt(0); 288 } 289 290 /** 291 * Get the last child of this node. 292 * @return The last child in the list of children contained by this node, 293 * <code>null</code> otherwise. 294 */ 295 public Node getLastChild () 296 { 297 if (children == null) 298 return null; 299 int numChildren = children.size(); 300 if (numChildren == 0) 301 return null; 302 return children.elementAt(numChildren - 1); 303 } 304 305 /** 306 * Get the previous sibling to this node. 307 * @return The previous sibling to this node if one exists, 308 * <code>null</code> otherwise. 309 */ 310 public Node getPreviousSibling () 311 { 312 Node parentNode = this.getParent(); 313 if (parentNode == null)//root node 314 return null; 315 NodeList siblings = parentNode.getChildren(); 316 if (siblings == null)//this should actually be an error 317 return null; 318 int numSiblings = siblings.size(); 319 if (numSiblings < 2)//need at least one other node to have a chance of having any siblings 320 return null; 321 int positionInParent = -1; 322 for (int i = 0; i < numSiblings; i++) 323 { 324 if (siblings.elementAt(i) == this) 325 { 326 positionInParent = i; 327 break; 328 } 329 } 330 if (positionInParent < 1)//no previous siblings 331 return null; 332 return siblings.elementAt(positionInParent - 1); 333 } 334 335 /** 336 * Get the next sibling to this node. 337 * @return The next sibling to this node if one exists, 338 * <code>null</code> otherwise. 339 */ 340 public Node getNextSibling () 341 { 342 Node parentNode = this.getParent(); 343 if (parentNode == null)//root node 344 return null; 345 NodeList siblings = parentNode.getChildren(); 346 if (siblings == null)//this should actually be an error 347 return null; 348 int numSiblings = siblings.size(); 349 if (numSiblings < 2)//need at least one other node to have a chance of having any siblings 350 return null; 351 int positionInParent = -1; 352 for (int i = 0; i < numSiblings; i++) 353 { 354 if (siblings.elementAt(i) == this) 355 { 356 positionInParent = i; 357 break; 358 } 359 } 360 if (positionInParent == -1)//this should actually be an error 361 return null; 362 if (positionInParent == (numSiblings - 1))//no next sibling 363 return null; 364 return siblings.elementAt(positionInParent + 1); 365 } 366 367 /** 368 * Returns the text of the node. 369 * @return The text of this node. The default is <code>null</code>. 370 */ 371 public String getText () 372 { 373 return null; 374 } 375 376 /** 377 * Sets the string contents of the node. 378 * @param text The new text for the node. 379 */ 380 public void setText(String text) 381 { 382 } 383 384 /** 385 * Perform the meaning of this tag. 386 * The default action is to do nothing. 387 * @exception ParserException <em>Not used.</em> Provides for subclasses 388 * that may want to indicate an exceptional condition. 389 */ 390 public void doSemanticAction () 391 throws 392 ParserException 393 { 394 } 395 }