/ org.htmlparser / src / org / htmlparser / tags / LinkTag.java
LinkTag.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2004 Somik Raha
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/LinkTag.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2005/04/10 23:20:45 $
 10  // $Revision: 1.54 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser.tags;
 28  
 29  import org.htmlparser.Node;
 30  import org.htmlparser.util.ParserUtils;
 31  import org.htmlparser.util.SimpleNodeIterator;
 32  
 33  /**
 34   * Identifies a link tag.
 35   */
 36  public class LinkTag extends CompositeTag
 37  {
 38      /**
 39       * The set of names handled by this tag.
 40       */
 41      private static final String[] mIds = new String[] {"A"};
 42  
 43      /**
 44       * The set of tag names that indicate the end of this tag.
 45       */
 46      private static final String[] mEnders = new String[] {"A", "P", "DIV", "TD", "TR", "FORM", "LI"};
 47  
 48      /**
 49       * The set of end tag names that indicate the end of this tag.
 50       */
 51      private static final String[] mEndTagEnders = new String[] {"P", "DIV", "TD", "TR", "FORM", "LI", "BODY", "HTML"};
 52  
 53      /**
 54       * The URL where the link points to
 55       */
 56      protected String mLink;
 57  
 58      /**
 59       * Set to true when the link was a mailto: URL.
 60       */
 61      private boolean mailLink;
 62  
 63      /**
 64       * Set to true when the link was a javascript: URL.
 65       */
 66      private boolean javascriptLink;
 67  
 68      /**
 69       * Constructor creates an LinkTag object, which basically stores the location
 70       * where the link points to, and the text it contains.
 71       * <p>
 72       * In order to get the contents of the link tag, use the method linkData(),
 73       * which returns an enumeration of nodes encapsulated within the link.
 74       * <p>
 75       * The following code will get all the images inside a link tag.
 76       * <pre>
 77       * Node node ;
 78       * ImageTag imageTag;
 79       * for (Enumeration e=linkTag.linkData();e.hasMoreElements();) {
 80       *      node = (Node)e.nextElement();
 81       *      if (node instanceof ImageTag) {
 82       *          imageTag = (ImageTag)node;
 83       *          // Process imageTag
 84       *      }
 85       * }
 86       * </pre>
 87       */
 88      public LinkTag ()
 89      {
 90      }
 91  
 92      /**
 93       * Return the set of names handled by this tag.
 94       * @return The names to be matched that create tags of this type.
 95       */
 96      public String[] getIds ()
 97      {
 98          return (mIds);
 99      }
100  
101      /**
102       * Return the set of tag names that cause this tag to finish.
103       * @return The names of following tags that stop further scanning.
104       */
105      public String[] getEnders ()
106      {
107          return (mEnders);
108      }
109  
110      /**
111       * Return the set of end tag names that cause this tag to finish.
112       * @return The names of following end tags that stop further scanning.
113       */
114      public String[] getEndTagEnders ()
115      {
116          return (mEndTagEnders);
117      }
118  
119      /**
120       * Get the <code>ACCESSKEY</code> attribute, if any.
121       * @return The value of the <code>ACCESSKEY</code> attribute,
122       * or <code>null</code> if the attribute doesn't exist.
123       */
124      public String getAccessKey()
125      {
126          return (getAttribute("ACCESSKEY"));
127      }
128  
129      /**
130       * Returns the url as a string, to which this link points.
131       * This string has had the "mailto:" and "javascript:" protocol stripped
132       * off the front (if those predicates return <code>true</code>) but not
133       * for other protocols. Don't ask me why, it's a legacy thing.
134       * @return The URL for this <code>A</code> tag.
135       */
136      public String getLink()
137      {
138          if (null == mLink)
139          {
140              mailLink=false;
141              javascriptLink = false;
142              mLink = extractLink ();
143  
144              int mailto = mLink.indexOf("mailto");
145              if (mailto==0)
146              {
147                  // yes it is
148                  mailto = mLink.indexOf(":");
149                  mLink = mLink.substring(mailto+1);
150                  mailLink = true;
151              }
152              int javascript = mLink.indexOf("javascript:");
153              if (javascript == 0)
154              {
155                  mLink = mLink.substring(11); // this magic number is "javascript:".length()
156                  javascriptLink = true;
157              }
158          }
159          return (mLink);
160      }
161  
162      /**
163       * Returns the text contained inside this link tag.
164       * @return The textual contents between the {@.html <A></A>} pair.
165       */
166      public String getLinkText()
167      {
168          String ret;
169  
170          if (null != getChildren ())
171              ret = getChildren ().asString ();
172          else
173              ret = "";
174  
175          return (ret);
176      }
177  
178      /**
179       * Is this a mail address
180       * @return boolean true/false
181       */
182      public boolean isMailLink()
183      {
184          getLink (); // force an evaluation of the booleans
185          return (mailLink);
186      }
187  
188      /**
189       * Tests if the link is javascript
190       * @return flag indicating if the link is a javascript code
191       */
192      public boolean isJavascriptLink()
193      {
194          getLink (); // force an evaluation of the booleans
195          return (javascriptLink);
196      }
197  
198      /**
199       * Tests if the link is an FTP link.
200       *
201       * @return flag indicating if this link is an FTP link
202       */
203      public boolean isFTPLink() {
204          return getLink ().indexOf("ftp://")==0;
205      }
206  
207      /**
208       * Tests if the link is an IRC link.
209       * @return flag indicating if this link is an IRC link
210       */
211      public boolean isIRCLink() {
212          return getLink ().indexOf("irc://")==0;
213      }
214  
215      /**
216       * Tests if the link is an HTTP link.
217       *
218       * @return flag indicating if this link is an HTTP link
219       */
220      public boolean isHTTPLink()
221      {
222          return (!isFTPLink() && !isHTTPSLink() && !isJavascriptLink() && !isMailLink() && !isIRCLink());
223      }
224  
225      /**
226       * Tests if the link is an HTTPS link.
227       *
228       * @return flag indicating if this link is an HTTPS link
229       */
230      public boolean isHTTPSLink() {
231              return getLink ().indexOf("https://")==0;
232      }
233  
234          /**
235       * Tests if the link is an HTTP link or one of its variations (HTTPS, etc.).
236       *
237       * @return flag indicating if this link is an HTTP link or one of its variations (HTTPS, etc.)
238       */
239      public boolean isHTTPLikeLink() {
240              return isHTTPLink() || isHTTPSLink();
241      }
242  
243  
244      /**
245       * Insert the method's description here.
246       * Creation date: (8/3/2001 1:49:31 AM)
247       * @param newMailLink boolean
248       */
249      public void setMailLink(boolean newMailLink)
250      {
251          mailLink = newMailLink;
252      }
253  
254      /**
255       * Set the link as a javascript link.
256       *
257       * @param newJavascriptLink flag indicating if the link is a javascript code
258       */
259      public void setJavascriptLink(boolean newJavascriptLink)
260      {
261          javascriptLink = newJavascriptLink;
262      }
263  
264      /**
265       * Return the contents of this link node as a string suitable for debugging.
266       * @return A string representation of this node.
267       */
268      public String toString()
269      {
270          StringBuffer sb = new StringBuffer();
271          sb.append("Link to : "+ getLink() + "; titled : "+getLinkText ()+"; begins at : "+getStartPosition ()+"; ends at : "+getEndPosition ()+ ", AccessKey=");
272          if (getAccessKey ()==null)
273              sb.append("null\n");
274          else
275              sb.append(getAccessKey ()+"\n");
276          if (null != getChildren ())
277          {
278              sb.append("  "+"LinkData\n");
279              sb.append("  "+"--------\n");
280  
281              Node node;
282              int i = 0;
283              for (SimpleNodeIterator e=children();e.hasMoreNodes();)
284              {
285                  node = e.nextNode();
286                  sb.append("   "+(i++)+ " ");
287                  sb.append(node.toString()+"\n");
288              }
289          }
290          sb.append("  "+"*** END of LinkData ***\n");
291          return sb.toString();
292      }
293  
294      /**
295       * Set the <code>HREF</code> attribute.
296       * @param link The new value of the <code>HREF</code> attribute.
297       */
298      public void setLink(String link)
299      {
300          mLink = link;
301          setAttribute ("HREF", link);
302      }
303  
304      /**
305       * This method returns an enumeration of data that it contains
306       * @return Enumeration
307       * @deprecated Use children() instead.
308       */
309      public SimpleNodeIterator linkData() {
310          return children();
311      }
312  
313      /**
314       * Extract the link from the HREF attribute.
315       * @return The URL from the HREF attibute. This is absolute if the tag has
316       * a valid page.
317       */
318      public String extractLink ()
319      {
320          String ret;
321  
322          ret =  getAttribute ("HREF");
323          if (null != ret)
324          {
325              ret = ParserUtils.removeChars (ret,'\n');
326              ret = ParserUtils.removeChars (ret,'\r');
327          }
328          if (null != getPage ())
329              ret = getPage ().getAbsoluteURL (ret);
330  
331          return (ret);
332      }
333  }