/ org.htmlparser / src / org / htmlparser / util / LinkProcessor.java
LinkProcessor.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2004 Somik Raha
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/LinkProcessor.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2004/07/31 16:42:34 $
 10  // $Revision: 1.35 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser.util;
 28  
 29  import java.io.Serializable;
 30  import java.net.MalformedURLException;
 31  import java.net.URL;
 32  
 33  /**
 34   * Processor class for links, is present basically as a utility class.
 35   * @deprecated Use a Page object instead.
 36   */
 37  public class LinkProcessor
 38      implements
 39          Serializable
 40  {
 41      /**
 42       * Overriding base URL.
 43       * If set, this is used instead of a provided base URL in extract().
 44       */
 45      private String baseUrl;
 46  
 47      /**
 48       * Create an HTMLLinkProcessor.
 49       */
 50      public LinkProcessor ()
 51      {
 52          baseUrl = null;
 53      }
 54  
 55      /**
 56       * Create an absolute URL from a possibly relative link and a base URL.
 57       * @param link The reslative portion of a URL.
 58       * @param base The base URL unless overridden by the current baseURL property.
 59       * @return The fully qualified URL or the original link if a failure occured.
 60       * @deprecated Use Page.getAbsoluteURL() instead.
 61       */
 62      public String extract (String link, String base)
 63      {
 64          String ret;
 65  
 66          try
 67          {
 68              if (null == link)
 69                  link = "";
 70              else
 71                  link = stripQuotes (link);
 72              if (null != getBaseUrl ())
 73                  base = getBaseUrl ();
 74              if ((null == base) || ("".equals (link)))
 75                  ret = link;
 76              else
 77              {
 78                  URL url = constructUrl(link, base);
 79                  ret = url.toExternalForm ();
 80              }
 81          }
 82          catch (MalformedURLException murle)
 83          {
 84              ret = link;
 85          }
 86  
 87          return (Translate.decode (ret));
 88      }
 89  
 90      /**
 91       * Remove double or single quotes from the string.
 92       */
 93      public String stripQuotes (String string)
 94      {
 95          // remove any double quotes from around string
 96          if (string.startsWith ("\"") && string.endsWith ("\"") && (1 < string.length ()))
 97              string = string.substring (1, string.length () - 1);
 98  
 99          // remove any single quote from around string
100          if (string.startsWith ("'") && string.endsWith ("'") && (1 < string.length ()))
101              string = string.substring (1, string.length () - 1);
102  
103          return (string);
104      }
105  
106      /**
107       * @deprecated Use Page.constructUrl() instead.
108       */
109      public URL constructUrl(String link, String base)
110          throws MalformedURLException {
111          String path;
112          boolean modified;
113          boolean absolute;
114          int index;
115          URL url; // constructed URL combining relative link and base
116          url = new URL (new URL (base), link);
117          path = url.getFile ();
118          modified = false;
119          absolute = link.startsWith ("/");
120          if (!absolute) {   // we prefer to fix incorrect relative links
121              // this doesn't fix them all, just the ones at the start
122              while (path.startsWith ("/.")) {
123                  if (path.startsWith ("/../")) {
124                      path = path.substring (3);
125                      modified = true;
126                  }
127                  else if (path.startsWith ("/./") || path.startsWith("/.")) {
128                      path = path.substring (2);
129                      modified = true;
130                  } else break;
131              }
132          }
133          // fix backslashes
134          while (-1 != (index = path.indexOf ("/\\"))) {
135              path = path.substring (0, index + 1) + path.substring (index + 2);
136              modified = true;
137          }
138          if (modified)
139              url = new URL (url, path);
140          return url;
141      }
142  
143      /**
144       * Turn spaces into %20.
145       * @param url The url containing spaces.
146       * @return The URL with spaces as %20 sequences.
147       * @deprecated Use Parser.fixSpaces() instead.
148       */
149      public static String fixSpaces (String url)
150      {
151          int index;
152          int length;
153          char ch;
154          StringBuffer returnURL;
155  
156          index = url.indexOf (' ');
157          if (-1 != index)
158          {
159              length = url.length ();
160              returnURL = new StringBuffer (length * 3);
161              returnURL.append (url.substring (0, index));
162              for (int i = index; i < length; i++)
163              {
164                  ch = url.charAt (i);
165                  if (ch==' ')
166                      returnURL.append ("%20");
167                  else
168                      returnURL.append (ch);
169              }
170              url = returnURL.toString ();
171          }
172  
173          return (url);
174      }
175  
176      /**
177       * Check if a resource is a valid URL.
178       * @param resourceLocn The resource to test.
179       * @return <code>true</code> if the resource is a valid URL.
180       */
181      public static boolean isURL (String resourceLocn) {
182          boolean ret;
183  
184          try
185          {
186              new URL (resourceLocn);
187              ret = true;
188          }
189          catch (MalformedURLException murle)
190          {
191              ret = false;
192          }
193  
194          return (ret);
195      }
196  
197      /**
198       * Returns the baseUrl.
199       * @return String
200       */
201      public String getBaseUrl ()
202      {
203          return baseUrl;
204      }
205  
206      /**
207       * Sets the baseUrl.
208       * @param baseUrl The baseUrl to set
209       */
210      public void setBaseUrl (String baseUrl)
211      {
212          this.baseUrl = baseUrl;
213      }
214  
215      /**
216       * @deprecated Removing the last slash from a URL is a bad idea.
217       */
218      public static String removeLastSlash(String baseUrl) {
219        if(baseUrl.charAt(baseUrl.length()-1)=='/')
220        {
221           return baseUrl.substring(0,baseUrl.length()-1);
222        }
223        else
224        {
225           return baseUrl;
226        }
227      }
228  
229  }