LinkProcessor.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Somik Raha 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/LinkProcessor.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2004/07/31 16:42:34 $ 10 // $Revision: 1.35 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.util; 28 29 import java.io.Serializable; 30 import java.net.MalformedURLException; 31 import java.net.URL; 32 33 /** 34 * Processor class for links, is present basically as a utility class. 35 * @deprecated Use a Page object instead. 36 */ 37 public class LinkProcessor 38 implements 39 Serializable 40 { 41 /** 42 * Overriding base URL. 43 * If set, this is used instead of a provided base URL in extract(). 44 */ 45 private String baseUrl; 46 47 /** 48 * Create an HTMLLinkProcessor. 49 */ 50 public LinkProcessor () 51 { 52 baseUrl = null; 53 } 54 55 /** 56 * Create an absolute URL from a possibly relative link and a base URL. 57 * @param link The reslative portion of a URL. 58 * @param base The base URL unless overridden by the current baseURL property. 59 * @return The fully qualified URL or the original link if a failure occured. 60 * @deprecated Use Page.getAbsoluteURL() instead. 61 */ 62 public String extract (String link, String base) 63 { 64 String ret; 65 66 try 67 { 68 if (null == link) 69 link = ""; 70 else 71 link = stripQuotes (link); 72 if (null != getBaseUrl ()) 73 base = getBaseUrl (); 74 if ((null == base) || ("".equals (link))) 75 ret = link; 76 else 77 { 78 URL url = constructUrl(link, base); 79 ret = url.toExternalForm (); 80 } 81 } 82 catch (MalformedURLException murle) 83 { 84 ret = link; 85 } 86 87 return (Translate.decode (ret)); 88 } 89 90 /** 91 * Remove double or single quotes from the string. 92 */ 93 public String stripQuotes (String string) 94 { 95 // remove any double quotes from around string 96 if (string.startsWith ("\"") && string.endsWith ("\"") && (1 < string.length ())) 97 string = string.substring (1, string.length () - 1); 98 99 // remove any single quote from around string 100 if (string.startsWith ("'") && string.endsWith ("'") && (1 < string.length ())) 101 string = string.substring (1, string.length () - 1); 102 103 return (string); 104 } 105 106 /** 107 * @deprecated Use Page.constructUrl() instead. 108 */ 109 public URL constructUrl(String link, String base) 110 throws MalformedURLException { 111 String path; 112 boolean modified; 113 boolean absolute; 114 int index; 115 URL url; // constructed URL combining relative link and base 116 url = new URL (new URL (base), link); 117 path = url.getFile (); 118 modified = false; 119 absolute = link.startsWith ("/"); 120 if (!absolute) { // we prefer to fix incorrect relative links 121 // this doesn't fix them all, just the ones at the start 122 while (path.startsWith ("/.")) { 123 if (path.startsWith ("/../")) { 124 path = path.substring (3); 125 modified = true; 126 } 127 else if (path.startsWith ("/./") || path.startsWith("/.")) { 128 path = path.substring (2); 129 modified = true; 130 } else break; 131 } 132 } 133 // fix backslashes 134 while (-1 != (index = path.indexOf ("/\\"))) { 135 path = path.substring (0, index + 1) + path.substring (index + 2); 136 modified = true; 137 } 138 if (modified) 139 url = new URL (url, path); 140 return url; 141 } 142 143 /** 144 * Turn spaces into %20. 145 * @param url The url containing spaces. 146 * @return The URL with spaces as %20 sequences. 147 * @deprecated Use Parser.fixSpaces() instead. 148 */ 149 public static String fixSpaces (String url) 150 { 151 int index; 152 int length; 153 char ch; 154 StringBuffer returnURL; 155 156 index = url.indexOf (' '); 157 if (-1 != index) 158 { 159 length = url.length (); 160 returnURL = new StringBuffer (length * 3); 161 returnURL.append (url.substring (0, index)); 162 for (int i = index; i < length; i++) 163 { 164 ch = url.charAt (i); 165 if (ch==' ') 166 returnURL.append ("%20"); 167 else 168 returnURL.append (ch); 169 } 170 url = returnURL.toString (); 171 } 172 173 return (url); 174 } 175 176 /** 177 * Check if a resource is a valid URL. 178 * @param resourceLocn The resource to test. 179 * @return <code>true</code> if the resource is a valid URL. 180 */ 181 public static boolean isURL (String resourceLocn) { 182 boolean ret; 183 184 try 185 { 186 new URL (resourceLocn); 187 ret = true; 188 } 189 catch (MalformedURLException murle) 190 { 191 ret = false; 192 } 193 194 return (ret); 195 } 196 197 /** 198 * Returns the baseUrl. 199 * @return String 200 */ 201 public String getBaseUrl () 202 { 203 return baseUrl; 204 } 205 206 /** 207 * Sets the baseUrl. 208 * @param baseUrl The baseUrl to set 209 */ 210 public void setBaseUrl (String baseUrl) 211 { 212 this.baseUrl = baseUrl; 213 } 214 215 /** 216 * @deprecated Removing the last slash from a URL is a bad idea. 217 */ 218 public static String removeLastSlash(String baseUrl) { 219 if(baseUrl.charAt(baseUrl.length()-1)=='/') 220 { 221 return baseUrl.substring(0,baseUrl.length()-1); 222 } 223 else 224 { 225 return baseUrl; 226 } 227 } 228 229 }