WikiCapturer.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2003 Derrick Oswald 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/WikiCapturer.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2005/04/12 11:27:42 $ 10 // $Revision: 1.3 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.parserapplications; 28 29 import java.io.File; 30 import java.io.IOException; 31 import java.net.MalformedURLException; 32 import java.net.URL; 33 import javax.swing.JFileChooser; 34 import javax.swing.JOptionPane; 35 import org.htmlparser.filters.AndFilter; 36 import org.htmlparser.filters.HasAttributeFilter; 37 import org.htmlparser.filters.NotFilter; 38 import org.htmlparser.filters.OrFilter; 39 import org.htmlparser.filters.TagNameFilter; 40 41 /** 42 * Save a wikiwikiweb locally. 43 * Illustrative program to save a wiki locally. 44 */ 45 public class WikiCapturer 46 extends 47 SiteCapturer 48 { 49 /** 50 * Create a wikicapturer. 51 */ 52 public WikiCapturer () 53 { 54 } 55 56 /** 57 * Returns <code>true</code> if the link is one we are interested in. 58 * @param link The link to be checked. 59 * @return <code>true</code> if the link has the source URL as a prefix 60 * and doesn't contain '?' or '#'; the former because we won't be able to 61 * handle server side queries in the static target directory structure and 62 * the latter because presumably the full page with that reference has 63 * already been captured previously. This performs a case insensitive 64 * comparison, which is cheating really, but it's cheap. 65 */ 66 protected boolean isToBeCaptured (String link) 67 { 68 boolean ret; 69 70 ret = super.isToBeCaptured (link); 71 72 // eliminate PhpWiki specific pages 73 if (ret) 74 if (link.endsWith ("PhpWikiAdministration")) 75 ret = false; 76 else if (link.endsWith ("PhpWikiDocumentation")) 77 ret = false; 78 else if (link.endsWith ("TextFormattingRules")) 79 ret = false; 80 else if (link.endsWith ("NewMarkupTestPage")) 81 ret = false; 82 else if (link.endsWith ("OldMarkupTestPage")) 83 ret = false; 84 else if (link.endsWith ("OldTextFormattingRules")) 85 ret = false; 86 else if (link.endsWith ("PgsrcTranslation")) 87 ret = false; 88 else if (link.endsWith ("HowToUseWiki")) 89 ret = false; 90 else if (link.endsWith ("MoreAboutMechanics")) 91 ret = false; 92 else if (link.endsWith ("AddingPages")) 93 ret = false; 94 else if (link.endsWith ("WikiWikiWeb")) 95 ret = false; 96 else if (link.endsWith ("UserPreferences")) 97 ret = false; 98 else if (link.endsWith ("PhpWiki")) 99 ret = false; 100 else if (link.endsWith ("WabiSabi")) 101 ret = false; 102 else if (link.endsWith ("EditText")) 103 ret = false; 104 else if (link.endsWith ("FindPage")) 105 ret = false; 106 else if (link.endsWith ("RecentChanges")) 107 ret = false; 108 else if (link.endsWith ("RecentEdits")) 109 ret = false; 110 else if (link.endsWith ("RecentVisitors")) 111 ret = false; 112 else if (link.endsWith ("SteveWainstead")) 113 ret = false; 114 115 return (ret); 116 } 117 118 /** 119 * Mainline to capture a web site locally. 120 * @param args The command line arguments. 121 * There are three arguments the web site to capture, the local directory 122 * to save it to, and a flag (true or false) to indicate whether resources 123 * such as images and video are to be captured as well. 124 * These are requested via dialog boxes if not supplied. 125 * @exception MalformedURLException If the supplied URL is invalid. 126 * @exception IOException If an error occurs reading the pages or resources. 127 */ 128 public static void main (String[] args) 129 throws 130 MalformedURLException, 131 IOException 132 { 133 WikiCapturer worker; 134 String url; 135 JFileChooser chooser; 136 URL source; 137 String path; 138 File target; 139 Boolean capture; 140 int ret; 141 142 worker = new WikiCapturer (); 143 if (0 >= args.length) 144 { 145 url = (String)JOptionPane.showInputDialog ( 146 null, 147 "Enter the URL to capture:", 148 "Web Site", 149 JOptionPane.PLAIN_MESSAGE, 150 null, 151 null, 152 "http://htmlparser.sourceforge.net/wiki"); 153 if (null != url) 154 worker.setSource (url); 155 else 156 System.exit (1); 157 } 158 else 159 worker.setSource (args[0]); 160 if (1 >= args.length) 161 { 162 url = worker.getSource (); 163 source = new URL (url); 164 path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath (); 165 target = new File (path); 166 chooser = new JFileChooser (target); 167 chooser.setDialogType (JFileChooser.SAVE_DIALOG); 168 chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY); 169 chooser.setSelectedFile (target); // this doesn't frickin' work 170 chooser.setMultiSelectionEnabled (false); 171 chooser.setDialogTitle ("Target Directory"); 172 ret = chooser.showSaveDialog (null); 173 if (ret == JFileChooser.APPROVE_OPTION) 174 worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ()); 175 else 176 System.exit (1); 177 } 178 else 179 worker.setTarget (args[1]); 180 if (2 >= args.length) 181 { 182 capture = (Boolean)JOptionPane.showInputDialog ( 183 null, 184 "Should resources be captured:", 185 "Capture Resources", 186 JOptionPane.PLAIN_MESSAGE, 187 null, 188 new Object[] { Boolean.TRUE, Boolean.FALSE}, 189 Boolean.TRUE); 190 if (null != capture) 191 worker.setCaptureResources (capture.booleanValue ()); 192 else 193 System.exit (1); 194 } 195 else 196 worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ())); 197 worker.setFilter ( 198 new NotFilter ( 199 new OrFilter ( 200 new AndFilter ( 201 new TagNameFilter ("DIV"), 202 new HasAttributeFilter ("id", "navbar")), 203 new OrFilter ( 204 new AndFilter ( 205 new TagNameFilter ("DIV"), 206 new HasAttributeFilter ("id", "actionbar")), 207 new AndFilter ( 208 new TagNameFilter ("DIV"), 209 new HasAttributeFilter ("id", "xhtml-validator")))))); 210 worker.capture (); 211 212 System.exit (0); 213 } 214 }