/ org.htmlparser / src / org / htmlparser / parserapplications / WikiCapturer.java
WikiCapturer.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2003 Derrick Oswald
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/WikiCapturer.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2005/04/12 11:27:42 $
 10  // $Revision: 1.3 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser.parserapplications;
 28  
 29  import java.io.File;
 30  import java.io.IOException;
 31  import java.net.MalformedURLException;
 32  import java.net.URL;
 33  import javax.swing.JFileChooser;
 34  import javax.swing.JOptionPane;
 35  import org.htmlparser.filters.AndFilter;
 36  import org.htmlparser.filters.HasAttributeFilter;
 37  import org.htmlparser.filters.NotFilter;
 38  import org.htmlparser.filters.OrFilter;
 39  import org.htmlparser.filters.TagNameFilter;
 40  
 41  /**
 42   * Save a wikiwikiweb locally.
 43   * Illustrative program to save a wiki locally.
 44   */
 45  public class WikiCapturer
 46      extends
 47          SiteCapturer
 48  {
 49      /**
 50       * Create a wikicapturer.
 51       */
 52      public WikiCapturer ()
 53      {
 54      }
 55  
 56      /**
 57       * Returns <code>true</code> if the link is one we are interested in.
 58       * @param link The link to be checked.
 59       * @return <code>true</code> if the link has the source URL as a prefix
 60       * and doesn't contain '?' or '#'; the former because we won't be able to
 61       * handle server side queries in the static target directory structure and
 62       * the latter because presumably the full page with that reference has
 63       * already been captured previously. This performs a case insensitive
 64       * comparison, which is cheating really, but it's cheap.
 65       */
 66      protected boolean isToBeCaptured (String link)
 67      {
 68          boolean ret;
 69          
 70          ret = super.isToBeCaptured (link);
 71  
 72          // eliminate PhpWiki specific pages
 73          if (ret)
 74              if (link.endsWith ("PhpWikiAdministration"))
 75                  ret = false;
 76              else if (link.endsWith ("PhpWikiDocumentation"))
 77                  ret = false;
 78              else if (link.endsWith ("TextFormattingRules"))
 79                  ret = false;
 80              else if (link.endsWith ("NewMarkupTestPage"))
 81                  ret = false;
 82              else if (link.endsWith ("OldMarkupTestPage"))
 83                  ret = false;
 84              else if (link.endsWith ("OldTextFormattingRules"))
 85                  ret = false;
 86              else if (link.endsWith ("PgsrcTranslation"))
 87                  ret = false;
 88              else if (link.endsWith ("HowToUseWiki"))
 89                  ret = false;
 90              else if (link.endsWith ("MoreAboutMechanics"))
 91                  ret = false;
 92              else if (link.endsWith ("AddingPages"))
 93                  ret = false;
 94              else if (link.endsWith ("WikiWikiWeb"))
 95                  ret = false;
 96              else if (link.endsWith ("UserPreferences"))
 97                  ret = false;
 98              else if (link.endsWith ("PhpWiki"))
 99                  ret = false;
100              else if (link.endsWith ("WabiSabi"))
101                  ret = false;
102              else if (link.endsWith ("EditText"))
103                  ret = false;
104              else if (link.endsWith ("FindPage"))
105                  ret = false;
106              else if (link.endsWith ("RecentChanges"))
107                  ret = false;
108              else if (link.endsWith ("RecentEdits"))
109                  ret = false;
110              else if (link.endsWith ("RecentVisitors"))
111                  ret = false;
112              else if (link.endsWith ("SteveWainstead"))
113                  ret = false;
114  
115          return (ret);
116      }
117  
118      /**
119       * Mainline to capture a web site locally.
120       * @param args The command line arguments.
121       * There are three arguments the web site to capture, the local directory
122       * to save it to, and a flag (true or false) to indicate whether resources
123       * such as images and video are to be captured as well.
124       * These are requested via dialog boxes if not supplied.
125       * @exception MalformedURLException If the supplied URL is invalid.
126       * @exception IOException If an error occurs reading the pages or resources.
127       */
128      public static void main (String[] args)
129          throws
130              MalformedURLException,
131              IOException
132      {
133          WikiCapturer worker;
134          String url;
135          JFileChooser chooser;
136          URL source;
137          String path;
138          File target;
139          Boolean capture;
140          int ret;
141          
142          worker = new WikiCapturer ();
143          if (0 >= args.length)
144          {
145              url = (String)JOptionPane.showInputDialog (
146                  null,
147                  "Enter the URL to capture:",
148                  "Web Site",
149                  JOptionPane.PLAIN_MESSAGE,
150                  null,
151                  null,
152                  "http://htmlparser.sourceforge.net/wiki");
153              if (null != url)
154                  worker.setSource (url);
155              else
156                  System.exit (1);
157          }
158          else
159              worker.setSource (args[0]);
160          if (1 >= args.length)
161          {
162              url = worker.getSource ();
163              source = new URL (url);
164              path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath ();
165              target = new File (path);
166              chooser = new JFileChooser (target);
167              chooser.setDialogType (JFileChooser.SAVE_DIALOG);
168              chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY);
169              chooser.setSelectedFile (target); // this doesn't frickin' work
170              chooser.setMultiSelectionEnabled (false);
171              chooser.setDialogTitle ("Target Directory");
172              ret = chooser.showSaveDialog (null);
173              if (ret == JFileChooser.APPROVE_OPTION)
174                  worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ());
175              else
176                  System.exit (1);
177          }
178          else
179              worker.setTarget (args[1]);
180          if (2 >= args.length)
181          {
182              capture = (Boolean)JOptionPane.showInputDialog (
183                  null,
184                  "Should resources be captured:",
185                  "Capture Resources",
186                  JOptionPane.PLAIN_MESSAGE,
187                  null,
188                  new Object[] { Boolean.TRUE, Boolean.FALSE},
189                  Boolean.TRUE);
190              if (null != capture)
191                  worker.setCaptureResources (capture.booleanValue ());
192              else
193                  System.exit (1);
194          }
195          else
196              worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ()));
197          worker.setFilter (
198              new NotFilter (
199                  new OrFilter (
200                      new AndFilter (
201                          new TagNameFilter ("DIV"),
202                          new HasAttributeFilter ("id", "navbar")), 
203                      new OrFilter (
204                          new AndFilter (
205                              new TagNameFilter ("DIV"),
206                              new HasAttributeFilter ("id", "actionbar")),
207                          new AndFilter (
208                              new TagNameFilter ("DIV"),
209                              new HasAttributeFilter ("id", "xhtml-validator"))))));
210          worker.capture ();
211          
212          System.exit (0);
213      }
214  }