/ org.htmlparser / src / org / htmlparser / visitors / TextExtractingVisitor.java
TextExtractingVisitor.java
 1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
 2  // http://sourceforge.org/projects/htmlparser
 3  // Copyright (C) 2004 Somik Raha
 4  //
 5  // Revision Control Information
 6  //
 7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/TextExtractingVisitor.java,v $
 8  // $Author: derrickoswald $
 9  // $Date: 2004/05/24 16:18:36 $
10  // $Revision: 1.42 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser.visitors;
28  
29  import org.htmlparser.Text;
30  import org.htmlparser.Tag;
31  import org.htmlparser.util.Translate;
32  
33  
34  /**
35   * Extracts text from a web page.
36   * Usage:
37   * <code>
38   * Parser parser = new Parser(...);
39   * TextExtractingVisitor visitor = new TextExtractingVisitor();
40   * parser.visitAllNodesWith(visitor);
41   * String textInPage = visitor.getExtractedText();
42   * </code>
43   */
44  public class TextExtractingVisitor extends NodeVisitor {
45      private StringBuffer textAccumulator;
46      private boolean preTagBeingProcessed;
47  
48      public TextExtractingVisitor() {
49          textAccumulator = new StringBuffer();
50          preTagBeingProcessed = false;
51      }
52  
53      public String getExtractedText() {
54          return textAccumulator.toString();
55      }
56  
57      public void visitStringNode(Text stringNode) {
58          String text = stringNode.getText();
59          if (!preTagBeingProcessed) {
60              text = Translate.decode(text);
61              text = replaceNonBreakingSpaceWithOrdinarySpace(text);
62          }
63          textAccumulator.append(text);
64      }
65  
66      private String replaceNonBreakingSpaceWithOrdinarySpace(String text) {
67          return text.replace('\u00a0',' ');
68      }
69  
70      public void visitTag(Tag tag)
71      {
72          if (isPreTag(tag))
73              preTagBeingProcessed = true;
74      }
75  
76      public void visitEndTag(Tag tag)
77      {
78          if (isPreTag(tag))
79              preTagBeingProcessed = false;
80      }
81  
82      private boolean isPreTag(Tag tag) {
83          return tag.getTagName().equals("PRE");
84      }
85  
86  }