TextExtractingVisitor.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Somik Raha 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/TextExtractingVisitor.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2004/05/24 16:18:36 $ 10 // $Revision: 1.42 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.visitors; 28 29 import org.htmlparser.Text; 30 import org.htmlparser.Tag; 31 import org.htmlparser.util.Translate; 32 33 34 /** 35 * Extracts text from a web page. 36 * Usage: 37 * <code> 38 * Parser parser = new Parser(...); 39 * TextExtractingVisitor visitor = new TextExtractingVisitor(); 40 * parser.visitAllNodesWith(visitor); 41 * String textInPage = visitor.getExtractedText(); 42 * </code> 43 */ 44 public class TextExtractingVisitor extends NodeVisitor { 45 private StringBuffer textAccumulator; 46 private boolean preTagBeingProcessed; 47 48 public TextExtractingVisitor() { 49 textAccumulator = new StringBuffer(); 50 preTagBeingProcessed = false; 51 } 52 53 public String getExtractedText() { 54 return textAccumulator.toString(); 55 } 56 57 public void visitStringNode(Text stringNode) { 58 String text = stringNode.getText(); 59 if (!preTagBeingProcessed) { 60 text = Translate.decode(text); 61 text = replaceNonBreakingSpaceWithOrdinarySpace(text); 62 } 63 textAccumulator.append(text); 64 } 65 66 private String replaceNonBreakingSpaceWithOrdinarySpace(String text) { 67 return text.replace('\u00a0',' '); 68 } 69 70 public void visitTag(Tag tag) 71 { 72 if (isPreTag(tag)) 73 preTagBeingProcessed = true; 74 } 75 76 public void visitEndTag(Tag tag) 77 { 78 if (isPreTag(tag)) 79 preTagBeingProcessed = false; 80 } 81 82 private boolean isPreTag(Tag tag) { 83 return tag.getTagName().equals("PRE"); 84 } 85 86 }