ParserUtils.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Somik Raha 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/ParserUtils.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2005/05/15 11:49:05 $ 10 // $Revision: 1.47 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.util; 28 29 import java.io.UnsupportedEncodingException; 30 import java.util.ArrayList; 31 32 import org.htmlparser.Node; 33 import org.htmlparser.NodeFilter; 34 import org.htmlparser.Parser; 35 import org.htmlparser.Tag; 36 import org.htmlparser.filters.NodeClassFilter; 37 import org.htmlparser.filters.TagNameFilter; 38 import org.htmlparser.lexer.Lexer; 39 import org.htmlparser.lexer.Page; 40 import org.htmlparser.tags.CompositeTag; 41 import org.htmlparser.util.NodeList; 42 import org.htmlparser.util.ParserException; 43 44 45 public class ParserUtils 46 { 47 public static String removeChars(String s, char occur) { 48 StringBuffer newString = new StringBuffer(); 49 char ch; 50 for (int i = 0; i < s.length(); i++) { 51 ch = s.charAt(i); 52 if (ch != occur) 53 newString.append(ch); 54 } 55 return newString.toString(); 56 } 57 58 public static String removeEscapeCharacters(String inputString) { 59 inputString = ParserUtils.removeChars(inputString, '\r'); 60 inputString = ParserUtils.removeChars(inputString, '\n'); 61 inputString = ParserUtils.removeChars(inputString, '\t'); 62 return inputString; 63 } 64 65 public static String removeTrailingBlanks(String text) { 66 char ch = ' '; 67 while (ch == ' ') { 68 ch = text.charAt(text.length() - 1); 69 if (ch == ' ') 70 text = text.substring(0, text.length() - 1); 71 } 72 return text; 73 } 74 75 /** 76 * Search given node and pick up any objects of given type. 77 * @param node The node to search. 78 * @param type The class to search for. 79 * @return A node array with the matching nodes. 80 */ 81 public static Node[] findTypeInNode(Node node, Class type) 82 { 83 NodeFilter filter; 84 NodeList ret; 85 86 ret = new NodeList (); 87 filter = new NodeClassFilter (type); 88 node.collectInto (ret, filter); 89 90 return (ret.toNodeArray ()); 91 } 92 93 /** 94 * Split the input string considering as string separator 95 * all the not numerical characters 96 * with the only exception of the characters specified in charsDoNotBeRemoved param. 97 * <BR>For example if you call splitButDigits("<DIV> +12.5, +3.4 </DIV>", "+."), 98 * <BR>you obtain an array of strings {"+12.5", "+3.4"} as output (1,2,3,4 and 5 are digits and +,. are chars that do not be removed). 99 * @param input The string in input. 100 * @param charsDoNotBeRemoved The chars that do not be removed. 101 * @return The array of strings as output. 102 */ 103 public static String[] splitButDigits (String input, String charsDoNotBeRemoved) 104 { 105 106 ArrayList output = new ArrayList(); 107 int minCapacity = 0; 108 StringBuffer str = new StringBuffer(); 109 110 boolean charFound = false; 111 boolean toBeAdd = false; 112 for (int index=0; index<input.length(); index++) 113 { 114 charFound=false; 115 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 116 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 117 charFound=true; 118 if ((Character.isDigit(input.charAt(index))) || (charFound)) 119 { 120 str.append(input.charAt(index)); 121 toBeAdd=false; 122 } 123 else 124 if (!toBeAdd) 125 toBeAdd=true; 126 // finished to parse one string 127 if (toBeAdd && (str.length()!=0)) { 128 minCapacity++; 129 output.ensureCapacity(minCapacity); 130 if (output.add(str.toString())) 131 str = new StringBuffer(); 132 else 133 minCapacity--; 134 } 135 } 136 // add the last string 137 if (str.length()!=0) { 138 minCapacity++; 139 output.ensureCapacity(minCapacity); 140 if (output.add(str.toString())) 141 str = new StringBuffer(); 142 else 143 minCapacity--; 144 } 145 146 output.trimToSize(); 147 Object[] outputObj = output.toArray(); 148 String[] outputStr = new String[output.size()]; 149 for (int i=0; i<output.size(); i++) 150 outputStr[i] = new String((String) outputObj[i]); 151 return outputStr; 152 153 } 154 155 /** 156 * Remove from the input string all the not numerical characters 157 * with the only exception of the characters specified in charsDoNotBeRemoved param. 158 * <BR>For example if you call trimButDigits("<DIV> +12.5 </DIV>", "+."), 159 * <BR>you obtain a string "+12.5" as output (1,2 and 5 are digits and +,. are chars that do not be removed). 160 * <BR>For example if you call trimButDigits("<DIV> +1 2 . 5 </DIV>", "+."), 161 * <BR>you obtain a string "+12.5" as output (the spaces between 1 and 2, 2 and ., . and 5 are removed). 162 * @param input The string in input. 163 * @param charsDoNotBeRemoved The chars that do not be removed. 164 * @return The string as output. 165 */ 166 public static String trimButDigits (String input, String charsDoNotBeRemoved) 167 { 168 169 StringBuffer output = new StringBuffer(); 170 171 boolean charFound=false; 172 for (int index=0; index<input.length(); index++) 173 { 174 charFound=false; 175 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 176 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 177 charFound=true; 178 if ((Character.isDigit(input.charAt(index))) || (charFound)) 179 output.append(input.charAt(index)); 180 } 181 182 return output.toString(); 183 184 } 185 186 /** 187 * Remove from the beginning and the end of the input string all the not numerical characters 188 * with the only exception of the characters specified in charsDoNotBeRemoved param. 189 * <BR>The removal process removes only chars at the beginning and at the end of the string. 190 * <BR>For example if you call trimButDigitsBeginEnd("<DIV> +12.5 </DIV>", "+."), 191 * <BR>you obtain a string "+12.5" as output (1,2 and 5 are digits and +,. are chars that do not be removed). 192 * <BR>For example if you call trimButDigitsBeginEnd("<DIV> +1 2 . 5 </DIV>", "+."), 193 * <BR>you obtain a string "+1 2 . 5" as output (the spacess inside the string are not removed). 194 * @param input - The string in input. 195 * @param charsDoNotBeRemoved - The chars that do not be removed. 196 * @return The string as output. 197 */ 198 public static String trimButDigitsBeginEnd (String input, String charsDoNotBeRemoved) 199 { 200 201 String output = new String(); 202 203 int begin=0; 204 int end=input.length()-1; 205 boolean charFound=false; 206 boolean ok=true; 207 for (int index=begin; (index<input.length()) && ok; index++) 208 { 209 charFound=false; 210 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 211 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 212 charFound=true; 213 if ( (Character.isDigit(input.charAt(index))) || (charFound) ) 214 { 215 begin=index; 216 ok=false; 217 } 218 } 219 ok=true; 220 for (int index=end; (index>=0) && ok; index--) 221 { 222 charFound=false; 223 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 224 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 225 charFound=true; 226 if ( (Character.isDigit(input.charAt(index))) || (charFound) ) 227 { 228 end=index; 229 ok=false; 230 } 231 } 232 output=input.substring(begin,end+1); 233 234 return output; 235 236 } 237 238 /** 239 * Split the input string considering as string separator 240 * all the spaces and tabs like chars and 241 * the chars specified in the input variable charsToBeRemoved. 242 * <BR>For example if you call splitSpaces("<DIV> +12.5, +3.4 </DIV>", "<>DIV/,"), 243 * <BR>you obtain an array of strings {"+12.5", "+3.4"} as output (space chars and <,>,D,I,V,/ and the comma are chars that must be removed). 244 * @param input The string in input. 245 * @param charsToBeRemoved The chars to be removed. 246 * @return The array of strings as output. 247 */ 248 public static String[] splitSpaces (String input, String charsToBeRemoved) 249 { 250 251 ArrayList output = new ArrayList(); 252 int minCapacity = 0; 253 StringBuffer str = new StringBuffer(); 254 255 boolean charFound = false; 256 boolean toBeAdd = false; 257 for (int index=0; index<input.length(); index++) 258 { 259 charFound=false; 260 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 261 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 262 charFound=true; 263 if (!((Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound))) 264 { 265 str.append(input.charAt(index)); 266 toBeAdd=false; 267 } 268 else 269 if (!toBeAdd) 270 toBeAdd=true; 271 // finished to parse one string 272 if (toBeAdd && (str.length()!=0)) { 273 minCapacity++; 274 output.ensureCapacity(minCapacity); 275 if (output.add(str.toString())) 276 str = new StringBuffer(); 277 else 278 minCapacity--; 279 } 280 } 281 // add the last string 282 if (str.length()!=0) { 283 minCapacity++; 284 output.ensureCapacity(minCapacity); 285 if (output.add(str.toString())) 286 str = new StringBuffer(); 287 else 288 minCapacity--; 289 } 290 291 output.trimToSize(); 292 Object[] outputObj = output.toArray(); 293 String[] outputStr = new String[output.size()]; 294 for (int i=0; i<output.size(); i++) 295 outputStr[i] = new String((String) outputObj[i]); 296 return outputStr; 297 298 } 299 300 /** 301 * Remove from the input string all the spaces and tabs like chars. 302 * Remove also the chars specified in the input variable charsToBeRemoved. 303 * <BR>For example if you call trimSpaces("<DIV> +12.5 </DIV>", "<>DIV/"), 304 * <BR>you obtain a string "+12.5" as output (space chars and <,>,D,I,V,/ are chars that must be removed). 305 * <BR>For example if you call trimSpaces("<DIV> Trim All Spaces Also The Ones Inside The String </DIV>", "<>DIV/"), 306 * <BR>you obtain a string "TrimAllSpacesAlsoTheOnesInsideTheString" as output (all the spaces inside the string are removed). 307 * @param input The string in input. 308 * @param charsToBeRemoved The chars to be removed. 309 * @return The string as output. 310 */ 311 public static String trimSpaces (String input, String charsToBeRemoved) 312 { 313 314 StringBuffer output = new StringBuffer(); 315 316 boolean charFound=false; 317 for (int index=0; index<input.length(); index++) 318 { 319 charFound=false; 320 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 321 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 322 charFound=true; 323 if (!((Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound))) 324 output.append(input.charAt(index)); 325 } 326 327 return output.toString(); 328 329 } 330 331 /** 332 * Remove from the beginning and the end of the input string all the spaces and tabs like chars. 333 * Remove also the chars specified in the input variable charsToBeRemoved. 334 * <BR>The removal process removes only chars at the beginning and at the end of the string. 335 * <BR>For example if you call trimSpacesBeginEnd("<DIV> +12.5 </DIV>", "<>DIV/"), 336 * <BR>you obtain a string "+12.5" as output (space chars and <,>,D,I,V,/ are chars that must be removed). 337 * <BR>For example if you call trimSpacesBeginEnd("<DIV> Trim all spaces but not the ones inside the string </DIV>", "<>DIV/"), 338 * <BR>you obtain a string "Trim all spaces but not the ones inside the string" as output (all the spaces inside the string are preserved). 339 * @param input The string in input. 340 * @param charsToBeRemoved The chars to be removed. 341 * @return The string as output. 342 */ 343 public static String trimSpacesBeginEnd (String input, String charsToBeRemoved) 344 { 345 346 String output = new String(); 347 348 int begin=0; 349 int end=input.length()-1; 350 boolean charFound=false; 351 boolean ok=true; 352 for (int index=begin; (index<input.length()) && ok; index++) 353 { 354 charFound=false; 355 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 356 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 357 charFound=true; 358 if (!( (Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound) )) 359 { 360 begin=index; 361 ok=false; 362 } 363 } 364 ok=true; 365 for (int index=end; (index>=0) && ok; index--) 366 { 367 charFound=false; 368 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 369 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 370 charFound=true; 371 if (!( (Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound) )) 372 { 373 end=index; 374 ok=false; 375 } 376 } 377 output=input.substring(begin,end+1); 378 379 return output; 380 381 } 382 383 /** 384 * Split the input string considering as string separator 385 * all the characters 386 * with the only exception of the characters specified in charsDoNotBeRemoved param. 387 * <BR>For example if you call splitButChars("<DIV> +12.5, +3.4 </DIV>", "+.1234567890"), 388 * <BR>you obtain an array of strings {"+12.5", "+3.4"} as output (+,.,1,2,3,4,5,6,7,8,9,0 are chars that do not be removed). 389 * @param input The string in input. 390 * @param charsDoNotBeRemoved The chars that do not be removed. 391 * @return The array of strings as output. 392 */ 393 public static String[] splitButChars (String input, String charsDoNotBeRemoved) 394 { 395 396 ArrayList output = new ArrayList(); 397 int minCapacity = 0; 398 StringBuffer str = new StringBuffer(); 399 400 boolean charFound = false; 401 boolean toBeAdd = false; 402 for (int index=0; index<input.length(); index++) 403 { 404 charFound=false; 405 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 406 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 407 charFound=true; 408 if (charFound) 409 { 410 str.append(input.charAt(index)); 411 toBeAdd=false; 412 } 413 else 414 if (!toBeAdd) 415 toBeAdd=true; 416 // finished to parse one string 417 if (toBeAdd && (str.length()!=0)) { 418 minCapacity++; 419 output.ensureCapacity(minCapacity); 420 if (output.add(str.toString())) 421 str = new StringBuffer(); 422 else 423 minCapacity--; 424 } 425 } 426 // add the last string 427 if (str.length()!=0) { 428 minCapacity++; 429 output.ensureCapacity(minCapacity); 430 if (output.add(str.toString())) 431 str = new StringBuffer(); 432 else 433 minCapacity--; 434 } 435 436 output.trimToSize(); 437 Object[] outputObj = output.toArray(); 438 String[] outputStr = new String[output.size()]; 439 for (int i=0; i<output.size(); i++) 440 outputStr[i] = new String((String) outputObj[i]); 441 return outputStr; 442 443 } 444 445 /** 446 * Remove from the input string all the characters 447 * with the only exception of the characters specified in charsDoNotBeRemoved param. 448 * <BR>For example if you call trimButChars("<DIV> +12.5 </DIV>", "+.1234567890"), 449 * <BR>you obtain a string "+12.5" as output (+,.,1,2,3,4,5,6,7,8,9,0 are chars that do not be removed). 450 * <BR>For example if you call trimButChars("<DIV> +1 2 . 5 </DIV>", "+.1234567890"), 451 * <BR>you obtain a string "+12.5" as output (the spaces between 1 and 2, 2 and ., . and 5 are removed). 452 * @param input The string in input. 453 * @param charsDoNotBeRemoved The chars that do not be removed. 454 * @return The string as output. 455 */ 456 public static String trimButChars (String input, String charsDoNotBeRemoved) 457 { 458 459 StringBuffer output = new StringBuffer(); 460 461 boolean charFound=false; 462 for (int index=0; index<input.length(); index++) 463 { 464 charFound=false; 465 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 466 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 467 charFound=true; 468 if (charFound) 469 output.append(input.charAt(index)); 470 } 471 472 return output.toString(); 473 474 } 475 476 /** 477 * Remove from the beginning and the end of the input string all the characters 478 * with the only exception of the characters specified in charsDoNotBeRemoved param. 479 * <BR>The removal process removes only chars at the beginning and at the end of the string. 480 * <BR>For example if you call trimButCharsBeginEnd("<DIV> +12.5 </DIV>", "+.1234567890"), 481 * <BR>you obtain a string "+12.5" as output (+,.,1,2,3,4,5,6,7,8,9,0 are chars that do not be removed). 482 * <BR>For example if you call trimButCharsBeginEnd("<DIV> +1 2 . 5 </DIV>", "+.1234567890"), 483 * <BR>you obtain a string "+1 2 . 5" as output (the spaces inside the string are not removed). 484 * @param input The string in input. 485 * @param charsDoNotBeRemoved The chars that do not be removed. 486 * @return The string as output. 487 */ 488 public static String trimButCharsBeginEnd (String input, String charsDoNotBeRemoved) 489 { 490 491 String output = new String(); 492 493 int begin=0; 494 int end=input.length()-1; 495 boolean charFound=false; 496 boolean ok=true; 497 for (int index=begin; (index<input.length()) && ok; index++) 498 { 499 charFound=false; 500 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 501 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 502 charFound=true; 503 if (charFound) 504 { 505 begin=index; 506 ok=false; 507 } 508 } 509 ok=true; 510 for (int index=end; (index>=0) && ok; index--) 511 { 512 charFound=false; 513 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 514 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 515 charFound=true; 516 if (charFound) 517 { 518 end=index; 519 ok=false; 520 } 521 } 522 output=input.substring(begin,end+1); 523 524 return output; 525 526 } 527 528 /** 529 * Split the input string considering as string separator 530 * the chars specified in the input variable charsToBeRemoved. 531 * <BR>For example if you call splitChars("<DIV> +12.5, +3.4 </DIV>", " <>DIV/,"), 532 * <BR>you obtain an array of strings {"+12.5", "+3.4"} as output (space chars and <,>,D,I,V,/ and the comma are chars that must be removed). 533 * @param input The string in input. 534 * @param charsToBeRemoved The chars to be removed. 535 * @return The array of strings as output. 536 */ 537 public static String[] splitChars (String input, String charsToBeRemoved) 538 { 539 540 ArrayList output = new ArrayList(); 541 int minCapacity = 0; 542 StringBuffer str = new StringBuffer(); 543 544 boolean charFound = false; 545 boolean toBeAdd = false; 546 for (int index=0; index<input.length(); index++) 547 { 548 charFound=false; 549 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 550 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 551 charFound=true; 552 if (!(charFound)) 553 { 554 str.append(input.charAt(index)); 555 toBeAdd=false; 556 } 557 else 558 if (!toBeAdd) 559 toBeAdd=true; 560 // finished to parse one string 561 if (toBeAdd && (str.length()!=0)) { 562 minCapacity++; 563 output.ensureCapacity(minCapacity); 564 if (output.add(str.toString())) 565 str = new StringBuffer(); 566 else 567 minCapacity--; 568 } 569 } 570 // add the last string 571 if (str.length()!=0) { 572 minCapacity++; 573 output.ensureCapacity(minCapacity); 574 if (output.add(str.toString())) 575 str = new StringBuffer(); 576 else 577 minCapacity--; 578 } 579 580 output.trimToSize(); 581 Object[] outputObj = output.toArray(); 582 String[] outputStr = new String[output.size()]; 583 for (int i=0; i<output.size(); i++) 584 outputStr[i] = new String((String) outputObj[i]); 585 return outputStr; 586 587 } 588 589 /** 590 * Remove from the input string all the chars specified in the input variable charsToBeRemoved. 591 * <BR>For example if you call trimChars("<DIV> +12.5 </DIV>", "<>DIV/ "), 592 * <BR>you obtain a string "+12.5" as output (<,>,D,I,V,/ and space char are chars that must be removed). 593 * <BR>For example if you call trimChars("<DIV> Trim All Chars Also The Ones Inside The String </DIV>", "<>DIV/ "), 594 * <BR>you obtain a string "TrimAllCharsAlsoTheOnesInsideTheString" as output (all the spaces inside the string are removed). 595 * @param input The string in input. 596 * @param charsToBeRemoved The chars to be removed. 597 * @return The string as output. 598 */ 599 public static String trimChars (String input, String charsToBeRemoved) 600 { 601 602 StringBuffer output = new StringBuffer(); 603 604 boolean charFound=false; 605 for (int index=0; index<input.length(); index++) 606 { 607 charFound=false; 608 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 609 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 610 charFound=true; 611 if (!(charFound)) 612 output.append(input.charAt(index)); 613 } 614 615 return output.toString(); 616 617 } 618 619 /** 620 * Remove from the beginning and the end of the input string all the chars specified in the input variable charsToBeRemoved. 621 * <BR>The removal process removes only chars at the beginning and at the end of the string. 622 * <BR>For example if you call trimCharsBeginEnd("<DIV> +12.5 </DIV>", "<>DIV/ "), 623 * <BR>you obtain a string "+12.5" as output (' ' is a space char and <,>,D,I,V,/ are chars that must be removed). 624 * <BR>For example if you call trimCharsBeginEnd("<DIV> Trim all spaces but not the ones inside the string </DIV>", "<>DIV/ "), 625 * <BR>you obtain a string "Trim all spaces but not the ones inside the string" as output (all the spaces inside the string are preserved). 626 * @param input The string in input. 627 * @param charsToBeRemoved The chars to be removed. 628 * @return The string as output. 629 */ 630 public static String trimCharsBeginEnd (String input, String charsToBeRemoved) 631 { 632 633 String output = new String(); 634 635 int begin=0; 636 int end=input.length()-1; 637 boolean charFound=false; 638 boolean ok=true; 639 for (int index=begin; (index<input.length()) && ok; index++) 640 { 641 charFound=false; 642 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 643 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 644 charFound=true; 645 if (!(charFound)) 646 { 647 begin=index; 648 ok=false; 649 } 650 } 651 ok=true; 652 for (int index=end; (index>=0) && ok; index--) 653 { 654 charFound=false; 655 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 656 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 657 charFound=true; 658 if (!(charFound)) 659 { 660 end=index; 661 ok=false; 662 } 663 } 664 output=input.substring(begin,end+1); 665 666 return output; 667 668 } 669 670 /** 671 * Split the input string in a string array, 672 * considering the tags as delimiter for splitting. 673 * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). 674 */ 675 public static String[] splitTags (String input, String[] tags) 676 throws ParserException, UnsupportedEncodingException 677 { 678 return splitTags (input, tags, true, true); 679 } 680 681 /** 682 * Split the input string in a string array, 683 * considering the tags as delimiter for splitting. 684 * <BR>For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}), 685 * <BR>you obtain a string array {"Begin ", " ALL OK"} as output (splitted <DIV> tags and their content recursively). 686 * <BR>For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, false), 687 * <BR>you obtain a string array {"Begin ", "<DIV> +12.5 </DIV>", " ALL OK"} as output (splitted <DIV> tags and not their content and no recursively). 688 * <BR>For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, true, false), 689 * <BR>you obtain a string array {"Begin ", " +12.5 ", " ALL OK"} as output (splitted <DIV> tags and not their content recursively). 690 * <BR>For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, true), 691 * <BR>you obtain a string array {"Begin ", " ALL OK"} as output (splitted <DIV> tags and their content). 692 * @param input The string in input. 693 * @param tags The tags to be used as splitting delimiter. 694 * @param recursive Optional parameter (true if not present), if true delete all the tags recursively. 695 * @param insideTag Optional parameter (true if not present), if true delete also the content of the tags. 696 * @return The string array containing the strings delimited by tags. 697 */ 698 public static String[] splitTags (String input, String[] tags, boolean recursive, boolean insideTag) 699 throws ParserException, UnsupportedEncodingException 700 { 701 702 ArrayList outputArrayList = new ArrayList(); 703 int minCapacity = 0; 704 String output = new String(); 705 String inputModified = new String(input); 706 String[] outputStr = new String[] {}; 707 708 String dummyString = createDummyString (' ', input.length()); 709 710 // loop inside the different tags to be trimmed 711 for (int i=0; i<tags.length; i++) 712 { 713 714 // loop inside the tags of the same type 715 NodeList links = getLinks (inputModified, tags[i], recursive); 716 for (int j=0; j<links.size(); j++) 717 { 718 CompositeTag beginTag = (CompositeTag)links.elementAt(j); 719 Tag endTag = beginTag.getEndTag(); 720 721 // positions of begin and end tags 722 int beginTagBegin = beginTag.getStartPosition (); 723 int endTagBegin = beginTag.getEndPosition (); 724 int beginTagEnd = endTag.getStartPosition (); 725 int endTagEnd = endTag.getEndPosition (); 726 727 if (insideTag) 728 { 729 dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagEnd); 730 } 731 else 732 { 733 dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagBegin); 734 dummyString = modifyDummyString (new String(dummyString), beginTagEnd, endTagEnd); 735 } 736 } 737 for (int k=dummyString.indexOf(' '); (k<dummyString.length()) && (k!=-1);) 738 { 739 int kNew = dummyString.indexOf('*',k); 740 if (kNew!=-1) 741 { 742 output = inputModified.substring(k,kNew); 743 k = dummyString.indexOf(' ',kNew); 744 745 minCapacity++; 746 outputArrayList.ensureCapacity(minCapacity); 747 if (outputArrayList.add(output)) 748 output = new String(); 749 else 750 minCapacity--; 751 } 752 else 753 { 754 output = inputModified.substring(k,dummyString.length()); 755 k = kNew; 756 757 minCapacity++; 758 outputArrayList.ensureCapacity(minCapacity); 759 if (outputArrayList.add(output)) 760 output = new String(); 761 else 762 minCapacity--; 763 } 764 } 765 StringBuffer outputStringBuffer = new StringBuffer(); 766 outputArrayList.trimToSize(); 767 Object[] outputObj = outputArrayList.toArray(); 768 outputStr = new String[outputArrayList.size()]; 769 for (int j=0; j<outputArrayList.size(); j++) 770 { 771 outputStr[j] = new String((String) outputObj[j]); 772 outputStringBuffer.append(outputStr[j]); 773 } 774 outputArrayList = new ArrayList(); 775 inputModified = new String(outputStringBuffer.toString()); 776 dummyString = createDummyString (' ', inputModified.length()); 777 } 778 779 return outputStr; 780 781 } 782 783 /** 784 * Split the input string in a string array, 785 * considering the tags as delimiter for splitting. 786 * <BR>Use Class class as input parameter 787 * instead of tags[] string array. 788 * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). 789 */ 790 public static String[] splitTags (String input, Class nodeType) 791 throws ParserException, UnsupportedEncodingException 792 { 793 return splitTags (input, new NodeClassFilter (nodeType), true, true); 794 } 795 796 /** 797 * Split the input string in a string array, 798 * considering the tags as delimiter for splitting. 799 * <BR>Use Class class as input parameter 800 * instead of tags[] string array. 801 * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). 802 */ 803 public static String[] splitTags (String input, Class nodeType, boolean recursive, boolean insideTag) 804 throws ParserException, UnsupportedEncodingException 805 { 806 return splitTags (input, new NodeClassFilter (nodeType), recursive, insideTag); 807 } 808 809 /** 810 * Split the input string in a string array, 811 * considering the tags as delimiter for splitting. 812 * <BR>Use NodeFilter class as input parameter 813 * instead of tags[] string array. 814 * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). 815 */ 816 public static String[] splitTags (String input, NodeFilter filter) 817 throws ParserException, UnsupportedEncodingException 818 { 819 return splitTags (input, filter, true, true); 820 } 821 822 /** 823 * Split the input string in a string array, 824 * considering the tags as delimiter for splitting. 825 * <BR>Use NodeFilter class as input parameter 826 * instead of tags[] string array. 827 * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). 828 */ 829 public static String[] splitTags (String input, NodeFilter filter, boolean recursive, boolean insideTag) 830 throws ParserException, UnsupportedEncodingException 831 { 832 833 ArrayList outputArrayList = new ArrayList(); 834 int minCapacity = 0; 835 String output = new String(); 836 837 String dummyString = createDummyString (' ', input.length()); 838 839 // loop inside the tags of the same type 840 NodeList links = getLinks (input, filter, recursive); 841 for (int j=0; j<links.size(); j++) 842 { 843 CompositeTag beginTag = (CompositeTag)links.elementAt(j); 844 Tag endTag = beginTag.getEndTag(); 845 846 // positions of begin and end tags 847 int beginTagBegin = beginTag.getStartPosition (); 848 int endTagBegin = beginTag.getEndPosition (); 849 int beginTagEnd = endTag.getStartPosition (); 850 int endTagEnd = endTag.getEndPosition (); 851 852 if (insideTag) 853 { 854 dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagEnd); 855 } 856 else 857 { 858 dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagBegin); 859 dummyString = modifyDummyString (new String(dummyString), beginTagEnd, endTagEnd); 860 } 861 } 862 for (int k=dummyString.indexOf(' '); (k<dummyString.length()) && (k!=-1);) 863 { 864 int kNew = dummyString.indexOf('*',k); 865 if (kNew!=-1) 866 { 867 output = input.substring(k,kNew); 868 k = dummyString.indexOf(' ',kNew); 869 870 minCapacity++; 871 outputArrayList.ensureCapacity(minCapacity); 872 if (outputArrayList.add(output)) 873 output = new String(); 874 else 875 minCapacity--; 876 } 877 else 878 { 879 output = input.substring(k,dummyString.length()); 880 k = kNew; 881 882 minCapacity++; 883 outputArrayList.ensureCapacity(minCapacity); 884 if (outputArrayList.add(output)) 885 output = new String(); 886 else 887 minCapacity--; 888 } 889 890 } 891 892 outputArrayList.trimToSize(); 893 Object[] outputObj = outputArrayList.toArray(); 894 String[] outputStr = new String[outputArrayList.size()]; 895 for (int i=0; i<outputArrayList.size(); i++) 896 outputStr[i] = new String((String) outputObj[i]); 897 return outputStr; 898 899 } 900 901 /** 902 * Trim the input string, removing all the tags in the input string. 903 * <BR>The method trims all the substrings included in the input string of the following type: 904 * "<XXX>", where XXX could be a string of any type. 905 * <BR>If you set to true the inside parameter, the method deletes also the YYY string in the following input string: 906 * "<XXX>YYY<ZZZ>", note that ZZZ is not necessary the closing tag of XXX. 907 * @param input The string in input. 908 * @param inside If true, it forces the method to delete also what is inside the tags. 909 * @return The string without tags. 910 */ 911 public static String trimAllTags (String input, boolean inside) 912 { 913 914 StringBuffer output = new StringBuffer(); 915 916 if (inside) { 917 if ((input.indexOf('<')==-1) || (input.lastIndexOf('>')==-1) || (input.lastIndexOf('>')<input.indexOf('<'))) { 918 output.append(input); 919 } else { 920 output.append(input.substring(0, input.indexOf('<'))); 921 output.append(input.substring(input.lastIndexOf('>')+1, input.length())); 922 } 923 } else { 924 boolean write = true; 925 for (int index=0; index<input.length(); index++) 926 { 927 if (input.charAt(index)=='<' && write) 928 write = false; 929 if (write) 930 output.append(input.charAt(index)); 931 if (input.charAt(index)=='>' && (!write)) 932 write = true; 933 } 934 } 935 936 return output.toString(); 937 } 938 939 940 /** 941 * Trim all tags in the input string and 942 * return a string like the input one 943 * without the tags and their content. 944 * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). 945 */ 946 public static String trimTags (String input, String[] tags) 947 throws ParserException, UnsupportedEncodingException 948 { 949 return trimTags (input, tags, true, true); 950 } 951 952 /** 953 * Trim all tags in the input string and 954 * return a string like the input one 955 * without the tags and their content (optional). 956 * <BR>For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}), 957 * <BR>you obtain a string " ALL OK" as output (trimmed <DIV> tags and their content recursively). 958 * <BR>For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, false), 959 * <BR>you obtain a string "<DIV> +12.5 </DIV> ALL OK" as output (trimmed <DIV> tags and not their content and no recursively). 960 * <BR>For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, true, false), 961 * <BR>you obtain a string " +12.5 ALL OK" as output (trimmed <DIV> tags and not their content recursively). 962 * <BR>For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, true), 963 * <BR>you obtain a string " ALL OK" as output (trimmed <DIV> tags and their content). 964 * @param input The string in input. 965 * @param tags The tags to be removed. 966 * @param recursive Optional parameter (true if not present), if true delete all the tags recursively. 967 * @param insideTag Optional parameter (true if not present), if true delete also the content of the tags. 968 * @return The string without tags. 969 */ 970 public static String trimTags (String input, String[] tags, boolean recursive, boolean insideTag) 971 throws ParserException, UnsupportedEncodingException 972 { 973 974 StringBuffer output = new StringBuffer(); 975 String inputModified = new String(input); 976 String dummyString = createDummyString (' ', input.length()); 977 978 // loop inside the different tags to be trimmed 979 for (int i=0; i<tags.length; i++) 980 { 981 output = new StringBuffer(); 982 983 // loop inside the tags of the same type 984 NodeList links = getLinks (inputModified, tags[i], recursive); 985 for (int j=0; j<links.size(); j++) 986 { 987 CompositeTag beginTag = (CompositeTag)links.elementAt(j); 988 Tag endTag = beginTag.getEndTag(); 989 990 // positions of begin and end tags 991 int beginTagBegin = beginTag.getStartPosition (); 992 int endTagBegin = beginTag.getEndPosition (); 993 int beginTagEnd = endTag.getStartPosition (); 994 int endTagEnd = endTag.getEndPosition (); 995 996 997 if (insideTag) 998 { 999 dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagEnd); 1000 } 1001 else 1002 { 1003 dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagBegin); 1004 dummyString = modifyDummyString (new String(dummyString), beginTagEnd, endTagEnd); 1005 } 1006 } 1007 for (int k=dummyString.indexOf(' '); (k<dummyString.length()) && (k!=-1);) 1008 { 1009 int kNew = dummyString.indexOf('*',k); 1010 if (kNew!=-1) 1011 { 1012 output = output.append(inputModified.substring(k,kNew)); 1013 k = dummyString.indexOf(' ',kNew); 1014 } 1015 else 1016 { 1017 output = output.append(inputModified.substring(k,dummyString.length())); 1018 k = kNew; 1019 } 1020 } 1021 inputModified = new String(output); 1022 dummyString = createDummyString (' ', inputModified.length()); 1023 } 1024 1025 return output.toString(); 1026 1027 } 1028 1029 /** 1030 * Trim all tags in the input string and 1031 * return a string like the input one 1032 * without the tags and their content. 1033 * <BR>Use Class class as input parameter 1034 * instead of tags[] string array. 1035 * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). 1036 */ 1037 public static String trimTags (String input, Class nodeType) 1038 throws ParserException, UnsupportedEncodingException 1039 { 1040 return trimTags (input, new NodeClassFilter (nodeType), true, true); 1041 } 1042 1043 /** 1044 * Trim all tags in the input string and 1045 * return a string like the input one 1046 * without the tags and their content (optional). 1047 * <BR>Use Class class as input parameter 1048 * instead of tags[] string array. 1049 * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). 1050 */ 1051 public static String trimTags (String input, Class nodeType, boolean recursive, boolean insideTag) 1052 throws ParserException, UnsupportedEncodingException 1053 { 1054 return trimTags (input, new NodeClassFilter (nodeType), recursive, insideTag); 1055 } 1056 1057 /** 1058 * Trim all tags in the input string and 1059 * return a string like the input one 1060 * without the tags and their content. 1061 * <BR>Use NodeFilter class as input parameter 1062 * instead of tags[] string array. 1063 * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). 1064 */ 1065 public static String trimTags (String input, NodeFilter filter) 1066 throws ParserException, UnsupportedEncodingException 1067 { 1068 return trimTags (input, filter, true, true); 1069 } 1070 1071 /** 1072 * Trim all tags in the input string and 1073 * return a string like the input one 1074 * without the tags and their content (optional). 1075 * <BR>Use NodeFilter class as input parameter 1076 * instead of tags[] string array. 1077 * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). 1078 */ 1079 public static String trimTags (String input, NodeFilter filter, boolean recursive, boolean insideTag) 1080 throws ParserException, UnsupportedEncodingException 1081 { 1082 1083 StringBuffer output = new StringBuffer(); 1084 1085 String dummyString = createDummyString (' ', input.length()); 1086 1087 // loop inside the tags of the same type 1088 NodeList links = getLinks (input, filter, recursive); 1089 for (int j=0; j<links.size(); j++) 1090 { 1091 CompositeTag beginTag = (CompositeTag)links.elementAt(j); 1092 Tag endTag = beginTag.getEndTag(); 1093 1094 // positions of begin and end tags 1095 int beginTagBegin = beginTag.getStartPosition (); 1096 int endTagBegin = beginTag.getEndPosition (); 1097 int beginTagEnd = endTag.getStartPosition (); 1098 int endTagEnd = endTag.getEndPosition (); 1099 1100 if (insideTag) 1101 { 1102 dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagEnd); 1103 } 1104 else 1105 { 1106 dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagBegin); 1107 dummyString = modifyDummyString (new String(dummyString), beginTagEnd, endTagEnd); 1108 } 1109 } 1110 for (int k=dummyString.indexOf(' '); (k<dummyString.length()) && (k!=-1);) 1111 { 1112 int kNew = dummyString.indexOf('*',k); 1113 if (kNew!=-1) 1114 { 1115 output = output.append(input.substring(k,kNew)); 1116 k = dummyString.indexOf(' ',kNew); 1117 } 1118 else 1119 { 1120 output = output.append(input.substring(k,dummyString.length())); 1121 k = kNew; 1122 } 1123 1124 } 1125 1126 return output.toString(); 1127 1128 } 1129 1130 /** 1131 * Create a Parser Object having a String Object as input (instead of a url or a string representing the url location). 1132 * <BR>The string will be parsed as it would be a file. 1133 * @param input The string in input. 1134 * @return The Parser Object with the string as input stream. 1135 */ 1136 public static Parser createParserParsingAnInputString (String input) 1137 throws ParserException, UnsupportedEncodingException 1138 { 1139 1140 Parser parser = new Parser(); 1141 Lexer lexer = new Lexer(); 1142 Page page = new Page(input); 1143 lexer.setPage(page); 1144 parser.setLexer(lexer); 1145 1146 return parser; 1147 1148 } 1149 1150 private static NodeList getLinks (String output, String tag, boolean recursive) 1151 throws ParserException, UnsupportedEncodingException 1152 { 1153 1154 Parser parser = new Parser(); 1155 NodeFilter filterLink = new TagNameFilter (tag); 1156 NodeList links = new NodeList (); 1157 parser = createParserParsingAnInputString(output); 1158 links = parser.extractAllNodesThatMatch(filterLink); 1159 1160 // loop to remove tags added recursively 1161 // so if you have selected 'not recursive option' 1162 // you have only the tag container and not the contained tags. 1163 if (!recursive) 1164 { 1165 for (int j=0; j<links.size(); j++) 1166 { 1167 CompositeTag jStartTag = (CompositeTag)links.elementAt(j); 1168 Tag jEndTag = jStartTag.getEndTag(); 1169 int jStartTagBegin = jStartTag.getStartPosition (); 1170 int jEndTagEnd = jEndTag.getEndPosition (); 1171 for (int k=0; k<links.size(); k++) 1172 { 1173 CompositeTag kStartTag = (CompositeTag)links.elementAt(k); 1174 Tag kEndTag = kStartTag.getEndTag(); 1175 int kStartTagBegin = kStartTag.getStartPosition (); 1176 int kEndTagEnd = kEndTag.getEndPosition (); 1177 if ((k!=j) && (kStartTagBegin>jStartTagBegin) && (kEndTagEnd<jEndTagEnd)) 1178 { 1179 links.remove(k); 1180 k--; 1181 j--; 1182 } 1183 } 1184 } 1185 } 1186 1187 return links; 1188 1189 } 1190 1191 private static NodeList getLinks (String output, NodeFilter filter, boolean recursive) 1192 throws ParserException, UnsupportedEncodingException 1193 { 1194 1195 Parser parser = new Parser(); 1196 NodeList links = new NodeList (); 1197 parser = createParserParsingAnInputString(output); 1198 links = parser.extractAllNodesThatMatch(filter); 1199 1200 // loop to remove tags added recursively 1201 // so if you have selected 'not recursive option' 1202 // you have only the tag container and not the contained tags. 1203 if (!recursive) 1204 { 1205 for (int j=0; j<links.size(); j++) 1206 { 1207 CompositeTag jStartTag = (CompositeTag)links.elementAt(j); 1208 Tag jEndTag = jStartTag.getEndTag(); 1209 int jStartTagBegin = jStartTag.getStartPosition (); 1210 int jEndTagEnd = jEndTag.getEndPosition (); 1211 for (int k=0; k<links.size(); k++) 1212 { 1213 CompositeTag kStartTag = (CompositeTag)links.elementAt(k); 1214 Tag kEndTag = kStartTag.getEndTag(); 1215 int kStartTagBegin = kStartTag.getStartPosition (); 1216 int kEndTagEnd = kEndTag.getEndPosition (); 1217 if ((k!=j) && (kStartTagBegin>jStartTagBegin) && (kEndTagEnd<jEndTagEnd)) 1218 { 1219 links.remove(k); 1220 k--; 1221 j--; 1222 } 1223 } 1224 } 1225 } 1226 1227 return links; 1228 1229 } 1230 1231 private static String createDummyString (char fillingChar, int length) 1232 { 1233 StringBuffer dummyStringBuffer = new StringBuffer(); 1234 for (int j=0; j<length; j++) 1235 dummyStringBuffer = dummyStringBuffer.append(fillingChar); 1236 return new String(dummyStringBuffer); 1237 } 1238 1239 private static String modifyDummyString (String dummyString, int beginTag, int endTag) 1240 { 1241 String dummyStringInterval = createDummyString ('*', endTag-beginTag); 1242 return new String(dummyString.substring(0, beginTag) + dummyStringInterval + dummyString.substring(endTag, dummyString.length())); 1243 } 1244 1245 }