InputStreamSource.java
1 // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML 2 // http://sourceforge.org/projects/htmlparser 3 // Copyright (C) 2004 Derrick Oswald 4 // 5 // Revision Control Information 6 // 7 // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/InputStreamSource.java,v $ 8 // $Author: derrickoswald $ 9 // $Date: 2005/10/25 01:26:09 $ 10 // $Revision: 1.9 $ 11 // 12 // This library is free software; you can redistribute it and/or 13 // modify it under the terms of the GNU Lesser General Public 14 // License as published by the Free Software Foundation; either 15 // version 2.1 of the License, or (at your option) any later version. 16 // 17 // This library is distributed in the hope that it will be useful, 18 // but WITHOUT ANY WARRANTY; without even the implied warranty of 19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 // Lesser General Public License for more details. 21 // 22 // You should have received a copy of the GNU Lesser General Public 23 // License along with this library; if not, write to the Free Software 24 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 // 26 27 package org.htmlparser.lexer; 28 29 import java.io.ByteArrayInputStream; 30 import java.io.IOException; 31 import java.io.InputStream; 32 import java.io.InputStreamReader; 33 import java.io.ObjectInputStream; 34 import java.io.ObjectOutputStream; 35 import java.io.UnsupportedEncodingException; 36 37 import org.htmlparser.util.EncodingChangeException; 38 import org.htmlparser.util.ParserException; 39 40 /** 41 * A source of characters based on an InputStream such as from a URLConnection. 42 */ 43 public class InputStreamSource 44 extends 45 Source 46 { 47 /** 48 * An initial buffer size. 49 * Has a default value of {16384}. 50 */ 51 public static int BUFFER_SIZE = 16384; 52 53 /** 54 * The stream of bytes. 55 * Set to <code>null</code> when the source is closed. 56 */ 57 protected transient InputStream mStream; 58 59 /** 60 * The character set in use. 61 */ 62 protected String mEncoding; 63 64 /** 65 * The converter from bytes to characters. 66 */ 67 protected transient InputStreamReader mReader; 68 69 /** 70 * The characters read so far. 71 */ 72 protected char[] mBuffer; 73 74 /** 75 * The number of valid bytes in the buffer. 76 */ 77 protected int mLevel; 78 79 /** 80 * The offset of the next byte returned by read(). 81 */ 82 protected int mOffset; 83 84 /** 85 * The bookmark. 86 */ 87 protected int mMark; 88 89 /** 90 * Create a source of characters using the default character set. 91 * @param stream The stream of bytes to use. 92 * @exception UnsupportedEncodingException If the default character set 93 * is unsupported. 94 */ 95 public InputStreamSource (InputStream stream) 96 throws 97 UnsupportedEncodingException 98 { 99 this (stream, null, BUFFER_SIZE); 100 } 101 102 /** 103 * Create a source of characters. 104 * @param stream The stream of bytes to use. 105 * @param charset The character set used in encoding the stream. 106 * @exception UnsupportedEncodingException If the character set 107 * is unsupported. 108 */ 109 public InputStreamSource (InputStream stream, String charset) 110 throws 111 UnsupportedEncodingException 112 { 113 this (stream, charset, BUFFER_SIZE); 114 } 115 116 /** 117 * Create a source of characters. 118 * @param stream The stream of bytes to use. 119 * @param charset The character set used in encoding the stream. 120 * @param size The initial character buffer size. 121 * @exception UnsupportedEncodingException If the character set 122 * is unsupported. 123 */ 124 public InputStreamSource (InputStream stream, String charset, int size) 125 throws 126 UnsupportedEncodingException 127 { 128 if (null == stream) 129 stream = new Stream (null); 130 else 131 // bug #1044707 mark()/reset() issues 132 if (!stream.markSupported ()) 133 // wrap the stream so we can reset 134 stream = new Stream (stream); 135 // else 136 // just because mark is supported doesn't guarantee 137 // proper reset operation; there is no call to mark 138 // in this code, so if reset misbehaves there is an 139 // appropriate message in setEncoding() to suggest 140 // wraping it in a Stream. 141 // This was deemed better than an attempt to call 142 // reset at this point just to check if we would 143 // succeed later, or to call mark with an arbitrary 144 // lookahead size 145 mStream = stream; 146 if (null == charset) 147 { 148 mReader = new InputStreamReader (stream); 149 mEncoding = mReader.getEncoding (); 150 } 151 else 152 { 153 mEncoding = charset; 154 mReader = new InputStreamReader (stream, charset); 155 } 156 mBuffer = new char[size]; 157 mLevel = 0; 158 mOffset = 0; 159 mMark = -1; 160 } 161 162 // 163 // Serialization support 164 // 165 166 /** 167 * Serialization support. 168 * @param out Where to write this object. 169 * @exception IOException If serialization has a problem. 170 */ 171 private void writeObject (ObjectOutputStream out) 172 throws 173 IOException 174 { 175 int offset; 176 char[] buffer; 177 178 if (null != mStream) 179 { 180 // remember the offset, drain the input stream, restore the offset 181 offset = mOffset; 182 buffer = new char[4096]; 183 while (EOF != read (buffer)) 184 ; 185 mOffset = offset; 186 } 187 188 out.defaultWriteObject (); 189 } 190 191 /** 192 * Deserialization support. 193 * @param in Where to read this object from. 194 * @exception IOException If deserialization has a problem. 195 */ 196 private void readObject (ObjectInputStream in) 197 throws 198 IOException, 199 ClassNotFoundException 200 { 201 in.defaultReadObject (); 202 if (null != mBuffer) // buffer is null when destroy's been called 203 // pretend we're open, mStream goes null when exhausted 204 mStream = new ByteArrayInputStream (new byte[0]); 205 } 206 207 /** 208 * Get the input stream being used. 209 * @return The current input stream. 210 */ 211 public InputStream getStream () 212 { 213 return (mStream); 214 } 215 216 /** 217 * Get the encoding being used to convert characters. 218 * @return The current encoding. 219 */ 220 public String getEncoding () 221 { 222 return (mEncoding); 223 } 224 225 /** 226 * Begins reading from the source with the given character set. 227 * If the current encoding is the same as the requested encoding, 228 * this method is a no-op. Otherwise any subsequent characters read from 229 * this page will have been decoded using the given character set.<p> 230 * Some magic happens here to obtain this result if characters have already 231 * been consumed from this source. 232 * Since a Reader cannot be dynamically altered to use a different character 233 * set, the underlying stream is reset, a new Source is constructed 234 * and a comparison made of the characters read so far with the newly 235 * read characters up to the current position. 236 * If a difference is encountered, or some other problem occurs, 237 * an exception is thrown. 238 * @param character_set The character set to use to convert bytes into 239 * characters. 240 * @exception ParserException If a character mismatch occurs between 241 * characters already provided and those that would have been returned 242 * had the new character set been in effect from the beginning. An 243 * exception is also thrown if the underlying stream won't put up with 244 * these shenanigans. 245 */ 246 public void setEncoding (String character_set) 247 throws 248 ParserException 249 { 250 String encoding; 251 InputStream stream; 252 char[] buffer; 253 int offset; 254 char[] new_chars; 255 256 encoding = getEncoding (); 257 if (!encoding.equalsIgnoreCase (character_set)) 258 { 259 stream = getStream (); 260 try 261 { 262 buffer = mBuffer; 263 offset = mOffset; 264 stream.reset (); 265 try 266 { 267 mEncoding = character_set; 268 mReader = new InputStreamReader (stream, character_set); 269 mBuffer = new char[mBuffer.length]; 270 mLevel = 0; 271 mOffset = 0; 272 mMark = -1; 273 if (0 != offset) 274 { 275 new_chars = new char[offset]; 276 if (offset != read (new_chars)) 277 throw new ParserException ("reset stream failed"); 278 for (int i = 0; i < offset; i++) 279 if (new_chars[i] != buffer[i]) 280 throw new EncodingChangeException ("character mismatch (new: " 281 + new_chars[i] 282 + " [0x" 283 + Integer.toString (new_chars[i], 16) 284 + "] != old: " 285 + " [0x" 286 + Integer.toString (buffer[i], 16) 287 + buffer[i] 288 + "]) for encoding change from " 289 + encoding 290 + " to " 291 + character_set 292 + " at character offset " 293 + i); 294 } 295 } 296 catch (IOException ioe) 297 { 298 throw new ParserException (ioe.getMessage (), ioe); 299 } 300 } 301 catch (IOException ioe) 302 { // bug #1044707 mark()/reset() issues 303 throw new ParserException ("Stream reset failed (" 304 + ioe.getMessage () 305 + "), try wrapping it with a org.htmlparser.lexer.Stream", 306 ioe); 307 } 308 } 309 } 310 311 /** 312 * Fetch more characters from the underlying reader. 313 * Has no effect if the underlying reader has been drained. 314 * @param min The minimum to read. 315 * @exception IOException If the underlying reader read() throws one. 316 */ 317 protected void fill (int min) 318 throws 319 IOException 320 { 321 char[] buffer; 322 int size; 323 int read; 324 325 if (null != mReader) // mReader goes null when it's been sucked dry 326 { 327 size = mBuffer.length - mLevel; // available space 328 if (size < min) // oops, better get some buffer space 329 { 330 // unknown length... keep doubling 331 size = mBuffer.length * 2; 332 read = mLevel + min; 333 if (size < read) // or satisfy min, whichever is greater 334 size = read; 335 else 336 min = size - mLevel; // read the max 337 buffer = new char[size]; 338 } 339 else 340 { 341 buffer = mBuffer; 342 min = size; 343 } 344 345 // read into the end of the 'new' buffer 346 read = mReader.read (buffer, mLevel, min); 347 if (EOF == read) 348 { 349 mReader.close (); 350 mReader = null; 351 } 352 else 353 { 354 if (mBuffer != buffer) 355 { // copy the bytes previously read 356 System.arraycopy (mBuffer, 0, buffer, 0, mLevel); 357 mBuffer = buffer; 358 } 359 mLevel += read; 360 } 361 // todo, should repeat on read shorter than original min 362 } 363 } 364 365 // 366 // Reader overrides 367 // 368 369 /** 370 * Does nothing. 371 * It's supposed to close the source, but use destroy() instead. 372 * @exception IOException <em>not used</em> 373 * @see #destroy 374 */ 375 public void close () throws IOException 376 { 377 } 378 379 /** 380 * Read a single character. 381 * This method will block until a character is available, 382 * an I/O error occurs, or the end of the stream is reached. 383 * @return The character read, as an integer in the range 0 to 65535 384 * (<tt>0x00-0xffff</tt>), or {@link #EOF EOF} if the end of the stream has 385 * been reached 386 * @exception IOException If an I/O error occurs. 387 */ 388 public int read () throws IOException 389 { 390 int ret; 391 392 if (mLevel - mOffset < 1) 393 { 394 if (null == mStream) 395 throw new IOException ("source is closed"); 396 fill (1); 397 if (mOffset >= mLevel) 398 ret = EOF; 399 else 400 ret = mBuffer[mOffset++]; 401 } 402 else 403 ret = mBuffer[mOffset++]; 404 405 return (ret); 406 } 407 408 /** 409 * Read characters into a portion of an array. This method will block 410 * until some input is available, an I/O error occurs, or the end of the 411 * stream is reached. 412 * @param cbuf Destination buffer 413 * @param off Offset at which to start storing characters 414 * @param len Maximum number of characters to read 415 * @return The number of characters read, or {@link #EOF EOF} if the end of 416 * the stream has been reached 417 * @exception IOException If an I/O error occurs. 418 */ 419 public int read (char[] cbuf, int off, int len) throws IOException 420 { 421 int ret; 422 423 if (null == mStream) 424 throw new IOException ("source is closed"); 425 if ((null == cbuf) || (0 > off) || (0 > len)) 426 throw new IOException ("illegal argument read (" 427 + ((null == cbuf) ? "null" : "cbuf") 428 + ", " + off + ", " + len + ")"); 429 if (mLevel - mOffset < len) 430 fill (len - (mLevel - mOffset)); // minimum to satisfy this request 431 if (mOffset >= mLevel) 432 ret = EOF; 433 else 434 { 435 ret = Math.min (mLevel - mOffset, len); 436 System.arraycopy (mBuffer, mOffset, cbuf, off, ret); 437 mOffset += ret; 438 } 439 440 return (ret); 441 } 442 443 /** 444 * Read characters into an array. 445 * This method will block until some input is available, an I/O error occurs, 446 * or the end of the stream is reached. 447 * @param cbuf Destination buffer. 448 * @return The number of characters read, or {@link #EOF EOF} if the end of 449 * the stream has been reached. 450 * @exception IOException If an I/O error occurs. 451 */ 452 public int read (char[] cbuf) throws IOException 453 { 454 return (read (cbuf, 0, cbuf.length)); 455 } 456 457 /** 458 * Reset the source. 459 * Repositions the read point to begin at zero. 460 * @exception IllegalStateException If the source has been closed. 461 */ 462 public void reset () 463 throws 464 IllegalStateException 465 { 466 if (null == mStream) 467 throw new IllegalStateException ("source is closed"); 468 if (-1 != mMark) 469 mOffset = mMark; 470 else 471 mOffset = 0; 472 } 473 474 /** 475 * Tell whether this source supports the mark() operation. 476 * @return <code>true</code>. 477 */ 478 public boolean markSupported () 479 { 480 return (true); 481 } 482 483 /** 484 * Mark the present position in the source. 485 * Subsequent calls to {@link #reset()} 486 * will attempt to reposition the source to this point. 487 * @param readAheadLimit <em>Not used.</em> 488 * @exception IOException If the source is closed. 489 * 490 */ 491 public void mark (int readAheadLimit) throws IOException 492 { 493 if (null == mStream) 494 throw new IOException ("source is closed"); 495 mMark = mOffset; 496 } 497 498 /** 499 * Tell whether this source is ready to be read. 500 * @return <code>true</code> if the next read() is guaranteed not to block 501 * for input, <code>false</code> otherwise. 502 * Note that returning false does not guarantee that the next read will block. 503 * @exception IOException If the source is closed. 504 */ 505 public boolean ready () throws IOException 506 { 507 if (null == mStream) 508 throw new IOException ("source is closed"); 509 return (mOffset < mLevel); 510 } 511 512 /** 513 * Skip characters. 514 * This method will block until some characters are available, 515 * an I/O error occurs, or the end of the stream is reached. 516 * <em>Note: n is treated as an int</em> 517 * @param n The number of characters to skip. 518 * @return The number of characters actually skipped 519 * @exception IllegalArgumentException If <code>n</code> is negative. 520 * @exception IOException If an I/O error occurs. 521 */ 522 public long skip (long n) 523 throws 524 IOException, 525 IllegalArgumentException 526 { 527 long ret; 528 529 if (null == mStream) 530 throw new IOException ("source is closed"); 531 if (0 > n) 532 throw new IllegalArgumentException ("cannot skip backwards"); 533 else 534 { 535 if (mLevel - mOffset < n) 536 fill ((int)(n - (mLevel - mOffset))); // minimum to satisfy this request 537 if (mOffset >= mLevel) 538 ret = EOF; 539 else 540 { 541 ret = Math.min (mLevel - mOffset, n); 542 mOffset += ret; 543 } 544 } 545 546 return (ret); 547 } 548 549 // 550 // Methods not in your Daddy's Reader 551 // 552 553 /** 554 * Undo the read of a single character. 555 * @exception IOException If the source is closed or no characters have 556 * been read. 557 */ 558 public void unread () throws IOException 559 { 560 if (null == mStream) 561 throw new IOException ("source is closed"); 562 if (0 < mOffset) 563 mOffset--; 564 else 565 throw new IOException ("can't unread no characters"); 566 } 567 568 /** 569 * Retrieve a character again. 570 * @param offset The offset of the character. 571 * @return The character at <code>offset</code>. 572 * @exception IOException If the offset is beyond {@link #offset()} or the 573 * source is closed. 574 */ 575 public char getCharacter (int offset) throws IOException 576 { 577 char ret; 578 579 if (null == mStream) 580 throw new IOException ("source is closed"); 581 if (offset >= mBuffer.length) 582 throw new IOException ("illegal read ahead"); 583 else 584 ret = mBuffer[offset]; 585 586 return (ret); 587 } 588 589 /** 590 * Retrieve characters again. 591 * @param array The array of characters. 592 * @param offset The starting position in the array where characters are to be placed. 593 * @param start The starting position, zero based. 594 * @param end The ending position 595 * (exclusive, i.e. the character at the ending position is not included), 596 * zero based. 597 * @exception IOException If the start or end is beyond {@link #offset()} 598 * or the source is closed. 599 */ 600 public void getCharacters (char[] array, int offset, int start, int end) throws IOException 601 { 602 if (null == mStream) 603 throw new IOException ("source is closed"); 604 System.arraycopy (mBuffer, start, array, offset, end - start); 605 } 606 607 /** 608 * Retrieve a string. 609 * @param offset The offset of the first character. 610 * @param length The number of characters to retrieve. 611 * @return A string containing the <code>length</code> characters at <code>offset</code>. 612 * @exception IOException If the offset or (offset + length) is beyond 613 * {@link #offset()} or the source is closed. 614 */ 615 public String getString (int offset, int length) throws IOException 616 { 617 String ret; 618 619 if (null == mStream) 620 throw new IOException ("source is closed"); 621 if (offset + length > mBuffer.length) 622 throw new IOException ("illegal read ahead"); 623 else 624 ret = new String (mBuffer, offset, length); 625 626 return (ret); 627 } 628 629 /** 630 * Append characters already read into a <code>StringBuffer</code>. 631 * @param buffer The buffer to append to. 632 * @param offset The offset of the first character. 633 * @param length The number of characters to retrieve. 634 * @exception IOException If the offset or (offset + length) is beyond 635 * {@link #offset()} or the source is closed. 636 */ 637 public void getCharacters (StringBuffer buffer, int offset, int length) throws IOException 638 { 639 if (null == mStream) 640 throw new IOException ("source is closed"); 641 buffer.append (mBuffer, offset, length); 642 } 643 644 /** 645 * Close the source. 646 * Once a source has been closed, further {@link #read() read}, 647 * {@link #ready ready}, {@link #mark mark}, {@link #reset reset}, 648 * {@link #skip skip}, {@link #unread unread}, 649 * {@link #getCharacter getCharacter} or {@link #getString getString} 650 * invocations will throw an IOException. 651 * Closing a previously-closed source, however, has no effect. 652 * @exception IOException If an I/O error occurs 653 */ 654 public void destroy () throws IOException 655 { 656 mStream = null; 657 if (null != mReader) 658 mReader.close (); 659 mReader = null; 660 mBuffer = null; 661 mLevel = 0; 662 mOffset = 0; 663 mMark = -1; 664 } 665 666 /** 667 * Get the position (in characters). 668 * @return The number of characters that have already been read, or 669 * {@link #EOF EOF} if the source is closed. 670 */ 671 public int offset () 672 { 673 int ret; 674 675 if (null == mStream) 676 ret = EOF; 677 else 678 ret = mOffset; 679 680 return (ret); 681 } 682 683 /** 684 * Get the number of available characters. 685 * @return The number of characters that can be read without blocking or 686 * zero if the source is closed. 687 */ 688 public int available () 689 { 690 int ret; 691 692 if (null == mStream) 693 ret = 0; 694 else 695 ret = mLevel - mOffset; 696 697 return (ret); 698 } 699 }