/ org.htmlparser / src / org / htmlparser / lexer / InputStreamSource.java
InputStreamSource.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2004 Derrick Oswald
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/InputStreamSource.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2005/10/25 01:26:09 $
 10  // $Revision: 1.9 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser.lexer;
 28  
 29  import java.io.ByteArrayInputStream;
 30  import java.io.IOException;
 31  import java.io.InputStream;
 32  import java.io.InputStreamReader;
 33  import java.io.ObjectInputStream;
 34  import java.io.ObjectOutputStream;
 35  import java.io.UnsupportedEncodingException;
 36  
 37  import org.htmlparser.util.EncodingChangeException;
 38  import org.htmlparser.util.ParserException;
 39  
 40  /**
 41   * A source of characters based on an InputStream such as from a URLConnection.
 42   */
 43  public class InputStreamSource
 44      extends
 45          Source
 46  {
 47      /**
 48       * An initial buffer size.
 49       * Has a default value of {16384}.
 50       */
 51      public static int BUFFER_SIZE = 16384;
 52  
 53      /**
 54       * The stream of bytes.
 55       * Set to <code>null</code> when the source is closed.
 56       */
 57      protected transient InputStream mStream;
 58  
 59      /**
 60       * The character set in use.
 61       */
 62      protected String mEncoding;
 63  
 64      /**
 65       * The converter from bytes to characters.
 66       */
 67      protected transient InputStreamReader mReader;
 68  
 69      /**
 70       * The characters read so far.
 71       */
 72      protected char[] mBuffer;
 73  
 74      /**
 75       * The number of valid bytes in the buffer.
 76       */
 77      protected int mLevel;
 78  
 79      /**
 80       * The offset of the next byte returned by read().
 81       */
 82      protected int mOffset;
 83  
 84      /**
 85       * The bookmark.
 86       */
 87      protected int mMark;
 88  
 89      /**
 90       * Create a source of characters using the default character set.
 91       * @param stream The stream of bytes to use.
 92       * @exception UnsupportedEncodingException If the default character set
 93       * is unsupported.
 94       */
 95      public InputStreamSource (InputStream stream)
 96          throws
 97              UnsupportedEncodingException
 98      {
 99          this (stream, null, BUFFER_SIZE);
100      }
101  
102      /**
103       * Create a source of characters.
104       * @param stream The stream of bytes to use.
105       * @param charset The character set used in encoding the stream.
106       * @exception UnsupportedEncodingException If the character set
107       * is unsupported.
108       */
109      public InputStreamSource (InputStream stream, String charset)
110          throws
111              UnsupportedEncodingException
112      {
113          this (stream, charset, BUFFER_SIZE);
114      }
115  
116      /**
117       * Create a source of characters.
118       * @param stream The stream of bytes to use.
119       * @param charset The character set used in encoding the stream.
120       * @param size The initial character buffer size.
121       * @exception UnsupportedEncodingException If the character set
122       * is unsupported.
123       */
124      public InputStreamSource (InputStream stream, String charset, int size)
125          throws
126              UnsupportedEncodingException
127      {
128          if (null == stream)
129              stream = new Stream (null);
130          else
131              // bug #1044707 mark()/reset() issues
132              if (!stream.markSupported ())
133                  // wrap the stream so we can reset
134                  stream = new Stream (stream);
135              // else
136                  // just because mark is supported doesn't guarantee
137                  // proper reset operation; there is no call to mark
138                  // in this code, so if reset misbehaves there is an
139                  // appropriate message in setEncoding() to suggest
140                  // wraping it in a Stream.
141                  // This was deemed better than an attempt to call
142                  // reset at this point just to check if we would
143                  // succeed later, or to call mark with an arbitrary
144                  // lookahead size
145          mStream = stream;
146          if (null == charset)
147          {
148              mReader = new InputStreamReader (stream);
149              mEncoding = mReader.getEncoding ();
150          }
151          else
152          {
153              mEncoding = charset;
154              mReader = new InputStreamReader (stream, charset);
155          }
156          mBuffer = new char[size];
157          mLevel = 0;
158          mOffset = 0;
159          mMark = -1;
160      }
161  
162      //
163      // Serialization support
164      //
165  
166      /**
167       * Serialization support.
168       * @param out Where to write this object.
169       * @exception IOException If serialization has a problem.
170       */
171      private void writeObject (ObjectOutputStream out)
172          throws
173              IOException
174      {
175          int offset;
176          char[] buffer;
177  
178          if (null != mStream)
179          {
180              // remember the offset, drain the input stream, restore the offset
181              offset = mOffset;
182              buffer = new char[4096];
183              while (EOF != read (buffer))
184                  ;
185              mOffset = offset;
186          }
187  
188          out.defaultWriteObject ();
189      }
190  
191      /**
192       * Deserialization support.
193       * @param in Where to read this object from.
194       * @exception IOException If deserialization has a problem.
195       */
196      private void readObject (ObjectInputStream in)
197          throws
198              IOException,
199              ClassNotFoundException
200      {
201          in.defaultReadObject ();
202          if (null != mBuffer) // buffer is null when destroy's been called
203              // pretend we're open, mStream goes null when exhausted
204              mStream = new ByteArrayInputStream (new byte[0]);
205      }
206  
207      /**
208       * Get the input stream being used.
209       * @return The current input stream.
210       */
211      public InputStream getStream ()
212      {
213          return (mStream);
214      }
215  
216      /**
217       * Get the encoding being used to convert characters.
218       * @return The current encoding.
219       */
220      public String getEncoding ()
221      {
222          return (mEncoding);
223      }
224  
225      /**
226       * Begins reading from the source with the given character set.
227       * If the current encoding is the same as the requested encoding,
228       * this method is a no-op. Otherwise any subsequent characters read from
229       * this page will have been decoded using the given character set.<p>
230       * Some magic happens here to obtain this result if characters have already
231       * been consumed from this source.
232       * Since a Reader cannot be dynamically altered to use a different character
233       * set, the underlying stream is reset, a new Source is constructed
234       * and a comparison made of the characters read so far with the newly
235       * read characters up to the current position.
236       * If a difference is encountered, or some other problem occurs,
237       * an exception is thrown.
238       * @param character_set The character set to use to convert bytes into
239       * characters.
240       * @exception ParserException If a character mismatch occurs between
241       * characters already provided and those that would have been returned
242       * had the new character set been in effect from the beginning. An
243       * exception is also thrown if the underlying stream won't put up with
244       * these shenanigans.
245       */
246      public void setEncoding (String character_set)
247          throws
248              ParserException
249      {
250          String encoding;
251          InputStream stream;
252          char[] buffer;
253          int offset;
254          char[] new_chars;
255  
256          encoding = getEncoding ();
257          if (!encoding.equalsIgnoreCase (character_set))
258          {
259              stream = getStream ();
260              try
261              {
262                  buffer = mBuffer;
263                  offset = mOffset;
264                  stream.reset ();
265                  try
266                  {
267                      mEncoding = character_set;
268                      mReader = new InputStreamReader (stream, character_set);
269                      mBuffer = new char[mBuffer.length];
270                      mLevel = 0;
271                      mOffset = 0;
272                      mMark = -1;
273                      if (0 != offset)
274                      {
275                          new_chars = new char[offset];
276                          if (offset != read (new_chars))
277                              throw new ParserException ("reset stream failed");
278                          for (int i = 0; i < offset; i++)
279                              if (new_chars[i] != buffer[i])
280                                  throw new EncodingChangeException ("character mismatch (new: "
281                                  + new_chars[i]
282                                  + " [0x"
283                                  + Integer.toString (new_chars[i], 16)
284                                  + "] != old: "
285                                  + " [0x"
286                                  + Integer.toString (buffer[i], 16)
287                                  + buffer[i]
288                                  + "]) for encoding change from "
289                                  + encoding
290                                  + " to "
291                                  + character_set
292                                  + " at character offset "
293                                  + i);
294                      }
295                  }
296                  catch (IOException ioe)
297                  {
298                      throw new ParserException (ioe.getMessage (), ioe);
299                  }
300              }
301              catch (IOException ioe)
302              {   // bug #1044707 mark()/reset() issues
303                  throw new ParserException ("Stream reset failed ("
304                      + ioe.getMessage ()
305                      + "), try wrapping it with a org.htmlparser.lexer.Stream",
306                      ioe);
307              }
308          }
309      }
310  
311      /**
312       * Fetch more characters from the underlying reader.
313       * Has no effect if the underlying reader has been drained.
314       * @param min The minimum to read.
315       * @exception IOException If the underlying reader read() throws one.
316       */
317      protected void fill (int min)
318          throws
319              IOException
320      {
321          char[] buffer;
322          int size;
323          int read;
324  
325          if (null != mReader) // mReader goes null when it's been sucked dry
326          {
327              size = mBuffer.length - mLevel; // available space
328              if (size < min) // oops, better get some buffer space
329              {
330                  // unknown length... keep doubling
331                  size = mBuffer.length * 2;
332                  read = mLevel + min;
333                  if (size < read) // or satisfy min, whichever is greater
334                      size = read;
335                  else
336                      min = size - mLevel; // read the max
337                  buffer = new char[size];
338              }
339              else
340              {
341                  buffer = mBuffer;
342                  min = size;
343              }
344  
345              // read into the end of the 'new' buffer
346              read = mReader.read (buffer, mLevel, min);
347              if (EOF == read)
348              {
349                  mReader.close ();
350                  mReader = null;
351              }
352              else
353              {
354                  if (mBuffer != buffer)
355                  {   // copy the bytes previously read
356                      System.arraycopy (mBuffer, 0, buffer, 0, mLevel);
357                      mBuffer = buffer;
358                  }
359                  mLevel += read;
360              }
361              // todo, should repeat on read shorter than original min
362          }
363      }
364  
365      //
366      // Reader overrides
367      //
368  
369      /**
370       * Does nothing.
371       * It's supposed to close the source, but use destroy() instead.
372       * @exception IOException <em>not used</em>
373       * @see #destroy
374       */
375      public void close () throws IOException
376      {
377      }
378  
379      /**
380       * Read a single character.
381       * This method will block until a character is available,
382       * an I/O error occurs, or the end of the stream is reached.
383       * @return The character read, as an integer in the range 0 to 65535
384       * (<tt>0x00-0xffff</tt>), or {@link #EOF EOF} if the end of the stream has
385       * been reached
386       * @exception IOException If an I/O error occurs.
387       */
388      public int read () throws IOException
389      {
390          int ret;
391  
392          if (mLevel - mOffset < 1)
393          {
394              if (null == mStream)
395                  throw new IOException ("source is closed");
396              fill (1);
397              if (mOffset >= mLevel)
398                  ret = EOF;
399              else
400                  ret = mBuffer[mOffset++];
401          }
402          else
403              ret = mBuffer[mOffset++];
404  
405          return (ret);
406      }
407  
408      /**
409       * Read characters into a portion of an array.  This method will block
410       * until some input is available, an I/O error occurs, or the end of the
411       * stream is reached.
412       * @param cbuf Destination buffer
413       * @param off Offset at which to start storing characters
414       * @param len Maximum number of characters to read
415       * @return The number of characters read, or {@link #EOF EOF} if the end of
416       * the stream has been reached
417       * @exception IOException If an I/O error occurs.
418       */
419      public int read (char[] cbuf, int off, int len) throws IOException
420      {
421          int ret;
422  
423          if (null == mStream)
424              throw new IOException ("source is closed");
425          if ((null == cbuf) || (0 > off) || (0 > len))
426              throw new IOException ("illegal argument read ("
427                  + ((null == cbuf) ? "null" : "cbuf")
428                  + ", " + off + ", " + len + ")");
429          if (mLevel - mOffset < len)
430              fill (len - (mLevel - mOffset)); // minimum to satisfy this request
431          if (mOffset >= mLevel)
432              ret = EOF;
433          else
434          {
435              ret = Math.min (mLevel - mOffset, len);
436              System.arraycopy (mBuffer, mOffset, cbuf, off, ret);
437              mOffset += ret;
438          }
439  
440          return (ret);
441      }
442  
443      /**
444       * Read characters into an array.
445       * This method will block until some input is available, an I/O error occurs,
446       * or the end of the stream is reached.
447       * @param cbuf Destination buffer.
448       * @return The number of characters read, or {@link #EOF EOF} if the end of
449       * the stream has been reached.
450       * @exception IOException If an I/O error occurs.
451       */
452      public int read (char[] cbuf) throws IOException
453      {
454          return (read (cbuf, 0, cbuf.length));
455      }
456  
457      /**
458       * Reset the source.
459       * Repositions the read point to begin at zero.
460       * @exception IllegalStateException If the source has been closed.
461       */
462      public void reset ()
463          throws
464              IllegalStateException
465      {
466          if (null == mStream)
467              throw new IllegalStateException ("source is closed");
468          if (-1 != mMark)
469              mOffset = mMark;
470          else
471              mOffset = 0;
472      }
473  
474      /**
475       * Tell whether this source supports the mark() operation.
476       * @return <code>true</code>.
477       */
478      public boolean markSupported ()
479      {
480          return (true);
481      }
482  
483      /**
484       * Mark the present position in the source.
485       * Subsequent calls to {@link #reset()}
486       * will attempt to reposition the source to this point.
487       * @param  readAheadLimit <em>Not used.</em>
488       * @exception IOException If the source is closed.
489       *
490       */
491      public void mark (int readAheadLimit) throws IOException
492      {
493          if (null == mStream)
494              throw new IOException ("source is closed");
495          mMark = mOffset;
496      }
497  
498      /**
499       * Tell whether this source is ready to be read.
500       * @return <code>true</code> if the next read() is guaranteed not to block
501       * for input, <code>false</code> otherwise.
502       * Note that returning false does not guarantee that the next read will block.
503       * @exception IOException If the source is closed.
504       */
505      public boolean ready () throws IOException
506      {
507          if (null == mStream)
508              throw new IOException ("source is closed");
509          return (mOffset < mLevel);
510      }
511  
512      /**
513       * Skip characters.
514       * This method will block until some characters are available,
515       * an I/O error occurs, or the end of the stream is reached.
516       * <em>Note: n is treated as an int</em>
517       * @param n The number of characters to skip.
518       * @return The number of characters actually skipped
519       * @exception IllegalArgumentException If <code>n</code> is negative.
520       * @exception IOException If an I/O error occurs.
521       */
522      public long skip (long n)
523          throws
524              IOException,
525              IllegalArgumentException
526      {
527          long ret;
528  
529          if (null == mStream)
530              throw new IOException ("source is closed");
531          if (0 > n)
532              throw new IllegalArgumentException ("cannot skip backwards");
533          else
534          {
535              if (mLevel - mOffset < n)
536                  fill ((int)(n - (mLevel - mOffset))); // minimum to satisfy this request
537              if (mOffset >= mLevel)
538                  ret = EOF;
539              else
540              {
541                  ret = Math.min (mLevel - mOffset, n);
542                  mOffset += ret;
543              }
544          }
545  
546          return (ret);
547      }
548  
549      //
550      // Methods not in your Daddy's Reader
551      //
552  
553      /**
554       * Undo the read of a single character.
555       * @exception IOException If the source is closed or no characters have
556       * been read.
557       */
558      public void unread () throws IOException
559      {
560          if (null == mStream)
561              throw new IOException ("source is closed");
562          if (0 < mOffset)
563              mOffset--;
564          else
565              throw new IOException ("can't unread no characters");
566      }
567  
568      /**
569       * Retrieve a character again.
570       * @param offset The offset of the character.
571       * @return The character at <code>offset</code>.
572       * @exception IOException If the offset is beyond {@link #offset()} or the
573       * source is closed.
574       */
575      public char getCharacter (int offset) throws IOException
576      {
577          char ret;
578  
579          if (null == mStream)
580              throw new IOException ("source is closed");
581          if (offset >= mBuffer.length)
582              throw new IOException ("illegal read ahead");
583          else
584              ret = mBuffer[offset];
585          
586          return (ret);
587      }
588  
589      /**
590       * Retrieve characters again.
591       * @param array The array of characters.
592       * @param offset The starting position in the array where characters are to be placed.
593       * @param start The starting position, zero based.
594       * @param end The ending position
595       * (exclusive, i.e. the character at the ending position is not included),
596       * zero based.
597       * @exception IOException If the start or end is beyond {@link #offset()}
598       * or the source is closed.
599       */
600      public void getCharacters (char[] array, int offset, int start, int end) throws IOException
601      {
602          if (null == mStream)
603              throw new IOException ("source is closed");
604          System.arraycopy (mBuffer, start, array, offset, end - start);
605      }
606      
607      /**
608       * Retrieve a string.
609       * @param offset The offset of the first character.
610       * @param length The number of characters to retrieve.
611       * @return A string containing the <code>length</code> characters at <code>offset</code>.
612       * @exception IOException If the offset or (offset + length) is beyond
613       * {@link #offset()} or the source is closed.
614       */
615      public String getString (int offset, int length) throws IOException
616      {
617          String ret;
618  
619          if (null == mStream)
620              throw new IOException ("source is closed");
621          if (offset + length > mBuffer.length)
622              throw new IOException ("illegal read ahead");
623          else
624              ret = new String (mBuffer, offset, length);
625          
626          return (ret);
627      }
628  
629      /**
630       * Append characters already read into a <code>StringBuffer</code>.
631       * @param buffer The buffer to append to.
632       * @param offset The offset of the first character.
633       * @param length The number of characters to retrieve.
634       * @exception IOException If the offset or (offset + length) is beyond
635       * {@link #offset()} or the source is closed.
636       */
637      public void getCharacters (StringBuffer buffer, int offset, int length) throws IOException
638      {
639          if (null == mStream)
640              throw new IOException ("source is closed");
641          buffer.append (mBuffer, offset, length);
642      }
643  
644      /**
645       * Close the source.
646       * Once a source has been closed, further {@link #read() read},
647       * {@link #ready ready}, {@link #mark mark}, {@link #reset reset},
648       * {@link #skip skip}, {@link #unread unread},
649       * {@link #getCharacter getCharacter} or {@link #getString getString}
650       * invocations will throw an IOException.
651       * Closing a previously-closed source, however, has no effect.
652       * @exception IOException If an I/O error occurs
653       */
654      public void destroy () throws IOException
655      {
656          mStream = null;
657          if (null != mReader)
658              mReader.close ();
659          mReader = null;
660          mBuffer = null;
661          mLevel = 0;
662          mOffset = 0;
663          mMark = -1;
664      }
665  
666      /**
667       * Get the position (in characters).
668       * @return The number of characters that have already been read, or
669       * {@link #EOF EOF} if the source is closed.
670       */
671      public int offset ()
672      {
673          int ret;
674  
675          if (null == mStream)
676              ret = EOF;
677          else
678              ret = mOffset;
679  
680          return (ret);
681      }
682  
683      /**
684       * Get the number of available characters.
685       * @return The number of characters that can be read without blocking or
686       * zero if the source is closed.
687       */
688      public int available ()
689      {
690          int ret;
691  
692          if (null == mStream)
693              ret = 0;
694          else
695              ret = mLevel - mOffset;
696  
697          return (ret);
698      }
699  }