/ org.htmlparser / src / org / htmlparser / lexer / Lexer.java
Lexer.java
   1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
   2  // http://sourceforge.org/projects/htmlparser
   3  // Copyright (C) 2004 Derrick Oswald
   4  //
   5  // Revision Control Information
   6  //
   7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v $
   8  // $Author: derrickoswald $
   9  // $Date: 2006/03/19 21:26:32 $
  10  // $Revision: 1.44 $
  11  //
  12  // This library is free software; you can redistribute it and/or
  13  // modify it under the terms of the GNU Lesser General Public
  14  // License as published by the Free Software Foundation; either
  15  // version 2.1 of the License, or (at your option) any later version.
  16  //
  17  // This library is distributed in the hope that it will be useful,
  18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20  // Lesser General Public License for more details.
  21  //
  22  // You should have received a copy of the GNU Lesser General Public
  23  // License along with this library; if not, write to the Free Software
  24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  25  //
  26  
  27  package org.htmlparser.lexer;
  28  
  29  import java.io.Serializable;
  30  import java.net.MalformedURLException;
  31  import java.net.URLConnection;
  32  import java.util.Vector;
  33  
  34  import org.htmlparser.Node;
  35  import org.htmlparser.NodeFactory;
  36  import org.htmlparser.Remark;
  37  import org.htmlparser.Text;
  38  import org.htmlparser.Tag;
  39  import org.htmlparser.http.ConnectionManager;
  40  import org.htmlparser.nodes.RemarkNode;
  41  import org.htmlparser.nodes.TextNode;
  42  import org.htmlparser.nodes.TagNode;
  43  import org.htmlparser.util.ParserException;
  44  
  45  /**
  46   * This class parses the HTML stream into nodes.
  47   * There are three major types of nodes (lexemes):
  48   * <ul>
  49   * <li>Remark</li>
  50   * <li>Text</li>
  51   * <li>Tag</li>
  52   * </ul>
  53   * Each time <code>nextNode()</code> is called, another node is returned until
  54   * the stream is exhausted, and <code>null</code> is returned.
  55   */
  56  public class Lexer
  57      implements
  58          Serializable,
  59          NodeFactory
  60  {
  61      /**
  62       * The page lexemes are retrieved from.
  63       */
  64      protected Page mPage;
  65  
  66      /**
  67       * The current position on the page.
  68       */
  69      protected Cursor mCursor;
  70  
  71      /**
  72       * The factory for new nodes.
  73       */
  74      protected NodeFactory mFactory;
  75  
  76      /**
  77       * Line number to trigger on.
  78       * This is tested on each <code>nextNode()</code> call, as a debugging aid.
  79       * Alter this value and set a breakpoint on the guarded statement.
  80       * Remember, these line numbers are zero based, while most editors are
  81       * one based.
  82       * @see #nextNode
  83       */
  84      protected static int mDebugLineTrigger = -1;
  85  
  86      /**
  87       * Creates a new instance of a Lexer.
  88       */
  89      public Lexer ()
  90      {
  91          this (new Page (""));
  92      }
  93  
  94      /**
  95       * Creates a new instance of a Lexer.
  96       * @param page The page with HTML text.
  97       */
  98      public Lexer (Page page)
  99      {
 100          setPage (page);
 101          setCursor (new Cursor (page, 0));
 102          setNodeFactory (this);
 103      }
 104  
 105      /**
 106       * Creates a new instance of a Lexer.
 107       * @param text The text to parse.
 108       */
 109      public Lexer (String text)
 110      {
 111          this (new Page (text));
 112      }
 113  
 114      /**
 115       * Creates a new instance of a Lexer.
 116       * @param connection The url to parse.
 117       * @exception ParserException If an error occurs opening the connection.
 118       */
 119      public Lexer (URLConnection connection)
 120          throws
 121              ParserException
 122      {
 123          this (new Page (connection));
 124      }
 125  
 126      /**
 127       * Reset the lexer to start parsing from the beginning again.
 128       * The underlying components are reset such that the next call to
 129       * <code>nextNode()</code> will return the first lexeme on the page.
 130       */
 131      public void reset ()
 132      {
 133          getPage ().reset ();
 134          setCursor (new Cursor (getPage (), 0));
 135      }
 136  
 137      /**
 138       * Get the page this lexer is working on.
 139       * @return The page that nodes are being read from.
 140       */
 141      public Page getPage ()
 142      {
 143          return (mPage);
 144      }
 145  
 146      /**
 147       * Set the page this lexer is working on.
 148       * @param page The page that nodes will be read from.
 149       */
 150      public void setPage (Page page)
 151      {
 152          if (null == page)
 153              throw new IllegalArgumentException ("page cannot be null");
 154          // todo: sanity checks
 155          mPage = page;
 156      }
 157  
 158      /**
 159       * Get the current scanning position.
 160       * @return The lexer's cursor position.
 161       */
 162      public Cursor getCursor ()
 163      {
 164          return (mCursor);
 165      }
 166  
 167      /**
 168       * Set the current scanning position.
 169       * @param cursor The lexer's new cursor position.
 170       */
 171      public void setCursor (Cursor cursor)
 172      {
 173          if (null == cursor)
 174              throw new IllegalArgumentException ("cursor cannot be null");
 175          // todo: sanity checks
 176          mCursor = cursor;
 177      }
 178  
 179      /**
 180       * Get the current node factory.
 181       * @return The lexer's node factory.
 182       */
 183      public NodeFactory getNodeFactory ()
 184      {
 185          return (mFactory);
 186      }
 187  
 188      /**
 189       * Set the current node factory.
 190       * @param factory The node factory to be used by the lexer.
 191       */
 192      public void setNodeFactory (NodeFactory factory)
 193      {
 194          if (null == factory)
 195              throw new IllegalArgumentException ("node factory cannot be null");
 196          mFactory = factory;
 197      }
 198  
 199      /**
 200       * Get the current cursor position.
 201       * @return The current character offset into the source.
 202       */
 203      public int getPosition ()
 204      {
 205          return (getCursor ().getPosition ());
 206      }
 207  
 208      /**
 209       * Set the current cursor position.
 210       * @param position The new character offset into the source.
 211       */
 212      public void setPosition (int position)
 213      {
 214          // todo: sanity checks
 215          getCursor ().setPosition (position);
 216      }
 217  
 218      /**
 219       * Get the current line number.
 220       * @return The line number the lexer's working on.
 221       */
 222      public int getCurrentLineNumber ()
 223      {
 224          return (getPage ().row (getCursor ()));
 225      }
 226  
 227      /**
 228       * Get the current line.
 229       * @return The string the lexer's working on.
 230       */
 231      public String getCurrentLine ()
 232      {
 233          return (getPage ().getLine (getCursor ()));
 234      }
 235  
 236      /**
 237       * Get the next node from the source.
 238       * @return A Remark, Text or Tag, or <code>null</code> if no
 239       * more lexemes are present.
 240       * @exception ParserException If there is a problem with the
 241       * underlying page.
 242       */
 243      public Node nextNode ()
 244          throws
 245              ParserException
 246      {
 247          return nextNode (false);
 248      }
 249  
 250      /**
 251       * Get the next node from the source.
 252       * @param quotesmart If <code>true</code>, strings ignore quoted contents.
 253       * @return A Remark, Text or Tag, or <code>null</code> if no
 254       * more lexemes are present.
 255       * @exception ParserException If there is a problem with the
 256       * underlying page.
 257       */
 258      public Node nextNode (boolean quotesmart)
 259          throws
 260              ParserException
 261      {
 262          int start;
 263          char ch;
 264          Node ret;
 265  
 266          // debugging suppport
 267          if (-1 != mDebugLineTrigger)
 268          {
 269              Page page = getPage ();
 270              int lineno = page.row (mCursor);
 271              if (mDebugLineTrigger < lineno)
 272                  mDebugLineTrigger = lineno + 1; // trigger on next line too
 273          }
 274          start = mCursor.getPosition ();
 275          ch = mPage.getCharacter (mCursor);
 276          switch (ch)
 277          {
 278              case Page.EOF:
 279                  ret = null;
 280                  break;
 281              case '<':
 282                  ch = mPage.getCharacter (mCursor);
 283                  if (Page.EOF == ch)
 284                      ret = makeString (start, mCursor.getPosition ());
 285                  else if ('%' == ch)
 286                  {
 287                      mCursor.retreat ();
 288                      ret = parseJsp (start);
 289                  }
 290                  else if ('?' == ch)
 291                  {
 292                      mCursor.retreat ();
 293                      ret = parsePI (start);
 294                  }
 295                  else if ('/' == ch || '%' == ch || Character.isLetter (ch))
 296                  {
 297                      mCursor.retreat ();
 298                      ret = parseTag (start);
 299                  }
 300                  else if ('!' == ch)
 301                  {
 302                      ch = mPage.getCharacter (mCursor);
 303                      if (Page.EOF == ch)
 304                          ret = makeString (start, mCursor.getPosition ());
 305                      else
 306                      {
 307                          if ('>' == ch) // handle <!>
 308                              ret = makeRemark (start, mCursor.getPosition ());
 309                          else
 310                          {
 311                              mCursor.retreat (); // remark/tag need this char
 312                              if ('-' == ch)
 313                                  ret = parseRemark (start, quotesmart);
 314                              else
 315                              {
 316                                  mCursor.retreat (); // tag needs prior one too
 317                                  ret = parseTag (start);
 318                              }
 319                          }
 320                      }
 321                  }
 322                  else
 323                      ret = parseString (start, quotesmart);
 324                  break;
 325              default:
 326                  mCursor.retreat (); // string needs to see leading foreslash
 327                  ret = parseString (start, quotesmart);
 328                  break;
 329          }
 330  
 331          return (ret);
 332      }
 333  
 334      /**
 335       * Advance the cursor through a JIS escape sequence.
 336       * @param cursor A cursor positioned within the escape sequence.
 337       * @exception ParserException If a problem occurs reading from the source.
 338       */
 339      protected void scanJIS (Cursor cursor)
 340          throws
 341              ParserException
 342      {
 343          boolean done;
 344          char ch;
 345          int state;
 346  
 347          done = false;
 348          state = 0;
 349          while (!done)
 350          {
 351              ch = mPage.getCharacter (cursor);
 352              if (Page.EOF == ch)
 353                  done = true;
 354              else
 355                  switch (state)
 356                  {
 357                      case 0:
 358                          if (0x1b == ch) // escape
 359                              state = 1;
 360                          break;
 361                      case 1:
 362                          if ('(' == ch)
 363                              state = 2;
 364                          else
 365                              state = 0;
 366                          break;
 367                      case 2:
 368                          if ('J' == ch)
 369                              done = true;
 370                          else
 371                              state = 0;
 372                          break;
 373                      default:
 374                          throw new IllegalStateException ("state " + state);
 375                  }
 376          }
 377      }
 378  
 379      /**
 380       * Parse a string node.
 381       * Scan characters until "&lt;/", "&lt;%", "&lt;!" or &lt; followed by a
 382       * letter is encountered, or the input stream is exhausted, in which
 383       * case <code>null</code> is returned.
 384       * @param start The position at which to start scanning.
 385       * @param quotesmart If <code>true</code>, strings ignore quoted contents.
 386       * @return The parsed node.
 387       * @exception ParserException If a problem occurs reading from the source.
 388       */
 389      protected Node parseString (int start, boolean quotesmart)
 390          throws
 391              ParserException
 392      {
 393          boolean done;
 394          char ch;
 395          char quote;
 396  
 397          done = false;
 398          quote = 0;
 399          while (!done)
 400          {
 401              ch = mPage.getCharacter (mCursor);
 402              if (Page.EOF == ch)
 403                  done = true;
 404              else if (0x1b == ch) // escape
 405              {
 406                  ch = mPage.getCharacter (mCursor);
 407                  if (Page.EOF == ch)
 408                      done = true;
 409                  else if ('$' == ch)
 410                  {
 411                      ch = mPage.getCharacter (mCursor);
 412                      if (Page.EOF == ch)
 413                          done = true;
 414                      else if ('B' == ch)
 415                          scanJIS (mCursor);
 416                      else
 417                      {
 418                          mCursor.retreat ();
 419                          mCursor.retreat ();
 420                      }
 421                  }
 422                  else
 423                      mCursor.retreat ();
 424              }
 425              else if (quotesmart && (0 == quote)
 426                  && (('\'' == ch) || ('"' == ch)))
 427                  quote = ch; // enter quoted state
 428              // patch from Gernot Fricke to handle escaped closing quote
 429              else if (quotesmart && (0 != quote) && ('\\' == ch))
 430              {
 431                  ch = mPage.getCharacter (mCursor); // try to consume escape
 432                  if ((Page.EOF != ch)
 433                      && ('\\' != ch) // escaped backslash
 434                      && (ch != quote)) // escaped quote character
 435                         // ( reflects ["] or [']  whichever opened the quotation)
 436                      mCursor.retreat(); // unconsume char if char not an escape
 437              }
 438              else if (quotesmart && (ch == quote))
 439                  quote = 0; // exit quoted state
 440              else if (quotesmart && (0 == quote) && (ch == '/'))
 441              {
 442                  // handle multiline and double slash comments (with a quote)
 443                  // in script like:
 444                  // I can't handle single quotations.
 445                  ch = mPage.getCharacter (mCursor);
 446                  if (Page.EOF == ch)
 447                      done = true;
 448                  else if ('/' == ch)
 449                  {
 450                      do
 451                          ch = mPage.getCharacter (mCursor);
 452                      while ((Page.EOF != ch) && ('\n' != ch));
 453                  }
 454                  else if ('*' == ch)
 455                  {
 456                      do
 457                      {
 458                          do
 459                              ch = mPage.getCharacter (mCursor);
 460                          while ((Page.EOF != ch) && ('*' != ch));
 461                          ch = mPage.getCharacter (mCursor);
 462                          if (ch == '*')
 463                              mCursor.retreat ();
 464                      }
 465                      while ((Page.EOF != ch) && ('/' != ch));
 466                  }
 467                  else
 468                      mCursor.retreat ();
 469              }
 470              else if ((0 == quote) && ('<' == ch))
 471              {
 472                  ch = mPage.getCharacter (mCursor);
 473                  if (Page.EOF == ch)
 474                      done = true;
 475                  // the order of these tests might be optimized for speed:
 476                  else if ('/' == ch || Character.isLetter (ch)
 477                      || '!' == ch || '%' == ch || '?' == ch)
 478                  {
 479                      done = true;
 480                      mCursor.retreat ();
 481                      mCursor.retreat ();
 482                  }
 483                  else
 484                  {
 485                      // it's not a tag, so keep going, but check for quotes
 486                      mCursor.retreat ();
 487                  }
 488              }
 489          }
 490  
 491          return (makeString (start, mCursor.getPosition ()));
 492      }
 493  
 494      /**
 495       * Create a string node based on the current cursor and the one provided.
 496       * @param start The starting point of the node.
 497       * @param end The ending point of the node.
 498       * @exception ParserException If the nodefactory creation of the text
 499       * node fails.
 500       * @return The new Text node.
 501       */
 502      protected Node makeString (int start, int end)
 503          throws
 504              ParserException
 505      {
 506          int length;
 507          Node ret;
 508  
 509          length = end - start;
 510          if (0 != length)
 511              // got some characters
 512              ret = getNodeFactory ().createStringNode (
 513                  this.getPage (), start, end);
 514          else
 515              ret = null;
 516  
 517          return (ret);
 518      }
 519  
 520      /**
 521       * Generate a whitespace 'attribute',
 522       * @param attributes The list so far.
 523       * @param bookmarks The array of positions.
 524       */
 525      private void whitespace (Vector attributes, int[] bookmarks)
 526      {
 527          if (bookmarks[1] > bookmarks[0])
 528              attributes.addElement (new PageAttribute (
 529                  mPage, -1, -1, bookmarks[0], bookmarks[1], (char)0));
 530      }
 531  
 532      /**
 533       * Generate a standalone attribute -- font.
 534       * @param attributes The list so far.
 535       * @param bookmarks The array of positions.
 536       */
 537      private void standalone (Vector attributes, int[] bookmarks)
 538      {
 539          attributes.addElement (new PageAttribute (
 540              mPage, bookmarks[1], bookmarks[2], -1, -1, (char)0));
 541      }
 542  
 543      /**
 544       * Generate an empty attribute -- color=.
 545       * @param attributes The list so far.
 546       * @param bookmarks The array of positions.
 547       */
 548      private void empty (Vector attributes, int[] bookmarks)
 549      {
 550          attributes.addElement (new PageAttribute (
 551              mPage, bookmarks[1], bookmarks[2], bookmarks[2] + 1, -1, (char)0));
 552      }
 553  
 554      /**
 555       * Generate an unquoted attribute -- size=1.
 556       * @param attributes The list so far.
 557       * @param bookmarks The array of positions.
 558       */
 559      private void naked (Vector attributes, int[] bookmarks)
 560      {
 561          attributes.addElement (new PageAttribute (
 562              mPage, bookmarks[1], bookmarks[2], bookmarks[3],
 563              bookmarks[4], (char)0));
 564      }
 565  
 566      /**
 567       * Generate an single quoted attribute -- width='100%'.
 568       * @param attributes The list so far.
 569       * @param bookmarks The array of positions.
 570       */
 571      private void single_quote (Vector attributes, int[] bookmarks)
 572      {
 573          attributes.addElement (new PageAttribute (
 574              mPage, bookmarks[1], bookmarks[2], bookmarks[4] + 1,
 575              bookmarks[5], '\''));
 576      }
 577  
 578      /**
 579       * Generate an double quoted attribute -- CONTENT="Test Development".
 580       * @param attributes The list so far.
 581       * @param bookmarks The array of positions.
 582       */
 583      private void double_quote (Vector attributes, int[] bookmarks)
 584      {
 585          attributes.addElement (new PageAttribute (
 586              mPage, bookmarks[1], bookmarks[2], bookmarks[5] + 1,
 587              bookmarks[6], '"'));
 588      }
 589  
 590      /**
 591       * Parse a tag.
 592       * Parse the name and attributes from a start tag.<p>
 593       * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2">
 594       * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a>
 595       * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2<p>
 596       * <cite>
 597       * 3.2.2 Attributes<p>
 598       * Elements may have associated properties, called attributes, which may
 599       * have values (by default, or set by authors or scripts). Attribute/value
 600       * pairs appear before the final ">" of an element's start tag. Any number
 601       * of (legal) attribute value pairs, separated by spaces, may appear in an
 602       * element's start tag. They may appear in any order.<p>
 603       * In this example, the id attribute is set for an H1 element:
 604       * <code>
 605       * &lt;H1 id="section1"&gt;
 606       * </code>
 607       * This is an identified heading thanks to the id attribute
 608       * <code>
 609       * &lt;/H1&gt;
 610       * </code>
 611       * By default, SGML requires that all attribute values be delimited using
 612       * either double quotation marks (ASCII decimal 34) or single quotation
 613       * marks (ASCII decimal 39). Single quote marks can be included within the
 614       * attribute value when the value is delimited by double quote marks, and
 615       * vice versa. Authors may also use numeric character references to
 616       * represent double quotes (&amp;#34;) and single quotes (&amp;#39;).
 617       * For doublequotes authors can also use the character entity reference
 618       * &amp;quot;.<p>
 619       * In certain cases, authors may specify the value of an attribute without
 620       * any quotation marks. The attribute value may only contain letters
 621       * (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45),
 622       * periods (ASCII decimal 46), underscores (ASCII decimal 95),
 623       * and colons (ASCII decimal 58). We recommend using quotation marks even
 624       * when it is possible to eliminate them.<p>
 625       * Attribute names are always case-insensitive.<p>
 626       * Attribute values are generally case-insensitive. The definition of each
 627       * attribute in the reference manual indicates whether its value is
 628       * case-insensitive.<p>
 629       * All the attributes defined by this specification are listed in the
 630       * attribute index.<p>
 631       * </cite>
 632       * <p>
 633       * This method uses a state machine with the following states:
 634       * <ol>
 635       * <li>state 0 - outside of any attribute</li>
 636       * <li>state 1 - within attributre name</li>
 637       * <li>state 2 - equals hit</li>
 638       * <li>state 3 - within naked attribute value.</li>
 639       * <li>state 4 - within single quoted attribute value</li>
 640       * <li>state 5 - within double quoted attribute value</li>
 641       * <li>state 6 - whitespaces after attribute name could lead to state 2 (=)or state 0</li>
 642       * </ol>
 643       * <p>
 644       * The starting point for the various components is stored in an array
 645       * of integers that match the initiation point for the states one-for-one,
 646       * i.e. bookmarks[0] is where state 0 began, bookmarks[1] is where state 1
 647       * began, etc.
 648       * Attributes are stored in a <code>Vector</code> having
 649       * one slot for each whitespace or attribute/value pair.
 650       * The first slot is for attribute name (kind of like a standalone attribute).
 651       * @param start The position at which to start scanning.
 652       * @return The parsed tag.
 653       * @exception ParserException If a problem occurs reading from the source.
 654       */
 655      protected Node parseTag (int start)
 656          throws
 657              ParserException
 658      {
 659          boolean done;
 660          char ch;
 661          int state;
 662          int[] bookmarks;
 663          Vector attributes;
 664  
 665          done = false;
 666          attributes = new Vector ();
 667          state = 0;
 668          bookmarks = new int[8];
 669          bookmarks[0] = mCursor.getPosition ();
 670          while (!done)
 671          {
 672              bookmarks[state + 1] = mCursor.getPosition ();
 673              ch = mPage.getCharacter (mCursor);
 674              switch (state)
 675              {
 676                  case 0: // outside of any attribute
 677                      if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch))
 678                      {
 679                          if ('<' == ch)
 680                          {
 681                              // don't consume the opening angle
 682                              mCursor.retreat ();
 683                              bookmarks[state + 1] = mCursor.getPosition ();
 684                          }
 685                          whitespace (attributes, bookmarks);
 686                          done = true;
 687                      }
 688                      else if (!Character.isWhitespace (ch))
 689                      {
 690                          whitespace (attributes, bookmarks);
 691                          state = 1;
 692                      }
 693                      break;
 694                  case 1: // within attribute name
 695                      if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch))
 696                      {
 697                          if ('<' == ch)
 698                          {
 699                              // don't consume the opening angle
 700                              mCursor.retreat ();
 701                              bookmarks[state + 1] = mCursor.getPosition ();
 702                          }
 703                          standalone (attributes, bookmarks);
 704                          done = true;
 705                      }
 706                      else if (Character.isWhitespace (ch))
 707                      {
 708                          // whitespaces might be followed by next attribute or an equal sign
 709                          // see Bug #891058 Bug in lexer.
 710                          bookmarks[6] = bookmarks[2]; // setting the bookmark[0] is done in state 6 if applicable
 711                          state = 6;
 712                      }
 713                      else if ('=' == ch)
 714                          state = 2;
 715                      break;
 716                  case 2: // equals hit
 717                      if ((Page.EOF == ch) || ('>' == ch))
 718                      {
 719                          empty (attributes, bookmarks);
 720                          done = true;
 721                      }
 722                      else if ('\'' == ch)
 723                      {
 724                          state = 4;
 725                          bookmarks[4] = bookmarks[3];
 726                      }
 727                      else if ('"' == ch)
 728                      {
 729                          state = 5;
 730                          bookmarks[5] = bookmarks[3];
 731                      }
 732                      else if (Character.isWhitespace (ch))
 733                      { 
 734                          // collect white spaces after "=" into the assignment string;
 735                          // do nothing
 736                          // see Bug #891058 Bug in lexer.
 737                      }
 738                      else
 739                          state = 3;
 740                      break;
 741                  case 3: // within naked attribute value
 742                      if ((Page.EOF == ch) || ('>' == ch))
 743                      {
 744                          naked (attributes, bookmarks);
 745                          done = true;
 746                      }
 747                      else if (Character.isWhitespace (ch))
 748                      {
 749                          naked (attributes, bookmarks);
 750                          bookmarks[0] = bookmarks[4];
 751                          state = 0;
 752                      }
 753                      break;
 754                  case 4: // within single quoted attribute value
 755                      if (Page.EOF == ch)
 756                      {
 757                          single_quote (attributes, bookmarks);
 758                          done = true; // complain?
 759                      }
 760                      else if ('\'' == ch)
 761                      {
 762                          single_quote (attributes, bookmarks);
 763                          bookmarks[0] = bookmarks[5] + 1;
 764                          state = 0;
 765                      }
 766                      break;
 767                  case 5: // within double quoted attribute value
 768                      if (Page.EOF == ch)
 769                      {
 770                          double_quote (attributes, bookmarks);
 771                          done = true; // complain?
 772                      }
 773                      else if ('"' == ch)
 774                      {
 775                          double_quote (attributes, bookmarks);
 776                          bookmarks[0] = bookmarks[6] + 1;
 777                          state = 0;
 778                      }
 779                      break;
 780                  // patch for lexer state correction by
 781                  // Gernot Fricke
 782                  // See Bug # 891058 Bug in lexer.
 783                  case 6: // undecided for state 0 or 2
 784                          // we have read white spaces after an attributte name
 785                      if (Page.EOF == ch)
 786                      {
 787                          // same as last else clause
 788                          standalone (attributes, bookmarks);
 789                    	    bookmarks[0]=bookmarks[6];
 790                    	    mCursor.retreat();
 791                    	    state=0;
 792                      }
 793                      else if (Character.isWhitespace (ch))
 794                      { 
 795                          // proceed
 796                      } 
 797                      else if ('=' == ch) // yepp. the white spaces belonged to the equal.
 798                      {
 799                          bookmarks[2] = bookmarks[6];
 800                          bookmarks[3] = bookmarks[7];
 801                          state=2;
 802                      }
 803                      else
 804                      {
 805                          // white spaces were not ended by equal
 806                          // meaning the attribute was a stand alone attribute
 807                          // now: create the stand alone attribute and rewind 
 808                          // the cursor to the end of the white spaces
 809                          // and restart scanning as whitespace attribute.
 810                    	    standalone (attributes, bookmarks);
 811                    	    bookmarks[0]=bookmarks[6];
 812                    	    mCursor.retreat();
 813                    	    state=0;
 814                     	}
 815                      break;
 816                  default:
 817                      throw new IllegalStateException ("how the fuck did we get in state " + state);
 818              }
 819          }
 820  
 821          return (makeTag (start, mCursor.getPosition (), attributes));
 822      }
 823  
 824      /**
 825       * Create a tag node based on the current cursor and the one provided.
 826       * @param start The starting point of the node.
 827       * @param end The ending point of the node.
 828       * @param attributes The attributes parsed from the tag.
 829       * @exception ParserException If the nodefactory creation of the tag node fails.
 830       * @return The new Tag node.
 831       */
 832      protected Node makeTag (int start, int end, Vector attributes)
 833          throws
 834              ParserException
 835      {
 836          int length;
 837          Node ret;
 838  
 839          length = end - start;
 840          if (0 != length)
 841          {   // return tag based on second character, '/', '%', Letter (ch), '!'
 842              if (2 > length)
 843                  // this is an error
 844                  return (makeString (start, end));
 845              ret = getNodeFactory ().createTagNode (this.getPage (), start, end, attributes);
 846          }
 847          else
 848              ret = null;
 849  
 850          return (ret);
 851      }
 852  
 853      /**
 854       * Parse a comment.
 855       * Parse a remark markup.<p>
 856       * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4">
 857       * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a>
 858       * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4<p>
 859       * <cite>
 860       * 3.2.4 Comments<p>
 861       * HTML comments have the following syntax:<p>
 862       * <code>
 863       * &lt;!-- this is a comment --&gt;<p>
 864       * &lt;!-- and so is this one,<p>
 865       *     which occupies more than one line --&gt;<p>
 866       * </code>
 867       * White space is not permitted between the markup declaration
 868       * open delimiter("&lt;!") and the comment open delimiter ("--"),
 869       * but is permitted between the comment close delimiter ("--") and
 870       * the markup declaration close delimiter ("&gt;").
 871       * A common error is to include a string of hyphens ("---") within a comment.
 872       * Authors should avoid putting two or more adjacent hyphens inside comments.
 873       * Information that appears between comments has no special meaning
 874       * (e.g., character references are not interpreted as such).
 875       * Note that comments are markup.<p>
 876       * </cite>
 877       * <p>
 878       * This method uses a state machine with the following states:
 879       * <ol>
 880       * <li>state 0 - prior to the first open delimiter</li>
 881       * <li>state 1 - prior to the second open delimiter</li>
 882       * <li>state 2 - prior to the first closing delimiter</li>
 883       * <li>state 3 - prior to the second closing delimiter</li>
 884       * <li>state 4 - prior to the terminating &gt;</li>
 885       * </ol>
 886       * <p>
 887       * All comment text (everything excluding the &lt; and &gt;), is included
 888       * in the remark text.
 889       * We allow terminators like --!&gt; even though this isn't part of the spec.
 890       * @param start The position at which to start scanning.
 891       * @param quotesmart If <code>true</code>, strings ignore quoted contents.
 892       * @return The parsed node.
 893       * @exception ParserException If a problem occurs reading from the source.
 894       */
 895      protected Node parseRemark (int start, boolean quotesmart)
 896          throws
 897              ParserException
 898      {
 899          boolean done;
 900          char ch;
 901          int state;
 902  
 903          done = false;
 904          state = 0;
 905          while (!done)
 906          {
 907              ch = mPage.getCharacter (mCursor);
 908              if (Page.EOF == ch)
 909                  done = true;
 910              else
 911                  switch (state)
 912                  {
 913                      case 0: // prior to the first open delimiter
 914                          if ('>' == ch)
 915                              done = true;
 916                          if ('-' == ch)
 917                              state = 1;
 918                          else
 919                              return (parseString (start, quotesmart));
 920                          break;
 921                      case 1: // prior to the second open delimiter
 922                          if ('-' == ch)
 923                          {
 924                              // handle <!--> because netscape does
 925                              ch = mPage.getCharacter (mCursor);
 926                              if (Page.EOF == ch)
 927                                  done = true;
 928                              else if ('>' == ch)
 929                                  done = true;
 930                              else
 931                              {
 932                                  mCursor.retreat ();
 933                                  state = 2;
 934                              }                        
 935                          }
 936                          else
 937                              return (parseString (start, quotesmart));
 938                          break;
 939                      case 2: // prior to the first closing delimiter
 940                          if ('-' == ch)
 941                              state = 3;
 942                          else if (Page.EOF == ch)
 943                              return (parseString (start, quotesmart)); // no terminator
 944                          break;
 945                      case 3: // prior to the second closing delimiter
 946                          if ('-' == ch)
 947                              state = 4;
 948                          else
 949                              state = 2;
 950                          break;
 951                      case 4: // prior to the terminating >
 952                          if ('>' == ch)
 953                              done = true;
 954                          else if (('!' == ch) || ('-' == ch) || Character.isWhitespace (ch))
 955                          {
 956                              // stay in state 4
 957                          }
 958                          else
 959                              state = 2;
 960                          break;
 961                      default:
 962                          throw new IllegalStateException ("how the fuck did we get in state " + state);
 963                  }
 964          }
 965  
 966          return (makeRemark (start, mCursor.getPosition ()));
 967      }
 968  
 969      /**
 970       * Create a remark node based on the current cursor and the one provided.
 971       * @param start The starting point of the node.
 972       * @param end The ending point of the node.
 973       * @exception ParserException If the nodefactory creation of the remark node fails.
 974       * @return The new Remark node.
 975       */
 976      protected Node makeRemark (int start, int end)
 977          throws
 978              ParserException
 979      {
 980          int length;
 981          Node ret;
 982  
 983          length = end - start;
 984          if (0 != length)
 985          {   // return tag based on second character, '/', '%', Letter (ch), '!'
 986              if (2 > length)
 987                  // this is an error
 988                  return (makeString (start, end));
 989              ret = getNodeFactory ().createRemarkNode (this.getPage (), start, end);
 990          }
 991          else
 992              ret = null;
 993          
 994          return (ret);
 995      }
 996  
 997      /**
 998       * Parse a java server page node.
 999       * Scan characters until "%&gt;" is encountered, or the input stream is
1000       * exhausted, in which case <code>null</code> is returned.
1001       * @param start The position at which to start scanning.
1002       * @return The parsed node.
1003       * @exception ParserException If a problem occurs reading from the source.
1004       */
1005      protected Node parseJsp (int start)
1006          throws
1007              ParserException
1008      {
1009          boolean done;
1010          char ch;
1011          int state;
1012          Vector attributes;
1013          int code;
1014  
1015          done = false;
1016          state = 0;
1017          code = 0;
1018          attributes = new Vector ();
1019          // <%xyz%>
1020          // 012223d
1021          // <%=xyz%>
1022          // 0122223d
1023          // <%@xyz%d
1024          // 0122223d
1025          while (!done)
1026          {
1027              ch = mPage.getCharacter (mCursor);
1028              switch (state)
1029              {
1030                  case 0: // prior to the percent
1031                      switch (ch)
1032                      {
1033                          case '%': // <%
1034                              state = 1;
1035                              break;
1036                          // case Page.EOF: // <\0
1037                          // case '>': // <>
1038                          default:
1039                              done = true;
1040                              break;
1041                      }
1042                      break;
1043                  case 1: // prior to the optional qualifier
1044                      switch (ch)
1045                      {
1046                          case Page.EOF:   // <%\0
1047                          case '>': // <%>
1048                              done = true;
1049                              break;
1050                          case '=': // <%=
1051                          case '@': // <%@
1052                              code = mCursor.getPosition ();
1053                              attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0));
1054                              state = 2;
1055                              break;
1056                          default:  // <%x
1057                              code = mCursor.getPosition () - 1;
1058                              attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0));
1059                              state = 2;
1060                              break;
1061                      }
1062                      break;
1063                  case 2: // prior to the closing percent
1064                      switch (ch)
1065                      {
1066                          case Page.EOF: // <%x\0
1067                          case '>': // <%x>
1068                              done = true;
1069                              break;
1070                          case '\'':
1071                          case '"':// <%???"
1072                              state = ch;
1073                              break;
1074                          case '%': // <%???%
1075                              state = 3;
1076                              break;
1077                          case '/': // // or /*
1078                              ch = mPage.getCharacter (mCursor);
1079                              if (ch == '/') 
1080                              {   // find the \n or \r
1081                                  while(true)
1082                                  {
1083                                      ch = mPage.getCharacter (mCursor);
1084                                      if (ch == Page.EOF)
1085                                      {
1086                                          done = true;
1087                                          break;
1088                                      }
1089                                      else if (ch == '\n' || ch == '\r')
1090                                      {
1091                                          break;
1092                                      }
1093                                  }
1094                              }
1095                              else if (ch == '*')
1096                              {
1097                                  do
1098                                  {
1099                                      do
1100                                          ch = mPage.getCharacter (mCursor);
1101                                      while ((Page.EOF != ch) && ('*' != ch));
1102                                      ch = mPage.getCharacter (mCursor);
1103                                      if (ch == '*')
1104                                          mCursor.retreat ();
1105                                  }
1106                                  while ((Page.EOF != ch) && ('/' != ch));
1107                              }
1108                              else
1109                              {
1110                                  mCursor.retreat ();
1111                              }
1112                              break;
1113                          default:  // <%???x
1114                              break;
1115                      }
1116                      break;
1117                  case 3:
1118                      switch (ch)
1119                      {
1120                          case Page.EOF: // <%x??%\0
1121                              done = true;
1122                              break;
1123                          case '>':
1124                              state = 4;
1125                              done = true;
1126                              break;
1127                          default:  // <%???%x
1128                              state = 2;
1129                              break;
1130                      }
1131                      break;
1132                  case '"':
1133                      switch (ch)
1134                      {
1135                          case Page.EOF: // <%x??"\0
1136                              done = true;
1137                              break;
1138                          case '"':
1139                              state = 2;
1140                              break;
1141                          default:  // <%???'??x
1142                              break;
1143                      }
1144                      break;
1145                  case '\'':
1146                      switch (ch)
1147                      {
1148                          case Page.EOF: // <%x??'\0
1149                              done = true;
1150                              break;
1151                          case '\'':
1152                              state = 2;
1153                              break;
1154                          default:  // <%???"??x
1155                              break;
1156                      }
1157                      break;
1158                  default:
1159                      throw new IllegalStateException ("how the fuck did we get in state " + state);
1160              }
1161          }
1162  
1163          if (4 == state) // normal exit
1164          {
1165              if (0 != code)
1166              {
1167                  state = mCursor.getPosition () - 2; // reuse state
1168                  attributes.addElement (new PageAttribute (mPage, code, state, -1, -1, (char)0));
1169                  attributes.addElement (new PageAttribute (mPage, state, state + 1, -1, -1, (char)0));
1170              }
1171              else
1172                  throw new IllegalStateException ("jsp with no code!");
1173          }
1174          else
1175              return (parseString (start, true)); // hmmm, true?
1176  
1177          return (makeTag (start, mCursor.getPosition (), attributes));
1178      }
1179  
1180      /**
1181       * Parse an XML processing instruction.
1182       * Scan characters until "?&gt;" is encountered, or the input stream is
1183       * exhausted, in which case <code>null</code> is returned.
1184       * @param start The position at which to start scanning.
1185       * @return The parsed node.
1186       * @exception ParserException If a problem occurs reading from the source.
1187       */
1188      protected Node parsePI (int start)
1189          throws
1190              ParserException
1191      {
1192          boolean done;
1193          char ch;
1194          int state;
1195          Vector attributes;
1196          int code;
1197  
1198          done = false;
1199          state = 0;
1200          code = 0;
1201          attributes = new Vector ();
1202          // <?xyz?>
1203          // 011112d
1204          while (!done)
1205          {
1206              ch = mPage.getCharacter (mCursor);
1207              switch (state)
1208              {
1209                  case 0: // prior to the question mark
1210                      switch (ch)
1211                      {
1212                          case '?': // <?
1213                              code = mCursor.getPosition ();
1214                              attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0));
1215                              state = 1;
1216                              break;
1217                          // case Page.EOF: // <\0
1218                          // case '>': // <>
1219                          default:
1220                              done = true;
1221                              break;
1222                      }
1223                      break;
1224                  case 1: // prior to the closing question mark
1225                      switch (ch)
1226                      {
1227                          case Page.EOF: // <?x\0
1228                          case '>': // <?x>
1229                              done = true;
1230                              break;
1231                          case '\'':
1232                          case '"':// <?..."
1233                              state = ch;
1234                              break;
1235                          case '?': // <?...?
1236                              state = 2;
1237                              break;
1238                          default:  // <?...x
1239                              break;
1240                      }
1241                      break;
1242                  case 2:
1243                      switch (ch)
1244                      {
1245                          case Page.EOF: // <?x..?\0
1246                              done = true;
1247                              break;
1248                          case '>':
1249                              state = 3;
1250                              done = true;
1251                              break;
1252                          default:  // <?...?x
1253                              state = 1;
1254                              break;
1255                      }
1256                      break;
1257                  case '"':
1258                      switch (ch)
1259                      {
1260                          case Page.EOF: // <?x.."\0
1261                              done = true;
1262                              break;
1263                          case '"':
1264                              state = 1;
1265                              break;
1266                          default:  // <?...'.x
1267                              break;
1268                      }
1269                      break;
1270                  case '\'':
1271                      switch (ch)
1272                      {
1273                          case Page.EOF: // <?x..'\0
1274                              done = true;
1275                              break;
1276                          case '\'':
1277                              state = 1;
1278                              break;
1279                          default:  // <?..."..x
1280                              break;
1281                      }
1282                      break;
1283                  default:
1284                      throw new IllegalStateException ("how the fuck did we get in state " + state);
1285              }
1286          }
1287  
1288          if (3 == state) // normal exit
1289          {
1290              if (0 != code)
1291              {
1292                  state = mCursor.getPosition () - 2; // reuse state
1293                  attributes.addElement (new PageAttribute (mPage, code, state, -1, -1, (char)0));
1294                  attributes.addElement (new PageAttribute (mPage, state, state + 1, -1, -1, (char)0));
1295              }
1296              else
1297                  throw new IllegalStateException ("processing instruction with no content");
1298          }
1299          else
1300              return (parseString (start, true)); // hmmm, true?
1301  
1302          return (makeTag (start, mCursor.getPosition (), attributes));
1303      }
1304  
1305      /**
1306       * Return CDATA as a text node.
1307       * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data">
1308       * B.3.2 Specifying non-HTML data</a> of the
1309       * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>:<br>
1310       * <quote>
1311       * <b>Element content</b><br>
1312       * When script or style data is the content of an element (SCRIPT and STYLE),
1313       * the data begins immediately after the element start tag and ends at the
1314       * first ETAGO ("&lt;/") delimiter followed by a name start character ([a-zA-Z]);
1315       * note that this may not be the element's end tag.
1316       * Authors should therefore escape "&lt;/" within the content. Escape mechanisms
1317       * are specific to each scripting or style sheet language.
1318       * </quote>
1319       * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none.
1320       * @exception ParserException If a problem occurs reading from the source.
1321       */
1322      public Node parseCDATA ()
1323          throws
1324              ParserException
1325      {
1326          return (parseCDATA (false));
1327      }
1328  
1329      /**
1330       * Return CDATA as a text node.
1331       * Slightly less rigid than {@link #parseCDATA()} this method provides for
1332       * parsing CDATA that may contain quoted strings that have embedded
1333       * ETAGO ("&lt;/") delimiters and skips single and multiline comments.
1334       * @param quotesmart If <code>true</code> the strict definition of CDATA is
1335       * extended to allow for single or double quoted ETAGO ("&lt;/") sequences.
1336       * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none.
1337       * @see #parseCDATA()
1338       * @exception ParserException If a problem occurs reading from the source.
1339       */
1340      public Node parseCDATA (boolean quotesmart)
1341          throws
1342              ParserException
1343      {
1344          int start;
1345          int state;
1346          boolean done;
1347          char quote;
1348          char ch;
1349          int end;
1350          boolean comment;
1351  
1352          start = mCursor.getPosition ();
1353          state = 0;
1354          done = false;
1355          quote = 0;
1356          comment = false;
1357  
1358          while (!done)
1359          {
1360              ch = mPage.getCharacter (mCursor);
1361              switch (state)
1362              {
1363                  case 0: // prior to ETAGO
1364                      switch (ch)
1365                      {
1366                          case Page.EOF:
1367                              done = true;
1368                              break;
1369                          case '\'':
1370                              if (quotesmart && !comment)
1371                                  if (0 == quote)
1372                                      quote = '\''; // enter quoted state
1373                                  else if ('\'' == quote)
1374                                      quote = 0; // exit quoted state
1375                              break;
1376                          case '"':
1377                              if (quotesmart && !comment)
1378                                  if (0 == quote)
1379                                      quote = '"'; // enter quoted state
1380                                  else if ('"' == quote)
1381                                      quote = 0; // exit quoted state
1382                              break;
1383                          case '\\':
1384                              if (quotesmart)
1385                                  if (0 != quote)
1386                                  {
1387                                      ch = mPage.getCharacter (mCursor); // try to consume escaped character
1388                                      if (Page.EOF == ch)
1389                                          done = true;
1390                                      else if (  (ch != '\\') && (ch != quote))
1391                                          mCursor.retreat (); // unconsume char if character was not an escapable char.
1392                                  }
1393                              break;
1394                          case '/':
1395                              if (quotesmart)
1396                                  if (0 == quote)
1397                                  {
1398                                      // handle multiline and double slash comments (with a quote)
1399                                      ch = mPage.getCharacter (mCursor);
1400                                      if (Page.EOF == ch)
1401                                          done = true;
1402                                      else if ('/' == ch)
1403                                          comment = true;
1404                                      else if ('*' == ch)
1405                                      {
1406                                          do
1407                                          {
1408                                              do
1409                                                  ch = mPage.getCharacter (mCursor);
1410                                              while ((Page.EOF != ch) && ('*' != ch));
1411                                              ch = mPage.getCharacter (mCursor);
1412                                              if (ch == '*')
1413                                                  mCursor.retreat ();
1414                                          }
1415                                          while ((Page.EOF != ch) && ('/' != ch));
1416                                      }
1417                                      else
1418                                          mCursor.retreat ();
1419                                  }
1420                              break;
1421                          case '\n':
1422                              comment = false;
1423                              break;
1424                          case '<':
1425                              if (quotesmart)
1426                              {
1427                                  if (0 == quote)
1428                                      state = 1;
1429                              }
1430                              else
1431                                  state = 1;
1432                              break;
1433                          default:
1434                              break;
1435                      }
1436                      break;
1437                  case 1: // <
1438                      switch (ch)
1439                      {
1440                          case Page.EOF:
1441                              done = true;
1442                              break;
1443                          case '/':
1444                              state = 2;
1445                              break;
1446                          case '!':
1447                              ch = mPage.getCharacter (mCursor);
1448                              if (Page.EOF == ch)
1449                                  done = true;
1450                              else if ('-' == ch)
1451                              {
1452                                  ch = mPage.getCharacter (mCursor);
1453                                  if (Page.EOF == ch)
1454                                      done = true;
1455                                  else if ('-' == ch)
1456                                      state = 3;
1457                                  else
1458                                      state = 0;
1459                              }
1460                              else
1461                                  state = 0;
1462                              break;
1463                          default:
1464                              state = 0;
1465                              break;
1466                      }
1467                      break;
1468                  case 2: // </
1469                      comment = false;
1470                      if (Page.EOF == ch)
1471                          done = true;
1472                      else if (Character.isLetter (ch))
1473                      {
1474                          done = true;
1475                          // back up to the start of ETAGO
1476                          mCursor.retreat ();
1477                          mCursor.retreat ();
1478                          mCursor.retreat ();
1479                      }
1480                      else
1481                          state = 0;
1482                      break;
1483                  case 3: // <!
1484                      comment = false;
1485                      if (Page.EOF == ch)
1486                          done = true;
1487                      else if ('-' == ch)
1488                      {
1489                          ch = mPage.getCharacter (mCursor);
1490                          if (Page.EOF == ch)
1491                              done = true;
1492                          else if ('-' == ch)
1493                          {
1494                              ch = mPage.getCharacter (mCursor);
1495                              if (Page.EOF == ch)
1496                                  done = true;
1497                              else if ('>' == ch)
1498                                  state = 0;
1499                              else
1500                              {
1501                                  mCursor.retreat ();
1502                                  mCursor.retreat ();
1503                              }
1504                          }
1505                          else
1506                              mCursor.retreat ();
1507                      }
1508                      break;
1509                  default:
1510                      throw new IllegalStateException ("how the fuck did we get in state " + state);
1511              }
1512          }
1513          end = mCursor.getPosition ();
1514  
1515          return (makeString (start, end));
1516      }
1517  
1518      //
1519      // NodeFactory interface
1520      //
1521  
1522      /**
1523       * Create a new string node.
1524       * @param page The page the node is on.
1525       * @param start The beginning position of the string.
1526       * @param end The ending positiong of the string.
1527       * @return The created Text node.
1528       */
1529      public Text createStringNode (Page page,  int start, int end)
1530      {
1531          return (new TextNode (page, start, end));
1532      }
1533  
1534      /**
1535       * Create a new remark node.
1536       * @param page The page the node is on.
1537       * @param start The beginning position of the remark.
1538       * @param end The ending positiong of the remark.
1539       * @return The created Remark node.
1540       */
1541      public Remark createRemarkNode (Page page,  int start, int end)
1542      {
1543          return (new RemarkNode (page, start, end));
1544      }
1545  
1546      /**
1547       * Create a new tag node.
1548       * Note that the attributes vector contains at least one element,
1549       * which is the tag name (standalone attribute) at position zero.
1550       * This can be used to decide which type of node to create, or
1551       * gate other processing that may be appropriate.
1552       * @param page The page the node is on.
1553       * @param start The beginning position of the tag.
1554       * @param end The ending positiong of the tag.
1555       * @param attributes The attributes contained in this tag.
1556       * @return The created Tag node.
1557       */
1558      public Tag createTagNode (Page page, int start, int end, Vector attributes)
1559      {
1560          return (new TagNode (page, start, end, attributes));
1561      }
1562  
1563      /**
1564       * Mainline for command line operation
1565       * @param args [0] The URL to parse.
1566       * @exception MalformedURLException If the provided URL cannot be resolved.
1567       * @exception ParserException If the parse fails.
1568       */
1569      public static void main (String[] args)
1570          throws
1571              MalformedURLException,
1572              ParserException
1573      {
1574          Lexer lexer;
1575          Node node;
1576  
1577          if (0 >= args.length)
1578              System.out.println ("usage: java -jar htmllexer.jar <url>");
1579          else
1580          {
1581              try
1582              {
1583                  ConnectionManager manager = Page.getConnectionManager ();
1584                  lexer = new Lexer (manager.openConnection (args[0]));
1585                  while (null != (node = lexer.nextNode (false)))
1586                      System.out.println (node.toString ());
1587              }
1588              catch (ParserException pe)
1589              {
1590                  System.out.println (pe.getMessage ());
1591                  if (null != pe.getThrowable ())
1592                      System.out.println (pe.getThrowable ().getMessage ());
1593              }
1594          }
1595      }
1596  }