/ org.htmlparser / src / org / htmlparser / Attribute.java
Attribute.java
  1  // HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
  2  // http://sourceforge.org/projects/htmlparser
  3  // Copyright (C) 2004 Derrick Oswald
  4  //
  5  // Revision Control Information
  6  //
  7  // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Attribute.java,v $
  8  // $Author: derrickoswald $
  9  // $Date: 2005/11/15 02:09:10 $
 10  // $Revision: 1.8 $
 11  //
 12  // This library is free software; you can redistribute it and/or
 13  // modify it under the terms of the GNU Lesser General Public
 14  // License as published by the Free Software Foundation; either
 15  // version 2.1 of the License, or (at your option) any later version.
 16  //
 17  // This library is distributed in the hope that it will be useful,
 18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20  // Lesser General Public License for more details.
 21  //
 22  // You should have received a copy of the GNU Lesser General Public
 23  // License along with this library; if not, write to the Free Software
 24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 25  //
 26  
 27  package org.htmlparser;
 28  
 29  import java.io.Serializable;
 30  
 31  /**
 32   * An attribute within a tag.
 33   * Holds the name, assignment string, value and quote character.
 34   * <p>
 35   * This class was made deliberately simple. Except for
 36   * {@link #setRawValue RawValue}, the properties are completely orthogonal,
 37   * that is: each property is independant of the others. This means you have
 38   * enough rope here to hang yourself, and it's very easy to create
 39   * malformed HTML. Where it's obvious, warnings and notes have been provided
 40   * in the setters javadocs, but it is up to you -- the programmer --
 41   * to ensure that the contents of the four fields will yield valid HTML
 42   * (if that's what you want).
 43   * <p>
 44   * Be especially mindful of quotes and assignment strings. These are handled
 45   * by the constructors where it's obvious, but in general, you need to set
 46   * them explicitly when building an attribute. For example to construct
 47   * the attribute <b><code>label="A multi word value."</code></b> you could use:
 48   * <pre>
 49   *     attribute = new Attribute ();
 50   *     attribute.setName ("label");
 51   *     attribute.setAssignment ("=");
 52   *     attribute.setValue ("A multi word value.");
 53   *     attribute.setQuote ('"');
 54   * </pre>
 55   * or
 56   * <pre>
 57   *     attribute = new Attribute ();
 58   *     attribute.setName ("label");
 59   *     attribute.setAssignment ("=");
 60   *     attribute.setRawValue ("A multi word value.");
 61   * </pre>
 62   * or
 63   * <pre>
 64   *     attribute = new Attribute ("label", "A multi word value.");
 65   * </pre>
 66   * Note that the assignment value and quoting need to be set separately when
 67   * building the attribute from scratch using the properties.
 68   * <p>
 69   * <table width="100.0%" align="Center" border="1">
 70   *   <caption>Valid States for Attributes.</caption>
 71   *   <tr>
 72   *     <th align="Center">Description</th>
 73   *     <th align="Center">toString()</th>
 74   *     <th align="Center">Name</th>
 75   *     <th align="Center">Assignment</th>
 76   *     <th align="Center">Value</th>
 77   *     <th align="Center">Quote</th>
 78   *   </tr>
 79   *   <tr>
 80   *     <td align="Center">whitespace attribute</td>
 81   *     <td align="Center">value</td>
 82   *     <td align="Center"><code>null</code></td>
 83   *     <td align="Center"><code>null</code></td>
 84   *     <td align="Center">"value"</td>
 85   *     <td align="Center"><code>0</code></td>
 86   *   </tr>
 87   *   <tr>
 88   *     <td align="Center">standalone attribute</td>
 89   *     <td align="Center">name</td>
 90   *     <td align="Center">"name"</td>
 91   *     <td align="Center"><code>null</code></td>
 92   *     <td align="Center"><code>null</code></td>
 93   *     <td align="Center"><code>0</code></td>
 94   *   </tr>
 95   *   <tr>
 96   *     <td align="Center">empty attribute</td>
 97   *     <td align="Center">name=</td>
 98   *     <td align="Center">"name"</td>
 99   *     <td align="Center">"="</td>
100   *     <td align="Center"><code>null</code></td>
101   *     <td align="Center"><code>0</code></td>
102   *   </tr>
103   *   <tr>
104   *     <td align="Center">empty single quoted attribute</td>
105   *     <td align="Center">name=''</td>
106   *     <td align="Center">"name"</td>
107   *     <td align="Center">"="</td>
108   *     <td align="Center"><code>null</code></td>
109   *     <td align="Center"><code>'</code></td>
110   *   </tr>
111   *   <tr>
112   *     <td align="Center">empty double quoted attribute</td>
113   *     <td align="Center">name=""</td>
114   *     <td align="Center">"name"</td>
115   *     <td align="Center">"="</td>
116   *     <td align="Center"><code>null</code></td>
117   *     <td align="Center"><code>"</code></td>
118   *   </tr>
119   *   <tr>
120   *     <td align="Center">naked attribute</td>
121   *     <td align="Center">name=value</td>
122   *     <td align="Center">"name"</td>
123   *     <td align="Center">"="</td>
124   *     <td align="Center">"value"</td>
125   *     <td align="Center"><code>0</code></td>
126   *   </tr>
127   *   <tr>
128   *     <td align="Center">single quoted attribute</td>
129   *     <td align="Center">name='value'</td>
130   *     <td align="Center">"name"</td>
131   *     <td align="Center">"="</td>
132   *     <td align="Center">"value"</td>
133   *     <td align="Center"><code>'</code></td>
134   *   </tr>
135   *   <tr>
136   *     <td align="Center">double quoted attribute</td>
137   *     <td align="Center">name="value"</td>
138   *     <td align="Center">"name"</td>
139   *     <td align="Center">"="</td>
140   *     <td align="Center">"value"</td>
141   *     <td align="Center"><code>"</code></td>
142   *   </tr>
143   * </table>
144   * <br>In words:
145   * <br>If Name is null, and Assignment is null, and Quote is zero,
146   *   it's whitepace and Value has the whitespace text -- value
147   * <br>If Name is not null, and both Assignment and Value are null
148   *   it's a standalone attribute -- name
149   * <br>If Name is not null, and Assignment is an equals sign, and Quote is zero
150   *   it's an empty attribute -- name=
151   * <br>If Name is not null, and Assignment is an equals sign,
152   *   and Value is "" or null, and Quote is '
153   *   it's an empty single quoted attribute -- name=''
154   * <br>If Name is not null, and Assignment is an equals sign,
155   *   and Value is "" or null, and Quote is "
156   *   it's an empty double quoted attribute -- name=""
157   * <br>If Name is not null, and Assignment is an equals sign,
158   *   and Value is something, and Quote is zero
159   *   it's a naked attribute -- name=value
160   * <br>If Name is not null, and Assignment is an equals sign,
161   *   and Value is something, and Quote is '
162   *   it's a single quoted attribute -- name='value'
163   * <br>If Name is not null, and Assignment is an equals sign,
164   *   and Value is something, and Quote is "
165   *   it's a double quoted attribute -- name="value"
166   * <br>All other states are invalid HTML.
167   * <p>
168   * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2">
169   * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a>
170   * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2:<p>
171   * <cite>
172   * 3.2.2 Attributes<p>
173   * Elements may have associated properties, called attributes, which may
174   * have values (by default, or set by authors or scripts). Attribute/value
175   * pairs appear before the final ">" of an element's start tag. Any number
176   * of (legal) attribute value pairs, separated by spaces, may appear in an
177   * element's start tag. They may appear in any order.<p>
178   * In this example, the id attribute is set for an H1 element:
179   * <pre>
180   * <code>
181   * {@.html
182   *  <H1 id="section1">
183   *  This is an identified heading thanks to the id attribute
184   *  </H1>}
185   * </code>
186   * </pre>
187   * By default, SGML requires that all attribute values be delimited using
188   * either double quotation marks (ASCII decimal 34) or single quotation
189   * marks (ASCII decimal 39). Single quote marks can be included within the
190   * attribute value when the value is delimited by double quote marks, and
191   * vice versa. Authors may also use numeric character references to
192   * represent double quotes (&amp;#34;) and single quotes (&amp;#39;).
193   * For doublequotes authors can also use the character entity reference
194   * &amp;quot;.<p>
195   * In certain cases, authors may specify the value of an attribute without
196   * any quotation marks. The attribute value may only contain letters
197   * (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45),
198   * periods (ASCII decimal 46), underscores (ASCII decimal 95),
199   * and colons (ASCII decimal 58). We recommend using quotation marks even
200   * when it is possible to eliminate them.<p>
201   * Attribute names are always case-insensitive.<p>
202   * Attribute values are generally case-insensitive. The definition of each
203   * attribute in the reference manual indicates whether its value is
204   * case-insensitive.<p>
205   * All the attributes defined by this specification are listed in the
206   * <a href="http://www.w3.org/TR/html4/index/attributes.html">attribute
207   * index</a>.<p>
208   * </cite>
209   * <p>
210   */
211  public class Attribute
212      implements
213          Serializable
214  {
215      /**
216       * The name of this attribute.
217       * The part before the equals sign, or the stand-alone attribute.
218       * This will be <code>null</code> if the attribute is whitespace.
219       */
220      protected String mName;
221  
222      /**
223       * The assignment string of the attribute.
224       * The equals sign.
225       * This will be <code>null</code> if the attribute is a
226       * stand-alone attribute.
227       */
228      protected String mAssignment;
229  
230      /**
231       * The value of the attribute.
232       * The part after the equals sign.
233       * This will be <code>null</code> if the attribute is an empty or
234       * stand-alone attribute.
235       */
236      protected String mValue;
237  
238      /**
239       * The quote, if any, surrounding the value of the attribute, if any.
240       * This will be zero if there are no quotes around the value.
241       */
242      protected char mQuote;
243  
244      /**
245       * Create an attribute with the name, assignment, value and quote given.
246       * If the quote value is zero, assigns the value using {@link #setRawValue}
247       * which sets the quote character to a proper value if necessary.
248       * @param name The name of this attribute.
249       * @param assignment The assignment string of this attribute.
250       * @param value The value of this attribute.
251       * @param quote The quote around the value of this attribute.
252       */
253      public Attribute (String name, String assignment, String value, char quote)
254      {
255          setName (name);
256          setAssignment (assignment);
257          if (0 == quote)
258              setRawValue (value);
259          else
260          {
261              setValue (value);
262              setQuote (quote);
263          }
264      }
265  
266      /**
267       * Create an attribute with the name, value and quote given.
268       * Uses an equals sign as the assignment string if the value is not
269       * <code>null</code>, and calls {@link #setRawValue} to get the
270       * correct quoting if <code>quote</code> is zero.
271       * @param name The name of this attribute.
272       * @param value The value of this attribute.
273       * @param quote The quote around the value of this attribute.
274       */
275      public Attribute (String name, String value, char quote)
276      {
277          this (name, (null == value ? "" : "="), value, quote);
278      }
279  
280      /**
281       * Create a whitespace attribute with the value given.
282       * @param value The value of this attribute.
283       * @exception IllegalArgumentException if the value contains other than
284       * whitespace. To set a real value use {@link #Attribute(String,String)}.
285       */
286      public Attribute (String value)
287          throws
288              IllegalArgumentException
289      {
290          if (0 != value.trim ().length ())
291              throw new IllegalArgumentException ("non whitespace value");
292          else
293          {
294              setName (null);
295              setAssignment (null);
296              setValue (value);
297              setQuote ((char)0);
298          }
299      }
300  
301      /**
302       * Create an attribute with the name and value given.
303       * Uses an equals sign as the assignment string if the value is not
304       * <code>null</code>, and calls {@link #setRawValue} to get the
305       * correct quoting.
306       * @param name The name of this attribute.
307       * @param value The value of this attribute.
308       */
309      public Attribute (String name, String value)
310      {
311          this (name, (null == value ? "" : "="), value, (char)0);
312      }
313  
314      /**
315       * Create an attribute with the name, assignment string and value given.
316       * Calls {@link #setRawValue} to get the correct quoting.
317       * @param name The name of this attribute.
318       * @param assignment The assignment string of this attribute.
319       * @param value The value of this attribute.
320       */
321      public Attribute (String name, String assignment, String value)
322      {
323          this (name, assignment, value, (char)0);
324      }
325  
326      /**
327       * Create an empty attribute.
328       * This will provide "" from the {@link #toString} and
329       * {@link #toString(StringBuffer)} methods.
330       */
331      public Attribute ()
332      {
333          this (null, null, null, (char)0);
334      }
335  
336      /**
337       * Get the name of this attribute.
338       * The part before the equals sign, or the contents of the
339       * stand-alone attribute.
340       * @return The name, or <code>null</code> if it's just a whitepace
341       * 'attribute'.
342       * @see #setName
343       */
344      public String getName ()
345      {
346          return (mName);
347      }
348  
349      /**
350       * Get the name of this attribute.
351       * @param buffer The buffer to place the name in.
352       * @see #getName()
353       * @see #setName
354       */
355      public void getName (StringBuffer buffer)
356      {
357          if (null != mName)
358              buffer.append (mName);
359      }
360  
361      /**
362       * Set the name of this attribute.
363       * Set the part before the equals sign, or the contents of the
364       * stand-alone attribute.
365       * <em>WARNING:</em> Setting this to <code>null</code> can result in
366       * malformed HTML if the assignment string is not <code>null</code>.
367       * @param name The new name.
368       * @see #getName
369       * @see #getName(StringBuffer)
370       */
371      public void setName (String name)
372      {
373          mName = name;
374      }
375  
376      /**
377       * Get the assignment string of this attribute.
378       * This is usually just an equals sign, but in poorly formed attributes it
379       * can include whitespace on either or both sides of an equals sign.
380       * @return The assignment string.
381       * @see #setAssignment
382       */
383      public String getAssignment ()
384      {
385          return (mAssignment);
386      }
387  
388      /**
389       * Get the assignment string of this attribute.
390       * @param buffer The buffer to place the assignment string in.
391       * @see #getAssignment()
392       * @see #setAssignment
393       */
394      public void getAssignment (StringBuffer buffer)
395      {
396          if (null != mAssignment)
397              buffer.append (mAssignment);
398      }
399  
400      /**
401       * Set the assignment string of this attribute.
402       * <em>WARNING:</em> Setting this property to other than an equals sign
403       * or <code>null</code> will result in malformed HTML. In the case of a
404       * <code>null</code>, the {@link  #setValue value} should also be set to
405       * <code>null</code>.
406       * @param assignment The new assignment string.
407       * @see #getAssignment
408       * @see #getAssignment(StringBuffer)
409       */
410      public void setAssignment (String assignment)
411      {
412          mAssignment = assignment;
413      }
414  
415      /**
416       * Get the value of the attribute.
417       * The part after the equals sign, or the text if it's just a whitepace
418       * 'attribute'.
419       * <em>NOTE:</em> This does not include any quotes that may have enclosed
420       * the value when it was read. To get the un-stripped value use
421       * {@link  #getRawValue}.
422       * @return The value, or <code>null</code> if it's a stand-alone or
423       * empty attribute, or the text if it's just a whitepace 'attribute'.
424       * @see #setValue
425       */
426      public String getValue ()
427      {
428          return (mValue);
429      }
430  
431      /**
432       * Get the value of the attribute.
433       * @param buffer The buffer to place the value in.
434       * @see #getValue()
435       * @see #setValue
436       */
437      public void getValue (StringBuffer buffer)
438      {
439          if (null != mValue)
440              buffer.append (mValue);
441      }
442  
443      /**
444       * Set the value of the attribute.
445       * The part after the equals sign, or the text if it's a whitepace
446       * 'attribute'.
447       * <em>WARNING:</em> Setting this property to a value that needs to be
448       * quoted without also setting the quote character will result in malformed
449       * HTML.
450       * @param value The new value.
451       * @see #getValue
452       * @see #getValue(StringBuffer)
453       */
454      public void setValue (String value)
455      {
456          mValue = value;
457      }
458  
459      /**
460       * Get the quote, if any, surrounding the value of the attribute, if any.
461       * @return Either ' or " if the attribute value was quoted, or zero
462       * if there are no quotes around it.
463       * @see #setQuote
464       */
465      public char getQuote ()
466      {
467          return (mQuote);
468      }
469  
470      /**
471       * Get the quote, if any, surrounding the value of the attribute, if any.
472       * @param buffer The buffer to place the quote in.
473       * @see #getQuote()
474       * @see #setQuote
475       */
476      public void getQuote (StringBuffer buffer)
477      {
478          if (0 != mQuote)
479              buffer.append (mQuote);
480      }
481  
482      /**
483       * Set the quote surrounding the value of the attribute.
484       * <em>WARNING:</em> Setting this property to zero will result in malformed
485       * HTML if the {@link  #getValue value} needs to be quoted (i.e. contains
486       * whitespace).
487       * @param quote The new quote value.
488       * @see #getQuote
489       * @see #getQuote(StringBuffer)
490       */
491      public void setQuote (char quote)
492      {
493          mQuote = quote;
494      }
495  
496      /**
497       * Get the raw value of the attribute.
498       * The part after the equals sign, or the text if it's just a whitepace
499       * 'attribute'. This includes the quotes around the value if any.
500       * @return The value, or <code>null</code> if it's a stand-alone attribute,
501       * or the text if it's just a whitepace 'attribute'.
502       * @see #setRawValue
503       */
504      public String getRawValue ()
505      {
506          char quote;
507          StringBuffer buffer;
508          String ret;
509  
510          if (isValued ())
511          {
512              quote = getQuote ();
513              if (0 != quote)
514              {
515                  buffer = new StringBuffer (); // todo: what is the value length?
516                  buffer.append (quote);
517                  getValue (buffer);
518                  buffer.append (quote);
519                  ret = buffer.toString ();
520              }
521              else
522                  ret = getValue ();
523          }
524          else
525              ret = null;
526  
527          return (ret);
528      }
529  
530      /**
531       * Get the raw value of the attribute.
532       * The part after the equals sign, or the text if it's just a whitepace
533       * 'attribute'. This includes the quotes around the value if any.
534       * @param buffer The string buffer to append the attribute value to.
535       * @see #getRawValue()
536       * @see #setRawValue
537       */
538      public void getRawValue (StringBuffer buffer)
539      {
540          getQuote (buffer);
541          getValue (buffer);
542          getQuote (buffer);
543      }
544  
545      /**
546       * Set the value of the attribute and the quote character.
547       * If the value is pure whitespace, assign it 'as is' and reset the
548       * quote character. If not, check for leading and trailing double or
549       * single quotes, and if found use this as the quote character and
550       * the inner contents of <code>value</code> as the real value.
551       * Otherwise, examine the string to determine if quotes are needed
552       * and an appropriate quote character if so. This may involve changing
553       * double quotes within the string to character references.
554       * @param value The new value.
555       * @see #getRawValue
556       * @see #getRawValue(StringBuffer)
557       */
558      public void setRawValue (String value)
559      {
560          char ch;
561          boolean needed;
562          boolean singleq;
563          boolean doubleq;
564          String ref;
565          StringBuffer buffer;
566          char quote;
567  
568          quote = 0;
569          if ((null != value) && (0 != value.trim ().length ()))
570          {
571              if (value.startsWith ("'") && value.endsWith ("'")
572                  && (2 <= value.length ()))
573              {
574                  quote = '\'';
575                  value = value.substring (1, value.length () - 1);
576              }
577              else if (value.startsWith ("\"") && value.endsWith ("\"")
578                  && (2 <= value.length ()))
579              {
580                  quote = '"';
581                  value = value.substring (1, value.length () - 1);
582              }
583              else
584              {
585                  // first determine if there's whitespace in the value
586                  // and while we're at it find a suitable quote character
587                  needed = false;
588                  singleq = true;
589                  doubleq = true;
590                  for (int i = 0; i < value.length (); i++)
591                  {
592                      ch = value.charAt (i);
593                      if ('\'' == ch)
594                      {
595                          singleq  = false;
596                          needed = true;
597                      }
598                      else if ('"' == ch)
599                      {
600                          doubleq = false;
601                          needed = true;
602                      }
603                      else if (!('-' == ch) && !('.' == ch) && !('_' == ch)
604                         && !(':' == ch) && !Character.isLetterOrDigit (ch))
605                      {
606                          needed = true;
607                      }
608                  }
609  
610                  // now apply quoting
611                  if (needed)
612                  {
613                      if (doubleq)
614                          quote = '"';
615                      else if (singleq)
616                          quote = '\'';
617                      else
618                      {
619                          // uh-oh, we need to convert some quotes into character
620                          // references, so convert all double quotes into &#34;
621                          quote = '"';
622                          ref = "&quot;"; // Translate.encode (quote);
623                          // JDK 1.4: value = value.replaceAll ("\"", ref);
624                          buffer = new StringBuffer (
625                                  value.length() * (ref.length () - 1));
626                          for (int i = 0; i < value.length (); i++)
627                          {
628                              ch = value.charAt (i);
629                              if (quote == ch)
630                                  buffer.append (ref);
631                              else
632                                  buffer.append (ch);
633                          }
634                          value = buffer.toString ();
635                      }
636                  }
637              }
638          }
639          setValue (value);
640          setQuote (quote);
641      }
642  
643      /**
644       * Predicate to determine if this attribute is whitespace.
645       * @return <code>true</code> if this attribute is whitespace,
646       * <code>false</code> if it is a real attribute.
647       */
648      public boolean isWhitespace ()
649      {
650          return (null == getName ());
651      }
652  
653      /**
654       * Predicate to determine if this attribute has no equals sign (or value).
655       * @return <code>true</code> if this attribute is a standalone attribute.
656       * <code>false</code> if has an equals sign.
657       */
658      public boolean isStandAlone ()
659      {
660          return ((null != getName ()) && (null == getAssignment ()));
661      }
662  
663      /**
664       * Predicate to determine if this attribute has an equals sign but no value.
665       * @return <code>true</code> if this attribute is an empty attribute.
666       * <code>false</code> if has an equals sign and a value.
667       */
668      public boolean isEmpty ()
669      {
670          return ((null != getAssignment ()) && (null == getValue ()));
671      }
672  
673      /**
674       * Predicate to determine if this attribute has a value.
675       * @return <code>true</code> if this attribute has a value.
676       * <code>false</code> if it is empty or standalone.
677       */
678      public boolean isValued ()
679      {
680          return (null != getValue ());
681      }
682  
683      /**
684       * Get the length of the string value of this attribute.
685       * @return The number of characters required to express this attribute.
686       */
687      public int getLength ()
688      {
689          String name;
690          String assignment;
691          String value;
692          char quote;
693          int ret;
694  
695          ret = 0;
696          name = getName ();
697          if (null != name)
698              ret += name.length ();
699          assignment = getAssignment ();
700          if (null != assignment)
701              ret += assignment.length ();
702          value = getValue ();
703          if (null != value)
704              ret += value.length ();
705          quote = getQuote ();
706          if (0 != quote)
707              ret += 2;
708  
709          return (ret);
710      }
711  
712      /**
713       * Get a text representation of this attribute.
714       * Suitable for insertion into a tag, the output is one of
715       * the forms:
716       * <code>
717       * <pre>
718       * value
719       * name
720       * name=
721       * name=value
722       * name='value'
723       * name="value"
724       * </pre>
725       * </code>
726       * @return A string that can be used within a tag.
727       */
728      public String toString ()
729      {
730          int length;
731          StringBuffer ret;
732  
733          // get the size to avoid extra StringBuffer allocations
734          length = getLength ();
735          ret = new StringBuffer (length);
736          toString (ret);
737  
738          return (ret.toString ());
739      }
740  
741      /**
742       * Get a text representation of this attribute.
743       * @param buffer The accumulator for placing the text into.
744       * @see #toString()
745       */
746      public void toString (StringBuffer buffer)
747      {
748          getName (buffer);
749          getAssignment (buffer);
750          getRawValue (buffer);
751      }
752  
753  }