/ libxml2 / test / valid / REC-xml-19980210.xml
REC-xml-19980210.xml
   1  <?xml version='1.0' encoding='ISO-8859-1' standalone='no'?>
   2  <!DOCTYPE spec SYSTEM "dtds/spec.dtd" [
   3  
   4  <!-- LAST TOUCHED BY: Tim Bray, 8 February 1997 -->
   5  
   6  <!-- The words 'FINAL EDIT' in comments mark places where changes
   7  need to be made after approval of the document by the ERB, before
   8  publication.  -->
   9  
  10  <!ENTITY XML.version "1.0">
  11  <!ENTITY doc.date "10 February 1998">
  12  <!ENTITY iso6.doc.date "19980210">
  13  <!ENTITY w3c.doc.date "02-Feb-1998">
  14  <!ENTITY draft.day '10'>
  15  <!ENTITY draft.month 'February'>
  16  <!ENTITY draft.year '1998'>
  17  
  18  <!ENTITY WebSGML 
  19   'WebSGML Adaptations Annex to ISO 8879'>
  20  
  21  <!ENTITY lt     "<"> 
  22  <!ENTITY gt     ">"> 
  23  <!ENTITY xmlpio "'&lt;?xml'">
  24  <!ENTITY pic    "'?>'">
  25  <!ENTITY br     "\n">
  26  <!ENTITY cellback '#c0d9c0'>
  27  <!ENTITY mdash  "--"> <!-- &#x2014, but nsgmls doesn't grok hex -->
  28  <!ENTITY com    "--">
  29  <!ENTITY como   "--">
  30  <!ENTITY comc   "--">
  31  <!ENTITY hcro   "&amp;#x">
  32  <!-- <!ENTITY nbsp "�"> -->
  33  <!ENTITY nbsp   "&#160;">
  34  <!ENTITY magicents "<code>amp</code>,
  35  <code>lt</code>,
  36  <code>gt</code>,
  37  <code>apos</code>,
  38  <code>quot</code>">
  39   
  40  <!-- audience and distribution status:  for use at publication time -->
  41  <!ENTITY doc.audience "public review and discussion">
  42  <!ENTITY doc.distribution "may be distributed freely, as long as
  43  all text and legal notices remain intact">
  44  
  45  ]>
  46  
  47  <!-- for Panorama *-->
  48  <?VERBATIM "eg" ?>
  49  
  50  <spec>
  51  <header>
  52  <title>Extensible Markup Language (XML) 1.0</title>
  53  <version></version>
  54  <w3c-designation>REC-xml-&iso6.doc.date;</w3c-designation>
  55  <w3c-doctype>W3C Recommendation</w3c-doctype>
  56  <pubdate><day>&draft.day;</day><month>&draft.month;</month><year>&draft.year;</year></pubdate>
  57  
  58  <publoc>
  59  <loc  href="http://www.w3.org/TR/1998/REC-xml-&iso6.doc.date;">
  60  http://www.w3.org/TR/1998/REC-xml-&iso6.doc.date;</loc>
  61  <loc  href="http://www.w3.org/TR/1998/REC-xml-&iso6.doc.date;.xml">
  62  http://www.w3.org/TR/1998/REC-xml-&iso6.doc.date;.xml</loc>
  63  <loc  href="http://www.w3.org/TR/1998/REC-xml-&iso6.doc.date;.html">
  64  http://www.w3.org/TR/1998/REC-xml-&iso6.doc.date;.html</loc>
  65  <loc  href="http://www.w3.org/TR/1998/REC-xml-&iso6.doc.date;.pdf">
  66  http://www.w3.org/TR/1998/REC-xml-&iso6.doc.date;.pdf</loc>
  67  <loc  href="http://www.w3.org/TR/1998/REC-xml-&iso6.doc.date;.ps">
  68  http://www.w3.org/TR/1998/REC-xml-&iso6.doc.date;.ps</loc>
  69  </publoc>
  70  <latestloc>
  71  <loc  href="http://www.w3.org/TR/REC-xml">
  72  http://www.w3.org/TR/REC-xml</loc>
  73  </latestloc>
  74  <prevlocs>
  75  <loc  href="http://www.w3.org/TR/PR-xml-971208">
  76  http://www.w3.org/TR/PR-xml-971208</loc>
  77  <!--
  78  <loc  href='http://www.w3.org/TR/WD-xml-961114'>
  79  http://www.w3.org/TR/WD-xml-961114</loc>
  80  <loc  href='http://www.w3.org/TR/WD-xml-lang-970331'>
  81  http://www.w3.org/TR/WD-xml-lang-970331</loc>
  82  <loc  href='http://www.w3.org/TR/WD-xml-lang-970630'>
  83  http://www.w3.org/TR/WD-xml-lang-970630</loc>
  84  <loc  href='http://www.w3.org/TR/WD-xml-970807'>
  85  http://www.w3.org/TR/WD-xml-970807</loc>
  86  <loc  href='http://www.w3.org/TR/WD-xml-971117'>
  87  http://www.w3.org/TR/WD-xml-971117</loc>-->
  88  </prevlocs>
  89  <authlist>
  90  <author><name>Tim Bray</name>
  91  <affiliation>Textuality and Netscape</affiliation>
  92  <email 
  93  href="mailto:tbray@textuality.com">tbray@textuality.com</email></author>
  94  <author><name>Jean Paoli</name>
  95  <affiliation>Microsoft</affiliation>
  96  <email href="mailto:jeanpa@microsoft.com">jeanpa@microsoft.com</email></author>
  97  <author><name>C. M. Sperberg-McQueen</name>
  98  <affiliation>University of Illinois at Chicago</affiliation>
  99  <email href="mailto:cmsmcq@uic.edu">cmsmcq@uic.edu</email></author>
 100  </authlist>
 101  <abstract>
 102  <p>The Extensible Markup Language (XML) is a subset of
 103  SGML that is completely described in this document. Its goal is to
 104  enable generic SGML to be served, received, and processed on the Web
 105  in the way that is now possible with HTML. XML has been designed for
 106  ease of implementation and for interoperability with both SGML and
 107  HTML.</p>
 108  </abstract>
 109  <status>
 110  <p>This document has been reviewed by W3C Members and
 111  other interested parties and has been endorsed by the
 112  Director as a W3C Recommendation. It is a stable
 113  document and may be used as reference material or cited
 114  as a normative reference from another document. W3C's
 115  role in making the Recommendation is to draw attention
 116  to the specification and to promote its widespread
 117  deployment. This enhances the functionality and
 118  interoperability of the Web.</p>
 119  <p>
 120  This document specifies a syntax created by subsetting an existing,
 121  widely used international text processing standard (Standard
 122  Generalized Markup Language, ISO 8879:1986(E) as amended and
 123  corrected) for use on the World Wide Web.  It is a product of the W3C
 124  XML Activity, details of which can be found at <loc
 125  href='http://www.w3.org/XML'>http://www.w3.org/XML</loc>.  A list of
 126  current W3C Recommendations and other technical documents can be found
 127  at <loc href='http://www.w3.org/TR'>http://www.w3.org/TR</loc>.
 128  </p>
 129  <p>This specification uses the term URI, which is defined by <bibref
 130  ref="Berners-Lee"/>, a work in progress expected to update <bibref
 131  ref="RFC1738"/> and <bibref ref="RFC1808"/>. 
 132  </p>
 133  <p>The list of known errors in this specification is 
 134  available at 
 135  <loc href='http://www.w3.org/XML/xml-19980210-errata'>http://www.w3.org/XML/xml-19980210-errata</loc>.</p>
 136  <p>Please report errors in this document to 
 137  <loc href='mailto:xml-editor@w3.org'>xml-editor@w3.org</loc>.
 138  </p>
 139  </status>
 140  
 141  
 142  <pubstmt>
 143  <p>Chicago, Vancouver, Mountain View, et al.:
 144  World-Wide Web Consortium, XML Working Group, 1996, 1997.</p>
 145  </pubstmt>
 146  <sourcedesc>
 147  <p>Created in electronic form.</p>
 148  </sourcedesc>
 149  <langusage>
 150  <language id='EN'>English</language>
 151  <language id='ebnf'>Extended Backus-Naur Form (formal grammar)</language>
 152  </langusage>
 153  <revisiondesc>
 154  <slist>
 155  <sitem>1997-12-03 : CMSMcQ : yet further changes</sitem>
 156  <sitem>1997-12-02 : TB : further changes (see TB to XML WG,
 157  2 December 1997)</sitem>
 158  <sitem>1997-12-02 : CMSMcQ : deal with as many corrections and
 159  comments from the proofreaders as possible:
 160  entify hard-coded document date in pubdate element,
 161  change expansion of entity WebSGML,
 162  update status description as per Dan Connolly (am not sure
 163  about refernece to Berners-Lee et al.),
 164  add 'The' to abstract as per WG decision,
 165  move Relationship to Existing Standards to back matter and
 166  combine with References,
 167  re-order back matter so normative appendices come first,
 168  re-tag back matter so informative appendices are tagged informdiv1,
 169  remove XXX XXX from list of 'normative' specs in prose,
 170  move some references from Other References to Normative References,
 171  add RFC 1738, 1808, and 2141 to Other References (they are not
 172  normative since we do not require the processor to enforce any 
 173  rules based on them),
 174  add reference to 'Fielding draft' (Berners-Lee et al.),
 175  move notation section to end of body,
 176  drop URIchar non-terminal and use SkipLit instead,
 177  lose stray reference to defunct nonterminal 'markupdecls',
 178  move reference to Aho et al. into appendix (Tim's right),
 179  add prose note saying that hash marks and fragment identifiers are
 180  NOT part of the URI formally speaking, and are NOT legal in 
 181  system identifiers (processor 'may' signal an error).
 182  Work through:
 183  Tim Bray reacting to James Clark,
 184  Tim Bray on his own,
 185  Eve Maler,
 186  
 187  NOT DONE YET:
 188  change binary / text to unparsed / parsed.
 189  handle James's suggestion about &lt; in attriubte values
 190  uppercase hex characters,
 191  namechar list,
 192  </sitem>
 193  <sitem>1997-12-01 : JB : add some column-width parameters</sitem>
 194  <sitem>1997-12-01 : CMSMcQ : begin round of changes to incorporate
 195  recent WG decisions and other corrections:
 196  binding sources of character encoding info (27 Aug / 3 Sept),
 197  correct wording of Faust quotation (restore dropped line),
 198  drop SDD from EncodingDecl,
 199  change text at version number 1.0,
 200  drop misleading (wrong!) sentence about ignorables and extenders,
 201  modify definition of PCData to make bar on msc grammatical,
 202  change grammar's handling of internal subset (drop non-terminal markupdecls),
 203  change definition of includeSect to allow conditional sections,
 204  add integral-declaration constraint on internal subset,
 205  drop misleading / dangerous sentence about relationship of
 206  entities with system storage objects,
 207  change table body tag to htbody as per EM change to DTD,
 208  add rule about space normalization in public identifiers,
 209  add description of how to generate our name-space rules from 
 210  Unicode character database (needs further work!).
 211  </sitem>
 212  <sitem>1997-10-08 : TB : Removed %-constructs again, new rules
 213  for PE appearance.</sitem>
 214  <sitem>1997-10-01 : TB : Case-sensitive markup; cleaned up
 215  element-type defs, lotsa little edits for style</sitem>
 216  <sitem>1997-09-25 : TB : Change to elm's new DTD, with
 217  substantial detail cleanup as a side-effect</sitem>
 218  <sitem>1997-07-24 : CMSMcQ : correct error (lost *) in definition 
 219  of ignoreSectContents (thanks to Makoto Murata)</sitem>
 220  <sitem>Allow all empty elements to have end-tags, consistent with
 221  SGML TC (as per JJC).</sitem>
 222  <sitem>1997-07-23 : CMSMcQ : pre-emptive strike on pending corrections:
 223  introduce the term 'empty-element tag', note that all empty elements
 224  may use it, and elements declared EMPTY must use it.
 225  Add WFC requiring encoding decl to come first in an entity.
 226  Redefine notations to point to PIs as well as binary entities.
 227  Change autodetection table by removing bytes 3 and 4 from 
 228  examples with Byte Order Mark.
 229  Add content model as a term and clarify that it applies to both
 230  mixed and element content.
 231  </sitem>
 232  <sitem>1997-06-30 : CMSMcQ : change date, some cosmetic changes,
 233  changes to productions for choice, seq, Mixed, NotationType,
 234  Enumeration.  Follow James Clark's suggestion and prohibit 
 235  conditional sections in internal subset.  TO DO:  simplify
 236  production for ignored sections as a result, since we don't 
 237  need to worry about parsers which don't expand PErefs finding
 238  a conditional section.</sitem>
 239  <sitem>1997-06-29 : TB : various edits</sitem>
 240  <sitem>1997-06-29 : CMSMcQ : further changes:
 241  Suppress old FINAL EDIT comments and some dead material.
 242  Revise occurrences of % in grammar to exploit Henry Thompson's pun,
 243  especially markupdecl and attdef.
 244  Remove RMD requirement relating to element content (?).
 245  </sitem>
 246  <sitem>1997-06-28 : CMSMcQ : Various changes for 1 July draft:
 247  Add text for draconian error handling (introduce
 248  the term Fatal Error).
 249  RE deleta est (changing wording from 
 250  original announcement to restrict the requirement to validating
 251  parsers).
 252  Tag definition of validating processor and link to it.
 253  Add colon as name character.
 254  Change def of %operator.
 255  Change standard definitions of lt, gt, amp.
 256  Strip leading zeros from #x00nn forms.</sitem>
 257  <sitem>1997-04-02 : CMSMcQ : final corrections of editorial errors
 258  found in last night's proofreading.  Reverse course once more on
 259  well-formed:   Webster's Second hyphenates it, and that's enough
 260  for me.</sitem>
 261  <sitem>1997-04-01 : CMSMcQ : corrections from JJC, EM, HT, and self</sitem>
 262  <sitem>1997-03-31 : Tim Bray : many changes</sitem>
 263  <sitem>1997-03-29 : CMSMcQ : some Henry Thompson (on entity handling),
 264  some Charles Goldfarb, some ERB decisions (PE handling in miscellaneous
 265  declarations.  Changed Ident element to accept def attribute.
 266  Allow normalization of Unicode characters.  move def of systemliteral
 267  into section on literals.</sitem>
 268  <sitem>1997-03-28 : CMSMcQ : make as many corrections as possible, from
 269  Terry Allen, Norbert Mikula, James Clark, Jon Bosak, Henry Thompson,
 270  Paul Grosso, and self.  Among other things:  give in on "well formed"
 271  (Terry is right), tentatively rename QuotedCData as AttValue
 272  and Literal as EntityValue to be more informative, since attribute
 273  values are the <emph>only</emph> place QuotedCData was used, and
 274  vice versa for entity text and Literal. (I'd call it Entity Text, 
 275  but 8879 uses that name for both internal and external entities.)</sitem>
 276  <sitem>1997-03-26 : CMSMcQ : resynch the two forks of this draft, reapply
 277  my changes dated 03-20 and 03-21.  Normalize old 'may not' to 'must not'
 278  except in the one case where it meant 'may or may not'.</sitem>
 279  <sitem>1997-03-21 : TB : massive changes on plane flight from Chicago
 280  to Vancouver</sitem>
 281  <sitem>1997-03-21 : CMSMcQ : correct as many reported errors as possible.
 282  </sitem>
 283  <sitem>1997-03-20 : CMSMcQ : correct typos listed in CMSMcQ hand copy of spec.</sitem>
 284  <sitem>1997-03-20 : CMSMcQ : cosmetic changes preparatory to revision for
 285  WWW conference April 1997:  restore some of the internal entity 
 286  references (e.g. to docdate, etc.), change character xA0 to &amp;nbsp;
 287  and define nbsp as &amp;#160;, and refill a lot of paragraphs for
 288  legibility.</sitem>
 289  <sitem>1996-11-12 : CMSMcQ : revise using Tim's edits:
 290  Add list type of NUMBERED and change most lists either to
 291  BULLETS or to NUMBERED.
 292  Suppress QuotedNames, Names (not used).
 293  Correct trivial-grammar doc type decl.
 294  Rename 'marked section' as 'CDATA section' passim.
 295  Also edits from James Clark:
 296  Define the set of characters from which [^abc] subtracts.
 297  Charref should use just [0-9] not Digit.
 298  Location info needs cleaner treatment:  remove?  (ERB
 299  question).
 300  One example of a PI has wrong pic.
 301  Clarify discussion of encoding names.
 302  Encoding failure should lead to unspecified results; don't
 303  prescribe error recovery.
 304  Don't require exposure of entity boundaries.
 305  Ignore white space in element content.
 306  Reserve entity names of the form u-NNNN.
 307  Clarify relative URLs.
 308  And some of my own:
 309  Correct productions for content model:  model cannot
 310  consist of a name, so "elements ::= cp" is no good.
 311  </sitem>
 312  <sitem>1996-11-11 : CMSMcQ : revise for style.
 313  Add new rhs to entity declaration, for parameter entities.</sitem>
 314  <sitem>1996-11-10 : CMSMcQ : revise for style.
 315  Fix / complete section on names, characters.
 316  Add sections on parameter entities, conditional sections.
 317  Still to do:  Add compatibility note on deterministic content models.
 318  Finish stylistic revision.</sitem>
 319  <sitem>1996-10-31 : TB : Add Entity Handling section</sitem>
 320  <sitem>1996-10-30 : TB : Clean up term &amp; termdef.  Slip in
 321  ERB decision re EMPTY.</sitem>
 322  <sitem>1996-10-28 : TB : Change DTD.  Implement some of Michael's
 323  suggestions.  Change comments back to //.  Introduce language for
 324  XML namespace reservation.  Add section on white-space handling.
 325  Lots more cleanup.</sitem>
 326  <sitem>1996-10-24 : CMSMcQ : quick tweaks, implement some ERB
 327  decisions.  Characters are not integers.  Comments are /* */ not //.
 328  Add bibliographic refs to 10646, HyTime, Unicode.
 329  Rename old Cdata as MsData since it's <emph>only</emph> seen
 330  in marked sections.  Call them attribute-value pairs not
 331  name-value pairs, except once.  Internal subset is optional, needs
 332  '?'.  Implied attributes should be signaled to the app, not
 333  have values supplied by processor.</sitem>
 334  <sitem>1996-10-16 : TB : track down &amp; excise all DSD references;
 335  introduce some EBNF for entity declarations.</sitem>
 336  <sitem>1996-10-?? : TB : consistency check, fix up scraps so
 337  they all parse, get formatter working, correct a few productions.</sitem>
 338  <sitem>1996-10-10/11 : CMSMcQ : various maintenance, stylistic, and
 339  organizational changes:
 340  Replace a few literals with xmlpio and
 341  pic entities, to make them consistent and ensure we can change pic
 342  reliably when the ERB votes.
 343  Drop paragraph on recognizers from notation section.
 344  Add match, exact match to terminology.
 345  Move old 2.2 XML Processors and Apps into intro.
 346  Mention comments, PIs, and marked sections in discussion of
 347  delimiter escaping.
 348  Streamline discussion of doctype decl syntax.
 349  Drop old section of 'PI syntax' for doctype decl, and add
 350  section on partial-DTD summary PIs to end of Logical Structures
 351  section.
 352  Revise DSD syntax section to use Tim's subset-in-a-PI
 353  mechanism.</sitem>
 354  <sitem>1996-10-10 : TB : eliminate name recognizers (and more?)</sitem>
 355  <sitem>1996-10-09 : CMSMcQ : revise for style, consistency through 2.3
 356  (Characters)</sitem>
 357  <sitem>1996-10-09 : CMSMcQ : re-unite everything for convenience,
 358  at least temporarily, and revise quickly</sitem>
 359  <sitem>1996-10-08 : TB : first major homogenization pass</sitem>
 360  <sitem>1996-10-08 : TB : turn "current" attribute on div type into 
 361  CDATA</sitem>
 362  <sitem>1996-10-02 : TB : remould into skeleton + entities</sitem>
 363  <sitem>1996-09-30 : CMSMcQ : add a few more sections prior to exchange
 364                              with Tim.</sitem>
 365  <sitem>1996-09-20 : CMSMcQ : finish transcribing notes.</sitem>
 366  <sitem>1996-09-19 : CMSMcQ : begin transcribing notes for draft.</sitem>
 367  <sitem>1996-09-13 : CMSMcQ : made outline from notes of 09-06,
 368  do some housekeeping</sitem>
 369  </slist>
 370  </revisiondesc>
 371  </header>
 372  <body> 
 373  <div1 id='sec-intro'>
 374  <head>Introduction</head>
 375  <p>Extensible Markup Language, abbreviated XML, describes a class of
 376  data objects called <termref def="dt-xml-doc">XML documents</termref> and
 377  partially describes the behavior of 
 378  computer programs which process them. XML is an application profile or
 379  restricted form of SGML, the Standard Generalized Markup 
 380  Language <bibref ref='ISO8879'/>.
 381  By construction, XML documents 
 382  are conforming SGML documents.
 383  </p>
 384  <p>XML documents are made up of storage units called <termref
 385  def="dt-entity">entities</termref>, which contain either parsed
 386  or unparsed data.
 387  Parsed data is made up of <termref def="dt-character">characters</termref>,
 388  some 
 389  of which form <termref def="dt-chardata">character data</termref>, 
 390  and some of which form <termref def="dt-markup">markup</termref>.
 391  Markup encodes a description of the document's storage layout and
 392  logical structure. XML provides a mechanism to impose constraints on
 393  the storage layout and logical structure.</p>
 394  <p><termdef id="dt-xml-proc" term="XML Processor">A software module
 395  called an <term>XML processor</term> is used to read XML documents
 396  and provide access to their content and structure.</termdef> <termdef
 397  id="dt-app" term="Application">It is assumed that an XML processor is
 398  doing its work on behalf of another module, called the
 399  <term>application</term>.</termdef> This specification describes the
 400  required behavior of an XML processor in terms of how it must read XML
 401  data and the information it must provide to the application.</p>
 402   
 403  <div2 id='sec-origin-goals'>
 404  <head>Origin and Goals</head>
 405  <p>XML was developed by an XML Working Group (originally known as the
 406  SGML Editorial Review Board) formed under the auspices of the World
 407  Wide Web Consortium (W3C) in 1996.
 408  It was chaired by Jon Bosak of Sun
 409  Microsystems with the active participation of an XML Special
 410  Interest Group (previously known as the SGML Working Group) also
 411  organized by the W3C. The membership of the XML Working Group is given
 412  in an appendix. Dan Connolly served as the WG's contact with the W3C.
 413  </p>
 414  <p>The design goals for XML are:<olist>
 415  <item><p>XML shall be straightforwardly usable over the
 416  Internet.</p></item>
 417  <item><p>XML shall support a wide variety of applications.</p></item>
 418  <item><p>XML shall be compatible with SGML.</p></item>
 419  <item><p>It shall be easy to write programs which process XML
 420  documents.</p></item>
 421  <item><p>The number of optional features in XML is to be kept to the
 422  absolute minimum, ideally zero.</p></item>
 423  <item><p>XML documents should be human-legible and reasonably
 424  clear.</p></item>
 425  <item><p>The XML design should be prepared quickly.</p></item>
 426  <item><p>The design of XML shall be formal and concise.</p></item>
 427  <item><p>XML documents shall be easy to create.</p></item>
 428  <item><p>Terseness in XML markup is of minimal importance.</p></item></olist>
 429  </p>
 430  <p>This specification, 
 431  together with associated standards
 432  (Unicode and ISO/IEC 10646 for characters,
 433  Internet RFC 1766 for language identification tags, 
 434  ISO 639 for language name codes, and 
 435  ISO 3166 for country name codes),
 436  provides all the information necessary to understand 
 437  XML Version &XML.version;
 438  and construct computer programs to process it.</p>
 439  <p>This version of the XML specification
 440  <!-- is for &doc.audience;.-->
 441  &doc.distribution;.</p>
 442  
 443  </div2>
 444   
 445  
 446  
 447   
 448  <div2 id='sec-terminology'>
 449  <head>Terminology</head>
 450   
 451  <p>The terminology used to describe XML documents is defined in the body of
 452  this specification.
 453  The terms defined in the following list are used in building those
 454  definitions and in describing the actions of an XML processor:
 455  <glist>
 456  <gitem>
 457  <label>may</label>
 458  <def><p><termdef id="dt-may" term="May">Conforming documents and XML
 459  processors are permitted to but need not behave as
 460  described.</termdef></p></def>
 461  </gitem>
 462  <gitem>
 463  <label>must</label>
 464  <def><p>Conforming documents and XML processors 
 465  are required to behave as described; otherwise they are in error.
 466  <!-- do NOT change this! this is what defines a violation of
 467  a 'must' clause as 'an error'. -MSM -->
 468  </p></def>
 469  </gitem>
 470  <gitem>
 471  <label>error</label>
 472  <def><p><termdef id='dt-error' term='Error'
 473  >A violation of the rules of this
 474  specification; results are
 475  undefined.  Conforming software may detect and report an error and may
 476  recover from it.</termdef></p></def>
 477  </gitem>
 478  <gitem>
 479  <label>fatal error</label>
 480  <def><p><termdef id="dt-fatal" term="Fatal Error">An error
 481  which a conforming <termref def="dt-xml-proc">XML processor</termref>
 482  must detect and report to the application.
 483  After encountering a fatal error, the
 484  processor may continue
 485  processing the data to search for further errors and may report such
 486  errors to the application.  In order to support correction of errors,
 487  the processor may make unprocessed data from the document (with
 488  intermingled character data and markup) available to the application.
 489  Once a fatal error is detected, however, the processor must not
 490  continue normal processing (i.e., it must not
 491  continue to pass character data and information about the document's
 492  logical structure to the application in the normal way).
 493  </termdef></p></def>
 494  </gitem>
 495  <gitem>
 496  <label>at user option</label>
 497  <def><p>Conforming software may or must (depending on the modal verb in the
 498  sentence) behave as described; if it does, it must
 499  provide users a means to enable or disable the behavior
 500  described.</p></def>
 501  </gitem>
 502  <gitem>
 503  <label>validity constraint</label>
 504  <def><p>A rule which applies to all 
 505  <termref def="dt-valid">valid</termref> XML documents.
 506  Violations of validity constraints are errors; they must, at user option, 
 507  be reported by 
 508  <termref def="dt-validating">validating XML processors</termref>.</p></def>
 509  </gitem>
 510  <gitem>
 511  <label>well-formedness constraint</label>
 512  <def><p>A rule which applies to all <termref
 513  def="dt-wellformed">well-formed</termref> XML documents.
 514  Violations of well-formedness constraints are 
 515  <termref def="dt-fatal">fatal errors</termref>.</p></def>
 516  </gitem>
 517  
 518  <gitem>
 519  <label>match</label>
 520  <def><p><termdef id="dt-match" term="match">(Of strings or names:) 
 521  Two strings or names being compared must be identical.
 522  Characters with multiple possible representations in ISO/IEC 10646 (e.g.
 523  characters with 
 524  both precomposed and base+diacritic forms) match only if they have the
 525  same representation in both strings.
 526  At user option, processors may normalize such characters to
 527  some canonical form.
 528  No case folding is performed. 
 529  (Of strings and rules in the grammar:)  
 530  A string matches a grammatical production if it belongs to the
 531  language generated by that production.
 532  (Of content and content models:)
 533  An element matches its declaration when it conforms
 534  in the fashion described in the constraint
 535  <specref ref='elementvalid'/>.
 536  </termdef>
 537  </p></def>
 538  </gitem>
 539  <gitem>
 540  <label>for compatibility</label>
 541  <def><p><termdef id="dt-compat" term="For Compatibility">A feature of
 542  XML included solely to ensure that XML remains compatible with SGML.
 543  </termdef></p></def>
 544  </gitem>
 545  <gitem>
 546  <label>for interoperability</label>
 547  <def><p><termdef id="dt-interop" term="For interoperability">A
 548  non-binding recommendation included to increase the chances that XML
 549  documents can be processed by the existing installed base of SGML
 550  processors which predate the
 551  &WebSGML;.</termdef></p></def>
 552  </gitem>
 553  </glist>
 554  </p>
 555  </div2>
 556  
 557   
 558  </div1>
 559  <!-- &Docs; -->
 560   
 561  <div1 id='sec-documents'>
 562  <head>Documents</head>
 563   
 564  <p><termdef id="dt-xml-doc" term="XML Document">
 565  A data object is an
 566  <term>XML document</term> if it is
 567  <termref def="dt-wellformed">well-formed</termref>, as
 568  defined in this specification.
 569  A well-formed XML document may in addition be
 570  <termref def="dt-valid">valid</termref> if it meets certain further 
 571  constraints.</termdef></p>
 572   
 573  <p>Each XML document has both a logical and a physical structure.
 574  Physically, the document is composed of units called <termref
 575  def="dt-entity">entities</termref>.  An entity may <termref
 576  def="dt-entref">refer</termref> to other entities to cause their
 577  inclusion in the document. A document begins in a "root"  or <termref
 578  def="dt-docent">document entity</termref>.
 579  Logically, the document is composed of declarations, elements, 
 580  comments,
 581  character references, and
 582  processing
 583  instructions, all of which are indicated in the document by explicit
 584  markup.
 585  The logical and physical structures must nest properly, as described  
 586  in <specref ref='wf-entities'/>.
 587  </p>
 588   
 589  <div2 id='sec-well-formed'>
 590  <head>Well-Formed XML Documents</head>
 591   
 592  <p><termdef id="dt-wellformed" term="Well-Formed">
 593  A textual object is 
 594  a well-formed XML document if:</termdef>
 595  <olist>
 596  <item><p>Taken as a whole, it
 597  matches the production labeled <nt def='NT-document'>document</nt>.</p></item>
 598  <item><p>It
 599  meets all the well-formedness constraints given in this specification.</p>
 600  </item>
 601  <item><p>Each of the <termref def='dt-parsedent'>parsed entities</termref> 
 602  which is referenced directly or indirectly within the document is
 603  <titleref href='wf-entities'>well-formed</titleref>.</p></item>
 604  </olist></p>
 605  <p>
 606  <scrap lang='ebnf' id='document'>
 607  <head>Document</head>
 608  <prod id='NT-document'><lhs>document</lhs>
 609  <rhs><nt def='NT-prolog'>prolog</nt> 
 610  <nt def='NT-element'>element</nt> 
 611  <nt def='NT-Misc'>Misc</nt>*</rhs></prod>
 612  </scrap>
 613  </p>
 614  <p>Matching the <nt def="NT-document">document</nt> production 
 615  implies that:
 616  <olist>
 617  <item><p>It contains one or more
 618  <termref def="dt-element">elements</termref>.</p>
 619  </item>
 620  <!--* N.B. some readers (notably JC) find the following
 621  paragraph awkward and redundant.  I agree it's logically redundant:
 622  it *says* it is summarizing the logical implications of
 623  matching the grammar, and that means by definition it's
 624  logically redundant.  I don't think it's rhetorically
 625  redundant or unnecessary, though, so I'm keeping it.  It
 626  could however use some recasting when the editors are feeling
 627  stronger. -MSM *-->
 628  <item><p><termdef id="dt-root" term="Root Element">There is  exactly
 629  one element, called the <term>root</term>, or document element,  no
 630  part of which appears in the <termref
 631  def="dt-content">content</termref> of any other element.</termdef>
 632  For all other elements, if the start-tag is in the content of another
 633  element, the end-tag is in the content of the same element.  More
 634  simply stated, the elements, delimited by start- and end-tags, nest
 635  properly within each other.
 636  </p></item>
 637  </olist>
 638  </p>
 639  <p><termdef id="dt-parentchild" term="Parent/Child">As a consequence 
 640  of this,
 641  for each non-root element
 642  <code>C</code> in the document, there is one other element <code>P</code>
 643  in the document such that 
 644  <code>C</code> is in the content of <code>P</code>, but is not in
 645  the content of any other element that is in the content of
 646  <code>P</code>.  
 647  <code>P</code> is referred to as the
 648  <term>parent</term> of <code>C</code>, and <code>C</code> as a
 649  <term>child</term> of <code>P</code>.</termdef></p></div2>
 650   
 651  <div2 id="charsets">
 652  <head>Characters</head>
 653   
 654  <p><termdef id="dt-text" term="Text">A parsed entity contains
 655  <term>text</term>, a sequence of 
 656  <termref def="dt-character">characters</termref>, 
 657  which may represent markup or character data.</termdef> 
 658  <termdef id="dt-character" term="Character">A <term>character</term> 
 659  is an atomic unit of text as specified by
 660  ISO/IEC 10646 <bibref ref="ISO10646"/>.
 661  Legal characters are tab, carriage return, line feed, and the legal
 662  graphic characters of Unicode and ISO/IEC 10646.
 663  The use of "compatibility characters", as defined in section 6.8
 664  of <bibref ref='Unicode'/>, is discouraged.
 665  </termdef> 
 666  <scrap lang="ebnf" id="char32">
 667  <head>Character Range</head>
 668  <prodgroup pcw2="4" pcw4="17.5" pcw5="11">
 669  <prod id="NT-Char"><lhs>Char</lhs> 
 670  <rhs>#x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] 
 671  | [#x10000-#x10FFFF]</rhs> 
 672  <com>any Unicode character, excluding the
 673  surrogate blocks, FFFE, and FFFF.</com> </prod>
 674  </prodgroup>
 675  </scrap>
 676  </p>
 677  
 678  <p>The mechanism for encoding character code points into bit patterns may
 679  vary from entity to entity. All XML processors must accept the UTF-8
 680  and UTF-16 encodings of 10646; the mechanisms for signaling which of
 681  the two is in use, or for bringing other encodings into play, are
 682  discussed later, in <specref ref='charencoding'/>.
 683  </p>
 684  <!--
 685  <p>Regardless of the specific encoding used, any character in the ISO/IEC
 686  10646 character set may be referred to by the decimal or hexadecimal
 687  equivalent of its 
 688  UCS-4 code value.
 689  </p>-->
 690  </div2>
 691   
 692  <div2 id='sec-common-syn'>
 693  <head>Common Syntactic Constructs</head>
 694   
 695  <p>This section defines some symbols used widely in the grammar.</p>
 696  <p><nt def="NT-S">S</nt> (white space) consists of one or more space (#x20)
 697  characters, carriage returns, line feeds, or tabs.
 698  
 699  <scrap lang="ebnf" id='white'>
 700  <head>White Space</head>
 701  <prodgroup pcw2="4" pcw4="17.5" pcw5="11">
 702  <prod id='NT-S'><lhs>S</lhs>
 703  <rhs>(#x20 | #x9 | #xD | #xA)+</rhs>
 704  </prod>
 705  </prodgroup>
 706  </scrap></p>
 707  <p>Characters are classified for convenience as letters, digits, or other
 708  characters.  Letters consist of an alphabetic or syllabic 
 709  base character possibly
 710  followed by one or more combining characters, or of an ideographic
 711  character.  
 712  Full definitions of the specific characters in each class
 713  are given in <specref ref='CharClasses'/>.</p>
 714  <p><termdef id="dt-name" term="Name">A <term>Name</term> is a token
 715  beginning with a letter or one of a few punctuation characters, and continuing
 716  with letters, digits, hyphens, underscores, colons, or full stops, together
 717  known as name characters.</termdef>
 718  Names beginning with the string "<code>xml</code>", or any string
 719  which would match <code>(('X'|'x') ('M'|'m') ('L'|'l'))</code>, are
 720  reserved for standardization in this or future versions of this
 721  specification.
 722  </p>
 723  <note>
 724  <p>The colon character within XML names is reserved for experimentation with
 725  name spaces.  
 726  Its meaning is expected to be
 727  standardized at some future point, at which point those documents 
 728  using the colon for experimental purposes may need to be updated.
 729  (There is no guarantee that any name-space mechanism
 730  adopted for XML will in fact use the colon as a name-space delimiter.)
 731  In practice, this means that authors should not use the colon in XML
 732  names except as part of name-space experiments, but that XML processors
 733  should accept the colon as a name character.</p>
 734  </note>
 735  <p>An
 736  <nt def='NT-Nmtoken'>Nmtoken</nt> (name token) is any mixture of
 737  name characters.
 738  <scrap lang='ebnf'>
 739  <head>Names and Tokens</head>
 740  <prod id='NT-NameChar'><lhs>NameChar</lhs>
 741  <rhs><nt def="NT-Letter">Letter</nt> 
 742  | <nt def='NT-Digit'>Digit</nt> 
 743  | '.' | '-' | '_' | ':'
 744  | <nt def='NT-CombiningChar'>CombiningChar</nt> 
 745  | <nt def='NT-Extender'>Extender</nt></rhs>
 746  </prod>
 747  <prod id='NT-Name'><lhs>Name</lhs>
 748  <rhs>(<nt def='NT-Letter'>Letter</nt> | '_' | ':')
 749  (<nt def='NT-NameChar'>NameChar</nt>)*</rhs></prod>
 750  <prod id='NT-Names'><lhs>Names</lhs>
 751  <rhs><nt def='NT-Name'>Name</nt> 
 752  (<nt def='NT-S'>S</nt> <nt def='NT-Name'>Name</nt>)*</rhs></prod>
 753  <prod id='NT-Nmtoken'><lhs>Nmtoken</lhs>
 754  <rhs>(<nt def='NT-NameChar'>NameChar</nt>)+</rhs></prod>
 755  <prod id='NT-Nmtokens'><lhs>Nmtokens</lhs>
 756  <rhs><nt def='NT-Nmtoken'>Nmtoken</nt> (<nt def='NT-S'>S</nt> <nt def='NT-Nmtoken'>Nmtoken</nt>)*</rhs></prod>
 757  </scrap>
 758  </p>
 759  <p>Literal data is any quoted string not containing
 760  the quotation mark used as a delimiter for that string.
 761  Literals are used
 762  for specifying the content of internal entities
 763  (<nt def='NT-EntityValue'>EntityValue</nt>),
 764  the values of attributes (<nt def='NT-AttValue'>AttValue</nt>), 
 765  and external identifiers 
 766  (<nt def="NT-SystemLiteral">SystemLiteral</nt>).  
 767  Note that a <nt def='NT-SystemLiteral'>SystemLiteral</nt>
 768  can be parsed without scanning for markup.
 769  <scrap lang='ebnf'>
 770  <head>Literals</head>
 771  <prod id='NT-EntityValue'><lhs>EntityValue</lhs>
 772  <rhs>'"' 
 773  ([^%&amp;"] 
 774  | <nt def='NT-PEReference'>PEReference</nt> 
 775  | <nt def='NT-Reference'>Reference</nt>)*
 776  '"' 
 777  </rhs>
 778  <rhs>|&nbsp; 
 779  "'" 
 780  ([^%&amp;'] 
 781  | <nt def='NT-PEReference'>PEReference</nt> 
 782  | <nt def='NT-Reference'>Reference</nt>)* 
 783  "'"</rhs>
 784  </prod>
 785  <prod id='NT-AttValue'><lhs>AttValue</lhs>
 786  <rhs>'"' 
 787  ([^&lt;&amp;"] 
 788  | <nt def='NT-Reference'>Reference</nt>)* 
 789  '"' 
 790  </rhs>
 791  <rhs>|&nbsp; 
 792  "'" 
 793  ([^&lt;&amp;'] 
 794  | <nt def='NT-Reference'>Reference</nt>)* 
 795  "'"</rhs>
 796  </prod>
 797  <prod id="NT-SystemLiteral"><lhs>SystemLiteral</lhs>
 798  <rhs>('"' [^"]* '"') |&nbsp;("'" [^']* "'")
 799  </rhs>
 800  </prod>
 801  <prod id="NT-PubidLiteral"><lhs>PubidLiteral</lhs>
 802  <rhs>'"' <nt def='NT-PubidChar'>PubidChar</nt>* 
 803  '"' 
 804  | "'" (<nt def='NT-PubidChar'>PubidChar</nt> - "'")* "'"</rhs>
 805  </prod>
 806  <prod id="NT-PubidChar"><lhs>PubidChar</lhs>
 807  <rhs>#x20 | #xD | #xA 
 808  |&nbsp;[a-zA-Z0-9]
 809  |&nbsp;[-'()+,./:=?;!*#@$_%]</rhs>
 810  </prod>
 811  </scrap>
 812  </p>
 813  
 814  </div2>
 815  
 816  <div2 id='syntax'>
 817  <head>Character Data and Markup</head>
 818   
 819  <p><termref def='dt-text'>Text</termref> consists of intermingled 
 820  <termref def="dt-chardata">character
 821  data</termref> and markup.
 822  <termdef id="dt-markup" term="Markup"><term>Markup</term> takes the form of
 823  <termref def="dt-stag">start-tags</termref>,
 824  <termref def="dt-etag">end-tags</termref>,
 825  <termref def="dt-empty">empty-element tags</termref>,
 826  <termref def="dt-entref">entity references</termref>,
 827  <termref def="dt-charref">character references</termref>,
 828  <termref def="dt-comment">comments</termref>,
 829  <termref def="dt-cdsection">CDATA section</termref> delimiters,
 830  <termref def="dt-doctype">document type declarations</termref>, and
 831  <termref def="dt-pi">processing instructions</termref>.
 832  </termdef>
 833  </p>
 834  <p><termdef id="dt-chardata" term="Character Data">All text that is not markup
 835  constitutes the <term>character data</term> of
 836  the document.</termdef></p>
 837  <p>The ampersand character (&amp;) and the left angle bracket (&lt;)
 838  may appear in their literal form <emph>only</emph> when used as markup
 839  delimiters, or within a <termref def="dt-comment">comment</termref>, a
 840  <termref def="dt-pi">processing instruction</termref>, 
 841  or a <termref def="dt-cdsection">CDATA section</termref>.  
 842  
 843  They are also legal within the <termref def='dt-litentval'>literal entity
 844  value</termref> of an internal entity declaration; see
 845  <specref ref='wf-entities'/>.
 846  <!-- FINAL EDIT:  restore internal entity decl or leave it out. -->
 847  If they are needed elsewhere,
 848  they must be <termref def="dt-escape">escaped</termref>
 849  using either <termref def='dt-charref'>numeric character references</termref>
 850  or the strings
 851  "<code>&amp;amp;</code>" and "<code>&amp;lt;</code>" respectively. 
 852  The right angle
 853  bracket (>) may be represented using the string
 854  "<code>&amp;gt;</code>", and must, <termref def='dt-compat'>for
 855  compatibility</termref>, 
 856  be escaped using
 857  "<code>&amp;gt;</code>" or a character reference 
 858  when it appears in the string
 859  "<code>]]&gt;</code>"
 860  in content, 
 861  when that string is not marking the end of 
 862  a <termref def="dt-cdsection">CDATA section</termref>. 
 863  </p>
 864  <p>
 865  In the content of elements, character data 
 866  is any string of characters which does
 867  not contain the start-delimiter of any markup.  
 868  In a CDATA section, character data
 869  is any string of characters not including the CDATA-section-close
 870  delimiter, "<code>]]&gt;</code>".</p>
 871  <p>
 872  To allow attribute values to contain both single and double quotes, the
 873  apostrophe or single-quote character (') may be represented as
 874  "<code>&amp;apos;</code>", and the double-quote character (") as
 875  "<code>&amp;quot;</code>".
 876  <scrap lang="ebnf">
 877  <head>Character Data</head>
 878  <prod id='NT-CharData'>
 879  <lhs>CharData</lhs>
 880  <rhs>[^&lt;&amp;]* - ([^&lt;&amp;]* ']]&gt;' [^&lt;&amp;]*)</rhs>
 881  </prod>
 882  </scrap>
 883  </p>
 884  </div2>
 885   
 886  <div2 id='sec-comments'>
 887  <head>Comments</head>
 888   
 889  <p><termdef id="dt-comment" term="Comment"><term>Comments</term> may 
 890  appear anywhere in a document outside other 
 891  <termref def='dt-markup'>markup</termref>; in addition,
 892  they may appear within the document type declaration
 893  at places allowed by the grammar.
 894  They are not part of the document's <termref def="dt-chardata">character
 895  data</termref>; an XML
 896  processor may, but need not, make it possible for an application to
 897  retrieve the text of comments.
 898  <termref def="dt-compat">For compatibility</termref>, the string
 899  "<code>--</code>" (double-hyphen) must not occur within
 900  comments.
 901  <scrap lang="ebnf">
 902  <head>Comments</head>
 903  <prod id='NT-Comment'><lhs>Comment</lhs>
 904  <rhs>'&lt;!--'
 905  ((<nt def='NT-Char'>Char</nt> - '-') 
 906  | ('-' (<nt def='NT-Char'>Char</nt> - '-')))* 
 907  '-->'</rhs>
 908  </prod>
 909  </scrap>
 910  </termdef></p>
 911  <p>An example of a comment:
 912  <eg>&lt;!&como; declarations for &lt;head> &amp; &lt;body> &comc;&gt;</eg>
 913  </p>
 914  </div2>
 915   
 916  <div2 id='sec-pi'>
 917  <head>Processing Instructions</head>
 918   
 919  <p><termdef id="dt-pi" term="Processing instruction"><term>Processing
 920  instructions</term> (PIs) allow documents to contain instructions
 921  for applications.
 922   
 923  <scrap lang="ebnf">
 924  <head>Processing Instructions</head>
 925  <prod id='NT-PI'><lhs>PI</lhs>
 926  <rhs>'&lt;?' <nt def='NT-PITarget'>PITarget</nt> 
 927  (<nt def='NT-S'>S</nt> 
 928  (<nt def='NT-Char'>Char</nt>* - 
 929  (<nt def='NT-Char'>Char</nt>* &pic; <nt def='NT-Char'>Char</nt>*)))?
 930  &pic;</rhs></prod>
 931  <prod id='NT-PITarget'><lhs>PITarget</lhs>
 932  <rhs><nt def='NT-Name'>Name</nt> - 
 933  (('X' | 'x') ('M' | 'm') ('L' | 'l'))</rhs>
 934  </prod>
 935  </scrap></termdef>
 936  PIs are not part of the document's <termref def="dt-chardata">character
 937  data</termref>, but must be passed through to the application. The
 938  PI begins with a target (<nt def='NT-PITarget'>PITarget</nt>) used
 939  to identify the application to which the instruction is directed.  
 940  The target names "<code>XML</code>", "<code>xml</code>", and so on are
 941  reserved for standardization in this or future versions of this
 942  specification.
 943  The 
 944  XML <termref def='dt-notation'>Notation</termref> mechanism
 945  may be used for
 946  formal declaration of PI targets.
 947  </p>
 948  </div2>
 949   
 950  <div2 id='sec-cdata-sect'>
 951  <head>CDATA Sections</head>
 952   
 953  <p><termdef id="dt-cdsection" term="CDATA Section"><term>CDATA sections</term>
 954  may occur 
 955  anywhere character data may occur; they are
 956  used to escape blocks of text containing characters which would
 957  otherwise be recognized as markup.  CDATA sections begin with the
 958  string "<code>&lt;![CDATA[</code>" and end with the string
 959  "<code>]]&gt;</code>":
 960  <scrap lang="ebnf">
 961  <head>CDATA Sections</head>
 962  <prod id='NT-CDSect'><lhs>CDSect</lhs>
 963  <rhs><nt def='NT-CDStart'>CDStart</nt> 
 964  <nt def='NT-CData'>CData</nt> 
 965  <nt def='NT-CDEnd'>CDEnd</nt></rhs></prod>
 966  <prod id='NT-CDStart'><lhs>CDStart</lhs>
 967  <rhs>'&lt;![CDATA['</rhs>
 968  </prod>
 969  <prod id='NT-CData'><lhs>CData</lhs>
 970  <rhs>(<nt def='NT-Char'>Char</nt>* - 
 971  (<nt def='NT-Char'>Char</nt>* ']]&gt;' <nt def='NT-Char'>Char</nt>*))
 972  </rhs>
 973  </prod>
 974  <prod id='NT-CDEnd'><lhs>CDEnd</lhs>
 975  <rhs>']]&gt;'</rhs>
 976  </prod>
 977  </scrap>
 978  
 979  Within a CDATA section, only the <nt def='NT-CDEnd'>CDEnd</nt> string is
 980  recognized as markup, so that left angle brackets and ampersands may occur in
 981  their literal form; they need not (and cannot) be escaped using
 982  "<code>&amp;lt;</code>" and "<code>&amp;amp;</code>".  CDATA sections
 983  cannot nest.</termdef>
 984  </p>
 985  
 986  <p>An example of a CDATA section, in which "<code>&lt;greeting></code>" and 
 987  "<code>&lt;/greeting></code>"
 988  are recognized as <termref def='dt-chardata'>character data</termref>, not
 989  <termref def='dt-markup'>markup</termref>:
 990  <eg>&lt;![CDATA[&lt;greeting>Hello, world!&lt;/greeting>]]&gt;</eg>
 991  </p>
 992  </div2>
 993   
 994  <div2 id='sec-prolog-dtd'>
 995  <head>Prolog and Document Type Declaration</head>
 996   
 997  <p><termdef id='dt-xmldecl' term='XML Declaration'>XML documents 
 998  may, and should, 
 999  begin with an <term>XML declaration</term> which specifies
1000  the version of
1001  XML being used.</termdef>
1002  For example, the following is a complete XML document, <termref
1003  def="dt-wellformed">well-formed</termref> but not
1004  <termref def="dt-valid">valid</termref>:
1005  <eg><![CDATA[<?xml version="1.0"?>
1006  <greeting>Hello, world!</greeting>
1007  ]]></eg>
1008  and so is this:
1009  <eg><![CDATA[<greeting>Hello, world!</greeting>
1010  ]]></eg>
1011  </p>
1012  
1013  <p>The version number "<code>1.0</code>" should be used to indicate
1014  conformance to this version of this specification; it is an error
1015  for a document to use the value "<code>1.0</code>" 
1016  if it does not conform to this version of this specification.
1017  It is the intent
1018  of the XML working group to give later versions of this specification
1019  numbers other than "<code>1.0</code>", but this intent does not
1020  indicate a
1021  commitment to produce any future versions of XML, nor if any are produced, to
1022  use any particular numbering scheme.
1023  Since future versions are not ruled out, this construct is provided 
1024  as a means to allow the possibility of automatic version recognition, should
1025  it become necessary.
1026  Processors may signal an error if they receive documents labeled with 
1027  versions they do not support. 
1028  </p>
1029  <p>The function of the markup in an XML document is to describe its
1030  storage and logical structure and to associate attribute-value pairs
1031  with its logical structures.  XML provides a mechanism, the <termref
1032  def="dt-doctype">document type declaration</termref>, to define
1033  constraints on the logical structure and to support the use of
1034  predefined storage units.
1035  
1036  <termdef id="dt-valid" term="Validity">An XML document is 
1037  <term>valid</term> if it has an associated document type
1038  declaration and if the document
1039  complies with the constraints expressed in it.</termdef></p>
1040  <p>The document type declaration must appear before
1041  the first <termref def="dt-element">element</termref> in the document.
1042  <scrap lang="ebnf" id='xmldoc'>
1043  <head>Prolog</head>
1044  <prodgroup pcw2="6" pcw4="17.5" pcw5="9">
1045  <prod id='NT-prolog'><lhs>prolog</lhs>
1046  <rhs><nt def='NT-XMLDecl'>XMLDecl</nt>? 
1047  <nt def='NT-Misc'>Misc</nt>* 
1048  (<nt def='NT-doctypedecl'>doctypedecl</nt> 
1049  <nt def='NT-Misc'>Misc</nt>*)?</rhs></prod>
1050  <prod id='NT-XMLDecl'><lhs>XMLDecl</lhs>
1051  <rhs>&xmlpio; 
1052  <nt def='NT-VersionInfo'>VersionInfo</nt> 
1053  <nt def='NT-EncodingDecl'>EncodingDecl</nt>? 
1054  <nt def='NT-SDDecl'>SDDecl</nt>? 
1055  <nt def="NT-S">S</nt>? 
1056  &pic;</rhs>
1057  </prod>
1058  <prod id='NT-VersionInfo'><lhs>VersionInfo</lhs>
1059  <rhs><nt def="NT-S">S</nt> 'version' <nt def='NT-Eq'>Eq</nt> 
1060  (' <nt def="NT-VersionNum">VersionNum</nt> ' 
1061  | " <nt def="NT-VersionNum">VersionNum</nt> ")</rhs>
1062  </prod>
1063  <prod id='NT-Eq'><lhs>Eq</lhs>
1064  <rhs><nt def='NT-S'>S</nt>? '=' <nt def='NT-S'>S</nt>?</rhs></prod>
1065  <prod id="NT-VersionNum">
1066  <lhs>VersionNum</lhs>
1067  <rhs>([a-zA-Z0-9_.:] | '-')+</rhs>
1068  </prod>
1069  <prod id='NT-Misc'><lhs>Misc</lhs>
1070  <rhs><nt def='NT-Comment'>Comment</nt> | <nt def='NT-PI'>PI</nt> | 
1071  <nt def='NT-S'>S</nt></rhs></prod>
1072  </prodgroup>
1073  </scrap></p>
1074  
1075  <p><termdef id="dt-doctype" term="Document Type Declaration">The XML
1076  <term>document type declaration</term> 
1077  contains or points to 
1078  <termref def='dt-markupdecl'>markup declarations</termref> 
1079  that provide a grammar for a
1080  class of documents.  
1081  This grammar is known as a document type definition,
1082  or <term>DTD</term>.  
1083  The document type declaration can point to an external subset (a
1084  special kind of 
1085  <termref def='dt-extent'>external entity</termref>) containing markup
1086  declarations, or can 
1087  contain the markup declarations directly in an internal subset, or can do
1088  both.   
1089  The DTD for a document consists of both subsets taken
1090  together.</termdef>
1091  </p>
1092  <p><termdef id="dt-markupdecl" term="markup declaration">
1093  A <term>markup declaration</term> is 
1094  an <termref def="dt-eldecl">element type declaration</termref>, 
1095  an <termref def="dt-attdecl">attribute-list declaration</termref>, 
1096  an <termref def="dt-entdecl">entity declaration</termref>, or
1097  a <termref def="dt-notdecl">notation declaration</termref>.
1098  </termdef>
1099  These declarations may be contained in whole or in part
1100  within <termref def='dt-PE'>parameter entities</termref>,
1101  as described in the well-formedness and validity constraints below.
1102  For fuller information, see
1103  <specref ref="sec-physical-struct"/>.</p>
1104  <scrap lang="ebnf" id='dtd'>
1105  <head>Document Type Definition</head>
1106  <prodgroup pcw2="6" pcw4="17.5" pcw5="9">
1107  <prod id='NT-doctypedecl'><lhs>doctypedecl</lhs>
1108  <rhs>'&lt;!DOCTYPE' <nt def='NT-S'>S</nt> 
1109  <nt def='NT-Name'>Name</nt> (<nt def='NT-S'>S</nt> 
1110  <nt def='NT-ExternalID'>ExternalID</nt>)? 
1111  <nt def='NT-S'>S</nt>? ('[' 
1112  (<nt def='NT-markupdecl'>markupdecl</nt> 
1113  | <nt def='NT-PEReference'>PEReference</nt> 
1114  | <nt def='NT-S'>S</nt>)*
1115  ']' 
1116  <nt def='NT-S'>S</nt>?)? '>'</rhs>
1117  <vc def="vc-roottype"/>
1118  </prod>
1119  <prod id='NT-markupdecl'><lhs>markupdecl</lhs>
1120  <rhs><nt def='NT-elementdecl'>elementdecl</nt> 
1121  | <nt def='NT-AttlistDecl'>AttlistDecl</nt> 
1122  | <nt def='NT-EntityDecl'>EntityDecl</nt> 
1123  | <nt def='NT-NotationDecl'>NotationDecl</nt> 
1124  | <nt def='NT-PI'>PI</nt> 
1125  | <nt def='NT-Comment'>Comment</nt>
1126  </rhs>
1127  <vc def='vc-PEinMarkupDecl'/>
1128  <wfc def="wfc-PEinInternalSubset"/>
1129  </prod>
1130  
1131  </prodgroup>
1132  </scrap>
1133  
1134  <p>The markup declarations may be made up in whole or in part of
1135  the <termref def='dt-repltext'>replacement text</termref> of 
1136  <termref def='dt-PE'>parameter entities</termref>.
1137  The productions later in this specification for
1138  individual nonterminals (<nt def='NT-elementdecl'>elementdecl</nt>,
1139  <nt def='NT-AttlistDecl'>AttlistDecl</nt>, and so on) describe 
1140  the declarations <emph>after</emph> all the parameter entities have been 
1141  <termref def='dt-include'>included</termref>.</p>
1142  
1143  <vcnote id="vc-roottype">
1144  <head>Root Element Type</head>
1145  <p>
1146  The <nt def='NT-Name'>Name</nt> in the document type declaration must
1147  match the element type of the <termref def='dt-root'>root element</termref>.
1148  </p>
1149  </vcnote>
1150  
1151  <vcnote id='vc-PEinMarkupDecl'>
1152  <head>Proper Declaration/PE Nesting</head>
1153  <p>Parameter-entity 
1154  <termref def='dt-repltext'>replacement text</termref> must be properly nested
1155  with markup declarations. 
1156  That is to say, if either the first character
1157  or the last character of a markup
1158  declaration (<nt def='NT-markupdecl'>markupdecl</nt> above)
1159  is contained in the replacement text for a 
1160  <termref def='dt-PERef'>parameter-entity reference</termref>,
1161  both must be contained in the same replacement text.</p>
1162  </vcnote>
1163  <wfcnote id="wfc-PEinInternalSubset">
1164  <head>PEs in Internal Subset</head>
1165  <p>In the internal DTD subset, 
1166  <termref def='dt-PERef'>parameter-entity references</termref>
1167  can occur only where markup declarations can occur, not
1168  within markup declarations.  (This does not apply to
1169  references that occur in
1170  external parameter entities or to the external subset.)
1171  </p>
1172  </wfcnote>
1173  <p>
1174  Like the internal subset, the external subset and 
1175  any external parameter entities referred to in the DTD 
1176  must consist of a series of complete markup declarations of the types 
1177  allowed by the non-terminal symbol
1178  <nt def="NT-markupdecl">markupdecl</nt>, interspersed with white space
1179  or <termref def="dt-PERef">parameter-entity references</termref>.
1180  However, portions of the contents
1181  of the 
1182  external subset or of external parameter entities may conditionally be ignored
1183  by using 
1184  the <termref def="dt-cond-section">conditional section</termref>
1185  construct; this is not allowed in the internal subset.
1186  
1187  <scrap id="ext-Subset">
1188  <head>External Subset</head>
1189  <prodgroup pcw2="6" pcw4="17.5" pcw5="9">
1190  <prod id='NT-extSubset'><lhs>extSubset</lhs>
1191  <rhs><nt def='NT-TextDecl'>TextDecl</nt>?
1192  <nt def='NT-extSubsetDecl'>extSubsetDecl</nt></rhs></prod>
1193  <prod id='NT-extSubsetDecl'><lhs>extSubsetDecl</lhs>
1194  <rhs>(
1195  <nt def='NT-markupdecl'>markupdecl</nt> 
1196  | <nt def='NT-conditionalSect'>conditionalSect</nt> 
1197  | <nt def='NT-PEReference'>PEReference</nt> 
1198  | <nt def='NT-S'>S</nt>
1199  )*</rhs>
1200  </prod>
1201  </prodgroup>
1202  </scrap></p>
1203  <p>The external subset and external parameter entities also differ 
1204  from the internal subset in that in them,
1205  <termref def="dt-PERef">parameter-entity references</termref>
1206  are permitted <emph>within</emph> markup declarations,
1207  not only <emph>between</emph> markup declarations.</p>
1208  <p>An example of an XML document with a document type declaration:
1209  <eg><![CDATA[<?xml version="1.0"?>
1210  <!DOCTYPE greeting SYSTEM "hello.dtd">
1211  <greeting>Hello, world!</greeting>
1212  ]]></eg>
1213  The <termref def="dt-sysid">system identifier</termref> 
1214  "<code>hello.dtd</code>" gives the URI of a DTD for the document.</p>
1215  <p>The declarations can also be given locally, as in this 
1216  example:
1217  <eg><![CDATA[<?xml version="1.0" encoding="UTF-8" ?>
1218  <!DOCTYPE greeting [
1219    <!ELEMENT greeting (#PCDATA)>
1220  ]>
1221  <greeting>Hello, world!</greeting>
1222  ]]></eg>
1223  If both the external and internal subsets are used, the 
1224  internal subset is considered to occur before the external subset.
1225  <!-- 'is considered to'? boo. whazzat mean? -->
1226  This has the effect that entity and attribute-list declarations in the
1227  internal subset take precedence over those in the external subset.
1228  </p>
1229  </div2>
1230   
1231  <div2 id='sec-rmd'>
1232  <head>Standalone Document Declaration</head>
1233  <p>Markup declarations can affect the content of the document,
1234  as passed from an <termref def="dt-xml-proc">XML processor</termref> 
1235  to an application; examples are attribute defaults and entity
1236  declarations.
1237  The standalone document declaration,
1238  which may appear as a component of the XML declaration, signals
1239  whether or not there are such declarations which appear external to 
1240  the <termref def='dt-docent'>document entity</termref>.
1241  <scrap lang="ebnf" id='fulldtd'>
1242  <head>Standalone Document Declaration</head>
1243  <prodgroup pcw2="4" pcw4="19.5" pcw5="9">
1244  <prod id='NT-SDDecl'><lhs>SDDecl</lhs>
1245  <rhs>
1246  <nt def="NT-S">S</nt> 
1247  'standalone' <nt def='NT-Eq'>Eq</nt> 
1248  (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
1249  </rhs>
1250  <vc def='vc-check-rmd'/></prod>
1251  </prodgroup>
1252  </scrap></p>
1253  <p>
1254  In a standalone document declaration, the value "<code>yes</code>" indicates
1255  that there 
1256  are no markup declarations external to the <termref def='dt-docent'>document
1257  entity</termref> (either in the DTD external subset, or in an
1258  external parameter entity referenced from the internal subset)
1259  which affect the information passed from the XML processor to
1260  the application.  
1261  The value "<code>no</code>" indicates that there are or may be such
1262  external markup declarations.
1263  Note that the standalone document declaration only 
1264  denotes the presence of external <emph>declarations</emph>; the presence, in a
1265  document, of 
1266  references to external <emph>entities</emph>, when those entities are
1267  internally declared, 
1268  does not change its standalone status.</p>
1269  <p>If there are no external markup declarations, the standalone document
1270  declaration has no meaning. 
1271  If there are external markup declarations but there is no standalone
1272  document declaration, the value "<code>no</code>" is assumed.</p>
1273  <p>Any XML document for which <code>standalone="no"</code> holds can 
1274  be converted algorithmically to a standalone document, 
1275  which may be desirable for some network delivery applications.</p>
1276  <vcnote id='vc-check-rmd'>
1277  <head>Standalone Document Declaration</head>
1278  <p>The standalone document declaration must have
1279  the value "<code>no</code>" if any external markup declarations
1280  contain declarations of:</p><ulist>
1281  <item><p>attributes with <termref def="dt-default">default</termref> values, if
1282  elements to which
1283  these attributes apply appear in the document without
1284  specifications of values for these attributes, or</p></item>
1285  <item><p>entities (other than &magicents;), 
1286  if <termref def="dt-entref">references</termref> to those
1287  entities appear in the document, or</p>
1288  </item>
1289  <item><p>attributes with values subject to
1290  <titleref href='AVNormalize'>normalization</titleref>, where the
1291  attribute appears in the document with a value which will
1292  change as a result of normalization, or</p>
1293  </item>
1294  <item>
1295  <p>element types with <termref def="dt-elemcontent">element content</termref>, 
1296  if white space occurs
1297  directly within any instance of those types.
1298  </p></item>
1299  </ulist>
1300  
1301  </vcnote>
1302  <p>An example XML declaration with a standalone document declaration:<eg
1303  >&lt;?xml version="&XML.version;" standalone='yes'?></eg></p>
1304  </div2>
1305  <div2 id='sec-white-space'>
1306  <head>White Space Handling</head>
1307  
1308  <p>In editing XML documents, it is often convenient to use "white space"
1309  (spaces, tabs, and blank lines, denoted by the nonterminal 
1310  <nt def='NT-S'>S</nt> in this specification) to
1311  set apart the markup for greater readability.  Such white space is typically
1312  not intended for inclusion in the delivered version of the document.
1313  On the other hand, "significant" white space that should be preserved in the
1314  delivered version is common, for example in poetry and
1315  source code.</p>
1316  <p>An <termref def='dt-xml-proc'>XML processor</termref> 
1317  must always pass all characters in a document that are not
1318  markup through to the application.   A <termref def='dt-validating'>
1319  validating XML processor</termref> must also inform the application
1320  which  of these characters constitute white space appearing
1321  in <termref def="dt-elemcontent">element content</termref>.
1322  </p>
1323  <p>A special <termref def='dt-attr'>attribute</termref> 
1324  named <kw>xml:space</kw> may be attached to an element
1325  to signal an intention that in that element,
1326  white space should be preserved by applications.
1327  In valid documents, this attribute, like any other, must be 
1328  <termref def="dt-attdecl">declared</termref> if it is used.
1329  When declared, it must be given as an 
1330  <termref def='dt-enumerated'>enumerated type</termref> whose only
1331  possible values are "<code>default</code>" and "<code>preserve</code>".
1332  For example:<eg><![CDATA[    <!ATTLIST poem   xml:space (default|preserve) 'preserve'>]]></eg></p>
1333  <p>The value "<code>default</code>" signals that applications'
1334  default white-space processing modes are acceptable for this element; the
1335  value "<code>preserve</code>" indicates the intent that applications preserve
1336  all the white space.
1337  This declared intent is considered to apply to all elements within the content
1338  of the element where it is specified, unless overriden with another instance
1339  of the <kw>xml:space</kw> attribute.
1340  </p>
1341  <p>The <termref def='dt-root'>root element</termref> of any document
1342  is considered to have signaled no intentions as regards application space
1343  handling, unless it provides a value for 
1344  this attribute or the attribute is declared with a default value.
1345  </p>
1346  
1347  </div2>
1348  <div2 id='sec-line-ends'>
1349  <head>End-of-Line Handling</head>
1350  <p>XML <termref def='dt-parsedent'>parsed entities</termref> are often stored in
1351  computer files which, for editing convenience, are organized into lines.
1352  These lines are typically separated by some combination of the characters
1353  carriage-return (#xD) and line-feed (#xA).</p>
1354  <p>To simplify the tasks of <termref def='dt-app'>applications</termref>,
1355  wherever an external parsed entity or the literal entity value
1356  of an internal parsed entity contains either the literal 
1357  two-character sequence "#xD#xA" or a standalone literal
1358  #xD, an <termref def='dt-xml-proc'>XML processor</termref> must 
1359  pass to the application the single character #xA.
1360  (This behavior can 
1361  conveniently be produced by normalizing all 
1362  line breaks to #xA on input, before parsing.)
1363  </p>
1364  </div2>
1365  <div2 id='sec-lang-tag'>
1366  <head>Language Identification</head>
1367  <p>In document processing, it is often useful to
1368  identify the natural or formal language 
1369  in which the content is 
1370  written.
1371  A special <termref def="dt-attr">attribute</termref> named
1372  <kw>xml:lang</kw> may be inserted in
1373  documents to specify the 
1374  language used in the contents and attribute values 
1375  of any element in an XML document.
1376  In valid documents, this attribute, like any other, must be 
1377  <termref def="dt-attdecl">declared</termref> if it is used.
1378  The values of the attribute are language identifiers as defined
1379  by <bibref ref="RFC1766"/>, "Tags for the Identification of Languages":
1380  <scrap lang='ebnf'>
1381  <head>Language Identification</head>
1382  <prod id='NT-LanguageID'><lhs>LanguageID</lhs>
1383  <rhs><nt def='NT-Langcode'>Langcode</nt> 
1384  ('-' <nt def='NT-Subcode'>Subcode</nt>)*</rhs></prod>
1385  <prod id='NT-Langcode'><lhs>Langcode</lhs>
1386  <rhs><nt def='NT-ISO639Code'>ISO639Code</nt> | 
1387  <nt def='NT-IanaCode'>IanaCode</nt> | 
1388  <nt def='NT-UserCode'>UserCode</nt></rhs>
1389  </prod>
1390  <prod id='NT-ISO639Code'><lhs>ISO639Code</lhs>
1391  <rhs>([a-z] | [A-Z]) ([a-z] | [A-Z])</rhs></prod>
1392  <prod id='NT-IanaCode'><lhs>IanaCode</lhs>
1393  <rhs>('i' | 'I') '-' ([a-z] | [A-Z])+</rhs></prod>
1394  <prod id='NT-UserCode'><lhs>UserCode</lhs>
1395  <rhs>('x' | 'X') '-' ([a-z] | [A-Z])+</rhs></prod>
1396  <prod id='NT-Subcode'><lhs>Subcode</lhs>
1397  <rhs>([a-z] | [A-Z])+</rhs></prod>
1398  </scrap>
1399  The <nt def='NT-Langcode'>Langcode</nt> may be any of the following:
1400  <ulist>
1401  <item><p>a two-letter language code as defined by 
1402  <bibref ref="ISO639"/>, "Codes
1403  for the representation of names of languages"</p></item>
1404  <item><p>a language identifier registered with the Internet
1405  Assigned Numbers Authority <bibref ref='IANA'/>; these begin with the 
1406  prefix "<code>i-</code>" (or "<code>I-</code>")</p></item>
1407  <item><p>a language identifier assigned by the user, or agreed on
1408  between parties in private use; these must begin with the
1409  prefix "<code>x-</code>" or "<code>X-</code>" in order to ensure that they do not conflict 
1410  with names later standardized or registered with IANA</p></item>
1411  </ulist></p>
1412  <p>There may be any number of <nt def='NT-Subcode'>Subcode</nt> segments; if
1413  the first 
1414  subcode segment exists and the Subcode consists of two 
1415  letters, then it must be a country code from 
1416  <bibref ref="ISO3166"/>, "Codes 
1417  for the representation of names of countries."
1418  If the first 
1419  subcode consists of more than two letters, it must be
1420  a subcode for the language in question registered with IANA,
1421  unless the <nt def='NT-Langcode'>Langcode</nt> begins with the prefix 
1422  "<code>x-</code>" or
1423  "<code>X-</code>". </p>
1424  <p>It is customary to give the language code in lower case, and
1425  the country code (if any) in upper case.
1426  Note that these values, unlike other names in XML documents,
1427  are case insensitive.</p>
1428  <p>For example:
1429  <eg><![CDATA[<p xml:lang="en">The quick brown fox jumps over the lazy dog.</p>
1430  <p xml:lang="en-GB">What colour is it?</p>
1431  <p xml:lang="en-US">What color is it?</p>
1432  <sp who="Faust" desc='leise' xml:lang="de">
1433    <l>Habe nun, ach! Philosophie,</l>
1434    <l>Juristerei, und Medizin</l>
1435    <l>und leider auch Theologie</l>
1436    <l>durchaus studiert mit hei�em Bem�h'n.</l>
1437    </sp>]]></eg></p>
1438  <!--<p>The xml:lang value is considered to apply both to the contents of an
1439  element and 
1440  (unless otherwise via attribute default values) to the
1441  values of all of its attributes with free-text (CDATA) values.  -->
1442  <p>The intent declared with <kw>xml:lang</kw> is considered to apply to
1443  all attributes and content of the element where it is specified,
1444  unless overridden with an instance of <kw>xml:lang</kw>
1445  on another element within that content.</p>
1446  <!--
1447  If no
1448  value is specified for xml:lang on an element, and no default value is
1449  defined for it in the DTD, then the xml:lang attribute of any element
1450  takes the same value it has in the parent element, if any.  The two
1451  technical terms in the following example both have the same effective
1452  value for xml:lang:
1453  
1454    <p xml:lang="en">Here the keywords are
1455    <term xml:lang="en">shift</term> and
1456    <term>reduce</term>. ...</p>
1457  
1458  The application, not the XML processor, is responsible for this '
1459  inheritance' of attribute values.
1460  -->
1461  <p>A simple declaration for <kw>xml:lang</kw> might take
1462  the form
1463  <eg>xml:lang  NMTOKEN  #IMPLIED</eg>
1464  but specific default values may also be given, if appropriate.  In a
1465  collection of French poems for English students, with glosses and
1466  notes in English, the xml:lang attribute might be declared this way:
1467  <eg><![CDATA[    <!ATTLIST poem   xml:lang NMTOKEN 'fr'>
1468      <!ATTLIST gloss  xml:lang NMTOKEN 'en'>
1469      <!ATTLIST note   xml:lang NMTOKEN 'en'>]]></eg>
1470  </p>
1471  
1472  </div2>
1473  </div1>
1474  <!-- &Elements; -->
1475   
1476  <div1 id='sec-logical-struct'>
1477  <head>Logical Structures</head>
1478   
1479  <p><termdef id="dt-element" term="Element">Each <termref
1480  def="dt-xml-doc">XML document</termref> contains one or more
1481  <term>elements</term>, the boundaries of which are 
1482  either delimited by <termref def="dt-stag">start-tags</termref> 
1483  and <termref def="dt-etag">end-tags</termref>, or, for <termref
1484  def="dt-empty">empty</termref> elements, by an <termref
1485  def="dt-eetag">empty-element tag</termref>. Each element has a type,
1486  identified by name, sometimes called its "generic
1487  identifier" (GI), and may have a set of
1488  attribute specifications.</termdef>  Each attribute specification 
1489  has a <termref
1490  def="dt-attrname">name</termref> and a <termref
1491  def="dt-attrval">value</termref>.
1492  </p>
1493  <scrap lang='ebnf'><head>Element</head>
1494  <prod id='NT-element'><lhs>element</lhs>
1495  <rhs><nt def='NT-EmptyElemTag'>EmptyElemTag</nt></rhs>
1496  <rhs>| <nt def='NT-STag'>STag</nt> <nt def='NT-content'>content</nt> 
1497  <nt def='NT-ETag'>ETag</nt></rhs>
1498  <wfc def='GIMatch'/>
1499  <vc def='elementvalid'/>
1500  </prod>
1501  </scrap>
1502  <p>This specification does not constrain the semantics, use, or (beyond
1503  syntax) names of the element types and attributes, except that names
1504  beginning with a match to <code>(('X'|'x')('M'|'m')('L'|'l'))</code>
1505  are reserved for standardization in this or future versions of this
1506  specification.
1507  </p>
1508  <wfcnote id='GIMatch'>
1509  <head>Element Type Match</head>
1510  <p>
1511  The <nt def='NT-Name'>Name</nt> in an element's end-tag must match 
1512  the element type in
1513  the start-tag.
1514  </p>
1515  </wfcnote>
1516  <vcnote id='elementvalid'>
1517  <head>Element Valid</head>
1518  <p>An element is
1519  valid if
1520  there is a declaration matching 
1521  <nt def='NT-elementdecl'>elementdecl</nt> where the
1522  <nt def='NT-Name'>Name</nt> matches the element type, and
1523  one of the following holds:</p>
1524  <olist>
1525  <item><p>The declaration matches <kw>EMPTY</kw> and the element has no 
1526  <termref def='dt-content'>content</termref>.</p></item>
1527  <item><p>The declaration matches <nt def='NT-children'>children</nt> and
1528  the sequence of 
1529  <termref def="dt-parentchild">child elements</termref>
1530  belongs to the language generated by the regular expression in
1531  the content model, with optional white space (characters 
1532  matching the nonterminal <nt def='NT-S'>S</nt>) between each pair
1533  of child elements.</p></item>
1534  <item><p>The declaration matches <nt def='NT-Mixed'>Mixed</nt> and 
1535  the content consists of <termref def='dt-chardata'>character 
1536  data</termref> and <termref def='dt-parentchild'>child elements</termref>
1537  whose types match names in the content model.</p></item>
1538  <item><p>The declaration matches <kw>ANY</kw>, and the types
1539  of any <termref def='dt-parentchild'>child elements</termref> have
1540  been declared.</p></item>
1541  </olist>
1542  </vcnote>
1543  
1544  <div2 id='sec-starttags'>
1545  <head>Start-Tags, End-Tags, and Empty-Element Tags</head>
1546   
1547  <p><termdef id="dt-stag" term="Start-Tag">The beginning of every
1548  non-empty XML element is marked by a <term>start-tag</term>.
1549  <scrap lang='ebnf'>
1550  <head>Start-tag</head>
1551  <prodgroup pcw2="6" pcw4="15" pcw5="11.5">
1552  <prod id='NT-STag'><lhs>STag</lhs>
1553  <rhs>'&lt;' <nt def='NT-Name'>Name</nt> 
1554  (<nt def='NT-S'>S</nt> <nt def='NT-Attribute'>Attribute</nt>)* 
1555  <nt def='NT-S'>S</nt>? '>'</rhs>
1556  <wfc def="uniqattspec"/>
1557  </prod>
1558  <prod id='NT-Attribute'><lhs>Attribute</lhs>
1559  <rhs><nt def='NT-Name'>Name</nt> <nt def='NT-Eq'>Eq</nt> 
1560  <nt def='NT-AttValue'>AttValue</nt></rhs>
1561  <vc def='ValueType'/>
1562  <wfc def='NoExternalRefs'/>
1563  <wfc def='CleanAttrVals'/></prod>
1564  </prodgroup>
1565  </scrap>
1566  The <nt def='NT-Name'>Name</nt> in
1567  the start- and end-tags gives the 
1568  element's <term>type</term>.</termdef>
1569  <termdef id="dt-attr" term="Attribute">
1570  The <nt def='NT-Name'>Name</nt>-<nt def='NT-AttValue'>AttValue</nt> pairs are
1571  referred to as 
1572  the <term>attribute specifications</term> of the element</termdef>,
1573  <termdef id="dt-attrname" term="Attribute Name">with the 
1574  <nt def='NT-Name'>Name</nt> in each pair
1575  referred to as the <term>attribute name</term></termdef> and
1576  <termdef id="dt-attrval" term="Attribute Value">the content of the
1577  <nt def='NT-AttValue'>AttValue</nt> (the text between the
1578  <code>'</code> or <code>"</code> delimiters)
1579  as the <term>attribute value</term>.</termdef>
1580  </p>
1581  <wfcnote id='uniqattspec'>
1582  <head>Unique Att Spec</head>
1583  <p>
1584  No attribute name may appear more than once in the same start-tag
1585  or empty-element tag.
1586  </p>
1587  </wfcnote>
1588  <vcnote id='ValueType'>
1589  <head>Attribute Value Type</head>
1590  <p>
1591  The attribute must have been declared; the value must be of the type 
1592  declared for it.
1593  (For attribute types, see <specref ref='attdecls'/>.)
1594  </p>
1595  </vcnote>
1596  <wfcnote id='NoExternalRefs'>
1597  <head>No External Entity References</head>
1598  <p>
1599  Attribute values cannot contain direct or indirect entity references 
1600  to external entities.
1601  </p>
1602  </wfcnote>
1603  <wfcnote id='CleanAttrVals'>
1604  <head>No <code>&lt;</code> in Attribute Values</head>
1605  <p>The <termref def='dt-repltext'>replacement text</termref> of any entity
1606  referred to directly or indirectly in an attribute
1607  value (other than "<code>&amp;lt;</code>") must not contain
1608  a <code>&lt;</code>.
1609  </p></wfcnote>
1610  <p>An example of a start-tag:
1611  <eg>&lt;termdef id="dt-dog" term="dog"></eg></p>
1612  <p><termdef id="dt-etag" term="End Tag">The end of every element 
1613  that begins with a start-tag must
1614  be marked by an <term>end-tag</term>
1615  containing a name that echoes the element's type as given in the
1616  start-tag:
1617  <scrap lang='ebnf'>
1618  <head>End-tag</head>
1619  <prodgroup pcw2="6" pcw4="15" pcw5="11.5">
1620  <prod id='NT-ETag'><lhs>ETag</lhs>
1621  <rhs>'&lt;/' <nt def='NT-Name'>Name</nt> 
1622  <nt def='NT-S'>S</nt>? '>'</rhs></prod>
1623  </prodgroup>
1624  </scrap>
1625  </termdef></p>
1626  <p>An example of an end-tag:<eg>&lt;/termdef></eg></p>
1627  <p><termdef id="dt-content" term="Content">The 
1628  <termref def='dt-text'>text</termref> between the start-tag and
1629  end-tag is called the element's
1630  <term>content</term>:
1631  <scrap lang='ebnf'>
1632  <head>Content of Elements</head>
1633  <prodgroup pcw2="6" pcw4="15" pcw5="11.5">
1634  <prod id='NT-content'><lhs>content</lhs>
1635  <rhs>(<nt def='NT-element'>element</nt> | <nt def='NT-CharData'>CharData</nt> 
1636  | <nt def='NT-Reference'>Reference</nt> | <nt def='NT-CDSect'>CDSect</nt> 
1637  | <nt def='NT-PI'>PI</nt> | <nt def='NT-Comment'>Comment</nt>)*</rhs>
1638  </prod>
1639  </prodgroup>
1640  </scrap>
1641  </termdef></p>
1642  <p><termdef id="dt-empty" term="Empty">If an element is <term>empty</term>,
1643  it must be represented either by a start-tag immediately followed
1644  by an end-tag or by an empty-element tag.</termdef>
1645  <termdef id="dt-eetag" term="empty-element tag">An 
1646  <term>empty-element tag</term> takes a special form:
1647  <scrap lang='ebnf'>
1648  <head>Tags for Empty Elements</head>
1649  <prodgroup pcw2="6" pcw4="15" pcw5="11.5">
1650  <prod id='NT-EmptyElemTag'><lhs>EmptyElemTag</lhs>
1651  <rhs>'&lt;' <nt def='NT-Name'>Name</nt> (<nt def='NT-S'>S</nt> 
1652  <nt def='NT-Attribute'>Attribute</nt>)* <nt def='NT-S'>S</nt>? 
1653  '/&gt;'</rhs>
1654  <wfc def="uniqattspec"/>
1655  </prod>
1656  </prodgroup>
1657  </scrap>
1658  </termdef></p>
1659  <p>Empty-element tags may be used for any element which has no
1660  content, whether or not it is declared using the keyword
1661  <kw>EMPTY</kw>.
1662  <termref def='dt-interop'>For interoperability</termref>, the empty-element
1663  tag must be used, and can only be used, for elements which are
1664  <termref def='dt-eldecl'>declared</termref> <kw>EMPTY</kw>.</p>
1665  <p>Examples of empty elements:
1666  <eg>&lt;IMG align="left"
1667   src="http://www.w3.org/Icons/WWW/w3c_home" />
1668  &lt;br>&lt;/br>
1669  &lt;br/></eg></p>
1670  </div2>
1671   
1672  <div2 id='elemdecls'>
1673  <head>Element Type Declarations</head>
1674   
1675  <p>The <termref def="dt-element">element</termref> structure of an
1676  <termref def="dt-xml-doc">XML document</termref> may, for 
1677  <termref def="dt-valid">validation</termref> purposes, 
1678  be constrained
1679  using element type and attribute-list declarations.
1680  An element type declaration constrains the element's
1681  <termref def="dt-content">content</termref>.
1682  </p>
1683  
1684  <p>Element type declarations often constrain which element types can
1685  appear as <termref def="dt-parentchild">children</termref> of the element.
1686  At user option, an XML processor may issue a warning
1687  when a declaration mentions an element type for which no declaration
1688  is provided, but this is not an error.</p>
1689  <p><termdef id="dt-eldecl" term="Element Type declaration">An <term>element
1690  type declaration</term> takes the form:
1691  <scrap lang='ebnf'>
1692  <head>Element Type Declaration</head>
1693  <prodgroup pcw2="5.5" pcw4="18" pcw5="9">
1694  <prod id='NT-elementdecl'><lhs>elementdecl</lhs>
1695  <rhs>'&lt;!ELEMENT' <nt def='NT-S'>S</nt> 
1696  <nt def='NT-Name'>Name</nt> 
1697  <nt def='NT-S'>S</nt> 
1698  <nt def='NT-contentspec'>contentspec</nt>
1699  <nt def='NT-S'>S</nt>? '>'</rhs>
1700  <vc def='EDUnique'/></prod>
1701  <prod id='NT-contentspec'><lhs>contentspec</lhs>
1702  <rhs>'EMPTY' 
1703  | 'ANY' 
1704  | <nt def='NT-Mixed'>Mixed</nt> 
1705  | <nt def='NT-children'>children</nt>
1706  </rhs>
1707  </prod>
1708  </prodgroup>
1709  </scrap>
1710  where the <nt def='NT-Name'>Name</nt> gives the element type 
1711  being declared.</termdef>
1712  </p>
1713  
1714  <vcnote id='EDUnique'>
1715  <head>Unique Element Type Declaration</head>
1716  <p>
1717  No element type may be declared more than once.
1718  </p>
1719  </vcnote>
1720  
1721  <p>Examples of element type declarations:
1722  <eg>&lt;!ELEMENT br EMPTY>
1723  &lt;!ELEMENT p (#PCDATA|emph)* >
1724  &lt;!ELEMENT %name.para; %content.para; >
1725  &lt;!ELEMENT container ANY></eg></p>
1726   
1727  <div3 id='sec-element-content'>
1728  <head>Element Content</head>
1729   
1730  <p><termdef id='dt-elemcontent' term='Element content'>An element <termref
1731  def="dt-stag">type</termref> has
1732  <term>element content</term> when elements of that
1733  type must contain only <termref def='dt-parentchild'>child</termref> 
1734  elements (no character data), optionally separated by 
1735  white space (characters matching the nonterminal 
1736  <nt def='NT-S'>S</nt>).
1737  </termdef>
1738  In this case, the
1739  constraint includes a content model, a simple grammar governing
1740  the allowed types of the child
1741  elements and the order in which they are allowed to appear.  
1742  The grammar is built on
1743  content particles (<nt def='NT-cp'>cp</nt>s), which consist of names, 
1744  choice lists of content particles, or
1745  sequence lists of content particles:
1746  <scrap lang='ebnf'>
1747  <head>Element-content Models</head>
1748  <prodgroup pcw2="5.5" pcw4="16" pcw5="11">
1749  <prod id='NT-children'><lhs>children</lhs>
1750  <rhs>(<nt def='NT-choice'>choice</nt> 
1751  | <nt def='NT-seq'>seq</nt>) 
1752  ('?' | '*' | '+')?</rhs></prod>
1753  <prod id='NT-cp'><lhs>cp</lhs>
1754  <rhs>(<nt def='NT-Name'>Name</nt> 
1755  | <nt def='NT-choice'>choice</nt> 
1756  | <nt def='NT-seq'>seq</nt>) 
1757  ('?' | '*' | '+')?</rhs></prod>
1758  <prod id='NT-choice'><lhs>choice</lhs>
1759  <rhs>'(' <nt def='NT-S'>S</nt>? cp 
1760  ( <nt def='NT-S'>S</nt>? '|' <nt def='NT-S'>S</nt>? <nt def='NT-cp'>cp</nt> )*
1761  <nt def='NT-S'>S</nt>? ')'</rhs>
1762  <vc def='vc-PEinGroup'/></prod>
1763  <prod id='NT-seq'><lhs>seq</lhs>
1764  <rhs>'(' <nt def='NT-S'>S</nt>? cp 
1765  ( <nt def='NT-S'>S</nt>? ',' <nt def='NT-S'>S</nt>? <nt def='NT-cp'>cp</nt> )*
1766  <nt def='NT-S'>S</nt>? ')'</rhs>
1767  <vc def='vc-PEinGroup'/></prod>
1768  
1769  </prodgroup>
1770  </scrap>
1771  where each <nt def='NT-Name'>Name</nt> is the type of an element which may
1772  appear as a <termref def="dt-parentchild">child</termref>.  
1773  Any content
1774  particle in a choice list may appear in the <termref
1775  def="dt-elemcontent">element content</termref> at the location where
1776  the choice list appears in the grammar;
1777  content particles occurring in a sequence list must each
1778  appear in the <termref def="dt-elemcontent">element content</termref> in the
1779  order given in the list.  
1780  The optional character following a name or list governs
1781  whether the element or the content particles in the list may occur one
1782  or more (<code>+</code>), zero or more (<code>*</code>), or zero or 
1783  one times (<code>?</code>).  
1784  The absence of such an operator means that the element or content particle
1785  must appear exactly once.
1786  This syntax
1787  and meaning are identical to those used in the productions in this
1788  specification.</p>
1789  <p>
1790  The content of an element matches a content model if and only if it is
1791  possible to trace out a path through the content model, obeying the
1792  sequence, choice, and repetition operators and matching each element in
1793  the content against an element type in the content model.  <termref
1794  def='dt-compat'>For compatibility</termref>, it is an error
1795  if an element in the document can
1796  match more than one occurrence of an element type in the content model.
1797  For more information, see <specref ref="determinism"/>.
1798  <!-- appendix <specref ref="determinism"/>. -->
1799  <!-- appendix on deterministic content models. -->
1800  </p>
1801  <vcnote id='vc-PEinGroup'>
1802  <head>Proper Group/PE Nesting</head>
1803  <p>Parameter-entity 
1804  <termref def='dt-repltext'>replacement text</termref> must be properly nested
1805  with parenthetized groups.
1806  That is to say, if either of the opening or closing parentheses
1807  in a <nt def='NT-choice'>choice</nt>, <nt def='NT-seq'>seq</nt>, or
1808  <nt def='NT-Mixed'>Mixed</nt> construct 
1809  is contained in the replacement text for a 
1810  <termref def='dt-PERef'>parameter entity</termref>,
1811  both must be contained in the same replacement text.</p>
1812  <p><termref def='dt-interop'>For interoperability</termref>, 
1813  if a parameter-entity reference appears in a 
1814  <nt def='NT-choice'>choice</nt>, <nt def='NT-seq'>seq</nt>, or
1815  <nt def='NT-Mixed'>Mixed</nt> construct, its replacement text
1816  should not be empty, and 
1817  neither the first nor last non-blank
1818  character of the replacement text should be a connector 
1819  (<code>|</code> or <code>,</code>).
1820  </p>
1821  </vcnote>
1822  <p>Examples of element-content models:
1823  <eg>&lt;!ELEMENT spec (front, body, back?)>
1824  &lt;!ELEMENT div1 (head, (p | list | note)*, div2*)>
1825  &lt;!ELEMENT dictionary-body (%div.mix; | %dict.mix;)*></eg></p>
1826  </div3>
1827  
1828  <div3 id='sec-mixed-content'>
1829  <head>Mixed Content</head>
1830   
1831  <p><termdef id='dt-mixed' term='Mixed Content'>An element 
1832  <termref def='dt-stag'>type</termref> has 
1833  <term>mixed content</term> when elements of that type may contain
1834  character data, optionally interspersed with
1835  <termref def="dt-parentchild">child</termref> elements.</termdef>
1836  In this case, the types of the child elements
1837  may be constrained, but not their order or their number of occurrences:
1838  <scrap lang='ebnf'>
1839  <head>Mixed-content Declaration</head>
1840  <prodgroup pcw2="5.5" pcw4="16" pcw5="11">
1841  <prod id='NT-Mixed'><lhs>Mixed</lhs>
1842  <rhs>'(' <nt def='NT-S'>S</nt>? 
1843  '#PCDATA'
1844  (<nt def='NT-S'>S</nt>? 
1845  '|' 
1846  <nt def='NT-S'>S</nt>? 
1847  <nt def='NT-Name'>Name</nt>)* 
1848  <nt def='NT-S'>S</nt>? 
1849  ')*' </rhs>
1850  <rhs>| '(' <nt def='NT-S'>S</nt>? '#PCDATA' <nt def='NT-S'>S</nt>? ')'
1851  </rhs><vc def='vc-PEinGroup'/>
1852  <vc def='vc-MixedChildrenUnique'/>
1853  </prod>
1854  
1855  </prodgroup>
1856  </scrap>
1857  where the <nt def='NT-Name'>Name</nt>s give the types of elements
1858  that may appear as children.
1859  </p>
1860  <vcnote id='vc-MixedChildrenUnique'>
1861  <head>No Duplicate Types</head>
1862  <p>The same name must not appear more than once in a single mixed-content
1863  declaration.
1864  </p></vcnote>
1865  <p>Examples of mixed content declarations:
1866  <eg>&lt;!ELEMENT p (#PCDATA|a|ul|b|i|em)*>
1867  &lt;!ELEMENT p (#PCDATA | %font; | %phrase; | %special; | %form;)* >
1868  &lt;!ELEMENT b (#PCDATA)></eg></p>
1869  </div3>
1870  </div2>
1871   
1872  <div2 id='attdecls'>
1873  <head>Attribute-List Declarations</head>
1874   
1875  <p><termref def="dt-attr">Attributes</termref> are used to associate
1876  name-value pairs with <termref def="dt-element">elements</termref>.
1877  Attribute specifications may appear only within <termref
1878  def="dt-stag">start-tags</termref>
1879  and <termref def="dt-eetag">empty-element tags</termref>; 
1880  thus, the productions used to
1881  recognize them appear in <specref ref='sec-starttags'/>.  
1882  Attribute-list
1883  declarations may be used:
1884  <ulist>
1885  <item><p>To define the set of attributes pertaining to a given
1886  element type.</p></item>
1887  <item><p>To establish type constraints for these
1888  attributes.</p></item>
1889  <item><p>To provide <termref def="dt-default">default values</termref>
1890  for attributes.</p></item>
1891  </ulist>
1892  </p>
1893  <p><termdef id="dt-attdecl" term="Attribute-List Declaration">
1894  <term>Attribute-list declarations</term> specify the name, data type, and default
1895  value (if any) of each attribute associated with a given element type:
1896  <scrap lang='ebnf'>
1897  <head>Attribute-list Declaration</head>
1898  <prod id='NT-AttlistDecl'><lhs>AttlistDecl</lhs>
1899  <rhs>'&lt;!ATTLIST' <nt def='NT-S'>S</nt> 
1900  <nt def='NT-Name'>Name</nt> 
1901  <nt def='NT-AttDef'>AttDef</nt>*
1902  <nt def='NT-S'>S</nt>? '&gt;'</rhs>
1903  </prod>
1904  <prod id='NT-AttDef'><lhs>AttDef</lhs>
1905  <rhs><nt def='NT-S'>S</nt> <nt def='NT-Name'>Name</nt> 
1906  <nt def='NT-S'>S</nt> <nt def='NT-AttType'>AttType</nt> 
1907  <nt def='NT-S'>S</nt> <nt def='NT-DefaultDecl'>DefaultDecl</nt></rhs>
1908  </prod>
1909  </scrap>
1910  The <nt def="NT-Name">Name</nt> in the
1911  <nt def='NT-AttlistDecl'>AttlistDecl</nt> rule is the type of an element.  At
1912  user option, an XML processor may issue a warning if attributes are
1913  declared for an element type not itself declared, but this is not an
1914  error.  The <nt def='NT-Name'>Name</nt> in the 
1915  <nt def='NT-AttDef'>AttDef</nt> rule is
1916  the name of the attribute.</termdef></p>
1917  <p>
1918  When more than one <nt def='NT-AttlistDecl'>AttlistDecl</nt> is provided for a
1919  given element type, the contents of all those provided are merged.  When
1920  more than one definition is provided for the same attribute of a
1921  given element type, the first declaration is binding and later
1922  declarations are ignored.  
1923  <termref def='dt-interop'>For interoperability,</termref> writers of DTDs
1924  may choose to provide at most one attribute-list declaration
1925  for a given element type, at most one attribute definition
1926  for a given attribute name, and at least one attribute definition
1927  in each attribute-list declaration.
1928  For interoperability, an XML processor may at user option
1929  issue a warning when more than one attribute-list declaration is
1930  provided for a given element type, or more than one attribute definition
1931  is provided 
1932  for a given attribute, but this is not an error.
1933  </p>
1934  
1935  <div3 id='sec-attribute-types'>
1936  <head>Attribute Types</head>
1937   
1938  <p>XML attribute types are of three kinds:  a string type, a
1939  set of tokenized types, and enumerated types.  The string type may take
1940  any literal string as a value; the tokenized types have varying lexical
1941  and semantic constraints, as noted:
1942  <scrap lang='ebnf'>
1943  <head>Attribute Types</head>
1944  <prodgroup pcw4="14" pcw5="11.5">
1945  <prod id='NT-AttType'><lhs>AttType</lhs>
1946  <rhs><nt def='NT-StringType'>StringType</nt> 
1947  | <nt def='NT-TokenizedType'>TokenizedType</nt> 
1948  | <nt def='NT-EnumeratedType'>EnumeratedType</nt>
1949  </rhs>
1950  </prod>
1951  <prod id='NT-StringType'><lhs>StringType</lhs>
1952  <rhs>'CDATA'</rhs>
1953  </prod>
1954  <prod id='NT-TokenizedType'><lhs>TokenizedType</lhs>
1955  <rhs>'ID'</rhs>
1956  <vc def='id'/>
1957  <vc def='one-id-per-el'/>
1958  <vc def='id-default'/>
1959  <rhs>| 'IDREF'</rhs>
1960  <vc def='idref'/>
1961  <rhs>| 'IDREFS'</rhs>
1962  <vc def='idref'/>
1963  <rhs>| 'ENTITY'</rhs>
1964  <vc def='entname'/>
1965  <rhs>| 'ENTITIES'</rhs>
1966  <vc def='entname'/>
1967  <rhs>| 'NMTOKEN'</rhs>
1968  <vc def='nmtok'/>
1969  <rhs>| 'NMTOKENS'</rhs>
1970  <vc def='nmtok'/></prod>
1971  </prodgroup>
1972  </scrap>
1973  </p>
1974  <vcnote id='id' >
1975  <head>ID</head>
1976  <p>
1977  Values of type <kw>ID</kw> must match the 
1978  <nt def='NT-Name'>Name</nt> production.  
1979  A name must not appear more than once in
1980  an XML document as a value of this type; i.e., ID values must uniquely
1981  identify the elements which bear them.   
1982  </p>
1983  </vcnote>
1984  <vcnote id='one-id-per-el'>
1985  <head>One ID per Element Type</head>
1986  <p>No element type may have more than one ID attribute specified.</p>
1987  </vcnote>
1988  <vcnote id='id-default'>
1989  <head>ID Attribute Default</head>
1990  <p>An ID attribute must have a declared default of <kw>#IMPLIED</kw> or
1991  <kw>#REQUIRED</kw>.</p>
1992  </vcnote>
1993  <vcnote id='idref'>
1994  <head>IDREF</head>
1995  <p>
1996  Values of type <kw>IDREF</kw> must match
1997  the <nt def="NT-Name">Name</nt> production, and
1998  values of type <kw>IDREFS</kw> must match
1999  <nt def="NT-Names">Names</nt>; 
2000  each <nt def='NT-Name'>Name</nt> must match the value of an ID attribute on 
2001  some element in the XML document; i.e. <kw>IDREF</kw> values must 
2002  match the value of some ID attribute. 
2003  </p>
2004  </vcnote>
2005  <vcnote id='entname'>
2006  <head>Entity Name</head>
2007  <p>
2008  Values of type <kw>ENTITY</kw> 
2009  must match the <nt def="NT-Name">Name</nt> production,
2010  values of type <kw>ENTITIES</kw> must match
2011  <nt def="NT-Names">Names</nt>;
2012  each <nt def="NT-Name">Name</nt> must 
2013  match the
2014  name of an <termref def="dt-unparsed">unparsed entity</termref> declared in the
2015  <termref def="dt-doctype">DTD</termref>.
2016  </p>
2017  </vcnote>
2018  <vcnote id='nmtok'>
2019  <head>Name Token</head>
2020  <p>
2021  Values of type <kw>NMTOKEN</kw> must match the
2022  <nt def="NT-Nmtoken">Nmtoken</nt> production;
2023  values of type <kw>NMTOKENS</kw> must 
2024  match <termref def="NT-Nmtokens">Nmtokens</termref>.
2025  </p>
2026  </vcnote>
2027  <!-- why?
2028  <p>The XML processor must normalize attribute values before
2029  passing them to the application, as described in 
2030  <specref ref="AVNormalize"/>.</p>-->
2031  <p><termdef id='dt-enumerated' term='Enumerated Attribute
2032  Values'><term>Enumerated attributes</term> can take one 
2033  of a list of values provided in the declaration</termdef>. There are two
2034  kinds of enumerated types:
2035  <scrap lang='ebnf'>
2036  <head>Enumerated Attribute Types</head>
2037  <prod id='NT-EnumeratedType'><lhs>EnumeratedType</lhs> 
2038  <rhs><nt def='NT-NotationType'>NotationType</nt> 
2039  | <nt def='NT-Enumeration'>Enumeration</nt>
2040  </rhs></prod>
2041  <prod id='NT-NotationType'><lhs>NotationType</lhs> 
2042  <rhs>'NOTATION' 
2043  <nt def='NT-S'>S</nt> 
2044  '(' 
2045  <nt def='NT-S'>S</nt>?  
2046  <nt def='NT-Name'>Name</nt> 
2047  (<nt def='NT-S'>S</nt>? '|' <nt def='NT-S'>S</nt>?  
2048  <nt def='NT-Name'>Name</nt>)*
2049  <nt def='NT-S'>S</nt>? ')'
2050  </rhs>
2051  <vc def='notatn' /></prod>
2052  <prod id='NT-Enumeration'><lhs>Enumeration</lhs> 
2053  <rhs>'(' <nt def='NT-S'>S</nt>?
2054  <nt def='NT-Nmtoken'>Nmtoken</nt> 
2055  (<nt def='NT-S'>S</nt>? '|' 
2056  <nt def='NT-S'>S</nt>?  
2057  <nt def='NT-Nmtoken'>Nmtoken</nt>)* 
2058  <nt def='NT-S'>S</nt>? 
2059  ')'</rhs> 
2060  <vc def='enum'/></prod>
2061  </scrap>
2062  A <kw>NOTATION</kw> attribute identifies a 
2063  <termref def='dt-notation'>notation</termref>, declared in the 
2064  DTD with associated system and/or public identifiers, to
2065  be used in interpreting the element to which the attribute
2066  is attached.
2067  </p>
2068  
2069  <vcnote id='notatn'>
2070  <head>Notation Attributes</head>
2071  <p>
2072  Values of this type must match
2073  one of the <titleref href='Notations'>notation</titleref> names included in
2074  the declaration; all notation names in the declaration must
2075  be declared.
2076  </p>
2077  </vcnote>
2078  <vcnote id='enum'>
2079  <head>Enumeration</head>
2080  <p>
2081  Values of this type
2082  must match one of the <nt def='NT-Nmtoken'>Nmtoken</nt> tokens in the
2083  declaration. 
2084  </p>
2085  </vcnote>
2086  <p><termref def='dt-interop'>For interoperability,</termref> the same
2087  <nt def='NT-Nmtoken'>Nmtoken</nt> should not occur more than once in the
2088  enumerated attribute types of a single element type.
2089  </p>
2090  </div3>
2091  
2092  <div3 id='sec-attr-defaults'>
2093  <head>Attribute Defaults</head>
2094   
2095  <p>An <termref def="dt-attdecl">attribute declaration</termref> provides
2096  information on whether
2097  the attribute's presence is required, and if not, how an XML processor should
2098  react if a declared attribute is absent in a document.
2099  <scrap lang='ebnf'>
2100  <head>Attribute Defaults</head>
2101  <prodgroup pcw4="14" pcw5="11.5">
2102  <prod id='NT-DefaultDecl'><lhs>DefaultDecl</lhs>
2103  <rhs>'#REQUIRED' 
2104  |&nbsp;'#IMPLIED' </rhs>
2105  <rhs>| (('#FIXED' S)? <nt def='NT-AttValue'>AttValue</nt>)</rhs>
2106  <vc def='RequiredAttr'/>
2107  <vc def='defattrvalid'/>
2108  <wfc def="CleanAttrVals"/>
2109  <vc def='FixedAttr'/>
2110  </prod>
2111  </prodgroup>
2112  </scrap>
2113  
2114  </p>
2115  <p>In an attribute declaration, <kw>#REQUIRED</kw> means that the
2116  attribute must always be provided, <kw>#IMPLIED</kw> that no default 
2117  value is provided.
2118  <!-- not any more!!
2119  <kw>#IMPLIED</kw> means that if the attribute is omitted
2120  from an element of this type,
2121  the XML processor must inform the application
2122  that no value was specified; no constraint is placed on the behavior
2123  of the application. -->
2124  <termdef id="dt-default" term="Attribute Default">If the 
2125  declaration
2126  is neither <kw>#REQUIRED</kw> nor <kw>#IMPLIED</kw>, then the
2127  <nt def='NT-AttValue'>AttValue</nt> value contains the declared
2128  <term>default</term> value; the <kw>#FIXED</kw> keyword states that
2129  the attribute must always have the default value.
2130  If a default value
2131  is declared, when an XML processor encounters an omitted attribute, it
2132  is to behave as though the attribute were present with 
2133  the declared default value.</termdef></p>
2134  <vcnote id='RequiredAttr'>
2135  <head>Required Attribute</head>
2136  <p>If the default declaration is the keyword <kw>#REQUIRED</kw>, then
2137  the attribute must be specified for
2138  all elements of the type in the attribute-list declaration.
2139  </p></vcnote>
2140  <vcnote id='defattrvalid'>
2141  <head>Attribute Default Legal</head>
2142  <p>
2143  The declared
2144  default value must meet the lexical constraints of the declared attribute type.
2145  </p>
2146  </vcnote>
2147  <vcnote id='FixedAttr'>
2148  <head>Fixed Attribute Default</head>
2149  <p>If an attribute has a default value declared with the 
2150  <kw>#FIXED</kw> keyword, instances of that attribute must
2151  match the default value.
2152  </p></vcnote>
2153  
2154  <p>Examples of attribute-list declarations:
2155  <eg>&lt;!ATTLIST termdef
2156            id      ID      #REQUIRED
2157            name    CDATA   #IMPLIED>
2158  &lt;!ATTLIST list
2159            type    (bullets|ordered|glossary)  "ordered">
2160  &lt;!ATTLIST form
2161            method  CDATA   #FIXED "POST"></eg></p>
2162  </div3>
2163  <div3 id='AVNormalize'>
2164  <head>Attribute-Value Normalization</head>
2165  <p>Before the value of an attribute is passed to the application
2166  or checked for validity, the
2167  XML processor must normalize it as follows:
2168  <ulist>
2169  <item><p>a character reference is processed by appending the referenced    
2170  character to the attribute value</p></item>
2171  <item><p>an entity reference is processed by recursively processing the
2172  replacement text of the entity</p></item>
2173  <item><p>a whitespace character (#x20, #xD, #xA, #x9) is processed by
2174  appending #x20 to the normalized value, except that only a single #x20
2175  is appended for a "#xD#xA" sequence that is part of an external
2176  parsed entity or the literal entity value of an internal parsed
2177  entity</p></item>
2178  <item><p>other characters are processed by appending them to the normalized
2179  value</p>
2180  </item></ulist>
2181  </p>
2182  <p>If the declared value is not CDATA, then the XML processor must
2183  further process the normalized attribute value by discarding any
2184  leading and trailing space (#x20) characters, and by replacing
2185  sequences of space (#x20) characters by a single space (#x20)
2186  character.</p>
2187  <p>
2188  All attributes for which no declaration has been read should be treated
2189  by a non-validating parser as if declared
2190  <kw>CDATA</kw>.
2191  </p>
2192  </div3>
2193  </div2>
2194  <div2 id='sec-condition-sect'>
2195  <head>Conditional Sections</head>
2196  <p><termdef id='dt-cond-section' term='conditional section'>
2197  <term>Conditional sections</term> are portions of the
2198  <termref def='dt-doctype'>document type declaration external subset</termref>
2199  which are 
2200  included in, or excluded from, the logical structure of the DTD based on
2201  the keyword which governs them.</termdef>
2202  <scrap lang='ebnf'>
2203  <head>Conditional Section</head>
2204  <prodgroup pcw2="9" pcw4="14.5">
2205  <prod id='NT-conditionalSect'><lhs>conditionalSect</lhs>
2206  <rhs><nt def='NT-includeSect'>includeSect</nt>
2207  | <nt def='NT-ignoreSect'>ignoreSect</nt>
2208  </rhs>
2209  </prod>
2210  <prod id='NT-includeSect'><lhs>includeSect</lhs>
2211  <rhs>'&lt;![' S? 'INCLUDE' S? '[' 
2212  
2213  <nt def="NT-extSubsetDecl">extSubsetDecl</nt>
2214  ']]&gt;'
2215  </rhs>
2216  </prod>
2217  <prod id='NT-ignoreSect'><lhs>ignoreSect</lhs>
2218  <rhs>'&lt;![' S? 'IGNORE' S? '[' 
2219  <nt def="NT-ignoreSectContents">ignoreSectContents</nt>*
2220  ']]&gt;'</rhs>
2221  </prod>
2222  
2223  <prod id='NT-ignoreSectContents'><lhs>ignoreSectContents</lhs>
2224  <rhs><nt def='NT-Ignore'>Ignore</nt>
2225  ('&lt;![' <nt def='NT-ignoreSectContents'>ignoreSectContents</nt> ']]&gt;' 
2226  <nt def='NT-Ignore'>Ignore</nt>)*</rhs></prod>
2227  <prod id='NT-Ignore'><lhs>Ignore</lhs>
2228  <rhs><nt def='NT-Char'>Char</nt>* - 
2229  (<nt def='NT-Char'>Char</nt>* ('&lt;![' | ']]&gt;') 
2230  <nt def='NT-Char'>Char</nt>*)
2231  </rhs></prod>
2232  
2233  </prodgroup>
2234  </scrap>
2235  </p>
2236  <p>Like the internal and external DTD subsets, a conditional section
2237  may contain one or more complete declarations,
2238  comments, processing instructions, 
2239  or nested conditional sections, intermingled with white space.
2240  </p>
2241  <p>If the keyword of the
2242  conditional section is <kw>INCLUDE</kw>, then the contents of the conditional
2243  section are part of the DTD.
2244  If the keyword of the conditional
2245  section is <kw>IGNORE</kw>, then the contents of the conditional section are
2246  not logically part of the DTD.
2247  Note that for reliable parsing, the contents of even ignored
2248  conditional sections must be read in order to
2249  detect nested conditional sections and ensure that the end of the
2250  outermost (ignored) conditional section is properly detected.
2251  If a conditional section with a
2252  keyword of <kw>INCLUDE</kw> occurs within a larger conditional
2253  section with a keyword of <kw>IGNORE</kw>, both the outer and the
2254  inner conditional sections are ignored.</p>
2255  <p>If the keyword of the conditional section is a 
2256  parameter-entity reference, the parameter entity must be replaced by its
2257  content before the processor decides whether to
2258  include or ignore the conditional section.</p>
2259  <p>An example:
2260  <eg>&lt;!ENTITY % draft 'INCLUDE' >
2261  &lt;!ENTITY % final 'IGNORE' >
2262   
2263  &lt;![%draft;[
2264  &lt;!ELEMENT book (comments*, title, body, supplements?)>
2265  ]]&gt;
2266  &lt;![%final;[
2267  &lt;!ELEMENT book (title, body, supplements?)>
2268  ]]&gt;
2269  </eg>
2270  </p>
2271  </div2>
2272  
2273  
2274  <!-- 
2275  <div2 id='sec-pass-to-app'>
2276  <head>XML Processor Treatment of Logical Structure</head>
2277  <p>When an XML processor encounters a start-tag, it must make
2278  at least the following information available to the application:
2279  <ulist>
2280  <item>
2281  <p>the element type's generic identifier</p>
2282  </item>
2283  <item>
2284  <p>the names of attributes known to apply to this element type
2285  (validating processors must make available names of all attributes
2286  declared for the element type; non-validating processors must
2287  make available at least the names of the attributes for which
2288  values are specified.
2289  </p>
2290  </item>
2291  </ulist>
2292  </p>
2293  </div2>
2294  --> 
2295  
2296  </div1>
2297  <!-- &Entities; -->
2298   
2299  <div1 id='sec-physical-struct'>
2300  <head>Physical Structures</head>
2301   
2302  <p><termdef id="dt-entity" term="Entity">An XML document may consist
2303  of one or many storage units.   These are called
2304  <term>entities</term>; they all have <term>content</term> and are all
2305  (except for the document entity, see below, and 
2306  the <termref def='dt-doctype'>external DTD subset</termref>) 
2307  identified by <term>name</term>.
2308  </termdef>
2309  Each XML document has one entity
2310  called the <termref def="dt-docent">document entity</termref>, which serves
2311  as the starting point for the <termref def="dt-xml-proc">XML
2312  processor</termref> and may contain the whole document.</p>
2313  <p>Entities may be either parsed or unparsed.
2314  <termdef id="dt-parsedent" term="Text Entity">A <term>parsed entity's</term>
2315  contents are referred to as its 
2316  <termref def='dt-repltext'>replacement text</termref>;
2317  this <termref def="dt-text">text</termref> is considered an
2318  integral part of the document.</termdef></p>
2319  
2320  <p><termdef id="dt-unparsed" term="Unparsed Entity">An 
2321  <term>unparsed entity</term> 
2322  is a resource whose contents may or may not be
2323  <termref def='dt-text'>text</termref>, and if text, may not be XML.
2324  Each unparsed entity
2325  has an associated <termref
2326  def="dt-notation">notation</termref>, identified by name.
2327  Beyond a requirement
2328  that an XML processor make the identifiers for the entity and 
2329  notation available to the application,
2330  XML places no constraints on the contents of unparsed entities.</termdef> 
2331  </p>
2332  <p>
2333  Parsed entities are invoked by name using entity references;
2334  unparsed entities by name, given in the value of <kw>ENTITY</kw>
2335  or <kw>ENTITIES</kw>
2336  attributes.</p>
2337  <p><termdef id='gen-entity' term='general entity'
2338  ><term>General entities</term>
2339  are entities for use within the document content.
2340  In this specification, general entities are sometimes referred 
2341  to with the unqualified term <emph>entity</emph> when this leads
2342  to no ambiguity.</termdef> 
2343  <termdef id='dt-PE' term='Parameter entity'>Parameter entities 
2344  are parsed entities for use within the DTD.</termdef>
2345  These two types of entities use different forms of reference and
2346  are recognized in different contexts.
2347  Furthermore, they occupy different namespaces; a parameter entity and
2348  a general entity with the same name are two distinct entities.
2349  </p>
2350  
2351  <div2 id='sec-references'>
2352  <head>Character and Entity References</head>
2353  <p><termdef id="dt-charref" term="Character Reference">
2354  A <term>character reference</term> refers to a specific character in the
2355  ISO/IEC 10646 character set, for example one not directly accessible from
2356  available input devices.
2357  <scrap lang='ebnf'>
2358  <head>Character Reference</head>
2359  <prod id='NT-CharRef'><lhs>CharRef</lhs>
2360  <rhs>'&amp;#' [0-9]+ ';' </rhs>
2361  <rhs>| '&hcro;' [0-9a-fA-F]+ ';'</rhs>
2362  <wfc def="wf-Legalchar"/>
2363  </prod>
2364  </scrap>
2365  <wfcnote id="wf-Legalchar">
2366  <head>Legal Character</head>
2367  <p>Characters referred to using character references must
2368  match the production for
2369  <termref def="NT-Char">Char</termref>.</p>
2370  </wfcnote>
2371  If the character reference begins with "<code>&amp;#x</code>", the digits and
2372  letters up to the terminating <code>;</code> provide a hexadecimal
2373  representation of the character's code point in ISO/IEC 10646.
2374  If it begins just with "<code>&amp;#</code>", the digits up to the terminating
2375  <code>;</code> provide a decimal representation of the character's 
2376  code point.
2377  </termdef>
2378  </p>
2379  <p><termdef id="dt-entref" term="Entity Reference">An <term>entity
2380  reference</term> refers to the content of a named entity.</termdef>
2381  <termdef id='dt-GERef' term='General Entity Reference'>References to 
2382  parsed general entities
2383  use ampersand (<code>&amp;</code>) and semicolon (<code>;</code>) as
2384  delimiters.</termdef>
2385  <termdef id='dt-PERef' term='Parameter-entity reference'>
2386  <term>Parameter-entity references</term> use percent-sign (<code>%</code>) and
2387  semicolon 
2388  (<code>;</code>) as delimiters.</termdef>
2389  </p>
2390  <scrap lang="ebnf">
2391  <head>Entity Reference</head>
2392  <prod id='NT-Reference'><lhs>Reference</lhs>
2393  <rhs><nt def='NT-EntityRef'>EntityRef</nt> 
2394  | <nt def='NT-CharRef'>CharRef</nt></rhs></prod>
2395  <prod id='NT-EntityRef'><lhs>EntityRef</lhs>
2396  <rhs>'&amp;' <nt def='NT-Name'>Name</nt> ';'</rhs>
2397  <wfc def='wf-entdeclared'/>
2398  <vc def='vc-entdeclared'/>
2399  <wfc def='textent'/>
2400  <wfc def='norecursion'/>
2401  </prod>
2402  <prod id='NT-PEReference'><lhs>PEReference</lhs>
2403  <rhs>'%' <nt def='NT-Name'>Name</nt> ';'</rhs>
2404  <vc def='vc-entdeclared'/>
2405  <wfc def='norecursion'/>
2406  <wfc def='indtd'/>
2407  </prod>
2408  </scrap>
2409  
2410  <wfcnote id='wf-entdeclared'>
2411  <head>Entity Declared</head>
2412  <p>In a document without any DTD, a document with only an internal
2413  DTD subset which contains no parameter entity references, or a document with
2414  "<code>standalone='yes'</code>", 
2415  the <nt def='NT-Name'>Name</nt> given in the entity reference must 
2416  <termref def="dt-match">match</termref> that in an 
2417  <titleref href='sec-entity-decl'>entity declaration</titleref>, except that
2418  well-formed documents need not declare 
2419  any of the following entities: &magicents;.  
2420  The declaration of a parameter entity must precede any reference to it.
2421  Similarly, the declaration of a general entity must precede any
2422  reference to it which appears in a default value in an attribute-list
2423  declaration.</p>
2424  <p>Note that if entities are declared in the external subset or in 
2425  external parameter entities, a non-validating processor is 
2426  <titleref href='include-if-valid'>not obligated to</titleref> read
2427  and process their declarations; for such documents, the rule that
2428  an entity must be declared is a well-formedness constraint only
2429  if <titleref href='sec-rmd'>standalone='yes'</titleref>.</p>
2430  </wfcnote>
2431  <vcnote id="vc-entdeclared">
2432  <head>Entity Declared</head>
2433  <p>In a document with an external subset or external parameter
2434  entities with "<code>standalone='no'</code>",
2435  the <nt def='NT-Name'>Name</nt> given in the entity reference must <termref
2436  def="dt-match">match</termref> that in an 
2437  <titleref href='sec-entity-decl'>entity declaration</titleref>.
2438  For interoperability, valid documents should declare the entities 
2439  &magicents;, in the form
2440  specified in <specref ref="sec-predefined-ent"/>.
2441  The declaration of a parameter entity must precede any reference to it.
2442  Similarly, the declaration of a general entity must precede any
2443  reference to it which appears in a default value in an attribute-list
2444  declaration.</p>
2445  </vcnote>
2446  <!-- FINAL EDIT:  is this duplication too clumsy? -->
2447  <wfcnote id='textent'>
2448  <head>Parsed Entity</head>
2449  <p>
2450  An entity reference must not contain the name of an <termref
2451  def="dt-unparsed">unparsed entity</termref>. Unparsed entities may be referred
2452  to only in <termref def="dt-attrval">attribute values</termref> declared to
2453  be of type <kw>ENTITY</kw> or <kw>ENTITIES</kw>.
2454  </p>
2455  </wfcnote>
2456  <wfcnote id='norecursion'>
2457  <head>No Recursion</head>
2458  <p>
2459  A parsed entity must not contain a recursive reference to itself,
2460  either directly or indirectly.
2461  </p>
2462  </wfcnote>
2463  <wfcnote id='indtd'>
2464  <head>In DTD</head>
2465  <p>
2466  Parameter-entity references may only appear in the 
2467  <termref def='dt-doctype'>DTD</termref>.
2468  </p>
2469  </wfcnote>
2470  <p>Examples of character and entity references:
2471  <eg>Type &lt;key>less-than&lt;/key> (&hcro;3C;) to save options.
2472  This document was prepared on &amp;docdate; and
2473  is classified &amp;security-level;.</eg></p>
2474  <p>Example of a parameter-entity reference:
2475  <eg><![CDATA[<!-- declare the parameter entity "ISOLat2"... -->
2476  <!ENTITY % ISOLat2
2477           SYSTEM "http://www.xml.com/iso/isolat2-xml.entities" >
2478  <!-- ... now reference it. -->
2479  %ISOLat2;]]></eg></p>
2480  </div2>
2481   
2482  <div2 id='sec-entity-decl'>
2483  <head>Entity Declarations</head>
2484   
2485  <p><termdef id="dt-entdecl" term="entity declaration">
2486  Entities are declared thus:
2487  <scrap lang='ebnf'>
2488  <head>Entity Declaration</head>
2489  <prodgroup pcw2="5" pcw4="18.5">
2490  <prod id='NT-EntityDecl'><lhs>EntityDecl</lhs>
2491  <rhs><nt def="NT-GEDecl">GEDecl</nt><!--</rhs><com>General entities</com>
2492  <rhs>--> | <nt def="NT-PEDecl">PEDecl</nt></rhs>
2493  <!--<com>Parameter entities</com>-->
2494  </prod>
2495  <prod id='NT-GEDecl'><lhs>GEDecl</lhs>
2496  <rhs>'&lt;!ENTITY' <nt def='NT-S'>S</nt> <nt def='NT-Name'>Name</nt> 
2497  <nt def='NT-S'>S</nt> <nt def='NT-EntityDef'>EntityDef</nt> 
2498  <nt def='NT-S'>S</nt>? '&gt;'</rhs>
2499  </prod>
2500  <prod id='NT-PEDecl'><lhs>PEDecl</lhs>
2501  <rhs>'&lt;!ENTITY' <nt def='NT-S'>S</nt> '%' <nt def='NT-S'>S</nt> 
2502  <nt def='NT-Name'>Name</nt> <nt def='NT-S'>S</nt> 
2503  <nt def='NT-PEDef'>PEDef</nt> <nt def='NT-S'>S</nt>? '&gt;'</rhs>
2504  <!--<com>Parameter entities</com>-->
2505  </prod>
2506  <prod id='NT-EntityDef'><lhs>EntityDef</lhs>
2507  <rhs><nt def='NT-EntityValue'>EntityValue</nt>
2508  <!--</rhs>
2509  <rhs>-->| (<nt def='NT-ExternalID'>ExternalID</nt> 
2510  <nt def='NT-NDataDecl'>NDataDecl</nt>?)</rhs>
2511  <!-- <nt def='NT-ExternalDef'>ExternalDef</nt></rhs> -->
2512  </prod>
2513  <!-- FINAL EDIT: what happened to WFs here? -->
2514  <prod id='NT-PEDef'><lhs>PEDef</lhs>
2515  <rhs><nt def='NT-EntityValue'>EntityValue</nt> 
2516  | <nt def='NT-ExternalID'>ExternalID</nt></rhs></prod>
2517  </prodgroup>
2518  </scrap>
2519  The <nt def='NT-Name'>Name</nt> identifies the entity in an
2520  <termref def="dt-entref">entity reference</termref> or, in the case of an
2521  unparsed entity, in the value of an <kw>ENTITY</kw> or <kw>ENTITIES</kw>
2522  attribute.
2523  If the same entity is declared more than once, the first declaration
2524  encountered is binding; at user option, an XML processor may issue a
2525  warning if entities are declared multiple times.</termdef>
2526  </p>
2527  
2528  <div3 id='sec-internal-ent'>
2529  <head>Internal Entities</head>
2530   
2531  <p><termdef id='dt-internent' term="Internal Entity Replacement Text">If 
2532  the entity definition is an 
2533  <nt def='NT-EntityValue'>EntityValue</nt>,  
2534  the defined entity is called an <term>internal entity</term>.  
2535  There is no separate physical
2536  storage object, and the content of the entity is given in the
2537  declaration. </termdef>
2538  Note that some processing of entity and character references in the
2539  <termref def='dt-litentval'>literal entity value</termref> may be required to
2540  produce the correct <termref def='dt-repltext'>replacement 
2541  text</termref>: see <specref ref='intern-replacement'/>.
2542  </p>
2543  <p>An internal entity is a <termref def="dt-parsedent">parsed
2544  entity</termref>.</p>
2545  <p>Example of an internal entity declaration:
2546  <eg>&lt;!ENTITY Pub-Status "This is a pre-release of the
2547   specification."></eg></p>
2548  </div3>
2549   
2550  <div3 id='sec-external-ent'>
2551  <head>External Entities</head>
2552   
2553  <p><termdef id="dt-extent" term="External Entity">If the entity is not
2554  internal, it is an <term>external
2555  entity</term>, declared as follows:
2556  <scrap lang='ebnf'>
2557  <head>External Entity Declaration</head>
2558  <!--
2559  <prod id='NT-ExternalDef'><lhs>ExternalDef</lhs>
2560  <rhs></prod> -->
2561  <prod id='NT-ExternalID'><lhs>ExternalID</lhs>
2562  <rhs>'SYSTEM' <nt def='NT-S'>S</nt> 
2563  <nt def='NT-SystemLiteral'>SystemLiteral</nt></rhs>
2564  <rhs>| 'PUBLIC' <nt def='NT-S'>S</nt> 
2565  <nt def='NT-PubidLiteral'>PubidLiteral</nt> 
2566  <nt def='NT-S'>S</nt> 
2567  <nt def='NT-SystemLiteral'>SystemLiteral</nt>
2568  </rhs>
2569  </prod>
2570  <prod id='NT-NDataDecl'><lhs>NDataDecl</lhs>
2571  <rhs><nt def='NT-S'>S</nt> 'NDATA' <nt def='NT-S'>S</nt> 
2572  <nt def='NT-Name'>Name</nt></rhs>
2573  <vc def='not-declared'/></prod>
2574  </scrap>
2575  If the <nt def='NT-NDataDecl'>NDataDecl</nt> is present, this is a
2576  general <termref def="dt-unparsed">unparsed
2577  entity</termref>; otherwise it is a parsed entity.</termdef></p>
2578  <vcnote id='not-declared'>
2579  <head>Notation Declared</head>
2580  <p>
2581  The <nt def='NT-Name'>Name</nt> must match the declared name of a
2582  <termref def="dt-notation">notation</termref>.
2583  </p>
2584  </vcnote>
2585  <p><termdef id="dt-sysid" term="System Identifier">The
2586  <nt def='NT-SystemLiteral'>SystemLiteral</nt> 
2587  is called the entity's <term>system identifier</term>. It is a URI,
2588  which may be used to retrieve the entity.</termdef>
2589  Note that the hash mark (<code>#</code>) and fragment identifier 
2590  frequently used with URIs are not, formally, part of the URI itself; 
2591  an XML processor may signal an error if a fragment identifier is 
2592  given as part of a system identifier.
2593  Unless otherwise provided by information outside the scope of this
2594  specification (e.g. a special XML element type defined by a particular
2595  DTD, or a processing instruction defined by a particular application
2596  specification), relative URIs are relative to the location of the
2597  resource within which the entity declaration occurs.
2598  A URI might thus be relative to the 
2599  <termref def='dt-docent'>document entity</termref>, to the entity
2600  containing the <termref def='dt-doctype'>external DTD subset</termref>, 
2601  or to some other <termref def='dt-extent'>external parameter entity</termref>.
2602  </p>
2603  <p>An XML processor should handle a non-ASCII character in a URI by
2604  representing the character in UTF-8 as one or more bytes, and then 
2605  escaping these bytes with the URI escaping mechanism (i.e., by
2606  converting each byte to %HH, where HH is the hexadecimal notation of the
2607  byte value).</p>
2608  <p><termdef id="dt-pubid" term="Public identifier">
2609  In addition to a system identifier, an external identifier may
2610  include a <term>public identifier</term>.</termdef>  
2611  An XML processor attempting to retrieve the entity's content may use the public
2612  identifier to try to generate an alternative URI.  If the processor
2613  is unable to do so, it must use the URI specified in the system
2614  literal.  Before a match is attempted, all strings
2615  of white space in the public identifier must be normalized to single space characters (#x20),
2616  and leading and trailing white space must be removed.</p>
2617  <p>Examples of external entity declarations:
2618  <eg>&lt;!ENTITY open-hatch
2619           SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml">
2620  &lt;!ENTITY open-hatch
2621           PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN"
2622           "http://www.textuality.com/boilerplate/OpenHatch.xml">
2623  &lt;!ENTITY hatch-pic
2624           SYSTEM "../grafix/OpenHatch.gif"
2625           NDATA gif ></eg></p>
2626  </div3>
2627   
2628  </div2>
2629  
2630  <div2 id='TextEntities'>
2631  <head>Parsed Entities</head>
2632  <div3 id='sec-TextDecl'>
2633  <head>The Text Declaration</head>
2634  <p>External parsed entities may each begin with a <term>text
2635  declaration</term>. 
2636  <scrap lang='ebnf'>
2637  <head>Text Declaration</head>
2638  <prodgroup pcw4="12.5" pcw5="13">
2639  <prod id='NT-TextDecl'><lhs>TextDecl</lhs>
2640  <rhs>&xmlpio; 
2641  <nt def='NT-VersionInfo'>VersionInfo</nt>?
2642  <nt def='NT-EncodingDecl'>EncodingDecl</nt>
2643  <nt def='NT-S'>S</nt>? &pic;</rhs>
2644  </prod>
2645  </prodgroup>
2646  </scrap>
2647  </p>
2648  <p>The text declaration must be provided literally, not
2649  by reference to a parsed entity.
2650  No text declaration may appear at any position other than the beginning of
2651  an external parsed entity.</p>
2652  </div3>
2653  <div3 id='wf-entities'>
2654  <head>Well-Formed Parsed Entities</head>
2655  <p>The document entity is well-formed if it matches the production labeled
2656  <nt def='NT-document'>document</nt>.
2657  An external general 
2658  parsed entity is well-formed if it matches the production labeled
2659  <nt def='NT-extParsedEnt'>extParsedEnt</nt>.
2660  An external parameter
2661  entity is well-formed if it matches the production labeled
2662  <nt def='NT-extPE'>extPE</nt>.
2663  <scrap lang='ebnf'>
2664  <head>Well-Formed External Parsed Entity</head>
2665  <prod id='NT-extParsedEnt'><lhs>extParsedEnt</lhs>
2666  <rhs><nt def='NT-TextDecl'>TextDecl</nt>? 
2667  <nt def='NT-content'>content</nt></rhs>
2668  </prod>
2669  <prod id='NT-extPE'><lhs>extPE</lhs>
2670  <rhs><nt def='NT-TextDecl'>TextDecl</nt>? 
2671  <nt def='NT-extSubsetDecl'>extSubsetDecl</nt></rhs>
2672  </prod>
2673  </scrap>
2674  An internal general parsed entity is well-formed if its replacement text 
2675  matches the production labeled
2676  <nt def='NT-content'>content</nt>.
2677  All internal parameter entities are well-formed by definition.
2678  </p>
2679  <p>A consequence of well-formedness in entities is that the logical 
2680  and physical structures in an XML document are properly nested; no 
2681  <termref def='dt-stag'>start-tag</termref>,
2682  <termref def='dt-etag'>end-tag</termref>,
2683  <termref def="dt-empty">empty-element tag</termref>,
2684  <termref def='dt-element'>element</termref>, 
2685  <termref def='dt-comment'>comment</termref>, 
2686  <termref def='dt-pi'>processing instruction</termref>, 
2687  <termref def='dt-charref'>character
2688  reference</termref>, or
2689  <termref def='dt-entref'>entity reference</termref> 
2690  can begin in one entity and end in another.</p>
2691  </div3>
2692  <div3 id='charencoding'>
2693  <head>Character Encoding in Entities</head>
2694   
2695  <p>Each external parsed entity in an XML document may use a different
2696  encoding for its characters. All XML processors must be able to read
2697  entities in either UTF-8 or UTF-16. 
2698  
2699  </p>
2700  <p>Entities encoded in UTF-16 must
2701  begin with the Byte Order Mark described by ISO/IEC 10646 Annex E and
2702  Unicode Appendix B (the ZERO WIDTH NO-BREAK SPACE character, #xFEFF).
2703  This is an encoding signature, not part of either the markup or the
2704  character data of the XML document.
2705  XML processors must be able to use this character to
2706  differentiate between UTF-8 and UTF-16 encoded documents.</p>
2707  <p>Although an XML processor is required to read only entities in
2708  the UTF-8 and UTF-16 encodings, it is recognized that other encodings are
2709  used around the world, and it may be desired for XML processors
2710  to read entities that use them.
2711  Parsed entities which are stored in an encoding other than
2712  UTF-8 or UTF-16 must begin with a <titleref href='TextDecl'>text
2713  declaration</titleref> containing an encoding declaration:
2714  <scrap lang='ebnf'>
2715  <head>Encoding Declaration</head>
2716  <prod id='NT-EncodingDecl'><lhs>EncodingDecl</lhs>
2717  <rhs><nt def="NT-S">S</nt>
2718  'encoding' <nt def='NT-Eq'>Eq</nt> 
2719  ('"' <nt def='NT-EncName'>EncName</nt> '"' | 
2720  "'" <nt def='NT-EncName'>EncName</nt> "'" )
2721  </rhs>
2722  </prod>
2723  <prod id='NT-EncName'><lhs>EncName</lhs>
2724  <rhs>[A-Za-z] ([A-Za-z0-9._] | '-')*</rhs>
2725  <com>Encoding name contains only Latin characters</com>
2726  </prod>
2727  </scrap>
2728  In the <termref def='dt-docent'>document entity</termref>, the encoding
2729  declaration is part of the <termref def="dt-xmldecl">XML declaration</termref>.
2730  The <nt def="NT-EncName">EncName</nt> is the name of the encoding used.
2731  </p>
2732  <!-- FINAL EDIT:  check name of IANA and charset names -->
2733  <p>In an encoding declaration, the values
2734  "<code>UTF-8</code>",
2735  "<code>UTF-16</code>",
2736  "<code>ISO-10646-UCS-2</code>", and
2737  "<code>ISO-10646-UCS-4</code>" should be 
2738  used for the various encodings and transformations of Unicode /
2739  ISO/IEC 10646, the values
2740  "<code>ISO-8859-1</code>",
2741  "<code>ISO-8859-2</code>", ...
2742  "<code>ISO-8859-9</code>" should be used for the parts of ISO 8859, and
2743  the values
2744  "<code>ISO-2022-JP</code>",
2745  "<code>Shift_JIS</code>", and
2746  "<code>EUC-JP</code>"
2747  should be used for the various encoded forms of JIS X-0208-1997.  XML
2748  processors may recognize other encodings; it is recommended that
2749  character encodings registered (as <emph>charset</emph>s) 
2750  with the Internet Assigned Numbers
2751  Authority <bibref ref='IANA'/>, other than those just listed, should be
2752  referred to
2753  using their registered names.
2754  Note that these registered names are defined to be 
2755  case-insensitive, so processors wishing to match against them 
2756  should do so in a case-insensitive
2757  way.</p>
2758  <p>In the absence of information provided by an external
2759  transport protocol (e.g. HTTP or MIME), 
2760  it is an <termref def="dt-error">error</termref> for an entity including
2761  an encoding declaration to be presented to the XML processor 
2762  in an encoding other than that named in the declaration, 
2763  for an encoding declaration to occur other than at the beginning 
2764  of an external entity, or for
2765  an entity which begins with neither a Byte Order Mark nor an encoding
2766  declaration to use an encoding other than UTF-8.
2767  Note that since ASCII
2768  is a subset of UTF-8, ordinary ASCII entities do not strictly need
2769  an encoding declaration.</p>
2770  
2771  <p>It is a <termref def='dt-fatal'>fatal error</termref> when an XML processor
2772  encounters an entity with an encoding that it is unable to process.</p>
2773  <p>Examples of encoding declarations:
2774  <eg>&lt;?xml encoding='UTF-8'?>
2775  &lt;?xml encoding='EUC-JP'?></eg></p>
2776  </div3>
2777  </div2>
2778  <div2 id='entproc'>
2779  <head>XML Processor Treatment of Entities and References</head>
2780  <p>The table below summarizes the contexts in which character references,
2781  entity references, and invocations of unparsed entities might appear and the
2782  required behavior of an <termref def='dt-xml-proc'>XML processor</termref> in
2783  each case.  
2784  The labels in the leftmost column describe the recognition context:
2785  <glist>
2786  <gitem><label>Reference in Content</label>
2787  <def><p>as a reference
2788  anywhere after the <termref def='dt-stag'>start-tag</termref> and
2789  before the <termref def='dt-etag'>end-tag</termref> of an element; corresponds
2790  to the nonterminal <nt def='NT-content'>content</nt>.</p></def>
2791  </gitem>
2792  <gitem>
2793  <label>Reference in Attribute Value</label>
2794  <def><p>as a reference within either the value of an attribute in a 
2795  <termref def='dt-stag'>start-tag</termref>, or a default
2796  value in an <termref def='dt-attdecl'>attribute declaration</termref>;
2797  corresponds to the nonterminal
2798  <nt def='NT-AttValue'>AttValue</nt>.</p></def></gitem>
2799  <gitem>
2800  <label>Occurs as Attribute Value</label>
2801  <def><p>as a <nt def='NT-Name'>Name</nt>, not a reference, appearing either as
2802  the value of an 
2803  attribute which has been declared as type <kw>ENTITY</kw>, or as one of
2804  the space-separated tokens in the value of an attribute which has been
2805  declared as type <kw>ENTITIES</kw>.</p>
2806  </def></gitem>
2807  <gitem><label>Reference in Entity Value</label>
2808  <def><p>as a reference
2809  within a parameter or internal entity's 
2810  <termref def='dt-litentval'>literal entity value</termref> in
2811  the entity's declaration; corresponds to the nonterminal 
2812  <nt def='NT-EntityValue'>EntityValue</nt>.</p></def></gitem>
2813  <gitem><label>Reference in DTD</label>
2814  <def><p>as a reference within either the internal or external subsets of the 
2815  <termref def='dt-doctype'>DTD</termref>, but outside
2816  of an <nt def='NT-EntityValue'>EntityValue</nt> or
2817  <nt def="NT-AttValue">AttValue</nt>.</p></def>
2818  </gitem>
2819  </glist></p>
2820  <htable border='1' cellpadding='7' align='center'>
2821  <htbody>
2822  <tr><td bgcolor='&cellback;' rowspan='2' colspan='1'></td>
2823  <td bgcolor='&cellback;' align='center' valign='bottom' colspan='4'>Entity Type</td>
2824  <td bgcolor='&cellback;' rowspan='2' align='center'>Character</td>
2825  </tr>
2826  <tr align='center' valign='bottom'>
2827  <td bgcolor='&cellback;'>Parameter</td>
2828  <td bgcolor='&cellback;'>Internal
2829  General</td>
2830  <td bgcolor='&cellback;'>External Parsed
2831  General</td>
2832  <td bgcolor='&cellback;'>Unparsed</td>
2833  </tr>
2834  <tr align='center' valign='middle'>
2835  
2836  <td bgcolor='&cellback;' align='right'>Reference
2837  in Content</td>
2838  <td bgcolor='&cellback;'><titleref href='not-recognized'>Not recognized</titleref></td>
2839  <td bgcolor='&cellback;'><titleref href='included'>Included</titleref></td>
2840  <td bgcolor='&cellback;'><titleref href='include-if-valid'>Included if validating</titleref></td>
2841  <td bgcolor='&cellback;'><titleref href='forbidden'>Forbidden</titleref></td>
2842  <td bgcolor='&cellback;'><titleref href='included'>Included</titleref></td>
2843  </tr>
2844  <tr align='center' valign='middle'>
2845  <td bgcolor='&cellback;' align='right'>Reference
2846  in Attribute Value</td>
2847  <td bgcolor='&cellback;'><titleref href='not-recognized'>Not recognized</titleref></td>
2848  <td bgcolor='&cellback;'><titleref href='inliteral'>Included in literal</titleref></td>
2849  <td bgcolor='&cellback;'><titleref href='forbidden'>Forbidden</titleref></td>
2850  <td bgcolor='&cellback;'><titleref href='forbidden'>Forbidden</titleref></td>
2851  <td bgcolor='&cellback;'><titleref href='included'>Included</titleref></td>
2852  </tr>
2853  <tr align='center' valign='middle'>
2854  <td bgcolor='&cellback;' align='right'>Occurs as
2855  Attribute Value</td>
2856  <td bgcolor='&cellback;'><titleref href='not-recognized'>Not recognized</titleref></td>
2857  <td bgcolor='&cellback;'><titleref href='not-recognized'>Forbidden</titleref></td>
2858  <td bgcolor='&cellback;'><titleref href='not-recognized'>Forbidden</titleref></td>
2859  <td bgcolor='&cellback;'><titleref href='notify'>Notify</titleref></td>
2860  <td bgcolor='&cellback;'><titleref href='not recognized'>Not recognized</titleref></td>
2861  </tr>
2862  <tr align='center' valign='middle'>
2863  <td bgcolor='&cellback;' align='right'>Reference
2864  in EntityValue</td>
2865  <td bgcolor='&cellback;'><titleref href='inliteral'>Included in literal</titleref></td>
2866  <td bgcolor='&cellback;'><titleref href='bypass'>Bypassed</titleref></td>
2867  <td bgcolor='&cellback;'><titleref href='bypass'>Bypassed</titleref></td>
2868  <td bgcolor='&cellback;'><titleref href='forbidden'>Forbidden</titleref></td>
2869  <td bgcolor='&cellback;'><titleref href='included'>Included</titleref></td>
2870  </tr>
2871  <tr align='center' valign='middle'>
2872  <td bgcolor='&cellback;' align='right'>Reference
2873  in DTD</td>
2874  <td bgcolor='&cellback;'><titleref href='as-PE'>Included as PE</titleref></td>
2875  <td bgcolor='&cellback;'><titleref href='forbidden'>Forbidden</titleref></td>
2876  <td bgcolor='&cellback;'><titleref href='forbidden'>Forbidden</titleref></td>
2877  <td bgcolor='&cellback;'><titleref href='forbidden'>Forbidden</titleref></td>
2878  <td bgcolor='&cellback;'><titleref href='forbidden'>Forbidden</titleref></td>
2879  </tr>
2880  </htbody>
2881  </htable>
2882  <div3 id='not-recognized'>
2883  <head>Not Recognized</head>
2884  <p>Outside the DTD, the <code>%</code> character has no
2885  special significance; thus, what would be parameter entity references in the
2886  DTD are not recognized as markup in <nt def='NT-content'>content</nt>.
2887  Similarly, the names of unparsed entities are not recognized except
2888  when they appear in the value of an appropriately declared attribute.
2889  </p>
2890  </div3>
2891  <div3 id='included'>
2892  <head>Included</head>
2893  <p><termdef id="dt-include" term="Include">An entity is 
2894  <term>included</term> when its 
2895  <termref def='dt-repltext'>replacement text</termref> is retrieved 
2896  and processed, in place of the reference itself,
2897  as though it were part of the document at the location the
2898  reference was recognized.
2899  The replacement text may contain both 
2900  <termref def='dt-chardata'>character data</termref>
2901  and (except for parameter entities) <termref def="dt-markup">markup</termref>,
2902  which must be recognized in
2903  the usual way, except that the replacement text of entities used to escape
2904  markup delimiters (the entities &magicents;) is always treated as
2905  data.  (The string "<code>AT&amp;amp;T;</code>" expands to
2906  "<code>AT&amp;T;</code>" and the remaining ampersand is not recognized
2907  as an entity-reference delimiter.) 
2908  A character reference is <term>included</term> when the indicated
2909  character is processed in place of the reference itself.
2910  </termdef></p>
2911  </div3>
2912  <div3 id='include-if-valid'>
2913  <head>Included If Validating</head>
2914  <p>When an XML processor recognizes a reference to a parsed entity, in order
2915  to <termref def="dt-valid">validate</termref>
2916  the document, the processor must 
2917  <termref def="dt-include">include</termref> its
2918  replacement text.
2919  If the entity is external, and the processor is not
2920  attempting to validate the XML document, the
2921  processor <termref def="dt-may">may</termref>, but need not, 
2922  include the entity's replacement text.
2923  If a non-validating parser does not include the replacement text,
2924  it must inform the application that it recognized, but did not
2925  read, the entity.</p>
2926  <p>This rule is based on the recognition that the automatic inclusion
2927  provided by the SGML and XML entity mechanism, primarily designed
2928  to support modularity in authoring, is not necessarily 
2929  appropriate for other applications, in particular document browsing.
2930  Browsers, for example, when encountering an external parsed entity reference,
2931  might choose to provide a visual indication of the entity's
2932  presence and retrieve it for display only on demand.
2933  </p>
2934  </div3>
2935  <div3 id='forbidden'>
2936  <head>Forbidden</head>
2937  <p>The following are forbidden, and constitute
2938  <termref def='dt-fatal'>fatal</termref> errors:
2939  <ulist>
2940  <item><p>the appearance of a reference to an
2941  <termref def='dt-unparsed'>unparsed entity</termref>.
2942  </p></item>
2943  <item><p>the appearance of any character or general-entity reference in the
2944  DTD except within an <nt def='NT-EntityValue'>EntityValue</nt> or 
2945  <nt def="NT-AttValue">AttValue</nt>.</p></item>
2946  <item><p>a reference to an external entity in an attribute value.</p>
2947  </item>
2948  </ulist>
2949  </p>
2950  </div3>
2951  <div3 id='inliteral'>
2952  <head>Included in Literal</head>
2953  <p>When an <termref def='dt-entref'>entity reference</termref> appears in an
2954  attribute value, or a parameter entity reference appears in a literal entity
2955  value, its <termref def='dt-repltext'>replacement text</termref> is
2956  processed in place of the reference itself as though it
2957  were part of the document at the location the reference was recognized,
2958  except that a single or double quote character in the replacement text
2959  is always treated as a normal data character and will not terminate the
2960  literal. 
2961  For example, this is well-formed:
2962  <eg><![CDATA[<!ENTITY % YN '"Yes"' >
2963  <!ENTITY WhatHeSaid "He said &YN;" >]]></eg>
2964  while this is not:
2965  <eg>&lt;!ENTITY EndAttr "27'" >
2966  &lt;element attribute='a-&amp;EndAttr;></eg>
2967  </p></div3>
2968  <div3 id='notify'>
2969  <head>Notify</head>
2970  <p>When the name of an <termref def='dt-unparsed'>unparsed
2971  entity</termref> appears as a token in the
2972  value of an attribute of declared type <kw>ENTITY</kw> or <kw>ENTITIES</kw>,
2973  a validating processor must inform the
2974  application of the <termref def='dt-sysid'>system</termref> 
2975  and <termref def='dt-pubid'>public</termref> (if any)
2976  identifiers for both the entity and its associated
2977  <termref def="dt-notation">notation</termref>.</p>
2978  </div3>
2979  <div3 id='bypass'>
2980  <head>Bypassed</head>
2981  <p>When a general entity reference appears in the
2982  <nt def='NT-EntityValue'>EntityValue</nt> in an entity declaration,
2983  it is bypassed and left as is.</p>
2984  </div3>
2985  <div3 id='as-PE'>
2986  <head>Included as PE</head>
2987  <p>Just as with external parsed entities, parameter entities
2988  need only be <titleref href='include-if-valid'>included if
2989  validating</titleref>. 
2990  When a parameter-entity reference is recognized in the DTD
2991  and included, its 
2992  <termref def='dt-repltext'>replacement
2993  text</termref> is enlarged by the attachment of one leading and one following
2994  space (#x20) character; the intent is to constrain the replacement
2995  text of parameter 
2996  entities to contain an integral number of grammatical tokens in the DTD.
2997  </p>
2998  </div3>
2999  
3000  </div2>
3001  <div2 id='intern-replacement'>
3002  <head>Construction of Internal Entity Replacement Text</head>
3003  <p>In discussing the treatment
3004  of internal entities, it is  
3005  useful to distinguish two forms of the entity's value.
3006  <termdef id="dt-litentval" term='Literal Entity Value'>The <term>literal
3007  entity value</term> is the quoted string actually
3008  present in the entity declaration, corresponding to the
3009  non-terminal <nt def='NT-EntityValue'>EntityValue</nt>.</termdef>
3010  <termdef id='dt-repltext' term='Replacement Text'>The <term>replacement
3011  text</term> is the content of the entity, after
3012  replacement of character references and parameter-entity
3013  references.
3014  </termdef></p>
3015  
3016  <p>The literal entity value 
3017  as given in an internal entity declaration
3018  (<nt def='NT-EntityValue'>EntityValue</nt>) may contain character,
3019  parameter-entity, and general-entity references.
3020  Such references must be contained entirely within the
3021  literal entity value.
3022  The actual replacement text that is 
3023  <termref def='dt-include'>included</termref> as described above
3024  must contain the <emph>replacement text</emph> of any 
3025  parameter entities referred to, and must contain the character
3026  referred to, in place of any character references in the
3027  literal entity value; however,
3028  general-entity references must be left as-is, unexpanded.
3029  For example, given the following declarations:
3030  
3031  <eg><![CDATA[<!ENTITY % pub    "&#xc9;ditions Gallimard" >
3032  <!ENTITY   rights "All rights reserved" >
3033  <!ENTITY   book   "La Peste: Albert Camus, 
3034  &#xA9; 1947 %pub;. &rights;" >]]></eg>
3035  then the replacement text for the entity "<code>book</code>" is:
3036  <eg>La Peste: Albert Camus, 
3037  &#169; 1947 &#201;ditions Gallimard. &amp;rights;</eg>
3038  The general-entity reference "<code>&amp;rights;</code>" would be expanded
3039  should the reference "<code>&amp;book;</code>" appear in the document's
3040  content or an attribute value.</p>
3041  <p>These simple rules may have complex interactions; for a detailed
3042  discussion of a difficult example, see
3043  <specref ref='sec-entexpand'/>.
3044  </p>
3045  
3046  </div2>
3047  <div2 id='sec-predefined-ent'>
3048  <head>Predefined Entities</head>
3049  <p><termdef id="dt-escape" term="escape">Entity and character
3050  references can both be used to <term>escape</term> the left angle bracket,
3051  ampersand, and other delimiters.   A set of general entities
3052  (&magicents;) is specified for this purpose.
3053  Numeric character references may also be used; they are
3054  expanded immediately when recognized and must be treated as
3055  character data, so the numeric character references
3056  "<code>&amp;#60;</code>" and "<code>&amp;#38;</code>" may be used to 
3057  escape <code>&lt;</code> and <code>&amp;</code> when they occur
3058  in character data.</termdef></p>
3059  <p>All XML processors must recognize these entities whether they
3060  are declared or not.  
3061  <termref def='dt-interop'>For interoperability</termref>,
3062  valid XML documents should declare these
3063  entities, like any others, before using them.
3064  If the entities in question are declared, they must be declared
3065  as internal entities whose replacement text is the single
3066  character being escaped or a character reference to
3067  that character, as shown below.
3068  <eg><![CDATA[<!ENTITY lt     "&#38;#60;"> 
3069  <!ENTITY gt     "&#62;"> 
3070  <!ENTITY amp    "&#38;#38;"> 
3071  <!ENTITY apos   "&#39;"> 
3072  <!ENTITY quot   "&#34;"> 
3073  ]]></eg>
3074  Note that the <code>&lt;</code> and <code>&amp;</code> characters
3075  in the declarations of "<code>lt</code>" and "<code>amp</code>"
3076  are doubly escaped to meet the requirement that entity replacement
3077  be well-formed.
3078  </p>
3079  </div2>
3080  
3081  <div2 id='Notations'>
3082  <head>Notation Declarations</head>
3083   
3084  <p><termdef id="dt-notation" term="Notation"><term>Notations</term> identify by
3085  name the format of <termref def="dt-extent">unparsed
3086  entities</termref>, the
3087  format of elements which bear a notation attribute, 
3088  or the application to which  
3089  a <termref def="dt-pi">processing instruction</termref> is
3090  addressed.</termdef></p>
3091  <p><termdef id="dt-notdecl" term="Notation Declaration">
3092  <term>Notation declarations</term>
3093  provide a name for the notation, for use in
3094  entity and attribute-list declarations and in attribute specifications,
3095  and an external identifier for the notation which may allow an XML
3096  processor or its client application to locate a helper application
3097  capable of processing data in the given notation.
3098  <scrap lang='ebnf'>
3099  <head>Notation Declarations</head>
3100  <prod id='NT-NotationDecl'><lhs>NotationDecl</lhs>
3101  <rhs>'&lt;!NOTATION' <nt def='NT-S'>S</nt> <nt def='NT-Name'>Name</nt> 
3102  <nt def='NT-S'>S</nt> 
3103  (<nt def='NT-ExternalID'>ExternalID</nt> | 
3104  <nt def='NT-PublicID'>PublicID</nt>)
3105  <nt def='NT-S'>S</nt>? '>'</rhs></prod>
3106  <prod id='NT-PublicID'><lhs>PublicID</lhs>
3107  <rhs>'PUBLIC' <nt def='NT-S'>S</nt> 
3108  <nt def='NT-PubidLiteral'>PubidLiteral</nt> 
3109  </rhs></prod>
3110  </scrap>
3111  </termdef></p>
3112  <p>XML processors must provide applications with the name and external
3113  identifier(s) of any notation declared and referred to in an attribute
3114  value, attribute definition, or entity declaration.  They may
3115  additionally resolve the external identifier into the
3116  <termref def="dt-sysid">system identifier</termref>,
3117  file name, or other information needed to allow the
3118  application to call a processor for data in the notation described.  (It
3119  is not an error, however, for XML documents to declare and refer to
3120  notations for which notation-specific applications are not available on
3121  the system where the XML processor or application is running.)</p>
3122  </div2>
3123  
3124   
3125  <div2 id='sec-doc-entity'>
3126  <head>Document Entity</head>
3127   
3128  <p><termdef id="dt-docent" term="Document Entity">The <term>document
3129  entity</term> serves as the root of the entity
3130  tree and a starting-point for an <termref def="dt-xml-proc">XML
3131  processor</termref>.</termdef>
3132  This specification does
3133  not specify how the document entity is to be located by an XML
3134  processor; unlike other entities, the document entity has no name and might
3135  well appear on a processor input stream 
3136  without any identification at all.</p>
3137  </div2>
3138  
3139  
3140  </div1>
3141  <!-- &Conformance; -->
3142   
3143  <div1 id='sec-conformance'>
3144  <head>Conformance</head>
3145   
3146  <div2 id='proc-types'>
3147  <head>Validating and Non-Validating Processors</head>
3148  <p>Conforming <termref def="dt-xml-proc">XML processors</termref> fall into two
3149  classes: validating and non-validating.</p>
3150  <p>Validating and non-validating processors alike must report
3151  violations of this specification's well-formedness constraints
3152  in the content of the
3153  <termref def='dt-docent'>document entity</termref> and any 
3154  other <termref def='dt-parsedent'>parsed entities</termref> that 
3155  they read.</p>
3156  <p><termdef id="dt-validating" term="Validating Processor">
3157  <term>Validating processors</term> must report
3158  violations of the constraints expressed by the declarations in the
3159  <termref def="dt-doctype">DTD</termref>, and
3160  failures to fulfill the validity constraints given
3161  in this specification.
3162  </termdef>
3163  To accomplish this, validating XML processors must read and process the entire
3164  DTD and all external parsed entities referenced in the document.
3165  </p>
3166  <p>Non-validating processors are required to check only the 
3167  <termref def='dt-docent'>document entity</termref>, including
3168  the entire internal DTD subset, for well-formedness.
3169  <termdef id='dt-use-mdecl' term='Process Declarations'>
3170  While they are not required to check the document for validity,
3171  they are required to 
3172  <term>process</term> all the declarations they read in the
3173  internal DTD subset and in any parameter entity that they
3174  read, up to the first reference
3175  to a parameter entity that they do <emph>not</emph> read; that is to 
3176  say, they must
3177  use the information in those declarations to
3178  <titleref href='AVNormalize'>normalize</titleref> attribute values,
3179  <titleref href='included'>include</titleref> the replacement text of 
3180  internal entities, and supply 
3181  <titleref href='sec-attr-defaults'>default attribute values</titleref>.
3182  </termdef>
3183  They must not <termref def='dt-use-mdecl'>process</termref>
3184  <termref def='dt-entdecl'>entity declarations</termref> or 
3185  <termref def='dt-attdecl'>attribute-list declarations</termref> 
3186  encountered after a reference to a parameter entity that is not
3187  read, since the entity may have contained overriding declarations.
3188  </p>
3189  </div2>
3190  <div2 id='safe-behavior'>
3191  <head>Using XML Processors</head>
3192  <p>The behavior of a validating XML processor is highly predictable; it
3193  must read every piece of a document and report all well-formedness and
3194  validity violations.
3195  Less is required of a non-validating processor; it need not read any
3196  part of the document other than the document entity.
3197  This has two effects that may be important to users of XML processors:
3198  <ulist>
3199  <item><p>Certain well-formedness errors, specifically those that require
3200  reading external entities, may not be detected by a non-validating processor.
3201  Examples include the constraints entitled 
3202  <titleref href='wf-entdeclared'>Entity Declared</titleref>, 
3203  <titleref href='wf-textent'>Parsed Entity</titleref>, and
3204  <titleref href='wf-norecursion'>No Recursion</titleref>, as well
3205  as some of the cases described as
3206  <titleref href='forbidden'>forbidden</titleref> in 
3207  <specref ref='entproc'/>.</p></item>
3208  <item><p>The information passed from the processor to the application may
3209  vary, depending on whether the processor reads
3210  parameter and external entities.
3211  For example, a non-validating processor may not 
3212  <titleref href='AVNormalize'>normalize</titleref> attribute values,
3213  <titleref href='included'>include</titleref> the replacement text of 
3214  internal entities, or supply 
3215  <titleref href='sec-attr-defaults'>default attribute values</titleref>,
3216  where doing so depends on having read declarations in 
3217  external or parameter entities.</p></item>
3218  </ulist>
3219  </p>
3220  <p>For maximum reliability in interoperating between different XML
3221  processors, applications which use non-validating processors should not 
3222  rely on any behaviors not required of such processors.
3223  Applications which require facilities such as the use of default
3224  attributes or internal entities which are declared in external
3225  entities should use validating XML processors.</p>
3226  </div2>
3227  </div1>
3228  
3229  <div1 id='sec-notation'>
3230  <head>Notation</head>
3231   
3232  <p>The formal grammar of XML is given in this specification using a simple
3233  Extended Backus-Naur Form (EBNF) notation.  Each rule in the grammar defines
3234  one symbol, in the form
3235  <eg>symbol ::= expression</eg></p>
3236  <p>Symbols are written with an initial capital letter if they are
3237  defined by a regular expression, or with an initial lower case letter 
3238  otherwise.
3239  Literal strings are quoted.
3240  
3241  </p>
3242  
3243  <p>Within the expression on the right-hand side of a rule, the following
3244  expressions are used to match strings of one or more characters:
3245  <glist>
3246  <gitem>
3247  <label><code>#xN</code></label>
3248  <def><p>where <code>N</code> is a hexadecimal integer, the
3249  expression matches the character in ISO/IEC 10646 whose canonical
3250  (UCS-4) 
3251  code value, when interpreted as an unsigned binary number, has
3252  the value indicated.  The number of leading zeros in the
3253  <code>#xN</code> form is insignificant; the number of leading
3254  zeros in the corresponding code value 
3255  is governed by the character
3256  encoding in use and is not significant for XML.</p></def>
3257  </gitem>
3258  <gitem>
3259  <label><code>[a-zA-Z]</code>, <code>[#xN-#xN]</code></label>
3260  <def><p>matches any <termref def='dt-character'>character</termref> 
3261  with a value in the range(s) indicated (inclusive).</p></def>
3262  </gitem>
3263  <gitem>
3264  <label><code>[^a-z]</code>, <code>[^#xN-#xN]</code></label>
3265  <def><p>matches any <termref def='dt-character'>character</termref> 
3266  with a value <emph>outside</emph> the
3267  range indicated.</p></def>
3268  </gitem>
3269  <gitem>
3270  <label><code>[^abc]</code>, <code>[^#xN#xN#xN]</code></label>
3271  <def><p>matches any <termref def='dt-character'>character</termref>
3272  with a value not among the characters given.</p></def>
3273  </gitem>
3274  <gitem>
3275  <label><code>"string"</code></label>
3276  <def><p>matches a literal string <termref def="dt-match">matching</termref>
3277  that given inside the double quotes.</p></def>
3278  </gitem>
3279  <gitem>
3280  <label><code>'string'</code></label>
3281  <def><p>matches a literal string <termref def="dt-match">matching</termref>
3282  that given inside the single quotes.</p></def>
3283  </gitem>
3284  </glist>
3285  These symbols may be combined to match more complex patterns as follows,
3286  where <code>A</code> and <code>B</code> represent simple expressions:
3287  <glist>
3288  <gitem>
3289  <label>(<code>expression</code>)</label>
3290  <def><p><code>expression</code> is treated as a unit 
3291  and may be combined as described in this list.</p></def>
3292  </gitem>
3293  <gitem>
3294  <label><code>A?</code></label>
3295  <def><p>matches <code>A</code> or nothing; optional <code>A</code>.</p></def>
3296  </gitem>
3297  <gitem>
3298  <label><code>A B</code></label>
3299  <def><p>matches <code>A</code> followed by <code>B</code>.</p></def>
3300  </gitem>
3301  <gitem>
3302  <label><code>A | B</code></label>
3303  <def><p>matches <code>A</code> or <code>B</code> but not both.</p></def>
3304  </gitem>
3305  <gitem>
3306  <label><code>A - B</code></label>
3307  <def><p>matches any string that matches <code>A</code> but does not match
3308  <code>B</code>.
3309  </p></def>
3310  </gitem>
3311  <gitem>
3312  <label><code>A+</code></label>
3313  <def><p>matches one or more occurrences of <code>A</code>.</p></def>
3314  </gitem>
3315  <gitem>
3316  <label><code>A*</code></label>
3317  <def><p>matches zero or more occurrences of <code>A</code>.</p></def>
3318  </gitem>
3319  
3320  </glist>
3321  Other notations used in the productions are:
3322  <glist>
3323  <gitem>
3324  <label><code>/* ... */</code></label>
3325  <def><p>comment.</p></def>
3326  </gitem>
3327  <gitem>
3328  <label><code>[ wfc: ... ]</code></label>
3329  <def><p>well-formedness constraint; this identifies by name a 
3330  constraint on 
3331  <termref def="dt-wellformed">well-formed</termref> documents
3332  associated with a production.</p></def>
3333  </gitem>
3334  <gitem>
3335  <label><code>[ vc: ... ]</code></label>
3336  <def><p>validity constraint; this identifies by name a constraint on
3337  <termref def="dt-valid">valid</termref> documents associated with
3338  a production.</p></def>
3339  </gitem>
3340  </glist>
3341  </p></div1>
3342  
3343  </body>
3344  <back>
3345  <!-- &SGML; -->
3346   
3347  
3348  <!-- &Biblio; -->
3349  <div1 id='sec-bibliography'>
3350  
3351  <head>References</head>
3352  <div2 id='sec-existing-stds'>
3353  <head>Normative References</head>
3354  
3355  <blist>
3356  <bibl id='IANA' key='IANA'>
3357  (Internet Assigned Numbers Authority) <emph>Official Names for 
3358  Character Sets</emph>,
3359  ed. Keld Simonsen et al.
3360  See <loc href='ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets'>ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets</loc>.
3361  </bibl>
3362  
3363  <bibl id='RFC1766' key='IETF RFC 1766'>
3364  IETF (Internet Engineering Task Force).
3365  <emph>RFC 1766:  Tags for the Identification of Languages</emph>,
3366  ed. H. Alvestrand.
3367  1995.
3368  </bibl>
3369  
3370  <bibl id='ISO639' key='ISO 639'>
3371  (International Organization for Standardization).
3372  <emph>ISO 639:1988 (E).
3373  Code for the representation of names of languages.</emph>
3374  [Geneva]:  International Organization for
3375  Standardization, 1988.</bibl>
3376  
3377  <bibl id='ISO3166' key='ISO 3166'>
3378  (International Organization for Standardization).
3379  <emph>ISO 3166-1:1997 (E).
3380  Codes for the representation of names of countries and their subdivisions 
3381  &mdash; Part 1: Country codes</emph>
3382  [Geneva]:  International Organization for
3383  Standardization, 1997.</bibl>
3384  
3385  <bibl id='ISO10646' key='ISO/IEC 10646'>ISO
3386  (International Organization for Standardization).
3387  <emph>ISO/IEC 10646-1993 (E).  Information technology &mdash; Universal
3388  Multiple-Octet Coded Character Set (UCS) &mdash; Part 1:
3389  Architecture and Basic Multilingual Plane.</emph>
3390  [Geneva]:  International Organization for
3391  Standardization, 1993 (plus amendments AM 1 through AM 7).
3392  </bibl>
3393  
3394  <bibl id='Unicode' key='Unicode'>The Unicode Consortium.
3395  <emph>The Unicode Standard, Version 2.0.</emph>
3396  Reading, Mass.:  Addison-Wesley Developers Press, 1996.</bibl>
3397  
3398  </blist>
3399  
3400  </div2>
3401  
3402  <div2><head>Other References</head> 
3403  
3404  <blist>
3405  
3406  <bibl id='Aho' key='Aho/Ullman'>Aho, Alfred V., 
3407  Ravi Sethi, and Jeffrey D. Ullman.
3408  <emph>Compilers:  Principles, Techniques, and Tools</emph>.
3409  Reading:  Addison-Wesley, 1986, rpt. corr. 1988.</bibl>
3410  
3411  <bibl id="Berners-Lee" xml-link="simple" key="Berners-Lee et al.">
3412  Berners-Lee, T., R. Fielding, and L. Masinter.
3413  <emph>Uniform Resource Identifiers (URI):  Generic Syntax and
3414  Semantics</emph>.
3415  1997.
3416  (Work in progress; see updates to RFC1738.)</bibl>
3417  
3418  <bibl id='ABK' key='Br�ggemann-Klein'>Br�ggemann-Klein, Anne.
3419  <emph>Regular Expressions into Finite Automata</emph>.
3420  Extended abstract in I. Simon, Hrsg., LATIN 1992, 
3421  S. 97-98. Springer-Verlag, Berlin 1992. 
3422  Full Version in Theoretical Computer Science 120: 197-213, 1993.
3423  
3424  </bibl>
3425  
3426  <bibl id='ABKDW' key='Br�ggemann-Klein and Wood'>Br�ggemann-Klein, Anne,
3427  and Derick Wood.
3428  <emph>Deterministic Regular Languages</emph>.
3429  Universit�t Freiburg, Institut f�r Informatik,
3430  Bericht 38, Oktober 1991.
3431  </bibl>
3432  
3433  <bibl id='Clark' key='Clark'>James Clark.
3434  Comparison of SGML and XML. See
3435  <loc href='http://www.w3.org/TR/NOTE-sgml-xml-971215'>http://www.w3.org/TR/NOTE-sgml-xml-971215</loc>.
3436  </bibl>
3437  <bibl id="RFC1738" xml-link="simple" key="IETF RFC1738">
3438  IETF (Internet Engineering Task Force).
3439  <emph>RFC 1738:  Uniform Resource Locators (URL)</emph>, 
3440  ed. T. Berners-Lee, L. Masinter, M. McCahill.
3441  1994.
3442  </bibl>
3443  
3444  <bibl id="RFC1808" xml-link="simple" key="IETF RFC1808">
3445  IETF (Internet Engineering Task Force).
3446  <emph>RFC 1808:  Relative Uniform Resource Locators</emph>, 
3447  ed. R. Fielding.
3448  1995.
3449  </bibl>
3450  
3451  <bibl id="RFC2141" xml-link="simple" key="IETF RFC2141">
3452  IETF (Internet Engineering Task Force).
3453  <emph>RFC 2141:  URN Syntax</emph>, 
3454  ed. R. Moats.
3455  1997.
3456  </bibl>
3457  
3458  <bibl id='ISO8879' key='ISO 8879'>ISO
3459  (International Organization for Standardization).
3460  <emph>ISO 8879:1986(E).  Information processing &mdash; Text and Office
3461  Systems &mdash; Standard Generalized Markup Language (SGML).</emph>  First
3462  edition &mdash; 1986-10-15.  [Geneva]:  International Organization for
3463  Standardization, 1986.
3464  </bibl>
3465  
3466  
3467  <bibl id='ISO10744' key='ISO/IEC 10744'>ISO
3468  (International Organization for Standardization).
3469  <emph>ISO/IEC 10744-1992 (E).  Information technology &mdash;
3470  Hypermedia/Time-based Structuring Language (HyTime).
3471  </emph>
3472  [Geneva]:  International Organization for
3473  Standardization, 1992.
3474  <emph>Extended Facilities Annexe.</emph>
3475  [Geneva]:  International Organization for
3476  Standardization, 1996. 
3477  </bibl>
3478  
3479  
3480  
3481  </blist>
3482  </div2>
3483  </div1>
3484  <div1 id='CharClasses'>
3485  <head>Character Classes</head>
3486  <p>Following the characteristics defined in the Unicode standard,
3487  characters are classed as base characters (among others, these
3488  contain the alphabetic characters of the Latin alphabet, without
3489  diacritics), ideographic characters, and combining characters (among
3490  others, this class contains most diacritics); these classes combine
3491  to form the class of letters.  Digits and extenders are
3492  also distinguished.
3493  <scrap lang="ebnf" id="CHARACTERS">
3494  <head>Characters</head>
3495  <prodgroup pcw3="3" pcw4="15">
3496  <prod id="NT-Letter"><lhs>Letter</lhs>
3497  <rhs><nt def="NT-BaseChar">BaseChar</nt> 
3498  | <nt def="NT-Ideographic">Ideographic</nt></rhs> </prod>
3499  <prod id='NT-BaseChar'><lhs>BaseChar</lhs>
3500  <rhs>[#x0041-#x005A]
3501  |&nbsp;[#x0061-#x007A]
3502  |&nbsp;[#x00C0-#x00D6]
3503  |&nbsp;[#x00D8-#x00F6]
3504  |&nbsp;[#x00F8-#x00FF]
3505  |&nbsp;[#x0100-#x0131]
3506  |&nbsp;[#x0134-#x013E]
3507  |&nbsp;[#x0141-#x0148]
3508  |&nbsp;[#x014A-#x017E]
3509  |&nbsp;[#x0180-#x01C3]
3510  |&nbsp;[#x01CD-#x01F0]
3511  |&nbsp;[#x01F4-#x01F5]
3512  |&nbsp;[#x01FA-#x0217]
3513  |&nbsp;[#x0250-#x02A8]
3514  |&nbsp;[#x02BB-#x02C1]
3515  |&nbsp;#x0386
3516  |&nbsp;[#x0388-#x038A]
3517  |&nbsp;#x038C
3518  |&nbsp;[#x038E-#x03A1]
3519  |&nbsp;[#x03A3-#x03CE]
3520  |&nbsp;[#x03D0-#x03D6]
3521  |&nbsp;#x03DA
3522  |&nbsp;#x03DC
3523  |&nbsp;#x03DE
3524  |&nbsp;#x03E0
3525  |&nbsp;[#x03E2-#x03F3]
3526  |&nbsp;[#x0401-#x040C]
3527  |&nbsp;[#x040E-#x044F]
3528  |&nbsp;[#x0451-#x045C]
3529  |&nbsp;[#x045E-#x0481]
3530  |&nbsp;[#x0490-#x04C4]
3531  |&nbsp;[#x04C7-#x04C8]
3532  |&nbsp;[#x04CB-#x04CC]
3533  |&nbsp;[#x04D0-#x04EB]
3534  |&nbsp;[#x04EE-#x04F5]
3535  |&nbsp;[#x04F8-#x04F9]
3536  |&nbsp;[#x0531-#x0556]
3537  |&nbsp;#x0559
3538  |&nbsp;[#x0561-#x0586]
3539  |&nbsp;[#x05D0-#x05EA]
3540  |&nbsp;[#x05F0-#x05F2]
3541  |&nbsp;[#x0621-#x063A]
3542  |&nbsp;[#x0641-#x064A]
3543  |&nbsp;[#x0671-#x06B7]
3544  |&nbsp;[#x06BA-#x06BE]
3545  |&nbsp;[#x06C0-#x06CE]
3546  |&nbsp;[#x06D0-#x06D3]
3547  |&nbsp;#x06D5
3548  |&nbsp;[#x06E5-#x06E6]
3549  |&nbsp;[#x0905-#x0939]
3550  |&nbsp;#x093D
3551  |&nbsp;[#x0958-#x0961]
3552  |&nbsp;[#x0985-#x098C]
3553  |&nbsp;[#x098F-#x0990]
3554  |&nbsp;[#x0993-#x09A8]
3555  |&nbsp;[#x09AA-#x09B0]
3556  |&nbsp;#x09B2
3557  |&nbsp;[#x09B6-#x09B9]
3558  |&nbsp;[#x09DC-#x09DD]
3559  |&nbsp;[#x09DF-#x09E1]
3560  |&nbsp;[#x09F0-#x09F1]
3561  |&nbsp;[#x0A05-#x0A0A]
3562  |&nbsp;[#x0A0F-#x0A10]
3563  |&nbsp;[#x0A13-#x0A28]
3564  |&nbsp;[#x0A2A-#x0A30]
3565  |&nbsp;[#x0A32-#x0A33]
3566  |&nbsp;[#x0A35-#x0A36]
3567  |&nbsp;[#x0A38-#x0A39]
3568  |&nbsp;[#x0A59-#x0A5C]
3569  |&nbsp;#x0A5E
3570  |&nbsp;[#x0A72-#x0A74]
3571  |&nbsp;[#x0A85-#x0A8B]
3572  |&nbsp;#x0A8D
3573  |&nbsp;[#x0A8F-#x0A91]
3574  |&nbsp;[#x0A93-#x0AA8]
3575  |&nbsp;[#x0AAA-#x0AB0]
3576  |&nbsp;[#x0AB2-#x0AB3]
3577  |&nbsp;[#x0AB5-#x0AB9]
3578  |&nbsp;#x0ABD
3579  |&nbsp;#x0AE0
3580  |&nbsp;[#x0B05-#x0B0C]
3581  |&nbsp;[#x0B0F-#x0B10]
3582  |&nbsp;[#x0B13-#x0B28]
3583  |&nbsp;[#x0B2A-#x0B30]
3584  |&nbsp;[#x0B32-#x0B33]
3585  |&nbsp;[#x0B36-#x0B39]
3586  |&nbsp;#x0B3D
3587  |&nbsp;[#x0B5C-#x0B5D]
3588  |&nbsp;[#x0B5F-#x0B61]
3589  |&nbsp;[#x0B85-#x0B8A]
3590  |&nbsp;[#x0B8E-#x0B90]
3591  |&nbsp;[#x0B92-#x0B95]
3592  |&nbsp;[#x0B99-#x0B9A]
3593  |&nbsp;#x0B9C
3594  |&nbsp;[#x0B9E-#x0B9F]
3595  |&nbsp;[#x0BA3-#x0BA4]
3596  |&nbsp;[#x0BA8-#x0BAA]
3597  |&nbsp;[#x0BAE-#x0BB5]
3598  |&nbsp;[#x0BB7-#x0BB9]
3599  |&nbsp;[#x0C05-#x0C0C]
3600  |&nbsp;[#x0C0E-#x0C10]
3601  |&nbsp;[#x0C12-#x0C28]
3602  |&nbsp;[#x0C2A-#x0C33]
3603  |&nbsp;[#x0C35-#x0C39]
3604  |&nbsp;[#x0C60-#x0C61]
3605  |&nbsp;[#x0C85-#x0C8C]
3606  |&nbsp;[#x0C8E-#x0C90]
3607  |&nbsp;[#x0C92-#x0CA8]
3608  |&nbsp;[#x0CAA-#x0CB3]
3609  |&nbsp;[#x0CB5-#x0CB9]
3610  |&nbsp;#x0CDE
3611  |&nbsp;[#x0CE0-#x0CE1]
3612  |&nbsp;[#x0D05-#x0D0C]
3613  |&nbsp;[#x0D0E-#x0D10]
3614  |&nbsp;[#x0D12-#x0D28]
3615  |&nbsp;[#x0D2A-#x0D39]
3616  |&nbsp;[#x0D60-#x0D61]
3617  |&nbsp;[#x0E01-#x0E2E]
3618  |&nbsp;#x0E30
3619  |&nbsp;[#x0E32-#x0E33]
3620  |&nbsp;[#x0E40-#x0E45]
3621  |&nbsp;[#x0E81-#x0E82]
3622  |&nbsp;#x0E84
3623  |&nbsp;[#x0E87-#x0E88]
3624  |&nbsp;#x0E8A
3625  |&nbsp;#x0E8D
3626  |&nbsp;[#x0E94-#x0E97]
3627  |&nbsp;[#x0E99-#x0E9F]
3628  |&nbsp;[#x0EA1-#x0EA3]
3629  |&nbsp;#x0EA5
3630  |&nbsp;#x0EA7
3631  |&nbsp;[#x0EAA-#x0EAB]
3632  |&nbsp;[#x0EAD-#x0EAE]
3633  |&nbsp;#x0EB0
3634  |&nbsp;[#x0EB2-#x0EB3]
3635  |&nbsp;#x0EBD
3636  |&nbsp;[#x0EC0-#x0EC4]
3637  |&nbsp;[#x0F40-#x0F47]
3638  |&nbsp;[#x0F49-#x0F69]
3639  |&nbsp;[#x10A0-#x10C5]
3640  |&nbsp;[#x10D0-#x10F6]
3641  |&nbsp;#x1100
3642  |&nbsp;[#x1102-#x1103]
3643  |&nbsp;[#x1105-#x1107]
3644  |&nbsp;#x1109
3645  |&nbsp;[#x110B-#x110C]
3646  |&nbsp;[#x110E-#x1112]
3647  |&nbsp;#x113C
3648  |&nbsp;#x113E
3649  |&nbsp;#x1140
3650  |&nbsp;#x114C
3651  |&nbsp;#x114E
3652  |&nbsp;#x1150
3653  |&nbsp;[#x1154-#x1155]
3654  |&nbsp;#x1159
3655  |&nbsp;[#x115F-#x1161]
3656  |&nbsp;#x1163
3657  |&nbsp;#x1165
3658  |&nbsp;#x1167
3659  |&nbsp;#x1169
3660  |&nbsp;[#x116D-#x116E]
3661  |&nbsp;[#x1172-#x1173]
3662  |&nbsp;#x1175
3663  |&nbsp;#x119E
3664  |&nbsp;#x11A8
3665  |&nbsp;#x11AB
3666  |&nbsp;[#x11AE-#x11AF]
3667  |&nbsp;[#x11B7-#x11B8]
3668  |&nbsp;#x11BA
3669  |&nbsp;[#x11BC-#x11C2]
3670  |&nbsp;#x11EB
3671  |&nbsp;#x11F0
3672  |&nbsp;#x11F9
3673  |&nbsp;[#x1E00-#x1E9B]
3674  |&nbsp;[#x1EA0-#x1EF9]
3675  |&nbsp;[#x1F00-#x1F15]
3676  |&nbsp;[#x1F18-#x1F1D]
3677  |&nbsp;[#x1F20-#x1F45]
3678  |&nbsp;[#x1F48-#x1F4D]
3679  |&nbsp;[#x1F50-#x1F57]
3680  |&nbsp;#x1F59
3681  |&nbsp;#x1F5B
3682  |&nbsp;#x1F5D
3683  |&nbsp;[#x1F5F-#x1F7D]
3684  |&nbsp;[#x1F80-#x1FB4]
3685  |&nbsp;[#x1FB6-#x1FBC]
3686  |&nbsp;#x1FBE
3687  |&nbsp;[#x1FC2-#x1FC4]
3688  |&nbsp;[#x1FC6-#x1FCC]
3689  |&nbsp;[#x1FD0-#x1FD3]
3690  |&nbsp;[#x1FD6-#x1FDB]
3691  |&nbsp;[#x1FE0-#x1FEC]
3692  |&nbsp;[#x1FF2-#x1FF4]
3693  |&nbsp;[#x1FF6-#x1FFC]
3694  |&nbsp;#x2126
3695  |&nbsp;[#x212A-#x212B]
3696  |&nbsp;#x212E
3697  |&nbsp;[#x2180-#x2182]
3698  |&nbsp;[#x3041-#x3094]
3699  |&nbsp;[#x30A1-#x30FA]
3700  |&nbsp;[#x3105-#x312C]
3701  |&nbsp;[#xAC00-#xD7A3]
3702  </rhs></prod>
3703  <prod id='NT-Ideographic'><lhs>Ideographic</lhs>
3704  <rhs>[#x4E00-#x9FA5]
3705  |&nbsp;#x3007
3706  |&nbsp;[#x3021-#x3029]
3707  </rhs></prod>
3708  <prod id='NT-CombiningChar'><lhs>CombiningChar</lhs>
3709  <rhs>[#x0300-#x0345]
3710  |&nbsp;[#x0360-#x0361]
3711  |&nbsp;[#x0483-#x0486]
3712  |&nbsp;[#x0591-#x05A1]
3713  |&nbsp;[#x05A3-#x05B9]
3714  |&nbsp;[#x05BB-#x05BD]
3715  |&nbsp;#x05BF
3716  |&nbsp;[#x05C1-#x05C2]
3717  |&nbsp;#x05C4
3718  |&nbsp;[#x064B-#x0652]
3719  |&nbsp;#x0670
3720  |&nbsp;[#x06D6-#x06DC]
3721  |&nbsp;[#x06DD-#x06DF]
3722  |&nbsp;[#x06E0-#x06E4]
3723  |&nbsp;[#x06E7-#x06E8]
3724  |&nbsp;[#x06EA-#x06ED]
3725  |&nbsp;[#x0901-#x0903]
3726  |&nbsp;#x093C
3727  |&nbsp;[#x093E-#x094C]
3728  |&nbsp;#x094D
3729  |&nbsp;[#x0951-#x0954]
3730  |&nbsp;[#x0962-#x0963]
3731  |&nbsp;[#x0981-#x0983]
3732  |&nbsp;#x09BC
3733  |&nbsp;#x09BE
3734  |&nbsp;#x09BF
3735  |&nbsp;[#x09C0-#x09C4]
3736  |&nbsp;[#x09C7-#x09C8]
3737  |&nbsp;[#x09CB-#x09CD]
3738  |&nbsp;#x09D7
3739  |&nbsp;[#x09E2-#x09E3]
3740  |&nbsp;#x0A02
3741  |&nbsp;#x0A3C
3742  |&nbsp;#x0A3E
3743  |&nbsp;#x0A3F
3744  |&nbsp;[#x0A40-#x0A42]
3745  |&nbsp;[#x0A47-#x0A48]
3746  |&nbsp;[#x0A4B-#x0A4D]
3747  |&nbsp;[#x0A70-#x0A71]
3748  |&nbsp;[#x0A81-#x0A83]
3749  |&nbsp;#x0ABC
3750  |&nbsp;[#x0ABE-#x0AC5]
3751  |&nbsp;[#x0AC7-#x0AC9]
3752  |&nbsp;[#x0ACB-#x0ACD]
3753  |&nbsp;[#x0B01-#x0B03]
3754  |&nbsp;#x0B3C
3755  |&nbsp;[#x0B3E-#x0B43]
3756  |&nbsp;[#x0B47-#x0B48]
3757  |&nbsp;[#x0B4B-#x0B4D]
3758  |&nbsp;[#x0B56-#x0B57]
3759  |&nbsp;[#x0B82-#x0B83]
3760  |&nbsp;[#x0BBE-#x0BC2]
3761  |&nbsp;[#x0BC6-#x0BC8]
3762  |&nbsp;[#x0BCA-#x0BCD]
3763  |&nbsp;#x0BD7
3764  |&nbsp;[#x0C01-#x0C03]
3765  |&nbsp;[#x0C3E-#x0C44]
3766  |&nbsp;[#x0C46-#x0C48]
3767  |&nbsp;[#x0C4A-#x0C4D]
3768  |&nbsp;[#x0C55-#x0C56]
3769  |&nbsp;[#x0C82-#x0C83]
3770  |&nbsp;[#x0CBE-#x0CC4]
3771  |&nbsp;[#x0CC6-#x0CC8]
3772  |&nbsp;[#x0CCA-#x0CCD]
3773  |&nbsp;[#x0CD5-#x0CD6]
3774  |&nbsp;[#x0D02-#x0D03]
3775  |&nbsp;[#x0D3E-#x0D43]
3776  |&nbsp;[#x0D46-#x0D48]
3777  |&nbsp;[#x0D4A-#x0D4D]
3778  |&nbsp;#x0D57
3779  |&nbsp;#x0E31
3780  |&nbsp;[#x0E34-#x0E3A]
3781  |&nbsp;[#x0E47-#x0E4E]
3782  |&nbsp;#x0EB1
3783  |&nbsp;[#x0EB4-#x0EB9]
3784  |&nbsp;[#x0EBB-#x0EBC]
3785  |&nbsp;[#x0EC8-#x0ECD]
3786  |&nbsp;[#x0F18-#x0F19]
3787  |&nbsp;#x0F35
3788  |&nbsp;#x0F37
3789  |&nbsp;#x0F39
3790  |&nbsp;#x0F3E
3791  |&nbsp;#x0F3F
3792  |&nbsp;[#x0F71-#x0F84]
3793  |&nbsp;[#x0F86-#x0F8B]
3794  |&nbsp;[#x0F90-#x0F95]
3795  |&nbsp;#x0F97
3796  |&nbsp;[#x0F99-#x0FAD]
3797  |&nbsp;[#x0FB1-#x0FB7]
3798  |&nbsp;#x0FB9
3799  |&nbsp;[#x20D0-#x20DC]
3800  |&nbsp;#x20E1
3801  |&nbsp;[#x302A-#x302F]
3802  |&nbsp;#x3099
3803  |&nbsp;#x309A
3804  </rhs></prod>
3805  <prod id='NT-Digit'><lhs>Digit</lhs>
3806  <rhs>[#x0030-#x0039]
3807  |&nbsp;[#x0660-#x0669]
3808  |&nbsp;[#x06F0-#x06F9]
3809  |&nbsp;[#x0966-#x096F]
3810  |&nbsp;[#x09E6-#x09EF]
3811  |&nbsp;[#x0A66-#x0A6F]
3812  |&nbsp;[#x0AE6-#x0AEF]
3813  |&nbsp;[#x0B66-#x0B6F]
3814  |&nbsp;[#x0BE7-#x0BEF]
3815  |&nbsp;[#x0C66-#x0C6F]
3816  |&nbsp;[#x0CE6-#x0CEF]
3817  |&nbsp;[#x0D66-#x0D6F]
3818  |&nbsp;[#x0E50-#x0E59]
3819  |&nbsp;[#x0ED0-#x0ED9]
3820  |&nbsp;[#x0F20-#x0F29]
3821  </rhs></prod>
3822  <prod id='NT-Extender'><lhs>Extender</lhs>
3823  <rhs>#x00B7
3824  |&nbsp;#x02D0
3825  |&nbsp;#x02D1
3826  |&nbsp;#x0387
3827  |&nbsp;#x0640
3828  |&nbsp;#x0E46
3829  |&nbsp;#x0EC6
3830  |&nbsp;#x3005
3831  |&nbsp;[#x3031-#x3035]
3832  |&nbsp;[#x309D-#x309E]
3833  |&nbsp;[#x30FC-#x30FE]
3834  </rhs></prod>
3835  
3836  </prodgroup>
3837  </scrap>
3838  </p>
3839  <p>The character classes defined here can be derived from the
3840  Unicode character database as follows:
3841  <ulist>
3842  <item>
3843  <p>Name start characters must have one of the categories Ll, Lu,
3844  Lo, Lt, Nl.</p>
3845  </item>
3846  <item>
3847  <p>Name characters other than Name-start characters 
3848  must have one of the categories Mc, Me, Mn, Lm, or Nd.</p>
3849  </item>
3850  <item>
3851  <p>Characters in the compatibility area (i.e. with character code
3852  greater than #xF900 and less than #xFFFE) are not allowed in XML
3853  names.</p>
3854  </item>
3855  <item>
3856  <p>Characters which have a font or compatibility decomposition (i.e. those
3857  with a "compatibility formatting tag" in field 5 of the database --
3858  marked by field 5 beginning with a "&lt;") are not allowed.</p>
3859  </item>
3860  <item>
3861  <p>The following characters are treated as name-start characters
3862  rather than name characters, because the property file classifies
3863  them as Alphabetic:  [#x02BB-#x02C1], #x0559, #x06E5, #x06E6.</p>
3864  </item>
3865  <item>
3866  <p>Characters #x20DD-#x20E0 are excluded (in accordance with 
3867  Unicode, section 5.14).</p>
3868  </item>
3869  <item>
3870  <p>Character #x00B7 is classified as an extender, because the
3871  property list so identifies it.</p>
3872  </item>
3873  <item>
3874  <p>Character #x0387 is added as a name character, because #x00B7
3875  is its canonical equivalent.</p>
3876  </item>
3877  <item>
3878  <p>Characters ':' and '_' are allowed as name-start characters.</p>
3879  </item>
3880  <item>
3881  <p>Characters '-' and '.' are allowed as name characters.</p>
3882  </item>
3883  </ulist>
3884  </p>
3885  </div1>
3886  <inform-div1 id="sec-xml-and-sgml">
3887  <head>XML and SGML</head>
3888   
3889  <p>XML is designed to be a subset of SGML, in that every
3890  <termref def="dt-valid">valid</termref> XML document should also be a
3891  conformant SGML document.
3892  For a detailed comparison of the additional restrictions that XML places on
3893  documents beyond those of SGML, see <bibref ref='Clark'/>.
3894  </p>
3895  </inform-div1>
3896  <inform-div1 id="sec-entexpand">
3897  <head>Expansion of Entity and Character References</head>
3898  <p>This appendix contains some examples illustrating the
3899  sequence of entity- and character-reference recognition and
3900  expansion, as specified in <specref ref='entproc'/>.</p>
3901  <p>
3902  If the DTD contains the declaration 
3903  <eg><![CDATA[<!ENTITY example "<p>An ampersand (&#38;#38;) may be escaped
3904  numerically (&#38;#38;#38;) or with a general entity
3905  (&amp;amp;).</p>" >
3906  ]]></eg>
3907  then the XML processor will recognize the character references 
3908  when it parses the entity declaration, and resolve them before 
3909  storing the following string as the
3910  value of the entity "<code>example</code>":
3911  <eg><![CDATA[<p>An ampersand (&#38;) may be escaped
3912  numerically (&#38;#38;) or with a general entity
3913  (&amp;amp;).</p>
3914  ]]></eg>
3915  A reference in the document to "<code>&amp;example;</code>" 
3916  will cause the text to be reparsed, at which time the 
3917  start- and end-tags of the "<code>p</code>" element will be recognized 
3918  and the three references will be recognized and expanded, 
3919  resulting in a "<code>p</code>" element with the following content
3920  (all data, no delimiters or markup):
3921  <eg><![CDATA[An ampersand (&) may be escaped
3922  numerically (&#38;) or with a general entity
3923  (&amp;).
3924  ]]></eg>
3925  </p>
3926  <p>A more complex example will illustrate the rules and their
3927  effects fully.  In the following example, the line numbers are
3928  solely for reference.
3929  <eg><![CDATA[1 <?xml version='1.0'?>
3930  2 <!DOCTYPE test [
3931  3 <!ELEMENT test (#PCDATA) >
3932  4 <!ENTITY % xx '&#37;zz;'>
3933  5 <!ENTITY % zz '&#60;!ENTITY tricky "error-prone" >' >
3934  6 %xx;
3935  7 ]>
3936  8 <test>This sample shows a &tricky; method.</test>
3937  ]]></eg>
3938  This produces the following:
3939  <ulist spacing="compact">
3940  <item><p>in line 4, the reference to character 37 is expanded immediately,
3941  and the parameter entity "<code>xx</code>" is stored in the symbol
3942  table with the value "<code>%zz;</code>".  Since the replacement text
3943  is not rescanned, the reference to parameter entity "<code>zz</code>"
3944  is not recognized.  (And it would be an error if it were, since
3945  "<code>zz</code>" is not yet declared.)</p></item>
3946  <item><p>in line 5, the character reference "<code>&amp;#60;</code>" is
3947  expanded immediately and the parameter entity "<code>zz</code>" is
3948  stored with the replacement text 
3949  "<code>&lt;!ENTITY tricky "error-prone" ></code>",
3950  which is a well-formed entity declaration.</p></item>
3951  <item><p>in line 6, the reference to "<code>xx</code>" is recognized,
3952  and the replacement text of "<code>xx</code>" (namely 
3953  "<code>%zz;</code>") is parsed.  The reference to "<code>zz</code>"
3954  is recognized in its turn, and its replacement text 
3955  ("<code>&lt;!ENTITY tricky "error-prone" ></code>") is parsed.
3956  The general entity "<code>tricky</code>" has now been
3957  declared, with the replacement text "<code>error-prone</code>".</p></item>
3958  <item><p>
3959  in line 8, the reference to the general entity "<code>tricky</code>" is
3960  recognized, and it is expanded, so the full content of the
3961  "<code>test</code>" element is the self-describing (and ungrammatical) string
3962  <emph>This sample shows a error-prone method.</emph>
3963  </p></item>
3964  </ulist>
3965  </p>
3966  </inform-div1> 
3967  <inform-div1 id="determinism">
3968  <head>Deterministic Content Models</head>
3969  <p><termref def='dt-compat'>For compatibility</termref>, it is
3970  required
3971  that content models in element type declarations be deterministic.  
3972  </p>
3973  <!-- FINAL EDIT:  WebSGML allows ambiguity? -->
3974  <p>SGML
3975  requires deterministic content models (it calls them
3976  "unambiguous"); XML processors built using SGML systems may
3977  flag non-deterministic content models as errors.</p>
3978  <p>For example, the content model <code>((b, c) | (b, d))</code> is
3979  non-deterministic, because given an initial <code>b</code> the parser
3980  cannot know which <code>b</code> in the model is being matched without
3981  looking ahead to see which element follows the <code>b</code>.
3982  In this case, the two references to
3983  <code>b</code> can be collapsed 
3984  into a single reference, making the model read
3985  <code>(b, (c | d))</code>.  An initial <code>b</code> now clearly
3986  matches only a single name in the content model.  The parser doesn't
3987  need to look ahead to see what follows; either <code>c</code> or
3988  <code>d</code> would be accepted.</p>
3989  <p>More formally:  a finite state automaton may be constructed from the
3990  content model using the standard algorithms, e.g. algorithm 3.5 
3991  in section 3.9
3992  of Aho, Sethi, and Ullman <bibref ref='Aho'/>.
3993  In many such algorithms, a follow set is constructed for each 
3994  position in the regular expression (i.e., each leaf 
3995  node in the 
3996  syntax tree for the regular expression);
3997  if any position has a follow set in which 
3998  more than one following position is 
3999  labeled with the same element type name, 
4000  then the content model is in error
4001  and may be reported as an error.
4002  </p>
4003  <p>Algorithms exist which allow many but not all non-deterministic
4004  content models to be reduced automatically to equivalent deterministic
4005  models; see Br�ggemann-Klein 1991 <bibref ref='ABK'/>.</p>
4006  </inform-div1>
4007  <inform-div1 id="sec-guessing">
4008  <head>Autodetection of Character Encodings</head>
4009  <p>The XML encoding declaration functions as an internal label on each
4010  entity, indicating which character encoding is in use.  Before an XML
4011  processor can read the internal label, however, it apparently has to
4012  know what character encoding is in use&mdash;which is what the internal label
4013  is trying to indicate.  In the general case, this is a hopeless
4014  situation. It is not entirely hopeless in XML, however, because XML
4015  limits the general case in two ways:  each implementation is assumed
4016  to support only a  finite set of character encodings, and the XML
4017  encoding declaration is restricted in position and content in order to
4018  make it feasible to autodetect the character encoding in use in each
4019  entity in normal cases.  Also, in many cases other sources of information
4020  are available in addition to the XML data stream itself.  
4021  Two cases may be distinguished, 
4022  depending on whether the XML entity is presented to the
4023  processor without, or with, any accompanying
4024  (external) information.  We consider the first case first.
4025  </p>
4026  <p>
4027  Because each XML entity not in UTF-8 or UTF-16 format <emph>must</emph>
4028  begin with an XML encoding declaration, in which the first  characters
4029  must be '<code>&lt;?xml</code>', any conforming processor can detect,
4030  after two to four octets of input, which of the following cases apply. 
4031  In reading this list, it may help to know that in UCS-4, '&lt;' is
4032  "<code>#x0000003C</code>" and '?' is "<code>#x0000003F</code>", and the Byte
4033  Order Mark required of UTF-16 data streams is "<code>#xFEFF</code>".</p>
4034  <p>
4035  <ulist>
4036  <item>
4037  <p><code>00 00 00 3C</code>: UCS-4, big-endian machine (1234 order)</p>
4038  </item>
4039  <item>
4040  <p><code>3C 00 00 00</code>: UCS-4, little-endian machine (4321 order)</p>
4041  </item>
4042  <item>
4043  <p><code>00 00 3C 00</code>: UCS-4, unusual octet order (2143)</p>
4044  </item>
4045  <item>
4046  <p><code>00 3C 00 00</code>: UCS-4, unusual octet order (3412)</p>
4047  </item>
4048  <item>
4049  <p><code>FE FF</code>: UTF-16, big-endian</p>
4050  </item>
4051  <item>
4052  <p><code>FF FE</code>: UTF-16, little-endian</p>
4053  </item>
4054  <item>
4055  <p><code>00 3C 00 3F</code>: UTF-16, big-endian, no Byte Order Mark
4056  (and thus, strictly speaking, in error)</p>
4057  </item>
4058  <item>
4059  <p><code>3C 00 3F 00</code>: UTF-16, little-endian, no Byte Order Mark
4060  (and thus, strictly speaking, in error)</p>
4061  </item>
4062  <item>
4063  <p><code>3C 3F 78 6D</code>: UTF-8, ISO 646, ASCII, some part of ISO 8859, 
4064  Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding
4065  which ensures that the characters of ASCII have their normal positions,
4066  width,
4067  and values; the actual encoding declaration must be read to 
4068  detect which of these applies, but since all of these encodings
4069  use the same bit patterns for the ASCII characters, the encoding 
4070  declaration itself may be read reliably
4071  </p>
4072  </item>
4073  <item>
4074  <p><code>4C 6F A7 94</code>: EBCDIC (in some flavor; the full
4075  encoding declaration must be read to tell which code page is in 
4076  use)</p>
4077  </item>
4078  <item>
4079  <p>other: UTF-8 without an encoding declaration, or else 
4080  the data stream is corrupt, fragmentary, or enclosed in
4081  a wrapper of some kind</p>
4082  </item>
4083  </ulist>
4084  </p>
4085  <p>
4086  This level of autodetection is enough to read the XML encoding
4087  declaration and parse the character-encoding identifier, which is
4088  still necessary to distinguish the individual members of each family
4089  of encodings (e.g. to tell  UTF-8 from 8859, and the parts of 8859
4090  from each other, or to distinguish the specific EBCDIC code page in
4091  use, and so on).
4092  </p>
4093  <p>
4094  Because the contents of the encoding declaration are restricted to
4095  ASCII characters, a processor can reliably read the entire encoding
4096  declaration as soon as it has detected which family of encodings is in
4097  use.  Since in practice, all widely used character encodings fall into
4098  one of the categories above, the XML encoding declaration allows
4099  reasonably reliable in-band labeling of character encodings, even when
4100  external sources of information at the operating-system or
4101  transport-protocol level are unreliable.
4102  </p>
4103  <p>
4104  Once the processor has detected the character encoding in use, it can
4105  act appropriately, whether by invoking a separate input routine for
4106  each case, or by calling the proper conversion function on each
4107  character of input. 
4108  </p>
4109  <p>
4110  Like any self-labeling system, the XML encoding declaration will not
4111  work if any software changes the entity's character set or encoding
4112  without updating the encoding declaration.  Implementors of
4113  character-encoding routines should be careful to ensure the accuracy
4114  of the internal and external information used to label the entity.
4115  </p>
4116  <p>The second possible case occurs when the XML entity is accompanied
4117  by encoding information, as in some file systems and some network
4118  protocols.
4119  When multiple sources of information are available,
4120  
4121  their relative
4122  priority and the preferred method of handling conflict should be
4123  specified as part of the higher-level protocol used to deliver XML.
4124  Rules for the relative priority of the internal label and the
4125  MIME-type label in an external header, for example, should be part of the
4126  RFC document defining the text/xml and application/xml MIME types. In
4127  the interests of interoperability, however, the following rules
4128  are recommended.
4129  <ulist>
4130  <item><p>If an XML entity is in a file, the Byte-Order Mark
4131  and encoding-declaration PI are used (if present) to determine the
4132  character encoding.  All other heuristics and sources of information
4133  are solely for error recovery.
4134  </p></item>
4135  <item><p>If an XML entity is delivered with a
4136  MIME type of text/xml, then the <code>charset</code> parameter
4137  on the MIME type determines the
4138  character encoding method; all other heuristics and sources of
4139  information are solely for error recovery.
4140  </p></item>
4141  <item><p>If an XML entity is delivered 
4142  with a
4143  MIME type of application/xml, then the Byte-Order Mark and
4144  encoding-declaration PI are used (if present) to determine the
4145  character encoding.  All other heuristics and sources of
4146  information are solely for error recovery.
4147  </p></item>
4148  </ulist>
4149  These rules apply only in the absence of protocol-level documentation;
4150  in particular, when the MIME types text/xml and application/xml are
4151  defined, the recommendations of the relevant RFC will supersede
4152  these rules.
4153  </p>
4154  
4155  </inform-div1>
4156  
4157  <inform-div1 id="sec-xml-wg">
4158  <head>W3C XML Working Group</head>
4159   
4160  <p>This specification was prepared and approved for publication by the
4161  W3C XML Working Group (WG).  WG approval of this specification does
4162  not necessarily imply that all WG members voted for its approval.  
4163  The current and former members of the XML WG are:</p>
4164   
4165  <orglist>
4166  <member><name>Jon Bosak, Sun</name><role>Chair</role></member>
4167  <member><name>James Clark</name><role>Technical Lead</role></member>
4168  <member><name>Tim Bray, Textuality and Netscape</name><role>XML Co-editor</role></member>
4169  <member><name>Jean Paoli, Microsoft</name><role>XML Co-editor</role></member>
4170  <member><name>C. M. Sperberg-McQueen, U. of Ill.</name><role>XML
4171  Co-editor</role></member>
4172  <member><name>Dan Connolly, W3C</name><role>W3C Liaison</role></member>
4173  <member><name>Paula Angerstein, Texcel</name></member>
4174  <member><name>Steve DeRose, INSO</name></member>
4175  <member><name>Dave Hollander, HP</name></member>
4176  <member><name>Eliot Kimber, ISOGEN</name></member>
4177  <member><name>Eve Maler, ArborText</name></member>
4178  <member><name>Tom Magliery, NCSA</name></member>
4179  <member><name>Murray Maloney, Muzmo and Grif</name></member>
4180  <member><name>Makoto Murata, Fuji Xerox Information Systems</name></member>
4181  <member><name>Joel Nava, Adobe</name></member>
4182  <member><name>Conleth O'Connell, Vignette</name></member>
4183  <member><name>Peter Sharpe, SoftQuad</name></member>
4184  <member><name>John Tigue, DataChannel</name></member>
4185  </orglist>
4186  
4187  </inform-div1>
4188  </back>
4189  </spec>
4190  <!-- Keep this comment at the end of the file
4191  Local variables:
4192  mode: sgml
4193  sgml-default-dtd-file:"~/sgml/spec.ced"
4194  sgml-omittag:t
4195  sgml-shorttag:t
4196  End:
4197  -->