/ lib / html / parser.pyc
parser.pyc
  1  o

  2  5�Hc�E�@s�dZddlZddlZddlmZdgZe�d�Ze�d�Ze�d�Z	e�d�Z
  3  e�d	�Ze�d
  4  �Ze�d�Z
e�d�Ze�d
�Ze�dej�Ze�d
  5  �Ze�d�ZGdd�dej�ZdS)zA parser for HTML and XHTML.�N)�unescape�
  6  HTMLParserz[&<]z
  7  &[a-zA-Z#]z%&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]z)&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]z	<[a-zA-Z]�>z--\s*>z+([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*z]((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*aF
  8    <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
  9    (?:[\s/]*                          # optional whitespace before attribute name
 10      (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
 11        (?:\s*=+\s*                    # value indicator
 12          (?:'[^']*'                   # LITA-enclosed value
 13            |"[^"]*"                   # LIT-enclosed value
 14            |(?!['"])[^>\s]*           # bare value
 15           )
 16          \s*                          # possibly followed by a space
 17         )?(?:\s|/(?!>))*
 18       )*
 19     )?
 20    \s*                                # trailing whitespace
 21  z#</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>c@s�eZdZdZdZdd�dd�Zdd�Zd	d
 22  �Zdd�Zd
Z	dd�Z
 23  dd�Zdd�Zdd�Z
dd�Zd7dd�Zdd�Zdd�Zdd �Zd!d"�Zd#d$�Zd%d&�Zd'd(�Zd)d*�Zd+d,�Zd-d.�Zd/d0�Zd1d2�Zd3d4�Zd5d6�Zd
S)8raEFind tags and other markup and call handler functions.
 24  
 25      Usage:
 26          p = HTMLParser()
 27          p.feed(data)
 28          ...
 29          p.close()
 30  
 31      Start tags are handled by calling self.handle_starttag() or
 32      self.handle_startendtag(); end tags by self.handle_endtag().  The
 33      data between tags is passed from the parser to the derived class
 34      by calling self.handle_data() with the data as argument (the data
 35      may be split up in arbitrary chunks).  If convert_charrefs is
 36      True the character references are converted automatically to the
 37      corresponding Unicode character (and self.handle_data() is no
 38      longer split in chunks), otherwise they are passed by calling
 39      self.handle_entityref() or self.handle_charref() with the string
 40      containing respectively the named or numeric reference as the
 41      argument.
 42      )�script�styleT)�convert_charrefscCs||_|��dS)z�Initialize and reset this instance.
 43  
 44          If convert_charrefs is True (the default), all character references
 45          are automatically converted to the corresponding Unicode characters.
 46          N)r�reset)�selfr�r
 47  �sC:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2800.0_x64__qbz5n2kfra8p0\lib\html\parser.py�__init__VszHTMLParser.__init__cCs(d|_d|_t|_d|_tj�|�dS)z1Reset this instance.  Loses all unprocessed data.�z???N)�rawdata�lasttag�interesting_normal�interesting�
 48  cdata_elem�_markupbase�
 49  ParserBaser�r	r
 50  r
 51  rr_s
 52  zHTMLParser.resetcCs|j||_|�d�dS)z�Feed data to the parser.
 53  
 54          Call this as often as you want, with as little or as much text
 55          as you want (may include '\n').
 56          rN)r�goahead�r	�datar
 57  r
 58  r�feedgszHTMLParser.feedcCs|�d�dS)zHandle any buffered data.�N)rrr
 59  r
 60  r�closepszHTMLParser.closeNcCs|jS)z)Return full source of start tag: '<...>'.)�_HTMLParser__starttag_textrr
 61  r
 62  r�get_starttag_textvszHTMLParser.get_starttag_textcCs$|��|_t�d|jtj�|_dS)Nz</\s*%s\s*>)�lowerr�re�compile�Ir)r	�elemr
 63  r
 64  r�set_cdata_modezs
 65  zHTMLParser.set_cdata_modecCst|_d|_dS�N)rrrrr
 66  r
 67  r�clear_cdata_mode~s
 68  zHTMLParser.clear_cdata_modecCs:|j}d}t|�}||k�r�|jr;|js;|�d|�}|dkr:|�dt||d��}|dkr8t�d��	||�s8�n�|}n|j
 69  �	||�}|rI|��}n|jrN�n�|}||kro|jrf|jsf|�t
|||���n	|�|||��|�||�}||kr{�nj|j}|d|��rt�||�r�|�|�}	n>|d|�r�|�|�}	n3|d|�r�|�|�}	n(|d|�r�|�|�}	n|d	|�r�|�|�}	n|d
 70  |kr�|�d�|d
 71  }	n�n|	dk�r|sאn|�d|d
 72  �}	|	dkr�|�d|d
 73  �}	|	dkr�|d
 74  }	n|	d
 75  7}	|j�r|j�s|�t
|||	���n	|�|||	��|�||	�}n�|d|��rlt�||�}|�rO|��d
d�}
 76  |�|
 77  �|��}	|d|	d
 78  ��sH|	d
 79  }	|�||	�}q	d||d�v�rk|�|||d
��|�||d
�}ny|d|��r�t�||�}|�r�|�d
 80  �}
 81  |�|
 82  �|��}	|d|	d
 83  ��s�|	d
 84  }	|�||	�}q	t�||�}|�r�|�r�|��||d�k�r�|��}	|	|k�r�|}	|�||d
 85  �}n|d
 86  |k�r�|�d�|�||d
 87  �}nnJd��||ks|�r||k�r|j�s|j�r|j�s|�t
|||���n	|�|||��|�||�}||d�|_dS)Nr�<�&�"z[\s;]�</�<!--�<?�<!rrz&#�������;zinteresting.search() lied)r�lenrr�find�rfind�maxrr �searchr�start�handle_datar�	updatepos�
 88  startswith�starttagopen�match�parse_starttag�parse_endtag�
parse_comment�parse_pi�parse_html_declaration�charref�group�handle_charref�end�	entityref�handle_entityref�
 89  incomplete)r	rCr�i�n�jZampposr:r8�k�namer
 90  r
 91  rr�s�
 92  ��
 93  
 94  
 95  
 96  
 97  
 98  
 99  
100  �
101  
102  
103  
104  
105  �kzHTMLParser.goaheadcCs�|j}|||d�dksJd��|||d�dkr |�|�S|||d�dkr/|�|�S|||d���d	krX|�d
106  |d�}|dkrIdS|�||d|��|dS|�|�S)
Nr-r,z+unexpected call to parse_html_declaration()�r*�z<![�	z	<!doctyperr.r)rr=�parse_marked_sectionrr1�handle_decl�parse_bogus_comment)r	rGr�gtposr
107  r
108  rr?�s
109  
110  
111  z!HTMLParser.parse_html_declarationrcCs`|j}|||d�dvsJd��|�d|d�}|dkrdS|r,|�||d|��|dS)Nr-)r,r)z"unexpected call to parse_comment()rr.r)rr1�handle_comment)r	rG�reportr�posr
112  r
113  rrQszHTMLParser.parse_bogus_commentcCsd|j}|||d�dksJd��t�||d�}|sdS|��}|�||d|��|��}|S)Nr-r+zunexpected call to parse_pi()r.)r�picloser4r5�	handle_pirC)r	rGrr:rIr
114  r
115  rr> szHTMLParser.parse_picCs�d|_|�|�}|dkr|S|j}|||�|_g}t�||d�}|s(Jd��|��}|�d���|_}||kr�t	�||�}|sCnS|�ddd�\}	}
116  }|
117  sRd}n-|dd�dkrd|dd�ksyn|dd�dkrw|dd�krnn|dd�}|r�t
118  |�}|�|	��|f�|��}||ks:|||���}|d	vr�|�
�\}
}d
119  |jvr�|
|j�d
120  �}
t|j�|j�d
121  �}n|t|j�}|�|||��|S|�d�r�|�||�|S|�||�||jvr�|�|�|S)Nrrz#unexpected call to parse_starttag()r-rM�'r.�")r�/>�
122  rZ)r�check_for_whole_start_tagr�tagfind_tolerantr:rCrArr�attrfind_tolerantr�append�strip�getpos�countr0r2r6�endswith�handle_startendtag�handle_starttag�CDATA_CONTENT_ELEMENTSr#)r	rG�endposr�attrsr:rJ�tag�m�attrname�rest�	attrvaluerC�lineno�offsetr
123  r
124  rr;,sX
125  &(�
126  
127  �
128  �
129  
130  zHTMLParser.parse_starttagcCs�|j}t�||�}|rU|��}|||d�}|dkr|dS|dkr?|�d|�r-|dS|�d|�r5dS||kr;|S|dS|dkrEdS|dvrKdS||krQ|S|dStd	��)
131  Nrr�/rZr-r.r
z6abcdefghijklmnopqrstuvwxyz=/ABCDEFGHIJKLMNOPQRSTUVWXYZzwe should not get here!)r�locatestarttagend_tolerantr:rCr8�AssertionError)r	rGrrjrI�nextr
132  r
133  rr\_s.z$HTMLParser.check_for_whole_start_tagcCs*|j}|||d�dksJd��t�||d�}|sdS|��}t�||�}|sn|jdur9|�|||��|St�||d�}|sV|||d�dkrQ|dS|�	|�S|�
134  d���}|�d|���}|�
|�|dS|�
135  d���}|jdur�||jkr�|�|||��|S|�
|�|��|S)	Nr-r)zunexpected call to parse_endtagrr.rMz</>r)r�	endendtagr4rC�
136  endtagfindr:rr6r]rQrArr1�
handle_endtagr%)r	rGrr:rRZ	namematch�tagnamer"r
137  r
138  rr<�s8
139  
140  
141  
142  
143  
144  zHTMLParser.parse_endtagcCs|�||�|�|�dSr$)rerv�r	rirhr
145  r
146  rrd�szHTMLParser.handle_startendtagcC�dSr$r
147  rxr
148  r
149  rre��zHTMLParser.handle_starttagcCryr$r
150  )r	rir
151  r
152  rrv�rzzHTMLParser.handle_endtagcCryr$r
153  �r	rKr
154  r
155  rrB�rzzHTMLParser.handle_charrefcCryr$r
156  r{r
157  r
158  rrE�rzzHTMLParser.handle_entityrefcCryr$r
159  rr
160  r
161  rr6�rzzHTMLParser.handle_datacCryr$r
162  rr
163  r
164  rrS�rzzHTMLParser.handle_commentcCryr$r
165  )r	�declr
166  r
167  rrP�rzzHTMLParser.handle_declcCryr$r
168  rr
169  r
170  rrW�rzzHTMLParser.handle_picCryr$r
171  rr
172  r
173  r�unknown_decl�rzzHTMLParser.unknown_decl)r)�__name__�
174  __module__�__qualname__�__doc__rfrrrrrrr#r%rr?rQr>r;r\r<rdrervrBrEr6rSrPrWr}r
175  r
176  r
177  rr>s8		z
178  3"()r�rr�htmlr�__all__r rrFrDr@r9rV�commentcloser]r^�VERBOSErqrtrurrr
179  r
180  r
181  r�<module>s,
182  
183  
184  
185  
186  
187  
188  
189  
190  ��
191  
192