/ lib / htmlpurifier / Filter / ExtractStyleBlocks.php
ExtractStyleBlocks.php
  1  <?php
  2  
  3  // why is this a top level function? Because PHP 5.2.0 doesn't seem to
  4  // understand how to interpret this filter if it's a static method.
  5  // It's all really silly, but if we go this route it might be reasonable
  6  // to coalesce all of these methods into one.
  7  function htmlpurifier_filter_extractstyleblocks_muteerrorhandler()
  8  {
  9  }
 10  
 11  /**
 12   * This filter extracts <style> blocks from input HTML, cleans them up
 13   * using CSSTidy, and then places them in $purifier->context->get('StyleBlocks')
 14   * so they can be used elsewhere in the document.
 15   *
 16   * @note
 17   *      See tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php for
 18   *      sample usage.
 19   *
 20   * @note
 21   *      This filter can also be used on stylesheets not included in the
 22   *      document--something purists would probably prefer. Just directly
 23   *      call HTMLPurifier_Filter_ExtractStyleBlocks->cleanCSS()
 24   */
 25  class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter
 26  {
 27      /**
 28       * @type string
 29       */
 30      public $name = 'ExtractStyleBlocks';
 31  
 32      /**
 33       * @type array
 34       */
 35      private $_styleMatches = array();
 36  
 37      /**
 38       * @type csstidy
 39       */
 40      private $_tidy;
 41  
 42      /**
 43       * @type HTMLPurifier_AttrDef_HTML_ID
 44       */
 45      private $_id_attrdef;
 46  
 47      /**
 48       * @type HTMLPurifier_AttrDef_CSS_Ident
 49       */
 50      private $_class_attrdef;
 51  
 52      /**
 53       * @type HTMLPurifier_AttrDef_Enum
 54       */
 55      private $_enum_attrdef;
 56  
 57      public function __construct()
 58      {
 59          $this->_tidy = new csstidy();
 60          $this->_tidy->set_cfg('lowercase_s', false);
 61          $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true);
 62          $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident();
 63          $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(
 64              array(
 65                  'first-child',
 66                  'link',
 67                  'visited',
 68                  'active',
 69                  'hover',
 70                  'focus'
 71              )
 72          );
 73      }
 74  
 75      /**
 76       * Save the contents of CSS blocks to style matches
 77       * @param array $matches preg_replace style $matches array
 78       */
 79      protected function styleCallback($matches)
 80      {
 81          $this->_styleMatches[] = $matches[1];
 82      }
 83  
 84      /**
 85       * Removes inline <style> tags from HTML, saves them for later use
 86       * @param string $html
 87       * @param HTMLPurifier_Config $config
 88       * @param HTMLPurifier_Context $context
 89       * @return string
 90       * @todo Extend to indicate non-text/css style blocks
 91       */
 92      public function preFilter($html, $config, $context)
 93      {
 94          $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl');
 95          if ($tidy !== null) {
 96              $this->_tidy = $tidy;
 97          }
 98          $html = preg_replace_callback('#<style(?:\s.*)?>(.+)</style>#isU', array($this, 'styleCallback'), $html);
 99          $style_blocks = $this->_styleMatches;
100          $this->_styleMatches = array(); // reset
101          $context->register('StyleBlocks', $style_blocks); // $context must not be reused
102          if ($this->_tidy) {
103              foreach ($style_blocks as &$style) {
104                  $style = $this->cleanCSS($style, $config, $context);
105              }
106          }
107          return $html;
108      }
109  
110      /**
111       * Takes CSS (the stuff found in <style>) and cleans it.
112       * @warning Requires CSSTidy <http://csstidy.sourceforge.net/>
113       * @param string $css CSS styling to clean
114       * @param HTMLPurifier_Config $config
115       * @param HTMLPurifier_Context $context
116       * @throws HTMLPurifier_Exception
117       * @return string Cleaned CSS
118       */
119      public function cleanCSS($css, $config, $context)
120      {
121          // prepare scope
122          $scope = $config->get('Filter.ExtractStyleBlocks.Scope');
123          if ($scope !== null) {
124              $scopes = array_map('trim', explode(',', $scope));
125          } else {
126              $scopes = array();
127          }
128          // remove comments from CSS
129          $css = trim($css);
130          if (strncmp('<!--', $css, 4) === 0) {
131              $css = substr($css, 4);
132          }
133          if (strlen($css) > 3 && substr($css, -3) == '-->') {
134              $css = substr($css, 0, -3);
135          }
136          $css = trim($css);
137          set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler');
138          $this->_tidy->parse($css);
139          restore_error_handler();
140          $css_definition = $config->getDefinition('CSS');
141          $html_definition = $config->getDefinition('HTML');
142          $new_css = array();
143          foreach ($this->_tidy->css as $k => $decls) {
144              // $decls are all CSS declarations inside an @ selector
145              $new_decls = array();
146              foreach ($decls as $selector => $style) {
147                  $selector = trim($selector);
148                  if ($selector === '') {
149                      continue;
150                  } // should not happen
151                  // Parse the selector
152                  // Here is the relevant part of the CSS grammar:
153                  //
154                  // ruleset
155                  //   : selector [ ',' S* selector ]* '{' ...
156                  // selector
157                  //   : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
158                  // combinator
159                  //   : '+' S*
160                  //   : '>' S*
161                  // simple_selector
162                  //   : element_name [ HASH | class | attrib | pseudo ]*
163                  //   | [ HASH | class | attrib | pseudo ]+
164                  // element_name
165                  //   : IDENT | '*'
166                  //   ;
167                  // class
168                  //   : '.' IDENT
169                  //   ;
170                  // attrib
171                  //   : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
172                  //     [ IDENT | STRING ] S* ]? ']'
173                  //   ;
174                  // pseudo
175                  //   : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
176                  //   ;
177                  //
178                  // For reference, here are the relevant tokens:
179                  //
180                  // HASH         #{name}
181                  // IDENT        {ident}
182                  // INCLUDES     ==
183                  // DASHMATCH    |=
184                  // STRING       {string}
185                  // FUNCTION     {ident}\(
186                  //
187                  // And the lexical scanner tokens
188                  //
189                  // name         {nmchar}+
190                  // nmchar       [_a-z0-9-]|{nonascii}|{escape}
191                  // nonascii     [\240-\377]
192                  // escape       {unicode}|\\[^\r\n\f0-9a-f]
193                  // unicode      \\{h}}{1,6}(\r\n|[ \t\r\n\f])?
194                  // ident        -?{nmstart}{nmchar*}
195                  // nmstart      [_a-z]|{nonascii}|{escape}
196                  // string       {string1}|{string2}
197                  // string1      \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
198                  // string2      \'([^\n\r\f\\"]|\\{nl}|{escape})*\'
199                  //
200                  // We'll implement a subset (in order to reduce attack
201                  // surface); in particular:
202                  //
203                  //      - No Unicode support
204                  //      - No escapes support
205                  //      - No string support (by proxy no attrib support)
206                  //      - element_name is matched against allowed
207                  //        elements (some people might find this
208                  //        annoying...)
209                  //      - Pseudo-elements one of :first-child, :link,
210                  //        :visited, :active, :hover, :focus
211  
212                  // handle ruleset
213                  $selectors = array_map('trim', explode(',', $selector));
214                  $new_selectors = array();
215                  foreach ($selectors as $sel) {
216                      // split on +, > and spaces
217                      $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE);
218                      // even indices are chunks, odd indices are
219                      // delimiters
220                      $nsel = null;
221                      $delim = null; // guaranteed to be non-null after
222                      // two loop iterations
223                      for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) {
224                          $x = $basic_selectors[$i];
225                          if ($i % 2) {
226                              // delimiter
227                              if ($x === ' ') {
228                                  $delim = ' ';
229                              } else {
230                                  $delim = ' ' . $x . ' ';
231                              }
232                          } else {
233                              // simple selector
234                              $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
235                              $sdelim = null;
236                              $nx = null;
237                              for ($j = 0, $cc = count($components); $j < $cc; $j++) {
238                                  $y = $components[$j];
239                                  if ($j === 0) {
240                                      if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
241                                          $nx = $y;
242                                      } else {
243                                          // $nx stays null; this matters
244                                          // if we don't manage to find
245                                          // any valid selector content,
246                                          // in which case we ignore the
247                                          // outer $delim
248                                      }
249                                  } elseif ($j % 2) {
250                                      // set delimiter
251                                      $sdelim = $y;
252                                  } else {
253                                      $attrdef = null;
254                                      if ($sdelim === '#') {
255                                          $attrdef = $this->_id_attrdef;
256                                      } elseif ($sdelim === '.') {
257                                          $attrdef = $this->_class_attrdef;
258                                      } elseif ($sdelim === ':') {
259                                          $attrdef = $this->_enum_attrdef;
260                                      } else {
261                                          throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
262                                      }
263                                      $r = $attrdef->validate($y, $config, $context);
264                                      if ($r !== false) {
265                                          if ($r !== true) {
266                                              $y = $r;
267                                          }
268                                          if ($nx === null) {
269                                              $nx = '';
270                                          }
271                                          $nx .= $sdelim . $y;
272                                      }
273                                  }
274                              }
275                              if ($nx !== null) {
276                                  if ($nsel === null) {
277                                      $nsel = $nx;
278                                  } else {
279                                      $nsel .= $delim . $nx;
280                                  }
281                              } else {
282                                  // delimiters to the left of invalid
283                                  // basic selector ignored
284                              }
285                          }
286                      }
287                      if ($nsel !== null) {
288                          if (!empty($scopes)) {
289                              foreach ($scopes as $s) {
290                                  $new_selectors[] = "$s $nsel";
291                              }
292                          } else {
293                              $new_selectors[] = $nsel;
294                          }
295                      }
296                  }
297                  if (empty($new_selectors)) {
298                      continue;
299                  }
300                  $selector = implode(', ', $new_selectors);
301                  foreach ($style as $name => $value) {
302                      if (!isset($css_definition->info[$name])) {
303                          unset($style[$name]);
304                          continue;
305                      }
306                      $def = $css_definition->info[$name];
307                      $ret = $def->validate($value, $config, $context);
308                      if ($ret === false) {
309                          unset($style[$name]);
310                      } else {
311                          $style[$name] = $ret;
312                      }
313                  }
314                  $new_decls[$selector] = $style;
315              }
316              $new_css[$k] = $new_decls;
317          }
318          // remove stuff that shouldn't be used, could be reenabled
319          // after security risks are analyzed
320          $this->_tidy->css = $new_css;
321          $this->_tidy->import = array();
322          $this->_tidy->charset = null;
323          $this->_tidy->namespace = null;
324          $css = $this->_tidy->print->plain();
325          // we are going to escape any special characters <>& to ensure
326          // that no funny business occurs (i.e. </style> in a font-family prop).
327          if ($config->get('Filter.ExtractStyleBlocks.Escaping')) {
328              $css = str_replace(
329                  array('<', '>', '&'),
330                  array('\3C ', '\3E ', '\26 '),
331                  $css
332              );
333          }
334          return $css;
335      }
336  }
337  
338  // vim: et sw=4 sts=4