<?php /** * Zend Framework (http://framework.zend.com/) * * @link http://github.com/zendframework/zf2 for the canonical source repository * @copyright Copyright (c) 2005-2014 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ namespace Zend\Escaper; /** * Context specific methods for use in secure output escaping */ class Escaper { /** * Entity Map mapping Unicode codepoints to any available named HTML entities. * * While HTML supports far more named entities, the lowest common denominator * has become HTML5's XML Serialisation which is restricted to the those named * entities that XML supports. Using HTML entities would result in this error: * XML Parsing Error: undefined entity * * @var array */ protected static $htmlNamedEntityMap = array( 34 => 'quot', // quotation mark 38 => 'amp', // ampersand 60 => 'lt', // less-than sign 62 => 'gt', // greater-than sign ); /** * Current encoding for escaping. If not UTF-8, we convert strings from this encoding * pre-escaping and back to this encoding post-escaping. * * @var string */ protected $encoding = 'utf-8'; /** * Holds the value of the special flags passed as second parameter to * htmlspecialchars(). We modify these for PHP 5.4 to take advantage * of the new ENT_SUBSTITUTE flag for correctly dealing with invalid * UTF-8 sequences. * * @var string */ protected $htmlSpecialCharsFlags = ENT_QUOTES; /** * Static Matcher which escapes characters for HTML Attribute contexts * * @var callable */ protected $htmlAttrMatcher; /** * Static Matcher which escapes characters for Javascript contexts * * @var callable */ protected $jsMatcher; /** * Static Matcher which escapes characters for CSS Attribute contexts * * @var callable */ protected $cssMatcher; /** * List of all encoding supported by this class * * @var array */ protected $supportedEncodings = array( 'iso-8859-1', 'iso8859-1', 'iso-8859-5', 'iso8859-5', 'iso-8859-15', 'iso8859-15', 'utf-8', 'cp866', 'ibm866', '866', 'cp1251', 'windows-1251', 'win-1251', '1251', 'cp1252', 'windows-1252', '1252', 'koi8-r', 'koi8-ru', 'koi8r', 'big5', '950', 'gb2312', '936', 'big5-hkscs', 'shift_jis', 'sjis', 'sjis-win', 'cp932', '932', 'euc-jp', 'eucjp', 'eucjp-win', 'macroman' ); /** * Constructor: Single parameter allows setting of global encoding for use by * the current object. If PHP 5.4 is detected, additional ENT_SUBSTITUTE flag * is set for htmlspecialchars() calls. * * @param string $encoding * @throws Exception\InvalidArgumentException */ public function __construct($encoding = null) { if ($encoding !== null) { $encoding = (string) $encoding; if ($encoding === '') { throw new Exception\InvalidArgumentException( get_class($this) . ' constructor parameter does not allow a blank value' ); } $encoding = strtolower($encoding); if (!in_array($encoding, $this->supportedEncodings)) { throw new Exception\InvalidArgumentException( 'Value of \'' . $encoding . '\' passed to ' . get_class($this) . ' constructor parameter is invalid. Provide an encoding supported by htmlspecialchars()' ); } $this->encoding = $encoding; } if (defined('ENT_SUBSTITUTE')) { $this->htmlSpecialCharsFlags|= ENT_SUBSTITUTE; } // set matcher callbacks $this->htmlAttrMatcher = array($this, 'htmlAttrMatcher'); $this->jsMatcher = array($this, 'jsMatcher'); $this->cssMatcher = array($this, 'cssMatcher'); } /** * Return the encoding that all output/input is expected to be encoded in. * * @return string */ public function getEncoding() { return $this->encoding; } /** * Escape a string for the HTML Body context where there are very few characters * of special meaning. Internally this will use htmlspecialchars(). * * @param string $string * @return string */ public function escapeHtml($string) { $result = htmlspecialchars($string, $this->htmlSpecialCharsFlags, $this->encoding); return $result; } /** * Escape a string for the HTML Attribute context. We use an extended set of characters * to escape that are not covered by htmlspecialchars() to cover cases where an attribute * might be unquoted or quoted illegally (e.g. backticks are valid quotes for IE). * * @param string $string * @return string */ public function escapeHtmlAttr($string) { $string = $this->toUtf8($string); if ($string === '' || ctype_digit($string)) { return $string; } $result = preg_replace_callback('/[^a-z0-9,\.\-_]/iSu', $this->htmlAttrMatcher, $string); return $this->fromUtf8($result); } /** * Escape a string for the Javascript context. This does not use json_encode(). An extended * set of characters are escaped beyond ECMAScript's rules for Javascript literal string * escaping in order to prevent misinterpretation of Javascript as HTML leading to the * injection of special characters and entities. The escaping used should be tolerant * of cases where HTML escaping was not applied on top of Javascript escaping correctly. * Backslash escaping is not used as it still leaves the escaped character as-is and so * is not useful in a HTML context. * * @param string $string * @return string */ public function escapeJs($string) { $string = $this->toUtf8($string); if ($string === '' || ctype_digit($string)) { return $string; } $result = preg_replace_callback('/[^a-z0-9,\._]/iSu', $this->jsMatcher, $string); return $this->fromUtf8($result); } /** * Escape a string for the URI or Parameter contexts. This should not be used to escape * an entire URI - only a subcomponent being inserted. The function is a simple proxy * to rawurlencode() which now implements RFC 3986 since PHP 5.3 completely. * * @param string $string * @return string */ public function escapeUrl($string) { return rawurlencode($string); } /** * Escape a string for the CSS context. CSS escaping can be applied to any string being * inserted into CSS and escapes everything except alphanumerics. * * @param string $string * @return string */ public function escapeCss($string) { $string = $this->toUtf8($string); if ($string === '' || ctype_digit($string)) { return $string; } $result = preg_replace_callback('/[^a-z0-9]/iSu', $this->cssMatcher, $string); return $this->fromUtf8($result); } /** * Callback function for preg_replace_callback that applies HTML Attribute * escaping to all matches. * * @param array $matches * @return string */ protected function htmlAttrMatcher($matches) { $chr = $matches[0]; $ord = ord($chr); /** * The following replaces characters undefined in HTML with the * hex entity for the Unicode replacement character. */ if (($ord <= 0x1f && $chr != "\t" && $chr != "\n" && $chr != "\r") || ($ord >= 0x7f && $ord <= 0x9f) ) { return '�'; } /** * Check if the current character to escape has a name entity we should * replace it with while grabbing the integer value of the character. */ if (strlen($chr) > 1) { $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8'); } $hex = bin2hex($chr); $ord = hexdec($hex); if (isset(static::$htmlNamedEntityMap[$ord])) { return '&' . static::$htmlNamedEntityMap[$ord] . ';'; } /** * Per OWASP recommendations, we'll use upper hex entities * for any other characters where a named entity does not exist. */ if ($ord > 255) { return sprintf('&#x%04X;', $ord); } return sprintf('&#x%02X;', $ord); } /** * Callback function for preg_replace_callback that applies Javascript * escaping to all matches. * * @param array $matches * @return string */ protected function jsMatcher($matches) { $chr = $matches[0]; if (strlen($chr) == 1) { return sprintf('\\x%02X', ord($chr)); } $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8'); return sprintf('\\u%04s', strtoupper(bin2hex($chr))); } /** * Callback function for preg_replace_callback that applies CSS * escaping to all matches. * * @param array $matches * @return string */ protected function cssMatcher($matches) { $chr = $matches[0]; if (strlen($chr) == 1) { $ord = ord($chr); } else { $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8'); $ord = hexdec(bin2hex($chr)); } return sprintf('\\%X ', $ord); } /** * Converts a string to UTF-8 from the base encoding. The base encoding is set via this * class' constructor. * * @param string $string * @throws Exception\RuntimeException * @return string */ protected function toUtf8($string) { if ($this->getEncoding() === 'utf-8') { $result = $string; } else { $result = $this->convertEncoding($string, 'UTF-8', $this->getEncoding()); } if (!$this->isUtf8($result)) { throw new Exception\RuntimeException(sprintf( 'String to be escaped was not valid UTF-8 or could not be converted: %s', $result )); } return $result; } /** * Converts a string from UTF-8 to the base encoding. The base encoding is set via this * class' constructor. * @param string $string * @return string */ protected function fromUtf8($string) { if ($this->getEncoding() === 'utf-8') { return $string; } return $this->convertEncoding($string, $this->getEncoding(), 'UTF-8'); } /** * Checks if a given string appears to be valid UTF-8 or not. * * @param string $string * @return bool */ protected function isUtf8($string) { return ($string === '' || preg_match('/^./su', $string)); } /** * Encoding conversion helper which wraps iconv and mbstring where they exist or throws * and exception where neither is available. * * @param string $string * @param string $to * @param array|string $from * @throws Exception\RuntimeException * @return string */ protected function convertEncoding($string, $to, $from) { $result = ''; if (function_exists('iconv')) { $result = iconv($from, $to, $string); } elseif (function_exists('mb_convert_encoding')) { $result = mb_convert_encoding($string, $to, $from); } else { throw new Exception\RuntimeException( get_class($this) . ' requires either the iconv or mbstring extension to be installed' . ' when escaping for non UTF-8 strings.' ); } if ($result === false) { return ''; // return non-fatal blank string on encoding errors from users } return $result; } }