Utf8.php #1

<?php
/**
 * Perforce Swarm
 *
 * @copyright   2012 Perforce Software. All rights reserved.
 * @license     Please see LICENSE.txt in top-level folder of this distribution.
 * @version     <release>/<patch>
 */

namespace P4\Filter;

class Utf8
{
    const REPLACE_INVALID       = 'replace';
    const CONVERT_ENCODING      = 'convert';

    protected $options          = array(
        'replace'               => true,
        'convert'               => false
    );

    /**
     * Constructor, can optionally take options :)
     *
     * @param array|bool    $replaceOrOptions   options array or setting for 'replace invalid'
     * @param bool          $convert            convert enabled/disabled
     */
    public function __construct($replaceOrOptions = true, $convert = false)
    {
        if (is_array($replaceOrOptions)) {
            $this->setOptions($replaceOrOptions);
        } else {
            $this->setReplaceInvalid($replaceOrOptions);
        }

        $this->setConvertEncoding($convert);
    }

    /**
     * Sets all options for this filter.
     *
     * @param   array   $options    options to use
     * @return  $this   to maintain a fluent interface
     */
    public function setOptions($options)
    {
        $this->options = $options + array(
            static::REPLACE_INVALID  => true,
            static::CONVERT_ENCODING => false
        );

        return $this;
    }

    /**
     * Returns all options for this filter
     *
     * @return  array   options
     */
    public function getOptions()
    {
        return $this->options;
    }

    /**
     * Used to update or access the 'replace invalid' option.
     *
     * If enabled, invalid UTF-8 byte sequences will be replaced with
     * an inverted question mark.
     *
     * @param   bool    $enabled    pass true/false to enable disable
     * @return  Utf8    provides a fluent interface
     */
    public function setReplaceInvalid($enabled)
    {
        $this->options[static::REPLACE_INVALID] = (bool) $enabled;
        return $this;
    }

    /**
     * Returns the current replace invalid setting. Default is true.
     *
     * @return  bool    true if invalid bytes will be replaced false otherwise
     */
    public function getReplaceInvalid()
    {
        return $this->options[static::REPLACE_INVALID];
    }

    /**
     * Used to update or access the 'convert encoding' option.
     *
     * If enabled, input which contain high bytes but no valid utf-8 character
     * sequences will be assumed to be Windows-1252 (ISO-8859-1 compatible)
     * and converted to UTF-8 prior to further processing.
     *
     * @param   bool    $enabled    pass true/false to enable disable
     * @return  Utf8    provides a fluent interface
     */
    public function setConvertEncoding($enabled)
    {
        $this->options[static::CONVERT_ENCODING] = (bool) $enabled;
        return $this;
    }

    /**
     * Returns the current convert setting. Default is false.
     *
     * @return  bool    true if encoding conversion will be attempted false otherwise
     */
    public function getConvertEncoding()
    {
        return $this->options[static::CONVERT_ENCODING];
    }

    /**
     * This filter performs two primary operations; both can be enabled/disabled.
     *
     * The filter will check for strings containing high bits but no valid UTF-8
     * characters. These will be converted from Windows-1252 to UTF-8. This feature
     * is disabled by default.
     *
     * This filter will also replace all invalid utf-8 byte sequences with the
     * character '�'. This feature is enabled by default.
     *
     * @param  string|array     $value  utf8 text that may contain invalid byte sequences.
     * @return string           valid utf8 version of text; invalid bytes replaced with '�'.
     */
    public function filter($value)
    {
        // if we were passed an array, recursively sanitize its values.
        if (is_array($value)) {
            foreach ($value as $key => $element) {
                $value[$key] = $this->filter($element);
            }

            return $value;
        }

        // only sanitize strings.
        if (!is_string($value)) {
            return $value;
        }

        if ($this->getConvertEncoding()) {
            $value = $this->convertEncoding($value);
        }

        if ($this->getReplaceInvalid()) {
            $value = $this->replaceInvalid($value);
        }

        return $value;
    }

    /**
     * For inputs which contain high bytes but _no_ valid UTF-8 byte sequences
     * this method will assume the input is in Windows-1252 (ISO-8859-1 compatible)
     * encoding and convert it to UTF-8 assuming mb_string_convert or iconv are
     * available.
     *
     * @param   string  $value  text that may be in UTF-8, Windows-1252 or ISO-8859-1 format.
     * @return  string  utf8 version of text; invalid byte sequences may be present.
     */
    protected function convertEncoding($value)
    {
        // nothing to do if we lack support for encoding conversion
        if (!function_exists('mb_convert_encoding') && !function_exists('iconv')) {
            return $value;
        }

        // if no high bytes are present, no conversion needed
        if (!preg_match('/[\x80-\xFF]/', $value)) {
            return $value;
        }

        // start by checking if the input has at least one valid utf-8 sequence in it
        $hasUtf8 = preg_match(
            '/(
              [\xC0-\xDF][\x80-\xBF]                            # Two byte sequence
            | [\xE0-\xEF][\x80-\xBF]{2}                         # Three byte
            | [\xF0-\xF7][\x80-\xBF]{3}                         # Four byte
            | [\xF8-\xFB][\x80-\xBF]{4}                         # Five byte
            | [\xFC-\xFD][\x80-\xBF]{5}                         # Six byte
            )/x',
            $value
        );

        // if we have a single valid utf-8 byte, assume utf-8 input
        if ($hasUtf8) {
            return $value;
        }

        // start with a 'from' of windows-1252
        $from = 'windows-1252';

        // last ditch check; if invalid Windows-1252 bytes are present swap
        // to mac os roman, it just might be that :)
        if (preg_match('/[\x81\x8D\x8F\x90\x9D]/', $value)) {
            $from = 'macintosh';
        }

        // convert from input format, mb_convert doesn't support macintosh
        // but is otherwise our first choice as we expect its faster.
        if ($from != 'macintosh' && function_exists('mb_convert_encoding')) {
            $value = mb_convert_encoding($value, 'utf-8', $from);
        } elseif (function_exists('iconv')) {
            $value = iconv($from, 'utf-8', $value);
        }

        return $value;
    }

    /**
     * This method will replace all invalid utf-8 byte sequences with the character '�'.
     *
     * @param   string  $value  utf8 text that may contain invalid byte sequences.
     * @return  string  valid utf8 version of text; invalid bytes replaced with '�'.
     */
    protected function replaceInvalid($value)
    {
        // looks like we got a scalar, use our regex to sanitize it
        // we use a regex instead of mb_convert_encoding as the mbstring module may not be present and
        // our replacement character is ignored on php 5.3. we can't use iconv as it too might not be
        // present and it simply drops bad bytes.
        $value = preg_replace(
            '/(
              [\xC0-\xC1]                                           # Invalid UTF-8 Bytes
            | [\xF5-\xFF]                                           # Invalid UTF-8 Bytes
            | \xE0[\x80-\x9F]                                       # Overlong encoding of prior code point
            | \xF0[\x80-\x8F]                                       # Overlong encoding of prior code point
            | [\xC2-\xDF](?![\x80-\xBF])                            # Invalid UTF-8 Sequence Start
            | [\xE0-\xEF](?![\x80-\xBF]{2})                         # Invalid UTF-8 Sequence Start
            | [\xF0-\xF4](?![\x80-\xBF]{3})                         # Invalid UTF-8 Sequence Start
            | (?<=[\x0-\x7F\xF5-\xFF])[\x80-\xBF]                   # Invalid UTF-8 Sequence Middle
            | (?<![\xC2-\xDF]|[\xE0-\xEF]|[\xE0-\xEF][\x80-\xBF]    # Overlong Sequence
                |[\xF0-\xF4]|[\xF0-\xF4][\x80-\xBF]|[\xF0-\xF4]
                [\x80-\xBF]{2})[\x80-\xBF]
            | (?<=[\xE0-\xEF])[\x80-\xBF](?![\x80-\xBF])            # Short 3 byte sequence
            | (?<=[\xF0-\xF4])[\x80-\xBF](?![\x80-\xBF]{2})         # Short 4 byte sequence
            | (?<=[\xF0-\xF4][\x80-\xBF])[\x80-\xBF](?![\x80-\xBF]) # Short 4 byte sequence (2)
            )/x',
            "�",
            $value
        );

        return $value;
    }
}
#	Change	User	Description	Committed
#1	18334	Liz Lam	initial add of jambox