[a-zA-Z0-9_]+)?(?:\#(?[a-zA-Z0-9_]+))?\)'; const MATCH_ATTR_STYLE = "\{(?[^\}\n]+)\}"; const MATCH_ATTR_LANG = '\[(?[a-zA-Z_]+)\]'; const MATCH_ATTR_ALIGN = '(?\<\>?|\>|=)'; /** * Token tree * * @var Zend_Markup_TokenList */ protected $_tree; /** * Current token * * @var Zend_Markup_Token */ protected $_current; /** * Source to tokenize * * @var string */ protected $_value = ''; /** * Length of the value * * @var int */ protected $_valueLen = 0; /** * Current pointer * * @var int */ protected $_pointer = 0; /** * The buffer * * @var string */ protected $_buffer = ''; /** * Simple tag translation * * @var array */ protected $_simpleTags = array( '*' => 'strong', '**' => 'bold', '_' => 'emphasized', '__' => 'italic', '??' => 'citation', '-' => 'deleted', '+' => 'insert', '^' => 'superscript', '~' => 'subscript', '%' => 'span', // these are a little more complicated '@' => 'code', '!' => 'img', ); /** * Token array * * @var array */ protected $_tokens = array(); /** * Prepare the parsing of a Textile string, the real parsing is done in {@link _parse()} * * @param string $value * * @return array */ public function parse($value) { if (!is_string($value)) { /** * @see Zend_Markup_Parser_Exception */ require_once 'Zend/Markup/Parser/Exception.php'; throw new Zend_Markup_Parser_Exception('Value to parse should be a string.'); } if (empty($value)) { /** * @see Zend_Markup_Parser_Exception */ require_once 'Zend/Markup/Parser/Exception.php'; throw new Zend_Markup_Parser_Exception('Value to parse cannot be left empty.'); } // first make we only have LF newlines, also trim the value $this->_value = str_replace(array("\r\n", "\r"), "\n", $value); $this->_value = trim($this->_value); // initialize variables and tokenize $this->_valueLen = iconv_strlen($this->_value, 'UTF-8'); $this->_pointer = 0; $this->_buffer = ''; $this->_temp = array(); $this->_tokens = array(); $this->_tokenize(); // create the tree $this->_tree = new Zend_Markup_TokenList(); $this->_current = new Zend_Markup_Token('', Zend_Markup_Token::TYPE_NONE, 'Zend_Markup_Root'); $this->_tree->addChild($this->_current); $this->_createTree(); return $this->_tree; } /** * Tokenize a textile string * * @return array */ protected function _tokenize() { $state = self::STATE_NEW_PARAGRAPH; $attrsMatch = implode('|', array( self::MATCH_ATTR_CLASSID, self::MATCH_ATTR_STYLE, self::MATCH_ATTR_LANG, self::MATCH_ATTR_ALIGN )); $paragraph = ''; while ($this->_pointer < $this->_valueLen) { switch ($state) { case self::STATE_SCAN: $matches = array(); //[^\n*_?+~%@!-] $acronym = '(?[A-Z]{2,})\((?[^\)]+)\)'; $regex = '#\G(?<text>.*?)(?:' . "(?:(?<nl_paragraph>\n{2,})|(?<nl_break>\n))|" . '(?<tag>' . "(?<name>\*{1,2}|_{1,2}|\?{2}|\-|\+|\~|\^|%|@|!|$|{$acronym}" . '|":(?<url>[^\s]+)|")' . "(?:{$attrsMatch})*)" . ')#si'; preg_match($regex, $this->_value, $matches, null, $this->_pointer); $this->_pointer += strlen($matches[0]); if (!empty($matches['text'])) { $this->_buffer .= $matches['text']; } // first add the buffer if (!empty($this->_buffer)) { $this->_tokens[] = array( 'tag' => $this->_buffer, 'type' => Zend_Markup_Token::TYPE_NONE ); $this->_buffer = ''; } if (!empty($matches['nl_paragraph'])) { $this->_temp = array( 'tag' => $matches['nl_paragraph'], 'name' => 'p', 'type' => Zend_Markup_Token::TYPE_TAG, 'attributes' => array() ); $state = self::STATE_NEW_PARAGRAPH; } elseif (!empty($matches['nl_break'])) { $this->_tokens[] = array( 'tag' => $matches['nl_break'], 'name' => 'break', 'type' => Zend_Markup_Token::TYPE_TAG, 'attributes' => array() ); $state = self::STATE_NEWLINE; } elseif (!empty($matches['tag'])) { if (isset($this->_simpleTags[$matches['name']])) { // now add the new token $this->_tokens[] = array( 'tag' => $matches['tag'], 'type' => Zend_Markup_Token::TYPE_TAG, 'name' => $this->_simpleTags[$matches['name']], 'attributes' => $this->_extractAttributes($matches) ); } else { $attributes = $this->_extractAttributes($matches); if ($matches['tag'][0] == '"') { $name = 'url'; if (isset($matches['url'])) { $attributes['url'] = $matches['url']; } $this->_tokens[] = array( 'tag' => $matches['tag'], 'type' => Zend_Markup_Token::TYPE_TAG, 'name' => $name, 'attributes' => $attributes ); } else { $name = 'acronym'; $this->_tokens[] = array( 'tag' => '', 'type' => Zend_Markup_Token::TYPE_TAG, 'name' => 'acronym', 'attributes' => array( 'title' => $matches['title'] ) ); $this->_tokens[] = array( 'tag' => $matches['acronym'], 'type' => Zend_Markup_Token::TYPE_NONE ); $this->_tokens[] = array( 'tag' => '(' . $matches['title'] . ')', 'type' => Zend_Markup_Token::TYPE_TAG, 'name' => 'acronym', 'attributes' => array() ); } } $state = self::STATE_SCAN; } break; case self::STATE_NEW_PARAGRAPH: if (empty($this->_temp)) { $this->_temp = array( 'tag' => '', 'name' => 'p', 'type' => Zend_Markup_Token::TYPE_TAG, 'attributes' => array() ); } else { $this->_tokens[] = array( 'tag' => "\n", 'name' => 'p', 'type' => Zend_Markup_Token::TYPE_TAG, 'attributes' => array() ); $this->_temp['tag'] = substr($this->_temp['tag'], 1); } $matches = array(); //[^\n*_?+~%@!-] (\()? [^()]+ (?(1)\)) $regex = "#\G(?<name>(h[1-6]|p)|(?:\#|\*))(?:{$attrsMatch})*(?(2)\.\s|\s)#i"; if (!preg_match($regex, $this->_value, $matches, null, $this->_pointer)) { $this->_tokens[] = $this->_temp; $state = self::STATE_SCAN; break; } $this->_pointer += strlen($matches[0]); if ($matches['name'] == 'p') { $this->_temp['tag'] .= $matches[0]; $this->_temp['attributes'] = $this->_extractAttributes($matches); $this->_tokens[] = $this->_temp; $this->_temp = array(); } else { $this->_tokens[] = $this->_temp; $this->_temp = array(); $name = $matches['name']; $attributes = $this->_extractAttributes($matches); if ($name == '#') { $name = 'list'; $attributes['list'] = 'decimal'; } elseif ($name == '*') { $name = 'list'; } $this->_tokens[] = array( 'tag' => $matches[0], 'name' => $name, 'type' => Zend_Markup_Token::TYPE_TAG, 'attributes' => $attributes ); } $state = self::STATE_SCAN; break; case self::STATE_NEWLINE: $matches = array(); //[^\n*_?+~%@!-] $regex = "#\G(?<name>(h[1-6])|(?:\#|\*))(?:{$attrsMatch})*(?(2)\.\s|\s)#si"; if (!preg_match($regex, $this->_value, $matches, null, $this->_pointer)) { $state = self::STATE_SCAN; break; } $this->_pointer += strlen($matches[0]); $name = $matches['name']; $attributes = $this->_extractAttributes($matches); if ($name == '#') { $name = 'list'; $attributes['list'] = 'decimal'; } elseif ($name == '*') { $name = 'list'; } $this->_tokens[] = array( 'tag' => $matches[0], 'name' => $name, 'type' => Zend_Markup_Token::TYPE_TAG, 'attributes' => $attributes ); break; } } } /** * Create a tree from the tokenized text * * @return void */ protected function _createTree() { $inside = true; foreach ($this->_tokens as $key => $token) { // first check if the token is a stopper if ($this->_isStopper($token, $this->_current)) { if ($this->_current->getName() == 'li') { // list items are handled differently if (isset($this->_tokens[$key + 1]) && ($this->_tokens[$key + 1]['type'] == Zend_Markup_Token::TYPE_TAG) && ($this->_tokens[$key + 1]['name'] == 'list') ) { // the next item is a correct tag $this->_current->setStopper($token['tag']); $this->_current = $this->_current->getParent(); } else { // close the list $this->_current->setStopper($token['tag']); $this->_current = $this->_current->getParent()->getParent(); // go up in the tree until we found the end while ($this->_isStopper($token, $this->_current)) { $this->_current->setStopper($token['tag']); $this->_current = $this->_current->getParent(); } } } else { // go up in the tree until we found the end of stoppers while ($this->_isStopper($token, $this->_current)) { $this->_current->setStopper($token['tag']); if (!empty($token['attributes'])) { foreach ($token['attributes'] as $name => $value) { $this->_current->addAttribute($name, $value); } } $this->_current = $this->_current->getParent(); } } $inside = true; } elseif (($token['type'] == Zend_Markup_Token::TYPE_TAG) && $inside) { if ($token['name'] == 'break') { // add the newline and continue parsing $this->_current->addChild(new Zend_Markup_Token( $token['tag'], Zend_Markup_Token::TYPE_NONE, '', array(), $this->_current )); } else { // handle a list item if ($token['name'] == 'list') { $attributes = array(); if (isset($token['attributes']['list'])) { $attributes['list'] = $token['attributes']['list']; unset($token['attributes']['list']); } if ($this->_current->getName() != 'list') { // the list isn't started yet, create it $child = new Zend_Markup_Token( '', Zend_Markup_Token::TYPE_TAG, 'list', $attributes, $this->_current ); $this->_current->addChild($child); $this->_current = $child; } $token['name'] = 'li'; } elseif (($token['name'] == 'img') || ($token['name'] == 'url')) { $inside = false; } // add the token $child = new Zend_Markup_Token( $token['tag'], Zend_Markup_Token::TYPE_TAG, $token['name'], $token['attributes'], $this->_current ); $this->_current->addChild($child); $this->_current = $child; } } else { // simply add the token as text $this->_current->addChild(new Zend_Markup_Token( $token['tag'], Zend_Markup_Token::TYPE_NONE, '', array(), $this->_current )); } } } /** * Check if a tag is a stopper * * @param array $token * @param Zend_Markup_Token $current * * @return bool */ protected function _isStopper(array $token, Zend_Markup_Token $current) { switch ($current->getName()) { case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': case 'list': case 'li': if (($token['type'] == Zend_Markup_Token::TYPE_TAG) && (($token['name'] == 'break') || ($token['name'] == 'p')) ) { return true; } break; case 'break': return false; break; default: if (($token['type'] == Zend_Markup_Token::TYPE_TAG) && ($token['name'] == $current->getName())) { return true; } break; } return false; } /** * Extract the attributes * * @param array $matches * * @return array */ protected function _extractAttributes(array $matches) { $attributes = array(); if (!empty($matches['attr_class'])) { $attributes['class'] = $matches['attr_class']; } if (!empty($matches['attr_id'])) { $attributes['id'] = $matches['attr_id']; } if (!empty($matches['attr_style'])) { $attributes['style'] = $matches['attr_style']; } if (!empty($matches['attr_lang'])) { $attributes['lang'] = $matches['attr_lang']; } if (!empty($matches['attr_align'])) { switch ($matches['attr_align']) { case '=': $attributes['align'] = 'center'; break; case '>': $attributes['align'] = 'right'; break; case '<>': $attributes['align'] = 'justify'; break; default: case '<': $attributes['align'] = 'left'; break; } } return $attributes; } }