* P4Cms Content Lucene Document.
* - Allows a Zend Search Lucene document to be created from a content entry.
* - Determines if a P4Cms Content field should be indexed.
* - Specifies how a Content field should be indexed.
* @copyright 2011 Perforce Software. All rights reserved.
* @license Please see LICENSE.txt in top-level folder of this distribution.
* @version <release>/<patch>
* @todo consider extracting additional metadata for certain document types
class P4Cms_Content_LuceneDocument extends Zend_Search_Lucene_Document
protected $_content = null;
* Make a new lucene document instance for the given content entry.
* @param P4Cms_Content $content the content entry to make a lucene document for.
public function __construct(P4Cms_Content $content)
$this->_content = $content;
// setup the lucene document fields.
* Get the content entry associated with the Lucene document.
* @return P4Cms_Content the content entry this lucene document represents.
public function getContentEntry()
return $this->_content;
* Convert the content fields into lucene document fields.
protected function _loadFields()
// convert field into lucene document field.
foreach ($this->_getContentFields() as $name => $data) {
// skip fields that should not be indexed.
if ($this->_isIndexDisabled($name, $data)) {
// convert and add field to document.
$field = $this->_toLuceneField($name, $data);
if ($field instanceof Zend_Search_Lucene_Field) {
* Collect all of the fields for the content entry with
* information about the field and the value pulled from
* the content entry.
* @return array the list of all content fields and their details/values.
protected function _getContentFields()
$entry = $this->getContentEntry();
$type = $this->getContentEntry()->getContentType();
// start with default/built-in/required fields.
$fields = array(
'uri' => array(
'value' => $entry->getUri(),
'search' => array('index' => array('type' => 'keyword'))
'title' => array(
'value' => $entry->getTitle(),
'search' => array('index' => array('type' => 'text'))
'excerpt' => array(
'value' => $entry->getExcerpt(),
'search' => array('index' => array('type' => 'unindexed'))
'contentId' => array(
'value' => $entry->getId(),
'search' => array('index' => array('type' => 'unindexed')),
'metadata' => array('mimeType' => 'text/plain')
'contentType' => array(
'value' => $entry->getContentTypeId(),
'search' => array('index' => array('type' => 'unindexed')),
'metadata' => array('mimeType' => 'text/plain')
'resource' => array(
'value' => 'content',
'search' => array('index' => array('type' => 'unindexed'))
'privilege' => array(
'value' => 'access',
'search' => array('index' => array('type' => 'unindexed'))
// add the fields from content type.
$fields = $this->_mergeFields($fields, $type->getElements());
// add in values and metadata from the content entry.
foreach ($entry->getValues() as $field => $value) {
if (array_key_exists($field, $fields)) {
$fields[$field]['value'] = $value;
$fields[$field]['metadata'] = $entry->getFieldMetadata($field);
// add filename if it does not exist already
if (isset($fields[$field]['metadata']['filename']) && !array_key_exists('filename', $fields)) {
$fields['filename']['value'] = $fields[$field]['metadata']['filename'];
$fields['filename']['search'] = array('index' => array('type' => 'unstored'));
return $fields;
* Convert from a field definition/value to a lucene document field.
* @param string $name the name of the field to convert.
* @param array $data the details and value of the field.
* @return Zend_Search_Lucene_Field|null lucene document field object or null if we
* can't create one.
protected function _toLuceneField($name, $data)
// presently we can't do anything reasonable with arrays/objects/etc.
// in the meantime, we just defend against these data types.
if (!array_key_exists('value', $data) || !is_scalar($data['value'])) {
return null;
// write value to a temp file.
$tempFile = tempnam(sys_get_temp_dir(), $name);
file_put_contents($tempFile, $data['value']);
// detect mime-type and encoding.
$data['tempFile'] = $tempFile;
$encoding = $this->_detectEncoding($data);
$data['encoding'] = $encoding ?: 'utf8'; // default to utf8
$data['mimeType'] = isset($data['metadata']['mimeType'])
? $data['metadata']['mimeType']
: P4Cms_Validate_File_MimeType::getTypeOfFile($tempFile);
// determine lucene field type.
$type = $this->_getLuceneFieldType($name, $data);
// attempt to filter/prepare the value and
// create lucene field of appropriate type.
try {
$value = $this->_prepareFieldValue($name, $data);
$field = Zend_Search_Lucene_Field::$type(
} catch (P4Cms_Content_Exception $e) {
$field = null;
// clean-up temp.
return $field;
* Determine the correct lucene field type to use for the given
* content field definition/value. Checks for explicit index type
* in field data - defaults to 'unstored'.
* @param string $name the name of the field to convert.
* @param array $data the details and value of the field.
* @return string the type of lucene field to use:
* keyword - [ ] tokenized [x] indexed [x] stored
* unindexed - [ ] tokenized [ ] indexed [x] stored
* binary - [ ] tokenized [ ] indexed [x] stored
* text - [x] tokenized [x] indexed [x] stored
* unstored - [x] tokenized [x] indexed [ ] stored
protected function _getLuceneFieldType($name, $data)
$types = array('keyword', 'unindexed', 'binary', 'text', 'unstored');
// if the field definition specifies a valid type, use it.
if (isset($data['search']['index']['type'])
&& in_array($data['search']['index']['type'], $types)
) {
return $data['search']['index']['type'];
return 'unstored';
* Determine if a given field should not be indexed.
* @param string $name the name of the field to be indexed.
* @param array $data the details and value of the field.
* @return bool true if we should not index this field; false otherwise.
protected function _isIndexDisabled($name, $data)
return isset($data['search']['index']['disabled'])
&& $data['search']['index']['disabled'];
* Get the filters to apply to the given field value before it is indexed.
* The filters to use can be specified in the content type field definition.
* @param string $name the name of the field to be indexed.
* @param array $data the details and value of the field.
* @return array the set of filters to apply to the field value.
* @todo automatically select filters based on mime-type and/or file extension
* alternatively, publish as pub/sub topic to collect filters.
protected function _getIndexFilters($name, $data)
// early exit if the field definition does not specify filters.
if (!isset($data['search']['index']['filters'])) {
return array();
$options = array('fieldName' => $name, 'fieldData' => $data);
$filters = $data['search']['index']['filters'];
// add field name and data to filter options.
/*foreach ($filters as $filter) {
$filter['options'] = isset($filter['options'])
? array_merge($options, $filter['options'])
: $options;
// use a form with a dummy element to leverage filter plugin loading.
$form = new P4Cms_Form;
$form->addElement('text', 'dummy', array('filters' => $filters));
return $form->getElement('dummy')->getFilters();
* Prepare a field value for indexing by applying filters to it.
* @param string $name the name of the field to be indexed.
* @param array $data the details and value of the field.
* @return string $value the prepared value.
* @throws P4Cms_Content_Exception if the value cannot be prepared.
protected function _prepareFieldValue($name, $data)
$filters = $this->_getIndexFilters($name, $data);
// filters are required for non-text values.
if (empty($filters) && strpos($data['mimeType'], 'text/') !== 0) {
throw new P4Cms_Content_Exception(
"Cannot prepare non-plain-text value without filters."
// apply filters to value and return result.
$value = $data['value'];
foreach ($filters as $filter) {
$value = $filter->filter($value);
return $value;
* Detect the encoding of a string.
* @param string $data the data to be checked.
* @return string the encoding or false if cannot be detected.
protected function _detectEncoding($data)
if (extension_loaded('mbstring')) {
$encoding = mb_detect_encoding($data['value']);
} else {
$finfo = finfo_open(FILEINFO_MIME);
// get mime type and encoding for the file
$mime = finfo_file($finfo, $data['tempFile']);
preg_match('/^(.*)\/(.*); charset=(.*)$/', $mime, $matches);
$encoding = isset($matches[3]) ? trim($matches[3]) : false;
return $encoding;
* Merge two sets of fields.
* Options in the base fields will be replaced by the ones from
* append fields and the default settings in the base will be
* kept if none is set in the appending fields.
* This works like array_merge_recursive but instead of making
* values with the same key an array, the value in the first array
* is replaced.
* @param array $a the base fields.
* @param array $b the append fields.
* @return array the merged fields.
protected function _mergeFields($a, $b)
if (!is_array($a)) {
$a = empty($a) ? array() : array($a);
if (!is_array($b)) {
$b = array($b);
foreach ($b as $key => $value) {
if (!array_key_exists($key, $a) and !is_numeric($key)) {
$a[$key] = $b[$key];
if (is_array($value) or is_array($a[$key])) {
$a[$key] = $this->_mergeFields($a[$key], $b[$key]);
} else if (is_numeric($key)) {
if (!in_array($value, $a)) {
$a[] = $value;
} else {
$a[$key] = $value;
return $a;