Tuesday, July 6, 2010

How to truncate an HTML/XHTML/XML snippet in PHP

<?php
function xmlentities($xml) {
return str_replace ( array ( '&', '"', "'", '<', '>' ), array ( '&amp;' , '&quot;', '&apos;' , '&lt;' , '&gt;' ), $xml);
}

function char_limit($text, $char_limit, $append = '...') {
$tok = strtok($text, " \n\t");
$result = '';

while ($tok !== false) {
if (strlen($tok) + strlen($result) + 1 <= $char_limit) {
$result = $result . ' ' . $tok;
} else {
$result .= $append;
break;
}
$tok = strtok(" \n\t");
}

return $result;
}

class TruncatingParser {
var $tagStack = array();

var $bufferedCdata = NULL;
var $out = '';
var $append = '';

var $maxChars = -1;
var $currentChars = 0;

function startElement($parser, $tagName, $attrs) {
$this->flushCharacterData();

if ($this->currentChars < $this->maxChars) {
$this->out .= '<' . $tagName;

foreach ($attrs as $key => $value) {
$this->out .= ' ' . $key . '="' . xmlentities($value) . '"';
}

$this->out .= '>';

$this->tagStack[count($this->tagStack)] = $tagName;
}
}

function endElement($parser, $tagName) {
$this->flushCharacterData();
if ($this->currentChars < $this->maxChars) {
$this->out .= '</' . $tagName . '>';
array_pop($this->tagStack);
}
}

function characterData($parser, $data) {
if ($this->currentChars < $this->maxChars) {
if ($this->bufferedCdata == NULL) {
$this->bufferedCdata = $data;
} else {
$this->bufferedCdata .= $data;
}
}
}

function flushCharacterData() {
if ($this->bufferedCdata != NULL && $this->currentChars < $this->maxChars) {
if ($this->currentChars + strlen($this->bufferedCdata) >= $this->maxChars) {
$charLimit = $this->maxChars - $this->currentChars;
$this->out .= xmlentities(char_limit($this->bufferedCdata, $charLimit, ''));
$this->out .= $this->append;

$this->currentChars = $this->maxChars;

while ($tag = array_pop($this->tagStack)) {
$this->out .= '</' . $tag . '>';
}
} else {
$this->out .= $this->bufferedCdata;
$this->currentChars += strlen($this->bufferedCdata);
}
}
$this->bufferedCdata = NULL;
}
}

function xml_char_limit($text, $char_limit, $append_text = '') {
$parser = new TruncatingParser();
$parser->maxChars = $char_limit;
$parser->append = $append_text;

$xml_parser = xml_parser_create();
xml_parser_set_option($xml_parser, XML_OPTION_CASE_FOLDING, 0);

xml_set_object($xml_parser, &$parser);

xml_set_element_handler($xml_parser, "startElement", "endElement");
xml_set_character_data_handler($xml_parser, "characterData");

$text = '<div>' . $text . '</div>';

if (!xml_parse($xml_parser, $text, TRUE)) {
return '';
}

xml_parser_free($xml_parser);

$out = $parser->out;

// Strip out the temporary root tag that was added
$out = substr($out, 5, strlen($out) - 11);

return $out;
}
?>