attempt of switching from htmlpurifier to htmLawed and replacing kses by htmLawed calls; this is done for performance and resource considerations; still the common call in egw is by html::purify, but htmLawed is doing the work now. let me know if there are issues regarding; if we cannot find issues within EGroupware, we will clean out the related sourcecode of kses and htmlpurifier with the next step

This commit is contained in:
Klaus Leithoff 2012-05-25 12:23:11 +00:00
parent 7e92918f9e
commit 2509d7783f
3 changed files with 105 additions and 228 deletions

View File

@ -80,10 +80,8 @@ class egw_htmLawed
*/
$this->Configuration = array('comment'=>0,
'balance'=>0,
//'keep_bad'=>3,
'balance'=>0,//turn off tag-balancing (config['balance']=>0). That will not introduce any security risk; only standards-compliant tag nesting check/filtering will be turned off (basic tag-balance will remain; i.e., there won't be any unclosed tag, etc., after filtering)
'tidy'=>1,
//'direct_list_nest'=>1,
'elements' => "* -script",
'schemes'=>'href: file, ftp, http, https, mailto; src: cid, data, file, ftp, http, https; *:file, http, https',
'hook_tag' =>"hl_my_tag_transform",
@ -113,16 +111,25 @@ class egw_htmLawed
* hl_my_tag_transform
*
* function to provide individual checks for element attribute pairs
* implemented so far: img checking for alt attribute == image; set this to empty
* implemented so far: img checking for alt attribute == image; set this to empty
* a checking for title, replacing @
*/
function hl_my_tag_transform($element, $attribute_array)
{
//error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array));
//if ($element=='a') error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array));
// Elements other than 'img' or 'img' without a 'img' attribute are returned unchanged
if($element == 'img' && isset($attribute_array['alt']))
if($element == 'img')
{
// Re-build 'alt'
$attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']);
if (isset($attribute_array['alt'])) $attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']);
if (isset($attribute_array['alt'])&&strpos($attribute_array['alt'],'@')!==false) $attribute_array['alt']=str_replace('@','(at)',$attribute_array['alt']);
}
if($element == 'a')
{
if (isset($attribute_array['title']))
{
if (strpos($attribute_array['title'],'@')!==false) $attribute_array['title']=str_replace('@','(at)',$attribute_array['title']);
}
}
/*
// Elements other than 'span' or 'span' without a 'style' attribute are returned unchanged
@ -165,3 +172,67 @@ function hl_my_tag_transform($element, $attribute_array)
return "<{$element}{$attributes}". (isset($empty_elements[$element]) ? ' /' : ''). '>';
}
/**
* hl_email_tag_transform
*
* function to provide individual checks for element attribute pairs
* implemented so far: img -checking for alt attribute == image; set this to empty
* -control for/on external Images and src-length
* a -checking for title and href, replacing @ accordingly
* -navigate to local anchors without reloading the page
*/
function hl_email_tag_transform($element, $attribute_array)
{
//if ($element=='a') error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array));
// Elements other than 'img' or 'img' without a 'img' attribute are returned unchanged
if($element == 'img')
{
// Re-build 'alt'
if (isset($attribute_array['alt'])) $attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']);
if (isset($attribute_array['alt'])&&strpos($attribute_array['alt'],'@')!==false) $attribute_array['alt']=str_replace('@','(at)',$attribute_array['alt']);
// $GLOBALS['egw_info']['user']['preferences']['felamimail']['allowExternalIMGs'] ? '' : 'match' => '/^cid:.*/'),
if (isset($attribute_array['src']))
{
if (!(strlen($attribute_array['src'])>4 && strlen($attribute_array['src']<400)))
{
$attribute_array['alt']= $attribute_array['alt'].' [blocked (reason: url length):'.$attribute_array['src'].']';
if (!isset($attribute_array['title'])) $attribute_array['title']=$attribute_array['alt'];
$attribute_array['src']=common::image('phpgwapi','dialog_error');
}
if (!$GLOBALS['egw_info']['user']['preferences']['felamimail']['allowExternalIMGs'])
{
if (!preg_match('/^cid:.*/',$attribute_array['src']))
{
$attribute_array['alt']= $attribute_array['alt'].' [blocked external image:'.$attribute_array['src'].']';
if (!isset($attribute_array['title'])) $attribute_array['title']=$attribute_array['alt'];
$attribute_array['src']=common::image('phpgwapi','dialog_error');
}
}
}
}
if($element == 'a')
{
if (isset($attribute_array['title']))
{
if (strpos($attribute_array['title'],'@')!==false) $attribute_array['title']=str_replace('@','(at)',$attribute_array['title']);
}
if (isset($attribute_array['name']) && isset($attribute_array['id'])) $attribute_array['id'] = $attribute_array['name'];
if (strpos($attribute_array['href'],'@')!==false) $attribute_array['href'] = str_replace('@','%40',$attribute_array['href']);
if (strpos($attribute_array['href'],'#')===0)
{
$attribute_array['href'] = "javascript:GoToAnchor('".trim(substr($attribute_array['href'],1))."');";
}
}
// Build the attributes string
$attributes = '';
foreach($attribute_array as $k=>$v){
$attributes .= " {$k}=\"{$v}\"";
}
// Return the opening tag with attributes
static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1);
return "<{$element}{$attributes}". (isset($empty_elements[$element]) ? ' /' : ''). '>';
}

View File

@ -183,101 +183,6 @@ class html
return preg_replace( $Expr, "<a href=\"http://$0\" target=\"_blank\">$0</a>", $result );
}
/**
* activates URLs in a text, URLs get replaced by html-links using htmlpurify
*
* @param string $content text containing URLs
* @return string html with activated links
*/
static function activateLinks($content)
{
if (!$content || strlen($content) < 20) return $content; // performance
// spamsaver emailaddress
$result = preg_replace('/'.$NotAnchor.'mailto:([a-z0-9._-]+)@([a-z0-9_-]+)\.([a-z0-9._-]+)/i',
'<a href="#" onclick="document.location=\'mai\'+\'lto:\\1\'+unescape(\'%40\')+\'\\2.\\3\'; return false;">\\1 AT \\2 DOT \\3</a>',
$content);
$config = self::purifyCreateDefaultConfig();
$config->set('Core.Encoding', (self::$charset?self::$charset:'UTF-8'));
// maybe the two following lines are useful for caching???
$config->set('HTML.DefinitionID', 'activatelinks');
$config->set('HTML.DefinitionRev', 1);
// doctype and tidylevel
$config->set('HTML.Doctype', 'XHTML 1.0 Transitional');
$config->set('HTML.TidyLevel', 'light');
// EnableID is needed for anchor tags
$config->set('Attr.EnableID',true);
// enable target attributes
$config->set('Attr.AllowedFrameTargets','_blank,_top,_self,_parent');
// actual allowed tags and attributes
$config->set('URI.AllowedSchemes', array('http'=>true, 'https'=>true, 'ftp'=>true, 'file'=>true, 'cid'=>true, 'data'=>true));
$config->set('AutoFormat.RemoveEmpty', true);
$config->set('HTML.Allowed', 'br,p[align],b,i,u,s,em,pre,tt,strong,strike,sub,sup,center,div[align|style],hr[class|style],'.
'font[size|color],'.
'ul[type],ol[type|start],li,'.
'h1,h2,h3,h4,h5,h6,'.
'span[class|style],'.
'table[class|border|cellpadding|cellspacing|width|style|align|bgcolor|align],'.
'tbody,thead,tfoot,colgroup,'.
'col[width|span],'.
'blockquote[class|cite|dir],'.
'tr[class|style|align|bgcolor|align|valign],'.
'td[class|colspan|rowspan|width|style|align|bgcolor|align|valign|nowrap],'.
'th[class|colspan|rowspan|width|style|align|bgcolor|align|valign|nowrap],'.
'a[href|target|name|title],'.
'img[src|alt|title|align|style|width|height]');
$config->set('Attr.DefaultInvalidImage', 'Image removed by htmlpurify');
$config->set('Cache.SerializerPath', ($GLOBALS['egw_info']['server']['temp_dir']?$GLOBALS['egw_info']['server']['temp_dir']:sys_get_temp_dir()));
$config->set('AutoFormat.Linkify',true);
return self::purify($result,$config);
}
/**
* deactivates URLs in a text, URLs get replaced by html-links using htmlpurify
*
* @param string $content text containing URLs
* @return string html with activated links
*/
static function deactivateLinks($_html)
{
$config = self::purifyCreateDefaultConfig();
$config->set('Core.Encoding', (self::$charset?self::$charset:'UTF-8'));
// maybe the two following lines are useful for caching???
$config->set('HTML.DefinitionID', 'deactivatelinks');
$config->set('HTML.DefinitionRev', 1);
// doctype and tidylevel
$config->set('HTML.Doctype', 'XHTML 1.0 Transitional');
$config->set('HTML.TidyLevel', 'light');
// EnableID is needed for anchor tags
$config->set('Attr.EnableID',true);
// enable target attributes
$config->set('Attr.AllowedFrameTargets','_blank,_top,_self,_parent');
// actual allowed tags and attributes
$config->set('URI.AllowedSchemes', array('http'=>true, 'https'=>true, 'ftp'=>true, 'file'=>true, 'cid'=>true, 'data'=>true));
$config->set('AutoFormat.RemoveEmpty', true);
$config->set('HTML.Allowed', 'br,p[align],b,i,u,s,em,pre,tt,strong,strike,sub,sup,center,div[align|style],hr[class|style],'.
'font[size|color],'.
'ul[type],ol[type|start],li,'.
'h1,h2,h3,h4,h5,h6,'.
'span[class|style],'.
'table[class|border|cellpadding|cellspacing|width|style|align|bgcolor|align],'.
'tbody,thead,tfoot,colgroup,'.
'col[width|span],'.
'blockquote[class|cite|dir],'.
'tr[class|style|align|bgcolor|align|valign],'.
'td[class|colspan|rowspan|width|style|align|bgcolor|align|valign|nowrap],'.
'th[class|colspan|rowspan|width|style|align|bgcolor|align|valign|nowrap],'.
'a[href|target|name|title],'.
'img[src|alt|title|align|style|width|height]');
$config->set('Attr.DefaultInvalidImage', 'Image removed by htmlpurify');
$config->set('Cache.SerializerPath', ($GLOBALS['egw_info']['server']['temp_dir']?$GLOBALS['egw_info']['server']['temp_dir']:sys_get_temp_dir()));
$config->set('AutoFormat.DisplayLinkURI',true);
$_html = self::purify($_html,$config);
return $_html;
}
/**
* escapes chars with special meaning in html as entities
*
@ -1395,129 +1300,21 @@ class html
return $html;
}
/**
* creates the HTMLPurifier default config
*
* @return HTMLPurifier_Config object
*/
static function purifyCreateDefaultConfig()
{
// add htmlpurifiers library to include_path
require_once(EGW_API_INC.'/htmlpurifier/library/HTMLPurifier.path.php');
// include most of the required files, for best performance with bytecode caches
require_once(EGW_API_INC.'/htmlpurifier/library/HTMLPurifier.includes.php');
// installs an autoloader for other files
require_once(EGW_API_INC.'/htmlpurifier/library/HTMLPurifier.autoload.php');
// testcase to test the processing of purify
//$html = "<h1 onclick=\"alert('hallo');\"> h1 </h1>".$html;
return HTMLPurifier_Config::createDefault();
}
/**
* creates a HTMLPurifier default config for the needs of HTMLTidy
*
* @return HTMLPurifier_Config object
*/
static function purifyCreateHTMLTidyConfig()
{
$config = html::purifyCreateDefaultConfig();
// maybe the two following lines are useful for caching???
$config->set('HTML.DefinitionID', 'egroupwareHTMLTidyConfig');
$config->set('HTML.DefinitionRev', 1);
$config->set('Core.Encoding', (self::$charset?self::$charset:'UTF-8')); // doctype and tidylevel
$config->set('Core.RemoveInvalidImg', false);
$config->set('HTML.Doctype', 'XHTML 1.0 Transitional');
$config->set('HTML.TidyLevel', 'light');
$config->set('Attr.EnableID',true);
// enable target attributes
$config->set('Attr.AllowedFrameTargets','_blank,_top,_self,_parent');
// actual allowed tags and attributes
$config->set('URI.AllowedSchemes', array('http'=>true, 'https'=>true, 'ftp'=>true, 'file'=>true, 'cid'=>true));
$config->set('AutoFormat.RemoveEmpty', true);
$config->set('HTML.Allowed', 'br,p[align],b,i,u,s,em,pre,tt,strong,strike,sub,sup,center,div[align|style],hr[class|style],'.
'font[size|color],'.
'ul[type],ol[type|start],li,'.
'h1,h2,h3,h4,h5,h6,'.
'span[class|style],'.
'table[class|border|cellpadding|cellspacing|width|style|align|bgcolor|align],'.
'tbody,thead,tfoot,colgroup,'.
'col[width|span],'.
'blockquote[class|cite|dir],'.
'tr[class|style|align|bgcolor|align|valign],'.
'td[class|colspan|rowspan|width|style|align|bgcolor|align|valign|nowrap],'.
'th[class|colspan|rowspan|width|style|align|bgcolor|align|valign|nowrap],'.
'a[href|target|name|title],'.
'img[src|alt|title|align|style|width|height]');
$config->set('URI.AllowedSchemes', array('http'=>true, 'https'=>true, 'ftp'=>true, 'file'=>true, 'cid'=>true, 'data'=>true));
$config->set('Cache.SerializerPath', ($GLOBALS['egw_info']['server']['temp_dir']?$GLOBALS['egw_info']['server']['temp_dir']:sys_get_temp_dir()));
return $config;
}
/**
* Runs HTMLPurifier over supplied html to remove malicious code
*
* @param string $html
* @param HTMLPurifier_Config $config=null
*/
static function purify($html,$config=null)
static function purify($html,$config=null,$spec=array(),$_force=false)
{
static $purifier;
$defaultConfig = array('valid_xhtml'=>1,'safe'=>1);
if (empty($html)) return $html; // no need to process further
if (is_null($purifier) || !is_null($config))
{
// add htmlpurifiers library to include_path
require_once(EGW_API_INC.'/htmlpurifier/library/HTMLPurifier.path.php');
// include most of the required files, for best performance with bytecode caches
require_once(EGW_API_INC.'/htmlpurifier/library/HTMLPurifier.includes.php');
// installs an autoloader for other files
require_once(EGW_API_INC.'/htmlpurifier/library/HTMLPurifier.autoload.php');
// testcase to test the processing of purify
//$html = "<h1 onclick=\"alert('hallo');\"> h1 </h1>".$html;
if (is_null($config))
{
$config = HTMLPurifier_Config::createDefault();
$config->set('Core.Encoding', (self::$charset?self::$charset:'UTF-8'));
// maybe the two following lines are useful for caching???
$config->set('HTML.DefinitionID', 'egroupware');
$config->set('HTML.DefinitionRev', 1);
// doctype and tidylevel
$config->set('HTML.Doctype', 'XHTML 1.0 Transitional');
$config->set('HTML.TidyLevel', 'light');
// EnableID is needed for anchor tags
$config->set('Attr.EnableID',true);
// enable target attributes
$config->set('Attr.AllowedFrameTargets','_blank,_top,_self,_parent');
// actual allowed tags and attributes
$config->set('HTML.Allowed', 'br,p[class|align|style],b,i,u,s,em,pre,tt,strong,strike,sub,sup,center,div[class|align|style],hr[class|style],'.
'ul[class|type],ol[class|type|start],li,'.
'h1,h2,h3,h4,h5,h6,'.
'span[class|style],'.
'table[class|border|cellpadding|cellspacing|width|style|align|bgcolor|align],'.
'tbody,thead,tfoot,colgroup,'.
'col[class|width|span],'.
'blockquote[class|cite|dir],'.
'tr[class|style|align|bgcolor|align|valign],'.
'td[class|colspan|rowspan|width|style|align|bgcolor|align|valign|nowrap],'.
'th[class|colspan|rowspan|width|style|align|bgcolor|align|valign|nowrap],'.
'a[class|href|target|name|title],'.
'img[class|src|alt|title|align|style|width|height]');
$config->set('Cache.SerializerPath', ($GLOBALS['egw_info']['server']['temp_dir']?$GLOBALS['egw_info']['server']['temp_dir']:sys_get_temp_dir()));
}
$purifier = new HTMLPurifier($config);
// the latter may enable you to modify the config later on, but by now
// the effort for e.g. enabling anchor tags is already included above
//$def =& $purifier->config->getHTMLDefinition(true);
//$def->addAttribute('a', 'name', 'Text');
}
$result = $purifier->purify( $html );
//error_log(__METHOD__.$purifier->version);
return $result;
$htmLawed = new egw_htmLawed();
if (is_array($config) && $_force===false) $config = array_merge($defaultConfig, $config);
if (empty($config)) $config = $defaultConfig;
//error_log(__METHOD__.__LINE__.array2string($config));
return $htmLawed->egw_htmLawed($html,$config,$spec);
}
/**

View File

@ -1036,7 +1036,7 @@ class translation
// some characterreplacements, as they fail to translate
$sar = array(
'@(\x84|\x93|\x94)@',
'@(\x96|\x97)@',
'@(\x96|\x97|\x1a)@',
'@(\x91|\x92)@',
'@(\x85)@',
'@(\x86)@',
@ -1138,29 +1138,38 @@ class translation
static function replaceTagsCompletley(&$_body,$tag,$endtag='',$addbracesforendtag=true)
{
if ($tag) $tag = strtolower($tag);
$singleton = false;
if ($endtag=='/>') $singleton =true;
if ($endtag == '' || empty($endtag) || !isset($endtag))
{
$endtag = $tag;
} else {
$endtag = strtolower($endtag);
//error_log(__METHOD__.' Using EndTag:'.$endtag);
//error_log(__METHOD__.' Using EndTag:'.$endtag);
}
// strip tags out of the message completely with their content
$taglen=strlen($tag);
$endtaglen=strlen($endtag);
if ($_body) {
if ($addbracesforendtag === true )
if ($singleton)
{
$_body = preg_replace('~<'.$tag.'[^>]*?>(.*)</'.$endtag.'[\s]*>~simU','',$_body);
// remove left over tags, unfinished ones, and so on
$_body = preg_replace('~<'.$tag.'[^>]*?>~si','',$_body);
$_body = preg_replace('~<'.$tag.'[^>].*? '.$endtag.'~simU','',$_body);
}
if ($addbracesforendtag === false )
else
{
$_body = preg_replace('~<'.$tag.'[^>]*?>(.*)'.$endtag.'~simU','',$_body);
// remove left over tags, unfinished ones, and so on
$_body = preg_replace('~<'.$tag.'[^>]*?>~si','',$_body);
$_body = preg_replace('~'.$endtag.'~','',$_body);
if ($addbracesforendtag === true )
{
$_body = preg_replace('~<'.$tag.'[^>]*?>(.*)</'.$endtag.'[\s]*>~simU','',$_body);
// remove left over tags, unfinished ones, and so on
$_body = preg_replace('~<'.$tag.'[^>]*?>~si','',$_body);
}
if ($addbracesforendtag === false )
{
$_body = preg_replace('~<'.$tag.'[^>]*?>(.*)'.$endtag.'~simU','',$_body);
// remove left over tags, unfinished ones, and so on
$_body = preg_replace('~<'.$tag.'[^>]*?>~si','',$_body);
$_body = preg_replace('~'.$endtag.'~','',$_body);
}
}
}
}