egroupware/phpgwapi/inc/class.egw_htmLawed.inc.php

347 lines
16 KiB
PHP

<?php
/**
* eGroupWare API: egw class to include (and configure (basic)) htmLawed by Santosh Patnaik
*
* @link http://www.egroupware.org
* @license http://opensource.org/licenses/gpl-license.php GPL - GNU General Public License
* @package api
* @subpackage htmLawed
* @author Klaus Leithoff <kl-AT-stylite.de>
* @version $Id$
*/
require_once(EGW_API_INC.'/htmLawed/htmLawed.php');
/**
* This class does NOT use anything EGroupware specific, it just calls htmLawed and supports autoloading
* while matching egw namespace requirements. It also provides (as a non class function ) a hook_tag function
* to do further tag / attribute validation
*/
class egw_htmLawed
{
/**
* config options see constructor
*
* @var Configuration
*/
var $Configuration;
/**
* The $spec argument can be used to disallow an otherwise legal attribute for an element,
* or to restrict the attribute's values. This can also be helpful as a security measure
* (e.g., in certain versions of browsers, certain values can cause buffer overflows and
* denial of service attacks), or in enforcing admin policy compliance. $spec is specified
* as a string of text containing one or more rules, with multiple rules separated from each
* other by a semi-colon (;)
*
* @var Spec
*/
var $Spec;
/**
* Constructor
*/
function __construct()
{
// may hold some Standard configuration
/*
$cfg = array(
'abs_url'=>array('3', '0', 'absolute/relative URL conversion', '-1'),
'and_mark'=>array('2', '0', 'mark original <em>&amp;</em> chars', '0', 'd'=>1), // 'd' to disable
'anti_link_spam'=>array('1', '0', 'modify <em>href</em> values as an anti-link spam measure', '0', array(array('30', '1', '', 'regex for extra <em>rel</em>'), array('30', '2', '', 'regex for no <em>href</em>'))),
'anti_mail_spam'=>array('1', '0', 'replace <em>@</em> in <em>mailto:</em> URLs', '0', '8', 'NO@SPAM', 'replacement'),
'balance'=>array('2', '1', 'fix nestings and balance tags', '0'),
'base_url'=>array('', '', 'base URL', '25'),
'cdata'=>array('4', 'nil', 'allow <em>CDATA</em> sections', 'nil'),
'clean_ms_char'=>array('3', '0', 'replace bad characters introduced by Microsoft apps. like <em>Word</em>', '0'),
'comment'=>array('4', 'nil', 'allow HTML comments', 'nil'),
'css_expression'=>array('2', 'nil', 'allow dynamic expressions in CSS style properties', 'nil'),
'deny_attribute'=>array('1', '0', 'denied attributes', '0', '50', '', 'these'),
'direct_list_nest'=>array('2', 'nil', 'allow direct nesting of a list within another without requiring it to be a list item', 'nil'),
'elements'=>array('', '', 'allowed elements', '50'),
'hexdec_entity'=>array('3', '1', 'convert hexadecimal numeric entities to decimal ones, or vice versa', '0'),
'hook'=>array('', '', 'name of hook function', '25'),
'hook_tag'=>array('', '', 'name of custom function to further check attribute values', '25'),
'keep_bad'=>array('7', '6', 'keep, or remove <em>bad</em> tag content', '0'),
'lc_std_val'=>array('2', '1', 'lower-case std. attribute values like <em>radio</em>', '0'),
'make_tag_strict'=>array('3', 'nil', 'transform deprecated elements', 'nil'), 3 is a new own config value, to indicate that transformation is to be performed, but don't transform font as size transformation of numeric sizes to keywords alters the intended result too much
'named_entity'=>array('2', '1', 'allow named entities, or convert numeric ones', '0'),
'no_deprecated_attr'=>array('3', '1', 'allow deprecated attributes, or transform them', '0'),
'parent'=>array('', 'div', 'name of parent element', '25'),
'safe'=>array('2', '0', 'for most <em>safe</em> HTML', '0'),
'schemes'=>array('', 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https', 'allowed URL protocols', '50'),
'show_setting'=>array('', 'htmLawed_setting', 'variable name to record <em>finalized</em> htmLawed settings', '25', 'd'=>1),
'style_pass'=>array('2', 'nil', 'do not look at <em>style</em> attribute values', 'nil'),
'tidy'=>array('3', '0', 'beautify/compact', '-1', '8', '1t1', 'format'),
'unique_ids'=>array('2', '1', 'unique <em>id</em> values', '0', '8', 'my_', 'prefix'),
'valid_xhtml'=>array('2', 'nil', 'auto-set various parameters for most valid XHTML', 'nil'),
'xml:lang'=>array('3', 'nil', 'auto-add <em>xml:lang</em> attribute', '0'),
'allow_for_inline' => array('table'),//block elements allowed for nesting when only inline is allowed; Example span does not allow block elements as table; table is the only element tested so far
);
*/
$this->Configuration = array('comment'=>1, //remove comments
'make_tag_strict'=>3,//3 is a new own config value, to indicate that transformation is to be performed, but don't transform font, as size transformation of numeric sizes to keywords alters the intended result too much
'balance'=>0,//turn off tag-balancing (config['balance']=>0). That will not introduce any security risk; only standards-compliant tag nesting check/filtering will be turned off (basic tag-balance will remain; i.e., there won't be any unclosed tag, etc., after filtering)
// tidy eats away even some wanted whitespace, so we switch it off;
// we used it for its compacting and beautifying capabilities, which resulted in better html for further processing
'tidy'=>0,
'elements' => "* -script",
'deny_attribute' => 'on*',
'schemes'=>'href: file, ftp, http, https, mailto; src: cid, data, file, ftp, http, https; *:file, http, https',
'hook_tag' =>"hl_my_tag_transform",
);
$this->Spec = 'img=alt(noneof="image"/default="")';
}
/**
* egw_htmlLawed
*
* @param varchar $html2check =text input Text to check
* @param mixed $Config = text or array
* @param mixed $Spec =text or array; The '$spec' argument can be used to disallow an otherwise legal attribute for an element
* @return varchar cleaned/fixed html
*/
function egw_htmLawed($html2check, $Config=null, $Spec=array())
{
//error_log(__METHOD__.__LINE__.' Input:'.$html2check);
if (is_array($Config) && is_array($this->Configuration)) $Config = array_merge($this->Configuration, $Config);
if (empty($Config)) $Config = $this->Configuration;
if (empty($Spec)) $Spec = $this->Spec;
// If we are processing mails, we take out stuff in <style> stuff </style> tags and
// put it back in after purifying; styles are processed for known security risks
// in html::getStyles
$styles='';
// we allow filtered style sections now throughout egroupware
/*if ($Config['hook_tag'] =="hl_email_tag_transform")*/ $styles = html::getStyles($html2check);
//error_log(__METHOD__.__LINE__.array2string($styles));
return ($styles?$styles:'').htmLawed($html2check, $Config, $Spec);
}
}
/**
* hl_my_tag_transform
*
* function to provide individual checks for element attribute pairs
* implemented so far: img checking for alt attribute == image; set this to empty
* a checking for title, replacing @
* blockquote checking for cite, replacing @
*/
function hl_my_tag_transform($element, $attribute_array=0)
{
// If second argument is not received, it means a closing tag is being handled
if(is_numeric($attribute_array)){
return "</$element>";
}
//if ($element=='img') error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array));
if ($element=='td' && isset($attribute_array['background']))
{
if (is_object($GLOBALS['egw']) && stripos($attribute_array['background'],$GLOBALS['egw']->link('/index.php'))!==false)
{
//error_log(__METHOD__.__LINE__.array2string($attribute_array));
//$attribute_array['background'] = 'url('.$attribute_array['background'].');';
}
else
{
// $attribute_array['background']='denied:'.$attribute_array['background'];
unset($attribute_array['background']);// only internal background images are allowed
}
}
// Elements other than 'img' or 'img' without a 'img' attribute are returned unchanged
if($element == 'img')
{
// Re-build 'alt'
if (isset($attribute_array['alt'])) $attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']);
if (isset($attribute_array['alt'])&&strpos($attribute_array['alt'],'@')!==false) $attribute_array['alt']=str_replace('@','(at)',$attribute_array['alt']);
}
if (isset($attribute_array['title']))
{
if (strpos($attribute_array['title'],'@')!==false) $attribute_array['title']=str_replace('@','(at)',$attribute_array['title']);
}
if ($element == 'blockquote')
{
if (isset($attribute_array['cite']))
{
if (strpos($attribute_array['cite'],'@')!==false) $attribute_array['cite']=str_replace('@','(at)',$attribute_array['cite']);
}
}
/*
// Elements other than 'span' or 'span' without a 'style' attribute are returned unchanged
if($element == 'span' && isset($attribute_array['style']))
{
// Identify CSS properties and values
$css = explode(';', $attribute_array['style']);
$style = array();
foreach($css as $v){
if(($p = strpos($v, ':')) > 1 && $p < strlen($v)){
$css_property_name = trim(substr($v, 0, $p));
$css_property_value = trim(substr($v, $p+1));
$style[] = "$css_property_name: $css_property_value";
}
}
// Alter the CSS property as required
// Black Arial must be at a font-size of 24
if(isset($style['font-family']) && $style['font-family'] == 'Arial' && isset($style['color']) && $style['color'] == '#000000'){
$style['font-size'] == '24';
}
// And so on for other criteria
// ...
// Re-build 'style'
$attribute_array['style'] = implode('; ', $style);
}
*/
if (isset($attribute_array['style']) && stripos($attribute_array['style'],'script')!==false) $attribute_array['style'] = str_ireplace('script','',$attribute_array['style']);
if($element == 'a')
{
//error_log(__METHOD__.__LINE__.array2string($attribute_array));
// rebuild Anchors, if processed by hl_email_tag_transform
if (strpos($attribute_array['href'],"denied:javascript:GoToAnchor('")===0)
{
$attribute_array['href']=str_ireplace("');",'',str_ireplace("denied:javascript:GoToAnchor('","#",$attribute_array['href']));
}
if (strpos($attribute_array['href'],"javascript:GoToAnchor('")===0)
{
$attribute_array['href']=str_ireplace("');",'',str_ireplace("javascript:GoToAnchor('","#",$attribute_array['href']));
}
if (strpos($attribute_array['href'],'denied:javascript')===0) $attribute_array['href']='';
}
// Build the attributes string
$attributes = '';
foreach($attribute_array as $k=>$v){
$attributes .= " {$k}=\"{$v}\"";
}
// Return the opening tag with attributes
static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1);
return "<{$element}{$attributes}". (isset($empty_elements[$element]) ? ' /' : ''). '>';
}
/**
* hl_email_tag_transform
*
* function to provide individual checks for element attribute pairs
* implemented so far: img -checking for alt attribute == image; set this to empty
* -control for/on external Images and src-length
* a -checking for title and href, replacing @ accordingly
* -navigate to local anchors without reloading the page
* blockquote -checking for cite, replacing @
* throwing away excess div elements, that carry no style or class or id info
*/
function hl_email_tag_transform($element, $attribute_array=0)
{
//error_log(__METHOD__.__LINE__.$element.array2string($attribute_array));
static $lastelement;
static $throwawaycounter;
if (is_null($lastelement)) $lastelement='';
if (is_null($throwawaycounter)) $throwawaycounter = 0;
//if ($throwawaycounter>1) error_log(__METHOD__.__LINE__.' '.$throwawaycounter.$element.array2string($attribute_array));
if ($element=='div' && $element==$lastelement && ($attribute_array==0 || empty($attribute_array)))
{
if (is_array($attribute_array)) $throwawaycounter++;
if ($attribute_array==0 && $throwawaycounter>0) $throwawaycounter--;
if ($throwawaycounter>1) return '';
}
if ($lastelement=='div' && $element!=$lastelement && is_array($attribute_array)) $throwawaycounter = 0;
if (is_array($attribute_array) && !empty($attribute_array) && $element=='div')
{
$lastelement = 'div_with_attr';
}
else
{
if (is_array($attribute_array)) $lastelement = $element;
}
// If second argument is not received, it means a closing tag is being handled
if(is_numeric($attribute_array)){
if($element==$lastelement) $lastelement='';
return "</$element>";
}
//if ($element=='a') error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array));
if ($element=='td' && isset($attribute_array['background']))
{
if (stripos($attribute_array['background'],'cid:')!==false)
{
//error_log(__METHOD__.__LINE__.array2string($attribute_array));
//$attribute_array['background'] = 'url('.$attribute_array['background'].');';
}
else
{
// $attribute_array['background']='denied:'.$attribute_array['background'];
unset($attribute_array['background']);// only cid style background images are allowed
}
}
// Elements other than 'img' or 'img' without a 'img' attribute are returned unchanged
if($element == 'img')
{
// Re-build 'alt'
if (isset($attribute_array['alt'])) $attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']);
if (isset($attribute_array['alt'])&&strpos($attribute_array['alt'],'@')!==false) $attribute_array['alt']=str_replace('@','(at)',$attribute_array['alt']);
// $GLOBALS['egw_info']['user']['preferences']['mail']['allowExternalIMGs'] ? '' : 'match' => '/^cid:.*/'),
if (isset($attribute_array['src']))
{
if (!(strlen($attribute_array['src'])>4 && strlen($attribute_array['src']<400)))
{
$attribute_array['alt']= $attribute_array['alt'].' [blocked (reason: url length):'.$attribute_array['src'].']';
if (!isset($attribute_array['title'])) $attribute_array['title']=$attribute_array['alt'];
$attribute_array['src']=common::image('phpgwapi','dialog_error');
}
if (!$GLOBALS['egw_info']['user']['preferences']['mail']['allowExternalIMGs'])
{
if (!preg_match('/^cid:.*/',$attribute_array['src']))
{
$attribute_array['alt']= $attribute_array['alt'].' [blocked external image:'.$attribute_array['src'].']';
if (!isset($attribute_array['title'])) $attribute_array['title']=$attribute_array['alt'];
$attribute_array['src']=common::image('mail','no-image-shown');
$attribute_array['border'] = 1;
if ($attribute_array['style'])
{
if (stripos($attribute_array['style'],'border')!==false) $attribute_array['style'] = preg_replace('~border(:|-left:|-right:|-bottom:|-top:)+ (0px)+ (none)+;~si','',$attribute_array['style']);
}
}
}
}
}
if (isset($attribute_array['style']) && stripos($attribute_array['style'],'script')!==false) $attribute_array['style'] = str_ireplace('script','',$attribute_array['style']);
if (isset($attribute_array['title']))
{
if (strpos($attribute_array['title'],'@')!==false) $attribute_array['title']=str_replace('@','(at)',$attribute_array['title']);
}
if ($element == 'blockquote')
{
if (isset($attribute_array['cite']))
{
if (strpos($attribute_array['cite'],'@')!==false) $attribute_array['cite']=str_replace('@','(at)',$attribute_array['cite']);
}
}
if($element == 'a')
{
//error_log(__METHOD__.__LINE__.array2string($attribute_array));
if (strpos($attribute_array['href'],'denied:javascript')===0) $attribute_array['href']='';
if (isset($attribute_array['name']) && isset($attribute_array['id'])) $attribute_array['id'] = $attribute_array['name'];
if (strpos($attribute_array['href'],'@')!==false) $attribute_array['href'] = str_replace('@','%40',$attribute_array['href']);
if (strpos($attribute_array['href'],'#')===0 && (isset(emailadmin_imapbase::$htmLawed_config['transform_anchor']) && emailadmin_imapbase::$htmLawed_config['transform_anchor']===true))
{
$attribute_array['href'] = "javascript:GoToAnchor('".trim(substr($attribute_array['href'],1))."');";
}
}
// Build the attributes string
$attributes = '';
foreach($attribute_array as $k=>$v){
$attributes .= " {$k}=\"{$v}\"";
}
// Return the opening tag with attributes
static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1);
return "<{$element}{$attributes}". (isset($empty_elements[$element]) ? ' /' : ''). '>';
}