* @version $Id$ */ require_once(EGW_API_INC.'/htmLawed/htmLawed.php'); /** * This class does NOT use anything EGroupware specific, it just calls htmLawed and supports autoloading * while matching egw namespace requirements. It also provides (as a non class function ) a hook_tag function * to do further tag / attribute validation */ class egw_htmLawed { /** * config options see constructor * * @var Configuration */ var $Configuration; /** * The $spec argument can be used to disallow an otherwise legal attribute for an element, * or to restrict the attribute's values. This can also be helpful as a security measure * (e.g., in certain versions of browsers, certain values can cause buffer overflows and * denial of service attacks), or in enforcing admin policy compliance. $spec is specified * as a string of text containing one or more rules, with multiple rules separated from each * other by a semi-colon (;) * * @var Spec */ var $Spec; /** * Constructor */ function __construct() { // may hold some Standard configuration /* $cfg = array( 'abs_url'=>array('3', '0', 'absolute/relative URL conversion', '-1'), 'and_mark'=>array('2', '0', 'mark original & chars', '0', 'd'=>1), // 'd' to disable 'anti_link_spam'=>array('1', '0', 'modify href values as an anti-link spam measure', '0', array(array('30', '1', '', 'regex for extra rel'), array('30', '2', '', 'regex for no href'))), 'anti_mail_spam'=>array('1', '0', 'replace @ in mailto: URLs', '0', '8', 'NO@SPAM', 'replacement'), 'balance'=>array('2', '1', 'fix nestings and balance tags', '0'), 'base_url'=>array('', '', 'base URL', '25'), 'cdata'=>array('4', 'nil', 'allow CDATA sections', 'nil'), 'clean_ms_char'=>array('3', '0', 'replace bad characters introduced by Microsoft apps. like Word', '0'), 'comment'=>array('4', 'nil', 'allow HTML comments', 'nil'), 'css_expression'=>array('2', 'nil', 'allow dynamic expressions in CSS style properties', 'nil'), 'deny_attribute'=>array('1', '0', 'denied attributes', '0', '50', '', 'these'), 'direct_list_nest'=>array('2', 'nil', 'allow direct nesting of a list within another without requiring it to be a list item', 'nil'), 'elements'=>array('', '', 'allowed elements', '50'), 'hexdec_entity'=>array('3', '1', 'convert hexadecimal numeric entities to decimal ones, or vice versa', '0'), 'hook'=>array('', '', 'name of hook function', '25'), 'hook_tag'=>array('', '', 'name of custom function to further check attribute values', '25'), 'keep_bad'=>array('7', '6', 'keep, or remove bad tag content', '0'), 'lc_std_val'=>array('2', '1', 'lower-case std. attribute values like radio', '0'), 'make_tag_strict'=>array('3', 'nil', 'transform deprecated elements', 'nil'), 'named_entity'=>array('2', '1', 'allow named entities, or convert numeric ones', '0'), 'no_deprecated_attr'=>array('3', '1', 'allow deprecated attributes, or transform them', '0'), 'parent'=>array('', 'div', 'name of parent element', '25'), 'safe'=>array('2', '0', 'for most safe HTML', '0'), 'schemes'=>array('', 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https', 'allowed URL protocols', '50'), 'show_setting'=>array('', 'htmLawed_setting', 'variable name to record finalized htmLawed settings', '25', 'd'=>1), 'style_pass'=>array('2', 'nil', 'do not look at style attribute values', 'nil'), 'tidy'=>array('3', '0', 'beautify/compact', '-1', '8', '1t1', 'format'), 'unique_ids'=>array('2', '1', 'unique id values', '0', '8', 'my_', 'prefix'), 'valid_xhtml'=>array('2', 'nil', 'auto-set various parameters for most valid XHTML', 'nil'), 'xml:lang'=>array('3', 'nil', 'auto-add xml:lang attribute', '0'), ); */ $this->Configuration = array('comment'=>1, //remove comments 'balance'=>0,//turn off tag-balancing (config['balance']=>0). That will not introduce any security risk; only standards-compliant tag nesting check/filtering will be turned off (basic tag-balance will remain; i.e., there won't be any unclosed tag, etc., after filtering) 'tidy'=>1, 'elements' => "* -script", 'schemes'=>'href: file, ftp, http, https, mailto; src: cid, data, file, ftp, http, https; *:file, http, https', 'hook_tag' =>"hl_my_tag_transform", ); $this->Spec = 'img=alt(noneof="image"/default="")'; } /** * egw_htmlLawed * * @param varchar $html2check =text input Text to check * @param mixed $Config = text or array * @param mixed $Spec =text or array; The '$spec' argument can be used to disallow an otherwise legal attribute for an element * @return varchar cleaned/fixed html */ function egw_htmLawed($html2check, $Config=null, $Spec=array()) { //error_log(__METHOD__.__LINE__.' Input:'.$html2check); if (is_array($Config) && is_array($this->Configuration)) $Config = array_merge($this->Configuration, $Config); if (empty($Config)) $Config = $this->Configuration; if (empty($Spec)) $Spec = $this->Spec; return htmLawed($html2check, $Config, $Spec); } } /** * hl_my_tag_transform * * function to provide individual checks for element attribute pairs * implemented so far: img checking for alt attribute == image; set this to empty * a checking for title, replacing @ * blockquote checking for cite, replacing @ */ function hl_my_tag_transform($element, $attribute_array=0) { // If second argument is not received, it means a closing tag is being handled if(is_numeric($attribute_array)){ return ""; } //if ($element=='img') error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array)); // Elements other than 'img' or 'img' without a 'img' attribute are returned unchanged if($element == 'img') { // Re-build 'alt' if (isset($attribute_array['alt'])) $attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']); if (isset($attribute_array['alt'])&&strpos($attribute_array['alt'],'@')!==false) $attribute_array['alt']=str_replace('@','(at)',$attribute_array['alt']); } if (isset($attribute_array['title'])) { if (strpos($attribute_array['title'],'@')!==false) $attribute_array['title']=str_replace('@','(at)',$attribute_array['title']); } if ($element == 'blockquote') { if (isset($attribute_array['cite'])) { if (strpos($attribute_array['cite'],'@')!==false) $attribute_array['cite']=str_replace('@','(at)',$attribute_array['cite']); } } /* // Elements other than 'span' or 'span' without a 'style' attribute are returned unchanged if($element == 'span' && isset($attribute_array['style'])) { // Identify CSS properties and values $css = explode(';', $attribute_array['style']); $style = array(); foreach($css as $v){ if(($p = strpos($v, ':')) > 1 && $p < strlen($v)){ $css_property_name = trim(substr($v, 0, $p)); $css_property_value = trim(substr($v, $p+1)); $style[] = "$css_property_name: $css_property_value"; } } // Alter the CSS property as required // Black Arial must be at a font-size of 24 if(isset($style['font-family']) && $style['font-family'] == 'Arial' && isset($style['color']) && $style['color'] == '#000000'){ $style['font-size'] == '24'; } // And so on for other criteria // ... // Re-build 'style' $attribute_array['style'] = implode('; ', $style); } */ // unwanted javascript static $pregFindScript = '/\b(on(before)?(abort|blur|change|click|dblclick|error|focus|keydown|keypress|keyup|load|mousedown|mousemove|mouseout|mouseover|mouseup|reset|select|submit|unload))\b/i'; // Build the attributes string $attributes = ''; foreach($attribute_array as $k=>$v){ //error_log(__METHOD__.__LINE__.' '.$k.'->'.preg_match($preg,$k)); if (!preg_match($pregFindScript,$k)) $attributes .= " {$k}=\"{$v}\""; } // Return the opening tag with attributes static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); return "<{$element}{$attributes}". (isset($empty_elements[$element]) ? ' /' : ''). '>'; } /** * hl_email_tag_transform * * function to provide individual checks for element attribute pairs * implemented so far: img -checking for alt attribute == image; set this to empty * -control for/on external Images and src-length * a -checking for title and href, replacing @ accordingly * -navigate to local anchors without reloading the page * blockquote -checking for cite, replacing @ * throwing away excess div elements, that carry no style or class or id info */ function hl_email_tag_transform($element, $attribute_array=0) { static $lastelement; static $throwawaycounter; if (is_null($throwawaycounter)) $throwawaycounter = 0; //if ($throwawaycounter>250) error_log(__METHOD__.__LINE__.' '.$throwawaycounter); if ($element=='div' && $element==$lastelement && ($attribute_array==0 || empty($attribute_array))) { if (is_array($attribute_array)) $throwawaycounter++; if ($attribute_array==0 && $throwawaycounter>0) $throwawaycounter--; if ($throwawaycounter>0) return ''; } if (is_array($attribute_array) && !empty($attribute_array) && $element=='div') { $lastelement = 'div_with_attr'; } else { $lastelement = $element; } // If second argument is not received, it means a closing tag is being handled if(is_numeric($attribute_array)){ return ""; } //if ($element=='a') error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array)); // Elements other than 'img' or 'img' without a 'img' attribute are returned unchanged if($element == 'img') { // Re-build 'alt' if (isset($attribute_array['alt'])) $attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']); if (isset($attribute_array['alt'])&&strpos($attribute_array['alt'],'@')!==false) $attribute_array['alt']=str_replace('@','(at)',$attribute_array['alt']); // $GLOBALS['egw_info']['user']['preferences']['felamimail']['allowExternalIMGs'] ? '' : 'match' => '/^cid:.*/'), if (isset($attribute_array['src'])) { if (!(strlen($attribute_array['src'])>4 && strlen($attribute_array['src']<400))) { $attribute_array['alt']= $attribute_array['alt'].' [blocked (reason: url length):'.$attribute_array['src'].']'; if (!isset($attribute_array['title'])) $attribute_array['title']=$attribute_array['alt']; $attribute_array['src']=common::image('phpgwapi','dialog_error'); } if (!$GLOBALS['egw_info']['user']['preferences']['felamimail']['allowExternalIMGs']) { if (!preg_match('/^cid:.*/',$attribute_array['src'])) { $attribute_array['alt']= $attribute_array['alt'].' [blocked external image:'.$attribute_array['src'].']'; if (!isset($attribute_array['title'])) $attribute_array['title']=$attribute_array['alt']; $attribute_array['src']=common::image('phpgwapi','dialog_error'); } } } } if (isset($attribute_array['title'])) { if (strpos($attribute_array['title'],'@')!==false) $attribute_array['title']=str_replace('@','(at)',$attribute_array['title']); } if ($element == 'blockquote') { if (isset($attribute_array['cite'])) { if (strpos($attribute_array['cite'],'@')!==false) $attribute_array['cite']=str_replace('@','(at)',$attribute_array['cite']); } } if($element == 'a') { //error_log(array2string($attribute_array)); if (strpos($attribute_array['href'],'denied:javascript')===0) $attribute_array['href']=''; if (isset($attribute_array['name']) && isset($attribute_array['id'])) $attribute_array['id'] = $attribute_array['name']; if (strpos($attribute_array['href'],'@')!==false) $attribute_array['href'] = str_replace('@','%40',$attribute_array['href']); if (strpos($attribute_array['href'],'#')===0) { $attribute_array['href'] = "javascript:GoToAnchor('".trim(substr($attribute_array['href'],1))."');"; } } // unwanted javascript static $pregFindScript = '/\b(on(before)?(abort|blur|change|click|dblclick|error|focus|keydown|keypress|keyup|load|mousedown|mousemove|mouseout|mouseover|mouseup|reset|select|submit|unload))\b/i'; // Build the attributes string $attributes = ''; foreach($attribute_array as $k=>$v){ //error_log(__METHOD__.__LINE__.' '.$k.'->'.preg_match($preg,$k)); if (!preg_match($pregFindScript,$k)) $attributes .= " {$k}=\"{$v}\""; } // Return the opening tag with attributes static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); return "<{$element}{$attributes}". (isset($empty_elements[$element]) ? ' /' : ''). '>'; }