* @version $Id$ */ require_once(EGW_API_INC.'/htmLawed/htmLawed.php'); /** * This class does NOT use anything EGroupware specific, it just calls htmLawed and supports autoloading * while matching egw namespace requirements. It also provides (as a non class function ) a hook_tag function * to do further tag / attribute validation */ class egw_htmLawed { /** * config options see constructor * * @var Configuration */ var $Configuration; /** * The $spec argument can be used to disallow an otherwise legal attribute for an element, * or to restrict the attribute's values. This can also be helpful as a security measure * (e.g., in certain versions of browsers, certain values can cause buffer overflows and * denial of service attacks), or in enforcing admin policy compliance. $spec is specified * as a string of text containing one or more rules, with multiple rules separated from each * other by a semi-colon (;) * * @var Spec */ var $Spec; /** * Constructor */ function __construct() { // may hold some Standard configuration /* $cfg = array( 'abs_url'=>array('3', '0', 'absolute/relative URL conversion', '-1'), 'and_mark'=>array('2', '0', 'mark original & chars', '0', 'd'=>1), // 'd' to disable 'anti_link_spam'=>array('1', '0', 'modify href values as an anti-link spam measure', '0', array(array('30', '1', '', 'regex for extra rel'), array('30', '2', '', 'regex for no href'))), 'anti_mail_spam'=>array('1', '0', 'replace @ in mailto: URLs', '0', '8', 'NO@SPAM', 'replacement'), 'balance'=>array('2', '1', 'fix nestings and balance tags', '0'), 'base_url'=>array('', '', 'base URL', '25'), 'cdata'=>array('4', 'nil', 'allow CDATA sections', 'nil'), 'clean_ms_char'=>array('3', '0', 'replace bad characters introduced by Microsoft apps. like Word', '0'), 'comment'=>array('4', 'nil', 'allow HTML comments', 'nil'), 'css_expression'=>array('2', 'nil', 'allow dynamic expressions in CSS style properties', 'nil'), 'deny_attribute'=>array('1', '0', 'denied attributes', '0', '50', '', 'these'), 'direct_list_nest'=>array('2', 'nil', 'allow direct nesting of a list within another without requiring it to be a list item', 'nil'), 'elements'=>array('', '', 'allowed elements', '50'), 'hexdec_entity'=>array('3', '1', 'convert hexadecimal numeric entities to decimal ones, or vice versa', '0'), 'hook'=>array('', '', 'name of hook function', '25'), 'hook_tag'=>array('', '', 'name of custom function to further check attribute values', '25'), 'keep_bad'=>array('7', '6', 'keep, or remove bad tag content', '0'), 'lc_std_val'=>array('2', '1', 'lower-case std. attribute values like radio', '0'), 'make_tag_strict'=>array('3', 'nil', 'transform deprecated elements', 'nil'), 3 is a new own config value, to indicate that transformation is to be performed, but don't transform font as size transformation of numeric sizes to keywords alters the intended result too much 'named_entity'=>array('2', '1', 'allow named entities, or convert numeric ones', '0'), 'no_deprecated_attr'=>array('3', '1', 'allow deprecated attributes, or transform them', '0'), 'parent'=>array('', 'div', 'name of parent element', '25'), 'safe'=>array('2', '0', 'for most safe HTML', '0'), 'schemes'=>array('', 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https', 'allowed URL protocols', '50'), 'show_setting'=>array('', 'htmLawed_setting', 'variable name to record finalized htmLawed settings', '25', 'd'=>1), 'style_pass'=>array('2', 'nil', 'do not look at style attribute values', 'nil'), 'tidy'=>array('3', '0', 'beautify/compact', '-1', '8', '1t1', 'format'), 'unique_ids'=>array('2', '1', 'unique id values', '0', '8', 'my_', 'prefix'), 'valid_xhtml'=>array('2', 'nil', 'auto-set various parameters for most valid XHTML', 'nil'), 'xml:lang'=>array('3', 'nil', 'auto-add xml:lang attribute', '0'), 'allow_for_inline' => array('table'),//block elements allowed for nesting when only inline is allowed; Example span does not allow block elements as table; table is the only element tested so far ); */ $this->Configuration = array('comment'=>1, //remove comments 'make_tag_strict'=>3,//3 is a new own config value, to indicate that transformation is to be performed, but don't transform font, as size transformation of numeric sizes to keywords alters the intended result too much 'balance'=>0,//turn off tag-balancing (config['balance']=>0). That will not introduce any security risk; only standards-compliant tag nesting check/filtering will be turned off (basic tag-balance will remain; i.e., there won't be any unclosed tag, etc., after filtering) // tidy eats away even some wanted whitespace, so we switch it off; // we used it for its compacting and beautifying capabilities, which resulted in better html for further processing 'tidy'=>0, 'elements' => "* -script", 'deny_attribute' => 'on*', 'schemes'=>'href: file, ftp, http, https, mailto; src: cid, data, file, ftp, http, https; *:file, http, https', 'hook_tag' =>"hl_my_tag_transform", ); $this->Spec = 'img=alt(noneof="image"/default="")'; } /** * egw_htmlLawed * * @param varchar $html2check =text input Text to check * @param mixed $Config = text or array * @param mixed $Spec =text or array; The '$spec' argument can be used to disallow an otherwise legal attribute for an element * @return varchar cleaned/fixed html */ function egw_htmLawed($html2check, $Config=null, $Spec=array()) { //error_log(__METHOD__.__LINE__.' Input:'.$html2check); if (is_array($Config) && is_array($this->Configuration)) $Config = array_merge($this->Configuration, $Config); if (empty($Config)) $Config = $this->Configuration; if (empty($Spec)) $Spec = $this->Spec; return htmLawed($html2check, $Config, $Spec); } } /** * hl_my_tag_transform * * function to provide individual checks for element attribute pairs * implemented so far: img checking for alt attribute == image; set this to empty * a checking for title, replacing @ * blockquote checking for cite, replacing @ */ function hl_my_tag_transform($element, $attribute_array=0) { // If second argument is not received, it means a closing tag is being handled if(is_numeric($attribute_array)){ return ""; } //if ($element=='img') error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array)); if ($element=='td' && isset($attribute_array['background'])) { if (stripos($attribute_array['background'],$GLOBALS['egw']->link('/index.php'))!==false) { //error_log(__METHOD__.__LINE__.array2string($attribute_array)); //$attribute_array['background'] = 'url('.$attribute_array['background'].');'; } else { // $attribute_array['background']='denied:'.$attribute_array['background']; unset($attribute_array['background']);// only cid style background images are allowed } } // Elements other than 'img' or 'img' without a 'img' attribute are returned unchanged if($element == 'img') { // Re-build 'alt' if (isset($attribute_array['alt'])) $attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']); if (isset($attribute_array['alt'])&&strpos($attribute_array['alt'],'@')!==false) $attribute_array['alt']=str_replace('@','(at)',$attribute_array['alt']); } if (isset($attribute_array['title'])) { if (strpos($attribute_array['title'],'@')!==false) $attribute_array['title']=str_replace('@','(at)',$attribute_array['title']); } if ($element == 'blockquote') { if (isset($attribute_array['cite'])) { if (strpos($attribute_array['cite'],'@')!==false) $attribute_array['cite']=str_replace('@','(at)',$attribute_array['cite']); } } /* // Elements other than 'span' or 'span' without a 'style' attribute are returned unchanged if($element == 'span' && isset($attribute_array['style'])) { // Identify CSS properties and values $css = explode(';', $attribute_array['style']); $style = array(); foreach($css as $v){ if(($p = strpos($v, ':')) > 1 && $p < strlen($v)){ $css_property_name = trim(substr($v, 0, $p)); $css_property_value = trim(substr($v, $p+1)); $style[] = "$css_property_name: $css_property_value"; } } // Alter the CSS property as required // Black Arial must be at a font-size of 24 if(isset($style['font-family']) && $style['font-family'] == 'Arial' && isset($style['color']) && $style['color'] == '#000000'){ $style['font-size'] == '24'; } // And so on for other criteria // ... // Re-build 'style' $attribute_array['style'] = implode('; ', $style); } */ if (isset($attribute_array['style']) && stripos($attribute_array['style'],'script')!==false) $attribute_array['style'] = str_ireplace('script','',$attribute_array['style']); // Build the attributes string $attributes = ''; foreach($attribute_array as $k=>$v){ $attributes .= " {$k}=\"{$v}\""; } // Return the opening tag with attributes static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); return "<{$element}{$attributes}". (isset($empty_elements[$element]) ? ' /' : ''). '>'; } /** * hl_email_tag_transform * * function to provide individual checks for element attribute pairs * implemented so far: img -checking for alt attribute == image; set this to empty * -control for/on external Images and src-length * a -checking for title and href, replacing @ accordingly * -navigate to local anchors without reloading the page * blockquote -checking for cite, replacing @ * throwing away excess div elements, that carry no style or class or id info */ function hl_email_tag_transform($element, $attribute_array=0) { static $lastelement; static $throwawaycounter; if (is_null($lastelement)) $lastelement=''; if (is_null($throwawaycounter)) $throwawaycounter = 0; //if ($throwawaycounter>250) error_log(__METHOD__.__LINE__.' '.$throwawaycounter); if ($element=='div' && $element==$lastelement && ($attribute_array==0 || empty($attribute_array))) { if (is_array($attribute_array)) $throwawaycounter++; if ($attribute_array==0 && $throwawaycounter>0) $throwawaycounter--; if ($throwawaycounter>0) return ''; } if ($lastelement=='div' && $element!=$lastelement && is_array($attribute_array)) $throwawaycounter = 0; if (is_array($attribute_array) && !empty($attribute_array) && $element=='div') { $lastelement = 'div_with_attr'; } else { if (is_array($attribute_array)) $lastelement = $element; } // If second argument is not received, it means a closing tag is being handled if(is_numeric($attribute_array)){ if($element==$lastelement) $lastelement=''; return ""; } //if ($element=='a') error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array)); if ($element=='td' && isset($attribute_array['background'])) { if (stripos($attribute_array['background'],'cid:')!==false) { //error_log(__METHOD__.__LINE__.array2string($attribute_array)); //$attribute_array['background'] = 'url('.$attribute_array['background'].');'; } else { // $attribute_array['background']='denied:'.$attribute_array['background']; unset($attribute_array['background']);// only cid style background images are allowed } } // Elements other than 'img' or 'img' without a 'img' attribute are returned unchanged if($element == 'img') { // Re-build 'alt' if (isset($attribute_array['alt'])) $attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']); if (isset($attribute_array['alt'])&&strpos($attribute_array['alt'],'@')!==false) $attribute_array['alt']=str_replace('@','(at)',$attribute_array['alt']); // $GLOBALS['egw_info']['user']['preferences']['felamimail']['allowExternalIMGs'] ? '' : 'match' => '/^cid:.*/'), if (isset($attribute_array['src'])) { if (!(strlen($attribute_array['src'])>4 && strlen($attribute_array['src']<400))) { $attribute_array['alt']= $attribute_array['alt'].' [blocked (reason: url length):'.$attribute_array['src'].']'; if (!isset($attribute_array['title'])) $attribute_array['title']=$attribute_array['alt']; $attribute_array['src']=common::image('phpgwapi','dialog_error'); } if (!$GLOBALS['egw_info']['user']['preferences']['felamimail']['allowExternalIMGs']) { if (!preg_match('/^cid:.*/',$attribute_array['src'])) { $attribute_array['alt']= $attribute_array['alt'].' [blocked external image:'.$attribute_array['src'].']'; if (!isset($attribute_array['title'])) $attribute_array['title']=$attribute_array['alt']; $attribute_array['src']=common::image('felamimail','no-image-shown'); $attribute_array['border'] = 1; if ($attribute_array['style']) { if (stripos($attribute_array['style'],'border')!==false) $attribute_array['style'] = preg_replace('~border(:|-left:|-right:|-bottom:|-top:)+ (0px)+ (none)+;~si','',$attribute_array['style']); } } } } } if (isset($attribute_array['style']) && stripos($attribute_array['style'],'script')!==false) $attribute_array['style'] = str_ireplace('script','',$attribute_array['style']); if (isset($attribute_array['title'])) { if (strpos($attribute_array['title'],'@')!==false) $attribute_array['title']=str_replace('@','(at)',$attribute_array['title']); } if ($element == 'blockquote') { if (isset($attribute_array['cite'])) { if (strpos($attribute_array['cite'],'@')!==false) $attribute_array['cite']=str_replace('@','(at)',$attribute_array['cite']); } } if($element == 'a') { //error_log(array2string($attribute_array)); if (strpos($attribute_array['href'],'denied:javascript')===0) $attribute_array['href']=''; if (isset($attribute_array['name']) && isset($attribute_array['id'])) $attribute_array['id'] = $attribute_array['name']; if (strpos($attribute_array['href'],'@')!==false) $attribute_array['href'] = str_replace('@','%40',$attribute_array['href']); if (strpos($attribute_array['href'],'#')===0) { $attribute_array['href'] = "javascript:GoToAnchor('".trim(substr($attribute_array['href'],1))."');"; } } // Build the attributes string $attributes = ''; foreach($attribute_array as $k=>$v){ $attributes .= " {$k}=\"{$v}\""; } // Return the opening tag with attributes static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); return "<{$element}{$attributes}". (isset($empty_elements[$element]) ? ' /' : ''). '>'; }