From 6408fdfc010ed8b23c596fa7786f910957b839f1 Mon Sep 17 00:00:00 2001 From: Klaus Leithoff Date: Mon, 15 Oct 2012 09:35:30 +0000 Subject: [PATCH] update htmLawed to Version 1.1.14 (including adaptions from http://www.bioinformatics.org/phplabware/forum/viewtopic.php?id=235) --- phpgwapi/inc/htmLawed/htmLawed.php | 45 +++-- phpgwapi/inc/htmLawed/htmLawedTest.php | 65 ++++++- phpgwapi/inc/htmLawed/htmLawed_README.htm | 181 ++++++++++++-------- phpgwapi/inc/htmLawed/htmLawed_README.txt | 172 ++++++++++--------- phpgwapi/inc/htmLawed/htmLawed_TESTCASE.txt | 23 ++- 5 files changed, 309 insertions(+), 177 deletions(-) diff --git a/phpgwapi/inc/htmLawed/htmLawed.php b/phpgwapi/inc/htmLawed/htmLawed.php index 090c09f85d..c7be528964 100644 --- a/phpgwapi/inc/htmLawed/htmLawed.php +++ b/phpgwapi/inc/htmLawed/htmLawed.php @@ -1,7 +1,7 @@ ', $x, ''; + } + elseif($do < 3 or isset($ok['#pcdata'])){echo $x;} elseif(strpos($x, "\x02\x04")){ foreach(preg_split('`(\x01\x02[^\x01\x02]+\x02\x01)`', $x, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $v){ echo (substr($v, 0, 2) == "\x01\x02" ? $v : ($do > 4 ? preg_replace('`\S`', '', $v) : '')); @@ -202,7 +205,7 @@ for($i=-1, $ci=count($t); ++$i<$ci;){ }elseif($do > 4){echo preg_replace('`\S`', '', $x);} } // get markup - if(!preg_match('`^(/?)([a-zA-Z1-6]+)([^>]*)>(.*)`sm', $t[$i], $r)){$x = $t[$i]; continue;} + if(!preg_match('`^(/?)([a-z1-6]+)([^>]*)>(.*)`sm', $t[$i], $r)){$x = $t[$i]; continue;} $s = null; $e = null; $a = null; $x = null; list($all, $s, $e, $a, $x) = $r; // close tag if($s){ @@ -224,6 +227,9 @@ for($i=-1, $ci=count($t); ++$i<$ci;){ if((($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql)) && !isset($eB[$e]) && !isset($ok[$e])){ array_splice($t, $i, 0, 'div>'); unset($e, $x); ++$ci; --$i; continue; } + if($e == 'div' && !isset($ok['div']) && strpos($a, '-htmlawed-transform')){ + $t[$i] = "span{$a}>{$x}"; unset($e, $x); --$i; continue; + } // if no open ele, $in = parent; mostly immediate parent-child relation should hold if(!$ql or !isset($eN[$e]) or !array_intersect($q, $cN2)){ if(!isset($ok[$e])){ @@ -412,7 +418,6 @@ $t = $t[0]; if($t == '< '){return '< ';} if($t == '>'){return '>';} if(!preg_match('`^<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>$`m', $t, $m)){ - error_log(__METHOD__.__LINE__.' Keep Bad:'.$C['keep_bad'].'->'.array2string($t)); //return str_replace(array('<', '>'), array('<', '>'), $t); return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('<', '>'), $t) : ''); }elseif(!isset($C['elements'][($e = strtolower($m[2]))])){ @@ -633,7 +638,13 @@ if($e == 'font'){ if(preg_match('`size\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m) && isset($fs[($m = trim($m[2]))])){ $a2 .= ' font-size: '. $fs[$m]. ';'; } - $e = 'span'; return ltrim($a2); +// $e = 'span'; return ltrim($a2); +// replace the above with following + if($GLOBALS['C']['balance']){ + $e = 'div'; return 'display: inline; -htmlawed-transform: 1; '. ltrim($a2); + }else{ + $e = 'span'; return ltrim($a2); + } } if($t == 2){$e = 0; return 0;} return ''; @@ -649,16 +660,16 @@ if(($w = strtolower($w)) == -1){ } $s = strpos(" $w", 't') ? "\t" : ' '; $s = preg_match('`\d`', $w, $m) ? str_repeat($s, $m[0]) : str_repeat($s, ($s == "\t" ? 1 : 2)); -$n = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0; +$N = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0; $a = array('br'=>1); $b = array('button'=>1, 'input'=>1, 'option'=>1); $c = array('caption'=>1, 'dd'=>1, 'dt'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1); -$d = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'form'=>1, 'hr'=>1, 'iframe'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1); -$to = explode('<', $t); -$do = 1; -while($do){ - $n = $no; - $t = $to; +$d = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'form'=>1, 'hr'=>1, 'iframe'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1); +$T = explode('<', $t); +$X = 1; +while($X){ + $n = $N; + $t = $T; ob_start(); if(isset($d[$p])){echo str_repeat($s, ++$n);} echo ltrim(array_shift($t)); @@ -670,7 +681,7 @@ while($do){ if(isset($d[$y])){ if(!$x){ if($n){echo "\n", str_repeat($s, --$n), "$e\n", str_repeat($s, $n);} - else{++$no; ob_end_clean(); continue 2;} + else{++$N; ob_end_clean(); continue 2;} } else{echo "\n", str_repeat($s, $n), "$e\n", str_repeat($s, ($x != 1 ? ++$n : $n));} echo ltrim($r); continue; @@ -684,7 +695,7 @@ while($do){ }elseif(!$y){echo $f, $e, $f, ltrim($r); }else{echo $e, $r;} } - $do = 0; + $X = 0; } $t = preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents()); ob_end_clean(); @@ -697,7 +708,7 @@ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array( function hl_version(){ // rel -return '1.1.13'; +return '1.1.14'; // eof } diff --git a/phpgwapi/inc/htmLawed/htmLawedTest.php b/phpgwapi/inc/htmLawed/htmLawedTest.php index 136c9575d9..63346b1f32 100644 --- a/phpgwapi/inc/htmLawed/htmLawedTest.php +++ b/phpgwapi/inc/htmLawed/htmLawedTest.php @@ -1,8 +1,8 @@ =this.Diff_Timeout?Number.MAX_VALUE:(new Date).getTime()+1E3*this.Diff_Timeout);if(null==a||null==b)throw Error("Null input. (diff_main)");if(a==b)return a?[[0,a]]:[];"undefined"==typeof c&&(c=!0);var e=c,f=this.diff_commonPrefix(a,b),c=a.substring(0,f),a=a.substring(f),b=b.substring(f),f=this.diff_commonSuffix(a,b),g=a.substring(a.length-f),a=a.substring(0,a.length-f),b=b.substring(0,b.length-f),a=this.diff_compute_(a, +b,e,d);c&&a.unshift([0,c]);g&&a.push([0,g]);this.diff_cleanupMerge(a);return a}; +diff_match_patch.prototype.diff_compute_=function(a,b,c,d){if(!a)return[[1,b]];if(!b)return[[-1,a]];var e=a.length>b.length?a:b,f=a.length>b.length?b:a,g=e.indexOf(f);if(-1!=g)return c=[[1,e.substring(0,g)],[0,f],[1,e.substring(g+f.length)]],a.length>b.length&&(c[0][0]=c[2][0]=-1),c;if(1==f.length)return[[-1,a],[1,b]];return(e=this.diff_halfMatch_(a,b))?(f=e[0],a=e[1],g=e[2],b=e[3],e=e[4],f=this.diff_main(f,g,c,d),c=this.diff_main(a,b,c,d),f.concat([[0,e]],c)):c&&100c);u++){for(var n=-u+q;n<=u-s;n+=2){var l=g+n,m;m=n==-u||n!=u&&j[l-1]d)s+=2;else if(r>e)q+=2;else if(p&&(l=g+k-n,0<=l&&l= +t)return this.diff_bisectSplit_(a,b,m,r,c)}}for(n=-u+o;n<=u-v;n+=2){l=g+n;t=n==-u||n!=u&&i[l-1]d)v+=2;else if(m>e)o+=2;else if(!p&&(l=g+k-n,0<=l&&l=t)))return this.diff_bisectSplit_(a,b,m,r,c)}}return[[-1,a],[1,b]]}; +diff_match_patch.prototype.diff_bisectSplit_=function(a,b,c,d,e){var f=a.substring(0,c),g=b.substring(0,d),a=a.substring(c),b=b.substring(d),f=this.diff_main(f,g,!1,e),e=this.diff_main(a,b,!1,e);return f.concat(e)}; +diff_match_patch.prototype.diff_linesToChars_=function(a,b){function c(a){for(var b="",c=0,f=-1,g=d.length;fd?a=a.substring(c-d):c=a.length?[h,j,n,l,g]:null}if(0>=this.Diff_Timeout)return null; +var d=a.length>b.length?a:b,e=a.length>b.length?b:a;if(4>d.length||2*e.lengthd[4].length?g:d:d:g;var j;a.length>b.length?(g=h[0],d=h[1],e=h[2],j=h[3]):(e=h[0],j=h[1],g=h[2],d=h[3]);h=h[4];return[g,d,e,j,h]}; +diff_match_patch.prototype.diff_cleanupSemantic=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=0,h=0,j=0,i=0;f=e){if(d>=b.length/2||d>=c.length/2)a.splice(f,0,[0,c.substring(0,d)]),a[f-1][1]=b.substring(0,b.length-d),a[f+1][1]=c.substring(d),f++}else if(e>=b.length/2||e>=c.length/2)a.splice(f,0,[0,b.substring(0,e)]),a[f-1][0]=1,a[f-1][1]=c.substring(0,c.length-e),a[f+1][0]=-1,a[f+1][1]=b.substring(e),f++;f++}f++}}; +diff_match_patch.prototype.diff_cleanupSemanticLossless=function(a){function b(a,b){if(!a||!b)return 6;var c=a.charAt(a.length-1),d=b.charAt(0),e=c.match(diff_match_patch.nonAlphaNumericRegex_),f=d.match(diff_match_patch.nonAlphaNumericRegex_),g=e&&c.match(diff_match_patch.whitespaceRegex_),h=f&&d.match(diff_match_patch.whitespaceRegex_),c=g&&c.match(diff_match_patch.linebreakRegex_),d=h&&d.match(diff_match_patch.linebreakRegex_),i=c&&a.match(diff_match_patch.blanklineEndRegex_),j=d&&b.match(diff_match_patch.blanklineStartRegex_); +return i||j?5:c||d?4:e&&!g&&h?3:g||h?2:e||f?1:0}for(var c=1;c=i&&(i=k,g=d,h=e,j=f)}a[c-1][1]!=g&&(g?a[c-1][1]=g:(a.splice(c-1,1),c--),a[c][1]= +h,j?a[c+1][1]=j:(a.splice(c+1,1),c--))}c++}};diff_match_patch.nonAlphaNumericRegex_=/[^a-zA-Z0-9]/;diff_match_patch.whitespaceRegex_=/\s/;diff_match_patch.linebreakRegex_=/[\r\n]/;diff_match_patch.blanklineEndRegex_=/\n\r?\n$/;diff_match_patch.blanklineStartRegex_=/^\r?\n\r?\n/; +diff_match_patch.prototype.diff_cleanupEfficiency=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=!1,h=!1,j=!1,i=!1;fb)break;e=c;f=d}return a.length!=g&&-1===a[g][0]?f:f+(b-e)}; +diff_match_patch.prototype.diff_prettyHtml=function(a){for(var b=[],c=/&/g,d=//g,f=/\n/g,g=0;g¬
");switch(h){case 1:b[g]=''+j+"";break;case -1:b[g]=''+j+"";break;case 0:b[g]=""+j+""}}return b.join("")}; +diff_match_patch.prototype.diff_text1=function(a){for(var b=[],c=0;cthis.Match_MaxBits)throw Error("Pattern too long for this browser.");var e=this.match_alphabet_(b),f=this,g=this.Match_Threshold,h=a.indexOf(b,c);-1!=h&&(g=Math.min(d(0,h),g),h=a.lastIndexOf(b,c+b.length),-1!=h&&(g=Math.min(d(0,h),g)));for(var j=1<=i;o--){var v=e[a.charAt(o-1)];k[o]=0===s?(k[o+1]<<1|1)&v:(k[o+1]<<1|1)&v|(q[o+1]|q[o])<<1|1|q[o+1];if(k[o]&j&&(v=d(s,o-1),v<=g))if(g=v,h=o-1,h>c)i=Math.max(1,2*c-h);else break}if(d(s+1,c)>g)break;q=k}return h}; +diff_match_patch.prototype.match_alphabet_=function(a){for(var b={},c=0;c=2*this.Patch_Margin&& +e&&(this.patch_addContext_(a,h),c.push(a),a=new diff_match_patch.patch_obj,e=0,h=d,f=g)}1!==i&&(f+=k.length);-1!==i&&(g+=k.length)}e&&(this.patch_addContext_(a,h),c.push(a));return c};diff_match_patch.prototype.patch_deepCopy=function(a){for(var b=[],c=0;cthis.Match_MaxBits){if(j=this.match_main(b,h.substring(0,this.Match_MaxBits),g),-1!=j&&(i=this.match_main(b,h.substring(h.length-this.Match_MaxBits),g+h.length-this.Match_MaxBits),-1==i||j>=i))j=-1}else j=this.match_main(b,h,g); +if(-1==j)e[f]=!1,d-=a[f].length2-a[f].length1;else if(e[f]=!0,d=j-g,g=-1==i?b.substring(j,j+h.length):b.substring(j,i+this.Match_MaxBits),h==g)b=b.substring(0,j)+this.diff_text2(a[f].diffs)+b.substring(j+h.length);else if(g=this.diff_main(h,g,!1),h.length>this.Match_MaxBits&&this.diff_levenshtein(g)/h.length>this.Patch_DeleteThreshold)e[f]=!1;else{this.diff_cleanupSemanticLossless(g);for(var h=0,k,i=0;ie[0][1].length){var f=b-e[0][1].length;e[0][1]=c.substring(e[0][1].length)+e[0][1];d.start1-=f;d.start2-=f;d.length1+=f;d.length2+=f}d=a[a.length-1];e=d.diffs;0==e.length||0!=e[e.length-1][0]?(e.push([0, +c]),d.length1+=b,d.length2+=b):b>e[e.length-1][1].length&&(f=b-e[e.length-1][1].length,e[e.length-1][1]+=c.substring(0,f),d.length1+=f,d.length2+=f);return c}; +diff_match_patch.prototype.patch_splitMax=function(a){for(var b=this.Match_MaxBits,c=0;c2*b?(h.length1+=i.length,e+=i.length,j=!1,h.diffs.push([g,i]),d.diffs.shift()):(i=i.substring(0,b-h.length1-this.Patch_Margin),h.length1+=i.length,e+=i.length,0===g?(h.length2+=i.length,f+=i.length):j=!1,h.diffs.push([g,i]),i==d.diffs[0][1]?d.diffs.shift():d.diffs[0][1]=d.diffs[0][1].substring(i.length))}g=this.diff_text2(h.diffs);g=g.substring(g.length-this.Patch_Margin);i=this.diff_text1(d.diffs).substring(0,this.Patch_Margin);""!==i&& +(h.length1+=i.length,h.length2+=i.length,0!==h.diffs.length&&0===h.diffs[h.diffs.length-1][0]?h.diffs[h.diffs.length-1][1]+=i:h.diffs.push([0,i]));j||a.splice(++c,0,h)}}};diff_match_patch.prototype.patch_toText=function(a){for(var b=[],c=0;c htmLawed (<?php echo hl_version();?>) test @@ -555,7 +608,7 @@ if($do){
Output code »
', format($out), '
', (!isset($_POST['text'][$_hlimit]) ? '
Output binary »' : ''), '
Output rendered »
', $out, '
'; + echo '
Output code »
', format($out), '
', (!isset($_POST['text'][$_hlimit]) ? ' Output binary »' : ''), ' Diff »
Output rendered »
', $out, '
'; } else{ ?> diff --git a/phpgwapi/inc/htmLawed/htmLawed_README.htm b/phpgwapi/inc/htmLawed/htmLawed_README.htm index 4589ef1aa7..dbf4f308be 100644 --- a/phpgwapi/inc/htmLawed/htmLawed_README.htm +++ b/phpgwapi/inc/htmLawed/htmLawed_README.htm @@ -110,8 +110,8 @@ span.totop a, span.totop a:visited {color: #6699cc;}

-
htmLawed_README.txt, 22 July 2012
-htmLawed 1.1.13, 22 July 2012
+
htmLawed_README.txt, 17 September 2012
+htmLawed 1.1.14, 8 August 2012
Copyright Santosh Patnaik
Dual licensed with LGPL 3 and GPL 2+
A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed 
@@ -121,9 +121,9 @@ A PHP Labware internal utility - 1  About htmLawed (to top)

-  htmLawed is a highly customizable single-file PHP script to make text secure, and standard- and admin policy-compliant for use in the body of HTML 4, XHTML 1 or 1.1, or generic XML documents. It is thus a configurable input (X)HTML filter, processor, purifier, sanitizer, beautifier, etc., and an alternative to the HTMLTidy application.
+  htmLawed is a PHP script to process text with HTML markup to make it more comliant with HTML standards  and administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such lawing in of HTML in text used in (X)HTML or XML documents ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators.

-  The lawing in of input text is needed to ensure that HTML code in the text is standard-compliant, does not introduce security vulnerabilities, and does not break the aesthetics, design or layout of web-pages. htmLawed tries to do this by, for example, making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting (XSS) attacks, and allowing only specified HTML elements/tags and attributes.
+  htmLawed is highly customizable, and fast with low memory usage. Its free and open-source code is in one small file, does not require extensions or libraries, and works in older versions of PHP as well. It is a good alternative to the HTML Tidy application.

1.1  Example uses @@ -151,8 +151,8 @@ A PHP Labware internal utility - img  ^`
+  *  can restrict elements  ^~`
+  *  ensures proper closure of empty elements like img  ^`
  *  transform deprecated elements like u  ^~`
  *  HTML comments and CDATA sections can be permitted  ^~`
  *  elements like script, object and form can be permitted  ~
@@ -161,7 +161,7 @@ A PHP Labware internal utility -
alt for image  ^`
-  *  transform deprecated attributes  ^~`
+  *  transforms deprecated attributes  ^~`
  *  attributes declared only once  ^`

  *  restrict attribute values, including element-specifically  ^~`
@@ -214,52 +214,74 @@ A PHP Labware internal utility -
1.3  History

(to top)

-  htmLawed was developed for use with LabWiki, a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like Kses and HTMLPurifier were deemed inadequate, slow, resource-intensive, or dependent on external applications like HTML Tidy.
+  htmLawed was created in 2007 for use with LabWiki, a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like Kses and HTMLPurifier were deemed inadequate, slow, resource-intensive, or dependent on an extension or external application like HTML Tidy. The core logic of htmLawed, that of identifying HTML elements and attributes, was based on the Kses (version 0.2.2) HTML filter software of Ulf Harnhammar (it can still be used with code that uses Kses; see section 2.6.).

-  htmLawed started as a modification of Ulf Harnhammar's Kses (version 0.2.2) software, and is compatible with code that uses Kses; see section 2.6.
+  See section 4.3 for a detailed log of changes in htmLawed over the years, and section 4.10 for acknowledgements.

1.4  License & copyright

(to top)

-  htmLawed is free and open-source software dual licensed under LGPL license version 3, and GPL license version 2 (or later), and copyrighted by Santosh Patnaik, MD, PhD.
+  htmLawed is free and open-source software dual copyrighted by Santosh Patnaik, MD, PhD, and licensed under LGPL license version 3, and GPL license version 2 (or later).

1.5  Terms used here

(to top)

-  *  administrator - or admin; person setting up the code to pass input through htmLawed; also, user
+  In this document, only HTML body-level elements are considered. htmLawed does not have support for head-level elements, body, and the frame-level elements, frameset, frame and noframes, and these elements are ignored here.
+
+  *  administrator - or admin; person setting up the code that utilizes htmLawed; also, user
  *  attributes - name-value pairs like href="http://x.com" in opening tags
-  *  author - writer
+  *  author - see writer
  *  character - atomic unit of text; internally represented by a numeric code-point as specified by the encoding or charset in use
  *  entity - markup like &gt; and &#160; used to refer to a character
  *  element - HTML element like a and img
-  *  element content -  content between the opening and closing tags of an element, like click of <a href="x">click</a>
+  *  element content -  content between the opening and closing tags of an element, like click of the <a href="x">click</a> element
  *  HTML - implies XHTML unless specified otherwise
-  *  input - text string given to htmLawed to process
+  *  HTML body - Complete HTML documents typically have a head and a body container. Information in head specifies title of the document, etc., whereas that in the body informs what is to be displayed on a web-page; it is only the elements for body, except frames, frameset and noframes that htmLawed is concerned with
+  *  input - text given to htmLawed to process
  *  processing - involves filtering, correction, etc., of input
-  *  safe - absence or reduction of certain characters and HTML elements and attributes in the input that can otherwise potentially and circumstantially expose web-site users to security vulnerabilities like cross-site scripting attacks (XSS)
-  *  scheme - URL protocol like http and ftp
-  *  specs - standard specifications
+  *  safe - absence or reduction of certain characters and HTML elements and attributes in HTML of text that can otherwise potentially, and circumstantially, expose text readers to security vulnerabilities like cross-site scripting attacks (XSS)
+  *  scheme - a URL protocol like http and ftp
+  *  specifications - standard specifications, for HTML4, HTML5, Ruby, etc.
  *  style property - terms like border and height for which declarations are made in values for the style attribute of elements
  *  tag - markers like <a href="x"> and </a> delineating element content; the opening tag can contain attributes
  *  tag content - consists of tag markers < and >, element names like div, and possibly attributes
  *  user - administrator
  *  writer - end-user like a blog commenter providing the input that is to be processed; also, author
+
+

+1.6  Availability +

(to top)
+
+  htmLawed can be downloaded for free at its website. Besides the htmLawed.php file, the download has the htmLawed documentation (this document) in plain text and HTML formats, a script for testing, and a text file for test-cases. htmLawed is also available as a PHP class (OOP code) on its website.
+

2  Usage

(to top)

-  htmLawed should work with PHP 4.4 and higher. Either include() the htmLawed.php file or copy-paste the entire code.
+  htmLawed works in PHP version 4.4 or higher. Either include() the htmLawed.php file, or copy-paste the entire code. To use with PHP 4.3, have the following code included:

-  To easily test htmLawed using a form-based interface, use the provided demo (htmLawed.php and htmLawedTest.php should be in the same directory on the web-server).
+ +    if(!function_exists('ctype_digit')){ +
+ +     function ctype_digit($var){ +
+ +      return ((int) $var == $var); +
+ +     } +
+ +    }
Note: For code for usage of the htmLawed class (for htmLawed in OOP), please refer to this page on the htmLawed website; the filtering itself can be configured, etc., as described here.

2.1  Simple @@ -271,7 +293,13 @@ A PHP Labware internal utility -     $processed = htmLawed($text);

Note: If input is from a $_GET or $_POST value, and magic quotes are enabled on the PHP setup, run stripslashes() on the input before passing to htmLawed.
+  With the htmLawed class (
section 1.6), usage is:
+
+ +    $processed = htmLawed::hl($text); +
+
Notes: (1) If input is from a $_GET or $_POST value, and magic quotes are enabled on the PHP setup, run stripslashes() on the input before passing to htmLawed. (2) htmLawed does not have support for head-level elements, body, and the frame-level elements, frameset, frame and noframes.

  By default, htmLawed will process the text allowing all valid HTML elements/tags, secure URL scheme/CSS style properties, etc. It will allow CDATA sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- $config and $spec:

@@ -279,9 +307,7 @@ A PHP Labware internal utility -     $processed = htmLawed($text, $config, $spec);

-  These extra parameters are detailed below. Some examples are shown in
section 2.9.
-
Note: For maximum protection against XSS and other scripting attacks (e.g., by disallowing Javascript code), consider using the safe parameter; see section 3.6.
+  The $config and $spec arguments are detailed below. Some examples are shown in section 2.9. For maximum protection against XSS and other scripting attacks (e.g., by disallowing Javascript code), consider using the safe parameter; see section 3.6.

@@ -321,13 +347,13 @@ A PHP Labware internal utility - section 3.4.7

  0 - no measure taken  *
array("regex1", "regex2") - will ensure a rel attribute with nofollow in its value in case the href attribute value matches the regular expression pattern regex1, and/or will remove href if its value matches the regular expression pattern regex2. E.g., array("/./", "/://\W*(?!(abc\.com|xyz\.org))/"); see section 3.4.7 for more.
array("regex1", "regex2") - will ensure a rel attribute with nofollow in its value in case the href attribute value matches the regular expression pattern regex1, and/or will remove href if its value matches the regular expression pattern regex2. E.g., array("/./", "/://\W*(?!(abc\.com|xyz\.org))/"); see section 3.4.7 for more.

  anti_mail_spam
  Anti-mail-spam measure; see section 3.4.7

  0 - no measure taken  *
word - @ in mail address in href attribute value is replaced with specified word
word - @ in mail address in href attribute value is replaced with specified word

  balance
  Balance tags for well-formedness and proper nesting; see section 3.3.3
@@ -371,7 +397,7 @@ A PHP Labware internal utility - section 3.4

  0 - none  *
string - dictated by values in string
string - dictated by values in string
  on* (like onfocus) attributes not allowed - "

  direct_nest_list
@@ -397,13 +423,13 @@ A PHP Labware internal utility - $config or $spec before htmLawed starts its main work; see section 3.7

  0 - no hook function  *
name - name is name of the hook function (kses_hook  ^)
name - name is name of the hook function (kses_hook  ^)

  hook_tag
  Name of an optional hook function to alter tag content finalized by htmLawed; see section 3.4.9

  0 - no hook function  *
name - name is name of the hook function
name - name is name of the hook function

  keep_bad
  Neutralize bad tags by converting < and > to entities, or remove them; see section 3.3.3
@@ -477,7 +503,7 @@ A PHP Labware internal utility - 0 - no  ^
  1 - remove duplicate and/or invalid ones  *
word - remove invalid ones and replace duplicate ones with new and unique ones based on the word; the admin-specified word, like my_, should begin with a letter (a-z) and can contain letters, digits, ., _, -, and :.
word - remove invalid ones and replace duplicate ones with new and unique ones based on the word; the admin-specified word, like my_, should begin with a letter (a-z) and can contain letters, digits, ., _, -, and :.

  valid_xhtml
  Magic parameter to make input the most valid XHTML without needing to specify other relevant $config parameters; see
section 3.5
@@ -497,7 +523,7 @@ A PHP Labware internal utility - 2.3  Extra HTML specifications using the $spec parameter

(to top)

-  The $spec argument can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policy compliance. $spec is specified as a string of text containing one or more rules, with multiple rules separated from each other by a semi-colon (;). E.g.,
+  The $spec argument of htmLawed can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policies. $spec is specified as a string of text containing one or more rules, with multiple rules separated from each other by a semi-colon (;). E.g.,

    $spec = 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt'; @@ -526,7 +552,7 @@ A PHP Labware internal utility - a=-*, href, title - none except href and title
  *  a=-*, -id, href, title - none except href and title

-  Rules regarding attribute values are optionally specified inside round brackets after attribute names in slash ('/')-separated parameter = value pairs. E.g., title(maxlen=30/minlen=5). None, or one or more of the following parameters may be specified:
+  Rules regarding attribute values are optionally specified inside round brackets after attribute names in slash ('/')-separated parameter = value pairs. E.g., title(maxlen=30/minlen=5). None or one or more of the following parameters may be specified:

  *  oneof - one or more choices separated by | that the value should match; if only one choice is provided, then the value must match that choice

@@ -558,7 +584,7 @@ A PHP Labware internal utility -
;, ,, /, (, ), |, ~ and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be escaped by enclosing in pairs of double-quotes ("). A back-tick (`) can be used to escape a literal ". An example rule illustrating this is input=value(maxlen=30/match="/^\w/"/default="your `"ID`"").

Note: To deny an attribute for all elements for which it is legal, $config["deny_attribute"] (see
section 3.4) can be used instead of $spec. Also, attributes can be allowed element-specifically through $spec while being denied globally through $config["deny_attribute"]. The hook_tag parameter (section 3.4.9) can also be used to implement the $spec functionality.
Note: To deny an attribute for all elements for which it is legal, $config["deny_attribute"] (see section 3.4) can be used instead of $spec. Also, attributes can be allowed element-specifically through $spec while being denied globally through $config["deny_attribute"]. The hook_tag parameter (section 3.4.9) can also be possibly used to implement a functionality like that achieved using $spec functionality.

  $spec can also be used to permit custom, non-standard attributes as well as custom rules for standard attributes. Thus, the following value of $spec will permit the custom uses of the standard rel attribute in input (not permitted as per standards) and of a non-standard attribute, vFlag, in img.

@@ -566,14 +592,14 @@ A PHP Labware internal utility -     $spec = 'img=vFlag; input=rel'

-  The attribute names can contain alphabets, colons (:) and hyphens (-) but must start with an alphabet.
+  The attribute names can contain alphabets, colons (:) and hyphens (-), but they must start with an alphabet.

2.4  Performance time & memory usage

(to top)

-  The time and memory used by htmLawed depends on its configuration and the size of the input, and the amount, nestedness and well-formedness of the HTML markup within it. In particular, tag balancing and beautification each can increase the processing time by about a quarter.
+  The time and memory consumed during text processing by htmLawed depends on its configuration, the size of the input, and the amount, nestedness and well-formedness of the HTML markup within the input. In particular, tag balancing and beautification each can increase the processing time by about a quarter.

  The htmLawed demo can be used to evaluate the performance and effects of different types of input and $config.
@@ -582,15 +608,13 @@ A PHP Labware internal utility - 2.5  Some security risks to keep in mind (to top)

-  When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially dangerous HTML code which is meant to steal user-data, deface a website, render a page non-functional, etc.
-
-  Unless end-users, either people or software, supplying the content are completely trusted, security issues arising from the degree of HTML usage permission has to be kept in mind. For example, following increase security risks:
+  When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially dangerous HTML code which is meant to steal user-data, deface a website, render a page non-functional, etc. Unless end-users, either people or software, supplying the content are completely trusted, security issues arising from the degree of HTML usage permitted through htmLawed's setting should be considered. For example, following increase security risks:

  *  Allowing script, applet, embed, iframe or object elements, or certain of their attributes like allowscriptaccess

  *  Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., <!--[if gte IE 4]><script>alert("xss");</script><![endif]-->

-  *  Allowing dynamic CSS expressions (a feature of the IE browser)
+  *  Allowing dynamic CSS expressions (some Internet Explorer versions are vulnerable)

  *  Allowing the style attribute

@@ -598,7 +622,7 @@ A PHP Labware internal utility - *style* attribute brings in risks of click-jacking, phishing, web-page overlays, etc., even when the safe parameter is enabled (see section 3.6). Except for URLs and a few other things like CSS dynamic expressions, htmLawed currently does not check every CSS style property. It does provide ways for the code-developer implementing htmLawed to do such checks through htmLawed's $spec argument, and through the hook_tag parameter (see section 3.4.8 for more). Disallowing style completely and relying on CSS classes and stylesheet files is recommended.

-  htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML meta tags, this can permit an exploit (like Google's UTF-7/XSS vulnerability of the past).
+  htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permissive circumstances, such as when the character encoding is left undefined through HTTP headers or HTML meta tags, this can allow for an exploit (like Google's UTF-7/XSS vulnerability of the past).

@@ -667,7 +691,7 @@ A PHP Labware internal utility - 2.7  Tolerance for ill-written HTML

(to top)

-  htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be read as HTML, and be considered mere plain text instead. Following statements indicate the degree of looseness that htmLawed can work with, and can be provided in instructions to writers:
+  htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be read as HTML, and may therefore get identified as mere plain text. Following statements indicate the degree of looseness that htmLawed can work with, and can be provided in instructions to writers:

  *  Tags must be flanked by < and > with no > inside -- any needed > should be put in as &gt;. It is possible for tag content (element name and attributes) to be spread over many lines instead of being on one. A space may be present between the tag content and >, like <div > and <img / >, but not after the <.

@@ -675,13 +699,13 @@ A PHP Labware internal utility - &#0160;, &x07ff;) with 0 is okay as long as the number of characters between between the & and the ; does not exceed 8. All entities must end with ; though.

-  *  Named character entities must be properly cased. E.g., &Lt; or &TILDE; will not be let through without modification.
+  *  Named character entities must be properly cased. Thus, &Lt; or &TILDE; will not be recognized as entities and will be neutralized.

-  *  HTML comments should not be inside element tags (okay between tags), and should begin with <!-- and end with -->. Characters like <, >, and & may be allowed inside depending on $config, but any --> inside should be put in as --&gt;. Any -- inside will be automatically converted to -, and a space will be added before the comment delimiter -->.
+  *  HTML comments should not be inside element tags (they can be between tags), and should begin with <!-- and end with -->. Characters like <, >, and & may be allowed inside depending on $config, but any --> inside should be put in as --&gt;. Any -- inside will be automatically converted to -, and a space will be added before the comment delimiter -->.

  *  CDATA sections should not be inside element tags, and can be in element content only if plain text is allowed for that element. They should begin with <[CDATA[ and end with ]]>. Characters like <, >, and & may be allowed inside depending on $config, but any ]]> inside should be put in as ]]&gt;.

@@ -696,22 +720,22 @@ A PHP Labware internal utility -
$config["unique_ids"] not 0 and the id attribute being permitted, writers should carefully avoid using duplicate or invalid id values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when <a id="home"></a><input id="home" /><label for="home"></label> is processed into
<a id="home"></a><input id="prefix_home" /><label for="home"></label>.

-  *  Note that even if intended HTML is lost in a highly ill-written input, the processed output will be more secure and standard-compliant.
+  *  Even if intended HTML is lost from an ill-written input, the processed output will be more secure and standard-compliant.

  *  For URLs, unless $config["scheme"] is appropriately set, writers should avoid using escape characters or entities in schemes. E.g., htt&#112; (which many browsers will read as the harmless http) may be considered bad by htmLawed.

-  *  htmLawed will attempt to put plain text present directly inside blockquote, form, map and noscript elements (illegal as per the specs) inside auto-generated div elements.
+  *  htmLawed will attempt to put plain text present directly inside blockquote, form, map and noscript elements (illegal as per the specifications) inside auto-generated div elements.

2.8  Limitations & work-arounds

(to top)

-  htmLawed's main objective is to make the input text more standard-compliant, secure for web-page readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with work-arounds.
+  htmLawed's main objective is to make the input text more standard-compliant, secure for readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with work-arounds.

-  It should be borne in mind that no browser application is 100% standard-compliant, and that some of the standard specs (like asking for normalization of white-spacing within textarea elements) are clearly wrong. Regarding security, note that unsafe HTML code is not necessarily legally invalid.
+  It should be borne in mind that no browser application is 100% standard-compliant, and that some of the standard specifications (like asking for normalization of white-spacing within textarea elements) are clearly wrong. Regarding security, note that unsafe HTML code is not legally invalid per se.

-  *  htmLawed is meant for input that goes into the body of HTML documents. HTML's head-level elements are not supported, nor are the frameset elements frameset, frame and noframes.
+  *  htmLawed is meant for input that goes into the body of HTML documents. HTML's head-level elements are not supported, nor are the frameset elements frameset, frame and noframes. Content of the latter elements can, however, be individually filtered through htmLawed.

  *  It cannot transform the non-standard embed elements to the standard-compliant object elements. Yet, it can allow embed elements if permitted (embed is widely used and supported). Admins can certainly use the hook_tag parameter (section 3.4.9) to deploy a custom embed-to-object converter function.

@@ -721,7 +745,7 @@ A PHP Labware internal utility - width="20m" with the dimension in non-standard m is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the hook_tag parameter (section 3.4.9) or $spec to enforce finer checks.

-  *  The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specs. Only a few of the proprietary attributes are supported.
+  *  The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specifications. Only a few of the proprietary attributes are supported.

  *  Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the hook_tag parameter (section 3.4.9) or $spec for finer checks. Perhaps the best option is to disallow style but allow class attributes with the right oneof or match values for class, and have the various class style properties in .css CSS stylesheet files.

@@ -733,11 +757,11 @@ A PHP Labware internal utility - http to https. Having absolute URLs may be a standard-requirement, e.g., when HTML is embedded in email messages, whereas altering URLs for other purposes is beyond htmLawed's goals. Admins may be able to use a custom hook function to enforce such checks (hook_tag parameter; see section 3.4.9).

-  *  Pairs of opening and closing tags that do not enclose any content (like <em></em>) are not removed. This may be against the standard specs for certain elements (e.g., table). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code.
+  *  Pairs of opening and closing tags that do not enclose any content (like <em></em>) are not removed. This may be against the standard specifications for certain elements (e.g., table). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code.

-  *  htmLawed does not check for certain element orderings described in the standard specs (e.g., in a table, tbody is allowed before tfoot). Admins may be able to use a custom hook function to enforce such checks (hook_tag parameter; see section 3.4.9).
+  *  htmLawed does not check for certain element orderings described in the standard specifications (e.g., in a table, tbody is allowed before tfoot). Admins may be able to use a custom hook function to enforce such checks (hook_tag parameter; see section 3.4.9).

-  *  htmLawed does not check the number of nested elements. E.g., it will allow two caption elements in a table element, illegal as per the specs. Admins may be able to use a custom hook function to enforce such checks (hook_tag parameter; see section 3.4.9).
+  *  htmLawed does not check the number of nested elements. E.g., it will allow two caption elements in a table element, illegal as per the specifications. Admins may be able to use a custom hook function to enforce such checks (hook_tag parameter; see section 3.4.9).

  *  htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers (/*) in style attribute values in order to detect malicious HTML like crafted IE-specific dynamic expressions like &#101;xpression.... If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the hook_tag parameter (section 3.4.9) to more specifically identify CSS expressions in the style attribute values. Also, using $config["style_pass"], it is possible to have htmLawed pass style attribute values without even looking at them (section 3.4.8).

@@ -745,7 +769,9 @@ A PHP Labware internal utility - section 3.1).

-  *  htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML meta tags, this can permit an exploit (like Google's UTF-7/XSS vulnerability of the past).
+  *  htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML meta tags, this can permit an exploit (like Google's UTF-7/XSS vulnerability of the past). Also, htmLawed can mangle input text if it is not well-formed in terms of character encoding. Administrators can consider using code available elsewhere to check well-formedness of input text characters to correct any defect.
+
+  *  htmLawed is expected to work with input texts in ASCII-compatible single byte encodings such as national variants of ASCII (like ISO-646-DE/German of the ISO 646 standard), extended ASCII variants (like ISO 8859-10/Turkish of the ISO 8859/ISO Latin standard), ISO 8859-based Windows variants (like Windows 1252), EBCDIC, Shift JIS (Japanese), GB-Roman (Chinese), and KS-Roman (Korean). It should also properly handle texts with variable byte encodings like UTF-7 (Unicode) and UTF-8 (Unicode). However, htmLawed may mangle input texts with double byte encodings like UTF-16 (Unicode), JIS X 0208:1997 (Japanese) and K SX 1001:1992 (Korean), or the UTF-32 (Unicode) quadruple byte encoding. If an input text has such an encoding, administrators can use PHP's iconv functions, or some other mean, to convert text to UTF-8 before passing it to htmLawed.

  *  Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts.
@@ -832,12 +858,21 @@ A PHP Labware internal utility -     $spec = 'a=title';
+    $out = htmLawed($in, $config, $spec); +
+
+  Allowing a custom attribute, vFlag, in img and permitting custom use of the standard attribute, rel, in input --
+
+ +    $spec = 'img=vFlag; input=rel'; +
+     $out = htmLawed($in, $config, $spec);

  Some case-studies are presented below.

1. A blog administrator wants to allow only a, em, strike, strong and u in comments, but needs strike and u transformed to span for better XHTML 1-strict compliance, and, he wants the a links to be to http or https resources:
1. A blog administrator wants to allow only a, em, strike, strong and u in comments, but needs strike and u transformed to span for better XHTML 1-strict compliance, and, he wants the a links to point only to http or https resources:

    $processed = htmLawed($in, array('elements'=>'a, em, strike, strong, u', 'make_tag_strict'=>1, 'safe'=>1, 'schemes'=>'*:http, https'), 'a=href'); @@ -1689,7 +1724,7 @@ A PHP Labware internal utility -
3.9  Retaining non-HTML tags in input with mixed markup (to top)

-  htmLawed does not remove certain characters that though invalid are nevertheless discouraged in HTML documents as per the specs (see section 5.1). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the <, > and & characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code).
+  htmLawed does not remove certain characters that, though invalid, are nevertheless discouraged in HTML documents as per the specifications (see section 5.1). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the <, > and & characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code).

  To deal with such mixed markup, the input text can be pre-processed to hide the non-HTML markup by specifically replacing the <, > and & characters with some of the HTML-discouraged characters (see section 3.1.2). Post-htmLawed processing, the replacements are reverted.

@@ -1718,7 +1753,7 @@ A PHP Labware internal utility - 4.1  Support (to top)

-  A careful re-reading of this documentation will very likely answer your questions.
+  A careful reading of this documentation may provide an answer.

  Software updates and forum-based community-support may be found at http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at http://php.net.
@@ -1728,18 +1763,18 @@ A PHP Labware internal utility - (to top)

  See section 2.8.
-
-  Readers are advised to cross-check information given in this document.

4.3  Change-log

(to top)

-  (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the htmLawed.php file may be updated independently if the secondary files are revised.)
+  (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the htmLawed.php file, may be updated without a change-log entry if the secondary files, but not htmLawed per se, are revised.)

  Version number - Release date. Notes

+  1.1.14 - 8 August 2012. Fix for possible segmental loss of incremental indentation during tidying when balance is disabled; fix for non-effectuation under some circumstances of a corrective behavior to preserve plain text within elements like blockquote.
+
  1.1.13 - 22 July 2012. Added feature allowing use of custom, non-standard attributes or custom rules for standard attributes

  1.1.12 - 5 July 2012. Fix for a bug in identifying an unquoted value of the face attribute
@@ -1772,11 +1807,11 @@ A PHP Labware internal utility - $config["hook_tag"] and $config["format"] introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug
+  1.1 - 29 June 2008. $config["hook_tag"] and $config["tidy"] introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug

-  1.0.9 - 11 June 2008. Fixed bug in invalid HTML code-point entity check
+  1.0.9 - 11 June 2008. Fix for a bug in checks for invalid HTML code-point entities

-  1.0.8 - 15 May 2008. bordercolor attribute for table, td and tr
+  1.0.8 - 15 May 2008. Permit bordercolor attribute for table, td and tr

  1.0.7 - 1 May 2008. Support for wmode attribute for embed; $config["show_setting"] introduced; improved $config["elements"] evaluation

@@ -1786,7 +1821,7 @@ A PHP Labware internal utility -
blockquote, form, map and noscript

-  1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); a bug allowing td directly inside table fixed; safe $config parameter added
+  1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); fix for a bug allowing td directly inside table; $config["safe"] introduced

  1.0.2 - 13 February 2008. Improved implementation of $config["keep_bad"]

@@ -1819,7 +1854,7 @@ A PHP Labware internal utility -
4.6  Comparison with HTMLPurifier (to top)

-  The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of mid-2009):
+  The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of year 2010):

  *  does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2)

@@ -1885,7 +1920,7 @@ A PHP Labware internal utility - 5.2  Valid attribute-element combinations (to top)

-  Valid attribute-element combinations as per W3C specs.
+  Valid attribute-element combinations as per W3C specs.

  *  includes deprecated attributes (marked ^), attributes for the non-standard embed element (marked *), and the proprietary bordercolor (marked ~)
  *  only non-frameset, HTML body elements
@@ -2095,11 +2130,11 @@ A PHP Labware internal utility - $in - 1st argument; a text string; the input text to be processed. Any extraneous slashes added by PHP when magic quotes are enabled should be removed beforehand using PHP's stripslashes() function.
+  *  $in - first argument; a text string; the input text to be processed. Any extraneous slashes added by PHP when magic quotes are enabled should be removed beforehand using PHP's stripslashes() function.

-  *  $config - 2nd argument; an associative array; optional (named $C in htmLawed code). The array has keys with names like balance and keep_bad, and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the configurable parameters (indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through $config. Finalized $config is thus a filtered and possibly larger array.
+  *  $config - second argument; an associative array; optional; named $C within htmLawed code. The array has keys with names like balance and keep_bad, and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the configurable parameters (indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through $config. Finalized $config is thus a filtered and possibly larger array.

-  *  $spec - 3rd argument; a text string; optional. The string has rules, written in an htmLawed-designated format, specifying element-specific attribute and attribute value restrictions. Function hl_spec() is used to convert the string to an associative-array for internal use. Finalized $spec is thus an array.
+  *  $spec - third argument; a text string; optional. The string has rules, written in an htmLawed-designated format, specifying element-specific attribute and attribute value restrictions. Function hl_spec() is used to convert the string to an associative-array, named $S within htmLawed code, for internal use. Finalized $spec is thus an array.

  Finalized $config and $spec are made global variables while htmLawed is at work. Values of any pre-existing global variables with same names are noted, and their values are restored after htmLawed finishes processing the input (to capture the finalized values, the show_settings parameter of $config should be used). Depending on $config, another global variable hl_Ids, to track id attribute values for uniqueness, may be set. Unlike the other two variables, this one is not reset (or unset) post-processing.

@@ -2126,14 +2161,14 @@ A PHP Labware internal utility -
htmLawed() identifies tags using regex and processes them with the help of hl_tag() --  a large function that analyzes tag content, filtering it as per HTML standards, $config and $spec. Among other things, hl_tag() transforms deprecated elements using hl_tag2(), removes attributes from closing tags, checks attribute values as per $spec rules using hl_attrval(), and checks URL protocols using hl_prot(). htmLawed() performs tag balancing and nesting checks with a call to hl_bal(), and optionally compacts/beautifies the output with proper white-spacing with a call to hl_tidy(). The latter temporarily replaces white-space, and <, > and & characters inside pre, script and textarea elements, and HTML comments and CDATA sections with control characters (code-points 1 to 5, and 7).

-  htmLawed permits the use of custom code or hook functions at two stages. The first, called inside htmLawed(), allows the input text as well as the finalized $config and $spec values to be altered right after the initial processing (see
section 3.7). The second is called by hl_tag() once the tag content is finalized (see section 3.4.9).
+  htmLawed permits the use of custom code or hook functions at two stages. The first, called inside htmLawed(), allows the input text as well as the finalized $config and $spec values to be altered right after the initial processing (see section 3.7). The second is called by hl_tag() once the tag content is finalized (see section 3.4.9).

-  Being dictated by the external and stable HTML standard, htmLawed's objective is very clear-cut and less concerned with tweakability. The code is only minimally annotated with comments -- it is not meant to instruct; PHP developers familiar with the HTML specs will see the logic, and others can always refer to the htmLawed documentation. The compact structuring of the statements is meant to aid in quickly grasping the logic, at least when viewed with code syntax highlighted. +  The functionality of htmLawed is dictated by the external HTML standard. It is thus coded for a clear-cut objective with not much concern for tweakability. The code is only minimally annotated with comments -- it is not meant to instruct; PHP developers familiar with the HTML specifications will see the logic, and others can always refer to the htmLawed documentation. The compact structuring of the statements is meant to aid a quick grasp of the logic.

-


HTM version of htmLawed_README.txt generated on 22 Jul, 2012 using rTxt2htm from PHP Labware +


HTM version of htmLawed_README.txt generated on 18 Sep, 2012 using rTxt2htm from PHP Labware
- + \ No newline at end of file diff --git a/phpgwapi/inc/htmLawed/htmLawed_README.txt b/phpgwapi/inc/htmLawed/htmLawed_README.txt index 33b1d7fa42..ef0c5a3fde 100644 --- a/phpgwapi/inc/htmLawed/htmLawed_README.txt +++ b/phpgwapi/inc/htmLawed/htmLawed_README.txt @@ -1,6 +1,6 @@ /* -htmLawed_README.txt, 22 July 2012 -htmLawed 1.1.13, 22 July 2012 +htmLawed_README.txt, 17 September 2012 +htmLawed 1.1.14, 8 August 2012 Copyright Santosh Patnaik Dual licensed with LGPL 3 and GPL 2+ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed @@ -73,9 +73,9 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern == 1 About htmLawed ================================================ - htmLawed is a highly customizable single-file PHP script to make text secure, and standard- and admin policy-compliant for use in the body of HTML 4, XHTML 1 or 1.1, or generic XML documents. It is thus a configurable input (X)HTML filter, processor, purifier, sanitizer, beautifier, etc., and an alternative to the HTMLTidy:- http://tidy.sourceforge.net application. - - The `lawing in` of input text is needed to ensure that HTML code in the text is standard-compliant, does not introduce security vulnerabilities, and does not break the aesthetics, design or layout of web-pages. htmLawed tries to do this by, for example, making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting ('XSS') attacks, and allowing only specified HTML elements/tags and attributes. + htmLawed is a PHP script to process text with HTML markup to make it more comliant with HTML standards and administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such `lawing in` of HTML in text used in (X)HTML or XML documents ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators. + + htmLawed is highly customizable, and fast with low memory usage. Its free and open-source code is in one small file, does not require extensions or libraries, and works in older versions of PHP as well. It is a good alternative to the HTML Tidy:- http://tidy.sourceforge.net application. -- 1.1 Example uses ------------------------------------------------ @@ -102,8 +102,8 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * *beautify* or *compact* HTML ^~` - * *restrict elements* ^~` - * proper closure of empty elements like 'img' ^` + * can *restrict elements* ^~` + * ensures proper closure of empty elements like 'img' ^` * *transform deprecated elements* like 'u' ^~` * HTML *comments* and 'CDATA' sections can be permitted ^~` * elements like 'script', 'object' and 'form' can be permitted ~ @@ -112,7 +112,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * remove *invalid attributes* ^` * element and attribute names are *lower-cased* ^ * provide *required attributes*, like 'alt' for 'image' ^` - * *transform deprecated attributes* ^~` + * *transforms deprecated attributes* ^~` * attributes *declared only once* ^` * *restrict attribute values*, including *element-specifically* ^~` @@ -164,33 +164,36 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 1.3 History ----------------------------------------------------o - htmLawed was developed for use with 'LabWiki', a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like 'Kses' and 'HTMLPurifier' were deemed inadequate, slow, resource-intensive, or dependent on external applications like 'HTML Tidy'. - - htmLawed started as a modification of Ulf Harnhammar's 'Kses' (version 0.2.2) software, and is compatible with code that uses 'Kses'; see section:- #2.6. + htmLawed was created in 2007 for use with 'LabWiki', a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like 'Kses' and 'HTMLPurifier' were deemed inadequate, slow, resource-intensive, or dependent on an extension or external application like 'HTML Tidy'. The core logic of htmLawed, that of identifying HTML elements and attributes, was based on the 'Kses' (version 0.2.2) HTML filter software of Ulf Harnhammar (it can still be used with code that uses 'Kses'; see section:- #2.6.). + + See section:- #4.3 for a detailed log of changes in htmLawed over the years, and section:- #4.10 for acknowledgements. -- 1.4 License & copyright ----------------------------------------o - htmLawed is free and open-source software dual licensed under LGPL license version 3:- http://www.gnu.org/licenses/lgpl-3.0.txt, and GPL license version 2:- http://www.gnu.org/licenses/gpl-2.0.txt (or later), and copyrighted by Santosh Patnaik, MD, PhD. + htmLawed is free and open-source software dual copyrighted by Santosh Patnaik, MD, PhD, and licensed under LGPL license version 3:- http://www.gnu.org/licenses/lgpl-3.0.txt, and GPL license version 2:- http://www.gnu.org/licenses/gpl-2.0.txt (or later). -- 1.5 Terms used here --------------------------------------------o - * `administrator` - or admin; person setting up the code to pass input through htmLawed; also, `user` + In this document, only HTML body-level elements are considered. htmLawed does not have support for head-level elements, 'body', and the frame-level elements, 'frameset', 'frame' and 'noframes', and these elements are ignored here. + + * `administrator` - or admin; person setting up the code that utilizes htmLawed; also, `user` * `attributes` - name-value pairs like 'href="http://x.com"' in opening tags - * `author` - `writer` + * `author` - see `writer` * `character` - atomic unit of text; internally represented by a numeric `code-point` as specified by the `encoding` or `charset` in use * `entity` - markup like '>' and ' ' used to refer to a character * `element` - HTML element like 'a' and 'img' - * `element content` - content between the opening and closing tags of an element, like 'click' of 'click' + * `element content` - content between the opening and closing tags of an element, like 'click' of the 'click' element * `HTML` - implies XHTML unless specified otherwise - * `input` - text string given to htmLawed to process + * `HTML body` - Complete HTML documents typically have a `head` and a `body` container. Information in `head` specifies title of the document, etc., whereas that in the body informs what is to be displayed on a web-page; it is only the elements for `body`, except 'frames', 'frameset' and 'noframes' that htmLawed is concerned with + * `input` - text given to htmLawed to process * `processing` - involves filtering, correction, etc., of input - * `safe` - absence or reduction of certain characters and HTML elements and attributes in the input that can otherwise potentially and circumstantially expose web-site users to security vulnerabilities like cross-site scripting attacks (XSS) - * `scheme` - URL protocol like 'http' and 'ftp' - * `specs` - standard specifications + * `safe` - absence or reduction of certain characters and HTML elements and attributes in HTML of text that can otherwise potentially, and circumstantially, expose text readers to security vulnerabilities like cross-site scripting attacks (XSS) + * `scheme` - a URL protocol like 'http' and 'ftp' + * `specifications` - standard specifications, for HTML4, HTML5, Ruby, etc. * `style property` - terms like 'border' and 'height' for which declarations are made in values for the 'style' attribute of elements * `tag` - markers like '' and '' delineating element content; the opening tag can contain attributes * `tag content` - consists of tag markers '<' and '>', element names like 'div', and possibly attributes @@ -198,14 +201,22 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * `writer` - end-user like a blog commenter providing the input that is to be processed; also, `author` +-- 1.6 Availability ------------------------------------------------o + + + htmLawed can be downloaded for free at its website:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. Besides the 'htmLawed.php' file, the download has the htmLawed documentation (this document) in plain text:- htmLawed_README.txt and HTML:- htmLawed_README.htm formats, a script for testing:- htmLawedTest.php, and a text file for test-cases:- htmLawed_TESTCASE.txt. htmLawed is also available as a PHP class (OOP code) on its website. + + == 2 Usage ========================================================oo - htmLawed should work with PHP 4.4 and higher. Either 'include()' the 'htmLawed.php' file or copy-paste the entire code. + htmLawed works in PHP version 4.4 or higher. Either 'include()' the 'htmLawed.php' file, or copy-paste the entire code. To use with PHP 4.3, have the following code included: - To easily *test* htmLawed using a form-based interface, use the provided demo:- htmLawedTest.php ('htmLawed.php' and 'htmLawedTest.php' should be in the same directory on the web-server). - - *Note*: For code for usage of the htmLawed class (for htmLawed in OOP), please refer to this page:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/oop.htm on the htmLawed website; the filtering itself can be configured, etc., as described here. + if(!function_exists('ctype_digit')){ + function ctype_digit($var){ + return ((int) $var == $var); + } + } -- 2.1 Simple ------------------------------------------------------ @@ -214,16 +225,18 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern The input text to be processed, '$text', is passed as an argument of type string; 'htmLawed()' returns the processed string: $processed = htmLawed($text); + + With the 'htmLawed class' (section:- #1.6), usage is: + + $processed = htmLawed::hl($text); - *Note*: If input is from a '$_GET' or '$_POST' value, and 'magic quotes' are enabled on the PHP setup, run 'stripslashes()' on the input before passing to htmLawed. + *Notes*: (1) If input is from a '$_GET' or '$_POST' value, and 'magic quotes' are enabled on the PHP setup, run 'stripslashes()' on the input before passing to htmLawed. (2) htmLawed does not have support for head-level elements, 'body', and the frame-level elements, 'frameset', 'frame' and 'noframes'. By default, htmLawed will process the text allowing all valid HTML elements/tags, secure URL scheme/CSS style properties, etc. It will allow 'CDATA' sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- '$config' and '$spec': - $processed = htmLawed($text, $config, $spec); + $processed = htmLawed($text, $config, $spec); - These extra parameters are detailed below. Some examples are shown in section:- #2.9. - - *Note*: For maximum protection against 'XSS' and other scripting attacks (e.g., by disallowing Javascript code), consider using the 'safe' parameter; see section:- #3.6. + The '$config' and '$spec' arguments are detailed below. Some examples are shown in section:- #2.9. For maximum protection against 'XSS' and other scripting attacks (e.g., by disallowing Javascript code), consider using the 'safe' parameter; see section:- #3.6. -- 2.2 Configuring htmLawed using the '$config' parameter ---------o @@ -256,13 +269,13 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern Anti-link-spam measure; see section:- #3.4.7 '0' - no measure taken * - 'array("regex1", "regex2")' - will ensure a 'rel' attribute with 'nofollow' in its value in case the 'href' attribute value matches the regular expression pattern 'regex1', and/or will remove 'href' if its value matches the regular expression pattern 'regex2'. E.g., 'array("/./", "/://\W*(?!(abc\.com|xyz\.org))/")'; see section:- #3.4.7 for more. + `array("regex1", "regex2")` - will ensure a 'rel' attribute with 'nofollow' in its value in case the 'href' attribute value matches the regular expression pattern 'regex1', and/or will remove 'href' if its value matches the regular expression pattern 'regex2'. E.g., 'array("/./", "/://\W*(?!(abc\.com|xyz\.org))/")'; see section:- #3.4.7 for more. *anti_mail_spam* Anti-mail-spam measure; see section:- #3.4.7 '0' - no measure taken * - 'word' - '@' in mail address in 'href' attribute value is replaced with specified 'word' + `word` - '@' in mail address in 'href' attribute value is replaced with specified `word` *balance* Balance tags for well-formedness and proper nesting; see section:- #3.3.3 @@ -306,7 +319,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern Denied HTML attributes; see section:- #3.4 '0' - none * - 'string' - dictated by values in 'string' + `string` - dictated by values in `string` 'on*' (like 'onfocus') attributes not allowed - " *direct_nest_list* @@ -332,13 +345,13 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern Name of an optional hook function to alter the input string, '$config' or '$spec' before htmLawed starts its main work; see section:- #3.7 '0' - no hook function * - 'name' - 'name' is name of the hook function ('kses_hook' ^) + `name` - `name` is name of the hook function ('kses_hook' ^) *hook_tag* Name of an optional hook function to alter tag content finalized by htmLawed; see section:- #3.4.9 '0' - no hook function * - 'name' - 'name' is name of the hook function + `name` - `name` is name of the hook function *keep_bad* Neutralize bad tags by converting '<' and '>' to entities, or remove them; see section:- #3.3.3 @@ -412,7 +425,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern '0' - no ^ '1' - remove duplicate and/or invalid ones * - 'word' - remove invalid ones and replace duplicate ones with new and unique ones based on the 'word'; the admin-specified 'word', like 'my_', should begin with a letter (a-z) and can contain letters, digits, '.', '_', '-', and ':'. + `word` - remove invalid ones and replace duplicate ones with new and unique ones based on the `word`; the admin-specified `word`, like 'my_', should begin with a letter (a-z) and can contain letters, digits, '.', '_', '-', and ':'. *valid_xhtml* Magic parameter to make input the most valid XHTML without needing to specify other relevant '$config' parameters; see section:- #3.5 @@ -431,7 +444,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 2.3 Extra HTML specifications using the $spec parameter --------o - The '$spec' argument can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policy compliance. '$spec' is specified as a string of text containing one or more `rules`, with multiple rules separated from each other by a semi-colon (';'). E.g., + The '$spec' argument of htmLawed can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policies. '$spec' is specified as a string of text containing one or more `rules`, with multiple rules separated from each other by a semi-colon (';'). E.g., $spec = 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt'; $processed = htmLawed($text, $config, $spec); @@ -454,7 +467,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * 'a=-*, href, title' - none except 'href' and 'title' * 'a=-*, -id, href, title' - none except 'href' and 'title' - Rules regarding *attribute values* are optionally specified inside round brackets after attribute names in slash ('/')-separated `parameter = value` pairs. E.g., 'title(maxlen=30/minlen=5)'. None, or one or more of the following parameters may be specified: + Rules regarding *attribute values* are optionally specified inside round brackets after attribute names in slash ('/')-separated `parameter = value` pairs. E.g., 'title(maxlen=30/minlen=5)'. None or one or more of the following parameters may be specified: * 'oneof' - one or more choices separated by '|' that the value should match; if only one choice is provided, then the value must match that choice @@ -486,19 +499,19 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern *Special characters*: The characters ';', ',', '/', '(', ')', '|', '~' and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be `escaped` by enclosing in pairs of double-quotes ('"'). A back-tick ('`') can be used to escape a literal '"'. An example rule illustrating this is 'input=value(maxlen=30/match="/^\w/"/default="your `"ID`"")'. - *Note*: To deny an attribute for all elements for which it is legal, '$config["deny_attribute"]' (see section:- #3.4) can be used instead of '$spec'. Also, attributes can be allowed element-specifically through '$spec' while being denied globally through '$config["deny_attribute"]'. The 'hook_tag' parameter (section:- #3.4.9) can also be used to implement the '$spec' functionality. + *Note*: To deny an attribute for all elements for which it is legal, '$config["deny_attribute"]' (see section:- #3.4) can be used instead of '$spec'. Also, attributes can be allowed element-specifically through '$spec' while being denied globally through '$config["deny_attribute"]'. The 'hook_tag' parameter (section:- #3.4.9) can also be possibly used to implement a functionality like that achieved using '$spec' functionality. '$spec' can also be used to permit custom, non-standard attributes as well as custom rules for standard attributes. Thus, the following value of '$spec' will permit the custom uses of the standard 'rel' attribute in 'input' (not permitted as per standards) and of a non-standard attribute, 'vFlag', in 'img'. $spec = 'img=vFlag; input=rel' - The attribute names can contain alphabets, colons (:) and hyphens (-) but must start with an alphabet. + The attribute names can contain alphabets, colons (:) and hyphens (-), but they must start with an alphabet. -- 2.4 Performance time & memory usage ----------------------------o - The time and memory used by htmLawed depends on its configuration and the size of the input, and the amount, nestedness and well-formedness of the HTML markup within it. In particular, tag balancing and beautification each can increase the processing time by about a quarter. + The time and memory consumed during text processing by htmLawed depends on its configuration, the size of the input, and the amount, nestedness and well-formedness of the HTML markup within the input. In particular, tag balancing and beautification each can increase the processing time by about a quarter. The htmLawed demo:- htmLawedTest.php can be used to evaluate the performance and effects of different types of input and '$config'. @@ -506,15 +519,13 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 2.5 Some security risks to keep in mind ------------------------o - When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially `dangerous` HTML code which is meant to steal user-data, deface a website, render a page non-functional, etc. - - Unless end-users, either people or software, supplying the content are completely trusted, security issues arising from the degree of HTML usage permission has to be kept in mind. For example, following increase security risks: + When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially `dangerous` HTML code which is meant to steal user-data, deface a website, render a page non-functional, etc. Unless end-users, either people or software, supplying the content are completely trusted, security issues arising from the degree of HTML usage permitted through htmLawed's setting should be considered. For example, following increase security risks: * Allowing 'script', 'applet', 'embed', 'iframe' or 'object' elements, or certain of their attributes like 'allowscriptaccess' * Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., '' - * Allowing dynamic CSS expressions (a feature of the IE browser) + * Allowing dynamic CSS expressions (some Internet Explorer versions are vulnerable) * Allowing the 'style' attribute @@ -522,7 +533,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern Permitting the '*style*' attribute brings in risks of `click-jacking`, `phishing`, web-page overlays, etc., `even` when the 'safe' parameter is enabled (see section:- #3.6). Except for URLs and a few other things like CSS dynamic expressions, htmLawed currently does not check every CSS style property. It does provide ways for the code-developer implementing htmLawed to do such checks through htmLawed's '$spec' argument, and through the 'hook_tag' parameter (see section:- #3.4.8 for more). Disallowing 'style' completely and relying on CSS classes and stylesheet files is recommended. - htmLawed does not check or correct the character *encoding* of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML 'meta' tags, this can permit an exploit (like Google's UTF-7/XSS vulnerability of the past). + htmLawed does not check or correct the character *encoding* of the input it receives. In conjunction with permissive circumstances, such as when the character encoding is left undefined through HTTP headers or HTML 'meta' tags, this can allow for an exploit (like Google's `UTF-7/XSS` vulnerability of the past). -- 2.6 Use without modifying old 'kses()' code --------------------o @@ -559,7 +570,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 2.7 Tolerance for ill-written HTML -----------------------------o - htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be `read` as HTML, and be considered mere plain text instead. Following statements indicate the degree of `looseness` that htmLawed can work with, and can be provided in instructions to writers: + htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be `read` as HTML, and may therefore get identified as mere plain text. Following statements indicate the degree of `looseness` that htmLawed can work with, and can be provided in instructions to writers: * Tags must be flanked by '<' and '>' with no '>' inside -- any needed '>' should be put in as '>'. It is possible for tag content (element name and attributes) to be spread over many lines instead of being on one. A space may be present between the tag content and '>', like '
' and '', but not after the '<'. @@ -567,13 +578,13 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * Attribute string of elements may be liberally spaced with tabs, line-breaks, etc. - * Attribute values may not be double-quoted, or may be single-quoted. + * Attribute values may be single- and not double-quoted. * Left-padding of numeric entities (like, ' ', '&x07ff;') with '0' is okay as long as the number of characters between between the '&' and the ';' does not exceed 8. All entities must end with ';' though. - * Named character entities must be properly cased. E.g., '≪' or '&TILDE;' will not be let through without modification. + * Named character entities must be properly cased. Thus, '≪' or '&TILDE;' will not be recognized as entities and will be `neutralized`. - * HTML comments should not be inside element tags (okay between tags), and should begin with ''. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any '-->' inside should be put in as '-->'. Any '--' inside will be automatically converted to '-', and a space will be added before the comment delimiter '-->'. + * HTML comments should not be inside element tags (they can be between tags), and should begin with ''. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any '-->' inside should be put in as '-->'. Any '--' inside will be automatically converted to '-', and a space will be added before the comment delimiter '-->'. * 'CDATA' sections should not be inside element tags, and can be in element content only if plain text is allowed for that element. They should begin with '<[CDATA[' and end with ']]>'. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any ']]>' inside should be put in as ']]>'. @@ -588,21 +599,21 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * With '$config["unique_ids"]' not '0' and the 'id' attribute being permitted, writers should carefully avoid using duplicate or invalid 'id' values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when '' is processed into ''. - * Note that even if intended HTML is lost in a highly ill-written input, the processed output will be more secure and standard-compliant. + * Even if intended HTML is lost from an ill-written input, the processed output will be more secure and standard-compliant. * For URLs, unless '$config["scheme"]' is appropriately set, writers should avoid using escape characters or entities in schemes. E.g., 'http' (which many browsers will read as the harmless 'http') may be considered bad by htmLawed. - * htmLawed will attempt to put plain text present directly inside 'blockquote', 'form', 'map' and 'noscript' elements (illegal as per the specs) inside auto-generated 'div' elements. + * htmLawed will attempt to put plain text present directly inside 'blockquote', 'form', 'map' and 'noscript' elements (illegal as per the specifications) inside auto-generated 'div' elements. -- 2.8 Limitations & work-arounds ---------------------------------o - htmLawed's main objective is to make the input text `more` standard-compliant, secure for web-page readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with work-arounds. + htmLawed's main objective is to make the input text `more` standard-compliant, secure for readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with work-arounds. - It should be borne in mind that no browser application is 100% standard-compliant, and that some of the standard specs (like asking for normalization of white-spacing within 'textarea' elements) are clearly wrong. Regarding security, note that `unsafe` HTML code is not necessarily legally invalid. + It should be borne in mind that no browser application is 100% standard-compliant, and that some of the standard specifications (like asking for normalization of white-spacing within 'textarea' elements) are clearly wrong. Regarding security, note that `unsafe` HTML code is not legally invalid per se. - * htmLawed is meant for input that goes into the 'body' of HTML documents. HTML's head-level elements are not supported, nor are the frameset elements 'frameset', 'frame' and 'noframes'. + * htmLawed is meant for input that goes into the 'body' of HTML documents. HTML's head-level elements are not supported, nor are the frameset elements 'frameset', 'frame' and 'noframes'. Content of the latter elements can, however, be individually filtered through htmLawed. * It cannot transform the non-standard 'embed' elements to the standard-compliant 'object' elements. Yet, it can allow 'embed' elements if permitted ('embed' is widely used and supported). Admins can certainly use the 'hook_tag' parameter (section:- #3.4.9) to deploy a custom embed-to-object converter function. @@ -612,7 +623,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * By default, htmLawed won't check many attribute values for standard compliance. E.g., 'width="20m"' with the dimension in non-standard 'm' is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the 'hook_tag' parameter (section:- #3.4.9) or '$spec' to enforce finer checks. - * The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specs. Only a few of the proprietary attributes are supported. + * The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specifications. Only a few of the proprietary attributes are supported. * Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the 'hook_tag' parameter (section:- #3.4.9) or '$spec' for finer checks. Perhaps the best option is to disallow 'style' but allow 'class' attributes with the right 'oneof' or 'match' values for 'class', and have the various class style properties in '.css' CSS stylesheet files. @@ -624,11 +635,11 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * Except for optionally converting absolute or relative URLs to the other type, htmLawed will not alter URLs (e.g., to change the value of query strings or to convert 'http' to 'https'. Having absolute URLs may be a standard-requirement, e.g., when HTML is embedded in email messages, whereas altering URLs for other purposes is beyond htmLawed's goals. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). - * Pairs of opening and closing tags that do not enclose any content (like '') are not removed. This may be against the standard specs for certain elements (e.g., 'table'). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code. + * Pairs of opening and closing tags that do not enclose any content (like '') are not removed. This may be against the standard specifications for certain elements (e.g., 'table'). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code. - * htmLawed does not check for certain element orderings described in the standard specs (e.g., in a 'table', 'tbody' is allowed before 'tfoot'). Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). + * htmLawed does not check for certain element orderings described in the standard specifications (e.g., in a 'table', 'tbody' is allowed before 'tfoot'). Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). - * htmLawed does not check the number of nested elements. E.g., it will allow two 'caption' elements in a 'table' element, illegal as per the specs. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). + * htmLawed does not check the number of nested elements. E.g., it will allow two 'caption' elements in a 'table' element, illegal as per the specifications. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). * htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers ('/*') in 'style' attribute values in order to detect malicious HTML like crafted IE-specific dynamic expressions like 'expression...'. If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the 'hook_tag' parameter (section:- #3.4.9) to more specifically identify CSS expressions in the 'style' attribute values. Also, using '$config["style_pass"]', it is possible to have htmLawed pass 'style' attribute values without even looking at them (section:- #3.4.8). @@ -636,10 +647,12 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern * Because of poor Unicode support in PHP, htmLawed does not remove the `high value` HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see section:- #3.1). - * htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML 'meta' tags, this can permit an exploit (like Google's UTF-7/XSS vulnerability of the past). + * htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML 'meta' tags, this can permit an exploit (like Google's `UTF-7/XSS` vulnerability of the past). Also, htmLawed can mangle input text if it is not well-formed in terms of character encoding. Administrators can consider using code available elsewhere to check well-formedness of input text characters to correct any defect. + + * htmLawed is expected to work with input texts in ASCII-compatible single byte encodings such as national variants of ASCII (like ISO-646-DE/German of the ISO 646 standard), extended ASCII variants (like ISO 8859-10/Turkish of the ISO 8859/ISO Latin standard), ISO 8859-based Windows variants (like Windows 1252), EBCDIC, Shift JIS (Japanese), GB-Roman (Chinese), and KS-Roman (Korean). It should also properly handle texts with variable byte encodings like UTF-7 (Unicode) and UTF-8 (Unicode). However, htmLawed may mangle input texts with double byte encodings like UTF-16 (Unicode), JIS X 0208:1997 (Japanese) and K SX 1001:1992 (Korean), or the UTF-32 (Unicode) quadruple byte encoding. If an input text has such an encoding, administrators can use PHP's iconv:- http://php.net/manual/en/book.iconv.php functions, or some other mean, to convert text to UTF-8 before passing it to htmLawed. * Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts. - + -- 2.9 Examples of usage -------------------------------------------o @@ -689,9 +702,14 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern $spec = 'a=title'; $out = htmLawed($in, $config, $spec); + Allowing a custom attribute, 'vFlag', in 'img' and permitting custom use of the standard attribute, 'rel', in 'input' -- + + $spec = 'img=vFlag; input=rel'; + $out = htmLawed($in, $config, $spec); + Some case-studies are presented below. - *1.* A blog administrator wants to allow only 'a', 'em', 'strike', 'strong' and 'u' in comments, but needs 'strike' and 'u' transformed to 'span' for better XHTML 1-strict compliance, and, he wants the 'a' links to be to 'http' or 'https' resources: + *1.* A blog administrator wants to allow only 'a', 'em', 'strike', 'strong' and 'u' in comments, but needs 'strike' and 'u' transformed to 'span' for better XHTML 1-strict compliance, and, he wants the 'a' links to point only to 'http' or 'https' resources: $processed = htmLawed($in, array('elements'=>'a, em, strike, strong, u', 'make_tag_strict'=>1, 'safe'=>1, 'schemes'=>'*:http, https'), 'a=href'); @@ -1287,7 +1305,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 3.9 Retaining non-HTML tags in input with mixed markup ---------o - htmLawed does not remove certain characters that though invalid are nevertheless discouraged in HTML documents as per the specs (see section:- #5.1). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the '<', '>' and '&' characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code). + htmLawed does not remove certain characters that, though invalid, are nevertheless `discouraged` in HTML documents as per the specifications (see section:- #5.1). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the '<', '>' and '&' characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code). To deal with such mixed markup, the input text can be pre-processed to hide the non-HTML markup by specifically replacing the '<', '>' and '&' characters with some of the HTML-discouraged characters (see section:- #3.1.2). Post-htmLawed processing, the replacements are reverted. @@ -1308,7 +1326,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 4.1 Support ----------------------------------------------------- - A careful re-reading of this documentation will very likely answer your questions. + A careful reading of this documentation may provide an answer. Software updates and forum-based community-support may be found at http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at http://php.net. @@ -1318,16 +1336,16 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern See section:- #2.8. - Readers are advised to cross-check information given in this document. - -- 4.3 Change-log -------------------------------------------------o - (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the 'htmLawed.php' file may be updated independently if the secondary files are revised.) + (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the 'htmLawed.php' file, may be updated without a change-log entry if the secondary files, but not htmLawed per se, are revised.) `Version number - Release date. Notes` + 1.1.14 - 8 August 2012. Fix for possible segmental loss of incremental indentation during 'tidying' when 'balance' is disabled; fix for non-effectuation under some circumstances of a corrective behavior to preserve plain text within elements like 'blockquote'. + 1.1.13 - 22 July 2012. Added feature allowing use of custom, non-standard attributes or custom rules for standard attributes 1.1.12 - 5 July 2012. Fix for a bug in identifying an unquoted value of the 'face' attribute @@ -1360,11 +1378,11 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern 1.1.1 - 27 September 2008. Better nesting correction when omitable closing tags are absent - 1.1 - 29 June 2008. '$config["hook_tag"]' and '$config["format"]' introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug + 1.1 - 29 June 2008. '$config["hook_tag"]' and '$config["tidy"]' introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug - 1.0.9 - 11 June 2008. Fixed bug in invalid HTML code-point entity check + 1.0.9 - 11 June 2008. Fix for a bug in checks for invalid HTML code-point entities - 1.0.8 - 15 May 2008. 'bordercolor' attribute for 'table', 'td' and 'tr' + 1.0.8 - 15 May 2008. Permit 'bordercolor' attribute for 'table', 'td' and 'tr' 1.0.7 - 1 May 2008. Support for 'wmode' attribute for 'embed'; '$config["show_setting"]' introduced; improved '$config["elements"]' evaluation @@ -1374,7 +1392,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern 1.0.4 - 10 March 2008. Improved corrections for 'blockquote', 'form', 'map' and 'noscript' - 1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); a bug allowing 'td' directly inside 'table' fixed; 'safe' '$config' parameter added + 1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); fix for a bug allowing 'td' directly inside 'table'; '$config["safe"]' introduced 1.0.2 - 13 February 2008. Improved implementation of '$config["keep_bad"]' @@ -1404,7 +1422,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 4.6 Comparison with 'HTMLPurifier' -----------------------------o - The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of mid-2009): + The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of year 2010): * does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2) @@ -1463,7 +1481,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern -- 5.2 Valid attribute-element combinations -----------------------o - Valid attribute-element combinations as per W3C specs. + Valid attribute-element combinations as per W3C:- http://www.w3c.org specs. * includes deprecated attributes (marked '^'), attributes for the non-standard 'embed' element (marked '*'), and the proprietary 'bordercolor' (marked '~') * only non-frameset, HTML body elements @@ -1667,11 +1685,11 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern *Function arguments* for htmLawed are: - * '$in' - 1st argument; a text string; the *input text* to be processed. Any extraneous slashes added by PHP when `magic quotes` are enabled should be removed beforehand using PHP's 'stripslashes()' function. + * '$in' - first argument; a text string; the *input text* to be processed. Any extraneous slashes added by PHP when `magic quotes` are enabled should be removed beforehand using PHP's 'stripslashes()' function. - * '$config' - 2nd argument; an associative array; optional (named '$C' in htmLawed code). The array has keys with names like 'balance' and 'keep_bad', and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the *configurable parameters* (indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through '$config'. `Finalized` '$config' is thus a filtered and possibly larger array. + * '$config' - second argument; an associative array; optional; named '$C' within htmLawed code. The array has keys with names like 'balance' and 'keep_bad', and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the *configurable parameters* (indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through '$config'. `Finalized` '$config' is thus a filtered and possibly larger array. - * '$spec' - 3rd argument; a text string; optional. The string has rules, written in an htmLawed-designated format, *specifying* element-specific attribute and attribute value restrictions. Function 'hl_spec()' is used to convert the string to an associative-array for internal use. `Finalized` '$spec' is thus an array. + * '$spec' - third argument; a text string; optional. The string has rules, written in an htmLawed-designated format, *specifying* element-specific attribute and attribute value restrictions. Function 'hl_spec()' is used to convert the string to an associative-array, named '$S' within htmLawed code, for internal use. `Finalized` '$spec' is thus an array. `Finalized` '$config' and '$spec' are made *global variables* while htmLawed is at work. Values of any pre-existing global variables with same names are noted, and their values are restored after htmLawed finishes processing the input (to capture the `finalized` values, the 'show_settings' parameter of '$config' should be used). Depending on '$config', another global variable 'hl_Ids', to track 'id' attribute values for uniqueness, may be set. Unlike the other two variables, this one is not reset (or unset) post-processing. @@ -1698,9 +1716,9 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern After this `initial processing` 'htmLawed()' identifies tags using regex and processes them with the help of 'hl_tag()' -- a large function that analyzes tag content, filtering it as per HTML standards, '$config' and '$spec'. Among other things, 'hl_tag()' transforms deprecated elements using 'hl_tag2()', removes attributes from closing tags, checks attribute values as per '$spec' rules using 'hl_attrval()', and checks URL protocols using 'hl_prot()'. 'htmLawed()' performs tag balancing and nesting checks with a call to 'hl_bal()', and optionally compacts/beautifies the output with proper white-spacing with a call to 'hl_tidy()'. The latter temporarily replaces white-space, and '<', '>' and '&' characters inside 'pre', 'script' and 'textarea' elements, and HTML comments and CDATA sections with control characters (code-points '1' to '5', and '7'). - htmLawed permits the use of custom code or *hook functions* at two stages. The first, called inside 'htmLawed()', allows the input text as well as the finalized $config and $spec values to be altered right after the initial processing (see section:- #3.7). The second is called by 'hl_tag()' once the tag content is finalized (see section:- #3.4.9). + htmLawed permits the use of custom code or *hook functions* at two stages. The first, called inside 'htmLawed()', allows the input text as well as the finalized '$config' and '$spec' values to be altered right after the initial processing (see section:- #3.7). The second is called by 'hl_tag()' once the tag content is finalized (see section:- #3.4.9). - Being dictated by the external and stable HTML standard, htmLawed's objective is very clear-cut and less concerned with tweakability. The code is only minimally annotated with comments -- it is not meant to instruct; PHP developers familiar with the HTML specs will see the logic, and others can always refer to the htmLawed documentation. The compact structuring of the statements is meant to aid in quickly grasping the logic, at least when viewed with code syntax highlighted. + The functionality of htmLawed is dictated by the external HTML standard. It is thus coded for a clear-cut objective with not much concern for tweakability. The code is only minimally annotated with comments -- it is not meant to instruct; PHP developers familiar with the HTML specifications will see the logic, and others can always refer to the htmLawed documentation. The compact structuring of the statements is meant to aid a quick grasp of the logic. ___________________________________________________________________oo @@ -1709,4 +1727,4 @@ ___________________________________________________________________oo @@encoding: utf-8 @@keywords: htmLawed, HTM, HTML, HTML Tidy, converter, filter, formatter, purifier, sanitizer, XSS, input, PHP, software, code, script, security, cross-site scripting, hack, sanitize, remove, standards, tags, attributes, elements @@language: en -@@title: htmLawed documentation +@@title: htmLawed documentation \ No newline at end of file diff --git a/phpgwapi/inc/htmLawed/htmLawed_TESTCASE.txt b/phpgwapi/inc/htmLawed/htmLawed_TESTCASE.txt index 6a266f0271..ea99e9b54f 100644 --- a/phpgwapi/inc/htmLawed/htmLawed_TESTCASE.txt +++ b/phpgwapi/inc/htmLawed/htmLawed_TESTCASE.txt @@ -1,6 +1,6 @@ /* -htmLawed_TESTCASE.txt, 22 July 2012 -htmLawed 1.1.13, 22 July 2012 +htmLawed_TESTCASE.txt, 14 August 2012 +htmLawed 1.1.14, 8 August 2012 Copyright Santosh Patnaik Dual licensed with LGPL 3 and GPL 2+ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed @@ -47,6 +47,11 @@ character encoding to Unicode/UTF-8
abc
def

abc
def
ghi

abc
def
ghi
+
QQQ
x

+
x
QQQ

+
x
QQQ
x

+
x
QQQ

x


+
(try with blockquote parent)
CDATA sections
@@ -129,8 +134,9 @@ Disallowed tag p Invalid: a
Empty: a, a, atext
Content invalid: 12
-Content invalid?:

(try setting 'form' as parent) -Casing: +Content invalid?:

(try setting 'form' as parent)
+Casing:
+Check for tidy:



hi
Entities
@@ -346,6 +352,11 @@ na Alemanha. r2c1r2c2
+
Tag transformation
+Font element intended as 'inline' element:

hi


+Font element intended as 'block' element:
hi

+Font element intended as 'block' element:
hi
QQQ

+
URLs
Relative and absolute: , , , , , ,
@@ -403,6 +414,10 @@ script:eval(document.all.mycode.expr)')">hi
> 3
<._.> hi!
<<< ALERT >>>
+ some stuff
+
+
+
if(13age){say 'teen'}
age >51 and a smoking history of >51 pack-years was
age > 51 and a smoking history of >51 pack-years was