* API/htmLawed: update to version 1.1.17

This commit is contained in:
Klaus Leithoff 2014-05-27 13:52:02 +00:00
parent a7ecc60a5e
commit f94e6bb501
5 changed files with 76 additions and 56 deletions

View File

@ -1,7 +1,7 @@
<?php
/*
htmLawed 1.1.14, 8 August 2012
htmLawed 1.1.17, 11 March 2014
Copyright Santosh Patnaik
Dual licensed with LGPL 3 and GPL 2+
A PHP Labware internal utility; www.bioinformatics.org/phplabware/internal_utilities/htmLawed
@ -80,9 +80,6 @@ $C['style_pass'] = empty($C['style_pass']) ? 0 : 1;
$C['tidy'] = empty($C['tidy']) ? 0 : $C['tidy'];
$C['unique_ids'] = isset($C['unique_ids']) ? $C['unique_ids'] : 1;
$C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 0;
// own config options
// block elements allowed for nesting when only inline is allowed; Example span does not allow block elements as table; table is the only element tested so far
$C['allow_for_inline'] = isset($C['allow_for_inline'])?$C['allow_for_inline']:0;
if(isset($GLOBALS['C'])){$reC = $GLOBALS['C'];}
$GLOBALS['C'] = $C;
@ -160,7 +157,6 @@ $cT = array('colgroup'=>1, 'dd'=>1, 'dt'=>1, 'li'=>1, 'option'=>1, 'p'=>1, 'td'=
// block/inline type; ins & del both type; #pcdata: text
$eB = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'del'=>1, 'dir'=>1, 'dl'=>1, 'div'=>1, 'fieldset'=>1, 'form'=>1, 'ins'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'isindex'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'table'=>1, 'ul'=>1);
$eI = array('#pcdata'=>1, 'a'=>1, 'abbr'=>1, 'acronym'=>1, 'applet'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'br'=>1, 'button'=>1, 'cite'=>1, 'code'=>1, 'del'=>1, 'dfn'=>1, 'em'=>1, 'embed'=>1, 'font'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'kbd'=>1, 'label'=>1, 'map'=>1, 'object'=>1, 'q'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'select'=>1, 'script'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1, 'tt'=>1, 'u'=>1, 'var'=>1);
if($GLOBALS['C']['allow_for_inline'] && is_array($GLOBALS['C']['allow_for_inline'])) foreach($GLOBALS['C']['allow_for_inline'] as $khai => $vhai) {$eI[$vhai]=1;}//allow table as if it was an inline element as <span> some Text <table>...</table> more text</span> is quite common
$eN = array('a'=>1, 'big'=>1, 'button'=>1, 'fieldset'=>1, 'font'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'label'=>1, 'object'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1); // Exclude from specific ele; $cN values
$eO = array('area'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'dd'=>1, 'dt'=>1, 'legend'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'script'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'thead'=>1, 'th'=>1, 'tr'=>1); // Missing in $eB & $eI
$eF = $eB + $eI;
@ -231,9 +227,6 @@ for($i=-1, $ci=count($t); ++$i<$ci;){
if((($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql)) && !isset($eB[$e]) && !isset($ok[$e])){
array_splice($t, $i, 0, 'div>'); unset($e, $x); ++$ci; --$i; continue;
}
if($e == 'div' && !isset($ok['div']) && strpos($a, '-htmlawed-transform')){
$t[$i] = "span{$a}>{$x}"; unset($e, $x); --$i; continue;
}
// if no open ele, $in = parent; mostly immediate parent-child relation should hold
if(!$ql or !isset($eN[$e]) or !array_intersect($q, $cN2)){
if(!isset($ok[$e])){
@ -343,7 +336,7 @@ $c = isset($C['schemes'][$c]) ? $C['schemes'][$c] : $C['schemes']['*'];
static $d = 'denied:';
if(isset($c['!']) && substr($p, 0, 7) != $d){$p = "$d$p";}
if(isset($c['*']) or !strcspn($p, '#?;') or (substr($p, 0, 7) == $d)){return "{$b}{$p}{$a}";} // All ok, frag, query, param
if(preg_match('`^([a-z\d\-+.&#; ]+?)(:|&#(58|x3a);|%3a|\\\\0{0,4}3a).`i', $p, $m) && !isset($c[strtolower($m[1])])){ // Denied prot
if(preg_match('`^([^:?[@!$()*,=/\'\]]+?)(:|&#(58|x3a);|%3a|\\\\0{0,4}3a).`i', $p, $m) && !isset($c[strtolower($m[1])])){ // Denied prot
return "{$b}{$d}{$p}{$a}";
}
if($C['abs_url']){
@ -386,7 +379,7 @@ return $r;
function hl_spec($t){
// final $spec
$s = array();
$t = str_replace(array("\t", "\r", "\n", ' '), '', preg_replace('/"(?>(`.|[^"])*)"/sme', 'substr(str_replace(array(";", "|", "~", " ", ",", "/", "(", ")", \'`"\'), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\""), "$0"), 1, -1)', trim($t)));
$t = str_replace(array("\t", "\r", "\n", ' '), '', preg_replace_callback('/"(?>(`.|[^"])*)"/sm', create_function('$m', 'return substr(str_replace(array(";", "|", "~", " ", ",", "/", "(", ")", \'`"\'), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\""), $m[0]), 1, -1);'), trim($t)));
for($i = count(($t = explode(';', $t))); --$i>=0;){
$w = $t[$i];
if(empty($w) or ($e = strpos($w, '=')) === false or !strlen(($a = substr($w, $e+1)))){continue;}
@ -422,8 +415,7 @@ $t = $t[0];
if($t == '< '){return '&lt; ';}
if($t == '>'){return '&gt;';}
if(!preg_match('`^<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>$`m', $t, $m)){
//return str_replace(array('<', '>'), array('&lt;', '&gt;'), $t);
return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('&lt;', '&gt;'), $t) : '');
return str_replace(array('<', '>'), array('&lt;', '&gt;'), $t);
}elseif(!isset($C['elements'][($e = strtolower($m[2]))])){
return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('&lt;', '&gt;'), $t) : '');
}
@ -443,7 +435,6 @@ if(!empty($m[1])){
// open tag & attr
static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept-charset'=>array('form'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accesskey'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'legend'=>1, 'textarea'=>1), 'action'=>array('form'=>1), 'align'=>array('caption'=>1, 'embed'=>1, 'applet'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'legend'=>1, 'table'=>1, 'hr'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'p'=>1, 'col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'tr'=>1, 'td'=>1, 'th'=>1), 'border'=>array('table'=>1, 'img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('input'=>1), 'cite'=>array('blockquote'=>1, 'q'=>1, 'del'=>1, 'ins'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('object'=>1, 'applet'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'coords'=>array('area'=>1, 'a'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1), 'declare'=>array('object'=>1), 'defer'=>array('script'=>1), 'dir'=>array('bdo'=>1), 'disabled'=>array('button'=>1, 'input'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'flashvars'=>array('embed'=>1), 'for'=>array('label'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('embed'=>1, 'iframe'=>1, 'td'=>1, 'th'=>1, 'img'=>1, 'object'=>1, 'applet'=>1), 'href'=>array('a'=>1, 'area'=>1), 'hreflang'=>array('a'=>1), 'hspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'label'=>array('option'=>1, 'optgroup'=>1), 'language'=>array('script'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'maxlength'=>array('input'=>1), 'method'=>array('form'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('select'=>1), 'name'=>array('button'=>1, 'embed'=>1, 'textarea'=>1, 'applet'=>1, 'select'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'a'=>1, 'input'=>1, 'object'=>1, 'map'=>1, 'param'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'onblur'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onchange'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'onfocus'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onreset'=>array('form'=>1), 'onselect'=>array('input'=>1, 'textarea'=>1), 'onsubmit'=>array('form'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'prompt'=>array('isindex'=>1), 'readonly'=>array('textarea'=>1, 'input'=>1), 'rel'=>array('a'=>1), 'rev'=>array('a'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scrolling'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('area'=>1, 'a'=>1), 'size'=>array('hr'=>1, 'font'=>1, 'input'=>1, 'select'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('embed'=>1, 'script'=>1, 'input'=>1, 'iframe'=>1, 'img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'summary'=>array('table'=>1), 'tabindex'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'object'=>1, 'select'=>1, 'textarea'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'embed'=>1, 'object'=>1, 'param'=>1, 'script'=>1, 'input'=>1, 'li'=>1, 'ol'=>1, 'ul'=>1, 'button'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('input'=>1, 'option'=>1, 'param'=>1, 'button'=>1, 'li'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'width'=>array('embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'object'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'applet'=>1, 'col'=>1, 'colgroup'=>1, 'pre'=>1), 'wmode'=>array('embed'=>1), 'xml:space'=>array('pre'=>1, 'script'=>1, 'style'=>1)); // Ele-specific
$aN['background']=(isset($aN['background'])?array_merge($aN['background'],array('td'=>1)):array('td'=>1));//allow attribute background for (additional) elements, its not allowed by HTML Standards exept for body, but used anyway; one should check for the url to be allowed by application means
static $aNE = array('checked'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'disabled'=>1, 'ismap'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'readonly'=>1, 'selected'=>1); // Empty
static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'usemap'=>1); // Need scheme check; excludes style, on* & src
static $aNU = array('class'=>array('param'=>1, 'script'=>1), 'dir'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'id'=>array('script'=>1), 'lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'xml:lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'onclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'ondblclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeydown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeypress'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeyup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousedown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousemove'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseout'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseover'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'style'=>array('param'=>1, 'script'=>1), 'title'=>array('param'=>1, 'script'=>1)); // Univ & exceptions
@ -632,7 +623,7 @@ if($e == 'dir' or $e == 'menu'){$e = 'ul'; return '';}
if($e == 's' or $e == 'strike'){$e = 'span'; return 'text-decoration: line-through;';}
if($e == 'u'){$e = 'span'; return 'text-decoration: underline;';}
static $fs = array('0'=>'xx-small', '1'=>'xx-small', '2'=>'small', '3'=>'medium', '4'=>'large', '5'=>'x-large', '6'=>'xx-large', '7'=>'300%', '-1'=>'smaller', '-2'=>'60%', '+1'=>'larger', '+2'=>'150%', '+3'=>'200%', '+4'=>'300%');
if($e == 'font' && $t !=3){//3 is a new make_tag_strict config value, to indicate that transformation is to be performed, but don't transform font, as size transformation of numeric sizes to keywords alters the intended result too much
if($e == 'font'){
$a2 = '';
if(preg_match('`face\s*=\s*(\'|")([^=]+?)\\1`i', $a, $m) or preg_match('`face\s*=(\s*)(\S+)`i', $a, $m)){
$a2 .= ' font-family: '. str_replace('"', '\'', trim($m[2])). ';';
@ -643,13 +634,7 @@ if($e == 'font' && $t !=3){//3 is a new make_tag_strict config value, to indicat
if(preg_match('`size\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m) && isset($fs[($m = trim($m[2]))])){
$a2 .= ' font-size: '. $fs[$m]. ';';
}
// $e = 'span'; return ltrim($a2);
// replace the above with following
if($GLOBALS['C']['balance']){
$e = 'div'; return 'display: inline; -htmlawed-transform: 1; '. ltrim($a2);
}else{
$e = 'span'; return ltrim($a2);
}
}
if($t == 2){$e = 0; return 0;}
return '';
@ -659,7 +644,7 @@ return '';
function hl_tidy($t, $w, $p){
// Tidy/compact HTM
if(strpos(' pre,script,textarea', "$p,")){return $t;}
$t = str_replace(' </', '</', preg_replace(array('`(<\w[^>]*(?<!/)>)\s+`', '`\s+`', '`(<\w[^>]*(?<!/)>) `'), array(' $1', ' ', '$1'), preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(</\2>)`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t)));
$t = preg_replace('`\s+`', ' ', preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(</\2>)`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t));
if(($w = strtolower($w)) == -1){
return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t);
}
@ -667,7 +652,7 @@ $s = strpos(" $w", 't') ? "\t" : ' ';
$s = preg_match('`\d`', $w, $m) ? str_repeat($s, $m[0]) : str_repeat($s, ($s == "\t" ? 1 : 2));
$N = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0;
$a = array('br'=>1);
$b = array('button'=>1, 'input'=>1, 'option'=>1);
$b = array('button'=>1, 'input'=>1, 'option'=>1, 'param'=>1);
$c = array('caption'=>1, 'dd'=>1, 'dt'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1);
$d = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'form'=>1, 'hr'=>1, 'iframe'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1);
$T = explode('<', $t);
@ -689,20 +674,20 @@ while($X){
else{++$N; ob_end_clean(); continue 2;}
}
else{echo "\n", str_repeat($s, $n), "$e\n", str_repeat($s, ($x != 1 ? ++$n : $n));}
echo ltrim($r); continue;
echo $r; continue;
}
$f = "\n". str_repeat($s, $n);
if(isset($c[$y])){
if(!$x){echo $e, $f, ltrim($r);}
if(!$x){echo $e, $f, $r;}
else{echo $f, $e, $r;}
}elseif(isset($b[$y])){echo $f, $e, $r;
}elseif(isset($a[$y])){echo $e, $f, ltrim($r);
}elseif(!$y){echo $f, $e, $f, ltrim($r);
}elseif(isset($a[$y])){echo $e, $f, $r;
}elseif(!$y){echo $f, $e, $f, $r;
}else{echo $e, $r;}
}
$X = 0;
}
$t = preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents());
$t = str_replace(array("\n ", " \n"), "\n", preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents()));
ob_end_clean();
if(($l = strpos(" $w", 'r') ? (strpos(" $w", 'n') ? "\r\n" : "\r") : 0)){
$t = str_replace("\n", $l, $t);
@ -713,7 +698,7 @@ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array(
function hl_version(){
// rel
return '1.1.14';
return '1.1.17';
// eof
}

View File

@ -1,8 +1,8 @@
<?php
/*
htmLawedTest.php, 8 August 2012
htmLawed 1.1.14, 8 August 2012
htmLawedTest.php, 28 May 2013
htmLawed 1.1.17, 11 March 2014
Copyright Santosh Patnaik
Dual licensed with LGPL 3 and GPL 2+
A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed
@ -262,9 +262,6 @@ function sndUnproc(){
var i = document.getElementById('text');
if(!i){return;}
i = i.value;
i = i.replace(/>/g, '&gt;');
i = i.replace(/</g, '&lt;');
i = i.replace(/"/g, '&quot;');
var w = window.open('htmLawedTest.php?pre=1', 'hlprehtm');
var f = document.createElement('form');
f.enctype = 'application/x-www-form-urlencoded';
@ -272,10 +269,14 @@ function sndUnproc(){
f.acceptCharset = '<?php echo htmlspecialchars($_POST['enc']); ?>';
if(f.style){f.style.display = 'none';}
else{f.visibility = 'hidden';}
f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="token" id="token" value="<?php echo $token; ?>" /><input style="display:none;" type="hidden" name="<?php echo htmlspecialchars($_sid); ?>" id="<?php echo htmlspecialchars($_sid); ?>" value="' + readCookie('<?php echo htmlspecialchars($_sid); ?>') + '" /><input style="display:none;" type="hidden" name="inputH" id="inputH" value="'+ i+ '" /></p>';
f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="token" id="token" value="<?php echo $token; ?>" /><input style="display:none;" type="hidden" name="<?php echo htmlspecialchars($_sid); ?>" id="<?php echo htmlspecialchars($_sid); ?>" value="' + readCookie('<?php echo htmlspecialchars($_sid); ?>') + '" /></p>';
f.action = 'htmLawedTest.php?pre=1';
f.target = 'hlprehtm';
f.method = 'post';
var t = document.createElement('textarea');
t.name = 'inputH';
t.value = i;
f.appendChild(t);
var b = document.getElementsByTagName('body')[0];
b.appendChild(f);
f.submit();
@ -285,9 +286,6 @@ function sndValidn(id, type){
var i = document.getElementById(id);
if(!i){return;}
i = i.value;
i = i.replace(/>/g, '&gt;');
i = i.replace(/</g, '&lt;');
i = i.replace(/"/g, '&quot;');
var w = window.open('http://validator.w3.org/check', 'validate'+id+type);
var f = document.createElement('form');
f.enctype = 'application/x-www-form-urlencoded';
@ -295,9 +293,13 @@ function sndValidn(id, type){
f.acceptCharset = '<?php echo htmlspecialchars($_POST['enc']); ?>';
if(f.style){f.style.display = 'none';}
else{f.visibility = 'hidden';}
f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="fragment" id="fragment" value="'+ i+ '" /><input style="display:none;" type="hidden" name="prefill" id="prefill" value="1" /><input style="display:none;" type="hidden" name="prefill_doctype" id="prefill_doctype" value="'+ type+ '" /><input style="display:none;" type="hidden" name="group" id="group" value="1" /><input type="hidden" name="ss" id="ss" value="1" /></p>';
f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="prefill" id="prefill" value="1" /><input style="display:none;" type="hidden" name="prefill_doctype" id="prefill_doctype" value="'+ type+ '" /><input style="display:none;" type="hidden" name="group" id="group" value="1" /><input type="hidden" name="ss" id="ss" value="1" /></p>';
f.action = 'http://validator.w3.org/check';
f.target = 'validate'+id+type;
var t = document.createElement('textarea');
t.name = 'fragment';
t.value = i;
f.appendChild(t);
var b = document.getElementsByTagName('body')[0];
b.appendChild(f);
f.submit();

View File

@ -110,8 +110,8 @@ span.totop a, span.totop a:visited {color: #6699cc;}
<div id="body">
<br />
<div class="comment">htmLawed_README.txt, 17 September 2012<br />
htmLawed 1.1.14, 8 August 2012<br />
<div class="comment">htmLawed_README.txt, 11 March 2014<br />
htmLawed 1.1.17, 11 March 2014<br />
Copyright Santosh Patnaik<br />
Dual licensed with LGPL 3 and GPL 2+<br />
A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed</a>&#160;</div>
@ -121,7 +121,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s1" id="s1"></a><span class="item-no">1</span>&#160; About htmLawed
</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; htmLawed is a PHP script to process text with HTML markup to make it more comliant with HTML standards &#160;and administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such <em>lawing in</em>&#160;of HTML in text used in (X)HTML or XML documents ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators.<br />
&#160; htmLawed is a PHP script to process text with HTML markup to make it more compliant with HTML standards and administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such <em>lawing in</em>&#160;of HTML in text used in (X)HTML or XML documents ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators.<br />
<br />
&#160; htmLawed is highly customizable, and fast with low memory usage. Its free and open-source code is in one small file, does not require extensions or libraries, and works in older versions of PHP as well. It is a good alternative to the HTML <a href="http://tidy.sourceforge.net">Tidy</a>&#160;application.<br />
@ -915,7 +915,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text.<br />
<br />
&#160; The <span class="term">$config["clean_ms_char"]</span>&#160;parameter should not be used if authors do not copy-paste Microsoft-created text, or if the input text is not believed to use the <span class="term">Windows 1252</span>&#160;(<span class="term">Cp-1252</span>) or a similar encoding like <span class="term">Cp-1251</span>. Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.<br />
&#160; The <span class="term">$config["clean_ms_char"]</span>&#160;parameter should not be used if authors do not copy-paste Microsoft-created text, or if the input text is not believed to use the <span class="term">Windows 1252</span>&#160;(<span class="term">Cp-1252</span>) or a similar encoding like <span class="term">Cp-1251</span>&#160;(otherwise, for example when UTF-8 encoding is in use, Japanese or Korean characters can get mangled). Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.<br />
</div>
<div class="sub-section"><h3>
@ -1773,15 +1773,21 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; <em>Version number - Release date. Notes</em><br />
<br />
&#160; 1.1.17 - 11 March 2014. Removed use of PHP function preg_replace with <span class="term">e</span>&#160;modifier for compatibility with PHP 5.5<br />
<br />
&#160; 1.1.16 - 29 August 2013. Fix for a potential security vulnerability arising from specially encoded space characters in URL schemes/protocols<br />
<br />
&#160; 1.1.15 - 11 August 2013. Improved tidying/prettifying functionality<br />
<br />
&#160; 1.1.14 - 8 August 2012. Fix for possible segmental loss of incremental indentation during <span class="term">tidying</span>&#160;when <span class="term">balance</span>&#160;is disabled; fix for non-effectuation under some circumstances of a corrective behavior to preserve plain text within elements like <span class="term">blockquote</span>.<br />
<br />
&#160; 1.1.13 - 22 July 2012. Added feature allowing use of custom, non-standard attributes or custom rules for standard attributes<br />
<br />
&#160; 1.1.12 - 5 July 2012. Fix for a bug in identifying an unquoted value of the <span class="term">face</span>&#160;attribute<br />
<br />
&#160; 1.1.11 - 5 June 2012. Fix for possible problem with handling of multi-byte characters in attribute values in an mbstring.func_overload enviroment. <span class="term">$config["hook_tag"]</span>, if specified, now receives names of elements in closing tags.<br />
&#160; 1.1.11 - 5 June 2012. Fix for possible problem with handling of multi-byte characters in attribute values in an mbstring.func_overload environment. <span class="term">$config["hook_tag"]</span>, if specified, now receives names of elements in closing tags.<br />
<br />
&#160; 1.1.10 - 22 October 2011. Fix for a bug in the <span class="term">tidy</span>&#160;functionality that caused the entire input to be replaced with a single space; new parameter, <span class="term">$config["direct_list_nest"]</span>&#160;to allow direct descendance of a list in a list. (5 April 2012. Dual licensing from LGPLv3 to LGPLv3 and GPLv2+.)<br />
&#160; 1.1.10 - 22 October 2011. Fix for a bug in the <span class="term">tidy</span>&#160;functionality that caused the entire input to be replaced with a single space; new parameter, <span class="term">$config["direct_list_nest"]</span>&#160;to allow direct descendence of a list in a list. (5 April 2012. Dual licensing from LGPLv3 to LGPLv3 and GPLv2+.)<br />
<br />
&#160; 1.1.9.5 - 6 July 2011. Minor correction of a rule for nesting of <span class="term">li</span>&#160;within <span class="term">dir</span><br />
<br />
@ -1898,7 +1904,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s4.10" id="s4.10"></a><span class="item-no">4.10</span>&#160; Acknowledgements
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Ulf Harnhammer, Gareth Heyes, Klaus Leithoff, Lukasz Pilorz, Shelley Powers, Edward Yang, and many anonymous users.<br />
&#160; Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Dac Chartrand, Ulf Harnhammer, Gareth Heyes, Klaus Leithoff, Lukasz Pilorz, Shelley Powers, Harro Verton, Edward Yang, and many anonymous users.<br />
<br />
&#160; Thank you!<br />
@ -2167,7 +2173,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
</div>
</div>
<br />
<hr /><br /><br /><span class="subtle"><small>HTM version of <em><a href="htmLawed_README.txt">htmLawed_README.txt</a></em> generated on 18 Sep, 2012 using <a href="http://www.bioinformatics.org/phplabware/internal_utilities">rTxt2htm</a> from PHP Labware</small></span>
<hr /><br /><br /><span class="subtle"><small>HTM version of <em><a href="htmLawed_README.txt">htmLawed_README.txt</a></em> generated on 11 Mar, 2014 using <a href="http://www.bioinformatics.org/phplabware/internal_utilities">rTxt2htm</a> from PHP Labware</small></span>
</div><!-- ended div body -->
</div><!-- ended div top -->
</body>

View File

@ -1,6 +1,6 @@
/*
htmLawed_README.txt, 17 September 2012
htmLawed 1.1.14, 8 August 2012
htmLawed_README.txt, 11 March 2014
htmLawed 1.1.17, 11 March 2014
Copyright Santosh Patnaik
Dual licensed with LGPL 3 and GPL 2+
A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed
@ -73,7 +73,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
== 1 About htmLawed ================================================
htmLawed is a PHP script to process text with HTML markup to make it more comliant with HTML standards and administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such `lawing in` of HTML in text used in (X)HTML or XML documents ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators.
htmLawed is a PHP script to process text with HTML markup to make it more compliant with HTML standards and administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such `lawing in` of HTML in text used in (X)HTML or XML documents ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators.
htmLawed is highly customizable, and fast with low memory usage. Its free and open-source code is in one small file, does not require extensions or libraries, and works in older versions of PHP as well. It is a good alternative to the HTML Tidy:- http://tidy.sourceforge.net application.
@ -744,7 +744,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text.
The '$config["clean_ms_char"]' parameter should not be used if authors do not copy-paste Microsoft-created text, or if the input text is not believed to use the 'Windows 1252' ('Cp-1252') or a similar encoding like 'Cp-1251'. Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.
The '$config["clean_ms_char"]' parameter should not be used if authors do not copy-paste Microsoft-created text, or if the input text is not believed to use the 'Windows 1252' ('Cp-1252') or a similar encoding like 'Cp-1251' (otherwise, for example when UTF-8 encoding is in use, Japanese or Korean characters can get mangled). Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.
-- 3.2 Character references/entities ------------------------------o
@ -1344,15 +1344,21 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
`Version number - Release date. Notes`
1.1.17 - 11 March 2014. Removed use of PHP function preg_replace with 'e' modifier for compatibility with PHP 5.5
1.1.16 - 29 August 2013. Fix for a potential security vulnerability arising from specially encoded space characters in URL schemes/protocols
1.1.15 - 11 August 2013. Improved tidying/prettifying functionality
1.1.14 - 8 August 2012. Fix for possible segmental loss of incremental indentation during 'tidying' when 'balance' is disabled; fix for non-effectuation under some circumstances of a corrective behavior to preserve plain text within elements like 'blockquote'.
1.1.13 - 22 July 2012. Added feature allowing use of custom, non-standard attributes or custom rules for standard attributes
1.1.12 - 5 July 2012. Fix for a bug in identifying an unquoted value of the 'face' attribute
1.1.11 - 5 June 2012. Fix for possible problem with handling of multi-byte characters in attribute values in an mbstring.func_overload enviroment. '$config["hook_tag"]', if specified, now receives names of elements in closing tags.
1.1.11 - 5 June 2012. Fix for possible problem with handling of multi-byte characters in attribute values in an mbstring.func_overload environment. '$config["hook_tag"]', if specified, now receives names of elements in closing tags.
1.1.10 - 22 October 2011. Fix for a bug in the 'tidy' functionality that caused the entire input to be replaced with a single space; new parameter, '$config["direct_list_nest"]' to allow direct descendance of a list in a list. (5 April 2012. Dual licensing from LGPLv3 to LGPLv3 and GPLv2+.)
1.1.10 - 22 October 2011. Fix for a bug in the 'tidy' functionality that caused the entire input to be replaced with a single space; new parameter, '$config["direct_list_nest"]' to allow direct descendence of a list in a list. (5 April 2012. Dual licensing from LGPLv3 to LGPLv3 and GPLv2+.)
1.1.9.5 - 6 July 2011. Minor correction of a rule for nesting of 'li' within 'dir'
@ -1462,7 +1468,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
-- 4.10 Acknowledgements ------------------------------------------o
Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Ulf Harnhammer, Gareth Heyes, Klaus Leithoff, Lukasz Pilorz, Shelley Powers, Edward Yang, and many anonymous users.
Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Dac Chartrand, Ulf Harnhammer, Gareth Heyes, Klaus Leithoff, Lukasz Pilorz, Shelley Powers, Harro Verton, Edward Yang, and many anonymous users.
Thank you!

View File

@ -1,6 +1,6 @@
/*
htmLawed_TESTCASE.txt, 14 August 2012
htmLawed 1.1.14, 8 August 2012
htmLawed_TESTCASE.txt, 27 August 2013
htmLawed 1.1.17, 11 March 2014
Copyright Santosh Patnaik
Dual licensed with LGPL 3 and GPL 2+
A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed
@ -28,6 +28,7 @@ character encoding to Unicode/UTF-8
<strong>Deprecated:</strong> <a id="id7" target="self" name="n">a</a>, <hr noshade="noshade" /><br />
<strong>Casing:</strong> <a HREF=""></a><br />
<strong>Custom:</strong> <img alt="image" my:data="portrait" /><br />
<strong>Data-*:</strong> <a data-xml="x" data-xmnt="x" data-xmlnt="x" data-xmn:t="x" data-xmxm="x">a</a><br />
<strong>Admin-restricted?:</strong> <a href="x" onclick="alert();"></a>
<h6>Attribute values</h6>
@ -205,6 +206,13 @@ text <img src="none" alt="none" /> <b>t<em> e <strong> x </strong> t</em></b>
<strong>Malformed:</strong> <![cdata check ]]>, < ![CDATA check ]]>, < ![CDATA check ] ]><br />
Invalid:</strong> <em <!-- check -->>comment in tag content</em>, <!--check-->
<h6>HTML5</h6>
<strong>figure and figcaption:</strong> <figure><img src="picture.jpg" alt="picture"><figcaption>Caption for the awesome picture</figcaption></figure>
<strong>article:</strong> <h1>A</h1><p>B</p><article><h2>C</h2></article><article><h2>E</h2><p>F</p><p>G</p></article>
<strong>meter</strong>: <p>Heat <meter min="100" max="200" value="150">150</meter>.</p>
<strong>datalist</strong>: <input list="b" /><datalist id="b"><option value="c"><option value="d"></datalist>
<h6>Ins-Del</h6>
(depending on context, these elements can be of either block or inline type)<br />
@ -265,6 +273,10 @@ Invalid:</strong> <em <!-- check -->>comment in tag content</em>, <!--check-->
</form>
</li></ul>
</td></tr></table></li></ol>
<strong>Menu</strong>: <menu type="toolbar"><li><menu label="File">
<button type="button" onclick="new()">New...</button>
</menu></li><li><menu label="Edit"><button type="button" onclick="cut()">Cut...</button></menu></li>
</menu>
<h6>Microdata</h6>
@ -279,6 +291,10 @@ Find me at <a href="http://www.xy.com" itemprop="url">www.xy.com</a>
<strong>XML declaration</strong>: <?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /><br />
<strong>XML-invalid character code-point (may not replicate)</strong>: <p class=3DMsoNormal>“Where is he?” asked both Mary the one so lovely and Jane.</p>
<h6>Nesting</h6>
<strong>Block or inline a</strong>: <p><a href="link">text</a></p><a href="link"><div>hi</div></a><br />
<h6>Non-English text-1</h6>
Inscrieţi-vă acum la a Zecea Conferinţă Internaţională<br />
@ -326,6 +342,7 @@ na Alemanha.
<rp>(</rp><rt>aaa</rt><rp>)</rp>
</ruby>
<h6>Tables</h6>
<strong>Omitted closing tags:</strong> <table>
@ -357,6 +374,9 @@ na Alemanha.
<strong>Font element intended as 'block' element:</strong> <div><font color='red'><div>hi</div></font></div><br />
<strong>Font element intended as 'block' element:</strong> <center><font color='red'><div>hi</div><div>QQQ</div></font></center><br />
<h6>Tidy</h6>
<strong>White-space handling:</strong> abc<em> def </em> ghi abc <em>def</em> ghi
<h6>URLs</h6>
<strong>Relative and absolute:</strong> <a href="mailto:x"></a>, <a href="http://a.com/b/c/d.f"></a>, <a href="./../d.f"></a>, <a href="./d.f"></a>, <a href="d.f"></a>, <a href="#s"></a>, <a href="./../../d.f#s"></a><br />
@ -382,6 +402,7 @@ src=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#1
<a style=";-moz-binding:url(http://lukasz.pilorz.net/xss/xss.xml#xss)" href="http://example.com">test</a><br />
<strong>Bad IE7:</strong> <a href="http://x&x=%22+style%3d%22background-image%3a+expression%28alert
%28%27xss%3f%29%29">x</a><br />
<strong>Opera:</strong> <a href="\xE2\x80\x83javascript:alert(123)">link</a>
<strong>Bad IE7:</strong> <a style=color:expr/*comment*/ession(alert(document.domain))>xxx</a><br />
<strong>Bad IE7:</strong> <a href="xxx" style="background: exp&#x72;ession(alert('xss'));">xxx</a><br />
<strong>Bad IE7:</strong> <a href="xxx" style="background: &#101;xpression(alert('xss'));">xxx</a><br />