Merge: more messing around with encoding, this time ">" was getting stripped

the source was plaintext going into an XML document without parsing html styles
This commit is contained in:
nathan 2025-01-24 11:18:44 -07:00
parent 2457b77cca
commit 3f5a0900f1
3 changed files with 170 additions and 51 deletions

View File

@ -1403,47 +1403,48 @@ abstract class Merge
}
}
}
if (!empty($is_xml)) // zip'ed xml document (eg. OO)
{
// Numeric fields
$names = array();
// Numeric fields
$names = array();
// Tags we can replace with the target document's version
$replace_tags = array();
// only keep tags, if we have xsl extension available
if(class_exists('XSLTProcessor') && class_exists('DOMDocument') && $this->parse_html_styles)
// Tags we can replace with the target document's version
$replace_tags = array();
// only keep tags, if we have xsl extension available
if(class_exists('XSLTProcessor') && class_exists('DOMDocument') && $this->parse_html_styles)
{
switch($mimetype . $mso_application_progid)
{
switch($mimetype . $mso_application_progid)
{
case 'text/html':
$replace_tags = array(
'<b>', '<strong>', '<i>', '<em>', '<u>', '<span>', '<ol>', '<ul>', '<li>',
'<table>', '<tr>', '<td>', '<a>', '<style>', '<img>',
);
break;
case 'application/vnd.oasis.opendocument.text': // open office
case 'application/vnd.oasis.opendocument.spreadsheet':
case 'application/vnd.oasis.opendocument.presentation':
case 'application/vnd.oasis.opendocument.text-template':
case 'application/vnd.oasis.opendocument.spreadsheet-template':
case 'application/vnd.oasis.opendocument.presentation-template':
$replace_tags = array(
'<b>', '<strong>', '<i>', '<em>', '<u>', '<span>', '<ol>', '<ul>', '<li>',
'<table>', '<tr>', '<td>', '<a>',
);
break;
case 'application/xmlWord.Document': // Word 2003*/
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': // ms office 2007
case 'application/vnd.ms-word.document.macroenabled.12':
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
case 'application/vnd.ms-excel.sheet.macroenabled.12':
$replace_tags = array(
'<b>', '<strong>', '<i>', '<em>', '<u>', '<span>', '<ol>', '<ul>', '<li>',
'<table>', '<tr>', '<td>',
);
break;
}
case 'text/html':
$replace_tags = array(
'<b>', '<strong>', '<i>', '<em>', '<u>', '<span>', '<ol>', '<ul>', '<li>',
'<table>', '<tr>', '<td>', '<a>', '<style>', '<img>',
);
break;
case 'application/vnd.oasis.opendocument.text': // open office
case 'application/vnd.oasis.opendocument.spreadsheet':
case 'application/vnd.oasis.opendocument.presentation':
case 'application/vnd.oasis.opendocument.text-template':
case 'application/vnd.oasis.opendocument.spreadsheet-template':
case 'application/vnd.oasis.opendocument.presentation-template':
$replace_tags = array(
'<b>', '<strong>', '<i>', '<em>', '<u>', '<span>', '<ol>', '<ul>', '<li>',
'<table>', '<tr>', '<td>', '<a>',
);
break;
case 'application/xmlWord.Document': // Word 2003*/
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': // ms office 2007
case 'application/vnd.ms-word.document.macroenabled.12':
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
case 'application/vnd.ms-excel.sheet.macroenabled.12':
$replace_tags = array(
'<b>', '<strong>', '<i>', '<em>', '<u>', '<span>', '<ol>', '<ul>', '<li>',
'<table>', '<tr>', '<td>',
);
break;
}
}
if(!empty($is_xml)) // zip'ed xml document (eg. OO)
{
// clean replacements from array values and html or html-entities, which mess up xml
foreach($replacements as $name => &$value)
{
@ -1454,7 +1455,6 @@ abstract class Merge
continue;
}
// decode html entities back to utf-8
if(is_string($value) && (strpos($value, '&') !== false) && $this->parse_html_styles)
{
$value = html_entity_decode($value, ENT_QUOTES, $charset);
@ -1462,16 +1462,18 @@ abstract class Merge
// remove all non-decodable entities
if(strpos($value, '&') !== false)
{
$value = preg_replace('/&[^; ]+;/', '', $value);
//$value = preg_replace('/&[^; ]+;/', '', $value);
}
}
if(!$this->parse_html_styles || (
strpos($value, "\n") !== FALSE &&
strpos($value, '<br') === FALSE && strpos($value, '<span') === FALSE && strpos($value, '<p') === FALSE && strpos($value, '<div') === FALSE
))
if(!$this->parse_html_styles)
{
// Encode special chars so they don't break the file
$value = htmlspecialchars($value, ENT_NOQUOTES);
//$value = htmlspecialchars($value, ENT_NOQUOTES);
strip_tags(str_replace(
array("\r", '<p>', "</p>\n", '</p>', '<div>', '</div>', '<br />'),
array('', '', "\n", "\n", '', "\n", "\n"), $value
), implode('', $replace_tags)
);
}
else
{
@ -1491,8 +1493,9 @@ abstract class Merge
else
{
// Strip some specific stuff to avoid the extra new lines
$value = str_replace(["<html>\n", "<head>\n<title></title>\n</head>\n", "<body>\n",
"</body>\n", "</html>\n"], '', $cleaned);
$value = str_replace(["<html>\n", '<html>', "<head>\n<title></title>\n</head>\n",
"<body>\n", '<body>',
"</body>\n", "</html>\n", '</html>'], '', $cleaned);
}
}
// replace </p> and <br /> with CRLF (remove <p> and CRLF)
@ -1564,15 +1567,27 @@ abstract class Merge
if($this->parse_html_styles)
{
$replacements = str_replace(
array('&', "\r", "\n", '&amp;lt;', '&amp;gt;'),
array('&amp;', '', $this->line_feed, '&lt;', '&gt;'),
array('&', "\r", "\n", '&amp;amp;', '&amp;lt;', '&amp;gt;', '&amp;nbsp;'),
array('&amp;', '', $this->line_feed, '&amp;', '&lt;', '&gt;', ' '),
$replacements
);
}
else
{
// Need to at least handle new lines, or it'll be run together on one line
$replacements = str_replace(array("\r", "\n"), array('', $this->line_feed), $replacements);
$replacements = str_replace(
array("\r", "\n", '&amp;amp;'),
array('', $this->line_feed, '&amp;'),
$replacements
);
}
}
else
{
// HTML into non-XML (plaintext template)
foreach($replacements as $name => &$value)
{
$value = html_entity_decode($value, ENT_QUOTES, $charset);
}
}
if($mimetype == 'application/x-yaml')
@ -2227,7 +2242,6 @@ abstract class Merge
copy($content_url, $archive);
$content_url = 'zip://' . $archive . '#' . ($content_file = 'content.xml');
$styles_url = 'zip://'.$archive.'#'.($styles_file = 'styles.xml');
$this->parse_html_styles = true;
break;
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.d': // mimetypes in vfs are limited to 64 chars
$mimetype = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';

View File

@ -0,0 +1,73 @@
<?php
namespace Storage;
use EGroupware\Api\LoggedInTest;
use PHPUnit\Framework\TestCase;
require_once __DIR__ . '/../LoggedInTest.php';
require_once __DIR__ . '/TestMerge.php';
class MergeTest extends LoggedInTest
{
const SIMPLE_TARGET = "{{replacement}}";
protected function setUp() : void
{
$this->merge = new TestMerge();
}
/**
* Test plain text into a simple text document
*
* @dataProvider textToTextProvider
*/
public function testTextToText($testText, $expectedText)
{
$errors = [];
$this->merge->setReplacements(['$$replacement$$' => $testText]);
$result = $this->merge->merge_string(self::SIMPLE_TARGET, [1], $errors, "text/plain");
$this->assertEmpty($errors, "Errors when merging");
$this->assertEquals($expectedText, $result);
}
public function textToTextProvider() : array
{
return [
["Plain text", "Plain text"],
["New\nline text", "New\nline text"],
['Special -> characters <- & stuff', 'Special -> characters <- & stuff'],
['<b>Contains HTML</b>', '<b>Contains HTML</b>'], // HTML is text too
['HTML<br />newline', "HTML<br />newline"], // HTML is text too
["Multi-line:\n1. First line\n -> Second\n", "Multi-line:\n1. First line\n -> Second\n"],
];
}
/**
* With no parsing into an HTML file, we expect the same
* @dataProvider textToHTMLProvider
*/
public function testTextToHtml($testText, $expectedText)
{
$errors = [];
$this->merge->setReplacements(['$$replacement$$' => $testText]);
$result = $this->merge->merge_string(self::SIMPLE_TARGET, [1], $errors, "text/html");
$this->assertEmpty($errors, "Errors when merging");
$this->assertEquals($expectedText, $result);
}
public function textToHtmlProvider() : array
{
return [
["Plain text", "Plain text"],
["New\nline text", "New<br/>line text"], // Newlines get parsed anyway
['Special -> characters <- & stuff', 'Special -> characters '],
// strip_tags() is not smart. This could be improved
['<b>Contains<br /> HTML</b>', '<b>Contains<br/> HTML</b>'], // Some tags are allowed
['<q>Contains HTML that will be stripped</q>', 'Contains HTML that will be stripped'],
["Multi-line:\n1. First line\n -> Second\n", "Multi-line:<br/>1. First line<br/> -> Second<br/>"],
];
}
}

View File

@ -0,0 +1,32 @@
<?php
namespace Storage;
use EGroupware\Api\Storage\Merge;
/**
* Concrete Merge class for testing
*/
class TestMerge extends Merge
{
private $replacements = [];
public function setReplacements(array $replacements)
{
$this->replacements = $replacements;
}
public function setParseHtmlStyles($parseHtmlStyles)
{
$this->parseHtmlStyles = $parseHtmlStyles;
}
/**
* @inheritDoc
*/
protected function get_replacements($id, &$content = null)
{
return $this->replacements;
}
}