Improve combine on same Y

This commit is contained in:
Johannes Zillmann 2017-01-28 22:56:44 +01:00
parent df07968c4d
commit b5177a9ed3

View File

@ -4,6 +4,47 @@ import PdfPage from '../PdfPage.jsx';
import ContentView from '../ContentView.jsx'; import ContentView from '../ContentView.jsx';
import Annotation from '../Annotation.jsx'; import Annotation from '../Annotation.jsx';
function combineTextItems(textItems:TextItem[]) {
var numChars = 0;
var sumWidth = 0;
var maxHeight = 0;
textItems.forEach(textItem => {
if (textItem.width > 0) {
numChars += textItem.text.length;
sumWidth += textItem.width;
}
maxHeight = Math.max(textItem.height, maxHeight);
});
const avgCharacterWidth = Math.round(sumWidth / numChars);
var combinedText = '';
var sumWidthWithWhitespaces = sumWidth;
var lastItemX;
var lastItemWidth;
textItems.forEach(textItem => {
if (lastItemX && textItem.x - lastItemX - lastItemWidth > avgCharacterWidth) {
combinedText += ' ';
sumWidthWithWhitespaces += avgCharacterWidth;
}
combinedText += textItem.text;
lastItemX = textItem.x;
lastItemWidth = textItem.width > 0 ? textItem.width : avgCharacterWidth / 2 * textItem.text.length;
});
return new TextItem({
x: textItems[0].x,
y: textItems[0].y,
width: sumWidthWithWhitespaces,
height: maxHeight,
text: combinedText,
annotation: new Annotation({
category: 'combined',
color: 'green'
})
});
}
export default class CombineSameYTransformation extends Transformation { export default class CombineSameYTransformation extends Transformation {
constructor() { constructor() {
@ -20,50 +61,36 @@ export default class CombineSameYTransformation extends Transformation {
category: 'removed', category: 'removed',
color: 'red' color: 'red'
}); });
const combinedAnnotation = new Annotation({
category: 'combined',
color: 'green'
});
return pages.map(pdfPage => { return pages.map(pdfPage => {
const newTextItems = []; const newTextItems = [];
var lastTextItem; var textItemsWithSameY = [];
pdfPage.textItems.forEach(textItem => {
if (!lastTextItem) {
lastTextItem = textItem;
} else {
if (textItem.y == lastTextItem.y) { //combine
if (!lastTextItem.annotation) { var completeTextItemsWithSameY = function(textItemsWithSameY) {
lastTextItem.annotation = removedAnnotation; if (textItemsWithSameY.length == 1) {
newTextItems.push(lastTextItem); newTextItems.push(textItemsWithSameY[0]);
} } else {
// add removed text-items
textItemsWithSameY.forEach(textItem => {
textItem.annotation = removedAnnotation; textItem.annotation = removedAnnotation;
newTextItems.push(textItem); newTextItems.push(textItem);
});
newTextItems.push(combineTextItems(textItemsWithSameY));
}
}
var combinedText = lastTextItem.text; pdfPage.textItems.forEach(textItem => {
//TODO make 5 dependent on text size or biggest gap? if (textItemsWithSameY.length == 0 || textItem.y == textItemsWithSameY[textItemsWithSameY.length - 1].y) {
if (textItem.x - lastTextItem.x - lastTextItem.width > 7) { //fill array
combinedText += ' '; textItemsWithSameY.push(textItem);
} } else {
combinedText += textItem.text; //rotate
completeTextItemsWithSameY(textItemsWithSameY);
lastTextItem = new TextItem({ textItemsWithSameY = [textItem];
x: lastTextItem.x,
y: lastTextItem.y,
width: textItem.x - lastTextItem.x + textItem.width,
height: lastTextItem.height, //might this cause problems ?
text: combinedText,
annotation: combinedAnnotation
});
} else { //rotate
newTextItems.push(lastTextItem);
lastTextItem = textItem;
}
} }
}); });
if (lastTextItem) { if (textItemsWithSameY.length > 0) {
newTextItems.push(lastTextItem); completeTextItemsWithSameY(textItemsWithSameY);
} }
return { return {