remove width and height from removing repetitive items to increase accuracy

This commit is contained in:
Johannes Zillmann 2017-02-03 12:37:39 +01:00
parent 91087d550b
commit 12f6105d90

View File

@ -21,7 +21,7 @@ function hashCodeIgnoringNumbers(string) {
function combineCoordinates(textItem) { function combineCoordinates(textItem) {
var hashCode = hashCodeIgnoringNumbers(textItem.text); var hashCode = hashCodeIgnoringNumbers(textItem.text);
return `${textItem.x}-${textItem.y}-${textItem.width}-${textItem.height}-${hashCode}`; return `${textItem.x}-${textItem.y}-${hashCode}`;
} }
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc... // Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
@ -37,7 +37,7 @@ export default class RemoveRepetitiveElements extends Transformation {
transform(pages:PdfPage[]) { transform(pages:PdfPage[]) {
//build repetition counts for every element //build repetition counts for every element
var repetitionCounts = {}; const repetitionCounts = {};
pages.forEach(pdfPage => { pages.forEach(pdfPage => {
pdfPage.textItems.forEach(textItem => { pdfPage.textItems.forEach(textItem => {
var combinedCoordinates = combineCoordinates(textItem); var combinedCoordinates = combineCoordinates(textItem);