mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-21 01:58:01 +02:00
remove width and height from removing repetitive items to increase accuracy
This commit is contained in:
parent
91087d550b
commit
12f6105d90
@ -21,7 +21,7 @@ function hashCodeIgnoringNumbers(string) {
|
||||
|
||||
function combineCoordinates(textItem) {
|
||||
var hashCode = hashCodeIgnoringNumbers(textItem.text);
|
||||
return `${textItem.x}-${textItem.y}-${textItem.width}-${textItem.height}-${hashCode}`;
|
||||
return `${textItem.x}-${textItem.y}-${hashCode}`;
|
||||
}
|
||||
|
||||
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
||||
@ -37,7 +37,7 @@ export default class RemoveRepetitiveElements extends Transformation {
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
//build repetition counts for every element
|
||||
var repetitionCounts = {};
|
||||
const repetitionCounts = {};
|
||||
pages.forEach(pdfPage => {
|
||||
pdfPage.textItems.forEach(textItem => {
|
||||
var combinedCoordinates = combineCoordinates(textItem);
|
||||
|
Loading…
x
Reference in New Issue
Block a user