mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-21 18:11:54 +02:00
remove width and height from removing repetitive items to increase accuracy
This commit is contained in:
parent
91087d550b
commit
12f6105d90
@ -21,7 +21,7 @@ function hashCodeIgnoringNumbers(string) {
|
|||||||
|
|
||||||
function combineCoordinates(textItem) {
|
function combineCoordinates(textItem) {
|
||||||
var hashCode = hashCodeIgnoringNumbers(textItem.text);
|
var hashCode = hashCodeIgnoringNumbers(textItem.text);
|
||||||
return `${textItem.x}-${textItem.y}-${textItem.width}-${textItem.height}-${hashCode}`;
|
return `${textItem.x}-${textItem.y}-${hashCode}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
||||||
@ -37,7 +37,7 @@ export default class RemoveRepetitiveElements extends Transformation {
|
|||||||
|
|
||||||
transform(pages:PdfPage[]) {
|
transform(pages:PdfPage[]) {
|
||||||
//build repetition counts for every element
|
//build repetition counts for every element
|
||||||
var repetitionCounts = {};
|
const repetitionCounts = {};
|
||||||
pages.forEach(pdfPage => {
|
pages.forEach(pdfPage => {
|
||||||
pdfPage.textItems.forEach(textItem => {
|
pdfPage.textItems.forEach(textItem => {
|
||||||
var combinedCoordinates = combineCoordinates(textItem);
|
var combinedCoordinates = combineCoordinates(textItem);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user