mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-07-13 20:45:14 +02:00
63 lines
2.2 KiB
JavaScript
63 lines
2.2 KiB
JavaScript
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
|
import PdfPage from '../PdfPage.jsx';
|
|
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
|
|
|
import { isDigit } from '../../functions.jsx'
|
|
|
|
|
|
function hashCodeIgnoringNumbers(string) {
|
|
var hash = 0, i, charCode, len;
|
|
if (string.length === 0) return hash;
|
|
for (i = 0, len = string.length; i < len; i++) {
|
|
charCode = string.charCodeAt(i);
|
|
if (!isDigit(charCode)) {
|
|
hash = ((hash << 5) - hash) + charCode;
|
|
hash |= 0; // Convert to 32bit integer
|
|
}
|
|
}
|
|
return hash;
|
|
}
|
|
|
|
function combineCoordinates(textItem) {
|
|
var hashCode = hashCodeIgnoringNumbers(textItem.text);
|
|
return `${textItem.x}-${textItem.y}-${hashCode}`;
|
|
}
|
|
|
|
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
|
export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
|
|
|
|
constructor() {
|
|
super("Remove Repetitive Elements");
|
|
}
|
|
|
|
transform(pages:PdfPage[]) {
|
|
//build repetition counts for every element
|
|
const repetitionCounts = {};
|
|
pages.forEach(pdfPage => {
|
|
pdfPage.textItems.forEach(textItem => {
|
|
var combinedCoordinates = combineCoordinates(textItem);
|
|
repetitionCounts[combinedCoordinates] = repetitionCounts[combinedCoordinates] ? repetitionCounts[combinedCoordinates] + 1 : 1;
|
|
});
|
|
});
|
|
|
|
// annotate elements with repetition as removed
|
|
pages.forEach(pdfPage => {
|
|
pdfPage.textItems.forEach(textItem => {
|
|
var combinedCoordinates = combineCoordinates(textItem);
|
|
if (repetitionCounts[combinedCoordinates] > 1) {
|
|
// console.debug("page " + pdfPage.index + " removed :" + repetitionCounts[combinedCoordinates] + " :" + textItem.text);
|
|
textItem.annotation = REMOVED_ANNOTATION;
|
|
}
|
|
});
|
|
});
|
|
return pages;
|
|
}
|
|
|
|
processAnnotations(pages:PdfPage[]) {
|
|
pages.forEach(page => {
|
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
|
});
|
|
return pages;
|
|
}
|
|
|
|
} |