Files
pdf-to-markdown/src/javascript/models/transformations/RemoveRepetitiveElements.jsx
2017-02-14 21:47:54 +01:00

63 lines
2.2 KiB
JavaScript

import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import PdfPage from '../PdfPage.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
import { isDigit } from '../../functions.jsx'
function hashCodeIgnoringNumbers(string) {
var hash = 0, i, charCode, len;
if (string.length === 0) return hash;
for (i = 0, len = string.length; i < len; i++) {
charCode = string.charCodeAt(i);
if (!isDigit(charCode)) {
hash = ((hash << 5) - hash) + charCode;
hash |= 0; // Convert to 32bit integer
}
}
return hash;
}
function combineCoordinates(textItem) {
var hashCode = hashCodeIgnoringNumbers(textItem.text);
return `${textItem.x}-${textItem.y}-${hashCode}`;
}
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
constructor() {
super("Remove Repetitive Elements");
}
transform(pages:PdfPage[]) {
//build repetition counts for every element
const repetitionCounts = {};
pages.forEach(pdfPage => {
pdfPage.textItems.forEach(textItem => {
var combinedCoordinates = combineCoordinates(textItem);
repetitionCounts[combinedCoordinates] = repetitionCounts[combinedCoordinates] ? repetitionCounts[combinedCoordinates] + 1 : 1;
});
});
// annotate elements with repetition as removed
pages.forEach(pdfPage => {
pdfPage.textItems.forEach(textItem => {
var combinedCoordinates = combineCoordinates(textItem);
if (repetitionCounts[combinedCoordinates] > 1) {
// console.debug("page " + pdfPage.index + " removed :" + repetitionCounts[combinedCoordinates] + " :" + textItem.text);
textItem.annotation = REMOVED_ANNOTATION;
}
});
});
return pages;
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
});
return pages;
}
}