From 08739c08848f2fd316ba283344343ffb3f20acf5 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Fri, 27 Jan 2017 21:40:49 +0100 Subject: [PATCH] Remove repetitive items (page header/footer) --- src/javascript/components/DebugView.jsx | 5 +- src/javascript/components/PdfPageView.jsx | 10 ++- src/javascript/models/Annotation.jsx | 9 +++ src/javascript/models/AppState.jsx | 9 ++- src/javascript/models/TextItem.jsx | 3 +- .../CombineSameYTransformation.jsx | 2 +- .../RemoveRepetitiveElements.jsx | 70 +++++++++++++++++++ .../RoundCoordinatesTransformation.jsx | 2 +- .../models/transformations/Transformation.jsx | 5 ++ 9 files changed, 109 insertions(+), 6 deletions(-) create mode 100644 src/javascript/models/Annotation.jsx create mode 100644 src/javascript/models/transformations/RemoveRepetitiveElements.jsx diff --git a/src/javascript/components/DebugView.jsx b/src/javascript/components/DebugView.jsx index d78163b..b0d81ac 100644 --- a/src/javascript/components/DebugView.jsx +++ b/src/javascript/components/DebugView.jsx @@ -61,9 +61,12 @@ export default class DebugView extends React.Component { var contentView; var lastTransformation; for (var i = 0; i <= currentTransformation; i++) { + if (lastTransformation) { + transformedPages = lastTransformation.processAnnotations(transformedPages); + } transformedPages = transformations[i].transform(transformedPages); - lastTransformation = transformations[i]; contentView = transformations[i].contentView(); + lastTransformation = transformations[i]; } var pageComponents; diff --git a/src/javascript/components/PdfPageView.jsx b/src/javascript/components/PdfPageView.jsx index 5d9e11e..f61c24d 100644 --- a/src/javascript/components/PdfPageView.jsx +++ b/src/javascript/components/PdfPageView.jsx @@ -34,10 +34,15 @@ export default class PdfPageView extends React.Component { Height + + Annotation + - { this.props.pdfPage.textItems.map((textItem, i) => + { this.props.pdfPage.textItems.map((textItem, i) => { i } @@ -56,6 +61,9 @@ export default class PdfPageView extends React.Component { { textItem.height } + + { textItem.annotation ? textItem.annotation.category : '' } + ) } diff --git a/src/javascript/models/Annotation.jsx b/src/javascript/models/Annotation.jsx new file mode 100644 index 0000000..65eb890 --- /dev/null +++ b/src/javascript/models/Annotation.jsx @@ -0,0 +1,9 @@ +// Annotation for a text item +export default class Annotation { + + constructor(options) { + this.category = options.category; + this.color = options.color; + } + +} diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index 79acc82..ffb0203 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -3,6 +3,7 @@ import { Enum } from 'enumify'; import NoOpTransformation from './transformations/NoOpTransformation.jsx'; import RoundCoordinatesTransformation from './transformations/RoundCoordinatesTransformation.jsx'; import CombineSameYTransformation from './transformations/CombineSameYTransformation.jsx'; +import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import ToTextPagesTransformation from './transformations/ToTextPagesTransformation.jsx'; import ToSingleTextPageTransformation from './transformations/ToSingleTextPageTransformation.jsx' @@ -14,7 +15,13 @@ export default class AppState { this.mainView = View.UPLOAD; this.fileBuffer; this.pdfPages = []; - this.transformations = [new NoOpTransformation(), new RoundCoordinatesTransformation(), new CombineSameYTransformation(), new ToTextPagesTransformation(), new ToSingleTextPageTransformation()]; + this.transformations = [ + new NoOpTransformation(), + new RoundCoordinatesTransformation(), + new CombineSameYTransformation(), + new RemoveRepetitiveElements(), + new ToTextPagesTransformation(), + new ToSingleTextPageTransformation()]; //bind functions this.render = this.render.bind(this); diff --git a/src/javascript/models/TextItem.jsx b/src/javascript/models/TextItem.jsx index 1a83a2e..c1765d4 100644 --- a/src/javascript/models/TextItem.jsx +++ b/src/javascript/models/TextItem.jsx @@ -1,4 +1,4 @@ -//Holds individual text items of a page +//A text iteme, i.e. a line, within a page export default class TextItem { constructor(options) { @@ -7,6 +7,7 @@ export default class TextItem { this.width = options.width; this.height = options.height; this.text = options.text; + this.annotation = options.annotation; } } diff --git a/src/javascript/models/transformations/CombineSameYTransformation.jsx b/src/javascript/models/transformations/CombineSameYTransformation.jsx index 941f695..5f41af6 100644 --- a/src/javascript/models/transformations/CombineSameYTransformation.jsx +++ b/src/javascript/models/transformations/CombineSameYTransformation.jsx @@ -6,7 +6,7 @@ import ContentView from '../ContentView.jsx'; export default class CombineSameYTransformation extends Transformation { constructor() { - super("Combine text on same Y"); + super("Combine Text On Same Y"); } contentView() { diff --git a/src/javascript/models/transformations/RemoveRepetitiveElements.jsx b/src/javascript/models/transformations/RemoveRepetitiveElements.jsx new file mode 100644 index 0000000..f341f38 --- /dev/null +++ b/src/javascript/models/transformations/RemoveRepetitiveElements.jsx @@ -0,0 +1,70 @@ +import Transformation from './Transformation.jsx'; +import Annotation from '../Annotation.jsx'; +import PdfPage from '../PdfPage.jsx'; +import ContentView from '../ContentView.jsx'; + + +function hashCodeIgnoringNumbers(string) { + var hash = 0, i, charCode, len, isNumber; + if (string.length === 0) return hash; + for (i = 0, len = string.length; i < len; i++) { + charCode = string.charCodeAt(i); + isNumber = charCode >= 48 && charCode <= 57; + if (!isNumber) { + hash = ((hash << 5) - hash) + charCode; + hash |= 0; // Convert to 32bit integer + } + } + return hash; +} + +function combineCoordinates(textItem) { + var hashCode = hashCodeIgnoringNumbers(textItem.text); + return `${textItem.x}-${textItem.y}-${textItem.width}-${textItem.height}-${hashCode}`; +} + +// Remove elements with similar content on same page positions, like page numbers, licenes information, etc... +export default class RemoveRepetitiveElements extends Transformation { + + constructor() { + super("Remove Repetitive Elements"); + } + + contentView() { + return ContentView.PDF; + } + + transform(pages:PdfPage[]) { + //build repetition counts for every element + var repetitionCounts = {}; + pages.forEach(pdfPage => { + pdfPage.textItems.forEach(textItem => { + var combinedCoordinates = combineCoordinates(textItem); + repetitionCounts[combinedCoordinates] = repetitionCounts[combinedCoordinates] ? repetitionCounts[combinedCoordinates] + 1 : 1; + }); + }); + + // annotate elements with repetition as removed + pages.forEach(pdfPage => { + pdfPage.textItems.forEach(textItem => { + var combinedCoordinates = combineCoordinates(textItem); + if (repetitionCounts[combinedCoordinates] > 1) { + // console.debug("page " + pdfPage.index + " removed :" + repetitionCounts[combinedCoordinates] + " :" + textItem.text); + textItem.annotation = new Annotation({ + category: 'removed', + color: 'red' + }); + } + }); + }); + return pages; + } + + processAnnotations(pages:PdfPage[]) { + pages.forEach(page => { + page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed'); + }); + return pages; + } + +} \ No newline at end of file diff --git a/src/javascript/models/transformations/RoundCoordinatesTransformation.jsx b/src/javascript/models/transformations/RoundCoordinatesTransformation.jsx index fa30a7a..f4ec82e 100644 --- a/src/javascript/models/transformations/RoundCoordinatesTransformation.jsx +++ b/src/javascript/models/transformations/RoundCoordinatesTransformation.jsx @@ -5,7 +5,7 @@ import ContentView from '../ContentView.jsx'; export default class RoundCoordinatesTransformation extends Transformation { constructor() { - super("Round coordinates"); + super("Round Coordinates"); } contentView() { diff --git a/src/javascript/models/transformations/Transformation.jsx b/src/javascript/models/transformations/Transformation.jsx index 4b0b8d4..4173066 100644 --- a/src/javascript/models/transformations/Transformation.jsx +++ b/src/javascript/models/transformations/Transformation.jsx @@ -25,5 +25,10 @@ export default class Transformation { throw new TypeError("Do not call abstract method foo from child."); } + // Annotations which have been added during transform() can now be cleaned-up / handled + processAnnotations(pages) { // eslint-disable-line no-unused-vars + return pages; + } + } \ No newline at end of file