diff --git a/src/javascript/components/DebugView.jsx b/src/javascript/components/DebugView.jsx
index d78163b..b0d81ac 100644
--- a/src/javascript/components/DebugView.jsx
+++ b/src/javascript/components/DebugView.jsx
@@ -61,9 +61,12 @@ export default class DebugView extends React.Component {
var contentView;
var lastTransformation;
for (var i = 0; i <= currentTransformation; i++) {
+ if (lastTransformation) {
+ transformedPages = lastTransformation.processAnnotations(transformedPages);
+ }
transformedPages = transformations[i].transform(transformedPages);
- lastTransformation = transformations[i];
contentView = transformations[i].contentView();
+ lastTransformation = transformations[i];
}
var pageComponents;
diff --git a/src/javascript/components/PdfPageView.jsx b/src/javascript/components/PdfPageView.jsx
index 5d9e11e..f61c24d 100644
--- a/src/javascript/components/PdfPageView.jsx
+++ b/src/javascript/components/PdfPageView.jsx
@@ -34,10 +34,15 @@ export default class PdfPageView extends React.Component {
Height
|
+
+ Annotation
+ |
- { this.props.pdfPage.textItems.map((textItem, i) =>
+ { this.props.pdfPage.textItems.map((textItem, i) =>
{ i }
|
@@ -56,6 +61,9 @@ export default class PdfPageView extends React.Component {
{ textItem.height }
|
+
+ { textItem.annotation ? textItem.annotation.category : '' }
+ |
) }
diff --git a/src/javascript/models/Annotation.jsx b/src/javascript/models/Annotation.jsx
new file mode 100644
index 0000000..65eb890
--- /dev/null
+++ b/src/javascript/models/Annotation.jsx
@@ -0,0 +1,9 @@
+// Annotation for a text item
+export default class Annotation {
+
+ constructor(options) {
+ this.category = options.category;
+ this.color = options.color;
+ }
+
+}
diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx
index 79acc82..ffb0203 100644
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@@ -3,6 +3,7 @@ import { Enum } from 'enumify';
import NoOpTransformation from './transformations/NoOpTransformation.jsx';
import RoundCoordinatesTransformation from './transformations/RoundCoordinatesTransformation.jsx';
import CombineSameYTransformation from './transformations/CombineSameYTransformation.jsx';
+import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import ToTextPagesTransformation from './transformations/ToTextPagesTransformation.jsx';
import ToSingleTextPageTransformation from './transformations/ToSingleTextPageTransformation.jsx'
@@ -14,7 +15,13 @@ export default class AppState {
this.mainView = View.UPLOAD;
this.fileBuffer;
this.pdfPages = [];
- this.transformations = [new NoOpTransformation(), new RoundCoordinatesTransformation(), new CombineSameYTransformation(), new ToTextPagesTransformation(), new ToSingleTextPageTransformation()];
+ this.transformations = [
+ new NoOpTransformation(),
+ new RoundCoordinatesTransformation(),
+ new CombineSameYTransformation(),
+ new RemoveRepetitiveElements(),
+ new ToTextPagesTransformation(),
+ new ToSingleTextPageTransformation()];
//bind functions
this.render = this.render.bind(this);
diff --git a/src/javascript/models/TextItem.jsx b/src/javascript/models/TextItem.jsx
index 1a83a2e..c1765d4 100644
--- a/src/javascript/models/TextItem.jsx
+++ b/src/javascript/models/TextItem.jsx
@@ -1,4 +1,4 @@
-//Holds individual text items of a page
+//A text iteme, i.e. a line, within a page
export default class TextItem {
constructor(options) {
@@ -7,6 +7,7 @@ export default class TextItem {
this.width = options.width;
this.height = options.height;
this.text = options.text;
+ this.annotation = options.annotation;
}
}
diff --git a/src/javascript/models/transformations/CombineSameYTransformation.jsx b/src/javascript/models/transformations/CombineSameYTransformation.jsx
index 941f695..5f41af6 100644
--- a/src/javascript/models/transformations/CombineSameYTransformation.jsx
+++ b/src/javascript/models/transformations/CombineSameYTransformation.jsx
@@ -6,7 +6,7 @@ import ContentView from '../ContentView.jsx';
export default class CombineSameYTransformation extends Transformation {
constructor() {
- super("Combine text on same Y");
+ super("Combine Text On Same Y");
}
contentView() {
diff --git a/src/javascript/models/transformations/RemoveRepetitiveElements.jsx b/src/javascript/models/transformations/RemoveRepetitiveElements.jsx
new file mode 100644
index 0000000..f341f38
--- /dev/null
+++ b/src/javascript/models/transformations/RemoveRepetitiveElements.jsx
@@ -0,0 +1,70 @@
+import Transformation from './Transformation.jsx';
+import Annotation from '../Annotation.jsx';
+import PdfPage from '../PdfPage.jsx';
+import ContentView from '../ContentView.jsx';
+
+
+function hashCodeIgnoringNumbers(string) {
+ var hash = 0, i, charCode, len, isNumber;
+ if (string.length === 0) return hash;
+ for (i = 0, len = string.length; i < len; i++) {
+ charCode = string.charCodeAt(i);
+ isNumber = charCode >= 48 && charCode <= 57;
+ if (!isNumber) {
+ hash = ((hash << 5) - hash) + charCode;
+ hash |= 0; // Convert to 32bit integer
+ }
+ }
+ return hash;
+}
+
+function combineCoordinates(textItem) {
+ var hashCode = hashCodeIgnoringNumbers(textItem.text);
+ return `${textItem.x}-${textItem.y}-${textItem.width}-${textItem.height}-${hashCode}`;
+}
+
+// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
+export default class RemoveRepetitiveElements extends Transformation {
+
+ constructor() {
+ super("Remove Repetitive Elements");
+ }
+
+ contentView() {
+ return ContentView.PDF;
+ }
+
+ transform(pages:PdfPage[]) {
+ //build repetition counts for every element
+ var repetitionCounts = {};
+ pages.forEach(pdfPage => {
+ pdfPage.textItems.forEach(textItem => {
+ var combinedCoordinates = combineCoordinates(textItem);
+ repetitionCounts[combinedCoordinates] = repetitionCounts[combinedCoordinates] ? repetitionCounts[combinedCoordinates] + 1 : 1;
+ });
+ });
+
+ // annotate elements with repetition as removed
+ pages.forEach(pdfPage => {
+ pdfPage.textItems.forEach(textItem => {
+ var combinedCoordinates = combineCoordinates(textItem);
+ if (repetitionCounts[combinedCoordinates] > 1) {
+ // console.debug("page " + pdfPage.index + " removed :" + repetitionCounts[combinedCoordinates] + " :" + textItem.text);
+ textItem.annotation = new Annotation({
+ category: 'removed',
+ color: 'red'
+ });
+ }
+ });
+ });
+ return pages;
+ }
+
+ processAnnotations(pages:PdfPage[]) {
+ pages.forEach(page => {
+ page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
+ });
+ return pages;
+ }
+
+}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/RoundCoordinatesTransformation.jsx b/src/javascript/models/transformations/RoundCoordinatesTransformation.jsx
index fa30a7a..f4ec82e 100644
--- a/src/javascript/models/transformations/RoundCoordinatesTransformation.jsx
+++ b/src/javascript/models/transformations/RoundCoordinatesTransformation.jsx
@@ -5,7 +5,7 @@ import ContentView from '../ContentView.jsx';
export default class RoundCoordinatesTransformation extends Transformation {
constructor() {
- super("Round coordinates");
+ super("Round Coordinates");
}
contentView() {
diff --git a/src/javascript/models/transformations/Transformation.jsx b/src/javascript/models/transformations/Transformation.jsx
index 4b0b8d4..4173066 100644
--- a/src/javascript/models/transformations/Transformation.jsx
+++ b/src/javascript/models/transformations/Transformation.jsx
@@ -25,5 +25,10 @@ export default class Transformation {
throw new TypeError("Do not call abstract method foo from child.");
}
+ // Annotations which have been added during transform() can now be cleaned-up / handled
+ processAnnotations(pages) { // eslint-disable-line no-unused-vars
+ return pages;
+ }
+
}
\ No newline at end of file