Remove repetitive items (page header/footer)

This commit is contained in:
Johannes Zillmann 2017-01-27 21:40:49 +01:00
parent e43cf9a6a9
commit 08739c0884
9 changed files with 109 additions and 6 deletions

View File

@ -61,9 +61,12 @@ export default class DebugView extends React.Component {
var contentView; var contentView;
var lastTransformation; var lastTransformation;
for (var i = 0; i <= currentTransformation; i++) { for (var i = 0; i <= currentTransformation; i++) {
if (lastTransformation) {
transformedPages = lastTransformation.processAnnotations(transformedPages);
}
transformedPages = transformations[i].transform(transformedPages); transformedPages = transformations[i].transform(transformedPages);
lastTransformation = transformations[i];
contentView = transformations[i].contentView(); contentView = transformations[i].contentView();
lastTransformation = transformations[i];
} }
var pageComponents; var pageComponents;

View File

@ -34,10 +34,15 @@ export default class PdfPageView extends React.Component {
<th> <th>
Height Height
</th> </th>
<th>
Annotation
</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{ this.props.pdfPage.textItems.map((textItem, i) => <tr key={ i }> { this.props.pdfPage.textItems.map((textItem, i) => <tr key={ i } style={ textItem.annotation ? {
color: textItem.annotation.color
} : null }>
<td> <td>
{ i } { i }
</td> </td>
@ -56,6 +61,9 @@ export default class PdfPageView extends React.Component {
<td> <td>
{ textItem.height } { textItem.height }
</td> </td>
<td>
{ textItem.annotation ? textItem.annotation.category : '' }
</td>
</tr> </tr>
) } ) }
</tbody> </tbody>

View File

@ -0,0 +1,9 @@
// Annotation for a text item
export default class Annotation {
constructor(options) {
this.category = options.category;
this.color = options.color;
}
}

View File

@ -3,6 +3,7 @@ import { Enum } from 'enumify';
import NoOpTransformation from './transformations/NoOpTransformation.jsx'; import NoOpTransformation from './transformations/NoOpTransformation.jsx';
import RoundCoordinatesTransformation from './transformations/RoundCoordinatesTransformation.jsx'; import RoundCoordinatesTransformation from './transformations/RoundCoordinatesTransformation.jsx';
import CombineSameYTransformation from './transformations/CombineSameYTransformation.jsx'; import CombineSameYTransformation from './transformations/CombineSameYTransformation.jsx';
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import ToTextPagesTransformation from './transformations/ToTextPagesTransformation.jsx'; import ToTextPagesTransformation from './transformations/ToTextPagesTransformation.jsx';
import ToSingleTextPageTransformation from './transformations/ToSingleTextPageTransformation.jsx' import ToSingleTextPageTransformation from './transformations/ToSingleTextPageTransformation.jsx'
@ -14,7 +15,13 @@ export default class AppState {
this.mainView = View.UPLOAD; this.mainView = View.UPLOAD;
this.fileBuffer; this.fileBuffer;
this.pdfPages = []; this.pdfPages = [];
this.transformations = [new NoOpTransformation(), new RoundCoordinatesTransformation(), new CombineSameYTransformation(), new ToTextPagesTransformation(), new ToSingleTextPageTransformation()]; this.transformations = [
new NoOpTransformation(),
new RoundCoordinatesTransformation(),
new CombineSameYTransformation(),
new RemoveRepetitiveElements(),
new ToTextPagesTransformation(),
new ToSingleTextPageTransformation()];
//bind functions //bind functions
this.render = this.render.bind(this); this.render = this.render.bind(this);

View File

@ -1,4 +1,4 @@
//Holds individual text items of a page //A text iteme, i.e. a line, within a page
export default class TextItem { export default class TextItem {
constructor(options) { constructor(options) {
@ -7,6 +7,7 @@ export default class TextItem {
this.width = options.width; this.width = options.width;
this.height = options.height; this.height = options.height;
this.text = options.text; this.text = options.text;
this.annotation = options.annotation;
} }
} }

View File

@ -6,7 +6,7 @@ import ContentView from '../ContentView.jsx';
export default class CombineSameYTransformation extends Transformation { export default class CombineSameYTransformation extends Transformation {
constructor() { constructor() {
super("Combine text on same Y"); super("Combine Text On Same Y");
} }
contentView() { contentView() {

View File

@ -0,0 +1,70 @@
import Transformation from './Transformation.jsx';
import Annotation from '../Annotation.jsx';
import PdfPage from '../PdfPage.jsx';
import ContentView from '../ContentView.jsx';
function hashCodeIgnoringNumbers(string) {
var hash = 0, i, charCode, len, isNumber;
if (string.length === 0) return hash;
for (i = 0, len = string.length; i < len; i++) {
charCode = string.charCodeAt(i);
isNumber = charCode >= 48 && charCode <= 57;
if (!isNumber) {
hash = ((hash << 5) - hash) + charCode;
hash |= 0; // Convert to 32bit integer
}
}
return hash;
}
function combineCoordinates(textItem) {
var hashCode = hashCodeIgnoringNumbers(textItem.text);
return `${textItem.x}-${textItem.y}-${textItem.width}-${textItem.height}-${hashCode}`;
}
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
export default class RemoveRepetitiveElements extends Transformation {
constructor() {
super("Remove Repetitive Elements");
}
contentView() {
return ContentView.PDF;
}
transform(pages:PdfPage[]) {
//build repetition counts for every element
var repetitionCounts = {};
pages.forEach(pdfPage => {
pdfPage.textItems.forEach(textItem => {
var combinedCoordinates = combineCoordinates(textItem);
repetitionCounts[combinedCoordinates] = repetitionCounts[combinedCoordinates] ? repetitionCounts[combinedCoordinates] + 1 : 1;
});
});
// annotate elements with repetition as removed
pages.forEach(pdfPage => {
pdfPage.textItems.forEach(textItem => {
var combinedCoordinates = combineCoordinates(textItem);
if (repetitionCounts[combinedCoordinates] > 1) {
// console.debug("page " + pdfPage.index + " removed :" + repetitionCounts[combinedCoordinates] + " :" + textItem.text);
textItem.annotation = new Annotation({
category: 'removed',
color: 'red'
});
}
});
});
return pages;
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
});
return pages;
}
}

View File

@ -5,7 +5,7 @@ import ContentView from '../ContentView.jsx';
export default class RoundCoordinatesTransformation extends Transformation { export default class RoundCoordinatesTransformation extends Transformation {
constructor() { constructor() {
super("Round coordinates"); super("Round Coordinates");
} }
contentView() { contentView() {

View File

@ -25,5 +25,10 @@ export default class Transformation {
throw new TypeError("Do not call abstract method foo from child."); throw new TypeError("Do not call abstract method foo from child.");
} }
// Annotations which have been added during transform() can now be cleaned-up / handled
processAnnotations(pages) { // eslint-disable-line no-unused-vars
return pages;
}
} }