mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-25 09:13:49 +01:00
Remove repetitive items (page header/footer)
This commit is contained in:
parent
e43cf9a6a9
commit
08739c0884
@ -61,9 +61,12 @@ export default class DebugView extends React.Component {
|
|||||||
var contentView;
|
var contentView;
|
||||||
var lastTransformation;
|
var lastTransformation;
|
||||||
for (var i = 0; i <= currentTransformation; i++) {
|
for (var i = 0; i <= currentTransformation; i++) {
|
||||||
|
if (lastTransformation) {
|
||||||
|
transformedPages = lastTransformation.processAnnotations(transformedPages);
|
||||||
|
}
|
||||||
transformedPages = transformations[i].transform(transformedPages);
|
transformedPages = transformations[i].transform(transformedPages);
|
||||||
lastTransformation = transformations[i];
|
|
||||||
contentView = transformations[i].contentView();
|
contentView = transformations[i].contentView();
|
||||||
|
lastTransformation = transformations[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
var pageComponents;
|
var pageComponents;
|
||||||
|
@ -34,10 +34,15 @@ export default class PdfPageView extends React.Component {
|
|||||||
<th>
|
<th>
|
||||||
Height
|
Height
|
||||||
</th>
|
</th>
|
||||||
|
<th>
|
||||||
|
Annotation
|
||||||
|
</th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
{ this.props.pdfPage.textItems.map((textItem, i) => <tr key={ i }>
|
{ this.props.pdfPage.textItems.map((textItem, i) => <tr key={ i } style={ textItem.annotation ? {
|
||||||
|
color: textItem.annotation.color
|
||||||
|
} : null }>
|
||||||
<td>
|
<td>
|
||||||
{ i }
|
{ i }
|
||||||
</td>
|
</td>
|
||||||
@ -56,6 +61,9 @@ export default class PdfPageView extends React.Component {
|
|||||||
<td>
|
<td>
|
||||||
{ textItem.height }
|
{ textItem.height }
|
||||||
</td>
|
</td>
|
||||||
|
<td>
|
||||||
|
{ textItem.annotation ? textItem.annotation.category : '' }
|
||||||
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
) }
|
) }
|
||||||
</tbody>
|
</tbody>
|
||||||
|
9
src/javascript/models/Annotation.jsx
Normal file
9
src/javascript/models/Annotation.jsx
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
// Annotation for a text item
|
||||||
|
export default class Annotation {
|
||||||
|
|
||||||
|
constructor(options) {
|
||||||
|
this.category = options.category;
|
||||||
|
this.color = options.color;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -3,6 +3,7 @@ import { Enum } from 'enumify';
|
|||||||
import NoOpTransformation from './transformations/NoOpTransformation.jsx';
|
import NoOpTransformation from './transformations/NoOpTransformation.jsx';
|
||||||
import RoundCoordinatesTransformation from './transformations/RoundCoordinatesTransformation.jsx';
|
import RoundCoordinatesTransformation from './transformations/RoundCoordinatesTransformation.jsx';
|
||||||
import CombineSameYTransformation from './transformations/CombineSameYTransformation.jsx';
|
import CombineSameYTransformation from './transformations/CombineSameYTransformation.jsx';
|
||||||
|
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||||
import ToTextPagesTransformation from './transformations/ToTextPagesTransformation.jsx';
|
import ToTextPagesTransformation from './transformations/ToTextPagesTransformation.jsx';
|
||||||
import ToSingleTextPageTransformation from './transformations/ToSingleTextPageTransformation.jsx'
|
import ToSingleTextPageTransformation from './transformations/ToSingleTextPageTransformation.jsx'
|
||||||
|
|
||||||
@ -14,7 +15,13 @@ export default class AppState {
|
|||||||
this.mainView = View.UPLOAD;
|
this.mainView = View.UPLOAD;
|
||||||
this.fileBuffer;
|
this.fileBuffer;
|
||||||
this.pdfPages = [];
|
this.pdfPages = [];
|
||||||
this.transformations = [new NoOpTransformation(), new RoundCoordinatesTransformation(), new CombineSameYTransformation(), new ToTextPagesTransformation(), new ToSingleTextPageTransformation()];
|
this.transformations = [
|
||||||
|
new NoOpTransformation(),
|
||||||
|
new RoundCoordinatesTransformation(),
|
||||||
|
new CombineSameYTransformation(),
|
||||||
|
new RemoveRepetitiveElements(),
|
||||||
|
new ToTextPagesTransformation(),
|
||||||
|
new ToSingleTextPageTransformation()];
|
||||||
|
|
||||||
//bind functions
|
//bind functions
|
||||||
this.render = this.render.bind(this);
|
this.render = this.render.bind(this);
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
//Holds individual text items of a page
|
//A text iteme, i.e. a line, within a page
|
||||||
export default class TextItem {
|
export default class TextItem {
|
||||||
|
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
@ -7,6 +7,7 @@ export default class TextItem {
|
|||||||
this.width = options.width;
|
this.width = options.width;
|
||||||
this.height = options.height;
|
this.height = options.height;
|
||||||
this.text = options.text;
|
this.text = options.text;
|
||||||
|
this.annotation = options.annotation;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -6,7 +6,7 @@ import ContentView from '../ContentView.jsx';
|
|||||||
export default class CombineSameYTransformation extends Transformation {
|
export default class CombineSameYTransformation extends Transformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Combine text on same Y");
|
super("Combine Text On Same Y");
|
||||||
}
|
}
|
||||||
|
|
||||||
contentView() {
|
contentView() {
|
||||||
|
@ -0,0 +1,70 @@
|
|||||||
|
import Transformation from './Transformation.jsx';
|
||||||
|
import Annotation from '../Annotation.jsx';
|
||||||
|
import PdfPage from '../PdfPage.jsx';
|
||||||
|
import ContentView from '../ContentView.jsx';
|
||||||
|
|
||||||
|
|
||||||
|
function hashCodeIgnoringNumbers(string) {
|
||||||
|
var hash = 0, i, charCode, len, isNumber;
|
||||||
|
if (string.length === 0) return hash;
|
||||||
|
for (i = 0, len = string.length; i < len; i++) {
|
||||||
|
charCode = string.charCodeAt(i);
|
||||||
|
isNumber = charCode >= 48 && charCode <= 57;
|
||||||
|
if (!isNumber) {
|
||||||
|
hash = ((hash << 5) - hash) + charCode;
|
||||||
|
hash |= 0; // Convert to 32bit integer
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
function combineCoordinates(textItem) {
|
||||||
|
var hashCode = hashCodeIgnoringNumbers(textItem.text);
|
||||||
|
return `${textItem.x}-${textItem.y}-${textItem.width}-${textItem.height}-${hashCode}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
||||||
|
export default class RemoveRepetitiveElements extends Transformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Remove Repetitive Elements");
|
||||||
|
}
|
||||||
|
|
||||||
|
contentView() {
|
||||||
|
return ContentView.PDF;
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(pages:PdfPage[]) {
|
||||||
|
//build repetition counts for every element
|
||||||
|
var repetitionCounts = {};
|
||||||
|
pages.forEach(pdfPage => {
|
||||||
|
pdfPage.textItems.forEach(textItem => {
|
||||||
|
var combinedCoordinates = combineCoordinates(textItem);
|
||||||
|
repetitionCounts[combinedCoordinates] = repetitionCounts[combinedCoordinates] ? repetitionCounts[combinedCoordinates] + 1 : 1;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// annotate elements with repetition as removed
|
||||||
|
pages.forEach(pdfPage => {
|
||||||
|
pdfPage.textItems.forEach(textItem => {
|
||||||
|
var combinedCoordinates = combineCoordinates(textItem);
|
||||||
|
if (repetitionCounts[combinedCoordinates] > 1) {
|
||||||
|
// console.debug("page " + pdfPage.index + " removed :" + repetitionCounts[combinedCoordinates] + " :" + textItem.text);
|
||||||
|
textItem.annotation = new Annotation({
|
||||||
|
category: 'removed',
|
||||||
|
color: 'red'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
processAnnotations(pages:PdfPage[]) {
|
||||||
|
pages.forEach(page => {
|
||||||
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||||
|
});
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -5,7 +5,7 @@ import ContentView from '../ContentView.jsx';
|
|||||||
export default class RoundCoordinatesTransformation extends Transformation {
|
export default class RoundCoordinatesTransformation extends Transformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Round coordinates");
|
super("Round Coordinates");
|
||||||
}
|
}
|
||||||
|
|
||||||
contentView() {
|
contentView() {
|
||||||
|
@ -25,5 +25,10 @@ export default class Transformation {
|
|||||||
throw new TypeError("Do not call abstract method foo from child.");
|
throw new TypeError("Do not call abstract method foo from child.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Annotations which have been added during transform() can now be cleaned-up / handled
|
||||||
|
processAnnotations(pages) { // eslint-disable-line no-unused-vars
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user