mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-24 06:29:06 +01:00
Remove repetitive items (page header/footer)
This commit is contained in:
parent
e43cf9a6a9
commit
08739c0884
@ -61,9 +61,12 @@ export default class DebugView extends React.Component {
|
||||
var contentView;
|
||||
var lastTransformation;
|
||||
for (var i = 0; i <= currentTransformation; i++) {
|
||||
if (lastTransformation) {
|
||||
transformedPages = lastTransformation.processAnnotations(transformedPages);
|
||||
}
|
||||
transformedPages = transformations[i].transform(transformedPages);
|
||||
lastTransformation = transformations[i];
|
||||
contentView = transformations[i].contentView();
|
||||
lastTransformation = transformations[i];
|
||||
}
|
||||
|
||||
var pageComponents;
|
||||
|
@ -34,10 +34,15 @@ export default class PdfPageView extends React.Component {
|
||||
<th>
|
||||
Height
|
||||
</th>
|
||||
<th>
|
||||
Annotation
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{ this.props.pdfPage.textItems.map((textItem, i) => <tr key={ i }>
|
||||
{ this.props.pdfPage.textItems.map((textItem, i) => <tr key={ i } style={ textItem.annotation ? {
|
||||
color: textItem.annotation.color
|
||||
} : null }>
|
||||
<td>
|
||||
{ i }
|
||||
</td>
|
||||
@ -56,6 +61,9 @@ export default class PdfPageView extends React.Component {
|
||||
<td>
|
||||
{ textItem.height }
|
||||
</td>
|
||||
<td>
|
||||
{ textItem.annotation ? textItem.annotation.category : '' }
|
||||
</td>
|
||||
</tr>
|
||||
) }
|
||||
</tbody>
|
||||
|
9
src/javascript/models/Annotation.jsx
Normal file
9
src/javascript/models/Annotation.jsx
Normal file
@ -0,0 +1,9 @@
|
||||
// Annotation for a text item
|
||||
export default class Annotation {
|
||||
|
||||
constructor(options) {
|
||||
this.category = options.category;
|
||||
this.color = options.color;
|
||||
}
|
||||
|
||||
}
|
@ -3,6 +3,7 @@ import { Enum } from 'enumify';
|
||||
import NoOpTransformation from './transformations/NoOpTransformation.jsx';
|
||||
import RoundCoordinatesTransformation from './transformations/RoundCoordinatesTransformation.jsx';
|
||||
import CombineSameYTransformation from './transformations/CombineSameYTransformation.jsx';
|
||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||
import ToTextPagesTransformation from './transformations/ToTextPagesTransformation.jsx';
|
||||
import ToSingleTextPageTransformation from './transformations/ToSingleTextPageTransformation.jsx'
|
||||
|
||||
@ -14,7 +15,13 @@ export default class AppState {
|
||||
this.mainView = View.UPLOAD;
|
||||
this.fileBuffer;
|
||||
this.pdfPages = [];
|
||||
this.transformations = [new NoOpTransformation(), new RoundCoordinatesTransformation(), new CombineSameYTransformation(), new ToTextPagesTransformation(), new ToSingleTextPageTransformation()];
|
||||
this.transformations = [
|
||||
new NoOpTransformation(),
|
||||
new RoundCoordinatesTransformation(),
|
||||
new CombineSameYTransformation(),
|
||||
new RemoveRepetitiveElements(),
|
||||
new ToTextPagesTransformation(),
|
||||
new ToSingleTextPageTransformation()];
|
||||
|
||||
//bind functions
|
||||
this.render = this.render.bind(this);
|
||||
|
@ -1,4 +1,4 @@
|
||||
//Holds individual text items of a page
|
||||
//A text iteme, i.e. a line, within a page
|
||||
export default class TextItem {
|
||||
|
||||
constructor(options) {
|
||||
@ -7,6 +7,7 @@ export default class TextItem {
|
||||
this.width = options.width;
|
||||
this.height = options.height;
|
||||
this.text = options.text;
|
||||
this.annotation = options.annotation;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -6,7 +6,7 @@ import ContentView from '../ContentView.jsx';
|
||||
export default class CombineSameYTransformation extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("Combine text on same Y");
|
||||
super("Combine Text On Same Y");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
|
@ -0,0 +1,70 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import Annotation from '../Annotation.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
|
||||
function hashCodeIgnoringNumbers(string) {
|
||||
var hash = 0, i, charCode, len, isNumber;
|
||||
if (string.length === 0) return hash;
|
||||
for (i = 0, len = string.length; i < len; i++) {
|
||||
charCode = string.charCodeAt(i);
|
||||
isNumber = charCode >= 48 && charCode <= 57;
|
||||
if (!isNumber) {
|
||||
hash = ((hash << 5) - hash) + charCode;
|
||||
hash |= 0; // Convert to 32bit integer
|
||||
}
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
function combineCoordinates(textItem) {
|
||||
var hashCode = hashCodeIgnoringNumbers(textItem.text);
|
||||
return `${textItem.x}-${textItem.y}-${textItem.width}-${textItem.height}-${hashCode}`;
|
||||
}
|
||||
|
||||
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
||||
export default class RemoveRepetitiveElements extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("Remove Repetitive Elements");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
//build repetition counts for every element
|
||||
var repetitionCounts = {};
|
||||
pages.forEach(pdfPage => {
|
||||
pdfPage.textItems.forEach(textItem => {
|
||||
var combinedCoordinates = combineCoordinates(textItem);
|
||||
repetitionCounts[combinedCoordinates] = repetitionCounts[combinedCoordinates] ? repetitionCounts[combinedCoordinates] + 1 : 1;
|
||||
});
|
||||
});
|
||||
|
||||
// annotate elements with repetition as removed
|
||||
pages.forEach(pdfPage => {
|
||||
pdfPage.textItems.forEach(textItem => {
|
||||
var combinedCoordinates = combineCoordinates(textItem);
|
||||
if (repetitionCounts[combinedCoordinates] > 1) {
|
||||
// console.debug("page " + pdfPage.index + " removed :" + repetitionCounts[combinedCoordinates] + " :" + textItem.text);
|
||||
textItem.annotation = new Annotation({
|
||||
category: 'removed',
|
||||
color: 'red'
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
return pages;
|
||||
}
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||
});
|
||||
return pages;
|
||||
}
|
||||
|
||||
}
|
@ -5,7 +5,7 @@ import ContentView from '../ContentView.jsx';
|
||||
export default class RoundCoordinatesTransformation extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("Round coordinates");
|
||||
super("Round Coordinates");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
|
@ -25,5 +25,10 @@ export default class Transformation {
|
||||
throw new TypeError("Do not call abstract method foo from child.");
|
||||
}
|
||||
|
||||
// Annotations which have been added during transform() can now be cleaned-up / handled
|
||||
processAnnotations(pages) { // eslint-disable-line no-unused-vars
|
||||
return pages;
|
||||
}
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user