Pimp up transformation pipeline with ParseResult object

This commit is contained in:
Johannes Zillmann 2017-02-15 07:03:44 +01:00
parent c08105ecaf
commit 55506576f5
16 changed files with 136 additions and 93 deletions

View File

@ -9,6 +9,8 @@ import MenuItem from 'react-bootstrap/lib/MenuItem'
import Label from 'react-bootstrap/lib/Label'
import Checkbox from 'react-bootstrap/lib/Checkbox'
import ParseResult from '../models/ParseResult.jsx';
// A view which displays the content of the given pages transformed by the given transformations
export default class DebugView extends React.Component {
@ -63,18 +65,20 @@ export default class DebugView extends React.Component {
const currentTransformationName = transformations[currentTransformation].name;
var transformedPages = pdfPages;
var parseResult = new ParseResult({
content: pdfPages
});
var lastTransformation;
for (var i = 0; i <= currentTransformation; i++) {
if (lastTransformation) {
transformedPages = lastTransformation.processAnnotations(transformedPages);
parseResult = lastTransformation.completeTransform(parseResult);
}
transformedPages = transformations[i].transform(transformedPages);
parseResult = transformations[i].transform(parseResult);
lastTransformation = transformations[i];
}
transformedPages = transformedPages.filter((elem, i) => pageNr == -1 || i == pageNr);
const pageComponents = transformedPages.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly));
parseResult.content = parseResult.content.filter((elem, i) => pageNr == -1 || i == pageNr);
const pageComponents = parseResult.content.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly));
const showModificationCheckbox = lastTransformation.showModificationCheckbox();
return (

View File

@ -5,6 +5,8 @@ import ButtonToolbar from 'react-bootstrap/lib/ButtonToolbar'
import ButtonGroup from 'react-bootstrap/lib/ButtonGroup'
import Button from 'react-bootstrap/lib/Button'
import ParseResult from '../models/ParseResult.jsx';
export default class ResultView extends React.Component {
static propTypes = {
@ -18,19 +20,21 @@ export default class ResultView extends React.Component {
componentWillMount() {
const {pdfPages, transformations} = this.props;
var transformedPages = pdfPages;
var parseResult = new ParseResult({
content: pdfPages
});
var lastTransformation;
transformations.forEach(transformation => {
if (lastTransformation) {
transformedPages = lastTransformation.processAnnotations(transformedPages);
parseResult = lastTransformation.completeTransform(parseResult);
}
transformedPages = transformation.transform(transformedPages);
parseResult = transformation.transform(parseResult);
lastTransformation = transformation;
});
this.state = {
preview: true,
text: transformedPages[0].text
text: parseResult.content[0].text
};
}

View File

@ -0,0 +1,10 @@
// The result of a PDF parse respectively a Transformation
export default class ParseResult {
constructor(options) {
this.content = options.content; // like PdfPages[]
this.summary = options.summary; // something to show only for the transformation
this.globals = options.globals; // properties accasable for the following transformations
}
}

View File

@ -1,6 +1,6 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
function combineTextItems(textItems:TextItem[]) {
@ -46,9 +46,8 @@ export default class CombineSameY extends ToPdfViewTransformation {
super("Combine Text On Same Y");
}
transform(pages:PdfPage[]) {
return pages.map(pdfPage => {
transform(parseResult:ParseResult) {
const newContent = parseResult.content.map(pdfPage => {
const newTextItems = [];
var textItemsWithSameY = [];
@ -84,14 +83,19 @@ export default class CombineSameY extends ToPdfViewTransformation {
textItems: newTextItems
};
});
return new ParseResult({
...parseResult,
content: newContent
});
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
return parseResult;
}
}

View File

@ -1,6 +1,6 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
import { isNumber } from '../../functions.jsx'
@ -11,12 +11,13 @@ export default class DetectFootnotes extends ToPdfViewTransformation {
super("Detect Footnotes");
}
transform(pages:PdfPage[]) {
transform(parseResult:ParseResult) {
var nextFooterNumber = 1;
var potentialFootnoteItem;
var foundFootnotes = 0;
return pages.map(page => {
const newContent = parseResult.content.map(page => {
const newTextItems = [];
for (var i = 0; i < page.textItems.length; i++) {
const item = page.textItems[i];
@ -36,6 +37,7 @@ export default class DetectFootnotes extends ToPdfViewTransformation {
}));
//TODO repsect multiline!!
nextFooterNumber++;
foundFootnotes++;
}
potentialFootnoteItem = null;
} else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
@ -49,14 +51,21 @@ export default class DetectFootnotes extends ToPdfViewTransformation {
textItems: newTextItems
};
});
return new ParseResult({
...parseResult,
content: newContent,
summary: {
footnotes: foundFootnotes
}
});
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
return parseResult;
}
}

View File

@ -1,6 +1,6 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
@ -10,8 +10,8 @@ export default class DetectLinks extends ToPdfViewTransformation {
super("Detect Links");
}
transform(pages:PdfPage[]) {
pages.forEach(page => {
transform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
const newTextItems = [];
page.textItems.forEach(item => {
newTextItems.push(item);
@ -40,15 +40,15 @@ export default class DetectLinks extends ToPdfViewTransformation {
});
page.textItems = newTextItems;
});
return pages;
return parseResult;
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
return parseResult;
}
}

View File

@ -1,6 +1,6 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ParseResult from '../ParseResult.jsx';
import Annotation from '../Annotation.jsx';
import Headline from '../markdown/Headline.jsx';
@ -69,14 +69,14 @@ export default class HeadlineDetector extends ToPdfViewTransformation {
// - heights which start a page are likely to be headlines
// - maxHeigth is likely a headline
// - heights which occur on more then one page are likely to be headlines
transform(pages:PdfPage[]) {
const heightAnalyzation = analyzeHeigths(pages);
transform(parseResult:ParseResult) {
const heightAnalyzation = analyzeHeigths(parseResult.content);
var paragraphHeight = heightAnalyzation.mostUsedHeight + 1;
// text with more hight then the paragraph height which are on the top of the page are likely to be headlines
const likelyHeadingHeights = new Set();
pages.forEach(page => {
parseResult.content.forEach(page => {
page.textItems.forEach(item => {
if (item.height > paragraphHeight && heightAnalyzation.maxYPerPage[page.index] == item.y) {
likelyHeadingHeights.add(item.height);
@ -116,7 +116,7 @@ export default class HeadlineDetector extends ToPdfViewTransformation {
}
}
return pages.map(page => {
const newContent = parseResult.content.map(page => {
const newTextItems = [];
page.textItems.forEach(item => {
if (item.height <= paragraphHeight) {
@ -141,13 +141,18 @@ export default class HeadlineDetector extends ToPdfViewTransformation {
textItems: newTextItems
};
});
return new ParseResult({
...parseResult,
content: newContent,
});
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
return parseResult;
}
}

View File

@ -1,6 +1,6 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION, UNCHANGED_ANNOTATION } from '../Annotation.jsx';
import { hasUpperCaseCharacterInMiddleOfWord } from '../../functions.jsx'
@ -13,10 +13,8 @@ export default class HeadlineToUppercase extends ToPdfViewTransformation {
super("Headlines Uppercase");
}
transform(pages:PdfPage[]) {
return pages.map(page => {
transform(parseResult:ParseResult) {
const newContent = parseResult.content.map(page => {
const newTextItems = [];
page.textItems.forEach(item => {
if (item.markdownElement && item.markdownElement.constructor.name === 'Headline') {
@ -42,14 +40,19 @@ export default class HeadlineToUppercase extends ToPdfViewTransformation {
textItems: newTextItems
};
});
return new ParseResult({
...parseResult,
content: newContent,
});
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
return parseResult;
}
}

View File

@ -1,5 +1,5 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import PdfPage from '../PdfPage.jsx';
import ParseResult from '../ParseResult.jsx';
export default class NoOp extends ToPdfViewTransformation {
@ -7,8 +7,8 @@ export default class NoOp extends ToPdfViewTransformation {
super("Original");
}
transform(pdfPages:PdfPage[]) {
return pdfPages;
transform(parseResult:ParseResult) {
return parseResult;
}
}

View File

@ -1,5 +1,5 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import PdfPage from '../PdfPage.jsx';
import ParseResult from '../ParseResult.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
import { isDigit } from '../../functions.jsx'
@ -30,10 +30,10 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
super("Remove Repetitive Elements");
}
transform(pages:PdfPage[]) {
transform(parseResult:ParseResult) {
//build repetition counts for every element
const repetitionCounts = {};
pages.forEach(pdfPage => {
parseResult.content.forEach(pdfPage => {
pdfPage.textItems.forEach(textItem => {
var combinedCoordinates = combineCoordinates(textItem);
repetitionCounts[combinedCoordinates] = repetitionCounts[combinedCoordinates] ? repetitionCounts[combinedCoordinates] + 1 : 1;
@ -41,7 +41,7 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
});
// annotate elements with repetition as removed
pages.forEach(pdfPage => {
parseResult.content.forEach(pdfPage => {
pdfPage.textItems.forEach(textItem => {
var combinedCoordinates = combineCoordinates(textItem);
if (repetitionCounts[combinedCoordinates] > 1) {
@ -50,14 +50,14 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
}
});
});
return pages;
return parseResult;
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
});
return pages;
return parseResult;
}
}

View File

@ -1,6 +1,6 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
@ -11,8 +11,8 @@ export default class RemoveWhitespaces extends ToPdfViewTransformation {
this.showWhitespaces = true;
}
transform(pages:PdfPage[]) {
pages.forEach(page => {
transform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
const newTextItems = [];
page.textItems.forEach(item => {
newTextItems.push(item);
@ -37,15 +37,15 @@ export default class RemoveWhitespaces extends ToPdfViewTransformation {
});
page.textItems = newTextItems;
});
return pages;
return parseResult;
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
return parseResult;
}
}

View File

@ -1,5 +1,5 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import PdfPage from '../PdfPage.jsx';
import ParseResult from '../ParseResult.jsx';
export default class RoundCoordinates extends ToPdfViewTransformation {
@ -7,8 +7,8 @@ export default class RoundCoordinates extends ToPdfViewTransformation {
super("Round Coordinates");
}
transform(pdfPages:PdfPage[]) {
return pdfPages.map(pdfPage => {
transform(parseResult:ParseResult) {
const newContent = parseResult.content.map(pdfPage => {
return {
...pdfPage,
textItems: pdfPage.textItems.map(textItem => {
@ -22,6 +22,10 @@ export default class RoundCoordinates extends ToPdfViewTransformation {
})
};
});
return new ParseResult({
...parseResult,
content: newContent,
});
}
}

View File

@ -1,7 +1,7 @@
import React from 'react';
import Transformation from './Transformation.jsx';
import BlockPageView from '../../components/debug/BlockPageView.jsx';
import PdfPage from '../PdfPage.jsx';
import ParseResult from '../ParseResult.jsx';
import BlockPage from '../BlockPage.jsx';
export default class ToBlockSystem extends Transformation {
@ -14,9 +14,9 @@ export default class ToBlockSystem extends Transformation {
return <BlockPageView key={ page.index } page={ page } />;
}
transform(pages:PdfPage[]) {
transform(parseResult:ParseResult) {
const blocks = [];
pages.forEach(page => {
parseResult.content.forEach(page => {
var minDiff = 99;
var lastY = 0;
page.textItems.forEach(item => {
@ -62,14 +62,13 @@ export default class ToBlockSystem extends Transformation {
});
rollup("Block")
});
return [new BlockPage({
index: 0,
blocks: blocks
})];
}
processAnnotations(pages) {
return pages;
return new ParseResult({
...parseResult,
content: [new BlockPage({
index: 0,
blocks: blocks
})],
});
}
}

View File

@ -1,6 +1,7 @@
import React from 'react';
import MarkdownPageView from '../../components/debug/MarkdownPageView.jsx';
import Transformation from './Transformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextPage from '../TextPage.jsx';
export default class ToMarkdown extends Transformation {
@ -13,17 +14,20 @@ export default class ToMarkdown extends Transformation {
return <MarkdownPageView key={ page.index } page={ page } />;
}
transform(pages:TextPage[]) {
transform(parseResult:ParseResult) {
var text = '';
pages.forEach(page => {
parseResult.content.forEach(page => {
page.blocks.forEach((block) => {
text += block.text + '\n\n';
});
});
return [new TextPage({
index: 0,
text: text
})];
return new ParseResult({
...parseResult,
content: [new TextPage({
index: 0,
text: text
})],
});
}
}

View File

@ -1,6 +1,5 @@
import React from 'react';
import Transformation from './Transformation.jsx';
import PdfPage from '../PdfPage.jsx';
import PdfPageView from '../../components/debug/PdfPageView.jsx';
// Abstract pdfView transformation
@ -30,8 +29,4 @@ export default class ToPdfViewTransformation extends Transformation {
showWhitespaces={ this.showWhitespaces } />;
}
transform(pdfPages:PdfPage[]) {
return pdfPages;
}
}

View File

@ -1,3 +1,5 @@
import ParseResult from '../ParseResult.jsx';
// A transformation from an PdfPage to an PdfPage
export default class Transformation {
@ -23,14 +25,14 @@ export default class Transformation {
throw new TypeError("Do not call abstract method foo from child.");
}
// Transform incoming pages (like PdfPage[]) into different pages (either PdfPages[] or TextPages[])
transform(pages) { // eslint-disable-line no-unused-vars
// Transform an incoming ParseResult into an outgoing ParseResult
transform(parseResult: ParseResult) { // eslint-disable-line no-unused-vars
throw new TypeError("Do not call abstract method foo from child.");
}
// Annotations which have been added during transform() can now be cleaned-up / handled
processAnnotations(pages) { // eslint-disable-line no-unused-vars
return pages;
// Sometimes the transform() does only visualize a change. This methods then does the actual change.
completeTransform(parseResult: ParseResult) { // eslint-disable-line no-unused-vars
return parseResult;
}