mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-28 18:53:40 +01:00
Move pageView construction into Transformer
This commit is contained in:
parent
92a4337387
commit
41bc2f6c34
@ -9,11 +9,6 @@ import MenuItem from 'react-bootstrap/lib/MenuItem'
|
||||
import Label from 'react-bootstrap/lib/Label'
|
||||
import Checkbox from 'react-bootstrap/lib/Checkbox'
|
||||
|
||||
import ContentView from '../models/ContentView.jsx';
|
||||
import PdfPageView from './debug/PdfPageView.jsx';
|
||||
import BlockPageView from './debug/BlockPageView.jsx';
|
||||
import MarkdownPageView from './debug/MarkdownPageView.jsx';
|
||||
|
||||
// A view which displays the content of the given pages transformed by the given transformations
|
||||
export default class DebugView extends React.Component {
|
||||
|
||||
@ -69,32 +64,18 @@ export default class DebugView extends React.Component {
|
||||
const currentTransformationName = transformations[currentTransformation].name;
|
||||
|
||||
var transformedPages = pdfPages;
|
||||
var contentView;
|
||||
var lastTransformation;
|
||||
for (var i = 0; i <= currentTransformation; i++) {
|
||||
if (lastTransformation) {
|
||||
transformedPages = lastTransformation.processAnnotations(transformedPages);
|
||||
}
|
||||
transformedPages = transformations[i].transform(transformedPages);
|
||||
contentView = transformations[i].contentView();
|
||||
lastTransformation = transformations[i];
|
||||
}
|
||||
|
||||
transformedPages = transformedPages.filter((elem, i) => pageNr == -1 || i == pageNr);
|
||||
var pageComponents;
|
||||
var showModificationCheckbox = false;
|
||||
switch (contentView) {
|
||||
case ContentView.PDF:
|
||||
pageComponents = transformedPages.map(page => <PdfPageView key={ page.index } pdfPage={ page } modificationsOnly={ this.state.modificationsOnly } />);
|
||||
showModificationCheckbox = true;
|
||||
break;
|
||||
case ContentView.BLOCK:
|
||||
pageComponents = transformedPages.map(page => <BlockPageView key={ page.index } page={ page } />);
|
||||
break;
|
||||
case ContentView.MARKDOWN:
|
||||
pageComponents = transformedPages.map(page => <MarkdownPageView key={ page.index } page={ page } />);
|
||||
break;
|
||||
}
|
||||
const pageComponents = transformedPages.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly));
|
||||
const showModificationCheckbox = lastTransformation.showModificationCheckbox();
|
||||
|
||||
return (
|
||||
<div>
|
||||
|
@ -1,5 +0,0 @@
|
||||
import { Enum } from 'enumify';
|
||||
|
||||
export default class ContentView extends Enum {
|
||||
}
|
||||
ContentView.initEnum(['PDF', 'BLOCK', 'MARKDOWN'])
|
@ -1,7 +1,6 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
function combineTextItems(textItems:TextItem[]) {
|
||||
@ -41,16 +40,12 @@ function combineTextItems(textItems:TextItem[]) {
|
||||
});
|
||||
}
|
||||
|
||||
export default class CombineSameY extends Transformation {
|
||||
export default class CombineSameY extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Combine Text On Same Y");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
|
||||
return pages.map(pdfPage => {
|
||||
|
@ -1,21 +1,16 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
import { isNumber } from '../../functions.jsx'
|
||||
|
||||
export default class DetectFootnotes extends Transformation {
|
||||
export default class DetectFootnotes extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Footnotes");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
|
||||
var nextFooterNumber = 1;
|
||||
|
@ -1,20 +1,15 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
export default class DetectLinks extends Transformation {
|
||||
export default class DetectLinks extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Links");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
const newTextItems = [];
|
||||
|
@ -1,7 +1,6 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import Annotation from '../Annotation.jsx';
|
||||
|
||||
import Headline from '../markdown/Headline.jsx';
|
||||
@ -59,16 +58,12 @@ function findNextMajorHeight(heights, currentHeight, headlineLevels) {
|
||||
}
|
||||
|
||||
|
||||
export default class HeadlineDetector extends Transformation {
|
||||
export default class HeadlineDetector extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Headlines");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
// Strategy:
|
||||
// - find most used height => this & every height below is paragraph
|
||||
// - heights which start a page are likely to be headlines
|
||||
|
@ -1,23 +1,18 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION, UNCHANGED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
import { hasUpperCaseCharacterInMiddleOfWord } from '../../functions.jsx'
|
||||
|
||||
// Uppercase headlines are often parsed with very mixed character with pdf.js, like 'A heAdLine'.
|
||||
// This tries to detect them and make them all uppercase.
|
||||
export default class HeadlineToUppercase extends Transformation {
|
||||
export default class HeadlineToUppercase extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Headlines Uppercase");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
|
||||
|
||||
|
@ -1,17 +1,12 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
export default class NoOp extends Transformation {
|
||||
export default class NoOp extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Original");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pdfPages:PdfPage[]) {
|
||||
return pdfPages;
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
import { isDigit } from '../../functions.jsx'
|
||||
@ -25,16 +24,12 @@ function combineCoordinates(textItem) {
|
||||
}
|
||||
|
||||
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
||||
export default class RemoveRepetitiveElements extends Transformation {
|
||||
export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Remove Repetitive Elements");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
//build repetition counts for every element
|
||||
const repetitionCounts = {};
|
||||
|
@ -1,20 +1,15 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
export default class RemoveWhitespaces extends Transformation {
|
||||
export default class RemoveWhitespaces extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Remove Whitespaces");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
const newTextItems = [];
|
||||
|
@ -1,17 +1,12 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
export default class RoundCoordinates extends Transformation {
|
||||
export default class RoundCoordinates extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Round Coordinates");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pdfPages:PdfPage[]) {
|
||||
return pdfPages.map(pdfPage => {
|
||||
return {
|
||||
|
@ -1,7 +1,8 @@
|
||||
import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import BlockPageView from '../../components/debug/BlockPageView.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import BlockPage from '../BlockPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
export default class ToBlockSystem extends Transformation {
|
||||
|
||||
@ -9,12 +10,8 @@ export default class ToBlockSystem extends Transformation {
|
||||
super("To Block System");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.BLOCK;
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return false;
|
||||
createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
|
||||
return <BlockPageView key={ page.index } page={ page } />;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
|
@ -1,6 +1,7 @@
|
||||
import React from 'react';
|
||||
import MarkdownPageView from '../../components/debug/MarkdownPageView.jsx';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import TextPage from '../TextPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
export default class ToMarkdown extends Transformation {
|
||||
|
||||
@ -8,12 +9,8 @@ export default class ToMarkdown extends Transformation {
|
||||
super("To Markdown");
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return false;
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.MARKDOWN;
|
||||
createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
|
||||
return <MarkdownPageView key={ page.index } page={ page } />;
|
||||
}
|
||||
|
||||
transform(pages:TextPage[]) {
|
||||
|
@ -0,0 +1,32 @@
|
||||
import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import PdfPageView from '../../components/debug/PdfPageView.jsx';
|
||||
|
||||
// Abstract pdfView transformation
|
||||
export default class ToPdfViewTransformation extends Transformation {
|
||||
|
||||
constructor(name) {
|
||||
super(name);
|
||||
if (this.constructor === ToPdfViewTransformation) {
|
||||
throw new TypeError("Can not construct abstract class.");
|
||||
}
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return true;
|
||||
}
|
||||
|
||||
showModificationCheckbox() {
|
||||
return true;
|
||||
}
|
||||
|
||||
createPageView(page, modificationsOnly) {
|
||||
return <PdfPageView key={ page.index } pdfPage={ page } modificationsOnly={ modificationsOnly } />;
|
||||
}
|
||||
|
||||
transform(pdfPages:PdfPage[]) {
|
||||
return pdfPages;
|
||||
}
|
||||
|
||||
}
|
@ -12,11 +12,14 @@ export default class Transformation {
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns with which type the transformed pages can be viewed
|
||||
contentView() {
|
||||
showModificationCheckbox() {
|
||||
return false;
|
||||
}
|
||||
|
||||
createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
|
||||
throw new TypeError("Do not call abstract method foo from child.");
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user