mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-28 18:53:40 +01:00
Move pageView construction into Transformer
This commit is contained in:
parent
92a4337387
commit
41bc2f6c34
@ -9,11 +9,6 @@ import MenuItem from 'react-bootstrap/lib/MenuItem'
|
|||||||
import Label from 'react-bootstrap/lib/Label'
|
import Label from 'react-bootstrap/lib/Label'
|
||||||
import Checkbox from 'react-bootstrap/lib/Checkbox'
|
import Checkbox from 'react-bootstrap/lib/Checkbox'
|
||||||
|
|
||||||
import ContentView from '../models/ContentView.jsx';
|
|
||||||
import PdfPageView from './debug/PdfPageView.jsx';
|
|
||||||
import BlockPageView from './debug/BlockPageView.jsx';
|
|
||||||
import MarkdownPageView from './debug/MarkdownPageView.jsx';
|
|
||||||
|
|
||||||
// A view which displays the content of the given pages transformed by the given transformations
|
// A view which displays the content of the given pages transformed by the given transformations
|
||||||
export default class DebugView extends React.Component {
|
export default class DebugView extends React.Component {
|
||||||
|
|
||||||
@ -69,32 +64,18 @@ export default class DebugView extends React.Component {
|
|||||||
const currentTransformationName = transformations[currentTransformation].name;
|
const currentTransformationName = transformations[currentTransformation].name;
|
||||||
|
|
||||||
var transformedPages = pdfPages;
|
var transformedPages = pdfPages;
|
||||||
var contentView;
|
|
||||||
var lastTransformation;
|
var lastTransformation;
|
||||||
for (var i = 0; i <= currentTransformation; i++) {
|
for (var i = 0; i <= currentTransformation; i++) {
|
||||||
if (lastTransformation) {
|
if (lastTransformation) {
|
||||||
transformedPages = lastTransformation.processAnnotations(transformedPages);
|
transformedPages = lastTransformation.processAnnotations(transformedPages);
|
||||||
}
|
}
|
||||||
transformedPages = transformations[i].transform(transformedPages);
|
transformedPages = transformations[i].transform(transformedPages);
|
||||||
contentView = transformations[i].contentView();
|
|
||||||
lastTransformation = transformations[i];
|
lastTransformation = transformations[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
transformedPages = transformedPages.filter((elem, i) => pageNr == -1 || i == pageNr);
|
transformedPages = transformedPages.filter((elem, i) => pageNr == -1 || i == pageNr);
|
||||||
var pageComponents;
|
const pageComponents = transformedPages.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly));
|
||||||
var showModificationCheckbox = false;
|
const showModificationCheckbox = lastTransformation.showModificationCheckbox();
|
||||||
switch (contentView) {
|
|
||||||
case ContentView.PDF:
|
|
||||||
pageComponents = transformedPages.map(page => <PdfPageView key={ page.index } pdfPage={ page } modificationsOnly={ this.state.modificationsOnly } />);
|
|
||||||
showModificationCheckbox = true;
|
|
||||||
break;
|
|
||||||
case ContentView.BLOCK:
|
|
||||||
pageComponents = transformedPages.map(page => <BlockPageView key={ page.index } page={ page } />);
|
|
||||||
break;
|
|
||||||
case ContentView.MARKDOWN:
|
|
||||||
pageComponents = transformedPages.map(page => <MarkdownPageView key={ page.index } page={ page } />);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div>
|
<div>
|
||||||
|
@ -1,5 +0,0 @@
|
|||||||
import { Enum } from 'enumify';
|
|
||||||
|
|
||||||
export default class ContentView extends Enum {
|
|
||||||
}
|
|
||||||
ContentView.initEnum(['PDF', 'BLOCK', 'MARKDOWN'])
|
|
@ -1,7 +1,6 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
|
||||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
function combineTextItems(textItems:TextItem[]) {
|
function combineTextItems(textItems:TextItem[]) {
|
||||||
@ -41,16 +40,12 @@ function combineTextItems(textItems:TextItem[]) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export default class CombineSameY extends Transformation {
|
export default class CombineSameY extends ToPdfViewTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Combine Text On Same Y");
|
super("Combine Text On Same Y");
|
||||||
}
|
}
|
||||||
|
|
||||||
contentView() {
|
|
||||||
return ContentView.PDF;
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(pages:PdfPage[]) {
|
transform(pages:PdfPage[]) {
|
||||||
|
|
||||||
return pages.map(pdfPage => {
|
return pages.map(pdfPage => {
|
||||||
|
@ -1,21 +1,16 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
|
||||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
import { isNumber } from '../../functions.jsx'
|
import { isNumber } from '../../functions.jsx'
|
||||||
|
|
||||||
export default class DetectFootnotes extends Transformation {
|
export default class DetectFootnotes extends ToPdfViewTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Detect Footnotes");
|
super("Detect Footnotes");
|
||||||
}
|
}
|
||||||
|
|
||||||
contentView() {
|
|
||||||
return ContentView.PDF;
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(pages:PdfPage[]) {
|
transform(pages:PdfPage[]) {
|
||||||
|
|
||||||
var nextFooterNumber = 1;
|
var nextFooterNumber = 1;
|
||||||
|
@ -1,20 +1,15 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
|
||||||
|
|
||||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
export default class DetectLinks extends Transformation {
|
export default class DetectLinks extends ToPdfViewTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Detect Links");
|
super("Detect Links");
|
||||||
}
|
}
|
||||||
|
|
||||||
contentView() {
|
|
||||||
return ContentView.PDF;
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(pages:PdfPage[]) {
|
transform(pages:PdfPage[]) {
|
||||||
pages.forEach(page => {
|
pages.forEach(page => {
|
||||||
const newTextItems = [];
|
const newTextItems = [];
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
|
||||||
import Annotation from '../Annotation.jsx';
|
import Annotation from '../Annotation.jsx';
|
||||||
|
|
||||||
import Headline from '../markdown/Headline.jsx';
|
import Headline from '../markdown/Headline.jsx';
|
||||||
@ -59,16 +58,12 @@ function findNextMajorHeight(heights, currentHeight, headlineLevels) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export default class HeadlineDetector extends Transformation {
|
export default class HeadlineDetector extends ToPdfViewTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Detect Headlines");
|
super("Detect Headlines");
|
||||||
}
|
}
|
||||||
|
|
||||||
contentView() {
|
|
||||||
return ContentView.PDF;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Strategy:
|
// Strategy:
|
||||||
// - find most used height => this & every height below is paragraph
|
// - find most used height => this & every height below is paragraph
|
||||||
// - heights which start a page are likely to be headlines
|
// - heights which start a page are likely to be headlines
|
||||||
|
@ -1,23 +1,18 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
|
||||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION, UNCHANGED_ANNOTATION } from '../Annotation.jsx';
|
import { ADDED_ANNOTATION, REMOVED_ANNOTATION, UNCHANGED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
import { hasUpperCaseCharacterInMiddleOfWord } from '../../functions.jsx'
|
import { hasUpperCaseCharacterInMiddleOfWord } from '../../functions.jsx'
|
||||||
|
|
||||||
// Uppercase headlines are often parsed with very mixed character with pdf.js, like 'A heAdLine'.
|
// Uppercase headlines are often parsed with very mixed character with pdf.js, like 'A heAdLine'.
|
||||||
// This tries to detect them and make them all uppercase.
|
// This tries to detect them and make them all uppercase.
|
||||||
export default class HeadlineToUppercase extends Transformation {
|
export default class HeadlineToUppercase extends ToPdfViewTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Headlines Uppercase");
|
super("Headlines Uppercase");
|
||||||
}
|
}
|
||||||
|
|
||||||
contentView() {
|
|
||||||
return ContentView.PDF;
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(pages:PdfPage[]) {
|
transform(pages:PdfPage[]) {
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,17 +1,12 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
|
||||||
|
|
||||||
export default class NoOp extends Transformation {
|
export default class NoOp extends ToPdfViewTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Original");
|
super("Original");
|
||||||
}
|
}
|
||||||
|
|
||||||
contentView() {
|
|
||||||
return ContentView.PDF;
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(pdfPages:PdfPage[]) {
|
transform(pdfPages:PdfPage[]) {
|
||||||
return pdfPages;
|
return pdfPages;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
|
||||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
import { isDigit } from '../../functions.jsx'
|
import { isDigit } from '../../functions.jsx'
|
||||||
@ -25,16 +24,12 @@ function combineCoordinates(textItem) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
||||||
export default class RemoveRepetitiveElements extends Transformation {
|
export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Remove Repetitive Elements");
|
super("Remove Repetitive Elements");
|
||||||
}
|
}
|
||||||
|
|
||||||
contentView() {
|
|
||||||
return ContentView.PDF;
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(pages:PdfPage[]) {
|
transform(pages:PdfPage[]) {
|
||||||
//build repetition counts for every element
|
//build repetition counts for every element
|
||||||
const repetitionCounts = {};
|
const repetitionCounts = {};
|
||||||
|
@ -1,20 +1,15 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
|
||||||
|
|
||||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
export default class RemoveWhitespaces extends Transformation {
|
export default class RemoveWhitespaces extends ToPdfViewTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Remove Whitespaces");
|
super("Remove Whitespaces");
|
||||||
}
|
}
|
||||||
|
|
||||||
contentView() {
|
|
||||||
return ContentView.PDF;
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(pages:PdfPage[]) {
|
transform(pages:PdfPage[]) {
|
||||||
pages.forEach(page => {
|
pages.forEach(page => {
|
||||||
const newTextItems = [];
|
const newTextItems = [];
|
||||||
|
@ -1,17 +1,12 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
|
||||||
|
|
||||||
export default class RoundCoordinates extends Transformation {
|
export default class RoundCoordinates extends ToPdfViewTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Round Coordinates");
|
super("Round Coordinates");
|
||||||
}
|
}
|
||||||
|
|
||||||
contentView() {
|
|
||||||
return ContentView.PDF;
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(pdfPages:PdfPage[]) {
|
transform(pdfPages:PdfPage[]) {
|
||||||
return pdfPages.map(pdfPage => {
|
return pdfPages.map(pdfPage => {
|
||||||
return {
|
return {
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
|
import React from 'react';
|
||||||
import Transformation from './Transformation.jsx';
|
import Transformation from './Transformation.jsx';
|
||||||
|
import BlockPageView from '../../components/debug/BlockPageView.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import BlockPage from '../BlockPage.jsx';
|
import BlockPage from '../BlockPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
|
||||||
|
|
||||||
export default class ToBlockSystem extends Transformation {
|
export default class ToBlockSystem extends Transformation {
|
||||||
|
|
||||||
@ -9,12 +10,8 @@ export default class ToBlockSystem extends Transformation {
|
|||||||
super("To Block System");
|
super("To Block System");
|
||||||
}
|
}
|
||||||
|
|
||||||
contentView() {
|
createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
|
||||||
return ContentView.BLOCK;
|
return <BlockPageView key={ page.index } page={ page } />;
|
||||||
}
|
|
||||||
|
|
||||||
showPageSelection() {
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(pages:PdfPage[]) {
|
transform(pages:PdfPage[]) {
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
|
import React from 'react';
|
||||||
|
import MarkdownPageView from '../../components/debug/MarkdownPageView.jsx';
|
||||||
import Transformation from './Transformation.jsx';
|
import Transformation from './Transformation.jsx';
|
||||||
import TextPage from '../TextPage.jsx';
|
import TextPage from '../TextPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
|
||||||
|
|
||||||
export default class ToMarkdown extends Transformation {
|
export default class ToMarkdown extends Transformation {
|
||||||
|
|
||||||
@ -8,12 +9,8 @@ export default class ToMarkdown extends Transformation {
|
|||||||
super("To Markdown");
|
super("To Markdown");
|
||||||
}
|
}
|
||||||
|
|
||||||
showPageSelection() {
|
createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
|
||||||
return false;
|
return <MarkdownPageView key={ page.index } page={ page } />;
|
||||||
}
|
|
||||||
|
|
||||||
contentView() {
|
|
||||||
return ContentView.MARKDOWN;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(pages:TextPage[]) {
|
transform(pages:TextPage[]) {
|
||||||
|
@ -0,0 +1,32 @@
|
|||||||
|
import React from 'react';
|
||||||
|
import Transformation from './Transformation.jsx';
|
||||||
|
import PdfPage from '../PdfPage.jsx';
|
||||||
|
import PdfPageView from '../../components/debug/PdfPageView.jsx';
|
||||||
|
|
||||||
|
// Abstract pdfView transformation
|
||||||
|
export default class ToPdfViewTransformation extends Transformation {
|
||||||
|
|
||||||
|
constructor(name) {
|
||||||
|
super(name);
|
||||||
|
if (this.constructor === ToPdfViewTransformation) {
|
||||||
|
throw new TypeError("Can not construct abstract class.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
showPageSelection() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
showModificationCheckbox() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
createPageView(page, modificationsOnly) {
|
||||||
|
return <PdfPageView key={ page.index } pdfPage={ page } modificationsOnly={ modificationsOnly } />;
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(pdfPages:PdfPage[]) {
|
||||||
|
return pdfPages;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -12,11 +12,14 @@ export default class Transformation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
showPageSelection() {
|
showPageSelection() {
|
||||||
return true;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns with which type the transformed pages can be viewed
|
showModificationCheckbox() {
|
||||||
contentView() {
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
|
||||||
throw new TypeError("Do not call abstract method foo from child.");
|
throw new TypeError("Do not call abstract method foo from child.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user