mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-22 07:43:46 +01:00
Add text view
This commit is contained in:
parent
5f500ad110
commit
e43cf9a6a9
@ -6,7 +6,7 @@ import TopBar from './TopBar.jsx';
|
|||||||
import { View } from '../models/AppState.jsx';
|
import { View } from '../models/AppState.jsx';
|
||||||
import PdfUploadView from './PdfUploadView.jsx';
|
import PdfUploadView from './PdfUploadView.jsx';
|
||||||
import LoadingView from './LoadingView.jsx';
|
import LoadingView from './LoadingView.jsx';
|
||||||
import PdfView from './PdfView.jsx';
|
import DebugView from './DebugView.jsx';
|
||||||
|
|
||||||
export default class App extends React.Component {
|
export default class App extends React.Component {
|
||||||
|
|
||||||
@ -26,8 +26,8 @@ export default class App extends React.Component {
|
|||||||
case View.LOADING:
|
case View.LOADING:
|
||||||
mainView = <LoadingView fileBuffer={ appState.fileBuffer } storePdfPagesFunction={ appState.storePdfPages } />
|
mainView = <LoadingView fileBuffer={ appState.fileBuffer } storePdfPagesFunction={ appState.storePdfPages } />
|
||||||
break;
|
break;
|
||||||
case View.PDF_VIEW:
|
case View.DEBUG:
|
||||||
mainView = <PdfView pdfPages={ appState.pdfPages } transformations={ appState.transformations } />
|
mainView = <DebugView pdfPages={ appState.pdfPages } transformations={ appState.transformations } />
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,10 +6,12 @@ import Button from 'react-bootstrap/lib/Button'
|
|||||||
import DropdownButton from 'react-bootstrap/lib/DropdownButton'
|
import DropdownButton from 'react-bootstrap/lib/DropdownButton'
|
||||||
import MenuItem from 'react-bootstrap/lib/MenuItem'
|
import MenuItem from 'react-bootstrap/lib/MenuItem'
|
||||||
|
|
||||||
|
import ContentView from '../models/ContentView.jsx';
|
||||||
import PdfPageView from './PdfPageView.jsx';
|
import PdfPageView from './PdfPageView.jsx';
|
||||||
|
import TextPageView from './TextPageView.jsx';
|
||||||
|
|
||||||
// A view which displays the TextItems of multiple PdfPages
|
// A view which displays the content of the given pages transformed by the given transformations
|
||||||
export default class PdfView extends React.Component {
|
export default class DebugView extends React.Component {
|
||||||
|
|
||||||
static propTypes = {
|
static propTypes = {
|
||||||
pdfPages: React.PropTypes.array.isRequired,
|
pdfPages: React.PropTypes.array.isRequired,
|
||||||
@ -55,39 +57,51 @@ export default class PdfView extends React.Component {
|
|||||||
|
|
||||||
const currentTransformationName = transformations[currentTransformation].name;
|
const currentTransformationName = transformations[currentTransformation].name;
|
||||||
|
|
||||||
const transformedPdfPages = pdfPages.filter((elem, i) => pageNr == -1 || i == pageNr).map(pdfPage => {
|
var transformedPages = pdfPages.filter((elem, i) => pageNr == -1 || i == pageNr);
|
||||||
for (var i = 0; i <= currentTransformation; i++) {
|
var contentView;
|
||||||
pdfPage = transformations[i].transform(pdfPage);
|
var lastTransformation;
|
||||||
}
|
for (var i = 0; i <= currentTransformation; i++) {
|
||||||
return pdfPage;
|
transformedPages = transformations[i].transform(transformedPages);
|
||||||
});
|
lastTransformation = transformations[i];
|
||||||
|
contentView = transformations[i].contentView();
|
||||||
|
}
|
||||||
|
|
||||||
var pageComponents = transformedPdfPages.map(page => <PdfPageView key={ page.index } pdfPage={ page } />);
|
var pageComponents;
|
||||||
|
switch (contentView) {
|
||||||
|
case ContentView.PDF:
|
||||||
|
pageComponents = transformedPages.map(page => <PdfPageView key={ page.index } pdfPage={ page } />);
|
||||||
|
break;
|
||||||
|
case ContentView.TEXT:
|
||||||
|
//transformedPages.forEach(p => console.debug(p));
|
||||||
|
pageComponents = transformedPages.map(page => <TextPageView key={ page.index } page={ page } />);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div>
|
<div>
|
||||||
<div>
|
<div>
|
||||||
<table style={ { width: '100%' } }>
|
{ lastTransformation.showPageSelection() &&
|
||||||
<caption>
|
<table style={ { width: '100%' } }>
|
||||||
Pages
|
<caption>
|
||||||
</caption>
|
Pages
|
||||||
<tbody>
|
</caption>
|
||||||
<tr>
|
<tbody>
|
||||||
<td>
|
<tr>
|
||||||
<ButtonToolbar>
|
<td>
|
||||||
<ButtonGroup>
|
<ButtonToolbar>
|
||||||
<Button onClick={ this.selectPage.bind(this, -1) } className={ pageNr == -1 ? 'active' : '' }>
|
<ButtonGroup>
|
||||||
All
|
<Button onClick={ this.selectPage.bind(this, -1) } className={ pageNr == -1 ? 'active' : '' }>
|
||||||
</Button>
|
All
|
||||||
{ pdfPages.map((pdfPage, i) => <Button key={ i } onClick={ this.selectPage.bind(this, i) } className={ pageNr == i ? 'active' : '' }>
|
</Button>
|
||||||
{ i + 1 }
|
{ pdfPages.map((pdfPage, i) => <Button key={ i } onClick={ this.selectPage.bind(this, i) } className={ pageNr == i ? 'active' : '' }>
|
||||||
</Button>) }
|
{ i + 1 }
|
||||||
</ButtonGroup>
|
</Button>) }
|
||||||
</ButtonToolbar>
|
</ButtonGroup>
|
||||||
</td>
|
</ButtonToolbar>
|
||||||
</tr>
|
</td>
|
||||||
</tbody>
|
</tr>
|
||||||
</table>
|
</tbody>
|
||||||
|
</table> }
|
||||||
<br/>
|
<br/>
|
||||||
<table>
|
<table>
|
||||||
<caption>
|
<caption>
|
20
src/javascript/components/TextPageView.jsx
Normal file
20
src/javascript/components/TextPageView.jsx
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
import React from 'react';
|
||||||
|
|
||||||
|
export default class TextPageView extends React.Component {
|
||||||
|
|
||||||
|
static propTypes = {
|
||||||
|
page: React.PropTypes.object.isRequired,
|
||||||
|
};
|
||||||
|
|
||||||
|
render() {
|
||||||
|
const header = "Page " + (this.props.page.index + 1);
|
||||||
|
return (
|
||||||
|
<div>
|
||||||
|
<h2>{ header }</h2>
|
||||||
|
<textarea rows="45" cols="150" defaultValue={ this.props.page.text }>
|
||||||
|
</textarea>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -3,6 +3,8 @@ import { Enum } from 'enumify';
|
|||||||
import NoOpTransformation from './transformations/NoOpTransformation.jsx';
|
import NoOpTransformation from './transformations/NoOpTransformation.jsx';
|
||||||
import RoundCoordinatesTransformation from './transformations/RoundCoordinatesTransformation.jsx';
|
import RoundCoordinatesTransformation from './transformations/RoundCoordinatesTransformation.jsx';
|
||||||
import CombineSameYTransformation from './transformations/CombineSameYTransformation.jsx';
|
import CombineSameYTransformation from './transformations/CombineSameYTransformation.jsx';
|
||||||
|
import ToTextPagesTransformation from './transformations/ToTextPagesTransformation.jsx';
|
||||||
|
import ToSingleTextPageTransformation from './transformations/ToSingleTextPageTransformation.jsx'
|
||||||
|
|
||||||
// Holds the state of the Application
|
// Holds the state of the Application
|
||||||
export default class AppState {
|
export default class AppState {
|
||||||
@ -12,7 +14,7 @@ export default class AppState {
|
|||||||
this.mainView = View.UPLOAD;
|
this.mainView = View.UPLOAD;
|
||||||
this.fileBuffer;
|
this.fileBuffer;
|
||||||
this.pdfPages = [];
|
this.pdfPages = [];
|
||||||
this.transformations = [new NoOpTransformation(), new RoundCoordinatesTransformation(), new CombineSameYTransformation()];
|
this.transformations = [new NoOpTransformation(), new RoundCoordinatesTransformation(), new CombineSameYTransformation(), new ToTextPagesTransformation(), new ToSingleTextPageTransformation()];
|
||||||
|
|
||||||
//bind functions
|
//bind functions
|
||||||
this.render = this.render.bind(this);
|
this.render = this.render.bind(this);
|
||||||
@ -34,7 +36,7 @@ export default class AppState {
|
|||||||
storePdfPages(pdfPages) {
|
storePdfPages(pdfPages) {
|
||||||
this.pdfPages = pdfPages;
|
this.pdfPages = pdfPages;
|
||||||
this.fileBuffer = null;
|
this.fileBuffer = null;
|
||||||
this.mainView = View.PDF_VIEW;
|
this.mainView = View.DEBUG;
|
||||||
this.render();
|
this.render();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -42,4 +44,4 @@ export default class AppState {
|
|||||||
|
|
||||||
export class View extends Enum {
|
export class View extends Enum {
|
||||||
}
|
}
|
||||||
View.initEnum(['UPLOAD', 'LOADING', 'PDF_VIEW'])
|
View.initEnum(['UPLOAD', 'LOADING', 'DEBUG'])
|
5
src/javascript/models/ContentView.jsx
Normal file
5
src/javascript/models/ContentView.jsx
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
import { Enum } from 'enumify';
|
||||||
|
|
||||||
|
export default class ContentView extends Enum {
|
||||||
|
}
|
||||||
|
ContentView.initEnum(['PDF', 'TEXT'])
|
9
src/javascript/models/TextPage.jsx
Normal file
9
src/javascript/models/TextPage.jsx
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
// A page which holds TextItems displayable via PdfPageView
|
||||||
|
export default class TextPage {
|
||||||
|
|
||||||
|
constructor(options) {
|
||||||
|
this.index = options.index;
|
||||||
|
this.text = options.text;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,6 +1,7 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import Transformation from './Transformation.jsx';
|
||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
|
import ContentView from '../ContentView.jsx';
|
||||||
|
|
||||||
export default class CombineSameYTransformation extends Transformation {
|
export default class CombineSameYTransformation extends Transformation {
|
||||||
|
|
||||||
@ -8,50 +9,55 @@ export default class CombineSameYTransformation extends Transformation {
|
|||||||
super("Combine text on same Y");
|
super("Combine text on same Y");
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(pdfPage:PdfPage) {
|
contentView() {
|
||||||
|
return ContentView.PDF;
|
||||||
|
}
|
||||||
|
|
||||||
const newTextItems = [];
|
transform(pages:PdfPage[]) {
|
||||||
var lastTextItem;
|
return pages.map(pdfPage => {
|
||||||
pdfPage.textItems.forEach(textItem => {
|
const newTextItems = [];
|
||||||
if (!lastTextItem) {
|
var lastTextItem;
|
||||||
lastTextItem = textItem;
|
pdfPage.textItems.forEach(textItem => {
|
||||||
} else {
|
if (!lastTextItem) {
|
||||||
if (textItem.y == lastTextItem.y) {
|
|
||||||
//combine
|
|
||||||
|
|
||||||
// console.debug("last=" + lastTextItem.text + ", x=" + lastTextItem.x + ", width=" + lastTextItem.width);
|
|
||||||
// console.debug("new=" + textItem.text + ", x=" + textItem.x + ", width=" + textItem.width);
|
|
||||||
// console.debug("diff=" + (textItem.x - lastTextItem.x - lastTextItem.width));
|
|
||||||
|
|
||||||
var combinedText = lastTextItem.text;
|
|
||||||
//TODO make 5 dependent on text size or biggest gap?
|
|
||||||
if (textItem.x - lastTextItem.x - lastTextItem.width > 7) {
|
|
||||||
combinedText += ' ';
|
|
||||||
}
|
|
||||||
combinedText += textItem.text;
|
|
||||||
|
|
||||||
lastTextItem = new TextItem({
|
|
||||||
x: lastTextItem.x,
|
|
||||||
y: lastTextItem.y,
|
|
||||||
width: textItem.x - lastTextItem.x + textItem.width,
|
|
||||||
height: lastTextItem.height, //might this cause problems ?
|
|
||||||
text: combinedText
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
//rotate
|
|
||||||
newTextItems.push(lastTextItem);
|
|
||||||
lastTextItem = textItem;
|
lastTextItem = textItem;
|
||||||
}
|
} else {
|
||||||
}
|
if (textItem.y == lastTextItem.y) {
|
||||||
});
|
//combine
|
||||||
if (lastTextItem) {
|
|
||||||
newTextItems.push(lastTextItem);
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
// console.debug("last=" + lastTextItem.text + ", x=" + lastTextItem.x + ", width=" + lastTextItem.width);
|
||||||
...pdfPage,
|
// console.debug("new=" + textItem.text + ", x=" + textItem.x + ", width=" + textItem.width);
|
||||||
textItems: newTextItems
|
// console.debug("diff=" + (textItem.x - lastTextItem.x - lastTextItem.width));
|
||||||
};
|
|
||||||
|
var combinedText = lastTextItem.text;
|
||||||
|
//TODO make 5 dependent on text size or biggest gap?
|
||||||
|
if (textItem.x - lastTextItem.x - lastTextItem.width > 7) {
|
||||||
|
combinedText += ' ';
|
||||||
|
}
|
||||||
|
combinedText += textItem.text;
|
||||||
|
|
||||||
|
lastTextItem = new TextItem({
|
||||||
|
x: lastTextItem.x,
|
||||||
|
y: lastTextItem.y,
|
||||||
|
width: textItem.x - lastTextItem.x + textItem.width,
|
||||||
|
height: lastTextItem.height, //might this cause problems ?
|
||||||
|
text: combinedText
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
//rotate
|
||||||
|
newTextItems.push(lastTextItem);
|
||||||
|
lastTextItem = textItem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (lastTextItem) {
|
||||||
|
newTextItems.push(lastTextItem);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
...pdfPage,
|
||||||
|
textItems: newTextItems
|
||||||
|
};
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -1,5 +1,6 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import Transformation from './Transformation.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
|
import ContentView from '../ContentView.jsx';
|
||||||
|
|
||||||
export default class NoOpTransformation extends Transformation {
|
export default class NoOpTransformation extends Transformation {
|
||||||
|
|
||||||
@ -7,8 +8,12 @@ export default class NoOpTransformation extends Transformation {
|
|||||||
super("Original");
|
super("Original");
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(pdfPage:PdfPage) {
|
contentView() {
|
||||||
return pdfPage;
|
return ContentView.PDF;
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(pdfPages:PdfPage[]) {
|
||||||
|
return pdfPages;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -1,5 +1,6 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import Transformation from './Transformation.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
|
import ContentView from '../ContentView.jsx';
|
||||||
|
|
||||||
export default class RoundCoordinatesTransformation extends Transformation {
|
export default class RoundCoordinatesTransformation extends Transformation {
|
||||||
|
|
||||||
@ -7,19 +8,25 @@ export default class RoundCoordinatesTransformation extends Transformation {
|
|||||||
super("Round coordinates");
|
super("Round coordinates");
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(pdfPage:PdfPage) {
|
contentView() {
|
||||||
return {
|
return ContentView.PDF;
|
||||||
...pdfPage,
|
}
|
||||||
textItems: pdfPage.textItems.map(textItem => {
|
|
||||||
return {
|
transform(pdfPages:PdfPage[]) {
|
||||||
...textItem,
|
return pdfPages.map(pdfPage => {
|
||||||
x: Math.round(textItem.x),
|
return {
|
||||||
y: Math.round(textItem.y),
|
...pdfPage,
|
||||||
width: Math.round(textItem.width),
|
textItems: pdfPage.textItems.map(textItem => {
|
||||||
height: Math.round(textItem.height)
|
return {
|
||||||
}
|
...textItem,
|
||||||
})
|
x: Math.round(textItem.x),
|
||||||
};
|
y: Math.round(textItem.y),
|
||||||
|
width: Math.round(textItem.width),
|
||||||
|
height: Math.round(textItem.height)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
};
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -0,0 +1,28 @@
|
|||||||
|
import Transformation from './Transformation.jsx';
|
||||||
|
import TextPage from '../TextPage.jsx';
|
||||||
|
import ContentView from '../ContentView.jsx';
|
||||||
|
|
||||||
|
export default class ToSingleTextPageTransformation extends Transformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("To Single Text Page");
|
||||||
|
}
|
||||||
|
|
||||||
|
showPageSelection() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
contentView() {
|
||||||
|
return ContentView.TEXT;
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(pages:TextPage[]) {
|
||||||
|
var text = '';
|
||||||
|
pages.forEach(page => text += page.text + '\n');
|
||||||
|
return [new TextPage({
|
||||||
|
index: 0,
|
||||||
|
text: text
|
||||||
|
})];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,27 @@
|
|||||||
|
import Transformation from './Transformation.jsx';
|
||||||
|
import PdfPage from '../PdfPage.jsx';
|
||||||
|
import TextPage from '../TextPage.jsx';
|
||||||
|
import ContentView from '../ContentView.jsx';
|
||||||
|
|
||||||
|
export default class ToTextPagesTransformation extends Transformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("To Text Pages");
|
||||||
|
}
|
||||||
|
|
||||||
|
contentView() {
|
||||||
|
return ContentView.TEXT;
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(pdfPages:PdfPage[]) {
|
||||||
|
return pdfPages.map(pdfPage => {
|
||||||
|
var text = '';
|
||||||
|
pdfPage.textItems.forEach(textItem => text += textItem.text + '\n');
|
||||||
|
return new TextPage({
|
||||||
|
index: pdfPage.index,
|
||||||
|
text: text
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,5 +1,3 @@
|
|||||||
import PdfPage from '../PdfPage.jsx';
|
|
||||||
|
|
||||||
// A transformation from an PdfPage to an PdfPage
|
// A transformation from an PdfPage to an PdfPage
|
||||||
export default class Transformation {
|
export default class Transformation {
|
||||||
|
|
||||||
@ -13,7 +11,19 @@ export default class Transformation {
|
|||||||
this.name = name;
|
this.name = name;
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(pdfPage:PdfPage) { // eslint-disable-line no-unused-vars
|
showPageSelection() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns with which type the transformed pages can be viewed
|
||||||
|
contentView() {
|
||||||
throw new TypeError("Do not call abstract method foo from child.");
|
throw new TypeError("Do not call abstract method foo from child.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Transform incoming pages (like PdfPage[]) into different pages (either PdfPages[] or TextPages[])
|
||||||
|
transform(pages) { // eslint-disable-line no-unused-vars
|
||||||
|
throw new TypeError("Do not call abstract method foo from child.");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user