mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-12 08:38:19 +01:00
Add text view
This commit is contained in:
parent
5f500ad110
commit
e43cf9a6a9
@ -6,7 +6,7 @@ import TopBar from './TopBar.jsx';
|
||||
import { View } from '../models/AppState.jsx';
|
||||
import PdfUploadView from './PdfUploadView.jsx';
|
||||
import LoadingView from './LoadingView.jsx';
|
||||
import PdfView from './PdfView.jsx';
|
||||
import DebugView from './DebugView.jsx';
|
||||
|
||||
export default class App extends React.Component {
|
||||
|
||||
@ -26,8 +26,8 @@ export default class App extends React.Component {
|
||||
case View.LOADING:
|
||||
mainView = <LoadingView fileBuffer={ appState.fileBuffer } storePdfPagesFunction={ appState.storePdfPages } />
|
||||
break;
|
||||
case View.PDF_VIEW:
|
||||
mainView = <PdfView pdfPages={ appState.pdfPages } transformations={ appState.transformations } />
|
||||
case View.DEBUG:
|
||||
mainView = <DebugView pdfPages={ appState.pdfPages } transformations={ appState.transformations } />
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -6,10 +6,12 @@ import Button from 'react-bootstrap/lib/Button'
|
||||
import DropdownButton from 'react-bootstrap/lib/DropdownButton'
|
||||
import MenuItem from 'react-bootstrap/lib/MenuItem'
|
||||
|
||||
import ContentView from '../models/ContentView.jsx';
|
||||
import PdfPageView from './PdfPageView.jsx';
|
||||
import TextPageView from './TextPageView.jsx';
|
||||
|
||||
// A view which displays the TextItems of multiple PdfPages
|
||||
export default class PdfView extends React.Component {
|
||||
// A view which displays the content of the given pages transformed by the given transformations
|
||||
export default class DebugView extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
pdfPages: React.PropTypes.array.isRequired,
|
||||
@ -55,39 +57,51 @@ export default class PdfView extends React.Component {
|
||||
|
||||
const currentTransformationName = transformations[currentTransformation].name;
|
||||
|
||||
const transformedPdfPages = pdfPages.filter((elem, i) => pageNr == -1 || i == pageNr).map(pdfPage => {
|
||||
for (var i = 0; i <= currentTransformation; i++) {
|
||||
pdfPage = transformations[i].transform(pdfPage);
|
||||
}
|
||||
return pdfPage;
|
||||
});
|
||||
var transformedPages = pdfPages.filter((elem, i) => pageNr == -1 || i == pageNr);
|
||||
var contentView;
|
||||
var lastTransformation;
|
||||
for (var i = 0; i <= currentTransformation; i++) {
|
||||
transformedPages = transformations[i].transform(transformedPages);
|
||||
lastTransformation = transformations[i];
|
||||
contentView = transformations[i].contentView();
|
||||
}
|
||||
|
||||
var pageComponents = transformedPdfPages.map(page => <PdfPageView key={ page.index } pdfPage={ page } />);
|
||||
var pageComponents;
|
||||
switch (contentView) {
|
||||
case ContentView.PDF:
|
||||
pageComponents = transformedPages.map(page => <PdfPageView key={ page.index } pdfPage={ page } />);
|
||||
break;
|
||||
case ContentView.TEXT:
|
||||
//transformedPages.forEach(p => console.debug(p));
|
||||
pageComponents = transformedPages.map(page => <TextPageView key={ page.index } page={ page } />);
|
||||
break;
|
||||
}
|
||||
|
||||
return (
|
||||
<div>
|
||||
<div>
|
||||
<table style={ { width: '100%' } }>
|
||||
<caption>
|
||||
Pages
|
||||
</caption>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<ButtonToolbar>
|
||||
<ButtonGroup>
|
||||
<Button onClick={ this.selectPage.bind(this, -1) } className={ pageNr == -1 ? 'active' : '' }>
|
||||
All
|
||||
</Button>
|
||||
{ pdfPages.map((pdfPage, i) => <Button key={ i } onClick={ this.selectPage.bind(this, i) } className={ pageNr == i ? 'active' : '' }>
|
||||
{ i + 1 }
|
||||
</Button>) }
|
||||
</ButtonGroup>
|
||||
</ButtonToolbar>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
{ lastTransformation.showPageSelection() &&
|
||||
<table style={ { width: '100%' } }>
|
||||
<caption>
|
||||
Pages
|
||||
</caption>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<ButtonToolbar>
|
||||
<ButtonGroup>
|
||||
<Button onClick={ this.selectPage.bind(this, -1) } className={ pageNr == -1 ? 'active' : '' }>
|
||||
All
|
||||
</Button>
|
||||
{ pdfPages.map((pdfPage, i) => <Button key={ i } onClick={ this.selectPage.bind(this, i) } className={ pageNr == i ? 'active' : '' }>
|
||||
{ i + 1 }
|
||||
</Button>) }
|
||||
</ButtonGroup>
|
||||
</ButtonToolbar>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table> }
|
||||
<br/>
|
||||
<table>
|
||||
<caption>
|
20
src/javascript/components/TextPageView.jsx
Normal file
20
src/javascript/components/TextPageView.jsx
Normal file
@ -0,0 +1,20 @@
|
||||
import React from 'react';
|
||||
|
||||
export default class TextPageView extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
page: React.PropTypes.object.isRequired,
|
||||
};
|
||||
|
||||
render() {
|
||||
const header = "Page " + (this.props.page.index + 1);
|
||||
return (
|
||||
<div>
|
||||
<h2>{ header }</h2>
|
||||
<textarea rows="45" cols="150" defaultValue={ this.props.page.text }>
|
||||
</textarea>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
}
|
@ -3,6 +3,8 @@ import { Enum } from 'enumify';
|
||||
import NoOpTransformation from './transformations/NoOpTransformation.jsx';
|
||||
import RoundCoordinatesTransformation from './transformations/RoundCoordinatesTransformation.jsx';
|
||||
import CombineSameYTransformation from './transformations/CombineSameYTransformation.jsx';
|
||||
import ToTextPagesTransformation from './transformations/ToTextPagesTransformation.jsx';
|
||||
import ToSingleTextPageTransformation from './transformations/ToSingleTextPageTransformation.jsx'
|
||||
|
||||
// Holds the state of the Application
|
||||
export default class AppState {
|
||||
@ -12,7 +14,7 @@ export default class AppState {
|
||||
this.mainView = View.UPLOAD;
|
||||
this.fileBuffer;
|
||||
this.pdfPages = [];
|
||||
this.transformations = [new NoOpTransformation(), new RoundCoordinatesTransformation(), new CombineSameYTransformation()];
|
||||
this.transformations = [new NoOpTransformation(), new RoundCoordinatesTransformation(), new CombineSameYTransformation(), new ToTextPagesTransformation(), new ToSingleTextPageTransformation()];
|
||||
|
||||
//bind functions
|
||||
this.render = this.render.bind(this);
|
||||
@ -34,7 +36,7 @@ export default class AppState {
|
||||
storePdfPages(pdfPages) {
|
||||
this.pdfPages = pdfPages;
|
||||
this.fileBuffer = null;
|
||||
this.mainView = View.PDF_VIEW;
|
||||
this.mainView = View.DEBUG;
|
||||
this.render();
|
||||
}
|
||||
|
||||
@ -42,4 +44,4 @@ export default class AppState {
|
||||
|
||||
export class View extends Enum {
|
||||
}
|
||||
View.initEnum(['UPLOAD', 'LOADING', 'PDF_VIEW'])
|
||||
View.initEnum(['UPLOAD', 'LOADING', 'DEBUG'])
|
5
src/javascript/models/ContentView.jsx
Normal file
5
src/javascript/models/ContentView.jsx
Normal file
@ -0,0 +1,5 @@
|
||||
import { Enum } from 'enumify';
|
||||
|
||||
export default class ContentView extends Enum {
|
||||
}
|
||||
ContentView.initEnum(['PDF', 'TEXT'])
|
9
src/javascript/models/TextPage.jsx
Normal file
9
src/javascript/models/TextPage.jsx
Normal file
@ -0,0 +1,9 @@
|
||||
// A page which holds TextItems displayable via PdfPageView
|
||||
export default class TextPage {
|
||||
|
||||
constructor(options) {
|
||||
this.index = options.index;
|
||||
this.text = options.text;
|
||||
}
|
||||
|
||||
}
|
@ -1,6 +1,7 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
export default class CombineSameYTransformation extends Transformation {
|
||||
|
||||
@ -8,50 +9,55 @@ export default class CombineSameYTransformation extends Transformation {
|
||||
super("Combine text on same Y");
|
||||
}
|
||||
|
||||
transform(pdfPage:PdfPage) {
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
const newTextItems = [];
|
||||
var lastTextItem;
|
||||
pdfPage.textItems.forEach(textItem => {
|
||||
if (!lastTextItem) {
|
||||
lastTextItem = textItem;
|
||||
} else {
|
||||
if (textItem.y == lastTextItem.y) {
|
||||
//combine
|
||||
|
||||
// console.debug("last=" + lastTextItem.text + ", x=" + lastTextItem.x + ", width=" + lastTextItem.width);
|
||||
// console.debug("new=" + textItem.text + ", x=" + textItem.x + ", width=" + textItem.width);
|
||||
// console.debug("diff=" + (textItem.x - lastTextItem.x - lastTextItem.width));
|
||||
|
||||
var combinedText = lastTextItem.text;
|
||||
//TODO make 5 dependent on text size or biggest gap?
|
||||
if (textItem.x - lastTextItem.x - lastTextItem.width > 7) {
|
||||
combinedText += ' ';
|
||||
}
|
||||
combinedText += textItem.text;
|
||||
|
||||
lastTextItem = new TextItem({
|
||||
x: lastTextItem.x,
|
||||
y: lastTextItem.y,
|
||||
width: textItem.x - lastTextItem.x + textItem.width,
|
||||
height: lastTextItem.height, //might this cause problems ?
|
||||
text: combinedText
|
||||
});
|
||||
} else {
|
||||
//rotate
|
||||
newTextItems.push(lastTextItem);
|
||||
transform(pages:PdfPage[]) {
|
||||
return pages.map(pdfPage => {
|
||||
const newTextItems = [];
|
||||
var lastTextItem;
|
||||
pdfPage.textItems.forEach(textItem => {
|
||||
if (!lastTextItem) {
|
||||
lastTextItem = textItem;
|
||||
}
|
||||
}
|
||||
});
|
||||
if (lastTextItem) {
|
||||
newTextItems.push(lastTextItem);
|
||||
}
|
||||
} else {
|
||||
if (textItem.y == lastTextItem.y) {
|
||||
//combine
|
||||
|
||||
return {
|
||||
...pdfPage,
|
||||
textItems: newTextItems
|
||||
};
|
||||
// console.debug("last=" + lastTextItem.text + ", x=" + lastTextItem.x + ", width=" + lastTextItem.width);
|
||||
// console.debug("new=" + textItem.text + ", x=" + textItem.x + ", width=" + textItem.width);
|
||||
// console.debug("diff=" + (textItem.x - lastTextItem.x - lastTextItem.width));
|
||||
|
||||
var combinedText = lastTextItem.text;
|
||||
//TODO make 5 dependent on text size or biggest gap?
|
||||
if (textItem.x - lastTextItem.x - lastTextItem.width > 7) {
|
||||
combinedText += ' ';
|
||||
}
|
||||
combinedText += textItem.text;
|
||||
|
||||
lastTextItem = new TextItem({
|
||||
x: lastTextItem.x,
|
||||
y: lastTextItem.y,
|
||||
width: textItem.x - lastTextItem.x + textItem.width,
|
||||
height: lastTextItem.height, //might this cause problems ?
|
||||
text: combinedText
|
||||
});
|
||||
} else {
|
||||
//rotate
|
||||
newTextItems.push(lastTextItem);
|
||||
lastTextItem = textItem;
|
||||
}
|
||||
}
|
||||
});
|
||||
if (lastTextItem) {
|
||||
newTextItems.push(lastTextItem);
|
||||
}
|
||||
|
||||
return {
|
||||
...pdfPage,
|
||||
textItems: newTextItems
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
export default class NoOpTransformation extends Transformation {
|
||||
|
||||
@ -7,8 +8,12 @@ export default class NoOpTransformation extends Transformation {
|
||||
super("Original");
|
||||
}
|
||||
|
||||
transform(pdfPage:PdfPage) {
|
||||
return pdfPage;
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pdfPages:PdfPage[]) {
|
||||
return pdfPages;
|
||||
}
|
||||
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
export default class RoundCoordinatesTransformation extends Transformation {
|
||||
|
||||
@ -7,19 +8,25 @@ export default class RoundCoordinatesTransformation extends Transformation {
|
||||
super("Round coordinates");
|
||||
}
|
||||
|
||||
transform(pdfPage:PdfPage) {
|
||||
return {
|
||||
...pdfPage,
|
||||
textItems: pdfPage.textItems.map(textItem => {
|
||||
return {
|
||||
...textItem,
|
||||
x: Math.round(textItem.x),
|
||||
y: Math.round(textItem.y),
|
||||
width: Math.round(textItem.width),
|
||||
height: Math.round(textItem.height)
|
||||
}
|
||||
})
|
||||
};
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pdfPages:PdfPage[]) {
|
||||
return pdfPages.map(pdfPage => {
|
||||
return {
|
||||
...pdfPage,
|
||||
textItems: pdfPage.textItems.map(textItem => {
|
||||
return {
|
||||
...textItem,
|
||||
x: Math.round(textItem.x),
|
||||
y: Math.round(textItem.y),
|
||||
width: Math.round(textItem.width),
|
||||
height: Math.round(textItem.height)
|
||||
}
|
||||
})
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import TextPage from '../TextPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
export default class ToSingleTextPageTransformation extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("To Single Text Page");
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return false;
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.TEXT;
|
||||
}
|
||||
|
||||
transform(pages:TextPage[]) {
|
||||
var text = '';
|
||||
pages.forEach(page => text += page.text + '\n');
|
||||
return [new TextPage({
|
||||
index: 0,
|
||||
text: text
|
||||
})];
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,27 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import TextPage from '../TextPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
export default class ToTextPagesTransformation extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("To Text Pages");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.TEXT;
|
||||
}
|
||||
|
||||
transform(pdfPages:PdfPage[]) {
|
||||
return pdfPages.map(pdfPage => {
|
||||
var text = '';
|
||||
pdfPage.textItems.forEach(textItem => text += textItem.text + '\n');
|
||||
return new TextPage({
|
||||
index: pdfPage.index,
|
||||
text: text
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
}
|
@ -1,5 +1,3 @@
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
|
||||
// A transformation from an PdfPage to an PdfPage
|
||||
export default class Transformation {
|
||||
|
||||
@ -13,7 +11,19 @@ export default class Transformation {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
transform(pdfPage:PdfPage) { // eslint-disable-line no-unused-vars
|
||||
showPageSelection() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns with which type the transformed pages can be viewed
|
||||
contentView() {
|
||||
throw new TypeError("Do not call abstract method foo from child.");
|
||||
}
|
||||
|
||||
// Transform incoming pages (like PdfPage[]) into different pages (either PdfPages[] or TextPages[])
|
||||
transform(pages) { // eslint-disable-line no-unused-vars
|
||||
throw new TypeError("Do not call abstract method foo from child.");
|
||||
}
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user