mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-16 18:00:46 +01:00
Replace text with block system
This commit is contained in:
parent
1ca9fa4362
commit
3a1241896b
@ -11,7 +11,7 @@ import Checkbox from 'react-bootstrap/lib/Checkbox'
|
|||||||
|
|
||||||
import ContentView from '../models/ContentView.jsx';
|
import ContentView from '../models/ContentView.jsx';
|
||||||
import PdfPageView from './debug/PdfPageView.jsx';
|
import PdfPageView from './debug/PdfPageView.jsx';
|
||||||
import TextPageView from './debug/TextPageView.jsx';
|
import BlockPageView from './debug/BlockPageView.jsx';
|
||||||
import MarkdownPageView from './debug/MarkdownPageView.jsx';
|
import MarkdownPageView from './debug/MarkdownPageView.jsx';
|
||||||
|
|
||||||
// A view which displays the content of the given pages transformed by the given transformations
|
// A view which displays the content of the given pages transformed by the given transformations
|
||||||
@ -88,8 +88,8 @@ export default class DebugView extends React.Component {
|
|||||||
pageComponents = transformedPages.map(page => <PdfPageView key={ page.index } pdfPage={ page } modificationsOnly={ this.state.modificationsOnly } />);
|
pageComponents = transformedPages.map(page => <PdfPageView key={ page.index } pdfPage={ page } modificationsOnly={ this.state.modificationsOnly } />);
|
||||||
showModificationCheckbox = true;
|
showModificationCheckbox = true;
|
||||||
break;
|
break;
|
||||||
case ContentView.TEXT:
|
case ContentView.BLOCK:
|
||||||
pageComponents = transformedPages.map(page => <TextPageView key={ page.index } page={ page } />);
|
pageComponents = transformedPages.map(page => <BlockPageView key={ page.index } page={ page } />);
|
||||||
break;
|
break;
|
||||||
case ContentView.MARKDOWN:
|
case ContentView.MARKDOWN:
|
||||||
pageComponents = transformedPages.map(page => <MarkdownPageView key={ page.index } page={ page } />);
|
pageComponents = transformedPages.map(page => <MarkdownPageView key={ page.index } page={ page } />);
|
||||||
|
49
src/javascript/components/debug/BlockPageView.jsx
Normal file
49
src/javascript/components/debug/BlockPageView.jsx
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
import React from 'react';
|
||||||
|
|
||||||
|
import Table from 'react-bootstrap/lib/Table'
|
||||||
|
|
||||||
|
export default class BlockPageView extends React.Component {
|
||||||
|
|
||||||
|
static propTypes = {
|
||||||
|
page: React.PropTypes.object.isRequired,
|
||||||
|
};
|
||||||
|
|
||||||
|
render() {
|
||||||
|
var blocks = this.props.page.blocks;
|
||||||
|
|
||||||
|
const content = <div>
|
||||||
|
<Table responsive>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>
|
||||||
|
#
|
||||||
|
</th>
|
||||||
|
<th>
|
||||||
|
Category
|
||||||
|
</th>
|
||||||
|
<th>
|
||||||
|
Text
|
||||||
|
</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{ blocks.map((block, i) => <tr key={ i }>
|
||||||
|
<td>
|
||||||
|
{ i }
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
{ block.category }
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<pre style={ { display: 'inline-block' } }>{ block.text }</pre>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
) }
|
||||||
|
</tbody>
|
||||||
|
</Table>
|
||||||
|
</div>
|
||||||
|
return (
|
||||||
|
content
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
@ -1,24 +0,0 @@
|
|||||||
import React from 'react';
|
|
||||||
|
|
||||||
export default class TextPageView extends React.Component {
|
|
||||||
|
|
||||||
static propTypes = {
|
|
||||||
page: React.PropTypes.object.isRequired,
|
|
||||||
};
|
|
||||||
|
|
||||||
render() {
|
|
||||||
const header = "Page " + (this.props.page.index + 1);
|
|
||||||
return (
|
|
||||||
<div>
|
|
||||||
<h2>{ header }</h2>
|
|
||||||
<textarea
|
|
||||||
rows="45"
|
|
||||||
cols="150"
|
|
||||||
value={ this.props.page.text }
|
|
||||||
readOnly="readonly">
|
|
||||||
</textarea>
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -9,7 +9,7 @@ import DetectLinks from './transformations/DetectLinks.jsx'
|
|||||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||||
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||||
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||||
import ToTextPages from './transformations/ToTextPages.jsx';
|
import ToBlockSystem from './transformations/ToBlockSystem.jsx';
|
||||||
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
||||||
|
|
||||||
// Holds the state of the Application
|
// Holds the state of the Application
|
||||||
@ -30,7 +30,7 @@ export default class AppState {
|
|||||||
new RemoveRepetitiveElements(),
|
new RemoveRepetitiveElements(),
|
||||||
new HeadlineDetector(),
|
new HeadlineDetector(),
|
||||||
new HeadlineToUppercase(),
|
new HeadlineToUppercase(),
|
||||||
new ToTextPages(),
|
new ToBlockSystem(),
|
||||||
new ToMarkdown()];
|
new ToMarkdown()];
|
||||||
|
|
||||||
//bind functions
|
//bind functions
|
||||||
|
9
src/javascript/models/BlockPage.jsx
Normal file
9
src/javascript/models/BlockPage.jsx
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
// A page which holds blocks displayable via BlockPageView
|
||||||
|
export default class BlockPage {
|
||||||
|
|
||||||
|
constructor(options) {
|
||||||
|
this.index = options.index;
|
||||||
|
this.blocks = options.blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -2,4 +2,4 @@ import { Enum } from 'enumify';
|
|||||||
|
|
||||||
export default class ContentView extends Enum {
|
export default class ContentView extends Enum {
|
||||||
}
|
}
|
||||||
ContentView.initEnum(['PDF', 'TEXT', 'MARKDOWN'])
|
ContentView.initEnum(['PDF', 'BLOCK', 'MARKDOWN'])
|
78
src/javascript/models/transformations/ToBlockSystem.jsx
Normal file
78
src/javascript/models/transformations/ToBlockSystem.jsx
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
import Transformation from './Transformation.jsx';
|
||||||
|
import PdfPage from '../PdfPage.jsx';
|
||||||
|
import BlockPage from '../BlockPage.jsx';
|
||||||
|
import ContentView from '../ContentView.jsx';
|
||||||
|
|
||||||
|
export default class ToBlockSystem extends Transformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("To Block System");
|
||||||
|
}
|
||||||
|
|
||||||
|
contentView() {
|
||||||
|
return ContentView.BLOCK;
|
||||||
|
}
|
||||||
|
|
||||||
|
showPageSelection() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(pages:PdfPage[]) {
|
||||||
|
const blocks = [];
|
||||||
|
pages.forEach(page => {
|
||||||
|
var minDiff = 99;
|
||||||
|
var lastY = 0;
|
||||||
|
page.textItems.forEach(item => {
|
||||||
|
if (lastY > 0) {
|
||||||
|
const yDiff = lastY - item.y - item.height;
|
||||||
|
if (yDiff > 0) {
|
||||||
|
minDiff = Math.min(minDiff, yDiff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lastY = item.y;
|
||||||
|
});
|
||||||
|
|
||||||
|
var text;
|
||||||
|
const rollup = (category) => {
|
||||||
|
if (text && text.length > 0) {
|
||||||
|
// console.debug("Push[" + blocks.length + "]: " + text);
|
||||||
|
blocks.push({
|
||||||
|
category: category,
|
||||||
|
text: text
|
||||||
|
});
|
||||||
|
}
|
||||||
|
text = null;
|
||||||
|
};
|
||||||
|
|
||||||
|
lastY = 0;
|
||||||
|
page.textItems.forEach(item => {
|
||||||
|
if (item.markdownElement) {
|
||||||
|
rollup("Block");
|
||||||
|
text = item.markdownElement.transformText(item.text);
|
||||||
|
rollup(item.markdownElement.constructor.name);
|
||||||
|
} else if (!text) {
|
||||||
|
text = item.text;
|
||||||
|
} else {
|
||||||
|
const yDiff = lastY - item.y - item.height;
|
||||||
|
if (yDiff > minDiff + 2) {
|
||||||
|
rollup("Block");
|
||||||
|
text = item.text;
|
||||||
|
} else {
|
||||||
|
text += '\n' + item.text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lastY = item.y;
|
||||||
|
});
|
||||||
|
rollup("Block")
|
||||||
|
});
|
||||||
|
return [new BlockPage({
|
||||||
|
index: 0,
|
||||||
|
blocks: blocks
|
||||||
|
})];
|
||||||
|
}
|
||||||
|
|
||||||
|
processAnnotations(pages) {
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -18,7 +18,11 @@ export default class ToMarkdown extends Transformation {
|
|||||||
|
|
||||||
transform(pages:TextPage[]) {
|
transform(pages:TextPage[]) {
|
||||||
var text = '';
|
var text = '';
|
||||||
pages.forEach(page => text += page.text + '\n');
|
pages.forEach(page => {
|
||||||
|
page.blocks.forEach((block) => {
|
||||||
|
text += block.text + '\n\n';
|
||||||
|
});
|
||||||
|
});
|
||||||
return [new TextPage({
|
return [new TextPage({
|
||||||
index: 0,
|
index: 0,
|
||||||
text: text
|
text: text
|
||||||
|
Loading…
Reference in New Issue
Block a user