mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-24 06:29:06 +01:00
Replace text with block system
This commit is contained in:
parent
1ca9fa4362
commit
3a1241896b
@ -11,7 +11,7 @@ import Checkbox from 'react-bootstrap/lib/Checkbox'
|
||||
|
||||
import ContentView from '../models/ContentView.jsx';
|
||||
import PdfPageView from './debug/PdfPageView.jsx';
|
||||
import TextPageView from './debug/TextPageView.jsx';
|
||||
import BlockPageView from './debug/BlockPageView.jsx';
|
||||
import MarkdownPageView from './debug/MarkdownPageView.jsx';
|
||||
|
||||
// A view which displays the content of the given pages transformed by the given transformations
|
||||
@ -88,8 +88,8 @@ export default class DebugView extends React.Component {
|
||||
pageComponents = transformedPages.map(page => <PdfPageView key={ page.index } pdfPage={ page } modificationsOnly={ this.state.modificationsOnly } />);
|
||||
showModificationCheckbox = true;
|
||||
break;
|
||||
case ContentView.TEXT:
|
||||
pageComponents = transformedPages.map(page => <TextPageView key={ page.index } page={ page } />);
|
||||
case ContentView.BLOCK:
|
||||
pageComponents = transformedPages.map(page => <BlockPageView key={ page.index } page={ page } />);
|
||||
break;
|
||||
case ContentView.MARKDOWN:
|
||||
pageComponents = transformedPages.map(page => <MarkdownPageView key={ page.index } page={ page } />);
|
||||
|
49
src/javascript/components/debug/BlockPageView.jsx
Normal file
49
src/javascript/components/debug/BlockPageView.jsx
Normal file
@ -0,0 +1,49 @@
|
||||
import React from 'react';
|
||||
|
||||
import Table from 'react-bootstrap/lib/Table'
|
||||
|
||||
export default class BlockPageView extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
page: React.PropTypes.object.isRequired,
|
||||
};
|
||||
|
||||
render() {
|
||||
var blocks = this.props.page.blocks;
|
||||
|
||||
const content = <div>
|
||||
<Table responsive>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>
|
||||
#
|
||||
</th>
|
||||
<th>
|
||||
Category
|
||||
</th>
|
||||
<th>
|
||||
Text
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{ blocks.map((block, i) => <tr key={ i }>
|
||||
<td>
|
||||
{ i }
|
||||
</td>
|
||||
<td>
|
||||
{ block.category }
|
||||
</td>
|
||||
<td>
|
||||
<pre style={ { display: 'inline-block' } }>{ block.text }</pre>
|
||||
</td>
|
||||
</tr>
|
||||
) }
|
||||
</tbody>
|
||||
</Table>
|
||||
</div>
|
||||
return (
|
||||
content
|
||||
);
|
||||
}
|
||||
}
|
@ -1,24 +0,0 @@
|
||||
import React from 'react';
|
||||
|
||||
export default class TextPageView extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
page: React.PropTypes.object.isRequired,
|
||||
};
|
||||
|
||||
render() {
|
||||
const header = "Page " + (this.props.page.index + 1);
|
||||
return (
|
||||
<div>
|
||||
<h2>{ header }</h2>
|
||||
<textarea
|
||||
rows="45"
|
||||
cols="150"
|
||||
value={ this.props.page.text }
|
||||
readOnly="readonly">
|
||||
</textarea>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
}
|
@ -9,7 +9,7 @@ import DetectLinks from './transformations/DetectLinks.jsx'
|
||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||
import ToTextPages from './transformations/ToTextPages.jsx';
|
||||
import ToBlockSystem from './transformations/ToBlockSystem.jsx';
|
||||
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
||||
|
||||
// Holds the state of the Application
|
||||
@ -30,7 +30,7 @@ export default class AppState {
|
||||
new RemoveRepetitiveElements(),
|
||||
new HeadlineDetector(),
|
||||
new HeadlineToUppercase(),
|
||||
new ToTextPages(),
|
||||
new ToBlockSystem(),
|
||||
new ToMarkdown()];
|
||||
|
||||
//bind functions
|
||||
|
9
src/javascript/models/BlockPage.jsx
Normal file
9
src/javascript/models/BlockPage.jsx
Normal file
@ -0,0 +1,9 @@
|
||||
// A page which holds blocks displayable via BlockPageView
|
||||
export default class BlockPage {
|
||||
|
||||
constructor(options) {
|
||||
this.index = options.index;
|
||||
this.blocks = options.blocks;
|
||||
}
|
||||
|
||||
}
|
@ -2,4 +2,4 @@ import { Enum } from 'enumify';
|
||||
|
||||
export default class ContentView extends Enum {
|
||||
}
|
||||
ContentView.initEnum(['PDF', 'TEXT', 'MARKDOWN'])
|
||||
ContentView.initEnum(['PDF', 'BLOCK', 'MARKDOWN'])
|
78
src/javascript/models/transformations/ToBlockSystem.jsx
Normal file
78
src/javascript/models/transformations/ToBlockSystem.jsx
Normal file
@ -0,0 +1,78 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import BlockPage from '../BlockPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
export default class ToBlockSystem extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("To Block System");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.BLOCK;
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return false;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
const blocks = [];
|
||||
pages.forEach(page => {
|
||||
var minDiff = 99;
|
||||
var lastY = 0;
|
||||
page.textItems.forEach(item => {
|
||||
if (lastY > 0) {
|
||||
const yDiff = lastY - item.y - item.height;
|
||||
if (yDiff > 0) {
|
||||
minDiff = Math.min(minDiff, yDiff);
|
||||
}
|
||||
}
|
||||
lastY = item.y;
|
||||
});
|
||||
|
||||
var text;
|
||||
const rollup = (category) => {
|
||||
if (text && text.length > 0) {
|
||||
// console.debug("Push[" + blocks.length + "]: " + text);
|
||||
blocks.push({
|
||||
category: category,
|
||||
text: text
|
||||
});
|
||||
}
|
||||
text = null;
|
||||
};
|
||||
|
||||
lastY = 0;
|
||||
page.textItems.forEach(item => {
|
||||
if (item.markdownElement) {
|
||||
rollup("Block");
|
||||
text = item.markdownElement.transformText(item.text);
|
||||
rollup(item.markdownElement.constructor.name);
|
||||
} else if (!text) {
|
||||
text = item.text;
|
||||
} else {
|
||||
const yDiff = lastY - item.y - item.height;
|
||||
if (yDiff > minDiff + 2) {
|
||||
rollup("Block");
|
||||
text = item.text;
|
||||
} else {
|
||||
text += '\n' + item.text;
|
||||
}
|
||||
}
|
||||
lastY = item.y;
|
||||
});
|
||||
rollup("Block")
|
||||
});
|
||||
return [new BlockPage({
|
||||
index: 0,
|
||||
blocks: blocks
|
||||
})];
|
||||
}
|
||||
|
||||
processAnnotations(pages) {
|
||||
return pages;
|
||||
}
|
||||
|
||||
}
|
@ -18,7 +18,11 @@ export default class ToMarkdown extends Transformation {
|
||||
|
||||
transform(pages:TextPage[]) {
|
||||
var text = '';
|
||||
pages.forEach(page => text += page.text + '\n');
|
||||
pages.forEach(page => {
|
||||
page.blocks.forEach((block) => {
|
||||
text += block.text + '\n\n';
|
||||
});
|
||||
});
|
||||
return [new TextPage({
|
||||
index: 0,
|
||||
text: text
|
||||
|
Loading…
Reference in New Issue
Block a user