Replace text with block system

This commit is contained in:
Johannes Zillmann 2017-02-12 19:37:21 +01:00
parent 1ca9fa4362
commit 3a1241896b
8 changed files with 147 additions and 31 deletions

View File

@ -11,7 +11,7 @@ import Checkbox from 'react-bootstrap/lib/Checkbox'
import ContentView from '../models/ContentView.jsx';
import PdfPageView from './debug/PdfPageView.jsx';
import TextPageView from './debug/TextPageView.jsx';
import BlockPageView from './debug/BlockPageView.jsx';
import MarkdownPageView from './debug/MarkdownPageView.jsx';
// A view which displays the content of the given pages transformed by the given transformations
@ -88,8 +88,8 @@ export default class DebugView extends React.Component {
pageComponents = transformedPages.map(page => <PdfPageView key={ page.index } pdfPage={ page } modificationsOnly={ this.state.modificationsOnly } />);
showModificationCheckbox = true;
break;
case ContentView.TEXT:
pageComponents = transformedPages.map(page => <TextPageView key={ page.index } page={ page } />);
case ContentView.BLOCK:
pageComponents = transformedPages.map(page => <BlockPageView key={ page.index } page={ page } />);
break;
case ContentView.MARKDOWN:
pageComponents = transformedPages.map(page => <MarkdownPageView key={ page.index } page={ page } />);

View File

@ -0,0 +1,49 @@
import React from 'react';
import Table from 'react-bootstrap/lib/Table'
export default class BlockPageView extends React.Component {
static propTypes = {
page: React.PropTypes.object.isRequired,
};
render() {
var blocks = this.props.page.blocks;
const content = <div>
<Table responsive>
<thead>
<tr>
<th>
#
</th>
<th>
Category
</th>
<th>
Text
</th>
</tr>
</thead>
<tbody>
{ blocks.map((block, i) => <tr key={ i }>
<td>
{ i }
</td>
<td>
{ block.category }
</td>
<td>
<pre style={ { display: 'inline-block' } }>{ block.text }</pre>
</td>
</tr>
) }
</tbody>
</Table>
</div>
return (
content
);
}
}

View File

@ -1,24 +0,0 @@
import React from 'react';
export default class TextPageView extends React.Component {
static propTypes = {
page: React.PropTypes.object.isRequired,
};
render() {
const header = "Page " + (this.props.page.index + 1);
return (
<div>
<h2>{ header }</h2>
<textarea
rows="45"
cols="150"
value={ this.props.page.text }
readOnly="readonly">
</textarea>
</div>
);
}
}

View File

@ -9,7 +9,7 @@ import DetectLinks from './transformations/DetectLinks.jsx'
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
import ToTextPages from './transformations/ToTextPages.jsx';
import ToBlockSystem from './transformations/ToBlockSystem.jsx';
import ToMarkdown from './transformations/ToMarkdown.jsx'
// Holds the state of the Application
@ -30,7 +30,7 @@ export default class AppState {
new RemoveRepetitiveElements(),
new HeadlineDetector(),
new HeadlineToUppercase(),
new ToTextPages(),
new ToBlockSystem(),
new ToMarkdown()];
//bind functions

View File

@ -0,0 +1,9 @@
// A page which holds blocks displayable via BlockPageView
export default class BlockPage {
constructor(options) {
this.index = options.index;
this.blocks = options.blocks;
}
}

View File

@ -2,4 +2,4 @@ import { Enum } from 'enumify';
export default class ContentView extends Enum {
}
ContentView.initEnum(['PDF', 'TEXT', 'MARKDOWN'])
ContentView.initEnum(['PDF', 'BLOCK', 'MARKDOWN'])

View File

@ -0,0 +1,78 @@
import Transformation from './Transformation.jsx';
import PdfPage from '../PdfPage.jsx';
import BlockPage from '../BlockPage.jsx';
import ContentView from '../ContentView.jsx';
export default class ToBlockSystem extends Transformation {
constructor() {
super("To Block System");
}
contentView() {
return ContentView.BLOCK;
}
showPageSelection() {
return false;
}
transform(pages:PdfPage[]) {
const blocks = [];
pages.forEach(page => {
var minDiff = 99;
var lastY = 0;
page.textItems.forEach(item => {
if (lastY > 0) {
const yDiff = lastY - item.y - item.height;
if (yDiff > 0) {
minDiff = Math.min(minDiff, yDiff);
}
}
lastY = item.y;
});
var text;
const rollup = (category) => {
if (text && text.length > 0) {
// console.debug("Push[" + blocks.length + "]: " + text);
blocks.push({
category: category,
text: text
});
}
text = null;
};
lastY = 0;
page.textItems.forEach(item => {
if (item.markdownElement) {
rollup("Block");
text = item.markdownElement.transformText(item.text);
rollup(item.markdownElement.constructor.name);
} else if (!text) {
text = item.text;
} else {
const yDiff = lastY - item.y - item.height;
if (yDiff > minDiff + 2) {
rollup("Block");
text = item.text;
} else {
text += '\n' + item.text;
}
}
lastY = item.y;
});
rollup("Block")
});
return [new BlockPage({
index: 0,
blocks: blocks
})];
}
processAnnotations(pages) {
return pages;
}
}

View File

@ -18,7 +18,11 @@ export default class ToMarkdown extends Transformation {
transform(pages:TextPage[]) {
var text = '';
pages.forEach(page => text += page.text + '\n');
pages.forEach(page => {
page.blocks.forEach((block) => {
text += block.text + '\n\n';
});
});
return [new TextPage({
index: 0,
text: text