mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-22 07:43:46 +01:00
WIP globalize display of globals and summary/messages
This commit is contained in:
parent
62fd0155ed
commit
b7db48af4b
@ -8,6 +8,8 @@ import Pagination from 'react-bootstrap/lib/Pagination'
|
||||
import MenuItem from 'react-bootstrap/lib/MenuItem'
|
||||
import Label from 'react-bootstrap/lib/Label'
|
||||
import Checkbox from 'react-bootstrap/lib/Checkbox'
|
||||
import Collapse from 'react-bootstrap/lib/Collapse'
|
||||
import Panel from 'react-bootstrap/lib/Panel'
|
||||
|
||||
import ParseResult from '../models/ParseResult.jsx';
|
||||
|
||||
@ -24,7 +26,8 @@ export default class DebugView extends React.Component {
|
||||
this.state = {
|
||||
currentTransformation: 0,
|
||||
pageNr: -1,
|
||||
modificationsOnly: false
|
||||
modificationsOnly: false,
|
||||
showStatistics: false
|
||||
};
|
||||
}
|
||||
|
||||
@ -58,6 +61,13 @@ export default class DebugView extends React.Component {
|
||||
});
|
||||
}
|
||||
|
||||
showStatistics() {
|
||||
this.setState({
|
||||
showStatistics: !this.state.showStatistics
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
render() {
|
||||
const {currentTransformation, pageNr} = this.state;
|
||||
@ -78,9 +88,18 @@ export default class DebugView extends React.Component {
|
||||
}
|
||||
|
||||
parseResult.content = parseResult.content.filter((elem, i) => pageNr == -1 || i == pageNr);
|
||||
const summaryComponent = lastTransformation.createSummaryView(parseResult);
|
||||
const pageComponents = parseResult.content.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly));
|
||||
const showModificationCheckbox = lastTransformation.showModificationCheckbox();
|
||||
const statisticsAsList = Object.keys(parseResult.globals).map((key, i) => {
|
||||
return <li key={ i }>
|
||||
{ key + ': ' + parseResult.globals[key] }
|
||||
</li>
|
||||
});
|
||||
const messagesAsList = parseResult.messages.map((message, i) => {
|
||||
return <li key={ i }>
|
||||
{ message }
|
||||
</li>
|
||||
});
|
||||
|
||||
return (
|
||||
<div>
|
||||
@ -103,7 +122,7 @@ export default class DebugView extends React.Component {
|
||||
ellipsis
|
||||
boundaryLinks
|
||||
items={ pdfPages.length }
|
||||
maxButtons={ 18 }
|
||||
maxButtons={ 17 }
|
||||
activePage={ this.state.pageNr + 1 }
|
||||
onSelect={ this.selectPage.bind(this) } />
|
||||
</div>
|
||||
@ -141,6 +160,11 @@ export default class DebugView extends React.Component {
|
||||
Show only modifications
|
||||
</Checkbox> }
|
||||
</ButtonGroup>
|
||||
<ButtonGroup>
|
||||
<Checkbox onClick={ ::this.showStatistics }>
|
||||
Show Statistics
|
||||
</Checkbox>
|
||||
</ButtonGroup>
|
||||
</ButtonToolbar>
|
||||
</td>
|
||||
<td style={ { padding: '5px' } }>
|
||||
@ -150,10 +174,24 @@ export default class DebugView extends React.Component {
|
||||
</Label>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<Collapse in={ this.state.showStatistics }>
|
||||
<Panel bsStyle="default">
|
||||
<ul>
|
||||
{ statisticsAsList }
|
||||
</ul>
|
||||
</Panel>
|
||||
</Collapse>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<hr/>
|
||||
{ summaryComponent }
|
||||
{ !this.state.showStatistics &&
|
||||
<hr style={ { marginTop: '5px' } } /> }
|
||||
<ul>
|
||||
{ messagesAsList }
|
||||
</ul>
|
||||
{ pageComponents }
|
||||
</div>
|
||||
);
|
||||
|
@ -81,7 +81,7 @@ export default class TextItemTable extends React.Component {
|
||||
)
|
||||
|
||||
return (
|
||||
<Table responsive bordered>
|
||||
<Table responsive condensed bordered>
|
||||
{ tableHeader }
|
||||
<tbody>
|
||||
{ textItemRows }
|
||||
|
@ -34,6 +34,7 @@ export default class AppState {
|
||||
new DetectTOC(),
|
||||
new DetectLists(),
|
||||
new DetectCodeBlocks(),
|
||||
|
||||
// new DetectFormats(),
|
||||
// new CombineSameY(),
|
||||
// new RemoveWhitespaces(),
|
||||
|
@ -3,8 +3,8 @@ export default class ParseResult {
|
||||
|
||||
constructor(options) {
|
||||
this.content = options.content; // like PdfPages[]
|
||||
this.summary = options.summary; // something to show only for the transformation
|
||||
this.globals = options.globals; // properties accasable for the following transformations
|
||||
this.globals = options.globals; // properties accasable for all the following transformations in debug mode
|
||||
this.messages = options.messages; // something to show only for the transformation in debug mode
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,4 +1,3 @@
|
||||
import React from 'react';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
|
||||
@ -8,38 +7,6 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
||||
super("Calculate Statistics");
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) {
|
||||
return <div>
|
||||
<ul>
|
||||
<li>
|
||||
{ 'Most-used height: ' + parseResult.globals.mostUsedHeight + ' ' }
|
||||
</li>
|
||||
<li>
|
||||
{ 'Most-used font: ' + parseResult.globals.mostUsedFont + ' ' }
|
||||
</li>
|
||||
<li>
|
||||
{ 'Most-used distance: ' + parseResult.globals.mostUsedDistance + ' ' }
|
||||
</li>
|
||||
<li>
|
||||
{ 'Max height: ' + parseResult.globals.maxHeight + ' ' }
|
||||
</li>
|
||||
<li>
|
||||
{ 'Max height font: ' + parseResult.globals.maxHeightFont + ' ' }
|
||||
</li>
|
||||
<hr/>
|
||||
<li>
|
||||
{ 'Items per height: ' + JSON.stringify(parseResult.summary.heightToOccurrence) + ' ' }
|
||||
</li>
|
||||
<li>
|
||||
{ 'Items per font: ' + JSON.stringify(parseResult.summary.fontToOccurrence) + ' ' }
|
||||
</li>
|
||||
<li>
|
||||
{ 'Items per distance: ' + JSON.stringify(parseResult.summary.distanceToOccurrence) + ' ' }
|
||||
</li>
|
||||
</ul>
|
||||
</div>;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
|
||||
// Parse heights
|
||||
@ -102,11 +69,11 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
||||
maxHeight: maxHeight,
|
||||
maxHeightFont: maxHeightFont,
|
||||
},
|
||||
summary: {
|
||||
heightToOccurrence: heightToOccurrence,
|
||||
fontToOccurrence: fontToOccurrence,
|
||||
distanceToOccurrence: distanceToOccurrence,
|
||||
}
|
||||
messages: [
|
||||
'Items per height: ' + JSON.stringify(heightToOccurrence),
|
||||
'Items per font: ' + JSON.stringify(fontToOccurrence),
|
||||
'Items per distance: ' + JSON.stringify(distanceToOccurrence)
|
||||
]
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
import React from 'react';
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import PdfBlock from '../PdfBlock.jsx';
|
||||
@ -14,13 +13,6 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
||||
super("Detect Code/Quotes");
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) {
|
||||
return <div>
|
||||
Detected
|
||||
{ ' ' + parseResult.summary.foundBlocks + ' ' } code/quote blocks.
|
||||
</div>;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedHeight, mostUsedDistance} = parseResult.globals;
|
||||
|
||||
@ -83,9 +75,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
summary: {
|
||||
foundBlocks: foundBlocks
|
||||
}
|
||||
messages: ['Detected ' + foundBlocks + ' code/quote blocks.']
|
||||
});
|
||||
|
||||
}
|
||||
|
@ -1,4 +1,3 @@
|
||||
import React from 'react';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
@ -12,14 +11,6 @@ export default class DetectFootnotes extends ToPdfViewTransformation {
|
||||
super("Detect Footnotes");
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) {
|
||||
return <div>
|
||||
Detected
|
||||
{ ' ' + parseResult.summary.footnotes + ' ' } footnotes.
|
||||
</div>;
|
||||
}
|
||||
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
|
||||
var nextFooterNumber = 1;
|
||||
@ -60,12 +51,11 @@ export default class DetectFootnotes extends ToPdfViewTransformation {
|
||||
textItems: newTextItems
|
||||
};
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: newContent,
|
||||
summary: {
|
||||
footnotes: foundFootnotes
|
||||
}
|
||||
messages: ['Detected ' + foundFootnotes + ' footnotes']
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
import React from 'react';
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
@ -15,13 +14,6 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
|
||||
super("Detect Lists");
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) {
|
||||
return <div>
|
||||
Detected
|
||||
{ ' ' + parseResult.summary.foundBlocks + ' ' } list blocks.
|
||||
</div>;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var foundBlocks = 0;
|
||||
@ -111,9 +103,7 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
summary: {
|
||||
foundBlocks: foundBlocks
|
||||
}
|
||||
messages: ['Detected ' + foundBlocks + ' list blocks.']
|
||||
});
|
||||
|
||||
}
|
||||
|
@ -1,4 +1,3 @@
|
||||
import React from 'react';
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import PdfBlockPage from '../PdfBlockPage.jsx';
|
||||
@ -11,13 +10,6 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
|
||||
super("Detect Blocks");
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) {
|
||||
return <div>
|
||||
Splitted into
|
||||
{ ' ' + parseResult.summary.createdBlocks + ' ' } blocks.
|
||||
</div>;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var createdBlocks = 0;
|
||||
@ -53,12 +45,11 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: newContent,
|
||||
summary: {
|
||||
createdBlocks: createdBlocks
|
||||
}
|
||||
messages: ['Splitted into ' + createdBlocks + ' blocks']
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
import React from 'react';
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
@ -15,14 +14,6 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
super("Detect Table of Contents");
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) {
|
||||
return <div>
|
||||
Detected
|
||||
{ ' ' + parseResult.summary.foundTocPages + ' ' } table of content pages.
|
||||
</div>;
|
||||
}
|
||||
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var foundTocPages = 0;
|
||||
@ -82,9 +73,7 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
summary: {
|
||||
foundTocPages: foundTocPages
|
||||
}
|
||||
messages: ['Detected ' + foundTocPages + ' table of content pages']
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
import React from 'react';
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
@ -27,19 +26,6 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
|
||||
super("Remove Repetitive Elements");
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) {
|
||||
return <div>
|
||||
<ul>
|
||||
<li>
|
||||
{ 'Removed Header: ' + parseResult.summary.removedHeader + ' ' }
|
||||
</li>
|
||||
<li>
|
||||
{ 'Removed Footers: ' + parseResult.summary.removedFooter + ' ' }
|
||||
</li>
|
||||
</ul>
|
||||
</div>;
|
||||
}
|
||||
|
||||
// The idea is the following:
|
||||
// - For each page, collect all items of the first, and all items of the last line
|
||||
// - Calculate how often these items occur accros all pages (hash ignoring numbers, whitespace, upper/lowercase)
|
||||
@ -104,18 +90,11 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
summary: {
|
||||
removedHeader: removedHeader,
|
||||
removedFooter: removedFooter,
|
||||
}
|
||||
messages: [
|
||||
'Removed Header: ' + removedHeader,
|
||||
'Removed Footers: ' + removedFooter
|
||||
]
|
||||
});
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
parseResult.content.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
}
|
@ -33,6 +33,7 @@ export default class ToPdfBlockViewTransformation extends Transformation {
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
// The usual cleanup
|
||||
parseResult.messages = [];
|
||||
parseResult.content.forEach(page => {
|
||||
page.blocks = page.blocks.filter(block => !block.annotation || block.annotation !== REMOVED_ANNOTATION);
|
||||
page.blocks.forEach(block => block.annotation = null);
|
||||
|
@ -1,6 +1,8 @@
|
||||
import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import PdfPageView from '../../components/debug/PdfPageView.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
// Abstract class for transformations producing a PdfPage to be shown in the PdfView
|
||||
export default class ToPdfViewTransformation extends Transformation {
|
||||
@ -29,4 +31,15 @@ export default class ToPdfViewTransformation extends Transformation {
|
||||
showWhitespaces={ this.showWhitespaces } />;
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
// The usual cleanup
|
||||
parseResult.messages = [];
|
||||
parseResult.content.forEach(page => {
|
||||
page.textItems = page.textItems.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(block => block.annotation = null);
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -21,10 +21,6 @@ export default class Transformation {
|
||||
return false;
|
||||
}
|
||||
|
||||
createSummaryView(parseResult:ParseResult) { // eslint-disable-line no-unused-vars
|
||||
return null;
|
||||
}
|
||||
|
||||
createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
|
||||
throw new TypeError("Do not call abstract method foo from child.");
|
||||
}
|
||||
@ -36,6 +32,7 @@ export default class Transformation {
|
||||
|
||||
// Sometimes the transform() does only visualize a change. This methods then does the actual change.
|
||||
completeTransform(parseResult: ParseResult) { // eslint-disable-line no-unused-vars
|
||||
parseResult.messages = [];
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
|
@ -11,6 +11,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
var foundVerticals = 0;
|
||||
const newContent = parseResult.content.map(page => {
|
||||
const newTextItems = [];
|
||||
// var oneCharacterItems = [];
|
||||
@ -65,6 +66,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
|
||||
text: combinedText,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
foundVerticals++;
|
||||
} else {
|
||||
oneCharacterItems.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
|
||||
}
|
||||
@ -87,15 +89,9 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: newContent,
|
||||
messages: ["Converted " + foundVerticals + " verticals"]
|
||||
});
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
parseResult.content.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user