[WIP] Cleanup page / item handling

This commit is contained in:
Johannes Zillmann 2017-03-07 21:59:15 +01:00
parent 6f69566e98
commit 111124fbf3
42 changed files with 527 additions and 399 deletions

View File

@ -28,10 +28,10 @@ export default class App extends React.Component {
mainView = <LoadingView fileBuffer={ appState.fileBuffer } storePdfPagesFunction={ appState.storePdfPages } />
break;
case View.RESULT:
mainView = <ResultView pdfPages={ appState.pdfPages } transformations={ appState.transformations } />
mainView = <ResultView pages={ appState.pages } transformations={ appState.transformations } />
break;
case View.DEBUG:
mainView = <DebugView pdfPages={ appState.pdfPages } transformations={ appState.transformations } />
mainView = <DebugView pages={ appState.pages } transformations={ appState.transformations } />
break;
default:
throw `View ${this.props.appState.mainView} not supported!`;
@ -46,7 +46,7 @@ export default class App extends React.Component {
</div>
</Grid>
</div>
);
);
}
}

View File

@ -17,7 +17,7 @@ import ParseResult from '../models/ParseResult.jsx';
export default class DebugView extends React.Component {
static propTypes = {
pdfPages: React.PropTypes.array.isRequired,
pages: React.PropTypes.array.isRequired,
transformations: React.PropTypes.array.isRequired,
};
@ -71,12 +71,12 @@ export default class DebugView extends React.Component {
render() {
const {currentTransformation, pageNr} = this.state;
const {pdfPages, transformations} = this.props;
const {pages, transformations} = this.props;
const currentTransformationName = transformations[currentTransformation].name;
var parseResult = new ParseResult({
content: pdfPages
pages: pages
});
var lastTransformation;
for (var i = 0; i <= currentTransformation; i++) {
@ -87,8 +87,8 @@ export default class DebugView extends React.Component {
lastTransformation = transformations[i];
}
parseResult.content = parseResult.content.filter((elem, i) => pageNr == -1 || i == pageNr);
const pageComponents = parseResult.content.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly));
parseResult.pages = parseResult.pages.filter((elem, i) => pageNr == -1 || i == pageNr);
const pageComponents = parseResult.pages.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly));
const showModificationCheckbox = lastTransformation.showModificationCheckbox();
const statisticsAsList = Object.keys(parseResult.globals).map((key, i) => {
return <li key={ i }>
@ -121,7 +121,7 @@ export default class DebugView extends React.Component {
last
ellipsis
boundaryLinks
items={ pdfPages.length }
items={ pages.length }
maxButtons={ 17 }
activePage={ this.state.pageNr + 1 }
onSelect={ this.selectPage.bind(this) } />
@ -194,6 +194,6 @@ export default class DebugView extends React.Component {
</ul>
{ pageComponents }
</div>
);
);
}
}

View File

@ -3,7 +3,7 @@ import React from 'react';
import pdfjs from 'pdfjs-dist'; // eslint-disable-line no-unused-vars
import { Line } from 'rc-progress';
import PdfPage from '../models/PdfPage.jsx';
import Page from '../models/Page.jsx';
import TextItem from '../models/TextItem.jsx';
export default class LoadingView extends React.Component {
@ -17,19 +17,19 @@ export default class LoadingView extends React.Component {
super(props);
this.state = {
parsedPages: 0,
pdfPages: []
pages: []
};
}
anounceInitialParse(pdfPages) {
anounceInitialParse(pages) {
this.setState({
pdfPages: pdfPages
pages: pages
});
}
anouncePageParsed(index, textItems) {
//TODO might make problems.. concat unordered and order at the end ?
this.state.pdfPages[index].textItems = textItems; // eslint-disable-line react/no-direct-mutation-state
this.state.pages[index].items = textItems; // eslint-disable-line react/no-direct-mutation-state
this.setState({
parsedPages: this.state.parsedPages + 1
});
@ -44,13 +44,13 @@ export default class LoadingView extends React.Component {
// console.debug(pdfDocument);
const numPages = pdfDocument.numPages;
// const numPages = 4; // hack
var pdfPages = [];
var pages = [];
for (var i = 0; i < numPages; i++) {
pdfPages.push(new PdfPage({
pages.push(new Page({
index: i
}));
}
anounceInitialParseFunction(pdfPages);
anounceInitialParseFunction(pages);
for (var j = 1; j <= numPages; j++) {
pdfDocument.getPage(j).then(function(page) {
var scale = 1.0;
@ -96,14 +96,14 @@ export default class LoadingView extends React.Component {
}
render() {
const {parsedPages, pdfPages} = this.state;
const {parsedPages, pages} = this.state;
var percentDone = 0;
var details = '';
if (pdfPages.length > 0) {
percentDone = parsedPages / pdfPages.length * 100;
details = parsedPages + ' / ' + pdfPages.length
if (parsedPages == pdfPages.length) {
this.props.storePdfPagesFunction(this.state.pdfPages);
if (pages.length > 0) {
percentDone = parsedPages / pages.length * 100;
details = parsedPages + ' / ' + pages.length
if (parsedPages == pages.length) {
this.props.storePdfPagesFunction(this.state.pages);
}
}
return (

View File

@ -10,7 +10,7 @@ import ParseResult from '../models/ParseResult.jsx';
export default class ResultView extends React.Component {
static propTypes = {
pdfPages: React.PropTypes.array.isRequired,
pages: React.PropTypes.array.isRequired,
transformations: React.PropTypes.array.isRequired,
};
@ -19,9 +19,9 @@ export default class ResultView extends React.Component {
}
componentWillMount() {
const {pdfPages, transformations} = this.props;
const {pages, transformations} = this.props;
var parseResult = new ParseResult({
content: pdfPages
pages: pages
});
var lastTransformation;
transformations.forEach(transformation => {
@ -32,10 +32,15 @@ export default class ResultView extends React.Component {
lastTransformation = transformation;
});
var text = '';
parseResult.pages.forEach(page => {
page.items.forEach(item => {
text += item + '\n';
});
});
this.state = {
preview: true,
text: parseResult.content[0].text
text: text
};
}
@ -90,7 +95,7 @@ export default class ResultView extends React.Component {
<hr/>
{ textComponent }
</div>
);
);
}
}

View File

@ -1,49 +0,0 @@
import React from 'react';
import Table from 'react-bootstrap/lib/Table'
export default class BlockPageView extends React.Component {
static propTypes = {
page: React.PropTypes.object.isRequired,
};
render() {
var blocks = this.props.page.blocks;
const content = <div>
<Table responsive>
<thead>
<tr>
<th>
#
</th>
<th>
Category
</th>
<th>
Text
</th>
</tr>
</thead>
<tbody>
{ blocks.map((block, i) => <tr key={ i }>
<td>
{ i }
</td>
<td>
{ block.category }
</td>
<td>
<pre style={ { display: 'inline-block' } }>{ block.text }</pre>
</td>
</tr>
) }
</tbody>
</Table>
</div>
return (
content
);
}
}

View File

@ -1,23 +1,17 @@
import React from 'react';
import PageView from './PageView.jsx';
import Remarkable from 'remarkable';
export default class MarkdownPageView extends React.Component {
export default class MarkdownPageView extends PageView {
static propTypes = {
page: React.PropTypes.object.isRequired,
};
render() {
createItemViews(items, showWhitespaces) { // eslint-disable-line no-unused-vars
const remarkable = new Remarkable({
breaks: true
});
const html = remarkable.render(this.props.page.text);
return (
<div>
<div dangerouslySetInnerHTML={ { __html: html } } />
</div>
);
const html = remarkable.render(items[0]);
return <div>
<div dangerouslySetInnerHTML={ { __html: html } } />
</div>
}
}

View File

@ -0,0 +1,41 @@
import React from 'react';
// Abstract view for a Page
export default class PageView extends React.Component {
static propTypes = {
page: React.PropTypes.object.isRequired,
modificationsOnly: React.PropTypes.bool,
showWhitespaces: React.PropTypes.bool
};
createItemViews(items, showWhitespaces) { // eslint-disable-line no-unused-vars
throw new TypeError("Do not call abstract method foo from child.");
}
render() {
const {page, modificationsOnly, showWhitespaces} = this.props;
var items = page.items;
if (modificationsOnly) {
items = items.filter(block => block.annotation);
}
var content;
if (items.length == 0 && modificationsOnly) {
content = <div/>
} else {
const itemViews = this.createItemViews(items, showWhitespaces);
const header = "Page " + (page.index + 1);
content = <div>
<h2>{ header }</h2>
<hr/>
{ itemViews }
</div>
}
return (
content
);
}
}

View File

@ -1,35 +0,0 @@
import React from 'react';
import TextItemTable from './TextItemTable.jsx';
// View for a PdfPage
export default class PdfPageView extends React.Component {
static propTypes = {
pdfPage: React.PropTypes.object.isRequired,
modificationsOnly: React.PropTypes.bool.isRequired,
showWhitespaces: React.PropTypes.bool
};
render() {
const {pdfPage, modificationsOnly, showWhitespaces} = this.props;
const header = "Page " + (pdfPage.index + 1);
var textItems = pdfPage.textItems;
if (modificationsOnly) {
textItems = textItems.filter(item => item.annotation);
}
var content;
if (textItems.length == 0 && modificationsOnly) {
content = <div/>
} else {
content = <div>
<h2>{ header }</h2>
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
</div>
}
return (
content
);
}
}

View File

@ -1,24 +1,12 @@
import React from 'react';
import PageView from './PageView.jsx';
import TextItemTable from './TextItemTable.jsx';
// View for a PdfBlockPage
export default class PdfBlockPageView extends React.Component {
// View for a Page which items are of kind TextItemBlock
export default class TextItemBlockPageView extends PageView {
static propTypes = {
pdfPage: React.PropTypes.object.isRequired,
modificationsOnly: React.PropTypes.bool.isRequired,
showWhitespaces: React.PropTypes.bool
};
render() {
const {pdfPage, modificationsOnly, showWhitespaces} = this.props;
var blocks = pdfPage.blocks;
if (modificationsOnly) {
blocks = blocks.filter(block => block.annotation);
}
const blockTables = blocks.map((block, i) => {
createItemViews(items, showWhitespaces) {
const blockTables = items.map((block, i) => {
var textItems = block.textItems;
const blockType = block.type ? ' - ' + block.type : null;
const blockAnnotation = block.annotation ? <span>{ ' - ' + block.annotation.category }</span>
@ -56,19 +44,7 @@ export default class PdfBlockPageView extends React.Component {
</div>
</div>
});
var content;
if (blocks.length == 0 && modificationsOnly) {
content = <div/>
} else {
const header = "Page " + (pdfPage.index + 1);
content = <div>
<h2>{ header }</h2>
{ blockTables }
</div>
}
return (
content
);
return blockTables;
}
}

View File

@ -0,0 +1,12 @@
import React from 'react';
import PageView from './PageView.jsx';
import TextItemTable from './TextItemTable.jsx';
// View for a Page which items are of kind TextItem
export default class TextItemPageView extends PageView {
createItemViews(items, showWhitespaces) {
return <TextItemTable textItems={ items } showWhitespaces={ showWhitespaces } />
}
}

View File

@ -0,0 +1,41 @@
import React from 'react';
import PageView from './PageView.jsx';
import Table from 'react-bootstrap/lib/Table'
export default class TextPageView extends PageView {
createItemViews(items, showWhitespaces) { // eslint-disable-line no-unused-vars
return <div>
<Table responsive>
<thead>
<tr>
<th>
#
</th>
<th>
Category
</th>
<th>
Text
</th>
</tr>
</thead>
<tbody>
{ items.map((block, i) => <tr key={ i }>
<td>
{ i }
</td>
<td>
{ block.category }
</td>
<td>
<pre style={ { display: 'inline-block' } }>{ block.text }</pre>
</td>
</tr>
) }
</tbody>
</Table>
</div>
}
}

View File

@ -26,7 +26,7 @@ export default class AppState {
this.renderFunction = options.renderFunction;
this.mainView = View.UPLOAD;
this.fileBuffer;
this.pdfPages = [];
this.pages = [];
this.transformations = [
new CalculateGlobalStats(),
new RemoveRepetitiveElements(),
@ -66,8 +66,8 @@ export default class AppState {
this.render()
}
storePdfPages(pdfPages) {
this.pdfPages = pdfPages;
storePdfPages(pages) {
this.pages = pages;
this.fileBuffer = null;
this.mainView = View.RESULT;
this.render();

View File

@ -1,9 +0,0 @@
// A page which holds blocks displayable via BlockPageView
export default class BlockPage {
constructor(options) {
this.index = options.index;
this.blocks = options.blocks;
}
}

View File

@ -1,4 +1,4 @@
import PdfBlock from './BlockPage.jsx';
import TextItemBlock from './TextItemBlock.jsx';
import TextItemCombiner from './TextItemCombiner.jsx';
import TextItem from './TextItem.jsx';
@ -31,7 +31,7 @@ export function headlineByLevel(level) {
throw "Unsupported headline level: " + level;
}
export function blockToText(block: PdfBlock) {
export function blockToText(block: TextItemBlock) {
switch (block.type) {
case CODE_BLOCK:
return '```\n' + concatTextItems(block.textItems) + '```'

View File

@ -0,0 +1,9 @@
// A page which holds PageItems displayable via PdfPageView
export default class Page {
constructor(options) {
this.index = options.index;
this.items = options.items || []; //PageItem
}
}

View File

@ -0,0 +1,13 @@
// A abstract PageItem class, can be TextItem, or TextItemBlock
export default class PageItem {
constructor(options) {
if (this.constructor === PageItem) {
throw new TypeError("Can not construct abstract class.");
}
this.type = options.type;
this.annotation = options.annotation;
this.parsedElements = options.parsedElements;
}
}

View File

@ -2,7 +2,7 @@
export default class ParseResult {
constructor(options) {
this.content = options.content; // like PdfPages[]
this.pages = options.pages; // like Page[]
this.globals = options.globals; // properties accasable for all the following transformations in debug mode
this.messages = options.messages; // something to show only for the transformation in debug mode
}

View File

@ -1,11 +0,0 @@
// A block within a PdfPage
export default class PdfBlock {
constructor(options) {
this.textItems = options.textItems;
this.type = options.type;
this.annotation = options.annotation;
this.parsedElements = options.parsedElements;
}
}

View File

@ -1,9 +0,0 @@
// A page which holds TextItems grouped by block displayable via PdfPageBlockView
export default class PdfBlockPage {
constructor(options) {
this.index = options.index;
this.blocks = options.blocks;
}
}

View File

@ -1,9 +0,0 @@
// A page which holds TextItems displayable via PdfPageView
export default class PdfPage {
constructor(options) {
this.index = options.index;
this.textItems = []
}
}

View File

@ -1,7 +1,10 @@
import PageItem from './PageItem.jsx'
//A text item, i.e. a line or a word within a page
export default class TextItem {
export default class TextItem extends PageItem {
constructor(options) {
super(options);
this.x = options.x;
this.y = options.y;
this.width = options.width;
@ -10,8 +13,6 @@ export default class TextItem {
this.font = options.font;
this.fontAscent = options.fontAscent;
this.fontDescent = options.fontDescent;
this.annotation = options.annotation;
this.markdownElement = options.markdownElement;
}
}

View File

@ -0,0 +1,11 @@
import PageItem from './PageItem.jsx'
// A block of TextItem[] within a Page
export default class TextItemBlock extends PageItem {
constructor(options) {
super(options);
this.textItems = options.textItems;
}
}

View File

@ -1,9 +0,0 @@
// A page which holds TextItems displayable via PdfPageView
export default class TextPage {
constructor(options) {
this.index = options.index;
this.text = options.text;
}
}

View File

@ -1,17 +0,0 @@
import MarkdownElement from './MarkdownElement.jsx';
export default class Headline extends MarkdownElement {
constructor(options) {
super({
newLineBefore: true,
newLineAfter: true
});
this.level = options.level;
}
transformText(text) {
return '#'.repeat(this.level) + ' ' + text;
}
}

View File

@ -1,16 +0,0 @@
// An text item detected as markdown element
export default class MarkdownElement {
constructor(options) {
if (this.constructor === MarkdownElement) {
throw new TypeError("Can not construct abstract class.");
}
this.newLineBefore = options.newLineBefore;
this.newLineAfter = options.newLineAfter;
}
transformText(text) { // eslint-disable-line no-unused-vars
throw new TypeError("Do not call abstract method foo from child.");
}
}

View File

@ -1,7 +1,7 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
export default class CalculateGlobalStats extends ToPdfViewTransformation {
export default class CalculateGlobalStats extends ToTextItemTransformation {
constructor() {
super("Calculate Statistics");
@ -14,8 +14,8 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
const fontToOccurrence = {};
var maxHeight = 0;
var maxHeightFont;
parseResult.content.forEach(page => {
page.textItems.forEach(item => {
parseResult.pages.forEach(page => {
page.items.forEach(item => {
heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
fontToOccurrence[item.font] = fontToOccurrence[item.font] ? fontToOccurrence[item.font] + 1 : 1;
if (item.height > maxHeight) {
@ -29,9 +29,9 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
// Parse line distances
const distanceToOccurrence = {};
parseResult.content.forEach(page => {
parseResult.pages.forEach(page => {
var lastItemOfMostUsedHeight;
page.textItems.forEach(item => {
page.items.forEach(item => {
if (item.height == mostUsedHeight && item.text.trim().length > 0) {
if (lastItemOfMostUsedHeight && item.y != lastItemOfMostUsedHeight.y) {
const distance = lastItemOfMostUsedHeight.y - item.y;
@ -49,10 +49,10 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
//Make a copy of the originals so all following transformation don't modify them
const newContent = parseResult.content.map(pdfPage => {
const newPages = parseResult.pages.map(page => {
return {
...pdfPage,
textItems: pdfPage.textItems.map(textItem => {
...page,
items: page.items.map(textItem => {
return {
...textItem,
}
@ -61,7 +61,7 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
});
return new ParseResult({
...parseResult,
content: newContent,
pages: newPages,
globals: {
mostUsedHeight: mostUsedHeight,
mostUsedFont: mostUsedFont,

View File

@ -1,4 +1,4 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import TextItem from '../TextItem.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
@ -40,7 +40,7 @@ function combineTextItems(textItems:TextItem[]) {
});
}
export default class CombineSameY extends ToPdfViewTransformation {
export default class CombineSameY extends ToTextItemTransformation {
constructor() {
super("Combine Text On Same Y");

View File

@ -1,13 +1,13 @@
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import PdfBlock from '../PdfBlock.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { CODE_BLOCK } from '../MarkdownElements.jsx';
import { minXFromBlocks } from '../../textItemFunctions.jsx';
//Detect quotes, code etc.. which is transformed to markdown code syntax
export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
export default class DetectCodeBlocks extends ToTextItemBlockTransformation {
constructor() {
super("Detect Code/Quotes");
@ -21,8 +21,8 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
mostUsedDistance: mostUsedDistance
});
parseResult.content.forEach(page => {
var minX = minXFromBlocks(page.blocks);
parseResult.pages.forEach(page => {
var minX = minXFromBlocks(page.items);
if (minX) {
const itemAreSuitable = (items) => {
for ( let item of items ) {
@ -37,7 +37,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
};
const newBlocks = [];
var preceedingCodeBlock;
page.blocks.forEach(block => {
page.items.forEach(block => {
if (block.type) {
newBlocks.push(block);
preceedingCodeBlock = null;
@ -54,7 +54,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
preceedingCodeBlock.textItems = preceedingCodeBlock.textItems.concat(combineResult.textItems);
preceedingCodeBlock.parsedElements.add(combineResult.parsedElements);
} else {
preceedingCodeBlock = new PdfBlock({
preceedingCodeBlock = new TextItemBlock({
type: CODE_BLOCK,
annotation: ADDED_ANNOTATION,
textItems: combineResult.textItems,
@ -69,7 +69,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
}
}
});
page.blocks = newBlocks;
page.items = newBlocks;
}
});

View File

@ -1,12 +1,12 @@
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import PdfBlock from '../PdfBlock.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { FOOTNOTE_BLOCK } from '../MarkdownElements.jsx';
//Detect quotes, code etc.. which is transformed to markdown code syntax
export default class DetectFootnotes extends ToPdfBlockViewTransformation {
export default class DetectFootnotes extends ToTextItemBlockTransformation {
constructor() {
super("Detect Footnotes");
@ -19,17 +19,17 @@ export default class DetectFootnotes extends ToPdfBlockViewTransformation {
mostUsedDistance: mostUsedDistance,
});
parseResult.content.forEach(page => {
parseResult.pages.forEach(page => {
const newBlocks = [];
var lastFootnote;
page.blocks.forEach(block => {
page.items.forEach(block => {
newBlocks.push(block);
if (!block.type && block.textItems[0].y < 200) {
const combineResult = textCombiner.combine(block.textItems);
if (combineResult.parsedElements.footnotes.length > 0) {
block.annotation = REMOVED_ANNOTATION;
foundFootnotes.push.apply(foundFootnotes, combineResult.parsedElements.footnotes);
lastFootnote = new PdfBlock({
lastFootnote = new TextItemBlock({
textItems: combineResult.textItems,
type: FOOTNOTE_BLOCK,
annotation: ADDED_ANNOTATION,
@ -48,7 +48,7 @@ export default class DetectFootnotes extends ToPdfBlockViewTransformation {
lastFootnote = null;
}
});
page.blocks = newBlocks;
page.items = newBlocks;
});
return new ParseResult({

View File

@ -0,0 +1,198 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
import { HEADLINE1, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
//Detect headlines
export default class DetectHeadlines extends ToTextItemBlockTransformation {
constructor() {
super("Detect Headlines");
}
transform(parseResult:ParseResult) {
var foundHeadlines = 0;
const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals;
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance,
});
//Set max headlines (all headers on the same page are max level 2)
const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight, textCombiner);
var headlineHeightFlowBeforeToc = [];
var headlineHeightsOccurenceBeforeToc = {};
var firstPageAfterToc = 0;
if (tocPages && tocPages.length > 0) {
[headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], textCombiner, mostUsedHeight, maxHeaderPages);
firstPageAfterToc = tocPages[tocPages.length - 1] + 1;
}
const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, textCombiner, mostUsedHeight, maxHeaderPages);
// TODO ==> do flow analysis (remove out of flow or snap, start with 2nd)
// TODO ==> parse seperately between beforeToc and after
// TODO ==> Kala chakra, all uppercase
// TODO ==> TOC headlines
//var topHeadlinePassed = false;
const headlineHeightMap = {};
const headlineSizePerLevel = {};
var currentHeadlineLevel;
parseResult.pages.forEach(page => {
const newBlocks = [];
page.items.forEach(block => {
newBlocks.push(block);
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
const combineResult = textCombiner.combine(block.textItems);
if (combineResult.textItems.length == 1) {
const height = combineResult.textItems[0].height;
if (height == maxHeight) {
block.annotation = REMOVED_ANNOTATION;
currentHeadlineLevel = 1;
headlineSizePerLevel[currentHeadlineLevel] = height
addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
}
// else if (currentHeadlineLevel) {
// const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel];
// if (height < currentLevelSize) {
// const nextLevelSize = headlineSizePerLevel[currentHeadlineLevel + 1];
// // if(!nextLevelSize)
// if (currentHeadlineLevel < 6) {
// currentHeadlineLevel++;
// }
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
// headlineSizePerLevel[currentHeadlineLevel] = height;
// } else if (height > currentLevelSize) {
// const preLevelSize = headlineSizePerLevel[currentHeadlineLevel - 1];
// if (currentHeadlineLevel > 1) {
// currentHeadlineLevel--;
// }
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
// headlineSizePerLevel[currentHeadlineLevel] = height;
// } else {
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
// }
// }
}
}
});
page.items = newBlocks;
});
const heightToOccurrence = {};
const fontToOccurrence = {};
// parseResult.content.forEach(page => {
// const newBlocks = [];
// page.blocks.forEach(block => {
// newBlocks.push(block);
// if (!block.type && block.textItems[0].height > mostUsedHeight) {
// foundHeadlines++;
// block.annotation = REMOVED_ANNOTATION;
// const combineResult = textCombiner.combine(block.textItems);
// const height = combineResult.textItems[0].height;
// const font = combineResult.textItems[0].font;
// heightToOccurrence[height] = heightToOccurrence[height] ? heightToOccurrence[height] + 1 : 1;
// fontToOccurrence[font] = fontToOccurrence[font] ? fontToOccurrence[font] + 1 : 1;
// newBlocks.push(new PdfBlock({
// textItems: combineResult.textItems,
// type: HEADLINE1,
// annotation: ADDED_ANNOTATION,
// parsedElements: combineResult.parsedElements
// }));
// }
// });
// page.blocks = newBlocks;
// });
return new ParseResult({
...parseResult,
messages: [
'Found headlines: ' + foundHeadlines,
'Height repetition: ' + JSON.stringify(heightToOccurrence),
'Font repetition: ' + JSON.stringify(fontToOccurrence),
'Pages with max Header: ' + maxHeaderPages,
'Headline Height Flow (before TOC): ' + headlineHeightFlowBeforeToc,
'Headline Heights Occurence (before TOC): ' + JSON.stringify(headlineHeightsOccurenceBeforeToc),
'Headline Height Flow: ' + headlineHeightFlowAfterToc,
'Headline Heights Occurence: ' + JSON.stringify(headlineHeightsOccurenceAfterToc),
]
});
}
}
function addNewBlock(newBlocks, combineResult, headlineLevel) {
newBlocks.push(new TextItemBlock({
textItems: combineResult.textItems,
type: headlineLevel,
annotation: ADDED_ANNOTATION,
parsedElements: combineResult.parsedElements
}));
}
function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
// Find pages with max height
const maxHeaderPagesSet = new Set();
pages.forEach(page => {
page.items.forEach(block => {
if (!block.type && block.textItems[0].height == maxHeight) {
maxHeaderPagesSet.add(page);
}
});
});
// Now convert those pages to headlines
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
maxHeaderPagesSet.forEach(pageWithMaxHeader => {
const newBlocks = [];
pageWithMaxHeader.items.forEach(block => {
newBlocks.push(block);
const height = block.textItems[0].height;
if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
block.annotation = REMOVED_ANNOTATION;
const combineResult = textCombiner.combine(block.textItems);
if (height == maxHeight) {
addNewBlock(newBlocks, combineResult, HEADLINE1);
} else if (combineResult.textItems.length == 1) {
addNewBlock(newBlocks, combineResult, HEADLINE2);
}
}
});
pageWithMaxHeader.items = newBlocks;
});
return Array.from(maxHeaderPagesSet).map(page => page.index + 1);
}
function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeight, maxHeaderPages) {
const headlineHeightFlow = [];
const headlineHeightsOccurences = {};
var lastHeadlineHeight;
for (var i = from; i < to; i++) {
const page = pages[i];
if (!maxHeaderPages.includes(page.index + 1)) {
page.items.forEach(block => {
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
const combineResult = textCombiner.combine(block.textItems);
if (combineResult.textItems.length == 1) {
const height = combineResult.textItems[0].height;
headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ;
if (!lastHeadlineHeight || height != lastHeadlineHeight) {
headlineHeightFlow.push(height);
//headlineFontFlow.push(combineResult.textItems[0].font)
lastHeadlineHeight = height;
}
}
}
});
}
}
return [headlineHeightFlow, headlineHeightsOccurences];
}

View File

@ -1,14 +1,14 @@
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import PdfBlock from '../PdfBlock.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { PARAGRAPH, LIST_BLOCK } from '../MarkdownElements.jsx';
import { minXFromBlocks } from '../../textItemFunctions.jsx';
//Detect quotes, code etc.. which is transformed to markdown code syntax
export default class DetectLists extends ToPdfBlockViewTransformation {
export default class DetectLists extends ToTextItemBlockTransformation {
constructor() {
super("Detect Lists");
@ -21,11 +21,11 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
mostUsedDistance: mostUsedDistance
});
parseResult.content.forEach(page => {
var minX = minXFromBlocks(page.blocks);
parseResult.pages.forEach(page => {
var minX = minXFromBlocks(page.items);
if (minX) {
const newBlocks = [];
page.blocks.forEach(block => {
page.items.forEach(block => {
newBlocks.push(block);
if (!block.type) {
const combineResult = textCombiner.combine(block.textItems);
@ -81,14 +81,14 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
});
if (itemsBeforeFirstLineItem.length > 0) {
newBlocks.push(new PdfBlock({
newBlocks.push(new TextItemBlock({
textItems: itemsBeforeFirstLineItem,
type: PARAGRAPH,
annotation: ADDED_ANNOTATION
}));
}
//TODO display with whitespace pre support
newBlocks.push(new PdfBlock({
newBlocks.push(new TextItemBlock({
textItems: listBlockItems,
type: LIST_BLOCK,
annotation: ADDED_ANNOTATION,
@ -97,7 +97,7 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
}
}
});
page.blocks = newBlocks;
page.items = newBlocks;
}
});

View File

@ -1,10 +1,10 @@
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import Page from '../Page.jsx';
import ParseResult from '../ParseResult.jsx';
import PdfBlockPage from '../PdfBlockPage.jsx';
import PdfBlock from '../PdfBlock.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import { minXFromTextItems } from '../../textItemFunctions.jsx';
export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
export default class DetectPdfBlocks extends ToTextItemBlockTransformation {
constructor() {
super("Detect Blocks");
@ -13,20 +13,20 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var createdBlocks = 0;
const newContent = parseResult.content.map(page => {
var minX = minXFromTextItems(page.textItems);
const newPages = parseResult.pages.map(page => {
var minX = minXFromTextItems(page.items);
const blocks = [];
var textItemsInBlock = [];
const completBlock = () => {
if (textItemsInBlock.length > 0) { //can happen on empty page
blocks.push(new PdfBlock({
blocks.push(new TextItemBlock({
textItems: textItemsInBlock
}));
textItemsInBlock = [];
}
};
var lastItem;
page.textItems.forEach(item => {
page.items.forEach(item => {
if (lastItem) {
if (shouldSplit(lastItem, item, minX, mostUsedDistance)) {
@ -39,16 +39,16 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
completBlock();
createdBlocks += blocks.length;
return new PdfBlockPage({
return new Page({
...page,
blocks: blocks
items: blocks
});
});
return new ParseResult({
...parseResult,
content: newContent,
pages: newPages,
messages: ['Splitted into ' + createdBlocks + ' blocks']
});
}

View File

@ -1,7 +1,7 @@
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import PdfBlock from '../PdfBlock.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import HeadlineFinder from '../HeadlineFinder.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
@ -9,16 +9,16 @@ import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
import { isDigit } from '../../functions.jsx'
//Detect table of contents pages
export default class DetectTOC extends ToPdfBlockViewTransformation {
export default class DetectTOC extends ToTextItemBlockTransformation {
constructor() {
super("Detect Table of Contents");
super("Detect TOC");
}
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
const tocPages = [];
const maxPagesToEvaluate = Math.min(20, parseResult.content.length);
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance
});
@ -26,14 +26,14 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
const linkLeveler = new LinkLeveler();
var tocLinks = [];
var lastTocPage;
parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => {
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
var linesCount = 0;
var linesWithDigitsCount = 0;
var lineItemsWithDigits = [];
const unknownBlocks = new Set();
var headlineBlock;
const pageTocLinks = [];
page.blocks.forEach(block => {
page.items.forEach(block => {
var blockHasLinesWithDigits = false;
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
var lastLineTextWithoutNumber;
@ -87,20 +87,20 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
tocLinks = tocLinks.concat(pageTocLinks);
const newBlocks = [];
page.blocks.forEach((block) => {
page.items.forEach((block) => {
if (!unknownBlocks.has(block)) {
block.annotation = REMOVED_ANNOTATION;
}
newBlocks.push(block);
if (block === headlineBlock) {
newBlocks.push(new PdfBlock({
newBlocks.push(new TextItemBlock({
textItems: textCombiner.combine(block.textItems).textItems,
type: HEADLINE2,
annotation: ADDED_ANNOTATION
}));
}
});
page.blocks = newBlocks;
page.items = newBlocks;
}
});
@ -109,12 +109,12 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
const notFoundHeadlines = [];
if (tocPages.length > 0) {
tocLinks.forEach(tocLink => {
var linkedPage = parseResult.content[tocLink.pageNumber - 1];
var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
var foundHeadline = false;
if (linkedPage) {
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
if (!foundHeadline) { // pages are off by 1 ?
linkedPage = parseResult.content[tocLink.pageNumber];
linkedPage = parseResult.pages[tocLink.pageNumber];
if (linkedPage) {
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
}
@ -126,7 +126,7 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
notFoundHeadlines.push(tocLink);
}
});
lastTocPage.blocks.push(new PdfBlock({
lastTocPage.items.push(new TextItemBlock({
textItems: tocLinks.map(tocLink => {
tocLink.textItem.text = ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text;
return tocLink.textItem
@ -164,7 +164,7 @@ function findHeadline(page, tocLink, textCombiner) {
});
var blockIndex = 0;
var lastBlock;
for ( var block of page.blocks ) {
for ( var block of page.items ) {
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
for ( var item of itemsGroupedByY ) {
const headlineItems = headlineFinder.consume(item);
@ -175,7 +175,7 @@ function findHeadline(page, tocLink, textCombiner) {
// 2 line headline
lastBlock.annotation = REMOVED_ANNOTATION;
}
page.blocks.splice(blockIndex + 1, 0, new PdfBlock({
page.items.splice(blockIndex + 1, 0, new TextItemBlock({
textItems: [new TextItem({
...usedItems[0],
text: headline

View File

@ -1,4 +1,4 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
@ -20,7 +20,7 @@ function hashCodeIgnoringSpacesAndNumbers(string) {
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
export default class RemoveRepetitiveElements extends ToTextItemTransformation {
constructor() {
super("Remove Repetitive Elements");
@ -36,8 +36,8 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
const pageStore = [];
const minLineHashRepetitions = {};
const maxLineHashRepetitions = {};
parseResult.content.forEach(pdfPage => {
const minMaxItems = pdfPage.textItems.reduce((itemStore, item) => {
parseResult.pages.forEach(page => {
const minMaxItems = page.items.reduce((itemStore, item) => {
if (item.y < itemStore.minY) {
itemStore.minElements = [item];
itemStore.minY = item.y;
@ -73,14 +73,14 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
// now annoate all removed items
var removedHeader = 0;
var removedFooter = 0;
parseResult.content.forEach((pdfPage, i) => {
if (minLineHashRepetitions[pageStore[i].minLineHash] >= Math.max(3, parseResult.content.length * 2 / 3)) {
parseResult.pages.forEach((page, i) => {
if (minLineHashRepetitions[pageStore[i].minLineHash] >= Math.max(3, parseResult.pages.length * 2 / 3)) {
pageStore[i].minElements.forEach(item => {
item.annotation = REMOVED_ANNOTATION;
});
removedFooter++;
}
if (maxLineHashRepetitions[pageStore[i].maxLineHash] >= Math.max(3, parseResult.content.length * 2 / 3)) {
if (maxLineHashRepetitions[pageStore[i].maxLineHash] >= Math.max(3, parseResult.pages.length * 2 / 3)) {
pageStore[i].maxElements.forEach(item => {
item.annotation = REMOVED_ANNOTATION;
});

View File

@ -2,7 +2,6 @@ import React from 'react';
import MarkdownPageView from '../../components/debug/MarkdownPageView.jsx';
import Transformation from './Transformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextPage from '../TextPage.jsx';
export default class ToMarkdown extends Transformation {
@ -15,18 +14,15 @@ export default class ToMarkdown extends Transformation {
}
transform(parseResult:ParseResult) {
var text = '';
parseResult.content.forEach(page => {
page.blocks.forEach((block) => {
parseResult.pages.forEach(page => {
var text = '';
page.items.forEach(block => {
text += block.text + '\n';
});
page.items = [text];
});
return new ParseResult({
...parseResult,
content: [new TextPage({
index: 0,
text: text
})],
});
}

View File

@ -1,45 +0,0 @@
import React from 'react';
import Transformation from './Transformation.jsx';
import ParseResult from '../ParseResult.jsx';
import PdfPageView from '../../components/debug/PdfPageView.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
// Abstract class for transformations producing a PdfPage to be shown in the PdfView
export default class ToPdfViewTransformation extends Transformation {
constructor(name) {
super(name);
if (this.constructor === ToPdfViewTransformation) {
throw new TypeError("Can not construct abstract class.");
}
this.showWhitespaces = false;
}
showPageSelection() {
return true;
}
showModificationCheckbox() {
return true;
}
createPageView(page, modificationsOnly) {
return <PdfPageView
key={ page.index }
pdfPage={ page }
modificationsOnly={ modificationsOnly }
showWhitespaces={ this.showWhitespaces } />;
}
completeTransform(parseResult:ParseResult) {
// The usual cleanup
parseResult.messages = [];
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(block => block.annotation = null);
});
return parseResult;
}
}

View File

@ -1,8 +1,7 @@
import React from 'react';
import Transformation from './Transformation.jsx';
import BlockPageView from '../../components/debug/BlockPageView.jsx';
import TextPageView from '../../components/debug/TextPageView.jsx';
import ParseResult from '../ParseResult.jsx';
import BlockPage from '../BlockPage.jsx';
import { blockToText } from '../MarkdownElements.jsx';
export default class ToTextBlocks extends Transformation {
@ -12,27 +11,23 @@ export default class ToTextBlocks extends Transformation {
}
createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
return <BlockPageView key={ page.index } page={ page } />;
return <TextPageView key={ page.index } page={ page } />;
}
transform(parseResult:ParseResult) {
const blocks = [];
parseResult.content.forEach(page => {
page.blocks.forEach(block => {
parseResult.pages.forEach(page => {
const textItems = [];
page.items.forEach(block => {
const category = block.type ? block.type : 'Unknown';
blocks.push({
textItems.push({
category: category,
text: blockToText(block)
});
});
page.items = textItems;
});
return new ParseResult({
...parseResult,
content: [new BlockPage({
index: 0,
blocks: blocks
})],
});
}

View File

@ -0,0 +1,44 @@
import React from 'react';
import Transformation from './Transformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItemBlockPageView from '../../components/debug/TextItemBlockPageView.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
// Abstract class for transformations producing TextItemBlock(s) to be shown in the TextItemBlockPageView
export default class ToTextItemBlockTransformation extends Transformation {
constructor(name) {
super(name);
if (this.constructor === ToTextItemBlockTransformation) {
throw new TypeError("Can not construct abstract class.");
}
this.showWhitespaces = false;
}
showPageSelection() {
return true;
}
showModificationCheckbox() {
return true;
}
createPageView(page, modificationsOnly) {
return <TextItemBlockPageView
key={ page.index }
page={ page }
modificationsOnly={ modificationsOnly }
showWhitespaces={ this.showWhitespaces } />;
}
completeTransform(parseResult:ParseResult) {
// The usual cleanup
parseResult.messages = [];
parseResult.pages.forEach(page => {
page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
page.items.forEach(item => item.annotation = null);
});
return parseResult;
}
}

View File

@ -1,15 +1,15 @@
import React from 'react';
import Transformation from './Transformation.jsx';
import ParseResult from '../ParseResult.jsx';
import PdfBlockPageView from '../../components/debug/PdfBlockPageView.jsx';
import TextItemPageView from '../../components/debug/TextItemPageView.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
// Abstract class for transformations producing a PdfBlockPage to be shown in the PdfBlockView
export default class ToPdfBlockViewTransformation extends Transformation {
// Abstract class for transformations producing TextItem(s) to be shown in the TextItemPageView
export default class ToTextItemTransformation extends Transformation {
constructor(name) {
super(name);
if (this.constructor === ToPdfBlockViewTransformation) {
if (this.constructor === ToTextItemTransformation) {
throw new TypeError("Can not construct abstract class.");
}
this.showWhitespaces = false;
@ -24,9 +24,9 @@ export default class ToPdfBlockViewTransformation extends Transformation {
}
createPageView(page, modificationsOnly) {
return <PdfBlockPageView
return <TextItemPageView
key={ page.index }
pdfPage={ page }
page={ page }
modificationsOnly={ modificationsOnly }
showWhitespaces={ this.showWhitespaces } />;
}
@ -34,11 +34,12 @@ export default class ToPdfBlockViewTransformation extends Transformation {
completeTransform(parseResult:ParseResult) {
// The usual cleanup
parseResult.messages = [];
parseResult.content.forEach(page => {
page.blocks = page.blocks.filter(block => !block.annotation || block.annotation !== REMOVED_ANNOTATION);
page.blocks.forEach(block => block.annotation = null);
parseResult.pages.forEach(page => {
page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
page.items.forEach(item => item.annotation = null);
});
return parseResult;
}
}

View File

@ -14,7 +14,7 @@ export default class Transformation {
}
showPageSelection() {
return false;
return true;
}
showModificationCheckbox() {

View File

@ -1,10 +1,10 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
// Converts vertical text to horizontal
export default class VerticalToHorizontal extends ToPdfViewTransformation {
export default class VerticalToHorizontal extends ToTextItemTransformation {
constructor() {
super("Vertical to Horizontal Text");
@ -12,7 +12,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
transform(parseResult:ParseResult) {
var foundVerticals = 0;
const newContent = parseResult.content.map(page => {
const newPages = parseResult.pages.map(page => {
const newTextItems = [];
// var oneCharacterItems = [];
@ -33,7 +33,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
//TODO generic state machine code ?
const leftOver = page.textItems.reduce((oneCharacterItems, item) => {
const leftOver = page.items.reduce((oneCharacterItems, item) => {
if (item.text.trim().length == 1) {
if (oneCharacterItems.length == 0) {
oneCharacterItems.push(item);
@ -84,12 +84,12 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
return {
...page,
textItems: newTextItems
items: newTextItems
};
});
return new ParseResult({
...parseResult,
content: newContent,
pages: newPages,
messages: ["Converted " + foundVerticals + " verticals"]
});
}

View File

@ -1,7 +1,7 @@
import PdfBlock from './models/PdfBlock.jsx';
import TextItemBlock from './models/TextItemBlock.jsx';
import TextItem from './models/TextItem.jsx';
export function minXFromBlocks(blocks:PdfBlock[]) {
export function minXFromBlocks(blocks:TextItemBlock[]) {
var minX = 999;
blocks.forEach(block => {
block.textItems.forEach(item => {