mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-24 16:54:12 +01:00
[WIP] Cleanup page / item handling
This commit is contained in:
parent
6f69566e98
commit
111124fbf3
@ -28,10 +28,10 @@ export default class App extends React.Component {
|
||||
mainView = <LoadingView fileBuffer={ appState.fileBuffer } storePdfPagesFunction={ appState.storePdfPages } />
|
||||
break;
|
||||
case View.RESULT:
|
||||
mainView = <ResultView pdfPages={ appState.pdfPages } transformations={ appState.transformations } />
|
||||
mainView = <ResultView pages={ appState.pages } transformations={ appState.transformations } />
|
||||
break;
|
||||
case View.DEBUG:
|
||||
mainView = <DebugView pdfPages={ appState.pdfPages } transformations={ appState.transformations } />
|
||||
mainView = <DebugView pages={ appState.pages } transformations={ appState.transformations } />
|
||||
break;
|
||||
default:
|
||||
throw `View ${this.props.appState.mainView} not supported!`;
|
||||
@ -46,7 +46,7 @@ export default class App extends React.Component {
|
||||
</div>
|
||||
</Grid>
|
||||
</div>
|
||||
);
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -17,7 +17,7 @@ import ParseResult from '../models/ParseResult.jsx';
|
||||
export default class DebugView extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
pdfPages: React.PropTypes.array.isRequired,
|
||||
pages: React.PropTypes.array.isRequired,
|
||||
transformations: React.PropTypes.array.isRequired,
|
||||
};
|
||||
|
||||
@ -71,12 +71,12 @@ export default class DebugView extends React.Component {
|
||||
|
||||
render() {
|
||||
const {currentTransformation, pageNr} = this.state;
|
||||
const {pdfPages, transformations} = this.props;
|
||||
const {pages, transformations} = this.props;
|
||||
|
||||
const currentTransformationName = transformations[currentTransformation].name;
|
||||
|
||||
var parseResult = new ParseResult({
|
||||
content: pdfPages
|
||||
pages: pages
|
||||
});
|
||||
var lastTransformation;
|
||||
for (var i = 0; i <= currentTransformation; i++) {
|
||||
@ -87,8 +87,8 @@ export default class DebugView extends React.Component {
|
||||
lastTransformation = transformations[i];
|
||||
}
|
||||
|
||||
parseResult.content = parseResult.content.filter((elem, i) => pageNr == -1 || i == pageNr);
|
||||
const pageComponents = parseResult.content.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly));
|
||||
parseResult.pages = parseResult.pages.filter((elem, i) => pageNr == -1 || i == pageNr);
|
||||
const pageComponents = parseResult.pages.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly));
|
||||
const showModificationCheckbox = lastTransformation.showModificationCheckbox();
|
||||
const statisticsAsList = Object.keys(parseResult.globals).map((key, i) => {
|
||||
return <li key={ i }>
|
||||
@ -121,7 +121,7 @@ export default class DebugView extends React.Component {
|
||||
last
|
||||
ellipsis
|
||||
boundaryLinks
|
||||
items={ pdfPages.length }
|
||||
items={ pages.length }
|
||||
maxButtons={ 17 }
|
||||
activePage={ this.state.pageNr + 1 }
|
||||
onSelect={ this.selectPage.bind(this) } />
|
||||
@ -194,6 +194,6 @@ export default class DebugView extends React.Component {
|
||||
</ul>
|
||||
{ pageComponents }
|
||||
</div>
|
||||
);
|
||||
);
|
||||
}
|
||||
}
|
@ -3,7 +3,7 @@ import React from 'react';
|
||||
import pdfjs from 'pdfjs-dist'; // eslint-disable-line no-unused-vars
|
||||
import { Line } from 'rc-progress';
|
||||
|
||||
import PdfPage from '../models/PdfPage.jsx';
|
||||
import Page from '../models/Page.jsx';
|
||||
import TextItem from '../models/TextItem.jsx';
|
||||
|
||||
export default class LoadingView extends React.Component {
|
||||
@ -17,19 +17,19 @@ export default class LoadingView extends React.Component {
|
||||
super(props);
|
||||
this.state = {
|
||||
parsedPages: 0,
|
||||
pdfPages: []
|
||||
pages: []
|
||||
};
|
||||
}
|
||||
|
||||
anounceInitialParse(pdfPages) {
|
||||
anounceInitialParse(pages) {
|
||||
this.setState({
|
||||
pdfPages: pdfPages
|
||||
pages: pages
|
||||
});
|
||||
}
|
||||
|
||||
anouncePageParsed(index, textItems) {
|
||||
//TODO might make problems.. concat unordered and order at the end ?
|
||||
this.state.pdfPages[index].textItems = textItems; // eslint-disable-line react/no-direct-mutation-state
|
||||
this.state.pages[index].items = textItems; // eslint-disable-line react/no-direct-mutation-state
|
||||
this.setState({
|
||||
parsedPages: this.state.parsedPages + 1
|
||||
});
|
||||
@ -44,13 +44,13 @@ export default class LoadingView extends React.Component {
|
||||
// console.debug(pdfDocument);
|
||||
const numPages = pdfDocument.numPages;
|
||||
// const numPages = 4; // hack
|
||||
var pdfPages = [];
|
||||
var pages = [];
|
||||
for (var i = 0; i < numPages; i++) {
|
||||
pdfPages.push(new PdfPage({
|
||||
pages.push(new Page({
|
||||
index: i
|
||||
}));
|
||||
}
|
||||
anounceInitialParseFunction(pdfPages);
|
||||
anounceInitialParseFunction(pages);
|
||||
for (var j = 1; j <= numPages; j++) {
|
||||
pdfDocument.getPage(j).then(function(page) {
|
||||
var scale = 1.0;
|
||||
@ -96,14 +96,14 @@ export default class LoadingView extends React.Component {
|
||||
}
|
||||
|
||||
render() {
|
||||
const {parsedPages, pdfPages} = this.state;
|
||||
const {parsedPages, pages} = this.state;
|
||||
var percentDone = 0;
|
||||
var details = '';
|
||||
if (pdfPages.length > 0) {
|
||||
percentDone = parsedPages / pdfPages.length * 100;
|
||||
details = parsedPages + ' / ' + pdfPages.length
|
||||
if (parsedPages == pdfPages.length) {
|
||||
this.props.storePdfPagesFunction(this.state.pdfPages);
|
||||
if (pages.length > 0) {
|
||||
percentDone = parsedPages / pages.length * 100;
|
||||
details = parsedPages + ' / ' + pages.length
|
||||
if (parsedPages == pages.length) {
|
||||
this.props.storePdfPagesFunction(this.state.pages);
|
||||
}
|
||||
}
|
||||
return (
|
||||
|
@ -10,7 +10,7 @@ import ParseResult from '../models/ParseResult.jsx';
|
||||
export default class ResultView extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
pdfPages: React.PropTypes.array.isRequired,
|
||||
pages: React.PropTypes.array.isRequired,
|
||||
transformations: React.PropTypes.array.isRequired,
|
||||
};
|
||||
|
||||
@ -19,9 +19,9 @@ export default class ResultView extends React.Component {
|
||||
}
|
||||
|
||||
componentWillMount() {
|
||||
const {pdfPages, transformations} = this.props;
|
||||
const {pages, transformations} = this.props;
|
||||
var parseResult = new ParseResult({
|
||||
content: pdfPages
|
||||
pages: pages
|
||||
});
|
||||
var lastTransformation;
|
||||
transformations.forEach(transformation => {
|
||||
@ -32,10 +32,15 @@ export default class ResultView extends React.Component {
|
||||
lastTransformation = transformation;
|
||||
});
|
||||
|
||||
var text = '';
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(item => {
|
||||
text += item + '\n';
|
||||
});
|
||||
});
|
||||
this.state = {
|
||||
preview: true,
|
||||
text: parseResult.content[0].text
|
||||
|
||||
text: text
|
||||
};
|
||||
}
|
||||
|
||||
@ -90,7 +95,7 @@ export default class ResultView extends React.Component {
|
||||
<hr/>
|
||||
{ textComponent }
|
||||
</div>
|
||||
);
|
||||
);
|
||||
}
|
||||
|
||||
}
|
@ -1,49 +0,0 @@
|
||||
import React from 'react';
|
||||
|
||||
import Table from 'react-bootstrap/lib/Table'
|
||||
|
||||
export default class BlockPageView extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
page: React.PropTypes.object.isRequired,
|
||||
};
|
||||
|
||||
render() {
|
||||
var blocks = this.props.page.blocks;
|
||||
|
||||
const content = <div>
|
||||
<Table responsive>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>
|
||||
#
|
||||
</th>
|
||||
<th>
|
||||
Category
|
||||
</th>
|
||||
<th>
|
||||
Text
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{ blocks.map((block, i) => <tr key={ i }>
|
||||
<td>
|
||||
{ i }
|
||||
</td>
|
||||
<td>
|
||||
{ block.category }
|
||||
</td>
|
||||
<td>
|
||||
<pre style={ { display: 'inline-block' } }>{ block.text }</pre>
|
||||
</td>
|
||||
</tr>
|
||||
) }
|
||||
</tbody>
|
||||
</Table>
|
||||
</div>
|
||||
return (
|
||||
content
|
||||
);
|
||||
}
|
||||
}
|
@ -1,23 +1,17 @@
|
||||
import React from 'react';
|
||||
import PageView from './PageView.jsx';
|
||||
import Remarkable from 'remarkable';
|
||||
|
||||
export default class MarkdownPageView extends React.Component {
|
||||
export default class MarkdownPageView extends PageView {
|
||||
|
||||
static propTypes = {
|
||||
page: React.PropTypes.object.isRequired,
|
||||
};
|
||||
|
||||
render() {
|
||||
createItemViews(items, showWhitespaces) { // eslint-disable-line no-unused-vars
|
||||
const remarkable = new Remarkable({
|
||||
breaks: true
|
||||
});
|
||||
|
||||
const html = remarkable.render(this.props.page.text);
|
||||
return (
|
||||
<div>
|
||||
<div dangerouslySetInnerHTML={ { __html: html } } />
|
||||
</div>
|
||||
);
|
||||
const html = remarkable.render(items[0]);
|
||||
return <div>
|
||||
<div dangerouslySetInnerHTML={ { __html: html } } />
|
||||
</div>
|
||||
}
|
||||
|
||||
}
|
41
src/javascript/components/debug/PageView.jsx
Normal file
41
src/javascript/components/debug/PageView.jsx
Normal file
@ -0,0 +1,41 @@
|
||||
import React from 'react';
|
||||
|
||||
// Abstract view for a Page
|
||||
export default class PageView extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
page: React.PropTypes.object.isRequired,
|
||||
modificationsOnly: React.PropTypes.bool,
|
||||
showWhitespaces: React.PropTypes.bool
|
||||
};
|
||||
|
||||
createItemViews(items, showWhitespaces) { // eslint-disable-line no-unused-vars
|
||||
throw new TypeError("Do not call abstract method foo from child.");
|
||||
}
|
||||
|
||||
render() {
|
||||
const {page, modificationsOnly, showWhitespaces} = this.props;
|
||||
|
||||
var items = page.items;
|
||||
if (modificationsOnly) {
|
||||
items = items.filter(block => block.annotation);
|
||||
}
|
||||
|
||||
|
||||
var content;
|
||||
if (items.length == 0 && modificationsOnly) {
|
||||
content = <div/>
|
||||
} else {
|
||||
const itemViews = this.createItemViews(items, showWhitespaces);
|
||||
const header = "Page " + (page.index + 1);
|
||||
content = <div>
|
||||
<h2>{ header }</h2>
|
||||
<hr/>
|
||||
{ itemViews }
|
||||
</div>
|
||||
}
|
||||
return (
|
||||
content
|
||||
);
|
||||
}
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
import React from 'react';
|
||||
import TextItemTable from './TextItemTable.jsx';
|
||||
|
||||
// View for a PdfPage
|
||||
export default class PdfPageView extends React.Component {
|
||||
|
||||
static propTypes = {
|
||||
pdfPage: React.PropTypes.object.isRequired,
|
||||
modificationsOnly: React.PropTypes.bool.isRequired,
|
||||
showWhitespaces: React.PropTypes.bool
|
||||
};
|
||||
|
||||
render() {
|
||||
const {pdfPage, modificationsOnly, showWhitespaces} = this.props;
|
||||
const header = "Page " + (pdfPage.index + 1);
|
||||
var textItems = pdfPage.textItems;
|
||||
if (modificationsOnly) {
|
||||
textItems = textItems.filter(item => item.annotation);
|
||||
}
|
||||
|
||||
var content;
|
||||
if (textItems.length == 0 && modificationsOnly) {
|
||||
content = <div/>
|
||||
} else {
|
||||
content = <div>
|
||||
<h2>{ header }</h2>
|
||||
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
|
||||
</div>
|
||||
}
|
||||
|
||||
return (
|
||||
content
|
||||
);
|
||||
}
|
||||
}
|
@ -1,24 +1,12 @@
|
||||
import React from 'react';
|
||||
import PageView from './PageView.jsx';
|
||||
import TextItemTable from './TextItemTable.jsx';
|
||||
|
||||
// View for a PdfBlockPage
|
||||
export default class PdfBlockPageView extends React.Component {
|
||||
// View for a Page which items are of kind TextItemBlock
|
||||
export default class TextItemBlockPageView extends PageView {
|
||||
|
||||
static propTypes = {
|
||||
pdfPage: React.PropTypes.object.isRequired,
|
||||
modificationsOnly: React.PropTypes.bool.isRequired,
|
||||
showWhitespaces: React.PropTypes.bool
|
||||
};
|
||||
|
||||
render() {
|
||||
const {pdfPage, modificationsOnly, showWhitespaces} = this.props;
|
||||
|
||||
var blocks = pdfPage.blocks;
|
||||
if (modificationsOnly) {
|
||||
blocks = blocks.filter(block => block.annotation);
|
||||
}
|
||||
|
||||
const blockTables = blocks.map((block, i) => {
|
||||
createItemViews(items, showWhitespaces) {
|
||||
const blockTables = items.map((block, i) => {
|
||||
var textItems = block.textItems;
|
||||
const blockType = block.type ? ' - ' + block.type : null;
|
||||
const blockAnnotation = block.annotation ? <span>{ ' - ' + block.annotation.category }</span>
|
||||
@ -56,19 +44,7 @@ export default class PdfBlockPageView extends React.Component {
|
||||
</div>
|
||||
</div>
|
||||
});
|
||||
|
||||
var content;
|
||||
if (blocks.length == 0 && modificationsOnly) {
|
||||
content = <div/>
|
||||
} else {
|
||||
const header = "Page " + (pdfPage.index + 1);
|
||||
content = <div>
|
||||
<h2>{ header }</h2>
|
||||
{ blockTables }
|
||||
</div>
|
||||
}
|
||||
return (
|
||||
content
|
||||
);
|
||||
return blockTables;
|
||||
}
|
||||
|
||||
}
|
12
src/javascript/components/debug/TextItemPageView.jsx
Normal file
12
src/javascript/components/debug/TextItemPageView.jsx
Normal file
@ -0,0 +1,12 @@
|
||||
import React from 'react';
|
||||
import PageView from './PageView.jsx';
|
||||
import TextItemTable from './TextItemTable.jsx';
|
||||
|
||||
// View for a Page which items are of kind TextItem
|
||||
export default class TextItemPageView extends PageView {
|
||||
|
||||
createItemViews(items, showWhitespaces) {
|
||||
return <TextItemTable textItems={ items } showWhitespaces={ showWhitespaces } />
|
||||
}
|
||||
|
||||
}
|
41
src/javascript/components/debug/TextPageView.jsx
Normal file
41
src/javascript/components/debug/TextPageView.jsx
Normal file
@ -0,0 +1,41 @@
|
||||
import React from 'react';
|
||||
import PageView from './PageView.jsx';
|
||||
import Table from 'react-bootstrap/lib/Table'
|
||||
|
||||
export default class TextPageView extends PageView {
|
||||
|
||||
createItemViews(items, showWhitespaces) { // eslint-disable-line no-unused-vars
|
||||
return <div>
|
||||
<Table responsive>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>
|
||||
#
|
||||
</th>
|
||||
<th>
|
||||
Category
|
||||
</th>
|
||||
<th>
|
||||
Text
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{ items.map((block, i) => <tr key={ i }>
|
||||
<td>
|
||||
{ i }
|
||||
</td>
|
||||
<td>
|
||||
{ block.category }
|
||||
</td>
|
||||
<td>
|
||||
<pre style={ { display: 'inline-block' } }>{ block.text }</pre>
|
||||
</td>
|
||||
</tr>
|
||||
) }
|
||||
</tbody>
|
||||
</Table>
|
||||
</div>
|
||||
}
|
||||
|
||||
}
|
@ -26,7 +26,7 @@ export default class AppState {
|
||||
this.renderFunction = options.renderFunction;
|
||||
this.mainView = View.UPLOAD;
|
||||
this.fileBuffer;
|
||||
this.pdfPages = [];
|
||||
this.pages = [];
|
||||
this.transformations = [
|
||||
new CalculateGlobalStats(),
|
||||
new RemoveRepetitiveElements(),
|
||||
@ -66,8 +66,8 @@ export default class AppState {
|
||||
this.render()
|
||||
}
|
||||
|
||||
storePdfPages(pdfPages) {
|
||||
this.pdfPages = pdfPages;
|
||||
storePdfPages(pages) {
|
||||
this.pages = pages;
|
||||
this.fileBuffer = null;
|
||||
this.mainView = View.RESULT;
|
||||
this.render();
|
||||
|
@ -1,9 +0,0 @@
|
||||
// A page which holds blocks displayable via BlockPageView
|
||||
export default class BlockPage {
|
||||
|
||||
constructor(options) {
|
||||
this.index = options.index;
|
||||
this.blocks = options.blocks;
|
||||
}
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
import PdfBlock from './BlockPage.jsx';
|
||||
import TextItemBlock from './TextItemBlock.jsx';
|
||||
import TextItemCombiner from './TextItemCombiner.jsx';
|
||||
import TextItem from './TextItem.jsx';
|
||||
|
||||
@ -31,7 +31,7 @@ export function headlineByLevel(level) {
|
||||
throw "Unsupported headline level: " + level;
|
||||
}
|
||||
|
||||
export function blockToText(block: PdfBlock) {
|
||||
export function blockToText(block: TextItemBlock) {
|
||||
switch (block.type) {
|
||||
case CODE_BLOCK:
|
||||
return '```\n' + concatTextItems(block.textItems) + '```'
|
||||
|
9
src/javascript/models/Page.jsx
Normal file
9
src/javascript/models/Page.jsx
Normal file
@ -0,0 +1,9 @@
|
||||
// A page which holds PageItems displayable via PdfPageView
|
||||
export default class Page {
|
||||
|
||||
constructor(options) {
|
||||
this.index = options.index;
|
||||
this.items = options.items || []; //PageItem
|
||||
}
|
||||
|
||||
}
|
13
src/javascript/models/PageItem.jsx
Normal file
13
src/javascript/models/PageItem.jsx
Normal file
@ -0,0 +1,13 @@
|
||||
// A abstract PageItem class, can be TextItem, or TextItemBlock
|
||||
export default class PageItem {
|
||||
|
||||
constructor(options) {
|
||||
if (this.constructor === PageItem) {
|
||||
throw new TypeError("Can not construct abstract class.");
|
||||
}
|
||||
this.type = options.type;
|
||||
this.annotation = options.annotation;
|
||||
this.parsedElements = options.parsedElements;
|
||||
}
|
||||
|
||||
}
|
@ -2,7 +2,7 @@
|
||||
export default class ParseResult {
|
||||
|
||||
constructor(options) {
|
||||
this.content = options.content; // like PdfPages[]
|
||||
this.pages = options.pages; // like Page[]
|
||||
this.globals = options.globals; // properties accasable for all the following transformations in debug mode
|
||||
this.messages = options.messages; // something to show only for the transformation in debug mode
|
||||
}
|
||||
|
@ -1,11 +0,0 @@
|
||||
// A block within a PdfPage
|
||||
export default class PdfBlock {
|
||||
|
||||
constructor(options) {
|
||||
this.textItems = options.textItems;
|
||||
this.type = options.type;
|
||||
this.annotation = options.annotation;
|
||||
this.parsedElements = options.parsedElements;
|
||||
}
|
||||
|
||||
}
|
@ -1,9 +0,0 @@
|
||||
// A page which holds TextItems grouped by block displayable via PdfPageBlockView
|
||||
export default class PdfBlockPage {
|
||||
|
||||
constructor(options) {
|
||||
this.index = options.index;
|
||||
this.blocks = options.blocks;
|
||||
}
|
||||
|
||||
}
|
@ -1,9 +0,0 @@
|
||||
// A page which holds TextItems displayable via PdfPageView
|
||||
export default class PdfPage {
|
||||
|
||||
constructor(options) {
|
||||
this.index = options.index;
|
||||
this.textItems = []
|
||||
}
|
||||
|
||||
}
|
@ -1,7 +1,10 @@
|
||||
import PageItem from './PageItem.jsx'
|
||||
|
||||
//A text item, i.e. a line or a word within a page
|
||||
export default class TextItem {
|
||||
export default class TextItem extends PageItem {
|
||||
|
||||
constructor(options) {
|
||||
super(options);
|
||||
this.x = options.x;
|
||||
this.y = options.y;
|
||||
this.width = options.width;
|
||||
@ -10,8 +13,6 @@ export default class TextItem {
|
||||
this.font = options.font;
|
||||
this.fontAscent = options.fontAscent;
|
||||
this.fontDescent = options.fontDescent;
|
||||
this.annotation = options.annotation;
|
||||
this.markdownElement = options.markdownElement;
|
||||
}
|
||||
|
||||
}
|
||||
|
11
src/javascript/models/TextItemBlock.jsx
Normal file
11
src/javascript/models/TextItemBlock.jsx
Normal file
@ -0,0 +1,11 @@
|
||||
import PageItem from './PageItem.jsx'
|
||||
|
||||
// A block of TextItem[] within a Page
|
||||
export default class TextItemBlock extends PageItem {
|
||||
|
||||
constructor(options) {
|
||||
super(options);
|
||||
this.textItems = options.textItems;
|
||||
}
|
||||
|
||||
}
|
@ -1,9 +0,0 @@
|
||||
// A page which holds TextItems displayable via PdfPageView
|
||||
export default class TextPage {
|
||||
|
||||
constructor(options) {
|
||||
this.index = options.index;
|
||||
this.text = options.text;
|
||||
}
|
||||
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
import MarkdownElement from './MarkdownElement.jsx';
|
||||
|
||||
export default class Headline extends MarkdownElement {
|
||||
|
||||
constructor(options) {
|
||||
super({
|
||||
newLineBefore: true,
|
||||
newLineAfter: true
|
||||
});
|
||||
this.level = options.level;
|
||||
}
|
||||
|
||||
transformText(text) {
|
||||
return '#'.repeat(this.level) + ' ' + text;
|
||||
}
|
||||
|
||||
}
|
@ -1,16 +0,0 @@
|
||||
// An text item detected as markdown element
|
||||
export default class MarkdownElement {
|
||||
|
||||
constructor(options) {
|
||||
if (this.constructor === MarkdownElement) {
|
||||
throw new TypeError("Can not construct abstract class.");
|
||||
}
|
||||
this.newLineBefore = options.newLineBefore;
|
||||
this.newLineAfter = options.newLineAfter;
|
||||
}
|
||||
|
||||
transformText(text) { // eslint-disable-line no-unused-vars
|
||||
throw new TypeError("Do not call abstract method foo from child.");
|
||||
}
|
||||
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
|
||||
export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
||||
export default class CalculateGlobalStats extends ToTextItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Calculate Statistics");
|
||||
@ -14,8 +14,8 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
||||
const fontToOccurrence = {};
|
||||
var maxHeight = 0;
|
||||
var maxHeightFont;
|
||||
parseResult.content.forEach(page => {
|
||||
page.textItems.forEach(item => {
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(item => {
|
||||
heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
|
||||
fontToOccurrence[item.font] = fontToOccurrence[item.font] ? fontToOccurrence[item.font] + 1 : 1;
|
||||
if (item.height > maxHeight) {
|
||||
@ -29,9 +29,9 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
||||
|
||||
// Parse line distances
|
||||
const distanceToOccurrence = {};
|
||||
parseResult.content.forEach(page => {
|
||||
parseResult.pages.forEach(page => {
|
||||
var lastItemOfMostUsedHeight;
|
||||
page.textItems.forEach(item => {
|
||||
page.items.forEach(item => {
|
||||
if (item.height == mostUsedHeight && item.text.trim().length > 0) {
|
||||
if (lastItemOfMostUsedHeight && item.y != lastItemOfMostUsedHeight.y) {
|
||||
const distance = lastItemOfMostUsedHeight.y - item.y;
|
||||
@ -49,10 +49,10 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
||||
|
||||
|
||||
//Make a copy of the originals so all following transformation don't modify them
|
||||
const newContent = parseResult.content.map(pdfPage => {
|
||||
const newPages = parseResult.pages.map(page => {
|
||||
return {
|
||||
...pdfPage,
|
||||
textItems: pdfPage.textItems.map(textItem => {
|
||||
...page,
|
||||
items: page.items.map(textItem => {
|
||||
return {
|
||||
...textItem,
|
||||
}
|
||||
@ -61,7 +61,7 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
||||
});
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: newContent,
|
||||
pages: newPages,
|
||||
globals: {
|
||||
mostUsedHeight: mostUsedHeight,
|
||||
mostUsedFont: mostUsedFont,
|
||||
|
@ -1,4 +1,4 @@
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
@ -40,7 +40,7 @@ function combineTextItems(textItems:TextItem[]) {
|
||||
});
|
||||
}
|
||||
|
||||
export default class CombineSameY extends ToPdfViewTransformation {
|
||||
export default class CombineSameY extends ToTextItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Combine Text On Same Y");
|
||||
|
@ -1,13 +1,13 @@
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import PdfBlock from '../PdfBlock.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { CODE_BLOCK } from '../MarkdownElements.jsx';
|
||||
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
||||
|
||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||
export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
||||
export default class DetectCodeBlocks extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Code/Quotes");
|
||||
@ -21,8 +21,8 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
||||
mostUsedDistance: mostUsedDistance
|
||||
});
|
||||
|
||||
parseResult.content.forEach(page => {
|
||||
var minX = minXFromBlocks(page.blocks);
|
||||
parseResult.pages.forEach(page => {
|
||||
var minX = minXFromBlocks(page.items);
|
||||
if (minX) {
|
||||
const itemAreSuitable = (items) => {
|
||||
for ( let item of items ) {
|
||||
@ -37,7 +37,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
||||
};
|
||||
const newBlocks = [];
|
||||
var preceedingCodeBlock;
|
||||
page.blocks.forEach(block => {
|
||||
page.items.forEach(block => {
|
||||
if (block.type) {
|
||||
newBlocks.push(block);
|
||||
preceedingCodeBlock = null;
|
||||
@ -54,7 +54,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
||||
preceedingCodeBlock.textItems = preceedingCodeBlock.textItems.concat(combineResult.textItems);
|
||||
preceedingCodeBlock.parsedElements.add(combineResult.parsedElements);
|
||||
} else {
|
||||
preceedingCodeBlock = new PdfBlock({
|
||||
preceedingCodeBlock = new TextItemBlock({
|
||||
type: CODE_BLOCK,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
textItems: combineResult.textItems,
|
||||
@ -69,7 +69,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
||||
}
|
||||
}
|
||||
});
|
||||
page.blocks = newBlocks;
|
||||
page.items = newBlocks;
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -1,12 +1,12 @@
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import PdfBlock from '../PdfBlock.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { FOOTNOTE_BLOCK } from '../MarkdownElements.jsx';
|
||||
|
||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||
export default class DetectFootnotes extends ToPdfBlockViewTransformation {
|
||||
export default class DetectFootnotes extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Footnotes");
|
||||
@ -19,17 +19,17 @@ export default class DetectFootnotes extends ToPdfBlockViewTransformation {
|
||||
mostUsedDistance: mostUsedDistance,
|
||||
});
|
||||
|
||||
parseResult.content.forEach(page => {
|
||||
parseResult.pages.forEach(page => {
|
||||
const newBlocks = [];
|
||||
var lastFootnote;
|
||||
page.blocks.forEach(block => {
|
||||
page.items.forEach(block => {
|
||||
newBlocks.push(block);
|
||||
if (!block.type && block.textItems[0].y < 200) {
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (combineResult.parsedElements.footnotes.length > 0) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
foundFootnotes.push.apply(foundFootnotes, combineResult.parsedElements.footnotes);
|
||||
lastFootnote = new PdfBlock({
|
||||
lastFootnote = new TextItemBlock({
|
||||
textItems: combineResult.textItems,
|
||||
type: FOOTNOTE_BLOCK,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
@ -48,7 +48,7 @@ export default class DetectFootnotes extends ToPdfBlockViewTransformation {
|
||||
lastFootnote = null;
|
||||
}
|
||||
});
|
||||
page.blocks = newBlocks;
|
||||
page.items = newBlocks;
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
|
198
src/javascript/models/transformations/DetectHeadlines.jsx
Normal file
198
src/javascript/models/transformations/DetectHeadlines.jsx
Normal file
@ -0,0 +1,198 @@
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { HEADLINE1, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
|
||||
|
||||
//Detect headlines
|
||||
export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Headlines");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
var foundHeadlines = 0;
|
||||
const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals;
|
||||
|
||||
const textCombiner = new TextItemCombiner({
|
||||
mostUsedDistance: mostUsedDistance,
|
||||
});
|
||||
|
||||
//Set max headlines (all headers on the same page are max level 2)
|
||||
const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight, textCombiner);
|
||||
|
||||
var headlineHeightFlowBeforeToc = [];
|
||||
var headlineHeightsOccurenceBeforeToc = {};
|
||||
var firstPageAfterToc = 0;
|
||||
if (tocPages && tocPages.length > 0) {
|
||||
[headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], textCombiner, mostUsedHeight, maxHeaderPages);
|
||||
firstPageAfterToc = tocPages[tocPages.length - 1] + 1;
|
||||
}
|
||||
|
||||
const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, textCombiner, mostUsedHeight, maxHeaderPages);
|
||||
|
||||
|
||||
// TODO ==> do flow analysis (remove out of flow or snap, start with 2nd)
|
||||
// TODO ==> parse seperately between beforeToc and after
|
||||
// TODO ==> Kala chakra, all uppercase
|
||||
// TODO ==> TOC headlines
|
||||
|
||||
//var topHeadlinePassed = false;
|
||||
const headlineHeightMap = {};
|
||||
const headlineSizePerLevel = {};
|
||||
var currentHeadlineLevel;
|
||||
parseResult.pages.forEach(page => {
|
||||
const newBlocks = [];
|
||||
page.items.forEach(block => {
|
||||
newBlocks.push(block);
|
||||
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (combineResult.textItems.length == 1) {
|
||||
const height = combineResult.textItems[0].height;
|
||||
if (height == maxHeight) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
currentHeadlineLevel = 1;
|
||||
headlineSizePerLevel[currentHeadlineLevel] = height
|
||||
addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
|
||||
}
|
||||
// else if (currentHeadlineLevel) {
|
||||
// const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel];
|
||||
// if (height < currentLevelSize) {
|
||||
// const nextLevelSize = headlineSizePerLevel[currentHeadlineLevel + 1];
|
||||
// // if(!nextLevelSize)
|
||||
// if (currentHeadlineLevel < 6) {
|
||||
// currentHeadlineLevel++;
|
||||
// }
|
||||
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
|
||||
// headlineSizePerLevel[currentHeadlineLevel] = height;
|
||||
// } else if (height > currentLevelSize) {
|
||||
// const preLevelSize = headlineSizePerLevel[currentHeadlineLevel - 1];
|
||||
// if (currentHeadlineLevel > 1) {
|
||||
// currentHeadlineLevel--;
|
||||
// }
|
||||
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
|
||||
// headlineSizePerLevel[currentHeadlineLevel] = height;
|
||||
// } else {
|
||||
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
|
||||
// }
|
||||
// }
|
||||
}
|
||||
}
|
||||
});
|
||||
page.items = newBlocks;
|
||||
});
|
||||
|
||||
const heightToOccurrence = {};
|
||||
const fontToOccurrence = {};
|
||||
// parseResult.content.forEach(page => {
|
||||
// const newBlocks = [];
|
||||
// page.blocks.forEach(block => {
|
||||
// newBlocks.push(block);
|
||||
// if (!block.type && block.textItems[0].height > mostUsedHeight) {
|
||||
// foundHeadlines++;
|
||||
// block.annotation = REMOVED_ANNOTATION;
|
||||
// const combineResult = textCombiner.combine(block.textItems);
|
||||
// const height = combineResult.textItems[0].height;
|
||||
// const font = combineResult.textItems[0].font;
|
||||
// heightToOccurrence[height] = heightToOccurrence[height] ? heightToOccurrence[height] + 1 : 1;
|
||||
// fontToOccurrence[font] = fontToOccurrence[font] ? fontToOccurrence[font] + 1 : 1;
|
||||
// newBlocks.push(new PdfBlock({
|
||||
// textItems: combineResult.textItems,
|
||||
// type: HEADLINE1,
|
||||
// annotation: ADDED_ANNOTATION,
|
||||
// parsedElements: combineResult.parsedElements
|
||||
// }));
|
||||
// }
|
||||
// });
|
||||
// page.blocks = newBlocks;
|
||||
// });
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: [
|
||||
'Found headlines: ' + foundHeadlines,
|
||||
'Height repetition: ' + JSON.stringify(heightToOccurrence),
|
||||
'Font repetition: ' + JSON.stringify(fontToOccurrence),
|
||||
'Pages with max Header: ' + maxHeaderPages,
|
||||
'Headline Height Flow (before TOC): ' + headlineHeightFlowBeforeToc,
|
||||
'Headline Heights Occurence (before TOC): ' + JSON.stringify(headlineHeightsOccurenceBeforeToc),
|
||||
'Headline Height Flow: ' + headlineHeightFlowAfterToc,
|
||||
'Headline Heights Occurence: ' + JSON.stringify(headlineHeightsOccurenceAfterToc),
|
||||
]
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function addNewBlock(newBlocks, combineResult, headlineLevel) {
|
||||
newBlocks.push(new TextItemBlock({
|
||||
textItems: combineResult.textItems,
|
||||
type: headlineLevel,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
parsedElements: combineResult.parsedElements
|
||||
}));
|
||||
}
|
||||
|
||||
function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
|
||||
// Find pages with max height
|
||||
const maxHeaderPagesSet = new Set();
|
||||
pages.forEach(page => {
|
||||
page.items.forEach(block => {
|
||||
if (!block.type && block.textItems[0].height == maxHeight) {
|
||||
maxHeaderPagesSet.add(page);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// Now convert those pages to headlines
|
||||
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
|
||||
maxHeaderPagesSet.forEach(pageWithMaxHeader => {
|
||||
const newBlocks = [];
|
||||
pageWithMaxHeader.items.forEach(block => {
|
||||
newBlocks.push(block);
|
||||
const height = block.textItems[0].height;
|
||||
if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (height == maxHeight) {
|
||||
addNewBlock(newBlocks, combineResult, HEADLINE1);
|
||||
} else if (combineResult.textItems.length == 1) {
|
||||
addNewBlock(newBlocks, combineResult, HEADLINE2);
|
||||
}
|
||||
}
|
||||
});
|
||||
pageWithMaxHeader.items = newBlocks;
|
||||
});
|
||||
|
||||
return Array.from(maxHeaderPagesSet).map(page => page.index + 1);
|
||||
}
|
||||
|
||||
function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeight, maxHeaderPages) {
|
||||
const headlineHeightFlow = [];
|
||||
const headlineHeightsOccurences = {};
|
||||
var lastHeadlineHeight;
|
||||
for (var i = from; i < to; i++) {
|
||||
const page = pages[i];
|
||||
if (!maxHeaderPages.includes(page.index + 1)) {
|
||||
page.items.forEach(block => {
|
||||
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (combineResult.textItems.length == 1) {
|
||||
const height = combineResult.textItems[0].height;
|
||||
headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ;
|
||||
if (!lastHeadlineHeight || height != lastHeadlineHeight) {
|
||||
headlineHeightFlow.push(height);
|
||||
//headlineFontFlow.push(combineResult.textItems[0].font)
|
||||
lastHeadlineHeight = height;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return [headlineHeightFlow, headlineHeightsOccurences];
|
||||
}
|
||||
|
@ -1,14 +1,14 @@
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfBlock from '../PdfBlock.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { PARAGRAPH, LIST_BLOCK } from '../MarkdownElements.jsx';
|
||||
import { minXFromBlocks } from '../../textItemFunctions.jsx';
|
||||
|
||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||
export default class DetectLists extends ToPdfBlockViewTransformation {
|
||||
export default class DetectLists extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Lists");
|
||||
@ -21,11 +21,11 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
|
||||
mostUsedDistance: mostUsedDistance
|
||||
});
|
||||
|
||||
parseResult.content.forEach(page => {
|
||||
var minX = minXFromBlocks(page.blocks);
|
||||
parseResult.pages.forEach(page => {
|
||||
var minX = minXFromBlocks(page.items);
|
||||
if (minX) {
|
||||
const newBlocks = [];
|
||||
page.blocks.forEach(block => {
|
||||
page.items.forEach(block => {
|
||||
newBlocks.push(block);
|
||||
if (!block.type) {
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
@ -81,14 +81,14 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
|
||||
});
|
||||
|
||||
if (itemsBeforeFirstLineItem.length > 0) {
|
||||
newBlocks.push(new PdfBlock({
|
||||
newBlocks.push(new TextItemBlock({
|
||||
textItems: itemsBeforeFirstLineItem,
|
||||
type: PARAGRAPH,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
}
|
||||
//TODO display with whitespace pre support
|
||||
newBlocks.push(new PdfBlock({
|
||||
newBlocks.push(new TextItemBlock({
|
||||
textItems: listBlockItems,
|
||||
type: LIST_BLOCK,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
@ -97,7 +97,7 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
|
||||
}
|
||||
}
|
||||
});
|
||||
page.blocks = newBlocks;
|
||||
page.items = newBlocks;
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -1,10 +1,10 @@
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import Page from '../Page.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import PdfBlockPage from '../PdfBlockPage.jsx';
|
||||
import PdfBlock from '../PdfBlock.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import { minXFromTextItems } from '../../textItemFunctions.jsx';
|
||||
|
||||
export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
|
||||
export default class DetectPdfBlocks extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Blocks");
|
||||
@ -13,20 +13,20 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var createdBlocks = 0;
|
||||
const newContent = parseResult.content.map(page => {
|
||||
var minX = minXFromTextItems(page.textItems);
|
||||
const newPages = parseResult.pages.map(page => {
|
||||
var minX = minXFromTextItems(page.items);
|
||||
const blocks = [];
|
||||
var textItemsInBlock = [];
|
||||
const completBlock = () => {
|
||||
if (textItemsInBlock.length > 0) { //can happen on empty page
|
||||
blocks.push(new PdfBlock({
|
||||
blocks.push(new TextItemBlock({
|
||||
textItems: textItemsInBlock
|
||||
}));
|
||||
textItemsInBlock = [];
|
||||
}
|
||||
};
|
||||
var lastItem;
|
||||
page.textItems.forEach(item => {
|
||||
page.items.forEach(item => {
|
||||
|
||||
if (lastItem) {
|
||||
if (shouldSplit(lastItem, item, minX, mostUsedDistance)) {
|
||||
@ -39,16 +39,16 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
|
||||
completBlock();
|
||||
|
||||
createdBlocks += blocks.length;
|
||||
return new PdfBlockPage({
|
||||
return new Page({
|
||||
...page,
|
||||
blocks: blocks
|
||||
items: blocks
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: newContent,
|
||||
pages: newPages,
|
||||
messages: ['Splitted into ' + createdBlocks + ' blocks']
|
||||
});
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfBlock from '../PdfBlock.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import HeadlineFinder from '../HeadlineFinder.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
@ -9,16 +9,16 @@ import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
|
||||
import { isDigit } from '../../functions.jsx'
|
||||
|
||||
//Detect table of contents pages
|
||||
export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
export default class DetectTOC extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Table of Contents");
|
||||
super("Detect TOC");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
const tocPages = [];
|
||||
const maxPagesToEvaluate = Math.min(20, parseResult.content.length);
|
||||
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
|
||||
const textCombiner = new TextItemCombiner({
|
||||
mostUsedDistance: mostUsedDistance
|
||||
});
|
||||
@ -26,14 +26,14 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
const linkLeveler = new LinkLeveler();
|
||||
var tocLinks = [];
|
||||
var lastTocPage;
|
||||
parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => {
|
||||
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
|
||||
var linesCount = 0;
|
||||
var linesWithDigitsCount = 0;
|
||||
var lineItemsWithDigits = [];
|
||||
const unknownBlocks = new Set();
|
||||
var headlineBlock;
|
||||
const pageTocLinks = [];
|
||||
page.blocks.forEach(block => {
|
||||
page.items.forEach(block => {
|
||||
var blockHasLinesWithDigits = false;
|
||||
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
|
||||
var lastLineTextWithoutNumber;
|
||||
@ -87,20 +87,20 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
tocLinks = tocLinks.concat(pageTocLinks);
|
||||
|
||||
const newBlocks = [];
|
||||
page.blocks.forEach((block) => {
|
||||
page.items.forEach((block) => {
|
||||
if (!unknownBlocks.has(block)) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
}
|
||||
newBlocks.push(block);
|
||||
if (block === headlineBlock) {
|
||||
newBlocks.push(new PdfBlock({
|
||||
newBlocks.push(new TextItemBlock({
|
||||
textItems: textCombiner.combine(block.textItems).textItems,
|
||||
type: HEADLINE2,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
}
|
||||
});
|
||||
page.blocks = newBlocks;
|
||||
page.items = newBlocks;
|
||||
}
|
||||
});
|
||||
|
||||
@ -109,12 +109,12 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
const notFoundHeadlines = [];
|
||||
if (tocPages.length > 0) {
|
||||
tocLinks.forEach(tocLink => {
|
||||
var linkedPage = parseResult.content[tocLink.pageNumber - 1];
|
||||
var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
|
||||
var foundHeadline = false;
|
||||
if (linkedPage) {
|
||||
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
|
||||
if (!foundHeadline) { // pages are off by 1 ?
|
||||
linkedPage = parseResult.content[tocLink.pageNumber];
|
||||
linkedPage = parseResult.pages[tocLink.pageNumber];
|
||||
if (linkedPage) {
|
||||
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
|
||||
}
|
||||
@ -126,7 +126,7 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
notFoundHeadlines.push(tocLink);
|
||||
}
|
||||
});
|
||||
lastTocPage.blocks.push(new PdfBlock({
|
||||
lastTocPage.items.push(new TextItemBlock({
|
||||
textItems: tocLinks.map(tocLink => {
|
||||
tocLink.textItem.text = ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text;
|
||||
return tocLink.textItem
|
||||
@ -164,7 +164,7 @@ function findHeadline(page, tocLink, textCombiner) {
|
||||
});
|
||||
var blockIndex = 0;
|
||||
var lastBlock;
|
||||
for ( var block of page.blocks ) {
|
||||
for ( var block of page.items ) {
|
||||
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
|
||||
for ( var item of itemsGroupedByY ) {
|
||||
const headlineItems = headlineFinder.consume(item);
|
||||
@ -175,7 +175,7 @@ function findHeadline(page, tocLink, textCombiner) {
|
||||
// 2 line headline
|
||||
lastBlock.annotation = REMOVED_ANNOTATION;
|
||||
}
|
||||
page.blocks.splice(blockIndex + 1, 0, new PdfBlock({
|
||||
page.items.splice(blockIndex + 1, 0, new TextItemBlock({
|
||||
textItems: [new TextItem({
|
||||
...usedItems[0],
|
||||
text: headline
|
||||
|
@ -1,4 +1,4 @@
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
@ -20,7 +20,7 @@ function hashCodeIgnoringSpacesAndNumbers(string) {
|
||||
|
||||
|
||||
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
|
||||
export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
|
||||
export default class RemoveRepetitiveElements extends ToTextItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Remove Repetitive Elements");
|
||||
@ -36,8 +36,8 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
|
||||
const pageStore = [];
|
||||
const minLineHashRepetitions = {};
|
||||
const maxLineHashRepetitions = {};
|
||||
parseResult.content.forEach(pdfPage => {
|
||||
const minMaxItems = pdfPage.textItems.reduce((itemStore, item) => {
|
||||
parseResult.pages.forEach(page => {
|
||||
const minMaxItems = page.items.reduce((itemStore, item) => {
|
||||
if (item.y < itemStore.minY) {
|
||||
itemStore.minElements = [item];
|
||||
itemStore.minY = item.y;
|
||||
@ -73,14 +73,14 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
|
||||
// now annoate all removed items
|
||||
var removedHeader = 0;
|
||||
var removedFooter = 0;
|
||||
parseResult.content.forEach((pdfPage, i) => {
|
||||
if (minLineHashRepetitions[pageStore[i].minLineHash] >= Math.max(3, parseResult.content.length * 2 / 3)) {
|
||||
parseResult.pages.forEach((page, i) => {
|
||||
if (minLineHashRepetitions[pageStore[i].minLineHash] >= Math.max(3, parseResult.pages.length * 2 / 3)) {
|
||||
pageStore[i].minElements.forEach(item => {
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
});
|
||||
removedFooter++;
|
||||
}
|
||||
if (maxLineHashRepetitions[pageStore[i].maxLineHash] >= Math.max(3, parseResult.content.length * 2 / 3)) {
|
||||
if (maxLineHashRepetitions[pageStore[i].maxLineHash] >= Math.max(3, parseResult.pages.length * 2 / 3)) {
|
||||
pageStore[i].maxElements.forEach(item => {
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
});
|
||||
|
@ -2,7 +2,6 @@ import React from 'react';
|
||||
import MarkdownPageView from '../../components/debug/MarkdownPageView.jsx';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextPage from '../TextPage.jsx';
|
||||
|
||||
export default class ToMarkdown extends Transformation {
|
||||
|
||||
@ -15,18 +14,15 @@ export default class ToMarkdown extends Transformation {
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
var text = '';
|
||||
parseResult.content.forEach(page => {
|
||||
page.blocks.forEach((block) => {
|
||||
parseResult.pages.forEach(page => {
|
||||
var text = '';
|
||||
page.items.forEach(block => {
|
||||
text += block.text + '\n';
|
||||
});
|
||||
page.items = [text];
|
||||
});
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: [new TextPage({
|
||||
index: 0,
|
||||
text: text
|
||||
})],
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -1,45 +0,0 @@
|
||||
import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import PdfPageView from '../../components/debug/PdfPageView.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
// Abstract class for transformations producing a PdfPage to be shown in the PdfView
|
||||
export default class ToPdfViewTransformation extends Transformation {
|
||||
|
||||
constructor(name) {
|
||||
super(name);
|
||||
if (this.constructor === ToPdfViewTransformation) {
|
||||
throw new TypeError("Can not construct abstract class.");
|
||||
}
|
||||
this.showWhitespaces = false;
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return true;
|
||||
}
|
||||
|
||||
showModificationCheckbox() {
|
||||
return true;
|
||||
}
|
||||
|
||||
createPageView(page, modificationsOnly) {
|
||||
return <PdfPageView
|
||||
key={ page.index }
|
||||
pdfPage={ page }
|
||||
modificationsOnly={ modificationsOnly }
|
||||
showWhitespaces={ this.showWhitespaces } />;
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
// The usual cleanup
|
||||
parseResult.messages = [];
|
||||
parseResult.content.forEach(page => {
|
||||
page.textItems = page.textItems.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(block => block.annotation = null);
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,8 +1,7 @@
|
||||
import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import BlockPageView from '../../components/debug/BlockPageView.jsx';
|
||||
import TextPageView from '../../components/debug/TextPageView.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import BlockPage from '../BlockPage.jsx';
|
||||
import { blockToText } from '../MarkdownElements.jsx';
|
||||
|
||||
export default class ToTextBlocks extends Transformation {
|
||||
@ -12,27 +11,23 @@ export default class ToTextBlocks extends Transformation {
|
||||
}
|
||||
|
||||
createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
|
||||
return <BlockPageView key={ page.index } page={ page } />;
|
||||
return <TextPageView key={ page.index } page={ page } />;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const blocks = [];
|
||||
parseResult.content.forEach(page => {
|
||||
page.blocks.forEach(block => {
|
||||
parseResult.pages.forEach(page => {
|
||||
const textItems = [];
|
||||
page.items.forEach(block => {
|
||||
const category = block.type ? block.type : 'Unknown';
|
||||
blocks.push({
|
||||
textItems.push({
|
||||
category: category,
|
||||
text: blockToText(block)
|
||||
});
|
||||
});
|
||||
|
||||
page.items = textItems;
|
||||
});
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: [new BlockPage({
|
||||
index: 0,
|
||||
blocks: blocks
|
||||
})],
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,44 @@
|
||||
import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItemBlockPageView from '../../components/debug/TextItemBlockPageView.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
// Abstract class for transformations producing TextItemBlock(s) to be shown in the TextItemBlockPageView
|
||||
export default class ToTextItemBlockTransformation extends Transformation {
|
||||
|
||||
constructor(name) {
|
||||
super(name);
|
||||
if (this.constructor === ToTextItemBlockTransformation) {
|
||||
throw new TypeError("Can not construct abstract class.");
|
||||
}
|
||||
this.showWhitespaces = false;
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return true;
|
||||
}
|
||||
|
||||
showModificationCheckbox() {
|
||||
return true;
|
||||
}
|
||||
|
||||
createPageView(page, modificationsOnly) {
|
||||
return <TextItemBlockPageView
|
||||
key={ page.index }
|
||||
page={ page }
|
||||
modificationsOnly={ modificationsOnly }
|
||||
showWhitespaces={ this.showWhitespaces } />;
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
// The usual cleanup
|
||||
parseResult.messages = [];
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
|
||||
page.items.forEach(item => item.annotation = null);
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
}
|
@ -1,15 +1,15 @@
|
||||
import React from 'react';
|
||||
import Transformation from './Transformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import PdfBlockPageView from '../../components/debug/PdfBlockPageView.jsx';
|
||||
import TextItemPageView from '../../components/debug/TextItemPageView.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
// Abstract class for transformations producing a PdfBlockPage to be shown in the PdfBlockView
|
||||
export default class ToPdfBlockViewTransformation extends Transformation {
|
||||
// Abstract class for transformations producing TextItem(s) to be shown in the TextItemPageView
|
||||
export default class ToTextItemTransformation extends Transformation {
|
||||
|
||||
constructor(name) {
|
||||
super(name);
|
||||
if (this.constructor === ToPdfBlockViewTransformation) {
|
||||
if (this.constructor === ToTextItemTransformation) {
|
||||
throw new TypeError("Can not construct abstract class.");
|
||||
}
|
||||
this.showWhitespaces = false;
|
||||
@ -24,9 +24,9 @@ export default class ToPdfBlockViewTransformation extends Transformation {
|
||||
}
|
||||
|
||||
createPageView(page, modificationsOnly) {
|
||||
return <PdfBlockPageView
|
||||
return <TextItemPageView
|
||||
key={ page.index }
|
||||
pdfPage={ page }
|
||||
page={ page }
|
||||
modificationsOnly={ modificationsOnly }
|
||||
showWhitespaces={ this.showWhitespaces } />;
|
||||
}
|
||||
@ -34,11 +34,12 @@ export default class ToPdfBlockViewTransformation extends Transformation {
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
// The usual cleanup
|
||||
parseResult.messages = [];
|
||||
parseResult.content.forEach(page => {
|
||||
page.blocks = page.blocks.filter(block => !block.annotation || block.annotation !== REMOVED_ANNOTATION);
|
||||
page.blocks.forEach(block => block.annotation = null);
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
|
||||
page.items.forEach(item => item.annotation = null);
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -14,7 +14,7 @@ export default class Transformation {
|
||||
}
|
||||
|
||||
showPageSelection() {
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
showModificationCheckbox() {
|
||||
|
@ -1,10 +1,10 @@
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
// Converts vertical text to horizontal
|
||||
export default class VerticalToHorizontal extends ToPdfViewTransformation {
|
||||
export default class VerticalToHorizontal extends ToTextItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Vertical to Horizontal Text");
|
||||
@ -12,7 +12,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
var foundVerticals = 0;
|
||||
const newContent = parseResult.content.map(page => {
|
||||
const newPages = parseResult.pages.map(page => {
|
||||
const newTextItems = [];
|
||||
// var oneCharacterItems = [];
|
||||
|
||||
@ -33,7 +33,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
|
||||
|
||||
//TODO generic state machine code ?
|
||||
|
||||
const leftOver = page.textItems.reduce((oneCharacterItems, item) => {
|
||||
const leftOver = page.items.reduce((oneCharacterItems, item) => {
|
||||
if (item.text.trim().length == 1) {
|
||||
if (oneCharacterItems.length == 0) {
|
||||
oneCharacterItems.push(item);
|
||||
@ -84,12 +84,12 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
|
||||
|
||||
return {
|
||||
...page,
|
||||
textItems: newTextItems
|
||||
items: newTextItems
|
||||
};
|
||||
});
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: newContent,
|
||||
pages: newPages,
|
||||
messages: ["Converted " + foundVerticals + " verticals"]
|
||||
});
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
import PdfBlock from './models/PdfBlock.jsx';
|
||||
import TextItemBlock from './models/TextItemBlock.jsx';
|
||||
import TextItem from './models/TextItem.jsx';
|
||||
|
||||
export function minXFromBlocks(blocks:PdfBlock[]) {
|
||||
export function minXFromBlocks(blocks:TextItemBlock[]) {
|
||||
var minX = 999;
|
||||
blocks.forEach(block => {
|
||||
block.textItems.forEach(item => {
|
||||
|
Loading…
Reference in New Issue
Block a user