diff --git a/src/javascript/components/App.jsx b/src/javascript/components/App.jsx
index 080e1a5..747b043 100644
--- a/src/javascript/components/App.jsx
+++ b/src/javascript/components/App.jsx
@@ -28,10 +28,10 @@ export default class App extends React.Component {
mainView =
break;
case View.RESULT:
- mainView =
+ mainView =
break;
case View.DEBUG:
- mainView =
+ mainView =
break;
default:
throw `View ${this.props.appState.mainView} not supported!`;
@@ -46,7 +46,7 @@ export default class App extends React.Component {
- );
+ );
}
}
diff --git a/src/javascript/components/DebugView.jsx b/src/javascript/components/DebugView.jsx
index 5aaa42e..906c793 100644
--- a/src/javascript/components/DebugView.jsx
+++ b/src/javascript/components/DebugView.jsx
@@ -17,7 +17,7 @@ import ParseResult from '../models/ParseResult.jsx';
export default class DebugView extends React.Component {
static propTypes = {
- pdfPages: React.PropTypes.array.isRequired,
+ pages: React.PropTypes.array.isRequired,
transformations: React.PropTypes.array.isRequired,
};
@@ -71,12 +71,12 @@ export default class DebugView extends React.Component {
render() {
const {currentTransformation, pageNr} = this.state;
- const {pdfPages, transformations} = this.props;
+ const {pages, transformations} = this.props;
const currentTransformationName = transformations[currentTransformation].name;
var parseResult = new ParseResult({
- content: pdfPages
+ pages: pages
});
var lastTransformation;
for (var i = 0; i <= currentTransformation; i++) {
@@ -87,8 +87,8 @@ export default class DebugView extends React.Component {
lastTransformation = transformations[i];
}
- parseResult.content = parseResult.content.filter((elem, i) => pageNr == -1 || i == pageNr);
- const pageComponents = parseResult.content.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly));
+ parseResult.pages = parseResult.pages.filter((elem, i) => pageNr == -1 || i == pageNr);
+ const pageComponents = parseResult.pages.map(page => lastTransformation.createPageView(page, this.state.modificationsOnly));
const showModificationCheckbox = lastTransformation.showModificationCheckbox();
const statisticsAsList = Object.keys(parseResult.globals).map((key, i) => {
return
@@ -121,7 +121,7 @@ export default class DebugView extends React.Component {
last
ellipsis
boundaryLinks
- items={ pdfPages.length }
+ items={ pages.length }
maxButtons={ 17 }
activePage={ this.state.pageNr + 1 }
onSelect={ this.selectPage.bind(this) } />
@@ -194,6 +194,6 @@ export default class DebugView extends React.Component {
{ pageComponents }
- );
+ );
}
}
\ No newline at end of file
diff --git a/src/javascript/components/LoadingView.jsx b/src/javascript/components/LoadingView.jsx
index 4d1c44a..b0748f3 100644
--- a/src/javascript/components/LoadingView.jsx
+++ b/src/javascript/components/LoadingView.jsx
@@ -3,7 +3,7 @@ import React from 'react';
import pdfjs from 'pdfjs-dist'; // eslint-disable-line no-unused-vars
import { Line } from 'rc-progress';
-import PdfPage from '../models/PdfPage.jsx';
+import Page from '../models/Page.jsx';
import TextItem from '../models/TextItem.jsx';
export default class LoadingView extends React.Component {
@@ -17,19 +17,19 @@ export default class LoadingView extends React.Component {
super(props);
this.state = {
parsedPages: 0,
- pdfPages: []
+ pages: []
};
}
- anounceInitialParse(pdfPages) {
+ anounceInitialParse(pages) {
this.setState({
- pdfPages: pdfPages
+ pages: pages
});
}
anouncePageParsed(index, textItems) {
//TODO might make problems.. concat unordered and order at the end ?
- this.state.pdfPages[index].textItems = textItems; // eslint-disable-line react/no-direct-mutation-state
+ this.state.pages[index].items = textItems; // eslint-disable-line react/no-direct-mutation-state
this.setState({
parsedPages: this.state.parsedPages + 1
});
@@ -44,13 +44,13 @@ export default class LoadingView extends React.Component {
// console.debug(pdfDocument);
const numPages = pdfDocument.numPages;
// const numPages = 4; // hack
- var pdfPages = [];
+ var pages = [];
for (var i = 0; i < numPages; i++) {
- pdfPages.push(new PdfPage({
+ pages.push(new Page({
index: i
}));
}
- anounceInitialParseFunction(pdfPages);
+ anounceInitialParseFunction(pages);
for (var j = 1; j <= numPages; j++) {
pdfDocument.getPage(j).then(function(page) {
var scale = 1.0;
@@ -96,14 +96,14 @@ export default class LoadingView extends React.Component {
}
render() {
- const {parsedPages, pdfPages} = this.state;
+ const {parsedPages, pages} = this.state;
var percentDone = 0;
var details = '';
- if (pdfPages.length > 0) {
- percentDone = parsedPages / pdfPages.length * 100;
- details = parsedPages + ' / ' + pdfPages.length
- if (parsedPages == pdfPages.length) {
- this.props.storePdfPagesFunction(this.state.pdfPages);
+ if (pages.length > 0) {
+ percentDone = parsedPages / pages.length * 100;
+ details = parsedPages + ' / ' + pages.length
+ if (parsedPages == pages.length) {
+ this.props.storePdfPagesFunction(this.state.pages);
}
}
return (
diff --git a/src/javascript/components/ResultView.jsx b/src/javascript/components/ResultView.jsx
index 2111286..ab4489a 100644
--- a/src/javascript/components/ResultView.jsx
+++ b/src/javascript/components/ResultView.jsx
@@ -10,7 +10,7 @@ import ParseResult from '../models/ParseResult.jsx';
export default class ResultView extends React.Component {
static propTypes = {
- pdfPages: React.PropTypes.array.isRequired,
+ pages: React.PropTypes.array.isRequired,
transformations: React.PropTypes.array.isRequired,
};
@@ -19,9 +19,9 @@ export default class ResultView extends React.Component {
}
componentWillMount() {
- const {pdfPages, transformations} = this.props;
+ const {pages, transformations} = this.props;
var parseResult = new ParseResult({
- content: pdfPages
+ pages: pages
});
var lastTransformation;
transformations.forEach(transformation => {
@@ -32,10 +32,15 @@ export default class ResultView extends React.Component {
lastTransformation = transformation;
});
+ var text = '';
+ parseResult.pages.forEach(page => {
+ page.items.forEach(item => {
+ text += item + '\n';
+ });
+ });
this.state = {
preview: true,
- text: parseResult.content[0].text
-
+ text: text
};
}
@@ -90,7 +95,7 @@ export default class ResultView extends React.Component {
{ textComponent }
- );
+ );
}
}
\ No newline at end of file
diff --git a/src/javascript/components/debug/BlockPageView.jsx b/src/javascript/components/debug/BlockPageView.jsx
deleted file mode 100644
index 6d35114..0000000
--- a/src/javascript/components/debug/BlockPageView.jsx
+++ /dev/null
@@ -1,49 +0,0 @@
-import React from 'react';
-
-import Table from 'react-bootstrap/lib/Table'
-
-export default class BlockPageView extends React.Component {
-
- static propTypes = {
- page: React.PropTypes.object.isRequired,
- };
-
- render() {
- var blocks = this.props.page.blocks;
-
- const content =
-
-
-
-
- #
- |
-
- Category
- |
-
- Text
- |
-
-
-
- { blocks.map((block, i) =>
-
- { i }
- |
-
- { block.category }
- |
-
- { block.text }
- |
-
- ) }
-
-
-
- return (
- content
- );
- }
-}
\ No newline at end of file
diff --git a/src/javascript/components/debug/MarkdownPageView.jsx b/src/javascript/components/debug/MarkdownPageView.jsx
index 0f60503..1b979ea 100644
--- a/src/javascript/components/debug/MarkdownPageView.jsx
+++ b/src/javascript/components/debug/MarkdownPageView.jsx
@@ -1,23 +1,17 @@
import React from 'react';
+import PageView from './PageView.jsx';
import Remarkable from 'remarkable';
-export default class MarkdownPageView extends React.Component {
+export default class MarkdownPageView extends PageView {
- static propTypes = {
- page: React.PropTypes.object.isRequired,
- };
-
- render() {
+ createItemViews(items, showWhitespaces) { // eslint-disable-line no-unused-vars
const remarkable = new Remarkable({
breaks: true
});
-
- const html = remarkable.render(this.props.page.text);
- return (
-
- );
+ const html = remarkable.render(items[0]);
+ return
}
}
\ No newline at end of file
diff --git a/src/javascript/components/debug/PageView.jsx b/src/javascript/components/debug/PageView.jsx
new file mode 100644
index 0000000..b79f13c
--- /dev/null
+++ b/src/javascript/components/debug/PageView.jsx
@@ -0,0 +1,41 @@
+import React from 'react';
+
+// Abstract view for a Page
+export default class PageView extends React.Component {
+
+ static propTypes = {
+ page: React.PropTypes.object.isRequired,
+ modificationsOnly: React.PropTypes.bool,
+ showWhitespaces: React.PropTypes.bool
+ };
+
+ createItemViews(items, showWhitespaces) { // eslint-disable-line no-unused-vars
+ throw new TypeError("Do not call abstract method foo from child.");
+ }
+
+ render() {
+ const {page, modificationsOnly, showWhitespaces} = this.props;
+
+ var items = page.items;
+ if (modificationsOnly) {
+ items = items.filter(block => block.annotation);
+ }
+
+
+ var content;
+ if (items.length == 0 && modificationsOnly) {
+ content =
+ } else {
+ const itemViews = this.createItemViews(items, showWhitespaces);
+ const header = "Page " + (page.index + 1);
+ content =
+
{ header }
+
+ { itemViews }
+
+ }
+ return (
+ content
+ );
+ }
+}
\ No newline at end of file
diff --git a/src/javascript/components/debug/PdfPageView.jsx b/src/javascript/components/debug/PdfPageView.jsx
deleted file mode 100644
index a6f71aa..0000000
--- a/src/javascript/components/debug/PdfPageView.jsx
+++ /dev/null
@@ -1,35 +0,0 @@
-import React from 'react';
-import TextItemTable from './TextItemTable.jsx';
-
-// View for a PdfPage
-export default class PdfPageView extends React.Component {
-
- static propTypes = {
- pdfPage: React.PropTypes.object.isRequired,
- modificationsOnly: React.PropTypes.bool.isRequired,
- showWhitespaces: React.PropTypes.bool
- };
-
- render() {
- const {pdfPage, modificationsOnly, showWhitespaces} = this.props;
- const header = "Page " + (pdfPage.index + 1);
- var textItems = pdfPage.textItems;
- if (modificationsOnly) {
- textItems = textItems.filter(item => item.annotation);
- }
-
- var content;
- if (textItems.length == 0 && modificationsOnly) {
- content =
- } else {
- content =
-
{ header }
-
-
- }
-
- return (
- content
- );
- }
-}
\ No newline at end of file
diff --git a/src/javascript/components/debug/PdfBlockPageView.jsx b/src/javascript/components/debug/TextItemBlockPageView.jsx
similarity index 64%
rename from src/javascript/components/debug/PdfBlockPageView.jsx
rename to src/javascript/components/debug/TextItemBlockPageView.jsx
index 706d57b..c0bb89c 100644
--- a/src/javascript/components/debug/PdfBlockPageView.jsx
+++ b/src/javascript/components/debug/TextItemBlockPageView.jsx
@@ -1,24 +1,12 @@
import React from 'react';
+import PageView from './PageView.jsx';
import TextItemTable from './TextItemTable.jsx';
-// View for a PdfBlockPage
-export default class PdfBlockPageView extends React.Component {
+// View for a Page which items are of kind TextItemBlock
+export default class TextItemBlockPageView extends PageView {
- static propTypes = {
- pdfPage: React.PropTypes.object.isRequired,
- modificationsOnly: React.PropTypes.bool.isRequired,
- showWhitespaces: React.PropTypes.bool
- };
-
- render() {
- const {pdfPage, modificationsOnly, showWhitespaces} = this.props;
-
- var blocks = pdfPage.blocks;
- if (modificationsOnly) {
- blocks = blocks.filter(block => block.annotation);
- }
-
- const blockTables = blocks.map((block, i) => {
+ createItemViews(items, showWhitespaces) {
+ const blockTables = items.map((block, i) => {
var textItems = block.textItems;
const blockType = block.type ? ' - ' + block.type : null;
const blockAnnotation = block.annotation ? { ' - ' + block.annotation.category }
@@ -56,19 +44,7 @@ export default class PdfBlockPageView extends React.Component {
});
-
- var content;
- if (blocks.length == 0 && modificationsOnly) {
- content =
- } else {
- const header = "Page " + (pdfPage.index + 1);
- content =
-
{ header }
- { blockTables }
-
- }
- return (
- content
- );
+ return blockTables;
}
+
}
\ No newline at end of file
diff --git a/src/javascript/components/debug/TextItemPageView.jsx b/src/javascript/components/debug/TextItemPageView.jsx
new file mode 100644
index 0000000..7a3d628
--- /dev/null
+++ b/src/javascript/components/debug/TextItemPageView.jsx
@@ -0,0 +1,12 @@
+import React from 'react';
+import PageView from './PageView.jsx';
+import TextItemTable from './TextItemTable.jsx';
+
+// View for a Page which items are of kind TextItem
+export default class TextItemPageView extends PageView {
+
+ createItemViews(items, showWhitespaces) {
+ return
+ }
+
+}
\ No newline at end of file
diff --git a/src/javascript/components/debug/TextPageView.jsx b/src/javascript/components/debug/TextPageView.jsx
new file mode 100644
index 0000000..90badd9
--- /dev/null
+++ b/src/javascript/components/debug/TextPageView.jsx
@@ -0,0 +1,41 @@
+import React from 'react';
+import PageView from './PageView.jsx';
+import Table from 'react-bootstrap/lib/Table'
+
+export default class TextPageView extends PageView {
+
+ createItemViews(items, showWhitespaces) { // eslint-disable-line no-unused-vars
+ return
+
+
+
+
+ #
+ |
+
+ Category
+ |
+
+ Text
+ |
+
+
+
+ { items.map((block, i) =>
+
+ { i }
+ |
+
+ { block.category }
+ |
+
+ { block.text }
+ |
+
+ ) }
+
+
+
+ }
+
+}
\ No newline at end of file
diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx
index f608d90..b4fea84 100644
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@@ -26,7 +26,7 @@ export default class AppState {
this.renderFunction = options.renderFunction;
this.mainView = View.UPLOAD;
this.fileBuffer;
- this.pdfPages = [];
+ this.pages = [];
this.transformations = [
new CalculateGlobalStats(),
new RemoveRepetitiveElements(),
@@ -66,8 +66,8 @@ export default class AppState {
this.render()
}
- storePdfPages(pdfPages) {
- this.pdfPages = pdfPages;
+ storePdfPages(pages) {
+ this.pages = pages;
this.fileBuffer = null;
this.mainView = View.RESULT;
this.render();
diff --git a/src/javascript/models/BlockPage.jsx b/src/javascript/models/BlockPage.jsx
deleted file mode 100644
index a61970f..0000000
--- a/src/javascript/models/BlockPage.jsx
+++ /dev/null
@@ -1,9 +0,0 @@
-// A page which holds blocks displayable via BlockPageView
-export default class BlockPage {
-
- constructor(options) {
- this.index = options.index;
- this.blocks = options.blocks;
- }
-
-}
diff --git a/src/javascript/models/MarkdownElements.jsx b/src/javascript/models/MarkdownElements.jsx
index 3c5371b..1002220 100644
--- a/src/javascript/models/MarkdownElements.jsx
+++ b/src/javascript/models/MarkdownElements.jsx
@@ -1,4 +1,4 @@
-import PdfBlock from './BlockPage.jsx';
+import TextItemBlock from './TextItemBlock.jsx';
import TextItemCombiner from './TextItemCombiner.jsx';
import TextItem from './TextItem.jsx';
@@ -31,7 +31,7 @@ export function headlineByLevel(level) {
throw "Unsupported headline level: " + level;
}
-export function blockToText(block: PdfBlock) {
+export function blockToText(block: TextItemBlock) {
switch (block.type) {
case CODE_BLOCK:
return '```\n' + concatTextItems(block.textItems) + '```'
diff --git a/src/javascript/models/Page.jsx b/src/javascript/models/Page.jsx
new file mode 100644
index 0000000..63a4820
--- /dev/null
+++ b/src/javascript/models/Page.jsx
@@ -0,0 +1,9 @@
+// A page which holds PageItems displayable via PdfPageView
+export default class Page {
+
+ constructor(options) {
+ this.index = options.index;
+ this.items = options.items || []; //PageItem
+ }
+
+}
diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx
new file mode 100644
index 0000000..cb01549
--- /dev/null
+++ b/src/javascript/models/PageItem.jsx
@@ -0,0 +1,13 @@
+// A abstract PageItem class, can be TextItem, or TextItemBlock
+export default class PageItem {
+
+ constructor(options) {
+ if (this.constructor === PageItem) {
+ throw new TypeError("Can not construct abstract class.");
+ }
+ this.type = options.type;
+ this.annotation = options.annotation;
+ this.parsedElements = options.parsedElements;
+ }
+
+}
diff --git a/src/javascript/models/ParseResult.jsx b/src/javascript/models/ParseResult.jsx
index 709e1a6..eb2bdc1 100644
--- a/src/javascript/models/ParseResult.jsx
+++ b/src/javascript/models/ParseResult.jsx
@@ -2,7 +2,7 @@
export default class ParseResult {
constructor(options) {
- this.content = options.content; // like PdfPages[]
+ this.pages = options.pages; // like Page[]
this.globals = options.globals; // properties accasable for all the following transformations in debug mode
this.messages = options.messages; // something to show only for the transformation in debug mode
}
diff --git a/src/javascript/models/PdfBlock.jsx b/src/javascript/models/PdfBlock.jsx
deleted file mode 100644
index 3423cce..0000000
--- a/src/javascript/models/PdfBlock.jsx
+++ /dev/null
@@ -1,11 +0,0 @@
-// A block within a PdfPage
-export default class PdfBlock {
-
- constructor(options) {
- this.textItems = options.textItems;
- this.type = options.type;
- this.annotation = options.annotation;
- this.parsedElements = options.parsedElements;
- }
-
-}
diff --git a/src/javascript/models/PdfBlockPage.jsx b/src/javascript/models/PdfBlockPage.jsx
deleted file mode 100644
index aca6ab8..0000000
--- a/src/javascript/models/PdfBlockPage.jsx
+++ /dev/null
@@ -1,9 +0,0 @@
-// A page which holds TextItems grouped by block displayable via PdfPageBlockView
-export default class PdfBlockPage {
-
- constructor(options) {
- this.index = options.index;
- this.blocks = options.blocks;
- }
-
-}
diff --git a/src/javascript/models/PdfPage.jsx b/src/javascript/models/PdfPage.jsx
deleted file mode 100644
index 36a163b..0000000
--- a/src/javascript/models/PdfPage.jsx
+++ /dev/null
@@ -1,9 +0,0 @@
-// A page which holds TextItems displayable via PdfPageView
-export default class PdfPage {
-
- constructor(options) {
- this.index = options.index;
- this.textItems = []
- }
-
-}
diff --git a/src/javascript/models/TextItem.jsx b/src/javascript/models/TextItem.jsx
index 2fa1a33..fb1ab2c 100644
--- a/src/javascript/models/TextItem.jsx
+++ b/src/javascript/models/TextItem.jsx
@@ -1,7 +1,10 @@
+import PageItem from './PageItem.jsx'
+
//A text item, i.e. a line or a word within a page
-export default class TextItem {
+export default class TextItem extends PageItem {
constructor(options) {
+ super(options);
this.x = options.x;
this.y = options.y;
this.width = options.width;
@@ -10,8 +13,6 @@ export default class TextItem {
this.font = options.font;
this.fontAscent = options.fontAscent;
this.fontDescent = options.fontDescent;
- this.annotation = options.annotation;
- this.markdownElement = options.markdownElement;
}
}
diff --git a/src/javascript/models/TextItemBlock.jsx b/src/javascript/models/TextItemBlock.jsx
new file mode 100644
index 0000000..614c422
--- /dev/null
+++ b/src/javascript/models/TextItemBlock.jsx
@@ -0,0 +1,11 @@
+import PageItem from './PageItem.jsx'
+
+// A block of TextItem[] within a Page
+export default class TextItemBlock extends PageItem {
+
+ constructor(options) {
+ super(options);
+ this.textItems = options.textItems;
+ }
+
+}
diff --git a/src/javascript/models/TextPage.jsx b/src/javascript/models/TextPage.jsx
deleted file mode 100644
index 88f2806..0000000
--- a/src/javascript/models/TextPage.jsx
+++ /dev/null
@@ -1,9 +0,0 @@
-// A page which holds TextItems displayable via PdfPageView
-export default class TextPage {
-
- constructor(options) {
- this.index = options.index;
- this.text = options.text;
- }
-
-}
diff --git a/src/javascript/models/markdown/Headline.jsx b/src/javascript/models/markdown/Headline.jsx
deleted file mode 100644
index c633a23..0000000
--- a/src/javascript/models/markdown/Headline.jsx
+++ /dev/null
@@ -1,17 +0,0 @@
-import MarkdownElement from './MarkdownElement.jsx';
-
-export default class Headline extends MarkdownElement {
-
- constructor(options) {
- super({
- newLineBefore: true,
- newLineAfter: true
- });
- this.level = options.level;
- }
-
- transformText(text) {
- return '#'.repeat(this.level) + ' ' + text;
- }
-
-}
diff --git a/src/javascript/models/markdown/MarkdownElement.jsx b/src/javascript/models/markdown/MarkdownElement.jsx
deleted file mode 100644
index c860e7d..0000000
--- a/src/javascript/models/markdown/MarkdownElement.jsx
+++ /dev/null
@@ -1,16 +0,0 @@
-// An text item detected as markdown element
-export default class MarkdownElement {
-
- constructor(options) {
- if (this.constructor === MarkdownElement) {
- throw new TypeError("Can not construct abstract class.");
- }
- this.newLineBefore = options.newLineBefore;
- this.newLineAfter = options.newLineAfter;
- }
-
- transformText(text) { // eslint-disable-line no-unused-vars
- throw new TypeError("Do not call abstract method foo from child.");
- }
-
-}
diff --git a/src/javascript/models/transformations/CalculateGlobalStats.jsx b/src/javascript/models/transformations/CalculateGlobalStats.jsx
index 4d6c8a3..b34d080 100644
--- a/src/javascript/models/transformations/CalculateGlobalStats.jsx
+++ b/src/javascript/models/transformations/CalculateGlobalStats.jsx
@@ -1,7 +1,7 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
+import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
-export default class CalculateGlobalStats extends ToPdfViewTransformation {
+export default class CalculateGlobalStats extends ToTextItemTransformation {
constructor() {
super("Calculate Statistics");
@@ -14,8 +14,8 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
const fontToOccurrence = {};
var maxHeight = 0;
var maxHeightFont;
- parseResult.content.forEach(page => {
- page.textItems.forEach(item => {
+ parseResult.pages.forEach(page => {
+ page.items.forEach(item => {
heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
fontToOccurrence[item.font] = fontToOccurrence[item.font] ? fontToOccurrence[item.font] + 1 : 1;
if (item.height > maxHeight) {
@@ -29,9 +29,9 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
// Parse line distances
const distanceToOccurrence = {};
- parseResult.content.forEach(page => {
+ parseResult.pages.forEach(page => {
var lastItemOfMostUsedHeight;
- page.textItems.forEach(item => {
+ page.items.forEach(item => {
if (item.height == mostUsedHeight && item.text.trim().length > 0) {
if (lastItemOfMostUsedHeight && item.y != lastItemOfMostUsedHeight.y) {
const distance = lastItemOfMostUsedHeight.y - item.y;
@@ -49,10 +49,10 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
//Make a copy of the originals so all following transformation don't modify them
- const newContent = parseResult.content.map(pdfPage => {
+ const newPages = parseResult.pages.map(page => {
return {
- ...pdfPage,
- textItems: pdfPage.textItems.map(textItem => {
+ ...page,
+ items: page.items.map(textItem => {
return {
...textItem,
}
@@ -61,7 +61,7 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
});
return new ParseResult({
...parseResult,
- content: newContent,
+ pages: newPages,
globals: {
mostUsedHeight: mostUsedHeight,
mostUsedFont: mostUsedFont,
diff --git a/src/javascript/models/transformations/CombineSameY.jsx b/src/javascript/models/transformations/CombineSameY.jsx
index 402067f..eb3e2b8 100644
--- a/src/javascript/models/transformations/CombineSameY.jsx
+++ b/src/javascript/models/transformations/CombineSameY.jsx
@@ -1,4 +1,4 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
+import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import TextItem from '../TextItem.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
@@ -40,7 +40,7 @@ function combineTextItems(textItems:TextItem[]) {
});
}
-export default class CombineSameY extends ToPdfViewTransformation {
+export default class CombineSameY extends ToTextItemTransformation {
constructor() {
super("Combine Text On Same Y");
diff --git a/src/javascript/models/transformations/DetectCodeBlocks.jsx b/src/javascript/models/transformations/DetectCodeBlocks.jsx
index ff8c4b2..486b220 100644
--- a/src/javascript/models/transformations/DetectCodeBlocks.jsx
+++ b/src/javascript/models/transformations/DetectCodeBlocks.jsx
@@ -1,13 +1,13 @@
-import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
+import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
-import PdfBlock from '../PdfBlock.jsx';
+import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { CODE_BLOCK } from '../MarkdownElements.jsx';
import { minXFromBlocks } from '../../textItemFunctions.jsx';
//Detect quotes, code etc.. which is transformed to markdown code syntax
-export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
+export default class DetectCodeBlocks extends ToTextItemBlockTransformation {
constructor() {
super("Detect Code/Quotes");
@@ -21,8 +21,8 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
mostUsedDistance: mostUsedDistance
});
- parseResult.content.forEach(page => {
- var minX = minXFromBlocks(page.blocks);
+ parseResult.pages.forEach(page => {
+ var minX = minXFromBlocks(page.items);
if (minX) {
const itemAreSuitable = (items) => {
for ( let item of items ) {
@@ -37,7 +37,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
};
const newBlocks = [];
var preceedingCodeBlock;
- page.blocks.forEach(block => {
+ page.items.forEach(block => {
if (block.type) {
newBlocks.push(block);
preceedingCodeBlock = null;
@@ -54,7 +54,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
preceedingCodeBlock.textItems = preceedingCodeBlock.textItems.concat(combineResult.textItems);
preceedingCodeBlock.parsedElements.add(combineResult.parsedElements);
} else {
- preceedingCodeBlock = new PdfBlock({
+ preceedingCodeBlock = new TextItemBlock({
type: CODE_BLOCK,
annotation: ADDED_ANNOTATION,
textItems: combineResult.textItems,
@@ -69,7 +69,7 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
}
}
});
- page.blocks = newBlocks;
+ page.items = newBlocks;
}
});
diff --git a/src/javascript/models/transformations/DetectFootnotes.jsx b/src/javascript/models/transformations/DetectFootnotes.jsx
index 96282a8..6d9c98c 100644
--- a/src/javascript/models/transformations/DetectFootnotes.jsx
+++ b/src/javascript/models/transformations/DetectFootnotes.jsx
@@ -1,12 +1,12 @@
-import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
+import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
-import PdfBlock from '../PdfBlock.jsx';
+import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { FOOTNOTE_BLOCK } from '../MarkdownElements.jsx';
//Detect quotes, code etc.. which is transformed to markdown code syntax
-export default class DetectFootnotes extends ToPdfBlockViewTransformation {
+export default class DetectFootnotes extends ToTextItemBlockTransformation {
constructor() {
super("Detect Footnotes");
@@ -19,17 +19,17 @@ export default class DetectFootnotes extends ToPdfBlockViewTransformation {
mostUsedDistance: mostUsedDistance,
});
- parseResult.content.forEach(page => {
+ parseResult.pages.forEach(page => {
const newBlocks = [];
var lastFootnote;
- page.blocks.forEach(block => {
+ page.items.forEach(block => {
newBlocks.push(block);
if (!block.type && block.textItems[0].y < 200) {
const combineResult = textCombiner.combine(block.textItems);
if (combineResult.parsedElements.footnotes.length > 0) {
block.annotation = REMOVED_ANNOTATION;
foundFootnotes.push.apply(foundFootnotes, combineResult.parsedElements.footnotes);
- lastFootnote = new PdfBlock({
+ lastFootnote = new TextItemBlock({
textItems: combineResult.textItems,
type: FOOTNOTE_BLOCK,
annotation: ADDED_ANNOTATION,
@@ -48,7 +48,7 @@ export default class DetectFootnotes extends ToPdfBlockViewTransformation {
lastFootnote = null;
}
});
- page.blocks = newBlocks;
+ page.items = newBlocks;
});
return new ParseResult({
diff --git a/src/javascript/models/transformations/DetectHeadlines.jsx b/src/javascript/models/transformations/DetectHeadlines.jsx
new file mode 100644
index 0000000..44c847a
--- /dev/null
+++ b/src/javascript/models/transformations/DetectHeadlines.jsx
@@ -0,0 +1,198 @@
+import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
+import ParseResult from '../ParseResult.jsx';
+import TextItemBlock from '../TextItemBlock.jsx';
+import TextItemCombiner from '../TextItemCombiner.jsx';
+import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
+import { HEADLINE1, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
+
+//Detect headlines
+export default class DetectHeadlines extends ToTextItemBlockTransformation {
+
+ constructor() {
+ super("Detect Headlines");
+ }
+
+ transform(parseResult:ParseResult) {
+ var foundHeadlines = 0;
+ const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals;
+
+ const textCombiner = new TextItemCombiner({
+ mostUsedDistance: mostUsedDistance,
+ });
+
+ //Set max headlines (all headers on the same page are max level 2)
+ const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight, textCombiner);
+
+ var headlineHeightFlowBeforeToc = [];
+ var headlineHeightsOccurenceBeforeToc = {};
+ var firstPageAfterToc = 0;
+ if (tocPages && tocPages.length > 0) {
+ [headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], textCombiner, mostUsedHeight, maxHeaderPages);
+ firstPageAfterToc = tocPages[tocPages.length - 1] + 1;
+ }
+
+ const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, textCombiner, mostUsedHeight, maxHeaderPages);
+
+
+ // TODO ==> do flow analysis (remove out of flow or snap, start with 2nd)
+ // TODO ==> parse seperately between beforeToc and after
+ // TODO ==> Kala chakra, all uppercase
+ // TODO ==> TOC headlines
+
+ //var topHeadlinePassed = false;
+ const headlineHeightMap = {};
+ const headlineSizePerLevel = {};
+ var currentHeadlineLevel;
+ parseResult.pages.forEach(page => {
+ const newBlocks = [];
+ page.items.forEach(block => {
+ newBlocks.push(block);
+ if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
+ const combineResult = textCombiner.combine(block.textItems);
+ if (combineResult.textItems.length == 1) {
+ const height = combineResult.textItems[0].height;
+ if (height == maxHeight) {
+ block.annotation = REMOVED_ANNOTATION;
+ currentHeadlineLevel = 1;
+ headlineSizePerLevel[currentHeadlineLevel] = height
+ addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
+ }
+ // else if (currentHeadlineLevel) {
+ // const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel];
+ // if (height < currentLevelSize) {
+ // const nextLevelSize = headlineSizePerLevel[currentHeadlineLevel + 1];
+ // // if(!nextLevelSize)
+ // if (currentHeadlineLevel < 6) {
+ // currentHeadlineLevel++;
+ // }
+ // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
+ // headlineSizePerLevel[currentHeadlineLevel] = height;
+ // } else if (height > currentLevelSize) {
+ // const preLevelSize = headlineSizePerLevel[currentHeadlineLevel - 1];
+ // if (currentHeadlineLevel > 1) {
+ // currentHeadlineLevel--;
+ // }
+ // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
+ // headlineSizePerLevel[currentHeadlineLevel] = height;
+ // } else {
+ // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
+ // }
+ // }
+ }
+ }
+ });
+ page.items = newBlocks;
+ });
+
+ const heightToOccurrence = {};
+ const fontToOccurrence = {};
+ // parseResult.content.forEach(page => {
+ // const newBlocks = [];
+ // page.blocks.forEach(block => {
+ // newBlocks.push(block);
+ // if (!block.type && block.textItems[0].height > mostUsedHeight) {
+ // foundHeadlines++;
+ // block.annotation = REMOVED_ANNOTATION;
+ // const combineResult = textCombiner.combine(block.textItems);
+ // const height = combineResult.textItems[0].height;
+ // const font = combineResult.textItems[0].font;
+ // heightToOccurrence[height] = heightToOccurrence[height] ? heightToOccurrence[height] + 1 : 1;
+ // fontToOccurrence[font] = fontToOccurrence[font] ? fontToOccurrence[font] + 1 : 1;
+ // newBlocks.push(new PdfBlock({
+ // textItems: combineResult.textItems,
+ // type: HEADLINE1,
+ // annotation: ADDED_ANNOTATION,
+ // parsedElements: combineResult.parsedElements
+ // }));
+ // }
+ // });
+ // page.blocks = newBlocks;
+ // });
+
+ return new ParseResult({
+ ...parseResult,
+ messages: [
+ 'Found headlines: ' + foundHeadlines,
+ 'Height repetition: ' + JSON.stringify(heightToOccurrence),
+ 'Font repetition: ' + JSON.stringify(fontToOccurrence),
+ 'Pages with max Header: ' + maxHeaderPages,
+ 'Headline Height Flow (before TOC): ' + headlineHeightFlowBeforeToc,
+ 'Headline Heights Occurence (before TOC): ' + JSON.stringify(headlineHeightsOccurenceBeforeToc),
+ 'Headline Height Flow: ' + headlineHeightFlowAfterToc,
+ 'Headline Heights Occurence: ' + JSON.stringify(headlineHeightsOccurenceAfterToc),
+ ]
+ });
+ }
+
+}
+
+function addNewBlock(newBlocks, combineResult, headlineLevel) {
+ newBlocks.push(new TextItemBlock({
+ textItems: combineResult.textItems,
+ type: headlineLevel,
+ annotation: ADDED_ANNOTATION,
+ parsedElements: combineResult.parsedElements
+ }));
+}
+
+function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
+ // Find pages with max height
+ const maxHeaderPagesSet = new Set();
+ pages.forEach(page => {
+ page.items.forEach(block => {
+ if (!block.type && block.textItems[0].height == maxHeight) {
+ maxHeaderPagesSet.add(page);
+ }
+ });
+ });
+
+ // Now convert those pages to headlines
+ const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
+ maxHeaderPagesSet.forEach(pageWithMaxHeader => {
+ const newBlocks = [];
+ pageWithMaxHeader.items.forEach(block => {
+ newBlocks.push(block);
+ const height = block.textItems[0].height;
+ if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
+ block.annotation = REMOVED_ANNOTATION;
+ const combineResult = textCombiner.combine(block.textItems);
+ if (height == maxHeight) {
+ addNewBlock(newBlocks, combineResult, HEADLINE1);
+ } else if (combineResult.textItems.length == 1) {
+ addNewBlock(newBlocks, combineResult, HEADLINE2);
+ }
+ }
+ });
+ pageWithMaxHeader.items = newBlocks;
+ });
+
+ return Array.from(maxHeaderPagesSet).map(page => page.index + 1);
+}
+
+function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeight, maxHeaderPages) {
+ const headlineHeightFlow = [];
+ const headlineHeightsOccurences = {};
+ var lastHeadlineHeight;
+ for (var i = from; i < to; i++) {
+ const page = pages[i];
+ if (!maxHeaderPages.includes(page.index + 1)) {
+ page.items.forEach(block => {
+ if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
+ const combineResult = textCombiner.combine(block.textItems);
+ if (combineResult.textItems.length == 1) {
+ const height = combineResult.textItems[0].height;
+ headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ;
+ if (!lastHeadlineHeight || height != lastHeadlineHeight) {
+ headlineHeightFlow.push(height);
+ //headlineFontFlow.push(combineResult.textItems[0].font)
+ lastHeadlineHeight = height;
+ }
+ }
+ }
+ });
+ }
+ }
+
+ return [headlineHeightFlow, headlineHeightsOccurences];
+}
+
diff --git a/src/javascript/models/transformations/DetectLists.jsx b/src/javascript/models/transformations/DetectLists.jsx
index e2bd6f6..1ac0459 100644
--- a/src/javascript/models/transformations/DetectLists.jsx
+++ b/src/javascript/models/transformations/DetectLists.jsx
@@ -1,14 +1,14 @@
-import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
+import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
-import PdfBlock from '../PdfBlock.jsx';
+import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { PARAGRAPH, LIST_BLOCK } from '../MarkdownElements.jsx';
import { minXFromBlocks } from '../../textItemFunctions.jsx';
//Detect quotes, code etc.. which is transformed to markdown code syntax
-export default class DetectLists extends ToPdfBlockViewTransformation {
+export default class DetectLists extends ToTextItemBlockTransformation {
constructor() {
super("Detect Lists");
@@ -21,11 +21,11 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
mostUsedDistance: mostUsedDistance
});
- parseResult.content.forEach(page => {
- var minX = minXFromBlocks(page.blocks);
+ parseResult.pages.forEach(page => {
+ var minX = minXFromBlocks(page.items);
if (minX) {
const newBlocks = [];
- page.blocks.forEach(block => {
+ page.items.forEach(block => {
newBlocks.push(block);
if (!block.type) {
const combineResult = textCombiner.combine(block.textItems);
@@ -81,14 +81,14 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
});
if (itemsBeforeFirstLineItem.length > 0) {
- newBlocks.push(new PdfBlock({
+ newBlocks.push(new TextItemBlock({
textItems: itemsBeforeFirstLineItem,
type: PARAGRAPH,
annotation: ADDED_ANNOTATION
}));
}
//TODO display with whitespace pre support
- newBlocks.push(new PdfBlock({
+ newBlocks.push(new TextItemBlock({
textItems: listBlockItems,
type: LIST_BLOCK,
annotation: ADDED_ANNOTATION,
@@ -97,7 +97,7 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
}
}
});
- page.blocks = newBlocks;
+ page.items = newBlocks;
}
});
diff --git a/src/javascript/models/transformations/DetectPdfBlocks.jsx b/src/javascript/models/transformations/DetectPdfBlocks.jsx
index 34e9d12..78d5fce 100644
--- a/src/javascript/models/transformations/DetectPdfBlocks.jsx
+++ b/src/javascript/models/transformations/DetectPdfBlocks.jsx
@@ -1,10 +1,10 @@
-import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
+import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
+import Page from '../Page.jsx';
import ParseResult from '../ParseResult.jsx';
-import PdfBlockPage from '../PdfBlockPage.jsx';
-import PdfBlock from '../PdfBlock.jsx';
+import TextItemBlock from '../TextItemBlock.jsx';
import { minXFromTextItems } from '../../textItemFunctions.jsx';
-export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
+export default class DetectPdfBlocks extends ToTextItemBlockTransformation {
constructor() {
super("Detect Blocks");
@@ -13,20 +13,20 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var createdBlocks = 0;
- const newContent = parseResult.content.map(page => {
- var minX = minXFromTextItems(page.textItems);
+ const newPages = parseResult.pages.map(page => {
+ var minX = minXFromTextItems(page.items);
const blocks = [];
var textItemsInBlock = [];
const completBlock = () => {
if (textItemsInBlock.length > 0) { //can happen on empty page
- blocks.push(new PdfBlock({
+ blocks.push(new TextItemBlock({
textItems: textItemsInBlock
}));
textItemsInBlock = [];
}
};
var lastItem;
- page.textItems.forEach(item => {
+ page.items.forEach(item => {
if (lastItem) {
if (shouldSplit(lastItem, item, minX, mostUsedDistance)) {
@@ -39,16 +39,16 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
completBlock();
createdBlocks += blocks.length;
- return new PdfBlockPage({
+ return new Page({
...page,
- blocks: blocks
+ items: blocks
});
});
return new ParseResult({
...parseResult,
- content: newContent,
+ pages: newPages,
messages: ['Splitted into ' + createdBlocks + ' blocks']
});
}
diff --git a/src/javascript/models/transformations/DetectTOC.jsx b/src/javascript/models/transformations/DetectTOC.jsx
index b13a6ea..b39f17a 100644
--- a/src/javascript/models/transformations/DetectTOC.jsx
+++ b/src/javascript/models/transformations/DetectTOC.jsx
@@ -1,7 +1,7 @@
-import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
+import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
-import PdfBlock from '../PdfBlock.jsx';
+import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import HeadlineFinder from '../HeadlineFinder.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
@@ -9,16 +9,16 @@ import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
import { isDigit } from '../../functions.jsx'
//Detect table of contents pages
-export default class DetectTOC extends ToPdfBlockViewTransformation {
+export default class DetectTOC extends ToTextItemBlockTransformation {
constructor() {
- super("Detect Table of Contents");
+ super("Detect TOC");
}
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
const tocPages = [];
- const maxPagesToEvaluate = Math.min(20, parseResult.content.length);
+ const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance
});
@@ -26,14 +26,14 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
const linkLeveler = new LinkLeveler();
var tocLinks = [];
var lastTocPage;
- parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => {
+ parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
var linesCount = 0;
var linesWithDigitsCount = 0;
var lineItemsWithDigits = [];
const unknownBlocks = new Set();
var headlineBlock;
const pageTocLinks = [];
- page.blocks.forEach(block => {
+ page.items.forEach(block => {
var blockHasLinesWithDigits = false;
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
var lastLineTextWithoutNumber;
@@ -87,20 +87,20 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
tocLinks = tocLinks.concat(pageTocLinks);
const newBlocks = [];
- page.blocks.forEach((block) => {
+ page.items.forEach((block) => {
if (!unknownBlocks.has(block)) {
block.annotation = REMOVED_ANNOTATION;
}
newBlocks.push(block);
if (block === headlineBlock) {
- newBlocks.push(new PdfBlock({
+ newBlocks.push(new TextItemBlock({
textItems: textCombiner.combine(block.textItems).textItems,
type: HEADLINE2,
annotation: ADDED_ANNOTATION
}));
}
});
- page.blocks = newBlocks;
+ page.items = newBlocks;
}
});
@@ -109,12 +109,12 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
const notFoundHeadlines = [];
if (tocPages.length > 0) {
tocLinks.forEach(tocLink => {
- var linkedPage = parseResult.content[tocLink.pageNumber - 1];
+ var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
var foundHeadline = false;
if (linkedPage) {
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
if (!foundHeadline) { // pages are off by 1 ?
- linkedPage = parseResult.content[tocLink.pageNumber];
+ linkedPage = parseResult.pages[tocLink.pageNumber];
if (linkedPage) {
foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
}
@@ -126,7 +126,7 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
notFoundHeadlines.push(tocLink);
}
});
- lastTocPage.blocks.push(new PdfBlock({
+ lastTocPage.items.push(new TextItemBlock({
textItems: tocLinks.map(tocLink => {
tocLink.textItem.text = ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text;
return tocLink.textItem
@@ -164,7 +164,7 @@ function findHeadline(page, tocLink, textCombiner) {
});
var blockIndex = 0;
var lastBlock;
- for ( var block of page.blocks ) {
+ for ( var block of page.items ) {
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
for ( var item of itemsGroupedByY ) {
const headlineItems = headlineFinder.consume(item);
@@ -175,7 +175,7 @@ function findHeadline(page, tocLink, textCombiner) {
// 2 line headline
lastBlock.annotation = REMOVED_ANNOTATION;
}
- page.blocks.splice(blockIndex + 1, 0, new PdfBlock({
+ page.items.splice(blockIndex + 1, 0, new TextItemBlock({
textItems: [new TextItem({
...usedItems[0],
text: headline
diff --git a/src/javascript/models/transformations/RemoveRepetitiveElements.jsx b/src/javascript/models/transformations/RemoveRepetitiveElements.jsx
index bcb258c..b5e3a0c 100644
--- a/src/javascript/models/transformations/RemoveRepetitiveElements.jsx
+++ b/src/javascript/models/transformations/RemoveRepetitiveElements.jsx
@@ -1,4 +1,4 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
+import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
@@ -20,7 +20,7 @@ function hashCodeIgnoringSpacesAndNumbers(string) {
// Remove elements with similar content on same page positions, like page numbers, licenes information, etc...
-export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
+export default class RemoveRepetitiveElements extends ToTextItemTransformation {
constructor() {
super("Remove Repetitive Elements");
@@ -36,8 +36,8 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
const pageStore = [];
const minLineHashRepetitions = {};
const maxLineHashRepetitions = {};
- parseResult.content.forEach(pdfPage => {
- const minMaxItems = pdfPage.textItems.reduce((itemStore, item) => {
+ parseResult.pages.forEach(page => {
+ const minMaxItems = page.items.reduce((itemStore, item) => {
if (item.y < itemStore.minY) {
itemStore.minElements = [item];
itemStore.minY = item.y;
@@ -73,14 +73,14 @@ export default class RemoveRepetitiveElements extends ToPdfViewTransformation {
// now annoate all removed items
var removedHeader = 0;
var removedFooter = 0;
- parseResult.content.forEach((pdfPage, i) => {
- if (minLineHashRepetitions[pageStore[i].minLineHash] >= Math.max(3, parseResult.content.length * 2 / 3)) {
+ parseResult.pages.forEach((page, i) => {
+ if (minLineHashRepetitions[pageStore[i].minLineHash] >= Math.max(3, parseResult.pages.length * 2 / 3)) {
pageStore[i].minElements.forEach(item => {
item.annotation = REMOVED_ANNOTATION;
});
removedFooter++;
}
- if (maxLineHashRepetitions[pageStore[i].maxLineHash] >= Math.max(3, parseResult.content.length * 2 / 3)) {
+ if (maxLineHashRepetitions[pageStore[i].maxLineHash] >= Math.max(3, parseResult.pages.length * 2 / 3)) {
pageStore[i].maxElements.forEach(item => {
item.annotation = REMOVED_ANNOTATION;
});
diff --git a/src/javascript/models/transformations/ToMarkdown.jsx b/src/javascript/models/transformations/ToMarkdown.jsx
index 7b1cd17..2d9415b 100644
--- a/src/javascript/models/transformations/ToMarkdown.jsx
+++ b/src/javascript/models/transformations/ToMarkdown.jsx
@@ -2,7 +2,6 @@ import React from 'react';
import MarkdownPageView from '../../components/debug/MarkdownPageView.jsx';
import Transformation from './Transformation.jsx';
import ParseResult from '../ParseResult.jsx';
-import TextPage from '../TextPage.jsx';
export default class ToMarkdown extends Transformation {
@@ -15,18 +14,15 @@ export default class ToMarkdown extends Transformation {
}
transform(parseResult:ParseResult) {
- var text = '';
- parseResult.content.forEach(page => {
- page.blocks.forEach((block) => {
+ parseResult.pages.forEach(page => {
+ var text = '';
+ page.items.forEach(block => {
text += block.text + '\n';
});
+ page.items = [text];
});
return new ParseResult({
...parseResult,
- content: [new TextPage({
- index: 0,
- text: text
- })],
});
}
diff --git a/src/javascript/models/transformations/ToPdfViewTransformation.jsx b/src/javascript/models/transformations/ToPdfViewTransformation.jsx
deleted file mode 100644
index 2bbd699..0000000
--- a/src/javascript/models/transformations/ToPdfViewTransformation.jsx
+++ /dev/null
@@ -1,45 +0,0 @@
-import React from 'react';
-import Transformation from './Transformation.jsx';
-import ParseResult from '../ParseResult.jsx';
-import PdfPageView from '../../components/debug/PdfPageView.jsx';
-import { REMOVED_ANNOTATION } from '../Annotation.jsx';
-
-// Abstract class for transformations producing a PdfPage to be shown in the PdfView
-export default class ToPdfViewTransformation extends Transformation {
-
- constructor(name) {
- super(name);
- if (this.constructor === ToPdfViewTransformation) {
- throw new TypeError("Can not construct abstract class.");
- }
- this.showWhitespaces = false;
- }
-
- showPageSelection() {
- return true;
- }
-
- showModificationCheckbox() {
- return true;
- }
-
- createPageView(page, modificationsOnly) {
- return ;
- }
-
- completeTransform(parseResult:ParseResult) {
- // The usual cleanup
- parseResult.messages = [];
- parseResult.content.forEach(page => {
- page.textItems = page.textItems.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
- page.textItems.forEach(block => block.annotation = null);
- });
- return parseResult;
- }
-
-
-}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/ToTextBlocks.jsx b/src/javascript/models/transformations/ToTextBlocks.jsx
index 315deaf..10d8b0f 100644
--- a/src/javascript/models/transformations/ToTextBlocks.jsx
+++ b/src/javascript/models/transformations/ToTextBlocks.jsx
@@ -1,8 +1,7 @@
import React from 'react';
import Transformation from './Transformation.jsx';
-import BlockPageView from '../../components/debug/BlockPageView.jsx';
+import TextPageView from '../../components/debug/TextPageView.jsx';
import ParseResult from '../ParseResult.jsx';
-import BlockPage from '../BlockPage.jsx';
import { blockToText } from '../MarkdownElements.jsx';
export default class ToTextBlocks extends Transformation {
@@ -12,27 +11,23 @@ export default class ToTextBlocks extends Transformation {
}
createPageView(page, modificationsOnly) { // eslint-disable-line no-unused-vars
- return ;
+ return ;
}
transform(parseResult:ParseResult) {
- const blocks = [];
- parseResult.content.forEach(page => {
- page.blocks.forEach(block => {
+ parseResult.pages.forEach(page => {
+ const textItems = [];
+ page.items.forEach(block => {
const category = block.type ? block.type : 'Unknown';
- blocks.push({
+ textItems.push({
category: category,
text: blockToText(block)
});
});
-
+ page.items = textItems;
});
return new ParseResult({
...parseResult,
- content: [new BlockPage({
- index: 0,
- blocks: blocks
- })],
});
}
diff --git a/src/javascript/models/transformations/ToTextItemBlockTransformation.jsx b/src/javascript/models/transformations/ToTextItemBlockTransformation.jsx
new file mode 100644
index 0000000..2687615
--- /dev/null
+++ b/src/javascript/models/transformations/ToTextItemBlockTransformation.jsx
@@ -0,0 +1,44 @@
+import React from 'react';
+import Transformation from './Transformation.jsx';
+import ParseResult from '../ParseResult.jsx';
+import TextItemBlockPageView from '../../components/debug/TextItemBlockPageView.jsx';
+import { REMOVED_ANNOTATION } from '../Annotation.jsx';
+
+// Abstract class for transformations producing TextItemBlock(s) to be shown in the TextItemBlockPageView
+export default class ToTextItemBlockTransformation extends Transformation {
+
+ constructor(name) {
+ super(name);
+ if (this.constructor === ToTextItemBlockTransformation) {
+ throw new TypeError("Can not construct abstract class.");
+ }
+ this.showWhitespaces = false;
+ }
+
+ showPageSelection() {
+ return true;
+ }
+
+ showModificationCheckbox() {
+ return true;
+ }
+
+ createPageView(page, modificationsOnly) {
+ return ;
+ }
+
+ completeTransform(parseResult:ParseResult) {
+ // The usual cleanup
+ parseResult.messages = [];
+ parseResult.pages.forEach(page => {
+ page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
+ page.items.forEach(item => item.annotation = null);
+ });
+ return parseResult;
+ }
+
+}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/ToPdfBlockViewTransformation.jsx b/src/javascript/models/transformations/ToTextItemTransformation.jsx
similarity index 59%
rename from src/javascript/models/transformations/ToPdfBlockViewTransformation.jsx
rename to src/javascript/models/transformations/ToTextItemTransformation.jsx
index 54db116..3514a56 100644
--- a/src/javascript/models/transformations/ToPdfBlockViewTransformation.jsx
+++ b/src/javascript/models/transformations/ToTextItemTransformation.jsx
@@ -1,15 +1,15 @@
import React from 'react';
import Transformation from './Transformation.jsx';
import ParseResult from '../ParseResult.jsx';
-import PdfBlockPageView from '../../components/debug/PdfBlockPageView.jsx';
+import TextItemPageView from '../../components/debug/TextItemPageView.jsx';
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
-// Abstract class for transformations producing a PdfBlockPage to be shown in the PdfBlockView
-export default class ToPdfBlockViewTransformation extends Transformation {
+// Abstract class for transformations producing TextItem(s) to be shown in the TextItemPageView
+export default class ToTextItemTransformation extends Transformation {
constructor(name) {
super(name);
- if (this.constructor === ToPdfBlockViewTransformation) {
+ if (this.constructor === ToTextItemTransformation) {
throw new TypeError("Can not construct abstract class.");
}
this.showWhitespaces = false;
@@ -24,9 +24,9 @@ export default class ToPdfBlockViewTransformation extends Transformation {
}
createPageView(page, modificationsOnly) {
- return ;
}
@@ -34,11 +34,12 @@ export default class ToPdfBlockViewTransformation extends Transformation {
completeTransform(parseResult:ParseResult) {
// The usual cleanup
parseResult.messages = [];
- parseResult.content.forEach(page => {
- page.blocks = page.blocks.filter(block => !block.annotation || block.annotation !== REMOVED_ANNOTATION);
- page.blocks.forEach(block => block.annotation = null);
+ parseResult.pages.forEach(page => {
+ page.items = page.items.filter(item => !item.annotation || item.annotation !== REMOVED_ANNOTATION);
+ page.items.forEach(item => item.annotation = null);
});
return parseResult;
}
+
}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/Transformation.jsx b/src/javascript/models/transformations/Transformation.jsx
index 58f2fb8..b5ff64b 100644
--- a/src/javascript/models/transformations/Transformation.jsx
+++ b/src/javascript/models/transformations/Transformation.jsx
@@ -14,7 +14,7 @@ export default class Transformation {
}
showPageSelection() {
- return false;
+ return true;
}
showModificationCheckbox() {
diff --git a/src/javascript/models/transformations/VerticalToHorizontal.jsx b/src/javascript/models/transformations/VerticalToHorizontal.jsx
index b1435db..96fed1f 100644
--- a/src/javascript/models/transformations/VerticalToHorizontal.jsx
+++ b/src/javascript/models/transformations/VerticalToHorizontal.jsx
@@ -1,10 +1,10 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
+import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
// Converts vertical text to horizontal
-export default class VerticalToHorizontal extends ToPdfViewTransformation {
+export default class VerticalToHorizontal extends ToTextItemTransformation {
constructor() {
super("Vertical to Horizontal Text");
@@ -12,7 +12,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
transform(parseResult:ParseResult) {
var foundVerticals = 0;
- const newContent = parseResult.content.map(page => {
+ const newPages = parseResult.pages.map(page => {
const newTextItems = [];
// var oneCharacterItems = [];
@@ -33,7 +33,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
//TODO generic state machine code ?
- const leftOver = page.textItems.reduce((oneCharacterItems, item) => {
+ const leftOver = page.items.reduce((oneCharacterItems, item) => {
if (item.text.trim().length == 1) {
if (oneCharacterItems.length == 0) {
oneCharacterItems.push(item);
@@ -84,12 +84,12 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
return {
...page,
- textItems: newTextItems
+ items: newTextItems
};
});
return new ParseResult({
...parseResult,
- content: newContent,
+ pages: newPages,
messages: ["Converted " + foundVerticals + " verticals"]
});
}
diff --git a/src/javascript/textItemFunctions.jsx b/src/javascript/textItemFunctions.jsx
index a60eeca..8c33ffc 100644
--- a/src/javascript/textItemFunctions.jsx
+++ b/src/javascript/textItemFunctions.jsx
@@ -1,7 +1,7 @@
-import PdfBlock from './models/PdfBlock.jsx';
+import TextItemBlock from './models/TextItemBlock.jsx';
import TextItem from './models/TextItem.jsx';
-export function minXFromBlocks(blocks:PdfBlock[]) {
+export function minXFromBlocks(blocks:TextItemBlock[]) {
var minX = 999;
blocks.forEach(block => {
block.textItems.forEach(item => {