diff --git a/src/javascript/components/debug/TextItemBlockPageView.jsx b/src/javascript/components/debug/TextItemBlockPageView.jsx
index c0bb89c..261c16f 100644
--- a/src/javascript/components/debug/TextItemBlockPageView.jsx
+++ b/src/javascript/components/debug/TextItemBlockPageView.jsx
@@ -8,7 +8,7 @@ export default class TextItemBlockPageView extends PageView {
createItemViews(items, showWhitespaces) {
const blockTables = items.map((block, i) => {
var textItems = block.textItems;
- const blockType = block.type ? ' - ' + block.type : null;
+ const blockType = block.type ? ' - ' + block.type.name : null;
const blockAnnotation = block.annotation ? { ' - ' + block.annotation.category }
: null;
const borderStyle = block.annotation ? {
diff --git a/src/javascript/components/debug/TextItemTable.jsx b/src/javascript/components/debug/TextItemTable.jsx
index f8ed8f3..91b0ac7 100644
--- a/src/javascript/components/debug/TextItemTable.jsx
+++ b/src/javascript/components/debug/TextItemTable.jsx
@@ -50,7 +50,7 @@ export default class TextItemTable extends React.Component {
{ textItem.annotation ? textItem.annotation.category : '' }
- { textItem.type ? textItem.type : '' }
+ { textItem.type ? textItem.type.name : '' }
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx
index 3f91d19..d56c068 100644
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@@ -6,7 +6,7 @@ import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import DetectTOC from './transformations/DetectTOC.jsx'
-import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
+import GatherBlocks from './transformations/GatherBlocks.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectLists from './transformations/DetectLists.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
@@ -36,7 +36,7 @@ export default class AppState {
new VerticalToHorizontal(),
new DetectTOC(),
- new DetectPdfBlocks(),
+ new GatherBlocks(),
new DetectFootnotes(),
new DetectLists(),
new DetectCodeBlocks(),
diff --git a/src/javascript/models/ElementType.jsx b/src/javascript/models/ElementType.jsx
new file mode 100644
index 0000000..5879da9
--- /dev/null
+++ b/src/javascript/models/ElementType.jsx
@@ -0,0 +1,51 @@
+import { Enum } from 'enumify';
+
+export default class ElementType extends Enum {
+}
+
+ElementType.initEnum({
+ H1: {
+
+ },
+ H2: {
+
+ },
+ H3: {
+
+ },
+ H4: {
+
+ },
+ H5: {
+
+ },
+ H6: {
+
+ },
+ TOC: {
+ mergeToBlock: true
+ },
+ FOOTNOTES: {
+ mergeToBlock: true,
+ mergeFollowingNonTypedItems: true
+ }
+});
+
+//export default ElementType
+
+export function headlineByLevel(level) {
+ if (level == 1) {
+ return ElementType.H1;
+ } else if (level == 2) {
+ return ElementType.H2;
+ } else if (level == 3) {
+ return ElementType.H3;
+ } else if (level == 4) {
+ return ElementType.H4;
+ } else if (level == 5) {
+ return ElementType.H5;
+ } else if (level == 6) {
+ return ElementType.H6;
+ }
+ throw "Unsupported headline level: " + level + " (supported are 1-6)";
+}
\ No newline at end of file
diff --git a/src/javascript/models/TextItemBlock.jsx b/src/javascript/models/TextItemBlock.jsx
index 614c422..9e19266 100644
--- a/src/javascript/models/TextItemBlock.jsx
+++ b/src/javascript/models/TextItemBlock.jsx
@@ -1,11 +1,36 @@
import PageItem from './PageItem.jsx'
+import TextItem from './TextItem.jsx'
// A block of TextItem[] within a Page
export default class TextItemBlock extends PageItem {
constructor(options) {
super(options);
- this.textItems = options.textItems;
+ this.textItems = [];
+ if (options.textItems) {
+ options.textItems.forEach(item => this.addTextItem(item));
+ }
+ }
+
+ addTextItem(textItem:TextItem) {
+ if (this.type && textItem.type && this.type !== textItem.type) {
+ throw `Adding text item of type ${textItem.type} to block of type ${this.type}`
+ }
+ if (!this.type) {
+ this.type = textItem.type;
+ }
+ if (textItem.parsedElements) {
+ if (this.parsedElements) {
+ this.parsedElements.add(textItem.parsedElements);
+ } else {
+ this.parsedElements = textItem.parsedElements;
+ }
+ }
+ const copiedTextItem = new TextItem({
+ ...textItem
+ });
+ copiedTextItem.type = null;
+ this.textItems.push(copiedTextItem);
}
}
diff --git a/src/javascript/models/transformations/CompactLines.jsx b/src/javascript/models/transformations/CompactLines.jsx
index 0c55805..ac0ab35 100644
--- a/src/javascript/models/transformations/CompactLines.jsx
+++ b/src/javascript/models/transformations/CompactLines.jsx
@@ -4,8 +4,10 @@ import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItemLineGrouper from '../TextItemLineGrouper.jsx';
import TextItemLineCompactor from '../TextItemLineCompactor.jsx';
+import ElementType from '../ElementType.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
+
// gathers text items on the same y line to one text item
export default class CompactLines extends ToTextItemTransformation {
@@ -44,6 +46,7 @@ export default class CompactLines extends ToTextItemTransformation {
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
}
if (combinedItem.parsedElements.footnotes.length > 0) {
+ combinedItem.type = ElementType.FOOTNOTES;
const footnotes = combinedItem.parsedElements.footnotes.map(footnote =>
{ footnote },);
foundFootnotes.push.apply(foundFootnotes, footnotes);
}
diff --git a/src/javascript/models/transformations/DetectPdfBlocks.jsx b/src/javascript/models/transformations/DetectPdfBlocks.jsx
deleted file mode 100644
index 78d5fce..0000000
--- a/src/javascript/models/transformations/DetectPdfBlocks.jsx
+++ /dev/null
@@ -1,73 +0,0 @@
-import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
-import Page from '../Page.jsx';
-import ParseResult from '../ParseResult.jsx';
-import TextItemBlock from '../TextItemBlock.jsx';
-import { minXFromTextItems } from '../../textItemFunctions.jsx';
-
-export default class DetectPdfBlocks extends ToTextItemBlockTransformation {
-
- constructor() {
- super("Detect Blocks");
- }
-
- transform(parseResult:ParseResult) {
- const {mostUsedDistance} = parseResult.globals;
- var createdBlocks = 0;
- const newPages = parseResult.pages.map(page => {
- var minX = minXFromTextItems(page.items);
- const blocks = [];
- var textItemsInBlock = [];
- const completBlock = () => {
- if (textItemsInBlock.length > 0) { //can happen on empty page
- blocks.push(new TextItemBlock({
- textItems: textItemsInBlock
- }));
- textItemsInBlock = [];
- }
- };
- var lastItem;
- page.items.forEach(item => {
-
- if (lastItem) {
- if (shouldSplit(lastItem, item, minX, mostUsedDistance)) {
- completBlock();
- }
- }
- textItemsInBlock.push(item);
- lastItem = item;
- });
- completBlock();
-
- createdBlocks += blocks.length;
- return new Page({
- ...page,
- items: blocks
- });
-
- });
-
- return new ParseResult({
- ...parseResult,
- pages: newPages,
- messages: ['Splitted into ' + createdBlocks + ' blocks']
- });
- }
-
-}
-
-function shouldSplit(lastItem, item, minX, mostUsedDistance) {
- const distance = lastItem.y - item.y;
- if (distance < 0 - mostUsedDistance / 2) {
- //distance is negative - and not only a bit
- return true;
- }
- var allowedDisctance = mostUsedDistance + 1;
- if (lastItem.x == item.x && item.x > minX) {
- //intended elements like lists often have greater spacing
- allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
- }
- if (distance > allowedDisctance) {
- return true;
- }
- return false;
-}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/DetectTOC.jsx b/src/javascript/models/transformations/DetectTOC.jsx
index edfe42f..25e6389 100644
--- a/src/javascript/models/transformations/DetectTOC.jsx
+++ b/src/javascript/models/transformations/DetectTOC.jsx
@@ -3,7 +3,8 @@ import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import HeadlineFinder from '../HeadlineFinder.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
-import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
+import ElementType from '../ElementType.jsx';
+import { headlineByLevel } from '../ElementType.jsx';
import { isDigit } from '../../functions.jsx'
//Detect table of contents pages
@@ -84,7 +85,7 @@ export default class DetectTOC extends ToTextItemTransformation {
if (line === headlineItem) {
newBlocks.push(new TextItem({
...line,
- type: HEADLINE2,
+ type: ElementType.H2,
annotation: ADDED_ANNOTATION
}));
}
@@ -120,7 +121,7 @@ export default class DetectTOC extends ToTextItemTransformation {
tocLinks.forEach(tocLink => {
lastTocPage.items.push(new TextItem({
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
- type: TOC_BLOCK,
+ type: ElementType.TOC,
annotation: ADDED_ANNOTATION
}));
});
diff --git a/src/javascript/models/transformations/GatherBlocks.jsx b/src/javascript/models/transformations/GatherBlocks.jsx
new file mode 100644
index 0000000..b1d4854
--- /dev/null
+++ b/src/javascript/models/transformations/GatherBlocks.jsx
@@ -0,0 +1,85 @@
+import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
+import ParseResult from '../ParseResult.jsx';
+import TextItemBlock from '../TextItemBlock.jsx';
+import { ADDED_ANNOTATION } from '../Annotation.jsx';
+import { minXFromTextItems } from '../../textItemFunctions.jsx';
+
+// Gathers lines to blocks
+export default class GatherBlocks extends ToTextItemBlockTransformation {
+
+ constructor() {
+ super("Gather Blocks");
+ }
+
+ transform(parseResult:ParseResult) {
+ const {mostUsedDistance} = parseResult.globals;
+ var createdBlocks = 0;
+ var textItems = 0;
+ parseResult.pages.map(page => {
+ textItems += page.items.length;
+ const blocks = [];
+ var stashedBlock = new TextItemBlock({});
+ const flushStashedItems = () => {
+ if (stashedBlock.textItems.length > 1) {
+ stashedBlock.annotation = ADDED_ANNOTATION;
+ }
+
+ blocks.push(stashedBlock);
+ stashedBlock = new TextItemBlock({});
+ createdBlocks++;
+ };
+
+ var minX = minXFromTextItems(page.items);
+ page.items.forEach(item => {
+ if (stashedBlock.textItems.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) {
+ flushStashedItems();
+ }
+ stashedBlock.addTextItem(item);
+ });
+ if (stashedBlock.textItems.length > 0) {
+ flushStashedItems();
+ }
+ page.items = blocks;
+ });
+
+ return new ParseResult({
+ ...parseResult,
+ messages: ['Gathered ' + createdBlocks + ' blocks out of ' + textItems + ' text items']
+ });
+ }
+
+}
+
+function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
+ if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
+ return false;
+ }
+ if (item.type !== stashedBlock.type) {
+ return true;
+ }
+ if (item.type) {
+ return !item.type.mergeToBlock;
+ } else {
+ console.debug(item);
+ const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
+ return shouldSplit(lastItem, item, minX, mostUsedDistance);
+ }
+}
+
+
+function shouldSplit(lastItem, item, minX, mostUsedDistance) {
+ const distance = lastItem.y - item.y;
+ if (distance < 0 - mostUsedDistance / 2) {
+ //distance is negative - and not only a bit
+ return true;
+ }
+ var allowedDisctance = mostUsedDistance + 1;
+ if (lastItem.x == item.x && item.x > minX) {
+ //intended elements like lists often have greater spacing
+ allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
+ }
+ if (distance > allowedDisctance) {
+ return true;
+ }
+ return false;
+}
\ No newline at end of file