diff --git a/src/javascript/components/debug/TextItemBlockPageView.jsx b/src/javascript/components/debug/TextItemBlockPageView.jsx index c0bb89c..261c16f 100644 --- a/src/javascript/components/debug/TextItemBlockPageView.jsx +++ b/src/javascript/components/debug/TextItemBlockPageView.jsx @@ -8,7 +8,7 @@ export default class TextItemBlockPageView extends PageView { createItemViews(items, showWhitespaces) { const blockTables = items.map((block, i) => { var textItems = block.textItems; - const blockType = block.type ? ' - ' + block.type : null; + const blockType = block.type ? ' - ' + block.type.name : null; const blockAnnotation = block.annotation ? { ' - ' + block.annotation.category } : null; const borderStyle = block.annotation ? { diff --git a/src/javascript/components/debug/TextItemTable.jsx b/src/javascript/components/debug/TextItemTable.jsx index f8ed8f3..91b0ac7 100644 --- a/src/javascript/components/debug/TextItemTable.jsx +++ b/src/javascript/components/debug/TextItemTable.jsx @@ -50,7 +50,7 @@ export default class TextItemTable extends React.Component { { textItem.annotation ? textItem.annotation.category : '' }
- { textItem.type ? textItem.type : '' } + { textItem.type ? textItem.type.name : '' }
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' } diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index 3f91d19..d56c068 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -6,7 +6,7 @@ import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx'; import DetectTOC from './transformations/DetectTOC.jsx' -import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx' +import GatherBlocks from './transformations/GatherBlocks.jsx' import DetectFootnotes from './transformations/DetectFootnotes.jsx' import DetectLists from './transformations/DetectLists.jsx' import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx' @@ -36,7 +36,7 @@ export default class AppState { new VerticalToHorizontal(), new DetectTOC(), - new DetectPdfBlocks(), + new GatherBlocks(), new DetectFootnotes(), new DetectLists(), new DetectCodeBlocks(), diff --git a/src/javascript/models/ElementType.jsx b/src/javascript/models/ElementType.jsx new file mode 100644 index 0000000..5879da9 --- /dev/null +++ b/src/javascript/models/ElementType.jsx @@ -0,0 +1,51 @@ +import { Enum } from 'enumify'; + +export default class ElementType extends Enum { +} + +ElementType.initEnum({ + H1: { + + }, + H2: { + + }, + H3: { + + }, + H4: { + + }, + H5: { + + }, + H6: { + + }, + TOC: { + mergeToBlock: true + }, + FOOTNOTES: { + mergeToBlock: true, + mergeFollowingNonTypedItems: true + } +}); + +//export default ElementType + +export function headlineByLevel(level) { + if (level == 1) { + return ElementType.H1; + } else if (level == 2) { + return ElementType.H2; + } else if (level == 3) { + return ElementType.H3; + } else if (level == 4) { + return ElementType.H4; + } else if (level == 5) { + return ElementType.H5; + } else if (level == 6) { + return ElementType.H6; + } + throw "Unsupported headline level: " + level + " (supported are 1-6)"; +} \ No newline at end of file diff --git a/src/javascript/models/TextItemBlock.jsx b/src/javascript/models/TextItemBlock.jsx index 614c422..9e19266 100644 --- a/src/javascript/models/TextItemBlock.jsx +++ b/src/javascript/models/TextItemBlock.jsx @@ -1,11 +1,36 @@ import PageItem from './PageItem.jsx' +import TextItem from './TextItem.jsx' // A block of TextItem[] within a Page export default class TextItemBlock extends PageItem { constructor(options) { super(options); - this.textItems = options.textItems; + this.textItems = []; + if (options.textItems) { + options.textItems.forEach(item => this.addTextItem(item)); + } + } + + addTextItem(textItem:TextItem) { + if (this.type && textItem.type && this.type !== textItem.type) { + throw `Adding text item of type ${textItem.type} to block of type ${this.type}` + } + if (!this.type) { + this.type = textItem.type; + } + if (textItem.parsedElements) { + if (this.parsedElements) { + this.parsedElements.add(textItem.parsedElements); + } else { + this.parsedElements = textItem.parsedElements; + } + } + const copiedTextItem = new TextItem({ + ...textItem + }); + copiedTextItem.type = null; + this.textItems.push(copiedTextItem); } } diff --git a/src/javascript/models/transformations/CompactLines.jsx b/src/javascript/models/transformations/CompactLines.jsx index 0c55805..ac0ab35 100644 --- a/src/javascript/models/transformations/CompactLines.jsx +++ b/src/javascript/models/transformations/CompactLines.jsx @@ -4,8 +4,10 @@ import ToTextItemTransformation from './ToTextItemTransformation.jsx'; import ParseResult from '../ParseResult.jsx'; import TextItemLineGrouper from '../TextItemLineGrouper.jsx'; import TextItemLineCompactor from '../TextItemLineCompactor.jsx'; +import ElementType from '../ElementType.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; + // gathers text items on the same y line to one text item export default class CompactLines extends ToTextItemTransformation { @@ -44,6 +46,7 @@ export default class CompactLines extends ToTextItemTransformation { foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks); } if (combinedItem.parsedElements.footnotes.length > 0) { + combinedItem.type = ElementType.FOOTNOTES; const footnotes = combinedItem.parsedElements.footnotes.map(footnote => { footnote },); foundFootnotes.push.apply(foundFootnotes, footnotes); } diff --git a/src/javascript/models/transformations/DetectPdfBlocks.jsx b/src/javascript/models/transformations/DetectPdfBlocks.jsx deleted file mode 100644 index 78d5fce..0000000 --- a/src/javascript/models/transformations/DetectPdfBlocks.jsx +++ /dev/null @@ -1,73 +0,0 @@ -import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; -import Page from '../Page.jsx'; -import ParseResult from '../ParseResult.jsx'; -import TextItemBlock from '../TextItemBlock.jsx'; -import { minXFromTextItems } from '../../textItemFunctions.jsx'; - -export default class DetectPdfBlocks extends ToTextItemBlockTransformation { - - constructor() { - super("Detect Blocks"); - } - - transform(parseResult:ParseResult) { - const {mostUsedDistance} = parseResult.globals; - var createdBlocks = 0; - const newPages = parseResult.pages.map(page => { - var minX = minXFromTextItems(page.items); - const blocks = []; - var textItemsInBlock = []; - const completBlock = () => { - if (textItemsInBlock.length > 0) { //can happen on empty page - blocks.push(new TextItemBlock({ - textItems: textItemsInBlock - })); - textItemsInBlock = []; - } - }; - var lastItem; - page.items.forEach(item => { - - if (lastItem) { - if (shouldSplit(lastItem, item, minX, mostUsedDistance)) { - completBlock(); - } - } - textItemsInBlock.push(item); - lastItem = item; - }); - completBlock(); - - createdBlocks += blocks.length; - return new Page({ - ...page, - items: blocks - }); - - }); - - return new ParseResult({ - ...parseResult, - pages: newPages, - messages: ['Splitted into ' + createdBlocks + ' blocks'] - }); - } - -} - -function shouldSplit(lastItem, item, minX, mostUsedDistance) { - const distance = lastItem.y - item.y; - if (distance < 0 - mostUsedDistance / 2) { - //distance is negative - and not only a bit - return true; - } - var allowedDisctance = mostUsedDistance + 1; - if (lastItem.x == item.x && item.x > minX) { - //intended elements like lists often have greater spacing - allowedDisctance = mostUsedDistance + mostUsedDistance / 2; - } - if (distance > allowedDisctance) { - return true; - } - return false; -} \ No newline at end of file diff --git a/src/javascript/models/transformations/DetectTOC.jsx b/src/javascript/models/transformations/DetectTOC.jsx index edfe42f..25e6389 100644 --- a/src/javascript/models/transformations/DetectTOC.jsx +++ b/src/javascript/models/transformations/DetectTOC.jsx @@ -3,7 +3,8 @@ import ParseResult from '../ParseResult.jsx'; import TextItem from '../TextItem.jsx'; import HeadlineFinder from '../HeadlineFinder.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; -import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx'; +import ElementType from '../ElementType.jsx'; +import { headlineByLevel } from '../ElementType.jsx'; import { isDigit } from '../../functions.jsx' //Detect table of contents pages @@ -84,7 +85,7 @@ export default class DetectTOC extends ToTextItemTransformation { if (line === headlineItem) { newBlocks.push(new TextItem({ ...line, - type: HEADLINE2, + type: ElementType.H2, annotation: ADDED_ANNOTATION })); } @@ -120,7 +121,7 @@ export default class DetectTOC extends ToTextItemTransformation { tocLinks.forEach(tocLink => { lastTocPage.items.push(new TextItem({ text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text, - type: TOC_BLOCK, + type: ElementType.TOC, annotation: ADDED_ANNOTATION })); }); diff --git a/src/javascript/models/transformations/GatherBlocks.jsx b/src/javascript/models/transformations/GatherBlocks.jsx new file mode 100644 index 0000000..b1d4854 --- /dev/null +++ b/src/javascript/models/transformations/GatherBlocks.jsx @@ -0,0 +1,85 @@ +import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; +import ParseResult from '../ParseResult.jsx'; +import TextItemBlock from '../TextItemBlock.jsx'; +import { ADDED_ANNOTATION } from '../Annotation.jsx'; +import { minXFromTextItems } from '../../textItemFunctions.jsx'; + +// Gathers lines to blocks +export default class GatherBlocks extends ToTextItemBlockTransformation { + + constructor() { + super("Gather Blocks"); + } + + transform(parseResult:ParseResult) { + const {mostUsedDistance} = parseResult.globals; + var createdBlocks = 0; + var textItems = 0; + parseResult.pages.map(page => { + textItems += page.items.length; + const blocks = []; + var stashedBlock = new TextItemBlock({}); + const flushStashedItems = () => { + if (stashedBlock.textItems.length > 1) { + stashedBlock.annotation = ADDED_ANNOTATION; + } + + blocks.push(stashedBlock); + stashedBlock = new TextItemBlock({}); + createdBlocks++; + }; + + var minX = minXFromTextItems(page.items); + page.items.forEach(item => { + if (stashedBlock.textItems.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) { + flushStashedItems(); + } + stashedBlock.addTextItem(item); + }); + if (stashedBlock.textItems.length > 0) { + flushStashedItems(); + } + page.items = blocks; + }); + + return new ParseResult({ + ...parseResult, + messages: ['Gathered ' + createdBlocks + ' blocks out of ' + textItems + ' text items'] + }); + } + +} + +function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) { + if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) { + return false; + } + if (item.type !== stashedBlock.type) { + return true; + } + if (item.type) { + return !item.type.mergeToBlock; + } else { + console.debug(item); + const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1]; + return shouldSplit(lastItem, item, minX, mostUsedDistance); + } +} + + +function shouldSplit(lastItem, item, minX, mostUsedDistance) { + const distance = lastItem.y - item.y; + if (distance < 0 - mostUsedDistance / 2) { + //distance is negative - and not only a bit + return true; + } + var allowedDisctance = mostUsedDistance + 1; + if (lastItem.x == item.x && item.x > minX) { + //intended elements like lists often have greater spacing + allowedDisctance = mostUsedDistance + mostUsedDistance / 2; + } + if (distance > allowedDisctance) { + return true; + } + return false; +} \ No newline at end of file