[WIP] change gather blocks transformation to new system

This commit is contained in:
Johannes Zillmann 2017-03-10 12:10:58 +01:00
parent bd4c207ae3
commit 68e3fd7a9f
9 changed files with 173 additions and 81 deletions

View File

@ -8,7 +8,7 @@ export default class TextItemBlockPageView extends PageView {
createItemViews(items, showWhitespaces) {
const blockTables = items.map((block, i) => {
var textItems = block.textItems;
const blockType = block.type ? ' - ' + block.type : null;
const blockType = block.type ? ' - ' + block.type.name : null;
const blockAnnotation = block.annotation ? <span>{ ' - ' + block.annotation.category }</span>
: null;
const borderStyle = block.annotation ? {

View File

@ -50,7 +50,7 @@ export default class TextItemTable extends React.Component {
{ textItem.annotation ? textItem.annotation.category : '' }
</div>
<div style={ { textAlign: 'center', color: 'brown' } }>
{ textItem.type ? textItem.type : '' }
{ textItem.type ? textItem.type.name : '' }
</div>
<div style={ { textAlign: 'center', color: 'orange' } }>
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }

View File

@ -6,7 +6,7 @@ import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import DetectTOC from './transformations/DetectTOC.jsx'
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
import GatherBlocks from './transformations/GatherBlocks.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectLists from './transformations/DetectLists.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
@ -36,7 +36,7 @@ export default class AppState {
new VerticalToHorizontal(),
new DetectTOC(),
new DetectPdfBlocks(),
new GatherBlocks(),
new DetectFootnotes(),
new DetectLists(),
new DetectCodeBlocks(),

View File

@ -0,0 +1,51 @@
import { Enum } from 'enumify';
export default class ElementType extends Enum {
}
ElementType.initEnum({
H1: {
},
H2: {
},
H3: {
},
H4: {
},
H5: {
},
H6: {
},
TOC: {
mergeToBlock: true
},
FOOTNOTES: {
mergeToBlock: true,
mergeFollowingNonTypedItems: true
}
});
//export default ElementType
export function headlineByLevel(level) {
if (level == 1) {
return ElementType.H1;
} else if (level == 2) {
return ElementType.H2;
} else if (level == 3) {
return ElementType.H3;
} else if (level == 4) {
return ElementType.H4;
} else if (level == 5) {
return ElementType.H5;
} else if (level == 6) {
return ElementType.H6;
}
throw "Unsupported headline level: " + level + " (supported are 1-6)";
}

View File

@ -1,11 +1,36 @@
import PageItem from './PageItem.jsx'
import TextItem from './TextItem.jsx'
// A block of TextItem[] within a Page
export default class TextItemBlock extends PageItem {
constructor(options) {
super(options);
this.textItems = options.textItems;
this.textItems = [];
if (options.textItems) {
options.textItems.forEach(item => this.addTextItem(item));
}
}
addTextItem(textItem:TextItem) {
if (this.type && textItem.type && this.type !== textItem.type) {
throw `Adding text item of type ${textItem.type} to block of type ${this.type}`
}
if (!this.type) {
this.type = textItem.type;
}
if (textItem.parsedElements) {
if (this.parsedElements) {
this.parsedElements.add(textItem.parsedElements);
} else {
this.parsedElements = textItem.parsedElements;
}
}
const copiedTextItem = new TextItem({
...textItem
});
copiedTextItem.type = null;
this.textItems.push(copiedTextItem);
}
}

View File

@ -4,8 +4,10 @@ import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItemLineGrouper from '../TextItemLineGrouper.jsx';
import TextItemLineCompactor from '../TextItemLineCompactor.jsx';
import ElementType from '../ElementType.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
// gathers text items on the same y line to one text item
export default class CompactLines extends ToTextItemTransformation {
@ -44,6 +46,7 @@ export default class CompactLines extends ToTextItemTransformation {
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
}
if (combinedItem.parsedElements.footnotes.length > 0) {
combinedItem.type = ElementType.FOOTNOTES;
const footnotes = combinedItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
foundFootnotes.push.apply(foundFootnotes, footnotes);
}

View File

@ -1,73 +0,0 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import Page from '../Page.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import { minXFromTextItems } from '../../textItemFunctions.jsx';
export default class DetectPdfBlocks extends ToTextItemBlockTransformation {
constructor() {
super("Detect Blocks");
}
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var createdBlocks = 0;
const newPages = parseResult.pages.map(page => {
var minX = minXFromTextItems(page.items);
const blocks = [];
var textItemsInBlock = [];
const completBlock = () => {
if (textItemsInBlock.length > 0) { //can happen on empty page
blocks.push(new TextItemBlock({
textItems: textItemsInBlock
}));
textItemsInBlock = [];
}
};
var lastItem;
page.items.forEach(item => {
if (lastItem) {
if (shouldSplit(lastItem, item, minX, mostUsedDistance)) {
completBlock();
}
}
textItemsInBlock.push(item);
lastItem = item;
});
completBlock();
createdBlocks += blocks.length;
return new Page({
...page,
items: blocks
});
});
return new ParseResult({
...parseResult,
pages: newPages,
messages: ['Splitted into ' + createdBlocks + ' blocks']
});
}
}
function shouldSplit(lastItem, item, minX, mostUsedDistance) {
const distance = lastItem.y - item.y;
if (distance < 0 - mostUsedDistance / 2) {
//distance is negative - and not only a bit
return true;
}
var allowedDisctance = mostUsedDistance + 1;
if (lastItem.x == item.x && item.x > minX) {
//intended elements like lists often have greater spacing
allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
}
if (distance > allowedDisctance) {
return true;
}
return false;
}

View File

@ -3,7 +3,8 @@ import ParseResult from '../ParseResult.jsx';
import TextItem from '../TextItem.jsx';
import HeadlineFinder from '../HeadlineFinder.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
import ElementType from '../ElementType.jsx';
import { headlineByLevel } from '../ElementType.jsx';
import { isDigit } from '../../functions.jsx'
//Detect table of contents pages
@ -84,7 +85,7 @@ export default class DetectTOC extends ToTextItemTransformation {
if (line === headlineItem) {
newBlocks.push(new TextItem({
...line,
type: HEADLINE2,
type: ElementType.H2,
annotation: ADDED_ANNOTATION
}));
}
@ -120,7 +121,7 @@ export default class DetectTOC extends ToTextItemTransformation {
tocLinks.forEach(tocLink => {
lastTocPage.items.push(new TextItem({
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
type: TOC_BLOCK,
type: ElementType.TOC,
annotation: ADDED_ANNOTATION
}));
});

View File

@ -0,0 +1,85 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import { ADDED_ANNOTATION } from '../Annotation.jsx';
import { minXFromTextItems } from '../../textItemFunctions.jsx';
// Gathers lines to blocks
export default class GatherBlocks extends ToTextItemBlockTransformation {
constructor() {
super("Gather Blocks");
}
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var createdBlocks = 0;
var textItems = 0;
parseResult.pages.map(page => {
textItems += page.items.length;
const blocks = [];
var stashedBlock = new TextItemBlock({});
const flushStashedItems = () => {
if (stashedBlock.textItems.length > 1) {
stashedBlock.annotation = ADDED_ANNOTATION;
}
blocks.push(stashedBlock);
stashedBlock = new TextItemBlock({});
createdBlocks++;
};
var minX = minXFromTextItems(page.items);
page.items.forEach(item => {
if (stashedBlock.textItems.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) {
flushStashedItems();
}
stashedBlock.addTextItem(item);
});
if (stashedBlock.textItems.length > 0) {
flushStashedItems();
}
page.items = blocks;
});
return new ParseResult({
...parseResult,
messages: ['Gathered ' + createdBlocks + ' blocks out of ' + textItems + ' text items']
});
}
}
function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
return false;
}
if (item.type !== stashedBlock.type) {
return true;
}
if (item.type) {
return !item.type.mergeToBlock;
} else {
console.debug(item);
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
return shouldSplit(lastItem, item, minX, mostUsedDistance);
}
}
function shouldSplit(lastItem, item, minX, mostUsedDistance) {
const distance = lastItem.y - item.y;
if (distance < 0 - mostUsedDistance / 2) {
//distance is negative - and not only a bit
return true;
}
var allowedDisctance = mostUsedDistance + 1;
if (lastItem.x == item.x && item.x > minX) {
//intended elements like lists often have greater spacing
allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
}
if (distance > allowedDisctance) {
return true;
}
return false;
}