mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-28 15:44:52 +01:00
[WIP] change gather blocks transformation to new system
This commit is contained in:
parent
bd4c207ae3
commit
68e3fd7a9f
@ -8,7 +8,7 @@ export default class TextItemBlockPageView extends PageView {
|
||||
createItemViews(items, showWhitespaces) {
|
||||
const blockTables = items.map((block, i) => {
|
||||
var textItems = block.textItems;
|
||||
const blockType = block.type ? ' - ' + block.type : null;
|
||||
const blockType = block.type ? ' - ' + block.type.name : null;
|
||||
const blockAnnotation = block.annotation ? <span>{ ' - ' + block.annotation.category }</span>
|
||||
: null;
|
||||
const borderStyle = block.annotation ? {
|
||||
|
@ -50,7 +50,7 @@ export default class TextItemTable extends React.Component {
|
||||
{ textItem.annotation ? textItem.annotation.category : '' }
|
||||
</div>
|
||||
<div style={ { textAlign: 'center', color: 'brown' } }>
|
||||
{ textItem.type ? textItem.type : '' }
|
||||
{ textItem.type ? textItem.type.name : '' }
|
||||
</div>
|
||||
<div style={ { textAlign: 'center', color: 'orange' } }>
|
||||
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
|
||||
|
@ -6,7 +6,7 @@ import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements
|
||||
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
||||
import DetectTOC from './transformations/DetectTOC.jsx'
|
||||
|
||||
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
|
||||
import GatherBlocks from './transformations/GatherBlocks.jsx'
|
||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||
import DetectLists from './transformations/DetectLists.jsx'
|
||||
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
||||
@ -36,7 +36,7 @@ export default class AppState {
|
||||
new VerticalToHorizontal(),
|
||||
new DetectTOC(),
|
||||
|
||||
new DetectPdfBlocks(),
|
||||
new GatherBlocks(),
|
||||
new DetectFootnotes(),
|
||||
new DetectLists(),
|
||||
new DetectCodeBlocks(),
|
||||
|
51
src/javascript/models/ElementType.jsx
Normal file
51
src/javascript/models/ElementType.jsx
Normal file
@ -0,0 +1,51 @@
|
||||
import { Enum } from 'enumify';
|
||||
|
||||
export default class ElementType extends Enum {
|
||||
}
|
||||
|
||||
ElementType.initEnum({
|
||||
H1: {
|
||||
|
||||
},
|
||||
H2: {
|
||||
|
||||
},
|
||||
H3: {
|
||||
|
||||
},
|
||||
H4: {
|
||||
|
||||
},
|
||||
H5: {
|
||||
|
||||
},
|
||||
H6: {
|
||||
|
||||
},
|
||||
TOC: {
|
||||
mergeToBlock: true
|
||||
},
|
||||
FOOTNOTES: {
|
||||
mergeToBlock: true,
|
||||
mergeFollowingNonTypedItems: true
|
||||
}
|
||||
});
|
||||
|
||||
//export default ElementType
|
||||
|
||||
export function headlineByLevel(level) {
|
||||
if (level == 1) {
|
||||
return ElementType.H1;
|
||||
} else if (level == 2) {
|
||||
return ElementType.H2;
|
||||
} else if (level == 3) {
|
||||
return ElementType.H3;
|
||||
} else if (level == 4) {
|
||||
return ElementType.H4;
|
||||
} else if (level == 5) {
|
||||
return ElementType.H5;
|
||||
} else if (level == 6) {
|
||||
return ElementType.H6;
|
||||
}
|
||||
throw "Unsupported headline level: " + level + " (supported are 1-6)";
|
||||
}
|
@ -1,11 +1,36 @@
|
||||
import PageItem from './PageItem.jsx'
|
||||
import TextItem from './TextItem.jsx'
|
||||
|
||||
// A block of TextItem[] within a Page
|
||||
export default class TextItemBlock extends PageItem {
|
||||
|
||||
constructor(options) {
|
||||
super(options);
|
||||
this.textItems = options.textItems;
|
||||
this.textItems = [];
|
||||
if (options.textItems) {
|
||||
options.textItems.forEach(item => this.addTextItem(item));
|
||||
}
|
||||
}
|
||||
|
||||
addTextItem(textItem:TextItem) {
|
||||
if (this.type && textItem.type && this.type !== textItem.type) {
|
||||
throw `Adding text item of type ${textItem.type} to block of type ${this.type}`
|
||||
}
|
||||
if (!this.type) {
|
||||
this.type = textItem.type;
|
||||
}
|
||||
if (textItem.parsedElements) {
|
||||
if (this.parsedElements) {
|
||||
this.parsedElements.add(textItem.parsedElements);
|
||||
} else {
|
||||
this.parsedElements = textItem.parsedElements;
|
||||
}
|
||||
}
|
||||
const copiedTextItem = new TextItem({
|
||||
...textItem
|
||||
});
|
||||
copiedTextItem.type = null;
|
||||
this.textItems.push(copiedTextItem);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -4,8 +4,10 @@ import ToTextItemTransformation from './ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItemLineGrouper from '../TextItemLineGrouper.jsx';
|
||||
import TextItemLineCompactor from '../TextItemLineCompactor.jsx';
|
||||
import ElementType from '../ElementType.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
|
||||
// gathers text items on the same y line to one text item
|
||||
export default class CompactLines extends ToTextItemTransformation {
|
||||
|
||||
@ -44,6 +46,7 @@ export default class CompactLines extends ToTextItemTransformation {
|
||||
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
|
||||
}
|
||||
if (combinedItem.parsedElements.footnotes.length > 0) {
|
||||
combinedItem.type = ElementType.FOOTNOTES;
|
||||
const footnotes = combinedItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
||||
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
||||
}
|
||||
|
@ -1,73 +0,0 @@
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import Page from '../Page.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import { minXFromTextItems } from '../../textItemFunctions.jsx';
|
||||
|
||||
export default class DetectPdfBlocks extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Blocks");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var createdBlocks = 0;
|
||||
const newPages = parseResult.pages.map(page => {
|
||||
var minX = minXFromTextItems(page.items);
|
||||
const blocks = [];
|
||||
var textItemsInBlock = [];
|
||||
const completBlock = () => {
|
||||
if (textItemsInBlock.length > 0) { //can happen on empty page
|
||||
blocks.push(new TextItemBlock({
|
||||
textItems: textItemsInBlock
|
||||
}));
|
||||
textItemsInBlock = [];
|
||||
}
|
||||
};
|
||||
var lastItem;
|
||||
page.items.forEach(item => {
|
||||
|
||||
if (lastItem) {
|
||||
if (shouldSplit(lastItem, item, minX, mostUsedDistance)) {
|
||||
completBlock();
|
||||
}
|
||||
}
|
||||
textItemsInBlock.push(item);
|
||||
lastItem = item;
|
||||
});
|
||||
completBlock();
|
||||
|
||||
createdBlocks += blocks.length;
|
||||
return new Page({
|
||||
...page,
|
||||
items: blocks
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
pages: newPages,
|
||||
messages: ['Splitted into ' + createdBlocks + ' blocks']
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function shouldSplit(lastItem, item, minX, mostUsedDistance) {
|
||||
const distance = lastItem.y - item.y;
|
||||
if (distance < 0 - mostUsedDistance / 2) {
|
||||
//distance is negative - and not only a bit
|
||||
return true;
|
||||
}
|
||||
var allowedDisctance = mostUsedDistance + 1;
|
||||
if (lastItem.x == item.x && item.x > minX) {
|
||||
//intended elements like lists often have greater spacing
|
||||
allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
|
||||
}
|
||||
if (distance > allowedDisctance) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
@ -3,7 +3,8 @@ import ParseResult from '../ParseResult.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import HeadlineFinder from '../HeadlineFinder.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
|
||||
import ElementType from '../ElementType.jsx';
|
||||
import { headlineByLevel } from '../ElementType.jsx';
|
||||
import { isDigit } from '../../functions.jsx'
|
||||
|
||||
//Detect table of contents pages
|
||||
@ -84,7 +85,7 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
if (line === headlineItem) {
|
||||
newBlocks.push(new TextItem({
|
||||
...line,
|
||||
type: HEADLINE2,
|
||||
type: ElementType.H2,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
}
|
||||
@ -120,7 +121,7 @@ export default class DetectTOC extends ToTextItemTransformation {
|
||||
tocLinks.forEach(tocLink => {
|
||||
lastTocPage.items.push(new TextItem({
|
||||
text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
|
||||
type: TOC_BLOCK,
|
||||
type: ElementType.TOC,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
});
|
||||
|
85
src/javascript/models/transformations/GatherBlocks.jsx
Normal file
85
src/javascript/models/transformations/GatherBlocks.jsx
Normal file
@ -0,0 +1,85 @@
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import { ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { minXFromTextItems } from '../../textItemFunctions.jsx';
|
||||
|
||||
// Gathers lines to blocks
|
||||
export default class GatherBlocks extends ToTextItemBlockTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Gather Blocks");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var createdBlocks = 0;
|
||||
var textItems = 0;
|
||||
parseResult.pages.map(page => {
|
||||
textItems += page.items.length;
|
||||
const blocks = [];
|
||||
var stashedBlock = new TextItemBlock({});
|
||||
const flushStashedItems = () => {
|
||||
if (stashedBlock.textItems.length > 1) {
|
||||
stashedBlock.annotation = ADDED_ANNOTATION;
|
||||
}
|
||||
|
||||
blocks.push(stashedBlock);
|
||||
stashedBlock = new TextItemBlock({});
|
||||
createdBlocks++;
|
||||
};
|
||||
|
||||
var minX = minXFromTextItems(page.items);
|
||||
page.items.forEach(item => {
|
||||
if (stashedBlock.textItems.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) {
|
||||
flushStashedItems();
|
||||
}
|
||||
stashedBlock.addTextItem(item);
|
||||
});
|
||||
if (stashedBlock.textItems.length > 0) {
|
||||
flushStashedItems();
|
||||
}
|
||||
page.items = blocks;
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: ['Gathered ' + createdBlocks + ' blocks out of ' + textItems + ' text items']
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
|
||||
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
|
||||
return false;
|
||||
}
|
||||
if (item.type !== stashedBlock.type) {
|
||||
return true;
|
||||
}
|
||||
if (item.type) {
|
||||
return !item.type.mergeToBlock;
|
||||
} else {
|
||||
console.debug(item);
|
||||
const lastItem = stashedBlock.textItems[stashedBlock.textItems.length - 1];
|
||||
return shouldSplit(lastItem, item, minX, mostUsedDistance);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function shouldSplit(lastItem, item, minX, mostUsedDistance) {
|
||||
const distance = lastItem.y - item.y;
|
||||
if (distance < 0 - mostUsedDistance / 2) {
|
||||
//distance is negative - and not only a bit
|
||||
return true;
|
||||
}
|
||||
var allowedDisctance = mostUsedDistance + 1;
|
||||
if (lastItem.x == item.x && item.x > minX) {
|
||||
//intended elements like lists often have greater spacing
|
||||
allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
|
||||
}
|
||||
if (distance > allowedDisctance) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
Loading…
Reference in New Issue
Block a user