mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-07-12 03:55:06 +02:00
88 lines
3.0 KiB
JavaScript
88 lines
3.0 KiB
JavaScript
import ToLineItemBlockTransformation from '../ToLineItemBlockTransformation.jsx';
|
|
import ParseResult from '../../ParseResult.jsx';
|
|
import LineItemBlock from '../../LineItemBlock.jsx';
|
|
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
|
import { minXFromPageItems } from '../../../pageItemFunctions.jsx';
|
|
|
|
// Gathers lines to blocks
|
|
export default class GatherBlocks extends ToLineItemBlockTransformation {
|
|
|
|
constructor() {
|
|
super("Gather Blocks");
|
|
}
|
|
|
|
transform(parseResult:ParseResult) {
|
|
const {mostUsedDistance} = parseResult.globals;
|
|
var createdBlocks = 0;
|
|
var lineItemCount = 0;
|
|
parseResult.pages.map(page => {
|
|
lineItemCount += page.items.length;
|
|
const blocks = [];
|
|
var stashedBlock = new LineItemBlock({});
|
|
const flushStashedItems = () => {
|
|
if (stashedBlock.items.length > 1) {
|
|
stashedBlock.annotation = DETECTED_ANNOTATION;
|
|
}
|
|
|
|
blocks.push(stashedBlock);
|
|
stashedBlock = new LineItemBlock({});
|
|
createdBlocks++;
|
|
};
|
|
|
|
var minX = minXFromPageItems(page.items);
|
|
page.items.forEach(item => {
|
|
if (stashedBlock.items.length > 0 && shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance)) {
|
|
flushStashedItems();
|
|
}
|
|
stashedBlock.addItem(item);
|
|
});
|
|
if (stashedBlock.items.length > 0) {
|
|
flushStashedItems();
|
|
}
|
|
page.items = blocks;
|
|
});
|
|
|
|
return new ParseResult({
|
|
...parseResult,
|
|
messages: ['Gathered ' + createdBlocks + ' blocks out of ' + lineItemCount + ' line items']
|
|
});
|
|
}
|
|
|
|
}
|
|
|
|
function shouldFlushBlock(stashedBlock, item, minX, mostUsedDistance) {
|
|
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItems && !item.type) {
|
|
return false;
|
|
}
|
|
const lastItem = stashedBlock.items[stashedBlock.items.length - 1];
|
|
const hasBigDistance = bigDistance(lastItem, item, minX, mostUsedDistance);
|
|
if (stashedBlock.type && stashedBlock.type.mergeFollowingNonTypedItemsWithSmallDistance && !item.type && !hasBigDistance) {
|
|
return false;
|
|
}
|
|
if (item.type !== stashedBlock.type) {
|
|
return true;
|
|
}
|
|
if (item.type) {
|
|
return !item.type.mergeToBlock;
|
|
} else {
|
|
return hasBigDistance;
|
|
}
|
|
}
|
|
|
|
|
|
function bigDistance(lastItem, item, minX, mostUsedDistance) {
|
|
const distance = lastItem.y - item.y;
|
|
if (distance < 0 - mostUsedDistance / 2) {
|
|
//distance is negative - and not only a bit
|
|
return true;
|
|
}
|
|
var allowedDisctance = mostUsedDistance + 1;
|
|
if (lastItem.x > minX && item.x > minX) {
|
|
//intended elements like lists often have greater spacing
|
|
allowedDisctance = mostUsedDistance + mostUsedDistance / 2;
|
|
}
|
|
if (distance > allowedDisctance) {
|
|
return true;
|
|
}
|
|
return false;
|
|
} |