mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 15:23:26 +01:00
[WIP] Simplify major headline detections
This commit is contained in:
parent
5caf8154db
commit
739d20d83b
@ -1,170 +0,0 @@
|
||||
import TextItem from './TextItem.jsx';
|
||||
import { isNumber } from '../functions.jsx'
|
||||
import { sortByX } from '../textItemFunctions.jsx'
|
||||
|
||||
//Combines text items which are on the same Y at the same time doing inline transformations like
|
||||
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||
export default class TextItemCombiner {
|
||||
|
||||
constructor(options) {
|
||||
this.transformEmphasis = options.transformEmphasis || true;
|
||||
this.mostUsedDistance = options.mostUsedDistance || 12;
|
||||
}
|
||||
|
||||
// returns a CombineResult
|
||||
combine(textItems: TextItem[]) {
|
||||
if (textItems.length == 0) {
|
||||
return new CombineResult({
|
||||
textItems: resultItems,
|
||||
parsedElements: {}
|
||||
});
|
||||
}
|
||||
const resultItems = [];
|
||||
const [groupedItems, parsedElements] = this.groupByFollowingY(textItems);
|
||||
groupedItems.forEach(itemGroup => {
|
||||
if (itemGroup.length == 1) {
|
||||
resultItems.push(itemGroup[0]);
|
||||
} else {
|
||||
var text = '';
|
||||
var maxHeight = 0;
|
||||
var widthSum = 0;
|
||||
var lastItem;
|
||||
itemGroup.forEach(item => {
|
||||
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
|
||||
const xDistance = item.x - lastItem.x - lastItem.width;
|
||||
if (xDistance >= 5) {
|
||||
text += ' ';
|
||||
}
|
||||
}
|
||||
text += item.text;
|
||||
widthSum += item.width;
|
||||
lastItem = item;
|
||||
maxHeight = Math.max(maxHeight, item.height);
|
||||
});
|
||||
resultItems.push(new TextItem({
|
||||
...itemGroup[0],
|
||||
text: text,
|
||||
height: maxHeight,
|
||||
width: widthSum
|
||||
}));
|
||||
}
|
||||
});
|
||||
|
||||
//TODO whitespace removal
|
||||
//TODO bold/emphasis
|
||||
|
||||
return new CombineResult({
|
||||
textItems: resultItems,
|
||||
parsedElements: parsedElements
|
||||
});
|
||||
}
|
||||
|
||||
groupByFollowingY(textItems) {
|
||||
const footnoteLinks = [];
|
||||
const footnotes = [];
|
||||
|
||||
|
||||
var lines = this.groupItemsByLine(textItems);
|
||||
lines = lines.map(lineItems => {
|
||||
const basicY = lineItems[0].y;
|
||||
const newLineItems = [];
|
||||
var stashedNumberItems = [];
|
||||
|
||||
|
||||
const commitStashedNumbers = (nextItem) => {
|
||||
if (stashedNumberItems.length > 0) {
|
||||
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
|
||||
if (stashedNumberItems[0].y > basicY) { // footnote link
|
||||
newLineItems.push(new TextItem({
|
||||
...stashedNumberItems[0],
|
||||
//TODO make fomatting configurable
|
||||
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
|
||||
text: `^${joinedNumber}`
|
||||
}));
|
||||
footnoteLinks.push(parseInt(joinedNumber));
|
||||
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
|
||||
//TODO womb comp [29] => ydiff == 0
|
||||
newLineItems.push(new TextItem({
|
||||
...stashedNumberItems[0],
|
||||
text: `(^${ joinedNumber}):`
|
||||
}));
|
||||
footnotes.push(joinedNumber);
|
||||
} else {
|
||||
stashedNumberItems.forEach(number => newLineItems.push(number));
|
||||
}
|
||||
|
||||
stashedNumberItems = [];
|
||||
}
|
||||
};
|
||||
|
||||
lineItems.forEach(item => {
|
||||
if (newLineItems.length == 0 && item.text.trim().length == 0) {
|
||||
// skip whitespace on the beginning of a line
|
||||
} else {
|
||||
const isANumber = isNumber(item.text);
|
||||
if (isANumber) {
|
||||
stashedNumberItems.push(item);
|
||||
} else {
|
||||
if (stashedNumberItems.length > 0) {
|
||||
commitStashedNumbers(item);
|
||||
}
|
||||
newLineItems.push(item);
|
||||
}
|
||||
}
|
||||
});
|
||||
commitStashedNumbers();
|
||||
return newLineItems;
|
||||
});
|
||||
|
||||
|
||||
return [lines, new ParsedElements({
|
||||
footnoteLinks: footnoteLinks,
|
||||
footnotes: footnotes
|
||||
})];
|
||||
}
|
||||
|
||||
groupItemsByLine(textItems:TextItem[]) {
|
||||
const lines = [];
|
||||
var currentLine = [];
|
||||
textItems.forEach(item => {
|
||||
if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
|
||||
lines.push(currentLine);
|
||||
currentLine = [];
|
||||
}
|
||||
currentLine.push(item);
|
||||
});
|
||||
lines.push(currentLine);
|
||||
|
||||
lines.forEach(lineItems => {
|
||||
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
||||
sortByX(lineItems);
|
||||
});
|
||||
return lines;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//Result of the TextItemCombiner#combine()
|
||||
export class CombineResult {
|
||||
|
||||
constructor(options) {
|
||||
this.textItems = options.textItems;
|
||||
this.parsedElements = options.parsedElements;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
export class ParsedElements {
|
||||
|
||||
constructor(options) {
|
||||
this.footnoteLinks = options.footnoteLinks;
|
||||
this.footnotes = options.footnotes;
|
||||
}
|
||||
|
||||
add(parsedElements:ParsedElements) {
|
||||
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
||||
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,8 +1,7 @@
|
||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItemBlock from '../TextItemBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../Annotation.jsx';
|
||||
import ElementType from '../ElementType.jsx';
|
||||
import { headlineByLevel } from '../ElementType.jsx';
|
||||
|
||||
@ -17,22 +16,19 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
||||
var foundHeadlines = 0;
|
||||
const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals;
|
||||
|
||||
const textCombiner = new TextItemCombiner({
|
||||
mostUsedDistance: mostUsedDistance,
|
||||
});
|
||||
|
||||
//Set max headlines (all headers on the same page are max level 2)
|
||||
const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight, textCombiner);
|
||||
const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight);
|
||||
|
||||
|
||||
var headlineHeightFlowBeforeToc = [];
|
||||
var headlineHeightsOccurenceBeforeToc = {};
|
||||
var firstPageAfterToc = 0;
|
||||
if (tocPages && tocPages.length > 0) {
|
||||
[headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], textCombiner, mostUsedHeight, maxHeaderPages);
|
||||
[headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], mostUsedHeight, maxHeaderPages);
|
||||
firstPageAfterToc = tocPages[tocPages.length - 1] + 1;
|
||||
}
|
||||
|
||||
const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, textCombiner, mostUsedHeight, maxHeaderPages);
|
||||
const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, mostUsedHeight, maxHeaderPages);
|
||||
|
||||
|
||||
// TODO ==> do flow analysis (remove out of flow or snap, start with 2nd)
|
||||
@ -49,15 +45,15 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
||||
page.items.forEach(block => {
|
||||
newBlocks.push(block);
|
||||
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (combineResult.textItems.length == 1) {
|
||||
const height = combineResult.textItems[0].height;
|
||||
if (height == maxHeight) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
currentHeadlineLevel = 1;
|
||||
headlineSizePerLevel[currentHeadlineLevel] = height
|
||||
addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
|
||||
}
|
||||
// const combineResult = textCombiner.combine(block.textItems);
|
||||
// if (combineResult.textItems.length == 1) {
|
||||
// const height = combineResult.textItems[0].height;
|
||||
// if (height == maxHeight) {
|
||||
// // block.annotation = REMOVED_ANNOTATION;
|
||||
// currentHeadlineLevel = 1;
|
||||
// headlineSizePerLevel[currentHeadlineLevel] = height
|
||||
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
|
||||
// }
|
||||
// else if (currentHeadlineLevel) {
|
||||
// const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel];
|
||||
// if (height < currentLevelSize) {
|
||||
@ -79,7 +75,7 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
||||
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
|
||||
// }
|
||||
// }
|
||||
}
|
||||
// }
|
||||
}
|
||||
});
|
||||
page.items = newBlocks;
|
||||
@ -127,16 +123,7 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
||||
|
||||
}
|
||||
|
||||
function addNewBlock(newBlocks, combineResult, headlineLevel) {
|
||||
newBlocks.push(new TextItemBlock({
|
||||
textItems: combineResult.textItems,
|
||||
type: headlineLevel,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
parsedElements: combineResult.parsedElements
|
||||
}));
|
||||
}
|
||||
|
||||
function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
|
||||
function convertMaxHeaders(pages, maxHeight, mostUsedHeight) {
|
||||
// Find pages with max height
|
||||
const maxHeaderPagesSet = new Set();
|
||||
pages.forEach(page => {
|
||||
@ -150,27 +137,24 @@ function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
|
||||
// Now convert those pages to headlines
|
||||
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
|
||||
maxHeaderPagesSet.forEach(pageWithMaxHeader => {
|
||||
const newBlocks = [];
|
||||
pageWithMaxHeader.items.forEach(block => {
|
||||
newBlocks.push(block);
|
||||
const height = block.textItems[0].height;
|
||||
if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (height == maxHeight) {
|
||||
addNewBlock(newBlocks, combineResult, ElementType.H1);
|
||||
} else if (combineResult.textItems.length == 1) {
|
||||
addNewBlock(newBlocks, combineResult, ElementType.H2);
|
||||
if (block.textItems.length == 1) {
|
||||
const height = block.textItems[0].height;
|
||||
if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
||||
block.annotation = DETECTED_ANNOTATION;
|
||||
if (height == maxHeight) {
|
||||
block.type = ElementType.H1;
|
||||
} else {
|
||||
block.type = ElementType.H2;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
pageWithMaxHeader.items = newBlocks;
|
||||
});
|
||||
|
||||
return Array.from(maxHeaderPagesSet).map(page => page.index + 1);
|
||||
}
|
||||
|
||||
function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeight, maxHeaderPages) {
|
||||
function calculateHeadlineHeigthFlow(pages, from, to, mostUsedHeight, maxHeaderPages) {
|
||||
const headlineHeightFlow = [];
|
||||
const headlineHeightsOccurences = {};
|
||||
var lastHeadlineHeight;
|
||||
@ -179,9 +163,8 @@ function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeig
|
||||
if (!maxHeaderPages.includes(page.index + 1)) {
|
||||
page.items.forEach(block => {
|
||||
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (combineResult.textItems.length == 1) {
|
||||
const height = combineResult.textItems[0].height;
|
||||
if (block.textItems.length == 1) {
|
||||
const height = block.textItems[0].height;
|
||||
headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ;
|
||||
if (!lastHeadlineHeight || height != lastHeadlineHeight) {
|
||||
headlineHeightFlow.push(height);
|
||||
|
Loading…
Reference in New Issue
Block a user