mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 23:33:31 +01:00
[WIP] Simplify major headline detections
This commit is contained in:
parent
5caf8154db
commit
739d20d83b
@ -1,170 +0,0 @@
|
|||||||
import TextItem from './TextItem.jsx';
|
|
||||||
import { isNumber } from '../functions.jsx'
|
|
||||||
import { sortByX } from '../textItemFunctions.jsx'
|
|
||||||
|
|
||||||
//Combines text items which are on the same Y at the same time doing inline transformations like
|
|
||||||
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
|
||||||
export default class TextItemCombiner {
|
|
||||||
|
|
||||||
constructor(options) {
|
|
||||||
this.transformEmphasis = options.transformEmphasis || true;
|
|
||||||
this.mostUsedDistance = options.mostUsedDistance || 12;
|
|
||||||
}
|
|
||||||
|
|
||||||
// returns a CombineResult
|
|
||||||
combine(textItems: TextItem[]) {
|
|
||||||
if (textItems.length == 0) {
|
|
||||||
return new CombineResult({
|
|
||||||
textItems: resultItems,
|
|
||||||
parsedElements: {}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
const resultItems = [];
|
|
||||||
const [groupedItems, parsedElements] = this.groupByFollowingY(textItems);
|
|
||||||
groupedItems.forEach(itemGroup => {
|
|
||||||
if (itemGroup.length == 1) {
|
|
||||||
resultItems.push(itemGroup[0]);
|
|
||||||
} else {
|
|
||||||
var text = '';
|
|
||||||
var maxHeight = 0;
|
|
||||||
var widthSum = 0;
|
|
||||||
var lastItem;
|
|
||||||
itemGroup.forEach(item => {
|
|
||||||
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
|
|
||||||
const xDistance = item.x - lastItem.x - lastItem.width;
|
|
||||||
if (xDistance >= 5) {
|
|
||||||
text += ' ';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
text += item.text;
|
|
||||||
widthSum += item.width;
|
|
||||||
lastItem = item;
|
|
||||||
maxHeight = Math.max(maxHeight, item.height);
|
|
||||||
});
|
|
||||||
resultItems.push(new TextItem({
|
|
||||||
...itemGroup[0],
|
|
||||||
text: text,
|
|
||||||
height: maxHeight,
|
|
||||||
width: widthSum
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
//TODO whitespace removal
|
|
||||||
//TODO bold/emphasis
|
|
||||||
|
|
||||||
return new CombineResult({
|
|
||||||
textItems: resultItems,
|
|
||||||
parsedElements: parsedElements
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
groupByFollowingY(textItems) {
|
|
||||||
const footnoteLinks = [];
|
|
||||||
const footnotes = [];
|
|
||||||
|
|
||||||
|
|
||||||
var lines = this.groupItemsByLine(textItems);
|
|
||||||
lines = lines.map(lineItems => {
|
|
||||||
const basicY = lineItems[0].y;
|
|
||||||
const newLineItems = [];
|
|
||||||
var stashedNumberItems = [];
|
|
||||||
|
|
||||||
|
|
||||||
const commitStashedNumbers = (nextItem) => {
|
|
||||||
if (stashedNumberItems.length > 0) {
|
|
||||||
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
|
|
||||||
if (stashedNumberItems[0].y > basicY) { // footnote link
|
|
||||||
newLineItems.push(new TextItem({
|
|
||||||
...stashedNumberItems[0],
|
|
||||||
//TODO make fomatting configurable
|
|
||||||
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
|
|
||||||
text: `^${joinedNumber}`
|
|
||||||
}));
|
|
||||||
footnoteLinks.push(parseInt(joinedNumber));
|
|
||||||
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
|
|
||||||
//TODO womb comp [29] => ydiff == 0
|
|
||||||
newLineItems.push(new TextItem({
|
|
||||||
...stashedNumberItems[0],
|
|
||||||
text: `(^${ joinedNumber}):`
|
|
||||||
}));
|
|
||||||
footnotes.push(joinedNumber);
|
|
||||||
} else {
|
|
||||||
stashedNumberItems.forEach(number => newLineItems.push(number));
|
|
||||||
}
|
|
||||||
|
|
||||||
stashedNumberItems = [];
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
lineItems.forEach(item => {
|
|
||||||
if (newLineItems.length == 0 && item.text.trim().length == 0) {
|
|
||||||
// skip whitespace on the beginning of a line
|
|
||||||
} else {
|
|
||||||
const isANumber = isNumber(item.text);
|
|
||||||
if (isANumber) {
|
|
||||||
stashedNumberItems.push(item);
|
|
||||||
} else {
|
|
||||||
if (stashedNumberItems.length > 0) {
|
|
||||||
commitStashedNumbers(item);
|
|
||||||
}
|
|
||||||
newLineItems.push(item);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
commitStashedNumbers();
|
|
||||||
return newLineItems;
|
|
||||||
});
|
|
||||||
|
|
||||||
|
|
||||||
return [lines, new ParsedElements({
|
|
||||||
footnoteLinks: footnoteLinks,
|
|
||||||
footnotes: footnotes
|
|
||||||
})];
|
|
||||||
}
|
|
||||||
|
|
||||||
groupItemsByLine(textItems:TextItem[]) {
|
|
||||||
const lines = [];
|
|
||||||
var currentLine = [];
|
|
||||||
textItems.forEach(item => {
|
|
||||||
if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
|
|
||||||
lines.push(currentLine);
|
|
||||||
currentLine = [];
|
|
||||||
}
|
|
||||||
currentLine.push(item);
|
|
||||||
});
|
|
||||||
lines.push(currentLine);
|
|
||||||
|
|
||||||
lines.forEach(lineItems => {
|
|
||||||
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
|
||||||
sortByX(lineItems);
|
|
||||||
});
|
|
||||||
return lines;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
//Result of the TextItemCombiner#combine()
|
|
||||||
export class CombineResult {
|
|
||||||
|
|
||||||
constructor(options) {
|
|
||||||
this.textItems = options.textItems;
|
|
||||||
this.parsedElements = options.parsedElements;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
export class ParsedElements {
|
|
||||||
|
|
||||||
constructor(options) {
|
|
||||||
this.footnoteLinks = options.footnoteLinks;
|
|
||||||
this.footnotes = options.footnotes;
|
|
||||||
}
|
|
||||||
|
|
||||||
add(parsedElements:ParsedElements) {
|
|
||||||
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
|
||||||
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,8 +1,7 @@
|
|||||||
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
|
||||||
import ParseResult from '../ParseResult.jsx';
|
import ParseResult from '../ParseResult.jsx';
|
||||||
import TextItemBlock from '../TextItemBlock.jsx';
|
import TextItemBlock from '../TextItemBlock.jsx';
|
||||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
import { ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../Annotation.jsx';
|
||||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
|
||||||
import ElementType from '../ElementType.jsx';
|
import ElementType from '../ElementType.jsx';
|
||||||
import { headlineByLevel } from '../ElementType.jsx';
|
import { headlineByLevel } from '../ElementType.jsx';
|
||||||
|
|
||||||
@ -17,22 +16,19 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
|||||||
var foundHeadlines = 0;
|
var foundHeadlines = 0;
|
||||||
const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals;
|
const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals;
|
||||||
|
|
||||||
const textCombiner = new TextItemCombiner({
|
|
||||||
mostUsedDistance: mostUsedDistance,
|
|
||||||
});
|
|
||||||
|
|
||||||
//Set max headlines (all headers on the same page are max level 2)
|
//Set max headlines (all headers on the same page are max level 2)
|
||||||
const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight, textCombiner);
|
const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight);
|
||||||
|
|
||||||
|
|
||||||
var headlineHeightFlowBeforeToc = [];
|
var headlineHeightFlowBeforeToc = [];
|
||||||
var headlineHeightsOccurenceBeforeToc = {};
|
var headlineHeightsOccurenceBeforeToc = {};
|
||||||
var firstPageAfterToc = 0;
|
var firstPageAfterToc = 0;
|
||||||
if (tocPages && tocPages.length > 0) {
|
if (tocPages && tocPages.length > 0) {
|
||||||
[headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], textCombiner, mostUsedHeight, maxHeaderPages);
|
[headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], mostUsedHeight, maxHeaderPages);
|
||||||
firstPageAfterToc = tocPages[tocPages.length - 1] + 1;
|
firstPageAfterToc = tocPages[tocPages.length - 1] + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, textCombiner, mostUsedHeight, maxHeaderPages);
|
const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, mostUsedHeight, maxHeaderPages);
|
||||||
|
|
||||||
|
|
||||||
// TODO ==> do flow analysis (remove out of flow or snap, start with 2nd)
|
// TODO ==> do flow analysis (remove out of flow or snap, start with 2nd)
|
||||||
@ -49,15 +45,15 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
|||||||
page.items.forEach(block => {
|
page.items.forEach(block => {
|
||||||
newBlocks.push(block);
|
newBlocks.push(block);
|
||||||
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
|
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
|
||||||
const combineResult = textCombiner.combine(block.textItems);
|
// const combineResult = textCombiner.combine(block.textItems);
|
||||||
if (combineResult.textItems.length == 1) {
|
// if (combineResult.textItems.length == 1) {
|
||||||
const height = combineResult.textItems[0].height;
|
// const height = combineResult.textItems[0].height;
|
||||||
if (height == maxHeight) {
|
// if (height == maxHeight) {
|
||||||
block.annotation = REMOVED_ANNOTATION;
|
// // block.annotation = REMOVED_ANNOTATION;
|
||||||
currentHeadlineLevel = 1;
|
// currentHeadlineLevel = 1;
|
||||||
headlineSizePerLevel[currentHeadlineLevel] = height
|
// headlineSizePerLevel[currentHeadlineLevel] = height
|
||||||
addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
|
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
|
||||||
}
|
// }
|
||||||
// else if (currentHeadlineLevel) {
|
// else if (currentHeadlineLevel) {
|
||||||
// const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel];
|
// const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel];
|
||||||
// if (height < currentLevelSize) {
|
// if (height < currentLevelSize) {
|
||||||
@ -79,7 +75,7 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
|||||||
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
|
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
page.items = newBlocks;
|
page.items = newBlocks;
|
||||||
@ -127,16 +123,7 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function addNewBlock(newBlocks, combineResult, headlineLevel) {
|
function convertMaxHeaders(pages, maxHeight, mostUsedHeight) {
|
||||||
newBlocks.push(new TextItemBlock({
|
|
||||||
textItems: combineResult.textItems,
|
|
||||||
type: headlineLevel,
|
|
||||||
annotation: ADDED_ANNOTATION,
|
|
||||||
parsedElements: combineResult.parsedElements
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
|
|
||||||
// Find pages with max height
|
// Find pages with max height
|
||||||
const maxHeaderPagesSet = new Set();
|
const maxHeaderPagesSet = new Set();
|
||||||
pages.forEach(page => {
|
pages.forEach(page => {
|
||||||
@ -150,27 +137,24 @@ function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
|
|||||||
// Now convert those pages to headlines
|
// Now convert those pages to headlines
|
||||||
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
|
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
|
||||||
maxHeaderPagesSet.forEach(pageWithMaxHeader => {
|
maxHeaderPagesSet.forEach(pageWithMaxHeader => {
|
||||||
const newBlocks = [];
|
|
||||||
pageWithMaxHeader.items.forEach(block => {
|
pageWithMaxHeader.items.forEach(block => {
|
||||||
newBlocks.push(block);
|
if (block.textItems.length == 1) {
|
||||||
const height = block.textItems[0].height;
|
const height = block.textItems[0].height;
|
||||||
if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
||||||
block.annotation = REMOVED_ANNOTATION;
|
block.annotation = DETECTED_ANNOTATION;
|
||||||
const combineResult = textCombiner.combine(block.textItems);
|
|
||||||
if (height == maxHeight) {
|
if (height == maxHeight) {
|
||||||
addNewBlock(newBlocks, combineResult, ElementType.H1);
|
block.type = ElementType.H1;
|
||||||
} else if (combineResult.textItems.length == 1) {
|
} else {
|
||||||
addNewBlock(newBlocks, combineResult, ElementType.H2);
|
block.type = ElementType.H2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
pageWithMaxHeader.items = newBlocks;
|
|
||||||
});
|
});
|
||||||
|
|
||||||
return Array.from(maxHeaderPagesSet).map(page => page.index + 1);
|
return Array.from(maxHeaderPagesSet).map(page => page.index + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeight, maxHeaderPages) {
|
function calculateHeadlineHeigthFlow(pages, from, to, mostUsedHeight, maxHeaderPages) {
|
||||||
const headlineHeightFlow = [];
|
const headlineHeightFlow = [];
|
||||||
const headlineHeightsOccurences = {};
|
const headlineHeightsOccurences = {};
|
||||||
var lastHeadlineHeight;
|
var lastHeadlineHeight;
|
||||||
@ -179,9 +163,8 @@ function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeig
|
|||||||
if (!maxHeaderPages.includes(page.index + 1)) {
|
if (!maxHeaderPages.includes(page.index + 1)) {
|
||||||
page.items.forEach(block => {
|
page.items.forEach(block => {
|
||||||
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
|
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
|
||||||
const combineResult = textCombiner.combine(block.textItems);
|
if (block.textItems.length == 1) {
|
||||||
if (combineResult.textItems.length == 1) {
|
const height = block.textItems[0].height;
|
||||||
const height = combineResult.textItems[0].height;
|
|
||||||
headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ;
|
headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ;
|
||||||
if (!lastHeadlineHeight || height != lastHeadlineHeight) {
|
if (!lastHeadlineHeight || height != lastHeadlineHeight) {
|
||||||
headlineHeightFlow.push(height);
|
headlineHeightFlow.push(height);
|
||||||
|
Loading…
Reference in New Issue
Block a user