mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 23:33:31 +01:00
WIP Proper footnote link detection
This commit is contained in:
parent
a3b6a26437
commit
62fd0155ed
@ -59,7 +59,8 @@ export default class ResultView extends React.Component {
|
||||
|
||||
render() {
|
||||
const remarkable = new Remarkable({
|
||||
breaks: true
|
||||
breaks: true,
|
||||
html: true
|
||||
});
|
||||
const {preview, text} = this.state;
|
||||
|
||||
|
@ -30,12 +30,20 @@ export default class PdfBlockPageView extends React.Component {
|
||||
const colorStyle = block.annotation ? {
|
||||
color: block.annotation.color
|
||||
} : null;
|
||||
var footnotesElement;
|
||||
if (block.parsedElements) {
|
||||
if (block.parsedElements.footnotes.length > 0) {
|
||||
footnotesElement = 'Footnotes: ' + block.parsedElements.footnotes;
|
||||
}
|
||||
}
|
||||
|
||||
return <div key={ i }>
|
||||
<div style={ colorStyle }>
|
||||
<b>Block { i + 1 }</b><i>{ blockType } { blockAnnotation }</i>
|
||||
</div>
|
||||
<div style={ borderStyle }>
|
||||
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
|
||||
{ footnotesElement }
|
||||
</div>
|
||||
</div>
|
||||
});
|
||||
|
@ -1,4 +1,6 @@
|
||||
import PdfBlock from './BlockPage.jsx';
|
||||
import TextItemCombiner from './TextItemCombiner.jsx';
|
||||
import TextItem from './TextItem.jsx';
|
||||
|
||||
export const HEADLINE1 = "Headline 1";
|
||||
export const PARAGRAPH = "Paragraph";
|
||||
@ -9,7 +11,7 @@ export const TOC_BLOCK = "TOC";
|
||||
export function blockToText(block: PdfBlock) {
|
||||
switch (block.type) {
|
||||
case CODE_BLOCK:
|
||||
return '```\n' + concatTextItems(block) + '```'
|
||||
return '```\n' + concatTextItems(block.textItems) + '```'
|
||||
case TOC_BLOCK:
|
||||
//TODO 2nd level
|
||||
//TODO real links
|
||||
@ -19,16 +21,21 @@ export function blockToText(block: PdfBlock) {
|
||||
});
|
||||
return text;
|
||||
case HEADLINE1:
|
||||
return '#' + concatTextItems(block);
|
||||
return '#' + concatTextItems(block.textItems);
|
||||
default:
|
||||
return concatTextItems(block);
|
||||
var textItems = block.textItems;
|
||||
if (!block.type) {
|
||||
//TODO mostUsedDistance
|
||||
textItems = new TextItemCombiner({}).combine(textItems).textItems;
|
||||
}
|
||||
return concatTextItems(textItems);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function concatTextItems(block: PdfBlock) {
|
||||
function concatTextItems(textItems: TextItem[]) {
|
||||
var text = '';
|
||||
block.textItems.forEach(item => {
|
||||
textItems.forEach(item => {
|
||||
text += item.text + '\n';
|
||||
});
|
||||
return text;
|
||||
|
@ -5,6 +5,7 @@ export default class PdfBlock {
|
||||
this.textItems = options.textItems;
|
||||
this.type = options.type;
|
||||
this.annotation = options.annotation;
|
||||
this.parsedElements = options.parsedElements;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,18 +1,25 @@
|
||||
import TextItem from './TextItem.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from './Annotation.jsx';
|
||||
import { isNumber } from '../functions.jsx'
|
||||
|
||||
//Combines text items which are on the same Y at the same time doing inline transformations like 'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||
//Combines text items which are on the same Y at the same time doing inline transformations like
|
||||
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||
export default class TextItemCombiner {
|
||||
|
||||
constructor(options) {
|
||||
this.transformEmphasis = options.transformEmphasis || true;
|
||||
this.maxYDerivation = options.transformEmphasis || 3;
|
||||
this.mostUsedDistance = options.mostUsedDistance || 12;
|
||||
}
|
||||
|
||||
// returns a TextItem array new items
|
||||
// returns a CombineResult
|
||||
combine(textItems: TextItem[]) {
|
||||
if (textItems.length == 0) {
|
||||
return new CombineResult({
|
||||
textItems: resultItems,
|
||||
parsedElements: {}
|
||||
});
|
||||
}
|
||||
const resultItems = [];
|
||||
const groupedItems = this.groupByFollowingY(textItems);
|
||||
const [groupedItems, parsedElements] = this.groupByFollowingY(textItems);
|
||||
groupedItems.forEach(itemGroup => {
|
||||
if (itemGroup.length == 1) {
|
||||
resultItems.push(itemGroup[0]);
|
||||
@ -22,8 +29,6 @@ export default class TextItemCombiner {
|
||||
var widthSum = 0;
|
||||
var lastItem;
|
||||
itemGroup.forEach(item => {
|
||||
// item.annotation = REMOVED_ANNOTATION;
|
||||
// resultItems.push(item);
|
||||
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
|
||||
const xDistance = item.x - lastItem.x - lastItem.width;
|
||||
if (xDistance >= 5) {
|
||||
@ -34,12 +39,11 @@ export default class TextItemCombiner {
|
||||
widthSum += item.width;
|
||||
lastItem = item;
|
||||
});
|
||||
//TODO set other elements
|
||||
resultItems.push(new TextItem({
|
||||
...itemGroup[0],
|
||||
text: text,
|
||||
height: maxHeight,
|
||||
width: widthSum,
|
||||
width: widthSum
|
||||
}));
|
||||
}
|
||||
});
|
||||
@ -47,23 +51,95 @@ export default class TextItemCombiner {
|
||||
//TODO whitespace removal
|
||||
//TODO bold/emphasis
|
||||
|
||||
return resultItems;
|
||||
return new CombineResult({
|
||||
textItems: resultItems,
|
||||
parsedElements: parsedElements
|
||||
});
|
||||
}
|
||||
|
||||
groupByFollowingY(textItems) {
|
||||
const yArrays = [];
|
||||
const footnotes = [];
|
||||
var itemsWithSameY = [];
|
||||
var lastItem;
|
||||
|
||||
|
||||
const wrapUpLine = () => {
|
||||
// we can't trust order of occurence, esp. footnotes like to come last
|
||||
itemsWithSameY.sort((a, b) => {
|
||||
return a.x - b.x;
|
||||
});
|
||||
const finalArray = [];
|
||||
const basicY = itemsWithSameY[0].y;
|
||||
var savedFootnoteItems = [];
|
||||
const commitSavedFootnotes = () => {
|
||||
if (savedFootnoteItems.length > 0) {
|
||||
const footnoteNumber = savedFootnoteItems.map(footnoteItem => footnoteItem.text).join('');
|
||||
finalArray.push(new TextItem({
|
||||
...savedFootnoteItems[0],
|
||||
//TODO make fomatting configurable
|
||||
// text: `<sup>[${footnoteNumber}](#${footnoteNumber})</sup>`
|
||||
text: `*${footnoteNumber}`
|
||||
}));
|
||||
savedFootnoteItems = [];
|
||||
footnotes.push(parseInt(footnoteNumber));
|
||||
}
|
||||
};
|
||||
|
||||
itemsWithSameY.forEach(item => {
|
||||
const isFootnote = item.y > basicY && isNumber(item.text);
|
||||
if (isFootnote) {
|
||||
savedFootnoteItems.push(item);
|
||||
} else {
|
||||
if (savedFootnoteItems.length > 0) {
|
||||
commitSavedFootnotes();
|
||||
}
|
||||
finalArray.push(item);
|
||||
}
|
||||
});
|
||||
commitSavedFootnotes();
|
||||
yArrays.push(finalArray);
|
||||
itemsWithSameY = [];
|
||||
};
|
||||
|
||||
textItems.forEach(item => {
|
||||
if (itemsWithSameY.length == 0 || Math.abs(lastItem.y - item.y) <= this.maxYDerivation) {
|
||||
itemsWithSameY.push(item);
|
||||
} else {
|
||||
yArrays.push(itemsWithSameY);
|
||||
itemsWithSameY = [item];
|
||||
if (lastItem) {
|
||||
if (Math.abs(lastItem.y - item.y) > this.mostUsedDistance / 2) {
|
||||
wrapUpLine();
|
||||
}
|
||||
}
|
||||
itemsWithSameY.push(item);
|
||||
lastItem = item;
|
||||
})
|
||||
yArrays.push(itemsWithSameY);
|
||||
return yArrays;
|
||||
// }
|
||||
});
|
||||
wrapUpLine();
|
||||
|
||||
return [yArrays, new ParsedElements({
|
||||
footnotes: footnotes
|
||||
})];
|
||||
}
|
||||
}
|
||||
|
||||
//Result of the TextItemCombiner#combine()
|
||||
export class CombineResult {
|
||||
|
||||
constructor(options) {
|
||||
this.textItems = options.textItems;
|
||||
this.footnotes = options.footnotes;
|
||||
this.parsedElements = options.parsedElements;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
export class ParsedElements {
|
||||
|
||||
constructor(options) {
|
||||
this.footnotes = options.footnotes;
|
||||
}
|
||||
|
||||
add(parsedElements:ParsedElements) {
|
||||
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -25,7 +25,9 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
||||
const {mostUsedHeight, mostUsedDistance} = parseResult.globals;
|
||||
|
||||
var foundBlocks = 0;
|
||||
const textCombiner = new TextItemCombiner({});
|
||||
const textCombiner = new TextItemCombiner({
|
||||
mostUsedDistance: mostUsedDistance
|
||||
});
|
||||
|
||||
parseResult.content.forEach(page => {
|
||||
var minX = minXFromBlocks(page.blocks);
|
||||
@ -55,13 +57,16 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
|
||||
}
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
newBlocks.push(block);
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (mergeWithPreceedingCodeBlock) {
|
||||
preceedingCodeBlock.textItems = preceedingCodeBlock.textItems.concat(textCombiner.combine(block.textItems));
|
||||
preceedingCodeBlock.textItems = preceedingCodeBlock.textItems.concat(combineResult.textItems);
|
||||
preceedingCodeBlock.parsedElements.add(combineResult.parsedElements);
|
||||
} else {
|
||||
preceedingCodeBlock = new PdfBlock({
|
||||
type: CODE_BLOCK,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
textItems: textCombiner.combine(block.textItems)
|
||||
textItems: combineResult.textItems,
|
||||
parsedElements: combineResult.parsedElements
|
||||
});
|
||||
foundBlocks++;
|
||||
}
|
||||
|
@ -23,8 +23,11 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var foundBlocks = 0;
|
||||
const textCombiner = new TextItemCombiner({});
|
||||
const textCombiner = new TextItemCombiner({
|
||||
mostUsedDistance: mostUsedDistance
|
||||
});
|
||||
|
||||
parseResult.content.forEach(page => {
|
||||
var minX = minXFromBlocks(page.blocks);
|
||||
@ -33,8 +36,8 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
|
||||
page.blocks.forEach(block => {
|
||||
newBlocks.push(block);
|
||||
if (!block.type) {
|
||||
const yGroupedItems = textCombiner.combine(block.textItems);
|
||||
if (hasMoreThan2LineItems(yGroupedItems)) {
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (hasMoreThan2LineItems(combineResult.textItems)) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
foundBlocks++;
|
||||
|
||||
@ -65,7 +68,7 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
|
||||
|
||||
};
|
||||
|
||||
yGroupedItems.forEach(lineItem => {
|
||||
combineResult.textItems.forEach(lineItem => {
|
||||
if (isPlainListItem(lineItem.text)) {
|
||||
var text = lineItem.text;
|
||||
text = text.substring(1, text.length).trim();
|
||||
@ -96,7 +99,8 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
|
||||
newBlocks.push(new PdfBlock({
|
||||
textItems: listBlockItems,
|
||||
type: LIST_BLOCK,
|
||||
annotation: ADDED_ANNOTATION
|
||||
annotation: ADDED_ANNOTATION,
|
||||
parsedElements: combineResult.parsedElements
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
@ -26,10 +26,12 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
|
||||
const blocks = [];
|
||||
var textItemsInBlock = [];
|
||||
const completBlock = () => {
|
||||
blocks.push(new PdfBlock({
|
||||
textItems: textItemsInBlock
|
||||
}));
|
||||
textItemsInBlock = [];
|
||||
if (textItemsInBlock.length > 0) { //can happen on empty page
|
||||
blocks.push(new PdfBlock({
|
||||
textItems: textItemsInBlock
|
||||
}));
|
||||
textItemsInBlock = [];
|
||||
}
|
||||
};
|
||||
var lastItem;
|
||||
page.textItems.forEach(item => {
|
||||
|
@ -6,9 +6,7 @@ import PdfBlock from '../PdfBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { TOC_BLOCK } from '../MarkdownElements.jsx';
|
||||
import Annotation from '../Annotation.jsx';
|
||||
import { groupByFollowingY } from '../TextItemCombiner.jsx';
|
||||
import { isNumber, isDigit } from '../../functions.jsx'
|
||||
import { isDigit } from '../../functions.jsx'
|
||||
|
||||
//Detect table of contents pages
|
||||
export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
@ -26,10 +24,14 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var foundTocPages = 0;
|
||||
var x = Math.min(12, parseResult.content.length);
|
||||
const textCombiner = new TextItemCombiner({});
|
||||
const textCombiner = new TextItemCombiner({
|
||||
mostUsedDistance: mostUsedDistance
|
||||
});
|
||||
|
||||
|
||||
parseResult.content.slice(0, x).forEach(page => {
|
||||
var linesCount = 0;
|
||||
var linesWithDigitsCount = 0;
|
||||
@ -37,7 +39,7 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
|
||||
var headlineBlock;
|
||||
page.blocks.forEach(block => {
|
||||
var blockHasLinesWithDigits = false;
|
||||
const itemsGroupedByY = textCombiner.combine(block.textItems);
|
||||
const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
|
||||
itemsGroupedByY.forEach(lineItem => {
|
||||
linesCount++
|
||||
var lineText = lineItem.text.replace(/\./g, '').trim();
|
||||
|
Loading…
Reference in New Issue
Block a user