WIP Proper footnote link detection

This commit is contained in:
Johannes Zillmann 2017-02-20 21:58:37 +01:00
parent a3b6a26437
commit 62fd0155ed
9 changed files with 148 additions and 42 deletions

View File

@ -59,7 +59,8 @@ export default class ResultView extends React.Component {
render() { render() {
const remarkable = new Remarkable({ const remarkable = new Remarkable({
breaks: true breaks: true,
html: true
}); });
const {preview, text} = this.state; const {preview, text} = this.state;

View File

@ -30,12 +30,20 @@ export default class PdfBlockPageView extends React.Component {
const colorStyle = block.annotation ? { const colorStyle = block.annotation ? {
color: block.annotation.color color: block.annotation.color
} : null; } : null;
var footnotesElement;
if (block.parsedElements) {
if (block.parsedElements.footnotes.length > 0) {
footnotesElement = 'Footnotes: ' + block.parsedElements.footnotes;
}
}
return <div key={ i }> return <div key={ i }>
<div style={ colorStyle }> <div style={ colorStyle }>
<b>Block { i + 1 }</b><i>{ blockType } { blockAnnotation }</i> <b>Block { i + 1 }</b><i>{ blockType } { blockAnnotation }</i>
</div> </div>
<div style={ borderStyle }> <div style={ borderStyle }>
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } /> <TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
{ footnotesElement }
</div> </div>
</div> </div>
}); });

View File

@ -1,4 +1,6 @@
import PdfBlock from './BlockPage.jsx'; import PdfBlock from './BlockPage.jsx';
import TextItemCombiner from './TextItemCombiner.jsx';
import TextItem from './TextItem.jsx';
export const HEADLINE1 = "Headline 1"; export const HEADLINE1 = "Headline 1";
export const PARAGRAPH = "Paragraph"; export const PARAGRAPH = "Paragraph";
@ -9,7 +11,7 @@ export const TOC_BLOCK = "TOC";
export function blockToText(block: PdfBlock) { export function blockToText(block: PdfBlock) {
switch (block.type) { switch (block.type) {
case CODE_BLOCK: case CODE_BLOCK:
return '```\n' + concatTextItems(block) + '```' return '```\n' + concatTextItems(block.textItems) + '```'
case TOC_BLOCK: case TOC_BLOCK:
//TODO 2nd level //TODO 2nd level
//TODO real links //TODO real links
@ -19,16 +21,21 @@ export function blockToText(block: PdfBlock) {
}); });
return text; return text;
case HEADLINE1: case HEADLINE1:
return '#' + concatTextItems(block); return '#' + concatTextItems(block.textItems);
default: default:
return concatTextItems(block); var textItems = block.textItems;
if (!block.type) {
//TODO mostUsedDistance
textItems = new TextItemCombiner({}).combine(textItems).textItems;
}
return concatTextItems(textItems);
} }
} }
function concatTextItems(block: PdfBlock) { function concatTextItems(textItems: TextItem[]) {
var text = ''; var text = '';
block.textItems.forEach(item => { textItems.forEach(item => {
text += item.text + '\n'; text += item.text + '\n';
}); });
return text; return text;

View File

@ -5,6 +5,7 @@ export default class PdfBlock {
this.textItems = options.textItems; this.textItems = options.textItems;
this.type = options.type; this.type = options.type;
this.annotation = options.annotation; this.annotation = options.annotation;
this.parsedElements = options.parsedElements;
} }
} }

View File

@ -1,18 +1,25 @@
import TextItem from './TextItem.jsx'; import TextItem from './TextItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from './Annotation.jsx'; import { isNumber } from '../functions.jsx'
//Combines text items which are on the same Y at the same time doing inline transformations like 'whitespace removal', bold/emphasis annotation, link-detection, etc.. //Combines text items which are on the same Y at the same time doing inline transformations like
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
export default class TextItemCombiner { export default class TextItemCombiner {
constructor(options) { constructor(options) {
this.transformEmphasis = options.transformEmphasis || true; this.transformEmphasis = options.transformEmphasis || true;
this.maxYDerivation = options.transformEmphasis || 3; this.mostUsedDistance = options.mostUsedDistance || 12;
} }
// returns a TextItem array new items // returns a CombineResult
combine(textItems: TextItem[]) { combine(textItems: TextItem[]) {
if (textItems.length == 0) {
return new CombineResult({
textItems: resultItems,
parsedElements: {}
});
}
const resultItems = []; const resultItems = [];
const groupedItems = this.groupByFollowingY(textItems); const [groupedItems, parsedElements] = this.groupByFollowingY(textItems);
groupedItems.forEach(itemGroup => { groupedItems.forEach(itemGroup => {
if (itemGroup.length == 1) { if (itemGroup.length == 1) {
resultItems.push(itemGroup[0]); resultItems.push(itemGroup[0]);
@ -22,8 +29,6 @@ export default class TextItemCombiner {
var widthSum = 0; var widthSum = 0;
var lastItem; var lastItem;
itemGroup.forEach(item => { itemGroup.forEach(item => {
// item.annotation = REMOVED_ANNOTATION;
// resultItems.push(item);
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) { if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
const xDistance = item.x - lastItem.x - lastItem.width; const xDistance = item.x - lastItem.x - lastItem.width;
if (xDistance >= 5) { if (xDistance >= 5) {
@ -34,12 +39,11 @@ export default class TextItemCombiner {
widthSum += item.width; widthSum += item.width;
lastItem = item; lastItem = item;
}); });
//TODO set other elements
resultItems.push(new TextItem({ resultItems.push(new TextItem({
...itemGroup[0], ...itemGroup[0],
text: text, text: text,
height: maxHeight, height: maxHeight,
width: widthSum, width: widthSum
})); }));
} }
}); });
@ -47,23 +51,95 @@ export default class TextItemCombiner {
//TODO whitespace removal //TODO whitespace removal
//TODO bold/emphasis //TODO bold/emphasis
return resultItems; return new CombineResult({
textItems: resultItems,
parsedElements: parsedElements
});
} }
groupByFollowingY(textItems) { groupByFollowingY(textItems) {
const yArrays = []; const yArrays = [];
const footnotes = [];
var itemsWithSameY = []; var itemsWithSameY = [];
var lastItem; var lastItem;
const wrapUpLine = () => {
// we can't trust order of occurence, esp. footnotes like to come last
itemsWithSameY.sort((a, b) => {
return a.x - b.x;
});
const finalArray = [];
const basicY = itemsWithSameY[0].y;
var savedFootnoteItems = [];
const commitSavedFootnotes = () => {
if (savedFootnoteItems.length > 0) {
const footnoteNumber = savedFootnoteItems.map(footnoteItem => footnoteItem.text).join('');
finalArray.push(new TextItem({
...savedFootnoteItems[0],
//TODO make fomatting configurable
// text: `<sup>[${footnoteNumber}](#${footnoteNumber})</sup>`
text: `*${footnoteNumber}`
}));
savedFootnoteItems = [];
footnotes.push(parseInt(footnoteNumber));
}
};
itemsWithSameY.forEach(item => {
const isFootnote = item.y > basicY && isNumber(item.text);
if (isFootnote) {
savedFootnoteItems.push(item);
} else {
if (savedFootnoteItems.length > 0) {
commitSavedFootnotes();
}
finalArray.push(item);
}
});
commitSavedFootnotes();
yArrays.push(finalArray);
itemsWithSameY = [];
};
textItems.forEach(item => { textItems.forEach(item => {
if (itemsWithSameY.length == 0 || Math.abs(lastItem.y - item.y) <= this.maxYDerivation) { if (lastItem) {
itemsWithSameY.push(item); if (Math.abs(lastItem.y - item.y) > this.mostUsedDistance / 2) {
} else { wrapUpLine();
yArrays.push(itemsWithSameY); }
itemsWithSameY = [item];
} }
itemsWithSameY.push(item);
lastItem = item; lastItem = item;
}) // }
yArrays.push(itemsWithSameY); });
return yArrays; wrapUpLine();
return [yArrays, new ParsedElements({
footnotes: footnotes
})];
} }
} }
//Result of the TextItemCombiner#combine()
export class CombineResult {
constructor(options) {
this.textItems = options.textItems;
this.footnotes = options.footnotes;
this.parsedElements = options.parsedElements;
}
}
export class ParsedElements {
constructor(options) {
this.footnotes = options.footnotes;
}
add(parsedElements:ParsedElements) {
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
}
}

View File

@ -25,7 +25,9 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
const {mostUsedHeight, mostUsedDistance} = parseResult.globals; const {mostUsedHeight, mostUsedDistance} = parseResult.globals;
var foundBlocks = 0; var foundBlocks = 0;
const textCombiner = new TextItemCombiner({}); const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance
});
parseResult.content.forEach(page => { parseResult.content.forEach(page => {
var minX = minXFromBlocks(page.blocks); var minX = minXFromBlocks(page.blocks);
@ -55,13 +57,16 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
} }
block.annotation = REMOVED_ANNOTATION; block.annotation = REMOVED_ANNOTATION;
newBlocks.push(block); newBlocks.push(block);
const combineResult = textCombiner.combine(block.textItems);
if (mergeWithPreceedingCodeBlock) { if (mergeWithPreceedingCodeBlock) {
preceedingCodeBlock.textItems = preceedingCodeBlock.textItems.concat(textCombiner.combine(block.textItems)); preceedingCodeBlock.textItems = preceedingCodeBlock.textItems.concat(combineResult.textItems);
preceedingCodeBlock.parsedElements.add(combineResult.parsedElements);
} else { } else {
preceedingCodeBlock = new PdfBlock({ preceedingCodeBlock = new PdfBlock({
type: CODE_BLOCK, type: CODE_BLOCK,
annotation: ADDED_ANNOTATION, annotation: ADDED_ANNOTATION,
textItems: textCombiner.combine(block.textItems) textItems: combineResult.textItems,
parsedElements: combineResult.parsedElements
}); });
foundBlocks++; foundBlocks++;
} }

View File

@ -23,8 +23,11 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
} }
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var foundBlocks = 0; var foundBlocks = 0;
const textCombiner = new TextItemCombiner({}); const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance
});
parseResult.content.forEach(page => { parseResult.content.forEach(page => {
var minX = minXFromBlocks(page.blocks); var minX = minXFromBlocks(page.blocks);
@ -33,8 +36,8 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
page.blocks.forEach(block => { page.blocks.forEach(block => {
newBlocks.push(block); newBlocks.push(block);
if (!block.type) { if (!block.type) {
const yGroupedItems = textCombiner.combine(block.textItems); const combineResult = textCombiner.combine(block.textItems);
if (hasMoreThan2LineItems(yGroupedItems)) { if (hasMoreThan2LineItems(combineResult.textItems)) {
block.annotation = REMOVED_ANNOTATION; block.annotation = REMOVED_ANNOTATION;
foundBlocks++; foundBlocks++;
@ -65,7 +68,7 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
}; };
yGroupedItems.forEach(lineItem => { combineResult.textItems.forEach(lineItem => {
if (isPlainListItem(lineItem.text)) { if (isPlainListItem(lineItem.text)) {
var text = lineItem.text; var text = lineItem.text;
text = text.substring(1, text.length).trim(); text = text.substring(1, text.length).trim();
@ -96,7 +99,8 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
newBlocks.push(new PdfBlock({ newBlocks.push(new PdfBlock({
textItems: listBlockItems, textItems: listBlockItems,
type: LIST_BLOCK, type: LIST_BLOCK,
annotation: ADDED_ANNOTATION annotation: ADDED_ANNOTATION,
parsedElements: combineResult.parsedElements
})); }));
} }
} }

View File

@ -26,10 +26,12 @@ export default class DetectPdfBlocks extends ToPdfBlockViewTransformation {
const blocks = []; const blocks = [];
var textItemsInBlock = []; var textItemsInBlock = [];
const completBlock = () => { const completBlock = () => {
blocks.push(new PdfBlock({ if (textItemsInBlock.length > 0) { //can happen on empty page
textItems: textItemsInBlock blocks.push(new PdfBlock({
})); textItems: textItemsInBlock
textItemsInBlock = []; }));
textItemsInBlock = [];
}
}; };
var lastItem; var lastItem;
page.textItems.forEach(item => { page.textItems.forEach(item => {

View File

@ -6,9 +6,7 @@ import PdfBlock from '../PdfBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx'; import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { TOC_BLOCK } from '../MarkdownElements.jsx'; import { TOC_BLOCK } from '../MarkdownElements.jsx';
import Annotation from '../Annotation.jsx'; import { isDigit } from '../../functions.jsx'
import { groupByFollowingY } from '../TextItemCombiner.jsx';
import { isNumber, isDigit } from '../../functions.jsx'
//Detect table of contents pages //Detect table of contents pages
export default class DetectTOC extends ToPdfBlockViewTransformation { export default class DetectTOC extends ToPdfBlockViewTransformation {
@ -26,10 +24,14 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var foundTocPages = 0; var foundTocPages = 0;
var x = Math.min(12, parseResult.content.length); var x = Math.min(12, parseResult.content.length);
const textCombiner = new TextItemCombiner({}); const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance
});
parseResult.content.slice(0, x).forEach(page => { parseResult.content.slice(0, x).forEach(page => {
var linesCount = 0; var linesCount = 0;
var linesWithDigitsCount = 0; var linesWithDigitsCount = 0;
@ -37,7 +39,7 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
var headlineBlock; var headlineBlock;
page.blocks.forEach(block => { page.blocks.forEach(block => {
var blockHasLinesWithDigits = false; var blockHasLinesWithDigits = false;
const itemsGroupedByY = textCombiner.combine(block.textItems); const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
itemsGroupedByY.forEach(lineItem => { itemsGroupedByY.forEach(lineItem => {
linesCount++ linesCount++
var lineText = lineItem.text.replace(/\./g, '').trim(); var lineText = lineItem.text.replace(/\./g, '').trim();