WIP footer detection

This commit is contained in:
Johannes Zillmann 2017-02-22 23:18:49 +01:00
parent b7db48af4b
commit 5827379d1b
7 changed files with 154 additions and 110 deletions

View File

@ -30,10 +30,18 @@ export default class PdfBlockPageView extends React.Component {
const colorStyle = block.annotation ? { const colorStyle = block.annotation ? {
color: block.annotation.color color: block.annotation.color
} : null; } : null;
var footnotesElement; var footnoteLinks;
var footnotes;
if (block.parsedElements) { if (block.parsedElements) {
if (block.parsedElements.footnoteLinks.length > 0) {
footnoteLinks = <div>
{ 'Footnote-Links: ' + block.parsedElements.footnoteLinks }
</div>;
}
if (block.parsedElements.footnotes.length > 0) { if (block.parsedElements.footnotes.length > 0) {
footnotesElement = 'Footnotes: ' + block.parsedElements.footnotes; footnotes = <div>
{ 'Footnotes: ' + block.parsedElements.footnotes }
</div>;
} }
} }
@ -43,7 +51,8 @@ export default class PdfBlockPageView extends React.Component {
</div> </div>
<div style={ borderStyle }> <div style={ borderStyle }>
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } /> <TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
{ footnotesElement } { footnoteLinks }
{ footnotes }
</div> </div>
</div> </div>
}); });

View File

@ -4,17 +4,17 @@ import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx'; import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx' import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectTOC from './transformations/DetectTOC.jsx' import DetectTOC from './transformations/DetectTOC.jsx'
import DetectLists from './transformations/DetectLists.jsx' import DetectLists from './transformations/DetectLists.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx' import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
import DetectFormats from './transformations/DetectFormats.jsx' // import DetectFormats from './transformations/DetectFormats.jsx'
import CombineSameY from './transformations/CombineSameY.jsx'; // import CombineSameY from './transformations/CombineSameY.jsx';
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx' // import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx' // import DetectLinks from './transformations/DetectLinks.jsx'
import DetectLinks from './transformations/DetectLinks.jsx' // import HeadlineDetector from './transformations/HeadlineDetector.jsx'
import HeadlineDetector from './transformations/HeadlineDetector.jsx' // import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx' // import ToBlockSystem from './transformations/ToBlockSystem.jsx';
import ToBlockSystem from './transformations/ToBlockSystem.jsx';
import ToTextBlocks from './transformations/ToTextBlocks.jsx'; import ToTextBlocks from './transformations/ToTextBlocks.jsx';
import ToMarkdown from './transformations/ToMarkdown.jsx' import ToMarkdown from './transformations/ToMarkdown.jsx'
@ -31,6 +31,7 @@ export default class AppState {
new RemoveRepetitiveElements(), new RemoveRepetitiveElements(),
new VerticalToHorizontal(), new VerticalToHorizontal(),
new DetectPdfBlocks(), new DetectPdfBlocks(),
new DetectFootnotes(),
new DetectTOC(), new DetectTOC(),
new DetectLists(), new DetectLists(),
new DetectCodeBlocks(), new DetectCodeBlocks(),
@ -38,7 +39,6 @@ export default class AppState {
// new DetectFormats(), // new DetectFormats(),
// new CombineSameY(), // new CombineSameY(),
// new RemoveWhitespaces(), // new RemoveWhitespaces(),
// new DetectFootnotes(),
// new DetectLinks(), // new DetectLinks(),
// new HeadlineDetector(), // new HeadlineDetector(),
// new HeadlineToUppercase(), // new HeadlineToUppercase(),

View File

@ -7,6 +7,7 @@ export const PARAGRAPH = "Paragraph";
export const LIST_BLOCK = "List"; export const LIST_BLOCK = "List";
export const CODE_BLOCK = "Code/Quote"; export const CODE_BLOCK = "Code/Quote";
export const TOC_BLOCK = "TOC"; export const TOC_BLOCK = "TOC";
export const FOOTNOTE_BLOCK = "Footnotes"
export function blockToText(block: PdfBlock) { export function blockToText(block: PdfBlock) {
switch (block.type) { switch (block.type) {

View File

@ -1,5 +1,6 @@
import TextItem from './TextItem.jsx'; import TextItem from './TextItem.jsx';
import { isNumber } from '../functions.jsx' import { isNumber, isDigit } from '../functions.jsx'
import { sortByX } from '../textItemFunctions.jsx'
//Combines text items which are on the same Y at the same time doing inline transformations like //Combines text items which are on the same Y at the same time doing inline transformations like
//'whitespace removal', bold/emphasis annotation, link-detection, etc.. //'whitespace removal', bold/emphasis annotation, link-detection, etc..
@ -58,66 +59,88 @@ export default class TextItemCombiner {
} }
groupByFollowingY(textItems) { groupByFollowingY(textItems) {
const yArrays = []; const footnoteLinks = [];
const footnotes = []; const footnotes = [];
var itemsWithSameY = [];
var lastItem;
const wrapUpLine = () => { var lines = this.groupItemsByLine(textItems);
// we can't trust order of occurence, esp. footnotes like to come last lines = lines.map(lineItems => {
itemsWithSameY.sort((a, b) => { const basicY = lineItems[0].y;
return a.x - b.x; const newLineItems = [];
}); var stashedNumberItems = [];
const finalArray = [];
const basicY = itemsWithSameY[0].y;
var savedFootnoteItems = []; const commitStashedNumbers = (nextItem) => {
const commitSavedFootnotes = () => { if (stashedNumberItems.length > 0) {
if (savedFootnoteItems.length > 0) { const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
const footnoteNumber = savedFootnoteItems.map(footnoteItem => footnoteItem.text).join(''); if (stashedNumberItems[0].y > basicY) { // footnote link
finalArray.push(new TextItem({ newLineItems.push(new TextItem({
...savedFootnoteItems[0], ...stashedNumberItems[0],
//TODO make fomatting configurable //TODO make fomatting configurable
// text: `<sup>[${footnoteNumber}](#${footnoteNumber})</sup>` // text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
text: `*${footnoteNumber}` text: `^${joinedNumber}`
})); }));
savedFootnoteItems = []; footnoteLinks.push(parseInt(joinedNumber));
footnotes.push(parseInt(footnoteNumber)); } else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
//TODO womb comp [29] => ydiff == 0
newLineItems.push(new TextItem({
...stashedNumberItems[0],
text: `(^${ joinedNumber}):`
}));
footnotes.push(joinedNumber);
} else {
stashedNumberItems.forEach(number => newLineItems.push(number));
}
stashedNumberItems = [];
} }
}; };
itemsWithSameY.forEach(item => { lineItems.forEach(item => {
const isFootnote = item.y > basicY && isNumber(item.text); if (newLineItems.length == 0 && item.text.trim().length == 0) {
if (isFootnote) { // skip whitespace on the beginning of a line
savedFootnoteItems.push(item);
} else { } else {
if (savedFootnoteItems.length > 0) { const isANumber = isNumber(item.text);
commitSavedFootnotes(); if (isANumber) {
stashedNumberItems.push(item);
} else {
if (stashedNumberItems.length > 0) {
commitStashedNumbers(item);
}
newLineItems.push(item);
} }
finalArray.push(item);
} }
}); });
commitSavedFootnotes(); commitStashedNumbers();
yArrays.push(finalArray); return newLineItems;
itemsWithSameY = [];
};
textItems.forEach(item => {
if (lastItem) {
if (Math.abs(lastItem.y - item.y) > this.mostUsedDistance / 2) {
wrapUpLine();
}
}
itemsWithSameY.push(item);
lastItem = item;
// }
}); });
wrapUpLine();
return [yArrays, new ParsedElements({
return [lines, new ParsedElements({
footnoteLinks: footnoteLinks,
footnotes: footnotes footnotes: footnotes
})]; })];
} }
groupItemsByLine(textItems:TextItem[]) {
const lines = [];
var currentLine = [];
textItems.forEach(item => {
if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
lines.push(currentLine);
currentLine = [];
}
currentLine.push(item);
});
lines.push(currentLine);
lines.forEach(lineItems => {
// we can't trust order of occurence, esp. footnoteLinks like to come last
sortByX(lineItems);
});
return lines;
}
} }
//Result of the TextItemCombiner#combine() //Result of the TextItemCombiner#combine()
@ -125,7 +148,6 @@ export class CombineResult {
constructor(options) { constructor(options) {
this.textItems = options.textItems; this.textItems = options.textItems;
this.footnotes = options.footnotes;
this.parsedElements = options.parsedElements; this.parsedElements = options.parsedElements;
} }
@ -134,10 +156,12 @@ export class CombineResult {
export class ParsedElements { export class ParsedElements {
constructor(options) { constructor(options) {
this.footnoteLinks = options.footnoteLinks;
this.footnotes = options.footnotes; this.footnotes = options.footnotes;
} }
add(parsedElements:ParsedElements) { add(parsedElements:ParsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes); this.footnotes = this.footnotes.concat(parsedElements.footnotes);
} }

View File

@ -1,70 +1,67 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx'; import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import ParseResult from '../ParseResult.jsx'; import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx'; import PdfBlock from '../PdfBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { FOOTNOTE_BLOCK } from '../MarkdownElements.jsx';
import { isNumber } from '../../functions.jsx' //Detect quotes, code etc.. which is transformed to markdown code syntax
export default class DetectFootnotes extends ToPdfBlockViewTransformation {
export default class DetectFootnotes extends ToPdfViewTransformation {
constructor() { constructor() {
super("Detect Footnotes"); super("Detect Footnotes");
} }
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var foundFootnotes = [];
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance,
});
var nextFooterNumber = 1; parseResult.content.forEach(page => {
var potentialFootnoteItem; const newBlocks = [];
var foundFootnotes = 0; var lastFootnote;
page.blocks.forEach(block => {
const newContent = parseResult.content.map(page => { newBlocks.push(block);
const newTextItems = []; if (!block.type && block.textItems[0].y < 200) {
for (var i = 0; i < page.textItems.length; i++) { const combineResult = textCombiner.combine(block.textItems);
const item = page.textItems[i]; if (combineResult.parsedElements.footnotes.length > 0) {
if (potentialFootnoteItem) { block.annotation = REMOVED_ANNOTATION;
if (potentialFootnoteItem.y - item.y < item.height) { foundFootnotes.push.apply(foundFootnotes, combineResult.parsedElements.footnotes);
potentialFootnoteItem.annotation = REMOVED_ANNOTATION; lastFootnote = new PdfBlock({
item.annotation = REMOVED_ANNOTATION; textItems: combineResult.textItems,
newTextItems.push(potentialFootnoteItem); type: FOOTNOTE_BLOCK,
newTextItems.push(item); annotation: ADDED_ANNOTATION,
newTextItems.push(new TextItem({ parsedElements: combineResult.parsedElements
x: potentialFootnoteItem.x, })
y: item.y, newBlocks.push(lastFootnote);
width: potentialFootnoteItem.width + item.width, } else if (lastFootnote) {
height: item.height, // likely to be the second line of aboves footnote
text: '[' + potentialFootnoteItem.text + '] ' + item.text, block.annotation = REMOVED_ANNOTATION;
annotation: ADDED_ANNOTATION lastFootnote.textItems = lastFootnote.textItems.concat(combineResult.textItems);
})); lastFootnote.parsedElements.add(combineResult.parsedElements);
//TODO repsect multiline!! newBlocks[newBlocks.length - 2] = block;
nextFooterNumber++; newBlocks[newBlocks.length - 1] = lastFootnote;
foundFootnotes++;
} }
potentialFootnoteItem = null;
} else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
potentialFootnoteItem = item;
} else { } else {
newTextItems.push(item); lastFootnote = null;
} }
} });
return { page.blocks = newBlocks;
...page,
textItems: newTextItems
};
}); });
return new ParseResult({ return new ParseResult({
...parseResult, ...parseResult,
content: newContent, messages: [
messages: ['Detected ' + foundFootnotes + ' footnotes'] 'Detected ' + foundFootnotes.length + ' footnotes:',
foundFootnotes.join(', ')
]
}); });
} }
completeTransform(parseResult:ParseResult) { }
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return parseResult;
}
}

View File

@ -33,7 +33,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
//TODO generic state machine code ? //TODO generic state machine code ?
page.textItems.reduce((oneCharacterItems, item) => { const leftOver = page.textItems.reduce((oneCharacterItems, item) => {
if (item.text.trim().length == 1) { if (item.text.trim().length == 1) {
if (oneCharacterItems.length == 0) { if (oneCharacterItems.length == 0) {
oneCharacterItems.push(item); oneCharacterItems.push(item);
@ -80,6 +80,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
} }
return oneCharacterItems; return oneCharacterItems;
}, []); }, []);
leftOver.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
return { return {
...page, ...page,

View File

@ -24,3 +24,15 @@ export function minXFromTextItems(items:TextItem) {
} }
return minX; return minX;
} }
export function sortByX(items:TextItem) {
items.sort((a, b) => {
return a.x - b.x;
});
}
export function sortCopyByX(items:TextItem) {
const copy = items.concat();
sortByX(copy);
return copy;
}