WIP footer detection

This commit is contained in:
Johannes Zillmann 2017-02-22 23:18:49 +01:00
parent b7db48af4b
commit 5827379d1b
7 changed files with 154 additions and 110 deletions

View File

@ -30,10 +30,18 @@ export default class PdfBlockPageView extends React.Component {
const colorStyle = block.annotation ? {
color: block.annotation.color
} : null;
var footnotesElement;
var footnoteLinks;
var footnotes;
if (block.parsedElements) {
if (block.parsedElements.footnoteLinks.length > 0) {
footnoteLinks = <div>
{ 'Footnote-Links: ' + block.parsedElements.footnoteLinks }
</div>;
}
if (block.parsedElements.footnotes.length > 0) {
footnotesElement = 'Footnotes: ' + block.parsedElements.footnotes;
footnotes = <div>
{ 'Footnotes: ' + block.parsedElements.footnotes }
</div>;
}
}
@ -43,7 +51,8 @@ export default class PdfBlockPageView extends React.Component {
</div>
<div style={ borderStyle }>
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
{ footnotesElement }
{ footnoteLinks }
{ footnotes }
</div>
</div>
});

View File

@ -4,17 +4,17 @@ import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectTOC from './transformations/DetectTOC.jsx'
import DetectLists from './transformations/DetectLists.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
import DetectFormats from './transformations/DetectFormats.jsx'
import CombineSameY from './transformations/CombineSameY.jsx';
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectLinks from './transformations/DetectLinks.jsx'
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
import ToBlockSystem from './transformations/ToBlockSystem.jsx';
// import DetectFormats from './transformations/DetectFormats.jsx'
// import CombineSameY from './transformations/CombineSameY.jsx';
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
// import DetectLinks from './transformations/DetectLinks.jsx'
// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
// import ToBlockSystem from './transformations/ToBlockSystem.jsx';
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
import ToMarkdown from './transformations/ToMarkdown.jsx'
@ -31,6 +31,7 @@ export default class AppState {
new RemoveRepetitiveElements(),
new VerticalToHorizontal(),
new DetectPdfBlocks(),
new DetectFootnotes(),
new DetectTOC(),
new DetectLists(),
new DetectCodeBlocks(),
@ -38,7 +39,6 @@ export default class AppState {
// new DetectFormats(),
// new CombineSameY(),
// new RemoveWhitespaces(),
// new DetectFootnotes(),
// new DetectLinks(),
// new HeadlineDetector(),
// new HeadlineToUppercase(),

View File

@ -7,6 +7,7 @@ export const PARAGRAPH = "Paragraph";
export const LIST_BLOCK = "List";
export const CODE_BLOCK = "Code/Quote";
export const TOC_BLOCK = "TOC";
export const FOOTNOTE_BLOCK = "Footnotes"
export function blockToText(block: PdfBlock) {
switch (block.type) {

View File

@ -1,5 +1,6 @@
import TextItem from './TextItem.jsx';
import { isNumber } from '../functions.jsx'
import { isNumber, isDigit } from '../functions.jsx'
import { sortByX } from '../textItemFunctions.jsx'
//Combines text items which are on the same Y at the same time doing inline transformations like
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
@ -58,66 +59,88 @@ export default class TextItemCombiner {
}
groupByFollowingY(textItems) {
const yArrays = [];
const footnoteLinks = [];
const footnotes = [];
var itemsWithSameY = [];
var lastItem;
const wrapUpLine = () => {
// we can't trust order of occurence, esp. footnotes like to come last
itemsWithSameY.sort((a, b) => {
return a.x - b.x;
});
const finalArray = [];
const basicY = itemsWithSameY[0].y;
var savedFootnoteItems = [];
const commitSavedFootnotes = () => {
if (savedFootnoteItems.length > 0) {
const footnoteNumber = savedFootnoteItems.map(footnoteItem => footnoteItem.text).join('');
finalArray.push(new TextItem({
...savedFootnoteItems[0],
var lines = this.groupItemsByLine(textItems);
lines = lines.map(lineItems => {
const basicY = lineItems[0].y;
const newLineItems = [];
var stashedNumberItems = [];
const commitStashedNumbers = (nextItem) => {
if (stashedNumberItems.length > 0) {
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
if (stashedNumberItems[0].y > basicY) { // footnote link
newLineItems.push(new TextItem({
...stashedNumberItems[0],
//TODO make fomatting configurable
// text: `<sup>[${footnoteNumber}](#${footnoteNumber})</sup>`
text: `*${footnoteNumber}`
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
text: `^${joinedNumber}`
}));
savedFootnoteItems = [];
footnotes.push(parseInt(footnoteNumber));
}
};
itemsWithSameY.forEach(item => {
const isFootnote = item.y > basicY && isNumber(item.text);
if (isFootnote) {
savedFootnoteItems.push(item);
footnoteLinks.push(parseInt(joinedNumber));
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
//TODO womb comp [29] => ydiff == 0
newLineItems.push(new TextItem({
...stashedNumberItems[0],
text: `(^${ joinedNumber}):`
}));
footnotes.push(joinedNumber);
} else {
if (savedFootnoteItems.length > 0) {
commitSavedFootnotes();
stashedNumberItems.forEach(number => newLineItems.push(number));
}
finalArray.push(item);
stashedNumberItems = [];
}
});
commitSavedFootnotes();
yArrays.push(finalArray);
itemsWithSameY = [];
};
textItems.forEach(item => {
if (lastItem) {
if (Math.abs(lastItem.y - item.y) > this.mostUsedDistance / 2) {
wrapUpLine();
lineItems.forEach(item => {
if (newLineItems.length == 0 && item.text.trim().length == 0) {
// skip whitespace on the beginning of a line
} else {
const isANumber = isNumber(item.text);
if (isANumber) {
stashedNumberItems.push(item);
} else {
if (stashedNumberItems.length > 0) {
commitStashedNumbers(item);
}
newLineItems.push(item);
}
}
itemsWithSameY.push(item);
lastItem = item;
// }
});
wrapUpLine();
commitStashedNumbers();
return newLineItems;
});
return [yArrays, new ParsedElements({
return [lines, new ParsedElements({
footnoteLinks: footnoteLinks,
footnotes: footnotes
})];
}
groupItemsByLine(textItems:TextItem[]) {
const lines = [];
var currentLine = [];
textItems.forEach(item => {
if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
lines.push(currentLine);
currentLine = [];
}
currentLine.push(item);
});
lines.push(currentLine);
lines.forEach(lineItems => {
// we can't trust order of occurence, esp. footnoteLinks like to come last
sortByX(lineItems);
});
return lines;
}
}
//Result of the TextItemCombiner#combine()
@ -125,7 +148,6 @@ export class CombineResult {
constructor(options) {
this.textItems = options.textItems;
this.footnotes = options.footnotes;
this.parsedElements = options.parsedElements;
}
@ -134,10 +156,12 @@ export class CombineResult {
export class ParsedElements {
constructor(options) {
this.footnoteLinks = options.footnoteLinks;
this.footnotes = options.footnotes;
}
add(parsedElements:ParsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
}

View File

@ -1,70 +1,67 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
import PdfBlock from '../PdfBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
import { FOOTNOTE_BLOCK } from '../MarkdownElements.jsx';
import { isNumber } from '../../functions.jsx'
export default class DetectFootnotes extends ToPdfViewTransformation {
//Detect quotes, code etc.. which is transformed to markdown code syntax
export default class DetectFootnotes extends ToPdfBlockViewTransformation {
constructor() {
super("Detect Footnotes");
}
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
var foundFootnotes = [];
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance,
});
var nextFooterNumber = 1;
var potentialFootnoteItem;
var foundFootnotes = 0;
const newContent = parseResult.content.map(page => {
const newTextItems = [];
for (var i = 0; i < page.textItems.length; i++) {
const item = page.textItems[i];
if (potentialFootnoteItem) {
if (potentialFootnoteItem.y - item.y < item.height) {
potentialFootnoteItem.annotation = REMOVED_ANNOTATION;
item.annotation = REMOVED_ANNOTATION;
newTextItems.push(potentialFootnoteItem);
newTextItems.push(item);
newTextItems.push(new TextItem({
x: potentialFootnoteItem.x,
y: item.y,
width: potentialFootnoteItem.width + item.width,
height: item.height,
text: '[' + potentialFootnoteItem.text + '] ' + item.text,
annotation: ADDED_ANNOTATION
}));
//TODO repsect multiline!!
nextFooterNumber++;
foundFootnotes++;
parseResult.content.forEach(page => {
const newBlocks = [];
var lastFootnote;
page.blocks.forEach(block => {
newBlocks.push(block);
if (!block.type && block.textItems[0].y < 200) {
const combineResult = textCombiner.combine(block.textItems);
if (combineResult.parsedElements.footnotes.length > 0) {
block.annotation = REMOVED_ANNOTATION;
foundFootnotes.push.apply(foundFootnotes, combineResult.parsedElements.footnotes);
lastFootnote = new PdfBlock({
textItems: combineResult.textItems,
type: FOOTNOTE_BLOCK,
annotation: ADDED_ANNOTATION,
parsedElements: combineResult.parsedElements
})
newBlocks.push(lastFootnote);
} else if (lastFootnote) {
// likely to be the second line of aboves footnote
block.annotation = REMOVED_ANNOTATION;
lastFootnote.textItems = lastFootnote.textItems.concat(combineResult.textItems);
lastFootnote.parsedElements.add(combineResult.parsedElements);
newBlocks[newBlocks.length - 2] = block;
newBlocks[newBlocks.length - 1] = lastFootnote;
}
potentialFootnoteItem = null;
} else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
potentialFootnoteItem = item;
} else {
newTextItems.push(item);
lastFootnote = null;
}
}
return {
...page,
textItems: newTextItems
};
});
page.blocks = newBlocks;
});
return new ParseResult({
...parseResult,
content: newContent,
messages: ['Detected ' + foundFootnotes + ' footnotes']
messages: [
'Detected ' + foundFootnotes.length + ' footnotes:',
foundFootnotes.join(', ')
]
});
}
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return parseResult;
}
}

View File

@ -33,7 +33,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
//TODO generic state machine code ?
page.textItems.reduce((oneCharacterItems, item) => {
const leftOver = page.textItems.reduce((oneCharacterItems, item) => {
if (item.text.trim().length == 1) {
if (oneCharacterItems.length == 0) {
oneCharacterItems.push(item);
@ -80,6 +80,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
}
return oneCharacterItems;
}, []);
leftOver.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
return {
...page,

View File

@ -24,3 +24,15 @@ export function minXFromTextItems(items:TextItem) {
}
return minX;
}
export function sortByX(items:TextItem) {
items.sort((a, b) => {
return a.x - b.x;
});
}
export function sortCopyByX(items:TextItem) {
const copy = items.concat();
sortByX(copy);
return copy;
}