mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-27 23:30:48 +01:00
WIP footer detection
This commit is contained in:
parent
b7db48af4b
commit
5827379d1b
@ -30,10 +30,18 @@ export default class PdfBlockPageView extends React.Component {
|
||||
const colorStyle = block.annotation ? {
|
||||
color: block.annotation.color
|
||||
} : null;
|
||||
var footnotesElement;
|
||||
var footnoteLinks;
|
||||
var footnotes;
|
||||
if (block.parsedElements) {
|
||||
if (block.parsedElements.footnoteLinks.length > 0) {
|
||||
footnoteLinks = <div>
|
||||
{ 'Footnote-Links: ' + block.parsedElements.footnoteLinks }
|
||||
</div>;
|
||||
}
|
||||
if (block.parsedElements.footnotes.length > 0) {
|
||||
footnotesElement = 'Footnotes: ' + block.parsedElements.footnotes;
|
||||
footnotes = <div>
|
||||
{ 'Footnotes: ' + block.parsedElements.footnotes }
|
||||
</div>;
|
||||
}
|
||||
}
|
||||
|
||||
@ -43,7 +51,8 @@ export default class PdfBlockPageView extends React.Component {
|
||||
</div>
|
||||
<div style={ borderStyle }>
|
||||
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
|
||||
{ footnotesElement }
|
||||
{ footnoteLinks }
|
||||
{ footnotes }
|
||||
</div>
|
||||
</div>
|
||||
});
|
||||
|
@ -4,17 +4,17 @@ import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
||||
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
|
||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||
import DetectTOC from './transformations/DetectTOC.jsx'
|
||||
import DetectLists from './transformations/DetectLists.jsx'
|
||||
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
||||
import DetectFormats from './transformations/DetectFormats.jsx'
|
||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
||||
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||
import DetectLinks from './transformations/DetectLinks.jsx'
|
||||
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||
import ToBlockSystem from './transformations/ToBlockSystem.jsx';
|
||||
// import DetectFormats from './transformations/DetectFormats.jsx'
|
||||
// import CombineSameY from './transformations/CombineSameY.jsx';
|
||||
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||
// import DetectLinks from './transformations/DetectLinks.jsx'
|
||||
// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||
// import ToBlockSystem from './transformations/ToBlockSystem.jsx';
|
||||
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
|
||||
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
||||
|
||||
@ -31,6 +31,7 @@ export default class AppState {
|
||||
new RemoveRepetitiveElements(),
|
||||
new VerticalToHorizontal(),
|
||||
new DetectPdfBlocks(),
|
||||
new DetectFootnotes(),
|
||||
new DetectTOC(),
|
||||
new DetectLists(),
|
||||
new DetectCodeBlocks(),
|
||||
@ -38,7 +39,6 @@ export default class AppState {
|
||||
// new DetectFormats(),
|
||||
// new CombineSameY(),
|
||||
// new RemoveWhitespaces(),
|
||||
// new DetectFootnotes(),
|
||||
// new DetectLinks(),
|
||||
// new HeadlineDetector(),
|
||||
// new HeadlineToUppercase(),
|
||||
|
@ -7,6 +7,7 @@ export const PARAGRAPH = "Paragraph";
|
||||
export const LIST_BLOCK = "List";
|
||||
export const CODE_BLOCK = "Code/Quote";
|
||||
export const TOC_BLOCK = "TOC";
|
||||
export const FOOTNOTE_BLOCK = "Footnotes"
|
||||
|
||||
export function blockToText(block: PdfBlock) {
|
||||
switch (block.type) {
|
||||
|
@ -1,5 +1,6 @@
|
||||
import TextItem from './TextItem.jsx';
|
||||
import { isNumber } from '../functions.jsx'
|
||||
import { isNumber, isDigit } from '../functions.jsx'
|
||||
import { sortByX } from '../textItemFunctions.jsx'
|
||||
|
||||
//Combines text items which are on the same Y at the same time doing inline transformations like
|
||||
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||
@ -58,66 +59,88 @@ export default class TextItemCombiner {
|
||||
}
|
||||
|
||||
groupByFollowingY(textItems) {
|
||||
const yArrays = [];
|
||||
const footnoteLinks = [];
|
||||
const footnotes = [];
|
||||
var itemsWithSameY = [];
|
||||
var lastItem;
|
||||
|
||||
|
||||
const wrapUpLine = () => {
|
||||
// we can't trust order of occurence, esp. footnotes like to come last
|
||||
itemsWithSameY.sort((a, b) => {
|
||||
return a.x - b.x;
|
||||
});
|
||||
const finalArray = [];
|
||||
const basicY = itemsWithSameY[0].y;
|
||||
var savedFootnoteItems = [];
|
||||
const commitSavedFootnotes = () => {
|
||||
if (savedFootnoteItems.length > 0) {
|
||||
const footnoteNumber = savedFootnoteItems.map(footnoteItem => footnoteItem.text).join('');
|
||||
finalArray.push(new TextItem({
|
||||
...savedFootnoteItems[0],
|
||||
//TODO make fomatting configurable
|
||||
// text: `<sup>[${footnoteNumber}](#${footnoteNumber})</sup>`
|
||||
text: `*${footnoteNumber}`
|
||||
}));
|
||||
savedFootnoteItems = [];
|
||||
footnotes.push(parseInt(footnoteNumber));
|
||||
var lines = this.groupItemsByLine(textItems);
|
||||
lines = lines.map(lineItems => {
|
||||
const basicY = lineItems[0].y;
|
||||
const newLineItems = [];
|
||||
var stashedNumberItems = [];
|
||||
|
||||
|
||||
const commitStashedNumbers = (nextItem) => {
|
||||
if (stashedNumberItems.length > 0) {
|
||||
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
|
||||
if (stashedNumberItems[0].y > basicY) { // footnote link
|
||||
newLineItems.push(new TextItem({
|
||||
...stashedNumberItems[0],
|
||||
//TODO make fomatting configurable
|
||||
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
|
||||
text: `^${joinedNumber}`
|
||||
}));
|
||||
footnoteLinks.push(parseInt(joinedNumber));
|
||||
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
|
||||
//TODO womb comp [29] => ydiff == 0
|
||||
newLineItems.push(new TextItem({
|
||||
...stashedNumberItems[0],
|
||||
text: `(^${ joinedNumber}):`
|
||||
}));
|
||||
footnotes.push(joinedNumber);
|
||||
} else {
|
||||
stashedNumberItems.forEach(number => newLineItems.push(number));
|
||||
}
|
||||
|
||||
stashedNumberItems = [];
|
||||
}
|
||||
};
|
||||
|
||||
itemsWithSameY.forEach(item => {
|
||||
const isFootnote = item.y > basicY && isNumber(item.text);
|
||||
if (isFootnote) {
|
||||
savedFootnoteItems.push(item);
|
||||
lineItems.forEach(item => {
|
||||
if (newLineItems.length == 0 && item.text.trim().length == 0) {
|
||||
// skip whitespace on the beginning of a line
|
||||
} else {
|
||||
if (savedFootnoteItems.length > 0) {
|
||||
commitSavedFootnotes();
|
||||
const isANumber = isNumber(item.text);
|
||||
if (isANumber) {
|
||||
stashedNumberItems.push(item);
|
||||
} else {
|
||||
if (stashedNumberItems.length > 0) {
|
||||
commitStashedNumbers(item);
|
||||
}
|
||||
newLineItems.push(item);
|
||||
}
|
||||
finalArray.push(item);
|
||||
}
|
||||
});
|
||||
commitSavedFootnotes();
|
||||
yArrays.push(finalArray);
|
||||
itemsWithSameY = [];
|
||||
};
|
||||
|
||||
textItems.forEach(item => {
|
||||
if (lastItem) {
|
||||
if (Math.abs(lastItem.y - item.y) > this.mostUsedDistance / 2) {
|
||||
wrapUpLine();
|
||||
}
|
||||
}
|
||||
itemsWithSameY.push(item);
|
||||
lastItem = item;
|
||||
// }
|
||||
commitStashedNumbers();
|
||||
return newLineItems;
|
||||
});
|
||||
wrapUpLine();
|
||||
|
||||
return [yArrays, new ParsedElements({
|
||||
|
||||
return [lines, new ParsedElements({
|
||||
footnoteLinks: footnoteLinks,
|
||||
footnotes: footnotes
|
||||
})];
|
||||
}
|
||||
|
||||
groupItemsByLine(textItems:TextItem[]) {
|
||||
const lines = [];
|
||||
var currentLine = [];
|
||||
textItems.forEach(item => {
|
||||
if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
|
||||
lines.push(currentLine);
|
||||
currentLine = [];
|
||||
}
|
||||
currentLine.push(item);
|
||||
});
|
||||
lines.push(currentLine);
|
||||
|
||||
lines.forEach(lineItems => {
|
||||
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
||||
sortByX(lineItems);
|
||||
});
|
||||
return lines;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//Result of the TextItemCombiner#combine()
|
||||
@ -125,7 +148,6 @@ export class CombineResult {
|
||||
|
||||
constructor(options) {
|
||||
this.textItems = options.textItems;
|
||||
this.footnotes = options.footnotes;
|
||||
this.parsedElements = options.parsedElements;
|
||||
}
|
||||
|
||||
@ -134,10 +156,12 @@ export class CombineResult {
|
||||
export class ParsedElements {
|
||||
|
||||
constructor(options) {
|
||||
this.footnoteLinks = options.footnoteLinks;
|
||||
this.footnotes = options.footnotes;
|
||||
}
|
||||
|
||||
add(parsedElements:ParsedElements) {
|
||||
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
||||
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
||||
}
|
||||
|
||||
|
@ -1,70 +1,67 @@
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
import PdfBlock from '../PdfBlock.jsx';
|
||||
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
import { FOOTNOTE_BLOCK } from '../MarkdownElements.jsx';
|
||||
|
||||
import { isNumber } from '../../functions.jsx'
|
||||
|
||||
export default class DetectFootnotes extends ToPdfViewTransformation {
|
||||
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||
export default class DetectFootnotes extends ToPdfBlockViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Footnotes");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
var foundFootnotes = [];
|
||||
const textCombiner = new TextItemCombiner({
|
||||
mostUsedDistance: mostUsedDistance,
|
||||
});
|
||||
|
||||
var nextFooterNumber = 1;
|
||||
var potentialFootnoteItem;
|
||||
var foundFootnotes = 0;
|
||||
|
||||
const newContent = parseResult.content.map(page => {
|
||||
const newTextItems = [];
|
||||
for (var i = 0; i < page.textItems.length; i++) {
|
||||
const item = page.textItems[i];
|
||||
if (potentialFootnoteItem) {
|
||||
if (potentialFootnoteItem.y - item.y < item.height) {
|
||||
potentialFootnoteItem.annotation = REMOVED_ANNOTATION;
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
newTextItems.push(potentialFootnoteItem);
|
||||
newTextItems.push(item);
|
||||
newTextItems.push(new TextItem({
|
||||
x: potentialFootnoteItem.x,
|
||||
y: item.y,
|
||||
width: potentialFootnoteItem.width + item.width,
|
||||
height: item.height,
|
||||
text: '[' + potentialFootnoteItem.text + '] ' + item.text,
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
//TODO repsect multiline!!
|
||||
nextFooterNumber++;
|
||||
foundFootnotes++;
|
||||
parseResult.content.forEach(page => {
|
||||
const newBlocks = [];
|
||||
var lastFootnote;
|
||||
page.blocks.forEach(block => {
|
||||
newBlocks.push(block);
|
||||
if (!block.type && block.textItems[0].y < 200) {
|
||||
const combineResult = textCombiner.combine(block.textItems);
|
||||
if (combineResult.parsedElements.footnotes.length > 0) {
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
foundFootnotes.push.apply(foundFootnotes, combineResult.parsedElements.footnotes);
|
||||
lastFootnote = new PdfBlock({
|
||||
textItems: combineResult.textItems,
|
||||
type: FOOTNOTE_BLOCK,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
parsedElements: combineResult.parsedElements
|
||||
})
|
||||
newBlocks.push(lastFootnote);
|
||||
} else if (lastFootnote) {
|
||||
// likely to be the second line of aboves footnote
|
||||
block.annotation = REMOVED_ANNOTATION;
|
||||
lastFootnote.textItems = lastFootnote.textItems.concat(combineResult.textItems);
|
||||
lastFootnote.parsedElements.add(combineResult.parsedElements);
|
||||
newBlocks[newBlocks.length - 2] = block;
|
||||
newBlocks[newBlocks.length - 1] = lastFootnote;
|
||||
}
|
||||
potentialFootnoteItem = null;
|
||||
} else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
|
||||
potentialFootnoteItem = item;
|
||||
} else {
|
||||
newTextItems.push(item);
|
||||
lastFootnote = null;
|
||||
}
|
||||
}
|
||||
return {
|
||||
...page,
|
||||
textItems: newTextItems
|
||||
};
|
||||
});
|
||||
page.blocks = newBlocks;
|
||||
});
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: newContent,
|
||||
messages: ['Detected ' + foundFootnotes + ' footnotes']
|
||||
messages: [
|
||||
'Detected ' + foundFootnotes.length + ' footnotes:',
|
||||
foundFootnotes.join(', ')
|
||||
]
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
parseResult.content.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
@ -33,7 +33,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
|
||||
|
||||
//TODO generic state machine code ?
|
||||
|
||||
page.textItems.reduce((oneCharacterItems, item) => {
|
||||
const leftOver = page.textItems.reduce((oneCharacterItems, item) => {
|
||||
if (item.text.trim().length == 1) {
|
||||
if (oneCharacterItems.length == 0) {
|
||||
oneCharacterItems.push(item);
|
||||
@ -80,6 +80,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
|
||||
}
|
||||
return oneCharacterItems;
|
||||
}, []);
|
||||
leftOver.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
|
||||
|
||||
return {
|
||||
...page,
|
||||
|
@ -24,3 +24,15 @@ export function minXFromTextItems(items:TextItem) {
|
||||
}
|
||||
return minX;
|
||||
}
|
||||
|
||||
export function sortByX(items:TextItem) {
|
||||
items.sort((a, b) => {
|
||||
return a.x - b.x;
|
||||
});
|
||||
}
|
||||
|
||||
export function sortCopyByX(items:TextItem) {
|
||||
const copy = items.concat();
|
||||
sortByX(copy);
|
||||
return copy;
|
||||
}
|
Loading…
Reference in New Issue
Block a user