mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-28 07:40:47 +01:00
WIP footer detection
This commit is contained in:
parent
b7db48af4b
commit
5827379d1b
@ -30,10 +30,18 @@ export default class PdfBlockPageView extends React.Component {
|
|||||||
const colorStyle = block.annotation ? {
|
const colorStyle = block.annotation ? {
|
||||||
color: block.annotation.color
|
color: block.annotation.color
|
||||||
} : null;
|
} : null;
|
||||||
var footnotesElement;
|
var footnoteLinks;
|
||||||
|
var footnotes;
|
||||||
if (block.parsedElements) {
|
if (block.parsedElements) {
|
||||||
|
if (block.parsedElements.footnoteLinks.length > 0) {
|
||||||
|
footnoteLinks = <div>
|
||||||
|
{ 'Footnote-Links: ' + block.parsedElements.footnoteLinks }
|
||||||
|
</div>;
|
||||||
|
}
|
||||||
if (block.parsedElements.footnotes.length > 0) {
|
if (block.parsedElements.footnotes.length > 0) {
|
||||||
footnotesElement = 'Footnotes: ' + block.parsedElements.footnotes;
|
footnotes = <div>
|
||||||
|
{ 'Footnotes: ' + block.parsedElements.footnotes }
|
||||||
|
</div>;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -43,7 +51,8 @@ export default class PdfBlockPageView extends React.Component {
|
|||||||
</div>
|
</div>
|
||||||
<div style={ borderStyle }>
|
<div style={ borderStyle }>
|
||||||
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
|
<TextItemTable textItems={ textItems } showWhitespaces={ showWhitespaces } />
|
||||||
{ footnotesElement }
|
{ footnoteLinks }
|
||||||
|
{ footnotes }
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
});
|
});
|
||||||
|
@ -4,17 +4,17 @@ import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
|||||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||||
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
||||||
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
|
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
|
||||||
|
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||||
import DetectTOC from './transformations/DetectTOC.jsx'
|
import DetectTOC from './transformations/DetectTOC.jsx'
|
||||||
import DetectLists from './transformations/DetectLists.jsx'
|
import DetectLists from './transformations/DetectLists.jsx'
|
||||||
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
|
||||||
import DetectFormats from './transformations/DetectFormats.jsx'
|
// import DetectFormats from './transformations/DetectFormats.jsx'
|
||||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
// import CombineSameY from './transformations/CombineSameY.jsx';
|
||||||
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
// import DetectLinks from './transformations/DetectLinks.jsx'
|
||||||
import DetectLinks from './transformations/DetectLinks.jsx'
|
// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||||
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||||
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
// import ToBlockSystem from './transformations/ToBlockSystem.jsx';
|
||||||
import ToBlockSystem from './transformations/ToBlockSystem.jsx';
|
|
||||||
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
|
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
|
||||||
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
||||||
|
|
||||||
@ -31,6 +31,7 @@ export default class AppState {
|
|||||||
new RemoveRepetitiveElements(),
|
new RemoveRepetitiveElements(),
|
||||||
new VerticalToHorizontal(),
|
new VerticalToHorizontal(),
|
||||||
new DetectPdfBlocks(),
|
new DetectPdfBlocks(),
|
||||||
|
new DetectFootnotes(),
|
||||||
new DetectTOC(),
|
new DetectTOC(),
|
||||||
new DetectLists(),
|
new DetectLists(),
|
||||||
new DetectCodeBlocks(),
|
new DetectCodeBlocks(),
|
||||||
@ -38,7 +39,6 @@ export default class AppState {
|
|||||||
// new DetectFormats(),
|
// new DetectFormats(),
|
||||||
// new CombineSameY(),
|
// new CombineSameY(),
|
||||||
// new RemoveWhitespaces(),
|
// new RemoveWhitespaces(),
|
||||||
// new DetectFootnotes(),
|
|
||||||
// new DetectLinks(),
|
// new DetectLinks(),
|
||||||
// new HeadlineDetector(),
|
// new HeadlineDetector(),
|
||||||
// new HeadlineToUppercase(),
|
// new HeadlineToUppercase(),
|
||||||
|
@ -7,6 +7,7 @@ export const PARAGRAPH = "Paragraph";
|
|||||||
export const LIST_BLOCK = "List";
|
export const LIST_BLOCK = "List";
|
||||||
export const CODE_BLOCK = "Code/Quote";
|
export const CODE_BLOCK = "Code/Quote";
|
||||||
export const TOC_BLOCK = "TOC";
|
export const TOC_BLOCK = "TOC";
|
||||||
|
export const FOOTNOTE_BLOCK = "Footnotes"
|
||||||
|
|
||||||
export function blockToText(block: PdfBlock) {
|
export function blockToText(block: PdfBlock) {
|
||||||
switch (block.type) {
|
switch (block.type) {
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import TextItem from './TextItem.jsx';
|
import TextItem from './TextItem.jsx';
|
||||||
import { isNumber } from '../functions.jsx'
|
import { isNumber, isDigit } from '../functions.jsx'
|
||||||
|
import { sortByX } from '../textItemFunctions.jsx'
|
||||||
|
|
||||||
//Combines text items which are on the same Y at the same time doing inline transformations like
|
//Combines text items which are on the same Y at the same time doing inline transformations like
|
||||||
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||||
@ -58,66 +59,88 @@ export default class TextItemCombiner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
groupByFollowingY(textItems) {
|
groupByFollowingY(textItems) {
|
||||||
const yArrays = [];
|
const footnoteLinks = [];
|
||||||
const footnotes = [];
|
const footnotes = [];
|
||||||
var itemsWithSameY = [];
|
|
||||||
var lastItem;
|
|
||||||
|
|
||||||
|
|
||||||
const wrapUpLine = () => {
|
var lines = this.groupItemsByLine(textItems);
|
||||||
// we can't trust order of occurence, esp. footnotes like to come last
|
lines = lines.map(lineItems => {
|
||||||
itemsWithSameY.sort((a, b) => {
|
const basicY = lineItems[0].y;
|
||||||
return a.x - b.x;
|
const newLineItems = [];
|
||||||
});
|
var stashedNumberItems = [];
|
||||||
const finalArray = [];
|
|
||||||
const basicY = itemsWithSameY[0].y;
|
|
||||||
var savedFootnoteItems = [];
|
const commitStashedNumbers = (nextItem) => {
|
||||||
const commitSavedFootnotes = () => {
|
if (stashedNumberItems.length > 0) {
|
||||||
if (savedFootnoteItems.length > 0) {
|
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
|
||||||
const footnoteNumber = savedFootnoteItems.map(footnoteItem => footnoteItem.text).join('');
|
if (stashedNumberItems[0].y > basicY) { // footnote link
|
||||||
finalArray.push(new TextItem({
|
newLineItems.push(new TextItem({
|
||||||
...savedFootnoteItems[0],
|
...stashedNumberItems[0],
|
||||||
//TODO make fomatting configurable
|
//TODO make fomatting configurable
|
||||||
// text: `<sup>[${footnoteNumber}](#${footnoteNumber})</sup>`
|
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
|
||||||
text: `*${footnoteNumber}`
|
text: `^${joinedNumber}`
|
||||||
}));
|
}));
|
||||||
savedFootnoteItems = [];
|
footnoteLinks.push(parseInt(joinedNumber));
|
||||||
footnotes.push(parseInt(footnoteNumber));
|
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
|
||||||
|
//TODO womb comp [29] => ydiff == 0
|
||||||
|
newLineItems.push(new TextItem({
|
||||||
|
...stashedNumberItems[0],
|
||||||
|
text: `(^${ joinedNumber}):`
|
||||||
|
}));
|
||||||
|
footnotes.push(joinedNumber);
|
||||||
|
} else {
|
||||||
|
stashedNumberItems.forEach(number => newLineItems.push(number));
|
||||||
|
}
|
||||||
|
|
||||||
|
stashedNumberItems = [];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
itemsWithSameY.forEach(item => {
|
lineItems.forEach(item => {
|
||||||
const isFootnote = item.y > basicY && isNumber(item.text);
|
if (newLineItems.length == 0 && item.text.trim().length == 0) {
|
||||||
if (isFootnote) {
|
// skip whitespace on the beginning of a line
|
||||||
savedFootnoteItems.push(item);
|
|
||||||
} else {
|
} else {
|
||||||
if (savedFootnoteItems.length > 0) {
|
const isANumber = isNumber(item.text);
|
||||||
commitSavedFootnotes();
|
if (isANumber) {
|
||||||
|
stashedNumberItems.push(item);
|
||||||
|
} else {
|
||||||
|
if (stashedNumberItems.length > 0) {
|
||||||
|
commitStashedNumbers(item);
|
||||||
|
}
|
||||||
|
newLineItems.push(item);
|
||||||
}
|
}
|
||||||
finalArray.push(item);
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
commitSavedFootnotes();
|
commitStashedNumbers();
|
||||||
yArrays.push(finalArray);
|
return newLineItems;
|
||||||
itemsWithSameY = [];
|
|
||||||
};
|
|
||||||
|
|
||||||
textItems.forEach(item => {
|
|
||||||
if (lastItem) {
|
|
||||||
if (Math.abs(lastItem.y - item.y) > this.mostUsedDistance / 2) {
|
|
||||||
wrapUpLine();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
itemsWithSameY.push(item);
|
|
||||||
lastItem = item;
|
|
||||||
// }
|
|
||||||
});
|
});
|
||||||
wrapUpLine();
|
|
||||||
|
|
||||||
return [yArrays, new ParsedElements({
|
|
||||||
|
return [lines, new ParsedElements({
|
||||||
|
footnoteLinks: footnoteLinks,
|
||||||
footnotes: footnotes
|
footnotes: footnotes
|
||||||
})];
|
})];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
groupItemsByLine(textItems:TextItem[]) {
|
||||||
|
const lines = [];
|
||||||
|
var currentLine = [];
|
||||||
|
textItems.forEach(item => {
|
||||||
|
if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
|
||||||
|
lines.push(currentLine);
|
||||||
|
currentLine = [];
|
||||||
|
}
|
||||||
|
currentLine.push(item);
|
||||||
|
});
|
||||||
|
lines.push(currentLine);
|
||||||
|
|
||||||
|
lines.forEach(lineItems => {
|
||||||
|
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
||||||
|
sortByX(lineItems);
|
||||||
|
});
|
||||||
|
return lines;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Result of the TextItemCombiner#combine()
|
//Result of the TextItemCombiner#combine()
|
||||||
@ -125,7 +148,6 @@ export class CombineResult {
|
|||||||
|
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
this.textItems = options.textItems;
|
this.textItems = options.textItems;
|
||||||
this.footnotes = options.footnotes;
|
|
||||||
this.parsedElements = options.parsedElements;
|
this.parsedElements = options.parsedElements;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -134,10 +156,12 @@ export class CombineResult {
|
|||||||
export class ParsedElements {
|
export class ParsedElements {
|
||||||
|
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
|
this.footnoteLinks = options.footnoteLinks;
|
||||||
this.footnotes = options.footnotes;
|
this.footnotes = options.footnotes;
|
||||||
}
|
}
|
||||||
|
|
||||||
add(parsedElements:ParsedElements) {
|
add(parsedElements:ParsedElements) {
|
||||||
|
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
||||||
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,70 +1,67 @@
|
|||||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
|
||||||
import TextItem from '../TextItem.jsx';
|
|
||||||
import ParseResult from '../ParseResult.jsx';
|
import ParseResult from '../ParseResult.jsx';
|
||||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
import PdfBlock from '../PdfBlock.jsx';
|
||||||
|
import TextItemCombiner from '../TextItemCombiner.jsx';
|
||||||
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
import { FOOTNOTE_BLOCK } from '../MarkdownElements.jsx';
|
||||||
|
|
||||||
import { isNumber } from '../../functions.jsx'
|
//Detect quotes, code etc.. which is transformed to markdown code syntax
|
||||||
|
export default class DetectFootnotes extends ToPdfBlockViewTransformation {
|
||||||
export default class DetectFootnotes extends ToPdfViewTransformation {
|
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Detect Footnotes");
|
super("Detect Footnotes");
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
|
const {mostUsedDistance} = parseResult.globals;
|
||||||
|
var foundFootnotes = [];
|
||||||
|
const textCombiner = new TextItemCombiner({
|
||||||
|
mostUsedDistance: mostUsedDistance,
|
||||||
|
});
|
||||||
|
|
||||||
var nextFooterNumber = 1;
|
parseResult.content.forEach(page => {
|
||||||
var potentialFootnoteItem;
|
const newBlocks = [];
|
||||||
var foundFootnotes = 0;
|
var lastFootnote;
|
||||||
|
page.blocks.forEach(block => {
|
||||||
const newContent = parseResult.content.map(page => {
|
newBlocks.push(block);
|
||||||
const newTextItems = [];
|
if (!block.type && block.textItems[0].y < 200) {
|
||||||
for (var i = 0; i < page.textItems.length; i++) {
|
const combineResult = textCombiner.combine(block.textItems);
|
||||||
const item = page.textItems[i];
|
if (combineResult.parsedElements.footnotes.length > 0) {
|
||||||
if (potentialFootnoteItem) {
|
block.annotation = REMOVED_ANNOTATION;
|
||||||
if (potentialFootnoteItem.y - item.y < item.height) {
|
foundFootnotes.push.apply(foundFootnotes, combineResult.parsedElements.footnotes);
|
||||||
potentialFootnoteItem.annotation = REMOVED_ANNOTATION;
|
lastFootnote = new PdfBlock({
|
||||||
item.annotation = REMOVED_ANNOTATION;
|
textItems: combineResult.textItems,
|
||||||
newTextItems.push(potentialFootnoteItem);
|
type: FOOTNOTE_BLOCK,
|
||||||
newTextItems.push(item);
|
annotation: ADDED_ANNOTATION,
|
||||||
newTextItems.push(new TextItem({
|
parsedElements: combineResult.parsedElements
|
||||||
x: potentialFootnoteItem.x,
|
})
|
||||||
y: item.y,
|
newBlocks.push(lastFootnote);
|
||||||
width: potentialFootnoteItem.width + item.width,
|
} else if (lastFootnote) {
|
||||||
height: item.height,
|
// likely to be the second line of aboves footnote
|
||||||
text: '[' + potentialFootnoteItem.text + '] ' + item.text,
|
block.annotation = REMOVED_ANNOTATION;
|
||||||
annotation: ADDED_ANNOTATION
|
lastFootnote.textItems = lastFootnote.textItems.concat(combineResult.textItems);
|
||||||
}));
|
lastFootnote.parsedElements.add(combineResult.parsedElements);
|
||||||
//TODO repsect multiline!!
|
newBlocks[newBlocks.length - 2] = block;
|
||||||
nextFooterNumber++;
|
newBlocks[newBlocks.length - 1] = lastFootnote;
|
||||||
foundFootnotes++;
|
|
||||||
}
|
}
|
||||||
potentialFootnoteItem = null;
|
|
||||||
} else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
|
|
||||||
potentialFootnoteItem = item;
|
|
||||||
} else {
|
} else {
|
||||||
newTextItems.push(item);
|
lastFootnote = null;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
return {
|
page.blocks = newBlocks;
|
||||||
...page,
|
|
||||||
textItems: newTextItems
|
|
||||||
};
|
|
||||||
});
|
});
|
||||||
|
|
||||||
return new ParseResult({
|
return new ParseResult({
|
||||||
...parseResult,
|
...parseResult,
|
||||||
content: newContent,
|
messages: [
|
||||||
messages: ['Detected ' + foundFootnotes + ' footnotes']
|
'Detected ' + foundFootnotes.length + ' footnotes:',
|
||||||
|
foundFootnotes.join(', ')
|
||||||
|
]
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
completeTransform(parseResult:ParseResult) {
|
}
|
||||||
parseResult.content.forEach(page => {
|
|
||||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
|
||||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
|
||||||
});
|
|
||||||
return parseResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -33,7 +33,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
|
|||||||
|
|
||||||
//TODO generic state machine code ?
|
//TODO generic state machine code ?
|
||||||
|
|
||||||
page.textItems.reduce((oneCharacterItems, item) => {
|
const leftOver = page.textItems.reduce((oneCharacterItems, item) => {
|
||||||
if (item.text.trim().length == 1) {
|
if (item.text.trim().length == 1) {
|
||||||
if (oneCharacterItems.length == 0) {
|
if (oneCharacterItems.length == 0) {
|
||||||
oneCharacterItems.push(item);
|
oneCharacterItems.push(item);
|
||||||
@ -80,6 +80,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
|
|||||||
}
|
}
|
||||||
return oneCharacterItems;
|
return oneCharacterItems;
|
||||||
}, []);
|
}, []);
|
||||||
|
leftOver.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...page,
|
...page,
|
||||||
|
@ -24,3 +24,15 @@ export function minXFromTextItems(items:TextItem) {
|
|||||||
}
|
}
|
||||||
return minX;
|
return minX;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function sortByX(items:TextItem) {
|
||||||
|
items.sort((a, b) => {
|
||||||
|
return a.x - b.x;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export function sortCopyByX(items:TextItem) {
|
||||||
|
const copy = items.concat();
|
||||||
|
sortByX(copy);
|
||||||
|
return copy;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user