mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-20 17:47:47 +02:00
[WIP] Compact Lines
* Almost every transformer first combines the lines, so we can make it an explicit one time transformation in the beginning
This commit is contained in:
parent
e2ddf0312b
commit
e2481bdd2a
@ -29,7 +29,7 @@ export default class PageView extends React.Component {
|
|||||||
const itemViews = this.createItemViews(items, showWhitespaces);
|
const itemViews = this.createItemViews(items, showWhitespaces);
|
||||||
const header = "Page " + (page.index + 1);
|
const header = "Page " + (page.index + 1);
|
||||||
content = <div>
|
content = <div>
|
||||||
<h2>{ header }</h2>
|
<h2 id={ header }>{ header }</h2>
|
||||||
<hr/>
|
<hr/>
|
||||||
{ itemViews }
|
{ itemViews }
|
||||||
</div>
|
</div>
|
||||||
|
@ -49,6 +49,10 @@ export default class TextItemTable extends React.Component {
|
|||||||
<div style={ { textAlign: 'center' } }>
|
<div style={ { textAlign: 'center' } }>
|
||||||
{ textItem.annotation ? textItem.annotation.category : '' }
|
{ textItem.annotation ? textItem.annotation.category : '' }
|
||||||
</div>
|
</div>
|
||||||
|
<div style={ { textAlign: 'center', color: 'orange' } }>
|
||||||
|
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
|
||||||
|
{ textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
|
||||||
|
</div>
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
{ showWhitespaces ? (
|
{ showWhitespaces ? (
|
||||||
@ -87,6 +91,6 @@ export default class TextItemTable extends React.Component {
|
|||||||
{ textItemRows }
|
{ textItemRows }
|
||||||
</tbody>
|
</tbody>
|
||||||
</Table>
|
</Table>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -3,6 +3,8 @@ import { Enum } from 'enumify';
|
|||||||
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
||||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||||
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
||||||
|
import CompactLines from './transformations/CompactLines.jsx';
|
||||||
|
|
||||||
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
|
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
|
||||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||||
import DetectTOC from './transformations/DetectTOC.jsx'
|
import DetectTOC from './transformations/DetectTOC.jsx'
|
||||||
@ -29,8 +31,10 @@ export default class AppState {
|
|||||||
this.pages = [];
|
this.pages = [];
|
||||||
this.transformations = [
|
this.transformations = [
|
||||||
new CalculateGlobalStats(),
|
new CalculateGlobalStats(),
|
||||||
|
new CompactLines(),
|
||||||
new RemoveRepetitiveElements(),
|
new RemoveRepetitiveElements(),
|
||||||
new VerticalToHorizontal(),
|
new VerticalToHorizontal(),
|
||||||
|
|
||||||
new DetectPdfBlocks(),
|
new DetectPdfBlocks(),
|
||||||
new DetectFootnotes(),
|
new DetectFootnotes(),
|
||||||
new DetectTOC(),
|
new DetectTOC(),
|
||||||
|
@ -11,3 +11,17 @@ export default class PageItem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export class ParsedElements {
|
||||||
|
|
||||||
|
constructor(options) {
|
||||||
|
this.footnoteLinks = options.footnoteLinks;
|
||||||
|
this.footnotes = options.footnotes;
|
||||||
|
}
|
||||||
|
|
||||||
|
add(parsedElements:ParsedElements) {
|
||||||
|
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
||||||
|
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
117
src/javascript/models/TextItemLineCompactor.jsx
Normal file
117
src/javascript/models/TextItemLineCompactor.jsx
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
import TextItem from './TextItem.jsx';
|
||||||
|
import { ParsedElements } from './PageItem.jsx';
|
||||||
|
import { isNumber } from '../functions.jsx'
|
||||||
|
import { sortByX } from '../textItemFunctions.jsx'
|
||||||
|
|
||||||
|
// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like
|
||||||
|
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||||
|
export default class TextItemLineCompactor {
|
||||||
|
|
||||||
|
constructor(options) {
|
||||||
|
if (options) {
|
||||||
|
this.transformEmphasis = options.transformEmphasis || true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns a CombineResult
|
||||||
|
compact(lineItems: TextItem[]) {
|
||||||
|
if (lineItems.length < 2) {
|
||||||
|
throw "Must be at least 2 line items, but was " + lineItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
||||||
|
sortByX(lineItems);
|
||||||
|
|
||||||
|
var combinedItem;
|
||||||
|
const [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems);
|
||||||
|
if (resolvedLineItems.length == 1) {
|
||||||
|
combinedItem = resolvedLineItems[0];
|
||||||
|
} else {
|
||||||
|
var text = '';
|
||||||
|
var maxHeight = 0;
|
||||||
|
var widthSum = 0;
|
||||||
|
var lastItem;
|
||||||
|
resolvedLineItems.forEach(item => {
|
||||||
|
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
|
||||||
|
const xDistance = item.x - lastItem.x - lastItem.width;
|
||||||
|
if (xDistance >= 5) {
|
||||||
|
text += ' ';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
text += item.text;
|
||||||
|
widthSum += item.width;
|
||||||
|
lastItem = item;
|
||||||
|
maxHeight = Math.max(maxHeight, item.height);
|
||||||
|
});
|
||||||
|
combinedItem = new TextItem({
|
||||||
|
...resolvedLineItems[0],
|
||||||
|
text: text,
|
||||||
|
height: maxHeight,
|
||||||
|
width: widthSum
|
||||||
|
});
|
||||||
|
}
|
||||||
|
combinedItem.parsedElements = parsedElements;
|
||||||
|
|
||||||
|
//TODO whitespace removal
|
||||||
|
//TODO bold/emphasis
|
||||||
|
|
||||||
|
return combinedItem;
|
||||||
|
}
|
||||||
|
|
||||||
|
resolveSpecialElements(lineItems) {
|
||||||
|
const footnoteLinks = [];
|
||||||
|
const footnotes = [];
|
||||||
|
const basicY = lineItems[0].y;
|
||||||
|
const newLineItems = [];
|
||||||
|
var stashedNumberItems = [];
|
||||||
|
|
||||||
|
const commitStashedNumbers = (nextItem) => {
|
||||||
|
if (stashedNumberItems.length > 0) {
|
||||||
|
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
|
||||||
|
if (stashedNumberItems[0].y > basicY) { // footnote link
|
||||||
|
newLineItems.push(new TextItem({
|
||||||
|
...stashedNumberItems[0],
|
||||||
|
//TODO make fomatting configurable
|
||||||
|
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
|
||||||
|
text: `^${joinedNumber}`
|
||||||
|
}));
|
||||||
|
footnoteLinks.push(parseInt(joinedNumber));
|
||||||
|
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
|
||||||
|
//TODO womb comp [29] => ydiff == 0
|
||||||
|
newLineItems.push(new TextItem({
|
||||||
|
...stashedNumberItems[0],
|
||||||
|
text: `(^${ joinedNumber}):`
|
||||||
|
}));
|
||||||
|
footnotes.push(joinedNumber);
|
||||||
|
} else {
|
||||||
|
stashedNumberItems.forEach(number => newLineItems.push(number));
|
||||||
|
}
|
||||||
|
|
||||||
|
stashedNumberItems = [];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
lineItems.forEach(item => {
|
||||||
|
if (newLineItems.length == 0 && item.text.trim().length == 0) {
|
||||||
|
// skip whitespace on the beginning of a line
|
||||||
|
} else {
|
||||||
|
const isANumber = isNumber(item.text.trim());
|
||||||
|
if (isANumber) {
|
||||||
|
stashedNumberItems.push(item);
|
||||||
|
} else {
|
||||||
|
if (stashedNumberItems.length > 0) {
|
||||||
|
commitStashedNumbers(item);
|
||||||
|
}
|
||||||
|
newLineItems.push(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
commitStashedNumbers();
|
||||||
|
|
||||||
|
|
||||||
|
return [newLineItems, new ParsedElements({
|
||||||
|
footnoteLinks: footnoteLinks,
|
||||||
|
footnotes: footnotes
|
||||||
|
})];
|
||||||
|
}
|
||||||
|
}
|
36
src/javascript/models/TextItemLineGrouper.jsx
Normal file
36
src/javascript/models/TextItemLineGrouper.jsx
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import TextItem from './TextItem.jsx';
|
||||||
|
import { sortByX } from '../textItemFunctions.jsx'
|
||||||
|
|
||||||
|
//Groups all text items which are on the same y line
|
||||||
|
export default class TextItemLineGrouper {
|
||||||
|
|
||||||
|
constructor(options) {
|
||||||
|
this.mostUsedDistance = options.mostUsedDistance || 12;
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns a CombineResult
|
||||||
|
group(textItems: TextItem[]) {
|
||||||
|
return this.groupItemsByLine(textItems);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
groupItemsByLine(textItems:TextItem[]) {
|
||||||
|
const lines = [];
|
||||||
|
var currentLine = [];
|
||||||
|
textItems.forEach(item => {
|
||||||
|
if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
|
||||||
|
lines.push(currentLine);
|
||||||
|
currentLine = [];
|
||||||
|
}
|
||||||
|
currentLine.push(item);
|
||||||
|
});
|
||||||
|
lines.push(currentLine);
|
||||||
|
|
||||||
|
lines.forEach(lineItems => {
|
||||||
|
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
||||||
|
sortByX(lineItems);
|
||||||
|
});
|
||||||
|
return lines;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
70
src/javascript/models/transformations/CompactLines.jsx
Normal file
70
src/javascript/models/transformations/CompactLines.jsx
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
import React from 'react';
|
||||||
|
|
||||||
|
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
|
||||||
|
import ParseResult from '../ParseResult.jsx';
|
||||||
|
import TextItemLineGrouper from '../TextItemLineGrouper.jsx';
|
||||||
|
import TextItemLineCompactor from '../TextItemLineCompactor.jsx';
|
||||||
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
|
// gathers text items on the same y line to one text item
|
||||||
|
export default class CompactLines extends ToTextItemTransformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Compact Lines");
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(parseResult:ParseResult) {
|
||||||
|
const {mostUsedDistance} = parseResult.globals;
|
||||||
|
const foundFootnotes = [];
|
||||||
|
const foundFootnoteLinks = [];
|
||||||
|
const lineGrouper = new TextItemLineGrouper({
|
||||||
|
mostUsedDistance: mostUsedDistance,
|
||||||
|
});
|
||||||
|
const lineCompactor = new TextItemLineCompactor();
|
||||||
|
|
||||||
|
parseResult.pages.forEach(page => {
|
||||||
|
if (page.items.length > 0) {
|
||||||
|
const newItems = [];
|
||||||
|
const textItemsGroupedByLine = lineGrouper.group(page.items);
|
||||||
|
textItemsGroupedByLine.forEach(textItemsOfLine => {
|
||||||
|
if (textItemsOfLine.length == 1) {
|
||||||
|
newItems.push(textItemsOfLine[0]);
|
||||||
|
} else {
|
||||||
|
textItemsOfLine.forEach(item => {
|
||||||
|
item.annotation = REMOVED_ANNOTATION;
|
||||||
|
newItems.push(item);
|
||||||
|
});
|
||||||
|
|
||||||
|
const combinedItem = lineCompactor.compact(textItemsOfLine);
|
||||||
|
combinedItem.annotation = ADDED_ANNOTATION;
|
||||||
|
newItems.push(combinedItem);
|
||||||
|
|
||||||
|
if (combinedItem.parsedElements.footnoteLinks.length > 0) {
|
||||||
|
const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
|
||||||
|
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
|
||||||
|
}
|
||||||
|
if (combinedItem.parsedElements.footnotes.length > 0) {
|
||||||
|
const footnotes = combinedItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
||||||
|
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
page.items = newItems;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
return new ParseResult({
|
||||||
|
...parseResult,
|
||||||
|
messages: [
|
||||||
|
// 'Detected ' + foundFootnoteLinks.length + ' footnote links: [' + foundFootnoteLinks.join(', ') + ']',
|
||||||
|
//'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes.join(', ') + ']',
|
||||||
|
// 'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes + ']',
|
||||||
|
<span>Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }]</span>,
|
||||||
|
<span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>,
|
||||||
|
]
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user