mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-20 17:47:47 +02:00
[WIP] Compact Lines
* Almost every transformer first combines the lines, so we can make it an explicit one time transformation in the beginning
This commit is contained in:
parent
e2ddf0312b
commit
e2481bdd2a
@ -29,7 +29,7 @@ export default class PageView extends React.Component {
|
||||
const itemViews = this.createItemViews(items, showWhitespaces);
|
||||
const header = "Page " + (page.index + 1);
|
||||
content = <div>
|
||||
<h2>{ header }</h2>
|
||||
<h2 id={ header }>{ header }</h2>
|
||||
<hr/>
|
||||
{ itemViews }
|
||||
</div>
|
||||
|
@ -49,6 +49,10 @@ export default class TextItemTable extends React.Component {
|
||||
<div style={ { textAlign: 'center' } }>
|
||||
{ textItem.annotation ? textItem.annotation.category : '' }
|
||||
</div>
|
||||
<div style={ { textAlign: 'center', color: 'orange' } }>
|
||||
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
|
||||
{ textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
|
||||
</div>
|
||||
</td>
|
||||
<td>
|
||||
{ showWhitespaces ? (
|
||||
|
@ -3,6 +3,8 @@ import { Enum } from 'enumify';
|
||||
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
|
||||
import CompactLines from './transformations/CompactLines.jsx';
|
||||
|
||||
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
|
||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||
import DetectTOC from './transformations/DetectTOC.jsx'
|
||||
@ -29,8 +31,10 @@ export default class AppState {
|
||||
this.pages = [];
|
||||
this.transformations = [
|
||||
new CalculateGlobalStats(),
|
||||
new CompactLines(),
|
||||
new RemoveRepetitiveElements(),
|
||||
new VerticalToHorizontal(),
|
||||
|
||||
new DetectPdfBlocks(),
|
||||
new DetectFootnotes(),
|
||||
new DetectTOC(),
|
||||
|
@ -11,3 +11,17 @@ export default class PageItem {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
export class ParsedElements {
|
||||
|
||||
constructor(options) {
|
||||
this.footnoteLinks = options.footnoteLinks;
|
||||
this.footnotes = options.footnotes;
|
||||
}
|
||||
|
||||
add(parsedElements:ParsedElements) {
|
||||
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
||||
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
||||
}
|
||||
|
||||
}
|
117
src/javascript/models/TextItemLineCompactor.jsx
Normal file
117
src/javascript/models/TextItemLineCompactor.jsx
Normal file
@ -0,0 +1,117 @@
|
||||
import TextItem from './TextItem.jsx';
|
||||
import { ParsedElements } from './PageItem.jsx';
|
||||
import { isNumber } from '../functions.jsx'
|
||||
import { sortByX } from '../textItemFunctions.jsx'
|
||||
|
||||
// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like
|
||||
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
|
||||
export default class TextItemLineCompactor {
|
||||
|
||||
constructor(options) {
|
||||
if (options) {
|
||||
this.transformEmphasis = options.transformEmphasis || true;
|
||||
}
|
||||
}
|
||||
|
||||
// returns a CombineResult
|
||||
compact(lineItems: TextItem[]) {
|
||||
if (lineItems.length < 2) {
|
||||
throw "Must be at least 2 line items, but was " + lineItems;
|
||||
}
|
||||
|
||||
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
||||
sortByX(lineItems);
|
||||
|
||||
var combinedItem;
|
||||
const [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems);
|
||||
if (resolvedLineItems.length == 1) {
|
||||
combinedItem = resolvedLineItems[0];
|
||||
} else {
|
||||
var text = '';
|
||||
var maxHeight = 0;
|
||||
var widthSum = 0;
|
||||
var lastItem;
|
||||
resolvedLineItems.forEach(item => {
|
||||
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
|
||||
const xDistance = item.x - lastItem.x - lastItem.width;
|
||||
if (xDistance >= 5) {
|
||||
text += ' ';
|
||||
}
|
||||
}
|
||||
text += item.text;
|
||||
widthSum += item.width;
|
||||
lastItem = item;
|
||||
maxHeight = Math.max(maxHeight, item.height);
|
||||
});
|
||||
combinedItem = new TextItem({
|
||||
...resolvedLineItems[0],
|
||||
text: text,
|
||||
height: maxHeight,
|
||||
width: widthSum
|
||||
});
|
||||
}
|
||||
combinedItem.parsedElements = parsedElements;
|
||||
|
||||
//TODO whitespace removal
|
||||
//TODO bold/emphasis
|
||||
|
||||
return combinedItem;
|
||||
}
|
||||
|
||||
resolveSpecialElements(lineItems) {
|
||||
const footnoteLinks = [];
|
||||
const footnotes = [];
|
||||
const basicY = lineItems[0].y;
|
||||
const newLineItems = [];
|
||||
var stashedNumberItems = [];
|
||||
|
||||
const commitStashedNumbers = (nextItem) => {
|
||||
if (stashedNumberItems.length > 0) {
|
||||
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
|
||||
if (stashedNumberItems[0].y > basicY) { // footnote link
|
||||
newLineItems.push(new TextItem({
|
||||
...stashedNumberItems[0],
|
||||
//TODO make fomatting configurable
|
||||
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
|
||||
text: `^${joinedNumber}`
|
||||
}));
|
||||
footnoteLinks.push(parseInt(joinedNumber));
|
||||
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
|
||||
//TODO womb comp [29] => ydiff == 0
|
||||
newLineItems.push(new TextItem({
|
||||
...stashedNumberItems[0],
|
||||
text: `(^${ joinedNumber}):`
|
||||
}));
|
||||
footnotes.push(joinedNumber);
|
||||
} else {
|
||||
stashedNumberItems.forEach(number => newLineItems.push(number));
|
||||
}
|
||||
|
||||
stashedNumberItems = [];
|
||||
}
|
||||
};
|
||||
|
||||
lineItems.forEach(item => {
|
||||
if (newLineItems.length == 0 && item.text.trim().length == 0) {
|
||||
// skip whitespace on the beginning of a line
|
||||
} else {
|
||||
const isANumber = isNumber(item.text.trim());
|
||||
if (isANumber) {
|
||||
stashedNumberItems.push(item);
|
||||
} else {
|
||||
if (stashedNumberItems.length > 0) {
|
||||
commitStashedNumbers(item);
|
||||
}
|
||||
newLineItems.push(item);
|
||||
}
|
||||
}
|
||||
});
|
||||
commitStashedNumbers();
|
||||
|
||||
|
||||
return [newLineItems, new ParsedElements({
|
||||
footnoteLinks: footnoteLinks,
|
||||
footnotes: footnotes
|
||||
})];
|
||||
}
|
||||
}
|
36
src/javascript/models/TextItemLineGrouper.jsx
Normal file
36
src/javascript/models/TextItemLineGrouper.jsx
Normal file
@ -0,0 +1,36 @@
|
||||
import TextItem from './TextItem.jsx';
|
||||
import { sortByX } from '../textItemFunctions.jsx'
|
||||
|
||||
//Groups all text items which are on the same y line
|
||||
export default class TextItemLineGrouper {
|
||||
|
||||
constructor(options) {
|
||||
this.mostUsedDistance = options.mostUsedDistance || 12;
|
||||
}
|
||||
|
||||
// returns a CombineResult
|
||||
group(textItems: TextItem[]) {
|
||||
return this.groupItemsByLine(textItems);
|
||||
}
|
||||
|
||||
|
||||
groupItemsByLine(textItems:TextItem[]) {
|
||||
const lines = [];
|
||||
var currentLine = [];
|
||||
textItems.forEach(item => {
|
||||
if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
|
||||
lines.push(currentLine);
|
||||
currentLine = [];
|
||||
}
|
||||
currentLine.push(item);
|
||||
});
|
||||
lines.push(currentLine);
|
||||
|
||||
lines.forEach(lineItems => {
|
||||
// we can't trust order of occurence, esp. footnoteLinks like to come last
|
||||
sortByX(lineItems);
|
||||
});
|
||||
return lines;
|
||||
}
|
||||
|
||||
}
|
70
src/javascript/models/transformations/CompactLines.jsx
Normal file
70
src/javascript/models/transformations/CompactLines.jsx
Normal file
@ -0,0 +1,70 @@
|
||||
import React from 'react';
|
||||
|
||||
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
import TextItemLineGrouper from '../TextItemLineGrouper.jsx';
|
||||
import TextItemLineCompactor from '../TextItemLineCompactor.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
// gathers text items on the same y line to one text item
|
||||
export default class CompactLines extends ToTextItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Compact Lines");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {mostUsedDistance} = parseResult.globals;
|
||||
const foundFootnotes = [];
|
||||
const foundFootnoteLinks = [];
|
||||
const lineGrouper = new TextItemLineGrouper({
|
||||
mostUsedDistance: mostUsedDistance,
|
||||
});
|
||||
const lineCompactor = new TextItemLineCompactor();
|
||||
|
||||
parseResult.pages.forEach(page => {
|
||||
if (page.items.length > 0) {
|
||||
const newItems = [];
|
||||
const textItemsGroupedByLine = lineGrouper.group(page.items);
|
||||
textItemsGroupedByLine.forEach(textItemsOfLine => {
|
||||
if (textItemsOfLine.length == 1) {
|
||||
newItems.push(textItemsOfLine[0]);
|
||||
} else {
|
||||
textItemsOfLine.forEach(item => {
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
newItems.push(item);
|
||||
});
|
||||
|
||||
const combinedItem = lineCompactor.compact(textItemsOfLine);
|
||||
combinedItem.annotation = ADDED_ANNOTATION;
|
||||
newItems.push(combinedItem);
|
||||
|
||||
if (combinedItem.parsedElements.footnoteLinks.length > 0) {
|
||||
const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
|
||||
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
|
||||
}
|
||||
if (combinedItem.parsedElements.footnotes.length > 0) {
|
||||
const footnotes = combinedItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
||||
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
||||
}
|
||||
}
|
||||
});
|
||||
page.items = newItems;
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: [
|
||||
// 'Detected ' + foundFootnoteLinks.length + ' footnote links: [' + foundFootnoteLinks.join(', ') + ']',
|
||||
//'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes.join(', ') + ']',
|
||||
// 'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes + ']',
|
||||
<span>Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }]</span>,
|
||||
<span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>,
|
||||
]
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user