[WIP] Compact Lines

* Almost every transformer first combines the lines, so we can make it an explicit one time transformation in the beginning
This commit is contained in:
Johannes Zillmann 2017-03-10 08:49:40 +01:00
parent e2ddf0312b
commit e2481bdd2a
7 changed files with 247 additions and 2 deletions

View File

@ -29,7 +29,7 @@ export default class PageView extends React.Component {
const itemViews = this.createItemViews(items, showWhitespaces);
const header = "Page " + (page.index + 1);
content = <div>
<h2>{ header }</h2>
<h2 id={ header }>{ header }</h2>
<hr/>
{ itemViews }
</div>

View File

@ -49,6 +49,10 @@ export default class TextItemTable extends React.Component {
<div style={ { textAlign: 'center' } }>
{ textItem.annotation ? textItem.annotation.category : '' }
</div>
<div style={ { textAlign: 'center', color: 'orange' } }>
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
{ textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
</div>
</td>
<td>
{ showWhitespaces ? (
@ -87,6 +91,6 @@ export default class TextItemTable extends React.Component {
{ textItemRows }
</tbody>
</Table>
);
);
}
}

View File

@ -3,6 +3,8 @@ import { Enum } from 'enumify';
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import CompactLines from './transformations/CompactLines.jsx';
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectTOC from './transformations/DetectTOC.jsx'
@ -29,8 +31,10 @@ export default class AppState {
this.pages = [];
this.transformations = [
new CalculateGlobalStats(),
new CompactLines(),
new RemoveRepetitiveElements(),
new VerticalToHorizontal(),
new DetectPdfBlocks(),
new DetectFootnotes(),
new DetectTOC(),

View File

@ -11,3 +11,17 @@ export default class PageItem {
}
}
export class ParsedElements {
constructor(options) {
this.footnoteLinks = options.footnoteLinks;
this.footnotes = options.footnotes;
}
add(parsedElements:ParsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
}
}

View File

@ -0,0 +1,117 @@
import TextItem from './TextItem.jsx';
import { ParsedElements } from './PageItem.jsx';
import { isNumber } from '../functions.jsx'
import { sortByX } from '../textItemFunctions.jsx'
// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
export default class TextItemLineCompactor {
constructor(options) {
if (options) {
this.transformEmphasis = options.transformEmphasis || true;
}
}
// returns a CombineResult
compact(lineItems: TextItem[]) {
if (lineItems.length < 2) {
throw "Must be at least 2 line items, but was " + lineItems;
}
// we can't trust order of occurence, esp. footnoteLinks like to come last
sortByX(lineItems);
var combinedItem;
const [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems);
if (resolvedLineItems.length == 1) {
combinedItem = resolvedLineItems[0];
} else {
var text = '';
var maxHeight = 0;
var widthSum = 0;
var lastItem;
resolvedLineItems.forEach(item => {
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
const xDistance = item.x - lastItem.x - lastItem.width;
if (xDistance >= 5) {
text += ' ';
}
}
text += item.text;
widthSum += item.width;
lastItem = item;
maxHeight = Math.max(maxHeight, item.height);
});
combinedItem = new TextItem({
...resolvedLineItems[0],
text: text,
height: maxHeight,
width: widthSum
});
}
combinedItem.parsedElements = parsedElements;
//TODO whitespace removal
//TODO bold/emphasis
return combinedItem;
}
resolveSpecialElements(lineItems) {
const footnoteLinks = [];
const footnotes = [];
const basicY = lineItems[0].y;
const newLineItems = [];
var stashedNumberItems = [];
const commitStashedNumbers = (nextItem) => {
if (stashedNumberItems.length > 0) {
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
if (stashedNumberItems[0].y > basicY) { // footnote link
newLineItems.push(new TextItem({
...stashedNumberItems[0],
//TODO make fomatting configurable
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
text: `^${joinedNumber}`
}));
footnoteLinks.push(parseInt(joinedNumber));
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
//TODO womb comp [29] => ydiff == 0
newLineItems.push(new TextItem({
...stashedNumberItems[0],
text: `(^${ joinedNumber}):`
}));
footnotes.push(joinedNumber);
} else {
stashedNumberItems.forEach(number => newLineItems.push(number));
}
stashedNumberItems = [];
}
};
lineItems.forEach(item => {
if (newLineItems.length == 0 && item.text.trim().length == 0) {
// skip whitespace on the beginning of a line
} else {
const isANumber = isNumber(item.text.trim());
if (isANumber) {
stashedNumberItems.push(item);
} else {
if (stashedNumberItems.length > 0) {
commitStashedNumbers(item);
}
newLineItems.push(item);
}
}
});
commitStashedNumbers();
return [newLineItems, new ParsedElements({
footnoteLinks: footnoteLinks,
footnotes: footnotes
})];
}
}

View File

@ -0,0 +1,36 @@
import TextItem from './TextItem.jsx';
import { sortByX } from '../textItemFunctions.jsx'
//Groups all text items which are on the same y line
export default class TextItemLineGrouper {
constructor(options) {
this.mostUsedDistance = options.mostUsedDistance || 12;
}
// returns a CombineResult
group(textItems: TextItem[]) {
return this.groupItemsByLine(textItems);
}
groupItemsByLine(textItems:TextItem[]) {
const lines = [];
var currentLine = [];
textItems.forEach(item => {
if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
lines.push(currentLine);
currentLine = [];
}
currentLine.push(item);
});
lines.push(currentLine);
lines.forEach(lineItems => {
// we can't trust order of occurence, esp. footnoteLinks like to come last
sortByX(lineItems);
});
return lines;
}
}

View File

@ -0,0 +1,70 @@
import React from 'react';
import ToTextItemTransformation from './ToTextItemTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItemLineGrouper from '../TextItemLineGrouper.jsx';
import TextItemLineCompactor from '../TextItemLineCompactor.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
// gathers text items on the same y line to one text item
export default class CompactLines extends ToTextItemTransformation {
constructor() {
super("Compact Lines");
}
transform(parseResult:ParseResult) {
const {mostUsedDistance} = parseResult.globals;
const foundFootnotes = [];
const foundFootnoteLinks = [];
const lineGrouper = new TextItemLineGrouper({
mostUsedDistance: mostUsedDistance,
});
const lineCompactor = new TextItemLineCompactor();
parseResult.pages.forEach(page => {
if (page.items.length > 0) {
const newItems = [];
const textItemsGroupedByLine = lineGrouper.group(page.items);
textItemsGroupedByLine.forEach(textItemsOfLine => {
if (textItemsOfLine.length == 1) {
newItems.push(textItemsOfLine[0]);
} else {
textItemsOfLine.forEach(item => {
item.annotation = REMOVED_ANNOTATION;
newItems.push(item);
});
const combinedItem = lineCompactor.compact(textItemsOfLine);
combinedItem.annotation = ADDED_ANNOTATION;
newItems.push(combinedItem);
if (combinedItem.parsedElements.footnoteLinks.length > 0) {
const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
}
if (combinedItem.parsedElements.footnotes.length > 0) {
const footnotes = combinedItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
foundFootnotes.push.apply(foundFootnotes, footnotes);
}
}
});
page.items = newItems;
}
});
return new ParseResult({
...parseResult,
messages: [
// 'Detected ' + foundFootnoteLinks.length + ' footnote links: [' + foundFootnoteLinks.join(', ') + ']',
//'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes.join(', ') + ']',
// 'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes + ']',
<span>Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }]</span>,
<span>Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }]</span>,
]
});
}
}