From e2481bdd2a7a34effda9902f673e0e3ba42c77a6 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Fri, 10 Mar 2017 08:49:40 +0100 Subject: [PATCH] [WIP] Compact Lines * Almost every transformer first combines the lines, so we can make it an explicit one time transformation in the beginning --- src/javascript/components/debug/PageView.jsx | 2 +- .../components/debug/TextItemTable.jsx | 6 +- src/javascript/models/AppState.jsx | 4 + src/javascript/models/PageItem.jsx | 14 +++ .../models/TextItemLineCompactor.jsx | 117 ++++++++++++++++++ src/javascript/models/TextItemLineGrouper.jsx | 36 ++++++ .../models/transformations/CompactLines.jsx | 70 +++++++++++ 7 files changed, 247 insertions(+), 2 deletions(-) create mode 100644 src/javascript/models/TextItemLineCompactor.jsx create mode 100644 src/javascript/models/TextItemLineGrouper.jsx create mode 100644 src/javascript/models/transformations/CompactLines.jsx diff --git a/src/javascript/components/debug/PageView.jsx b/src/javascript/components/debug/PageView.jsx index b79f13c..2a0b042 100644 --- a/src/javascript/components/debug/PageView.jsx +++ b/src/javascript/components/debug/PageView.jsx @@ -29,7 +29,7 @@ export default class PageView extends React.Component { const itemViews = this.createItemViews(items, showWhitespaces); const header = "Page " + (page.index + 1); content =
-

{ header }

+

{ header }


{ itemViews }
diff --git a/src/javascript/components/debug/TextItemTable.jsx b/src/javascript/components/debug/TextItemTable.jsx index 7ac6e60..12d7b79 100644 --- a/src/javascript/components/debug/TextItemTable.jsx +++ b/src/javascript/components/debug/TextItemTable.jsx @@ -49,6 +49,10 @@ export default class TextItemTable extends React.Component {
{ textItem.annotation ? textItem.annotation.category : '' }
+
+ { textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' } + { textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' } +
{ showWhitespaces ? ( @@ -87,6 +91,6 @@ export default class TextItemTable extends React.Component { { textItemRows } - ); + ); } } \ No newline at end of file diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index b4fea84..36c227a 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -3,6 +3,8 @@ import { Enum } from 'enumify'; import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx'; import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx'; +import CompactLines from './transformations/CompactLines.jsx'; + import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx' import DetectFootnotes from './transformations/DetectFootnotes.jsx' import DetectTOC from './transformations/DetectTOC.jsx' @@ -29,8 +31,10 @@ export default class AppState { this.pages = []; this.transformations = [ new CalculateGlobalStats(), + new CompactLines(), new RemoveRepetitiveElements(), new VerticalToHorizontal(), + new DetectPdfBlocks(), new DetectFootnotes(), new DetectTOC(), diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx index cb01549..77cfe55 100644 --- a/src/javascript/models/PageItem.jsx +++ b/src/javascript/models/PageItem.jsx @@ -11,3 +11,17 @@ export default class PageItem { } } + +export class ParsedElements { + + constructor(options) { + this.footnoteLinks = options.footnoteLinks; + this.footnotes = options.footnotes; + } + + add(parsedElements:ParsedElements) { + this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks); + this.footnotes = this.footnotes.concat(parsedElements.footnotes); + } + +} \ No newline at end of file diff --git a/src/javascript/models/TextItemLineCompactor.jsx b/src/javascript/models/TextItemLineCompactor.jsx new file mode 100644 index 0000000..74c69b8 --- /dev/null +++ b/src/javascript/models/TextItemLineCompactor.jsx @@ -0,0 +1,117 @@ +import TextItem from './TextItem.jsx'; +import { ParsedElements } from './PageItem.jsx'; +import { isNumber } from '../functions.jsx' +import { sortByX } from '../textItemFunctions.jsx' + +// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like +//'whitespace removal', bold/emphasis annotation, link-detection, etc.. +export default class TextItemLineCompactor { + + constructor(options) { + if (options) { + this.transformEmphasis = options.transformEmphasis || true; + } + } + + // returns a CombineResult + compact(lineItems: TextItem[]) { + if (lineItems.length < 2) { + throw "Must be at least 2 line items, but was " + lineItems; + } + + // we can't trust order of occurence, esp. footnoteLinks like to come last + sortByX(lineItems); + + var combinedItem; + const [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems); + if (resolvedLineItems.length == 1) { + combinedItem = resolvedLineItems[0]; + } else { + var text = ''; + var maxHeight = 0; + var widthSum = 0; + var lastItem; + resolvedLineItems.forEach(item => { + if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) { + const xDistance = item.x - lastItem.x - lastItem.width; + if (xDistance >= 5) { + text += ' '; + } + } + text += item.text; + widthSum += item.width; + lastItem = item; + maxHeight = Math.max(maxHeight, item.height); + }); + combinedItem = new TextItem({ + ...resolvedLineItems[0], + text: text, + height: maxHeight, + width: widthSum + }); + } + combinedItem.parsedElements = parsedElements; + + //TODO whitespace removal + //TODO bold/emphasis + + return combinedItem; + } + + resolveSpecialElements(lineItems) { + const footnoteLinks = []; + const footnotes = []; + const basicY = lineItems[0].y; + const newLineItems = []; + var stashedNumberItems = []; + + const commitStashedNumbers = (nextItem) => { + if (stashedNumberItems.length > 0) { + const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join(''); + if (stashedNumberItems[0].y > basicY) { // footnote link + newLineItems.push(new TextItem({ + ...stashedNumberItems[0], + //TODO make fomatting configurable + // text: `[${joinedNumber}](#${joinedNumber})` + text: `^${joinedNumber}` + })); + footnoteLinks.push(parseInt(joinedNumber)); + } else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote + //TODO womb comp [29] => ydiff == 0 + newLineItems.push(new TextItem({ + ...stashedNumberItems[0], + text: `(^${ joinedNumber}):` + })); + footnotes.push(joinedNumber); + } else { + stashedNumberItems.forEach(number => newLineItems.push(number)); + } + + stashedNumberItems = []; + } + }; + + lineItems.forEach(item => { + if (newLineItems.length == 0 && item.text.trim().length == 0) { + // skip whitespace on the beginning of a line + } else { + const isANumber = isNumber(item.text.trim()); + if (isANumber) { + stashedNumberItems.push(item); + } else { + if (stashedNumberItems.length > 0) { + commitStashedNumbers(item); + } + newLineItems.push(item); + } + } + }); + commitStashedNumbers(); + + + return [newLineItems, new ParsedElements({ + footnoteLinks: footnoteLinks, + footnotes: footnotes + })]; + } +} diff --git a/src/javascript/models/TextItemLineGrouper.jsx b/src/javascript/models/TextItemLineGrouper.jsx new file mode 100644 index 0000000..54264b5 --- /dev/null +++ b/src/javascript/models/TextItemLineGrouper.jsx @@ -0,0 +1,36 @@ +import TextItem from './TextItem.jsx'; +import { sortByX } from '../textItemFunctions.jsx' + +//Groups all text items which are on the same y line +export default class TextItemLineGrouper { + + constructor(options) { + this.mostUsedDistance = options.mostUsedDistance || 12; + } + + // returns a CombineResult + group(textItems: TextItem[]) { + return this.groupItemsByLine(textItems); + } + + + groupItemsByLine(textItems:TextItem[]) { + const lines = []; + var currentLine = []; + textItems.forEach(item => { + if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) { + lines.push(currentLine); + currentLine = []; + } + currentLine.push(item); + }); + lines.push(currentLine); + + lines.forEach(lineItems => { + // we can't trust order of occurence, esp. footnoteLinks like to come last + sortByX(lineItems); + }); + return lines; + } + +} diff --git a/src/javascript/models/transformations/CompactLines.jsx b/src/javascript/models/transformations/CompactLines.jsx new file mode 100644 index 0000000..0c55805 --- /dev/null +++ b/src/javascript/models/transformations/CompactLines.jsx @@ -0,0 +1,70 @@ +import React from 'react'; + +import ToTextItemTransformation from './ToTextItemTransformation.jsx'; +import ParseResult from '../ParseResult.jsx'; +import TextItemLineGrouper from '../TextItemLineGrouper.jsx'; +import TextItemLineCompactor from '../TextItemLineCompactor.jsx'; +import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; + +// gathers text items on the same y line to one text item +export default class CompactLines extends ToTextItemTransformation { + + constructor() { + super("Compact Lines"); + } + + transform(parseResult:ParseResult) { + const {mostUsedDistance} = parseResult.globals; + const foundFootnotes = []; + const foundFootnoteLinks = []; + const lineGrouper = new TextItemLineGrouper({ + mostUsedDistance: mostUsedDistance, + }); + const lineCompactor = new TextItemLineCompactor(); + + parseResult.pages.forEach(page => { + if (page.items.length > 0) { + const newItems = []; + const textItemsGroupedByLine = lineGrouper.group(page.items); + textItemsGroupedByLine.forEach(textItemsOfLine => { + if (textItemsOfLine.length == 1) { + newItems.push(textItemsOfLine[0]); + } else { + textItemsOfLine.forEach(item => { + item.annotation = REMOVED_ANNOTATION; + newItems.push(item); + }); + + const combinedItem = lineCompactor.compact(textItemsOfLine); + combinedItem.annotation = ADDED_ANNOTATION; + newItems.push(combinedItem); + + if (combinedItem.parsedElements.footnoteLinks.length > 0) { + const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => { footnoteLink },); + foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks); + } + if (combinedItem.parsedElements.footnotes.length > 0) { + const footnotes = combinedItem.parsedElements.footnotes.map(footnote => { footnote },); + foundFootnotes.push.apply(foundFootnotes, footnotes); + } + } + }); + page.items = newItems; + } + }); + + + return new ParseResult({ + ...parseResult, + messages: [ + // 'Detected ' + foundFootnoteLinks.length + ' footnote links: [' + foundFootnoteLinks.join(', ') + ']', + //'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes.join(', ') + ']', + // 'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes + ']', + Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }], + Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }], + ] + }); + } + + +}