diff --git a/src/javascript/components/debug/PageView.jsx b/src/javascript/components/debug/PageView.jsx
index b79f13c..2a0b042 100644
--- a/src/javascript/components/debug/PageView.jsx
+++ b/src/javascript/components/debug/PageView.jsx
@@ -29,7 +29,7 @@ export default class PageView extends React.Component {
const itemViews = this.createItemViews(items, showWhitespaces);
const header = "Page " + (page.index + 1);
content =
-
{ header }
+ { header }
{ itemViews }
diff --git a/src/javascript/components/debug/TextItemTable.jsx b/src/javascript/components/debug/TextItemTable.jsx
index 7ac6e60..12d7b79 100644
--- a/src/javascript/components/debug/TextItemTable.jsx
+++ b/src/javascript/components/debug/TextItemTable.jsx
@@ -49,6 +49,10 @@ export default class TextItemTable extends React.Component {
{ textItem.annotation ? textItem.annotation.category : '' }
+
+ { textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
+ { textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
+
{ showWhitespaces ? (
@@ -87,6 +91,6 @@ export default class TextItemTable extends React.Component {
{ textItemRows }
- );
+ );
}
}
\ No newline at end of file
diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx
index b4fea84..36c227a 100644
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@@ -3,6 +3,8 @@ import { Enum } from 'enumify';
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
+import CompactLines from './transformations/CompactLines.jsx';
+
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectTOC from './transformations/DetectTOC.jsx'
@@ -29,8 +31,10 @@ export default class AppState {
this.pages = [];
this.transformations = [
new CalculateGlobalStats(),
+ new CompactLines(),
new RemoveRepetitiveElements(),
new VerticalToHorizontal(),
+
new DetectPdfBlocks(),
new DetectFootnotes(),
new DetectTOC(),
diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx
index cb01549..77cfe55 100644
--- a/src/javascript/models/PageItem.jsx
+++ b/src/javascript/models/PageItem.jsx
@@ -11,3 +11,17 @@ export default class PageItem {
}
}
+
+export class ParsedElements {
+
+ constructor(options) {
+ this.footnoteLinks = options.footnoteLinks;
+ this.footnotes = options.footnotes;
+ }
+
+ add(parsedElements:ParsedElements) {
+ this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
+ this.footnotes = this.footnotes.concat(parsedElements.footnotes);
+ }
+
+}
\ No newline at end of file
diff --git a/src/javascript/models/TextItemLineCompactor.jsx b/src/javascript/models/TextItemLineCompactor.jsx
new file mode 100644
index 0000000..74c69b8
--- /dev/null
+++ b/src/javascript/models/TextItemLineCompactor.jsx
@@ -0,0 +1,117 @@
+import TextItem from './TextItem.jsx';
+import { ParsedElements } from './PageItem.jsx';
+import { isNumber } from '../functions.jsx'
+import { sortByX } from '../textItemFunctions.jsx'
+
+// Compact text items which have been grouped to a line (through TextItemLineCompactor) to a single TextItem doing inline transformations like
+//'whitespace removal', bold/emphasis annotation, link-detection, etc..
+export default class TextItemLineCompactor {
+
+ constructor(options) {
+ if (options) {
+ this.transformEmphasis = options.transformEmphasis || true;
+ }
+ }
+
+ // returns a CombineResult
+ compact(lineItems: TextItem[]) {
+ if (lineItems.length < 2) {
+ throw "Must be at least 2 line items, but was " + lineItems;
+ }
+
+ // we can't trust order of occurence, esp. footnoteLinks like to come last
+ sortByX(lineItems);
+
+ var combinedItem;
+ const [resolvedLineItems, parsedElements] = this.resolveSpecialElements(lineItems);
+ if (resolvedLineItems.length == 1) {
+ combinedItem = resolvedLineItems[0];
+ } else {
+ var text = '';
+ var maxHeight = 0;
+ var widthSum = 0;
+ var lastItem;
+ resolvedLineItems.forEach(item => {
+ if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
+ const xDistance = item.x - lastItem.x - lastItem.width;
+ if (xDistance >= 5) {
+ text += ' ';
+ }
+ }
+ text += item.text;
+ widthSum += item.width;
+ lastItem = item;
+ maxHeight = Math.max(maxHeight, item.height);
+ });
+ combinedItem = new TextItem({
+ ...resolvedLineItems[0],
+ text: text,
+ height: maxHeight,
+ width: widthSum
+ });
+ }
+ combinedItem.parsedElements = parsedElements;
+
+ //TODO whitespace removal
+ //TODO bold/emphasis
+
+ return combinedItem;
+ }
+
+ resolveSpecialElements(lineItems) {
+ const footnoteLinks = [];
+ const footnotes = [];
+ const basicY = lineItems[0].y;
+ const newLineItems = [];
+ var stashedNumberItems = [];
+
+ const commitStashedNumbers = (nextItem) => {
+ if (stashedNumberItems.length > 0) {
+ const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
+ if (stashedNumberItems[0].y > basicY) { // footnote link
+ newLineItems.push(new TextItem({
+ ...stashedNumberItems[0],
+ //TODO make fomatting configurable
+ // text: `[${joinedNumber}](#${joinedNumber})`
+ text: `^${joinedNumber}`
+ }));
+ footnoteLinks.push(parseInt(joinedNumber));
+ } else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
+ //TODO womb comp [29] => ydiff == 0
+ newLineItems.push(new TextItem({
+ ...stashedNumberItems[0],
+ text: `(^${ joinedNumber}):`
+ }));
+ footnotes.push(joinedNumber);
+ } else {
+ stashedNumberItems.forEach(number => newLineItems.push(number));
+ }
+
+ stashedNumberItems = [];
+ }
+ };
+
+ lineItems.forEach(item => {
+ if (newLineItems.length == 0 && item.text.trim().length == 0) {
+ // skip whitespace on the beginning of a line
+ } else {
+ const isANumber = isNumber(item.text.trim());
+ if (isANumber) {
+ stashedNumberItems.push(item);
+ } else {
+ if (stashedNumberItems.length > 0) {
+ commitStashedNumbers(item);
+ }
+ newLineItems.push(item);
+ }
+ }
+ });
+ commitStashedNumbers();
+
+
+ return [newLineItems, new ParsedElements({
+ footnoteLinks: footnoteLinks,
+ footnotes: footnotes
+ })];
+ }
+}
diff --git a/src/javascript/models/TextItemLineGrouper.jsx b/src/javascript/models/TextItemLineGrouper.jsx
new file mode 100644
index 0000000..54264b5
--- /dev/null
+++ b/src/javascript/models/TextItemLineGrouper.jsx
@@ -0,0 +1,36 @@
+import TextItem from './TextItem.jsx';
+import { sortByX } from '../textItemFunctions.jsx'
+
+//Groups all text items which are on the same y line
+export default class TextItemLineGrouper {
+
+ constructor(options) {
+ this.mostUsedDistance = options.mostUsedDistance || 12;
+ }
+
+ // returns a CombineResult
+ group(textItems: TextItem[]) {
+ return this.groupItemsByLine(textItems);
+ }
+
+
+ groupItemsByLine(textItems:TextItem[]) {
+ const lines = [];
+ var currentLine = [];
+ textItems.forEach(item => {
+ if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
+ lines.push(currentLine);
+ currentLine = [];
+ }
+ currentLine.push(item);
+ });
+ lines.push(currentLine);
+
+ lines.forEach(lineItems => {
+ // we can't trust order of occurence, esp. footnoteLinks like to come last
+ sortByX(lineItems);
+ });
+ return lines;
+ }
+
+}
diff --git a/src/javascript/models/transformations/CompactLines.jsx b/src/javascript/models/transformations/CompactLines.jsx
new file mode 100644
index 0000000..0c55805
--- /dev/null
+++ b/src/javascript/models/transformations/CompactLines.jsx
@@ -0,0 +1,70 @@
+import React from 'react';
+
+import ToTextItemTransformation from './ToTextItemTransformation.jsx';
+import ParseResult from '../ParseResult.jsx';
+import TextItemLineGrouper from '../TextItemLineGrouper.jsx';
+import TextItemLineCompactor from '../TextItemLineCompactor.jsx';
+import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
+
+// gathers text items on the same y line to one text item
+export default class CompactLines extends ToTextItemTransformation {
+
+ constructor() {
+ super("Compact Lines");
+ }
+
+ transform(parseResult:ParseResult) {
+ const {mostUsedDistance} = parseResult.globals;
+ const foundFootnotes = [];
+ const foundFootnoteLinks = [];
+ const lineGrouper = new TextItemLineGrouper({
+ mostUsedDistance: mostUsedDistance,
+ });
+ const lineCompactor = new TextItemLineCompactor();
+
+ parseResult.pages.forEach(page => {
+ if (page.items.length > 0) {
+ const newItems = [];
+ const textItemsGroupedByLine = lineGrouper.group(page.items);
+ textItemsGroupedByLine.forEach(textItemsOfLine => {
+ if (textItemsOfLine.length == 1) {
+ newItems.push(textItemsOfLine[0]);
+ } else {
+ textItemsOfLine.forEach(item => {
+ item.annotation = REMOVED_ANNOTATION;
+ newItems.push(item);
+ });
+
+ const combinedItem = lineCompactor.compact(textItemsOfLine);
+ combinedItem.annotation = ADDED_ANNOTATION;
+ newItems.push(combinedItem);
+
+ if (combinedItem.parsedElements.footnoteLinks.length > 0) {
+ const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => { footnoteLink },);
+ foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
+ }
+ if (combinedItem.parsedElements.footnotes.length > 0) {
+ const footnotes = combinedItem.parsedElements.footnotes.map(footnote => { footnote },);
+ foundFootnotes.push.apply(foundFootnotes, footnotes);
+ }
+ }
+ });
+ page.items = newItems;
+ }
+ });
+
+
+ return new ParseResult({
+ ...parseResult,
+ messages: [
+ // 'Detected ' + foundFootnoteLinks.length + ' footnote links: [' + foundFootnoteLinks.join(', ') + ']',
+ //'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes.join(', ') + ']',
+ // 'Detected ' + foundFootnotes.length + ' footnotes: [' + foundFootnotes + ']',
+ Detected { foundFootnoteLinks.length } footnotes: [{ foundFootnoteLinks }],
+ Detected { foundFootnotes.length } footnotes: [{ foundFootnotes }],
+ ]
+ });
+ }
+
+
+}
|