diff --git a/src/javascript/components/debug/PdfBlockPageView.jsx b/src/javascript/components/debug/PdfBlockPageView.jsx
index 884979a..706d57b 100644
--- a/src/javascript/components/debug/PdfBlockPageView.jsx
+++ b/src/javascript/components/debug/PdfBlockPageView.jsx
@@ -30,10 +30,18 @@ export default class PdfBlockPageView extends React.Component {
const colorStyle = block.annotation ? {
color: block.annotation.color
} : null;
- var footnotesElement;
+ var footnoteLinks;
+ var footnotes;
if (block.parsedElements) {
+ if (block.parsedElements.footnoteLinks.length > 0) {
+ footnoteLinks =
+ { 'Footnote-Links: ' + block.parsedElements.footnoteLinks }
+
;
+ }
if (block.parsedElements.footnotes.length > 0) {
- footnotesElement = 'Footnotes: ' + block.parsedElements.footnotes;
+ footnotes =
+ { 'Footnotes: ' + block.parsedElements.footnotes }
+
;
}
}
@@ -43,7 +51,8 @@ export default class PdfBlockPageView extends React.Component {
- { footnotesElement }
+ { footnoteLinks }
+ { footnotes }
});
diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx
index a29513e..d32a705 100644
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@@ -4,17 +4,17 @@ import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
+import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectTOC from './transformations/DetectTOC.jsx'
import DetectLists from './transformations/DetectLists.jsx'
import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
-import DetectFormats from './transformations/DetectFormats.jsx'
-import CombineSameY from './transformations/CombineSameY.jsx';
-import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
-import DetectFootnotes from './transformations/DetectFootnotes.jsx'
-import DetectLinks from './transformations/DetectLinks.jsx'
-import HeadlineDetector from './transformations/HeadlineDetector.jsx'
-import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
-import ToBlockSystem from './transformations/ToBlockSystem.jsx';
+// import DetectFormats from './transformations/DetectFormats.jsx'
+// import CombineSameY from './transformations/CombineSameY.jsx';
+// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
+// import DetectLinks from './transformations/DetectLinks.jsx'
+// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
+// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
+// import ToBlockSystem from './transformations/ToBlockSystem.jsx';
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
import ToMarkdown from './transformations/ToMarkdown.jsx'
@@ -31,6 +31,7 @@ export default class AppState {
new RemoveRepetitiveElements(),
new VerticalToHorizontal(),
new DetectPdfBlocks(),
+ new DetectFootnotes(),
new DetectTOC(),
new DetectLists(),
new DetectCodeBlocks(),
@@ -38,7 +39,6 @@ export default class AppState {
// new DetectFormats(),
// new CombineSameY(),
// new RemoveWhitespaces(),
- // new DetectFootnotes(),
// new DetectLinks(),
// new HeadlineDetector(),
// new HeadlineToUppercase(),
diff --git a/src/javascript/models/MarkdownElements.jsx b/src/javascript/models/MarkdownElements.jsx
index 4cbb559..e7f09c9 100644
--- a/src/javascript/models/MarkdownElements.jsx
+++ b/src/javascript/models/MarkdownElements.jsx
@@ -7,6 +7,7 @@ export const PARAGRAPH = "Paragraph";
export const LIST_BLOCK = "List";
export const CODE_BLOCK = "Code/Quote";
export const TOC_BLOCK = "TOC";
+export const FOOTNOTE_BLOCK = "Footnotes"
export function blockToText(block: PdfBlock) {
switch (block.type) {
diff --git a/src/javascript/models/TextItemCombiner.jsx b/src/javascript/models/TextItemCombiner.jsx
index fb1d1ce..07453cc 100644
--- a/src/javascript/models/TextItemCombiner.jsx
+++ b/src/javascript/models/TextItemCombiner.jsx
@@ -1,5 +1,6 @@
import TextItem from './TextItem.jsx';
-import { isNumber } from '../functions.jsx'
+import { isNumber, isDigit } from '../functions.jsx'
+import { sortByX } from '../textItemFunctions.jsx'
//Combines text items which are on the same Y at the same time doing inline transformations like
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
@@ -58,66 +59,88 @@ export default class TextItemCombiner {
}
groupByFollowingY(textItems) {
- const yArrays = [];
+ const footnoteLinks = [];
const footnotes = [];
- var itemsWithSameY = [];
- var lastItem;
- const wrapUpLine = () => {
- // we can't trust order of occurence, esp. footnotes like to come last
- itemsWithSameY.sort((a, b) => {
- return a.x - b.x;
- });
- const finalArray = [];
- const basicY = itemsWithSameY[0].y;
- var savedFootnoteItems = [];
- const commitSavedFootnotes = () => {
- if (savedFootnoteItems.length > 0) {
- const footnoteNumber = savedFootnoteItems.map(footnoteItem => footnoteItem.text).join('');
- finalArray.push(new TextItem({
- ...savedFootnoteItems[0],
- //TODO make fomatting configurable
- // text: `[${footnoteNumber}](#${footnoteNumber})`
- text: `*${footnoteNumber}`
- }));
- savedFootnoteItems = [];
- footnotes.push(parseInt(footnoteNumber));
+ var lines = this.groupItemsByLine(textItems);
+ lines = lines.map(lineItems => {
+ const basicY = lineItems[0].y;
+ const newLineItems = [];
+ var stashedNumberItems = [];
+
+
+ const commitStashedNumbers = (nextItem) => {
+ if (stashedNumberItems.length > 0) {
+ const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
+ if (stashedNumberItems[0].y > basicY) { // footnote link
+ newLineItems.push(new TextItem({
+ ...stashedNumberItems[0],
+ //TODO make fomatting configurable
+ // text: `[${joinedNumber}](#${joinedNumber})`
+ text: `^${joinedNumber}`
+ }));
+ footnoteLinks.push(parseInt(joinedNumber));
+ } else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
+ //TODO womb comp [29] => ydiff == 0
+ newLineItems.push(new TextItem({
+ ...stashedNumberItems[0],
+ text: `(^${ joinedNumber}):`
+ }));
+ footnotes.push(joinedNumber);
+ } else {
+ stashedNumberItems.forEach(number => newLineItems.push(number));
+ }
+
+ stashedNumberItems = [];
}
};
- itemsWithSameY.forEach(item => {
- const isFootnote = item.y > basicY && isNumber(item.text);
- if (isFootnote) {
- savedFootnoteItems.push(item);
+ lineItems.forEach(item => {
+ if (newLineItems.length == 0 && item.text.trim().length == 0) {
+ // skip whitespace on the beginning of a line
} else {
- if (savedFootnoteItems.length > 0) {
- commitSavedFootnotes();
+ const isANumber = isNumber(item.text);
+ if (isANumber) {
+ stashedNumberItems.push(item);
+ } else {
+ if (stashedNumberItems.length > 0) {
+ commitStashedNumbers(item);
+ }
+ newLineItems.push(item);
}
- finalArray.push(item);
}
});
- commitSavedFootnotes();
- yArrays.push(finalArray);
- itemsWithSameY = [];
- };
-
- textItems.forEach(item => {
- if (lastItem) {
- if (Math.abs(lastItem.y - item.y) > this.mostUsedDistance / 2) {
- wrapUpLine();
- }
- }
- itemsWithSameY.push(item);
- lastItem = item;
- // }
+ commitStashedNumbers();
+ return newLineItems;
});
- wrapUpLine();
- return [yArrays, new ParsedElements({
+
+ return [lines, new ParsedElements({
+ footnoteLinks: footnoteLinks,
footnotes: footnotes
})];
}
+
+ groupItemsByLine(textItems:TextItem[]) {
+ const lines = [];
+ var currentLine = [];
+ textItems.forEach(item => {
+ if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
+ lines.push(currentLine);
+ currentLine = [];
+ }
+ currentLine.push(item);
+ });
+ lines.push(currentLine);
+
+ lines.forEach(lineItems => {
+ // we can't trust order of occurence, esp. footnoteLinks like to come last
+ sortByX(lineItems);
+ });
+ return lines;
+ }
+
}
//Result of the TextItemCombiner#combine()
@@ -125,7 +148,6 @@ export class CombineResult {
constructor(options) {
this.textItems = options.textItems;
- this.footnotes = options.footnotes;
this.parsedElements = options.parsedElements;
}
@@ -134,10 +156,12 @@ export class CombineResult {
export class ParsedElements {
constructor(options) {
+ this.footnoteLinks = options.footnoteLinks;
this.footnotes = options.footnotes;
}
add(parsedElements:ParsedElements) {
+ this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
}
diff --git a/src/javascript/models/transformations/DetectFootnotes.jsx b/src/javascript/models/transformations/DetectFootnotes.jsx
index a447c34..f3ba1af 100644
--- a/src/javascript/models/transformations/DetectFootnotes.jsx
+++ b/src/javascript/models/transformations/DetectFootnotes.jsx
@@ -1,70 +1,67 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
-import TextItem from '../TextItem.jsx';
+import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
-import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
+import PdfBlock from '../PdfBlock.jsx';
+import TextItemCombiner from '../TextItemCombiner.jsx';
+import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
+import { FOOTNOTE_BLOCK } from '../MarkdownElements.jsx';
-import { isNumber } from '../../functions.jsx'
-
-export default class DetectFootnotes extends ToPdfViewTransformation {
+//Detect quotes, code etc.. which is transformed to markdown code syntax
+export default class DetectFootnotes extends ToPdfBlockViewTransformation {
constructor() {
super("Detect Footnotes");
}
transform(parseResult:ParseResult) {
+ const {mostUsedDistance} = parseResult.globals;
+ var foundFootnotes = [];
+ const textCombiner = new TextItemCombiner({
+ mostUsedDistance: mostUsedDistance,
+ });
- var nextFooterNumber = 1;
- var potentialFootnoteItem;
- var foundFootnotes = 0;
-
- const newContent = parseResult.content.map(page => {
- const newTextItems = [];
- for (var i = 0; i < page.textItems.length; i++) {
- const item = page.textItems[i];
- if (potentialFootnoteItem) {
- if (potentialFootnoteItem.y - item.y < item.height) {
- potentialFootnoteItem.annotation = REMOVED_ANNOTATION;
- item.annotation = REMOVED_ANNOTATION;
- newTextItems.push(potentialFootnoteItem);
- newTextItems.push(item);
- newTextItems.push(new TextItem({
- x: potentialFootnoteItem.x,
- y: item.y,
- width: potentialFootnoteItem.width + item.width,
- height: item.height,
- text: '[' + potentialFootnoteItem.text + '] ' + item.text,
- annotation: ADDED_ANNOTATION
- }));
- //TODO repsect multiline!!
- nextFooterNumber++;
- foundFootnotes++;
+ parseResult.content.forEach(page => {
+ const newBlocks = [];
+ var lastFootnote;
+ page.blocks.forEach(block => {
+ newBlocks.push(block);
+ if (!block.type && block.textItems[0].y < 200) {
+ const combineResult = textCombiner.combine(block.textItems);
+ if (combineResult.parsedElements.footnotes.length > 0) {
+ block.annotation = REMOVED_ANNOTATION;
+ foundFootnotes.push.apply(foundFootnotes, combineResult.parsedElements.footnotes);
+ lastFootnote = new PdfBlock({
+ textItems: combineResult.textItems,
+ type: FOOTNOTE_BLOCK,
+ annotation: ADDED_ANNOTATION,
+ parsedElements: combineResult.parsedElements
+ })
+ newBlocks.push(lastFootnote);
+ } else if (lastFootnote) {
+ // likely to be the second line of aboves footnote
+ block.annotation = REMOVED_ANNOTATION;
+ lastFootnote.textItems = lastFootnote.textItems.concat(combineResult.textItems);
+ lastFootnote.parsedElements.add(combineResult.parsedElements);
+ newBlocks[newBlocks.length - 2] = block;
+ newBlocks[newBlocks.length - 1] = lastFootnote;
}
- potentialFootnoteItem = null;
- } else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
- potentialFootnoteItem = item;
} else {
- newTextItems.push(item);
+ lastFootnote = null;
}
- }
- return {
- ...page,
- textItems: newTextItems
- };
+ });
+ page.blocks = newBlocks;
});
return new ParseResult({
...parseResult,
- content: newContent,
- messages: ['Detected ' + foundFootnotes + ' footnotes']
+ messages: [
+ 'Detected ' + foundFootnotes.length + ' footnotes:',
+ foundFootnotes.join(', ')
+ ]
});
+
}
- completeTransform(parseResult:ParseResult) {
- parseResult.content.forEach(page => {
- page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
- page.textItems.forEach(textItem => textItem.annotation = null)
- });
- return parseResult;
- }
+}
+
+
-}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/VerticalToHorizontal.jsx b/src/javascript/models/transformations/VerticalToHorizontal.jsx
index 5ab8155..b1435db 100644
--- a/src/javascript/models/transformations/VerticalToHorizontal.jsx
+++ b/src/javascript/models/transformations/VerticalToHorizontal.jsx
@@ -33,7 +33,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
//TODO generic state machine code ?
- page.textItems.reduce((oneCharacterItems, item) => {
+ const leftOver = page.textItems.reduce((oneCharacterItems, item) => {
if (item.text.trim().length == 1) {
if (oneCharacterItems.length == 0) {
oneCharacterItems.push(item);
@@ -80,6 +80,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation {
}
return oneCharacterItems;
}, []);
+ leftOver.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem));
return {
...page,
diff --git a/src/javascript/textItemFunctions.jsx b/src/javascript/textItemFunctions.jsx
index 7629911..a60eeca 100644
--- a/src/javascript/textItemFunctions.jsx
+++ b/src/javascript/textItemFunctions.jsx
@@ -24,3 +24,15 @@ export function minXFromTextItems(items:TextItem) {
}
return minX;
}
+
+export function sortByX(items:TextItem) {
+ items.sort((a, b) => {
+ return a.x - b.x;
+ });
+}
+
+export function sortCopyByX(items:TextItem) {
+ const copy = items.concat();
+ sortByX(copy);
+ return copy;
+}
\ No newline at end of file