diff --git a/src/javascript/components/debug/PdfBlockPageView.jsx b/src/javascript/components/debug/PdfBlockPageView.jsx index 884979a..706d57b 100644 --- a/src/javascript/components/debug/PdfBlockPageView.jsx +++ b/src/javascript/components/debug/PdfBlockPageView.jsx @@ -30,10 +30,18 @@ export default class PdfBlockPageView extends React.Component { const colorStyle = block.annotation ? { color: block.annotation.color } : null; - var footnotesElement; + var footnoteLinks; + var footnotes; if (block.parsedElements) { + if (block.parsedElements.footnoteLinks.length > 0) { + footnoteLinks =
+ { 'Footnote-Links: ' + block.parsedElements.footnoteLinks } +
; + } if (block.parsedElements.footnotes.length > 0) { - footnotesElement = 'Footnotes: ' + block.parsedElements.footnotes; + footnotes =
+ { 'Footnotes: ' + block.parsedElements.footnotes } +
; } } @@ -43,7 +51,8 @@ export default class PdfBlockPageView extends React.Component {
- { footnotesElement } + { footnoteLinks } + { footnotes }
}); diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index a29513e..d32a705 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -4,17 +4,17 @@ import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx'; import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx'; import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx' +import DetectFootnotes from './transformations/DetectFootnotes.jsx' import DetectTOC from './transformations/DetectTOC.jsx' import DetectLists from './transformations/DetectLists.jsx' import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx' -import DetectFormats from './transformations/DetectFormats.jsx' -import CombineSameY from './transformations/CombineSameY.jsx'; -import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx' -import DetectFootnotes from './transformations/DetectFootnotes.jsx' -import DetectLinks from './transformations/DetectLinks.jsx' -import HeadlineDetector from './transformations/HeadlineDetector.jsx' -import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx' -import ToBlockSystem from './transformations/ToBlockSystem.jsx'; +// import DetectFormats from './transformations/DetectFormats.jsx' +// import CombineSameY from './transformations/CombineSameY.jsx'; +// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx' +// import DetectLinks from './transformations/DetectLinks.jsx' +// import HeadlineDetector from './transformations/HeadlineDetector.jsx' +// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx' +// import ToBlockSystem from './transformations/ToBlockSystem.jsx'; import ToTextBlocks from './transformations/ToTextBlocks.jsx'; import ToMarkdown from './transformations/ToMarkdown.jsx' @@ -31,6 +31,7 @@ export default class AppState { new RemoveRepetitiveElements(), new VerticalToHorizontal(), new DetectPdfBlocks(), + new DetectFootnotes(), new DetectTOC(), new DetectLists(), new DetectCodeBlocks(), @@ -38,7 +39,6 @@ export default class AppState { // new DetectFormats(), // new CombineSameY(), // new RemoveWhitespaces(), - // new DetectFootnotes(), // new DetectLinks(), // new HeadlineDetector(), // new HeadlineToUppercase(), diff --git a/src/javascript/models/MarkdownElements.jsx b/src/javascript/models/MarkdownElements.jsx index 4cbb559..e7f09c9 100644 --- a/src/javascript/models/MarkdownElements.jsx +++ b/src/javascript/models/MarkdownElements.jsx @@ -7,6 +7,7 @@ export const PARAGRAPH = "Paragraph"; export const LIST_BLOCK = "List"; export const CODE_BLOCK = "Code/Quote"; export const TOC_BLOCK = "TOC"; +export const FOOTNOTE_BLOCK = "Footnotes" export function blockToText(block: PdfBlock) { switch (block.type) { diff --git a/src/javascript/models/TextItemCombiner.jsx b/src/javascript/models/TextItemCombiner.jsx index fb1d1ce..07453cc 100644 --- a/src/javascript/models/TextItemCombiner.jsx +++ b/src/javascript/models/TextItemCombiner.jsx @@ -1,5 +1,6 @@ import TextItem from './TextItem.jsx'; -import { isNumber } from '../functions.jsx' +import { isNumber, isDigit } from '../functions.jsx' +import { sortByX } from '../textItemFunctions.jsx' //Combines text items which are on the same Y at the same time doing inline transformations like //'whitespace removal', bold/emphasis annotation, link-detection, etc.. @@ -58,66 +59,88 @@ export default class TextItemCombiner { } groupByFollowingY(textItems) { - const yArrays = []; + const footnoteLinks = []; const footnotes = []; - var itemsWithSameY = []; - var lastItem; - const wrapUpLine = () => { - // we can't trust order of occurence, esp. footnotes like to come last - itemsWithSameY.sort((a, b) => { - return a.x - b.x; - }); - const finalArray = []; - const basicY = itemsWithSameY[0].y; - var savedFootnoteItems = []; - const commitSavedFootnotes = () => { - if (savedFootnoteItems.length > 0) { - const footnoteNumber = savedFootnoteItems.map(footnoteItem => footnoteItem.text).join(''); - finalArray.push(new TextItem({ - ...savedFootnoteItems[0], - //TODO make fomatting configurable - // text: `[${footnoteNumber}](#${footnoteNumber})` - text: `*${footnoteNumber}` - })); - savedFootnoteItems = []; - footnotes.push(parseInt(footnoteNumber)); + var lines = this.groupItemsByLine(textItems); + lines = lines.map(lineItems => { + const basicY = lineItems[0].y; + const newLineItems = []; + var stashedNumberItems = []; + + + const commitStashedNumbers = (nextItem) => { + if (stashedNumberItems.length > 0) { + const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join(''); + if (stashedNumberItems[0].y > basicY) { // footnote link + newLineItems.push(new TextItem({ + ...stashedNumberItems[0], + //TODO make fomatting configurable + // text: `[${joinedNumber}](#${joinedNumber})` + text: `^${joinedNumber}` + })); + footnoteLinks.push(parseInt(joinedNumber)); + } else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote + //TODO womb comp [29] => ydiff == 0 + newLineItems.push(new TextItem({ + ...stashedNumberItems[0], + text: `(^${ joinedNumber}):` + })); + footnotes.push(joinedNumber); + } else { + stashedNumberItems.forEach(number => newLineItems.push(number)); + } + + stashedNumberItems = []; } }; - itemsWithSameY.forEach(item => { - const isFootnote = item.y > basicY && isNumber(item.text); - if (isFootnote) { - savedFootnoteItems.push(item); + lineItems.forEach(item => { + if (newLineItems.length == 0 && item.text.trim().length == 0) { + // skip whitespace on the beginning of a line } else { - if (savedFootnoteItems.length > 0) { - commitSavedFootnotes(); + const isANumber = isNumber(item.text); + if (isANumber) { + stashedNumberItems.push(item); + } else { + if (stashedNumberItems.length > 0) { + commitStashedNumbers(item); + } + newLineItems.push(item); } - finalArray.push(item); } }); - commitSavedFootnotes(); - yArrays.push(finalArray); - itemsWithSameY = []; - }; - - textItems.forEach(item => { - if (lastItem) { - if (Math.abs(lastItem.y - item.y) > this.mostUsedDistance / 2) { - wrapUpLine(); - } - } - itemsWithSameY.push(item); - lastItem = item; - // } + commitStashedNumbers(); + return newLineItems; }); - wrapUpLine(); - return [yArrays, new ParsedElements({ + + return [lines, new ParsedElements({ + footnoteLinks: footnoteLinks, footnotes: footnotes })]; } + + groupItemsByLine(textItems:TextItem[]) { + const lines = []; + var currentLine = []; + textItems.forEach(item => { + if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) { + lines.push(currentLine); + currentLine = []; + } + currentLine.push(item); + }); + lines.push(currentLine); + + lines.forEach(lineItems => { + // we can't trust order of occurence, esp. footnoteLinks like to come last + sortByX(lineItems); + }); + return lines; + } + } //Result of the TextItemCombiner#combine() @@ -125,7 +148,6 @@ export class CombineResult { constructor(options) { this.textItems = options.textItems; - this.footnotes = options.footnotes; this.parsedElements = options.parsedElements; } @@ -134,10 +156,12 @@ export class CombineResult { export class ParsedElements { constructor(options) { + this.footnoteLinks = options.footnoteLinks; this.footnotes = options.footnotes; } add(parsedElements:ParsedElements) { + this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks); this.footnotes = this.footnotes.concat(parsedElements.footnotes); } diff --git a/src/javascript/models/transformations/DetectFootnotes.jsx b/src/javascript/models/transformations/DetectFootnotes.jsx index a447c34..f3ba1af 100644 --- a/src/javascript/models/transformations/DetectFootnotes.jsx +++ b/src/javascript/models/transformations/DetectFootnotes.jsx @@ -1,70 +1,67 @@ -import ToPdfViewTransformation from './ToPdfViewTransformation.jsx'; -import TextItem from '../TextItem.jsx'; +import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx'; import ParseResult from '../ParseResult.jsx'; -import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx'; +import PdfBlock from '../PdfBlock.jsx'; +import TextItemCombiner from '../TextItemCombiner.jsx'; +import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; +import { FOOTNOTE_BLOCK } from '../MarkdownElements.jsx'; -import { isNumber } from '../../functions.jsx' - -export default class DetectFootnotes extends ToPdfViewTransformation { +//Detect quotes, code etc.. which is transformed to markdown code syntax +export default class DetectFootnotes extends ToPdfBlockViewTransformation { constructor() { super("Detect Footnotes"); } transform(parseResult:ParseResult) { + const {mostUsedDistance} = parseResult.globals; + var foundFootnotes = []; + const textCombiner = new TextItemCombiner({ + mostUsedDistance: mostUsedDistance, + }); - var nextFooterNumber = 1; - var potentialFootnoteItem; - var foundFootnotes = 0; - - const newContent = parseResult.content.map(page => { - const newTextItems = []; - for (var i = 0; i < page.textItems.length; i++) { - const item = page.textItems[i]; - if (potentialFootnoteItem) { - if (potentialFootnoteItem.y - item.y < item.height) { - potentialFootnoteItem.annotation = REMOVED_ANNOTATION; - item.annotation = REMOVED_ANNOTATION; - newTextItems.push(potentialFootnoteItem); - newTextItems.push(item); - newTextItems.push(new TextItem({ - x: potentialFootnoteItem.x, - y: item.y, - width: potentialFootnoteItem.width + item.width, - height: item.height, - text: '[' + potentialFootnoteItem.text + '] ' + item.text, - annotation: ADDED_ANNOTATION - })); - //TODO repsect multiline!! - nextFooterNumber++; - foundFootnotes++; + parseResult.content.forEach(page => { + const newBlocks = []; + var lastFootnote; + page.blocks.forEach(block => { + newBlocks.push(block); + if (!block.type && block.textItems[0].y < 200) { + const combineResult = textCombiner.combine(block.textItems); + if (combineResult.parsedElements.footnotes.length > 0) { + block.annotation = REMOVED_ANNOTATION; + foundFootnotes.push.apply(foundFootnotes, combineResult.parsedElements.footnotes); + lastFootnote = new PdfBlock({ + textItems: combineResult.textItems, + type: FOOTNOTE_BLOCK, + annotation: ADDED_ANNOTATION, + parsedElements: combineResult.parsedElements + }) + newBlocks.push(lastFootnote); + } else if (lastFootnote) { + // likely to be the second line of aboves footnote + block.annotation = REMOVED_ANNOTATION; + lastFootnote.textItems = lastFootnote.textItems.concat(combineResult.textItems); + lastFootnote.parsedElements.add(combineResult.parsedElements); + newBlocks[newBlocks.length - 2] = block; + newBlocks[newBlocks.length - 1] = lastFootnote; } - potentialFootnoteItem = null; - } else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) { - potentialFootnoteItem = item; } else { - newTextItems.push(item); + lastFootnote = null; } - } - return { - ...page, - textItems: newTextItems - }; + }); + page.blocks = newBlocks; }); return new ParseResult({ ...parseResult, - content: newContent, - messages: ['Detected ' + foundFootnotes + ' footnotes'] + messages: [ + 'Detected ' + foundFootnotes.length + ' footnotes:', + foundFootnotes.join(', ') + ] }); + } - completeTransform(parseResult:ParseResult) { - parseResult.content.forEach(page => { - page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION); - page.textItems.forEach(textItem => textItem.annotation = null) - }); - return parseResult; - } +} + + -} \ No newline at end of file diff --git a/src/javascript/models/transformations/VerticalToHorizontal.jsx b/src/javascript/models/transformations/VerticalToHorizontal.jsx index 5ab8155..b1435db 100644 --- a/src/javascript/models/transformations/VerticalToHorizontal.jsx +++ b/src/javascript/models/transformations/VerticalToHorizontal.jsx @@ -33,7 +33,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation { //TODO generic state machine code ? - page.textItems.reduce((oneCharacterItems, item) => { + const leftOver = page.textItems.reduce((oneCharacterItems, item) => { if (item.text.trim().length == 1) { if (oneCharacterItems.length == 0) { oneCharacterItems.push(item); @@ -80,6 +80,7 @@ export default class VerticalToHorizontal extends ToPdfViewTransformation { } return oneCharacterItems; }, []); + leftOver.forEach(oneCharacterItem => newTextItems.push(oneCharacterItem)); return { ...page, diff --git a/src/javascript/textItemFunctions.jsx b/src/javascript/textItemFunctions.jsx index 7629911..a60eeca 100644 --- a/src/javascript/textItemFunctions.jsx +++ b/src/javascript/textItemFunctions.jsx @@ -24,3 +24,15 @@ export function minXFromTextItems(items:TextItem) { } return minX; } + +export function sortByX(items:TextItem) { + items.sort((a, b) => { + return a.x - b.x; + }); +} + +export function sortCopyByX(items:TextItem) { + const copy = items.concat(); + sortByX(copy); + return copy; +} \ No newline at end of file