diff --git a/src/javascript/components/LoadingView.jsx b/src/javascript/components/LoadingView.jsx index cefc99c..ac655da 100644 --- a/src/javascript/components/LoadingView.jsx +++ b/src/javascript/components/LoadingView.jsx @@ -53,14 +53,21 @@ export default class LoadingView extends React.Component { anounceInitialParseFunction(pdfPages); for (var j = 1; j <= numPages; j++) { pdfDocument.getPage(j).then(function(page) { + var scale = 1.0; + var viewport = page.getViewport(scale); page.getTextContent().then(function(textContent) { - // console.debug(textContent); const textItems = textContent.items.map(function(item) { + var tx = PDFJS.Util.transform( // eslint-disable-line no-undef + viewport.transform, + item.transform + ); + var fontHeight = Math.sqrt((tx[2] * tx[2]) + (tx[3] * tx[3])); + return new TextItem({ x: item.transform[4], y: item.transform[5], width: item.width, - height: item.height, + height: item.height / fontHeight, text: item.str }); }); diff --git a/src/javascript/functions.jsx b/src/javascript/functions.jsx new file mode 100644 index 0000000..bb19aee --- /dev/null +++ b/src/javascript/functions.jsx @@ -0,0 +1,13 @@ +export function isDigit(charCode) { + return charCode >= 48 && charCode <= 57; +} + +export function isNumber(string) { + for (var i = 0; i < string.length; i++) { + const charCode = string.charCodeAt(i); + if (!isDigit(charCode)) { + return false; + } + } + return true; +} \ No newline at end of file diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index 7436a04..8569125 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -3,6 +3,7 @@ import { Enum } from 'enumify'; import NoOp from './transformations/NoOp.jsx'; import RoundCoordinates from './transformations/RoundCoordinates.jsx'; import CombineSameY from './transformations/CombineSameY.jsx'; +import DetectFootnotes from './transformations/DetectFootnotes.jsx' import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import ToTextPages from './transformations/ToTextPages.jsx'; import ToSingleTextPage from './transformations/ToSingleTextPage.jsx' @@ -19,6 +20,7 @@ export default class AppState { new NoOp, new RoundCoordinates(), new CombineSameY(), + new DetectFootnotes(), new RemoveRepetitiveElements(), new ToTextPages(), new ToSingleTextPage()]; diff --git a/src/javascript/models/transformations/DetectFootnotes.jsx b/src/javascript/models/transformations/DetectFootnotes.jsx new file mode 100644 index 0000000..e307361 --- /dev/null +++ b/src/javascript/models/transformations/DetectFootnotes.jsx @@ -0,0 +1,75 @@ +import Transformation from './Transformation.jsx'; +import TextItem from '../TextItem.jsx'; +import PdfPage from '../PdfPage.jsx'; +import ContentView from '../ContentView.jsx'; +import Annotation from '../Annotation.jsx'; + +import { isNumber } from '../../functions.jsx' + +export default class DetectFootnotes extends Transformation { + + constructor() { + super("Detect Footnotes"); + } + + contentView() { + return ContentView.PDF; + } + + transform(pages:PdfPage[]) { + + var nextFooterNumber = 1; + var potentialFootnoteItem; + + const removedAnnotation = new Annotation({ + category: 'removed', + color: 'red' + }); + + return pages.map(page => { + const newTextItems = []; + for (var i = 0; i < page.textItems.length; i++) { + const item = page.textItems[i]; + if (potentialFootnoteItem) { + if (potentialFootnoteItem.y - item.y < item.height) { + potentialFootnoteItem.annotation = removedAnnotation; + item.annotation = removedAnnotation; + newTextItems.push(potentialFootnoteItem); + newTextItems.push(item); + newTextItems.push(new TextItem({ + x: potentialFootnoteItem.x, + y: item.y, + width: potentialFootnoteItem.width + item.width, + height: item.height, + text: '[' + potentialFootnoteItem.text + '] ' + item.text, + annotation: new Annotation({ + category: 'footnote', + color: 'green' + }) + })); + //TODO repsect multiline!! + nextFooterNumber++; + } + potentialFootnoteItem = null; + } else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) { + potentialFootnoteItem = item; + } else { + newTextItems.push(item); + } + } + return { + ...page, + textItems: newTextItems + }; + }); + } + + processAnnotations(pages:PdfPage[]) { + pages.forEach(page => { + page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed'); + page.textItems.forEach(textItem => textItem.annotation = null) + }); + return pages; + } + +} \ No newline at end of file diff --git a/src/javascript/models/transformations/RemoveRepetitiveElements.jsx b/src/javascript/models/transformations/RemoveRepetitiveElements.jsx index 8e84bb4..c7a34af 100644 --- a/src/javascript/models/transformations/RemoveRepetitiveElements.jsx +++ b/src/javascript/models/transformations/RemoveRepetitiveElements.jsx @@ -3,14 +3,15 @@ import PdfPage from '../PdfPage.jsx'; import ContentView from '../ContentView.jsx'; import Annotation from '../Annotation.jsx'; +import { isDigit } from '../../functions.jsx' + function hashCodeIgnoringNumbers(string) { - var hash = 0, i, charCode, len, isNumber; + var hash = 0, i, charCode, len; if (string.length === 0) return hash; for (i = 0, len = string.length; i < len; i++) { charCode = string.charCodeAt(i); - isNumber = charCode >= 48 && charCode <= 57; - if (!isNumber) { + if (!isDigit(charCode)) { hash = ((hash << 5) - hash) + charCode; hash |= 0; // Convert to 32bit integer }