Detect footnotes transformation

* transform height with fontHeight
This commit is contained in:
Johannes Zillmann 2017-02-03 12:36:56 +01:00
parent c2289421e4
commit 91087d550b
5 changed files with 103 additions and 5 deletions

View File

@ -53,14 +53,21 @@ export default class LoadingView extends React.Component {
anounceInitialParseFunction(pdfPages);
for (var j = 1; j <= numPages; j++) {
pdfDocument.getPage(j).then(function(page) {
var scale = 1.0;
var viewport = page.getViewport(scale);
page.getTextContent().then(function(textContent) {
// console.debug(textContent);
const textItems = textContent.items.map(function(item) {
var tx = PDFJS.Util.transform( // eslint-disable-line no-undef
viewport.transform,
item.transform
);
var fontHeight = Math.sqrt((tx[2] * tx[2]) + (tx[3] * tx[3]));
return new TextItem({
x: item.transform[4],
y: item.transform[5],
width: item.width,
height: item.height,
height: item.height / fontHeight,
text: item.str
});
});

View File

@ -0,0 +1,13 @@
export function isDigit(charCode) {
return charCode >= 48 && charCode <= 57;
}
export function isNumber(string) {
for (var i = 0; i < string.length; i++) {
const charCode = string.charCodeAt(i);
if (!isDigit(charCode)) {
return false;
}
}
return true;
}

View File

@ -3,6 +3,7 @@ import { Enum } from 'enumify';
import NoOp from './transformations/NoOp.jsx';
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
import CombineSameY from './transformations/CombineSameY.jsx';
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import ToTextPages from './transformations/ToTextPages.jsx';
import ToSingleTextPage from './transformations/ToSingleTextPage.jsx'
@ -19,6 +20,7 @@ export default class AppState {
new NoOp,
new RoundCoordinates(),
new CombineSameY(),
new DetectFootnotes(),
new RemoveRepetitiveElements(),
new ToTextPages(),
new ToSingleTextPage()];

View File

@ -0,0 +1,75 @@
import Transformation from './Transformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ContentView from '../ContentView.jsx';
import Annotation from '../Annotation.jsx';
import { isNumber } from '../../functions.jsx'
export default class DetectFootnotes extends Transformation {
constructor() {
super("Detect Footnotes");
}
contentView() {
return ContentView.PDF;
}
transform(pages:PdfPage[]) {
var nextFooterNumber = 1;
var potentialFootnoteItem;
const removedAnnotation = new Annotation({
category: 'removed',
color: 'red'
});
return pages.map(page => {
const newTextItems = [];
for (var i = 0; i < page.textItems.length; i++) {
const item = page.textItems[i];
if (potentialFootnoteItem) {
if (potentialFootnoteItem.y - item.y < item.height) {
potentialFootnoteItem.annotation = removedAnnotation;
item.annotation = removedAnnotation;
newTextItems.push(potentialFootnoteItem);
newTextItems.push(item);
newTextItems.push(new TextItem({
x: potentialFootnoteItem.x,
y: item.y,
width: potentialFootnoteItem.width + item.width,
height: item.height,
text: '[' + potentialFootnoteItem.text + '] ' + item.text,
annotation: new Annotation({
category: 'footnote',
color: 'green'
})
}));
//TODO repsect multiline!!
nextFooterNumber++;
}
potentialFootnoteItem = null;
} else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
potentialFootnoteItem = item;
} else {
newTextItems.push(item);
}
}
return {
...page,
textItems: newTextItems
};
});
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
}
}

View File

@ -3,14 +3,15 @@ import PdfPage from '../PdfPage.jsx';
import ContentView from '../ContentView.jsx';
import Annotation from '../Annotation.jsx';
import { isDigit } from '../../functions.jsx'
function hashCodeIgnoringNumbers(string) {
var hash = 0, i, charCode, len, isNumber;
var hash = 0, i, charCode, len;
if (string.length === 0) return hash;
for (i = 0, len = string.length; i < len; i++) {
charCode = string.charCodeAt(i);
isNumber = charCode >= 48 && charCode <= 57;
if (!isNumber) {
if (!isDigit(charCode)) {
hash = ((hash << 5) - hash) + charCode;
hash |= 0; // Convert to 32bit integer
}