mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-21 10:08:03 +02:00
Detect footnotes transformation
* transform height with fontHeight
This commit is contained in:
parent
c2289421e4
commit
91087d550b
@ -53,14 +53,21 @@ export default class LoadingView extends React.Component {
|
|||||||
anounceInitialParseFunction(pdfPages);
|
anounceInitialParseFunction(pdfPages);
|
||||||
for (var j = 1; j <= numPages; j++) {
|
for (var j = 1; j <= numPages; j++) {
|
||||||
pdfDocument.getPage(j).then(function(page) {
|
pdfDocument.getPage(j).then(function(page) {
|
||||||
|
var scale = 1.0;
|
||||||
|
var viewport = page.getViewport(scale);
|
||||||
page.getTextContent().then(function(textContent) {
|
page.getTextContent().then(function(textContent) {
|
||||||
// console.debug(textContent);
|
|
||||||
const textItems = textContent.items.map(function(item) {
|
const textItems = textContent.items.map(function(item) {
|
||||||
|
var tx = PDFJS.Util.transform( // eslint-disable-line no-undef
|
||||||
|
viewport.transform,
|
||||||
|
item.transform
|
||||||
|
);
|
||||||
|
var fontHeight = Math.sqrt((tx[2] * tx[2]) + (tx[3] * tx[3]));
|
||||||
|
|
||||||
return new TextItem({
|
return new TextItem({
|
||||||
x: item.transform[4],
|
x: item.transform[4],
|
||||||
y: item.transform[5],
|
y: item.transform[5],
|
||||||
width: item.width,
|
width: item.width,
|
||||||
height: item.height,
|
height: item.height / fontHeight,
|
||||||
text: item.str
|
text: item.str
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
13
src/javascript/functions.jsx
Normal file
13
src/javascript/functions.jsx
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
export function isDigit(charCode) {
|
||||||
|
return charCode >= 48 && charCode <= 57;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isNumber(string) {
|
||||||
|
for (var i = 0; i < string.length; i++) {
|
||||||
|
const charCode = string.charCodeAt(i);
|
||||||
|
if (!isDigit(charCode)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
@ -3,6 +3,7 @@ import { Enum } from 'enumify';
|
|||||||
import NoOp from './transformations/NoOp.jsx';
|
import NoOp from './transformations/NoOp.jsx';
|
||||||
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
|
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
|
||||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
import CombineSameY from './transformations/CombineSameY.jsx';
|
||||||
|
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||||
import ToTextPages from './transformations/ToTextPages.jsx';
|
import ToTextPages from './transformations/ToTextPages.jsx';
|
||||||
import ToSingleTextPage from './transformations/ToSingleTextPage.jsx'
|
import ToSingleTextPage from './transformations/ToSingleTextPage.jsx'
|
||||||
@ -19,6 +20,7 @@ export default class AppState {
|
|||||||
new NoOp,
|
new NoOp,
|
||||||
new RoundCoordinates(),
|
new RoundCoordinates(),
|
||||||
new CombineSameY(),
|
new CombineSameY(),
|
||||||
|
new DetectFootnotes(),
|
||||||
new RemoveRepetitiveElements(),
|
new RemoveRepetitiveElements(),
|
||||||
new ToTextPages(),
|
new ToTextPages(),
|
||||||
new ToSingleTextPage()];
|
new ToSingleTextPage()];
|
||||||
|
75
src/javascript/models/transformations/DetectFootnotes.jsx
Normal file
75
src/javascript/models/transformations/DetectFootnotes.jsx
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import Transformation from './Transformation.jsx';
|
||||||
|
import TextItem from '../TextItem.jsx';
|
||||||
|
import PdfPage from '../PdfPage.jsx';
|
||||||
|
import ContentView from '../ContentView.jsx';
|
||||||
|
import Annotation from '../Annotation.jsx';
|
||||||
|
|
||||||
|
import { isNumber } from '../../functions.jsx'
|
||||||
|
|
||||||
|
export default class DetectFootnotes extends Transformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Detect Footnotes");
|
||||||
|
}
|
||||||
|
|
||||||
|
contentView() {
|
||||||
|
return ContentView.PDF;
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(pages:PdfPage[]) {
|
||||||
|
|
||||||
|
var nextFooterNumber = 1;
|
||||||
|
var potentialFootnoteItem;
|
||||||
|
|
||||||
|
const removedAnnotation = new Annotation({
|
||||||
|
category: 'removed',
|
||||||
|
color: 'red'
|
||||||
|
});
|
||||||
|
|
||||||
|
return pages.map(page => {
|
||||||
|
const newTextItems = [];
|
||||||
|
for (var i = 0; i < page.textItems.length; i++) {
|
||||||
|
const item = page.textItems[i];
|
||||||
|
if (potentialFootnoteItem) {
|
||||||
|
if (potentialFootnoteItem.y - item.y < item.height) {
|
||||||
|
potentialFootnoteItem.annotation = removedAnnotation;
|
||||||
|
item.annotation = removedAnnotation;
|
||||||
|
newTextItems.push(potentialFootnoteItem);
|
||||||
|
newTextItems.push(item);
|
||||||
|
newTextItems.push(new TextItem({
|
||||||
|
x: potentialFootnoteItem.x,
|
||||||
|
y: item.y,
|
||||||
|
width: potentialFootnoteItem.width + item.width,
|
||||||
|
height: item.height,
|
||||||
|
text: '[' + potentialFootnoteItem.text + '] ' + item.text,
|
||||||
|
annotation: new Annotation({
|
||||||
|
category: 'footnote',
|
||||||
|
color: 'green'
|
||||||
|
})
|
||||||
|
}));
|
||||||
|
//TODO repsect multiline!!
|
||||||
|
nextFooterNumber++;
|
||||||
|
}
|
||||||
|
potentialFootnoteItem = null;
|
||||||
|
} else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
|
||||||
|
potentialFootnoteItem = item;
|
||||||
|
} else {
|
||||||
|
newTextItems.push(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
...page,
|
||||||
|
textItems: newTextItems
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
processAnnotations(pages:PdfPage[]) {
|
||||||
|
pages.forEach(page => {
|
||||||
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||||
|
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||||
|
});
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -3,14 +3,15 @@ import PdfPage from '../PdfPage.jsx';
|
|||||||
import ContentView from '../ContentView.jsx';
|
import ContentView from '../ContentView.jsx';
|
||||||
import Annotation from '../Annotation.jsx';
|
import Annotation from '../Annotation.jsx';
|
||||||
|
|
||||||
|
import { isDigit } from '../../functions.jsx'
|
||||||
|
|
||||||
|
|
||||||
function hashCodeIgnoringNumbers(string) {
|
function hashCodeIgnoringNumbers(string) {
|
||||||
var hash = 0, i, charCode, len, isNumber;
|
var hash = 0, i, charCode, len;
|
||||||
if (string.length === 0) return hash;
|
if (string.length === 0) return hash;
|
||||||
for (i = 0, len = string.length; i < len; i++) {
|
for (i = 0, len = string.length; i < len; i++) {
|
||||||
charCode = string.charCodeAt(i);
|
charCode = string.charCodeAt(i);
|
||||||
isNumber = charCode >= 48 && charCode <= 57;
|
if (!isDigit(charCode)) {
|
||||||
if (!isNumber) {
|
|
||||||
hash = ((hash << 5) - hash) + charCode;
|
hash = ((hash << 5) - hash) + charCode;
|
||||||
hash |= 0; // Convert to 32bit integer
|
hash |= 0; // Convert to 32bit integer
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user