mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-12-23 23:08:59 +01:00
Detect footnotes transformation
* transform height with fontHeight
This commit is contained in:
parent
c2289421e4
commit
91087d550b
@ -53,14 +53,21 @@ export default class LoadingView extends React.Component {
|
||||
anounceInitialParseFunction(pdfPages);
|
||||
for (var j = 1; j <= numPages; j++) {
|
||||
pdfDocument.getPage(j).then(function(page) {
|
||||
var scale = 1.0;
|
||||
var viewport = page.getViewport(scale);
|
||||
page.getTextContent().then(function(textContent) {
|
||||
// console.debug(textContent);
|
||||
const textItems = textContent.items.map(function(item) {
|
||||
var tx = PDFJS.Util.transform( // eslint-disable-line no-undef
|
||||
viewport.transform,
|
||||
item.transform
|
||||
);
|
||||
var fontHeight = Math.sqrt((tx[2] * tx[2]) + (tx[3] * tx[3]));
|
||||
|
||||
return new TextItem({
|
||||
x: item.transform[4],
|
||||
y: item.transform[5],
|
||||
width: item.width,
|
||||
height: item.height,
|
||||
height: item.height / fontHeight,
|
||||
text: item.str
|
||||
});
|
||||
});
|
||||
|
13
src/javascript/functions.jsx
Normal file
13
src/javascript/functions.jsx
Normal file
@ -0,0 +1,13 @@
|
||||
export function isDigit(charCode) {
|
||||
return charCode >= 48 && charCode <= 57;
|
||||
}
|
||||
|
||||
export function isNumber(string) {
|
||||
for (var i = 0; i < string.length; i++) {
|
||||
const charCode = string.charCodeAt(i);
|
||||
if (!isDigit(charCode)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
@ -3,6 +3,7 @@ import { Enum } from 'enumify';
|
||||
import NoOp from './transformations/NoOp.jsx';
|
||||
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
|
||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||
import ToTextPages from './transformations/ToTextPages.jsx';
|
||||
import ToSingleTextPage from './transformations/ToSingleTextPage.jsx'
|
||||
@ -19,6 +20,7 @@ export default class AppState {
|
||||
new NoOp,
|
||||
new RoundCoordinates(),
|
||||
new CombineSameY(),
|
||||
new DetectFootnotes(),
|
||||
new RemoveRepetitiveElements(),
|
||||
new ToTextPages(),
|
||||
new ToSingleTextPage()];
|
||||
|
75
src/javascript/models/transformations/DetectFootnotes.jsx
Normal file
75
src/javascript/models/transformations/DetectFootnotes.jsx
Normal file
@ -0,0 +1,75 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import Annotation from '../Annotation.jsx';
|
||||
|
||||
import { isNumber } from '../../functions.jsx'
|
||||
|
||||
export default class DetectFootnotes extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Footnotes");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
|
||||
var nextFooterNumber = 1;
|
||||
var potentialFootnoteItem;
|
||||
|
||||
const removedAnnotation = new Annotation({
|
||||
category: 'removed',
|
||||
color: 'red'
|
||||
});
|
||||
|
||||
return pages.map(page => {
|
||||
const newTextItems = [];
|
||||
for (var i = 0; i < page.textItems.length; i++) {
|
||||
const item = page.textItems[i];
|
||||
if (potentialFootnoteItem) {
|
||||
if (potentialFootnoteItem.y - item.y < item.height) {
|
||||
potentialFootnoteItem.annotation = removedAnnotation;
|
||||
item.annotation = removedAnnotation;
|
||||
newTextItems.push(potentialFootnoteItem);
|
||||
newTextItems.push(item);
|
||||
newTextItems.push(new TextItem({
|
||||
x: potentialFootnoteItem.x,
|
||||
y: item.y,
|
||||
width: potentialFootnoteItem.width + item.width,
|
||||
height: item.height,
|
||||
text: '[' + potentialFootnoteItem.text + '] ' + item.text,
|
||||
annotation: new Annotation({
|
||||
category: 'footnote',
|
||||
color: 'green'
|
||||
})
|
||||
}));
|
||||
//TODO repsect multiline!!
|
||||
nextFooterNumber++;
|
||||
}
|
||||
potentialFootnoteItem = null;
|
||||
} else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
|
||||
potentialFootnoteItem = item;
|
||||
} else {
|
||||
newTextItems.push(item);
|
||||
}
|
||||
}
|
||||
return {
|
||||
...page,
|
||||
textItems: newTextItems
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return pages;
|
||||
}
|
||||
|
||||
}
|
@ -3,14 +3,15 @@ import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import Annotation from '../Annotation.jsx';
|
||||
|
||||
import { isDigit } from '../../functions.jsx'
|
||||
|
||||
|
||||
function hashCodeIgnoringNumbers(string) {
|
||||
var hash = 0, i, charCode, len, isNumber;
|
||||
var hash = 0, i, charCode, len;
|
||||
if (string.length === 0) return hash;
|
||||
for (i = 0, len = string.length; i < len; i++) {
|
||||
charCode = string.charCodeAt(i);
|
||||
isNumber = charCode >= 48 && charCode <= 57;
|
||||
if (!isNumber) {
|
||||
if (!isDigit(charCode)) {
|
||||
hash = ((hash << 5) - hash) + charCode;
|
||||
hash |= 0; // Convert to 32bit integer
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user