mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-22 15:53:34 +01:00
Detect Footnotes
- not yet converted in MD - detection should be same as old version
This commit is contained in:
parent
fab5d4649c
commit
5daa8aa45a
@ -16,6 +16,10 @@ The interesting thing is that rendering with pdfjs (online) looks good. So maybe
|
||||
- multiline headlines: [WoodUp](examples/WoodUp.pdf)
|
||||
- Detecting list of figures (and creating headlines) [Achieving-The-Paris-Climate-Agreement](Achieving-The-Paris-Climate-Agreement.pdf)
|
||||
|
||||
# Footnotes
|
||||
|
||||
- multiline foot notes (compressed.tracemonkey-pldi-09.pdf)
|
||||
|
||||
## Not yet reviewed test PDFS
|
||||
|
||||
# Achieving-The-Paris-Climate-Agreement.pdf
|
||||
|
@ -35,6 +35,8 @@ export default class Debugger {
|
||||
stageResult(stageIndex: number): StageResult {
|
||||
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
||||
if (!this.stageResultCache[idx]) {
|
||||
console.log(this.transformers[idx - 1].name);
|
||||
|
||||
const evaluations = new EvaluationTracker();
|
||||
const transformer = this.transformers[idx - 1];
|
||||
const previousStageResult: StageResult = this.stageResultCache[idx - 1];
|
||||
|
15
src/Item.ts
15
src/Item.ts
@ -1,32 +1,41 @@
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import { TokenType } from './token-types';
|
||||
|
||||
export default class Item {
|
||||
page: number;
|
||||
data: object;
|
||||
uuid: string;
|
||||
tokenTypes: TokenType[] = [];
|
||||
|
||||
constructor(page: number, data: object, uuid: string = uuidv4()) {
|
||||
constructor(page: number, data: object, tokenTypes: TokenType[] = [], uuid: string = uuidv4()) {
|
||||
this.page = page;
|
||||
this.data = data;
|
||||
this.uuid = uuid;
|
||||
this.tokenTypes = tokenTypes;
|
||||
}
|
||||
|
||||
value(column: string): object {
|
||||
return this.data[column];
|
||||
}
|
||||
|
||||
withTokenType(tokenType: TokenType): Item {
|
||||
const newItem = new Item(this.page, this.data, this.tokenTypes, this.uuid);
|
||||
newItem.tokenTypes.push(tokenType);
|
||||
return newItem;
|
||||
}
|
||||
|
||||
withDataAddition(data: object): Item {
|
||||
return this.withData({ ...this.data, ...data });
|
||||
}
|
||||
|
||||
withData(data: object): Item {
|
||||
return new Item(this.page, data, this.uuid);
|
||||
return new Item(this.page, data, this.tokenTypes, this.uuid);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the item without a uuid.
|
||||
*/
|
||||
withoutUuid(): Item {
|
||||
return new Item(this.page, this.data, '');
|
||||
return new Item(this.page, this.data, this.tokenTypes, '');
|
||||
}
|
||||
}
|
||||
|
@ -55,6 +55,9 @@ function detectPageChanges(tracker: ChangeTracker, inputItems: Item[], outputIte
|
||||
if ((typesInInput || typesInOutput) && !arraysEqual(typesInInput, typesInOutput)) {
|
||||
tracker.trackContentChange(inputItem);
|
||||
}
|
||||
if (!arraysEqual(inputItem.tokenTypes, outputItems[positionInOutput].tokenTypes)) {
|
||||
tracker.trackContentChange(inputItem);
|
||||
}
|
||||
} else {
|
||||
// Handle items from the output with arn't in the input array
|
||||
for (let intermediateOutputIdx = outputIndex; intermediateOutputIdx < positionInOutput; intermediateOutputIdx++) {
|
||||
|
@ -18,6 +18,7 @@ import {type ParseConfig } from './parse';
|
||||
import DetectListItems from './transformer/DetectListItems';
|
||||
import DetectBlocks from './transformer/DetectBlocks';
|
||||
import DetectListLevels from './transformer/DetectListLevels';
|
||||
import DetectFootnotes from './transformer/DetectFootnotes';
|
||||
|
||||
export const transformers = [
|
||||
new AdjustHeight(),
|
||||
@ -27,6 +28,7 @@ export const transformers = [
|
||||
new CompactLines(),
|
||||
new SortXWithinLines(),
|
||||
new RemoveRepetitiveItems(),
|
||||
new DetectFootnotes(),
|
||||
new DetectToc(),
|
||||
new DetectHeaders(),
|
||||
new DetectListItems(),
|
||||
|
@ -59,3 +59,13 @@ export function isListItem(value: string) {
|
||||
export function isNumberedListItem(value: string) {
|
||||
return /^[\s]*\d*\.(?:\s|$)/g.test(value);
|
||||
}
|
||||
|
||||
export function isNumber(value: string) {
|
||||
for (let i = 0; i < value.length; i++) {
|
||||
const charCode = value.charCodeAt(i);
|
||||
if (!isDigit(charCode)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
2
src/token-types.ts
Normal file
2
src/token-types.ts
Normal file
@ -0,0 +1,2 @@
|
||||
export type FontType = 'BOLD' | 'OBLIQUE';
|
||||
export type TokenType = 'LINK' | 'FOOTNOTE' | 'FOOTNOTE_LINK' | FontType;
|
@ -90,7 +90,6 @@ export default class CalculateStatistics extends ItemTransformer {
|
||||
maxHeightFont = itemFont;
|
||||
}
|
||||
});
|
||||
// TODO really need parseInt here ?
|
||||
const mostUsedHeight = to2DigitDecimalFromString(getMostUsedKey(heightToOccurrence));
|
||||
|
||||
const mostUsedFont = getMostUsedKey(fontToOccurrence);
|
||||
@ -103,7 +102,7 @@ export default class CalculateStatistics extends ItemTransformer {
|
||||
|
||||
let page = -1;
|
||||
let lastItemOfMostUsedHeight: Item | undefined;
|
||||
items.forEach((item, i) => {
|
||||
items.forEach((item) => {
|
||||
if (item.page !== page) lastItemOfMostUsedHeight = undefined;
|
||||
const itemHeight = to2DigitDecimalFromString(item.data['height']);
|
||||
const itemText = item.data['str'];
|
||||
|
84
src/transformer/DetectFootnotes.ts
Normal file
84
src/transformer/DetectFootnotes.ts
Normal file
@ -0,0 +1,84 @@
|
||||
import Item from '../Item';
|
||||
import ItemResult from '../ItemResult';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
import LineItemMerger from '../debug/LineItemMerger';
|
||||
import { groupByLine } from '../support/groupingUtils';
|
||||
import { isNumber } from '../support/stringFunctions';
|
||||
|
||||
export default class DetectFootnotes extends ItemTransformer {
|
||||
constructor() {
|
||||
super(
|
||||
'Detect Footnotes',
|
||||
'Detect footnotes in text and link them to the references',
|
||||
{
|
||||
requireColumns: ['str', 'y'],
|
||||
debug: {
|
||||
itemMerger: new LineItemMerger(false),
|
||||
},
|
||||
},
|
||||
(incomingSchema) => {
|
||||
return incomingSchema.reduce((schema, column) => {
|
||||
if (column === 'x') {
|
||||
return [...schema, 'token types', 'x'];
|
||||
}
|
||||
return [...schema, column];
|
||||
}, new Array<string>());
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
transform(context: TransformContext, inputItems: Item[]): ItemResult {
|
||||
const stash: Item[] = [];
|
||||
const footnoteLinks = new Set<string>();
|
||||
const footnotes = new Set<string>();
|
||||
|
||||
groupByLine(inputItems).forEach((lineItems) => {
|
||||
const firstY = lineItems[0].data['y'];
|
||||
lineItems.forEach((item, lineIndex) => {
|
||||
const itemText = item.data['str'].trim();
|
||||
const itemY = item.data['y'];
|
||||
if (isNumber(itemText)) {
|
||||
if (hasPreceedingText(lineItems, lineIndex) && itemY > firstY) {
|
||||
footnoteLinks.add(item.uuid);
|
||||
} else if (isFollowedByText(lineItems, lineIndex)) {
|
||||
footnotes.add(item.uuid);
|
||||
}
|
||||
stash.push(item);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return {
|
||||
items: inputItems.map((item) => {
|
||||
if (footnoteLinks.has(item.uuid)) {
|
||||
return item.withTokenType('FOOTNOTE_LINK');
|
||||
}
|
||||
if (footnotes.has(item.uuid)) {
|
||||
return item.withTokenType('FOOTNOTE');
|
||||
}
|
||||
return item;
|
||||
}),
|
||||
messages: [`Detected ${footnoteLinks.size}/${footnotes.size} footnotes.`],
|
||||
};
|
||||
}
|
||||
}
|
||||
function hasPreceedingText(lineItems: Item[], lineIndex: number) {
|
||||
for (let index = lineIndex - 1; index >= 0; index--) {
|
||||
const itemText = lineItems[index].data['str'].trim() as string;
|
||||
if (!isNumber(itemText)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function isFollowedByText(lineItems: Item[], lineIndex: number) {
|
||||
for (let index = lineIndex + 1; index < lineItems.length; index++) {
|
||||
const itemText = lineItems[index].data['str'].trim() as string;
|
||||
if (!isNumber(itemText)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
@ -12,7 +12,6 @@ import { HeadlineType, TextType, isHeadline, toHeadlineType } from '../text-type
|
||||
|
||||
const config = {
|
||||
// How much taller a text must be to be a headline (relative to mostUsedHeight)
|
||||
// TODO sync with DetectHeadline ??
|
||||
minHeadlineDistance: 1.3,
|
||||
};
|
||||
|
||||
|
@ -5,6 +5,9 @@ import TransformContext from './TransformContext';
|
||||
import LineItemMerger from '../debug/LineItemMerger';
|
||||
import { transformGroupedByPageAndLine } from '../support/groupingUtils';
|
||||
|
||||
/**
|
||||
* We can't trust order of occurence, esp. footnote links like to come last
|
||||
*/
|
||||
export default class SortXWithinLines extends ItemTransformer {
|
||||
constructor() {
|
||||
super('Sort by X', 'Sorts the items of a line by the x coordinate', {
|
||||
|
Loading…
Reference in New Issue
Block a user