mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-23 00:03:39 +01:00
Detect Footnotes
- not yet converted in MD - detection should be same as old version
This commit is contained in:
parent
fab5d4649c
commit
5daa8aa45a
@ -16,6 +16,10 @@ The interesting thing is that rendering with pdfjs (online) looks good. So maybe
|
|||||||
- multiline headlines: [WoodUp](examples/WoodUp.pdf)
|
- multiline headlines: [WoodUp](examples/WoodUp.pdf)
|
||||||
- Detecting list of figures (and creating headlines) [Achieving-The-Paris-Climate-Agreement](Achieving-The-Paris-Climate-Agreement.pdf)
|
- Detecting list of figures (and creating headlines) [Achieving-The-Paris-Climate-Agreement](Achieving-The-Paris-Climate-Agreement.pdf)
|
||||||
|
|
||||||
|
# Footnotes
|
||||||
|
|
||||||
|
- multiline foot notes (compressed.tracemonkey-pldi-09.pdf)
|
||||||
|
|
||||||
## Not yet reviewed test PDFS
|
## Not yet reviewed test PDFS
|
||||||
|
|
||||||
# Achieving-The-Paris-Climate-Agreement.pdf
|
# Achieving-The-Paris-Climate-Agreement.pdf
|
||||||
|
@ -35,6 +35,8 @@ export default class Debugger {
|
|||||||
stageResult(stageIndex: number): StageResult {
|
stageResult(stageIndex: number): StageResult {
|
||||||
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
||||||
if (!this.stageResultCache[idx]) {
|
if (!this.stageResultCache[idx]) {
|
||||||
|
console.log(this.transformers[idx - 1].name);
|
||||||
|
|
||||||
const evaluations = new EvaluationTracker();
|
const evaluations = new EvaluationTracker();
|
||||||
const transformer = this.transformers[idx - 1];
|
const transformer = this.transformers[idx - 1];
|
||||||
const previousStageResult: StageResult = this.stageResultCache[idx - 1];
|
const previousStageResult: StageResult = this.stageResultCache[idx - 1];
|
||||||
|
15
src/Item.ts
15
src/Item.ts
@ -1,32 +1,41 @@
|
|||||||
import { v4 as uuidv4 } from 'uuid';
|
import { v4 as uuidv4 } from 'uuid';
|
||||||
|
import { TokenType } from './token-types';
|
||||||
|
|
||||||
export default class Item {
|
export default class Item {
|
||||||
page: number;
|
page: number;
|
||||||
data: object;
|
data: object;
|
||||||
uuid: string;
|
uuid: string;
|
||||||
|
tokenTypes: TokenType[] = [];
|
||||||
|
|
||||||
constructor(page: number, data: object, uuid: string = uuidv4()) {
|
constructor(page: number, data: object, tokenTypes: TokenType[] = [], uuid: string = uuidv4()) {
|
||||||
this.page = page;
|
this.page = page;
|
||||||
this.data = data;
|
this.data = data;
|
||||||
this.uuid = uuid;
|
this.uuid = uuid;
|
||||||
|
this.tokenTypes = tokenTypes;
|
||||||
}
|
}
|
||||||
|
|
||||||
value(column: string): object {
|
value(column: string): object {
|
||||||
return this.data[column];
|
return this.data[column];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
withTokenType(tokenType: TokenType): Item {
|
||||||
|
const newItem = new Item(this.page, this.data, this.tokenTypes, this.uuid);
|
||||||
|
newItem.tokenTypes.push(tokenType);
|
||||||
|
return newItem;
|
||||||
|
}
|
||||||
|
|
||||||
withDataAddition(data: object): Item {
|
withDataAddition(data: object): Item {
|
||||||
return this.withData({ ...this.data, ...data });
|
return this.withData({ ...this.data, ...data });
|
||||||
}
|
}
|
||||||
|
|
||||||
withData(data: object): Item {
|
withData(data: object): Item {
|
||||||
return new Item(this.page, data, this.uuid);
|
return new Item(this.page, data, this.tokenTypes, this.uuid);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the item without a uuid.
|
* Returns the item without a uuid.
|
||||||
*/
|
*/
|
||||||
withoutUuid(): Item {
|
withoutUuid(): Item {
|
||||||
return new Item(this.page, this.data, '');
|
return new Item(this.page, this.data, this.tokenTypes, '');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -55,6 +55,9 @@ function detectPageChanges(tracker: ChangeTracker, inputItems: Item[], outputIte
|
|||||||
if ((typesInInput || typesInOutput) && !arraysEqual(typesInInput, typesInOutput)) {
|
if ((typesInInput || typesInOutput) && !arraysEqual(typesInInput, typesInOutput)) {
|
||||||
tracker.trackContentChange(inputItem);
|
tracker.trackContentChange(inputItem);
|
||||||
}
|
}
|
||||||
|
if (!arraysEqual(inputItem.tokenTypes, outputItems[positionInOutput].tokenTypes)) {
|
||||||
|
tracker.trackContentChange(inputItem);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Handle items from the output with arn't in the input array
|
// Handle items from the output with arn't in the input array
|
||||||
for (let intermediateOutputIdx = outputIndex; intermediateOutputIdx < positionInOutput; intermediateOutputIdx++) {
|
for (let intermediateOutputIdx = outputIndex; intermediateOutputIdx < positionInOutput; intermediateOutputIdx++) {
|
||||||
|
@ -18,6 +18,7 @@ import {type ParseConfig } from './parse';
|
|||||||
import DetectListItems from './transformer/DetectListItems';
|
import DetectListItems from './transformer/DetectListItems';
|
||||||
import DetectBlocks from './transformer/DetectBlocks';
|
import DetectBlocks from './transformer/DetectBlocks';
|
||||||
import DetectListLevels from './transformer/DetectListLevels';
|
import DetectListLevels from './transformer/DetectListLevels';
|
||||||
|
import DetectFootnotes from './transformer/DetectFootnotes';
|
||||||
|
|
||||||
export const transformers = [
|
export const transformers = [
|
||||||
new AdjustHeight(),
|
new AdjustHeight(),
|
||||||
@ -27,6 +28,7 @@ export const transformers = [
|
|||||||
new CompactLines(),
|
new CompactLines(),
|
||||||
new SortXWithinLines(),
|
new SortXWithinLines(),
|
||||||
new RemoveRepetitiveItems(),
|
new RemoveRepetitiveItems(),
|
||||||
|
new DetectFootnotes(),
|
||||||
new DetectToc(),
|
new DetectToc(),
|
||||||
new DetectHeaders(),
|
new DetectHeaders(),
|
||||||
new DetectListItems(),
|
new DetectListItems(),
|
||||||
|
@ -59,3 +59,13 @@ export function isListItem(value: string) {
|
|||||||
export function isNumberedListItem(value: string) {
|
export function isNumberedListItem(value: string) {
|
||||||
return /^[\s]*\d*\.(?:\s|$)/g.test(value);
|
return /^[\s]*\d*\.(?:\s|$)/g.test(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function isNumber(value: string) {
|
||||||
|
for (let i = 0; i < value.length; i++) {
|
||||||
|
const charCode = value.charCodeAt(i);
|
||||||
|
if (!isDigit(charCode)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
2
src/token-types.ts
Normal file
2
src/token-types.ts
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
export type FontType = 'BOLD' | 'OBLIQUE';
|
||||||
|
export type TokenType = 'LINK' | 'FOOTNOTE' | 'FOOTNOTE_LINK' | FontType;
|
@ -90,7 +90,6 @@ export default class CalculateStatistics extends ItemTransformer {
|
|||||||
maxHeightFont = itemFont;
|
maxHeightFont = itemFont;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
// TODO really need parseInt here ?
|
|
||||||
const mostUsedHeight = to2DigitDecimalFromString(getMostUsedKey(heightToOccurrence));
|
const mostUsedHeight = to2DigitDecimalFromString(getMostUsedKey(heightToOccurrence));
|
||||||
|
|
||||||
const mostUsedFont = getMostUsedKey(fontToOccurrence);
|
const mostUsedFont = getMostUsedKey(fontToOccurrence);
|
||||||
@ -103,7 +102,7 @@ export default class CalculateStatistics extends ItemTransformer {
|
|||||||
|
|
||||||
let page = -1;
|
let page = -1;
|
||||||
let lastItemOfMostUsedHeight: Item | undefined;
|
let lastItemOfMostUsedHeight: Item | undefined;
|
||||||
items.forEach((item, i) => {
|
items.forEach((item) => {
|
||||||
if (item.page !== page) lastItemOfMostUsedHeight = undefined;
|
if (item.page !== page) lastItemOfMostUsedHeight = undefined;
|
||||||
const itemHeight = to2DigitDecimalFromString(item.data['height']);
|
const itemHeight = to2DigitDecimalFromString(item.data['height']);
|
||||||
const itemText = item.data['str'];
|
const itemText = item.data['str'];
|
||||||
|
84
src/transformer/DetectFootnotes.ts
Normal file
84
src/transformer/DetectFootnotes.ts
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import Item from '../Item';
|
||||||
|
import ItemResult from '../ItemResult';
|
||||||
|
import ItemTransformer from './ItemTransformer';
|
||||||
|
import TransformContext from './TransformContext';
|
||||||
|
import LineItemMerger from '../debug/LineItemMerger';
|
||||||
|
import { groupByLine } from '../support/groupingUtils';
|
||||||
|
import { isNumber } from '../support/stringFunctions';
|
||||||
|
|
||||||
|
export default class DetectFootnotes extends ItemTransformer {
|
||||||
|
constructor() {
|
||||||
|
super(
|
||||||
|
'Detect Footnotes',
|
||||||
|
'Detect footnotes in text and link them to the references',
|
||||||
|
{
|
||||||
|
requireColumns: ['str', 'y'],
|
||||||
|
debug: {
|
||||||
|
itemMerger: new LineItemMerger(false),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
(incomingSchema) => {
|
||||||
|
return incomingSchema.reduce((schema, column) => {
|
||||||
|
if (column === 'x') {
|
||||||
|
return [...schema, 'token types', 'x'];
|
||||||
|
}
|
||||||
|
return [...schema, column];
|
||||||
|
}, new Array<string>());
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(context: TransformContext, inputItems: Item[]): ItemResult {
|
||||||
|
const stash: Item[] = [];
|
||||||
|
const footnoteLinks = new Set<string>();
|
||||||
|
const footnotes = new Set<string>();
|
||||||
|
|
||||||
|
groupByLine(inputItems).forEach((lineItems) => {
|
||||||
|
const firstY = lineItems[0].data['y'];
|
||||||
|
lineItems.forEach((item, lineIndex) => {
|
||||||
|
const itemText = item.data['str'].trim();
|
||||||
|
const itemY = item.data['y'];
|
||||||
|
if (isNumber(itemText)) {
|
||||||
|
if (hasPreceedingText(lineItems, lineIndex) && itemY > firstY) {
|
||||||
|
footnoteLinks.add(item.uuid);
|
||||||
|
} else if (isFollowedByText(lineItems, lineIndex)) {
|
||||||
|
footnotes.add(item.uuid);
|
||||||
|
}
|
||||||
|
stash.push(item);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
items: inputItems.map((item) => {
|
||||||
|
if (footnoteLinks.has(item.uuid)) {
|
||||||
|
return item.withTokenType('FOOTNOTE_LINK');
|
||||||
|
}
|
||||||
|
if (footnotes.has(item.uuid)) {
|
||||||
|
return item.withTokenType('FOOTNOTE');
|
||||||
|
}
|
||||||
|
return item;
|
||||||
|
}),
|
||||||
|
messages: [`Detected ${footnoteLinks.size}/${footnotes.size} footnotes.`],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function hasPreceedingText(lineItems: Item[], lineIndex: number) {
|
||||||
|
for (let index = lineIndex - 1; index >= 0; index--) {
|
||||||
|
const itemText = lineItems[index].data['str'].trim() as string;
|
||||||
|
if (!isNumber(itemText)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function isFollowedByText(lineItems: Item[], lineIndex: number) {
|
||||||
|
for (let index = lineIndex + 1; index < lineItems.length; index++) {
|
||||||
|
const itemText = lineItems[index].data['str'].trim() as string;
|
||||||
|
if (!isNumber(itemText)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
@ -12,7 +12,6 @@ import { HeadlineType, TextType, isHeadline, toHeadlineType } from '../text-type
|
|||||||
|
|
||||||
const config = {
|
const config = {
|
||||||
// How much taller a text must be to be a headline (relative to mostUsedHeight)
|
// How much taller a text must be to be a headline (relative to mostUsedHeight)
|
||||||
// TODO sync with DetectHeadline ??
|
|
||||||
minHeadlineDistance: 1.3,
|
minHeadlineDistance: 1.3,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -5,6 +5,9 @@ import TransformContext from './TransformContext';
|
|||||||
import LineItemMerger from '../debug/LineItemMerger';
|
import LineItemMerger from '../debug/LineItemMerger';
|
||||||
import { transformGroupedByPageAndLine } from '../support/groupingUtils';
|
import { transformGroupedByPageAndLine } from '../support/groupingUtils';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We can't trust order of occurence, esp. footnote links like to come last
|
||||||
|
*/
|
||||||
export default class SortXWithinLines extends ItemTransformer {
|
export default class SortXWithinLines extends ItemTransformer {
|
||||||
constructor() {
|
constructor() {
|
||||||
super('Sort by X', 'Sorts the items of a line by the x coordinate', {
|
super('Sort by X', 'Sorts the items of a line by the x coordinate', {
|
||||||
|
Loading…
Reference in New Issue
Block a user