Detect Footnotes

- not yet converted in MD
- detection should be same as old version
This commit is contained in:
Johannes Zillmann 2024-04-08 19:08:26 -06:00
parent fab5d4649c
commit 5daa8aa45a
11 changed files with 123 additions and 6 deletions

View File

@ -16,6 +16,10 @@ The interesting thing is that rendering with pdfjs (online) looks good. So maybe
- multiline headlines: [WoodUp](examples/WoodUp.pdf) - multiline headlines: [WoodUp](examples/WoodUp.pdf)
- Detecting list of figures (and creating headlines) [Achieving-The-Paris-Climate-Agreement](Achieving-The-Paris-Climate-Agreement.pdf) - Detecting list of figures (and creating headlines) [Achieving-The-Paris-Climate-Agreement](Achieving-The-Paris-Climate-Agreement.pdf)
# Footnotes
- multiline foot notes (compressed.tracemonkey-pldi-09.pdf)
## Not yet reviewed test PDFS ## Not yet reviewed test PDFS
# Achieving-The-Paris-Climate-Agreement.pdf # Achieving-The-Paris-Climate-Agreement.pdf

View File

@ -35,6 +35,8 @@ export default class Debugger {
stageResult(stageIndex: number): StageResult { stageResult(stageIndex: number): StageResult {
for (let idx = 0; idx < stageIndex + 1; idx++) { for (let idx = 0; idx < stageIndex + 1; idx++) {
if (!this.stageResultCache[idx]) { if (!this.stageResultCache[idx]) {
console.log(this.transformers[idx - 1].name);
const evaluations = new EvaluationTracker(); const evaluations = new EvaluationTracker();
const transformer = this.transformers[idx - 1]; const transformer = this.transformers[idx - 1];
const previousStageResult: StageResult = this.stageResultCache[idx - 1]; const previousStageResult: StageResult = this.stageResultCache[idx - 1];

View File

@ -1,32 +1,41 @@
import { v4 as uuidv4 } from 'uuid'; import { v4 as uuidv4 } from 'uuid';
import { TokenType } from './token-types';
export default class Item { export default class Item {
page: number; page: number;
data: object; data: object;
uuid: string; uuid: string;
tokenTypes: TokenType[] = [];
constructor(page: number, data: object, uuid: string = uuidv4()) { constructor(page: number, data: object, tokenTypes: TokenType[] = [], uuid: string = uuidv4()) {
this.page = page; this.page = page;
this.data = data; this.data = data;
this.uuid = uuid; this.uuid = uuid;
this.tokenTypes = tokenTypes;
} }
value(column: string): object { value(column: string): object {
return this.data[column]; return this.data[column];
} }
withTokenType(tokenType: TokenType): Item {
const newItem = new Item(this.page, this.data, this.tokenTypes, this.uuid);
newItem.tokenTypes.push(tokenType);
return newItem;
}
withDataAddition(data: object): Item { withDataAddition(data: object): Item {
return this.withData({ ...this.data, ...data }); return this.withData({ ...this.data, ...data });
} }
withData(data: object): Item { withData(data: object): Item {
return new Item(this.page, data, this.uuid); return new Item(this.page, data, this.tokenTypes, this.uuid);
} }
/** /**
* Returns the item without a uuid. * Returns the item without a uuid.
*/ */
withoutUuid(): Item { withoutUuid(): Item {
return new Item(this.page, this.data, ''); return new Item(this.page, this.data, this.tokenTypes, '');
} }
} }

View File

@ -55,6 +55,9 @@ function detectPageChanges(tracker: ChangeTracker, inputItems: Item[], outputIte
if ((typesInInput || typesInOutput) && !arraysEqual(typesInInput, typesInOutput)) { if ((typesInInput || typesInOutput) && !arraysEqual(typesInInput, typesInOutput)) {
tracker.trackContentChange(inputItem); tracker.trackContentChange(inputItem);
} }
if (!arraysEqual(inputItem.tokenTypes, outputItems[positionInOutput].tokenTypes)) {
tracker.trackContentChange(inputItem);
}
} else { } else {
// Handle items from the output with arn't in the input array // Handle items from the output with arn't in the input array
for (let intermediateOutputIdx = outputIndex; intermediateOutputIdx < positionInOutput; intermediateOutputIdx++) { for (let intermediateOutputIdx = outputIndex; intermediateOutputIdx < positionInOutput; intermediateOutputIdx++) {

View File

@ -18,6 +18,7 @@ import {type ParseConfig } from './parse';
import DetectListItems from './transformer/DetectListItems'; import DetectListItems from './transformer/DetectListItems';
import DetectBlocks from './transformer/DetectBlocks'; import DetectBlocks from './transformer/DetectBlocks';
import DetectListLevels from './transformer/DetectListLevels'; import DetectListLevels from './transformer/DetectListLevels';
import DetectFootnotes from './transformer/DetectFootnotes';
export const transformers = [ export const transformers = [
new AdjustHeight(), new AdjustHeight(),
@ -27,6 +28,7 @@ export const transformers = [
new CompactLines(), new CompactLines(),
new SortXWithinLines(), new SortXWithinLines(),
new RemoveRepetitiveItems(), new RemoveRepetitiveItems(),
new DetectFootnotes(),
new DetectToc(), new DetectToc(),
new DetectHeaders(), new DetectHeaders(),
new DetectListItems(), new DetectListItems(),

View File

@ -59,3 +59,13 @@ export function isListItem(value: string) {
export function isNumberedListItem(value: string) { export function isNumberedListItem(value: string) {
return /^[\s]*\d*\.(?:\s|$)/g.test(value); return /^[\s]*\d*\.(?:\s|$)/g.test(value);
} }
export function isNumber(value: string) {
for (let i = 0; i < value.length; i++) {
const charCode = value.charCodeAt(i);
if (!isDigit(charCode)) {
return false;
}
}
return true;
}

2
src/token-types.ts Normal file
View File

@ -0,0 +1,2 @@
export type FontType = 'BOLD' | 'OBLIQUE';
export type TokenType = 'LINK' | 'FOOTNOTE' | 'FOOTNOTE_LINK' | FontType;

View File

@ -90,7 +90,6 @@ export default class CalculateStatistics extends ItemTransformer {
maxHeightFont = itemFont; maxHeightFont = itemFont;
} }
}); });
// TODO really need parseInt here ?
const mostUsedHeight = to2DigitDecimalFromString(getMostUsedKey(heightToOccurrence)); const mostUsedHeight = to2DigitDecimalFromString(getMostUsedKey(heightToOccurrence));
const mostUsedFont = getMostUsedKey(fontToOccurrence); const mostUsedFont = getMostUsedKey(fontToOccurrence);
@ -103,7 +102,7 @@ export default class CalculateStatistics extends ItemTransformer {
let page = -1; let page = -1;
let lastItemOfMostUsedHeight: Item | undefined; let lastItemOfMostUsedHeight: Item | undefined;
items.forEach((item, i) => { items.forEach((item) => {
if (item.page !== page) lastItemOfMostUsedHeight = undefined; if (item.page !== page) lastItemOfMostUsedHeight = undefined;
const itemHeight = to2DigitDecimalFromString(item.data['height']); const itemHeight = to2DigitDecimalFromString(item.data['height']);
const itemText = item.data['str']; const itemText = item.data['str'];

View File

@ -0,0 +1,84 @@
import Item from '../Item';
import ItemResult from '../ItemResult';
import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext';
import LineItemMerger from '../debug/LineItemMerger';
import { groupByLine } from '../support/groupingUtils';
import { isNumber } from '../support/stringFunctions';
export default class DetectFootnotes extends ItemTransformer {
constructor() {
super(
'Detect Footnotes',
'Detect footnotes in text and link them to the references',
{
requireColumns: ['str', 'y'],
debug: {
itemMerger: new LineItemMerger(false),
},
},
(incomingSchema) => {
return incomingSchema.reduce((schema, column) => {
if (column === 'x') {
return [...schema, 'token types', 'x'];
}
return [...schema, column];
}, new Array<string>());
},
);
}
transform(context: TransformContext, inputItems: Item[]): ItemResult {
const stash: Item[] = [];
const footnoteLinks = new Set<string>();
const footnotes = new Set<string>();
groupByLine(inputItems).forEach((lineItems) => {
const firstY = lineItems[0].data['y'];
lineItems.forEach((item, lineIndex) => {
const itemText = item.data['str'].trim();
const itemY = item.data['y'];
if (isNumber(itemText)) {
if (hasPreceedingText(lineItems, lineIndex) && itemY > firstY) {
footnoteLinks.add(item.uuid);
} else if (isFollowedByText(lineItems, lineIndex)) {
footnotes.add(item.uuid);
}
stash.push(item);
}
});
});
return {
items: inputItems.map((item) => {
if (footnoteLinks.has(item.uuid)) {
return item.withTokenType('FOOTNOTE_LINK');
}
if (footnotes.has(item.uuid)) {
return item.withTokenType('FOOTNOTE');
}
return item;
}),
messages: [`Detected ${footnoteLinks.size}/${footnotes.size} footnotes.`],
};
}
}
function hasPreceedingText(lineItems: Item[], lineIndex: number) {
for (let index = lineIndex - 1; index >= 0; index--) {
const itemText = lineItems[index].data['str'].trim() as string;
if (!isNumber(itemText)) {
return true;
}
}
return false;
}
function isFollowedByText(lineItems: Item[], lineIndex: number) {
for (let index = lineIndex + 1; index < lineItems.length; index++) {
const itemText = lineItems[index].data['str'].trim() as string;
if (!isNumber(itemText)) {
return true;
}
}
return false;
}

View File

@ -12,7 +12,6 @@ import { HeadlineType, TextType, isHeadline, toHeadlineType } from '../text-type
const config = { const config = {
// How much taller a text must be to be a headline (relative to mostUsedHeight) // How much taller a text must be to be a headline (relative to mostUsedHeight)
// TODO sync with DetectHeadline ??
minHeadlineDistance: 1.3, minHeadlineDistance: 1.3,
}; };

View File

@ -5,6 +5,9 @@ import TransformContext from './TransformContext';
import LineItemMerger from '../debug/LineItemMerger'; import LineItemMerger from '../debug/LineItemMerger';
import { transformGroupedByPageAndLine } from '../support/groupingUtils'; import { transformGroupedByPageAndLine } from '../support/groupingUtils';
/**
* We can't trust order of occurence, esp. footnote links like to come last
*/
export default class SortXWithinLines extends ItemTransformer { export default class SortXWithinLines extends ItemTransformer {
constructor() { constructor() {
super('Sort by X', 'Sorts the items of a line by the x coordinate', { super('Sort by X', 'Sorts the items of a line by the x coordinate', {