diff --git a/KNOWN_ISSUES.md b/KNOWN_ISSUES.md index 80ff5f2..a8dd3f6 100644 --- a/KNOWN_ISSUES.md +++ b/KNOWN_ISSUES.md @@ -16,6 +16,10 @@ The interesting thing is that rendering with pdfjs (online) looks good. So maybe - multiline headlines: [WoodUp](examples/WoodUp.pdf) - Detecting list of figures (and creating headlines) [Achieving-The-Paris-Climate-Agreement](Achieving-The-Paris-Climate-Agreement.pdf) +# Footnotes + +- multiline foot notes (compressed.tracemonkey-pldi-09.pdf) + ## Not yet reviewed test PDFS # Achieving-The-Paris-Climate-Agreement.pdf diff --git a/src/Debugger.ts b/src/Debugger.ts index 982c002..5bc9c43 100644 --- a/src/Debugger.ts +++ b/src/Debugger.ts @@ -35,6 +35,8 @@ export default class Debugger { stageResult(stageIndex: number): StageResult { for (let idx = 0; idx < stageIndex + 1; idx++) { if (!this.stageResultCache[idx]) { + console.log(this.transformers[idx - 1].name); + const evaluations = new EvaluationTracker(); const transformer = this.transformers[idx - 1]; const previousStageResult: StageResult = this.stageResultCache[idx - 1]; diff --git a/src/Item.ts b/src/Item.ts index deb61ec..26dc828 100644 --- a/src/Item.ts +++ b/src/Item.ts @@ -1,32 +1,41 @@ import { v4 as uuidv4 } from 'uuid'; +import { TokenType } from './token-types'; export default class Item { page: number; data: object; uuid: string; + tokenTypes: TokenType[] = []; - constructor(page: number, data: object, uuid: string = uuidv4()) { + constructor(page: number, data: object, tokenTypes: TokenType[] = [], uuid: string = uuidv4()) { this.page = page; this.data = data; this.uuid = uuid; + this.tokenTypes = tokenTypes; } value(column: string): object { return this.data[column]; } + withTokenType(tokenType: TokenType): Item { + const newItem = new Item(this.page, this.data, this.tokenTypes, this.uuid); + newItem.tokenTypes.push(tokenType); + return newItem; + } + withDataAddition(data: object): Item { return this.withData({ ...this.data, ...data }); } withData(data: object): Item { - return new Item(this.page, data, this.uuid); + return new Item(this.page, data, this.tokenTypes, this.uuid); } /** * Returns the item without a uuid. */ withoutUuid(): Item { - return new Item(this.page, this.data, ''); + return new Item(this.page, this.data, this.tokenTypes, ''); } } diff --git a/src/debug/detectChanges.ts b/src/debug/detectChanges.ts index f97be85..9b94228 100644 --- a/src/debug/detectChanges.ts +++ b/src/debug/detectChanges.ts @@ -55,6 +55,9 @@ function detectPageChanges(tracker: ChangeTracker, inputItems: Item[], outputIte if ((typesInInput || typesInOutput) && !arraysEqual(typesInInput, typesInOutput)) { tracker.trackContentChange(inputItem); } + if (!arraysEqual(inputItem.tokenTypes, outputItems[positionInOutput].tokenTypes)) { + tracker.trackContentChange(inputItem); + } } else { // Handle items from the output with arn't in the input array for (let intermediateOutputIdx = outputIndex; intermediateOutputIdx < positionInOutput; intermediateOutputIdx++) { diff --git a/src/index.ts b/src/index.ts index c32d83c..9f18f1d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -18,6 +18,7 @@ import {type ParseConfig } from './parse'; import DetectListItems from './transformer/DetectListItems'; import DetectBlocks from './transformer/DetectBlocks'; import DetectListLevels from './transformer/DetectListLevels'; +import DetectFootnotes from './transformer/DetectFootnotes'; export const transformers = [ new AdjustHeight(), @@ -27,6 +28,7 @@ export const transformers = [ new CompactLines(), new SortXWithinLines(), new RemoveRepetitiveItems(), + new DetectFootnotes(), new DetectToc(), new DetectHeaders(), new DetectListItems(), diff --git a/src/support/stringFunctions.ts b/src/support/stringFunctions.ts index d70f6f1..08f41f8 100644 --- a/src/support/stringFunctions.ts +++ b/src/support/stringFunctions.ts @@ -59,3 +59,13 @@ export function isListItem(value: string) { export function isNumberedListItem(value: string) { return /^[\s]*\d*\.(?:\s|$)/g.test(value); } + +export function isNumber(value: string) { + for (let i = 0; i < value.length; i++) { + const charCode = value.charCodeAt(i); + if (!isDigit(charCode)) { + return false; + } + } + return true; +} diff --git a/src/token-types.ts b/src/token-types.ts new file mode 100644 index 0000000..ca62e5a --- /dev/null +++ b/src/token-types.ts @@ -0,0 +1,2 @@ +export type FontType = 'BOLD' | 'OBLIQUE'; +export type TokenType = 'LINK' | 'FOOTNOTE' | 'FOOTNOTE_LINK' | FontType; diff --git a/src/transformer/CacluclateStatistics.ts b/src/transformer/CacluclateStatistics.ts index 54c739d..7631e86 100644 --- a/src/transformer/CacluclateStatistics.ts +++ b/src/transformer/CacluclateStatistics.ts @@ -90,7 +90,6 @@ export default class CalculateStatistics extends ItemTransformer { maxHeightFont = itemFont; } }); - // TODO really need parseInt here ? const mostUsedHeight = to2DigitDecimalFromString(getMostUsedKey(heightToOccurrence)); const mostUsedFont = getMostUsedKey(fontToOccurrence); @@ -103,7 +102,7 @@ export default class CalculateStatistics extends ItemTransformer { let page = -1; let lastItemOfMostUsedHeight: Item | undefined; - items.forEach((item, i) => { + items.forEach((item) => { if (item.page !== page) lastItemOfMostUsedHeight = undefined; const itemHeight = to2DigitDecimalFromString(item.data['height']); const itemText = item.data['str']; diff --git a/src/transformer/DetectFootnotes.ts b/src/transformer/DetectFootnotes.ts new file mode 100644 index 0000000..f9309dc --- /dev/null +++ b/src/transformer/DetectFootnotes.ts @@ -0,0 +1,84 @@ +import Item from '../Item'; +import ItemResult from '../ItemResult'; +import ItemTransformer from './ItemTransformer'; +import TransformContext from './TransformContext'; +import LineItemMerger from '../debug/LineItemMerger'; +import { groupByLine } from '../support/groupingUtils'; +import { isNumber } from '../support/stringFunctions'; + +export default class DetectFootnotes extends ItemTransformer { + constructor() { + super( + 'Detect Footnotes', + 'Detect footnotes in text and link them to the references', + { + requireColumns: ['str', 'y'], + debug: { + itemMerger: new LineItemMerger(false), + }, + }, + (incomingSchema) => { + return incomingSchema.reduce((schema, column) => { + if (column === 'x') { + return [...schema, 'token types', 'x']; + } + return [...schema, column]; + }, new Array()); + }, + ); + } + + transform(context: TransformContext, inputItems: Item[]): ItemResult { + const stash: Item[] = []; + const footnoteLinks = new Set(); + const footnotes = new Set(); + + groupByLine(inputItems).forEach((lineItems) => { + const firstY = lineItems[0].data['y']; + lineItems.forEach((item, lineIndex) => { + const itemText = item.data['str'].trim(); + const itemY = item.data['y']; + if (isNumber(itemText)) { + if (hasPreceedingText(lineItems, lineIndex) && itemY > firstY) { + footnoteLinks.add(item.uuid); + } else if (isFollowedByText(lineItems, lineIndex)) { + footnotes.add(item.uuid); + } + stash.push(item); + } + }); + }); + + return { + items: inputItems.map((item) => { + if (footnoteLinks.has(item.uuid)) { + return item.withTokenType('FOOTNOTE_LINK'); + } + if (footnotes.has(item.uuid)) { + return item.withTokenType('FOOTNOTE'); + } + return item; + }), + messages: [`Detected ${footnoteLinks.size}/${footnotes.size} footnotes.`], + }; + } +} +function hasPreceedingText(lineItems: Item[], lineIndex: number) { + for (let index = lineIndex - 1; index >= 0; index--) { + const itemText = lineItems[index].data['str'].trim() as string; + if (!isNumber(itemText)) { + return true; + } + } + return false; +} + +function isFollowedByText(lineItems: Item[], lineIndex: number) { + for (let index = lineIndex + 1; index < lineItems.length; index++) { + const itemText = lineItems[index].data['str'].trim() as string; + if (!isNumber(itemText)) { + return true; + } + } + return false; +} diff --git a/src/transformer/DetectHeaders.ts b/src/transformer/DetectHeaders.ts index afd04ec..002f989 100644 --- a/src/transformer/DetectHeaders.ts +++ b/src/transformer/DetectHeaders.ts @@ -12,7 +12,6 @@ import { HeadlineType, TextType, isHeadline, toHeadlineType } from '../text-type const config = { // How much taller a text must be to be a headline (relative to mostUsedHeight) - // TODO sync with DetectHeadline ?? minHeadlineDistance: 1.3, }; diff --git a/src/transformer/SortXWithinLines.ts b/src/transformer/SortXWithinLines.ts index 3813ca6..5f243dc 100644 --- a/src/transformer/SortXWithinLines.ts +++ b/src/transformer/SortXWithinLines.ts @@ -5,6 +5,9 @@ import TransformContext from './TransformContext'; import LineItemMerger from '../debug/LineItemMerger'; import { transformGroupedByPageAndLine } from '../support/groupingUtils'; +/** + * We can't trust order of occurence, esp. footnote links like to come last + */ export default class SortXWithinLines extends ItemTransformer { constructor() { super('Sort by X', 'Sorts the items of a line by the x coordinate', {