diff --git a/core/src/Debugger.ts b/core/src/Debugger.ts index a65d115..2e8abae 100644 --- a/core/src/Debugger.ts +++ b/core/src/Debugger.ts @@ -6,28 +6,25 @@ import ColumnAnnotation from './debug/ColumnAnnotation'; import AnnotatedColumn from './debug/AnnotatedColumn'; import { detectChanges } from './debug/detectChanges'; import { asPages } from './debug/Page'; +import EvaluationTracker from './transformer/EvaluationTracker'; import ChangeTracker from './debug/ChangeTracker'; +import PageViewport from './parse/PageViewport'; export default class Debugger { - private context: TransformContext; private transformers: ItemTransformer[]; private stageResultCache: StageResult[]; - pageCount: number; - fontMap: Map; stageNames: string[]; stageDescriptions: string[]; constructor( - pageCount: number, + public fontMap: Map, + private pageViewports: PageViewport[], + public pageCount: number, inputSchema: string[], inputItems: Item[], - context: TransformContext, transformers: ItemTransformer[], ) { this.transformers = transformers; - this.context = context; - this.fontMap = context.fontMap; - this.pageCount = pageCount; this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)]; this.stageDescriptions = ['Initial items as parsed by PDFjs', ...transformers.map((t) => t.description)]; this.stageResultCache = [initialStage(inputSchema, inputItems)]; @@ -36,19 +33,21 @@ export default class Debugger { stageResult(stageIndex: number): StageResult { for (let idx = 0; idx < stageIndex + 1; idx++) { if (!this.stageResultCache[idx]) { + const evaluations = new EvaluationTracker(); + const context = new TransformContext(this.fontMap, this.pageViewports, evaluations); const transformer = this.transformers[idx - 1]; const previousStageResult: StageResult = this.stageResultCache[idx - 1]; const previousItems = previousStageResult.itemsCleanedAndUnpacked(); const inputSchema = toSimpleSchema(previousStageResult); const outputSchema = transformer.schemaTransformer(inputSchema); - const itemResult = transformer.transform(this.context, [...previousItems]); + const itemResult = transformer.transform(context, [...previousItems]); - const changeTracker = new ChangeTracker(); - const items = detectChanges(changeTracker, previousItems, itemResult.items); - const pages = asPages(changeTracker, items, transformer.descriptor.debug?.itemMerger); + const changes = new ChangeTracker(); + const items = detectChanges(changes, previousItems, itemResult.items); + const pages = asPages(evaluations, changes, items, transformer.descriptor.debug?.itemMerger); const messages = itemResult.messages; - if (changeTracker.changeCount() > 0 && messages.length === 0) { - messages.unshift(`Detected ${changeTracker.changeCount()} changes`); + if (changes.changeCount() > 0 && messages.length === 0) { + messages.unshift(`Detected ${changes.changeCount()} changes`); } this.stageResultCache.push( @@ -56,7 +55,8 @@ export default class Debugger { transformer.descriptor, toAnnotatedSchema(inputSchema, outputSchema), pages, - changeTracker, + evaluations, + changes, messages, ), ); diff --git a/core/src/PdfPipeline.ts b/core/src/PdfPipeline.ts index 140f314..90b9928 100644 --- a/core/src/PdfPipeline.ts +++ b/core/src/PdfPipeline.ts @@ -5,6 +5,7 @@ import ItemTransformer from './transformer/ItemTransformer'; import ParseResult from './ParseResult'; import Debugger from './Debugger'; import { assert } from './assert'; +import TransformContext from './transformer/TransformContext'; export default class PdfPipeline { parser: PdfParser; @@ -27,9 +28,9 @@ export default class PdfPipeline { async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise { const parseResult = await this.parse(src, progressListener); this.verifyRequiredColumns(parseResult.schema, this.transformers); - const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports }; let items = parseResult.items; this.transformers.forEach((transformer) => { + const context = new TransformContext(parseResult.fontMap, parseResult.pageViewports); items = transformer.transform(context, items).items; }); parseResult.items = items; @@ -38,8 +39,14 @@ export default class PdfPipeline { async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise { const parseResult = await this.parse(src, progressListener); - const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports }; - return new Debugger(parseResult.pageCount, parseResult.schema, parseResult.items, context, this.transformers); + return new Debugger( + parseResult.fontMap, + parseResult.pageViewports, + parseResult.pageCount, + parseResult.schema, + parseResult.items, + this.transformers, + ); } /** diff --git a/core/src/debug/ItemMerger.ts b/core/src/debug/ItemMerger.ts index d18b9bb..b04e09a 100644 --- a/core/src/debug/ItemMerger.ts +++ b/core/src/debug/ItemMerger.ts @@ -1,3 +1,4 @@ +import type EvaluationTracker from '../transformer/EvaluationTracker'; import type ChangeTracker from './ChangeTracker'; import type Item from '../Item'; @@ -6,5 +7,5 @@ import type Item from '../Item'; */ export default abstract class ItemMerger { constructor(public groupKey: string) {} - abstract merge(tracker: ChangeTracker, items: Item[]): Item; + abstract merge(evaluationTracker: EvaluationTracker, changeTracker: ChangeTracker, items: Item[]): Item; } diff --git a/core/src/debug/LineItemMerger.ts b/core/src/debug/LineItemMerger.ts index 44251c8..4c05746 100644 --- a/core/src/debug/LineItemMerger.ts +++ b/core/src/debug/LineItemMerger.ts @@ -1,5 +1,6 @@ import ItemMerger from './ItemMerger'; import Item from '../Item'; +import EvaluationTracker from '../transformer/EvaluationTracker'; import ChangeTracker from './ChangeTracker'; export default class LineItemMerger extends ItemMerger { @@ -7,7 +8,7 @@ export default class LineItemMerger extends ItemMerger { super('line'); } - merge(tracker: ChangeTracker, items: Item[]): Item { + merge(evaluationTracker: EvaluationTracker, changeTracker: ChangeTracker, items: Item[]): Item { const page = items[0].page; const line = items[0].data['line']; const str = items.map((item) => item.data['str']).join(' '); @@ -28,12 +29,14 @@ export default class LineItemMerger extends ItemMerger { dir: directions, }); + if (items.find((item) => evaluationTracker.evaluated(item))) evaluationTracker.trackEvaluation(newItem); + if (this.trackAsNew) { - tracker.trackAddition(newItem); - } else if (items.every((item) => tracker.isRemoved(item))) { - tracker.trackRemoval(newItem); - } else if (items.find((item) => tracker.hasChanged(item))) { - tracker.trackContentChange(newItem); + changeTracker.trackAddition(newItem); + } else if (items.every((item) => changeTracker.isRemoved(item))) { + changeTracker.trackRemoval(newItem); + } else if (items.find((item) => changeTracker.hasChanged(item))) { + changeTracker.trackContentChange(newItem); } return newItem; } diff --git a/core/src/debug/Page.ts b/core/src/debug/Page.ts index e4aabf8..e1160dc 100644 --- a/core/src/debug/Page.ts +++ b/core/src/debug/Page.ts @@ -1,5 +1,6 @@ import Item from '../Item'; import { groupByElement, groupByPage } from '../support/groupingUtils'; +import EvaluationTracker from '../transformer/EvaluationTracker'; import ChangeTracker from './ChangeTracker'; import ItemGroup from './ItemGroup'; import ItemMerger from './ItemMerger'; @@ -9,13 +10,18 @@ export default interface Page { itemGroups: ItemGroup[]; } -export function asPages(tracker: ChangeTracker, items: Item[], itemMerger?: ItemMerger): Page[] { +export function asPages( + evaluationTracker: EvaluationTracker, + changeTracker: ChangeTracker, + items: Item[], + itemMerger?: ItemMerger, +): Page[] { return groupByPage(items).map((pageItems: Item[]) => { let itemGroups: ItemGroup[]; if (itemMerger) { itemGroups = groupByElement(pageItems, itemMerger.groupKey).map((groupItems) => { if (groupItems.length > 1) { - const top = itemMerger.merge(tracker, groupItems); + const top = itemMerger.merge(evaluationTracker, changeTracker, groupItems); return new ItemGroup(top, groupItems); } else { return new ItemGroup(groupItems[0]); diff --git a/core/src/debug/StageResult.ts b/core/src/debug/StageResult.ts index c55be70..5bcd921 100644 --- a/core/src/debug/StageResult.ts +++ b/core/src/debug/StageResult.ts @@ -5,12 +5,15 @@ import Page, { asPages } from './Page'; import ChangeIndex from './ChangeIndex'; import ChangeTracker from './ChangeTracker'; import ItemGroup from './ItemGroup'; +import EvaluationIndex from '../transformer/EvaluationIndex'; +import EvaluationTracker from '../transformer/EvaluationTracker'; export default class StageResult { constructor( public descriptor: TransformDescriptor, public schema: AnnotatedColumn[], public pages: Page[], + public evaluations: EvaluationIndex, public changes: ChangeIndex, public messages: string[], ) {} @@ -50,7 +53,9 @@ export default class StageResult { (page) => ({ ...page, - itemGroups: page.itemGroups.filter((itemGroup) => this.changes.hasChanged(itemGroup.top)), + itemGroups: page.itemGroups.filter( + (itemGroup) => this.evaluations.evaluated(itemGroup.top) || this.changes.hasChanged(itemGroup.top), + ), } as Page), ); } @@ -73,12 +78,13 @@ export default class StageResult { export function initialStage(inputSchema: string[], inputItems: Item[]): StageResult { const schema = inputSchema.map((column) => ({ name: column })); - const tracker = new ChangeTracker(); - const pages = asPages(tracker, inputItems); + const evaluations = new EvaluationTracker(); + const changes = new ChangeTracker(); + const pages = asPages(evaluations, changes, inputItems); const messages = [ `Parsed ${inputItems.length === 0 ? 0 : inputItems[inputItems.length - 1].page + 1} pages with ${ inputItems.length } items`, ]; - return new StageResult(toDescriptor({ debug: { showAll: true } }), schema, pages, tracker, messages); + return new StageResult(toDescriptor({ debug: { showAll: true } }), schema, pages, evaluations, changes, messages); } diff --git a/core/src/support/groupingUtils.ts b/core/src/support/groupingUtils.ts index e1bde23..c643a3e 100644 --- a/core/src/support/groupingUtils.ts +++ b/core/src/support/groupingUtils.ts @@ -8,6 +8,10 @@ export function onlyUniques(value: T, index: number, self: T[]) { return self.indexOf(value) === index; } +export function ascending(a: number, b: number): number { + return a - b; +} + export function count(array: T[], find: (entry: T) => boolean): number { return array.reduce((count, entry) => (find(entry) ? count + 1 : count), 0); } diff --git a/core/src/transformer/EvaluationIndex.ts b/core/src/transformer/EvaluationIndex.ts new file mode 100644 index 0000000..6221248 --- /dev/null +++ b/core/src/transformer/EvaluationIndex.ts @@ -0,0 +1,14 @@ +import Item from '../Item'; + +export default interface EvaluationIndex { + /** + * Return the number of tracked evaluations. + */ + evaluationCount(): number; + + /** + * Returns the true if the given item has been evaluated + * @param item + */ + evaluated(item: Item): boolean; +} diff --git a/core/src/transformer/EvaluationTracker.ts b/core/src/transformer/EvaluationTracker.ts new file mode 100644 index 0000000..ed6fd51 --- /dev/null +++ b/core/src/transformer/EvaluationTracker.ts @@ -0,0 +1,23 @@ +import { assertDefined } from '../assert'; +import Item from '../Item'; +import EvaluationIndex from './EvaluationIndex'; + +export default class EvaluationTracker implements EvaluationIndex { + private evaluations: Set = new Set(); + + evaluationCount() { + return this.evaluations.size; + } + + evaluated(item: Item) { + return this.evaluations.has(_uuid(item)); + } + + trackEvaluation(item: Item) { + this.evaluations.add(_uuid(item)); + } +} + +function _uuid(item: Item): string { + return assertDefined(item.uuid, 'UUID is not set'); +} diff --git a/core/src/transformer/RemoveRepetitiveItems.ts b/core/src/transformer/RemoveRepetitiveItems.ts index b62bd9e..d6d7622 100644 --- a/core/src/transformer/RemoveRepetitiveItems.ts +++ b/core/src/transformer/RemoveRepetitiveItems.ts @@ -6,6 +6,7 @@ import ItemTransformer from './ItemTransformer'; import TransformContext from './TransformContext'; import LineItemMerger from '../debug/LineItemMerger'; import { + ascending, flatMap, groupByLine, groupByPage, @@ -43,14 +44,14 @@ export default class RemoveRepetitiveItems extends ItemTransformer { transform(context: TransformContext, inputItems: Item[]): ItemResult { const pageExtracts = buildExtracts(inputItems); - const uniqueYs = flatMap(pageExtracts, (extract) => extract.fringeLines) + const fringeYs = flatMap(pageExtracts, (extract) => extract.fringeLines) .map((line) => line.y) .filter(onlyUniques) - .sort((a, b) => a - b); + .sort(ascending); // console.log('uniqueYs', uniqueYs); - const yToRemove = uniqueYs.filter((y) => { + const yToRemove = fringeYs.filter((y) => { const yLines = pageExtracts .map((page) => page.lineByY(y)) .filter((line) => typeof line !== 'undefined') as Line[]; @@ -73,11 +74,20 @@ export default class RemoveRepetitiveItems extends ItemTransformer { //console.log('yToRemove', yToRemove); + let removalCount = 0; return { - items: transformGroupedByPageAndLine(inputItems, (_, __, items) => - yToRemove.includes(yFromLine(items)) ? [] : items, - ), - messages: [`Filtered out each item with y == ${yToRemove.join('||')}`], + items: transformGroupedByPageAndLine(inputItems, (_, __, lineItems) => { + const itemsY = yFromLine(lineItems); + if (fringeYs.includes(itemsY)) { + lineItems.forEach(context.trackEvaluation.bind(context)); + } + if (yToRemove.includes(itemsY)) { + removalCount++; + return []; + } + return lineItems; + }), + messages: [`Filtered out ${removalCount} items with y == ${yToRemove.join('||')}`], }; } } @@ -166,14 +176,14 @@ class Line { constructor(public y: number, public items: Item[]) {} - text() { + text(): string { if (!this._text) { this._text = this.items.reduce((all, item) => all + item.data['str'], ''); } return this._text; } - textWithoutNumbers() { + textWithoutNumbers(): string { if (!this._textWithoutNumbers) { this._textWithoutNumbers = filterOutDigits(this.text()); } diff --git a/core/src/transformer/TransformContext.ts b/core/src/transformer/TransformContext.ts index d8dcfbd..dceb73c 100644 --- a/core/src/transformer/TransformContext.ts +++ b/core/src/transformer/TransformContext.ts @@ -1,6 +1,15 @@ +import Item from '../Item'; import PageViewport from '../parse/PageViewport'; +import EvaluationTracker from './EvaluationTracker'; -export default interface TransformContext { - fontMap: Map; - pageViewports: PageViewport[]; +export default class TransformContext { + constructor( + public fontMap: Map, + public pageViewports: PageViewport[], + private evaluations = new EvaluationTracker(), + ) {} + + trackEvaluation(item: Item) { + this.evaluations.trackEvaluation(item); + } } diff --git a/ui/src/debug/ChangeSymbol.svelte b/ui/src/debug/ChangeSymbol.svelte index 4924cda..c3bc305 100644 --- a/ui/src/debug/ChangeSymbol.svelte +++ b/ui/src/debug/ChangeSymbol.svelte @@ -1,4 +1,5 @@ -{#if hasChanged} +{#if evaluated || hasChanged}
{#if iconComp} diff --git a/ui/src/debug/DebugView.svelte b/ui/src/debug/DebugView.svelte index cb2b735..879705f 100644 --- a/ui/src/debug/DebugView.svelte +++ b/ui/src/debug/DebugView.svelte @@ -49,7 +49,12 @@ {#if visiblePages.find((page) => page.itemGroups.length > 0)} - + {:else}
diff --git a/ui/src/debug/ItemRow.svelte b/ui/src/debug/ItemRow.svelte index 8880f86..afa2735 100644 --- a/ui/src/debug/ItemRow.svelte +++ b/ui/src/debug/ItemRow.svelte @@ -4,6 +4,7 @@ import { ArrowLeft, ArrowRight } from 'svelte-hero-icons'; import type ItemGroup from '@core/debug/ItemGroup'; + import type EvaluationIndex from '@core/transformer/EvaluationIndex'; import type ChangeIndex from '@core/debug/ChangeIndex'; import type AnnotatedColumn from '@core/debug/AnnotatedColumn'; @@ -17,6 +18,7 @@ export let itemIdx: number; export let schema: AnnotatedColumn[]; export let itemGroup: ItemGroup; + export let evaluations: EvaluationIndex; export let changes: ChangeIndex; let expandedItemGroup: { pageIndex: number; itemIndex: number }; @@ -45,7 +47,7 @@ {/if} - + itemGroup.hasMany() && toggleRow(pageIdx, itemIdx)}> @@ -72,7 +74,7 @@ class:changeMinus={changes.isMinusChange(child)}> - +
diff --git a/ui/src/debug/ItemTable.svelte b/ui/src/debug/ItemTable.svelte index d3e4531..43a1ada 100644 --- a/ui/src/debug/ItemTable.svelte +++ b/ui/src/debug/ItemTable.svelte @@ -4,6 +4,7 @@ import { PresentationChartLine } from 'svelte-hero-icons'; import type AnnotatedColumn from '@core/debug/AnnotatedColumn'; + import type EvaluationIndex from '@core/transformer/EvaluationIndex'; import type ChangeIndex from '@core/debug/ChangeIndex'; import type Page from '@core/debug/Page'; import ColumnAnnotation from '../../../core/src/debug/ColumnAnnotation'; @@ -15,6 +16,7 @@ export let schema: AnnotatedColumn[]; export let pages: Page[]; export let pageControl: PageControl; + export let evaluations: EvaluationIndex; export let changes: ChangeIndex; let { pagePinned } = pageControl; @@ -80,7 +82,7 @@ {#each page.itemGroups as itemGroup, itemIdx} - + {/each} {/each}