Track evaluations

This commit is contained in:
Johannes Zillmann 2021-03-23 07:25:17 +01:00
parent c98145a63c
commit 0be95e4bbc
15 changed files with 147 additions and 49 deletions

View File

@ -6,28 +6,25 @@ import ColumnAnnotation from './debug/ColumnAnnotation';
import AnnotatedColumn from './debug/AnnotatedColumn'; import AnnotatedColumn from './debug/AnnotatedColumn';
import { detectChanges } from './debug/detectChanges'; import { detectChanges } from './debug/detectChanges';
import { asPages } from './debug/Page'; import { asPages } from './debug/Page';
import EvaluationTracker from './transformer/EvaluationTracker';
import ChangeTracker from './debug/ChangeTracker'; import ChangeTracker from './debug/ChangeTracker';
import PageViewport from './parse/PageViewport';
export default class Debugger { export default class Debugger {
private context: TransformContext;
private transformers: ItemTransformer[]; private transformers: ItemTransformer[];
private stageResultCache: StageResult[]; private stageResultCache: StageResult[];
pageCount: number;
fontMap: Map<string, object>;
stageNames: string[]; stageNames: string[];
stageDescriptions: string[]; stageDescriptions: string[];
constructor( constructor(
pageCount: number, public fontMap: Map<string, object>,
private pageViewports: PageViewport[],
public pageCount: number,
inputSchema: string[], inputSchema: string[],
inputItems: Item[], inputItems: Item[],
context: TransformContext,
transformers: ItemTransformer[], transformers: ItemTransformer[],
) { ) {
this.transformers = transformers; this.transformers = transformers;
this.context = context;
this.fontMap = context.fontMap;
this.pageCount = pageCount;
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)]; this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
this.stageDescriptions = ['Initial items as parsed by PDFjs', ...transformers.map((t) => t.description)]; this.stageDescriptions = ['Initial items as parsed by PDFjs', ...transformers.map((t) => t.description)];
this.stageResultCache = [initialStage(inputSchema, inputItems)]; this.stageResultCache = [initialStage(inputSchema, inputItems)];
@ -36,19 +33,21 @@ export default class Debugger {
stageResult(stageIndex: number): StageResult { stageResult(stageIndex: number): StageResult {
for (let idx = 0; idx < stageIndex + 1; idx++) { for (let idx = 0; idx < stageIndex + 1; idx++) {
if (!this.stageResultCache[idx]) { if (!this.stageResultCache[idx]) {
const evaluations = new EvaluationTracker();
const context = new TransformContext(this.fontMap, this.pageViewports, evaluations);
const transformer = this.transformers[idx - 1]; const transformer = this.transformers[idx - 1];
const previousStageResult: StageResult = this.stageResultCache[idx - 1]; const previousStageResult: StageResult = this.stageResultCache[idx - 1];
const previousItems = previousStageResult.itemsCleanedAndUnpacked(); const previousItems = previousStageResult.itemsCleanedAndUnpacked();
const inputSchema = toSimpleSchema(previousStageResult); const inputSchema = toSimpleSchema(previousStageResult);
const outputSchema = transformer.schemaTransformer(inputSchema); const outputSchema = transformer.schemaTransformer(inputSchema);
const itemResult = transformer.transform(this.context, [...previousItems]); const itemResult = transformer.transform(context, [...previousItems]);
const changeTracker = new ChangeTracker(); const changes = new ChangeTracker();
const items = detectChanges(changeTracker, previousItems, itemResult.items); const items = detectChanges(changes, previousItems, itemResult.items);
const pages = asPages(changeTracker, items, transformer.descriptor.debug?.itemMerger); const pages = asPages(evaluations, changes, items, transformer.descriptor.debug?.itemMerger);
const messages = itemResult.messages; const messages = itemResult.messages;
if (changeTracker.changeCount() > 0 && messages.length === 0) { if (changes.changeCount() > 0 && messages.length === 0) {
messages.unshift(`Detected ${changeTracker.changeCount()} changes`); messages.unshift(`Detected ${changes.changeCount()} changes`);
} }
this.stageResultCache.push( this.stageResultCache.push(
@ -56,7 +55,8 @@ export default class Debugger {
transformer.descriptor, transformer.descriptor,
toAnnotatedSchema(inputSchema, outputSchema), toAnnotatedSchema(inputSchema, outputSchema),
pages, pages,
changeTracker, evaluations,
changes,
messages, messages,
), ),
); );

View File

@ -5,6 +5,7 @@ import ItemTransformer from './transformer/ItemTransformer';
import ParseResult from './ParseResult'; import ParseResult from './ParseResult';
import Debugger from './Debugger'; import Debugger from './Debugger';
import { assert } from './assert'; import { assert } from './assert';
import TransformContext from './transformer/TransformContext';
export default class PdfPipeline { export default class PdfPipeline {
parser: PdfParser; parser: PdfParser;
@ -27,9 +28,9 @@ export default class PdfPipeline {
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> { async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
const parseResult = await this.parse(src, progressListener); const parseResult = await this.parse(src, progressListener);
this.verifyRequiredColumns(parseResult.schema, this.transformers); this.verifyRequiredColumns(parseResult.schema, this.transformers);
const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
let items = parseResult.items; let items = parseResult.items;
this.transformers.forEach((transformer) => { this.transformers.forEach((transformer) => {
const context = new TransformContext(parseResult.fontMap, parseResult.pageViewports);
items = transformer.transform(context, items).items; items = transformer.transform(context, items).items;
}); });
parseResult.items = items; parseResult.items = items;
@ -38,8 +39,14 @@ export default class PdfPipeline {
async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> { async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> {
const parseResult = await this.parse(src, progressListener); const parseResult = await this.parse(src, progressListener);
const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports }; return new Debugger(
return new Debugger(parseResult.pageCount, parseResult.schema, parseResult.items, context, this.transformers); parseResult.fontMap,
parseResult.pageViewports,
parseResult.pageCount,
parseResult.schema,
parseResult.items,
this.transformers,
);
} }
/** /**

View File

@ -1,3 +1,4 @@
import type EvaluationTracker from '../transformer/EvaluationTracker';
import type ChangeTracker from './ChangeTracker'; import type ChangeTracker from './ChangeTracker';
import type Item from '../Item'; import type Item from '../Item';
@ -6,5 +7,5 @@ import type Item from '../Item';
*/ */
export default abstract class ItemMerger { export default abstract class ItemMerger {
constructor(public groupKey: string) {} constructor(public groupKey: string) {}
abstract merge(tracker: ChangeTracker, items: Item[]): Item; abstract merge(evaluationTracker: EvaluationTracker, changeTracker: ChangeTracker, items: Item[]): Item;
} }

View File

@ -1,5 +1,6 @@
import ItemMerger from './ItemMerger'; import ItemMerger from './ItemMerger';
import Item from '../Item'; import Item from '../Item';
import EvaluationTracker from '../transformer/EvaluationTracker';
import ChangeTracker from './ChangeTracker'; import ChangeTracker from './ChangeTracker';
export default class LineItemMerger extends ItemMerger { export default class LineItemMerger extends ItemMerger {
@ -7,7 +8,7 @@ export default class LineItemMerger extends ItemMerger {
super('line'); super('line');
} }
merge(tracker: ChangeTracker, items: Item[]): Item { merge(evaluationTracker: EvaluationTracker, changeTracker: ChangeTracker, items: Item[]): Item {
const page = items[0].page; const page = items[0].page;
const line = items[0].data['line']; const line = items[0].data['line'];
const str = items.map((item) => item.data['str']).join(' '); const str = items.map((item) => item.data['str']).join(' ');
@ -28,12 +29,14 @@ export default class LineItemMerger extends ItemMerger {
dir: directions, dir: directions,
}); });
if (items.find((item) => evaluationTracker.evaluated(item))) evaluationTracker.trackEvaluation(newItem);
if (this.trackAsNew) { if (this.trackAsNew) {
tracker.trackAddition(newItem); changeTracker.trackAddition(newItem);
} else if (items.every((item) => tracker.isRemoved(item))) { } else if (items.every((item) => changeTracker.isRemoved(item))) {
tracker.trackRemoval(newItem); changeTracker.trackRemoval(newItem);
} else if (items.find((item) => tracker.hasChanged(item))) { } else if (items.find((item) => changeTracker.hasChanged(item))) {
tracker.trackContentChange(newItem); changeTracker.trackContentChange(newItem);
} }
return newItem; return newItem;
} }

View File

@ -1,5 +1,6 @@
import Item from '../Item'; import Item from '../Item';
import { groupByElement, groupByPage } from '../support/groupingUtils'; import { groupByElement, groupByPage } from '../support/groupingUtils';
import EvaluationTracker from '../transformer/EvaluationTracker';
import ChangeTracker from './ChangeTracker'; import ChangeTracker from './ChangeTracker';
import ItemGroup from './ItemGroup'; import ItemGroup from './ItemGroup';
import ItemMerger from './ItemMerger'; import ItemMerger from './ItemMerger';
@ -9,13 +10,18 @@ export default interface Page {
itemGroups: ItemGroup[]; itemGroups: ItemGroup[];
} }
export function asPages(tracker: ChangeTracker, items: Item[], itemMerger?: ItemMerger): Page[] { export function asPages(
evaluationTracker: EvaluationTracker,
changeTracker: ChangeTracker,
items: Item[],
itemMerger?: ItemMerger,
): Page[] {
return groupByPage(items).map((pageItems: Item[]) => { return groupByPage(items).map((pageItems: Item[]) => {
let itemGroups: ItemGroup[]; let itemGroups: ItemGroup[];
if (itemMerger) { if (itemMerger) {
itemGroups = groupByElement(pageItems, itemMerger.groupKey).map((groupItems) => { itemGroups = groupByElement(pageItems, itemMerger.groupKey).map((groupItems) => {
if (groupItems.length > 1) { if (groupItems.length > 1) {
const top = itemMerger.merge(tracker, groupItems); const top = itemMerger.merge(evaluationTracker, changeTracker, groupItems);
return new ItemGroup(top, groupItems); return new ItemGroup(top, groupItems);
} else { } else {
return new ItemGroup(groupItems[0]); return new ItemGroup(groupItems[0]);

View File

@ -5,12 +5,15 @@ import Page, { asPages } from './Page';
import ChangeIndex from './ChangeIndex'; import ChangeIndex from './ChangeIndex';
import ChangeTracker from './ChangeTracker'; import ChangeTracker from './ChangeTracker';
import ItemGroup from './ItemGroup'; import ItemGroup from './ItemGroup';
import EvaluationIndex from '../transformer/EvaluationIndex';
import EvaluationTracker from '../transformer/EvaluationTracker';
export default class StageResult { export default class StageResult {
constructor( constructor(
public descriptor: TransformDescriptor, public descriptor: TransformDescriptor,
public schema: AnnotatedColumn[], public schema: AnnotatedColumn[],
public pages: Page[], public pages: Page[],
public evaluations: EvaluationIndex,
public changes: ChangeIndex, public changes: ChangeIndex,
public messages: string[], public messages: string[],
) {} ) {}
@ -50,7 +53,9 @@ export default class StageResult {
(page) => (page) =>
({ ({
...page, ...page,
itemGroups: page.itemGroups.filter((itemGroup) => this.changes.hasChanged(itemGroup.top)), itemGroups: page.itemGroups.filter(
(itemGroup) => this.evaluations.evaluated(itemGroup.top) || this.changes.hasChanged(itemGroup.top),
),
} as Page), } as Page),
); );
} }
@ -73,12 +78,13 @@ export default class StageResult {
export function initialStage(inputSchema: string[], inputItems: Item[]): StageResult { export function initialStage(inputSchema: string[], inputItems: Item[]): StageResult {
const schema = inputSchema.map((column) => ({ name: column })); const schema = inputSchema.map((column) => ({ name: column }));
const tracker = new ChangeTracker(); const evaluations = new EvaluationTracker();
const pages = asPages(tracker, inputItems); const changes = new ChangeTracker();
const pages = asPages(evaluations, changes, inputItems);
const messages = [ const messages = [
`Parsed ${inputItems.length === 0 ? 0 : inputItems[inputItems.length - 1].page + 1} pages with ${ `Parsed ${inputItems.length === 0 ? 0 : inputItems[inputItems.length - 1].page + 1} pages with ${
inputItems.length inputItems.length
} items`, } items`,
]; ];
return new StageResult(toDescriptor({ debug: { showAll: true } }), schema, pages, tracker, messages); return new StageResult(toDescriptor({ debug: { showAll: true } }), schema, pages, evaluations, changes, messages);
} }

View File

@ -8,6 +8,10 @@ export function onlyUniques<T>(value: T, index: number, self: T[]) {
return self.indexOf(value) === index; return self.indexOf(value) === index;
} }
export function ascending<T>(a: number, b: number): number {
return a - b;
}
export function count<T, S>(array: T[], find: (entry: T) => boolean): number { export function count<T, S>(array: T[], find: (entry: T) => boolean): number {
return array.reduce((count, entry) => (find(entry) ? count + 1 : count), 0); return array.reduce((count, entry) => (find(entry) ? count + 1 : count), 0);
} }

View File

@ -0,0 +1,14 @@
import Item from '../Item';
export default interface EvaluationIndex {
/**
* Return the number of tracked evaluations.
*/
evaluationCount(): number;
/**
* Returns the true if the given item has been evaluated
* @param item
*/
evaluated(item: Item): boolean;
}

View File

@ -0,0 +1,23 @@
import { assertDefined } from '../assert';
import Item from '../Item';
import EvaluationIndex from './EvaluationIndex';
export default class EvaluationTracker implements EvaluationIndex {
private evaluations: Set<string> = new Set();
evaluationCount() {
return this.evaluations.size;
}
evaluated(item: Item) {
return this.evaluations.has(_uuid(item));
}
trackEvaluation(item: Item) {
this.evaluations.add(_uuid(item));
}
}
function _uuid(item: Item): string {
return assertDefined(item.uuid, 'UUID is not set');
}

View File

@ -6,6 +6,7 @@ import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext'; import TransformContext from './TransformContext';
import LineItemMerger from '../debug/LineItemMerger'; import LineItemMerger from '../debug/LineItemMerger';
import { import {
ascending,
flatMap, flatMap,
groupByLine, groupByLine,
groupByPage, groupByPage,
@ -43,14 +44,14 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
transform(context: TransformContext, inputItems: Item[]): ItemResult { transform(context: TransformContext, inputItems: Item[]): ItemResult {
const pageExtracts = buildExtracts(inputItems); const pageExtracts = buildExtracts(inputItems);
const uniqueYs = flatMap(pageExtracts, (extract) => extract.fringeLines) const fringeYs = flatMap(pageExtracts, (extract) => extract.fringeLines)
.map((line) => line.y) .map((line) => line.y)
.filter(onlyUniques) .filter(onlyUniques)
.sort((a, b) => a - b); .sort(ascending);
// console.log('uniqueYs', uniqueYs); // console.log('uniqueYs', uniqueYs);
const yToRemove = uniqueYs.filter((y) => { const yToRemove = fringeYs.filter((y) => {
const yLines = pageExtracts const yLines = pageExtracts
.map((page) => page.lineByY(y)) .map((page) => page.lineByY(y))
.filter((line) => typeof line !== 'undefined') as Line[]; .filter((line) => typeof line !== 'undefined') as Line[];
@ -73,11 +74,20 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
//console.log('yToRemove', yToRemove); //console.log('yToRemove', yToRemove);
let removalCount = 0;
return { return {
items: transformGroupedByPageAndLine(inputItems, (_, __, items) => items: transformGroupedByPageAndLine(inputItems, (_, __, lineItems) => {
yToRemove.includes(yFromLine(items)) ? [] : items, const itemsY = yFromLine(lineItems);
), if (fringeYs.includes(itemsY)) {
messages: [`Filtered out each item with y == ${yToRemove.join('||')}`], lineItems.forEach(context.trackEvaluation.bind(context));
}
if (yToRemove.includes(itemsY)) {
removalCount++;
return [];
}
return lineItems;
}),
messages: [`Filtered out ${removalCount} items with y == ${yToRemove.join('||')}`],
}; };
} }
} }
@ -166,14 +176,14 @@ class Line {
constructor(public y: number, public items: Item[]) {} constructor(public y: number, public items: Item[]) {}
text() { text(): string {
if (!this._text) { if (!this._text) {
this._text = this.items.reduce((all, item) => all + item.data['str'], ''); this._text = this.items.reduce((all, item) => all + item.data['str'], '');
} }
return this._text; return this._text;
} }
textWithoutNumbers() { textWithoutNumbers(): string {
if (!this._textWithoutNumbers) { if (!this._textWithoutNumbers) {
this._textWithoutNumbers = filterOutDigits(this.text()); this._textWithoutNumbers = filterOutDigits(this.text());
} }

View File

@ -1,6 +1,15 @@
import Item from '../Item';
import PageViewport from '../parse/PageViewport'; import PageViewport from '../parse/PageViewport';
import EvaluationTracker from './EvaluationTracker';
export default interface TransformContext { export default class TransformContext {
fontMap: Map<string, object>; constructor(
pageViewports: PageViewport[]; public fontMap: Map<string, object>,
public pageViewports: PageViewport[],
private evaluations = new EvaluationTracker(),
) {}
trackEvaluation(item: Item) {
this.evaluations.trackEvaluation(item);
}
} }

View File

@ -1,4 +1,5 @@
<script> <script>
import type EvaluationIndex from '@core/transformer/EvaluationIndex';
import type ChangeIndex from '@core/debug/ChangeIndex'; import type ChangeIndex from '@core/debug/ChangeIndex';
import type Item from '@core/Item'; import type Item from '@core/Item';
import { Addition, Removal, ContentChange, PositionChange, Direction } from '../../../core/src/debug/ChangeIndex'; import { Addition, Removal, ContentChange, PositionChange, Direction } from '../../../core/src/debug/ChangeIndex';
@ -10,17 +11,20 @@
MinusCircle as Minus, MinusCircle as Minus,
ArrowCircleUp as Up, ArrowCircleUp as Up,
ArrowCircleDown as Down, ArrowCircleDown as Down,
Eye,
} from 'svelte-hero-icons'; } from 'svelte-hero-icons';
export let evaluations: EvaluationIndex;
export let changes: ChangeIndex; export let changes: ChangeIndex;
export let item: Item; export let item: Item;
$: evaluated = evaluations.evaluated(item);
$: hasChanged = changes.hasChanged(item); $: hasChanged = changes.hasChanged(item);
let changeContent: string; let changeContent: string;
let iconComp: ComponentDefinition; let iconComp: ComponentDefinition;
$: { $: {
if (hasChanged) {
let args = { size: '14' }; let args = { size: '14' };
if (hasChanged) {
let change = changes.change(item); let change = changes.change(item);
switch (change.constructor.name) { switch (change.constructor.name) {
case PositionChange.name: case PositionChange.name:
@ -43,11 +47,13 @@
default: default:
throw new Error(`${change.constructor.name}: ${change}`); throw new Error(`${change.constructor.name}: ${change}`);
} }
} else if (evaluated) {
iconComp = new ComponentDefinition(Eye, args);
} }
} }
</script> </script>
{#if hasChanged} {#if evaluated || hasChanged}
<div class="flex space-x-0.5 items-center text-xs"> <div class="flex space-x-0.5 items-center text-xs">
{#if iconComp} {#if iconComp}
<svelte:component this={iconComp.component} {...iconComp.args} /> <svelte:component this={iconComp.component} {...iconComp.args} />

View File

@ -49,7 +49,12 @@
<!-- Items --> <!-- Items -->
{#if visiblePages.find((page) => page.itemGroups.length > 0)} {#if visiblePages.find((page) => page.itemGroups.length > 0)}
<ItemTable schema={stageResult.schema} pages={visiblePages} {pageControl} changes={stageResult.changes} /> <ItemTable
schema={stageResult.schema}
pages={visiblePages}
{pageControl}
evaluations={stageResult.evaluations}
changes={stageResult.changes} />
{:else} {:else}
<!-- No items visible --> <!-- No items visible -->
<div class="flex mt-8"> <div class="flex mt-8">

View File

@ -4,6 +4,7 @@
import { ArrowLeft, ArrowRight } from 'svelte-hero-icons'; import { ArrowLeft, ArrowRight } from 'svelte-hero-icons';
import type ItemGroup from '@core/debug/ItemGroup'; import type ItemGroup from '@core/debug/ItemGroup';
import type EvaluationIndex from '@core/transformer/EvaluationIndex';
import type ChangeIndex from '@core/debug/ChangeIndex'; import type ChangeIndex from '@core/debug/ChangeIndex';
import type AnnotatedColumn from '@core/debug/AnnotatedColumn'; import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
@ -17,6 +18,7 @@
export let itemIdx: number; export let itemIdx: number;
export let schema: AnnotatedColumn[]; export let schema: AnnotatedColumn[];
export let itemGroup: ItemGroup; export let itemGroup: ItemGroup;
export let evaluations: EvaluationIndex;
export let changes: ChangeIndex; export let changes: ChangeIndex;
let expandedItemGroup: { pageIndex: number; itemIndex: number }; let expandedItemGroup: { pageIndex: number; itemIndex: number };
@ -45,7 +47,7 @@
<td id="page" /> <td id="page" />
{/if} {/if}
<td class="align-middle"> <td class="align-middle">
<ChangeSymbol {changes} item={itemGroup.top} /> <ChangeSymbol {evaluations} {changes} item={itemGroup.top} />
</td> </td>
<span class="contents" on:click={() => itemGroup.hasMany() && toggleRow(pageIdx, itemIdx)}> <span class="contents" on:click={() => itemGroup.hasMany() && toggleRow(pageIdx, itemIdx)}>
<!-- ID & change marker column --> <!-- ID & change marker column -->
@ -72,7 +74,7 @@
class:changeMinus={changes.isMinusChange(child)}> class:changeMinus={changes.isMinusChange(child)}>
<td id="page" /> <td id="page" />
<td class="align-baseline"> <td class="align-baseline">
<ChangeSymbol {changes} item={child} /> <ChangeSymbol {evaluations} {changes} item={child} />
</td> </td>
<td class="whitespace-nowrap"> <td class="whitespace-nowrap">
<div class="flex space-x-1"> <div class="flex space-x-1">

View File

@ -4,6 +4,7 @@
import { PresentationChartLine } from 'svelte-hero-icons'; import { PresentationChartLine } from 'svelte-hero-icons';
import type AnnotatedColumn from '@core/debug/AnnotatedColumn'; import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
import type EvaluationIndex from '@core/transformer/EvaluationIndex';
import type ChangeIndex from '@core/debug/ChangeIndex'; import type ChangeIndex from '@core/debug/ChangeIndex';
import type Page from '@core/debug/Page'; import type Page from '@core/debug/Page';
import ColumnAnnotation from '../../../core/src/debug/ColumnAnnotation'; import ColumnAnnotation from '../../../core/src/debug/ColumnAnnotation';
@ -15,6 +16,7 @@
export let schema: AnnotatedColumn[]; export let schema: AnnotatedColumn[];
export let pages: Page[]; export let pages: Page[];
export let pageControl: PageControl; export let pageControl: PageControl;
export let evaluations: EvaluationIndex;
export let changes: ChangeIndex; export let changes: ChangeIndex;
let { pagePinned } = pageControl; let { pagePinned } = pageControl;
@ -80,7 +82,7 @@
<!-- Page items --> <!-- Page items -->
{#each page.itemGroups as itemGroup, itemIdx} {#each page.itemGroups as itemGroup, itemIdx}
<ItemRow pageIdx={page.index} {itemIdx} {schema} {itemGroup} {changes} {pageControl} /> <ItemRow pageIdx={page.index} {itemIdx} {schema} {itemGroup} {evaluations} {changes} {pageControl} />
{/each} {/each}
{/each} {/each}
</tbody> </tbody>