Track evaluations

This commit is contained in:
Johannes Zillmann 2021-03-23 07:25:17 +01:00
parent c98145a63c
commit 0be95e4bbc
15 changed files with 147 additions and 49 deletions

View File

@ -6,28 +6,25 @@ import ColumnAnnotation from './debug/ColumnAnnotation';
import AnnotatedColumn from './debug/AnnotatedColumn';
import { detectChanges } from './debug/detectChanges';
import { asPages } from './debug/Page';
import EvaluationTracker from './transformer/EvaluationTracker';
import ChangeTracker from './debug/ChangeTracker';
import PageViewport from './parse/PageViewport';
export default class Debugger {
private context: TransformContext;
private transformers: ItemTransformer[];
private stageResultCache: StageResult[];
pageCount: number;
fontMap: Map<string, object>;
stageNames: string[];
stageDescriptions: string[];
constructor(
pageCount: number,
public fontMap: Map<string, object>,
private pageViewports: PageViewport[],
public pageCount: number,
inputSchema: string[],
inputItems: Item[],
context: TransformContext,
transformers: ItemTransformer[],
) {
this.transformers = transformers;
this.context = context;
this.fontMap = context.fontMap;
this.pageCount = pageCount;
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
this.stageDescriptions = ['Initial items as parsed by PDFjs', ...transformers.map((t) => t.description)];
this.stageResultCache = [initialStage(inputSchema, inputItems)];
@ -36,19 +33,21 @@ export default class Debugger {
stageResult(stageIndex: number): StageResult {
for (let idx = 0; idx < stageIndex + 1; idx++) {
if (!this.stageResultCache[idx]) {
const evaluations = new EvaluationTracker();
const context = new TransformContext(this.fontMap, this.pageViewports, evaluations);
const transformer = this.transformers[idx - 1];
const previousStageResult: StageResult = this.stageResultCache[idx - 1];
const previousItems = previousStageResult.itemsCleanedAndUnpacked();
const inputSchema = toSimpleSchema(previousStageResult);
const outputSchema = transformer.schemaTransformer(inputSchema);
const itemResult = transformer.transform(this.context, [...previousItems]);
const itemResult = transformer.transform(context, [...previousItems]);
const changeTracker = new ChangeTracker();
const items = detectChanges(changeTracker, previousItems, itemResult.items);
const pages = asPages(changeTracker, items, transformer.descriptor.debug?.itemMerger);
const changes = new ChangeTracker();
const items = detectChanges(changes, previousItems, itemResult.items);
const pages = asPages(evaluations, changes, items, transformer.descriptor.debug?.itemMerger);
const messages = itemResult.messages;
if (changeTracker.changeCount() > 0 && messages.length === 0) {
messages.unshift(`Detected ${changeTracker.changeCount()} changes`);
if (changes.changeCount() > 0 && messages.length === 0) {
messages.unshift(`Detected ${changes.changeCount()} changes`);
}
this.stageResultCache.push(
@ -56,7 +55,8 @@ export default class Debugger {
transformer.descriptor,
toAnnotatedSchema(inputSchema, outputSchema),
pages,
changeTracker,
evaluations,
changes,
messages,
),
);

View File

@ -5,6 +5,7 @@ import ItemTransformer from './transformer/ItemTransformer';
import ParseResult from './ParseResult';
import Debugger from './Debugger';
import { assert } from './assert';
import TransformContext from './transformer/TransformContext';
export default class PdfPipeline {
parser: PdfParser;
@ -27,9 +28,9 @@ export default class PdfPipeline {
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
const parseResult = await this.parse(src, progressListener);
this.verifyRequiredColumns(parseResult.schema, this.transformers);
const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
let items = parseResult.items;
this.transformers.forEach((transformer) => {
const context = new TransformContext(parseResult.fontMap, parseResult.pageViewports);
items = transformer.transform(context, items).items;
});
parseResult.items = items;
@ -38,8 +39,14 @@ export default class PdfPipeline {
async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> {
const parseResult = await this.parse(src, progressListener);
const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
return new Debugger(parseResult.pageCount, parseResult.schema, parseResult.items, context, this.transformers);
return new Debugger(
parseResult.fontMap,
parseResult.pageViewports,
parseResult.pageCount,
parseResult.schema,
parseResult.items,
this.transformers,
);
}
/**

View File

@ -1,3 +1,4 @@
import type EvaluationTracker from '../transformer/EvaluationTracker';
import type ChangeTracker from './ChangeTracker';
import type Item from '../Item';
@ -6,5 +7,5 @@ import type Item from '../Item';
*/
export default abstract class ItemMerger {
constructor(public groupKey: string) {}
abstract merge(tracker: ChangeTracker, items: Item[]): Item;
abstract merge(evaluationTracker: EvaluationTracker, changeTracker: ChangeTracker, items: Item[]): Item;
}

View File

@ -1,5 +1,6 @@
import ItemMerger from './ItemMerger';
import Item from '../Item';
import EvaluationTracker from '../transformer/EvaluationTracker';
import ChangeTracker from './ChangeTracker';
export default class LineItemMerger extends ItemMerger {
@ -7,7 +8,7 @@ export default class LineItemMerger extends ItemMerger {
super('line');
}
merge(tracker: ChangeTracker, items: Item[]): Item {
merge(evaluationTracker: EvaluationTracker, changeTracker: ChangeTracker, items: Item[]): Item {
const page = items[0].page;
const line = items[0].data['line'];
const str = items.map((item) => item.data['str']).join(' ');
@ -28,12 +29,14 @@ export default class LineItemMerger extends ItemMerger {
dir: directions,
});
if (items.find((item) => evaluationTracker.evaluated(item))) evaluationTracker.trackEvaluation(newItem);
if (this.trackAsNew) {
tracker.trackAddition(newItem);
} else if (items.every((item) => tracker.isRemoved(item))) {
tracker.trackRemoval(newItem);
} else if (items.find((item) => tracker.hasChanged(item))) {
tracker.trackContentChange(newItem);
changeTracker.trackAddition(newItem);
} else if (items.every((item) => changeTracker.isRemoved(item))) {
changeTracker.trackRemoval(newItem);
} else if (items.find((item) => changeTracker.hasChanged(item))) {
changeTracker.trackContentChange(newItem);
}
return newItem;
}

View File

@ -1,5 +1,6 @@
import Item from '../Item';
import { groupByElement, groupByPage } from '../support/groupingUtils';
import EvaluationTracker from '../transformer/EvaluationTracker';
import ChangeTracker from './ChangeTracker';
import ItemGroup from './ItemGroup';
import ItemMerger from './ItemMerger';
@ -9,13 +10,18 @@ export default interface Page {
itemGroups: ItemGroup[];
}
export function asPages(tracker: ChangeTracker, items: Item[], itemMerger?: ItemMerger): Page[] {
export function asPages(
evaluationTracker: EvaluationTracker,
changeTracker: ChangeTracker,
items: Item[],
itemMerger?: ItemMerger,
): Page[] {
return groupByPage(items).map((pageItems: Item[]) => {
let itemGroups: ItemGroup[];
if (itemMerger) {
itemGroups = groupByElement(pageItems, itemMerger.groupKey).map((groupItems) => {
if (groupItems.length > 1) {
const top = itemMerger.merge(tracker, groupItems);
const top = itemMerger.merge(evaluationTracker, changeTracker, groupItems);
return new ItemGroup(top, groupItems);
} else {
return new ItemGroup(groupItems[0]);

View File

@ -5,12 +5,15 @@ import Page, { asPages } from './Page';
import ChangeIndex from './ChangeIndex';
import ChangeTracker from './ChangeTracker';
import ItemGroup from './ItemGroup';
import EvaluationIndex from '../transformer/EvaluationIndex';
import EvaluationTracker from '../transformer/EvaluationTracker';
export default class StageResult {
constructor(
public descriptor: TransformDescriptor,
public schema: AnnotatedColumn[],
public pages: Page[],
public evaluations: EvaluationIndex,
public changes: ChangeIndex,
public messages: string[],
) {}
@ -50,7 +53,9 @@ export default class StageResult {
(page) =>
({
...page,
itemGroups: page.itemGroups.filter((itemGroup) => this.changes.hasChanged(itemGroup.top)),
itemGroups: page.itemGroups.filter(
(itemGroup) => this.evaluations.evaluated(itemGroup.top) || this.changes.hasChanged(itemGroup.top),
),
} as Page),
);
}
@ -73,12 +78,13 @@ export default class StageResult {
export function initialStage(inputSchema: string[], inputItems: Item[]): StageResult {
const schema = inputSchema.map((column) => ({ name: column }));
const tracker = new ChangeTracker();
const pages = asPages(tracker, inputItems);
const evaluations = new EvaluationTracker();
const changes = new ChangeTracker();
const pages = asPages(evaluations, changes, inputItems);
const messages = [
`Parsed ${inputItems.length === 0 ? 0 : inputItems[inputItems.length - 1].page + 1} pages with ${
inputItems.length
} items`,
];
return new StageResult(toDescriptor({ debug: { showAll: true } }), schema, pages, tracker, messages);
return new StageResult(toDescriptor({ debug: { showAll: true } }), schema, pages, evaluations, changes, messages);
}

View File

@ -8,6 +8,10 @@ export function onlyUniques<T>(value: T, index: number, self: T[]) {
return self.indexOf(value) === index;
}
export function ascending<T>(a: number, b: number): number {
return a - b;
}
export function count<T, S>(array: T[], find: (entry: T) => boolean): number {
return array.reduce((count, entry) => (find(entry) ? count + 1 : count), 0);
}

View File

@ -0,0 +1,14 @@
import Item from '../Item';
export default interface EvaluationIndex {
/**
* Return the number of tracked evaluations.
*/
evaluationCount(): number;
/**
* Returns the true if the given item has been evaluated
* @param item
*/
evaluated(item: Item): boolean;
}

View File

@ -0,0 +1,23 @@
import { assertDefined } from '../assert';
import Item from '../Item';
import EvaluationIndex from './EvaluationIndex';
export default class EvaluationTracker implements EvaluationIndex {
private evaluations: Set<string> = new Set();
evaluationCount() {
return this.evaluations.size;
}
evaluated(item: Item) {
return this.evaluations.has(_uuid(item));
}
trackEvaluation(item: Item) {
this.evaluations.add(_uuid(item));
}
}
function _uuid(item: Item): string {
return assertDefined(item.uuid, 'UUID is not set');
}

View File

@ -6,6 +6,7 @@ import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext';
import LineItemMerger from '../debug/LineItemMerger';
import {
ascending,
flatMap,
groupByLine,
groupByPage,
@ -43,14 +44,14 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
transform(context: TransformContext, inputItems: Item[]): ItemResult {
const pageExtracts = buildExtracts(inputItems);
const uniqueYs = flatMap(pageExtracts, (extract) => extract.fringeLines)
const fringeYs = flatMap(pageExtracts, (extract) => extract.fringeLines)
.map((line) => line.y)
.filter(onlyUniques)
.sort((a, b) => a - b);
.sort(ascending);
// console.log('uniqueYs', uniqueYs);
const yToRemove = uniqueYs.filter((y) => {
const yToRemove = fringeYs.filter((y) => {
const yLines = pageExtracts
.map((page) => page.lineByY(y))
.filter((line) => typeof line !== 'undefined') as Line[];
@ -73,11 +74,20 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
//console.log('yToRemove', yToRemove);
let removalCount = 0;
return {
items: transformGroupedByPageAndLine(inputItems, (_, __, items) =>
yToRemove.includes(yFromLine(items)) ? [] : items,
),
messages: [`Filtered out each item with y == ${yToRemove.join('||')}`],
items: transformGroupedByPageAndLine(inputItems, (_, __, lineItems) => {
const itemsY = yFromLine(lineItems);
if (fringeYs.includes(itemsY)) {
lineItems.forEach(context.trackEvaluation.bind(context));
}
if (yToRemove.includes(itemsY)) {
removalCount++;
return [];
}
return lineItems;
}),
messages: [`Filtered out ${removalCount} items with y == ${yToRemove.join('||')}`],
};
}
}
@ -166,14 +176,14 @@ class Line {
constructor(public y: number, public items: Item[]) {}
text() {
text(): string {
if (!this._text) {
this._text = this.items.reduce((all, item) => all + item.data['str'], '');
}
return this._text;
}
textWithoutNumbers() {
textWithoutNumbers(): string {
if (!this._textWithoutNumbers) {
this._textWithoutNumbers = filterOutDigits(this.text());
}

View File

@ -1,6 +1,15 @@
import Item from '../Item';
import PageViewport from '../parse/PageViewport';
import EvaluationTracker from './EvaluationTracker';
export default interface TransformContext {
fontMap: Map<string, object>;
pageViewports: PageViewport[];
export default class TransformContext {
constructor(
public fontMap: Map<string, object>,
public pageViewports: PageViewport[],
private evaluations = new EvaluationTracker(),
) {}
trackEvaluation(item: Item) {
this.evaluations.trackEvaluation(item);
}
}

View File

@ -1,4 +1,5 @@
<script>
import type EvaluationIndex from '@core/transformer/EvaluationIndex';
import type ChangeIndex from '@core/debug/ChangeIndex';
import type Item from '@core/Item';
import { Addition, Removal, ContentChange, PositionChange, Direction } from '../../../core/src/debug/ChangeIndex';
@ -10,17 +11,20 @@
MinusCircle as Minus,
ArrowCircleUp as Up,
ArrowCircleDown as Down,
Eye,
} from 'svelte-hero-icons';
export let evaluations: EvaluationIndex;
export let changes: ChangeIndex;
export let item: Item;
$: evaluated = evaluations.evaluated(item);
$: hasChanged = changes.hasChanged(item);
let changeContent: string;
let iconComp: ComponentDefinition;
$: {
let args = { size: '14' };
if (hasChanged) {
let args = { size: '14' };
let change = changes.change(item);
switch (change.constructor.name) {
case PositionChange.name:
@ -43,11 +47,13 @@
default:
throw new Error(`${change.constructor.name}: ${change}`);
}
} else if (evaluated) {
iconComp = new ComponentDefinition(Eye, args);
}
}
</script>
{#if hasChanged}
{#if evaluated || hasChanged}
<div class="flex space-x-0.5 items-center text-xs">
{#if iconComp}
<svelte:component this={iconComp.component} {...iconComp.args} />

View File

@ -49,7 +49,12 @@
<!-- Items -->
{#if visiblePages.find((page) => page.itemGroups.length > 0)}
<ItemTable schema={stageResult.schema} pages={visiblePages} {pageControl} changes={stageResult.changes} />
<ItemTable
schema={stageResult.schema}
pages={visiblePages}
{pageControl}
evaluations={stageResult.evaluations}
changes={stageResult.changes} />
{:else}
<!-- No items visible -->
<div class="flex mt-8">

View File

@ -4,6 +4,7 @@
import { ArrowLeft, ArrowRight } from 'svelte-hero-icons';
import type ItemGroup from '@core/debug/ItemGroup';
import type EvaluationIndex from '@core/transformer/EvaluationIndex';
import type ChangeIndex from '@core/debug/ChangeIndex';
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
@ -17,6 +18,7 @@
export let itemIdx: number;
export let schema: AnnotatedColumn[];
export let itemGroup: ItemGroup;
export let evaluations: EvaluationIndex;
export let changes: ChangeIndex;
let expandedItemGroup: { pageIndex: number; itemIndex: number };
@ -45,7 +47,7 @@
<td id="page" />
{/if}
<td class="align-middle">
<ChangeSymbol {changes} item={itemGroup.top} />
<ChangeSymbol {evaluations} {changes} item={itemGroup.top} />
</td>
<span class="contents" on:click={() => itemGroup.hasMany() && toggleRow(pageIdx, itemIdx)}>
<!-- ID & change marker column -->
@ -72,7 +74,7 @@
class:changeMinus={changes.isMinusChange(child)}>
<td id="page" />
<td class="align-baseline">
<ChangeSymbol {changes} item={child} />
<ChangeSymbol {evaluations} {changes} item={child} />
</td>
<td class="whitespace-nowrap">
<div class="flex space-x-1">

View File

@ -4,6 +4,7 @@
import { PresentationChartLine } from 'svelte-hero-icons';
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
import type EvaluationIndex from '@core/transformer/EvaluationIndex';
import type ChangeIndex from '@core/debug/ChangeIndex';
import type Page from '@core/debug/Page';
import ColumnAnnotation from '../../../core/src/debug/ColumnAnnotation';
@ -15,6 +16,7 @@
export let schema: AnnotatedColumn[];
export let pages: Page[];
export let pageControl: PageControl;
export let evaluations: EvaluationIndex;
export let changes: ChangeIndex;
let { pagePinned } = pageControl;
@ -80,7 +82,7 @@
<!-- Page items -->
{#each page.itemGroups as itemGroup, itemIdx}
<ItemRow pageIdx={page.index} {itemIdx} {schema} {itemGroup} {changes} {pageControl} />
<ItemRow pageIdx={page.index} {itemIdx} {schema} {itemGroup} {evaluations} {changes} {pageControl} />
{/each}
{/each}
</tbody>