From 6c72d615903fa7a6917d169004bb93a1551ea5f0 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Sun, 14 Feb 2021 11:43:26 +0100 Subject: [PATCH] Annotated schema for debug --- core/src/Debugger.ts | 82 ++++++++--- core/src/PdfPipeline.ts | 33 ++++- core/src/debug/AnnotatedColumn.ts | 6 + core/src/debug/ColumnAnnotation.ts | 6 + core/src/debug/StageResult.ts | 8 ++ core/src/transformer/transformerUtil.ts | 46 ------- core/test/Debugger.test.ts | 128 +++++++++++++++++- ...formerUtil.test.ts => PdfPipeline.test.ts} | 36 ++--- ui/src/debug/DebugView.svelte | 5 +- ui/src/debug/ItemTable.svelte | 14 +- 10 files changed, 257 insertions(+), 107 deletions(-) create mode 100644 core/src/debug/AnnotatedColumn.ts create mode 100644 core/src/debug/ColumnAnnotation.ts create mode 100644 core/src/debug/StageResult.ts delete mode 100644 core/src/transformer/transformerUtil.ts rename core/test/{transformer/transformerUtil.test.ts => PdfPipeline.test.ts} (62%) diff --git a/core/src/Debugger.ts b/core/src/Debugger.ts index da579b5..a5e1e81 100644 --- a/core/src/Debugger.ts +++ b/core/src/Debugger.ts @@ -1,43 +1,81 @@ -import { assert } from './assert'; import Item from './Item'; import ItemResult from './ItemResult'; import ItemTransformer from './transformer/ItemTransformer'; -import { calculateSchemas } from './transformer/transformerUtil'; import TransformContext from './transformer/TransformContext'; +import StageResult from './debug/StageResult'; +import ColumnAnnotation from './debug/ColumnAnnotation'; +import AnnotatedColumn from './debug/AnnotatedColumn'; export default class Debugger { - // parseResult: ParseResult; - context: TransformContext; - transformers: ItemTransformer[]; + private context: TransformContext; + private transformers: ItemTransformer[]; + private stageResultCache: StageResult[]; stageNames: string[]; - stageSchema: string[][]; - private stageResultCache: ItemResult[]; - constructor( - initialSchema: string[], - initialItems: Item[], - context: TransformContext, - transformers: ItemTransformer[], - ) { - // this.parseResult = parseResult; + constructor(inputSchema: string[], inputItems: Item[], context: TransformContext, transformers: ItemTransformer[]) { this.transformers = transformers; this.context = context; this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)]; - this.stageResultCache = [{ items: initialItems, messages: [`Parsed ${initialItems[initialItems.length-1].page+1} pages with ${initialItems.length} items`] }]; - this.stageSchema = calculateSchemas(initialSchema, transformers); + this.stageResultCache = [ + { + schema: inputSchema.map((column) => ({ name: column })), + items: inputItems, + messages: [ + `Parsed ${inputItems.length === 0 ? 0 : inputItems[inputItems.length - 1].page + 1} pages with ${ + inputItems.length + } items`, + ], + }, + ]; } //TODO return MarkedItem ? (removed, added, etc..)? - //TODO StageResult == class with schema and marked items ? - stageResults(stageIndex: number): ItemResult { + stageResults(stageIndex: number): StageResult { for (let idx = 0; idx < stageIndex + 1; idx++) { if (!this.stageResultCache[idx]) { - const stageResult = this.transformers[idx - 1].transform(this.context, [ - ...this.stageResultCache[idx - 1].items, - ]); - this.stageResultCache.push(stageResult); + const transformer = this.transformers[idx - 1]; + const previousStageResult: StageResult = this.stageResultCache[idx - 1]; + const inputSchema = toSimpleSchema(previousStageResult); + const outputSchema = transformer.schemaTransformer(inputSchema); + const itemResult = transformer.transform(this.context, [...this.stageResultCache[idx - 1].items]); + this.stageResultCache.push({ + schema: toAnnotatedSchema(inputSchema, outputSchema), + ...itemResult, + }); } } return this.stageResultCache[stageIndex]; } } + +function toSimpleSchema(stageResult: StageResult): string[] { + return stageResult.schema + .filter((column) => !column.annotation || column.annotation !== ColumnAnnotation.REMOVED) + .map((column) => column.name); +} + +function toAnnotatedSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] { + const annotatedSchema: AnnotatedColumn[] = []; + let out_idx = 0; + for (let in_idx = 0; in_idx < inputSchema.length; in_idx++) { + const nextInputColumn = inputSchema[in_idx]; + const indexInOut = outputSchema.indexOf(nextInputColumn); + if (indexInOut === -1) { + annotatedSchema.push({ name: nextInputColumn, annotation: ColumnAnnotation.REMOVED }); + } else if (indexInOut > out_idx) { + while (out_idx < indexInOut) { + annotatedSchema.push({ name: outputSchema[out_idx], annotation: ColumnAnnotation.ADDED }); + out_idx++; + } + annotatedSchema.push({ name: nextInputColumn }); + out_idx++; + } else { + annotatedSchema.push({ name: nextInputColumn }); + out_idx++; + } + } + for (let index = out_idx; index < outputSchema.length; index++) { + annotatedSchema.push({ name: outputSchema[index], annotation: ColumnAnnotation.ADDED }); + } + return annotatedSchema; +} diff --git a/core/src/PdfPipeline.ts b/core/src/PdfPipeline.ts index cfb2fb5..69b80a3 100644 --- a/core/src/PdfPipeline.ts +++ b/core/src/PdfPipeline.ts @@ -2,11 +2,9 @@ import PdfParser from './PdfParser'; import ProgressListenFunction from './ProgressListenFunction'; import ParseProgressReporter from './ParseProgressReporter'; import ItemTransformer from './transformer/ItemTransformer'; -import Item from './Item'; import ParseResult from './ParseResult'; import Debugger from './Debugger'; -import { verifyRequiredColumns } from './transformer/transformerUtil'; -import TransformContext from './transformer/TransformContext'; +import { assert } from './assert'; export default class PdfPipeline { parser: PdfParser; @@ -22,16 +20,17 @@ export default class PdfPipeline { progressListener: ProgressListenFunction, ): Promise { const parseResult = await this.parser.parse(src, new ParseProgressReporter(progressListener)); - verifyRequiredColumns(parseResult.schema, this.transformers); + this.verifyRequiredColumns(parseResult.schema, this.transformers); return parseResult; } //TODO PipelineResult async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise { const parseResult = await this.parse(src, progressListener); + this.verifyRequiredColumns(parseResult.schema, this.transformers); const context = { pageViewports: parseResult.pageViewports }; let items = parseResult.items; this.transformers.forEach((transformer) => { - items = transformer.transform(context, items); + items = transformer.transform(context, items).items; }); parseResult.items = items; return parseResult; @@ -42,4 +41,28 @@ export default class PdfPipeline { const context = { pageViewports: parseResult.pageViewports }; return new Debugger(parseResult.schema, parseResult.items, context, this.transformers); } + + /** + * Goes through all transformer and makes sure each required column is available in its predecessor schema. + * + * @param inputSchema + * @param transformers + */ + verifyRequiredColumns(inputSchema: string[], transformers: ItemTransformer[]) { + const schemas: string[][] = [inputSchema]; + for (let idx = 0; idx < transformers.length; idx++) { + const transformer = transformers[idx]; + const predecessorSchema = schemas[idx]; + transformer.descriptor.requireColumns?.forEach((column) => { + assert( + predecessorSchema.includes(column), + `Input schema [${predecessorSchema.join(', ')}] for transformer '${ + transformer.name + }' does not contain the required column '${column}'`, + ); + }); + const outputSchema = transformer.schemaTransformer(predecessorSchema); + schemas.push(outputSchema); + } + } } diff --git a/core/src/debug/AnnotatedColumn.ts b/core/src/debug/AnnotatedColumn.ts new file mode 100644 index 0000000..cc020db --- /dev/null +++ b/core/src/debug/AnnotatedColumn.ts @@ -0,0 +1,6 @@ +import ColumnAnnotation from './ColumnAnnotation'; + +export default interface AnnotatedColumn { + name: string; + annotation?: ColumnAnnotation; +} diff --git a/core/src/debug/ColumnAnnotation.ts b/core/src/debug/ColumnAnnotation.ts new file mode 100644 index 0000000..767a5d4 --- /dev/null +++ b/core/src/debug/ColumnAnnotation.ts @@ -0,0 +1,6 @@ +enum ColumnAnnotation { + ADDED = 'ADDED', + REMOVED = 'REMOVED', +} + +export default ColumnAnnotation; diff --git a/core/src/debug/StageResult.ts b/core/src/debug/StageResult.ts new file mode 100644 index 0000000..5070799 --- /dev/null +++ b/core/src/debug/StageResult.ts @@ -0,0 +1,8 @@ +import Item from '../Item'; +import AnnotatedColumn from './AnnotatedColumn'; + +export default interface StageResult { + schema: AnnotatedColumn[]; + items: Item[]; + messages: string[]; +} diff --git a/core/src/transformer/transformerUtil.ts b/core/src/transformer/transformerUtil.ts deleted file mode 100644 index 5bef901..0000000 --- a/core/src/transformer/transformerUtil.ts +++ /dev/null @@ -1,46 +0,0 @@ -import TransformerDescriptor from 'src/TransformerDescription'; -import { assert } from '../assert'; -import ItemTransformer from './ItemTransformer'; - -/** - * Goes through all transformer and makes sure each required column ({@link TransformerDescription#consumes}) is available in its predecessor schema. - * - * @param initialSchema - * @param transformers - */ -export function verifyRequiredColumns(initialSchema: string[], transformers: ItemTransformer[]) { - calculateSchemas(initialSchema, transformers); -} - -//TODO debug schema -// initial - all unanotated -// second - 2 removed, 1 added -// third - all as before without the removed - -export function calculateSchemas(initialSchema: string[], transformers: ItemTransformer[]): string[][] { - const schemas: string[][] = []; - schemas.push(initialSchema); - for (let idx = 0; idx < transformers.length; idx++) { - const transformer = transformers[idx]; - const inputSchema = schemas[idx]; - validateReferences(inputSchema, transformer.name, transformer.descriptor); - const outputSchema = transformer.schemaTransformer(inputSchema); - schemas.push(outputSchema); - } - return schemas; -} - -function validateReferences( - inputSchema: string[], - transformerName: string, - transformerDescriptor: TransformerDescriptor, -) { - transformerDescriptor.requireColumns?.forEach((column) => { - assert( - inputSchema.includes(column), - `Input schema [${inputSchema.join( - ', ', - )}] for transformer '${transformerName}' does not contain the required column '${column}' (consumes)`, - ); - }); -} diff --git a/core/test/Debugger.test.ts b/core/test/Debugger.test.ts index 92f6e33..1d9f611 100644 --- a/core/test/Debugger.test.ts +++ b/core/test/Debugger.test.ts @@ -4,6 +4,8 @@ import ItemTransformer from 'src/transformer/ItemTransformer'; import TransformerDescriptor from 'src/TransformerDescription'; import TransformContext from 'src/transformer/TransformContext'; import ItemResult from 'src/ItemResult'; +import ColumnAnnotation from 'src/debug/ColumnAnnotation'; +import AnnotatedColumn from 'src/debug/AnnotatedColumn'; class TestTransformer extends ItemTransformer { items: Item[]; @@ -31,7 +33,131 @@ test('basic debug', async () => { const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers); expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']); - expect(debug.stageSchema).toEqual([parsedSchema, ['C']]); + expect(debug.stageResults(0).schema).toEqual(parsedSchema.map((column) => ({ name: column }))); + expect(debug.stageResults(1).schema).toEqual([ + ...parsedSchema.map((column) => ({ name: column, annotation: ColumnAnnotation.REMOVED })), + { name: 'C', annotation: ColumnAnnotation.ADDED }, + ]); expect(debug.stageResults(0).items).toEqual(parsedItems); expect(debug.stageResults(1).items).toEqual(trans1Items); }); + +describe('build schemas', () => { + const items: Item[] = []; + + function calculateSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] { + const transformers = [new TestTransformer('Trans1', {}, outputSchema, items)]; + const debug = new Debugger(inputSchema, items, { pageViewports: [] }, transformers); + return debug.stageResults(1).schema; + } + + test('Add', async () => { + const annotatedSchema = calculateSchema(['A', 'B'], ['A', 'B', 'C']); + expect(annotatedSchema).toEqual([{ name: 'A' }, { name: 'B' }, { name: 'C', annotation: ColumnAnnotation.ADDED }]); + }); + + test('Remove', async () => { + const annotatedSchema = calculateSchema(['A', 'B'], ['A']); + expect(annotatedSchema).toEqual([{ name: 'A' }, { name: 'B', annotation: ColumnAnnotation.REMOVED }]); + }); + + test('Replace first', async () => { + const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['X', 'B', 'C']); + expect(annotatedSchema).toEqual([ + { name: 'A', annotation: ColumnAnnotation.REMOVED }, + { name: 'X', annotation: ColumnAnnotation.ADDED }, + { name: 'B' }, + { name: 'C' }, + ]); + }); + + test('Replace middle', async () => { + const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'X', 'C']); + expect(annotatedSchema).toEqual([ + { name: 'A' }, + { name: 'B', annotation: ColumnAnnotation.REMOVED }, + { name: 'X', annotation: ColumnAnnotation.ADDED }, + { name: 'C' }, + ]); + }); + + test('Replace last', async () => { + const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'B', 'X']); + expect(annotatedSchema).toEqual([ + { name: 'A' }, + { name: 'B' }, + { name: 'C', annotation: ColumnAnnotation.REMOVED }, + { name: 'X', annotation: ColumnAnnotation.ADDED }, + ]); + }); + + test('Replace first with 2', async () => { + const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['X', 'Y', 'B', 'C']); + expect(annotatedSchema).toEqual([ + { name: 'A', annotation: ColumnAnnotation.REMOVED }, + { name: 'X', annotation: ColumnAnnotation.ADDED }, + { name: 'Y', annotation: ColumnAnnotation.ADDED }, + { name: 'B' }, + { name: 'C' }, + ]); + }); + + test('Replace middle with 2', async () => { + const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'X', 'Y', 'C']); + expect(annotatedSchema).toEqual([ + { name: 'A' }, + { name: 'B', annotation: ColumnAnnotation.REMOVED }, + { name: 'X', annotation: ColumnAnnotation.ADDED }, + { name: 'Y', annotation: ColumnAnnotation.ADDED }, + { name: 'C' }, + ]); + }); + + test('Replace last with 2', async () => { + const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'B', 'X', 'Y']); + expect(annotatedSchema).toEqual([ + { name: 'A' }, + { name: 'B' }, + { name: 'C', annotation: ColumnAnnotation.REMOVED }, + { name: 'X', annotation: ColumnAnnotation.ADDED }, + { name: 'Y', annotation: ColumnAnnotation.ADDED }, + ]); + }); + + test('Replace 2 with one', async () => { + const annotatedSchema = calculateSchema(['A', 'B', 'C', 'D'], ['A', 'X', 'D']); + expect(annotatedSchema).toEqual([ + { name: 'A' }, + { name: 'B', annotation: ColumnAnnotation.REMOVED }, + { name: 'C', annotation: ColumnAnnotation.REMOVED }, + { name: 'X', annotation: ColumnAnnotation.ADDED }, + { name: 'D' }, + ]); + }); + + test('Replace all', async () => { + const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['X']); + expect(annotatedSchema).toEqual([ + { name: 'A', annotation: ColumnAnnotation.REMOVED }, + { name: 'B', annotation: ColumnAnnotation.REMOVED }, + { name: 'C', annotation: ColumnAnnotation.REMOVED }, + { name: 'X', annotation: ColumnAnnotation.ADDED }, + ]); + }); + + test('Wild mix all', async () => { + const annotatedSchema = calculateSchema(['A', 'B', 'C', 'E', 'F', 'G'], ['B', 'X', 'E', 'Y', 'Z', 'G', 'XX']); + expect(annotatedSchema).toEqual([ + { name: 'A', annotation: ColumnAnnotation.REMOVED }, + { name: 'B' }, + { name: 'C', annotation: ColumnAnnotation.REMOVED }, + { name: 'X', annotation: ColumnAnnotation.ADDED }, + { name: 'E' }, + { name: 'F', annotation: ColumnAnnotation.REMOVED }, + { name: 'Y', annotation: ColumnAnnotation.ADDED }, + { name: 'Z', annotation: ColumnAnnotation.ADDED }, + { name: 'G' }, + { name: 'XX', annotation: ColumnAnnotation.ADDED }, + ]); + }); +}); diff --git a/core/test/transformer/transformerUtil.test.ts b/core/test/PdfPipeline.test.ts similarity index 62% rename from core/test/transformer/transformerUtil.test.ts rename to core/test/PdfPipeline.test.ts index 251d576..d25ee77 100644 --- a/core/test/transformer/transformerUtil.test.ts +++ b/core/test/PdfPipeline.test.ts @@ -3,7 +3,10 @@ import Item from 'src/Item'; import ItemResult from 'src/ItemResult'; import ItemTransformer from 'src/transformer/ItemTransformer'; import TransformContext from 'src/transformer/TransformContext'; -import { calculateSchemas, verifyRequiredColumns } from 'src/transformer/transformerUtil'; +import PdfParser from 'src/PdfParser'; +import * as pdfjs from 'pdfjs-dist/es5/build/pdf'; +import * as fs from 'fs'; +import PdfPipeline from 'src/PdfPipeline'; class TestSchemaTransformer extends ItemTransformer { constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[] | undefined = undefined) { @@ -26,36 +29,15 @@ test('verify valid transform', async () => { new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']), new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }, ['A', 'D', 'E']), ]; - verifyRequiredColumns(inputSchema, transformers); + const pipeline = new PdfPipeline(new PdfParser(pdfjs), transformers); + pipeline.verifyRequiredColumns(inputSchema, transformers); }); test('verify invalid consume', async () => { const inputSchema = ['A', 'B', 'C']; - const transformers = [new TestSchemaTransformer('Consumes X', { requireColumns: ['X'] })]; - expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError( - "Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X' (consumes)", + const pipeline = new PdfPipeline(new PdfParser(pdfjs), transformers); + expect(() => pipeline.verifyRequiredColumns(inputSchema, transformers)).toThrowError( + "Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X'", ); }); - -test('calculate schemas', async () => { - const inputSchema = ['A', 'B', 'C']; - - const transformers = [ - new TestSchemaTransformer( - 'Replace B & C with D', - { - requireColumns: ['B', 'C'], - }, - ['A', 'D'], - ), - new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']), - new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }), - ]; - expect(calculateSchemas(inputSchema, transformers)).toEqual([ - ['A', 'B', 'C'], - ['A', 'D'], - ['A', 'D', 'E'], - ['A', 'D', 'E'], - ]); -}); diff --git a/ui/src/debug/DebugView.svelte b/ui/src/debug/DebugView.svelte index 6d2cd00..676190a 100644 --- a/ui/src/debug/DebugView.svelte +++ b/ui/src/debug/DebugView.svelte @@ -13,7 +13,6 @@ let currentStage = 0; $: canNext = currentStage + 1 < stageNames.length; $: canPrev = currentStage > 0; - $: stageSchema = debug.stageSchema[currentStage]; $: stageResult = debug.stageResults(currentStage); $: pageFocus = !isNaN(focusedPage); $: pagesNumbers = new Set(stageResult.items.map((item) => item.page)); @@ -41,7 +40,7 @@
- @@ -99,7 +98,7 @@ - +