Annotated schema for debug

This commit is contained in:
Johannes Zillmann 2021-02-14 11:43:26 +01:00
parent 698562ab27
commit 6c72d61590
10 changed files with 257 additions and 107 deletions

View File

@ -1,43 +1,81 @@
import { assert } from './assert';
import Item from './Item'; import Item from './Item';
import ItemResult from './ItemResult'; import ItemResult from './ItemResult';
import ItemTransformer from './transformer/ItemTransformer'; import ItemTransformer from './transformer/ItemTransformer';
import { calculateSchemas } from './transformer/transformerUtil';
import TransformContext from './transformer/TransformContext'; import TransformContext from './transformer/TransformContext';
import StageResult from './debug/StageResult';
import ColumnAnnotation from './debug/ColumnAnnotation';
import AnnotatedColumn from './debug/AnnotatedColumn';
export default class Debugger { export default class Debugger {
// parseResult: ParseResult; private context: TransformContext;
context: TransformContext; private transformers: ItemTransformer[];
transformers: ItemTransformer[]; private stageResultCache: StageResult[];
stageNames: string[]; stageNames: string[];
stageSchema: string[][];
private stageResultCache: ItemResult[];
constructor( constructor(inputSchema: string[], inputItems: Item[], context: TransformContext, transformers: ItemTransformer[]) {
initialSchema: string[],
initialItems: Item[],
context: TransformContext,
transformers: ItemTransformer[],
) {
// this.parseResult = parseResult;
this.transformers = transformers; this.transformers = transformers;
this.context = context; this.context = context;
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)]; this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
this.stageResultCache = [{ items: initialItems, messages: [`Parsed ${initialItems[initialItems.length-1].page+1} pages with ${initialItems.length} items`] }]; this.stageResultCache = [
this.stageSchema = calculateSchemas(initialSchema, transformers); {
schema: inputSchema.map((column) => ({ name: column })),
items: inputItems,
messages: [
`Parsed ${inputItems.length === 0 ? 0 : inputItems[inputItems.length - 1].page + 1} pages with ${
inputItems.length
} items`,
],
},
];
} }
//TODO return MarkedItem ? (removed, added, etc..)? //TODO return MarkedItem ? (removed, added, etc..)?
//TODO StageResult == class with schema and marked items ? stageResults(stageIndex: number): StageResult {
stageResults(stageIndex: number): ItemResult {
for (let idx = 0; idx < stageIndex + 1; idx++) { for (let idx = 0; idx < stageIndex + 1; idx++) {
if (!this.stageResultCache[idx]) { if (!this.stageResultCache[idx]) {
const stageResult = this.transformers[idx - 1].transform(this.context, [ const transformer = this.transformers[idx - 1];
...this.stageResultCache[idx - 1].items, const previousStageResult: StageResult = this.stageResultCache[idx - 1];
]); const inputSchema = toSimpleSchema(previousStageResult);
this.stageResultCache.push(stageResult); const outputSchema = transformer.schemaTransformer(inputSchema);
const itemResult = transformer.transform(this.context, [...this.stageResultCache[idx - 1].items]);
this.stageResultCache.push({
schema: toAnnotatedSchema(inputSchema, outputSchema),
...itemResult,
});
} }
} }
return this.stageResultCache[stageIndex]; return this.stageResultCache[stageIndex];
} }
} }
function toSimpleSchema(stageResult: StageResult): string[] {
return stageResult.schema
.filter((column) => !column.annotation || column.annotation !== ColumnAnnotation.REMOVED)
.map((column) => column.name);
}
function toAnnotatedSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] {
const annotatedSchema: AnnotatedColumn[] = [];
let out_idx = 0;
for (let in_idx = 0; in_idx < inputSchema.length; in_idx++) {
const nextInputColumn = inputSchema[in_idx];
const indexInOut = outputSchema.indexOf(nextInputColumn);
if (indexInOut === -1) {
annotatedSchema.push({ name: nextInputColumn, annotation: ColumnAnnotation.REMOVED });
} else if (indexInOut > out_idx) {
while (out_idx < indexInOut) {
annotatedSchema.push({ name: outputSchema[out_idx], annotation: ColumnAnnotation.ADDED });
out_idx++;
}
annotatedSchema.push({ name: nextInputColumn });
out_idx++;
} else {
annotatedSchema.push({ name: nextInputColumn });
out_idx++;
}
}
for (let index = out_idx; index < outputSchema.length; index++) {
annotatedSchema.push({ name: outputSchema[index], annotation: ColumnAnnotation.ADDED });
}
return annotatedSchema;
}

View File

@ -2,11 +2,9 @@ import PdfParser from './PdfParser';
import ProgressListenFunction from './ProgressListenFunction'; import ProgressListenFunction from './ProgressListenFunction';
import ParseProgressReporter from './ParseProgressReporter'; import ParseProgressReporter from './ParseProgressReporter';
import ItemTransformer from './transformer/ItemTransformer'; import ItemTransformer from './transformer/ItemTransformer';
import Item from './Item';
import ParseResult from './ParseResult'; import ParseResult from './ParseResult';
import Debugger from './Debugger'; import Debugger from './Debugger';
import { verifyRequiredColumns } from './transformer/transformerUtil'; import { assert } from './assert';
import TransformContext from './transformer/TransformContext';
export default class PdfPipeline { export default class PdfPipeline {
parser: PdfParser; parser: PdfParser;
@ -22,16 +20,17 @@ export default class PdfPipeline {
progressListener: ProgressListenFunction, progressListener: ProgressListenFunction,
): Promise<ParseResult> { ): Promise<ParseResult> {
const parseResult = await this.parser.parse(src, new ParseProgressReporter(progressListener)); const parseResult = await this.parser.parse(src, new ParseProgressReporter(progressListener));
verifyRequiredColumns(parseResult.schema, this.transformers); this.verifyRequiredColumns(parseResult.schema, this.transformers);
return parseResult; return parseResult;
} }
//TODO PipelineResult //TODO PipelineResult
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> { async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
const parseResult = await this.parse(src, progressListener); const parseResult = await this.parse(src, progressListener);
this.verifyRequiredColumns(parseResult.schema, this.transformers);
const context = { pageViewports: parseResult.pageViewports }; const context = { pageViewports: parseResult.pageViewports };
let items = parseResult.items; let items = parseResult.items;
this.transformers.forEach((transformer) => { this.transformers.forEach((transformer) => {
items = transformer.transform(context, items); items = transformer.transform(context, items).items;
}); });
parseResult.items = items; parseResult.items = items;
return parseResult; return parseResult;
@ -42,4 +41,28 @@ export default class PdfPipeline {
const context = { pageViewports: parseResult.pageViewports }; const context = { pageViewports: parseResult.pageViewports };
return new Debugger(parseResult.schema, parseResult.items, context, this.transformers); return new Debugger(parseResult.schema, parseResult.items, context, this.transformers);
} }
/**
* Goes through all transformer and makes sure each required column is available in its predecessor schema.
*
* @param inputSchema
* @param transformers
*/
verifyRequiredColumns(inputSchema: string[], transformers: ItemTransformer[]) {
const schemas: string[][] = [inputSchema];
for (let idx = 0; idx < transformers.length; idx++) {
const transformer = transformers[idx];
const predecessorSchema = schemas[idx];
transformer.descriptor.requireColumns?.forEach((column) => {
assert(
predecessorSchema.includes(column),
`Input schema [${predecessorSchema.join(', ')}] for transformer '${
transformer.name
}' does not contain the required column '${column}'`,
);
});
const outputSchema = transformer.schemaTransformer(predecessorSchema);
schemas.push(outputSchema);
}
}
} }

View File

@ -0,0 +1,6 @@
import ColumnAnnotation from './ColumnAnnotation';
export default interface AnnotatedColumn {
name: string;
annotation?: ColumnAnnotation;
}

View File

@ -0,0 +1,6 @@
enum ColumnAnnotation {
ADDED = 'ADDED',
REMOVED = 'REMOVED',
}
export default ColumnAnnotation;

View File

@ -0,0 +1,8 @@
import Item from '../Item';
import AnnotatedColumn from './AnnotatedColumn';
export default interface StageResult {
schema: AnnotatedColumn[];
items: Item[];
messages: string[];
}

View File

@ -1,46 +0,0 @@
import TransformerDescriptor from 'src/TransformerDescription';
import { assert } from '../assert';
import ItemTransformer from './ItemTransformer';
/**
* Goes through all transformer and makes sure each required column ({@link TransformerDescription#consumes}) is available in its predecessor schema.
*
* @param initialSchema
* @param transformers
*/
export function verifyRequiredColumns(initialSchema: string[], transformers: ItemTransformer[]) {
calculateSchemas(initialSchema, transformers);
}
//TODO debug schema
// initial - all unanotated
// second - 2 removed, 1 added
// third - all as before without the removed
export function calculateSchemas(initialSchema: string[], transformers: ItemTransformer[]): string[][] {
const schemas: string[][] = [];
schemas.push(initialSchema);
for (let idx = 0; idx < transformers.length; idx++) {
const transformer = transformers[idx];
const inputSchema = schemas[idx];
validateReferences(inputSchema, transformer.name, transformer.descriptor);
const outputSchema = transformer.schemaTransformer(inputSchema);
schemas.push(outputSchema);
}
return schemas;
}
function validateReferences(
inputSchema: string[],
transformerName: string,
transformerDescriptor: TransformerDescriptor,
) {
transformerDescriptor.requireColumns?.forEach((column) => {
assert(
inputSchema.includes(column),
`Input schema [${inputSchema.join(
', ',
)}] for transformer '${transformerName}' does not contain the required column '${column}' (consumes)`,
);
});
}

View File

@ -4,6 +4,8 @@ import ItemTransformer from 'src/transformer/ItemTransformer';
import TransformerDescriptor from 'src/TransformerDescription'; import TransformerDescriptor from 'src/TransformerDescription';
import TransformContext from 'src/transformer/TransformContext'; import TransformContext from 'src/transformer/TransformContext';
import ItemResult from 'src/ItemResult'; import ItemResult from 'src/ItemResult';
import ColumnAnnotation from 'src/debug/ColumnAnnotation';
import AnnotatedColumn from 'src/debug/AnnotatedColumn';
class TestTransformer extends ItemTransformer { class TestTransformer extends ItemTransformer {
items: Item[]; items: Item[];
@ -31,7 +33,131 @@ test('basic debug', async () => {
const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers); const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers);
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']); expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
expect(debug.stageSchema).toEqual([parsedSchema, ['C']]); expect(debug.stageResults(0).schema).toEqual(parsedSchema.map((column) => ({ name: column })));
expect(debug.stageResults(1).schema).toEqual([
...parsedSchema.map((column) => ({ name: column, annotation: ColumnAnnotation.REMOVED })),
{ name: 'C', annotation: ColumnAnnotation.ADDED },
]);
expect(debug.stageResults(0).items).toEqual(parsedItems); expect(debug.stageResults(0).items).toEqual(parsedItems);
expect(debug.stageResults(1).items).toEqual(trans1Items); expect(debug.stageResults(1).items).toEqual(trans1Items);
}); });
describe('build schemas', () => {
const items: Item[] = [];
function calculateSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] {
const transformers = [new TestTransformer('Trans1', {}, outputSchema, items)];
const debug = new Debugger(inputSchema, items, { pageViewports: [] }, transformers);
return debug.stageResults(1).schema;
}
test('Add', async () => {
const annotatedSchema = calculateSchema(['A', 'B'], ['A', 'B', 'C']);
expect(annotatedSchema).toEqual([{ name: 'A' }, { name: 'B' }, { name: 'C', annotation: ColumnAnnotation.ADDED }]);
});
test('Remove', async () => {
const annotatedSchema = calculateSchema(['A', 'B'], ['A']);
expect(annotatedSchema).toEqual([{ name: 'A' }, { name: 'B', annotation: ColumnAnnotation.REMOVED }]);
});
test('Replace first', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['X', 'B', 'C']);
expect(annotatedSchema).toEqual([
{ name: 'A', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'B' },
{ name: 'C' },
]);
});
test('Replace middle', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'X', 'C']);
expect(annotatedSchema).toEqual([
{ name: 'A' },
{ name: 'B', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'C' },
]);
});
test('Replace last', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'B', 'X']);
expect(annotatedSchema).toEqual([
{ name: 'A' },
{ name: 'B' },
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
]);
});
test('Replace first with 2', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['X', 'Y', 'B', 'C']);
expect(annotatedSchema).toEqual([
{ name: 'A', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'Y', annotation: ColumnAnnotation.ADDED },
{ name: 'B' },
{ name: 'C' },
]);
});
test('Replace middle with 2', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'X', 'Y', 'C']);
expect(annotatedSchema).toEqual([
{ name: 'A' },
{ name: 'B', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'Y', annotation: ColumnAnnotation.ADDED },
{ name: 'C' },
]);
});
test('Replace last with 2', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'B', 'X', 'Y']);
expect(annotatedSchema).toEqual([
{ name: 'A' },
{ name: 'B' },
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'Y', annotation: ColumnAnnotation.ADDED },
]);
});
test('Replace 2 with one', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C', 'D'], ['A', 'X', 'D']);
expect(annotatedSchema).toEqual([
{ name: 'A' },
{ name: 'B', annotation: ColumnAnnotation.REMOVED },
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'D' },
]);
});
test('Replace all', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['X']);
expect(annotatedSchema).toEqual([
{ name: 'A', annotation: ColumnAnnotation.REMOVED },
{ name: 'B', annotation: ColumnAnnotation.REMOVED },
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
]);
});
test('Wild mix all', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C', 'E', 'F', 'G'], ['B', 'X', 'E', 'Y', 'Z', 'G', 'XX']);
expect(annotatedSchema).toEqual([
{ name: 'A', annotation: ColumnAnnotation.REMOVED },
{ name: 'B' },
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'E' },
{ name: 'F', annotation: ColumnAnnotation.REMOVED },
{ name: 'Y', annotation: ColumnAnnotation.ADDED },
{ name: 'Z', annotation: ColumnAnnotation.ADDED },
{ name: 'G' },
{ name: 'XX', annotation: ColumnAnnotation.ADDED },
]);
});
});

View File

@ -3,7 +3,10 @@ import Item from 'src/Item';
import ItemResult from 'src/ItemResult'; import ItemResult from 'src/ItemResult';
import ItemTransformer from 'src/transformer/ItemTransformer'; import ItemTransformer from 'src/transformer/ItemTransformer';
import TransformContext from 'src/transformer/TransformContext'; import TransformContext from 'src/transformer/TransformContext';
import { calculateSchemas, verifyRequiredColumns } from 'src/transformer/transformerUtil'; import PdfParser from 'src/PdfParser';
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
import * as fs from 'fs';
import PdfPipeline from 'src/PdfPipeline';
class TestSchemaTransformer extends ItemTransformer { class TestSchemaTransformer extends ItemTransformer {
constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[] | undefined = undefined) { constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[] | undefined = undefined) {
@ -26,36 +29,15 @@ test('verify valid transform', async () => {
new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']), new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }, ['A', 'D', 'E']), new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }, ['A', 'D', 'E']),
]; ];
verifyRequiredColumns(inputSchema, transformers); const pipeline = new PdfPipeline(new PdfParser(pdfjs), transformers);
pipeline.verifyRequiredColumns(inputSchema, transformers);
}); });
test('verify invalid consume', async () => { test('verify invalid consume', async () => {
const inputSchema = ['A', 'B', 'C']; const inputSchema = ['A', 'B', 'C'];
const transformers = [new TestSchemaTransformer('Consumes X', { requireColumns: ['X'] })]; const transformers = [new TestSchemaTransformer('Consumes X', { requireColumns: ['X'] })];
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError( const pipeline = new PdfPipeline(new PdfParser(pdfjs), transformers);
"Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X' (consumes)", expect(() => pipeline.verifyRequiredColumns(inputSchema, transformers)).toThrowError(
"Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X'",
); );
}); });
test('calculate schemas', async () => {
const inputSchema = ['A', 'B', 'C'];
const transformers = [
new TestSchemaTransformer(
'Replace B & C with D',
{
requireColumns: ['B', 'C'],
},
['A', 'D'],
),
new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }),
];
expect(calculateSchemas(inputSchema, transformers)).toEqual([
['A', 'B', 'C'],
['A', 'D'],
['A', 'D', 'E'],
['A', 'D', 'E'],
]);
});

View File

@ -13,7 +13,6 @@
let currentStage = 0; let currentStage = 0;
$: canNext = currentStage + 1 < stageNames.length; $: canNext = currentStage + 1 < stageNames.length;
$: canPrev = currentStage > 0; $: canPrev = currentStage > 0;
$: stageSchema = debug.stageSchema[currentStage];
$: stageResult = debug.stageResults(currentStage); $: stageResult = debug.stageResults(currentStage);
$: pageFocus = !isNaN(focusedPage); $: pageFocus = !isNaN(focusedPage);
$: pagesNumbers = new Set(stageResult.items.map((item) => item.page)); $: pagesNumbers = new Set(stageResult.items.map((item) => item.page));
@ -41,7 +40,7 @@
</script> </script>
<div class="mx-4"> <div class="mx-4">
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div> <!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
<div>Title: {parseResult.metadata.title()}</div> <div>Title: {parseResult.metadata.title()}</div>
<div>Author: {parseResult.metadata.author()}</div> --> <div>Author: {parseResult.metadata.author()}</div> -->
@ -99,7 +98,7 @@
</ul> </ul>
<!-- Items --> <!-- Items -->
<ItemTable schema={stageSchema} itemsByPage={visiblePages} {maxPage} {pageFocus} /> <ItemTable schema={stageResult.schema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
</div> </div>
<style> <style>

View File

@ -1,7 +1,9 @@
<script> <script>
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
import type Item from '@core/Item'; import type Item from '@core/Item';
import ColumnAnnotation from '../../../core/src/debug/ColumnAnnotation';
export let schema: string[]; export let schema: AnnotatedColumn[];
export let itemsByPage: [number, Item[]][]; export let itemsByPage: [number, Item[]][];
export let maxPage: number; export let maxPage: number;
export let pageFocus: boolean; export let pageFocus: boolean;
@ -21,6 +23,9 @@
} }
return value; return value;
} }
//TODO if no ADDED/REMOVE cols
// - have highlight declarations in descriptor
</script> </script>
<!-- Item table --> <!-- Item table -->
@ -30,7 +35,10 @@
<th /> <th />
<th>#</th> <th>#</th>
{#each schema as column} {#each schema as column}
<th>{column}</th> <th
class={column.annotation === ColumnAnnotation.ADDED ? 'text-green-600' : column.annotation === ColumnAnnotation.REMOVED ? 'text-red-600' : ''}>
{column.name}
</th>
{/each} {/each}
</thead> </thead>
<tbody> <tbody>
@ -51,7 +59,7 @@
{/if} {/if}
<td>{itemIdx}</td> <td>{itemIdx}</td>
{#each schema as column} {#each schema as column}
<td class="select-all">{format(item.data[column])}</td> <td class="select-all">{format(item.data[column.name])}</td>
{/each} {/each}
</tr> </tr>
{/each} {/each}