Annotated schema for debug

This commit is contained in:
Johannes Zillmann 2021-02-14 11:43:26 +01:00
parent 698562ab27
commit 6c72d61590
10 changed files with 257 additions and 107 deletions

View File

@ -1,43 +1,81 @@
import { assert } from './assert';
import Item from './Item';
import ItemResult from './ItemResult';
import ItemTransformer from './transformer/ItemTransformer';
import { calculateSchemas } from './transformer/transformerUtil';
import TransformContext from './transformer/TransformContext';
import StageResult from './debug/StageResult';
import ColumnAnnotation from './debug/ColumnAnnotation';
import AnnotatedColumn from './debug/AnnotatedColumn';
export default class Debugger {
// parseResult: ParseResult;
context: TransformContext;
transformers: ItemTransformer[];
private context: TransformContext;
private transformers: ItemTransformer[];
private stageResultCache: StageResult[];
stageNames: string[];
stageSchema: string[][];
private stageResultCache: ItemResult[];
constructor(
initialSchema: string[],
initialItems: Item[],
context: TransformContext,
transformers: ItemTransformer[],
) {
// this.parseResult = parseResult;
constructor(inputSchema: string[], inputItems: Item[], context: TransformContext, transformers: ItemTransformer[]) {
this.transformers = transformers;
this.context = context;
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
this.stageResultCache = [{ items: initialItems, messages: [`Parsed ${initialItems[initialItems.length-1].page+1} pages with ${initialItems.length} items`] }];
this.stageSchema = calculateSchemas(initialSchema, transformers);
this.stageResultCache = [
{
schema: inputSchema.map((column) => ({ name: column })),
items: inputItems,
messages: [
`Parsed ${inputItems.length === 0 ? 0 : inputItems[inputItems.length - 1].page + 1} pages with ${
inputItems.length
} items`,
],
},
];
}
//TODO return MarkedItem ? (removed, added, etc..)?
//TODO StageResult == class with schema and marked items ?
stageResults(stageIndex: number): ItemResult {
stageResults(stageIndex: number): StageResult {
for (let idx = 0; idx < stageIndex + 1; idx++) {
if (!this.stageResultCache[idx]) {
const stageResult = this.transformers[idx - 1].transform(this.context, [
...this.stageResultCache[idx - 1].items,
]);
this.stageResultCache.push(stageResult);
const transformer = this.transformers[idx - 1];
const previousStageResult: StageResult = this.stageResultCache[idx - 1];
const inputSchema = toSimpleSchema(previousStageResult);
const outputSchema = transformer.schemaTransformer(inputSchema);
const itemResult = transformer.transform(this.context, [...this.stageResultCache[idx - 1].items]);
this.stageResultCache.push({
schema: toAnnotatedSchema(inputSchema, outputSchema),
...itemResult,
});
}
}
return this.stageResultCache[stageIndex];
}
}
function toSimpleSchema(stageResult: StageResult): string[] {
return stageResult.schema
.filter((column) => !column.annotation || column.annotation !== ColumnAnnotation.REMOVED)
.map((column) => column.name);
}
function toAnnotatedSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] {
const annotatedSchema: AnnotatedColumn[] = [];
let out_idx = 0;
for (let in_idx = 0; in_idx < inputSchema.length; in_idx++) {
const nextInputColumn = inputSchema[in_idx];
const indexInOut = outputSchema.indexOf(nextInputColumn);
if (indexInOut === -1) {
annotatedSchema.push({ name: nextInputColumn, annotation: ColumnAnnotation.REMOVED });
} else if (indexInOut > out_idx) {
while (out_idx < indexInOut) {
annotatedSchema.push({ name: outputSchema[out_idx], annotation: ColumnAnnotation.ADDED });
out_idx++;
}
annotatedSchema.push({ name: nextInputColumn });
out_idx++;
} else {
annotatedSchema.push({ name: nextInputColumn });
out_idx++;
}
}
for (let index = out_idx; index < outputSchema.length; index++) {
annotatedSchema.push({ name: outputSchema[index], annotation: ColumnAnnotation.ADDED });
}
return annotatedSchema;
}

View File

@ -2,11 +2,9 @@ import PdfParser from './PdfParser';
import ProgressListenFunction from './ProgressListenFunction';
import ParseProgressReporter from './ParseProgressReporter';
import ItemTransformer from './transformer/ItemTransformer';
import Item from './Item';
import ParseResult from './ParseResult';
import Debugger from './Debugger';
import { verifyRequiredColumns } from './transformer/transformerUtil';
import TransformContext from './transformer/TransformContext';
import { assert } from './assert';
export default class PdfPipeline {
parser: PdfParser;
@ -22,16 +20,17 @@ export default class PdfPipeline {
progressListener: ProgressListenFunction,
): Promise<ParseResult> {
const parseResult = await this.parser.parse(src, new ParseProgressReporter(progressListener));
verifyRequiredColumns(parseResult.schema, this.transformers);
this.verifyRequiredColumns(parseResult.schema, this.transformers);
return parseResult;
}
//TODO PipelineResult
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
const parseResult = await this.parse(src, progressListener);
this.verifyRequiredColumns(parseResult.schema, this.transformers);
const context = { pageViewports: parseResult.pageViewports };
let items = parseResult.items;
this.transformers.forEach((transformer) => {
items = transformer.transform(context, items);
items = transformer.transform(context, items).items;
});
parseResult.items = items;
return parseResult;
@ -42,4 +41,28 @@ export default class PdfPipeline {
const context = { pageViewports: parseResult.pageViewports };
return new Debugger(parseResult.schema, parseResult.items, context, this.transformers);
}
/**
* Goes through all transformer and makes sure each required column is available in its predecessor schema.
*
* @param inputSchema
* @param transformers
*/
verifyRequiredColumns(inputSchema: string[], transformers: ItemTransformer[]) {
const schemas: string[][] = [inputSchema];
for (let idx = 0; idx < transformers.length; idx++) {
const transformer = transformers[idx];
const predecessorSchema = schemas[idx];
transformer.descriptor.requireColumns?.forEach((column) => {
assert(
predecessorSchema.includes(column),
`Input schema [${predecessorSchema.join(', ')}] for transformer '${
transformer.name
}' does not contain the required column '${column}'`,
);
});
const outputSchema = transformer.schemaTransformer(predecessorSchema);
schemas.push(outputSchema);
}
}
}

View File

@ -0,0 +1,6 @@
import ColumnAnnotation from './ColumnAnnotation';
export default interface AnnotatedColumn {
name: string;
annotation?: ColumnAnnotation;
}

View File

@ -0,0 +1,6 @@
enum ColumnAnnotation {
ADDED = 'ADDED',
REMOVED = 'REMOVED',
}
export default ColumnAnnotation;

View File

@ -0,0 +1,8 @@
import Item from '../Item';
import AnnotatedColumn from './AnnotatedColumn';
export default interface StageResult {
schema: AnnotatedColumn[];
items: Item[];
messages: string[];
}

View File

@ -1,46 +0,0 @@
import TransformerDescriptor from 'src/TransformerDescription';
import { assert } from '../assert';
import ItemTransformer from './ItemTransformer';
/**
* Goes through all transformer and makes sure each required column ({@link TransformerDescription#consumes}) is available in its predecessor schema.
*
* @param initialSchema
* @param transformers
*/
export function verifyRequiredColumns(initialSchema: string[], transformers: ItemTransformer[]) {
calculateSchemas(initialSchema, transformers);
}
//TODO debug schema
// initial - all unanotated
// second - 2 removed, 1 added
// third - all as before without the removed
export function calculateSchemas(initialSchema: string[], transformers: ItemTransformer[]): string[][] {
const schemas: string[][] = [];
schemas.push(initialSchema);
for (let idx = 0; idx < transformers.length; idx++) {
const transformer = transformers[idx];
const inputSchema = schemas[idx];
validateReferences(inputSchema, transformer.name, transformer.descriptor);
const outputSchema = transformer.schemaTransformer(inputSchema);
schemas.push(outputSchema);
}
return schemas;
}
function validateReferences(
inputSchema: string[],
transformerName: string,
transformerDescriptor: TransformerDescriptor,
) {
transformerDescriptor.requireColumns?.forEach((column) => {
assert(
inputSchema.includes(column),
`Input schema [${inputSchema.join(
', ',
)}] for transformer '${transformerName}' does not contain the required column '${column}' (consumes)`,
);
});
}

View File

@ -4,6 +4,8 @@ import ItemTransformer from 'src/transformer/ItemTransformer';
import TransformerDescriptor from 'src/TransformerDescription';
import TransformContext from 'src/transformer/TransformContext';
import ItemResult from 'src/ItemResult';
import ColumnAnnotation from 'src/debug/ColumnAnnotation';
import AnnotatedColumn from 'src/debug/AnnotatedColumn';
class TestTransformer extends ItemTransformer {
items: Item[];
@ -31,7 +33,131 @@ test('basic debug', async () => {
const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers);
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
expect(debug.stageSchema).toEqual([parsedSchema, ['C']]);
expect(debug.stageResults(0).schema).toEqual(parsedSchema.map((column) => ({ name: column })));
expect(debug.stageResults(1).schema).toEqual([
...parsedSchema.map((column) => ({ name: column, annotation: ColumnAnnotation.REMOVED })),
{ name: 'C', annotation: ColumnAnnotation.ADDED },
]);
expect(debug.stageResults(0).items).toEqual(parsedItems);
expect(debug.stageResults(1).items).toEqual(trans1Items);
});
describe('build schemas', () => {
const items: Item[] = [];
function calculateSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] {
const transformers = [new TestTransformer('Trans1', {}, outputSchema, items)];
const debug = new Debugger(inputSchema, items, { pageViewports: [] }, transformers);
return debug.stageResults(1).schema;
}
test('Add', async () => {
const annotatedSchema = calculateSchema(['A', 'B'], ['A', 'B', 'C']);
expect(annotatedSchema).toEqual([{ name: 'A' }, { name: 'B' }, { name: 'C', annotation: ColumnAnnotation.ADDED }]);
});
test('Remove', async () => {
const annotatedSchema = calculateSchema(['A', 'B'], ['A']);
expect(annotatedSchema).toEqual([{ name: 'A' }, { name: 'B', annotation: ColumnAnnotation.REMOVED }]);
});
test('Replace first', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['X', 'B', 'C']);
expect(annotatedSchema).toEqual([
{ name: 'A', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'B' },
{ name: 'C' },
]);
});
test('Replace middle', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'X', 'C']);
expect(annotatedSchema).toEqual([
{ name: 'A' },
{ name: 'B', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'C' },
]);
});
test('Replace last', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'B', 'X']);
expect(annotatedSchema).toEqual([
{ name: 'A' },
{ name: 'B' },
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
]);
});
test('Replace first with 2', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['X', 'Y', 'B', 'C']);
expect(annotatedSchema).toEqual([
{ name: 'A', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'Y', annotation: ColumnAnnotation.ADDED },
{ name: 'B' },
{ name: 'C' },
]);
});
test('Replace middle with 2', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'X', 'Y', 'C']);
expect(annotatedSchema).toEqual([
{ name: 'A' },
{ name: 'B', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'Y', annotation: ColumnAnnotation.ADDED },
{ name: 'C' },
]);
});
test('Replace last with 2', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'B', 'X', 'Y']);
expect(annotatedSchema).toEqual([
{ name: 'A' },
{ name: 'B' },
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'Y', annotation: ColumnAnnotation.ADDED },
]);
});
test('Replace 2 with one', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C', 'D'], ['A', 'X', 'D']);
expect(annotatedSchema).toEqual([
{ name: 'A' },
{ name: 'B', annotation: ColumnAnnotation.REMOVED },
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'D' },
]);
});
test('Replace all', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['X']);
expect(annotatedSchema).toEqual([
{ name: 'A', annotation: ColumnAnnotation.REMOVED },
{ name: 'B', annotation: ColumnAnnotation.REMOVED },
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
]);
});
test('Wild mix all', async () => {
const annotatedSchema = calculateSchema(['A', 'B', 'C', 'E', 'F', 'G'], ['B', 'X', 'E', 'Y', 'Z', 'G', 'XX']);
expect(annotatedSchema).toEqual([
{ name: 'A', annotation: ColumnAnnotation.REMOVED },
{ name: 'B' },
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
{ name: 'X', annotation: ColumnAnnotation.ADDED },
{ name: 'E' },
{ name: 'F', annotation: ColumnAnnotation.REMOVED },
{ name: 'Y', annotation: ColumnAnnotation.ADDED },
{ name: 'Z', annotation: ColumnAnnotation.ADDED },
{ name: 'G' },
{ name: 'XX', annotation: ColumnAnnotation.ADDED },
]);
});
});

View File

@ -3,7 +3,10 @@ import Item from 'src/Item';
import ItemResult from 'src/ItemResult';
import ItemTransformer from 'src/transformer/ItemTransformer';
import TransformContext from 'src/transformer/TransformContext';
import { calculateSchemas, verifyRequiredColumns } from 'src/transformer/transformerUtil';
import PdfParser from 'src/PdfParser';
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
import * as fs from 'fs';
import PdfPipeline from 'src/PdfPipeline';
class TestSchemaTransformer extends ItemTransformer {
constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[] | undefined = undefined) {
@ -26,36 +29,15 @@ test('verify valid transform', async () => {
new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }, ['A', 'D', 'E']),
];
verifyRequiredColumns(inputSchema, transformers);
const pipeline = new PdfPipeline(new PdfParser(pdfjs), transformers);
pipeline.verifyRequiredColumns(inputSchema, transformers);
});
test('verify invalid consume', async () => {
const inputSchema = ['A', 'B', 'C'];
const transformers = [new TestSchemaTransformer('Consumes X', { requireColumns: ['X'] })];
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
"Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X' (consumes)",
const pipeline = new PdfPipeline(new PdfParser(pdfjs), transformers);
expect(() => pipeline.verifyRequiredColumns(inputSchema, transformers)).toThrowError(
"Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X'",
);
});
test('calculate schemas', async () => {
const inputSchema = ['A', 'B', 'C'];
const transformers = [
new TestSchemaTransformer(
'Replace B & C with D',
{
requireColumns: ['B', 'C'],
},
['A', 'D'],
),
new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }),
];
expect(calculateSchemas(inputSchema, transformers)).toEqual([
['A', 'B', 'C'],
['A', 'D'],
['A', 'D', 'E'],
['A', 'D', 'E'],
]);
});

View File

@ -13,7 +13,6 @@
let currentStage = 0;
$: canNext = currentStage + 1 < stageNames.length;
$: canPrev = currentStage > 0;
$: stageSchema = debug.stageSchema[currentStage];
$: stageResult = debug.stageResults(currentStage);
$: pageFocus = !isNaN(focusedPage);
$: pagesNumbers = new Set(stageResult.items.map((item) => item.page));
@ -41,7 +40,7 @@
</script>
<div class="mx-4">
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
<div>Title: {parseResult.metadata.title()}</div>
<div>Author: {parseResult.metadata.author()}</div> -->
@ -99,7 +98,7 @@
</ul>
<!-- Items -->
<ItemTable schema={stageSchema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
<ItemTable schema={stageResult.schema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
</div>
<style>

View File

@ -1,7 +1,9 @@
<script>
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
import type Item from '@core/Item';
import ColumnAnnotation from '../../../core/src/debug/ColumnAnnotation';
export let schema: string[];
export let schema: AnnotatedColumn[];
export let itemsByPage: [number, Item[]][];
export let maxPage: number;
export let pageFocus: boolean;
@ -21,6 +23,9 @@
}
return value;
}
//TODO if no ADDED/REMOVE cols
// - have highlight declarations in descriptor
</script>
<!-- Item table -->
@ -30,7 +35,10 @@
<th />
<th>#</th>
{#each schema as column}
<th>{column}</th>
<th
class={column.annotation === ColumnAnnotation.ADDED ? 'text-green-600' : column.annotation === ColumnAnnotation.REMOVED ? 'text-red-600' : ''}>
{column.name}
</th>
{/each}
</thead>
<tbody>
@ -51,7 +59,7 @@
{/if}
<td>{itemIdx}</td>
{#each schema as column}
<td class="select-all">{format(item.data[column])}</td>
<td class="select-all">{format(item.data[column.name])}</td>
{/each}
</tr>
{/each}