mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-02 19:09:30 +01:00
Annotated schema for debug
This commit is contained in:
parent
698562ab27
commit
6c72d61590
@ -1,43 +1,81 @@
|
|||||||
import { assert } from './assert';
|
|
||||||
import Item from './Item';
|
import Item from './Item';
|
||||||
import ItemResult from './ItemResult';
|
import ItemResult from './ItemResult';
|
||||||
import ItemTransformer from './transformer/ItemTransformer';
|
import ItemTransformer from './transformer/ItemTransformer';
|
||||||
import { calculateSchemas } from './transformer/transformerUtil';
|
|
||||||
import TransformContext from './transformer/TransformContext';
|
import TransformContext from './transformer/TransformContext';
|
||||||
|
import StageResult from './debug/StageResult';
|
||||||
|
import ColumnAnnotation from './debug/ColumnAnnotation';
|
||||||
|
import AnnotatedColumn from './debug/AnnotatedColumn';
|
||||||
|
|
||||||
export default class Debugger {
|
export default class Debugger {
|
||||||
// parseResult: ParseResult;
|
private context: TransformContext;
|
||||||
context: TransformContext;
|
private transformers: ItemTransformer[];
|
||||||
transformers: ItemTransformer[];
|
private stageResultCache: StageResult[];
|
||||||
stageNames: string[];
|
stageNames: string[];
|
||||||
stageSchema: string[][];
|
|
||||||
private stageResultCache: ItemResult[];
|
|
||||||
|
|
||||||
constructor(
|
constructor(inputSchema: string[], inputItems: Item[], context: TransformContext, transformers: ItemTransformer[]) {
|
||||||
initialSchema: string[],
|
|
||||||
initialItems: Item[],
|
|
||||||
context: TransformContext,
|
|
||||||
transformers: ItemTransformer[],
|
|
||||||
) {
|
|
||||||
// this.parseResult = parseResult;
|
|
||||||
this.transformers = transformers;
|
this.transformers = transformers;
|
||||||
this.context = context;
|
this.context = context;
|
||||||
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
|
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
|
||||||
this.stageResultCache = [{ items: initialItems, messages: [`Parsed ${initialItems[initialItems.length-1].page+1} pages with ${initialItems.length} items`] }];
|
this.stageResultCache = [
|
||||||
this.stageSchema = calculateSchemas(initialSchema, transformers);
|
{
|
||||||
|
schema: inputSchema.map((column) => ({ name: column })),
|
||||||
|
items: inputItems,
|
||||||
|
messages: [
|
||||||
|
`Parsed ${inputItems.length === 0 ? 0 : inputItems[inputItems.length - 1].page + 1} pages with ${
|
||||||
|
inputItems.length
|
||||||
|
} items`,
|
||||||
|
],
|
||||||
|
},
|
||||||
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO return MarkedItem ? (removed, added, etc..)?
|
//TODO return MarkedItem ? (removed, added, etc..)?
|
||||||
//TODO StageResult == class with schema and marked items ?
|
stageResults(stageIndex: number): StageResult {
|
||||||
stageResults(stageIndex: number): ItemResult {
|
|
||||||
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
||||||
if (!this.stageResultCache[idx]) {
|
if (!this.stageResultCache[idx]) {
|
||||||
const stageResult = this.transformers[idx - 1].transform(this.context, [
|
const transformer = this.transformers[idx - 1];
|
||||||
...this.stageResultCache[idx - 1].items,
|
const previousStageResult: StageResult = this.stageResultCache[idx - 1];
|
||||||
]);
|
const inputSchema = toSimpleSchema(previousStageResult);
|
||||||
this.stageResultCache.push(stageResult);
|
const outputSchema = transformer.schemaTransformer(inputSchema);
|
||||||
|
const itemResult = transformer.transform(this.context, [...this.stageResultCache[idx - 1].items]);
|
||||||
|
this.stageResultCache.push({
|
||||||
|
schema: toAnnotatedSchema(inputSchema, outputSchema),
|
||||||
|
...itemResult,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return this.stageResultCache[stageIndex];
|
return this.stageResultCache[stageIndex];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function toSimpleSchema(stageResult: StageResult): string[] {
|
||||||
|
return stageResult.schema
|
||||||
|
.filter((column) => !column.annotation || column.annotation !== ColumnAnnotation.REMOVED)
|
||||||
|
.map((column) => column.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
function toAnnotatedSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] {
|
||||||
|
const annotatedSchema: AnnotatedColumn[] = [];
|
||||||
|
let out_idx = 0;
|
||||||
|
for (let in_idx = 0; in_idx < inputSchema.length; in_idx++) {
|
||||||
|
const nextInputColumn = inputSchema[in_idx];
|
||||||
|
const indexInOut = outputSchema.indexOf(nextInputColumn);
|
||||||
|
if (indexInOut === -1) {
|
||||||
|
annotatedSchema.push({ name: nextInputColumn, annotation: ColumnAnnotation.REMOVED });
|
||||||
|
} else if (indexInOut > out_idx) {
|
||||||
|
while (out_idx < indexInOut) {
|
||||||
|
annotatedSchema.push({ name: outputSchema[out_idx], annotation: ColumnAnnotation.ADDED });
|
||||||
|
out_idx++;
|
||||||
|
}
|
||||||
|
annotatedSchema.push({ name: nextInputColumn });
|
||||||
|
out_idx++;
|
||||||
|
} else {
|
||||||
|
annotatedSchema.push({ name: nextInputColumn });
|
||||||
|
out_idx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (let index = out_idx; index < outputSchema.length; index++) {
|
||||||
|
annotatedSchema.push({ name: outputSchema[index], annotation: ColumnAnnotation.ADDED });
|
||||||
|
}
|
||||||
|
return annotatedSchema;
|
||||||
|
}
|
||||||
|
@ -2,11 +2,9 @@ import PdfParser from './PdfParser';
|
|||||||
import ProgressListenFunction from './ProgressListenFunction';
|
import ProgressListenFunction from './ProgressListenFunction';
|
||||||
import ParseProgressReporter from './ParseProgressReporter';
|
import ParseProgressReporter from './ParseProgressReporter';
|
||||||
import ItemTransformer from './transformer/ItemTransformer';
|
import ItemTransformer from './transformer/ItemTransformer';
|
||||||
import Item from './Item';
|
|
||||||
import ParseResult from './ParseResult';
|
import ParseResult from './ParseResult';
|
||||||
import Debugger from './Debugger';
|
import Debugger from './Debugger';
|
||||||
import { verifyRequiredColumns } from './transformer/transformerUtil';
|
import { assert } from './assert';
|
||||||
import TransformContext from './transformer/TransformContext';
|
|
||||||
|
|
||||||
export default class PdfPipeline {
|
export default class PdfPipeline {
|
||||||
parser: PdfParser;
|
parser: PdfParser;
|
||||||
@ -22,16 +20,17 @@ export default class PdfPipeline {
|
|||||||
progressListener: ProgressListenFunction,
|
progressListener: ProgressListenFunction,
|
||||||
): Promise<ParseResult> {
|
): Promise<ParseResult> {
|
||||||
const parseResult = await this.parser.parse(src, new ParseProgressReporter(progressListener));
|
const parseResult = await this.parser.parse(src, new ParseProgressReporter(progressListener));
|
||||||
verifyRequiredColumns(parseResult.schema, this.transformers);
|
this.verifyRequiredColumns(parseResult.schema, this.transformers);
|
||||||
return parseResult;
|
return parseResult;
|
||||||
}
|
}
|
||||||
//TODO PipelineResult
|
//TODO PipelineResult
|
||||||
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
|
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
|
||||||
const parseResult = await this.parse(src, progressListener);
|
const parseResult = await this.parse(src, progressListener);
|
||||||
|
this.verifyRequiredColumns(parseResult.schema, this.transformers);
|
||||||
const context = { pageViewports: parseResult.pageViewports };
|
const context = { pageViewports: parseResult.pageViewports };
|
||||||
let items = parseResult.items;
|
let items = parseResult.items;
|
||||||
this.transformers.forEach((transformer) => {
|
this.transformers.forEach((transformer) => {
|
||||||
items = transformer.transform(context, items);
|
items = transformer.transform(context, items).items;
|
||||||
});
|
});
|
||||||
parseResult.items = items;
|
parseResult.items = items;
|
||||||
return parseResult;
|
return parseResult;
|
||||||
@ -42,4 +41,28 @@ export default class PdfPipeline {
|
|||||||
const context = { pageViewports: parseResult.pageViewports };
|
const context = { pageViewports: parseResult.pageViewports };
|
||||||
return new Debugger(parseResult.schema, parseResult.items, context, this.transformers);
|
return new Debugger(parseResult.schema, parseResult.items, context, this.transformers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Goes through all transformer and makes sure each required column is available in its predecessor schema.
|
||||||
|
*
|
||||||
|
* @param inputSchema
|
||||||
|
* @param transformers
|
||||||
|
*/
|
||||||
|
verifyRequiredColumns(inputSchema: string[], transformers: ItemTransformer[]) {
|
||||||
|
const schemas: string[][] = [inputSchema];
|
||||||
|
for (let idx = 0; idx < transformers.length; idx++) {
|
||||||
|
const transformer = transformers[idx];
|
||||||
|
const predecessorSchema = schemas[idx];
|
||||||
|
transformer.descriptor.requireColumns?.forEach((column) => {
|
||||||
|
assert(
|
||||||
|
predecessorSchema.includes(column),
|
||||||
|
`Input schema [${predecessorSchema.join(', ')}] for transformer '${
|
||||||
|
transformer.name
|
||||||
|
}' does not contain the required column '${column}'`,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
const outputSchema = transformer.schemaTransformer(predecessorSchema);
|
||||||
|
schemas.push(outputSchema);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
6
core/src/debug/AnnotatedColumn.ts
Normal file
6
core/src/debug/AnnotatedColumn.ts
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import ColumnAnnotation from './ColumnAnnotation';
|
||||||
|
|
||||||
|
export default interface AnnotatedColumn {
|
||||||
|
name: string;
|
||||||
|
annotation?: ColumnAnnotation;
|
||||||
|
}
|
6
core/src/debug/ColumnAnnotation.ts
Normal file
6
core/src/debug/ColumnAnnotation.ts
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
enum ColumnAnnotation {
|
||||||
|
ADDED = 'ADDED',
|
||||||
|
REMOVED = 'REMOVED',
|
||||||
|
}
|
||||||
|
|
||||||
|
export default ColumnAnnotation;
|
8
core/src/debug/StageResult.ts
Normal file
8
core/src/debug/StageResult.ts
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
import Item from '../Item';
|
||||||
|
import AnnotatedColumn from './AnnotatedColumn';
|
||||||
|
|
||||||
|
export default interface StageResult {
|
||||||
|
schema: AnnotatedColumn[];
|
||||||
|
items: Item[];
|
||||||
|
messages: string[];
|
||||||
|
}
|
@ -1,46 +0,0 @@
|
|||||||
import TransformerDescriptor from 'src/TransformerDescription';
|
|
||||||
import { assert } from '../assert';
|
|
||||||
import ItemTransformer from './ItemTransformer';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Goes through all transformer and makes sure each required column ({@link TransformerDescription#consumes}) is available in its predecessor schema.
|
|
||||||
*
|
|
||||||
* @param initialSchema
|
|
||||||
* @param transformers
|
|
||||||
*/
|
|
||||||
export function verifyRequiredColumns(initialSchema: string[], transformers: ItemTransformer[]) {
|
|
||||||
calculateSchemas(initialSchema, transformers);
|
|
||||||
}
|
|
||||||
|
|
||||||
//TODO debug schema
|
|
||||||
// initial - all unanotated
|
|
||||||
// second - 2 removed, 1 added
|
|
||||||
// third - all as before without the removed
|
|
||||||
|
|
||||||
export function calculateSchemas(initialSchema: string[], transformers: ItemTransformer[]): string[][] {
|
|
||||||
const schemas: string[][] = [];
|
|
||||||
schemas.push(initialSchema);
|
|
||||||
for (let idx = 0; idx < transformers.length; idx++) {
|
|
||||||
const transformer = transformers[idx];
|
|
||||||
const inputSchema = schemas[idx];
|
|
||||||
validateReferences(inputSchema, transformer.name, transformer.descriptor);
|
|
||||||
const outputSchema = transformer.schemaTransformer(inputSchema);
|
|
||||||
schemas.push(outputSchema);
|
|
||||||
}
|
|
||||||
return schemas;
|
|
||||||
}
|
|
||||||
|
|
||||||
function validateReferences(
|
|
||||||
inputSchema: string[],
|
|
||||||
transformerName: string,
|
|
||||||
transformerDescriptor: TransformerDescriptor,
|
|
||||||
) {
|
|
||||||
transformerDescriptor.requireColumns?.forEach((column) => {
|
|
||||||
assert(
|
|
||||||
inputSchema.includes(column),
|
|
||||||
`Input schema [${inputSchema.join(
|
|
||||||
', ',
|
|
||||||
)}] for transformer '${transformerName}' does not contain the required column '${column}' (consumes)`,
|
|
||||||
);
|
|
||||||
});
|
|
||||||
}
|
|
@ -4,6 +4,8 @@ import ItemTransformer from 'src/transformer/ItemTransformer';
|
|||||||
import TransformerDescriptor from 'src/TransformerDescription';
|
import TransformerDescriptor from 'src/TransformerDescription';
|
||||||
import TransformContext from 'src/transformer/TransformContext';
|
import TransformContext from 'src/transformer/TransformContext';
|
||||||
import ItemResult from 'src/ItemResult';
|
import ItemResult from 'src/ItemResult';
|
||||||
|
import ColumnAnnotation from 'src/debug/ColumnAnnotation';
|
||||||
|
import AnnotatedColumn from 'src/debug/AnnotatedColumn';
|
||||||
|
|
||||||
class TestTransformer extends ItemTransformer {
|
class TestTransformer extends ItemTransformer {
|
||||||
items: Item[];
|
items: Item[];
|
||||||
@ -31,7 +33,131 @@ test('basic debug', async () => {
|
|||||||
const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers);
|
const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers);
|
||||||
|
|
||||||
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
|
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
|
||||||
expect(debug.stageSchema).toEqual([parsedSchema, ['C']]);
|
expect(debug.stageResults(0).schema).toEqual(parsedSchema.map((column) => ({ name: column })));
|
||||||
|
expect(debug.stageResults(1).schema).toEqual([
|
||||||
|
...parsedSchema.map((column) => ({ name: column, annotation: ColumnAnnotation.REMOVED })),
|
||||||
|
{ name: 'C', annotation: ColumnAnnotation.ADDED },
|
||||||
|
]);
|
||||||
expect(debug.stageResults(0).items).toEqual(parsedItems);
|
expect(debug.stageResults(0).items).toEqual(parsedItems);
|
||||||
expect(debug.stageResults(1).items).toEqual(trans1Items);
|
expect(debug.stageResults(1).items).toEqual(trans1Items);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('build schemas', () => {
|
||||||
|
const items: Item[] = [];
|
||||||
|
|
||||||
|
function calculateSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] {
|
||||||
|
const transformers = [new TestTransformer('Trans1', {}, outputSchema, items)];
|
||||||
|
const debug = new Debugger(inputSchema, items, { pageViewports: [] }, transformers);
|
||||||
|
return debug.stageResults(1).schema;
|
||||||
|
}
|
||||||
|
|
||||||
|
test('Add', async () => {
|
||||||
|
const annotatedSchema = calculateSchema(['A', 'B'], ['A', 'B', 'C']);
|
||||||
|
expect(annotatedSchema).toEqual([{ name: 'A' }, { name: 'B' }, { name: 'C', annotation: ColumnAnnotation.ADDED }]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Remove', async () => {
|
||||||
|
const annotatedSchema = calculateSchema(['A', 'B'], ['A']);
|
||||||
|
expect(annotatedSchema).toEqual([{ name: 'A' }, { name: 'B', annotation: ColumnAnnotation.REMOVED }]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Replace first', async () => {
|
||||||
|
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['X', 'B', 'C']);
|
||||||
|
expect(annotatedSchema).toEqual([
|
||||||
|
{ name: 'A', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'X', annotation: ColumnAnnotation.ADDED },
|
||||||
|
{ name: 'B' },
|
||||||
|
{ name: 'C' },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Replace middle', async () => {
|
||||||
|
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'X', 'C']);
|
||||||
|
expect(annotatedSchema).toEqual([
|
||||||
|
{ name: 'A' },
|
||||||
|
{ name: 'B', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'X', annotation: ColumnAnnotation.ADDED },
|
||||||
|
{ name: 'C' },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Replace last', async () => {
|
||||||
|
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'B', 'X']);
|
||||||
|
expect(annotatedSchema).toEqual([
|
||||||
|
{ name: 'A' },
|
||||||
|
{ name: 'B' },
|
||||||
|
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'X', annotation: ColumnAnnotation.ADDED },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Replace first with 2', async () => {
|
||||||
|
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['X', 'Y', 'B', 'C']);
|
||||||
|
expect(annotatedSchema).toEqual([
|
||||||
|
{ name: 'A', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'X', annotation: ColumnAnnotation.ADDED },
|
||||||
|
{ name: 'Y', annotation: ColumnAnnotation.ADDED },
|
||||||
|
{ name: 'B' },
|
||||||
|
{ name: 'C' },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Replace middle with 2', async () => {
|
||||||
|
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'X', 'Y', 'C']);
|
||||||
|
expect(annotatedSchema).toEqual([
|
||||||
|
{ name: 'A' },
|
||||||
|
{ name: 'B', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'X', annotation: ColumnAnnotation.ADDED },
|
||||||
|
{ name: 'Y', annotation: ColumnAnnotation.ADDED },
|
||||||
|
{ name: 'C' },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Replace last with 2', async () => {
|
||||||
|
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['A', 'B', 'X', 'Y']);
|
||||||
|
expect(annotatedSchema).toEqual([
|
||||||
|
{ name: 'A' },
|
||||||
|
{ name: 'B' },
|
||||||
|
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'X', annotation: ColumnAnnotation.ADDED },
|
||||||
|
{ name: 'Y', annotation: ColumnAnnotation.ADDED },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Replace 2 with one', async () => {
|
||||||
|
const annotatedSchema = calculateSchema(['A', 'B', 'C', 'D'], ['A', 'X', 'D']);
|
||||||
|
expect(annotatedSchema).toEqual([
|
||||||
|
{ name: 'A' },
|
||||||
|
{ name: 'B', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'X', annotation: ColumnAnnotation.ADDED },
|
||||||
|
{ name: 'D' },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Replace all', async () => {
|
||||||
|
const annotatedSchema = calculateSchema(['A', 'B', 'C'], ['X']);
|
||||||
|
expect(annotatedSchema).toEqual([
|
||||||
|
{ name: 'A', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'B', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'X', annotation: ColumnAnnotation.ADDED },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Wild mix all', async () => {
|
||||||
|
const annotatedSchema = calculateSchema(['A', 'B', 'C', 'E', 'F', 'G'], ['B', 'X', 'E', 'Y', 'Z', 'G', 'XX']);
|
||||||
|
expect(annotatedSchema).toEqual([
|
||||||
|
{ name: 'A', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'B' },
|
||||||
|
{ name: 'C', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'X', annotation: ColumnAnnotation.ADDED },
|
||||||
|
{ name: 'E' },
|
||||||
|
{ name: 'F', annotation: ColumnAnnotation.REMOVED },
|
||||||
|
{ name: 'Y', annotation: ColumnAnnotation.ADDED },
|
||||||
|
{ name: 'Z', annotation: ColumnAnnotation.ADDED },
|
||||||
|
{ name: 'G' },
|
||||||
|
{ name: 'XX', annotation: ColumnAnnotation.ADDED },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
@ -3,7 +3,10 @@ import Item from 'src/Item';
|
|||||||
import ItemResult from 'src/ItemResult';
|
import ItemResult from 'src/ItemResult';
|
||||||
import ItemTransformer from 'src/transformer/ItemTransformer';
|
import ItemTransformer from 'src/transformer/ItemTransformer';
|
||||||
import TransformContext from 'src/transformer/TransformContext';
|
import TransformContext from 'src/transformer/TransformContext';
|
||||||
import { calculateSchemas, verifyRequiredColumns } from 'src/transformer/transformerUtil';
|
import PdfParser from 'src/PdfParser';
|
||||||
|
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import PdfPipeline from 'src/PdfPipeline';
|
||||||
|
|
||||||
class TestSchemaTransformer extends ItemTransformer {
|
class TestSchemaTransformer extends ItemTransformer {
|
||||||
constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[] | undefined = undefined) {
|
constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[] | undefined = undefined) {
|
||||||
@ -26,36 +29,15 @@ test('verify valid transform', async () => {
|
|||||||
new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
|
new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
|
||||||
new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }, ['A', 'D', 'E']),
|
new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }, ['A', 'D', 'E']),
|
||||||
];
|
];
|
||||||
verifyRequiredColumns(inputSchema, transformers);
|
const pipeline = new PdfPipeline(new PdfParser(pdfjs), transformers);
|
||||||
|
pipeline.verifyRequiredColumns(inputSchema, transformers);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('verify invalid consume', async () => {
|
test('verify invalid consume', async () => {
|
||||||
const inputSchema = ['A', 'B', 'C'];
|
const inputSchema = ['A', 'B', 'C'];
|
||||||
|
|
||||||
const transformers = [new TestSchemaTransformer('Consumes X', { requireColumns: ['X'] })];
|
const transformers = [new TestSchemaTransformer('Consumes X', { requireColumns: ['X'] })];
|
||||||
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
|
const pipeline = new PdfPipeline(new PdfParser(pdfjs), transformers);
|
||||||
"Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X' (consumes)",
|
expect(() => pipeline.verifyRequiredColumns(inputSchema, transformers)).toThrowError(
|
||||||
|
"Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X'",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('calculate schemas', async () => {
|
|
||||||
const inputSchema = ['A', 'B', 'C'];
|
|
||||||
|
|
||||||
const transformers = [
|
|
||||||
new TestSchemaTransformer(
|
|
||||||
'Replace B & C with D',
|
|
||||||
{
|
|
||||||
requireColumns: ['B', 'C'],
|
|
||||||
},
|
|
||||||
['A', 'D'],
|
|
||||||
),
|
|
||||||
new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
|
|
||||||
new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }),
|
|
||||||
];
|
|
||||||
expect(calculateSchemas(inputSchema, transformers)).toEqual([
|
|
||||||
['A', 'B', 'C'],
|
|
||||||
['A', 'D'],
|
|
||||||
['A', 'D', 'E'],
|
|
||||||
['A', 'D', 'E'],
|
|
||||||
]);
|
|
||||||
});
|
|
@ -13,7 +13,6 @@
|
|||||||
let currentStage = 0;
|
let currentStage = 0;
|
||||||
$: canNext = currentStage + 1 < stageNames.length;
|
$: canNext = currentStage + 1 < stageNames.length;
|
||||||
$: canPrev = currentStage > 0;
|
$: canPrev = currentStage > 0;
|
||||||
$: stageSchema = debug.stageSchema[currentStage];
|
|
||||||
$: stageResult = debug.stageResults(currentStage);
|
$: stageResult = debug.stageResults(currentStage);
|
||||||
$: pageFocus = !isNaN(focusedPage);
|
$: pageFocus = !isNaN(focusedPage);
|
||||||
$: pagesNumbers = new Set(stageResult.items.map((item) => item.page));
|
$: pagesNumbers = new Set(stageResult.items.map((item) => item.page));
|
||||||
@ -41,7 +40,7 @@
|
|||||||
</script>
|
</script>
|
||||||
|
|
||||||
<div class="mx-4">
|
<div class="mx-4">
|
||||||
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
|
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
|
||||||
<div>Title: {parseResult.metadata.title()}</div>
|
<div>Title: {parseResult.metadata.title()}</div>
|
||||||
<div>Author: {parseResult.metadata.author()}</div> -->
|
<div>Author: {parseResult.metadata.author()}</div> -->
|
||||||
|
|
||||||
@ -99,7 +98,7 @@
|
|||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
<!-- Items -->
|
<!-- Items -->
|
||||||
<ItemTable schema={stageSchema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
|
<ItemTable schema={stageResult.schema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<style>
|
<style>
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
<script>
|
<script>
|
||||||
|
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
|
||||||
import type Item from '@core/Item';
|
import type Item from '@core/Item';
|
||||||
|
import ColumnAnnotation from '../../../core/src/debug/ColumnAnnotation';
|
||||||
|
|
||||||
export let schema: string[];
|
export let schema: AnnotatedColumn[];
|
||||||
export let itemsByPage: [number, Item[]][];
|
export let itemsByPage: [number, Item[]][];
|
||||||
export let maxPage: number;
|
export let maxPage: number;
|
||||||
export let pageFocus: boolean;
|
export let pageFocus: boolean;
|
||||||
@ -21,6 +23,9 @@
|
|||||||
}
|
}
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//TODO if no ADDED/REMOVE cols
|
||||||
|
// - have highlight declarations in descriptor
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<!-- Item table -->
|
<!-- Item table -->
|
||||||
@ -30,7 +35,10 @@
|
|||||||
<th />
|
<th />
|
||||||
<th>#</th>
|
<th>#</th>
|
||||||
{#each schema as column}
|
{#each schema as column}
|
||||||
<th>{column}</th>
|
<th
|
||||||
|
class={column.annotation === ColumnAnnotation.ADDED ? 'text-green-600' : column.annotation === ColumnAnnotation.REMOVED ? 'text-red-600' : ''}>
|
||||||
|
{column.name}
|
||||||
|
</th>
|
||||||
{/each}
|
{/each}
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
@ -51,7 +59,7 @@
|
|||||||
{/if}
|
{/if}
|
||||||
<td>{itemIdx}</td>
|
<td>{itemIdx}</td>
|
||||||
{#each schema as column}
|
{#each schema as column}
|
||||||
<td class="select-all">{format(item.data[column])}</td>
|
<td class="select-all">{format(item.data[column.name])}</td>
|
||||||
{/each}
|
{/each}
|
||||||
</tr>
|
</tr>
|
||||||
{/each}
|
{/each}
|
||||||
|
Loading…
Reference in New Issue
Block a user