Implement CalculateCoordinates + simplify schema transformation

This commit is contained in:
Johannes Zillmann 2021-02-13 11:09:34 +01:00
parent 3ebba083c2
commit 698562ab27
8 changed files with 100 additions and 143 deletions

View File

@ -9,7 +9,7 @@ import ParseResult from './ParseResult';
export default class PdfParser { export default class PdfParser {
pdfjs: any; pdfjs: any;
defaultParams: object; defaultParams: object;
schema = ['str', 'fontName', 'dir', 'width', 'height', 'transform']; schema = ['str', 'fontName', 'dir', 'transform', 'width', 'height'];
constructor(pdfjs: any, defaultParams = {}) { constructor(pdfjs: any, defaultParams = {}) {
this.pdfjs = pdfjs; this.pdfjs = pdfjs;
@ -20,10 +20,10 @@ export default class PdfParser {
const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) }; const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) };
return this.pdfjs return this.pdfjs
.getDocument(documentInitParameters) .getDocument(documentInitParameters)
.promise.then((pdfDocument) => { .promise.then((pdfDocument: any) => {
reporter.parsedDocumentHeader(pdfDocument.numPages); reporter.parsedDocumentHeader(pdfDocument.numPages);
return Promise.all([ return Promise.all([
pdfDocument.getMetadata().then((metadata) => { pdfDocument.getMetadata().then((metadata: any) => {
reporter.parsedMetadata(); reporter.parsedMetadata();
return metadata; return metadata;
}), }),
@ -31,9 +31,9 @@ export default class PdfParser {
]); ]);
}) })
.then(([metadata, pages]) => { .then(([metadata, pages]) => {
const pdfPages = pages.map((page) => page.page); const pdfPages = pages.map((page: any) => page.page);
const items = pages.reduce((allItems, page) => allItems.concat(page.items), []); const items = pages.reduce((allItems: any[], page: any) => allItems.concat(page.items), []);
const pageViewports = pdfPages.map((page) => { const pageViewports = pdfPages.map((page: any) => {
const viewPort = page.getViewport({ scale: 1.0 }); const viewPort = page.getViewport({ scale: 1.0 });
return { return {
transformFunction: (itemTransform: number[]) => transformFunction: (itemTransform: number[]) =>
@ -57,23 +57,22 @@ export default class PdfParser {
throw new Error('Invalid PDFjs parameter for getDocument. Need either Uint8Array, string or a parameter object'); throw new Error('Invalid PDFjs parameter for getDocument. Need either Uint8Array, string or a parameter object');
} }
private isArrayBuffer(object) { private isArrayBuffer(object: any) {
return typeof object === 'object' && object !== null && object.byteLength !== undefined; return typeof object === 'object' && object !== null && object.byteLength !== undefined;
} }
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> { private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => { return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults) => { return accumulatorPromise.then((accumulatedResults) => {
return pdfDocument.getPage(index + 1).then((page) => { return pdfDocument.getPage(index + 1).then((page: any) => {
const viewport = page.getViewport({ scale: 1.0 });
return this.triggerFontRetrieval(page).then(() => return this.triggerFontRetrieval(page).then(() =>
page page
.getTextContent({ .getTextContent({
normalizeWhitespace: false, normalizeWhitespace: false,
disableCombineTextItems: true, disableCombineTextItems: true,
}) })
.then((textContent) => { .then((textContent: any) => {
const items = textContent.items.map((rawItem) => new Item(index, rawItem)); const items = textContent.items.map((rawItem: any) => new Item(index, rawItem));
reporter.parsedPage(index); reporter.parsedPage(index);
return [...accumulatedResults, { index, page, items }]; return [...accumulatedResults, { index, page, items }];
}), }),
@ -83,63 +82,9 @@ export default class PdfParser {
}, Promise.resolve([])); }, Promise.resolve([]));
} }
private triggerFontRetrieval(page): Promise<void> { private triggerFontRetrieval(page: any): Promise<void> {
return page.getOperatorList(); return page.getOperatorList();
} }
// async parseOld(data: Uint8Array): Promise<ParseResult> {
// return this.pdfjs
// .getDocument({
// data,
// cMapUrl: 'cmaps/',
// cMapPacked: true,
// })
// .promise.then((pdfDocument) => {
// // console.log('result', pdfDocument);
// const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
// return accumulatorPromise.then((accumulatedResults) => {
// // console.log('Parsing page ' + index);
// return pdfDocument.getPage(index + 1).then((page) => {
// const viewport = page.getViewport({ scale: 1.0 });
// console.log(viewport);
// return this.triggerFontRetrieval(page).then(() =>
// page.getTextContent().then((textContent) => {
// // console.log(textContent);
// const textItems: TextItem[] = textContent.items.map((item) => {
// const tx = this.pdfjs.Util.transform(viewport.transform, item.transform);
// const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
// const dividedHeight = item.height / fontHeight;
// return {
// x: Math.round(item.transform[4]),
// y: Math.round(item.transform[5]),
// width: Math.round(item.width),
// height: Math.round(
// Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight,
// ),
// text: item.str,
// textDirection: TextDirection.fromPdfJs(item.dir),
// fontId: item.fontName,
// };
// });
// return [...accumulatedResults, ...textItems];
// }),
// );
// });
// });
// }, Promise.resolve([]));
// return Promise.all([pdfDocument.getMetadata(), result]);
// })
// .then(([metadata, r]) => {
// // console.log('Parsed metadata:', metadata);
// // console.log('Parsed result:', r.length);
// // console.log('Parsed result:', r);
// return {};
// });
// }
} }
interface ParsedPage { interface ParsedPage {

View File

@ -1,7 +1,5 @@
export default interface TransformerDescription { export default interface TransformerDescriptor {
readonly requireColumns?: string[];
readonly consumesGlobels?: string[]; readonly consumesGlobels?: string[];
readonly producesGlobels?: string[]; readonly producesGlobels?: string[];
readonly consumes?: string[];
readonly produces?: string[];
readonly removes?: string[];
} }

View File

@ -6,8 +6,8 @@ import TransformContext from './TransformContext';
export default class AdjustHeight extends ItemTransformer { export default class AdjustHeight extends ItemTransformer {
constructor() { constructor() {
super('Adjust Heights', { super('Adjust Heights', 'Corrects height with help of the page viewport', {
consumes: ['transform', 'height'], requireColumns: ['transform', 'height'],
}); });
} }

View File

@ -5,19 +5,30 @@ import TransformContext from './TransformContext';
export default class CalculateCoordinates extends ItemTransformer { export default class CalculateCoordinates extends ItemTransformer {
constructor() { constructor() {
super('Calculate Coordinates', { super(
consumes: ['transform'], 'Calculate Coordinates',
produces: ['X', 'Y'], 'Extracts X and Y out of the Transform array',
removes: ['transform'], {
}); requireColumns: ['transform'],
},
(incomingSchema) => {
return incomingSchema.reduce((schema, column) => {
if (column === 'transform') {
return [...schema, 'x', 'y'];
}
return [...schema, column];
}, new Array<string>());
},
);
} }
transform(context: TransformContext, items: Item[]): ItemResult { transform(_: TransformContext, items: Item[]): ItemResult {
// const transform: number[] = item.value['Transform']; const transformedItems = items.map((item) => {
items.shift(); const transform: number[] = item.data['transform'];
if(items[0]){ const x = transform[4];
items[0].data['fontName']='xxx'; const y = transform[5];
} return item.withDataAddition({ x, y });
return { items, messages: [] }; });
return { items: transformedItems, messages: [] };
} }
} }

View File

@ -1,26 +1,37 @@
import TransformerDescription from '../TransformerDescription'; import TransformerDescriptor from '../TransformerDescription';
import type Item from '../Item'; import type Item from '../Item';
import TransformContext from './TransformContext'; import TransformContext from './TransformContext';
import ItemResult from 'src/ItemResult'; import ItemResult from 'src/ItemResult';
/**
* Transforms the incoming schema to what the transformer produces.
*/
type SchemaTransformer = (incomingSchema: string[]) => string[];
export default abstract class ItemTransformer { export default abstract class ItemTransformer {
readonly name: string; readonly name: string;
readonly description: TransformerDescription; readonly description: string;
readonly descriptor: TransformerDescriptor;
readonly schemaTransformer: SchemaTransformer;
constructor(name: string, description: TransformerDescription) { constructor(
name: string,
description: string,
descriptor: TransformerDescriptor,
schemaTransformer: SchemaTransformer = (schema) => schema,
) {
this.name = name; this.name = name;
this.description = { this.description = description;
this.descriptor = {
...{ ...{
consumesGlobels: [], consumesGlobels: [],
producesGlobels: [], producesGlobels: [],
consumes: [], requireColumns: [],
produces: [],
removes: [],
}, },
...description, ...descriptor,
}; };
this.schemaTransformer = schemaTransformer;
} }
// columnar-changes: described
abstract transform(context: TransformContext, items: Item[]): ItemResult; abstract transform(context: TransformContext, items: Item[]): ItemResult;
} }

View File

@ -1,4 +1,4 @@
import TransformerDescription from 'src/TransformerDescription'; import TransformerDescriptor from 'src/TransformerDescription';
import { assert } from '../assert'; import { assert } from '../assert';
import ItemTransformer from './ItemTransformer'; import ItemTransformer from './ItemTransformer';
@ -23,9 +23,8 @@ export function calculateSchemas(initialSchema: string[], transformers: ItemTran
for (let idx = 0; idx < transformers.length; idx++) { for (let idx = 0; idx < transformers.length; idx++) {
const transformer = transformers[idx]; const transformer = transformers[idx];
const inputSchema = schemas[idx]; const inputSchema = schemas[idx];
validateReferences(inputSchema, transformer.name, transformer.description); validateReferences(inputSchema, transformer.name, transformer.descriptor);
const outputSchema = inputSchema.filter((column) => !transformer.description.removes?.includes(column)); const outputSchema = transformer.schemaTransformer(inputSchema);
transformer.description.produces?.forEach((column) => outputSchema.push(column));
schemas.push(outputSchema); schemas.push(outputSchema);
} }
return schemas; return schemas;
@ -34,9 +33,9 @@ export function calculateSchemas(initialSchema: string[], transformers: ItemTran
function validateReferences( function validateReferences(
inputSchema: string[], inputSchema: string[],
transformerName: string, transformerName: string,
transformerDescription: TransformerDescription, transformerDescriptor: TransformerDescriptor,
) { ) {
transformerDescription.consumes?.forEach((column) => { transformerDescriptor.requireColumns?.forEach((column) => {
assert( assert(
inputSchema.includes(column), inputSchema.includes(column),
`Input schema [${inputSchema.join( `Input schema [${inputSchema.join(
@ -44,12 +43,4 @@ function validateReferences(
)}] for transformer '${transformerName}' does not contain the required column '${column}' (consumes)`, )}] for transformer '${transformerName}' does not contain the required column '${column}' (consumes)`,
); );
}); });
transformerDescription.removes?.forEach((column) => {
assert(
inputSchema.includes(column),
`Input schema [${inputSchema.join(
', ',
)}] for transformer '${transformerName}' does not contain the required column '${column}' (removes)`,
);
});
} }

View File

@ -1,19 +1,21 @@
import Debugger from 'src/Debugger'; import Debugger from 'src/Debugger';
import Item from 'src/Item'; import Item from 'src/Item';
import ItemTransformer from 'src/transformer/ItemTransformer'; import ItemTransformer from 'src/transformer/ItemTransformer';
import Metadata from 'src/Metadata'; import TransformerDescriptor from 'src/TransformerDescription';
import ParseResult from 'src/ParseResult';
import TransformerDescription from 'src/TransformerDescription';
import TransformContext from 'src/transformer/TransformContext'; import TransformContext from 'src/transformer/TransformContext';
import ItemResult from 'src/ItemResult';
class TestTransformer extends ItemTransformer { class TestTransformer extends ItemTransformer {
items: Item[]; items: Item[];
constructor(name: string, description: TransformerDescription, items: Item[]) { constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[], items: Item[]) {
super(name, description); super(name, `Description for ${name}`, descriptor, (incomingSchema) => outputSchema);
this.items = items; this.items = items;
} }
transform(_: TransformContext, items: Item[]): Item[] { transform(_: TransformContext, items: Item[]): ItemResult {
return this.items; return {
items: this.items,
messages: [],
};
} }
} }
@ -21,18 +23,15 @@ test('basic debug', async () => {
const parsedSchema = ['A', 'B']; const parsedSchema = ['A', 'B'];
const parsedItems = [new Item(0, { A: 'a_row1', B: 'b_row1' }), new Item(0, { A: 'a_row2', B: 'b_row2' })]; const parsedItems = [new Item(0, { A: 'a_row1', B: 'b_row1' }), new Item(0, { A: 'a_row2', B: 'b_row2' })];
const trans1Desc = { consumes: ['A', 'B'], produces: ['C'], removes: ['A', 'B'] }; const trans1Desc = { requireColumns: ['A', 'B'] };
const trans1Schema = ['C'];
const trans1Items = parsedItems.map((item) => item.withData({ C: `c=${item.value('A')}+${item.value('B')}` })); const trans1Items = parsedItems.map((item) => item.withData({ C: `c=${item.value('A')}+${item.value('B')}` }));
const transformers = [new TestTransformer('Trans1', trans1Desc, trans1Items)]; const transformers = [new TestTransformer('Trans1', trans1Desc, trans1Schema, trans1Items)];
const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers); const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers);
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']); expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
expect(debug.stageSchema).toEqual([parsedSchema, ['C']]); expect(debug.stageSchema).toEqual([parsedSchema, ['C']]);
for (let index = 0; index < debug.stageNames.length; index++) { expect(debug.stageResults(0).items).toEqual(parsedItems);
console.log(index, debug.stageResults(index)); expect(debug.stageResults(1).items).toEqual(trans1Items);
}
expect(debug.stageResults(0)).toEqual(parsedItems);
expect(debug.stageResults(1)).toEqual(trans1Items);
}); });

View File

@ -1,15 +1,20 @@
import TransformerDescription from 'src/TransformerDescription'; import TransformerDescriptor from 'src/TransformerDescription';
import Item from 'src/Item'; import Item from 'src/Item';
import ItemResult from 'src/ItemResult';
import ItemTransformer from 'src/transformer/ItemTransformer'; import ItemTransformer from 'src/transformer/ItemTransformer';
import TransformContext from 'src/transformer/TransformContext'; import TransformContext from 'src/transformer/TransformContext';
import { calculateSchemas, verifyRequiredColumns } from 'src/transformer/transformerUtil'; import { calculateSchemas, verifyRequiredColumns } from 'src/transformer/transformerUtil';
class TestSchemaTransformer extends ItemTransformer { class TestSchemaTransformer extends ItemTransformer {
constructor(name: string, description: TransformerDescription) { constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[] | undefined = undefined) {
super(name, description); if (outputSchema) {
super(name, `Description for ${name}`, descriptor, (_) => outputSchema);
} else {
super(name, `Description for ${name}`, descriptor);
} }
transform(_: TransformContext, items: Item[]): Item[] { }
return items; transform(_: TransformContext, items: Item[]): ItemResult {
return { items, messages: [] };
} }
} }
@ -17,9 +22,9 @@ test('verify valid transform', async () => {
const inputSchema = ['A', 'B', 'C']; const inputSchema = ['A', 'B', 'C'];
const transformers = [ const transformers = [
new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }), new TestSchemaTransformer('Replace B & C with D', { requireColumns: ['B', 'C'] }, ['A', 'D']),
new TestSchemaTransformer('Create E', { produces: ['E'] }), new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }), new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }, ['A', 'D', 'E']),
]; ];
verifyRequiredColumns(inputSchema, transformers); verifyRequiredColumns(inputSchema, transformers);
}); });
@ -27,28 +32,25 @@ test('verify valid transform', async () => {
test('verify invalid consume', async () => { test('verify invalid consume', async () => {
const inputSchema = ['A', 'B', 'C']; const inputSchema = ['A', 'B', 'C'];
const transformers = [new TestSchemaTransformer('Consumes X', { consumes: ['X'] })]; const transformers = [new TestSchemaTransformer('Consumes X', { requireColumns: ['X'] })];
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError( expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
"Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X' (consumes)", "Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X' (consumes)",
); );
}); });
test('verify invalid remove', async () => {
const inputSchema = ['A', 'B', 'C'];
const transformers = [new TestSchemaTransformer('Removes X', { removes: ['X'] })];
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
"Input schema [A, B, C] for transformer 'Removes X' does not contain the required column 'X' (removes)",
);
});
test('calculate schemas', async () => { test('calculate schemas', async () => {
const inputSchema = ['A', 'B', 'C']; const inputSchema = ['A', 'B', 'C'];
const transformers = [ const transformers = [
new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }), new TestSchemaTransformer(
new TestSchemaTransformer('Create E', { produces: ['E'] }), 'Replace B & C with D',
new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }), {
requireColumns: ['B', 'C'],
},
['A', 'D'],
),
new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }),
]; ];
expect(calculateSchemas(inputSchema, transformers)).toEqual([ expect(calculateSchemas(inputSchema, transformers)).toEqual([
['A', 'B', 'C'], ['A', 'B', 'C'],