Implement CalculateCoordinates + simplify schema transformation

This commit is contained in:
Johannes Zillmann 2021-02-13 11:09:34 +01:00
parent 3ebba083c2
commit 698562ab27
8 changed files with 100 additions and 143 deletions

View File

@ -9,7 +9,7 @@ import ParseResult from './ParseResult';
export default class PdfParser {
pdfjs: any;
defaultParams: object;
schema = ['str', 'fontName', 'dir', 'width', 'height', 'transform'];
schema = ['str', 'fontName', 'dir', 'transform', 'width', 'height'];
constructor(pdfjs: any, defaultParams = {}) {
this.pdfjs = pdfjs;
@ -20,10 +20,10 @@ export default class PdfParser {
const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) };
return this.pdfjs
.getDocument(documentInitParameters)
.promise.then((pdfDocument) => {
.promise.then((pdfDocument: any) => {
reporter.parsedDocumentHeader(pdfDocument.numPages);
return Promise.all([
pdfDocument.getMetadata().then((metadata) => {
pdfDocument.getMetadata().then((metadata: any) => {
reporter.parsedMetadata();
return metadata;
}),
@ -31,9 +31,9 @@ export default class PdfParser {
]);
})
.then(([metadata, pages]) => {
const pdfPages = pages.map((page) => page.page);
const items = pages.reduce((allItems, page) => allItems.concat(page.items), []);
const pageViewports = pdfPages.map((page) => {
const pdfPages = pages.map((page: any) => page.page);
const items = pages.reduce((allItems: any[], page: any) => allItems.concat(page.items), []);
const pageViewports = pdfPages.map((page: any) => {
const viewPort = page.getViewport({ scale: 1.0 });
return {
transformFunction: (itemTransform: number[]) =>
@ -57,23 +57,22 @@ export default class PdfParser {
throw new Error('Invalid PDFjs parameter for getDocument. Need either Uint8Array, string or a parameter object');
}
private isArrayBuffer(object) {
private isArrayBuffer(object: any) {
return typeof object === 'object' && object !== null && object.byteLength !== undefined;
}
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults) => {
return pdfDocument.getPage(index + 1).then((page) => {
const viewport = page.getViewport({ scale: 1.0 });
return pdfDocument.getPage(index + 1).then((page: any) => {
return this.triggerFontRetrieval(page).then(() =>
page
.getTextContent({
normalizeWhitespace: false,
disableCombineTextItems: true,
})
.then((textContent) => {
const items = textContent.items.map((rawItem) => new Item(index, rawItem));
.then((textContent: any) => {
const items = textContent.items.map((rawItem: any) => new Item(index, rawItem));
reporter.parsedPage(index);
return [...accumulatedResults, { index, page, items }];
}),
@ -83,63 +82,9 @@ export default class PdfParser {
}, Promise.resolve([]));
}
private triggerFontRetrieval(page): Promise<void> {
private triggerFontRetrieval(page: any): Promise<void> {
return page.getOperatorList();
}
// async parseOld(data: Uint8Array): Promise<ParseResult> {
// return this.pdfjs
// .getDocument({
// data,
// cMapUrl: 'cmaps/',
// cMapPacked: true,
// })
// .promise.then((pdfDocument) => {
// // console.log('result', pdfDocument);
// const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
// return accumulatorPromise.then((accumulatedResults) => {
// // console.log('Parsing page ' + index);
// return pdfDocument.getPage(index + 1).then((page) => {
// const viewport = page.getViewport({ scale: 1.0 });
// console.log(viewport);
// return this.triggerFontRetrieval(page).then(() =>
// page.getTextContent().then((textContent) => {
// // console.log(textContent);
// const textItems: TextItem[] = textContent.items.map((item) => {
// const tx = this.pdfjs.Util.transform(viewport.transform, item.transform);
// const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
// const dividedHeight = item.height / fontHeight;
// return {
// x: Math.round(item.transform[4]),
// y: Math.round(item.transform[5]),
// width: Math.round(item.width),
// height: Math.round(
// Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight,
// ),
// text: item.str,
// textDirection: TextDirection.fromPdfJs(item.dir),
// fontId: item.fontName,
// };
// });
// return [...accumulatedResults, ...textItems];
// }),
// );
// });
// });
// }, Promise.resolve([]));
// return Promise.all([pdfDocument.getMetadata(), result]);
// })
// .then(([metadata, r]) => {
// // console.log('Parsed metadata:', metadata);
// // console.log('Parsed result:', r.length);
// // console.log('Parsed result:', r);
// return {};
// });
// }
}
interface ParsedPage {

View File

@ -1,7 +1,5 @@
export default interface TransformerDescription {
export default interface TransformerDescriptor {
readonly requireColumns?: string[];
readonly consumesGlobels?: string[];
readonly producesGlobels?: string[];
readonly consumes?: string[];
readonly produces?: string[];
readonly removes?: string[];
}

View File

@ -6,8 +6,8 @@ import TransformContext from './TransformContext';
export default class AdjustHeight extends ItemTransformer {
constructor() {
super('Adjust Heights', {
consumes: ['transform', 'height'],
super('Adjust Heights', 'Corrects height with help of the page viewport', {
requireColumns: ['transform', 'height'],
});
}

View File

@ -5,19 +5,30 @@ import TransformContext from './TransformContext';
export default class CalculateCoordinates extends ItemTransformer {
constructor() {
super('Calculate Coordinates', {
consumes: ['transform'],
produces: ['X', 'Y'],
removes: ['transform'],
});
super(
'Calculate Coordinates',
'Extracts X and Y out of the Transform array',
{
requireColumns: ['transform'],
},
(incomingSchema) => {
return incomingSchema.reduce((schema, column) => {
if (column === 'transform') {
return [...schema, 'x', 'y'];
}
return [...schema, column];
}, new Array<string>());
},
);
}
transform(context: TransformContext, items: Item[]): ItemResult {
// const transform: number[] = item.value['Transform'];
items.shift();
if(items[0]){
items[0].data['fontName']='xxx';
}
return { items, messages: [] };
transform(_: TransformContext, items: Item[]): ItemResult {
const transformedItems = items.map((item) => {
const transform: number[] = item.data['transform'];
const x = transform[4];
const y = transform[5];
return item.withDataAddition({ x, y });
});
return { items: transformedItems, messages: [] };
}
}

View File

@ -1,26 +1,37 @@
import TransformerDescription from '../TransformerDescription';
import TransformerDescriptor from '../TransformerDescription';
import type Item from '../Item';
import TransformContext from './TransformContext';
import ItemResult from 'src/ItemResult';
/**
* Transforms the incoming schema to what the transformer produces.
*/
type SchemaTransformer = (incomingSchema: string[]) => string[];
export default abstract class ItemTransformer {
readonly name: string;
readonly description: TransformerDescription;
readonly description: string;
readonly descriptor: TransformerDescriptor;
readonly schemaTransformer: SchemaTransformer;
constructor(name: string, description: TransformerDescription) {
constructor(
name: string,
description: string,
descriptor: TransformerDescriptor,
schemaTransformer: SchemaTransformer = (schema) => schema,
) {
this.name = name;
this.description = {
this.description = description;
this.descriptor = {
...{
consumesGlobels: [],
producesGlobels: [],
consumes: [],
produces: [],
removes: [],
requireColumns: [],
},
...description,
...descriptor,
};
this.schemaTransformer = schemaTransformer;
}
// columnar-changes: described
abstract transform(context: TransformContext, items: Item[]): ItemResult;
}

View File

@ -1,4 +1,4 @@
import TransformerDescription from 'src/TransformerDescription';
import TransformerDescriptor from 'src/TransformerDescription';
import { assert } from '../assert';
import ItemTransformer from './ItemTransformer';
@ -23,9 +23,8 @@ export function calculateSchemas(initialSchema: string[], transformers: ItemTran
for (let idx = 0; idx < transformers.length; idx++) {
const transformer = transformers[idx];
const inputSchema = schemas[idx];
validateReferences(inputSchema, transformer.name, transformer.description);
const outputSchema = inputSchema.filter((column) => !transformer.description.removes?.includes(column));
transformer.description.produces?.forEach((column) => outputSchema.push(column));
validateReferences(inputSchema, transformer.name, transformer.descriptor);
const outputSchema = transformer.schemaTransformer(inputSchema);
schemas.push(outputSchema);
}
return schemas;
@ -34,9 +33,9 @@ export function calculateSchemas(initialSchema: string[], transformers: ItemTran
function validateReferences(
inputSchema: string[],
transformerName: string,
transformerDescription: TransformerDescription,
transformerDescriptor: TransformerDescriptor,
) {
transformerDescription.consumes?.forEach((column) => {
transformerDescriptor.requireColumns?.forEach((column) => {
assert(
inputSchema.includes(column),
`Input schema [${inputSchema.join(
@ -44,12 +43,4 @@ function validateReferences(
)}] for transformer '${transformerName}' does not contain the required column '${column}' (consumes)`,
);
});
transformerDescription.removes?.forEach((column) => {
assert(
inputSchema.includes(column),
`Input schema [${inputSchema.join(
', ',
)}] for transformer '${transformerName}' does not contain the required column '${column}' (removes)`,
);
});
}

View File

@ -1,19 +1,21 @@
import Debugger from 'src/Debugger';
import Item from 'src/Item';
import ItemTransformer from 'src/transformer/ItemTransformer';
import Metadata from 'src/Metadata';
import ParseResult from 'src/ParseResult';
import TransformerDescription from 'src/TransformerDescription';
import TransformerDescriptor from 'src/TransformerDescription';
import TransformContext from 'src/transformer/TransformContext';
import ItemResult from 'src/ItemResult';
class TestTransformer extends ItemTransformer {
items: Item[];
constructor(name: string, description: TransformerDescription, items: Item[]) {
super(name, description);
constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[], items: Item[]) {
super(name, `Description for ${name}`, descriptor, (incomingSchema) => outputSchema);
this.items = items;
}
transform(_: TransformContext, items: Item[]): Item[] {
return this.items;
transform(_: TransformContext, items: Item[]): ItemResult {
return {
items: this.items,
messages: [],
};
}
}
@ -21,18 +23,15 @@ test('basic debug', async () => {
const parsedSchema = ['A', 'B'];
const parsedItems = [new Item(0, { A: 'a_row1', B: 'b_row1' }), new Item(0, { A: 'a_row2', B: 'b_row2' })];
const trans1Desc = { consumes: ['A', 'B'], produces: ['C'], removes: ['A', 'B'] };
const trans1Desc = { requireColumns: ['A', 'B'] };
const trans1Schema = ['C'];
const trans1Items = parsedItems.map((item) => item.withData({ C: `c=${item.value('A')}+${item.value('B')}` }));
const transformers = [new TestTransformer('Trans1', trans1Desc, trans1Items)];
const transformers = [new TestTransformer('Trans1', trans1Desc, trans1Schema, trans1Items)];
const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers);
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
expect(debug.stageSchema).toEqual([parsedSchema, ['C']]);
for (let index = 0; index < debug.stageNames.length; index++) {
console.log(index, debug.stageResults(index));
}
expect(debug.stageResults(0)).toEqual(parsedItems);
expect(debug.stageResults(1)).toEqual(trans1Items);
expect(debug.stageResults(0).items).toEqual(parsedItems);
expect(debug.stageResults(1).items).toEqual(trans1Items);
});

View File

@ -1,15 +1,20 @@
import TransformerDescription from 'src/TransformerDescription';
import TransformerDescriptor from 'src/TransformerDescription';
import Item from 'src/Item';
import ItemResult from 'src/ItemResult';
import ItemTransformer from 'src/transformer/ItemTransformer';
import TransformContext from 'src/transformer/TransformContext';
import { calculateSchemas, verifyRequiredColumns } from 'src/transformer/transformerUtil';
class TestSchemaTransformer extends ItemTransformer {
constructor(name: string, description: TransformerDescription) {
super(name, description);
constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[] | undefined = undefined) {
if (outputSchema) {
super(name, `Description for ${name}`, descriptor, (_) => outputSchema);
} else {
super(name, `Description for ${name}`, descriptor);
}
}
transform(_: TransformContext, items: Item[]): Item[] {
return items;
transform(_: TransformContext, items: Item[]): ItemResult {
return { items, messages: [] };
}
}
@ -17,9 +22,9 @@ test('verify valid transform', async () => {
const inputSchema = ['A', 'B', 'C'];
const transformers = [
new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }),
new TestSchemaTransformer('Create E', { produces: ['E'] }),
new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }),
new TestSchemaTransformer('Replace B & C with D', { requireColumns: ['B', 'C'] }, ['A', 'D']),
new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }, ['A', 'D', 'E']),
];
verifyRequiredColumns(inputSchema, transformers);
});
@ -27,28 +32,25 @@ test('verify valid transform', async () => {
test('verify invalid consume', async () => {
const inputSchema = ['A', 'B', 'C'];
const transformers = [new TestSchemaTransformer('Consumes X', { consumes: ['X'] })];
const transformers = [new TestSchemaTransformer('Consumes X', { requireColumns: ['X'] })];
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
"Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X' (consumes)",
);
});
test('verify invalid remove', async () => {
const inputSchema = ['A', 'B', 'C'];
const transformers = [new TestSchemaTransformer('Removes X', { removes: ['X'] })];
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
"Input schema [A, B, C] for transformer 'Removes X' does not contain the required column 'X' (removes)",
);
});
test('calculate schemas', async () => {
const inputSchema = ['A', 'B', 'C'];
const transformers = [
new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }),
new TestSchemaTransformer('Create E', { produces: ['E'] }),
new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }),
new TestSchemaTransformer(
'Replace B & C with D',
{
requireColumns: ['B', 'C'],
},
['A', 'D'],
),
new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }),
];
expect(calculateSchemas(inputSchema, transformers)).toEqual([
['A', 'B', 'C'],