mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-23 05:10:47 +01:00
Implement CalculateCoordinates + simplify schema transformation
This commit is contained in:
parent
3ebba083c2
commit
698562ab27
@ -9,7 +9,7 @@ import ParseResult from './ParseResult';
|
||||
export default class PdfParser {
|
||||
pdfjs: any;
|
||||
defaultParams: object;
|
||||
schema = ['str', 'fontName', 'dir', 'width', 'height', 'transform'];
|
||||
schema = ['str', 'fontName', 'dir', 'transform', 'width', 'height'];
|
||||
|
||||
constructor(pdfjs: any, defaultParams = {}) {
|
||||
this.pdfjs = pdfjs;
|
||||
@ -20,10 +20,10 @@ export default class PdfParser {
|
||||
const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) };
|
||||
return this.pdfjs
|
||||
.getDocument(documentInitParameters)
|
||||
.promise.then((pdfDocument) => {
|
||||
.promise.then((pdfDocument: any) => {
|
||||
reporter.parsedDocumentHeader(pdfDocument.numPages);
|
||||
return Promise.all([
|
||||
pdfDocument.getMetadata().then((metadata) => {
|
||||
pdfDocument.getMetadata().then((metadata: any) => {
|
||||
reporter.parsedMetadata();
|
||||
return metadata;
|
||||
}),
|
||||
@ -31,9 +31,9 @@ export default class PdfParser {
|
||||
]);
|
||||
})
|
||||
.then(([metadata, pages]) => {
|
||||
const pdfPages = pages.map((page) => page.page);
|
||||
const items = pages.reduce((allItems, page) => allItems.concat(page.items), []);
|
||||
const pageViewports = pdfPages.map((page) => {
|
||||
const pdfPages = pages.map((page: any) => page.page);
|
||||
const items = pages.reduce((allItems: any[], page: any) => allItems.concat(page.items), []);
|
||||
const pageViewports = pdfPages.map((page: any) => {
|
||||
const viewPort = page.getViewport({ scale: 1.0 });
|
||||
return {
|
||||
transformFunction: (itemTransform: number[]) =>
|
||||
@ -57,23 +57,22 @@ export default class PdfParser {
|
||||
throw new Error('Invalid PDFjs parameter for getDocument. Need either Uint8Array, string or a parameter object');
|
||||
}
|
||||
|
||||
private isArrayBuffer(object) {
|
||||
private isArrayBuffer(object: any) {
|
||||
return typeof object === 'object' && object !== null && object.byteLength !== undefined;
|
||||
}
|
||||
|
||||
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
|
||||
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||
return accumulatorPromise.then((accumulatedResults) => {
|
||||
return pdfDocument.getPage(index + 1).then((page) => {
|
||||
const viewport = page.getViewport({ scale: 1.0 });
|
||||
return pdfDocument.getPage(index + 1).then((page: any) => {
|
||||
return this.triggerFontRetrieval(page).then(() =>
|
||||
page
|
||||
.getTextContent({
|
||||
normalizeWhitespace: false,
|
||||
disableCombineTextItems: true,
|
||||
})
|
||||
.then((textContent) => {
|
||||
const items = textContent.items.map((rawItem) => new Item(index, rawItem));
|
||||
.then((textContent: any) => {
|
||||
const items = textContent.items.map((rawItem: any) => new Item(index, rawItem));
|
||||
reporter.parsedPage(index);
|
||||
return [...accumulatedResults, { index, page, items }];
|
||||
}),
|
||||
@ -83,63 +82,9 @@ export default class PdfParser {
|
||||
}, Promise.resolve([]));
|
||||
}
|
||||
|
||||
private triggerFontRetrieval(page): Promise<void> {
|
||||
private triggerFontRetrieval(page: any): Promise<void> {
|
||||
return page.getOperatorList();
|
||||
}
|
||||
|
||||
// async parseOld(data: Uint8Array): Promise<ParseResult> {
|
||||
// return this.pdfjs
|
||||
// .getDocument({
|
||||
// data,
|
||||
// cMapUrl: 'cmaps/',
|
||||
// cMapPacked: true,
|
||||
// })
|
||||
// .promise.then((pdfDocument) => {
|
||||
// // console.log('result', pdfDocument);
|
||||
// const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||
// return accumulatorPromise.then((accumulatedResults) => {
|
||||
// // console.log('Parsing page ' + index);
|
||||
// return pdfDocument.getPage(index + 1).then((page) => {
|
||||
// const viewport = page.getViewport({ scale: 1.0 });
|
||||
// console.log(viewport);
|
||||
|
||||
// return this.triggerFontRetrieval(page).then(() =>
|
||||
// page.getTextContent().then((textContent) => {
|
||||
// // console.log(textContent);
|
||||
// const textItems: TextItem[] = textContent.items.map((item) => {
|
||||
// const tx = this.pdfjs.Util.transform(viewport.transform, item.transform);
|
||||
// const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
|
||||
// const dividedHeight = item.height / fontHeight;
|
||||
|
||||
// return {
|
||||
// x: Math.round(item.transform[4]),
|
||||
// y: Math.round(item.transform[5]),
|
||||
// width: Math.round(item.width),
|
||||
// height: Math.round(
|
||||
// Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight,
|
||||
// ),
|
||||
// text: item.str,
|
||||
// textDirection: TextDirection.fromPdfJs(item.dir),
|
||||
// fontId: item.fontName,
|
||||
// };
|
||||
// });
|
||||
|
||||
// return [...accumulatedResults, ...textItems];
|
||||
// }),
|
||||
// );
|
||||
// });
|
||||
// });
|
||||
// }, Promise.resolve([]));
|
||||
// return Promise.all([pdfDocument.getMetadata(), result]);
|
||||
// })
|
||||
// .then(([metadata, r]) => {
|
||||
// // console.log('Parsed metadata:', metadata);
|
||||
// // console.log('Parsed result:', r.length);
|
||||
// // console.log('Parsed result:', r);
|
||||
|
||||
// return {};
|
||||
// });
|
||||
// }
|
||||
}
|
||||
|
||||
interface ParsedPage {
|
||||
|
@ -1,7 +1,5 @@
|
||||
export default interface TransformerDescription {
|
||||
export default interface TransformerDescriptor {
|
||||
readonly requireColumns?: string[];
|
||||
readonly consumesGlobels?: string[];
|
||||
readonly producesGlobels?: string[];
|
||||
readonly consumes?: string[];
|
||||
readonly produces?: string[];
|
||||
readonly removes?: string[];
|
||||
}
|
||||
|
@ -6,8 +6,8 @@ import TransformContext from './TransformContext';
|
||||
|
||||
export default class AdjustHeight extends ItemTransformer {
|
||||
constructor() {
|
||||
super('Adjust Heights', {
|
||||
consumes: ['transform', 'height'],
|
||||
super('Adjust Heights', 'Corrects height with help of the page viewport', {
|
||||
requireColumns: ['transform', 'height'],
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -5,19 +5,30 @@ import TransformContext from './TransformContext';
|
||||
|
||||
export default class CalculateCoordinates extends ItemTransformer {
|
||||
constructor() {
|
||||
super('Calculate Coordinates', {
|
||||
consumes: ['transform'],
|
||||
produces: ['X', 'Y'],
|
||||
removes: ['transform'],
|
||||
});
|
||||
super(
|
||||
'Calculate Coordinates',
|
||||
'Extracts X and Y out of the Transform array',
|
||||
{
|
||||
requireColumns: ['transform'],
|
||||
},
|
||||
(incomingSchema) => {
|
||||
return incomingSchema.reduce((schema, column) => {
|
||||
if (column === 'transform') {
|
||||
return [...schema, 'x', 'y'];
|
||||
}
|
||||
return [...schema, column];
|
||||
}, new Array<string>());
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
transform(context: TransformContext, items: Item[]): ItemResult {
|
||||
// const transform: number[] = item.value['Transform'];
|
||||
items.shift();
|
||||
if(items[0]){
|
||||
items[0].data['fontName']='xxx';
|
||||
}
|
||||
return { items, messages: [] };
|
||||
transform(_: TransformContext, items: Item[]): ItemResult {
|
||||
const transformedItems = items.map((item) => {
|
||||
const transform: number[] = item.data['transform'];
|
||||
const x = transform[4];
|
||||
const y = transform[5];
|
||||
return item.withDataAddition({ x, y });
|
||||
});
|
||||
return { items: transformedItems, messages: [] };
|
||||
}
|
||||
}
|
||||
|
@ -1,26 +1,37 @@
|
||||
import TransformerDescription from '../TransformerDescription';
|
||||
import TransformerDescriptor from '../TransformerDescription';
|
||||
import type Item from '../Item';
|
||||
import TransformContext from './TransformContext';
|
||||
import ItemResult from 'src/ItemResult';
|
||||
|
||||
/**
|
||||
* Transforms the incoming schema to what the transformer produces.
|
||||
*/
|
||||
type SchemaTransformer = (incomingSchema: string[]) => string[];
|
||||
|
||||
export default abstract class ItemTransformer {
|
||||
readonly name: string;
|
||||
readonly description: TransformerDescription;
|
||||
readonly description: string;
|
||||
readonly descriptor: TransformerDescriptor;
|
||||
readonly schemaTransformer: SchemaTransformer;
|
||||
|
||||
constructor(name: string, description: TransformerDescription) {
|
||||
constructor(
|
||||
name: string,
|
||||
description: string,
|
||||
descriptor: TransformerDescriptor,
|
||||
schemaTransformer: SchemaTransformer = (schema) => schema,
|
||||
) {
|
||||
this.name = name;
|
||||
this.description = {
|
||||
this.description = description;
|
||||
this.descriptor = {
|
||||
...{
|
||||
consumesGlobels: [],
|
||||
producesGlobels: [],
|
||||
consumes: [],
|
||||
produces: [],
|
||||
removes: [],
|
||||
requireColumns: [],
|
||||
},
|
||||
...description,
|
||||
...descriptor,
|
||||
};
|
||||
this.schemaTransformer = schemaTransformer;
|
||||
}
|
||||
|
||||
// columnar-changes: described
|
||||
abstract transform(context: TransformContext, items: Item[]): ItemResult;
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
import TransformerDescription from 'src/TransformerDescription';
|
||||
import TransformerDescriptor from 'src/TransformerDescription';
|
||||
import { assert } from '../assert';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
|
||||
@ -23,9 +23,8 @@ export function calculateSchemas(initialSchema: string[], transformers: ItemTran
|
||||
for (let idx = 0; idx < transformers.length; idx++) {
|
||||
const transformer = transformers[idx];
|
||||
const inputSchema = schemas[idx];
|
||||
validateReferences(inputSchema, transformer.name, transformer.description);
|
||||
const outputSchema = inputSchema.filter((column) => !transformer.description.removes?.includes(column));
|
||||
transformer.description.produces?.forEach((column) => outputSchema.push(column));
|
||||
validateReferences(inputSchema, transformer.name, transformer.descriptor);
|
||||
const outputSchema = transformer.schemaTransformer(inputSchema);
|
||||
schemas.push(outputSchema);
|
||||
}
|
||||
return schemas;
|
||||
@ -34,9 +33,9 @@ export function calculateSchemas(initialSchema: string[], transformers: ItemTran
|
||||
function validateReferences(
|
||||
inputSchema: string[],
|
||||
transformerName: string,
|
||||
transformerDescription: TransformerDescription,
|
||||
transformerDescriptor: TransformerDescriptor,
|
||||
) {
|
||||
transformerDescription.consumes?.forEach((column) => {
|
||||
transformerDescriptor.requireColumns?.forEach((column) => {
|
||||
assert(
|
||||
inputSchema.includes(column),
|
||||
`Input schema [${inputSchema.join(
|
||||
@ -44,12 +43,4 @@ function validateReferences(
|
||||
)}] for transformer '${transformerName}' does not contain the required column '${column}' (consumes)`,
|
||||
);
|
||||
});
|
||||
transformerDescription.removes?.forEach((column) => {
|
||||
assert(
|
||||
inputSchema.includes(column),
|
||||
`Input schema [${inputSchema.join(
|
||||
', ',
|
||||
)}] for transformer '${transformerName}' does not contain the required column '${column}' (removes)`,
|
||||
);
|
||||
});
|
||||
}
|
||||
|
@ -1,19 +1,21 @@
|
||||
import Debugger from 'src/Debugger';
|
||||
import Item from 'src/Item';
|
||||
import ItemTransformer from 'src/transformer/ItemTransformer';
|
||||
import Metadata from 'src/Metadata';
|
||||
import ParseResult from 'src/ParseResult';
|
||||
import TransformerDescription from 'src/TransformerDescription';
|
||||
import TransformerDescriptor from 'src/TransformerDescription';
|
||||
import TransformContext from 'src/transformer/TransformContext';
|
||||
import ItemResult from 'src/ItemResult';
|
||||
|
||||
class TestTransformer extends ItemTransformer {
|
||||
items: Item[];
|
||||
constructor(name: string, description: TransformerDescription, items: Item[]) {
|
||||
super(name, description);
|
||||
constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[], items: Item[]) {
|
||||
super(name, `Description for ${name}`, descriptor, (incomingSchema) => outputSchema);
|
||||
this.items = items;
|
||||
}
|
||||
transform(_: TransformContext, items: Item[]): Item[] {
|
||||
return this.items;
|
||||
transform(_: TransformContext, items: Item[]): ItemResult {
|
||||
return {
|
||||
items: this.items,
|
||||
messages: [],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@ -21,18 +23,15 @@ test('basic debug', async () => {
|
||||
const parsedSchema = ['A', 'B'];
|
||||
const parsedItems = [new Item(0, { A: 'a_row1', B: 'b_row1' }), new Item(0, { A: 'a_row2', B: 'b_row2' })];
|
||||
|
||||
const trans1Desc = { consumes: ['A', 'B'], produces: ['C'], removes: ['A', 'B'] };
|
||||
const trans1Desc = { requireColumns: ['A', 'B'] };
|
||||
const trans1Schema = ['C'];
|
||||
const trans1Items = parsedItems.map((item) => item.withData({ C: `c=${item.value('A')}+${item.value('B')}` }));
|
||||
|
||||
const transformers = [new TestTransformer('Trans1', trans1Desc, trans1Items)];
|
||||
const transformers = [new TestTransformer('Trans1', trans1Desc, trans1Schema, trans1Items)];
|
||||
const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers);
|
||||
|
||||
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
|
||||
expect(debug.stageSchema).toEqual([parsedSchema, ['C']]);
|
||||
for (let index = 0; index < debug.stageNames.length; index++) {
|
||||
console.log(index, debug.stageResults(index));
|
||||
}
|
||||
|
||||
expect(debug.stageResults(0)).toEqual(parsedItems);
|
||||
expect(debug.stageResults(1)).toEqual(trans1Items);
|
||||
expect(debug.stageResults(0).items).toEqual(parsedItems);
|
||||
expect(debug.stageResults(1).items).toEqual(trans1Items);
|
||||
});
|
||||
|
@ -1,15 +1,20 @@
|
||||
import TransformerDescription from 'src/TransformerDescription';
|
||||
import TransformerDescriptor from 'src/TransformerDescription';
|
||||
import Item from 'src/Item';
|
||||
import ItemResult from 'src/ItemResult';
|
||||
import ItemTransformer from 'src/transformer/ItemTransformer';
|
||||
import TransformContext from 'src/transformer/TransformContext';
|
||||
import { calculateSchemas, verifyRequiredColumns } from 'src/transformer/transformerUtil';
|
||||
|
||||
class TestSchemaTransformer extends ItemTransformer {
|
||||
constructor(name: string, description: TransformerDescription) {
|
||||
super(name, description);
|
||||
constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[] | undefined = undefined) {
|
||||
if (outputSchema) {
|
||||
super(name, `Description for ${name}`, descriptor, (_) => outputSchema);
|
||||
} else {
|
||||
super(name, `Description for ${name}`, descriptor);
|
||||
}
|
||||
}
|
||||
transform(_: TransformContext, items: Item[]): Item[] {
|
||||
return items;
|
||||
transform(_: TransformContext, items: Item[]): ItemResult {
|
||||
return { items, messages: [] };
|
||||
}
|
||||
}
|
||||
|
||||
@ -17,9 +22,9 @@ test('verify valid transform', async () => {
|
||||
const inputSchema = ['A', 'B', 'C'];
|
||||
|
||||
const transformers = [
|
||||
new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }),
|
||||
new TestSchemaTransformer('Create E', { produces: ['E'] }),
|
||||
new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }),
|
||||
new TestSchemaTransformer('Replace B & C with D', { requireColumns: ['B', 'C'] }, ['A', 'D']),
|
||||
new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
|
||||
new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }, ['A', 'D', 'E']),
|
||||
];
|
||||
verifyRequiredColumns(inputSchema, transformers);
|
||||
});
|
||||
@ -27,28 +32,25 @@ test('verify valid transform', async () => {
|
||||
test('verify invalid consume', async () => {
|
||||
const inputSchema = ['A', 'B', 'C'];
|
||||
|
||||
const transformers = [new TestSchemaTransformer('Consumes X', { consumes: ['X'] })];
|
||||
const transformers = [new TestSchemaTransformer('Consumes X', { requireColumns: ['X'] })];
|
||||
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
|
||||
"Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X' (consumes)",
|
||||
);
|
||||
});
|
||||
|
||||
test('verify invalid remove', async () => {
|
||||
const inputSchema = ['A', 'B', 'C'];
|
||||
|
||||
const transformers = [new TestSchemaTransformer('Removes X', { removes: ['X'] })];
|
||||
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
|
||||
"Input schema [A, B, C] for transformer 'Removes X' does not contain the required column 'X' (removes)",
|
||||
);
|
||||
});
|
||||
|
||||
test('calculate schemas', async () => {
|
||||
const inputSchema = ['A', 'B', 'C'];
|
||||
|
||||
const transformers = [
|
||||
new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }),
|
||||
new TestSchemaTransformer('Create E', { produces: ['E'] }),
|
||||
new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }),
|
||||
new TestSchemaTransformer(
|
||||
'Replace B & C with D',
|
||||
{
|
||||
requireColumns: ['B', 'C'],
|
||||
},
|
||||
['A', 'D'],
|
||||
),
|
||||
new TestSchemaTransformer('Create E', {}, ['A', 'D', 'E']),
|
||||
new TestSchemaTransformer('Uses A, D & E', { requireColumns: ['A', 'D', 'E'] }),
|
||||
];
|
||||
expect(calculateSchemas(inputSchema, transformers)).toEqual([
|
||||
['A', 'B', 'C'],
|
||||
|
Loading…
Reference in New Issue
Block a user