diff --git a/core/src/Config.ts b/core/src/Config.ts new file mode 100644 index 0000000..d98c54e --- /dev/null +++ b/core/src/Config.ts @@ -0,0 +1,8 @@ +import ItemTransformer from './transformer/ItemTransformer'; + +export default interface Config { + // See DocumentInitParameters from https://mozilla.github.io/pdf.js/api/draft/module-pdfjsLib.html#DocumentInitParameters + pdfjsParams?: object; + transformers?: ItemTransformer[]; + // TODO keep pdfPages ? +} diff --git a/core/src/Debugger.ts b/core/src/Debugger.ts new file mode 100644 index 0000000..294e673 --- /dev/null +++ b/core/src/Debugger.ts @@ -0,0 +1,41 @@ +import { assert } from './assert'; +import Item from './Item'; +import ItemTransformer from './transformer/ItemTransformer'; +import ParseResult from './ParseResult'; +import { calculateSchemas } from './transformer/transformerUtil'; +import TransformContext from './transformer/TransformContext'; + +export default class Debugger { + // parseResult: ParseResult; + context: TransformContext; + transformers: ItemTransformer[]; + stageNames: string[]; + stageSchema: string[][]; + private stageItems: Item[][]; + + constructor( + initialSchema: string[], + initialItems: Item[], + context: TransformContext, + transformers: ItemTransformer[], + ) { + // this.parseResult = parseResult; + this.transformers = transformers; + this.context = context; + this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)]; + this.stageItems = [initialItems]; + this.stageSchema = calculateSchemas(initialSchema, transformers); + } + + //TODO return MarkedItem ? (removed, added, etc..)? + //TODO StageResult == class with schema and marked items ? + stageResults(stageIndex: number): Item[] { + for (let idx = 0; idx < stageIndex + 1; idx++) { + if (!this.stageItems[idx]) { + const stageItems = this.transformers[idx - 1].transform(this.context, this.stageItems[idx - 1]); + this.stageItems.push(stageItems); + } + } + return this.stageItems[stageIndex]; + } +} diff --git a/core/src/Item.ts b/core/src/Item.ts index 6ebc104..4421e14 100644 --- a/core/src/Item.ts +++ b/core/src/Item.ts @@ -1,13 +1,32 @@ +import { v4 as uuidv4 } from 'uuid'; + export default class Item { page: number; data: object; + uuid?: string; - constructor(page: number, data: object) { + constructor(page: number, data: object, uuid: string = uuidv4()) { this.page = page; this.data = data; + this.uuid = uuid; } value(column: string): object { return this.data[column]; } + + withDataAddition(data: object): Item { + return this.withData({ ...this.data, ...data }); + } + + withData(data: object): Item { + return new Item(this.page, data, this.uuid); + } + + /** + * Returns the item without a uuid. + */ + withoutUuid(): Item { + return new Item(this.page, this.data, ''); + } } diff --git a/core/src/ParseResult.ts b/core/src/ParseResult.ts index 014fbff..b92d52a 100644 --- a/core/src/ParseResult.ts +++ b/core/src/ParseResult.ts @@ -1,16 +1,19 @@ -import Item from './Item'; +import type Item from './Item'; import type Metadata from './Metadata'; +import type PageViewport from './parse/PageViewport'; export default class ParseResult { pdfPages: any[]; + pageViewports: PageViewport[]; metadata: Metadata; - columns: string[]; + schema: string[]; items: Item[]; - constructor(pdfPages: any[], metadata: Metadata, columns: string[], items: Item[]) { + constructor(pdfPages: any[], pageViewports: PageViewport[], metadata: Metadata, schema: string[], items: Item[]) { this.pdfPages = pdfPages; + this.pageViewports = pageViewports; this.metadata = metadata; - this.columns = columns; + this.schema = schema; this.items = items; } diff --git a/core/src/ParsedPage.ts b/core/src/ParsedPage.ts deleted file mode 100644 index 244dd38..0000000 --- a/core/src/ParsedPage.ts +++ /dev/null @@ -1,13 +0,0 @@ -import type ParsedPageItem from './ParsedPageItem'; - -export default class ParsedPage { - index: number; - pdfPage: any; - items: ParsedPageItem[]; - - constructor(index: number, pdfPage: any, items: ParsedPageItem[]) { - this.index = index; - this.pdfPage = pdfPage; - this.items = items; - } -} diff --git a/core/src/PdfParser.ts b/core/src/PdfParser.ts index 8652815..09f5b56 100644 --- a/core/src/PdfParser.ts +++ b/core/src/PdfParser.ts @@ -1,6 +1,5 @@ import Item from './Item'; import Metadata from './Metadata'; -import ParsedPage from './ParsedPage'; import type ParseReporter from './ParseReporter'; import ParseResult from './ParseResult'; import TextDirection from './TextDirection'; @@ -11,31 +10,18 @@ import type TextItem from './TextItem'; */ export default class PdfParser { pdfjs: any; - columns = ['str', 'dir', 'width', 'height', 'transform', 'fontName']; + defaultParams: object; + schema = ['str', 'fontName', 'dir', 'width', 'height', 'transform']; - constructor(pdfjs: any) { + constructor(pdfjs: any, defaultParams = {}) { this.pdfjs = pdfjs; + this.defaultParams = defaultParams; } - async parseBytes(data: Uint8Array, reporter: ParseReporter): Promise { - return this.parse(this.params({ data }), reporter); - } - - async parseUrl(url: string, reporter: ParseReporter): Promise { - return this.parse(this.params({ url }), reporter); - } - - private params(dataSourceParams: object): object { - const defaultParams = { - cMapUrl: 'cmaps/', - cMapPacked: true, - }; - return { ...defaultParams, ...dataSourceParams }; - } - - async parse(parameter: object, reporter: ParseReporter): Promise { + async parse(src: string | Uint8Array | object, reporter: ParseReporter): Promise { + const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) }; return this.pdfjs - .getDocument(parameter) + .getDocument(documentInitParameters) .promise.then((pdfDocument) => { reporter.parsedDocumentHeader(pdfDocument.numPages); return Promise.all([ @@ -47,16 +33,38 @@ export default class PdfParser { ]); }) .then(([metadata, pages]) => { - const pdfPages = pages.map((page) => page.pdfPage); + const pdfPages = pages.map((page) => page.page); const items = pages.reduce((allItems, page) => allItems.concat(page.items), []); - return new ParseResult(pdfPages, new Metadata(metadata), this.columns, items); + const pageViewports = pdfPages.map((page) => { + const viewPort = page.getViewport({ scale: 1.0 }); + return { transformFunction: (itemTransform: number[]) => this.pdfjs.Util.transform(viewPort, itemTransform) }; + }); + return new ParseResult(pdfPages, pageViewports, new Metadata(metadata), this.schema, items); }); } + private documentInitParameters(src: string | Uint8Array | object): object { + if (typeof src === 'string') { + return { url: src }; + } + if (this.isArrayBuffer(src)) { + return { data: src }; + } + if (typeof src === 'object') { + return src; + } + throw new Error('Invalid PDFjs parameter for getDocument. Need either Uint8Array, string or a parameter object'); + } + + private isArrayBuffer(object) { + return typeof object === 'object' && object !== null && object.byteLength !== undefined; + } + private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise { return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => { return accumulatorPromise.then((accumulatedResults) => { return pdfDocument.getPage(index + 1).then((page) => { + const viewport = page.getViewport({ scale: 1.0 }); return this.triggerFontRetrieval(page).then(() => page .getTextContent({ @@ -66,7 +74,7 @@ export default class PdfParser { .then((textContent) => { const items = textContent.items.map((rawItem) => new Item(index, rawItem)); reporter.parsedPage(index); - return [...accumulatedResults, new ParsedPage(index, page, items)]; + return [...accumulatedResults, { index, page, items }]; }), ); }); @@ -92,6 +100,8 @@ export default class PdfParser { // console.log('Parsing page ' + index); return pdfDocument.getPage(index + 1).then((page) => { const viewport = page.getViewport({ scale: 1.0 }); + console.log(viewport); + return this.triggerFontRetrieval(page).then(() => page.getTextContent().then((textContent) => { // console.log(textContent); @@ -126,7 +136,13 @@ export default class PdfParser { // console.log('Parsed result:', r.length); // console.log('Parsed result:', r); - return new ParseResult([], new Metadata(metadata), [], []); + return {}; }); } } + +interface ParsedPage { + index: number; + page: any; + items: Item[]; +} diff --git a/core/src/PdfPipeline.ts b/core/src/PdfPipeline.ts new file mode 100644 index 0000000..cfb2fb5 --- /dev/null +++ b/core/src/PdfPipeline.ts @@ -0,0 +1,45 @@ +import PdfParser from './PdfParser'; +import ProgressListenFunction from './ProgressListenFunction'; +import ParseProgressReporter from './ParseProgressReporter'; +import ItemTransformer from './transformer/ItemTransformer'; +import Item from './Item'; +import ParseResult from './ParseResult'; +import Debugger from './Debugger'; +import { verifyRequiredColumns } from './transformer/transformerUtil'; +import TransformContext from './transformer/TransformContext'; + +export default class PdfPipeline { + parser: PdfParser; + transformers: ItemTransformer[]; + + constructor(parser: PdfParser, transformers: ItemTransformer[]) { + this.parser = parser; + this.transformers = transformers; + } + + private async parse( + src: string | Uint8Array | object, + progressListener: ProgressListenFunction, + ): Promise { + const parseResult = await this.parser.parse(src, new ParseProgressReporter(progressListener)); + verifyRequiredColumns(parseResult.schema, this.transformers); + return parseResult; + } + //TODO PipelineResult + async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise { + const parseResult = await this.parse(src, progressListener); + const context = { pageViewports: parseResult.pageViewports }; + let items = parseResult.items; + this.transformers.forEach((transformer) => { + items = transformer.transform(context, items); + }); + parseResult.items = items; + return parseResult; + } + + async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise { + const parseResult = await this.parse(src, progressListener); + const context = { pageViewports: parseResult.pageViewports }; + return new Debugger(parseResult.schema, parseResult.items, context, this.transformers); + } +} diff --git a/core/src/TransformerDescription.ts b/core/src/TransformerDescription.ts new file mode 100644 index 0000000..f0b0f78 --- /dev/null +++ b/core/src/TransformerDescription.ts @@ -0,0 +1,7 @@ +export default interface TransformerDescription { + readonly consumesGlobels?: string[]; + readonly producesGlobels?: string[]; + readonly consumes?: string[]; + readonly produces?: string[]; + readonly removes?: string[]; +} diff --git a/core/src/assert.ts b/core/src/assert.ts new file mode 100644 index 0000000..2df6126 --- /dev/null +++ b/core/src/assert.ts @@ -0,0 +1,11 @@ +export function assert(condition: boolean, message: string) { + if (!condition) { + throw new Error(message || 'Assertion failed'); + } +} + +export function assertDefined(value: T, message: string): T { + assert(value !== null, message); + assert(typeof value !== 'undefined', message); + return value; +} diff --git a/core/src/index.ts b/core/src/index.ts index a05e65e..a60d747 100644 --- a/core/src/index.ts +++ b/core/src/index.ts @@ -1,11 +1,32 @@ +import Config from './Config'; import type ProgressListenFunction from './ProgressListenFunction'; import ParseProgressReporter from './ParseProgressReporter'; import PdfParser from './PdfParser'; +import PdfPipeline from './PdfPipeline'; + +import AdjustHeight from './transformer/AdjustHeight'; +import CalculateCoordinates from './transformer/CalculateCoordinates'; + +const transformers = [new AdjustHeight(), new CalculateCoordinates()]; + +const defaultConfig: Config = { + pdfjsParams: { + // TODO check if that cmap thing makes sense since we don't bundle them + cMapUrl: 'cmaps/', + cMapPacked: true, + }, + transformers, +}; export function pdfParser(pdfJs: any) { - return new PdfParser(pdfJs); + return new PdfParser(pdfJs, defaultConfig.pdfjsParams); } export function parseReporter(progressListener: ProgressListenFunction) { return new ParseProgressReporter(progressListener); } + +export function createPipeline(pdfJs: any, config = defaultConfig): PdfPipeline { + const parser = new PdfParser(pdfJs, config.pdfjsParams); + return new PdfPipeline(parser, config.transformers || transformers); +} diff --git a/core/src/parse/PageViewport.ts b/core/src/parse/PageViewport.ts new file mode 100644 index 0000000..c178e05 --- /dev/null +++ b/core/src/parse/PageViewport.ts @@ -0,0 +1,5 @@ +type ItemTransformFunction = (itemTransform: number[]) => number[]; + +export default interface PageViewport { + transformFunction: ItemTransformFunction; +} diff --git a/core/src/transformer/AdjustHeight.ts b/core/src/transformer/AdjustHeight.ts new file mode 100644 index 0000000..3749dc2 --- /dev/null +++ b/core/src/transformer/AdjustHeight.ts @@ -0,0 +1,37 @@ +import PageViewport from 'src/parse/PageViewport'; +import Item from '../Item'; +import ItemTransformer from './ItemTransformer'; +import TransformContext from './TransformContext'; + +export default class AdjustHeight extends ItemTransformer { + constructor() { + super('Adjust Heights', { + consumes: ['transform', 'height'], + }); + } + + transform(context: TransformContext, items: Item[]): Item[] { + const newItems: Item[] = []; + let page = -1; + let pageViewport: PageViewport; + //TODO groupBy page + items.forEach((item) => { + if (item.page !== page) { + pageViewport = context.pageViewports[item.page]; + page = page; + } + const itemTransform = item.data['transform']; + const itemHeight = item.data['height']; + const tx = pageViewport.transformFunction(itemTransform); + const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]); + const dividedHeight = itemHeight / fontHeight; + const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight; + if (newHeight !== itemHeight) { + newItems.push(item.withDataAddition({ height: newHeight })); + } else { + newItems.push(item); + } + }); + return items; + } +} diff --git a/core/src/transformer/CalculateCoordinates.ts b/core/src/transformer/CalculateCoordinates.ts new file mode 100644 index 0000000..9e30a00 --- /dev/null +++ b/core/src/transformer/CalculateCoordinates.ts @@ -0,0 +1,19 @@ +import Item from '../Item'; +import ItemTransformer from './ItemTransformer'; +import TransformContext from './TransformContext'; + +export default class CalculateCoordinates extends ItemTransformer { + constructor() { + super('Calculate Coordinates', { + consumes: ['transform'], + produces: ['X', 'Y'], + removes: ['transform'], + }); + } + + transform(context: TransformContext, items: Item[]): Item[] { + // const transform: number[] = item.value['Transform']; + items.shift(); + return items; + } +} diff --git a/core/src/transformer/ItemTransformer.ts b/core/src/transformer/ItemTransformer.ts new file mode 100644 index 0000000..d89284f --- /dev/null +++ b/core/src/transformer/ItemTransformer.ts @@ -0,0 +1,25 @@ +import TransformerDescription from '../TransformerDescription'; +import type Item from '../Item'; +import TransformContext from './TransformContext'; + +export default abstract class ItemTransformer { + readonly name: string; + readonly description: TransformerDescription; + + constructor(name: string, description: TransformerDescription) { + this.name = name; + this.description = { + ...{ + consumesGlobels: [], + producesGlobels: [], + consumes: [], + produces: [], + removes: [], + }, + ...description, + }; + } + + // columnar-changes: described + abstract transform(context: TransformContext, items: Item[]): Item[]; +} diff --git a/core/src/transformer/TransformContext.ts b/core/src/transformer/TransformContext.ts new file mode 100644 index 0000000..97fa2c3 --- /dev/null +++ b/core/src/transformer/TransformContext.ts @@ -0,0 +1,5 @@ +import PageViewport from 'src/parse/PageViewport'; + +export default interface TransformContext { + pageViewports: PageViewport[]; +} diff --git a/core/src/transformer/transformerUtil.ts b/core/src/transformer/transformerUtil.ts new file mode 100644 index 0000000..68fcc49 --- /dev/null +++ b/core/src/transformer/transformerUtil.ts @@ -0,0 +1,55 @@ +import TransformerDescription from 'src/TransformerDescription'; +import { assert } from '../assert'; +import ItemTransformer from './ItemTransformer'; + +/** + * Goes through all transformer and makes sure each required column ({@link TransformerDescription#consumes}) is available in its predecessor schema. + * + * @param initialSchema + * @param transformers + */ +export function verifyRequiredColumns(initialSchema: string[], transformers: ItemTransformer[]) { + calculateSchemas(initialSchema, transformers); +} + +//TODO debug schema +// initial - all unanotated +// second - 2 removed, 1 added +// third - all as before without the removed + +export function calculateSchemas(initialSchema: string[], transformers: ItemTransformer[]): string[][] { + const schemas: string[][] = []; + schemas.push(initialSchema); + for (let idx = 0; idx < transformers.length; idx++) { + const transformer = transformers[idx]; + const inputSchema = schemas[idx]; + validateReferences(inputSchema, transformer.name, transformer.description); + const outputSchema = inputSchema.filter((column) => !transformer.description.removes?.includes(column)); + transformer.description.produces?.forEach((column) => outputSchema.push(column)); + schemas.push(outputSchema); + } + return schemas; +} + +function validateReferences( + inputSchema: string[], + transformerName: string, + transformerDescription: TransformerDescription, +) { + transformerDescription.consumes?.forEach((column) => { + assert( + inputSchema.includes(column), + `Input schema [${inputSchema.join( + ', ', + )}] for transformer '${transformerName}' does not contain the required column '${column}' (consumes)`, + ); + }); + transformerDescription.removes?.forEach((column) => { + assert( + inputSchema.includes(column), + `Input schema [${inputSchema.join( + ', ', + )}] for transformer '${transformerName}' does not contain the required column '${column}' (removes)`, + ); + }); +} diff --git a/core/test/Debugger.test.ts b/core/test/Debugger.test.ts new file mode 100644 index 0000000..03623d5 --- /dev/null +++ b/core/test/Debugger.test.ts @@ -0,0 +1,38 @@ +import Debugger from 'src/Debugger'; +import Item from 'src/Item'; +import ItemTransformer from 'src/transformer/ItemTransformer'; +import Metadata from 'src/Metadata'; +import ParseResult from 'src/ParseResult'; +import TransformerDescription from 'src/TransformerDescription'; +import TransformContext from 'src/transformer/TransformContext'; + +class TestTransformer extends ItemTransformer { + items: Item[]; + constructor(name: string, description: TransformerDescription, items: Item[]) { + super(name, description); + this.items = items; + } + transform(_: TransformContext, items: Item[]): Item[] { + return this.items; + } +} + +test('basic debug', async () => { + const parsedSchema = ['A', 'B']; + const parsedItems = [new Item(0, { A: 'a_row1', B: 'b_row1' }), new Item(0, { A: 'a_row2', B: 'b_row2' })]; + + const trans1Desc = { consumes: ['A', 'B'], produces: ['C'], removes: ['A', 'B'] }; + const trans1Items = parsedItems.map((item) => item.withData({ C: `c=${item.value('A')}+${item.value('B')}` })); + + const transformers = [new TestTransformer('Trans1', trans1Desc, trans1Items)]; + const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers); + + expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']); + expect(debug.stageSchema).toEqual([parsedSchema, ['C']]); + for (let index = 0; index < debug.stageNames.length; index++) { + console.log(index, debug.stageResults(index)); + } + + expect(debug.stageResults(0)).toEqual(parsedItems); + expect(debug.stageResults(1)).toEqual(trans1Items); +}); diff --git a/core/test/PdfParser.test.ts b/core/test/PdfParser.test.ts index 67b619b..fc70717 100644 --- a/core/test/PdfParser.test.ts +++ b/core/test/PdfParser.test.ts @@ -12,7 +12,7 @@ test('basic example PDF parse', async () => { const data = fs.readFileSync('../examples/ExamplePdf.pdf', null); // to test - const result = await parser.parseBytes( + const result = await parser.parse( data, new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)), ); @@ -29,7 +29,7 @@ test('basic example PDF parse', async () => { expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]); // verify first n items - expect(result.items.slice(0, 16)).toEqual([ + expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([ new Item(0, { str: 'Mega Überschrift', dir: 'ltr', @@ -37,7 +37,7 @@ test('basic example PDF parse', async () => { height: 30, transform: [30, 0, 0, 30, 175, 756], fontName: 'g_d0_f1', - }), + }).withoutUuid(), new Item(0, { str: '2te Überschrift', dir: 'ltr', @@ -45,7 +45,7 @@ test('basic example PDF parse', async () => { height: 20, transform: [20, 0, 0, 20, 233, 665], fontName: 'g_d0_f2', - }), + }).withoutUuid(), new Item(0, { str: 'Dies ist eine Test-PDF', dir: 'ltr', @@ -53,7 +53,7 @@ test('basic example PDF parse', async () => { height: 11, transform: [11, 0, 0, 11, 240, 585], fontName: 'g_d0_f2', - }), + }).withoutUuid(), new Item(0, { str: '.', dir: 'ltr', @@ -61,7 +61,7 @@ test('basic example PDF parse', async () => { height: 11, transform: [11, 0, 0, 11, 352.6927, 585], fontName: 'g_d0_f2', - }), + }).withoutUuid(), new Item(0, { str: '1', dir: 'ltr', @@ -69,7 +69,7 @@ test('basic example PDF parse', async () => { height: 7.333334, transform: [7.333334, 0, 0, 7.333334, 348, 588], fontName: 'g_d0_f2', - }), + }).withoutUuid(), new Item(0, { str: 'Für’s Testen des ', dir: 'ltr', @@ -77,7 +77,7 @@ test('basic example PDF parse', async () => { height: 11, transform: [11, 0, 0, 11, 208, 572], fontName: 'g_d0_f2', - }), + }).withoutUuid(), new Item(0, { str: 'Markdown Parsers', dir: 'ltr', @@ -85,7 +85,7 @@ test('basic example PDF parse', async () => { height: 11, transform: [11, 0, 0, 11, 291.77832, 572], fontName: 'g_d0_f2', - }), + }).withoutUuid(), new Item(0, { str: '.', dir: 'ltr', @@ -93,7 +93,7 @@ test('basic example PDF parse', async () => { height: 11, transform: [11, 0, 0, 11, 383.47360000000003, 572], fontName: 'g_d0_f2', - }), + }).withoutUuid(), new Item(0, { str: ' ', dir: 'ltr', @@ -101,7 +101,7 @@ test('basic example PDF parse', async () => { height: 11, transform: [11, 0, 0, 11, 61.078451, 59], fontName: 'g_d0_f2', - }), + }).withoutUuid(), new Item(0, { str: 'In Deutsch.', dir: 'ltr', @@ -109,7 +109,7 @@ test('basic example PDF parse', async () => { height: 11, transform: [11, 0, 0, 11, 64.134603, 59], fontName: 'g_d0_f2', - }), + }).withoutUuid(), new Item(0, { str: '1', dir: 'ltr', @@ -117,7 +117,7 @@ test('basic example PDF parse', async () => { height: 7.333334, transform: [7.333334, 0, 0, 7.333334, 57, 62], fontName: 'g_d0_f2', - }), + }).withoutUuid(), new Item(0, { str: '\x00', dir: 'ltr', @@ -125,7 +125,7 @@ test('basic example PDF parse', async () => { height: 12, transform: [12, 0, 0, 12, 294, 45], fontName: 'g_d0_f3', - }), + }).withoutUuid(), new Item(0, { str: '1', dir: 'ltr', @@ -133,7 +133,7 @@ test('basic example PDF parse', async () => { height: 12, transform: [12, 0, 0, 12, 294, 45], fontName: 'g_d0_f2', - }), + }).withoutUuid(), new Item(1, { str: '\x00', dir: 'ltr', @@ -141,7 +141,7 @@ test('basic example PDF parse', async () => { height: 12, transform: [12, 0, 0, 12, 294, 45], fontName: 'g_d0_f3', - }), + }).withoutUuid(), new Item(1, { str: '2', dir: 'ltr', @@ -149,7 +149,7 @@ test('basic example PDF parse', async () => { height: 12, transform: [12, 0, 0, 12, 294, 45], fontName: 'g_d0_f2', - }), + }).withoutUuid(), new Item(2, { str: 'Paragraphen', dir: 'ltr', @@ -157,7 +157,7 @@ test('basic example PDF parse', async () => { height: 18, transform: [18, 0, 0, 18, 57, 767], fontName: 'g_d0_f1', - }), + }).withoutUuid(), ]); // verify progress diff --git a/core/test/transformer/transformerUtil.test.ts b/core/test/transformer/transformerUtil.test.ts new file mode 100644 index 0000000..084eac7 --- /dev/null +++ b/core/test/transformer/transformerUtil.test.ts @@ -0,0 +1,59 @@ +import TransformerDescription from 'src/TransformerDescription'; +import Item from 'src/Item'; +import ItemTransformer from 'src/transformer/ItemTransformer'; +import TransformContext from 'src/transformer/TransformContext'; +import { calculateSchemas, verifyRequiredColumns } from 'src/transformer/transformerUtil'; + +class TestSchemaTransformer extends ItemTransformer { + constructor(name: string, description: TransformerDescription) { + super(name, description); + } + transform(_: TransformContext, items: Item[]): Item[] { + return items; + } +} + +test('verify valid transform', async () => { + const inputSchema = ['A', 'B', 'C']; + + const transformers = [ + new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }), + new TestSchemaTransformer('Create E', { produces: ['E'] }), + new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }), + ]; + verifyRequiredColumns(inputSchema, transformers); +}); + +test('verify invalid consume', async () => { + const inputSchema = ['A', 'B', 'C']; + + const transformers = [new TestSchemaTransformer('Consumes X', { consumes: ['X'] })]; + expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError( + "Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X' (consumes)", + ); +}); + +test('verify invalid remove', async () => { + const inputSchema = ['A', 'B', 'C']; + + const transformers = [new TestSchemaTransformer('Removes X', { removes: ['X'] })]; + expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError( + "Input schema [A, B, C] for transformer 'Removes X' does not contain the required column 'X' (removes)", + ); +}); + +test('calculate schemas', async () => { + const inputSchema = ['A', 'B', 'C']; + + const transformers = [ + new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }), + new TestSchemaTransformer('Create E', { produces: ['E'] }), + new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }), + ]; + expect(calculateSchemas(inputSchema, transformers)).toEqual([ + ['A', 'B', 'C'], + ['A', 'D'], + ['A', 'D', 'E'], + ['A', 'D', 'E'], + ]); +}); diff --git a/ui/package-lock.json b/ui/package-lock.json index ffc5d62..1f5c85c 100644 --- a/ui/package-lock.json +++ b/ui/package-lock.json @@ -5137,8 +5137,7 @@ "uuid": { "version": "8.3.2", "resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz", - "integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==", - "dev": true + "integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==" }, "v8-to-istanbul": { "version": "7.0.0", diff --git a/ui/package.json b/ui/package.json index 32da37a..779b526 100644 --- a/ui/package.json +++ b/ui/package.json @@ -21,7 +21,8 @@ }, "dependencies": { "pdfjs-dist": "^2.5.207", - "svelte-file-dropzone": "0.0.15" + "svelte-file-dropzone": "0.0.15", + "uuid": "^8.3.2" }, "devDependencies": { "@snowpack/plugin-dotenv": "^2.0.5", diff --git a/ui/src/App.svelte b/ui/src/App.svelte index 14d2b9c..626fa5a 100644 --- a/ui/src/App.svelte +++ b/ui/src/App.svelte @@ -1,14 +1,14 @@
PDF to Markdown Converter
- {#if $parseResult} - + {#if $debug} + {:else} {/if} diff --git a/ui/src/Result.svelte b/ui/src/Result.svelte deleted file mode 100644 index e4a8697..0000000 --- a/ui/src/Result.svelte +++ /dev/null @@ -1,16 +0,0 @@ - - -
-
-
Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items
-
Title: {parseResult.metadata.title()}
-
Author: {parseResult.metadata.author()}
-
- - - diff --git a/ui/src/Table.svelte b/ui/src/Table.svelte deleted file mode 100644 index 0d23cf4..0000000 --- a/ui/src/Table.svelte +++ /dev/null @@ -1,172 +0,0 @@ - - - -
-
- - (openedPageIndex = !openedPageIndex)}> - - - - - {#if openedPageIndex} -
- - - -
- {#each new Array(maxPage + 1) as _, idx} -
itemsGroupedByPage.has(idx) && focusOnPage(idx)} - class="px-2 border border-gray-300 rounded-full text-center {itemsGroupedByPage.has(idx) ? 'hover:text-green-700 hover:border-green-700 cursor-pointer' : 'opacity-50'}"> - {idx} -
- {/each} -
-
- {/if} -
- -
|
-
Transformation:
- -
Parse Result
- -
-
- - -
- - - - {#each columns as column} - - {/each} - - - {#each [...itemsGroupedByPage].filter(([page]) => !focused || page === focusedPage) as [pageNumber, items], pageIdx} - - {#if pageIdx > 0} - - {/if} - {#each items as item, itemIdx} - - - {#if itemIdx === 0} - - {:else} - - {#each columns as column} - - {/each} - - {/each} - {/each} - -
- #{column}
-
Page {pageNumber} {focused ? '' : ' / ' + maxPage}
-
- {#if !focused} - focusOnPage(pageNumber)}> - - - {:else} - - - - {/if} -
-
- {/if} - {itemIdx}{format(item.data[column])}
- - diff --git a/ui/src/debug/DebugView.svelte b/ui/src/debug/DebugView.svelte new file mode 100644 index 0000000..b5fa76d --- /dev/null +++ b/ui/src/debug/DebugView.svelte @@ -0,0 +1,107 @@ + + +
+
+ +
+ + +
+
+ {#if pageFocus} + + + + {/if} + + (openedPageIndex = !openedPageIndex)}> + + + + + {#if openedPageIndex} +
+ + + +
+ {#each new Array(maxPage + 1) as _, idx} +
pagesNumbers.has(idx) && focusOnPage(idx)} + class="px-2 border border-gray-300 rounded-full text-center {pagesNumbers.has(idx) ? 'hover:text-green-700 hover:border-green-700 cursor-pointer' : 'opacity-50'}"> + {idx} +
+ {/each} +
+
+ {/if} +
+ +
|
+
Transformation:
+ canPrev && currentStage--}> + + + canNext && currentStage++}> + + +
{stageNames[currentStage]}
+
+
+ + +
+ + diff --git a/ui/src/debug/ItemTable.svelte b/ui/src/debug/ItemTable.svelte new file mode 100644 index 0000000..73f5cf5 --- /dev/null +++ b/ui/src/debug/ItemTable.svelte @@ -0,0 +1,93 @@ + + + + + + + + {#each schema as column} + + {/each} + + + {#each itemsByPage as [pageNumber, items], pageIdx} + + {#if pageIdx > 0} + + {/if} + {#each items as item, itemIdx} + + + {#if itemIdx === 0} + + {:else} + + {#each schema as column} + + {/each} + + {/each} + {/each} + +
+ #{column}
+
Page {pageNumber} {pageFocus ? '' : ' / ' + maxPage}
+
+ {/if} + {itemIdx}{format(item.data[column])}
+ + diff --git a/ui/src/store.ts b/ui/src/store.ts index 1a523be..3d98a38 100644 --- a/ui/src/store.ts +++ b/ui/src/store.ts @@ -1,21 +1,23 @@ -import { pdfParser, parseReporter } from '@core'; +import { pdfParser, createPipeline, parseReporter } from '@core'; import type ProgressListenFunction from '@core/ProgressListenFunction'; import type ParseResult from '@core/ParseResult'; +import type Debugger from '@core/Debugger'; import * as pdfjs from 'pdfjs-dist/es5/build/pdf'; import { Writable, writable } from 'svelte/store'; +export let debug: Writable = writable(undefined); export let parseResult: Writable = writable(undefined); pdfjs.GlobalWorkerOptions.workerSrc = 'worker/pdf.worker.min.js'; -const parser = pdfParser(pdfjs); +const pdfPipeline = createPipeline(pdfjs, {}); -export async function loadExample(progressListener: ProgressListenFunction): Promise { - return parsePdf(parser.parseUrl('/ExamplePdf.pdf', parseReporter(progressListener))); +export async function loadExample(progressListener: ProgressListenFunction): Promise { + return parsePdf('/ExamplePdf.pdf', progressListener); } -export async function processUpload(file: File, progressListener: ProgressListenFunction): Promise { +export async function processUpload(file: File, progressListener: ProgressListenFunction): Promise { return new Promise((resolve, reject) => { const reader = new FileReader(); reader.onerror = reject; @@ -25,13 +27,18 @@ export async function processUpload(file: File, progressListener: ProgressListen reader.readAsArrayBuffer(file); }).then((buffer) => { const data = new Uint8Array(buffer as ArrayBuffer); - return parsePdf(parser.parseBytes(data, parseReporter(progressListener))); + return parsePdf(data, progressListener); }); } -async function parsePdf(parsePromise: Promise): Promise { - return parsePromise.then((result) => { - parseResult.set(result); - return result; +async function parsePdf(src: string | Uint8Array, progressListener: ProgressListenFunction): Promise { + pdfPipeline.debug(src, progressListener).then((debugInstance) => { + debug.set(debugInstance); + return debug; }); + //TODO without debug-flag + // return pdfPipeline.execute(src, progressListener).then((result) => { + // parseResult.set(result); + // return result; + // }); }