From 1b530c6c2986399227231d9537249c8c28e909f4 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Sat, 20 Feb 2021 13:06:29 +0100 Subject: [PATCH] Fetch fontObjects --- core/src/ParseProgressReporter.ts | 3 +- core/src/ParseResult.ts | 17 ++++- core/src/PdfParser.ts | 95 +++++++++++++++--------- core/src/PdfPipeline.ts | 4 +- core/src/transformer/TransformContext.ts | 1 + core/test/PdfParser.test.ts | 13 ++-- 6 files changed, 84 insertions(+), 49 deletions(-) diff --git a/core/src/ParseProgressReporter.ts b/core/src/ParseProgressReporter.ts index 9770f86..6bcd668 100644 --- a/core/src/ParseProgressReporter.ts +++ b/core/src/ParseProgressReporter.ts @@ -3,7 +3,7 @@ import type ProgressListenFunction from './ProgressListenFunction'; import Progress from './Progress'; export default class ParseProgressReporter implements ParseReporter { - progress = new Progress(['Document Header', 'Metadata', 'Pages', 'Fonts'], [0.1, 0.1, 0.7, 0.1]); + progress = new Progress(['Document Header', 'Metadata', 'Pages', 'Fonts'], [0.01, 0.01, 0.97, 0.01]); pagesToParse = 0; progressListenFunction: ProgressListenFunction; @@ -32,5 +32,6 @@ export default class ParseProgressReporter implements ParseReporter { parsedFonts(): void { this.progress.stageProgress[3] = 1; + this.progressListenFunction(this.progress); } } diff --git a/core/src/ParseResult.ts b/core/src/ParseResult.ts index b92d52a..161bdfc 100644 --- a/core/src/ParseResult.ts +++ b/core/src/ParseResult.ts @@ -3,14 +3,23 @@ import type Metadata from './Metadata'; import type PageViewport from './parse/PageViewport'; export default class ParseResult { - pdfPages: any[]; + fontMap: Map; + pdfjsPages: any[]; pageViewports: PageViewport[]; metadata: Metadata; schema: string[]; items: Item[]; - constructor(pdfPages: any[], pageViewports: PageViewport[], metadata: Metadata, schema: string[], items: Item[]) { - this.pdfPages = pdfPages; + constructor( + fontMap: Map, + pdfjsPages: any[], + pageViewports: PageViewport[], + metadata: Metadata, + schema: string[], + items: Item[], + ) { + this.fontMap = fontMap; + this.pdfjsPages = pdfjsPages; this.pageViewports = pageViewports; this.metadata = metadata; this.schema = schema; @@ -18,6 +27,6 @@ export default class ParseResult { } pageCount(): number { - return this.pdfPages.length; + return this.pdfjsPages.length; } } diff --git a/core/src/PdfParser.ts b/core/src/PdfParser.ts index 7406062..fe07abe 100644 --- a/core/src/PdfParser.ts +++ b/core/src/PdfParser.ts @@ -20,30 +20,80 @@ export default class PdfParser { const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) }; return this.pdfjs .getDocument(documentInitParameters) - .promise.then((pdfDocument: any) => { - reporter.parsedDocumentHeader(pdfDocument.numPages); + .promise.then((pdfjsDocument: any) => { + reporter.parsedDocumentHeader(pdfjsDocument.numPages); return Promise.all([ - pdfDocument.getMetadata().then((metadata: any) => { + pdfjsDocument.getMetadata().then((pdfjsMetadata: any) => { reporter.parsedMetadata(); - return metadata; + return new Metadata(pdfjsMetadata); }), - this.extractPagesSequentially(pdfDocument, reporter), + this.extractPagesSequentially(pdfjsDocument, reporter), ]); }) - .then(([metadata, pages]) => { - const pdfPages = pages.map((page: any) => page.page); + .then(([metadata, pages]: [Metadata, ParsedPage[]]) => { + return Promise.all([metadata, pages, this.gatherFontObjects(pages).finally(() => reporter.parsedFonts())]); + }) + .then(([metadata, pages, fontMap]: [Metadata, ParsedPage[], Map]) => { + const pdfjsPages = pages.map((page: any) => page.pdfjsPage); const items = pages.reduce((allItems: any[], page: any) => allItems.concat(page.items), []); - const pageViewports = pdfPages.map((page: any) => { + const pageViewports = pdfjsPages.map((page: any) => { const viewPort = page.getViewport({ scale: 1.0 }); return { transformFunction: (itemTransform: number[]) => this.pdfjs.Util.transform(viewPort.transform, itemTransform), }; }); - return new ParseResult(pdfPages, pageViewports, new Metadata(metadata), this.schema, items); + return new ParseResult(fontMap, pdfjsPages, pageViewports, metadata, this.schema, items); }); } + private extractPagesSequentially(pdfjsDocument: any, reporter: ParseReporter): Promise { + return [...Array(pdfjsDocument.numPages)].reduce((accumulatorPromise, _, index) => { + return accumulatorPromise.then((accumulatedResults: ParsedPage[]) => { + return pdfjsDocument.getPage(index + 1).then((pdfjsPage: any) => { + return pdfjsPage + .getTextContent({ + normalizeWhitespace: false, + disableCombineTextItems: true, + }) + .then((textContent: any) => { + const items = textContent.items.map((pdfjsItem: any) => new Item(index, pdfjsItem)); + reporter.parsedPage(index); + return [...accumulatedResults, { index, pdfjsPage, items }]; + }); + }); + }); + }, Promise.resolve([])); + } + + private gatherFontObjects(pages: ParsedPage[]): Promise> { + let result = Promise.resolve(new Map()); + const uniqueFontIds = new Set(); + pages.forEach((page) => { + const unknownPageFonts: string[] = []; + page.items.forEach((item) => { + const fontId = item.data['fontName']; + if (!uniqueFontIds.has(fontId) && fontId.startsWith('g_d')) { + uniqueFontIds.add(fontId); + unknownPageFonts.push(fontId); + } + }); + if (unknownPageFonts.length > 0) { + // console.log(`Fetch fonts ${unknownPageFonts} for page ${page.index}`); + result = result.then((fontMap) => { + return page.pdfjsPage.getOperatorList().then(() => { + unknownPageFonts.forEach((fontId) => { + const fontObject = page.pdfjsPage.commonObjs.get(fontId); + fontMap.set(fontId, fontObject); + }); + return fontMap; + }); + }); + } + }); + return result; + } + private documentInitParameters(src: string | Uint8Array | object): object { if (typeof src === 'string') { return { url: src }; @@ -60,35 +110,10 @@ export default class PdfParser { private isArrayBuffer(object: any) { return typeof object === 'object' && object !== null && object.byteLength !== undefined; } - - private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise { - return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => { - return accumulatorPromise.then((accumulatedResults) => { - return pdfDocument.getPage(index + 1).then((page: any) => { - return this.triggerFontRetrieval(page).then(() => - page - .getTextContent({ - normalizeWhitespace: false, - disableCombineTextItems: true, - }) - .then((textContent: any) => { - const items = textContent.items.map((rawItem: any) => new Item(index, rawItem)); - reporter.parsedPage(index); - return [...accumulatedResults, { index, page, items }]; - }), - ); - }); - }); - }, Promise.resolve([])); - } - - private triggerFontRetrieval(page: any): Promise { - return page.getOperatorList(); - } } interface ParsedPage { index: number; - page: any; + pdfjsPage: any; items: Item[]; } diff --git a/core/src/PdfPipeline.ts b/core/src/PdfPipeline.ts index 69b80a3..18b2d60 100644 --- a/core/src/PdfPipeline.ts +++ b/core/src/PdfPipeline.ts @@ -27,7 +27,7 @@ export default class PdfPipeline { async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise { const parseResult = await this.parse(src, progressListener); this.verifyRequiredColumns(parseResult.schema, this.transformers); - const context = { pageViewports: parseResult.pageViewports }; + const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports }; let items = parseResult.items; this.transformers.forEach((transformer) => { items = transformer.transform(context, items).items; @@ -38,7 +38,7 @@ export default class PdfPipeline { async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise { const parseResult = await this.parse(src, progressListener); - const context = { pageViewports: parseResult.pageViewports }; + const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports }; return new Debugger(parseResult.schema, parseResult.items, context, this.transformers); } diff --git a/core/src/transformer/TransformContext.ts b/core/src/transformer/TransformContext.ts index 2740beb..d8dcfbd 100644 --- a/core/src/transformer/TransformContext.ts +++ b/core/src/transformer/TransformContext.ts @@ -1,5 +1,6 @@ import PageViewport from '../parse/PageViewport'; export default interface TransformContext { + fontMap: Map; pageViewports: PageViewport[]; } diff --git a/core/test/PdfParser.test.ts b/core/test/PdfParser.test.ts index fc70717..08de437 100644 --- a/core/test/PdfParser.test.ts +++ b/core/test/PdfParser.test.ts @@ -22,11 +22,11 @@ test('basic example PDF parse', async () => { expect(result.metadata.title()).toEqual('ExamplePdf'); expect(result.metadata.author()).toEqual('Johannes Zillmann'); expect(result.pageCount()).toBe(expectedPages); - result.pdfPages.forEach((pdfPage, i) => { + result.pdfjsPages.forEach((pdfPage, i) => { expect(pdfPage._pageIndex).toBe(i); }); - expect(result.pdfPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]); - expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]); + expect(result.pdfjsPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]); + expect(result.pdfjsPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]); // verify first n items expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([ @@ -161,7 +161,7 @@ test('basic example PDF parse', async () => { ]); // verify progress - expect(progressUpdates.length).toBe(expectedPages + 2); + expect(progressUpdates.length).toBe(expectedPages + 3); progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts'])); expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]); expect(progressUpdates[0].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]); @@ -183,7 +183,6 @@ test('basic example PDF parse', async () => { expect(progressUpdates[7].stageDetails).toEqual([null, null, `6 / ${expectedPages}`, null]); expect(progressUpdates[8].stageProgress).toEqual([1, 1, 7 / expectedPages, 0]); expect(progressUpdates[8].stageDetails).toEqual([null, null, `7 / ${expectedPages}`, null]); - - // expect(progressUpdates[9].stagePercents).toEqual([1, 1, 1, 0]); - // expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]); + expect(progressUpdates[9].stageProgress).toEqual([1, 1, 1, 1]); + expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]); });