Fetch fontObjects

This commit is contained in:
Johannes Zillmann 2021-02-20 13:06:29 +01:00
parent a5b65b5d85
commit 1b530c6c29
6 changed files with 84 additions and 49 deletions

View File

@ -3,7 +3,7 @@ import type ProgressListenFunction from './ProgressListenFunction';
import Progress from './Progress'; import Progress from './Progress';
export default class ParseProgressReporter implements ParseReporter { export default class ParseProgressReporter implements ParseReporter {
progress = new Progress(['Document Header', 'Metadata', 'Pages', 'Fonts'], [0.1, 0.1, 0.7, 0.1]); progress = new Progress(['Document Header', 'Metadata', 'Pages', 'Fonts'], [0.01, 0.01, 0.97, 0.01]);
pagesToParse = 0; pagesToParse = 0;
progressListenFunction: ProgressListenFunction; progressListenFunction: ProgressListenFunction;
@ -32,5 +32,6 @@ export default class ParseProgressReporter implements ParseReporter {
parsedFonts(): void { parsedFonts(): void {
this.progress.stageProgress[3] = 1; this.progress.stageProgress[3] = 1;
this.progressListenFunction(this.progress);
} }
} }

View File

@ -3,14 +3,23 @@ import type Metadata from './Metadata';
import type PageViewport from './parse/PageViewport'; import type PageViewport from './parse/PageViewport';
export default class ParseResult { export default class ParseResult {
pdfPages: any[]; fontMap: Map<string, object>;
pdfjsPages: any[];
pageViewports: PageViewport[]; pageViewports: PageViewport[];
metadata: Metadata; metadata: Metadata;
schema: string[]; schema: string[];
items: Item[]; items: Item[];
constructor(pdfPages: any[], pageViewports: PageViewport[], metadata: Metadata, schema: string[], items: Item[]) { constructor(
this.pdfPages = pdfPages; fontMap: Map<string, object>,
pdfjsPages: any[],
pageViewports: PageViewport[],
metadata: Metadata,
schema: string[],
items: Item[],
) {
this.fontMap = fontMap;
this.pdfjsPages = pdfjsPages;
this.pageViewports = pageViewports; this.pageViewports = pageViewports;
this.metadata = metadata; this.metadata = metadata;
this.schema = schema; this.schema = schema;
@ -18,6 +27,6 @@ export default class ParseResult {
} }
pageCount(): number { pageCount(): number {
return this.pdfPages.length; return this.pdfjsPages.length;
} }
} }

View File

@ -20,30 +20,80 @@ export default class PdfParser {
const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) }; const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) };
return this.pdfjs return this.pdfjs
.getDocument(documentInitParameters) .getDocument(documentInitParameters)
.promise.then((pdfDocument: any) => { .promise.then((pdfjsDocument: any) => {
reporter.parsedDocumentHeader(pdfDocument.numPages); reporter.parsedDocumentHeader(pdfjsDocument.numPages);
return Promise.all([ return Promise.all([
pdfDocument.getMetadata().then((metadata: any) => { pdfjsDocument.getMetadata().then((pdfjsMetadata: any) => {
reporter.parsedMetadata(); reporter.parsedMetadata();
return metadata; return new Metadata(pdfjsMetadata);
}), }),
this.extractPagesSequentially(pdfDocument, reporter), this.extractPagesSequentially(pdfjsDocument, reporter),
]); ]);
}) })
.then(([metadata, pages]) => { .then(([metadata, pages]: [Metadata, ParsedPage[]]) => {
const pdfPages = pages.map((page: any) => page.page); return Promise.all([metadata, pages, this.gatherFontObjects(pages).finally(() => reporter.parsedFonts())]);
})
.then(([metadata, pages, fontMap]: [Metadata, ParsedPage[], Map<string, object>]) => {
const pdfjsPages = pages.map((page: any) => page.pdfjsPage);
const items = pages.reduce((allItems: any[], page: any) => allItems.concat(page.items), []); const items = pages.reduce((allItems: any[], page: any) => allItems.concat(page.items), []);
const pageViewports = pdfPages.map((page: any) => { const pageViewports = pdfjsPages.map((page: any) => {
const viewPort = page.getViewport({ scale: 1.0 }); const viewPort = page.getViewport({ scale: 1.0 });
return { return {
transformFunction: (itemTransform: number[]) => transformFunction: (itemTransform: number[]) =>
this.pdfjs.Util.transform(viewPort.transform, itemTransform), this.pdfjs.Util.transform(viewPort.transform, itemTransform),
}; };
}); });
return new ParseResult(pdfPages, pageViewports, new Metadata(metadata), this.schema, items); return new ParseResult(fontMap, pdfjsPages, pageViewports, metadata, this.schema, items);
}); });
} }
private extractPagesSequentially(pdfjsDocument: any, reporter: ParseReporter): Promise<ParsedPage[]> {
return [...Array(pdfjsDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults: ParsedPage[]) => {
return pdfjsDocument.getPage(index + 1).then((pdfjsPage: any) => {
return pdfjsPage
.getTextContent({
normalizeWhitespace: false,
disableCombineTextItems: true,
})
.then((textContent: any) => {
const items = textContent.items.map((pdfjsItem: any) => new Item(index, pdfjsItem));
reporter.parsedPage(index);
return [...accumulatedResults, { index, pdfjsPage, items }];
});
});
});
}, Promise.resolve([]));
}
private gatherFontObjects(pages: ParsedPage[]): Promise<Map<string, object>> {
let result = Promise.resolve(new Map<string, object>());
const uniqueFontIds = new Set<string>();
pages.forEach((page) => {
const unknownPageFonts: string[] = [];
page.items.forEach((item) => {
const fontId = item.data['fontName'];
if (!uniqueFontIds.has(fontId) && fontId.startsWith('g_d')) {
uniqueFontIds.add(fontId);
unknownPageFonts.push(fontId);
}
});
if (unknownPageFonts.length > 0) {
// console.log(`Fetch fonts ${unknownPageFonts} for page ${page.index}`);
result = result.then((fontMap) => {
return page.pdfjsPage.getOperatorList().then(() => {
unknownPageFonts.forEach((fontId) => {
const fontObject = page.pdfjsPage.commonObjs.get(fontId);
fontMap.set(fontId, fontObject);
});
return fontMap;
});
});
}
});
return result;
}
private documentInitParameters(src: string | Uint8Array | object): object { private documentInitParameters(src: string | Uint8Array | object): object {
if (typeof src === 'string') { if (typeof src === 'string') {
return { url: src }; return { url: src };
@ -60,35 +110,10 @@ export default class PdfParser {
private isArrayBuffer(object: any) { private isArrayBuffer(object: any) {
return typeof object === 'object' && object !== null && object.byteLength !== undefined; return typeof object === 'object' && object !== null && object.byteLength !== undefined;
} }
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults) => {
return pdfDocument.getPage(index + 1).then((page: any) => {
return this.triggerFontRetrieval(page).then(() =>
page
.getTextContent({
normalizeWhitespace: false,
disableCombineTextItems: true,
})
.then((textContent: any) => {
const items = textContent.items.map((rawItem: any) => new Item(index, rawItem));
reporter.parsedPage(index);
return [...accumulatedResults, { index, page, items }];
}),
);
});
});
}, Promise.resolve([]));
}
private triggerFontRetrieval(page: any): Promise<void> {
return page.getOperatorList();
}
} }
interface ParsedPage { interface ParsedPage {
index: number; index: number;
page: any; pdfjsPage: any;
items: Item[]; items: Item[];
} }

View File

@ -27,7 +27,7 @@ export default class PdfPipeline {
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> { async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
const parseResult = await this.parse(src, progressListener); const parseResult = await this.parse(src, progressListener);
this.verifyRequiredColumns(parseResult.schema, this.transformers); this.verifyRequiredColumns(parseResult.schema, this.transformers);
const context = { pageViewports: parseResult.pageViewports }; const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
let items = parseResult.items; let items = parseResult.items;
this.transformers.forEach((transformer) => { this.transformers.forEach((transformer) => {
items = transformer.transform(context, items).items; items = transformer.transform(context, items).items;
@ -38,7 +38,7 @@ export default class PdfPipeline {
async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> { async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> {
const parseResult = await this.parse(src, progressListener); const parseResult = await this.parse(src, progressListener);
const context = { pageViewports: parseResult.pageViewports }; const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
return new Debugger(parseResult.schema, parseResult.items, context, this.transformers); return new Debugger(parseResult.schema, parseResult.items, context, this.transformers);
} }

View File

@ -1,5 +1,6 @@
import PageViewport from '../parse/PageViewport'; import PageViewport from '../parse/PageViewport';
export default interface TransformContext { export default interface TransformContext {
fontMap: Map<string, object>;
pageViewports: PageViewport[]; pageViewports: PageViewport[];
} }

View File

@ -22,11 +22,11 @@ test('basic example PDF parse', async () => {
expect(result.metadata.title()).toEqual('ExamplePdf'); expect(result.metadata.title()).toEqual('ExamplePdf');
expect(result.metadata.author()).toEqual('Johannes Zillmann'); expect(result.metadata.author()).toEqual('Johannes Zillmann');
expect(result.pageCount()).toBe(expectedPages); expect(result.pageCount()).toBe(expectedPages);
result.pdfPages.forEach((pdfPage, i) => { result.pdfjsPages.forEach((pdfPage, i) => {
expect(pdfPage._pageIndex).toBe(i); expect(pdfPage._pageIndex).toBe(i);
}); });
expect(result.pdfPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]); expect(result.pdfjsPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]);
expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]); expect(result.pdfjsPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
// verify first n items // verify first n items
expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([ expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([
@ -161,7 +161,7 @@ test('basic example PDF parse', async () => {
]); ]);
// verify progress // verify progress
expect(progressUpdates.length).toBe(expectedPages + 2); expect(progressUpdates.length).toBe(expectedPages + 3);
progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts'])); progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts']));
expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]); expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]);
expect(progressUpdates[0].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]); expect(progressUpdates[0].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
@ -183,7 +183,6 @@ test('basic example PDF parse', async () => {
expect(progressUpdates[7].stageDetails).toEqual([null, null, `6 / ${expectedPages}`, null]); expect(progressUpdates[7].stageDetails).toEqual([null, null, `6 / ${expectedPages}`, null]);
expect(progressUpdates[8].stageProgress).toEqual([1, 1, 7 / expectedPages, 0]); expect(progressUpdates[8].stageProgress).toEqual([1, 1, 7 / expectedPages, 0]);
expect(progressUpdates[8].stageDetails).toEqual([null, null, `7 / ${expectedPages}`, null]); expect(progressUpdates[8].stageDetails).toEqual([null, null, `7 / ${expectedPages}`, null]);
expect(progressUpdates[9].stageProgress).toEqual([1, 1, 1, 1]);
// expect(progressUpdates[9].stagePercents).toEqual([1, 1, 1, 0]); expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]);
// expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]);
}); });