Fetch fontObjects

This commit is contained in:
Johannes Zillmann 2021-02-20 13:06:29 +01:00
parent a5b65b5d85
commit 1b530c6c29
6 changed files with 84 additions and 49 deletions

View File

@ -3,7 +3,7 @@ import type ProgressListenFunction from './ProgressListenFunction';
import Progress from './Progress';
export default class ParseProgressReporter implements ParseReporter {
progress = new Progress(['Document Header', 'Metadata', 'Pages', 'Fonts'], [0.1, 0.1, 0.7, 0.1]);
progress = new Progress(['Document Header', 'Metadata', 'Pages', 'Fonts'], [0.01, 0.01, 0.97, 0.01]);
pagesToParse = 0;
progressListenFunction: ProgressListenFunction;
@ -32,5 +32,6 @@ export default class ParseProgressReporter implements ParseReporter {
parsedFonts(): void {
this.progress.stageProgress[3] = 1;
this.progressListenFunction(this.progress);
}
}

View File

@ -3,14 +3,23 @@ import type Metadata from './Metadata';
import type PageViewport from './parse/PageViewport';
export default class ParseResult {
pdfPages: any[];
fontMap: Map<string, object>;
pdfjsPages: any[];
pageViewports: PageViewport[];
metadata: Metadata;
schema: string[];
items: Item[];
constructor(pdfPages: any[], pageViewports: PageViewport[], metadata: Metadata, schema: string[], items: Item[]) {
this.pdfPages = pdfPages;
constructor(
fontMap: Map<string, object>,
pdfjsPages: any[],
pageViewports: PageViewport[],
metadata: Metadata,
schema: string[],
items: Item[],
) {
this.fontMap = fontMap;
this.pdfjsPages = pdfjsPages;
this.pageViewports = pageViewports;
this.metadata = metadata;
this.schema = schema;
@ -18,6 +27,6 @@ export default class ParseResult {
}
pageCount(): number {
return this.pdfPages.length;
return this.pdfjsPages.length;
}
}

View File

@ -20,30 +20,80 @@ export default class PdfParser {
const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) };
return this.pdfjs
.getDocument(documentInitParameters)
.promise.then((pdfDocument: any) => {
reporter.parsedDocumentHeader(pdfDocument.numPages);
.promise.then((pdfjsDocument: any) => {
reporter.parsedDocumentHeader(pdfjsDocument.numPages);
return Promise.all([
pdfDocument.getMetadata().then((metadata: any) => {
pdfjsDocument.getMetadata().then((pdfjsMetadata: any) => {
reporter.parsedMetadata();
return metadata;
return new Metadata(pdfjsMetadata);
}),
this.extractPagesSequentially(pdfDocument, reporter),
this.extractPagesSequentially(pdfjsDocument, reporter),
]);
})
.then(([metadata, pages]) => {
const pdfPages = pages.map((page: any) => page.page);
.then(([metadata, pages]: [Metadata, ParsedPage[]]) => {
return Promise.all([metadata, pages, this.gatherFontObjects(pages).finally(() => reporter.parsedFonts())]);
})
.then(([metadata, pages, fontMap]: [Metadata, ParsedPage[], Map<string, object>]) => {
const pdfjsPages = pages.map((page: any) => page.pdfjsPage);
const items = pages.reduce((allItems: any[], page: any) => allItems.concat(page.items), []);
const pageViewports = pdfPages.map((page: any) => {
const pageViewports = pdfjsPages.map((page: any) => {
const viewPort = page.getViewport({ scale: 1.0 });
return {
transformFunction: (itemTransform: number[]) =>
this.pdfjs.Util.transform(viewPort.transform, itemTransform),
};
});
return new ParseResult(pdfPages, pageViewports, new Metadata(metadata), this.schema, items);
return new ParseResult(fontMap, pdfjsPages, pageViewports, metadata, this.schema, items);
});
}
private extractPagesSequentially(pdfjsDocument: any, reporter: ParseReporter): Promise<ParsedPage[]> {
return [...Array(pdfjsDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults: ParsedPage[]) => {
return pdfjsDocument.getPage(index + 1).then((pdfjsPage: any) => {
return pdfjsPage
.getTextContent({
normalizeWhitespace: false,
disableCombineTextItems: true,
})
.then((textContent: any) => {
const items = textContent.items.map((pdfjsItem: any) => new Item(index, pdfjsItem));
reporter.parsedPage(index);
return [...accumulatedResults, { index, pdfjsPage, items }];
});
});
});
}, Promise.resolve([]));
}
private gatherFontObjects(pages: ParsedPage[]): Promise<Map<string, object>> {
let result = Promise.resolve(new Map<string, object>());
const uniqueFontIds = new Set<string>();
pages.forEach((page) => {
const unknownPageFonts: string[] = [];
page.items.forEach((item) => {
const fontId = item.data['fontName'];
if (!uniqueFontIds.has(fontId) && fontId.startsWith('g_d')) {
uniqueFontIds.add(fontId);
unknownPageFonts.push(fontId);
}
});
if (unknownPageFonts.length > 0) {
// console.log(`Fetch fonts ${unknownPageFonts} for page ${page.index}`);
result = result.then((fontMap) => {
return page.pdfjsPage.getOperatorList().then(() => {
unknownPageFonts.forEach((fontId) => {
const fontObject = page.pdfjsPage.commonObjs.get(fontId);
fontMap.set(fontId, fontObject);
});
return fontMap;
});
});
}
});
return result;
}
private documentInitParameters(src: string | Uint8Array | object): object {
if (typeof src === 'string') {
return { url: src };
@ -60,35 +110,10 @@ export default class PdfParser {
private isArrayBuffer(object: any) {
return typeof object === 'object' && object !== null && object.byteLength !== undefined;
}
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults) => {
return pdfDocument.getPage(index + 1).then((page: any) => {
return this.triggerFontRetrieval(page).then(() =>
page
.getTextContent({
normalizeWhitespace: false,
disableCombineTextItems: true,
})
.then((textContent: any) => {
const items = textContent.items.map((rawItem: any) => new Item(index, rawItem));
reporter.parsedPage(index);
return [...accumulatedResults, { index, page, items }];
}),
);
});
});
}, Promise.resolve([]));
}
private triggerFontRetrieval(page: any): Promise<void> {
return page.getOperatorList();
}
}
interface ParsedPage {
index: number;
page: any;
pdfjsPage: any;
items: Item[];
}

View File

@ -27,7 +27,7 @@ export default class PdfPipeline {
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
const parseResult = await this.parse(src, progressListener);
this.verifyRequiredColumns(parseResult.schema, this.transformers);
const context = { pageViewports: parseResult.pageViewports };
const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
let items = parseResult.items;
this.transformers.forEach((transformer) => {
items = transformer.transform(context, items).items;
@ -38,7 +38,7 @@ export default class PdfPipeline {
async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> {
const parseResult = await this.parse(src, progressListener);
const context = { pageViewports: parseResult.pageViewports };
const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
return new Debugger(parseResult.schema, parseResult.items, context, this.transformers);
}

View File

@ -1,5 +1,6 @@
import PageViewport from '../parse/PageViewport';
export default interface TransformContext {
fontMap: Map<string, object>;
pageViewports: PageViewport[];
}

View File

@ -22,11 +22,11 @@ test('basic example PDF parse', async () => {
expect(result.metadata.title()).toEqual('ExamplePdf');
expect(result.metadata.author()).toEqual('Johannes Zillmann');
expect(result.pageCount()).toBe(expectedPages);
result.pdfPages.forEach((pdfPage, i) => {
result.pdfjsPages.forEach((pdfPage, i) => {
expect(pdfPage._pageIndex).toBe(i);
});
expect(result.pdfPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]);
expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
expect(result.pdfjsPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]);
expect(result.pdfjsPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
// verify first n items
expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([
@ -161,7 +161,7 @@ test('basic example PDF parse', async () => {
]);
// verify progress
expect(progressUpdates.length).toBe(expectedPages + 2);
expect(progressUpdates.length).toBe(expectedPages + 3);
progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts']));
expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]);
expect(progressUpdates[0].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
@ -183,7 +183,6 @@ test('basic example PDF parse', async () => {
expect(progressUpdates[7].stageDetails).toEqual([null, null, `6 / ${expectedPages}`, null]);
expect(progressUpdates[8].stageProgress).toEqual([1, 1, 7 / expectedPages, 0]);
expect(progressUpdates[8].stageDetails).toEqual([null, null, `7 / ${expectedPages}`, null]);
// expect(progressUpdates[9].stagePercents).toEqual([1, 1, 1, 0]);
// expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]);
expect(progressUpdates[9].stageProgress).toEqual([1, 1, 1, 1]);
expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]);
});