mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-30 09:29:43 +01:00
Fetch fontObjects
This commit is contained in:
parent
a5b65b5d85
commit
1b530c6c29
@ -3,7 +3,7 @@ import type ProgressListenFunction from './ProgressListenFunction';
|
||||
import Progress from './Progress';
|
||||
|
||||
export default class ParseProgressReporter implements ParseReporter {
|
||||
progress = new Progress(['Document Header', 'Metadata', 'Pages', 'Fonts'], [0.1, 0.1, 0.7, 0.1]);
|
||||
progress = new Progress(['Document Header', 'Metadata', 'Pages', 'Fonts'], [0.01, 0.01, 0.97, 0.01]);
|
||||
pagesToParse = 0;
|
||||
progressListenFunction: ProgressListenFunction;
|
||||
|
||||
@ -32,5 +32,6 @@ export default class ParseProgressReporter implements ParseReporter {
|
||||
|
||||
parsedFonts(): void {
|
||||
this.progress.stageProgress[3] = 1;
|
||||
this.progressListenFunction(this.progress);
|
||||
}
|
||||
}
|
||||
|
@ -3,14 +3,23 @@ import type Metadata from './Metadata';
|
||||
import type PageViewport from './parse/PageViewport';
|
||||
|
||||
export default class ParseResult {
|
||||
pdfPages: any[];
|
||||
fontMap: Map<string, object>;
|
||||
pdfjsPages: any[];
|
||||
pageViewports: PageViewport[];
|
||||
metadata: Metadata;
|
||||
schema: string[];
|
||||
items: Item[];
|
||||
|
||||
constructor(pdfPages: any[], pageViewports: PageViewport[], metadata: Metadata, schema: string[], items: Item[]) {
|
||||
this.pdfPages = pdfPages;
|
||||
constructor(
|
||||
fontMap: Map<string, object>,
|
||||
pdfjsPages: any[],
|
||||
pageViewports: PageViewport[],
|
||||
metadata: Metadata,
|
||||
schema: string[],
|
||||
items: Item[],
|
||||
) {
|
||||
this.fontMap = fontMap;
|
||||
this.pdfjsPages = pdfjsPages;
|
||||
this.pageViewports = pageViewports;
|
||||
this.metadata = metadata;
|
||||
this.schema = schema;
|
||||
@ -18,6 +27,6 @@ export default class ParseResult {
|
||||
}
|
||||
|
||||
pageCount(): number {
|
||||
return this.pdfPages.length;
|
||||
return this.pdfjsPages.length;
|
||||
}
|
||||
}
|
||||
|
@ -20,30 +20,80 @@ export default class PdfParser {
|
||||
const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) };
|
||||
return this.pdfjs
|
||||
.getDocument(documentInitParameters)
|
||||
.promise.then((pdfDocument: any) => {
|
||||
reporter.parsedDocumentHeader(pdfDocument.numPages);
|
||||
.promise.then((pdfjsDocument: any) => {
|
||||
reporter.parsedDocumentHeader(pdfjsDocument.numPages);
|
||||
return Promise.all([
|
||||
pdfDocument.getMetadata().then((metadata: any) => {
|
||||
pdfjsDocument.getMetadata().then((pdfjsMetadata: any) => {
|
||||
reporter.parsedMetadata();
|
||||
return metadata;
|
||||
return new Metadata(pdfjsMetadata);
|
||||
}),
|
||||
this.extractPagesSequentially(pdfDocument, reporter),
|
||||
this.extractPagesSequentially(pdfjsDocument, reporter),
|
||||
]);
|
||||
})
|
||||
.then(([metadata, pages]) => {
|
||||
const pdfPages = pages.map((page: any) => page.page);
|
||||
.then(([metadata, pages]: [Metadata, ParsedPage[]]) => {
|
||||
return Promise.all([metadata, pages, this.gatherFontObjects(pages).finally(() => reporter.parsedFonts())]);
|
||||
})
|
||||
.then(([metadata, pages, fontMap]: [Metadata, ParsedPage[], Map<string, object>]) => {
|
||||
const pdfjsPages = pages.map((page: any) => page.pdfjsPage);
|
||||
const items = pages.reduce((allItems: any[], page: any) => allItems.concat(page.items), []);
|
||||
const pageViewports = pdfPages.map((page: any) => {
|
||||
const pageViewports = pdfjsPages.map((page: any) => {
|
||||
const viewPort = page.getViewport({ scale: 1.0 });
|
||||
return {
|
||||
transformFunction: (itemTransform: number[]) =>
|
||||
this.pdfjs.Util.transform(viewPort.transform, itemTransform),
|
||||
};
|
||||
});
|
||||
return new ParseResult(pdfPages, pageViewports, new Metadata(metadata), this.schema, items);
|
||||
return new ParseResult(fontMap, pdfjsPages, pageViewports, metadata, this.schema, items);
|
||||
});
|
||||
}
|
||||
|
||||
private extractPagesSequentially(pdfjsDocument: any, reporter: ParseReporter): Promise<ParsedPage[]> {
|
||||
return [...Array(pdfjsDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||
return accumulatorPromise.then((accumulatedResults: ParsedPage[]) => {
|
||||
return pdfjsDocument.getPage(index + 1).then((pdfjsPage: any) => {
|
||||
return pdfjsPage
|
||||
.getTextContent({
|
||||
normalizeWhitespace: false,
|
||||
disableCombineTextItems: true,
|
||||
})
|
||||
.then((textContent: any) => {
|
||||
const items = textContent.items.map((pdfjsItem: any) => new Item(index, pdfjsItem));
|
||||
reporter.parsedPage(index);
|
||||
return [...accumulatedResults, { index, pdfjsPage, items }];
|
||||
});
|
||||
});
|
||||
});
|
||||
}, Promise.resolve([]));
|
||||
}
|
||||
|
||||
private gatherFontObjects(pages: ParsedPage[]): Promise<Map<string, object>> {
|
||||
let result = Promise.resolve(new Map<string, object>());
|
||||
const uniqueFontIds = new Set<string>();
|
||||
pages.forEach((page) => {
|
||||
const unknownPageFonts: string[] = [];
|
||||
page.items.forEach((item) => {
|
||||
const fontId = item.data['fontName'];
|
||||
if (!uniqueFontIds.has(fontId) && fontId.startsWith('g_d')) {
|
||||
uniqueFontIds.add(fontId);
|
||||
unknownPageFonts.push(fontId);
|
||||
}
|
||||
});
|
||||
if (unknownPageFonts.length > 0) {
|
||||
// console.log(`Fetch fonts ${unknownPageFonts} for page ${page.index}`);
|
||||
result = result.then((fontMap) => {
|
||||
return page.pdfjsPage.getOperatorList().then(() => {
|
||||
unknownPageFonts.forEach((fontId) => {
|
||||
const fontObject = page.pdfjsPage.commonObjs.get(fontId);
|
||||
fontMap.set(fontId, fontObject);
|
||||
});
|
||||
return fontMap;
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
private documentInitParameters(src: string | Uint8Array | object): object {
|
||||
if (typeof src === 'string') {
|
||||
return { url: src };
|
||||
@ -60,35 +110,10 @@ export default class PdfParser {
|
||||
private isArrayBuffer(object: any) {
|
||||
return typeof object === 'object' && object !== null && object.byteLength !== undefined;
|
||||
}
|
||||
|
||||
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
|
||||
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||
return accumulatorPromise.then((accumulatedResults) => {
|
||||
return pdfDocument.getPage(index + 1).then((page: any) => {
|
||||
return this.triggerFontRetrieval(page).then(() =>
|
||||
page
|
||||
.getTextContent({
|
||||
normalizeWhitespace: false,
|
||||
disableCombineTextItems: true,
|
||||
})
|
||||
.then((textContent: any) => {
|
||||
const items = textContent.items.map((rawItem: any) => new Item(index, rawItem));
|
||||
reporter.parsedPage(index);
|
||||
return [...accumulatedResults, { index, page, items }];
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
}, Promise.resolve([]));
|
||||
}
|
||||
|
||||
private triggerFontRetrieval(page: any): Promise<void> {
|
||||
return page.getOperatorList();
|
||||
}
|
||||
}
|
||||
|
||||
interface ParsedPage {
|
||||
index: number;
|
||||
page: any;
|
||||
pdfjsPage: any;
|
||||
items: Item[];
|
||||
}
|
||||
|
@ -27,7 +27,7 @@ export default class PdfPipeline {
|
||||
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
|
||||
const parseResult = await this.parse(src, progressListener);
|
||||
this.verifyRequiredColumns(parseResult.schema, this.transformers);
|
||||
const context = { pageViewports: parseResult.pageViewports };
|
||||
const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
|
||||
let items = parseResult.items;
|
||||
this.transformers.forEach((transformer) => {
|
||||
items = transformer.transform(context, items).items;
|
||||
@ -38,7 +38,7 @@ export default class PdfPipeline {
|
||||
|
||||
async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> {
|
||||
const parseResult = await this.parse(src, progressListener);
|
||||
const context = { pageViewports: parseResult.pageViewports };
|
||||
const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
|
||||
return new Debugger(parseResult.schema, parseResult.items, context, this.transformers);
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
import PageViewport from '../parse/PageViewport';
|
||||
|
||||
export default interface TransformContext {
|
||||
fontMap: Map<string, object>;
|
||||
pageViewports: PageViewport[];
|
||||
}
|
||||
|
@ -22,11 +22,11 @@ test('basic example PDF parse', async () => {
|
||||
expect(result.metadata.title()).toEqual('ExamplePdf');
|
||||
expect(result.metadata.author()).toEqual('Johannes Zillmann');
|
||||
expect(result.pageCount()).toBe(expectedPages);
|
||||
result.pdfPages.forEach((pdfPage, i) => {
|
||||
result.pdfjsPages.forEach((pdfPage, i) => {
|
||||
expect(pdfPage._pageIndex).toBe(i);
|
||||
});
|
||||
expect(result.pdfPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]);
|
||||
expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
|
||||
expect(result.pdfjsPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]);
|
||||
expect(result.pdfjsPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
|
||||
|
||||
// verify first n items
|
||||
expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([
|
||||
@ -161,7 +161,7 @@ test('basic example PDF parse', async () => {
|
||||
]);
|
||||
|
||||
// verify progress
|
||||
expect(progressUpdates.length).toBe(expectedPages + 2);
|
||||
expect(progressUpdates.length).toBe(expectedPages + 3);
|
||||
progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts']));
|
||||
expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]);
|
||||
expect(progressUpdates[0].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
|
||||
@ -183,7 +183,6 @@ test('basic example PDF parse', async () => {
|
||||
expect(progressUpdates[7].stageDetails).toEqual([null, null, `6 / ${expectedPages}`, null]);
|
||||
expect(progressUpdates[8].stageProgress).toEqual([1, 1, 7 / expectedPages, 0]);
|
||||
expect(progressUpdates[8].stageDetails).toEqual([null, null, `7 / ${expectedPages}`, null]);
|
||||
|
||||
// expect(progressUpdates[9].stagePercents).toEqual([1, 1, 1, 0]);
|
||||
// expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]);
|
||||
expect(progressUpdates[9].stageProgress).toEqual([1, 1, 1, 1]);
|
||||
expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]);
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user