mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-11 23:39:14 +01:00
Fetch fontObjects
This commit is contained in:
parent
a5b65b5d85
commit
1b530c6c29
@ -3,7 +3,7 @@ import type ProgressListenFunction from './ProgressListenFunction';
|
|||||||
import Progress from './Progress';
|
import Progress from './Progress';
|
||||||
|
|
||||||
export default class ParseProgressReporter implements ParseReporter {
|
export default class ParseProgressReporter implements ParseReporter {
|
||||||
progress = new Progress(['Document Header', 'Metadata', 'Pages', 'Fonts'], [0.1, 0.1, 0.7, 0.1]);
|
progress = new Progress(['Document Header', 'Metadata', 'Pages', 'Fonts'], [0.01, 0.01, 0.97, 0.01]);
|
||||||
pagesToParse = 0;
|
pagesToParse = 0;
|
||||||
progressListenFunction: ProgressListenFunction;
|
progressListenFunction: ProgressListenFunction;
|
||||||
|
|
||||||
@ -32,5 +32,6 @@ export default class ParseProgressReporter implements ParseReporter {
|
|||||||
|
|
||||||
parsedFonts(): void {
|
parsedFonts(): void {
|
||||||
this.progress.stageProgress[3] = 1;
|
this.progress.stageProgress[3] = 1;
|
||||||
|
this.progressListenFunction(this.progress);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,14 +3,23 @@ import type Metadata from './Metadata';
|
|||||||
import type PageViewport from './parse/PageViewport';
|
import type PageViewport from './parse/PageViewport';
|
||||||
|
|
||||||
export default class ParseResult {
|
export default class ParseResult {
|
||||||
pdfPages: any[];
|
fontMap: Map<string, object>;
|
||||||
|
pdfjsPages: any[];
|
||||||
pageViewports: PageViewport[];
|
pageViewports: PageViewport[];
|
||||||
metadata: Metadata;
|
metadata: Metadata;
|
||||||
schema: string[];
|
schema: string[];
|
||||||
items: Item[];
|
items: Item[];
|
||||||
|
|
||||||
constructor(pdfPages: any[], pageViewports: PageViewport[], metadata: Metadata, schema: string[], items: Item[]) {
|
constructor(
|
||||||
this.pdfPages = pdfPages;
|
fontMap: Map<string, object>,
|
||||||
|
pdfjsPages: any[],
|
||||||
|
pageViewports: PageViewport[],
|
||||||
|
metadata: Metadata,
|
||||||
|
schema: string[],
|
||||||
|
items: Item[],
|
||||||
|
) {
|
||||||
|
this.fontMap = fontMap;
|
||||||
|
this.pdfjsPages = pdfjsPages;
|
||||||
this.pageViewports = pageViewports;
|
this.pageViewports = pageViewports;
|
||||||
this.metadata = metadata;
|
this.metadata = metadata;
|
||||||
this.schema = schema;
|
this.schema = schema;
|
||||||
@ -18,6 +27,6 @@ export default class ParseResult {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pageCount(): number {
|
pageCount(): number {
|
||||||
return this.pdfPages.length;
|
return this.pdfjsPages.length;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -20,30 +20,80 @@ export default class PdfParser {
|
|||||||
const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) };
|
const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) };
|
||||||
return this.pdfjs
|
return this.pdfjs
|
||||||
.getDocument(documentInitParameters)
|
.getDocument(documentInitParameters)
|
||||||
.promise.then((pdfDocument: any) => {
|
.promise.then((pdfjsDocument: any) => {
|
||||||
reporter.parsedDocumentHeader(pdfDocument.numPages);
|
reporter.parsedDocumentHeader(pdfjsDocument.numPages);
|
||||||
return Promise.all([
|
return Promise.all([
|
||||||
pdfDocument.getMetadata().then((metadata: any) => {
|
pdfjsDocument.getMetadata().then((pdfjsMetadata: any) => {
|
||||||
reporter.parsedMetadata();
|
reporter.parsedMetadata();
|
||||||
return metadata;
|
return new Metadata(pdfjsMetadata);
|
||||||
}),
|
}),
|
||||||
this.extractPagesSequentially(pdfDocument, reporter),
|
this.extractPagesSequentially(pdfjsDocument, reporter),
|
||||||
]);
|
]);
|
||||||
})
|
})
|
||||||
.then(([metadata, pages]) => {
|
.then(([metadata, pages]: [Metadata, ParsedPage[]]) => {
|
||||||
const pdfPages = pages.map((page: any) => page.page);
|
return Promise.all([metadata, pages, this.gatherFontObjects(pages).finally(() => reporter.parsedFonts())]);
|
||||||
|
})
|
||||||
|
.then(([metadata, pages, fontMap]: [Metadata, ParsedPage[], Map<string, object>]) => {
|
||||||
|
const pdfjsPages = pages.map((page: any) => page.pdfjsPage);
|
||||||
const items = pages.reduce((allItems: any[], page: any) => allItems.concat(page.items), []);
|
const items = pages.reduce((allItems: any[], page: any) => allItems.concat(page.items), []);
|
||||||
const pageViewports = pdfPages.map((page: any) => {
|
const pageViewports = pdfjsPages.map((page: any) => {
|
||||||
const viewPort = page.getViewport({ scale: 1.0 });
|
const viewPort = page.getViewport({ scale: 1.0 });
|
||||||
return {
|
return {
|
||||||
transformFunction: (itemTransform: number[]) =>
|
transformFunction: (itemTransform: number[]) =>
|
||||||
this.pdfjs.Util.transform(viewPort.transform, itemTransform),
|
this.pdfjs.Util.transform(viewPort.transform, itemTransform),
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
return new ParseResult(pdfPages, pageViewports, new Metadata(metadata), this.schema, items);
|
return new ParseResult(fontMap, pdfjsPages, pageViewports, metadata, this.schema, items);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private extractPagesSequentially(pdfjsDocument: any, reporter: ParseReporter): Promise<ParsedPage[]> {
|
||||||
|
return [...Array(pdfjsDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||||
|
return accumulatorPromise.then((accumulatedResults: ParsedPage[]) => {
|
||||||
|
return pdfjsDocument.getPage(index + 1).then((pdfjsPage: any) => {
|
||||||
|
return pdfjsPage
|
||||||
|
.getTextContent({
|
||||||
|
normalizeWhitespace: false,
|
||||||
|
disableCombineTextItems: true,
|
||||||
|
})
|
||||||
|
.then((textContent: any) => {
|
||||||
|
const items = textContent.items.map((pdfjsItem: any) => new Item(index, pdfjsItem));
|
||||||
|
reporter.parsedPage(index);
|
||||||
|
return [...accumulatedResults, { index, pdfjsPage, items }];
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}, Promise.resolve([]));
|
||||||
|
}
|
||||||
|
|
||||||
|
private gatherFontObjects(pages: ParsedPage[]): Promise<Map<string, object>> {
|
||||||
|
let result = Promise.resolve(new Map<string, object>());
|
||||||
|
const uniqueFontIds = new Set<string>();
|
||||||
|
pages.forEach((page) => {
|
||||||
|
const unknownPageFonts: string[] = [];
|
||||||
|
page.items.forEach((item) => {
|
||||||
|
const fontId = item.data['fontName'];
|
||||||
|
if (!uniqueFontIds.has(fontId) && fontId.startsWith('g_d')) {
|
||||||
|
uniqueFontIds.add(fontId);
|
||||||
|
unknownPageFonts.push(fontId);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (unknownPageFonts.length > 0) {
|
||||||
|
// console.log(`Fetch fonts ${unknownPageFonts} for page ${page.index}`);
|
||||||
|
result = result.then((fontMap) => {
|
||||||
|
return page.pdfjsPage.getOperatorList().then(() => {
|
||||||
|
unknownPageFonts.forEach((fontId) => {
|
||||||
|
const fontObject = page.pdfjsPage.commonObjs.get(fontId);
|
||||||
|
fontMap.set(fontId, fontObject);
|
||||||
|
});
|
||||||
|
return fontMap;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
private documentInitParameters(src: string | Uint8Array | object): object {
|
private documentInitParameters(src: string | Uint8Array | object): object {
|
||||||
if (typeof src === 'string') {
|
if (typeof src === 'string') {
|
||||||
return { url: src };
|
return { url: src };
|
||||||
@ -60,35 +110,10 @@ export default class PdfParser {
|
|||||||
private isArrayBuffer(object: any) {
|
private isArrayBuffer(object: any) {
|
||||||
return typeof object === 'object' && object !== null && object.byteLength !== undefined;
|
return typeof object === 'object' && object !== null && object.byteLength !== undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
|
|
||||||
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
|
||||||
return accumulatorPromise.then((accumulatedResults) => {
|
|
||||||
return pdfDocument.getPage(index + 1).then((page: any) => {
|
|
||||||
return this.triggerFontRetrieval(page).then(() =>
|
|
||||||
page
|
|
||||||
.getTextContent({
|
|
||||||
normalizeWhitespace: false,
|
|
||||||
disableCombineTextItems: true,
|
|
||||||
})
|
|
||||||
.then((textContent: any) => {
|
|
||||||
const items = textContent.items.map((rawItem: any) => new Item(index, rawItem));
|
|
||||||
reporter.parsedPage(index);
|
|
||||||
return [...accumulatedResults, { index, page, items }];
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}, Promise.resolve([]));
|
|
||||||
}
|
|
||||||
|
|
||||||
private triggerFontRetrieval(page: any): Promise<void> {
|
|
||||||
return page.getOperatorList();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ParsedPage {
|
interface ParsedPage {
|
||||||
index: number;
|
index: number;
|
||||||
page: any;
|
pdfjsPage: any;
|
||||||
items: Item[];
|
items: Item[];
|
||||||
}
|
}
|
||||||
|
@ -27,7 +27,7 @@ export default class PdfPipeline {
|
|||||||
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
|
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
|
||||||
const parseResult = await this.parse(src, progressListener);
|
const parseResult = await this.parse(src, progressListener);
|
||||||
this.verifyRequiredColumns(parseResult.schema, this.transformers);
|
this.verifyRequiredColumns(parseResult.schema, this.transformers);
|
||||||
const context = { pageViewports: parseResult.pageViewports };
|
const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
|
||||||
let items = parseResult.items;
|
let items = parseResult.items;
|
||||||
this.transformers.forEach((transformer) => {
|
this.transformers.forEach((transformer) => {
|
||||||
items = transformer.transform(context, items).items;
|
items = transformer.transform(context, items).items;
|
||||||
@ -38,7 +38,7 @@ export default class PdfPipeline {
|
|||||||
|
|
||||||
async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> {
|
async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> {
|
||||||
const parseResult = await this.parse(src, progressListener);
|
const parseResult = await this.parse(src, progressListener);
|
||||||
const context = { pageViewports: parseResult.pageViewports };
|
const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
|
||||||
return new Debugger(parseResult.schema, parseResult.items, context, this.transformers);
|
return new Debugger(parseResult.schema, parseResult.items, context, this.transformers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import PageViewport from '../parse/PageViewport';
|
import PageViewport from '../parse/PageViewport';
|
||||||
|
|
||||||
export default interface TransformContext {
|
export default interface TransformContext {
|
||||||
|
fontMap: Map<string, object>;
|
||||||
pageViewports: PageViewport[];
|
pageViewports: PageViewport[];
|
||||||
}
|
}
|
||||||
|
@ -22,11 +22,11 @@ test('basic example PDF parse', async () => {
|
|||||||
expect(result.metadata.title()).toEqual('ExamplePdf');
|
expect(result.metadata.title()).toEqual('ExamplePdf');
|
||||||
expect(result.metadata.author()).toEqual('Johannes Zillmann');
|
expect(result.metadata.author()).toEqual('Johannes Zillmann');
|
||||||
expect(result.pageCount()).toBe(expectedPages);
|
expect(result.pageCount()).toBe(expectedPages);
|
||||||
result.pdfPages.forEach((pdfPage, i) => {
|
result.pdfjsPages.forEach((pdfPage, i) => {
|
||||||
expect(pdfPage._pageIndex).toBe(i);
|
expect(pdfPage._pageIndex).toBe(i);
|
||||||
});
|
});
|
||||||
expect(result.pdfPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]);
|
expect(result.pdfjsPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]);
|
||||||
expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
|
expect(result.pdfjsPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
|
||||||
|
|
||||||
// verify first n items
|
// verify first n items
|
||||||
expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([
|
expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([
|
||||||
@ -161,7 +161,7 @@ test('basic example PDF parse', async () => {
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
// verify progress
|
// verify progress
|
||||||
expect(progressUpdates.length).toBe(expectedPages + 2);
|
expect(progressUpdates.length).toBe(expectedPages + 3);
|
||||||
progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts']));
|
progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts']));
|
||||||
expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]);
|
expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]);
|
||||||
expect(progressUpdates[0].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
|
expect(progressUpdates[0].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
|
||||||
@ -183,7 +183,6 @@ test('basic example PDF parse', async () => {
|
|||||||
expect(progressUpdates[7].stageDetails).toEqual([null, null, `6 / ${expectedPages}`, null]);
|
expect(progressUpdates[7].stageDetails).toEqual([null, null, `6 / ${expectedPages}`, null]);
|
||||||
expect(progressUpdates[8].stageProgress).toEqual([1, 1, 7 / expectedPages, 0]);
|
expect(progressUpdates[8].stageProgress).toEqual([1, 1, 7 / expectedPages, 0]);
|
||||||
expect(progressUpdates[8].stageDetails).toEqual([null, null, `7 / ${expectedPages}`, null]);
|
expect(progressUpdates[8].stageDetails).toEqual([null, null, `7 / ${expectedPages}`, null]);
|
||||||
|
expect(progressUpdates[9].stageProgress).toEqual([1, 1, 1, 1]);
|
||||||
// expect(progressUpdates[9].stagePercents).toEqual([1, 1, 1, 0]);
|
expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]);
|
||||||
// expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]);
|
|
||||||
});
|
});
|
||||||
|
Loading…
Reference in New Issue
Block a user