diff --git a/core/src/ParseProgressReporter.ts b/core/src/ParseProgressReporter.ts new file mode 100644 index 0000000..9770f86 --- /dev/null +++ b/core/src/ParseProgressReporter.ts @@ -0,0 +1,36 @@ +import type ParseReporter from './ParseReporter'; +import type ProgressListenFunction from './ProgressListenFunction'; +import Progress from './Progress'; + +export default class ParseProgressReporter implements ParseReporter { + progress = new Progress(['Document Header', 'Metadata', 'Pages', 'Fonts'], [0.1, 0.1, 0.7, 0.1]); + pagesToParse = 0; + progressListenFunction: ProgressListenFunction; + + constructor(progressListenFunction: ProgressListenFunction) { + this.progressListenFunction = progressListenFunction; + } + + parsedDocumentHeader(numberOfPages: number): void { + this.pagesToParse = numberOfPages; + this.progress.stageProgress[0] = 1; + this.progress.stageDetails[2] = `0 / ${numberOfPages}`; + this.progressListenFunction(this.progress); + } + + parsedMetadata(): void { + this.progress.stageProgress[1] = 1; + this.progressListenFunction(this.progress); + } + + parsedPage(index: number): void { + const pagesParsed = index + 1; + this.progress.stageProgress[2] = pagesParsed / this.pagesToParse; + this.progress.stageDetails[2] = `${pagesParsed} / ${this.pagesToParse}`; + this.progressListenFunction(this.progress); + } + + parsedFonts(): void { + this.progress.stageProgress[3] = 1; + } +} diff --git a/core/src/ParseReporter.ts b/core/src/ParseReporter.ts new file mode 100644 index 0000000..df12458 --- /dev/null +++ b/core/src/ParseReporter.ts @@ -0,0 +1,9 @@ +/** + * Progress listerner for PdfParser. + */ +export default interface ParseReporter { + parsedDocumentHeader(numberOfPages: number): void; + parsedMetadata(): void; + parsedPage(index: number): void; + parsedFonts(): void; +} diff --git a/core/src/PdfParser.ts b/core/src/PdfParser.ts index ce8c22b..21f6a79 100644 --- a/core/src/PdfParser.ts +++ b/core/src/PdfParser.ts @@ -1,21 +1,25 @@ import Metadata from './Metadata'; import ParsedPage from './ParsedPage'; +import type ParseReporter from './ParseReporter'; import ParseResult from './ParseResult'; import TextDirection from './TextDirection'; import type TextItem from './TextItem'; +/** + * Parses a PDF via PDFJS and returns a ParseResult which contains more or less the original data from PDFJS. + */ export default class PdfParser { pdfjs: any; constructor(pdfjs: any) { this.pdfjs = pdfjs; } - async parseBytes(data: Uint8Array): Promise { - return this.parse(this.params({ data })); + async parseBytes(data: Uint8Array, reporter: ParseReporter): Promise { + return this.parse(this.params({ data }), reporter); } - async parseUrl(url: string): Promise { - return this.parse(this.params({ url })); + async parseUrl(url: string, reporter: ParseReporter): Promise { + return this.parse(this.params({ url }), reporter); } private params(dataSourceParams: object): object { @@ -26,27 +30,37 @@ export default class PdfParser { return { ...defaultParams, ...dataSourceParams }; } - async parse(parameter: object): Promise { + async parse(parameter: object, reporter: ParseReporter): Promise { return this.pdfjs .getDocument(parameter) .promise.then((pdfDocument) => { - return Promise.all([pdfDocument.getMetadata(), this.extractPagesSequentially(pdfDocument)]); + reporter.parsedDocumentHeader(pdfDocument.numPages); + return Promise.all([ + pdfDocument.getMetadata().then((metadata) => { + reporter.parsedMetadata(); + return metadata; + }), + this.extractPagesSequentially(pdfDocument, reporter), + ]); }) .then(([metadata, pages]) => new ParseResult(new Metadata(metadata), pages)); } - private extractPagesSequentially(pdfDocument: any): Promise { + private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise { return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => { return accumulatorPromise.then((accumulatedResults) => { return pdfDocument.getPage(index + 1).then((page) => { const viewport = page.getViewport({ scale: 1.0 }); return this.triggerFontRetrieval(page).then(() => page - .getTextContent() - .then((textContent) => [ - ...accumulatedResults, - new ParsedPage(index, viewport.transform, textContent.items), - ]), + .getTextContent({ + normalizeWhitespace: false, + disableCombineTextItems: true, + }) + .then((textContent) => { + reporter.parsedPage(index); + return [...accumulatedResults, new ParsedPage(index, viewport.transform, textContent.items)]; + }), ); }); }); diff --git a/core/src/Progress.ts b/core/src/Progress.ts new file mode 100644 index 0000000..53515b3 --- /dev/null +++ b/core/src/Progress.ts @@ -0,0 +1,45 @@ +/** + * Multi-stage progress. Progress is expressed in a number between 0 and 1. + */ +export default class Progress { + stages: string[]; + stageDetails: string[]; + stageProgress: number[]; + stageWeights: number[]; + + constructor(stages: string[], weights: number[] = []) { + this.stages = stages; + this.stageDetails = new Array(stages.length); + this.stageProgress = new Array(stages.length).fill(0); + if (weights.length === 0) { + this.stageWeights = new Array(stages.length).fill(1 / stages.length); + } else { + if (weights.length !== stages.length) + throw new Error( + `Provided only ${weights.length} weights but expected ${stages.length} for ${stages.length} stages`, + ); + const weightsSummed = weights.reduce((sum, weight) => +(sum + weight).toFixed(12), 0); + if (weightsSummed !== 1) + throw new Error(`Weights [${weights.join(', ')}] should sum up to 1, but did to ${weightsSummed}`); + this.stageWeights = weights; + } + } + + isComplete(stageIndex: number) { + return this.stageProgress[stageIndex] === 1; + } + + isProgressing(stageIndex: number) { + const previousComplete = stageIndex === 0 || this.isComplete(stageIndex - 1); + return previousComplete && this.stageProgress[stageIndex] < 1; + } + + totalProgress() { + const stageCount = this.stages.length; + const stageProgressSummed = this.stageProgress.reduce( + (sum, stageProgress, index) => sum + stageProgress * this.stageWeights[index] * this.stages.length, + 0, + ); + return stageProgressSummed / stageCount; + } +} diff --git a/core/src/ProgressListenFunction.ts b/core/src/ProgressListenFunction.ts new file mode 100644 index 0000000..a5bdccf --- /dev/null +++ b/core/src/ProgressListenFunction.ts @@ -0,0 +1,5 @@ +import type Progress from './Progress'; + +type ProgressListenFunction = (progressUpdate: Progress) => void; + +export default ProgressListenFunction; diff --git a/core/src/index.ts b/core/src/index.ts index c4e74e1..a05e65e 100644 --- a/core/src/index.ts +++ b/core/src/index.ts @@ -1,6 +1,11 @@ -import ParseResult from './ParseResult'; +import type ProgressListenFunction from './ProgressListenFunction'; +import ParseProgressReporter from './ParseProgressReporter'; import PdfParser from './PdfParser'; export function pdfParser(pdfJs: any) { return new PdfParser(pdfJs); } + +export function parseReporter(progressListener: ProgressListenFunction) { + return new ParseProgressReporter(progressListener); +} diff --git a/core/test/PdfParser.test.ts b/core/test/PdfParser.test.ts index 4804d44..6ed7ce0 100644 --- a/core/test/PdfParser.test.ts +++ b/core/test/PdfParser.test.ts @@ -1,15 +1,22 @@ import PdfParser from 'src/PdfParser'; import * as pdfjs from 'pdfjs-dist/es5/build/pdf'; import * as fs from 'fs'; +import ParseProgressReporter from 'src/ParseProgressReporter'; +import Progress from 'src/Progress'; const parser = new PdfParser(pdfjs); -test('testIt', async () => { +test('basic example PDF parse', async () => { + const progressUpdates: Progress[] = []; const data = fs.readFileSync('../examples/ExamplePdf.pdf', null); - const result = await parser.parseBytes(data); + const result = await parser.parseBytes( + data, + new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)), + ); + const expectedPages = 7; expect(result.metadata.title()).toEqual('ExamplePdf'); expect(result.metadata.author()).toEqual('Johannes Zillmann'); - expect(result.pages.length).toBe(7); + expect(result.pages.length).toBe(expectedPages); expect(result.pages[0].index).toBe(0); expect(result.pages[0].viewPortTransform).toEqual([1, 0, 0, -1, 0, 841.8898]); expect(result.pages[0].items).toEqual([ @@ -118,4 +125,30 @@ test('testIt', async () => { fontName: 'g_d0_f2', }, ]); + + expect(progressUpdates.length).toBe(expectedPages + 2); + progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts'])); + expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]); + expect(progressUpdates[0].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]); + + expect(progressUpdates[1].stageProgress).toEqual([1, 1, 0, 0]); + expect(progressUpdates[1].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]); + + expect(progressUpdates[2].stageProgress).toEqual([1, 1, 1 / expectedPages, 0]); + expect(progressUpdates[2].stageDetails).toEqual([null, null, `1 / ${expectedPages}`, null]); + expect(progressUpdates[3].stageProgress).toEqual([1, 1, 2 / expectedPages, 0]); + expect(progressUpdates[3].stageDetails).toEqual([null, null, `2 / ${expectedPages}`, null]); + expect(progressUpdates[4].stageProgress).toEqual([1, 1, 3 / expectedPages, 0]); + expect(progressUpdates[4].stageDetails).toEqual([null, null, `3 / ${expectedPages}`, null]); + expect(progressUpdates[5].stageProgress).toEqual([1, 1, 4 / expectedPages, 0]); + expect(progressUpdates[5].stageDetails).toEqual([null, null, `4 / ${expectedPages}`, null]); + expect(progressUpdates[6].stageProgress).toEqual([1, 1, 5 / expectedPages, 0]); + expect(progressUpdates[6].stageDetails).toEqual([null, null, `5 / ${expectedPages}`, null]); + expect(progressUpdates[7].stageProgress).toEqual([1, 1, 6 / expectedPages, 0]); + expect(progressUpdates[7].stageDetails).toEqual([null, null, `6 / ${expectedPages}`, null]); + expect(progressUpdates[8].stageProgress).toEqual([1, 1, 7 / expectedPages, 0]); + expect(progressUpdates[8].stageDetails).toEqual([null, null, `7 / ${expectedPages}`, null]); + + // expect(progressUpdates[9].stagePercents).toEqual([1, 1, 1, 0]); + // expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]); }); diff --git a/core/test/Progress.test.ts b/core/test/Progress.test.ts new file mode 100644 index 0000000..4f1b4fb --- /dev/null +++ b/core/test/Progress.test.ts @@ -0,0 +1,107 @@ +import Progress from 'src/Progress'; + +test('basic progress', async () => { + const progress = new Progress(['Stage0', 'Stage1', 'Stage1']); + + // nothing yet + expectTotalProgress(progress, 0); + expectStageInProgress(progress, 0); + + // stage 0 progress + progress.stageProgress[0] = 0.3; + expectTotalProgress(progress, 10); + expectStageInProgress(progress, 0); + + // stage 0 completed + progress.stageProgress[0] = 1; + expectTotalProgress(progress, 33); + expectStageInProgress(progress, 1); + + // stage 1 progress + progress.stageProgress[1] = 0.3; + expectTotalProgress(progress, 43); + expectStageInProgress(progress, 1); + + // stage 1 completed + progress.stageProgress[1] = 1; + expectTotalProgress(progress, 67); + expectStageInProgress(progress, 2); + + // stage 2 completed + progress.stageProgress[2] = 1; + expectTotalProgress(progress, 100); + expectStageInProgress(progress, 3); +}); + +test('number of stage weights must match the number of stages', async () => { + try { + new Progress(['Stage0', 'Stage1', 'Stage1'], [0.5, 0.5]); + fail('Creating a progress object with number of weigths not matching numbers of stages should fail'); + } catch (error) { + expect(error.message).toEqual('Provided only 2 weights but expected 3 for 3 stages'); + } +}); + +test('stage weights must sum up', async () => { + try { + new Progress(['Stage0', 'Stage1', 'Stage1'], [0.5, 0.5, 0.5]); + fail('Creating a progress object with stage weigths not summing up should fail'); + } catch (error) { + expect(error.message).toEqual('Weights [0.5, 0.5, 0.5] should sum up to 1, but did to 1.5'); + } +}); + +test('weighted progress', async () => { + const progress = new Progress(['Stage0', 'Stage1', 'Stage1'], [0, 0.7, 0.3]); + + // nothing yet + expectTotalProgress(progress, 0); + + // stage 0 progress + progress.stageProgress[0] = 0.9; + expectTotalProgress(progress, 0); + + // stage 0 completed + progress.stageProgress[0] = 1; + expectTotalProgress(progress, 0); + + // stage 1 progress + progress.stageProgress[1] = 0.3; + expectTotalProgress(progress, 21); + + // stage 1 more progress + progress.stageProgress[1] = 0.6; + expectTotalProgress(progress, 42); + + // stage 1 completed + progress.stageProgress[1] = 1; + expectTotalProgress(progress, 70); + + // stage 2 progress + progress.stageProgress[2] = 0.3; + expectTotalProgress(progress, 79); + + // stage 2 completed + progress.stageProgress[2] = 1; + expectTotalProgress(progress, 100); +}); + +function expectTotalProgress(progress: Progress, expected: number) { + expect(Math.round(progress.totalProgress() * 100)).toBe(expected); +} + +function expectStageInProgress(progress: Progress, stageIndex: number) { + for (let index = 0; index < progress.stageProgress.length; index++) { + const stageProgress = progress.stageProgress[index]; + if (index < stageIndex) { + expect(progress.isProgressing(index)).toBe(false); + expect(progress.isComplete(index)).toBe(true); + } else if (index === stageIndex) { + expect(progress.isProgressing(index)).toBe(true); + expect(progress.isComplete(index)).toBe(false); + } else if (index > stageIndex) { + expect(progress.isProgressing(index)).toBe(false); + expect(progress.isComplete(index)).toBe(false); + } + } +} diff --git a/ui/snowpack.config.js b/ui/snowpack.config.js index a042b8e..62ed5e2 100644 --- a/ui/snowpack.config.js +++ b/ui/snowpack.config.js @@ -31,5 +31,6 @@ module.exports = { }, alias: { '@core': '../core/src/index.js', + '@core/*': '../core/src/*', }, }; diff --git a/ui/src/ProgressRing.svelte b/ui/src/ProgressRing.svelte new file mode 100644 index 0000000..0d92291 --- /dev/null +++ b/ui/src/ProgressRing.svelte @@ -0,0 +1,35 @@ + + + + + + {Math.round($progressTweened)}% + + diff --git a/ui/src/Upload.svelte b/ui/src/Upload.svelte index ec8a39c..833b6ce 100644 --- a/ui/src/Upload.svelte +++ b/ui/src/Upload.svelte @@ -1,31 +1,44 @@ @@ -33,11 +46,11 @@
Load Example
-
Debug
+
Debug
-
+
(dragover = true)} @@ -49,7 +62,7 @@ -
+
Drop your PDF file here...
Or click the box to select one...
Note: Your data stays locally in your browser.
@@ -64,22 +77,55 @@
-
- {#await upload} -
Parsing {specifiedFileName}...
- {:catch error} -
Failed to parse '{specifiedFileName}': {error.message}
- {/await} - {#if rejectionError} -
{rejectionError}
- {/if} + +
+
+ {#if specifiedFileName} +
Parsing {specifiedFileName} ...
+ {/if} + {#if parseProgress} +
+ +
+ {#each parseProgress.stages as stage, index} + {#if parseProgress.isProgressing(index)} +
+
+ Parsing + {stage} + {parseProgress.stageDetails[index] ? parseProgress.stageDetails[index] : ''} +
+
+ {:else if parseProgress.isComplete(index)} +
+
+ Parsing + {stage} + {parseProgress.stageDetails[index] ? parseProgress.stageDetails[index] : ''} +
+ +
+ {/if} + {/each} +
+
+ {/if} + {#if rejectionError} +
{rejectionError}
+ {/if} + {#await upload} + + {:catch error} +
Failed to parse '{specifiedFileName}': {error?.message}
+ {/await} +
diff --git a/ui/src/store.ts b/ui/src/store.ts index 8ae19a1..1a523be 100644 --- a/ui/src/store.ts +++ b/ui/src/store.ts @@ -1,4 +1,5 @@ -import { pdfParser } from '@core'; +import { pdfParser, parseReporter } from '@core'; +import type ProgressListenFunction from '@core/ProgressListenFunction'; import type ParseResult from '@core/ParseResult'; import * as pdfjs from 'pdfjs-dist/es5/build/pdf'; @@ -10,11 +11,11 @@ pdfjs.GlobalWorkerOptions.workerSrc = 'worker/pdf.worker.min.js'; const parser = pdfParser(pdfjs); -export async function loadExample(): Promise { - return parsePdf(parser.parseUrl('/ExamplePdf.pdf')); +export async function loadExample(progressListener: ProgressListenFunction): Promise { + return parsePdf(parser.parseUrl('/ExamplePdf.pdf', parseReporter(progressListener))); } -export async function processUpload(file: File): Promise { +export async function processUpload(file: File, progressListener: ProgressListenFunction): Promise { return new Promise((resolve, reject) => { const reader = new FileReader(); reader.onerror = reject; @@ -24,7 +25,7 @@ export async function processUpload(file: File): Promise { reader.readAsArrayBuffer(file); }).then((buffer) => { const data = new Uint8Array(buffer as ArrayBuffer); - return parsePdf(parser.parseBytes(data)); + return parsePdf(parser.parseBytes(data, parseReporter(progressListener))); }); }