Progress Infrastructure

This commit is contained in:
Johannes Zillmann 2021-01-12 22:54:22 +01:00
parent 21ecfd35bd
commit ee7d686ba6
12 changed files with 378 additions and 41 deletions

View File

@ -0,0 +1,36 @@
import type ParseReporter from './ParseReporter';
import type ProgressListenFunction from './ProgressListenFunction';
import Progress from './Progress';
export default class ParseProgressReporter implements ParseReporter {
progress = new Progress(['Document Header', 'Metadata', 'Pages', 'Fonts'], [0.1, 0.1, 0.7, 0.1]);
pagesToParse = 0;
progressListenFunction: ProgressListenFunction;
constructor(progressListenFunction: ProgressListenFunction) {
this.progressListenFunction = progressListenFunction;
}
parsedDocumentHeader(numberOfPages: number): void {
this.pagesToParse = numberOfPages;
this.progress.stageProgress[0] = 1;
this.progress.stageDetails[2] = `0 / ${numberOfPages}`;
this.progressListenFunction(this.progress);
}
parsedMetadata(): void {
this.progress.stageProgress[1] = 1;
this.progressListenFunction(this.progress);
}
parsedPage(index: number): void {
const pagesParsed = index + 1;
this.progress.stageProgress[2] = pagesParsed / this.pagesToParse;
this.progress.stageDetails[2] = `${pagesParsed} / ${this.pagesToParse}`;
this.progressListenFunction(this.progress);
}
parsedFonts(): void {
this.progress.stageProgress[3] = 1;
}
}

View File

@ -0,0 +1,9 @@
/**
* Progress listerner for PdfParser.
*/
export default interface ParseReporter {
parsedDocumentHeader(numberOfPages: number): void;
parsedMetadata(): void;
parsedPage(index: number): void;
parsedFonts(): void;
}

View File

@ -1,21 +1,25 @@
import Metadata from './Metadata';
import ParsedPage from './ParsedPage';
import type ParseReporter from './ParseReporter';
import ParseResult from './ParseResult';
import TextDirection from './TextDirection';
import type TextItem from './TextItem';
/**
* Parses a PDF via PDFJS and returns a ParseResult which contains more or less the original data from PDFJS.
*/
export default class PdfParser {
pdfjs: any;
constructor(pdfjs: any) {
this.pdfjs = pdfjs;
}
async parseBytes(data: Uint8Array): Promise<ParseResult> {
return this.parse(this.params({ data }));
async parseBytes(data: Uint8Array, reporter: ParseReporter): Promise<ParseResult> {
return this.parse(this.params({ data }), reporter);
}
async parseUrl(url: string): Promise<ParseResult> {
return this.parse(this.params({ url }));
async parseUrl(url: string, reporter: ParseReporter): Promise<ParseResult> {
return this.parse(this.params({ url }), reporter);
}
private params(dataSourceParams: object): object {
@ -26,27 +30,37 @@ export default class PdfParser {
return { ...defaultParams, ...dataSourceParams };
}
async parse(parameter: object): Promise<ParseResult> {
async parse(parameter: object, reporter: ParseReporter): Promise<ParseResult> {
return this.pdfjs
.getDocument(parameter)
.promise.then((pdfDocument) => {
return Promise.all([pdfDocument.getMetadata(), this.extractPagesSequentially(pdfDocument)]);
reporter.parsedDocumentHeader(pdfDocument.numPages);
return Promise.all([
pdfDocument.getMetadata().then((metadata) => {
reporter.parsedMetadata();
return metadata;
}),
this.extractPagesSequentially(pdfDocument, reporter),
]);
})
.then(([metadata, pages]) => new ParseResult(new Metadata(metadata), pages));
}
private extractPagesSequentially(pdfDocument: any): Promise<ParsedPage> {
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults) => {
return pdfDocument.getPage(index + 1).then((page) => {
const viewport = page.getViewport({ scale: 1.0 });
return this.triggerFontRetrieval(page).then(() =>
page
.getTextContent()
.then((textContent) => [
...accumulatedResults,
new ParsedPage(index, viewport.transform, textContent.items),
]),
.getTextContent({
normalizeWhitespace: false,
disableCombineTextItems: true,
})
.then((textContent) => {
reporter.parsedPage(index);
return [...accumulatedResults, new ParsedPage(index, viewport.transform, textContent.items)];
}),
);
});
});

45
core/src/Progress.ts Normal file
View File

@ -0,0 +1,45 @@
/**
* Multi-stage progress. Progress is expressed in a number between 0 and 1.
*/
export default class Progress {
stages: string[];
stageDetails: string[];
stageProgress: number[];
stageWeights: number[];
constructor(stages: string[], weights: number[] = []) {
this.stages = stages;
this.stageDetails = new Array(stages.length);
this.stageProgress = new Array(stages.length).fill(0);
if (weights.length === 0) {
this.stageWeights = new Array(stages.length).fill(1 / stages.length);
} else {
if (weights.length !== stages.length)
throw new Error(
`Provided only ${weights.length} weights but expected ${stages.length} for ${stages.length} stages`,
);
const weightsSummed = weights.reduce((sum, weight) => +(sum + weight).toFixed(12), 0);
if (weightsSummed !== 1)
throw new Error(`Weights [${weights.join(', ')}] should sum up to 1, but did to ${weightsSummed}`);
this.stageWeights = weights;
}
}
isComplete(stageIndex: number) {
return this.stageProgress[stageIndex] === 1;
}
isProgressing(stageIndex: number) {
const previousComplete = stageIndex === 0 || this.isComplete(stageIndex - 1);
return previousComplete && this.stageProgress[stageIndex] < 1;
}
totalProgress() {
const stageCount = this.stages.length;
const stageProgressSummed = this.stageProgress.reduce(
(sum, stageProgress, index) => sum + stageProgress * this.stageWeights[index] * this.stages.length,
0,
);
return stageProgressSummed / stageCount;
}
}

View File

@ -0,0 +1,5 @@
import type Progress from './Progress';
type ProgressListenFunction = (progressUpdate: Progress) => void;
export default ProgressListenFunction;

View File

@ -1,6 +1,11 @@
import ParseResult from './ParseResult';
import type ProgressListenFunction from './ProgressListenFunction';
import ParseProgressReporter from './ParseProgressReporter';
import PdfParser from './PdfParser';
export function pdfParser(pdfJs: any) {
return new PdfParser(pdfJs);
}
export function parseReporter(progressListener: ProgressListenFunction) {
return new ParseProgressReporter(progressListener);
}

View File

@ -1,15 +1,22 @@
import PdfParser from 'src/PdfParser';
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
import * as fs from 'fs';
import ParseProgressReporter from 'src/ParseProgressReporter';
import Progress from 'src/Progress';
const parser = new PdfParser(pdfjs);
test('testIt', async () => {
test('basic example PDF parse', async () => {
const progressUpdates: Progress[] = [];
const data = fs.readFileSync('../examples/ExamplePdf.pdf', null);
const result = await parser.parseBytes(data);
const result = await parser.parseBytes(
data,
new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)),
);
const expectedPages = 7;
expect(result.metadata.title()).toEqual('ExamplePdf');
expect(result.metadata.author()).toEqual('Johannes Zillmann');
expect(result.pages.length).toBe(7);
expect(result.pages.length).toBe(expectedPages);
expect(result.pages[0].index).toBe(0);
expect(result.pages[0].viewPortTransform).toEqual([1, 0, 0, -1, 0, 841.8898]);
expect(result.pages[0].items).toEqual([
@ -118,4 +125,30 @@ test('testIt', async () => {
fontName: 'g_d0_f2',
},
]);
expect(progressUpdates.length).toBe(expectedPages + 2);
progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts']));
expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]);
expect(progressUpdates[0].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
expect(progressUpdates[1].stageProgress).toEqual([1, 1, 0, 0]);
expect(progressUpdates[1].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
expect(progressUpdates[2].stageProgress).toEqual([1, 1, 1 / expectedPages, 0]);
expect(progressUpdates[2].stageDetails).toEqual([null, null, `1 / ${expectedPages}`, null]);
expect(progressUpdates[3].stageProgress).toEqual([1, 1, 2 / expectedPages, 0]);
expect(progressUpdates[3].stageDetails).toEqual([null, null, `2 / ${expectedPages}`, null]);
expect(progressUpdates[4].stageProgress).toEqual([1, 1, 3 / expectedPages, 0]);
expect(progressUpdates[4].stageDetails).toEqual([null, null, `3 / ${expectedPages}`, null]);
expect(progressUpdates[5].stageProgress).toEqual([1, 1, 4 / expectedPages, 0]);
expect(progressUpdates[5].stageDetails).toEqual([null, null, `4 / ${expectedPages}`, null]);
expect(progressUpdates[6].stageProgress).toEqual([1, 1, 5 / expectedPages, 0]);
expect(progressUpdates[6].stageDetails).toEqual([null, null, `5 / ${expectedPages}`, null]);
expect(progressUpdates[7].stageProgress).toEqual([1, 1, 6 / expectedPages, 0]);
expect(progressUpdates[7].stageDetails).toEqual([null, null, `6 / ${expectedPages}`, null]);
expect(progressUpdates[8].stageProgress).toEqual([1, 1, 7 / expectedPages, 0]);
expect(progressUpdates[8].stageDetails).toEqual([null, null, `7 / ${expectedPages}`, null]);
// expect(progressUpdates[9].stagePercents).toEqual([1, 1, 1, 0]);
// expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]);
});

107
core/test/Progress.test.ts Normal file
View File

@ -0,0 +1,107 @@
import Progress from 'src/Progress';
test('basic progress', async () => {
const progress = new Progress(['Stage0', 'Stage1', 'Stage1']);
// nothing yet
expectTotalProgress(progress, 0);
expectStageInProgress(progress, 0);
// stage 0 progress
progress.stageProgress[0] = 0.3;
expectTotalProgress(progress, 10);
expectStageInProgress(progress, 0);
// stage 0 completed
progress.stageProgress[0] = 1;
expectTotalProgress(progress, 33);
expectStageInProgress(progress, 1);
// stage 1 progress
progress.stageProgress[1] = 0.3;
expectTotalProgress(progress, 43);
expectStageInProgress(progress, 1);
// stage 1 completed
progress.stageProgress[1] = 1;
expectTotalProgress(progress, 67);
expectStageInProgress(progress, 2);
// stage 2 completed
progress.stageProgress[2] = 1;
expectTotalProgress(progress, 100);
expectStageInProgress(progress, 3);
});
test('number of stage weights must match the number of stages', async () => {
try {
new Progress(['Stage0', 'Stage1', 'Stage1'], [0.5, 0.5]);
fail('Creating a progress object with number of weigths not matching numbers of stages should fail');
} catch (error) {
expect(error.message).toEqual('Provided only 2 weights but expected 3 for 3 stages');
}
});
test('stage weights must sum up', async () => {
try {
new Progress(['Stage0', 'Stage1', 'Stage1'], [0.5, 0.5, 0.5]);
fail('Creating a progress object with stage weigths not summing up should fail');
} catch (error) {
expect(error.message).toEqual('Weights [0.5, 0.5, 0.5] should sum up to 1, but did to 1.5');
}
});
test('weighted progress', async () => {
const progress = new Progress(['Stage0', 'Stage1', 'Stage1'], [0, 0.7, 0.3]);
// nothing yet
expectTotalProgress(progress, 0);
// stage 0 progress
progress.stageProgress[0] = 0.9;
expectTotalProgress(progress, 0);
// stage 0 completed
progress.stageProgress[0] = 1;
expectTotalProgress(progress, 0);
// stage 1 progress
progress.stageProgress[1] = 0.3;
expectTotalProgress(progress, 21);
// stage 1 more progress
progress.stageProgress[1] = 0.6;
expectTotalProgress(progress, 42);
// stage 1 completed
progress.stageProgress[1] = 1;
expectTotalProgress(progress, 70);
// stage 2 progress
progress.stageProgress[2] = 0.3;
expectTotalProgress(progress, 79);
// stage 2 completed
progress.stageProgress[2] = 1;
expectTotalProgress(progress, 100);
});
function expectTotalProgress(progress: Progress, expected: number) {
expect(Math.round(progress.totalProgress() * 100)).toBe(expected);
}
function expectStageInProgress(progress: Progress, stageIndex: number) {
for (let index = 0; index < progress.stageProgress.length; index++) {
const stageProgress = progress.stageProgress[index];
if (index < stageIndex) {
expect(progress.isProgressing(index)).toBe(false);
expect(progress.isComplete(index)).toBe(true);
} else if (index === stageIndex) {
expect(progress.isProgressing(index)).toBe(true);
expect(progress.isComplete(index)).toBe(false);
} else if (index > stageIndex) {
expect(progress.isProgressing(index)).toBe(false);
expect(progress.isComplete(index)).toBe(false);
}
}
}

View File

@ -31,5 +31,6 @@ module.exports = {
},
alias: {
'@core': '../core/src/index.js',
'@core/*': '../core/src/*',
},
};

View File

@ -0,0 +1,35 @@
<script>
import { tweened } from 'svelte/motion';
import { cubicOut } from 'svelte/easing';
export let radius: number;
export let stroke: number;
export let progress: number;
const normalizedRadius = radius - stroke * 2;
const circumference = normalizedRadius * 2 * Math.PI;
const progressTweened = tweened(0, {
duration: 400,
easing: cubicOut,
});
$: progressTweened.set(progress);
$: strokeDashoffset = circumference - ($progressTweened / 100) * circumference;
</script>
<svg
height={radius * 2}
width={radius * 2}
class="text-green-600 stroke-current"
style="filter: brightness({$progressTweened / 100 / 2 + 0.5}) sepia({0.5 - $progressTweened / 100 / 2}) blur({0.6 - $progressTweened / 100 / 3}px)">
<circle
fill="transparent"
stroke-width={stroke}
stroke-dasharray={circumference + ' ' + circumference}
stroke-dashoffset={strokeDashoffset}
r={normalizedRadius}
cx={radius}
cy={radius} />
<text x="50%" y="53%" text-anchor="middle" class="text-gray-800 fill-current" stroke-width="1px" dy=".2em">
{Math.round($progressTweened)}%
</text>
</svg>

View File

@ -1,31 +1,44 @@
<script>
import { blur, slide } from 'svelte/transition';
import Dropzone from 'svelte-file-dropzone';
import { Download } from 'svelte-hero-icons';
import { Download, Check } from 'svelte-hero-icons';
import { processUpload, loadExample } from './store';
import type Progress from '@core/Progress';
import ProgressRing from './ProgressRing.svelte';
let specifiedFileName: string;
let dragover = false;
let upload: Promise<any>;
let rejectionError: string;
let parseProgress: Progress;
function handleExampleLoad() {
rejectionError = undefined;
dragover = true;
specifiedFileName = 'ExamplePdf.pdf';
upload = loadExample();
}
function handleFilesSelect(e) {
rejectionError = undefined;
parseProgress = undefined;
upload = loadExample(handleProgress);
}
function handleFilesSelect(e) {
specifiedFileName = undefined;
rejectionError = undefined;
parseProgress = undefined;
const { acceptedFiles, fileRejections } = e.detail;
if (acceptedFiles.length === 1) {
const specifiedFile = acceptedFiles[0];
specifiedFileName = specifiedFile.name;
upload = processUpload(specifiedFile);
upload = processUpload(specifiedFile, handleProgress);
}
if (fileRejections.length > 1) {
const fileNames = fileRejections.map((r) => r.file.name);
rejectionError = `Only one file at a time allowed! Rejected ${fileRejections.length} files: '${fileNames}'.`;
}
}
function handleProgress(progress: Progress) {
parseProgress = progress;
}
</script>
<!-- Options -->
@ -33,11 +46,11 @@
<div class="py-0.5 border-2 border-gray-50 hover:underline cursor-pointer" on:click={handleExampleLoad}>
Load Example
</div>
<div class="py-0.5 px-1 border-2 border-gray-50 hover:border-blue-600 cursor-pointer">Debug</div>
<div class="py-0.5 px-1 border-2 border-gray-50 hover:border-green-600 cursor-pointer">Debug</div>
</div>
<!-- Upload Box -->
<div class="pb-5 border-2 border-dashed border-gray-400 hover:border-blue-800" class:dragover>
<div class="mb-5 border-2 border-dashed border-gray-400 hover:border-green-800" class:dragover>
<Dropzone
on:drop={handleFilesSelect}
on:dragenter={() => (dragover = true)}
@ -49,7 +62,7 @@
<span class:dragoverItem={dragover}>
<Download size="21x" />
</span>
<div class="px-5">
<div class="px-5 mb-5">
<div class="text-5xl font-bold my-4">Drop your PDF file here...</div>
<div class="text-2xl font-bold">Or click the box to select one...</div>
<div class="mt-14"><strong>Note:</strong> Your data stays locally in your browser.</div>
@ -64,22 +77,55 @@
</Dropzone>
</div>
<div class="mt-5 text-center font-bold">
{#await upload}
<div>Parsing {specifiedFileName}...</div>
{:catch error}
<div class="text-red-700">Failed to parse '{specifiedFileName}': {error.message}</div>
{/await}
{#if rejectionError}
<div class="text-red-700">{rejectionError}</div>
{/if}
<!-- Progress Info -->
<div class="mt-5 text-xl font-bold">
<div style="min-width: 70%;">
{#if specifiedFileName}
<div in:blur class="text-2xl mb-2">Parsing {specifiedFileName} ...</div>
{/if}
{#if parseProgress}
<div in:blur class="flex space-x-4">
<ProgressRing radius={50} stroke={7} progress={parseProgress?.totalProgress() * 100} />
<div>
{#each parseProgress.stages as stage, index}
{#if parseProgress.isProgressing(index)}
<div class="flex space-x-2 items-center">
<div>
Parsing
{stage}
{parseProgress.stageDetails[index] ? parseProgress.stageDetails[index] : ''}
</div>
</div>
{:else if parseProgress.isComplete(index)}
<div class="flex space-x-2 items-center ">
<div>
Parsing
{stage}
{parseProgress.stageDetails[index] ? parseProgress.stageDetails[index] : ''}
</div>
<Check size="1.5x" class="text-green-700" />
</div>
{/if}
{/each}
</div>
</div>
{/if}
{#if rejectionError}
<div in:slide class="text-red-700">{rejectionError}</div>
{/if}
{#await upload}
<!-- -->
{:catch error}
<div class="text-red-700">Failed to parse '{specifiedFileName}': {error?.message}</div>
{/await}
</div>
</div>
<style>
.dragover {
@apply border-purple-600;
@apply border-green-600;
}
.dragoverItem {
@apply text-purple-600;
@apply text-green-600;
}
</style>

View File

@ -1,4 +1,5 @@
import { pdfParser } from '@core';
import { pdfParser, parseReporter } from '@core';
import type ProgressListenFunction from '@core/ProgressListenFunction';
import type ParseResult from '@core/ParseResult';
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
@ -10,11 +11,11 @@ pdfjs.GlobalWorkerOptions.workerSrc = 'worker/pdf.worker.min.js';
const parser = pdfParser(pdfjs);
export async function loadExample(): Promise<ParseResult> {
return parsePdf(parser.parseUrl('/ExamplePdf.pdf'));
export async function loadExample(progressListener: ProgressListenFunction): Promise<ParseResult> {
return parsePdf(parser.parseUrl('/ExamplePdf.pdf', parseReporter(progressListener)));
}
export async function processUpload(file: File): Promise<ParseResult> {
export async function processUpload(file: File, progressListener: ProgressListenFunction): Promise<ParseResult> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onerror = reject;
@ -24,7 +25,7 @@ export async function processUpload(file: File): Promise<ParseResult> {
reader.readAsArrayBuffer(file);
}).then((buffer) => {
const data = new Uint8Array(buffer as ArrayBuffer);
return parsePdf(parser.parseBytes(data));
return parsePdf(parser.parseBytes(data, parseReporter(progressListener)));
});
}