Show parse results

This commit is contained in:
Johannes Zillmann 2021-01-03 20:09:35 +01:00
parent 26e5b785cd
commit e6ce7132ce
7 changed files with 122 additions and 28 deletions

View File

@ -2,7 +2,7 @@
"name": "pdf-to-markdown",
"version": "0.2.0",
"description": "A PDF to Markdown Converter",
"keywords": [
"keywords": [
"PDF",
"Markdown",
"Converter"

View File

@ -1,10 +1,17 @@
<script>
import Upload from './Upload.svelte';
import { parseResult } from './store';
import Result from './Result.svelte';
</script>
<div class="text-2xl font-semibold font-serif text-center bg-gray-400">PDF to Markdown Converter</div>
<main class="container mx-auto mt-5 h-full">
<Upload />
{#if $parseResult}
<Result parseResult={$parseResult} />
{:else}
<Upload />
{/if}
</main>
<style>

15
ui/src/Result.svelte Normal file
View File

@ -0,0 +1,15 @@
<script>
import type ParseResult from 'pdf-to-markdown-core/lib/src/ParseResult';
import Table from './Table.svelte';
export let parseResult: ParseResult;
console.log(parseResult.metadata);
</script>
<div>Title: {parseResult.metadata.title()}</div>
<div>Author: {parseResult.metadata.author()}</div>
{#each parseResult.pages as page}
<div class="text-2xl font-semibold my-5">Page {page.index}</div>
<Table items={page.items} />
{/each}

52
ui/src/Table.svelte Normal file
View File

@ -0,0 +1,52 @@
<script>
import type ParsedPageItem from 'pdf-to-markdown-core/lib/src/ParsedPageItem';
const headers = ['ID', 'Text', 'Font', 'Direction', 'Width', 'Height', 'Transform'];
export let items: ParsedPageItem[];
</script>
<div class="static">
<div class="grid" style="grid-template-columns:repeat({headers.length}, auto)">
{#each headers as header}
<div class="header">{header}</div>
{/each}
{#each items as item, i}
<div class="row contents">
<div class="cell">{i + 1}</div>
<div class="cell">{item.str}</div>
<div class="cell">{item.fontName}</div>
<div class="cell">{item.dir}</div>
<div class="cell">{item.width}</div>
<div class="cell">{item.height}</div>
<div class="cell">{item.transform.join(', ')}</div>
</div>
{/each}
</div>
</div>
<style>
.grid {
width: 100%;
max-height: 100vh;
display: grid;
grid-auto-rows: min-content;
overflow-y: auto;
border: 1px solid #e3e4e4;
border-left: none;
}
.header {
@apply bg-gray-300;
position: sticky;
top: 0;
padding: 5px;
border-bottom: 1px solid #e3e4e4;
}
.row:hover > div {
@apply bg-gray-200;
}
.cell {
@apply pl-1;
border-left: 1px solid #e3e4e4;
}
</style>

View File

@ -32,7 +32,9 @@
noClick={false}
disableDefaultStyles={true}>
<div class="grid grid-cols-1 md:grid-cols-2 justify-items-center">
<Download size="21x" />
<span class:dragoverItem={dragover}>
<Download size="21x" />
</span>
<div class="mt-4">
<div class="text-5xl font-bold my-4">Drop your PDF file here...</div>
<div class="text-2xl font-bold">Or click the box to select one...</div>
@ -63,4 +65,7 @@
.dragover {
@apply border-purple-600;
}
.dragoverItem {
@apply text-purple-600;
}
</style>

View File

@ -2,6 +2,10 @@ import { pdfParser } from 'pdf-to-markdown-core';
import type ParseResult from 'pdf-to-markdown-core/lib/src/ParseResult';
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
import { Writable, writable } from 'svelte/store';
export let parseResult: Writable<ParseResult> = writable(undefined);
// TODO this will setup fake worker cause getMainThreadWorkerMessageHandler isn't null
import pdfjsWorker from 'pdfjs-dist//es5/build/pdf.worker.entry';
pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorker;
@ -16,8 +20,13 @@ export function processUpload(file: File): Promise<ParseResult> {
resolve(reader.result as ArrayBuffer);
};
reader.readAsArrayBuffer(file);
}).then((buffer) => {
const uintArray = new Uint8Array(buffer as ArrayBuffer);
return parser.parse(uintArray);
});
})
.then((buffer) => {
const uintArray = new Uint8Array(buffer as ArrayBuffer);
return parser.parse(uintArray);
})
.then((result) => {
parseResult.set(result);
return result;
});
}

View File

@ -1,23 +1,29 @@
{
"include": ["src", "types"],
"compilerOptions": {
"module": "esnext",
"target": "esnext",
"moduleResolution": "node",
"jsx": "preserve",
"baseUrl": "./",
/* paths - If you configure Snowpack import aliases, add them here. */
"paths": {},
/* noEmit - Snowpack builds (emits) files, not tsc. */
"noEmit": true,
/* Additional Options */
"strict": true,
"noImplicitAny": false,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"resolveJsonModule": true,
"useDefineForClassFields": true,
"allowSyntheticDefaultImports": true,
"importsNotUsedAsValues": "error"
}
"include": ["src", "types"],
"compilerOptions": {
"module": "esnext",
"target": "esnext",
"moduleResolution": "node",
"jsx": "preserve",
"baseUrl": "./",
/* paths - If you configure Snowpack import aliases, add them here. */
"paths": {},
/* noEmit - Snowpack builds (emits) files, not tsc. */
"noEmit": true,
/* Additional Options */
"strict": true,
"noImplicitAny": false,
// "noImplicitThis": false,
// "alwaysStrict": false,
// "strictBindCallApply": false,
"strictNullChecks": false,
// "strictFunctionTypes": false,
// "strictPropertyInitialization": false,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"resolveJsonModule": true,
"useDefineForClassFields": true,
"allowSyntheticDefaultImports": true,
"importsNotUsedAsValues": "error"
}
}