Flexible Debug table foundation

This commit is contained in:
Johannes Zillmann 2021-01-28 23:06:37 +01:00
parent ee7d686ba6
commit 8783e3cf9e
7 changed files with 160 additions and 90 deletions

13
core/src/Item.ts Normal file
View File

@ -0,0 +1,13 @@
export default class Item {
page: number;
data: object;
constructor(page: number, data: object) {
this.page = page;
this.data = data;
}
value(column: string): object {
return this.data[column];
}
}

View File

@ -1,12 +1,20 @@
import Item from './Item';
import type Metadata from './Metadata'; import type Metadata from './Metadata';
import type ParsedPage from './ParsedPage';
export default class ParseResult { export default class ParseResult {
pdfPages: any[];
metadata: Metadata; metadata: Metadata;
pages: ParsedPage[]; columns: string[];
items: Item[];
constructor(metadata: Metadata, pages: ParsedPage[]) { constructor(pdfPages: any[], metadata: Metadata, columns: string[], items: Item[]) {
this.pdfPages = pdfPages;
this.metadata = metadata; this.metadata = metadata;
this.pages = pages; this.columns = columns;
this.items = items;
}
pageCount(): number {
return this.pdfPages.length;
} }
} }

View File

@ -2,12 +2,12 @@ import type ParsedPageItem from './ParsedPageItem';
export default class ParsedPage { export default class ParsedPage {
index: number; index: number;
viewPortTransform: number[]; pdfPage: any;
items: ParsedPageItem[]; items: ParsedPageItem[];
constructor(index: number, viewPortTransform: number[], items: ParsedPageItem[]) { constructor(index: number, pdfPage: any, items: ParsedPageItem[]) {
this.index = index; this.index = index;
this.viewPortTransform = viewPortTransform; this.pdfPage = pdfPage;
this.items = items; this.items = items;
} }
} }

View File

@ -1,3 +1,4 @@
import Item from './Item';
import Metadata from './Metadata'; import Metadata from './Metadata';
import ParsedPage from './ParsedPage'; import ParsedPage from './ParsedPage';
import type ParseReporter from './ParseReporter'; import type ParseReporter from './ParseReporter';
@ -10,6 +11,7 @@ import type TextItem from './TextItem';
*/ */
export default class PdfParser { export default class PdfParser {
pdfjs: any; pdfjs: any;
columns = ['str', 'dir', 'width', 'height', 'transfom', 'fontName'];
constructor(pdfjs: any) { constructor(pdfjs: any) {
this.pdfjs = pdfjs; this.pdfjs = pdfjs;
} }
@ -43,14 +45,17 @@ export default class PdfParser {
this.extractPagesSequentially(pdfDocument, reporter), this.extractPagesSequentially(pdfDocument, reporter),
]); ]);
}) })
.then(([metadata, pages]) => new ParseResult(new Metadata(metadata), pages)); .then(([metadata, pages]) => {
const pdfPages = pages.map((page) => page.pdfPage);
const items = pages.reduce((allItems, page) => allItems.concat(page.items), []);
return new ParseResult(pdfPages, new Metadata(metadata), this.columns, items);
});
} }
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> { private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => { return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults) => { return accumulatorPromise.then((accumulatedResults) => {
return pdfDocument.getPage(index + 1).then((page) => { return pdfDocument.getPage(index + 1).then((page) => {
const viewport = page.getViewport({ scale: 1.0 });
return this.triggerFontRetrieval(page).then(() => return this.triggerFontRetrieval(page).then(() =>
page page
.getTextContent({ .getTextContent({
@ -58,8 +63,9 @@ export default class PdfParser {
disableCombineTextItems: true, disableCombineTextItems: true,
}) })
.then((textContent) => { .then((textContent) => {
const items = textContent.items.map((rawItem) => new Item(index, rawItem));
reporter.parsedPage(index); reporter.parsedPage(index);
return [...accumulatedResults, new ParsedPage(index, viewport.transform, textContent.items)]; return [...accumulatedResults, new ParsedPage(index, page, items)];
}), }),
); );
}); });
@ -119,7 +125,7 @@ export default class PdfParser {
// console.log('Parsed result:', r.length); // console.log('Parsed result:', r.length);
// console.log('Parsed result:', r); // console.log('Parsed result:', r);
return new ParseResult(new Metadata(metadata), r); return new ParseResult([], new Metadata(metadata), [], []);
}); });
} }
} }

View File

@ -3,129 +3,164 @@ import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
import * as fs from 'fs'; import * as fs from 'fs';
import ParseProgressReporter from 'src/ParseProgressReporter'; import ParseProgressReporter from 'src/ParseProgressReporter';
import Progress from 'src/Progress'; import Progress from 'src/Progress';
import Item from 'src/Item';
const parser = new PdfParser(pdfjs); const parser = new PdfParser(pdfjs);
test('basic example PDF parse', async () => { test('basic example PDF parse', async () => {
const progressUpdates: Progress[] = []; const progressUpdates: Progress[] = [];
const data = fs.readFileSync('../examples/ExamplePdf.pdf', null); const data = fs.readFileSync('../examples/ExamplePdf.pdf', null);
// to test
const result = await parser.parseBytes( const result = await parser.parseBytes(
data, data,
new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)), new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)),
); );
// verify pages
const expectedPages = 7; const expectedPages = 7;
expect(result.metadata.title()).toEqual('ExamplePdf'); expect(result.metadata.title()).toEqual('ExamplePdf');
expect(result.metadata.author()).toEqual('Johannes Zillmann'); expect(result.metadata.author()).toEqual('Johannes Zillmann');
expect(result.pages.length).toBe(expectedPages); expect(result.pageCount()).toBe(expectedPages);
expect(result.pages[0].index).toBe(0); result.pdfPages.forEach((pdfPage, i) => {
expect(result.pages[0].viewPortTransform).toEqual([1, 0, 0, -1, 0, 841.8898]); expect(pdfPage._pageIndex).toBe(i);
expect(result.pages[0].items).toEqual([ });
{ expect(result.pdfPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]);
expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
// verify first n items
expect(result.items.slice(0, 16)).toEqual([
new Item(0, {
str: 'Mega Überschrift', str: 'Mega Überschrift',
dir: 'ltr', dir: 'ltr',
width: 245.05800000000005, width: 245.05800000000005,
height: 30, height: 30,
transform: [30, 0, 0, 30, 175, 756], transform: [30, 0, 0, 30, 175, 756],
fontName: 'g_d0_f1', fontName: 'g_d0_f1',
}, }),
{ new Item(0, {
str: '2te Überschrift', str: '2te Überschrift',
dir: 'ltr', dir: 'ltr',
width: 130.056, width: 130.056,
height: 20, height: 20,
transform: [20, 0, 0, 20, 233, 665], transform: [20, 0, 0, 20, 233, 665],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}, }),
{ new Item(0, {
str: 'Dies ist eine Test-PDF', str: 'Dies ist eine Test-PDF',
dir: 'ltr', dir: 'ltr',
width: 108.61950000000003, width: 108.61950000000003,
height: 11, height: 11,
transform: [11, 0, 0, 11, 240, 585], transform: [11, 0, 0, 11, 240, 585],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}, }),
{ new Item(0, {
str: '.', str: '.',
dir: 'ltr', dir: 'ltr',
width: 3.0580000000000003, width: 3.0580000000000003,
height: 11, height: 11,
transform: [11, 0, 0, 11, 352.6927, 585], transform: [11, 0, 0, 11, 352.6927, 585],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}, }),
{ new Item(0, {
str: '1', str: '1',
dir: 'ltr', dir: 'ltr',
width: 4.077333704, width: 4.077333704,
height: 7.333334, height: 7.333334,
transform: [7.333334, 0, 0, 7.333334, 348, 588], transform: [7.333334, 0, 0, 7.333334, 348, 588],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}, }),
{ new Item(0, {
str: 'Fürs Testen des ', str: 'Fürs Testen des ',
dir: 'ltr', dir: 'ltr',
width: 83.7826, width: 83.7826,
height: 11, height: 11,
transform: [11, 0, 0, 11, 208, 572], transform: [11, 0, 0, 11, 208, 572],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}, }),
{ new Item(0, {
str: 'Markdown Parsers', str: 'Markdown Parsers',
dir: 'ltr', dir: 'ltr',
width: 91.6982, width: 91.6982,
height: 11, height: 11,
transform: [11, 0, 0, 11, 291.77832, 572], transform: [11, 0, 0, 11, 291.77832, 572],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}, }),
{ new Item(0, {
str: '.', str: '.',
dir: 'ltr', dir: 'ltr',
width: 3.0580000000000003, width: 3.0580000000000003,
height: 11, height: 11,
transform: [11, 0, 0, 11, 383.47360000000003, 572], transform: [11, 0, 0, 11, 383.47360000000003, 572],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}, }),
{ new Item(0, {
str: ' ', str: ' ',
dir: 'ltr', dir: 'ltr',
width: 3.0580000000000003, width: 3.0580000000000003,
height: 11, height: 11,
transform: [11, 0, 0, 11, 61.078451, 59], transform: [11, 0, 0, 11, 61.078451, 59],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}, }),
{ new Item(0, {
str: 'In Deutsch.', str: 'In Deutsch.',
dir: 'ltr', dir: 'ltr',
width: 55.64240000000001, width: 55.64240000000001,
height: 11, height: 11,
transform: [11, 0, 0, 11, 64.134603, 59], transform: [11, 0, 0, 11, 64.134603, 59],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}, }),
{ new Item(0, {
str: '1', str: '1',
dir: 'ltr', dir: 'ltr',
width: 4.077333704, width: 4.077333704,
height: 7.333334, height: 7.333334,
transform: [7.333334, 0, 0, 7.333334, 57, 62], transform: [7.333334, 0, 0, 7.333334, 57, 62],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}, }),
{ new Item(0, {
str: '\x00', str: '\x00',
dir: 'ltr', dir: 'ltr',
width: 0, width: 0,
height: 12, height: 12,
transform: [12, 0, 0, 12, 294, 45], transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f3', fontName: 'g_d0_f3',
}, }),
{ new Item(0, {
str: '1', str: '1',
dir: 'ltr', dir: 'ltr',
width: 6.672000000000001, width: 6.672000000000001,
height: 12, height: 12,
transform: [12, 0, 0, 12, 294, 45], transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}, }),
new Item(1, {
str: '\x00',
dir: 'ltr',
width: 0,
height: 12,
transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f3',
}),
new Item(1, {
str: '2',
dir: 'ltr',
width: 6.672000000000001,
height: 12,
transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f2',
}),
new Item(2, {
str: 'Paragraphen',
dir: 'ltr',
width: 110.04479999999998,
height: 18,
transform: [18, 0, 0, 18, 57, 767],
fontName: 'g_d0_f1',
}),
]); ]);
// verify progress
expect(progressUpdates.length).toBe(expectedPages + 2); expect(progressUpdates.length).toBe(expectedPages + 2);
progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts'])); progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts']));
expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]); expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]);

View File

@ -1,22 +1,15 @@
<script> <script>
import type ParseResult from 'pdf-to-markdown-core/lib/src/ParseResult'; import type ParseResult from 'pdf-to-markdown-core/lib/src/ParseResult';
import type Item from '@core/Item';
import Table from './Table.svelte'; import Table from './Table.svelte';
export let parseResult: ParseResult; export let parseResult: ParseResult;
console.log(parseResult.metadata);
</script> </script>
<div> <div class="mb-4">
Parsed <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
{parseResult.pages.length} <div>Title: {parseResult.metadata.title()}</div>
pages with <div>Author: {parseResult.metadata.author()}</div>
{parseResult.pages.reduce((count, page) => count + page.items.length, 0)}
items
</div> </div>
<div>Title: {parseResult.metadata.title()}</div>
<div>Author: {parseResult.metadata.author()}</div>
{#each parseResult.pages as page} <Table columns={parseResult.columns} items={parseResult.items} />
<div class="text-2xl font-semibold my-5">Page {page.index}</div>
<Table items={page.items} />
{/each}

View File

@ -1,52 +1,67 @@
<script> <script>
import type ParsedPageItem from '@core/ParsedPageItem'; import type Item from '@core/Item';
const headers = ['ID', 'Text', 'Font', 'Direction', 'Width', 'Height', 'Transform']; export let columns: string[];
export let items: ParsedPageItem[]; export let items: Item[];
</script> </script>
<div class="static"> <table class="w-full text-left">
<div class="grid" style="grid-template-columns:repeat({headers.length}, auto)"> <thead class=" ">
{#each headers as header} <th />
<div class="header">{header}</div> <th>#</th>
{#each columns as column}
<th>{column}</th>
{/each} {/each}
{#each items as item, i} </thead>
<div class="row contents"> <tbody>
<div class="cell">{i + 1}</div> {#each items as item, idx}
<div class="cell">{item.str}</div> {#if idx > 0 && item.page !== items[idx - 1].page}
<div class="cell">{item.fontName}</div> <tr class="h-5 bg-blue-200" />
<div class="cell">{item.dir}</div> {/if}
<div class="cell">{item.width}</div> <tr class="">
<div class="cell">{item.height}</div> {#if idx === 0 || item.page !== items[idx - 1].page}
<div class="cell">{item.transform.join(', ')}</div> <td class="page bg-gray-50">Page {item.page}</td>
</div> {:else}
<td />
{/if}
<td class="">{idx}</td>
{#each columns as column}
<td class="borde2r">{item.data[column]}</td>
{/each} {/each}
</div> </tr>
</div> {/each}
</tbody>
</table>
<style> <style>
.grid { th {
width: 100%; @apply px-1;
max-height: 100vh; position: -webkit-sticky;
display: grid;
grid-auto-rows: min-content;
overflow-y: auto;
border: 1px solid #e3e4e4;
border-left: none;
}
.header {
@apply bg-gray-300;
position: sticky; position: sticky;
top: 0; top: 0;
padding: 5px; z-index: 2;
border-bottom: 1px solid #e3e4e4;
} }
.row:hover > div { th:not(:first-child) {
@apply bg-gray-300;
@apply shadow;
}
td:not(:first-child) {
@apply px-1;
@apply border-b;
}
tr:hover td:not(:first-child) {
@apply bg-gray-200; @apply bg-gray-200;
} }
.cell { .page {
@apply pl-1; @apply text-lg;
border-left: 1px solid #e3e4e4; @apply font-semibold;
@apply pr-4;
@apply whitespace-nowrap;
position: -webkit-sticky;
position: sticky;
top: 0;
z-index: 2;
} }
</style> </style>