mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-25 12:01:45 +02:00
Flexible Debug table foundation
This commit is contained in:
parent
ee7d686ba6
commit
8783e3cf9e
13
core/src/Item.ts
Normal file
13
core/src/Item.ts
Normal file
@ -0,0 +1,13 @@
|
||||
export default class Item {
|
||||
page: number;
|
||||
data: object;
|
||||
|
||||
constructor(page: number, data: object) {
|
||||
this.page = page;
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
value(column: string): object {
|
||||
return this.data[column];
|
||||
}
|
||||
}
|
@ -1,12 +1,20 @@
|
||||
import Item from './Item';
|
||||
import type Metadata from './Metadata';
|
||||
import type ParsedPage from './ParsedPage';
|
||||
|
||||
export default class ParseResult {
|
||||
pdfPages: any[];
|
||||
metadata: Metadata;
|
||||
pages: ParsedPage[];
|
||||
columns: string[];
|
||||
items: Item[];
|
||||
|
||||
constructor(metadata: Metadata, pages: ParsedPage[]) {
|
||||
constructor(pdfPages: any[], metadata: Metadata, columns: string[], items: Item[]) {
|
||||
this.pdfPages = pdfPages;
|
||||
this.metadata = metadata;
|
||||
this.pages = pages;
|
||||
this.columns = columns;
|
||||
this.items = items;
|
||||
}
|
||||
|
||||
pageCount(): number {
|
||||
return this.pdfPages.length;
|
||||
}
|
||||
}
|
||||
|
@ -2,12 +2,12 @@ import type ParsedPageItem from './ParsedPageItem';
|
||||
|
||||
export default class ParsedPage {
|
||||
index: number;
|
||||
viewPortTransform: number[];
|
||||
pdfPage: any;
|
||||
items: ParsedPageItem[];
|
||||
|
||||
constructor(index: number, viewPortTransform: number[], items: ParsedPageItem[]) {
|
||||
constructor(index: number, pdfPage: any, items: ParsedPageItem[]) {
|
||||
this.index = index;
|
||||
this.viewPortTransform = viewPortTransform;
|
||||
this.pdfPage = pdfPage;
|
||||
this.items = items;
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
import Item from './Item';
|
||||
import Metadata from './Metadata';
|
||||
import ParsedPage from './ParsedPage';
|
||||
import type ParseReporter from './ParseReporter';
|
||||
@ -10,6 +11,7 @@ import type TextItem from './TextItem';
|
||||
*/
|
||||
export default class PdfParser {
|
||||
pdfjs: any;
|
||||
columns = ['str', 'dir', 'width', 'height', 'transfom', 'fontName'];
|
||||
constructor(pdfjs: any) {
|
||||
this.pdfjs = pdfjs;
|
||||
}
|
||||
@ -43,14 +45,17 @@ export default class PdfParser {
|
||||
this.extractPagesSequentially(pdfDocument, reporter),
|
||||
]);
|
||||
})
|
||||
.then(([metadata, pages]) => new ParseResult(new Metadata(metadata), pages));
|
||||
.then(([metadata, pages]) => {
|
||||
const pdfPages = pages.map((page) => page.pdfPage);
|
||||
const items = pages.reduce((allItems, page) => allItems.concat(page.items), []);
|
||||
return new ParseResult(pdfPages, new Metadata(metadata), this.columns, items);
|
||||
});
|
||||
}
|
||||
|
||||
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
|
||||
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||
return accumulatorPromise.then((accumulatedResults) => {
|
||||
return pdfDocument.getPage(index + 1).then((page) => {
|
||||
const viewport = page.getViewport({ scale: 1.0 });
|
||||
return this.triggerFontRetrieval(page).then(() =>
|
||||
page
|
||||
.getTextContent({
|
||||
@ -58,8 +63,9 @@ export default class PdfParser {
|
||||
disableCombineTextItems: true,
|
||||
})
|
||||
.then((textContent) => {
|
||||
const items = textContent.items.map((rawItem) => new Item(index, rawItem));
|
||||
reporter.parsedPage(index);
|
||||
return [...accumulatedResults, new ParsedPage(index, viewport.transform, textContent.items)];
|
||||
return [...accumulatedResults, new ParsedPage(index, page, items)];
|
||||
}),
|
||||
);
|
||||
});
|
||||
@ -119,7 +125,7 @@ export default class PdfParser {
|
||||
// console.log('Parsed result:', r.length);
|
||||
// console.log('Parsed result:', r);
|
||||
|
||||
return new ParseResult(new Metadata(metadata), r);
|
||||
return new ParseResult([], new Metadata(metadata), [], []);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -3,129 +3,164 @@ import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
|
||||
import * as fs from 'fs';
|
||||
import ParseProgressReporter from 'src/ParseProgressReporter';
|
||||
import Progress from 'src/Progress';
|
||||
import Item from 'src/Item';
|
||||
|
||||
const parser = new PdfParser(pdfjs);
|
||||
|
||||
test('basic example PDF parse', async () => {
|
||||
const progressUpdates: Progress[] = [];
|
||||
const data = fs.readFileSync('../examples/ExamplePdf.pdf', null);
|
||||
|
||||
// to test
|
||||
const result = await parser.parseBytes(
|
||||
data,
|
||||
new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)),
|
||||
);
|
||||
|
||||
// verify pages
|
||||
const expectedPages = 7;
|
||||
expect(result.metadata.title()).toEqual('ExamplePdf');
|
||||
expect(result.metadata.author()).toEqual('Johannes Zillmann');
|
||||
expect(result.pages.length).toBe(expectedPages);
|
||||
expect(result.pages[0].index).toBe(0);
|
||||
expect(result.pages[0].viewPortTransform).toEqual([1, 0, 0, -1, 0, 841.8898]);
|
||||
expect(result.pages[0].items).toEqual([
|
||||
{
|
||||
expect(result.pageCount()).toBe(expectedPages);
|
||||
result.pdfPages.forEach((pdfPage, i) => {
|
||||
expect(pdfPage._pageIndex).toBe(i);
|
||||
});
|
||||
expect(result.pdfPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]);
|
||||
expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
|
||||
|
||||
// verify first n items
|
||||
expect(result.items.slice(0, 16)).toEqual([
|
||||
new Item(0, {
|
||||
str: 'Mega Überschrift',
|
||||
dir: 'ltr',
|
||||
width: 245.05800000000005,
|
||||
height: 30,
|
||||
transform: [30, 0, 0, 30, 175, 756],
|
||||
fontName: 'g_d0_f1',
|
||||
},
|
||||
{
|
||||
}),
|
||||
new Item(0, {
|
||||
str: '2te Überschrift',
|
||||
dir: 'ltr',
|
||||
width: 130.056,
|
||||
height: 20,
|
||||
transform: [20, 0, 0, 20, 233, 665],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
}),
|
||||
new Item(0, {
|
||||
str: 'Dies ist eine Test-PDF',
|
||||
dir: 'ltr',
|
||||
width: 108.61950000000003,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 240, 585],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
}),
|
||||
new Item(0, {
|
||||
str: '.',
|
||||
dir: 'ltr',
|
||||
width: 3.0580000000000003,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 352.6927, 585],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
}),
|
||||
new Item(0, {
|
||||
str: '1',
|
||||
dir: 'ltr',
|
||||
width: 4.077333704,
|
||||
height: 7.333334,
|
||||
transform: [7.333334, 0, 0, 7.333334, 348, 588],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
}),
|
||||
new Item(0, {
|
||||
str: 'Für’s Testen des ',
|
||||
dir: 'ltr',
|
||||
width: 83.7826,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 208, 572],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
}),
|
||||
new Item(0, {
|
||||
str: 'Markdown Parsers',
|
||||
dir: 'ltr',
|
||||
width: 91.6982,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 291.77832, 572],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
}),
|
||||
new Item(0, {
|
||||
str: '.',
|
||||
dir: 'ltr',
|
||||
width: 3.0580000000000003,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 383.47360000000003, 572],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
}),
|
||||
new Item(0, {
|
||||
str: ' ',
|
||||
dir: 'ltr',
|
||||
width: 3.0580000000000003,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 61.078451, 59],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
}),
|
||||
new Item(0, {
|
||||
str: 'In Deutsch.',
|
||||
dir: 'ltr',
|
||||
width: 55.64240000000001,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 64.134603, 59],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
}),
|
||||
new Item(0, {
|
||||
str: '1',
|
||||
dir: 'ltr',
|
||||
width: 4.077333704,
|
||||
height: 7.333334,
|
||||
transform: [7.333334, 0, 0, 7.333334, 57, 62],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
}),
|
||||
new Item(0, {
|
||||
str: '\x00',
|
||||
dir: 'ltr',
|
||||
width: 0,
|
||||
height: 12,
|
||||
transform: [12, 0, 0, 12, 294, 45],
|
||||
fontName: 'g_d0_f3',
|
||||
},
|
||||
{
|
||||
}),
|
||||
new Item(0, {
|
||||
str: '1',
|
||||
dir: 'ltr',
|
||||
width: 6.672000000000001,
|
||||
height: 12,
|
||||
transform: [12, 0, 0, 12, 294, 45],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
}),
|
||||
new Item(1, {
|
||||
str: '\x00',
|
||||
dir: 'ltr',
|
||||
width: 0,
|
||||
height: 12,
|
||||
transform: [12, 0, 0, 12, 294, 45],
|
||||
fontName: 'g_d0_f3',
|
||||
}),
|
||||
new Item(1, {
|
||||
str: '2',
|
||||
dir: 'ltr',
|
||||
width: 6.672000000000001,
|
||||
height: 12,
|
||||
transform: [12, 0, 0, 12, 294, 45],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
new Item(2, {
|
||||
str: 'Paragraphen',
|
||||
dir: 'ltr',
|
||||
width: 110.04479999999998,
|
||||
height: 18,
|
||||
transform: [18, 0, 0, 18, 57, 767],
|
||||
fontName: 'g_d0_f1',
|
||||
}),
|
||||
]);
|
||||
|
||||
// verify progress
|
||||
expect(progressUpdates.length).toBe(expectedPages + 2);
|
||||
progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts']));
|
||||
expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]);
|
||||
|
@ -1,22 +1,15 @@
|
||||
<script>
|
||||
import type ParseResult from 'pdf-to-markdown-core/lib/src/ParseResult';
|
||||
import type Item from '@core/Item';
|
||||
import Table from './Table.svelte';
|
||||
|
||||
export let parseResult: ParseResult;
|
||||
console.log(parseResult.metadata);
|
||||
</script>
|
||||
|
||||
<div>
|
||||
Parsed
|
||||
{parseResult.pages.length}
|
||||
pages with
|
||||
{parseResult.pages.reduce((count, page) => count + page.items.length, 0)}
|
||||
items
|
||||
</div>
|
||||
<div class="mb-4">
|
||||
<div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
|
||||
<div>Title: {parseResult.metadata.title()}</div>
|
||||
<div>Author: {parseResult.metadata.author()}</div>
|
||||
</div>
|
||||
|
||||
{#each parseResult.pages as page}
|
||||
<div class="text-2xl font-semibold my-5">Page {page.index}</div>
|
||||
<Table items={page.items} />
|
||||
{/each}
|
||||
<Table columns={parseResult.columns} items={parseResult.items} />
|
||||
|
@ -1,52 +1,67 @@
|
||||
<script>
|
||||
import type ParsedPageItem from '@core/ParsedPageItem';
|
||||
import type Item from '@core/Item';
|
||||
|
||||
const headers = ['ID', 'Text', 'Font', 'Direction', 'Width', 'Height', 'Transform'];
|
||||
export let items: ParsedPageItem[];
|
||||
export let columns: string[];
|
||||
export let items: Item[];
|
||||
</script>
|
||||
|
||||
<div class="static">
|
||||
<div class="grid" style="grid-template-columns:repeat({headers.length}, auto)">
|
||||
{#each headers as header}
|
||||
<div class="header">{header}</div>
|
||||
<table class="w-full text-left">
|
||||
<thead class=" ">
|
||||
<th />
|
||||
<th>#</th>
|
||||
{#each columns as column}
|
||||
<th>{column}</th>
|
||||
{/each}
|
||||
{#each items as item, i}
|
||||
<div class="row contents">
|
||||
<div class="cell">{i + 1}</div>
|
||||
<div class="cell">{item.str}</div>
|
||||
<div class="cell">{item.fontName}</div>
|
||||
<div class="cell">{item.dir}</div>
|
||||
<div class="cell">{item.width}</div>
|
||||
<div class="cell">{item.height}</div>
|
||||
<div class="cell">{item.transform.join(', ')}</div>
|
||||
</div>
|
||||
</thead>
|
||||
<tbody>
|
||||
{#each items as item, idx}
|
||||
{#if idx > 0 && item.page !== items[idx - 1].page}
|
||||
<tr class="h-5 bg-blue-200" />
|
||||
{/if}
|
||||
<tr class="">
|
||||
{#if idx === 0 || item.page !== items[idx - 1].page}
|
||||
<td class="page bg-gray-50">Page {item.page}</td>
|
||||
{:else}
|
||||
<td />
|
||||
{/if}
|
||||
<td class="">{idx}</td>
|
||||
{#each columns as column}
|
||||
<td class="borde2r">{item.data[column]}</td>
|
||||
{/each}
|
||||
</div>
|
||||
</div>
|
||||
</tr>
|
||||
{/each}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<style>
|
||||
.grid {
|
||||
width: 100%;
|
||||
max-height: 100vh;
|
||||
display: grid;
|
||||
grid-auto-rows: min-content;
|
||||
overflow-y: auto;
|
||||
border: 1px solid #e3e4e4;
|
||||
border-left: none;
|
||||
}
|
||||
.header {
|
||||
@apply bg-gray-300;
|
||||
th {
|
||||
@apply px-1;
|
||||
position: -webkit-sticky;
|
||||
position: sticky;
|
||||
top: 0;
|
||||
padding: 5px;
|
||||
border-bottom: 1px solid #e3e4e4;
|
||||
z-index: 2;
|
||||
}
|
||||
.row:hover > div {
|
||||
th:not(:first-child) {
|
||||
@apply bg-gray-300;
|
||||
@apply shadow;
|
||||
}
|
||||
td:not(:first-child) {
|
||||
@apply px-1;
|
||||
@apply border-b;
|
||||
}
|
||||
|
||||
tr:hover td:not(:first-child) {
|
||||
@apply bg-gray-200;
|
||||
}
|
||||
|
||||
.cell {
|
||||
@apply pl-1;
|
||||
border-left: 1px solid #e3e4e4;
|
||||
.page {
|
||||
@apply text-lg;
|
||||
@apply font-semibold;
|
||||
@apply pr-4;
|
||||
@apply whitespace-nowrap;
|
||||
position: -webkit-sticky;
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 2;
|
||||
}
|
||||
</style>
|
||||
|
Loading…
x
Reference in New Issue
Block a user