mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-31 18:09:16 +01:00
Transform messages
This commit is contained in:
parent
4401f1fb5c
commit
a98a862a58
@ -1,7 +1,7 @@
|
||||
import { assert } from './assert';
|
||||
import Item from './Item';
|
||||
import ItemResult from './ItemResult';
|
||||
import ItemTransformer from './transformer/ItemTransformer';
|
||||
import ParseResult from './ParseResult';
|
||||
import { calculateSchemas } from './transformer/transformerUtil';
|
||||
import TransformContext from './transformer/TransformContext';
|
||||
|
||||
@ -11,7 +11,7 @@ export default class Debugger {
|
||||
transformers: ItemTransformer[];
|
||||
stageNames: string[];
|
||||
stageSchema: string[][];
|
||||
private stageItems: Item[][];
|
||||
private stageResultCache: ItemResult[];
|
||||
|
||||
constructor(
|
||||
initialSchema: string[],
|
||||
@ -23,19 +23,21 @@ export default class Debugger {
|
||||
this.transformers = transformers;
|
||||
this.context = context;
|
||||
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
|
||||
this.stageItems = [initialItems];
|
||||
this.stageResultCache = [{ items: initialItems, messages: [`Parsed ${initialItems[initialItems.length-1].page+1} pages with ${initialItems.length} items`] }];
|
||||
this.stageSchema = calculateSchemas(initialSchema, transformers);
|
||||
}
|
||||
|
||||
//TODO return MarkedItem ? (removed, added, etc..)?
|
||||
//TODO StageResult == class with schema and marked items ?
|
||||
stageResults(stageIndex: number): Item[] {
|
||||
stageResults(stageIndex: number): ItemResult {
|
||||
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
||||
if (!this.stageItems[idx]) {
|
||||
const stageItems = this.transformers[idx - 1].transform(this.context, this.stageItems[idx - 1]);
|
||||
this.stageItems.push(stageItems);
|
||||
if (!this.stageResultCache[idx]) {
|
||||
const stageResult = this.transformers[idx - 1].transform(this.context, [
|
||||
...this.stageResultCache[idx - 1].items,
|
||||
]);
|
||||
this.stageResultCache.push(stageResult);
|
||||
}
|
||||
}
|
||||
return this.stageItems[stageIndex];
|
||||
return this.stageResultCache[stageIndex];
|
||||
}
|
||||
}
|
||||
|
6
core/src/ItemResult.ts
Normal file
6
core/src/ItemResult.ts
Normal file
@ -0,0 +1,6 @@
|
||||
import type Item from './Item';
|
||||
|
||||
export default interface ItemResult {
|
||||
items: Item[];
|
||||
messages: string[];
|
||||
}
|
@ -2,8 +2,6 @@ import Item from './Item';
|
||||
import Metadata from './Metadata';
|
||||
import type ParseReporter from './ParseReporter';
|
||||
import ParseResult from './ParseResult';
|
||||
import TextDirection from './TextDirection';
|
||||
import type TextItem from './TextItem';
|
||||
|
||||
/**
|
||||
* Parses a PDF via PDFJS and returns a ParseResult which contains more or less the original data from PDFJS.
|
||||
@ -86,59 +84,59 @@ export default class PdfParser {
|
||||
return page.getOperatorList();
|
||||
}
|
||||
|
||||
async parseOld(data: Uint8Array): Promise<ParseResult> {
|
||||
return this.pdfjs
|
||||
.getDocument({
|
||||
data,
|
||||
cMapUrl: 'cmaps/',
|
||||
cMapPacked: true,
|
||||
})
|
||||
.promise.then((pdfDocument) => {
|
||||
// console.log('result', pdfDocument);
|
||||
const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||
return accumulatorPromise.then((accumulatedResults) => {
|
||||
// console.log('Parsing page ' + index);
|
||||
return pdfDocument.getPage(index + 1).then((page) => {
|
||||
const viewport = page.getViewport({ scale: 1.0 });
|
||||
console.log(viewport);
|
||||
// async parseOld(data: Uint8Array): Promise<ParseResult> {
|
||||
// return this.pdfjs
|
||||
// .getDocument({
|
||||
// data,
|
||||
// cMapUrl: 'cmaps/',
|
||||
// cMapPacked: true,
|
||||
// })
|
||||
// .promise.then((pdfDocument) => {
|
||||
// // console.log('result', pdfDocument);
|
||||
// const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||
// return accumulatorPromise.then((accumulatedResults) => {
|
||||
// // console.log('Parsing page ' + index);
|
||||
// return pdfDocument.getPage(index + 1).then((page) => {
|
||||
// const viewport = page.getViewport({ scale: 1.0 });
|
||||
// console.log(viewport);
|
||||
|
||||
return this.triggerFontRetrieval(page).then(() =>
|
||||
page.getTextContent().then((textContent) => {
|
||||
// console.log(textContent);
|
||||
const textItems: TextItem[] = textContent.items.map((item) => {
|
||||
const tx = this.pdfjs.Util.transform(viewport.transform, item.transform);
|
||||
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
|
||||
const dividedHeight = item.height / fontHeight;
|
||||
// return this.triggerFontRetrieval(page).then(() =>
|
||||
// page.getTextContent().then((textContent) => {
|
||||
// // console.log(textContent);
|
||||
// const textItems: TextItem[] = textContent.items.map((item) => {
|
||||
// const tx = this.pdfjs.Util.transform(viewport.transform, item.transform);
|
||||
// const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
|
||||
// const dividedHeight = item.height / fontHeight;
|
||||
|
||||
return {
|
||||
x: Math.round(item.transform[4]),
|
||||
y: Math.round(item.transform[5]),
|
||||
width: Math.round(item.width),
|
||||
height: Math.round(
|
||||
Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight,
|
||||
),
|
||||
text: item.str,
|
||||
textDirection: TextDirection.fromPdfJs(item.dir),
|
||||
fontId: item.fontName,
|
||||
};
|
||||
});
|
||||
// return {
|
||||
// x: Math.round(item.transform[4]),
|
||||
// y: Math.round(item.transform[5]),
|
||||
// width: Math.round(item.width),
|
||||
// height: Math.round(
|
||||
// Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight,
|
||||
// ),
|
||||
// text: item.str,
|
||||
// textDirection: TextDirection.fromPdfJs(item.dir),
|
||||
// fontId: item.fontName,
|
||||
// };
|
||||
// });
|
||||
|
||||
return [...accumulatedResults, ...textItems];
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
}, Promise.resolve([]));
|
||||
return Promise.all([pdfDocument.getMetadata(), result]);
|
||||
})
|
||||
.then(([metadata, r]) => {
|
||||
// console.log('Parsed metadata:', metadata);
|
||||
// console.log('Parsed result:', r.length);
|
||||
// console.log('Parsed result:', r);
|
||||
// return [...accumulatedResults, ...textItems];
|
||||
// }),
|
||||
// );
|
||||
// });
|
||||
// });
|
||||
// }, Promise.resolve([]));
|
||||
// return Promise.all([pdfDocument.getMetadata(), result]);
|
||||
// })
|
||||
// .then(([metadata, r]) => {
|
||||
// // console.log('Parsed metadata:', metadata);
|
||||
// // console.log('Parsed result:', r.length);
|
||||
// // console.log('Parsed result:', r);
|
||||
|
||||
return {};
|
||||
});
|
||||
}
|
||||
// return {};
|
||||
// });
|
||||
// }
|
||||
}
|
||||
|
||||
interface ParsedPage {
|
||||
|
@ -1,5 +1,6 @@
|
||||
import PageViewport from 'src/parse/PageViewport';
|
||||
import Item from '../Item';
|
||||
import ItemResult from '../ItemResult';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
|
||||
@ -10,11 +11,12 @@ export default class AdjustHeight extends ItemTransformer {
|
||||
});
|
||||
}
|
||||
|
||||
transform(context: TransformContext, items: Item[]): Item[] {
|
||||
transform(context: TransformContext, items: Item[]): ItemResult {
|
||||
const newItems: Item[] = [];
|
||||
let page = -1;
|
||||
let pageViewport: PageViewport;
|
||||
//TODO groupBy page
|
||||
let correctedHeights = 0;
|
||||
items.forEach((item) => {
|
||||
if (item.page !== page) {
|
||||
pageViewport = context.pageViewports[item.page];
|
||||
@ -27,11 +29,12 @@ export default class AdjustHeight extends ItemTransformer {
|
||||
const dividedHeight = itemHeight / fontHeight;
|
||||
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
|
||||
if (newHeight !== itemHeight) {
|
||||
correctedHeights++;
|
||||
newItems.push(item.withDataAddition({ height: newHeight }));
|
||||
} else {
|
||||
newItems.push(item);
|
||||
}
|
||||
});
|
||||
return items;
|
||||
return { items, messages: [`${correctedHeights} corrected heights`] };
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
import Item from '../Item';
|
||||
import ItemResult from '../ItemResult';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
|
||||
@ -11,9 +12,12 @@ export default class CalculateCoordinates extends ItemTransformer {
|
||||
});
|
||||
}
|
||||
|
||||
transform(context: TransformContext, items: Item[]): Item[] {
|
||||
transform(context: TransformContext, items: Item[]): ItemResult {
|
||||
// const transform: number[] = item.value['Transform'];
|
||||
items.shift();
|
||||
return items;
|
||||
if(items[0]){
|
||||
items[0].data['fontName']='xxx';
|
||||
}
|
||||
return { items, messages: [] };
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
import TransformerDescription from '../TransformerDescription';
|
||||
import type Item from '../Item';
|
||||
import TransformContext from './TransformContext';
|
||||
import ItemResult from 'src/ItemResult';
|
||||
|
||||
export default abstract class ItemTransformer {
|
||||
readonly name: string;
|
||||
@ -21,5 +22,5 @@ export default abstract class ItemTransformer {
|
||||
}
|
||||
|
||||
// columnar-changes: described
|
||||
abstract transform(context: TransformContext, items: Item[]): Item[];
|
||||
abstract transform(context: TransformContext, items: Item[]): ItemResult;
|
||||
}
|
||||
|
@ -6,7 +6,7 @@
|
||||
</script>
|
||||
|
||||
<div class="text-2xl font-semibold font-serif text-center bg-gray-400">PDF to Markdown Converter</div>
|
||||
<main class="mt-5 h-full">
|
||||
<main class="mt-2 h-full">
|
||||
{#if $debug}
|
||||
<DebugView debug={$debug} />
|
||||
{:else}
|
||||
|
@ -14,12 +14,12 @@
|
||||
$: canNext = currentStage + 1 < stageNames.length;
|
||||
$: canPrev = currentStage > 0;
|
||||
$: stageSchema = debug.stageSchema[currentStage];
|
||||
$: stageItems = debug.stageResults(currentStage);
|
||||
$: stageResult = debug.stageResults(currentStage);
|
||||
$: pageFocus = !isNaN(focusedPage);
|
||||
$: pagesNumbers = new Set(stageItems.map((item) => item.page));
|
||||
$: pagesNumbers = new Set(stageResult.items.map((item) => item.page));
|
||||
$: maxPage = Math.max(...pagesNumbers);
|
||||
$: itemsByPage = [
|
||||
...stageItems.reduce((map, item) => {
|
||||
...stageResult.items.reduce((map, item) => {
|
||||
if (!map.has(item.page)) {
|
||||
map.set(item.page, []);
|
||||
}
|
||||
@ -41,11 +41,9 @@
|
||||
</script>
|
||||
|
||||
<div class="mx-4">
|
||||
<div class="mb-4">
|
||||
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
|
||||
<div>Title: {parseResult.metadata.title()}</div>
|
||||
<div>Author: {parseResult.metadata.author()}</div> -->
|
||||
</div>
|
||||
|
||||
<!-- Sticky Controls -->
|
||||
<div class="controls py-2">
|
||||
@ -89,10 +87,18 @@
|
||||
<span on:click={() => canNext && currentStage++}>
|
||||
<ArrowRight size="1x" class={canNext ? 'hover:text-green-700 cursor-pointer' : 'opacity-50'} />
|
||||
</span>
|
||||
<div>{stageNames[currentStage]}</div>
|
||||
<div class="cursor-pointer hover:underline">{stageNames[currentStage]}</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Stage Messages -->
|
||||
<ul class="list-disc list-inside mb-2 p-2 bg-yellow-100 rounded shadow text-sm">
|
||||
{#each stageResult.messages as message}
|
||||
<li>{message}</li>
|
||||
{/each}
|
||||
</ul>
|
||||
|
||||
<!-- Items -->
|
||||
<ItemTable schema={stageSchema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
|
||||
</div>
|
||||
|
||||
|
@ -51,7 +51,7 @@
|
||||
{/if}
|
||||
<td>{itemIdx}</td>
|
||||
{#each schema as column}
|
||||
<td>{format(item.data[column])}</td>
|
||||
<td class="select-all">{format(item.data[column])}</td>
|
||||
{/each}
|
||||
</tr>
|
||||
{/each}
|
||||
|
Loading…
Reference in New Issue
Block a user