mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-31 18:09:16 +01:00
Transform messages
This commit is contained in:
parent
4401f1fb5c
commit
a98a862a58
@ -1,7 +1,7 @@
|
|||||||
import { assert } from './assert';
|
import { assert } from './assert';
|
||||||
import Item from './Item';
|
import Item from './Item';
|
||||||
|
import ItemResult from './ItemResult';
|
||||||
import ItemTransformer from './transformer/ItemTransformer';
|
import ItemTransformer from './transformer/ItemTransformer';
|
||||||
import ParseResult from './ParseResult';
|
|
||||||
import { calculateSchemas } from './transformer/transformerUtil';
|
import { calculateSchemas } from './transformer/transformerUtil';
|
||||||
import TransformContext from './transformer/TransformContext';
|
import TransformContext from './transformer/TransformContext';
|
||||||
|
|
||||||
@ -11,7 +11,7 @@ export default class Debugger {
|
|||||||
transformers: ItemTransformer[];
|
transformers: ItemTransformer[];
|
||||||
stageNames: string[];
|
stageNames: string[];
|
||||||
stageSchema: string[][];
|
stageSchema: string[][];
|
||||||
private stageItems: Item[][];
|
private stageResultCache: ItemResult[];
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
initialSchema: string[],
|
initialSchema: string[],
|
||||||
@ -23,19 +23,21 @@ export default class Debugger {
|
|||||||
this.transformers = transformers;
|
this.transformers = transformers;
|
||||||
this.context = context;
|
this.context = context;
|
||||||
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
|
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
|
||||||
this.stageItems = [initialItems];
|
this.stageResultCache = [{ items: initialItems, messages: [`Parsed ${initialItems[initialItems.length-1].page+1} pages with ${initialItems.length} items`] }];
|
||||||
this.stageSchema = calculateSchemas(initialSchema, transformers);
|
this.stageSchema = calculateSchemas(initialSchema, transformers);
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO return MarkedItem ? (removed, added, etc..)?
|
//TODO return MarkedItem ? (removed, added, etc..)?
|
||||||
//TODO StageResult == class with schema and marked items ?
|
//TODO StageResult == class with schema and marked items ?
|
||||||
stageResults(stageIndex: number): Item[] {
|
stageResults(stageIndex: number): ItemResult {
|
||||||
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
||||||
if (!this.stageItems[idx]) {
|
if (!this.stageResultCache[idx]) {
|
||||||
const stageItems = this.transformers[idx - 1].transform(this.context, this.stageItems[idx - 1]);
|
const stageResult = this.transformers[idx - 1].transform(this.context, [
|
||||||
this.stageItems.push(stageItems);
|
...this.stageResultCache[idx - 1].items,
|
||||||
|
]);
|
||||||
|
this.stageResultCache.push(stageResult);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return this.stageItems[stageIndex];
|
return this.stageResultCache[stageIndex];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
6
core/src/ItemResult.ts
Normal file
6
core/src/ItemResult.ts
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import type Item from './Item';
|
||||||
|
|
||||||
|
export default interface ItemResult {
|
||||||
|
items: Item[];
|
||||||
|
messages: string[];
|
||||||
|
}
|
@ -2,8 +2,6 @@ import Item from './Item';
|
|||||||
import Metadata from './Metadata';
|
import Metadata from './Metadata';
|
||||||
import type ParseReporter from './ParseReporter';
|
import type ParseReporter from './ParseReporter';
|
||||||
import ParseResult from './ParseResult';
|
import ParseResult from './ParseResult';
|
||||||
import TextDirection from './TextDirection';
|
|
||||||
import type TextItem from './TextItem';
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parses a PDF via PDFJS and returns a ParseResult which contains more or less the original data from PDFJS.
|
* Parses a PDF via PDFJS and returns a ParseResult which contains more or less the original data from PDFJS.
|
||||||
@ -86,59 +84,59 @@ export default class PdfParser {
|
|||||||
return page.getOperatorList();
|
return page.getOperatorList();
|
||||||
}
|
}
|
||||||
|
|
||||||
async parseOld(data: Uint8Array): Promise<ParseResult> {
|
// async parseOld(data: Uint8Array): Promise<ParseResult> {
|
||||||
return this.pdfjs
|
// return this.pdfjs
|
||||||
.getDocument({
|
// .getDocument({
|
||||||
data,
|
// data,
|
||||||
cMapUrl: 'cmaps/',
|
// cMapUrl: 'cmaps/',
|
||||||
cMapPacked: true,
|
// cMapPacked: true,
|
||||||
})
|
// })
|
||||||
.promise.then((pdfDocument) => {
|
// .promise.then((pdfDocument) => {
|
||||||
// console.log('result', pdfDocument);
|
// // console.log('result', pdfDocument);
|
||||||
const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
// const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||||
return accumulatorPromise.then((accumulatedResults) => {
|
// return accumulatorPromise.then((accumulatedResults) => {
|
||||||
// console.log('Parsing page ' + index);
|
// // console.log('Parsing page ' + index);
|
||||||
return pdfDocument.getPage(index + 1).then((page) => {
|
// return pdfDocument.getPage(index + 1).then((page) => {
|
||||||
const viewport = page.getViewport({ scale: 1.0 });
|
// const viewport = page.getViewport({ scale: 1.0 });
|
||||||
console.log(viewport);
|
// console.log(viewport);
|
||||||
|
|
||||||
return this.triggerFontRetrieval(page).then(() =>
|
// return this.triggerFontRetrieval(page).then(() =>
|
||||||
page.getTextContent().then((textContent) => {
|
// page.getTextContent().then((textContent) => {
|
||||||
// console.log(textContent);
|
// // console.log(textContent);
|
||||||
const textItems: TextItem[] = textContent.items.map((item) => {
|
// const textItems: TextItem[] = textContent.items.map((item) => {
|
||||||
const tx = this.pdfjs.Util.transform(viewport.transform, item.transform);
|
// const tx = this.pdfjs.Util.transform(viewport.transform, item.transform);
|
||||||
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
|
// const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
|
||||||
const dividedHeight = item.height / fontHeight;
|
// const dividedHeight = item.height / fontHeight;
|
||||||
|
|
||||||
return {
|
// return {
|
||||||
x: Math.round(item.transform[4]),
|
// x: Math.round(item.transform[4]),
|
||||||
y: Math.round(item.transform[5]),
|
// y: Math.round(item.transform[5]),
|
||||||
width: Math.round(item.width),
|
// width: Math.round(item.width),
|
||||||
height: Math.round(
|
// height: Math.round(
|
||||||
Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight,
|
// Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight,
|
||||||
),
|
// ),
|
||||||
text: item.str,
|
// text: item.str,
|
||||||
textDirection: TextDirection.fromPdfJs(item.dir),
|
// textDirection: TextDirection.fromPdfJs(item.dir),
|
||||||
fontId: item.fontName,
|
// fontId: item.fontName,
|
||||||
};
|
// };
|
||||||
});
|
// });
|
||||||
|
|
||||||
return [...accumulatedResults, ...textItems];
|
// return [...accumulatedResults, ...textItems];
|
||||||
}),
|
// }),
|
||||||
);
|
// );
|
||||||
});
|
// });
|
||||||
});
|
// });
|
||||||
}, Promise.resolve([]));
|
// }, Promise.resolve([]));
|
||||||
return Promise.all([pdfDocument.getMetadata(), result]);
|
// return Promise.all([pdfDocument.getMetadata(), result]);
|
||||||
})
|
// })
|
||||||
.then(([metadata, r]) => {
|
// .then(([metadata, r]) => {
|
||||||
// console.log('Parsed metadata:', metadata);
|
// // console.log('Parsed metadata:', metadata);
|
||||||
// console.log('Parsed result:', r.length);
|
// // console.log('Parsed result:', r.length);
|
||||||
// console.log('Parsed result:', r);
|
// // console.log('Parsed result:', r);
|
||||||
|
|
||||||
return {};
|
// return {};
|
||||||
});
|
// });
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ParsedPage {
|
interface ParsedPage {
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import PageViewport from 'src/parse/PageViewport';
|
import PageViewport from 'src/parse/PageViewport';
|
||||||
import Item from '../Item';
|
import Item from '../Item';
|
||||||
|
import ItemResult from '../ItemResult';
|
||||||
import ItemTransformer from './ItemTransformer';
|
import ItemTransformer from './ItemTransformer';
|
||||||
import TransformContext from './TransformContext';
|
import TransformContext from './TransformContext';
|
||||||
|
|
||||||
@ -10,11 +11,12 @@ export default class AdjustHeight extends ItemTransformer {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(context: TransformContext, items: Item[]): Item[] {
|
transform(context: TransformContext, items: Item[]): ItemResult {
|
||||||
const newItems: Item[] = [];
|
const newItems: Item[] = [];
|
||||||
let page = -1;
|
let page = -1;
|
||||||
let pageViewport: PageViewport;
|
let pageViewport: PageViewport;
|
||||||
//TODO groupBy page
|
//TODO groupBy page
|
||||||
|
let correctedHeights = 0;
|
||||||
items.forEach((item) => {
|
items.forEach((item) => {
|
||||||
if (item.page !== page) {
|
if (item.page !== page) {
|
||||||
pageViewport = context.pageViewports[item.page];
|
pageViewport = context.pageViewports[item.page];
|
||||||
@ -27,11 +29,12 @@ export default class AdjustHeight extends ItemTransformer {
|
|||||||
const dividedHeight = itemHeight / fontHeight;
|
const dividedHeight = itemHeight / fontHeight;
|
||||||
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
|
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
|
||||||
if (newHeight !== itemHeight) {
|
if (newHeight !== itemHeight) {
|
||||||
|
correctedHeights++;
|
||||||
newItems.push(item.withDataAddition({ height: newHeight }));
|
newItems.push(item.withDataAddition({ height: newHeight }));
|
||||||
} else {
|
} else {
|
||||||
newItems.push(item);
|
newItems.push(item);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return items;
|
return { items, messages: [`${correctedHeights} corrected heights`] };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import Item from '../Item';
|
import Item from '../Item';
|
||||||
|
import ItemResult from '../ItemResult';
|
||||||
import ItemTransformer from './ItemTransformer';
|
import ItemTransformer from './ItemTransformer';
|
||||||
import TransformContext from './TransformContext';
|
import TransformContext from './TransformContext';
|
||||||
|
|
||||||
@ -11,9 +12,12 @@ export default class CalculateCoordinates extends ItemTransformer {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(context: TransformContext, items: Item[]): Item[] {
|
transform(context: TransformContext, items: Item[]): ItemResult {
|
||||||
// const transform: number[] = item.value['Transform'];
|
// const transform: number[] = item.value['Transform'];
|
||||||
items.shift();
|
items.shift();
|
||||||
return items;
|
if(items[0]){
|
||||||
|
items[0].data['fontName']='xxx';
|
||||||
|
}
|
||||||
|
return { items, messages: [] };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import TransformerDescription from '../TransformerDescription';
|
import TransformerDescription from '../TransformerDescription';
|
||||||
import type Item from '../Item';
|
import type Item from '../Item';
|
||||||
import TransformContext from './TransformContext';
|
import TransformContext from './TransformContext';
|
||||||
|
import ItemResult from 'src/ItemResult';
|
||||||
|
|
||||||
export default abstract class ItemTransformer {
|
export default abstract class ItemTransformer {
|
||||||
readonly name: string;
|
readonly name: string;
|
||||||
@ -21,5 +22,5 @@ export default abstract class ItemTransformer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// columnar-changes: described
|
// columnar-changes: described
|
||||||
abstract transform(context: TransformContext, items: Item[]): Item[];
|
abstract transform(context: TransformContext, items: Item[]): ItemResult;
|
||||||
}
|
}
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
</script>
|
</script>
|
||||||
|
|
||||||
<div class="text-2xl font-semibold font-serif text-center bg-gray-400">PDF to Markdown Converter</div>
|
<div class="text-2xl font-semibold font-serif text-center bg-gray-400">PDF to Markdown Converter</div>
|
||||||
<main class="mt-5 h-full">
|
<main class="mt-2 h-full">
|
||||||
{#if $debug}
|
{#if $debug}
|
||||||
<DebugView debug={$debug} />
|
<DebugView debug={$debug} />
|
||||||
{:else}
|
{:else}
|
||||||
|
@ -14,12 +14,12 @@
|
|||||||
$: canNext = currentStage + 1 < stageNames.length;
|
$: canNext = currentStage + 1 < stageNames.length;
|
||||||
$: canPrev = currentStage > 0;
|
$: canPrev = currentStage > 0;
|
||||||
$: stageSchema = debug.stageSchema[currentStage];
|
$: stageSchema = debug.stageSchema[currentStage];
|
||||||
$: stageItems = debug.stageResults(currentStage);
|
$: stageResult = debug.stageResults(currentStage);
|
||||||
$: pageFocus = !isNaN(focusedPage);
|
$: pageFocus = !isNaN(focusedPage);
|
||||||
$: pagesNumbers = new Set(stageItems.map((item) => item.page));
|
$: pagesNumbers = new Set(stageResult.items.map((item) => item.page));
|
||||||
$: maxPage = Math.max(...pagesNumbers);
|
$: maxPage = Math.max(...pagesNumbers);
|
||||||
$: itemsByPage = [
|
$: itemsByPage = [
|
||||||
...stageItems.reduce((map, item) => {
|
...stageResult.items.reduce((map, item) => {
|
||||||
if (!map.has(item.page)) {
|
if (!map.has(item.page)) {
|
||||||
map.set(item.page, []);
|
map.set(item.page, []);
|
||||||
}
|
}
|
||||||
@ -41,11 +41,9 @@
|
|||||||
</script>
|
</script>
|
||||||
|
|
||||||
<div class="mx-4">
|
<div class="mx-4">
|
||||||
<div class="mb-4">
|
|
||||||
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
|
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
|
||||||
<div>Title: {parseResult.metadata.title()}</div>
|
<div>Title: {parseResult.metadata.title()}</div>
|
||||||
<div>Author: {parseResult.metadata.author()}</div> -->
|
<div>Author: {parseResult.metadata.author()}</div> -->
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Sticky Controls -->
|
<!-- Sticky Controls -->
|
||||||
<div class="controls py-2">
|
<div class="controls py-2">
|
||||||
@ -89,10 +87,18 @@
|
|||||||
<span on:click={() => canNext && currentStage++}>
|
<span on:click={() => canNext && currentStage++}>
|
||||||
<ArrowRight size="1x" class={canNext ? 'hover:text-green-700 cursor-pointer' : 'opacity-50'} />
|
<ArrowRight size="1x" class={canNext ? 'hover:text-green-700 cursor-pointer' : 'opacity-50'} />
|
||||||
</span>
|
</span>
|
||||||
<div>{stageNames[currentStage]}</div>
|
<div class="cursor-pointer hover:underline">{stageNames[currentStage]}</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- Stage Messages -->
|
||||||
|
<ul class="list-disc list-inside mb-2 p-2 bg-yellow-100 rounded shadow text-sm">
|
||||||
|
{#each stageResult.messages as message}
|
||||||
|
<li>{message}</li>
|
||||||
|
{/each}
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<!-- Items -->
|
||||||
<ItemTable schema={stageSchema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
|
<ItemTable schema={stageSchema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@
|
|||||||
{/if}
|
{/if}
|
||||||
<td>{itemIdx}</td>
|
<td>{itemIdx}</td>
|
||||||
{#each schema as column}
|
{#each schema as column}
|
||||||
<td>{format(item.data[column])}</td>
|
<td class="select-all">{format(item.data[column])}</td>
|
||||||
{/each}
|
{/each}
|
||||||
</tr>
|
</tr>
|
||||||
{/each}
|
{/each}
|
||||||
|
Loading…
Reference in New Issue
Block a user