mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-01 03:09:01 +01:00
Rudimentary explicit debug support
This commit is contained in:
parent
95a7e3e93b
commit
4401f1fb5c
8
core/src/Config.ts
Normal file
8
core/src/Config.ts
Normal file
@ -0,0 +1,8 @@
|
||||
import ItemTransformer from './transformer/ItemTransformer';
|
||||
|
||||
export default interface Config {
|
||||
// See DocumentInitParameters from https://mozilla.github.io/pdf.js/api/draft/module-pdfjsLib.html#DocumentInitParameters
|
||||
pdfjsParams?: object;
|
||||
transformers?: ItemTransformer[];
|
||||
// TODO keep pdfPages ?
|
||||
}
|
41
core/src/Debugger.ts
Normal file
41
core/src/Debugger.ts
Normal file
@ -0,0 +1,41 @@
|
||||
import { assert } from './assert';
|
||||
import Item from './Item';
|
||||
import ItemTransformer from './transformer/ItemTransformer';
|
||||
import ParseResult from './ParseResult';
|
||||
import { calculateSchemas } from './transformer/transformerUtil';
|
||||
import TransformContext from './transformer/TransformContext';
|
||||
|
||||
export default class Debugger {
|
||||
// parseResult: ParseResult;
|
||||
context: TransformContext;
|
||||
transformers: ItemTransformer[];
|
||||
stageNames: string[];
|
||||
stageSchema: string[][];
|
||||
private stageItems: Item[][];
|
||||
|
||||
constructor(
|
||||
initialSchema: string[],
|
||||
initialItems: Item[],
|
||||
context: TransformContext,
|
||||
transformers: ItemTransformer[],
|
||||
) {
|
||||
// this.parseResult = parseResult;
|
||||
this.transformers = transformers;
|
||||
this.context = context;
|
||||
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
|
||||
this.stageItems = [initialItems];
|
||||
this.stageSchema = calculateSchemas(initialSchema, transformers);
|
||||
}
|
||||
|
||||
//TODO return MarkedItem ? (removed, added, etc..)?
|
||||
//TODO StageResult == class with schema and marked items ?
|
||||
stageResults(stageIndex: number): Item[] {
|
||||
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
||||
if (!this.stageItems[idx]) {
|
||||
const stageItems = this.transformers[idx - 1].transform(this.context, this.stageItems[idx - 1]);
|
||||
this.stageItems.push(stageItems);
|
||||
}
|
||||
}
|
||||
return this.stageItems[stageIndex];
|
||||
}
|
||||
}
|
@ -1,13 +1,32 @@
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
|
||||
export default class Item {
|
||||
page: number;
|
||||
data: object;
|
||||
uuid?: string;
|
||||
|
||||
constructor(page: number, data: object) {
|
||||
constructor(page: number, data: object, uuid: string = uuidv4()) {
|
||||
this.page = page;
|
||||
this.data = data;
|
||||
this.uuid = uuid;
|
||||
}
|
||||
|
||||
value(column: string): object {
|
||||
return this.data[column];
|
||||
}
|
||||
|
||||
withDataAddition(data: object): Item {
|
||||
return this.withData({ ...this.data, ...data });
|
||||
}
|
||||
|
||||
withData(data: object): Item {
|
||||
return new Item(this.page, data, this.uuid);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the item without a uuid.
|
||||
*/
|
||||
withoutUuid(): Item {
|
||||
return new Item(this.page, this.data, '');
|
||||
}
|
||||
}
|
||||
|
@ -1,16 +1,19 @@
|
||||
import Item from './Item';
|
||||
import type Item from './Item';
|
||||
import type Metadata from './Metadata';
|
||||
import type PageViewport from './parse/PageViewport';
|
||||
|
||||
export default class ParseResult {
|
||||
pdfPages: any[];
|
||||
pageViewports: PageViewport[];
|
||||
metadata: Metadata;
|
||||
columns: string[];
|
||||
schema: string[];
|
||||
items: Item[];
|
||||
|
||||
constructor(pdfPages: any[], metadata: Metadata, columns: string[], items: Item[]) {
|
||||
constructor(pdfPages: any[], pageViewports: PageViewport[], metadata: Metadata, schema: string[], items: Item[]) {
|
||||
this.pdfPages = pdfPages;
|
||||
this.pageViewports = pageViewports;
|
||||
this.metadata = metadata;
|
||||
this.columns = columns;
|
||||
this.schema = schema;
|
||||
this.items = items;
|
||||
}
|
||||
|
||||
|
@ -1,13 +0,0 @@
|
||||
import type ParsedPageItem from './ParsedPageItem';
|
||||
|
||||
export default class ParsedPage {
|
||||
index: number;
|
||||
pdfPage: any;
|
||||
items: ParsedPageItem[];
|
||||
|
||||
constructor(index: number, pdfPage: any, items: ParsedPageItem[]) {
|
||||
this.index = index;
|
||||
this.pdfPage = pdfPage;
|
||||
this.items = items;
|
||||
}
|
||||
}
|
@ -1,6 +1,5 @@
|
||||
import Item from './Item';
|
||||
import Metadata from './Metadata';
|
||||
import ParsedPage from './ParsedPage';
|
||||
import type ParseReporter from './ParseReporter';
|
||||
import ParseResult from './ParseResult';
|
||||
import TextDirection from './TextDirection';
|
||||
@ -11,31 +10,18 @@ import type TextItem from './TextItem';
|
||||
*/
|
||||
export default class PdfParser {
|
||||
pdfjs: any;
|
||||
columns = ['str', 'dir', 'width', 'height', 'transform', 'fontName'];
|
||||
defaultParams: object;
|
||||
schema = ['str', 'fontName', 'dir', 'width', 'height', 'transform'];
|
||||
|
||||
constructor(pdfjs: any) {
|
||||
constructor(pdfjs: any, defaultParams = {}) {
|
||||
this.pdfjs = pdfjs;
|
||||
this.defaultParams = defaultParams;
|
||||
}
|
||||
|
||||
async parseBytes(data: Uint8Array, reporter: ParseReporter): Promise<ParseResult> {
|
||||
return this.parse(this.params({ data }), reporter);
|
||||
}
|
||||
|
||||
async parseUrl(url: string, reporter: ParseReporter): Promise<ParseResult> {
|
||||
return this.parse(this.params({ url }), reporter);
|
||||
}
|
||||
|
||||
private params(dataSourceParams: object): object {
|
||||
const defaultParams = {
|
||||
cMapUrl: 'cmaps/',
|
||||
cMapPacked: true,
|
||||
};
|
||||
return { ...defaultParams, ...dataSourceParams };
|
||||
}
|
||||
|
||||
async parse(parameter: object, reporter: ParseReporter): Promise<ParseResult> {
|
||||
async parse(src: string | Uint8Array | object, reporter: ParseReporter): Promise<ParseResult> {
|
||||
const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) };
|
||||
return this.pdfjs
|
||||
.getDocument(parameter)
|
||||
.getDocument(documentInitParameters)
|
||||
.promise.then((pdfDocument) => {
|
||||
reporter.parsedDocumentHeader(pdfDocument.numPages);
|
||||
return Promise.all([
|
||||
@ -47,16 +33,38 @@ export default class PdfParser {
|
||||
]);
|
||||
})
|
||||
.then(([metadata, pages]) => {
|
||||
const pdfPages = pages.map((page) => page.pdfPage);
|
||||
const pdfPages = pages.map((page) => page.page);
|
||||
const items = pages.reduce((allItems, page) => allItems.concat(page.items), []);
|
||||
return new ParseResult(pdfPages, new Metadata(metadata), this.columns, items);
|
||||
const pageViewports = pdfPages.map((page) => {
|
||||
const viewPort = page.getViewport({ scale: 1.0 });
|
||||
return { transformFunction: (itemTransform: number[]) => this.pdfjs.Util.transform(viewPort, itemTransform) };
|
||||
});
|
||||
return new ParseResult(pdfPages, pageViewports, new Metadata(metadata), this.schema, items);
|
||||
});
|
||||
}
|
||||
|
||||
private documentInitParameters(src: string | Uint8Array | object): object {
|
||||
if (typeof src === 'string') {
|
||||
return { url: src };
|
||||
}
|
||||
if (this.isArrayBuffer(src)) {
|
||||
return { data: src };
|
||||
}
|
||||
if (typeof src === 'object') {
|
||||
return src;
|
||||
}
|
||||
throw new Error('Invalid PDFjs parameter for getDocument. Need either Uint8Array, string or a parameter object');
|
||||
}
|
||||
|
||||
private isArrayBuffer(object) {
|
||||
return typeof object === 'object' && object !== null && object.byteLength !== undefined;
|
||||
}
|
||||
|
||||
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
|
||||
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||
return accumulatorPromise.then((accumulatedResults) => {
|
||||
return pdfDocument.getPage(index + 1).then((page) => {
|
||||
const viewport = page.getViewport({ scale: 1.0 });
|
||||
return this.triggerFontRetrieval(page).then(() =>
|
||||
page
|
||||
.getTextContent({
|
||||
@ -66,7 +74,7 @@ export default class PdfParser {
|
||||
.then((textContent) => {
|
||||
const items = textContent.items.map((rawItem) => new Item(index, rawItem));
|
||||
reporter.parsedPage(index);
|
||||
return [...accumulatedResults, new ParsedPage(index, page, items)];
|
||||
return [...accumulatedResults, { index, page, items }];
|
||||
}),
|
||||
);
|
||||
});
|
||||
@ -92,6 +100,8 @@ export default class PdfParser {
|
||||
// console.log('Parsing page ' + index);
|
||||
return pdfDocument.getPage(index + 1).then((page) => {
|
||||
const viewport = page.getViewport({ scale: 1.0 });
|
||||
console.log(viewport);
|
||||
|
||||
return this.triggerFontRetrieval(page).then(() =>
|
||||
page.getTextContent().then((textContent) => {
|
||||
// console.log(textContent);
|
||||
@ -126,7 +136,13 @@ export default class PdfParser {
|
||||
// console.log('Parsed result:', r.length);
|
||||
// console.log('Parsed result:', r);
|
||||
|
||||
return new ParseResult([], new Metadata(metadata), [], []);
|
||||
return {};
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
interface ParsedPage {
|
||||
index: number;
|
||||
page: any;
|
||||
items: Item[];
|
||||
}
|
||||
|
45
core/src/PdfPipeline.ts
Normal file
45
core/src/PdfPipeline.ts
Normal file
@ -0,0 +1,45 @@
|
||||
import PdfParser from './PdfParser';
|
||||
import ProgressListenFunction from './ProgressListenFunction';
|
||||
import ParseProgressReporter from './ParseProgressReporter';
|
||||
import ItemTransformer from './transformer/ItemTransformer';
|
||||
import Item from './Item';
|
||||
import ParseResult from './ParseResult';
|
||||
import Debugger from './Debugger';
|
||||
import { verifyRequiredColumns } from './transformer/transformerUtil';
|
||||
import TransformContext from './transformer/TransformContext';
|
||||
|
||||
export default class PdfPipeline {
|
||||
parser: PdfParser;
|
||||
transformers: ItemTransformer[];
|
||||
|
||||
constructor(parser: PdfParser, transformers: ItemTransformer[]) {
|
||||
this.parser = parser;
|
||||
this.transformers = transformers;
|
||||
}
|
||||
|
||||
private async parse(
|
||||
src: string | Uint8Array | object,
|
||||
progressListener: ProgressListenFunction,
|
||||
): Promise<ParseResult> {
|
||||
const parseResult = await this.parser.parse(src, new ParseProgressReporter(progressListener));
|
||||
verifyRequiredColumns(parseResult.schema, this.transformers);
|
||||
return parseResult;
|
||||
}
|
||||
//TODO PipelineResult
|
||||
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
|
||||
const parseResult = await this.parse(src, progressListener);
|
||||
const context = { pageViewports: parseResult.pageViewports };
|
||||
let items = parseResult.items;
|
||||
this.transformers.forEach((transformer) => {
|
||||
items = transformer.transform(context, items);
|
||||
});
|
||||
parseResult.items = items;
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> {
|
||||
const parseResult = await this.parse(src, progressListener);
|
||||
const context = { pageViewports: parseResult.pageViewports };
|
||||
return new Debugger(parseResult.schema, parseResult.items, context, this.transformers);
|
||||
}
|
||||
}
|
7
core/src/TransformerDescription.ts
Normal file
7
core/src/TransformerDescription.ts
Normal file
@ -0,0 +1,7 @@
|
||||
export default interface TransformerDescription {
|
||||
readonly consumesGlobels?: string[];
|
||||
readonly producesGlobels?: string[];
|
||||
readonly consumes?: string[];
|
||||
readonly produces?: string[];
|
||||
readonly removes?: string[];
|
||||
}
|
11
core/src/assert.ts
Normal file
11
core/src/assert.ts
Normal file
@ -0,0 +1,11 @@
|
||||
export function assert(condition: boolean, message: string) {
|
||||
if (!condition) {
|
||||
throw new Error(message || 'Assertion failed');
|
||||
}
|
||||
}
|
||||
|
||||
export function assertDefined<T>(value: T, message: string): T {
|
||||
assert(value !== null, message);
|
||||
assert(typeof value !== 'undefined', message);
|
||||
return value;
|
||||
}
|
@ -1,11 +1,32 @@
|
||||
import Config from './Config';
|
||||
import type ProgressListenFunction from './ProgressListenFunction';
|
||||
import ParseProgressReporter from './ParseProgressReporter';
|
||||
import PdfParser from './PdfParser';
|
||||
import PdfPipeline from './PdfPipeline';
|
||||
|
||||
import AdjustHeight from './transformer/AdjustHeight';
|
||||
import CalculateCoordinates from './transformer/CalculateCoordinates';
|
||||
|
||||
const transformers = [new AdjustHeight(), new CalculateCoordinates()];
|
||||
|
||||
const defaultConfig: Config = {
|
||||
pdfjsParams: {
|
||||
// TODO check if that cmap thing makes sense since we don't bundle them
|
||||
cMapUrl: 'cmaps/',
|
||||
cMapPacked: true,
|
||||
},
|
||||
transformers,
|
||||
};
|
||||
|
||||
export function pdfParser(pdfJs: any) {
|
||||
return new PdfParser(pdfJs);
|
||||
return new PdfParser(pdfJs, defaultConfig.pdfjsParams);
|
||||
}
|
||||
|
||||
export function parseReporter(progressListener: ProgressListenFunction) {
|
||||
return new ParseProgressReporter(progressListener);
|
||||
}
|
||||
|
||||
export function createPipeline(pdfJs: any, config = defaultConfig): PdfPipeline {
|
||||
const parser = new PdfParser(pdfJs, config.pdfjsParams);
|
||||
return new PdfPipeline(parser, config.transformers || transformers);
|
||||
}
|
||||
|
5
core/src/parse/PageViewport.ts
Normal file
5
core/src/parse/PageViewport.ts
Normal file
@ -0,0 +1,5 @@
|
||||
type ItemTransformFunction = (itemTransform: number[]) => number[];
|
||||
|
||||
export default interface PageViewport {
|
||||
transformFunction: ItemTransformFunction;
|
||||
}
|
37
core/src/transformer/AdjustHeight.ts
Normal file
37
core/src/transformer/AdjustHeight.ts
Normal file
@ -0,0 +1,37 @@
|
||||
import PageViewport from 'src/parse/PageViewport';
|
||||
import Item from '../Item';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
|
||||
export default class AdjustHeight extends ItemTransformer {
|
||||
constructor() {
|
||||
super('Adjust Heights', {
|
||||
consumes: ['transform', 'height'],
|
||||
});
|
||||
}
|
||||
|
||||
transform(context: TransformContext, items: Item[]): Item[] {
|
||||
const newItems: Item[] = [];
|
||||
let page = -1;
|
||||
let pageViewport: PageViewport;
|
||||
//TODO groupBy page
|
||||
items.forEach((item) => {
|
||||
if (item.page !== page) {
|
||||
pageViewport = context.pageViewports[item.page];
|
||||
page = page;
|
||||
}
|
||||
const itemTransform = item.data['transform'];
|
||||
const itemHeight = item.data['height'];
|
||||
const tx = pageViewport.transformFunction(itemTransform);
|
||||
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
|
||||
const dividedHeight = itemHeight / fontHeight;
|
||||
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
|
||||
if (newHeight !== itemHeight) {
|
||||
newItems.push(item.withDataAddition({ height: newHeight }));
|
||||
} else {
|
||||
newItems.push(item);
|
||||
}
|
||||
});
|
||||
return items;
|
||||
}
|
||||
}
|
19
core/src/transformer/CalculateCoordinates.ts
Normal file
19
core/src/transformer/CalculateCoordinates.ts
Normal file
@ -0,0 +1,19 @@
|
||||
import Item from '../Item';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
|
||||
export default class CalculateCoordinates extends ItemTransformer {
|
||||
constructor() {
|
||||
super('Calculate Coordinates', {
|
||||
consumes: ['transform'],
|
||||
produces: ['X', 'Y'],
|
||||
removes: ['transform'],
|
||||
});
|
||||
}
|
||||
|
||||
transform(context: TransformContext, items: Item[]): Item[] {
|
||||
// const transform: number[] = item.value['Transform'];
|
||||
items.shift();
|
||||
return items;
|
||||
}
|
||||
}
|
25
core/src/transformer/ItemTransformer.ts
Normal file
25
core/src/transformer/ItemTransformer.ts
Normal file
@ -0,0 +1,25 @@
|
||||
import TransformerDescription from '../TransformerDescription';
|
||||
import type Item from '../Item';
|
||||
import TransformContext from './TransformContext';
|
||||
|
||||
export default abstract class ItemTransformer {
|
||||
readonly name: string;
|
||||
readonly description: TransformerDescription;
|
||||
|
||||
constructor(name: string, description: TransformerDescription) {
|
||||
this.name = name;
|
||||
this.description = {
|
||||
...{
|
||||
consumesGlobels: [],
|
||||
producesGlobels: [],
|
||||
consumes: [],
|
||||
produces: [],
|
||||
removes: [],
|
||||
},
|
||||
...description,
|
||||
};
|
||||
}
|
||||
|
||||
// columnar-changes: described
|
||||
abstract transform(context: TransformContext, items: Item[]): Item[];
|
||||
}
|
5
core/src/transformer/TransformContext.ts
Normal file
5
core/src/transformer/TransformContext.ts
Normal file
@ -0,0 +1,5 @@
|
||||
import PageViewport from 'src/parse/PageViewport';
|
||||
|
||||
export default interface TransformContext {
|
||||
pageViewports: PageViewport[];
|
||||
}
|
55
core/src/transformer/transformerUtil.ts
Normal file
55
core/src/transformer/transformerUtil.ts
Normal file
@ -0,0 +1,55 @@
|
||||
import TransformerDescription from 'src/TransformerDescription';
|
||||
import { assert } from '../assert';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
|
||||
/**
|
||||
* Goes through all transformer and makes sure each required column ({@link TransformerDescription#consumes}) is available in its predecessor schema.
|
||||
*
|
||||
* @param initialSchema
|
||||
* @param transformers
|
||||
*/
|
||||
export function verifyRequiredColumns(initialSchema: string[], transformers: ItemTransformer[]) {
|
||||
calculateSchemas(initialSchema, transformers);
|
||||
}
|
||||
|
||||
//TODO debug schema
|
||||
// initial - all unanotated
|
||||
// second - 2 removed, 1 added
|
||||
// third - all as before without the removed
|
||||
|
||||
export function calculateSchemas(initialSchema: string[], transformers: ItemTransformer[]): string[][] {
|
||||
const schemas: string[][] = [];
|
||||
schemas.push(initialSchema);
|
||||
for (let idx = 0; idx < transformers.length; idx++) {
|
||||
const transformer = transformers[idx];
|
||||
const inputSchema = schemas[idx];
|
||||
validateReferences(inputSchema, transformer.name, transformer.description);
|
||||
const outputSchema = inputSchema.filter((column) => !transformer.description.removes?.includes(column));
|
||||
transformer.description.produces?.forEach((column) => outputSchema.push(column));
|
||||
schemas.push(outputSchema);
|
||||
}
|
||||
return schemas;
|
||||
}
|
||||
|
||||
function validateReferences(
|
||||
inputSchema: string[],
|
||||
transformerName: string,
|
||||
transformerDescription: TransformerDescription,
|
||||
) {
|
||||
transformerDescription.consumes?.forEach((column) => {
|
||||
assert(
|
||||
inputSchema.includes(column),
|
||||
`Input schema [${inputSchema.join(
|
||||
', ',
|
||||
)}] for transformer '${transformerName}' does not contain the required column '${column}' (consumes)`,
|
||||
);
|
||||
});
|
||||
transformerDescription.removes?.forEach((column) => {
|
||||
assert(
|
||||
inputSchema.includes(column),
|
||||
`Input schema [${inputSchema.join(
|
||||
', ',
|
||||
)}] for transformer '${transformerName}' does not contain the required column '${column}' (removes)`,
|
||||
);
|
||||
});
|
||||
}
|
38
core/test/Debugger.test.ts
Normal file
38
core/test/Debugger.test.ts
Normal file
@ -0,0 +1,38 @@
|
||||
import Debugger from 'src/Debugger';
|
||||
import Item from 'src/Item';
|
||||
import ItemTransformer from 'src/transformer/ItemTransformer';
|
||||
import Metadata from 'src/Metadata';
|
||||
import ParseResult from 'src/ParseResult';
|
||||
import TransformerDescription from 'src/TransformerDescription';
|
||||
import TransformContext from 'src/transformer/TransformContext';
|
||||
|
||||
class TestTransformer extends ItemTransformer {
|
||||
items: Item[];
|
||||
constructor(name: string, description: TransformerDescription, items: Item[]) {
|
||||
super(name, description);
|
||||
this.items = items;
|
||||
}
|
||||
transform(_: TransformContext, items: Item[]): Item[] {
|
||||
return this.items;
|
||||
}
|
||||
}
|
||||
|
||||
test('basic debug', async () => {
|
||||
const parsedSchema = ['A', 'B'];
|
||||
const parsedItems = [new Item(0, { A: 'a_row1', B: 'b_row1' }), new Item(0, { A: 'a_row2', B: 'b_row2' })];
|
||||
|
||||
const trans1Desc = { consumes: ['A', 'B'], produces: ['C'], removes: ['A', 'B'] };
|
||||
const trans1Items = parsedItems.map((item) => item.withData({ C: `c=${item.value('A')}+${item.value('B')}` }));
|
||||
|
||||
const transformers = [new TestTransformer('Trans1', trans1Desc, trans1Items)];
|
||||
const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers);
|
||||
|
||||
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
|
||||
expect(debug.stageSchema).toEqual([parsedSchema, ['C']]);
|
||||
for (let index = 0; index < debug.stageNames.length; index++) {
|
||||
console.log(index, debug.stageResults(index));
|
||||
}
|
||||
|
||||
expect(debug.stageResults(0)).toEqual(parsedItems);
|
||||
expect(debug.stageResults(1)).toEqual(trans1Items);
|
||||
});
|
@ -12,7 +12,7 @@ test('basic example PDF parse', async () => {
|
||||
const data = fs.readFileSync('../examples/ExamplePdf.pdf', null);
|
||||
|
||||
// to test
|
||||
const result = await parser.parseBytes(
|
||||
const result = await parser.parse(
|
||||
data,
|
||||
new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)),
|
||||
);
|
||||
@ -29,7 +29,7 @@ test('basic example PDF parse', async () => {
|
||||
expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
|
||||
|
||||
// verify first n items
|
||||
expect(result.items.slice(0, 16)).toEqual([
|
||||
expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([
|
||||
new Item(0, {
|
||||
str: 'Mega Überschrift',
|
||||
dir: 'ltr',
|
||||
@ -37,7 +37,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 30,
|
||||
transform: [30, 0, 0, 30, 175, 756],
|
||||
fontName: 'g_d0_f1',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(0, {
|
||||
str: '2te Überschrift',
|
||||
dir: 'ltr',
|
||||
@ -45,7 +45,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 20,
|
||||
transform: [20, 0, 0, 20, 233, 665],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(0, {
|
||||
str: 'Dies ist eine Test-PDF',
|
||||
dir: 'ltr',
|
||||
@ -53,7 +53,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 240, 585],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(0, {
|
||||
str: '.',
|
||||
dir: 'ltr',
|
||||
@ -61,7 +61,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 352.6927, 585],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(0, {
|
||||
str: '1',
|
||||
dir: 'ltr',
|
||||
@ -69,7 +69,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 7.333334,
|
||||
transform: [7.333334, 0, 0, 7.333334, 348, 588],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(0, {
|
||||
str: 'Für’s Testen des ',
|
||||
dir: 'ltr',
|
||||
@ -77,7 +77,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 208, 572],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(0, {
|
||||
str: 'Markdown Parsers',
|
||||
dir: 'ltr',
|
||||
@ -85,7 +85,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 291.77832, 572],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(0, {
|
||||
str: '.',
|
||||
dir: 'ltr',
|
||||
@ -93,7 +93,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 383.47360000000003, 572],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(0, {
|
||||
str: ' ',
|
||||
dir: 'ltr',
|
||||
@ -101,7 +101,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 61.078451, 59],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(0, {
|
||||
str: 'In Deutsch.',
|
||||
dir: 'ltr',
|
||||
@ -109,7 +109,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 64.134603, 59],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(0, {
|
||||
str: '1',
|
||||
dir: 'ltr',
|
||||
@ -117,7 +117,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 7.333334,
|
||||
transform: [7.333334, 0, 0, 7.333334, 57, 62],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(0, {
|
||||
str: '\x00',
|
||||
dir: 'ltr',
|
||||
@ -125,7 +125,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 12,
|
||||
transform: [12, 0, 0, 12, 294, 45],
|
||||
fontName: 'g_d0_f3',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(0, {
|
||||
str: '1',
|
||||
dir: 'ltr',
|
||||
@ -133,7 +133,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 12,
|
||||
transform: [12, 0, 0, 12, 294, 45],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(1, {
|
||||
str: '\x00',
|
||||
dir: 'ltr',
|
||||
@ -141,7 +141,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 12,
|
||||
transform: [12, 0, 0, 12, 294, 45],
|
||||
fontName: 'g_d0_f3',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(1, {
|
||||
str: '2',
|
||||
dir: 'ltr',
|
||||
@ -149,7 +149,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 12,
|
||||
transform: [12, 0, 0, 12, 294, 45],
|
||||
fontName: 'g_d0_f2',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
new Item(2, {
|
||||
str: 'Paragraphen',
|
||||
dir: 'ltr',
|
||||
@ -157,7 +157,7 @@ test('basic example PDF parse', async () => {
|
||||
height: 18,
|
||||
transform: [18, 0, 0, 18, 57, 767],
|
||||
fontName: 'g_d0_f1',
|
||||
}),
|
||||
}).withoutUuid(),
|
||||
]);
|
||||
|
||||
// verify progress
|
||||
|
59
core/test/transformer/transformerUtil.test.ts
Normal file
59
core/test/transformer/transformerUtil.test.ts
Normal file
@ -0,0 +1,59 @@
|
||||
import TransformerDescription from 'src/TransformerDescription';
|
||||
import Item from 'src/Item';
|
||||
import ItemTransformer from 'src/transformer/ItemTransformer';
|
||||
import TransformContext from 'src/transformer/TransformContext';
|
||||
import { calculateSchemas, verifyRequiredColumns } from 'src/transformer/transformerUtil';
|
||||
|
||||
class TestSchemaTransformer extends ItemTransformer {
|
||||
constructor(name: string, description: TransformerDescription) {
|
||||
super(name, description);
|
||||
}
|
||||
transform(_: TransformContext, items: Item[]): Item[] {
|
||||
return items;
|
||||
}
|
||||
}
|
||||
|
||||
test('verify valid transform', async () => {
|
||||
const inputSchema = ['A', 'B', 'C'];
|
||||
|
||||
const transformers = [
|
||||
new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }),
|
||||
new TestSchemaTransformer('Create E', { produces: ['E'] }),
|
||||
new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }),
|
||||
];
|
||||
verifyRequiredColumns(inputSchema, transformers);
|
||||
});
|
||||
|
||||
test('verify invalid consume', async () => {
|
||||
const inputSchema = ['A', 'B', 'C'];
|
||||
|
||||
const transformers = [new TestSchemaTransformer('Consumes X', { consumes: ['X'] })];
|
||||
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
|
||||
"Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X' (consumes)",
|
||||
);
|
||||
});
|
||||
|
||||
test('verify invalid remove', async () => {
|
||||
const inputSchema = ['A', 'B', 'C'];
|
||||
|
||||
const transformers = [new TestSchemaTransformer('Removes X', { removes: ['X'] })];
|
||||
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
|
||||
"Input schema [A, B, C] for transformer 'Removes X' does not contain the required column 'X' (removes)",
|
||||
);
|
||||
});
|
||||
|
||||
test('calculate schemas', async () => {
|
||||
const inputSchema = ['A', 'B', 'C'];
|
||||
|
||||
const transformers = [
|
||||
new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }),
|
||||
new TestSchemaTransformer('Create E', { produces: ['E'] }),
|
||||
new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }),
|
||||
];
|
||||
expect(calculateSchemas(inputSchema, transformers)).toEqual([
|
||||
['A', 'B', 'C'],
|
||||
['A', 'D'],
|
||||
['A', 'D', 'E'],
|
||||
['A', 'D', 'E'],
|
||||
]);
|
||||
});
|
3
ui/package-lock.json
generated
3
ui/package-lock.json
generated
@ -5137,8 +5137,7 @@
|
||||
"uuid": {
|
||||
"version": "8.3.2",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz",
|
||||
"integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==",
|
||||
"dev": true
|
||||
"integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg=="
|
||||
},
|
||||
"v8-to-istanbul": {
|
||||
"version": "7.0.0",
|
||||
|
@ -21,7 +21,8 @@
|
||||
},
|
||||
"dependencies": {
|
||||
"pdfjs-dist": "^2.5.207",
|
||||
"svelte-file-dropzone": "0.0.15"
|
||||
"svelte-file-dropzone": "0.0.15",
|
||||
"uuid": "^8.3.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@snowpack/plugin-dotenv": "^2.0.5",
|
||||
|
@ -1,14 +1,14 @@
|
||||
<script>
|
||||
import Upload from './Upload.svelte';
|
||||
|
||||
import { parseResult } from './store';
|
||||
import Result from './Result.svelte';
|
||||
import { parseResult, debug } from './store';
|
||||
import DebugView from './debug/DebugView.svelte';
|
||||
</script>
|
||||
|
||||
<div class="text-2xl font-semibold font-serif text-center bg-gray-400">PDF to Markdown Converter</div>
|
||||
<main class="mt-5 h-full">
|
||||
{#if $parseResult}
|
||||
<Result parseResult={$parseResult} />
|
||||
{#if $debug}
|
||||
<DebugView debug={$debug} />
|
||||
{:else}
|
||||
<Upload />
|
||||
{/if}
|
||||
|
@ -1,16 +0,0 @@
|
||||
<script>
|
||||
import type ParseResult from 'pdf-to-markdown-core/lib/src/ParseResult';
|
||||
import Table from './Table.svelte';
|
||||
|
||||
export let parseResult: ParseResult;
|
||||
</script>
|
||||
|
||||
<div class="mx-4">
|
||||
<div class="mb-4">
|
||||
<div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
|
||||
<div>Title: {parseResult.metadata.title()}</div>
|
||||
<div>Author: {parseResult.metadata.author()}</div>
|
||||
</div>
|
||||
|
||||
<Table columns={parseResult.columns} items={parseResult.items} />
|
||||
</div>
|
@ -1,172 +0,0 @@
|
||||
<script>
|
||||
import type Item from '@core/Item';
|
||||
import { Collection, BookOpen, Support, ArrowLeft, ArrowRight } from 'svelte-hero-icons';
|
||||
|
||||
export let columns: string[];
|
||||
export let items: Item[];
|
||||
const pages = [...new Set(items.map((item) => item.page))];
|
||||
const maxPage = Math.max(...pages);
|
||||
let focusedPage: number;
|
||||
$: focused = typeof focusedPage === 'number';
|
||||
let openedPageIndex = false;
|
||||
|
||||
const itemsGroupedByPage = items.reduce((map, item) => {
|
||||
if (!map.has(item.page)) {
|
||||
map.set(item.page, []);
|
||||
}
|
||||
map.get(item.page).push(item);
|
||||
return map;
|
||||
}, new Map<number, Item[]>());
|
||||
|
||||
function focusOnPage(pageNumber: number) {
|
||||
openedPageIndex = false;
|
||||
focusedPage = pageNumber;
|
||||
}
|
||||
|
||||
function showAllPages() {
|
||||
openedPageIndex = false;
|
||||
focusedPage = undefined;
|
||||
}
|
||||
|
||||
function format(value: object) {
|
||||
const type = typeof value;
|
||||
if (typeof value === 'number') {
|
||||
return (value as number).toFixed(2);
|
||||
}
|
||||
if (typeof value === 'object' && typeof Array.isArray(value)) {
|
||||
let array = value as Array<object>;
|
||||
if (array.length > 0 && typeof array[0] === 'number') {
|
||||
array = (array.map((element) =>
|
||||
((element as unknown) as number).toFixed(2)
|
||||
) as unknown) as Array<object>;
|
||||
}
|
||||
return '[' + array.join(', ') + ']';
|
||||
}
|
||||
return value;
|
||||
}
|
||||
</script>
|
||||
|
||||
<!-- Sticky Controls -->
|
||||
<div class="controls py-2">
|
||||
<div class="flex items-center space-x-2">
|
||||
<span>
|
||||
<span on:click={() => (openedPageIndex = !openedPageIndex)}>
|
||||
<BookOpen size="1x" class="hover:text-green-700 cursor-pointer" />
|
||||
</span>
|
||||
|
||||
<!-- Page selection popup-->
|
||||
{#if openedPageIndex}
|
||||
<div class="absolute mt-2 p-2 flex bg-gray-200 shadow-lg rounded-sm overflow-auto max-h-96">
|
||||
<span class="mt-1 pr-2" on:click={showAllPages}>
|
||||
<Collection size="1x" class="hover:text-green-700 cursor-pointer" />
|
||||
</span>
|
||||
<div
|
||||
class="grid gap-3"
|
||||
style="grid-template-columns: repeat({Math.min(20, maxPage + 1)}, minmax(0, 1fr));">
|
||||
{#each new Array(maxPage + 1) as _, idx}
|
||||
<div
|
||||
on:click={() => itemsGroupedByPage.has(idx) && focusOnPage(idx)}
|
||||
class="px-2 border border-gray-300 rounded-full text-center {itemsGroupedByPage.has(idx) ? 'hover:text-green-700 hover:border-green-700 cursor-pointer' : 'opacity-50'}">
|
||||
{idx}
|
||||
</div>
|
||||
{/each}
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
</span>
|
||||
|
||||
<div>|</div>
|
||||
<div>Transformation:</div>
|
||||
<ArrowLeft size="1x" class="hover:text-green-700 cursor-pointer opacity-50" />
|
||||
<div>Parse Result</div>
|
||||
<ArrowRight size="1x" class="hover:text-green-700 cursor-pointer" />
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Item table -->
|
||||
<table class="w-full text-left">
|
||||
<!-- Sticky header -->
|
||||
<thead class=" ">
|
||||
<th />
|
||||
<th>#</th>
|
||||
{#each columns as column}
|
||||
<th>{column}</th>
|
||||
{/each}
|
||||
</thead>
|
||||
<tbody>
|
||||
{#each [...itemsGroupedByPage].filter(([page]) => !focused || page === focusedPage) as [pageNumber, items], pageIdx}
|
||||
<!-- Separator between pages -->
|
||||
{#if pageIdx > 0}
|
||||
<tr class="h-5" />
|
||||
{/if}
|
||||
{#each items as item, itemIdx}
|
||||
<tr>
|
||||
<!-- Page number in first page item row -->
|
||||
{#if itemIdx === 0}
|
||||
<td class="page bg-gray-50">
|
||||
<div>Page {pageNumber} {focused ? '' : ' / ' + maxPage}</div>
|
||||
<div class="absolute flex">
|
||||
{#if !focused}
|
||||
<span on:click={() => focusOnPage(pageNumber)}>
|
||||
<Support size="1x" class="hover:text-green-700 cursor-pointer opacity-75" />
|
||||
</span>
|
||||
{:else}
|
||||
<span on:click={showAllPages}>
|
||||
<Collection size="1x" class="hover:text-green-700 cursor-pointer opacity-75" />
|
||||
</span>
|
||||
{/if}
|
||||
</div>
|
||||
</td>
|
||||
{:else}
|
||||
<td />
|
||||
{/if}
|
||||
<td>{itemIdx}</td>
|
||||
{#each columns as column}
|
||||
<td>{format(item.data[column])}</td>
|
||||
{/each}
|
||||
</tr>
|
||||
{/each}
|
||||
{/each}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<style>
|
||||
.controls {
|
||||
@apply bg-gray-50;
|
||||
position: -webkit-sticky;
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 3;
|
||||
}
|
||||
|
||||
.page {
|
||||
@apply text-lg;
|
||||
@apply font-semibold;
|
||||
@apply pr-4;
|
||||
@apply whitespace-nowrap;
|
||||
position: -webkit-sticky;
|
||||
position: sticky;
|
||||
top: 2em;
|
||||
z-index: 2;
|
||||
}
|
||||
|
||||
th {
|
||||
@apply px-1;
|
||||
position: -webkit-sticky;
|
||||
position: sticky;
|
||||
top: 2.4em;
|
||||
z-index: 2;
|
||||
}
|
||||
th:not(:first-child) {
|
||||
@apply bg-gray-300;
|
||||
@apply shadow;
|
||||
}
|
||||
td:not(:first-child) {
|
||||
@apply px-1;
|
||||
@apply border-b;
|
||||
}
|
||||
|
||||
tr:hover td:not(:first-child) {
|
||||
@apply bg-gray-200;
|
||||
}
|
||||
</style>
|
107
ui/src/debug/DebugView.svelte
Normal file
107
ui/src/debug/DebugView.svelte
Normal file
@ -0,0 +1,107 @@
|
||||
<script>
|
||||
import type Debugger from '@core/Debugger';
|
||||
import type Item from '@core/Item';
|
||||
import { Collection, BookOpen, ArrowLeft, ArrowRight } from 'svelte-hero-icons';
|
||||
import ItemTable from './ItemTable.svelte';
|
||||
|
||||
export let debug: Debugger;
|
||||
|
||||
const stageNames = debug.stageNames;
|
||||
let openedPageIndex = false;
|
||||
let focusedPage: number;
|
||||
|
||||
let currentStage = 0;
|
||||
$: canNext = currentStage + 1 < stageNames.length;
|
||||
$: canPrev = currentStage > 0;
|
||||
$: stageSchema = debug.stageSchema[currentStage];
|
||||
$: stageItems = debug.stageResults(currentStage);
|
||||
$: pageFocus = !isNaN(focusedPage);
|
||||
$: pagesNumbers = new Set(stageItems.map((item) => item.page));
|
||||
$: maxPage = Math.max(...pagesNumbers);
|
||||
$: itemsByPage = [
|
||||
...stageItems.reduce((map, item) => {
|
||||
if (!map.has(item.page)) {
|
||||
map.set(item.page, []);
|
||||
}
|
||||
map.get(item.page).push(item);
|
||||
return map;
|
||||
}, new Map<number, Item[]>()),
|
||||
];
|
||||
$: visiblePages = pageFocus ? itemsByPage.filter(([page]) => page === focusedPage) : itemsByPage;
|
||||
|
||||
function focusOnPage(pageNumber: number) {
|
||||
openedPageIndex = false;
|
||||
focusedPage = pageNumber;
|
||||
}
|
||||
|
||||
function showAllPages() {
|
||||
openedPageIndex = false;
|
||||
focusedPage = undefined;
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="mx-4">
|
||||
<div class="mb-4">
|
||||
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
|
||||
<div>Title: {parseResult.metadata.title()}</div>
|
||||
<div>Author: {parseResult.metadata.author()}</div> -->
|
||||
</div>
|
||||
|
||||
<!-- Sticky Controls -->
|
||||
<div class="controls py-2">
|
||||
<div class="flex items-center space-x-2">
|
||||
{#if pageFocus}
|
||||
<span on:click={showAllPages}>
|
||||
<Collection size="1x" class="hover:text-green-700 cursor-pointer opacity-75" />
|
||||
</span>
|
||||
{/if}
|
||||
<span>
|
||||
<span on:click={() => (openedPageIndex = !openedPageIndex)}>
|
||||
<BookOpen size="1x" class="hover:text-green-700 cursor-pointer" />
|
||||
</span>
|
||||
|
||||
<!-- Page selection popup-->
|
||||
{#if openedPageIndex}
|
||||
<div class="absolute mt-2 p-2 flex bg-gray-200 shadow-lg rounded-sm overflow-auto max-h-96">
|
||||
<span class="mt-1 pr-2" on:click={showAllPages}>
|
||||
<Collection size="1x" class="hover:text-green-700 cursor-pointer" />
|
||||
</span>
|
||||
<div
|
||||
class="grid gap-3"
|
||||
style="grid-template-columns: repeat({Math.min(20, maxPage + 1)}, minmax(0, 1fr));">
|
||||
{#each new Array(maxPage + 1) as _, idx}
|
||||
<div
|
||||
on:click={() => pagesNumbers.has(idx) && focusOnPage(idx)}
|
||||
class="px-2 border border-gray-300 rounded-full text-center {pagesNumbers.has(idx) ? 'hover:text-green-700 hover:border-green-700 cursor-pointer' : 'opacity-50'}">
|
||||
{idx}
|
||||
</div>
|
||||
{/each}
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
</span>
|
||||
|
||||
<div>|</div>
|
||||
<div>Transformation:</div>
|
||||
<span on:click={() => canPrev && currentStage--}>
|
||||
<ArrowLeft size="1x" class={canPrev ? 'hover:text-green-700 cursor-pointer' : 'opacity-50'} />
|
||||
</span>
|
||||
<span on:click={() => canNext && currentStage++}>
|
||||
<ArrowRight size="1x" class={canNext ? 'hover:text-green-700 cursor-pointer' : 'opacity-50'} />
|
||||
</span>
|
||||
<div>{stageNames[currentStage]}</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<ItemTable schema={stageSchema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.controls {
|
||||
@apply bg-gray-50;
|
||||
position: -webkit-sticky;
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 3;
|
||||
}
|
||||
</style>
|
93
ui/src/debug/ItemTable.svelte
Normal file
93
ui/src/debug/ItemTable.svelte
Normal file
@ -0,0 +1,93 @@
|
||||
<script>
|
||||
import type Item from '@core/Item';
|
||||
|
||||
export let schema: string[];
|
||||
export let itemsByPage: [number, Item[]][];
|
||||
export let maxPage: number;
|
||||
export let pageFocus: boolean;
|
||||
|
||||
function format(value: object) {
|
||||
if (typeof value === 'number') {
|
||||
return (value as number).toFixed(2);
|
||||
}
|
||||
if (typeof value === 'object' && typeof Array.isArray(value)) {
|
||||
let array = value as Array<object>;
|
||||
if (array.length > 0 && typeof array[0] === 'number') {
|
||||
array = (array.map((element) =>
|
||||
((element as unknown) as number).toFixed(2)
|
||||
) as unknown) as Array<object>;
|
||||
}
|
||||
return '[' + array.join(', ') + ']';
|
||||
}
|
||||
return value;
|
||||
}
|
||||
</script>
|
||||
|
||||
<!-- Item table -->
|
||||
<table class="w-full text-left">
|
||||
<!-- Sticky header -->
|
||||
<thead class=" ">
|
||||
<th />
|
||||
<th>#</th>
|
||||
{#each schema as column}
|
||||
<th>{column}</th>
|
||||
{/each}
|
||||
</thead>
|
||||
<tbody>
|
||||
{#each itemsByPage as [pageNumber, items], pageIdx}
|
||||
<!-- Separator between pages -->
|
||||
{#if pageIdx > 0}
|
||||
<tr class="h-5" />
|
||||
{/if}
|
||||
{#each items as item, itemIdx}
|
||||
<tr>
|
||||
<!-- Page number in first page item row -->
|
||||
{#if itemIdx === 0}
|
||||
<td class="page bg-gray-50">
|
||||
<div>Page {pageNumber} {pageFocus ? '' : ' / ' + maxPage}</div>
|
||||
</td>
|
||||
{:else}
|
||||
<td />
|
||||
{/if}
|
||||
<td>{itemIdx}</td>
|
||||
{#each schema as column}
|
||||
<td>{format(item.data[column])}</td>
|
||||
{/each}
|
||||
</tr>
|
||||
{/each}
|
||||
{/each}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<style>
|
||||
.page {
|
||||
@apply text-lg;
|
||||
@apply font-semibold;
|
||||
@apply pr-4;
|
||||
@apply whitespace-nowrap;
|
||||
position: -webkit-sticky;
|
||||
position: sticky;
|
||||
top: 2em;
|
||||
z-index: 2;
|
||||
}
|
||||
|
||||
th {
|
||||
@apply px-1;
|
||||
position: -webkit-sticky;
|
||||
position: sticky;
|
||||
top: 2.4em;
|
||||
z-index: 2;
|
||||
}
|
||||
th:not(:first-child) {
|
||||
@apply bg-gray-300;
|
||||
@apply shadow;
|
||||
}
|
||||
td:not(:first-child) {
|
||||
@apply px-1;
|
||||
@apply border-b;
|
||||
}
|
||||
|
||||
tr:hover td:not(:first-child) {
|
||||
@apply bg-gray-200;
|
||||
}
|
||||
</style>
|
@ -1,21 +1,23 @@
|
||||
import { pdfParser, parseReporter } from '@core';
|
||||
import { pdfParser, createPipeline, parseReporter } from '@core';
|
||||
import type ProgressListenFunction from '@core/ProgressListenFunction';
|
||||
import type ParseResult from '@core/ParseResult';
|
||||
import type Debugger from '@core/Debugger';
|
||||
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
|
||||
|
||||
import { Writable, writable } from 'svelte/store';
|
||||
|
||||
export let debug: Writable<Debugger> = writable(undefined);
|
||||
export let parseResult: Writable<ParseResult> = writable(undefined);
|
||||
|
||||
pdfjs.GlobalWorkerOptions.workerSrc = 'worker/pdf.worker.min.js';
|
||||
|
||||
const parser = pdfParser(pdfjs);
|
||||
const pdfPipeline = createPipeline(pdfjs, {});
|
||||
|
||||
export async function loadExample(progressListener: ProgressListenFunction): Promise<ParseResult> {
|
||||
return parsePdf(parser.parseUrl('/ExamplePdf.pdf', parseReporter(progressListener)));
|
||||
export async function loadExample(progressListener: ProgressListenFunction): Promise<any> {
|
||||
return parsePdf('/ExamplePdf.pdf', progressListener);
|
||||
}
|
||||
|
||||
export async function processUpload(file: File, progressListener: ProgressListenFunction): Promise<ParseResult> {
|
||||
export async function processUpload(file: File, progressListener: ProgressListenFunction): Promise<any> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const reader = new FileReader();
|
||||
reader.onerror = reject;
|
||||
@ -25,13 +27,18 @@ export async function processUpload(file: File, progressListener: ProgressListen
|
||||
reader.readAsArrayBuffer(file);
|
||||
}).then((buffer) => {
|
||||
const data = new Uint8Array(buffer as ArrayBuffer);
|
||||
return parsePdf(parser.parseBytes(data, parseReporter(progressListener)));
|
||||
return parsePdf(data, progressListener);
|
||||
});
|
||||
}
|
||||
|
||||
async function parsePdf(parsePromise: Promise<ParseResult>): Promise<ParseResult> {
|
||||
return parsePromise.then((result) => {
|
||||
parseResult.set(result);
|
||||
return result;
|
||||
async function parsePdf(src: string | Uint8Array, progressListener: ProgressListenFunction): Promise<any> {
|
||||
pdfPipeline.debug(src, progressListener).then((debugInstance) => {
|
||||
debug.set(debugInstance);
|
||||
return debug;
|
||||
});
|
||||
//TODO without debug-flag
|
||||
// return pdfPipeline.execute(src, progressListener).then((result) => {
|
||||
// parseResult.set(result);
|
||||
// return result;
|
||||
// });
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user