Rudimentary explicit debug support

This commit is contained in:
Johannes Zillmann 2021-02-05 18:28:04 +01:00
parent 95a7e3e93b
commit 4401f1fb5c
27 changed files with 687 additions and 267 deletions

8
core/src/Config.ts Normal file
View File

@ -0,0 +1,8 @@
import ItemTransformer from './transformer/ItemTransformer';
export default interface Config {
// See DocumentInitParameters from https://mozilla.github.io/pdf.js/api/draft/module-pdfjsLib.html#DocumentInitParameters
pdfjsParams?: object;
transformers?: ItemTransformer[];
// TODO keep pdfPages ?
}

41
core/src/Debugger.ts Normal file
View File

@ -0,0 +1,41 @@
import { assert } from './assert';
import Item from './Item';
import ItemTransformer from './transformer/ItemTransformer';
import ParseResult from './ParseResult';
import { calculateSchemas } from './transformer/transformerUtil';
import TransformContext from './transformer/TransformContext';
export default class Debugger {
// parseResult: ParseResult;
context: TransformContext;
transformers: ItemTransformer[];
stageNames: string[];
stageSchema: string[][];
private stageItems: Item[][];
constructor(
initialSchema: string[],
initialItems: Item[],
context: TransformContext,
transformers: ItemTransformer[],
) {
// this.parseResult = parseResult;
this.transformers = transformers;
this.context = context;
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
this.stageItems = [initialItems];
this.stageSchema = calculateSchemas(initialSchema, transformers);
}
//TODO return MarkedItem ? (removed, added, etc..)?
//TODO StageResult == class with schema and marked items ?
stageResults(stageIndex: number): Item[] {
for (let idx = 0; idx < stageIndex + 1; idx++) {
if (!this.stageItems[idx]) {
const stageItems = this.transformers[idx - 1].transform(this.context, this.stageItems[idx - 1]);
this.stageItems.push(stageItems);
}
}
return this.stageItems[stageIndex];
}
}

View File

@ -1,13 +1,32 @@
import { v4 as uuidv4 } from 'uuid';
export default class Item { export default class Item {
page: number; page: number;
data: object; data: object;
uuid?: string;
constructor(page: number, data: object) { constructor(page: number, data: object, uuid: string = uuidv4()) {
this.page = page; this.page = page;
this.data = data; this.data = data;
this.uuid = uuid;
} }
value(column: string): object { value(column: string): object {
return this.data[column]; return this.data[column];
} }
withDataAddition(data: object): Item {
return this.withData({ ...this.data, ...data });
}
withData(data: object): Item {
return new Item(this.page, data, this.uuid);
}
/**
* Returns the item without a uuid.
*/
withoutUuid(): Item {
return new Item(this.page, this.data, '');
}
} }

View File

@ -1,16 +1,19 @@
import Item from './Item'; import type Item from './Item';
import type Metadata from './Metadata'; import type Metadata from './Metadata';
import type PageViewport from './parse/PageViewport';
export default class ParseResult { export default class ParseResult {
pdfPages: any[]; pdfPages: any[];
pageViewports: PageViewport[];
metadata: Metadata; metadata: Metadata;
columns: string[]; schema: string[];
items: Item[]; items: Item[];
constructor(pdfPages: any[], metadata: Metadata, columns: string[], items: Item[]) { constructor(pdfPages: any[], pageViewports: PageViewport[], metadata: Metadata, schema: string[], items: Item[]) {
this.pdfPages = pdfPages; this.pdfPages = pdfPages;
this.pageViewports = pageViewports;
this.metadata = metadata; this.metadata = metadata;
this.columns = columns; this.schema = schema;
this.items = items; this.items = items;
} }

View File

@ -1,13 +0,0 @@
import type ParsedPageItem from './ParsedPageItem';
export default class ParsedPage {
index: number;
pdfPage: any;
items: ParsedPageItem[];
constructor(index: number, pdfPage: any, items: ParsedPageItem[]) {
this.index = index;
this.pdfPage = pdfPage;
this.items = items;
}
}

View File

@ -1,6 +1,5 @@
import Item from './Item'; import Item from './Item';
import Metadata from './Metadata'; import Metadata from './Metadata';
import ParsedPage from './ParsedPage';
import type ParseReporter from './ParseReporter'; import type ParseReporter from './ParseReporter';
import ParseResult from './ParseResult'; import ParseResult from './ParseResult';
import TextDirection from './TextDirection'; import TextDirection from './TextDirection';
@ -11,31 +10,18 @@ import type TextItem from './TextItem';
*/ */
export default class PdfParser { export default class PdfParser {
pdfjs: any; pdfjs: any;
columns = ['str', 'dir', 'width', 'height', 'transform', 'fontName']; defaultParams: object;
schema = ['str', 'fontName', 'dir', 'width', 'height', 'transform'];
constructor(pdfjs: any) { constructor(pdfjs: any, defaultParams = {}) {
this.pdfjs = pdfjs; this.pdfjs = pdfjs;
this.defaultParams = defaultParams;
} }
async parseBytes(data: Uint8Array, reporter: ParseReporter): Promise<ParseResult> { async parse(src: string | Uint8Array | object, reporter: ParseReporter): Promise<ParseResult> {
return this.parse(this.params({ data }), reporter); const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) };
}
async parseUrl(url: string, reporter: ParseReporter): Promise<ParseResult> {
return this.parse(this.params({ url }), reporter);
}
private params(dataSourceParams: object): object {
const defaultParams = {
cMapUrl: 'cmaps/',
cMapPacked: true,
};
return { ...defaultParams, ...dataSourceParams };
}
async parse(parameter: object, reporter: ParseReporter): Promise<ParseResult> {
return this.pdfjs return this.pdfjs
.getDocument(parameter) .getDocument(documentInitParameters)
.promise.then((pdfDocument) => { .promise.then((pdfDocument) => {
reporter.parsedDocumentHeader(pdfDocument.numPages); reporter.parsedDocumentHeader(pdfDocument.numPages);
return Promise.all([ return Promise.all([
@ -47,16 +33,38 @@ export default class PdfParser {
]); ]);
}) })
.then(([metadata, pages]) => { .then(([metadata, pages]) => {
const pdfPages = pages.map((page) => page.pdfPage); const pdfPages = pages.map((page) => page.page);
const items = pages.reduce((allItems, page) => allItems.concat(page.items), []); const items = pages.reduce((allItems, page) => allItems.concat(page.items), []);
return new ParseResult(pdfPages, new Metadata(metadata), this.columns, items); const pageViewports = pdfPages.map((page) => {
const viewPort = page.getViewport({ scale: 1.0 });
return { transformFunction: (itemTransform: number[]) => this.pdfjs.Util.transform(viewPort, itemTransform) };
}); });
return new ParseResult(pdfPages, pageViewports, new Metadata(metadata), this.schema, items);
});
}
private documentInitParameters(src: string | Uint8Array | object): object {
if (typeof src === 'string') {
return { url: src };
}
if (this.isArrayBuffer(src)) {
return { data: src };
}
if (typeof src === 'object') {
return src;
}
throw new Error('Invalid PDFjs parameter for getDocument. Need either Uint8Array, string or a parameter object');
}
private isArrayBuffer(object) {
return typeof object === 'object' && object !== null && object.byteLength !== undefined;
} }
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> { private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => { return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults) => { return accumulatorPromise.then((accumulatedResults) => {
return pdfDocument.getPage(index + 1).then((page) => { return pdfDocument.getPage(index + 1).then((page) => {
const viewport = page.getViewport({ scale: 1.0 });
return this.triggerFontRetrieval(page).then(() => return this.triggerFontRetrieval(page).then(() =>
page page
.getTextContent({ .getTextContent({
@ -66,7 +74,7 @@ export default class PdfParser {
.then((textContent) => { .then((textContent) => {
const items = textContent.items.map((rawItem) => new Item(index, rawItem)); const items = textContent.items.map((rawItem) => new Item(index, rawItem));
reporter.parsedPage(index); reporter.parsedPage(index);
return [...accumulatedResults, new ParsedPage(index, page, items)]; return [...accumulatedResults, { index, page, items }];
}), }),
); );
}); });
@ -92,6 +100,8 @@ export default class PdfParser {
// console.log('Parsing page ' + index); // console.log('Parsing page ' + index);
return pdfDocument.getPage(index + 1).then((page) => { return pdfDocument.getPage(index + 1).then((page) => {
const viewport = page.getViewport({ scale: 1.0 }); const viewport = page.getViewport({ scale: 1.0 });
console.log(viewport);
return this.triggerFontRetrieval(page).then(() => return this.triggerFontRetrieval(page).then(() =>
page.getTextContent().then((textContent) => { page.getTextContent().then((textContent) => {
// console.log(textContent); // console.log(textContent);
@ -126,7 +136,13 @@ export default class PdfParser {
// console.log('Parsed result:', r.length); // console.log('Parsed result:', r.length);
// console.log('Parsed result:', r); // console.log('Parsed result:', r);
return new ParseResult([], new Metadata(metadata), [], []); return {};
}); });
} }
} }
interface ParsedPage {
index: number;
page: any;
items: Item[];
}

45
core/src/PdfPipeline.ts Normal file
View File

@ -0,0 +1,45 @@
import PdfParser from './PdfParser';
import ProgressListenFunction from './ProgressListenFunction';
import ParseProgressReporter from './ParseProgressReporter';
import ItemTransformer from './transformer/ItemTransformer';
import Item from './Item';
import ParseResult from './ParseResult';
import Debugger from './Debugger';
import { verifyRequiredColumns } from './transformer/transformerUtil';
import TransformContext from './transformer/TransformContext';
export default class PdfPipeline {
parser: PdfParser;
transformers: ItemTransformer[];
constructor(parser: PdfParser, transformers: ItemTransformer[]) {
this.parser = parser;
this.transformers = transformers;
}
private async parse(
src: string | Uint8Array | object,
progressListener: ProgressListenFunction,
): Promise<ParseResult> {
const parseResult = await this.parser.parse(src, new ParseProgressReporter(progressListener));
verifyRequiredColumns(parseResult.schema, this.transformers);
return parseResult;
}
//TODO PipelineResult
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
const parseResult = await this.parse(src, progressListener);
const context = { pageViewports: parseResult.pageViewports };
let items = parseResult.items;
this.transformers.forEach((transformer) => {
items = transformer.transform(context, items);
});
parseResult.items = items;
return parseResult;
}
async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> {
const parseResult = await this.parse(src, progressListener);
const context = { pageViewports: parseResult.pageViewports };
return new Debugger(parseResult.schema, parseResult.items, context, this.transformers);
}
}

View File

@ -0,0 +1,7 @@
export default interface TransformerDescription {
readonly consumesGlobels?: string[];
readonly producesGlobels?: string[];
readonly consumes?: string[];
readonly produces?: string[];
readonly removes?: string[];
}

11
core/src/assert.ts Normal file
View File

@ -0,0 +1,11 @@
export function assert(condition: boolean, message: string) {
if (!condition) {
throw new Error(message || 'Assertion failed');
}
}
export function assertDefined<T>(value: T, message: string): T {
assert(value !== null, message);
assert(typeof value !== 'undefined', message);
return value;
}

View File

@ -1,11 +1,32 @@
import Config from './Config';
import type ProgressListenFunction from './ProgressListenFunction'; import type ProgressListenFunction from './ProgressListenFunction';
import ParseProgressReporter from './ParseProgressReporter'; import ParseProgressReporter from './ParseProgressReporter';
import PdfParser from './PdfParser'; import PdfParser from './PdfParser';
import PdfPipeline from './PdfPipeline';
import AdjustHeight from './transformer/AdjustHeight';
import CalculateCoordinates from './transformer/CalculateCoordinates';
const transformers = [new AdjustHeight(), new CalculateCoordinates()];
const defaultConfig: Config = {
pdfjsParams: {
// TODO check if that cmap thing makes sense since we don't bundle them
cMapUrl: 'cmaps/',
cMapPacked: true,
},
transformers,
};
export function pdfParser(pdfJs: any) { export function pdfParser(pdfJs: any) {
return new PdfParser(pdfJs); return new PdfParser(pdfJs, defaultConfig.pdfjsParams);
} }
export function parseReporter(progressListener: ProgressListenFunction) { export function parseReporter(progressListener: ProgressListenFunction) {
return new ParseProgressReporter(progressListener); return new ParseProgressReporter(progressListener);
} }
export function createPipeline(pdfJs: any, config = defaultConfig): PdfPipeline {
const parser = new PdfParser(pdfJs, config.pdfjsParams);
return new PdfPipeline(parser, config.transformers || transformers);
}

View File

@ -0,0 +1,5 @@
type ItemTransformFunction = (itemTransform: number[]) => number[];
export default interface PageViewport {
transformFunction: ItemTransformFunction;
}

View File

@ -0,0 +1,37 @@
import PageViewport from 'src/parse/PageViewport';
import Item from '../Item';
import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext';
export default class AdjustHeight extends ItemTransformer {
constructor() {
super('Adjust Heights', {
consumes: ['transform', 'height'],
});
}
transform(context: TransformContext, items: Item[]): Item[] {
const newItems: Item[] = [];
let page = -1;
let pageViewport: PageViewport;
//TODO groupBy page
items.forEach((item) => {
if (item.page !== page) {
pageViewport = context.pageViewports[item.page];
page = page;
}
const itemTransform = item.data['transform'];
const itemHeight = item.data['height'];
const tx = pageViewport.transformFunction(itemTransform);
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
const dividedHeight = itemHeight / fontHeight;
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
if (newHeight !== itemHeight) {
newItems.push(item.withDataAddition({ height: newHeight }));
} else {
newItems.push(item);
}
});
return items;
}
}

View File

@ -0,0 +1,19 @@
import Item from '../Item';
import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext';
export default class CalculateCoordinates extends ItemTransformer {
constructor() {
super('Calculate Coordinates', {
consumes: ['transform'],
produces: ['X', 'Y'],
removes: ['transform'],
});
}
transform(context: TransformContext, items: Item[]): Item[] {
// const transform: number[] = item.value['Transform'];
items.shift();
return items;
}
}

View File

@ -0,0 +1,25 @@
import TransformerDescription from '../TransformerDescription';
import type Item from '../Item';
import TransformContext from './TransformContext';
export default abstract class ItemTransformer {
readonly name: string;
readonly description: TransformerDescription;
constructor(name: string, description: TransformerDescription) {
this.name = name;
this.description = {
...{
consumesGlobels: [],
producesGlobels: [],
consumes: [],
produces: [],
removes: [],
},
...description,
};
}
// columnar-changes: described
abstract transform(context: TransformContext, items: Item[]): Item[];
}

View File

@ -0,0 +1,5 @@
import PageViewport from 'src/parse/PageViewport';
export default interface TransformContext {
pageViewports: PageViewport[];
}

View File

@ -0,0 +1,55 @@
import TransformerDescription from 'src/TransformerDescription';
import { assert } from '../assert';
import ItemTransformer from './ItemTransformer';
/**
* Goes through all transformer and makes sure each required column ({@link TransformerDescription#consumes}) is available in its predecessor schema.
*
* @param initialSchema
* @param transformers
*/
export function verifyRequiredColumns(initialSchema: string[], transformers: ItemTransformer[]) {
calculateSchemas(initialSchema, transformers);
}
//TODO debug schema
// initial - all unanotated
// second - 2 removed, 1 added
// third - all as before without the removed
export function calculateSchemas(initialSchema: string[], transformers: ItemTransformer[]): string[][] {
const schemas: string[][] = [];
schemas.push(initialSchema);
for (let idx = 0; idx < transformers.length; idx++) {
const transformer = transformers[idx];
const inputSchema = schemas[idx];
validateReferences(inputSchema, transformer.name, transformer.description);
const outputSchema = inputSchema.filter((column) => !transformer.description.removes?.includes(column));
transformer.description.produces?.forEach((column) => outputSchema.push(column));
schemas.push(outputSchema);
}
return schemas;
}
function validateReferences(
inputSchema: string[],
transformerName: string,
transformerDescription: TransformerDescription,
) {
transformerDescription.consumes?.forEach((column) => {
assert(
inputSchema.includes(column),
`Input schema [${inputSchema.join(
', ',
)}] for transformer '${transformerName}' does not contain the required column '${column}' (consumes)`,
);
});
transformerDescription.removes?.forEach((column) => {
assert(
inputSchema.includes(column),
`Input schema [${inputSchema.join(
', ',
)}] for transformer '${transformerName}' does not contain the required column '${column}' (removes)`,
);
});
}

View File

@ -0,0 +1,38 @@
import Debugger from 'src/Debugger';
import Item from 'src/Item';
import ItemTransformer from 'src/transformer/ItemTransformer';
import Metadata from 'src/Metadata';
import ParseResult from 'src/ParseResult';
import TransformerDescription from 'src/TransformerDescription';
import TransformContext from 'src/transformer/TransformContext';
class TestTransformer extends ItemTransformer {
items: Item[];
constructor(name: string, description: TransformerDescription, items: Item[]) {
super(name, description);
this.items = items;
}
transform(_: TransformContext, items: Item[]): Item[] {
return this.items;
}
}
test('basic debug', async () => {
const parsedSchema = ['A', 'B'];
const parsedItems = [new Item(0, { A: 'a_row1', B: 'b_row1' }), new Item(0, { A: 'a_row2', B: 'b_row2' })];
const trans1Desc = { consumes: ['A', 'B'], produces: ['C'], removes: ['A', 'B'] };
const trans1Items = parsedItems.map((item) => item.withData({ C: `c=${item.value('A')}+${item.value('B')}` }));
const transformers = [new TestTransformer('Trans1', trans1Desc, trans1Items)];
const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers);
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
expect(debug.stageSchema).toEqual([parsedSchema, ['C']]);
for (let index = 0; index < debug.stageNames.length; index++) {
console.log(index, debug.stageResults(index));
}
expect(debug.stageResults(0)).toEqual(parsedItems);
expect(debug.stageResults(1)).toEqual(trans1Items);
});

View File

@ -12,7 +12,7 @@ test('basic example PDF parse', async () => {
const data = fs.readFileSync('../examples/ExamplePdf.pdf', null); const data = fs.readFileSync('../examples/ExamplePdf.pdf', null);
// to test // to test
const result = await parser.parseBytes( const result = await parser.parse(
data, data,
new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)), new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)),
); );
@ -29,7 +29,7 @@ test('basic example PDF parse', async () => {
expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]); expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
// verify first n items // verify first n items
expect(result.items.slice(0, 16)).toEqual([ expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([
new Item(0, { new Item(0, {
str: 'Mega Überschrift', str: 'Mega Überschrift',
dir: 'ltr', dir: 'ltr',
@ -37,7 +37,7 @@ test('basic example PDF parse', async () => {
height: 30, height: 30,
transform: [30, 0, 0, 30, 175, 756], transform: [30, 0, 0, 30, 175, 756],
fontName: 'g_d0_f1', fontName: 'g_d0_f1',
}), }).withoutUuid(),
new Item(0, { new Item(0, {
str: '2te Überschrift', str: '2te Überschrift',
dir: 'ltr', dir: 'ltr',
@ -45,7 +45,7 @@ test('basic example PDF parse', async () => {
height: 20, height: 20,
transform: [20, 0, 0, 20, 233, 665], transform: [20, 0, 0, 20, 233, 665],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}), }).withoutUuid(),
new Item(0, { new Item(0, {
str: 'Dies ist eine Test-PDF', str: 'Dies ist eine Test-PDF',
dir: 'ltr', dir: 'ltr',
@ -53,7 +53,7 @@ test('basic example PDF parse', async () => {
height: 11, height: 11,
transform: [11, 0, 0, 11, 240, 585], transform: [11, 0, 0, 11, 240, 585],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}), }).withoutUuid(),
new Item(0, { new Item(0, {
str: '.', str: '.',
dir: 'ltr', dir: 'ltr',
@ -61,7 +61,7 @@ test('basic example PDF parse', async () => {
height: 11, height: 11,
transform: [11, 0, 0, 11, 352.6927, 585], transform: [11, 0, 0, 11, 352.6927, 585],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}), }).withoutUuid(),
new Item(0, { new Item(0, {
str: '1', str: '1',
dir: 'ltr', dir: 'ltr',
@ -69,7 +69,7 @@ test('basic example PDF parse', async () => {
height: 7.333334, height: 7.333334,
transform: [7.333334, 0, 0, 7.333334, 348, 588], transform: [7.333334, 0, 0, 7.333334, 348, 588],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}), }).withoutUuid(),
new Item(0, { new Item(0, {
str: 'Fürs Testen des ', str: 'Fürs Testen des ',
dir: 'ltr', dir: 'ltr',
@ -77,7 +77,7 @@ test('basic example PDF parse', async () => {
height: 11, height: 11,
transform: [11, 0, 0, 11, 208, 572], transform: [11, 0, 0, 11, 208, 572],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}), }).withoutUuid(),
new Item(0, { new Item(0, {
str: 'Markdown Parsers', str: 'Markdown Parsers',
dir: 'ltr', dir: 'ltr',
@ -85,7 +85,7 @@ test('basic example PDF parse', async () => {
height: 11, height: 11,
transform: [11, 0, 0, 11, 291.77832, 572], transform: [11, 0, 0, 11, 291.77832, 572],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}), }).withoutUuid(),
new Item(0, { new Item(0, {
str: '.', str: '.',
dir: 'ltr', dir: 'ltr',
@ -93,7 +93,7 @@ test('basic example PDF parse', async () => {
height: 11, height: 11,
transform: [11, 0, 0, 11, 383.47360000000003, 572], transform: [11, 0, 0, 11, 383.47360000000003, 572],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}), }).withoutUuid(),
new Item(0, { new Item(0, {
str: ' ', str: ' ',
dir: 'ltr', dir: 'ltr',
@ -101,7 +101,7 @@ test('basic example PDF parse', async () => {
height: 11, height: 11,
transform: [11, 0, 0, 11, 61.078451, 59], transform: [11, 0, 0, 11, 61.078451, 59],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}), }).withoutUuid(),
new Item(0, { new Item(0, {
str: 'In Deutsch.', str: 'In Deutsch.',
dir: 'ltr', dir: 'ltr',
@ -109,7 +109,7 @@ test('basic example PDF parse', async () => {
height: 11, height: 11,
transform: [11, 0, 0, 11, 64.134603, 59], transform: [11, 0, 0, 11, 64.134603, 59],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}), }).withoutUuid(),
new Item(0, { new Item(0, {
str: '1', str: '1',
dir: 'ltr', dir: 'ltr',
@ -117,7 +117,7 @@ test('basic example PDF parse', async () => {
height: 7.333334, height: 7.333334,
transform: [7.333334, 0, 0, 7.333334, 57, 62], transform: [7.333334, 0, 0, 7.333334, 57, 62],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}), }).withoutUuid(),
new Item(0, { new Item(0, {
str: '\x00', str: '\x00',
dir: 'ltr', dir: 'ltr',
@ -125,7 +125,7 @@ test('basic example PDF parse', async () => {
height: 12, height: 12,
transform: [12, 0, 0, 12, 294, 45], transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f3', fontName: 'g_d0_f3',
}), }).withoutUuid(),
new Item(0, { new Item(0, {
str: '1', str: '1',
dir: 'ltr', dir: 'ltr',
@ -133,7 +133,7 @@ test('basic example PDF parse', async () => {
height: 12, height: 12,
transform: [12, 0, 0, 12, 294, 45], transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}), }).withoutUuid(),
new Item(1, { new Item(1, {
str: '\x00', str: '\x00',
dir: 'ltr', dir: 'ltr',
@ -141,7 +141,7 @@ test('basic example PDF parse', async () => {
height: 12, height: 12,
transform: [12, 0, 0, 12, 294, 45], transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f3', fontName: 'g_d0_f3',
}), }).withoutUuid(),
new Item(1, { new Item(1, {
str: '2', str: '2',
dir: 'ltr', dir: 'ltr',
@ -149,7 +149,7 @@ test('basic example PDF parse', async () => {
height: 12, height: 12,
transform: [12, 0, 0, 12, 294, 45], transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f2', fontName: 'g_d0_f2',
}), }).withoutUuid(),
new Item(2, { new Item(2, {
str: 'Paragraphen', str: 'Paragraphen',
dir: 'ltr', dir: 'ltr',
@ -157,7 +157,7 @@ test('basic example PDF parse', async () => {
height: 18, height: 18,
transform: [18, 0, 0, 18, 57, 767], transform: [18, 0, 0, 18, 57, 767],
fontName: 'g_d0_f1', fontName: 'g_d0_f1',
}), }).withoutUuid(),
]); ]);
// verify progress // verify progress

View File

@ -0,0 +1,59 @@
import TransformerDescription from 'src/TransformerDescription';
import Item from 'src/Item';
import ItemTransformer from 'src/transformer/ItemTransformer';
import TransformContext from 'src/transformer/TransformContext';
import { calculateSchemas, verifyRequiredColumns } from 'src/transformer/transformerUtil';
class TestSchemaTransformer extends ItemTransformer {
constructor(name: string, description: TransformerDescription) {
super(name, description);
}
transform(_: TransformContext, items: Item[]): Item[] {
return items;
}
}
test('verify valid transform', async () => {
const inputSchema = ['A', 'B', 'C'];
const transformers = [
new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }),
new TestSchemaTransformer('Create E', { produces: ['E'] }),
new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }),
];
verifyRequiredColumns(inputSchema, transformers);
});
test('verify invalid consume', async () => {
const inputSchema = ['A', 'B', 'C'];
const transformers = [new TestSchemaTransformer('Consumes X', { consumes: ['X'] })];
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
"Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X' (consumes)",
);
});
test('verify invalid remove', async () => {
const inputSchema = ['A', 'B', 'C'];
const transformers = [new TestSchemaTransformer('Removes X', { removes: ['X'] })];
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
"Input schema [A, B, C] for transformer 'Removes X' does not contain the required column 'X' (removes)",
);
});
test('calculate schemas', async () => {
const inputSchema = ['A', 'B', 'C'];
const transformers = [
new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }),
new TestSchemaTransformer('Create E', { produces: ['E'] }),
new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }),
];
expect(calculateSchemas(inputSchema, transformers)).toEqual([
['A', 'B', 'C'],
['A', 'D'],
['A', 'D', 'E'],
['A', 'D', 'E'],
]);
});

3
ui/package-lock.json generated
View File

@ -5137,8 +5137,7 @@
"uuid": { "uuid": {
"version": "8.3.2", "version": "8.3.2",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz", "resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz",
"integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==", "integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg=="
"dev": true
}, },
"v8-to-istanbul": { "v8-to-istanbul": {
"version": "7.0.0", "version": "7.0.0",

View File

@ -21,7 +21,8 @@
}, },
"dependencies": { "dependencies": {
"pdfjs-dist": "^2.5.207", "pdfjs-dist": "^2.5.207",
"svelte-file-dropzone": "0.0.15" "svelte-file-dropzone": "0.0.15",
"uuid": "^8.3.2"
}, },
"devDependencies": { "devDependencies": {
"@snowpack/plugin-dotenv": "^2.0.5", "@snowpack/plugin-dotenv": "^2.0.5",

View File

@ -1,14 +1,14 @@
<script> <script>
import Upload from './Upload.svelte'; import Upload from './Upload.svelte';
import { parseResult } from './store'; import { parseResult, debug } from './store';
import Result from './Result.svelte'; import DebugView from './debug/DebugView.svelte';
</script> </script>
<div class="text-2xl font-semibold font-serif text-center bg-gray-400">PDF to Markdown Converter</div> <div class="text-2xl font-semibold font-serif text-center bg-gray-400">PDF to Markdown Converter</div>
<main class="mt-5 h-full"> <main class="mt-5 h-full">
{#if $parseResult} {#if $debug}
<Result parseResult={$parseResult} /> <DebugView debug={$debug} />
{:else} {:else}
<Upload /> <Upload />
{/if} {/if}

View File

@ -1,16 +0,0 @@
<script>
import type ParseResult from 'pdf-to-markdown-core/lib/src/ParseResult';
import Table from './Table.svelte';
export let parseResult: ParseResult;
</script>
<div class="mx-4">
<div class="mb-4">
<div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
<div>Title: {parseResult.metadata.title()}</div>
<div>Author: {parseResult.metadata.author()}</div>
</div>
<Table columns={parseResult.columns} items={parseResult.items} />
</div>

View File

@ -1,172 +0,0 @@
<script>
import type Item from '@core/Item';
import { Collection, BookOpen, Support, ArrowLeft, ArrowRight } from 'svelte-hero-icons';
export let columns: string[];
export let items: Item[];
const pages = [...new Set(items.map((item) => item.page))];
const maxPage = Math.max(...pages);
let focusedPage: number;
$: focused = typeof focusedPage === 'number';
let openedPageIndex = false;
const itemsGroupedByPage = items.reduce((map, item) => {
if (!map.has(item.page)) {
map.set(item.page, []);
}
map.get(item.page).push(item);
return map;
}, new Map<number, Item[]>());
function focusOnPage(pageNumber: number) {
openedPageIndex = false;
focusedPage = pageNumber;
}
function showAllPages() {
openedPageIndex = false;
focusedPage = undefined;
}
function format(value: object) {
const type = typeof value;
if (typeof value === 'number') {
return (value as number).toFixed(2);
}
if (typeof value === 'object' && typeof Array.isArray(value)) {
let array = value as Array<object>;
if (array.length > 0 && typeof array[0] === 'number') {
array = (array.map((element) =>
((element as unknown) as number).toFixed(2)
) as unknown) as Array<object>;
}
return '[' + array.join(', ') + ']';
}
return value;
}
</script>
<!-- Sticky Controls -->
<div class="controls py-2">
<div class="flex items-center space-x-2">
<span>
<span on:click={() => (openedPageIndex = !openedPageIndex)}>
<BookOpen size="1x" class="hover:text-green-700 cursor-pointer" />
</span>
<!-- Page selection popup-->
{#if openedPageIndex}
<div class="absolute mt-2 p-2 flex bg-gray-200 shadow-lg rounded-sm overflow-auto max-h-96">
<span class="mt-1 pr-2" on:click={showAllPages}>
<Collection size="1x" class="hover:text-green-700 cursor-pointer" />
</span>
<div
class="grid gap-3"
style="grid-template-columns: repeat({Math.min(20, maxPage + 1)}, minmax(0, 1fr));">
{#each new Array(maxPage + 1) as _, idx}
<div
on:click={() => itemsGroupedByPage.has(idx) && focusOnPage(idx)}
class="px-2 border border-gray-300 rounded-full text-center {itemsGroupedByPage.has(idx) ? 'hover:text-green-700 hover:border-green-700 cursor-pointer' : 'opacity-50'}">
{idx}
</div>
{/each}
</div>
</div>
{/if}
</span>
<div>|</div>
<div>Transformation:</div>
<ArrowLeft size="1x" class="hover:text-green-700 cursor-pointer opacity-50" />
<div>Parse Result</div>
<ArrowRight size="1x" class="hover:text-green-700 cursor-pointer" />
</div>
</div>
<!-- Item table -->
<table class="w-full text-left">
<!-- Sticky header -->
<thead class=" ">
<th />
<th>#</th>
{#each columns as column}
<th>{column}</th>
{/each}
</thead>
<tbody>
{#each [...itemsGroupedByPage].filter(([page]) => !focused || page === focusedPage) as [pageNumber, items], pageIdx}
<!-- Separator between pages -->
{#if pageIdx > 0}
<tr class="h-5" />
{/if}
{#each items as item, itemIdx}
<tr>
<!-- Page number in first page item row -->
{#if itemIdx === 0}
<td class="page bg-gray-50">
<div>Page {pageNumber} {focused ? '' : ' / ' + maxPage}</div>
<div class="absolute flex">
{#if !focused}
<span on:click={() => focusOnPage(pageNumber)}>
<Support size="1x" class="hover:text-green-700 cursor-pointer opacity-75" />
</span>
{:else}
<span on:click={showAllPages}>
<Collection size="1x" class="hover:text-green-700 cursor-pointer opacity-75" />
</span>
{/if}
</div>
</td>
{:else}
<td />
{/if}
<td>{itemIdx}</td>
{#each columns as column}
<td>{format(item.data[column])}</td>
{/each}
</tr>
{/each}
{/each}
</tbody>
</table>
<style>
.controls {
@apply bg-gray-50;
position: -webkit-sticky;
position: sticky;
top: 0;
z-index: 3;
}
.page {
@apply text-lg;
@apply font-semibold;
@apply pr-4;
@apply whitespace-nowrap;
position: -webkit-sticky;
position: sticky;
top: 2em;
z-index: 2;
}
th {
@apply px-1;
position: -webkit-sticky;
position: sticky;
top: 2.4em;
z-index: 2;
}
th:not(:first-child) {
@apply bg-gray-300;
@apply shadow;
}
td:not(:first-child) {
@apply px-1;
@apply border-b;
}
tr:hover td:not(:first-child) {
@apply bg-gray-200;
}
</style>

View File

@ -0,0 +1,107 @@
<script>
import type Debugger from '@core/Debugger';
import type Item from '@core/Item';
import { Collection, BookOpen, ArrowLeft, ArrowRight } from 'svelte-hero-icons';
import ItemTable from './ItemTable.svelte';
export let debug: Debugger;
const stageNames = debug.stageNames;
let openedPageIndex = false;
let focusedPage: number;
let currentStage = 0;
$: canNext = currentStage + 1 < stageNames.length;
$: canPrev = currentStage > 0;
$: stageSchema = debug.stageSchema[currentStage];
$: stageItems = debug.stageResults(currentStage);
$: pageFocus = !isNaN(focusedPage);
$: pagesNumbers = new Set(stageItems.map((item) => item.page));
$: maxPage = Math.max(...pagesNumbers);
$: itemsByPage = [
...stageItems.reduce((map, item) => {
if (!map.has(item.page)) {
map.set(item.page, []);
}
map.get(item.page).push(item);
return map;
}, new Map<number, Item[]>()),
];
$: visiblePages = pageFocus ? itemsByPage.filter(([page]) => page === focusedPage) : itemsByPage;
function focusOnPage(pageNumber: number) {
openedPageIndex = false;
focusedPage = pageNumber;
}
function showAllPages() {
openedPageIndex = false;
focusedPage = undefined;
}
</script>
<div class="mx-4">
<div class="mb-4">
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
<div>Title: {parseResult.metadata.title()}</div>
<div>Author: {parseResult.metadata.author()}</div> -->
</div>
<!-- Sticky Controls -->
<div class="controls py-2">
<div class="flex items-center space-x-2">
{#if pageFocus}
<span on:click={showAllPages}>
<Collection size="1x" class="hover:text-green-700 cursor-pointer opacity-75" />
</span>
{/if}
<span>
<span on:click={() => (openedPageIndex = !openedPageIndex)}>
<BookOpen size="1x" class="hover:text-green-700 cursor-pointer" />
</span>
<!-- Page selection popup-->
{#if openedPageIndex}
<div class="absolute mt-2 p-2 flex bg-gray-200 shadow-lg rounded-sm overflow-auto max-h-96">
<span class="mt-1 pr-2" on:click={showAllPages}>
<Collection size="1x" class="hover:text-green-700 cursor-pointer" />
</span>
<div
class="grid gap-3"
style="grid-template-columns: repeat({Math.min(20, maxPage + 1)}, minmax(0, 1fr));">
{#each new Array(maxPage + 1) as _, idx}
<div
on:click={() => pagesNumbers.has(idx) && focusOnPage(idx)}
class="px-2 border border-gray-300 rounded-full text-center {pagesNumbers.has(idx) ? 'hover:text-green-700 hover:border-green-700 cursor-pointer' : 'opacity-50'}">
{idx}
</div>
{/each}
</div>
</div>
{/if}
</span>
<div>|</div>
<div>Transformation:</div>
<span on:click={() => canPrev && currentStage--}>
<ArrowLeft size="1x" class={canPrev ? 'hover:text-green-700 cursor-pointer' : 'opacity-50'} />
</span>
<span on:click={() => canNext && currentStage++}>
<ArrowRight size="1x" class={canNext ? 'hover:text-green-700 cursor-pointer' : 'opacity-50'} />
</span>
<div>{stageNames[currentStage]}</div>
</div>
</div>
<ItemTable schema={stageSchema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
</div>
<style>
.controls {
@apply bg-gray-50;
position: -webkit-sticky;
position: sticky;
top: 0;
z-index: 3;
}
</style>

View File

@ -0,0 +1,93 @@
<script>
import type Item from '@core/Item';
export let schema: string[];
export let itemsByPage: [number, Item[]][];
export let maxPage: number;
export let pageFocus: boolean;
function format(value: object) {
if (typeof value === 'number') {
return (value as number).toFixed(2);
}
if (typeof value === 'object' && typeof Array.isArray(value)) {
let array = value as Array<object>;
if (array.length > 0 && typeof array[0] === 'number') {
array = (array.map((element) =>
((element as unknown) as number).toFixed(2)
) as unknown) as Array<object>;
}
return '[' + array.join(', ') + ']';
}
return value;
}
</script>
<!-- Item table -->
<table class="w-full text-left">
<!-- Sticky header -->
<thead class=" ">
<th />
<th>#</th>
{#each schema as column}
<th>{column}</th>
{/each}
</thead>
<tbody>
{#each itemsByPage as [pageNumber, items], pageIdx}
<!-- Separator between pages -->
{#if pageIdx > 0}
<tr class="h-5" />
{/if}
{#each items as item, itemIdx}
<tr>
<!-- Page number in first page item row -->
{#if itemIdx === 0}
<td class="page bg-gray-50">
<div>Page {pageNumber} {pageFocus ? '' : ' / ' + maxPage}</div>
</td>
{:else}
<td />
{/if}
<td>{itemIdx}</td>
{#each schema as column}
<td>{format(item.data[column])}</td>
{/each}
</tr>
{/each}
{/each}
</tbody>
</table>
<style>
.page {
@apply text-lg;
@apply font-semibold;
@apply pr-4;
@apply whitespace-nowrap;
position: -webkit-sticky;
position: sticky;
top: 2em;
z-index: 2;
}
th {
@apply px-1;
position: -webkit-sticky;
position: sticky;
top: 2.4em;
z-index: 2;
}
th:not(:first-child) {
@apply bg-gray-300;
@apply shadow;
}
td:not(:first-child) {
@apply px-1;
@apply border-b;
}
tr:hover td:not(:first-child) {
@apply bg-gray-200;
}
</style>

View File

@ -1,21 +1,23 @@
import { pdfParser, parseReporter } from '@core'; import { pdfParser, createPipeline, parseReporter } from '@core';
import type ProgressListenFunction from '@core/ProgressListenFunction'; import type ProgressListenFunction from '@core/ProgressListenFunction';
import type ParseResult from '@core/ParseResult'; import type ParseResult from '@core/ParseResult';
import type Debugger from '@core/Debugger';
import * as pdfjs from 'pdfjs-dist/es5/build/pdf'; import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
import { Writable, writable } from 'svelte/store'; import { Writable, writable } from 'svelte/store';
export let debug: Writable<Debugger> = writable(undefined);
export let parseResult: Writable<ParseResult> = writable(undefined); export let parseResult: Writable<ParseResult> = writable(undefined);
pdfjs.GlobalWorkerOptions.workerSrc = 'worker/pdf.worker.min.js'; pdfjs.GlobalWorkerOptions.workerSrc = 'worker/pdf.worker.min.js';
const parser = pdfParser(pdfjs); const pdfPipeline = createPipeline(pdfjs, {});
export async function loadExample(progressListener: ProgressListenFunction): Promise<ParseResult> { export async function loadExample(progressListener: ProgressListenFunction): Promise<any> {
return parsePdf(parser.parseUrl('/ExamplePdf.pdf', parseReporter(progressListener))); return parsePdf('/ExamplePdf.pdf', progressListener);
} }
export async function processUpload(file: File, progressListener: ProgressListenFunction): Promise<ParseResult> { export async function processUpload(file: File, progressListener: ProgressListenFunction): Promise<any> {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
const reader = new FileReader(); const reader = new FileReader();
reader.onerror = reject; reader.onerror = reject;
@ -25,13 +27,18 @@ export async function processUpload(file: File, progressListener: ProgressListen
reader.readAsArrayBuffer(file); reader.readAsArrayBuffer(file);
}).then((buffer) => { }).then((buffer) => {
const data = new Uint8Array(buffer as ArrayBuffer); const data = new Uint8Array(buffer as ArrayBuffer);
return parsePdf(parser.parseBytes(data, parseReporter(progressListener))); return parsePdf(data, progressListener);
}); });
} }
async function parsePdf(parsePromise: Promise<ParseResult>): Promise<ParseResult> { async function parsePdf(src: string | Uint8Array, progressListener: ProgressListenFunction): Promise<any> {
return parsePromise.then((result) => { pdfPipeline.debug(src, progressListener).then((debugInstance) => {
parseResult.set(result); debug.set(debugInstance);
return result; return debug;
}); });
//TODO without debug-flag
// return pdfPipeline.execute(src, progressListener).then((result) => {
// parseResult.set(result);
// return result;
// });
} }