mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-29 19:23:42 +01:00
Rudimentary explicit debug support
This commit is contained in:
parent
95a7e3e93b
commit
4401f1fb5c
8
core/src/Config.ts
Normal file
8
core/src/Config.ts
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
import ItemTransformer from './transformer/ItemTransformer';
|
||||||
|
|
||||||
|
export default interface Config {
|
||||||
|
// See DocumentInitParameters from https://mozilla.github.io/pdf.js/api/draft/module-pdfjsLib.html#DocumentInitParameters
|
||||||
|
pdfjsParams?: object;
|
||||||
|
transformers?: ItemTransformer[];
|
||||||
|
// TODO keep pdfPages ?
|
||||||
|
}
|
41
core/src/Debugger.ts
Normal file
41
core/src/Debugger.ts
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import { assert } from './assert';
|
||||||
|
import Item from './Item';
|
||||||
|
import ItemTransformer from './transformer/ItemTransformer';
|
||||||
|
import ParseResult from './ParseResult';
|
||||||
|
import { calculateSchemas } from './transformer/transformerUtil';
|
||||||
|
import TransformContext from './transformer/TransformContext';
|
||||||
|
|
||||||
|
export default class Debugger {
|
||||||
|
// parseResult: ParseResult;
|
||||||
|
context: TransformContext;
|
||||||
|
transformers: ItemTransformer[];
|
||||||
|
stageNames: string[];
|
||||||
|
stageSchema: string[][];
|
||||||
|
private stageItems: Item[][];
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
initialSchema: string[],
|
||||||
|
initialItems: Item[],
|
||||||
|
context: TransformContext,
|
||||||
|
transformers: ItemTransformer[],
|
||||||
|
) {
|
||||||
|
// this.parseResult = parseResult;
|
||||||
|
this.transformers = transformers;
|
||||||
|
this.context = context;
|
||||||
|
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
|
||||||
|
this.stageItems = [initialItems];
|
||||||
|
this.stageSchema = calculateSchemas(initialSchema, transformers);
|
||||||
|
}
|
||||||
|
|
||||||
|
//TODO return MarkedItem ? (removed, added, etc..)?
|
||||||
|
//TODO StageResult == class with schema and marked items ?
|
||||||
|
stageResults(stageIndex: number): Item[] {
|
||||||
|
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
||||||
|
if (!this.stageItems[idx]) {
|
||||||
|
const stageItems = this.transformers[idx - 1].transform(this.context, this.stageItems[idx - 1]);
|
||||||
|
this.stageItems.push(stageItems);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return this.stageItems[stageIndex];
|
||||||
|
}
|
||||||
|
}
|
@ -1,13 +1,32 @@
|
|||||||
|
import { v4 as uuidv4 } from 'uuid';
|
||||||
|
|
||||||
export default class Item {
|
export default class Item {
|
||||||
page: number;
|
page: number;
|
||||||
data: object;
|
data: object;
|
||||||
|
uuid?: string;
|
||||||
|
|
||||||
constructor(page: number, data: object) {
|
constructor(page: number, data: object, uuid: string = uuidv4()) {
|
||||||
this.page = page;
|
this.page = page;
|
||||||
this.data = data;
|
this.data = data;
|
||||||
|
this.uuid = uuid;
|
||||||
}
|
}
|
||||||
|
|
||||||
value(column: string): object {
|
value(column: string): object {
|
||||||
return this.data[column];
|
return this.data[column];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
withDataAddition(data: object): Item {
|
||||||
|
return this.withData({ ...this.data, ...data });
|
||||||
|
}
|
||||||
|
|
||||||
|
withData(data: object): Item {
|
||||||
|
return new Item(this.page, data, this.uuid);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the item without a uuid.
|
||||||
|
*/
|
||||||
|
withoutUuid(): Item {
|
||||||
|
return new Item(this.page, this.data, '');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,16 +1,19 @@
|
|||||||
import Item from './Item';
|
import type Item from './Item';
|
||||||
import type Metadata from './Metadata';
|
import type Metadata from './Metadata';
|
||||||
|
import type PageViewport from './parse/PageViewport';
|
||||||
|
|
||||||
export default class ParseResult {
|
export default class ParseResult {
|
||||||
pdfPages: any[];
|
pdfPages: any[];
|
||||||
|
pageViewports: PageViewport[];
|
||||||
metadata: Metadata;
|
metadata: Metadata;
|
||||||
columns: string[];
|
schema: string[];
|
||||||
items: Item[];
|
items: Item[];
|
||||||
|
|
||||||
constructor(pdfPages: any[], metadata: Metadata, columns: string[], items: Item[]) {
|
constructor(pdfPages: any[], pageViewports: PageViewport[], metadata: Metadata, schema: string[], items: Item[]) {
|
||||||
this.pdfPages = pdfPages;
|
this.pdfPages = pdfPages;
|
||||||
|
this.pageViewports = pageViewports;
|
||||||
this.metadata = metadata;
|
this.metadata = metadata;
|
||||||
this.columns = columns;
|
this.schema = schema;
|
||||||
this.items = items;
|
this.items = items;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,13 +0,0 @@
|
|||||||
import type ParsedPageItem from './ParsedPageItem';
|
|
||||||
|
|
||||||
export default class ParsedPage {
|
|
||||||
index: number;
|
|
||||||
pdfPage: any;
|
|
||||||
items: ParsedPageItem[];
|
|
||||||
|
|
||||||
constructor(index: number, pdfPage: any, items: ParsedPageItem[]) {
|
|
||||||
this.index = index;
|
|
||||||
this.pdfPage = pdfPage;
|
|
||||||
this.items = items;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,6 +1,5 @@
|
|||||||
import Item from './Item';
|
import Item from './Item';
|
||||||
import Metadata from './Metadata';
|
import Metadata from './Metadata';
|
||||||
import ParsedPage from './ParsedPage';
|
|
||||||
import type ParseReporter from './ParseReporter';
|
import type ParseReporter from './ParseReporter';
|
||||||
import ParseResult from './ParseResult';
|
import ParseResult from './ParseResult';
|
||||||
import TextDirection from './TextDirection';
|
import TextDirection from './TextDirection';
|
||||||
@ -11,31 +10,18 @@ import type TextItem from './TextItem';
|
|||||||
*/
|
*/
|
||||||
export default class PdfParser {
|
export default class PdfParser {
|
||||||
pdfjs: any;
|
pdfjs: any;
|
||||||
columns = ['str', 'dir', 'width', 'height', 'transform', 'fontName'];
|
defaultParams: object;
|
||||||
|
schema = ['str', 'fontName', 'dir', 'width', 'height', 'transform'];
|
||||||
|
|
||||||
constructor(pdfjs: any) {
|
constructor(pdfjs: any, defaultParams = {}) {
|
||||||
this.pdfjs = pdfjs;
|
this.pdfjs = pdfjs;
|
||||||
|
this.defaultParams = defaultParams;
|
||||||
}
|
}
|
||||||
|
|
||||||
async parseBytes(data: Uint8Array, reporter: ParseReporter): Promise<ParseResult> {
|
async parse(src: string | Uint8Array | object, reporter: ParseReporter): Promise<ParseResult> {
|
||||||
return this.parse(this.params({ data }), reporter);
|
const documentInitParameters = { ...this.defaultParams, ...this.documentInitParameters(src) };
|
||||||
}
|
|
||||||
|
|
||||||
async parseUrl(url: string, reporter: ParseReporter): Promise<ParseResult> {
|
|
||||||
return this.parse(this.params({ url }), reporter);
|
|
||||||
}
|
|
||||||
|
|
||||||
private params(dataSourceParams: object): object {
|
|
||||||
const defaultParams = {
|
|
||||||
cMapUrl: 'cmaps/',
|
|
||||||
cMapPacked: true,
|
|
||||||
};
|
|
||||||
return { ...defaultParams, ...dataSourceParams };
|
|
||||||
}
|
|
||||||
|
|
||||||
async parse(parameter: object, reporter: ParseReporter): Promise<ParseResult> {
|
|
||||||
return this.pdfjs
|
return this.pdfjs
|
||||||
.getDocument(parameter)
|
.getDocument(documentInitParameters)
|
||||||
.promise.then((pdfDocument) => {
|
.promise.then((pdfDocument) => {
|
||||||
reporter.parsedDocumentHeader(pdfDocument.numPages);
|
reporter.parsedDocumentHeader(pdfDocument.numPages);
|
||||||
return Promise.all([
|
return Promise.all([
|
||||||
@ -47,16 +33,38 @@ export default class PdfParser {
|
|||||||
]);
|
]);
|
||||||
})
|
})
|
||||||
.then(([metadata, pages]) => {
|
.then(([metadata, pages]) => {
|
||||||
const pdfPages = pages.map((page) => page.pdfPage);
|
const pdfPages = pages.map((page) => page.page);
|
||||||
const items = pages.reduce((allItems, page) => allItems.concat(page.items), []);
|
const items = pages.reduce((allItems, page) => allItems.concat(page.items), []);
|
||||||
return new ParseResult(pdfPages, new Metadata(metadata), this.columns, items);
|
const pageViewports = pdfPages.map((page) => {
|
||||||
|
const viewPort = page.getViewport({ scale: 1.0 });
|
||||||
|
return { transformFunction: (itemTransform: number[]) => this.pdfjs.Util.transform(viewPort, itemTransform) };
|
||||||
});
|
});
|
||||||
|
return new ParseResult(pdfPages, pageViewports, new Metadata(metadata), this.schema, items);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private documentInitParameters(src: string | Uint8Array | object): object {
|
||||||
|
if (typeof src === 'string') {
|
||||||
|
return { url: src };
|
||||||
|
}
|
||||||
|
if (this.isArrayBuffer(src)) {
|
||||||
|
return { data: src };
|
||||||
|
}
|
||||||
|
if (typeof src === 'object') {
|
||||||
|
return src;
|
||||||
|
}
|
||||||
|
throw new Error('Invalid PDFjs parameter for getDocument. Need either Uint8Array, string or a parameter object');
|
||||||
|
}
|
||||||
|
|
||||||
|
private isArrayBuffer(object) {
|
||||||
|
return typeof object === 'object' && object !== null && object.byteLength !== undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
|
private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise<ParsedPage> {
|
||||||
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||||
return accumulatorPromise.then((accumulatedResults) => {
|
return accumulatorPromise.then((accumulatedResults) => {
|
||||||
return pdfDocument.getPage(index + 1).then((page) => {
|
return pdfDocument.getPage(index + 1).then((page) => {
|
||||||
|
const viewport = page.getViewport({ scale: 1.0 });
|
||||||
return this.triggerFontRetrieval(page).then(() =>
|
return this.triggerFontRetrieval(page).then(() =>
|
||||||
page
|
page
|
||||||
.getTextContent({
|
.getTextContent({
|
||||||
@ -66,7 +74,7 @@ export default class PdfParser {
|
|||||||
.then((textContent) => {
|
.then((textContent) => {
|
||||||
const items = textContent.items.map((rawItem) => new Item(index, rawItem));
|
const items = textContent.items.map((rawItem) => new Item(index, rawItem));
|
||||||
reporter.parsedPage(index);
|
reporter.parsedPage(index);
|
||||||
return [...accumulatedResults, new ParsedPage(index, page, items)];
|
return [...accumulatedResults, { index, page, items }];
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
@ -92,6 +100,8 @@ export default class PdfParser {
|
|||||||
// console.log('Parsing page ' + index);
|
// console.log('Parsing page ' + index);
|
||||||
return pdfDocument.getPage(index + 1).then((page) => {
|
return pdfDocument.getPage(index + 1).then((page) => {
|
||||||
const viewport = page.getViewport({ scale: 1.0 });
|
const viewport = page.getViewport({ scale: 1.0 });
|
||||||
|
console.log(viewport);
|
||||||
|
|
||||||
return this.triggerFontRetrieval(page).then(() =>
|
return this.triggerFontRetrieval(page).then(() =>
|
||||||
page.getTextContent().then((textContent) => {
|
page.getTextContent().then((textContent) => {
|
||||||
// console.log(textContent);
|
// console.log(textContent);
|
||||||
@ -126,7 +136,13 @@ export default class PdfParser {
|
|||||||
// console.log('Parsed result:', r.length);
|
// console.log('Parsed result:', r.length);
|
||||||
// console.log('Parsed result:', r);
|
// console.log('Parsed result:', r);
|
||||||
|
|
||||||
return new ParseResult([], new Metadata(metadata), [], []);
|
return {};
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface ParsedPage {
|
||||||
|
index: number;
|
||||||
|
page: any;
|
||||||
|
items: Item[];
|
||||||
|
}
|
||||||
|
45
core/src/PdfPipeline.ts
Normal file
45
core/src/PdfPipeline.ts
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import PdfParser from './PdfParser';
|
||||||
|
import ProgressListenFunction from './ProgressListenFunction';
|
||||||
|
import ParseProgressReporter from './ParseProgressReporter';
|
||||||
|
import ItemTransformer from './transformer/ItemTransformer';
|
||||||
|
import Item from './Item';
|
||||||
|
import ParseResult from './ParseResult';
|
||||||
|
import Debugger from './Debugger';
|
||||||
|
import { verifyRequiredColumns } from './transformer/transformerUtil';
|
||||||
|
import TransformContext from './transformer/TransformContext';
|
||||||
|
|
||||||
|
export default class PdfPipeline {
|
||||||
|
parser: PdfParser;
|
||||||
|
transformers: ItemTransformer[];
|
||||||
|
|
||||||
|
constructor(parser: PdfParser, transformers: ItemTransformer[]) {
|
||||||
|
this.parser = parser;
|
||||||
|
this.transformers = transformers;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async parse(
|
||||||
|
src: string | Uint8Array | object,
|
||||||
|
progressListener: ProgressListenFunction,
|
||||||
|
): Promise<ParseResult> {
|
||||||
|
const parseResult = await this.parser.parse(src, new ParseProgressReporter(progressListener));
|
||||||
|
verifyRequiredColumns(parseResult.schema, this.transformers);
|
||||||
|
return parseResult;
|
||||||
|
}
|
||||||
|
//TODO PipelineResult
|
||||||
|
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
|
||||||
|
const parseResult = await this.parse(src, progressListener);
|
||||||
|
const context = { pageViewports: parseResult.pageViewports };
|
||||||
|
let items = parseResult.items;
|
||||||
|
this.transformers.forEach((transformer) => {
|
||||||
|
items = transformer.transform(context, items);
|
||||||
|
});
|
||||||
|
parseResult.items = items;
|
||||||
|
return parseResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> {
|
||||||
|
const parseResult = await this.parse(src, progressListener);
|
||||||
|
const context = { pageViewports: parseResult.pageViewports };
|
||||||
|
return new Debugger(parseResult.schema, parseResult.items, context, this.transformers);
|
||||||
|
}
|
||||||
|
}
|
7
core/src/TransformerDescription.ts
Normal file
7
core/src/TransformerDescription.ts
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
export default interface TransformerDescription {
|
||||||
|
readonly consumesGlobels?: string[];
|
||||||
|
readonly producesGlobels?: string[];
|
||||||
|
readonly consumes?: string[];
|
||||||
|
readonly produces?: string[];
|
||||||
|
readonly removes?: string[];
|
||||||
|
}
|
11
core/src/assert.ts
Normal file
11
core/src/assert.ts
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
export function assert(condition: boolean, message: string) {
|
||||||
|
if (!condition) {
|
||||||
|
throw new Error(message || 'Assertion failed');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function assertDefined<T>(value: T, message: string): T {
|
||||||
|
assert(value !== null, message);
|
||||||
|
assert(typeof value !== 'undefined', message);
|
||||||
|
return value;
|
||||||
|
}
|
@ -1,11 +1,32 @@
|
|||||||
|
import Config from './Config';
|
||||||
import type ProgressListenFunction from './ProgressListenFunction';
|
import type ProgressListenFunction from './ProgressListenFunction';
|
||||||
import ParseProgressReporter from './ParseProgressReporter';
|
import ParseProgressReporter from './ParseProgressReporter';
|
||||||
import PdfParser from './PdfParser';
|
import PdfParser from './PdfParser';
|
||||||
|
import PdfPipeline from './PdfPipeline';
|
||||||
|
|
||||||
|
import AdjustHeight from './transformer/AdjustHeight';
|
||||||
|
import CalculateCoordinates from './transformer/CalculateCoordinates';
|
||||||
|
|
||||||
|
const transformers = [new AdjustHeight(), new CalculateCoordinates()];
|
||||||
|
|
||||||
|
const defaultConfig: Config = {
|
||||||
|
pdfjsParams: {
|
||||||
|
// TODO check if that cmap thing makes sense since we don't bundle them
|
||||||
|
cMapUrl: 'cmaps/',
|
||||||
|
cMapPacked: true,
|
||||||
|
},
|
||||||
|
transformers,
|
||||||
|
};
|
||||||
|
|
||||||
export function pdfParser(pdfJs: any) {
|
export function pdfParser(pdfJs: any) {
|
||||||
return new PdfParser(pdfJs);
|
return new PdfParser(pdfJs, defaultConfig.pdfjsParams);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function parseReporter(progressListener: ProgressListenFunction) {
|
export function parseReporter(progressListener: ProgressListenFunction) {
|
||||||
return new ParseProgressReporter(progressListener);
|
return new ParseProgressReporter(progressListener);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function createPipeline(pdfJs: any, config = defaultConfig): PdfPipeline {
|
||||||
|
const parser = new PdfParser(pdfJs, config.pdfjsParams);
|
||||||
|
return new PdfPipeline(parser, config.transformers || transformers);
|
||||||
|
}
|
||||||
|
5
core/src/parse/PageViewport.ts
Normal file
5
core/src/parse/PageViewport.ts
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
type ItemTransformFunction = (itemTransform: number[]) => number[];
|
||||||
|
|
||||||
|
export default interface PageViewport {
|
||||||
|
transformFunction: ItemTransformFunction;
|
||||||
|
}
|
37
core/src/transformer/AdjustHeight.ts
Normal file
37
core/src/transformer/AdjustHeight.ts
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
import PageViewport from 'src/parse/PageViewport';
|
||||||
|
import Item from '../Item';
|
||||||
|
import ItemTransformer from './ItemTransformer';
|
||||||
|
import TransformContext from './TransformContext';
|
||||||
|
|
||||||
|
export default class AdjustHeight extends ItemTransformer {
|
||||||
|
constructor() {
|
||||||
|
super('Adjust Heights', {
|
||||||
|
consumes: ['transform', 'height'],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(context: TransformContext, items: Item[]): Item[] {
|
||||||
|
const newItems: Item[] = [];
|
||||||
|
let page = -1;
|
||||||
|
let pageViewport: PageViewport;
|
||||||
|
//TODO groupBy page
|
||||||
|
items.forEach((item) => {
|
||||||
|
if (item.page !== page) {
|
||||||
|
pageViewport = context.pageViewports[item.page];
|
||||||
|
page = page;
|
||||||
|
}
|
||||||
|
const itemTransform = item.data['transform'];
|
||||||
|
const itemHeight = item.data['height'];
|
||||||
|
const tx = pageViewport.transformFunction(itemTransform);
|
||||||
|
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
|
||||||
|
const dividedHeight = itemHeight / fontHeight;
|
||||||
|
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
|
||||||
|
if (newHeight !== itemHeight) {
|
||||||
|
newItems.push(item.withDataAddition({ height: newHeight }));
|
||||||
|
} else {
|
||||||
|
newItems.push(item);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return items;
|
||||||
|
}
|
||||||
|
}
|
19
core/src/transformer/CalculateCoordinates.ts
Normal file
19
core/src/transformer/CalculateCoordinates.ts
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import Item from '../Item';
|
||||||
|
import ItemTransformer from './ItemTransformer';
|
||||||
|
import TransformContext from './TransformContext';
|
||||||
|
|
||||||
|
export default class CalculateCoordinates extends ItemTransformer {
|
||||||
|
constructor() {
|
||||||
|
super('Calculate Coordinates', {
|
||||||
|
consumes: ['transform'],
|
||||||
|
produces: ['X', 'Y'],
|
||||||
|
removes: ['transform'],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(context: TransformContext, items: Item[]): Item[] {
|
||||||
|
// const transform: number[] = item.value['Transform'];
|
||||||
|
items.shift();
|
||||||
|
return items;
|
||||||
|
}
|
||||||
|
}
|
25
core/src/transformer/ItemTransformer.ts
Normal file
25
core/src/transformer/ItemTransformer.ts
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
import TransformerDescription from '../TransformerDescription';
|
||||||
|
import type Item from '../Item';
|
||||||
|
import TransformContext from './TransformContext';
|
||||||
|
|
||||||
|
export default abstract class ItemTransformer {
|
||||||
|
readonly name: string;
|
||||||
|
readonly description: TransformerDescription;
|
||||||
|
|
||||||
|
constructor(name: string, description: TransformerDescription) {
|
||||||
|
this.name = name;
|
||||||
|
this.description = {
|
||||||
|
...{
|
||||||
|
consumesGlobels: [],
|
||||||
|
producesGlobels: [],
|
||||||
|
consumes: [],
|
||||||
|
produces: [],
|
||||||
|
removes: [],
|
||||||
|
},
|
||||||
|
...description,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// columnar-changes: described
|
||||||
|
abstract transform(context: TransformContext, items: Item[]): Item[];
|
||||||
|
}
|
5
core/src/transformer/TransformContext.ts
Normal file
5
core/src/transformer/TransformContext.ts
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
import PageViewport from 'src/parse/PageViewport';
|
||||||
|
|
||||||
|
export default interface TransformContext {
|
||||||
|
pageViewports: PageViewport[];
|
||||||
|
}
|
55
core/src/transformer/transformerUtil.ts
Normal file
55
core/src/transformer/transformerUtil.ts
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
import TransformerDescription from 'src/TransformerDescription';
|
||||||
|
import { assert } from '../assert';
|
||||||
|
import ItemTransformer from './ItemTransformer';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Goes through all transformer and makes sure each required column ({@link TransformerDescription#consumes}) is available in its predecessor schema.
|
||||||
|
*
|
||||||
|
* @param initialSchema
|
||||||
|
* @param transformers
|
||||||
|
*/
|
||||||
|
export function verifyRequiredColumns(initialSchema: string[], transformers: ItemTransformer[]) {
|
||||||
|
calculateSchemas(initialSchema, transformers);
|
||||||
|
}
|
||||||
|
|
||||||
|
//TODO debug schema
|
||||||
|
// initial - all unanotated
|
||||||
|
// second - 2 removed, 1 added
|
||||||
|
// third - all as before without the removed
|
||||||
|
|
||||||
|
export function calculateSchemas(initialSchema: string[], transformers: ItemTransformer[]): string[][] {
|
||||||
|
const schemas: string[][] = [];
|
||||||
|
schemas.push(initialSchema);
|
||||||
|
for (let idx = 0; idx < transformers.length; idx++) {
|
||||||
|
const transformer = transformers[idx];
|
||||||
|
const inputSchema = schemas[idx];
|
||||||
|
validateReferences(inputSchema, transformer.name, transformer.description);
|
||||||
|
const outputSchema = inputSchema.filter((column) => !transformer.description.removes?.includes(column));
|
||||||
|
transformer.description.produces?.forEach((column) => outputSchema.push(column));
|
||||||
|
schemas.push(outputSchema);
|
||||||
|
}
|
||||||
|
return schemas;
|
||||||
|
}
|
||||||
|
|
||||||
|
function validateReferences(
|
||||||
|
inputSchema: string[],
|
||||||
|
transformerName: string,
|
||||||
|
transformerDescription: TransformerDescription,
|
||||||
|
) {
|
||||||
|
transformerDescription.consumes?.forEach((column) => {
|
||||||
|
assert(
|
||||||
|
inputSchema.includes(column),
|
||||||
|
`Input schema [${inputSchema.join(
|
||||||
|
', ',
|
||||||
|
)}] for transformer '${transformerName}' does not contain the required column '${column}' (consumes)`,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
transformerDescription.removes?.forEach((column) => {
|
||||||
|
assert(
|
||||||
|
inputSchema.includes(column),
|
||||||
|
`Input schema [${inputSchema.join(
|
||||||
|
', ',
|
||||||
|
)}] for transformer '${transformerName}' does not contain the required column '${column}' (removes)`,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
38
core/test/Debugger.test.ts
Normal file
38
core/test/Debugger.test.ts
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import Debugger from 'src/Debugger';
|
||||||
|
import Item from 'src/Item';
|
||||||
|
import ItemTransformer from 'src/transformer/ItemTransformer';
|
||||||
|
import Metadata from 'src/Metadata';
|
||||||
|
import ParseResult from 'src/ParseResult';
|
||||||
|
import TransformerDescription from 'src/TransformerDescription';
|
||||||
|
import TransformContext from 'src/transformer/TransformContext';
|
||||||
|
|
||||||
|
class TestTransformer extends ItemTransformer {
|
||||||
|
items: Item[];
|
||||||
|
constructor(name: string, description: TransformerDescription, items: Item[]) {
|
||||||
|
super(name, description);
|
||||||
|
this.items = items;
|
||||||
|
}
|
||||||
|
transform(_: TransformContext, items: Item[]): Item[] {
|
||||||
|
return this.items;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test('basic debug', async () => {
|
||||||
|
const parsedSchema = ['A', 'B'];
|
||||||
|
const parsedItems = [new Item(0, { A: 'a_row1', B: 'b_row1' }), new Item(0, { A: 'a_row2', B: 'b_row2' })];
|
||||||
|
|
||||||
|
const trans1Desc = { consumes: ['A', 'B'], produces: ['C'], removes: ['A', 'B'] };
|
||||||
|
const trans1Items = parsedItems.map((item) => item.withData({ C: `c=${item.value('A')}+${item.value('B')}` }));
|
||||||
|
|
||||||
|
const transformers = [new TestTransformer('Trans1', trans1Desc, trans1Items)];
|
||||||
|
const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers);
|
||||||
|
|
||||||
|
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
|
||||||
|
expect(debug.stageSchema).toEqual([parsedSchema, ['C']]);
|
||||||
|
for (let index = 0; index < debug.stageNames.length; index++) {
|
||||||
|
console.log(index, debug.stageResults(index));
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(debug.stageResults(0)).toEqual(parsedItems);
|
||||||
|
expect(debug.stageResults(1)).toEqual(trans1Items);
|
||||||
|
});
|
@ -12,7 +12,7 @@ test('basic example PDF parse', async () => {
|
|||||||
const data = fs.readFileSync('../examples/ExamplePdf.pdf', null);
|
const data = fs.readFileSync('../examples/ExamplePdf.pdf', null);
|
||||||
|
|
||||||
// to test
|
// to test
|
||||||
const result = await parser.parseBytes(
|
const result = await parser.parse(
|
||||||
data,
|
data,
|
||||||
new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)),
|
new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)),
|
||||||
);
|
);
|
||||||
@ -29,7 +29,7 @@ test('basic example PDF parse', async () => {
|
|||||||
expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
|
expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
|
||||||
|
|
||||||
// verify first n items
|
// verify first n items
|
||||||
expect(result.items.slice(0, 16)).toEqual([
|
expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: 'Mega Überschrift',
|
str: 'Mega Überschrift',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -37,7 +37,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 30,
|
height: 30,
|
||||||
transform: [30, 0, 0, 30, 175, 756],
|
transform: [30, 0, 0, 30, 175, 756],
|
||||||
fontName: 'g_d0_f1',
|
fontName: 'g_d0_f1',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: '2te Überschrift',
|
str: '2te Überschrift',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -45,7 +45,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 20,
|
height: 20,
|
||||||
transform: [20, 0, 0, 20, 233, 665],
|
transform: [20, 0, 0, 20, 233, 665],
|
||||||
fontName: 'g_d0_f2',
|
fontName: 'g_d0_f2',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: 'Dies ist eine Test-PDF',
|
str: 'Dies ist eine Test-PDF',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -53,7 +53,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 11,
|
height: 11,
|
||||||
transform: [11, 0, 0, 11, 240, 585],
|
transform: [11, 0, 0, 11, 240, 585],
|
||||||
fontName: 'g_d0_f2',
|
fontName: 'g_d0_f2',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: '.',
|
str: '.',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -61,7 +61,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 11,
|
height: 11,
|
||||||
transform: [11, 0, 0, 11, 352.6927, 585],
|
transform: [11, 0, 0, 11, 352.6927, 585],
|
||||||
fontName: 'g_d0_f2',
|
fontName: 'g_d0_f2',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: '1',
|
str: '1',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -69,7 +69,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 7.333334,
|
height: 7.333334,
|
||||||
transform: [7.333334, 0, 0, 7.333334, 348, 588],
|
transform: [7.333334, 0, 0, 7.333334, 348, 588],
|
||||||
fontName: 'g_d0_f2',
|
fontName: 'g_d0_f2',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: 'Für’s Testen des ',
|
str: 'Für’s Testen des ',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -77,7 +77,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 11,
|
height: 11,
|
||||||
transform: [11, 0, 0, 11, 208, 572],
|
transform: [11, 0, 0, 11, 208, 572],
|
||||||
fontName: 'g_d0_f2',
|
fontName: 'g_d0_f2',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: 'Markdown Parsers',
|
str: 'Markdown Parsers',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -85,7 +85,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 11,
|
height: 11,
|
||||||
transform: [11, 0, 0, 11, 291.77832, 572],
|
transform: [11, 0, 0, 11, 291.77832, 572],
|
||||||
fontName: 'g_d0_f2',
|
fontName: 'g_d0_f2',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: '.',
|
str: '.',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -93,7 +93,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 11,
|
height: 11,
|
||||||
transform: [11, 0, 0, 11, 383.47360000000003, 572],
|
transform: [11, 0, 0, 11, 383.47360000000003, 572],
|
||||||
fontName: 'g_d0_f2',
|
fontName: 'g_d0_f2',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: ' ',
|
str: ' ',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -101,7 +101,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 11,
|
height: 11,
|
||||||
transform: [11, 0, 0, 11, 61.078451, 59],
|
transform: [11, 0, 0, 11, 61.078451, 59],
|
||||||
fontName: 'g_d0_f2',
|
fontName: 'g_d0_f2',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: 'In Deutsch.',
|
str: 'In Deutsch.',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -109,7 +109,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 11,
|
height: 11,
|
||||||
transform: [11, 0, 0, 11, 64.134603, 59],
|
transform: [11, 0, 0, 11, 64.134603, 59],
|
||||||
fontName: 'g_d0_f2',
|
fontName: 'g_d0_f2',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: '1',
|
str: '1',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -117,7 +117,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 7.333334,
|
height: 7.333334,
|
||||||
transform: [7.333334, 0, 0, 7.333334, 57, 62],
|
transform: [7.333334, 0, 0, 7.333334, 57, 62],
|
||||||
fontName: 'g_d0_f2',
|
fontName: 'g_d0_f2',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: '\x00',
|
str: '\x00',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -125,7 +125,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 12,
|
height: 12,
|
||||||
transform: [12, 0, 0, 12, 294, 45],
|
transform: [12, 0, 0, 12, 294, 45],
|
||||||
fontName: 'g_d0_f3',
|
fontName: 'g_d0_f3',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
str: '1',
|
str: '1',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -133,7 +133,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 12,
|
height: 12,
|
||||||
transform: [12, 0, 0, 12, 294, 45],
|
transform: [12, 0, 0, 12, 294, 45],
|
||||||
fontName: 'g_d0_f2',
|
fontName: 'g_d0_f2',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(1, {
|
new Item(1, {
|
||||||
str: '\x00',
|
str: '\x00',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -141,7 +141,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 12,
|
height: 12,
|
||||||
transform: [12, 0, 0, 12, 294, 45],
|
transform: [12, 0, 0, 12, 294, 45],
|
||||||
fontName: 'g_d0_f3',
|
fontName: 'g_d0_f3',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(1, {
|
new Item(1, {
|
||||||
str: '2',
|
str: '2',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -149,7 +149,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 12,
|
height: 12,
|
||||||
transform: [12, 0, 0, 12, 294, 45],
|
transform: [12, 0, 0, 12, 294, 45],
|
||||||
fontName: 'g_d0_f2',
|
fontName: 'g_d0_f2',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
new Item(2, {
|
new Item(2, {
|
||||||
str: 'Paragraphen',
|
str: 'Paragraphen',
|
||||||
dir: 'ltr',
|
dir: 'ltr',
|
||||||
@ -157,7 +157,7 @@ test('basic example PDF parse', async () => {
|
|||||||
height: 18,
|
height: 18,
|
||||||
transform: [18, 0, 0, 18, 57, 767],
|
transform: [18, 0, 0, 18, 57, 767],
|
||||||
fontName: 'g_d0_f1',
|
fontName: 'g_d0_f1',
|
||||||
}),
|
}).withoutUuid(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
// verify progress
|
// verify progress
|
||||||
|
59
core/test/transformer/transformerUtil.test.ts
Normal file
59
core/test/transformer/transformerUtil.test.ts
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
import TransformerDescription from 'src/TransformerDescription';
|
||||||
|
import Item from 'src/Item';
|
||||||
|
import ItemTransformer from 'src/transformer/ItemTransformer';
|
||||||
|
import TransformContext from 'src/transformer/TransformContext';
|
||||||
|
import { calculateSchemas, verifyRequiredColumns } from 'src/transformer/transformerUtil';
|
||||||
|
|
||||||
|
class TestSchemaTransformer extends ItemTransformer {
|
||||||
|
constructor(name: string, description: TransformerDescription) {
|
||||||
|
super(name, description);
|
||||||
|
}
|
||||||
|
transform(_: TransformContext, items: Item[]): Item[] {
|
||||||
|
return items;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test('verify valid transform', async () => {
|
||||||
|
const inputSchema = ['A', 'B', 'C'];
|
||||||
|
|
||||||
|
const transformers = [
|
||||||
|
new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }),
|
||||||
|
new TestSchemaTransformer('Create E', { produces: ['E'] }),
|
||||||
|
new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }),
|
||||||
|
];
|
||||||
|
verifyRequiredColumns(inputSchema, transformers);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('verify invalid consume', async () => {
|
||||||
|
const inputSchema = ['A', 'B', 'C'];
|
||||||
|
|
||||||
|
const transformers = [new TestSchemaTransformer('Consumes X', { consumes: ['X'] })];
|
||||||
|
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
|
||||||
|
"Input schema [A, B, C] for transformer 'Consumes X' does not contain the required column 'X' (consumes)",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('verify invalid remove', async () => {
|
||||||
|
const inputSchema = ['A', 'B', 'C'];
|
||||||
|
|
||||||
|
const transformers = [new TestSchemaTransformer('Removes X', { removes: ['X'] })];
|
||||||
|
expect(() => verifyRequiredColumns(inputSchema, transformers)).toThrowError(
|
||||||
|
"Input schema [A, B, C] for transformer 'Removes X' does not contain the required column 'X' (removes)",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('calculate schemas', async () => {
|
||||||
|
const inputSchema = ['A', 'B', 'C'];
|
||||||
|
|
||||||
|
const transformers = [
|
||||||
|
new TestSchemaTransformer('Replace B & C with D', { consumes: ['B', 'C'], produces: ['D'], removes: ['B', 'C'] }),
|
||||||
|
new TestSchemaTransformer('Create E', { produces: ['E'] }),
|
||||||
|
new TestSchemaTransformer('Uses A, D & E', { consumes: ['A', 'D', 'E'] }),
|
||||||
|
];
|
||||||
|
expect(calculateSchemas(inputSchema, transformers)).toEqual([
|
||||||
|
['A', 'B', 'C'],
|
||||||
|
['A', 'D'],
|
||||||
|
['A', 'D', 'E'],
|
||||||
|
['A', 'D', 'E'],
|
||||||
|
]);
|
||||||
|
});
|
3
ui/package-lock.json
generated
3
ui/package-lock.json
generated
@ -5137,8 +5137,7 @@
|
|||||||
"uuid": {
|
"uuid": {
|
||||||
"version": "8.3.2",
|
"version": "8.3.2",
|
||||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz",
|
"resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz",
|
||||||
"integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==",
|
"integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg=="
|
||||||
"dev": true
|
|
||||||
},
|
},
|
||||||
"v8-to-istanbul": {
|
"v8-to-istanbul": {
|
||||||
"version": "7.0.0",
|
"version": "7.0.0",
|
||||||
|
@ -21,7 +21,8 @@
|
|||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"pdfjs-dist": "^2.5.207",
|
"pdfjs-dist": "^2.5.207",
|
||||||
"svelte-file-dropzone": "0.0.15"
|
"svelte-file-dropzone": "0.0.15",
|
||||||
|
"uuid": "^8.3.2"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@snowpack/plugin-dotenv": "^2.0.5",
|
"@snowpack/plugin-dotenv": "^2.0.5",
|
||||||
|
@ -1,14 +1,14 @@
|
|||||||
<script>
|
<script>
|
||||||
import Upload from './Upload.svelte';
|
import Upload from './Upload.svelte';
|
||||||
|
|
||||||
import { parseResult } from './store';
|
import { parseResult, debug } from './store';
|
||||||
import Result from './Result.svelte';
|
import DebugView from './debug/DebugView.svelte';
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<div class="text-2xl font-semibold font-serif text-center bg-gray-400">PDF to Markdown Converter</div>
|
<div class="text-2xl font-semibold font-serif text-center bg-gray-400">PDF to Markdown Converter</div>
|
||||||
<main class="mt-5 h-full">
|
<main class="mt-5 h-full">
|
||||||
{#if $parseResult}
|
{#if $debug}
|
||||||
<Result parseResult={$parseResult} />
|
<DebugView debug={$debug} />
|
||||||
{:else}
|
{:else}
|
||||||
<Upload />
|
<Upload />
|
||||||
{/if}
|
{/if}
|
||||||
|
@ -1,16 +0,0 @@
|
|||||||
<script>
|
|
||||||
import type ParseResult from 'pdf-to-markdown-core/lib/src/ParseResult';
|
|
||||||
import Table from './Table.svelte';
|
|
||||||
|
|
||||||
export let parseResult: ParseResult;
|
|
||||||
</script>
|
|
||||||
|
|
||||||
<div class="mx-4">
|
|
||||||
<div class="mb-4">
|
|
||||||
<div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
|
|
||||||
<div>Title: {parseResult.metadata.title()}</div>
|
|
||||||
<div>Author: {parseResult.metadata.author()}</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<Table columns={parseResult.columns} items={parseResult.items} />
|
|
||||||
</div>
|
|
@ -1,172 +0,0 @@
|
|||||||
<script>
|
|
||||||
import type Item from '@core/Item';
|
|
||||||
import { Collection, BookOpen, Support, ArrowLeft, ArrowRight } from 'svelte-hero-icons';
|
|
||||||
|
|
||||||
export let columns: string[];
|
|
||||||
export let items: Item[];
|
|
||||||
const pages = [...new Set(items.map((item) => item.page))];
|
|
||||||
const maxPage = Math.max(...pages);
|
|
||||||
let focusedPage: number;
|
|
||||||
$: focused = typeof focusedPage === 'number';
|
|
||||||
let openedPageIndex = false;
|
|
||||||
|
|
||||||
const itemsGroupedByPage = items.reduce((map, item) => {
|
|
||||||
if (!map.has(item.page)) {
|
|
||||||
map.set(item.page, []);
|
|
||||||
}
|
|
||||||
map.get(item.page).push(item);
|
|
||||||
return map;
|
|
||||||
}, new Map<number, Item[]>());
|
|
||||||
|
|
||||||
function focusOnPage(pageNumber: number) {
|
|
||||||
openedPageIndex = false;
|
|
||||||
focusedPage = pageNumber;
|
|
||||||
}
|
|
||||||
|
|
||||||
function showAllPages() {
|
|
||||||
openedPageIndex = false;
|
|
||||||
focusedPage = undefined;
|
|
||||||
}
|
|
||||||
|
|
||||||
function format(value: object) {
|
|
||||||
const type = typeof value;
|
|
||||||
if (typeof value === 'number') {
|
|
||||||
return (value as number).toFixed(2);
|
|
||||||
}
|
|
||||||
if (typeof value === 'object' && typeof Array.isArray(value)) {
|
|
||||||
let array = value as Array<object>;
|
|
||||||
if (array.length > 0 && typeof array[0] === 'number') {
|
|
||||||
array = (array.map((element) =>
|
|
||||||
((element as unknown) as number).toFixed(2)
|
|
||||||
) as unknown) as Array<object>;
|
|
||||||
}
|
|
||||||
return '[' + array.join(', ') + ']';
|
|
||||||
}
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
</script>
|
|
||||||
|
|
||||||
<!-- Sticky Controls -->
|
|
||||||
<div class="controls py-2">
|
|
||||||
<div class="flex items-center space-x-2">
|
|
||||||
<span>
|
|
||||||
<span on:click={() => (openedPageIndex = !openedPageIndex)}>
|
|
||||||
<BookOpen size="1x" class="hover:text-green-700 cursor-pointer" />
|
|
||||||
</span>
|
|
||||||
|
|
||||||
<!-- Page selection popup-->
|
|
||||||
{#if openedPageIndex}
|
|
||||||
<div class="absolute mt-2 p-2 flex bg-gray-200 shadow-lg rounded-sm overflow-auto max-h-96">
|
|
||||||
<span class="mt-1 pr-2" on:click={showAllPages}>
|
|
||||||
<Collection size="1x" class="hover:text-green-700 cursor-pointer" />
|
|
||||||
</span>
|
|
||||||
<div
|
|
||||||
class="grid gap-3"
|
|
||||||
style="grid-template-columns: repeat({Math.min(20, maxPage + 1)}, minmax(0, 1fr));">
|
|
||||||
{#each new Array(maxPage + 1) as _, idx}
|
|
||||||
<div
|
|
||||||
on:click={() => itemsGroupedByPage.has(idx) && focusOnPage(idx)}
|
|
||||||
class="px-2 border border-gray-300 rounded-full text-center {itemsGroupedByPage.has(idx) ? 'hover:text-green-700 hover:border-green-700 cursor-pointer' : 'opacity-50'}">
|
|
||||||
{idx}
|
|
||||||
</div>
|
|
||||||
{/each}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
{/if}
|
|
||||||
</span>
|
|
||||||
|
|
||||||
<div>|</div>
|
|
||||||
<div>Transformation:</div>
|
|
||||||
<ArrowLeft size="1x" class="hover:text-green-700 cursor-pointer opacity-50" />
|
|
||||||
<div>Parse Result</div>
|
|
||||||
<ArrowRight size="1x" class="hover:text-green-700 cursor-pointer" />
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Item table -->
|
|
||||||
<table class="w-full text-left">
|
|
||||||
<!-- Sticky header -->
|
|
||||||
<thead class=" ">
|
|
||||||
<th />
|
|
||||||
<th>#</th>
|
|
||||||
{#each columns as column}
|
|
||||||
<th>{column}</th>
|
|
||||||
{/each}
|
|
||||||
</thead>
|
|
||||||
<tbody>
|
|
||||||
{#each [...itemsGroupedByPage].filter(([page]) => !focused || page === focusedPage) as [pageNumber, items], pageIdx}
|
|
||||||
<!-- Separator between pages -->
|
|
||||||
{#if pageIdx > 0}
|
|
||||||
<tr class="h-5" />
|
|
||||||
{/if}
|
|
||||||
{#each items as item, itemIdx}
|
|
||||||
<tr>
|
|
||||||
<!-- Page number in first page item row -->
|
|
||||||
{#if itemIdx === 0}
|
|
||||||
<td class="page bg-gray-50">
|
|
||||||
<div>Page {pageNumber} {focused ? '' : ' / ' + maxPage}</div>
|
|
||||||
<div class="absolute flex">
|
|
||||||
{#if !focused}
|
|
||||||
<span on:click={() => focusOnPage(pageNumber)}>
|
|
||||||
<Support size="1x" class="hover:text-green-700 cursor-pointer opacity-75" />
|
|
||||||
</span>
|
|
||||||
{:else}
|
|
||||||
<span on:click={showAllPages}>
|
|
||||||
<Collection size="1x" class="hover:text-green-700 cursor-pointer opacity-75" />
|
|
||||||
</span>
|
|
||||||
{/if}
|
|
||||||
</div>
|
|
||||||
</td>
|
|
||||||
{:else}
|
|
||||||
<td />
|
|
||||||
{/if}
|
|
||||||
<td>{itemIdx}</td>
|
|
||||||
{#each columns as column}
|
|
||||||
<td>{format(item.data[column])}</td>
|
|
||||||
{/each}
|
|
||||||
</tr>
|
|
||||||
{/each}
|
|
||||||
{/each}
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<style>
|
|
||||||
.controls {
|
|
||||||
@apply bg-gray-50;
|
|
||||||
position: -webkit-sticky;
|
|
||||||
position: sticky;
|
|
||||||
top: 0;
|
|
||||||
z-index: 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
.page {
|
|
||||||
@apply text-lg;
|
|
||||||
@apply font-semibold;
|
|
||||||
@apply pr-4;
|
|
||||||
@apply whitespace-nowrap;
|
|
||||||
position: -webkit-sticky;
|
|
||||||
position: sticky;
|
|
||||||
top: 2em;
|
|
||||||
z-index: 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
th {
|
|
||||||
@apply px-1;
|
|
||||||
position: -webkit-sticky;
|
|
||||||
position: sticky;
|
|
||||||
top: 2.4em;
|
|
||||||
z-index: 2;
|
|
||||||
}
|
|
||||||
th:not(:first-child) {
|
|
||||||
@apply bg-gray-300;
|
|
||||||
@apply shadow;
|
|
||||||
}
|
|
||||||
td:not(:first-child) {
|
|
||||||
@apply px-1;
|
|
||||||
@apply border-b;
|
|
||||||
}
|
|
||||||
|
|
||||||
tr:hover td:not(:first-child) {
|
|
||||||
@apply bg-gray-200;
|
|
||||||
}
|
|
||||||
</style>
|
|
107
ui/src/debug/DebugView.svelte
Normal file
107
ui/src/debug/DebugView.svelte
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
<script>
|
||||||
|
import type Debugger from '@core/Debugger';
|
||||||
|
import type Item from '@core/Item';
|
||||||
|
import { Collection, BookOpen, ArrowLeft, ArrowRight } from 'svelte-hero-icons';
|
||||||
|
import ItemTable from './ItemTable.svelte';
|
||||||
|
|
||||||
|
export let debug: Debugger;
|
||||||
|
|
||||||
|
const stageNames = debug.stageNames;
|
||||||
|
let openedPageIndex = false;
|
||||||
|
let focusedPage: number;
|
||||||
|
|
||||||
|
let currentStage = 0;
|
||||||
|
$: canNext = currentStage + 1 < stageNames.length;
|
||||||
|
$: canPrev = currentStage > 0;
|
||||||
|
$: stageSchema = debug.stageSchema[currentStage];
|
||||||
|
$: stageItems = debug.stageResults(currentStage);
|
||||||
|
$: pageFocus = !isNaN(focusedPage);
|
||||||
|
$: pagesNumbers = new Set(stageItems.map((item) => item.page));
|
||||||
|
$: maxPage = Math.max(...pagesNumbers);
|
||||||
|
$: itemsByPage = [
|
||||||
|
...stageItems.reduce((map, item) => {
|
||||||
|
if (!map.has(item.page)) {
|
||||||
|
map.set(item.page, []);
|
||||||
|
}
|
||||||
|
map.get(item.page).push(item);
|
||||||
|
return map;
|
||||||
|
}, new Map<number, Item[]>()),
|
||||||
|
];
|
||||||
|
$: visiblePages = pageFocus ? itemsByPage.filter(([page]) => page === focusedPage) : itemsByPage;
|
||||||
|
|
||||||
|
function focusOnPage(pageNumber: number) {
|
||||||
|
openedPageIndex = false;
|
||||||
|
focusedPage = pageNumber;
|
||||||
|
}
|
||||||
|
|
||||||
|
function showAllPages() {
|
||||||
|
openedPageIndex = false;
|
||||||
|
focusedPage = undefined;
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<div class="mx-4">
|
||||||
|
<div class="mb-4">
|
||||||
|
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
|
||||||
|
<div>Title: {parseResult.metadata.title()}</div>
|
||||||
|
<div>Author: {parseResult.metadata.author()}</div> -->
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Sticky Controls -->
|
||||||
|
<div class="controls py-2">
|
||||||
|
<div class="flex items-center space-x-2">
|
||||||
|
{#if pageFocus}
|
||||||
|
<span on:click={showAllPages}>
|
||||||
|
<Collection size="1x" class="hover:text-green-700 cursor-pointer opacity-75" />
|
||||||
|
</span>
|
||||||
|
{/if}
|
||||||
|
<span>
|
||||||
|
<span on:click={() => (openedPageIndex = !openedPageIndex)}>
|
||||||
|
<BookOpen size="1x" class="hover:text-green-700 cursor-pointer" />
|
||||||
|
</span>
|
||||||
|
|
||||||
|
<!-- Page selection popup-->
|
||||||
|
{#if openedPageIndex}
|
||||||
|
<div class="absolute mt-2 p-2 flex bg-gray-200 shadow-lg rounded-sm overflow-auto max-h-96">
|
||||||
|
<span class="mt-1 pr-2" on:click={showAllPages}>
|
||||||
|
<Collection size="1x" class="hover:text-green-700 cursor-pointer" />
|
||||||
|
</span>
|
||||||
|
<div
|
||||||
|
class="grid gap-3"
|
||||||
|
style="grid-template-columns: repeat({Math.min(20, maxPage + 1)}, minmax(0, 1fr));">
|
||||||
|
{#each new Array(maxPage + 1) as _, idx}
|
||||||
|
<div
|
||||||
|
on:click={() => pagesNumbers.has(idx) && focusOnPage(idx)}
|
||||||
|
class="px-2 border border-gray-300 rounded-full text-center {pagesNumbers.has(idx) ? 'hover:text-green-700 hover:border-green-700 cursor-pointer' : 'opacity-50'}">
|
||||||
|
{idx}
|
||||||
|
</div>
|
||||||
|
{/each}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{/if}
|
||||||
|
</span>
|
||||||
|
|
||||||
|
<div>|</div>
|
||||||
|
<div>Transformation:</div>
|
||||||
|
<span on:click={() => canPrev && currentStage--}>
|
||||||
|
<ArrowLeft size="1x" class={canPrev ? 'hover:text-green-700 cursor-pointer' : 'opacity-50'} />
|
||||||
|
</span>
|
||||||
|
<span on:click={() => canNext && currentStage++}>
|
||||||
|
<ArrowRight size="1x" class={canNext ? 'hover:text-green-700 cursor-pointer' : 'opacity-50'} />
|
||||||
|
</span>
|
||||||
|
<div>{stageNames[currentStage]}</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<ItemTable schema={stageSchema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.controls {
|
||||||
|
@apply bg-gray-50;
|
||||||
|
position: -webkit-sticky;
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
z-index: 3;
|
||||||
|
}
|
||||||
|
</style>
|
93
ui/src/debug/ItemTable.svelte
Normal file
93
ui/src/debug/ItemTable.svelte
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
<script>
|
||||||
|
import type Item from '@core/Item';
|
||||||
|
|
||||||
|
export let schema: string[];
|
||||||
|
export let itemsByPage: [number, Item[]][];
|
||||||
|
export let maxPage: number;
|
||||||
|
export let pageFocus: boolean;
|
||||||
|
|
||||||
|
function format(value: object) {
|
||||||
|
if (typeof value === 'number') {
|
||||||
|
return (value as number).toFixed(2);
|
||||||
|
}
|
||||||
|
if (typeof value === 'object' && typeof Array.isArray(value)) {
|
||||||
|
let array = value as Array<object>;
|
||||||
|
if (array.length > 0 && typeof array[0] === 'number') {
|
||||||
|
array = (array.map((element) =>
|
||||||
|
((element as unknown) as number).toFixed(2)
|
||||||
|
) as unknown) as Array<object>;
|
||||||
|
}
|
||||||
|
return '[' + array.join(', ') + ']';
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<!-- Item table -->
|
||||||
|
<table class="w-full text-left">
|
||||||
|
<!-- Sticky header -->
|
||||||
|
<thead class=" ">
|
||||||
|
<th />
|
||||||
|
<th>#</th>
|
||||||
|
{#each schema as column}
|
||||||
|
<th>{column}</th>
|
||||||
|
{/each}
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{#each itemsByPage as [pageNumber, items], pageIdx}
|
||||||
|
<!-- Separator between pages -->
|
||||||
|
{#if pageIdx > 0}
|
||||||
|
<tr class="h-5" />
|
||||||
|
{/if}
|
||||||
|
{#each items as item, itemIdx}
|
||||||
|
<tr>
|
||||||
|
<!-- Page number in first page item row -->
|
||||||
|
{#if itemIdx === 0}
|
||||||
|
<td class="page bg-gray-50">
|
||||||
|
<div>Page {pageNumber} {pageFocus ? '' : ' / ' + maxPage}</div>
|
||||||
|
</td>
|
||||||
|
{:else}
|
||||||
|
<td />
|
||||||
|
{/if}
|
||||||
|
<td>{itemIdx}</td>
|
||||||
|
{#each schema as column}
|
||||||
|
<td>{format(item.data[column])}</td>
|
||||||
|
{/each}
|
||||||
|
</tr>
|
||||||
|
{/each}
|
||||||
|
{/each}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.page {
|
||||||
|
@apply text-lg;
|
||||||
|
@apply font-semibold;
|
||||||
|
@apply pr-4;
|
||||||
|
@apply whitespace-nowrap;
|
||||||
|
position: -webkit-sticky;
|
||||||
|
position: sticky;
|
||||||
|
top: 2em;
|
||||||
|
z-index: 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
th {
|
||||||
|
@apply px-1;
|
||||||
|
position: -webkit-sticky;
|
||||||
|
position: sticky;
|
||||||
|
top: 2.4em;
|
||||||
|
z-index: 2;
|
||||||
|
}
|
||||||
|
th:not(:first-child) {
|
||||||
|
@apply bg-gray-300;
|
||||||
|
@apply shadow;
|
||||||
|
}
|
||||||
|
td:not(:first-child) {
|
||||||
|
@apply px-1;
|
||||||
|
@apply border-b;
|
||||||
|
}
|
||||||
|
|
||||||
|
tr:hover td:not(:first-child) {
|
||||||
|
@apply bg-gray-200;
|
||||||
|
}
|
||||||
|
</style>
|
@ -1,21 +1,23 @@
|
|||||||
import { pdfParser, parseReporter } from '@core';
|
import { pdfParser, createPipeline, parseReporter } from '@core';
|
||||||
import type ProgressListenFunction from '@core/ProgressListenFunction';
|
import type ProgressListenFunction from '@core/ProgressListenFunction';
|
||||||
import type ParseResult from '@core/ParseResult';
|
import type ParseResult from '@core/ParseResult';
|
||||||
|
import type Debugger from '@core/Debugger';
|
||||||
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
|
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
|
||||||
|
|
||||||
import { Writable, writable } from 'svelte/store';
|
import { Writable, writable } from 'svelte/store';
|
||||||
|
|
||||||
|
export let debug: Writable<Debugger> = writable(undefined);
|
||||||
export let parseResult: Writable<ParseResult> = writable(undefined);
|
export let parseResult: Writable<ParseResult> = writable(undefined);
|
||||||
|
|
||||||
pdfjs.GlobalWorkerOptions.workerSrc = 'worker/pdf.worker.min.js';
|
pdfjs.GlobalWorkerOptions.workerSrc = 'worker/pdf.worker.min.js';
|
||||||
|
|
||||||
const parser = pdfParser(pdfjs);
|
const pdfPipeline = createPipeline(pdfjs, {});
|
||||||
|
|
||||||
export async function loadExample(progressListener: ProgressListenFunction): Promise<ParseResult> {
|
export async function loadExample(progressListener: ProgressListenFunction): Promise<any> {
|
||||||
return parsePdf(parser.parseUrl('/ExamplePdf.pdf', parseReporter(progressListener)));
|
return parsePdf('/ExamplePdf.pdf', progressListener);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function processUpload(file: File, progressListener: ProgressListenFunction): Promise<ParseResult> {
|
export async function processUpload(file: File, progressListener: ProgressListenFunction): Promise<any> {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
const reader = new FileReader();
|
const reader = new FileReader();
|
||||||
reader.onerror = reject;
|
reader.onerror = reject;
|
||||||
@ -25,13 +27,18 @@ export async function processUpload(file: File, progressListener: ProgressListen
|
|||||||
reader.readAsArrayBuffer(file);
|
reader.readAsArrayBuffer(file);
|
||||||
}).then((buffer) => {
|
}).then((buffer) => {
|
||||||
const data = new Uint8Array(buffer as ArrayBuffer);
|
const data = new Uint8Array(buffer as ArrayBuffer);
|
||||||
return parsePdf(parser.parseBytes(data, parseReporter(progressListener)));
|
return parsePdf(data, progressListener);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async function parsePdf(parsePromise: Promise<ParseResult>): Promise<ParseResult> {
|
async function parsePdf(src: string | Uint8Array, progressListener: ProgressListenFunction): Promise<any> {
|
||||||
return parsePromise.then((result) => {
|
pdfPipeline.debug(src, progressListener).then((debugInstance) => {
|
||||||
parseResult.set(result);
|
debug.set(debugInstance);
|
||||||
return result;
|
return debug;
|
||||||
});
|
});
|
||||||
|
//TODO without debug-flag
|
||||||
|
// return pdfPipeline.execute(src, progressListener).then((result) => {
|
||||||
|
// parseResult.set(result);
|
||||||
|
// return result;
|
||||||
|
// });
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user