Add sparse support for final convert

This commit is contained in:
Johannes Zillmann 2024-03-25 10:21:27 -06:00
parent 5bf4988da2
commit 075639979e
6 changed files with 170 additions and 20 deletions

View File

@ -1,8 +1,5 @@
import ItemTransformer from './transformer/ItemTransformer';
export default interface Config {
// See DocumentInitParameters from https://mozilla.github.io/pdf.js/api/draft/module-pdfjsLib.html#DocumentInitParameters
pdfjsParams?: object;
export default interface TransformConfig {
transformers?: ItemTransformer[];
// TODO keep pdfPages ?
}

View File

@ -3,7 +3,7 @@ import ItemTransformer from './transformer/ItemTransformer';
import TransformContext from './transformer/TransformContext';
import StageResult, { initialStage } from './debug/StageResult';
import ColumnAnnotation from './debug/ColumnAnnotation';
import AnnotatedColumn from './debug/AnnotatedColumn';
import type AnnotatedColumn from './debug/AnnotatedColumn';
import { detectChanges } from './debug/detectChanges';
import { asPages } from './debug/Page';
import EvaluationTracker from './debug/EvaluationTracker';

115
core/src/convert.ts Normal file
View File

@ -0,0 +1,115 @@
import Globals from './Globals';
import Item from './Item';
import ParseProgressReporter from './ParseProgressReporter';
import ParseResult from './ParseResult';
import PdfParser from './PdfParser';
import type ProgressListenFunction from './ProgressListenFunction';
import { assert } from './assert';
import ItemTransformer from './transformer/ItemTransformer';
import TransformContext from './transformer/TransformContext';
export type PdfSource = string | Uint8Array | object;
export interface Converter {
convert: (items: Item[]) => string;
}
interface Options {
debug: boolean;
progressListener: ProgressListenFunction;
}
const defaultOptions: Options = {
debug: false,
progressListener: () => {},
};
export interface TransformationResult {
convert(converter: Converter): string;
}
export async function parseAndTransform(
src: PdfSource,
parser: PdfParser,
transformers: ItemTransformer[],
progressListener: ProgressListenFunction,
): Promise<TransformationResult> {
const parseResult = await parseAndVerifyTransformers(src, parser, transformers, progressListener);
const transformedItems = transform(parseResult, transformers);
return Promise.resolve({
convert: (converter: Converter) => converter.convert(transformedItems),
});
}
export async function convert(
src: PdfSource,
parser: PdfParser,
transformers: ItemTransformer[],
converter: Converter,
options: Options = defaultOptions,
): Promise<string> {
// parse
const parseResult = await parser.parse(src, new ParseProgressReporter(options.progressListener));
verifyRequiredColumns(parseResult.schema, transformers);
// transform
let items = parseResult.items;
let globals = new Globals();
const context = new TransformContext(parseResult.fontMap, parseResult.pageViewports, globals);
transformers.forEach((transformer) => {
const result = transformer.transform(context, items);
globals = globals.withValues(result.globals);
items = result.items;
});
// convert
return converter.convert(items);
}
async function parseAndVerifyTransformers(
src: PdfSource,
parser: PdfParser,
transformers: ItemTransformer[],
progressListener: ProgressListenFunction,
): Promise<ParseResult> {
return parser.parse(src, new ParseProgressReporter(progressListener)).then((parseResult) => {
verifyRequiredColumns(parseResult.schema, transformers);
return parseResult;
});
}
/**
* Goes through all transformer and makes sure each required column is available in its predecessor schema.
*
* @param inputSchema
* @param transformers
*/
function verifyRequiredColumns(inputSchema: string[], transformers: ItemTransformer[]) {
const schemas: string[][] = [inputSchema];
for (let idx = 0; idx < transformers.length; idx++) {
const transformer = transformers[idx];
const predecessorSchema = schemas[idx];
transformer.descriptor.requireColumns?.forEach((column) => {
assert(
predecessorSchema.includes(column),
`Input schema [${predecessorSchema.join(', ')}] for transformer '${
transformer.name
}' does not contain the required column '${column}'`,
);
});
const outputSchema = transformer.schemaTransformer(predecessorSchema);
schemas.push(outputSchema);
}
}
function transform(parseResult: ParseResult, transformers: ItemTransformer[]) {
let items = parseResult.items;
let globals = new Globals();
const context = new TransformContext(parseResult.fontMap, parseResult.pageViewports, globals);
transformers.forEach((transformer) => {
const result = transformer.transform(context, items);
globals = globals.withValues(result.globals);
items = result.items;
});
return items;
}

View File

@ -0,0 +1,24 @@
import Item from 'src/Item';
import { Converter } from 'src/convert';
export default class MarkdownConverter implements Converter {
convert(items: Item[]) {
let content = '';
items.forEach((item) => {
const types = item.data['types'] || [];
const itemText = item.data['str'];
if (types.includes('H1')) {
content += '# ' + itemText + '\n';
} else if (types.includes('H2')) {
content += '## ' + itemText + '\n';
} else if (types.includes('H3')) {
content += '## ' + itemText + '\n';
} else {
content += itemText;
}
content += '\n';
});
return content;
}
}

View File

@ -1,4 +1,4 @@
import Config from './Config';
import TransformConfig from './Config';
import type ProgressListenFunction from './ProgressListenFunction';
import ParseProgressReporter from './ParseProgressReporter';
import PdfParser from './PdfParser';
@ -12,7 +12,9 @@ import CompactLines from './transformer/CompactLines';
import SortXWithinLines from './transformer/SortXWithinLines';
import RemoveRepetitiveItems from './transformer/RemoveRepetitiveItems';
import DetectToc from './transformer/DetectToc';
import DetectHeaders from './transformer/DetectHeaders';
import NoOpTransformer from './transformer/NoOpTransformer';
import {type ParseConfig } from './parse';
export const transformers = [
new AdjustHeight(),
@ -23,27 +25,20 @@ export const transformers = [
new SortXWithinLines(),
new RemoveRepetitiveItems(),
new DetectToc(),
new DetectHeaders(),
new NoOpTransformer(),
];
const defaultConfig: Config = {
pdfjsParams: {
// TODO check if that cmap thing makes sense since we don't bundle them
cMapUrl: 'cmaps/',
cMapPacked: true,
},
transformers,
};
export function pdfParser(pdfJs: any) {
return new PdfParser(pdfJs, defaultConfig.pdfjsParams);
export interface Options {
parseConfig?: ParseConfig;
transformConfig?: TransformConfig;
}
export function parseReporter(progressListener: ProgressListenFunction) {
return new ParseProgressReporter(progressListener);
}
export function createPipeline(pdfJs: any, config = defaultConfig): PdfPipeline {
const parser = new PdfParser(pdfJs, config.pdfjsParams);
return new PdfPipeline(parser, config.transformers || transformers);
export function createPipeline(pdfJs: any, options: Options = {}): PdfPipeline {
const parser = new PdfParser(pdfJs);
return new PdfPipeline(parser, options.transformConfig?.transformers || transformers);
}

19
core/src/parse.ts Normal file
View File

@ -0,0 +1,19 @@
import PdfParser from './PdfParser';
export interface ParseConfig {
// See DocumentInitParameters from https://mozilla.github.io/pdf.js/api/draft/module-pdfjsLib.html#DocumentInitParameters
pdfjsParams?: object;
// TODO keep pdfPages ?
}
const defaultConfig: ParseConfig = {
pdfjsParams: {
// TODO check if that cmap thing makes sense since we don't bundle them
cMapUrl: 'cmaps/',
cMapPacked: true,
},
};
export function parser(pdfJs: any, options: ParseConfig = defaultConfig) {
return new PdfParser(pdfJs, options.pdfjsParams);
}