mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-27 15:20:47 +01:00
Add sparse support for final convert
This commit is contained in:
parent
5bf4988da2
commit
075639979e
@ -1,8 +1,5 @@
|
||||
import ItemTransformer from './transformer/ItemTransformer';
|
||||
|
||||
export default interface Config {
|
||||
// See DocumentInitParameters from https://mozilla.github.io/pdf.js/api/draft/module-pdfjsLib.html#DocumentInitParameters
|
||||
pdfjsParams?: object;
|
||||
export default interface TransformConfig {
|
||||
transformers?: ItemTransformer[];
|
||||
// TODO keep pdfPages ?
|
||||
}
|
||||
|
@ -3,7 +3,7 @@ import ItemTransformer from './transformer/ItemTransformer';
|
||||
import TransformContext from './transformer/TransformContext';
|
||||
import StageResult, { initialStage } from './debug/StageResult';
|
||||
import ColumnAnnotation from './debug/ColumnAnnotation';
|
||||
import AnnotatedColumn from './debug/AnnotatedColumn';
|
||||
import type AnnotatedColumn from './debug/AnnotatedColumn';
|
||||
import { detectChanges } from './debug/detectChanges';
|
||||
import { asPages } from './debug/Page';
|
||||
import EvaluationTracker from './debug/EvaluationTracker';
|
||||
|
115
core/src/convert.ts
Normal file
115
core/src/convert.ts
Normal file
@ -0,0 +1,115 @@
|
||||
import Globals from './Globals';
|
||||
import Item from './Item';
|
||||
import ParseProgressReporter from './ParseProgressReporter';
|
||||
import ParseResult from './ParseResult';
|
||||
import PdfParser from './PdfParser';
|
||||
import type ProgressListenFunction from './ProgressListenFunction';
|
||||
import { assert } from './assert';
|
||||
import ItemTransformer from './transformer/ItemTransformer';
|
||||
import TransformContext from './transformer/TransformContext';
|
||||
|
||||
export type PdfSource = string | Uint8Array | object;
|
||||
|
||||
export interface Converter {
|
||||
convert: (items: Item[]) => string;
|
||||
}
|
||||
|
||||
interface Options {
|
||||
debug: boolean;
|
||||
progressListener: ProgressListenFunction;
|
||||
}
|
||||
|
||||
const defaultOptions: Options = {
|
||||
debug: false,
|
||||
progressListener: () => {},
|
||||
};
|
||||
|
||||
export interface TransformationResult {
|
||||
convert(converter: Converter): string;
|
||||
}
|
||||
|
||||
export async function parseAndTransform(
|
||||
src: PdfSource,
|
||||
parser: PdfParser,
|
||||
transformers: ItemTransformer[],
|
||||
progressListener: ProgressListenFunction,
|
||||
): Promise<TransformationResult> {
|
||||
const parseResult = await parseAndVerifyTransformers(src, parser, transformers, progressListener);
|
||||
const transformedItems = transform(parseResult, transformers);
|
||||
return Promise.resolve({
|
||||
convert: (converter: Converter) => converter.convert(transformedItems),
|
||||
});
|
||||
}
|
||||
|
||||
export async function convert(
|
||||
src: PdfSource,
|
||||
parser: PdfParser,
|
||||
transformers: ItemTransformer[],
|
||||
converter: Converter,
|
||||
options: Options = defaultOptions,
|
||||
): Promise<string> {
|
||||
// parse
|
||||
const parseResult = await parser.parse(src, new ParseProgressReporter(options.progressListener));
|
||||
verifyRequiredColumns(parseResult.schema, transformers);
|
||||
|
||||
// transform
|
||||
let items = parseResult.items;
|
||||
let globals = new Globals();
|
||||
const context = new TransformContext(parseResult.fontMap, parseResult.pageViewports, globals);
|
||||
transformers.forEach((transformer) => {
|
||||
const result = transformer.transform(context, items);
|
||||
globals = globals.withValues(result.globals);
|
||||
items = result.items;
|
||||
});
|
||||
|
||||
// convert
|
||||
return converter.convert(items);
|
||||
}
|
||||
|
||||
async function parseAndVerifyTransformers(
|
||||
src: PdfSource,
|
||||
parser: PdfParser,
|
||||
transformers: ItemTransformer[],
|
||||
progressListener: ProgressListenFunction,
|
||||
): Promise<ParseResult> {
|
||||
return parser.parse(src, new ParseProgressReporter(progressListener)).then((parseResult) => {
|
||||
verifyRequiredColumns(parseResult.schema, transformers);
|
||||
return parseResult;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Goes through all transformer and makes sure each required column is available in its predecessor schema.
|
||||
*
|
||||
* @param inputSchema
|
||||
* @param transformers
|
||||
*/
|
||||
function verifyRequiredColumns(inputSchema: string[], transformers: ItemTransformer[]) {
|
||||
const schemas: string[][] = [inputSchema];
|
||||
for (let idx = 0; idx < transformers.length; idx++) {
|
||||
const transformer = transformers[idx];
|
||||
const predecessorSchema = schemas[idx];
|
||||
transformer.descriptor.requireColumns?.forEach((column) => {
|
||||
assert(
|
||||
predecessorSchema.includes(column),
|
||||
`Input schema [${predecessorSchema.join(', ')}] for transformer '${
|
||||
transformer.name
|
||||
}' does not contain the required column '${column}'`,
|
||||
);
|
||||
});
|
||||
const outputSchema = transformer.schemaTransformer(predecessorSchema);
|
||||
schemas.push(outputSchema);
|
||||
}
|
||||
}
|
||||
|
||||
function transform(parseResult: ParseResult, transformers: ItemTransformer[]) {
|
||||
let items = parseResult.items;
|
||||
let globals = new Globals();
|
||||
const context = new TransformContext(parseResult.fontMap, parseResult.pageViewports, globals);
|
||||
transformers.forEach((transformer) => {
|
||||
const result = transformer.transform(context, items);
|
||||
globals = globals.withValues(result.globals);
|
||||
items = result.items;
|
||||
});
|
||||
return items;
|
||||
}
|
24
core/src/convert/MarkdownConverter.ts
Normal file
24
core/src/convert/MarkdownConverter.ts
Normal file
@ -0,0 +1,24 @@
|
||||
import Item from 'src/Item';
|
||||
import { Converter } from 'src/convert';
|
||||
|
||||
export default class MarkdownConverter implements Converter {
|
||||
convert(items: Item[]) {
|
||||
let content = '';
|
||||
|
||||
items.forEach((item) => {
|
||||
const types = item.data['types'] || [];
|
||||
const itemText = item.data['str'];
|
||||
if (types.includes('H1')) {
|
||||
content += '# ' + itemText + '\n';
|
||||
} else if (types.includes('H2')) {
|
||||
content += '## ' + itemText + '\n';
|
||||
} else if (types.includes('H3')) {
|
||||
content += '## ' + itemText + '\n';
|
||||
} else {
|
||||
content += itemText;
|
||||
}
|
||||
content += '\n';
|
||||
});
|
||||
return content;
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
import Config from './Config';
|
||||
import TransformConfig from './Config';
|
||||
import type ProgressListenFunction from './ProgressListenFunction';
|
||||
import ParseProgressReporter from './ParseProgressReporter';
|
||||
import PdfParser from './PdfParser';
|
||||
@ -12,7 +12,9 @@ import CompactLines from './transformer/CompactLines';
|
||||
import SortXWithinLines from './transformer/SortXWithinLines';
|
||||
import RemoveRepetitiveItems from './transformer/RemoveRepetitiveItems';
|
||||
import DetectToc from './transformer/DetectToc';
|
||||
import DetectHeaders from './transformer/DetectHeaders';
|
||||
import NoOpTransformer from './transformer/NoOpTransformer';
|
||||
import {type ParseConfig } from './parse';
|
||||
|
||||
export const transformers = [
|
||||
new AdjustHeight(),
|
||||
@ -23,27 +25,20 @@ export const transformers = [
|
||||
new SortXWithinLines(),
|
||||
new RemoveRepetitiveItems(),
|
||||
new DetectToc(),
|
||||
new DetectHeaders(),
|
||||
new NoOpTransformer(),
|
||||
];
|
||||
|
||||
const defaultConfig: Config = {
|
||||
pdfjsParams: {
|
||||
// TODO check if that cmap thing makes sense since we don't bundle them
|
||||
cMapUrl: 'cmaps/',
|
||||
cMapPacked: true,
|
||||
},
|
||||
transformers,
|
||||
};
|
||||
|
||||
export function pdfParser(pdfJs: any) {
|
||||
return new PdfParser(pdfJs, defaultConfig.pdfjsParams);
|
||||
export interface Options {
|
||||
parseConfig?: ParseConfig;
|
||||
transformConfig?: TransformConfig;
|
||||
}
|
||||
|
||||
export function parseReporter(progressListener: ProgressListenFunction) {
|
||||
return new ParseProgressReporter(progressListener);
|
||||
}
|
||||
|
||||
export function createPipeline(pdfJs: any, config = defaultConfig): PdfPipeline {
|
||||
const parser = new PdfParser(pdfJs, config.pdfjsParams);
|
||||
return new PdfPipeline(parser, config.transformers || transformers);
|
||||
export function createPipeline(pdfJs: any, options: Options = {}): PdfPipeline {
|
||||
const parser = new PdfParser(pdfJs);
|
||||
return new PdfPipeline(parser, options.transformConfig?.transformers || transformers);
|
||||
}
|
||||
|
19
core/src/parse.ts
Normal file
19
core/src/parse.ts
Normal file
@ -0,0 +1,19 @@
|
||||
import PdfParser from './PdfParser';
|
||||
|
||||
export interface ParseConfig {
|
||||
// See DocumentInitParameters from https://mozilla.github.io/pdf.js/api/draft/module-pdfjsLib.html#DocumentInitParameters
|
||||
pdfjsParams?: object;
|
||||
// TODO keep pdfPages ?
|
||||
}
|
||||
|
||||
const defaultConfig: ParseConfig = {
|
||||
pdfjsParams: {
|
||||
// TODO check if that cmap thing makes sense since we don't bundle them
|
||||
cMapUrl: 'cmaps/',
|
||||
cMapPacked: true,
|
||||
},
|
||||
};
|
||||
|
||||
export function parser(pdfJs: any, options: ParseConfig = defaultConfig) {
|
||||
return new PdfParser(pdfJs, options.pdfjsParams);
|
||||
}
|
Loading…
Reference in New Issue
Block a user