mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-25 03:51:33 +02:00
Add sparse support for final convert
This commit is contained in:
parent
5bf4988da2
commit
075639979e
@ -1,8 +1,5 @@
|
|||||||
import ItemTransformer from './transformer/ItemTransformer';
|
import ItemTransformer from './transformer/ItemTransformer';
|
||||||
|
|
||||||
export default interface Config {
|
export default interface TransformConfig {
|
||||||
// See DocumentInitParameters from https://mozilla.github.io/pdf.js/api/draft/module-pdfjsLib.html#DocumentInitParameters
|
|
||||||
pdfjsParams?: object;
|
|
||||||
transformers?: ItemTransformer[];
|
transformers?: ItemTransformer[];
|
||||||
// TODO keep pdfPages ?
|
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ import ItemTransformer from './transformer/ItemTransformer';
|
|||||||
import TransformContext from './transformer/TransformContext';
|
import TransformContext from './transformer/TransformContext';
|
||||||
import StageResult, { initialStage } from './debug/StageResult';
|
import StageResult, { initialStage } from './debug/StageResult';
|
||||||
import ColumnAnnotation from './debug/ColumnAnnotation';
|
import ColumnAnnotation from './debug/ColumnAnnotation';
|
||||||
import AnnotatedColumn from './debug/AnnotatedColumn';
|
import type AnnotatedColumn from './debug/AnnotatedColumn';
|
||||||
import { detectChanges } from './debug/detectChanges';
|
import { detectChanges } from './debug/detectChanges';
|
||||||
import { asPages } from './debug/Page';
|
import { asPages } from './debug/Page';
|
||||||
import EvaluationTracker from './debug/EvaluationTracker';
|
import EvaluationTracker from './debug/EvaluationTracker';
|
||||||
|
115
core/src/convert.ts
Normal file
115
core/src/convert.ts
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
import Globals from './Globals';
|
||||||
|
import Item from './Item';
|
||||||
|
import ParseProgressReporter from './ParseProgressReporter';
|
||||||
|
import ParseResult from './ParseResult';
|
||||||
|
import PdfParser from './PdfParser';
|
||||||
|
import type ProgressListenFunction from './ProgressListenFunction';
|
||||||
|
import { assert } from './assert';
|
||||||
|
import ItemTransformer from './transformer/ItemTransformer';
|
||||||
|
import TransformContext from './transformer/TransformContext';
|
||||||
|
|
||||||
|
export type PdfSource = string | Uint8Array | object;
|
||||||
|
|
||||||
|
export interface Converter {
|
||||||
|
convert: (items: Item[]) => string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface Options {
|
||||||
|
debug: boolean;
|
||||||
|
progressListener: ProgressListenFunction;
|
||||||
|
}
|
||||||
|
|
||||||
|
const defaultOptions: Options = {
|
||||||
|
debug: false,
|
||||||
|
progressListener: () => {},
|
||||||
|
};
|
||||||
|
|
||||||
|
export interface TransformationResult {
|
||||||
|
convert(converter: Converter): string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function parseAndTransform(
|
||||||
|
src: PdfSource,
|
||||||
|
parser: PdfParser,
|
||||||
|
transformers: ItemTransformer[],
|
||||||
|
progressListener: ProgressListenFunction,
|
||||||
|
): Promise<TransformationResult> {
|
||||||
|
const parseResult = await parseAndVerifyTransformers(src, parser, transformers, progressListener);
|
||||||
|
const transformedItems = transform(parseResult, transformers);
|
||||||
|
return Promise.resolve({
|
||||||
|
convert: (converter: Converter) => converter.convert(transformedItems),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function convert(
|
||||||
|
src: PdfSource,
|
||||||
|
parser: PdfParser,
|
||||||
|
transformers: ItemTransformer[],
|
||||||
|
converter: Converter,
|
||||||
|
options: Options = defaultOptions,
|
||||||
|
): Promise<string> {
|
||||||
|
// parse
|
||||||
|
const parseResult = await parser.parse(src, new ParseProgressReporter(options.progressListener));
|
||||||
|
verifyRequiredColumns(parseResult.schema, transformers);
|
||||||
|
|
||||||
|
// transform
|
||||||
|
let items = parseResult.items;
|
||||||
|
let globals = new Globals();
|
||||||
|
const context = new TransformContext(parseResult.fontMap, parseResult.pageViewports, globals);
|
||||||
|
transformers.forEach((transformer) => {
|
||||||
|
const result = transformer.transform(context, items);
|
||||||
|
globals = globals.withValues(result.globals);
|
||||||
|
items = result.items;
|
||||||
|
});
|
||||||
|
|
||||||
|
// convert
|
||||||
|
return converter.convert(items);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function parseAndVerifyTransformers(
|
||||||
|
src: PdfSource,
|
||||||
|
parser: PdfParser,
|
||||||
|
transformers: ItemTransformer[],
|
||||||
|
progressListener: ProgressListenFunction,
|
||||||
|
): Promise<ParseResult> {
|
||||||
|
return parser.parse(src, new ParseProgressReporter(progressListener)).then((parseResult) => {
|
||||||
|
verifyRequiredColumns(parseResult.schema, transformers);
|
||||||
|
return parseResult;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Goes through all transformer and makes sure each required column is available in its predecessor schema.
|
||||||
|
*
|
||||||
|
* @param inputSchema
|
||||||
|
* @param transformers
|
||||||
|
*/
|
||||||
|
function verifyRequiredColumns(inputSchema: string[], transformers: ItemTransformer[]) {
|
||||||
|
const schemas: string[][] = [inputSchema];
|
||||||
|
for (let idx = 0; idx < transformers.length; idx++) {
|
||||||
|
const transformer = transformers[idx];
|
||||||
|
const predecessorSchema = schemas[idx];
|
||||||
|
transformer.descriptor.requireColumns?.forEach((column) => {
|
||||||
|
assert(
|
||||||
|
predecessorSchema.includes(column),
|
||||||
|
`Input schema [${predecessorSchema.join(', ')}] for transformer '${
|
||||||
|
transformer.name
|
||||||
|
}' does not contain the required column '${column}'`,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
const outputSchema = transformer.schemaTransformer(predecessorSchema);
|
||||||
|
schemas.push(outputSchema);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function transform(parseResult: ParseResult, transformers: ItemTransformer[]) {
|
||||||
|
let items = parseResult.items;
|
||||||
|
let globals = new Globals();
|
||||||
|
const context = new TransformContext(parseResult.fontMap, parseResult.pageViewports, globals);
|
||||||
|
transformers.forEach((transformer) => {
|
||||||
|
const result = transformer.transform(context, items);
|
||||||
|
globals = globals.withValues(result.globals);
|
||||||
|
items = result.items;
|
||||||
|
});
|
||||||
|
return items;
|
||||||
|
}
|
24
core/src/convert/MarkdownConverter.ts
Normal file
24
core/src/convert/MarkdownConverter.ts
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import Item from 'src/Item';
|
||||||
|
import { Converter } from 'src/convert';
|
||||||
|
|
||||||
|
export default class MarkdownConverter implements Converter {
|
||||||
|
convert(items: Item[]) {
|
||||||
|
let content = '';
|
||||||
|
|
||||||
|
items.forEach((item) => {
|
||||||
|
const types = item.data['types'] || [];
|
||||||
|
const itemText = item.data['str'];
|
||||||
|
if (types.includes('H1')) {
|
||||||
|
content += '# ' + itemText + '\n';
|
||||||
|
} else if (types.includes('H2')) {
|
||||||
|
content += '## ' + itemText + '\n';
|
||||||
|
} else if (types.includes('H3')) {
|
||||||
|
content += '## ' + itemText + '\n';
|
||||||
|
} else {
|
||||||
|
content += itemText;
|
||||||
|
}
|
||||||
|
content += '\n';
|
||||||
|
});
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
import Config from './Config';
|
import TransformConfig from './Config';
|
||||||
import type ProgressListenFunction from './ProgressListenFunction';
|
import type ProgressListenFunction from './ProgressListenFunction';
|
||||||
import ParseProgressReporter from './ParseProgressReporter';
|
import ParseProgressReporter from './ParseProgressReporter';
|
||||||
import PdfParser from './PdfParser';
|
import PdfParser from './PdfParser';
|
||||||
@ -12,7 +12,9 @@ import CompactLines from './transformer/CompactLines';
|
|||||||
import SortXWithinLines from './transformer/SortXWithinLines';
|
import SortXWithinLines from './transformer/SortXWithinLines';
|
||||||
import RemoveRepetitiveItems from './transformer/RemoveRepetitiveItems';
|
import RemoveRepetitiveItems from './transformer/RemoveRepetitiveItems';
|
||||||
import DetectToc from './transformer/DetectToc';
|
import DetectToc from './transformer/DetectToc';
|
||||||
|
import DetectHeaders from './transformer/DetectHeaders';
|
||||||
import NoOpTransformer from './transformer/NoOpTransformer';
|
import NoOpTransformer from './transformer/NoOpTransformer';
|
||||||
|
import {type ParseConfig } from './parse';
|
||||||
|
|
||||||
export const transformers = [
|
export const transformers = [
|
||||||
new AdjustHeight(),
|
new AdjustHeight(),
|
||||||
@ -23,27 +25,20 @@ export const transformers = [
|
|||||||
new SortXWithinLines(),
|
new SortXWithinLines(),
|
||||||
new RemoveRepetitiveItems(),
|
new RemoveRepetitiveItems(),
|
||||||
new DetectToc(),
|
new DetectToc(),
|
||||||
|
new DetectHeaders(),
|
||||||
new NoOpTransformer(),
|
new NoOpTransformer(),
|
||||||
];
|
];
|
||||||
|
|
||||||
const defaultConfig: Config = {
|
export interface Options {
|
||||||
pdfjsParams: {
|
parseConfig?: ParseConfig;
|
||||||
// TODO check if that cmap thing makes sense since we don't bundle them
|
transformConfig?: TransformConfig;
|
||||||
cMapUrl: 'cmaps/',
|
|
||||||
cMapPacked: true,
|
|
||||||
},
|
|
||||||
transformers,
|
|
||||||
};
|
|
||||||
|
|
||||||
export function pdfParser(pdfJs: any) {
|
|
||||||
return new PdfParser(pdfJs, defaultConfig.pdfjsParams);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export function parseReporter(progressListener: ProgressListenFunction) {
|
export function parseReporter(progressListener: ProgressListenFunction) {
|
||||||
return new ParseProgressReporter(progressListener);
|
return new ParseProgressReporter(progressListener);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function createPipeline(pdfJs: any, config = defaultConfig): PdfPipeline {
|
export function createPipeline(pdfJs: any, options: Options = {}): PdfPipeline {
|
||||||
const parser = new PdfParser(pdfJs, config.pdfjsParams);
|
const parser = new PdfParser(pdfJs);
|
||||||
return new PdfPipeline(parser, config.transformers || transformers);
|
return new PdfPipeline(parser, options.transformConfig?.transformers || transformers);
|
||||||
}
|
}
|
||||||
|
19
core/src/parse.ts
Normal file
19
core/src/parse.ts
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import PdfParser from './PdfParser';
|
||||||
|
|
||||||
|
export interface ParseConfig {
|
||||||
|
// See DocumentInitParameters from https://mozilla.github.io/pdf.js/api/draft/module-pdfjsLib.html#DocumentInitParameters
|
||||||
|
pdfjsParams?: object;
|
||||||
|
// TODO keep pdfPages ?
|
||||||
|
}
|
||||||
|
|
||||||
|
const defaultConfig: ParseConfig = {
|
||||||
|
pdfjsParams: {
|
||||||
|
// TODO check if that cmap thing makes sense since we don't bundle them
|
||||||
|
cMapUrl: 'cmaps/',
|
||||||
|
cMapPacked: true,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export function parser(pdfJs: any, options: ParseConfig = defaultConfig) {
|
||||||
|
return new PdfParser(pdfJs, options.pdfjsParams);
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user