Initial pdfJs integration

This commit is contained in:
Johannes Zillmann 2020-12-20 19:01:03 +01:00
parent f988bd565e
commit a3695a4a56
14 changed files with 338 additions and 19 deletions

27
core/README.md Normal file
View File

@ -0,0 +1,27 @@
# PDF-To-Markdown Converter Core
Javascript library to parse PDF files and convert them into Markdown format. Ui version online version at http://pdf2md.morethan.io!
## Use
//TBD
## Contribute
Use the [issue tracker](https://github.com/jzillmann/pdf-to-markdown/issues) and/or open [pull requests](https://github.com/jzillmann/pdf-to-markdown/pulls)!
## Build
- `npm install` Download all necessary npm packages
- `npm test` Run the tests
- `npm run lint` Lint the javascript files
- `npm run format` Run the prettier formatter
- `npm run build` Compile the typescript files to the `lib` folder
## Release
//TBD
## Credits
[pdf.js](https://mozilla.github.io/pdf.js/) - Mozilla's PDF parsing & rendering platform which is used as a raw parser

View File

@ -1,7 +1,7 @@
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
roots: ['./'],
roots: ['./test'],
transform: { '\\.ts$': ['ts-jest'] },
testRegex: '(/test/.*|(\\.|/)(test|spec))\\.(ts)$',
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],

View File

@ -4488,6 +4488,12 @@
"integrity": "sha512-GSmOT2EbHrINBf9SR7CDELwlJ8AENk3Qn7OikK4nFYAu3Ote2+JYNVvkpAEQm3/TLNEJFD/xZJjzyxg3KBWOzw==",
"dev": true
},
"pdfjs-dist": {
"version": "2.5.207",
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-2.5.207.tgz",
"integrity": "sha512-xGDUhnCYPfHy+unMXCLCJtlpZaaZ17Ew3WIL0tnSgKFUZXHAPD49GO9xScyszSsQMoutNDgRb+rfBXIaX/lJbw==",
"dev": true
},
"performance-now": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz",

View File

@ -1,17 +1,7 @@
{
"name": "pdf-to-markdown-core",
"version": "1.0.0",
"version": "0.5.0",
"description": "Core logic for parsing a PDF and transforming it to Markdown",
"main": "index.js",
"files": [
"lib/src/**/*"
],
"scripts": {
"test": "jest",
"build": "tsc",
"format": "prettier --write \"src/**/*.ts\" \"src/**/*.js\"",
"lint": "tslint -p tsconfig.json"
},
"keywords": [
"PDF",
"Markdown",
@ -19,9 +9,24 @@
],
"author": "Johannes Zillmann",
"license": "AGPL-3.0",
"repository": {
"type": "git",
"url": "https://github.com/jzillmann/pdf-to-markdown"
},
"main": "index.js",
"files": [
"lib/src/**/*"
],
"scripts": {
"test": "jest",
"build": "tsc",
"format": "prettier --write \"src/**/*.ts\"",
"lint": "tslint -p tsconfig.json"
},
"devDependencies": {
"@types/jest": "^26.0.19",
"jest": "^26.6.3",
"pdfjs-dist": "^2.5.207",
"prettier": "^2.2.1",
"ts-jest": "^26.4.4",
"tslint": "^6.1.3",

23
core/src/Metadata.ts Normal file
View File

@ -0,0 +1,23 @@
export default class Metadata {
original: object;
constructor(original: object) {
this.original = original;
}
title() {
return this.extract('Title', 'dc:title');
}
author() {
return this.extract('Author', 'dc:creator');
}
private extract(infoName: string, metadataKey: string) {
const metadata = this.original['metadata'];
if (metadata) {
return metadata[metadataKey];
}
return this.original['info'][infoName];
}
}

12
core/src/ParseResult.ts Normal file
View File

@ -0,0 +1,12 @@
import Metadata from './Metadata';
import ParsedPage from './ParsedPage';
export default class ParseResult {
metadata: Metadata;
pages: ParsedPage[];
constructor(metadata: Metadata, pages: ParsedPage[]) {
this.metadata = metadata;
this.pages = pages;
}
}

13
core/src/ParsedPage.ts Normal file
View File

@ -0,0 +1,13 @@
import ParsedPageItem from './ParsedPageItem';
export default class ParsedPage {
index: number;
viewPortTransform: number[];
items: ParsedPageItem[];
constructor(index: number, viewPortTransform: number[], items: ParsedPageItem[]) {
this.index = index;
this.viewPortTransform = viewPortTransform;
this.items = items;
}
}

View File

@ -0,0 +1,8 @@
export default interface ParsedPageItem {
str: string;
dir: string;
width: number;
height: number;
transform: number[];
fontName: string;
}

99
core/src/PdfParser.ts Normal file
View File

@ -0,0 +1,99 @@
import Metadata from './Metadata';
import ParsedPage from './ParsedPage';
import ParseResult from './ParseResult';
import TextDirection from './TextDirection';
import TextItem from './TextItem';
export default class PdfParser {
pdfjs: any;
constructor(pdfjs: any) {
this.pdfjs = pdfjs;
}
async parse(data: Uint8Array): Promise<ParseResult> {
return this.pdfjs
.getDocument({
data,
cMapUrl: 'cmaps/',
cMapPacked: true,
})
.promise.then((pdfDocument) => {
return Promise.all([pdfDocument.getMetadata(), this.extractPagesSequentially(pdfDocument)]);
})
.then(([metadata, pages]) => new ParseResult(new Metadata(metadata), pages));
}
private extractPagesSequentially(pdfDocument: any): Promise<ParsedPage> {
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults) => {
return pdfDocument.getPage(index + 1).then((page) => {
const viewport = page.getViewport({ scale: 1.0 });
return this.triggerFontRetrieval(page).then(() =>
page
.getTextContent()
.then((textContent) => [
...accumulatedResults,
new ParsedPage(index, viewport.transform, textContent.items),
]),
);
});
});
}, Promise.resolve([]));
}
private triggerFontRetrieval(page): Promise<void> {
return page.getOperatorList();
}
async parseOld(data: Uint8Array): Promise<ParseResult> {
return this.pdfjs
.getDocument({
data,
cMapUrl: 'cmaps/',
cMapPacked: true,
})
.promise.then((pdfDocument) => {
// console.log('result', pdfDocument);
const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults) => {
// console.log('Parsing page ' + index);
return pdfDocument.getPage(index + 1).then((page) => {
const viewport = page.getViewport({ scale: 1.0 });
return this.triggerFontRetrieval(page).then(() =>
page.getTextContent().then((textContent) => {
// console.log(textContent);
const textItems: TextItem[] = textContent.items.map((item) => {
const tx = this.pdfjs.Util.transform(viewport.transform, item.transform);
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
const dividedHeight = item.height / fontHeight;
return {
x: Math.round(item.transform[4]),
y: Math.round(item.transform[5]),
width: Math.round(item.width),
height: Math.round(
Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight,
),
text: item.str,
textDirection: TextDirection.fromPdfJs(item.dir),
fontId: item.fontName,
};
});
return [...accumulatedResults, ...textItems];
}),
);
});
});
}, Promise.resolve([]));
return Promise.all([pdfDocument.getMetadata(), result]);
})
.then(([metadata, r]) => {
// console.log('Parsed metadata:', metadata);
// console.log('Parsed result:', r.length);
// console.log('Parsed result:', r);
return new ParseResult(new Metadata(metadata), r);
});
}
}

View File

@ -1 +1,6 @@
export const Greeter = (name: string) => `Hello ${name}`;
import ParseResult from './ParseResult';
import PdfParser from './PdfParser';
export function pdfParser(pdfJs: any) {
return new PdfParser(pdfJs);
}

121
core/test/PdfParser.test.ts Normal file
View File

@ -0,0 +1,121 @@
import PdfParser from 'src/PdfParser';
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
import * as fs from 'fs';
const parser = new PdfParser(pdfjs);
test('testIt', async () => {
const data = fs.readFileSync('../examples/ExamplePdf.pdf', null);
const result = await parser.parse(data);
expect(result.metadata.title()).toEqual('ExamplePdf');
expect(result.metadata.author()).toEqual('Johannes Zillmann');
expect(result.pages.length).toBe(7);
expect(result.pages[0].index).toBe(0);
expect(result.pages[0].viewPortTransform).toEqual([1, 0, 0, -1, 0, 841.8898]);
expect(result.pages[0].items).toEqual([
{
str: 'Mega Überschrift',
dir: 'ltr',
width: 245.05800000000005,
height: 30,
transform: [30, 0, 0, 30, 175, 756],
fontName: 'g_d0_f1',
},
{
str: '2te Überschrift',
dir: 'ltr',
width: 130.056,
height: 20,
transform: [20, 0, 0, 20, 233, 665],
fontName: 'g_d0_f2',
},
{
str: 'Dies ist eine Test-PDF',
dir: 'ltr',
width: 108.61950000000003,
height: 11,
transform: [11, 0, 0, 11, 240, 585],
fontName: 'g_d0_f2',
},
{
str: '.',
dir: 'ltr',
width: 3.0580000000000003,
height: 11,
transform: [11, 0, 0, 11, 352.6927, 585],
fontName: 'g_d0_f2',
},
{
str: '1',
dir: 'ltr',
width: 4.077333704,
height: 7.333334,
transform: [7.333334, 0, 0, 7.333334, 348, 588],
fontName: 'g_d0_f2',
},
{
str: 'Fürs Testen des ',
dir: 'ltr',
width: 83.7826,
height: 11,
transform: [11, 0, 0, 11, 208, 572],
fontName: 'g_d0_f2',
},
{
str: 'Markdown Parsers',
dir: 'ltr',
width: 91.6982,
height: 11,
transform: [11, 0, 0, 11, 291.77832, 572],
fontName: 'g_d0_f2',
},
{
str: '.',
dir: 'ltr',
width: 3.0580000000000003,
height: 11,
transform: [11, 0, 0, 11, 383.47360000000003, 572],
fontName: 'g_d0_f2',
},
{
str: ' ',
dir: 'ltr',
width: 3.0580000000000003,
height: 11,
transform: [11, 0, 0, 11, 61.078451, 59],
fontName: 'g_d0_f2',
},
{
str: 'In Deutsch.',
dir: 'ltr',
width: 55.64240000000001,
height: 11,
transform: [11, 0, 0, 11, 64.134603, 59],
fontName: 'g_d0_f2',
},
{
str: '1',
dir: 'ltr',
width: 4.077333704,
height: 7.333334,
transform: [7.333334, 0, 0, 7.333334, 57, 62],
fontName: 'g_d0_f2',
},
{
str: '\x00',
dir: 'ltr',
width: 0,
height: 12,
transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f3',
},
{
str: '1',
dir: 'ltr',
width: 6.672000000000001,
height: 12,
transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f2',
},
]);
});

View File

@ -1,5 +0,0 @@
import { Greeter } from 'src/index';
test('My Greeter', () => {
expect(Greeter('Carl')).toBe('Hello Carl');
});

View File

@ -5,6 +5,7 @@
"declaration": true,
"outDir": "./lib",
"strict": true,
"noImplicitAny": false,
"baseUrl": "./",
"paths": {
"src/*": ["src/*"],

View File

@ -1,3 +1,7 @@
{
"extends": ["tslint:recommended", "tslint-config-prettier"]
"extends": ["tslint:recommended", "tslint-config-prettier"],
"rules": {
"no-string-literal" : false,
"no-namespace" : false
}
}