mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-12-29 09:49:00 +01:00
Initial pdfJs integration
This commit is contained in:
parent
f988bd565e
commit
a3695a4a56
27
core/README.md
Normal file
27
core/README.md
Normal file
@ -0,0 +1,27 @@
|
||||
# PDF-To-Markdown Converter Core
|
||||
|
||||
Javascript library to parse PDF files and convert them into Markdown format. Ui version online version at http://pdf2md.morethan.io!
|
||||
|
||||
## Use
|
||||
|
||||
//TBD
|
||||
|
||||
## Contribute
|
||||
|
||||
Use the [issue tracker](https://github.com/jzillmann/pdf-to-markdown/issues) and/or open [pull requests](https://github.com/jzillmann/pdf-to-markdown/pulls)!
|
||||
|
||||
## Build
|
||||
|
||||
- `npm install` Download all necessary npm packages
|
||||
- `npm test` Run the tests
|
||||
- `npm run lint` Lint the javascript files
|
||||
- `npm run format` Run the prettier formatter
|
||||
- `npm run build` Compile the typescript files to the `lib` folder
|
||||
|
||||
## Release
|
||||
|
||||
//TBD
|
||||
|
||||
## Credits
|
||||
|
||||
[pdf.js](https://mozilla.github.io/pdf.js/) - Mozilla's PDF parsing & rendering platform which is used as a raw parser
|
@ -1,7 +1,7 @@
|
||||
module.exports = {
|
||||
preset: 'ts-jest',
|
||||
testEnvironment: 'node',
|
||||
roots: ['./'],
|
||||
roots: ['./test'],
|
||||
transform: { '\\.ts$': ['ts-jest'] },
|
||||
testRegex: '(/test/.*|(\\.|/)(test|spec))\\.(ts)$',
|
||||
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
|
||||
|
6
core/package-lock.json
generated
6
core/package-lock.json
generated
@ -4488,6 +4488,12 @@
|
||||
"integrity": "sha512-GSmOT2EbHrINBf9SR7CDELwlJ8AENk3Qn7OikK4nFYAu3Ote2+JYNVvkpAEQm3/TLNEJFD/xZJjzyxg3KBWOzw==",
|
||||
"dev": true
|
||||
},
|
||||
"pdfjs-dist": {
|
||||
"version": "2.5.207",
|
||||
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-2.5.207.tgz",
|
||||
"integrity": "sha512-xGDUhnCYPfHy+unMXCLCJtlpZaaZ17Ew3WIL0tnSgKFUZXHAPD49GO9xScyszSsQMoutNDgRb+rfBXIaX/lJbw==",
|
||||
"dev": true
|
||||
},
|
||||
"performance-now": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz",
|
||||
|
@ -1,17 +1,7 @@
|
||||
{
|
||||
"name": "pdf-to-markdown-core",
|
||||
"version": "1.0.0",
|
||||
"version": "0.5.0",
|
||||
"description": "Core logic for parsing a PDF and transforming it to Markdown",
|
||||
"main": "index.js",
|
||||
"files": [
|
||||
"lib/src/**/*"
|
||||
],
|
||||
"scripts": {
|
||||
"test": "jest",
|
||||
"build": "tsc",
|
||||
"format": "prettier --write \"src/**/*.ts\" \"src/**/*.js\"",
|
||||
"lint": "tslint -p tsconfig.json"
|
||||
},
|
||||
"keywords": [
|
||||
"PDF",
|
||||
"Markdown",
|
||||
@ -19,9 +9,24 @@
|
||||
],
|
||||
"author": "Johannes Zillmann",
|
||||
"license": "AGPL-3.0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/jzillmann/pdf-to-markdown"
|
||||
},
|
||||
"main": "index.js",
|
||||
"files": [
|
||||
"lib/src/**/*"
|
||||
],
|
||||
"scripts": {
|
||||
"test": "jest",
|
||||
"build": "tsc",
|
||||
"format": "prettier --write \"src/**/*.ts\"",
|
||||
"lint": "tslint -p tsconfig.json"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/jest": "^26.0.19",
|
||||
"jest": "^26.6.3",
|
||||
"pdfjs-dist": "^2.5.207",
|
||||
"prettier": "^2.2.1",
|
||||
"ts-jest": "^26.4.4",
|
||||
"tslint": "^6.1.3",
|
||||
|
23
core/src/Metadata.ts
Normal file
23
core/src/Metadata.ts
Normal file
@ -0,0 +1,23 @@
|
||||
export default class Metadata {
|
||||
original: object;
|
||||
|
||||
constructor(original: object) {
|
||||
this.original = original;
|
||||
}
|
||||
|
||||
title() {
|
||||
return this.extract('Title', 'dc:title');
|
||||
}
|
||||
|
||||
author() {
|
||||
return this.extract('Author', 'dc:creator');
|
||||
}
|
||||
|
||||
private extract(infoName: string, metadataKey: string) {
|
||||
const metadata = this.original['metadata'];
|
||||
if (metadata) {
|
||||
return metadata[metadataKey];
|
||||
}
|
||||
return this.original['info'][infoName];
|
||||
}
|
||||
}
|
12
core/src/ParseResult.ts
Normal file
12
core/src/ParseResult.ts
Normal file
@ -0,0 +1,12 @@
|
||||
import Metadata from './Metadata';
|
||||
import ParsedPage from './ParsedPage';
|
||||
|
||||
export default class ParseResult {
|
||||
metadata: Metadata;
|
||||
pages: ParsedPage[];
|
||||
|
||||
constructor(metadata: Metadata, pages: ParsedPage[]) {
|
||||
this.metadata = metadata;
|
||||
this.pages = pages;
|
||||
}
|
||||
}
|
13
core/src/ParsedPage.ts
Normal file
13
core/src/ParsedPage.ts
Normal file
@ -0,0 +1,13 @@
|
||||
import ParsedPageItem from './ParsedPageItem';
|
||||
|
||||
export default class ParsedPage {
|
||||
index: number;
|
||||
viewPortTransform: number[];
|
||||
items: ParsedPageItem[];
|
||||
|
||||
constructor(index: number, viewPortTransform: number[], items: ParsedPageItem[]) {
|
||||
this.index = index;
|
||||
this.viewPortTransform = viewPortTransform;
|
||||
this.items = items;
|
||||
}
|
||||
}
|
8
core/src/ParsedPageItem.ts
Normal file
8
core/src/ParsedPageItem.ts
Normal file
@ -0,0 +1,8 @@
|
||||
export default interface ParsedPageItem {
|
||||
str: string;
|
||||
dir: string;
|
||||
width: number;
|
||||
height: number;
|
||||
transform: number[];
|
||||
fontName: string;
|
||||
}
|
99
core/src/PdfParser.ts
Normal file
99
core/src/PdfParser.ts
Normal file
@ -0,0 +1,99 @@
|
||||
import Metadata from './Metadata';
|
||||
import ParsedPage from './ParsedPage';
|
||||
import ParseResult from './ParseResult';
|
||||
import TextDirection from './TextDirection';
|
||||
import TextItem from './TextItem';
|
||||
|
||||
export default class PdfParser {
|
||||
pdfjs: any;
|
||||
constructor(pdfjs: any) {
|
||||
this.pdfjs = pdfjs;
|
||||
}
|
||||
|
||||
async parse(data: Uint8Array): Promise<ParseResult> {
|
||||
return this.pdfjs
|
||||
.getDocument({
|
||||
data,
|
||||
cMapUrl: 'cmaps/',
|
||||
cMapPacked: true,
|
||||
})
|
||||
.promise.then((pdfDocument) => {
|
||||
return Promise.all([pdfDocument.getMetadata(), this.extractPagesSequentially(pdfDocument)]);
|
||||
})
|
||||
.then(([metadata, pages]) => new ParseResult(new Metadata(metadata), pages));
|
||||
}
|
||||
|
||||
private extractPagesSequentially(pdfDocument: any): Promise<ParsedPage> {
|
||||
return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||
return accumulatorPromise.then((accumulatedResults) => {
|
||||
return pdfDocument.getPage(index + 1).then((page) => {
|
||||
const viewport = page.getViewport({ scale: 1.0 });
|
||||
return this.triggerFontRetrieval(page).then(() =>
|
||||
page
|
||||
.getTextContent()
|
||||
.then((textContent) => [
|
||||
...accumulatedResults,
|
||||
new ParsedPage(index, viewport.transform, textContent.items),
|
||||
]),
|
||||
);
|
||||
});
|
||||
});
|
||||
}, Promise.resolve([]));
|
||||
}
|
||||
|
||||
private triggerFontRetrieval(page): Promise<void> {
|
||||
return page.getOperatorList();
|
||||
}
|
||||
|
||||
async parseOld(data: Uint8Array): Promise<ParseResult> {
|
||||
return this.pdfjs
|
||||
.getDocument({
|
||||
data,
|
||||
cMapUrl: 'cmaps/',
|
||||
cMapPacked: true,
|
||||
})
|
||||
.promise.then((pdfDocument) => {
|
||||
// console.log('result', pdfDocument);
|
||||
const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
|
||||
return accumulatorPromise.then((accumulatedResults) => {
|
||||
// console.log('Parsing page ' + index);
|
||||
return pdfDocument.getPage(index + 1).then((page) => {
|
||||
const viewport = page.getViewport({ scale: 1.0 });
|
||||
return this.triggerFontRetrieval(page).then(() =>
|
||||
page.getTextContent().then((textContent) => {
|
||||
// console.log(textContent);
|
||||
const textItems: TextItem[] = textContent.items.map((item) => {
|
||||
const tx = this.pdfjs.Util.transform(viewport.transform, item.transform);
|
||||
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
|
||||
const dividedHeight = item.height / fontHeight;
|
||||
|
||||
return {
|
||||
x: Math.round(item.transform[4]),
|
||||
y: Math.round(item.transform[5]),
|
||||
width: Math.round(item.width),
|
||||
height: Math.round(
|
||||
Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight,
|
||||
),
|
||||
text: item.str,
|
||||
textDirection: TextDirection.fromPdfJs(item.dir),
|
||||
fontId: item.fontName,
|
||||
};
|
||||
});
|
||||
|
||||
return [...accumulatedResults, ...textItems];
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
}, Promise.resolve([]));
|
||||
return Promise.all([pdfDocument.getMetadata(), result]);
|
||||
})
|
||||
.then(([metadata, r]) => {
|
||||
// console.log('Parsed metadata:', metadata);
|
||||
// console.log('Parsed result:', r.length);
|
||||
// console.log('Parsed result:', r);
|
||||
|
||||
return new ParseResult(new Metadata(metadata), r);
|
||||
});
|
||||
}
|
||||
}
|
@ -1 +1,6 @@
|
||||
export const Greeter = (name: string) => `Hello ${name}`;
|
||||
import ParseResult from './ParseResult';
|
||||
import PdfParser from './PdfParser';
|
||||
|
||||
export function pdfParser(pdfJs: any) {
|
||||
return new PdfParser(pdfJs);
|
||||
}
|
||||
|
121
core/test/PdfParser.test.ts
Normal file
121
core/test/PdfParser.test.ts
Normal file
@ -0,0 +1,121 @@
|
||||
import PdfParser from 'src/PdfParser';
|
||||
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
|
||||
import * as fs from 'fs';
|
||||
|
||||
const parser = new PdfParser(pdfjs);
|
||||
|
||||
test('testIt', async () => {
|
||||
const data = fs.readFileSync('../examples/ExamplePdf.pdf', null);
|
||||
const result = await parser.parse(data);
|
||||
expect(result.metadata.title()).toEqual('ExamplePdf');
|
||||
expect(result.metadata.author()).toEqual('Johannes Zillmann');
|
||||
expect(result.pages.length).toBe(7);
|
||||
expect(result.pages[0].index).toBe(0);
|
||||
expect(result.pages[0].viewPortTransform).toEqual([1, 0, 0, -1, 0, 841.8898]);
|
||||
expect(result.pages[0].items).toEqual([
|
||||
{
|
||||
str: 'Mega Überschrift',
|
||||
dir: 'ltr',
|
||||
width: 245.05800000000005,
|
||||
height: 30,
|
||||
transform: [30, 0, 0, 30, 175, 756],
|
||||
fontName: 'g_d0_f1',
|
||||
},
|
||||
{
|
||||
str: '2te Überschrift',
|
||||
dir: 'ltr',
|
||||
width: 130.056,
|
||||
height: 20,
|
||||
transform: [20, 0, 0, 20, 233, 665],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
str: 'Dies ist eine Test-PDF',
|
||||
dir: 'ltr',
|
||||
width: 108.61950000000003,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 240, 585],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
str: '.',
|
||||
dir: 'ltr',
|
||||
width: 3.0580000000000003,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 352.6927, 585],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
str: '1',
|
||||
dir: 'ltr',
|
||||
width: 4.077333704,
|
||||
height: 7.333334,
|
||||
transform: [7.333334, 0, 0, 7.333334, 348, 588],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
str: 'Für’s Testen des ',
|
||||
dir: 'ltr',
|
||||
width: 83.7826,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 208, 572],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
str: 'Markdown Parsers',
|
||||
dir: 'ltr',
|
||||
width: 91.6982,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 291.77832, 572],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
str: '.',
|
||||
dir: 'ltr',
|
||||
width: 3.0580000000000003,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 383.47360000000003, 572],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
str: ' ',
|
||||
dir: 'ltr',
|
||||
width: 3.0580000000000003,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 61.078451, 59],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
str: 'In Deutsch.',
|
||||
dir: 'ltr',
|
||||
width: 55.64240000000001,
|
||||
height: 11,
|
||||
transform: [11, 0, 0, 11, 64.134603, 59],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
str: '1',
|
||||
dir: 'ltr',
|
||||
width: 4.077333704,
|
||||
height: 7.333334,
|
||||
transform: [7.333334, 0, 0, 7.333334, 57, 62],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
{
|
||||
str: '\x00',
|
||||
dir: 'ltr',
|
||||
width: 0,
|
||||
height: 12,
|
||||
transform: [12, 0, 0, 12, 294, 45],
|
||||
fontName: 'g_d0_f3',
|
||||
},
|
||||
{
|
||||
str: '1',
|
||||
dir: 'ltr',
|
||||
width: 6.672000000000001,
|
||||
height: 12,
|
||||
transform: [12, 0, 0, 12, 294, 45],
|
||||
fontName: 'g_d0_f2',
|
||||
},
|
||||
]);
|
||||
});
|
@ -1,5 +0,0 @@
|
||||
import { Greeter } from 'src/index';
|
||||
|
||||
test('My Greeter', () => {
|
||||
expect(Greeter('Carl')).toBe('Hello Carl');
|
||||
});
|
@ -5,6 +5,7 @@
|
||||
"declaration": true,
|
||||
"outDir": "./lib",
|
||||
"strict": true,
|
||||
"noImplicitAny": false,
|
||||
"baseUrl": "./",
|
||||
"paths": {
|
||||
"src/*": ["src/*"],
|
||||
|
@ -1,3 +1,7 @@
|
||||
{
|
||||
"extends": ["tslint:recommended", "tslint-config-prettier"]
|
||||
"extends": ["tslint:recommended", "tslint-config-prettier"],
|
||||
"rules": {
|
||||
"no-string-literal" : false,
|
||||
"no-namespace" : false
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user