diff --git a/core/README.md b/core/README.md new file mode 100644 index 0000000..a5806df --- /dev/null +++ b/core/README.md @@ -0,0 +1,27 @@ +# PDF-To-Markdown Converter Core + +Javascript library to parse PDF files and convert them into Markdown format. Ui version online version at http://pdf2md.morethan.io! + +## Use + +//TBD + +## Contribute + +Use the [issue tracker](https://github.com/jzillmann/pdf-to-markdown/issues) and/or open [pull requests](https://github.com/jzillmann/pdf-to-markdown/pulls)! + +## Build + +- `npm install` Download all necessary npm packages +- `npm test` Run the tests +- `npm run lint` Lint the javascript files +- `npm run format` Run the prettier formatter +- `npm run build` Compile the typescript files to the `lib` folder + +## Release + +//TBD + +## Credits + +[pdf.js](https://mozilla.github.io/pdf.js/) - Mozilla's PDF parsing & rendering platform which is used as a raw parser diff --git a/core/jest.config.js b/core/jest.config.js index 1d71251..0b73a74 100644 --- a/core/jest.config.js +++ b/core/jest.config.js @@ -1,7 +1,7 @@ module.exports = { preset: 'ts-jest', testEnvironment: 'node', - roots: ['./'], + roots: ['./test'], transform: { '\\.ts$': ['ts-jest'] }, testRegex: '(/test/.*|(\\.|/)(test|spec))\\.(ts)$', moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'], diff --git a/core/package-lock.json b/core/package-lock.json index 30ba1d2..ccb8a45 100644 --- a/core/package-lock.json +++ b/core/package-lock.json @@ -4488,6 +4488,12 @@ "integrity": "sha512-GSmOT2EbHrINBf9SR7CDELwlJ8AENk3Qn7OikK4nFYAu3Ote2+JYNVvkpAEQm3/TLNEJFD/xZJjzyxg3KBWOzw==", "dev": true }, + "pdfjs-dist": { + "version": "2.5.207", + "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-2.5.207.tgz", + "integrity": "sha512-xGDUhnCYPfHy+unMXCLCJtlpZaaZ17Ew3WIL0tnSgKFUZXHAPD49GO9xScyszSsQMoutNDgRb+rfBXIaX/lJbw==", + "dev": true + }, "performance-now": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz", diff --git a/core/package.json b/core/package.json index b21d20d..0ce0a43 100644 --- a/core/package.json +++ b/core/package.json @@ -1,17 +1,7 @@ { "name": "pdf-to-markdown-core", - "version": "1.0.0", + "version": "0.5.0", "description": "Core logic for parsing a PDF and transforming it to Markdown", - "main": "index.js", - "files": [ - "lib/src/**/*" - ], - "scripts": { - "test": "jest", - "build": "tsc", - "format": "prettier --write \"src/**/*.ts\" \"src/**/*.js\"", - "lint": "tslint -p tsconfig.json" - }, "keywords": [ "PDF", "Markdown", @@ -19,9 +9,24 @@ ], "author": "Johannes Zillmann", "license": "AGPL-3.0", + "repository": { + "type": "git", + "url": "https://github.com/jzillmann/pdf-to-markdown" + }, + "main": "index.js", + "files": [ + "lib/src/**/*" + ], + "scripts": { + "test": "jest", + "build": "tsc", + "format": "prettier --write \"src/**/*.ts\"", + "lint": "tslint -p tsconfig.json" + }, "devDependencies": { "@types/jest": "^26.0.19", "jest": "^26.6.3", + "pdfjs-dist": "^2.5.207", "prettier": "^2.2.1", "ts-jest": "^26.4.4", "tslint": "^6.1.3", diff --git a/core/src/Metadata.ts b/core/src/Metadata.ts new file mode 100644 index 0000000..0558413 --- /dev/null +++ b/core/src/Metadata.ts @@ -0,0 +1,23 @@ +export default class Metadata { + original: object; + + constructor(original: object) { + this.original = original; + } + + title() { + return this.extract('Title', 'dc:title'); + } + + author() { + return this.extract('Author', 'dc:creator'); + } + + private extract(infoName: string, metadataKey: string) { + const metadata = this.original['metadata']; + if (metadata) { + return metadata[metadataKey]; + } + return this.original['info'][infoName]; + } +} diff --git a/core/src/ParseResult.ts b/core/src/ParseResult.ts new file mode 100644 index 0000000..362a2a8 --- /dev/null +++ b/core/src/ParseResult.ts @@ -0,0 +1,12 @@ +import Metadata from './Metadata'; +import ParsedPage from './ParsedPage'; + +export default class ParseResult { + metadata: Metadata; + pages: ParsedPage[]; + + constructor(metadata: Metadata, pages: ParsedPage[]) { + this.metadata = metadata; + this.pages = pages; + } +} diff --git a/core/src/ParsedPage.ts b/core/src/ParsedPage.ts new file mode 100644 index 0000000..6da02a2 --- /dev/null +++ b/core/src/ParsedPage.ts @@ -0,0 +1,13 @@ +import ParsedPageItem from './ParsedPageItem'; + +export default class ParsedPage { + index: number; + viewPortTransform: number[]; + items: ParsedPageItem[]; + + constructor(index: number, viewPortTransform: number[], items: ParsedPageItem[]) { + this.index = index; + this.viewPortTransform = viewPortTransform; + this.items = items; + } +} diff --git a/core/src/ParsedPageItem.ts b/core/src/ParsedPageItem.ts new file mode 100644 index 0000000..c451415 --- /dev/null +++ b/core/src/ParsedPageItem.ts @@ -0,0 +1,8 @@ +export default interface ParsedPageItem { + str: string; + dir: string; + width: number; + height: number; + transform: number[]; + fontName: string; +} diff --git a/core/src/PdfParser.ts b/core/src/PdfParser.ts new file mode 100644 index 0000000..a546f8b --- /dev/null +++ b/core/src/PdfParser.ts @@ -0,0 +1,99 @@ +import Metadata from './Metadata'; +import ParsedPage from './ParsedPage'; +import ParseResult from './ParseResult'; +import TextDirection from './TextDirection'; +import TextItem from './TextItem'; + +export default class PdfParser { + pdfjs: any; + constructor(pdfjs: any) { + this.pdfjs = pdfjs; + } + + async parse(data: Uint8Array): Promise { + return this.pdfjs + .getDocument({ + data, + cMapUrl: 'cmaps/', + cMapPacked: true, + }) + .promise.then((pdfDocument) => { + return Promise.all([pdfDocument.getMetadata(), this.extractPagesSequentially(pdfDocument)]); + }) + .then(([metadata, pages]) => new ParseResult(new Metadata(metadata), pages)); + } + + private extractPagesSequentially(pdfDocument: any): Promise { + return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => { + return accumulatorPromise.then((accumulatedResults) => { + return pdfDocument.getPage(index + 1).then((page) => { + const viewport = page.getViewport({ scale: 1.0 }); + return this.triggerFontRetrieval(page).then(() => + page + .getTextContent() + .then((textContent) => [ + ...accumulatedResults, + new ParsedPage(index, viewport.transform, textContent.items), + ]), + ); + }); + }); + }, Promise.resolve([])); + } + + private triggerFontRetrieval(page): Promise { + return page.getOperatorList(); + } + + async parseOld(data: Uint8Array): Promise { + return this.pdfjs + .getDocument({ + data, + cMapUrl: 'cmaps/', + cMapPacked: true, + }) + .promise.then((pdfDocument) => { + // console.log('result', pdfDocument); + const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => { + return accumulatorPromise.then((accumulatedResults) => { + // console.log('Parsing page ' + index); + return pdfDocument.getPage(index + 1).then((page) => { + const viewport = page.getViewport({ scale: 1.0 }); + return this.triggerFontRetrieval(page).then(() => + page.getTextContent().then((textContent) => { + // console.log(textContent); + const textItems: TextItem[] = textContent.items.map((item) => { + const tx = this.pdfjs.Util.transform(viewport.transform, item.transform); + const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]); + const dividedHeight = item.height / fontHeight; + + return { + x: Math.round(item.transform[4]), + y: Math.round(item.transform[5]), + width: Math.round(item.width), + height: Math.round( + Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight, + ), + text: item.str, + textDirection: TextDirection.fromPdfJs(item.dir), + fontId: item.fontName, + }; + }); + + return [...accumulatedResults, ...textItems]; + }), + ); + }); + }); + }, Promise.resolve([])); + return Promise.all([pdfDocument.getMetadata(), result]); + }) + .then(([metadata, r]) => { + // console.log('Parsed metadata:', metadata); + // console.log('Parsed result:', r.length); + // console.log('Parsed result:', r); + + return new ParseResult(new Metadata(metadata), r); + }); + } +} diff --git a/core/src/index.ts b/core/src/index.ts index d439c50..c4e74e1 100644 --- a/core/src/index.ts +++ b/core/src/index.ts @@ -1 +1,6 @@ -export const Greeter = (name: string) => `Hello ${name}`; +import ParseResult from './ParseResult'; +import PdfParser from './PdfParser'; + +export function pdfParser(pdfJs: any) { + return new PdfParser(pdfJs); +} diff --git a/core/test/PdfParser.test.ts b/core/test/PdfParser.test.ts new file mode 100644 index 0000000..296f1a2 --- /dev/null +++ b/core/test/PdfParser.test.ts @@ -0,0 +1,121 @@ +import PdfParser from 'src/PdfParser'; +import * as pdfjs from 'pdfjs-dist/es5/build/pdf'; +import * as fs from 'fs'; + +const parser = new PdfParser(pdfjs); + +test('testIt', async () => { + const data = fs.readFileSync('../examples/ExamplePdf.pdf', null); + const result = await parser.parse(data); + expect(result.metadata.title()).toEqual('ExamplePdf'); + expect(result.metadata.author()).toEqual('Johannes Zillmann'); + expect(result.pages.length).toBe(7); + expect(result.pages[0].index).toBe(0); + expect(result.pages[0].viewPortTransform).toEqual([1, 0, 0, -1, 0, 841.8898]); + expect(result.pages[0].items).toEqual([ + { + str: 'Mega Überschrift', + dir: 'ltr', + width: 245.05800000000005, + height: 30, + transform: [30, 0, 0, 30, 175, 756], + fontName: 'g_d0_f1', + }, + { + str: '2te Überschrift', + dir: 'ltr', + width: 130.056, + height: 20, + transform: [20, 0, 0, 20, 233, 665], + fontName: 'g_d0_f2', + }, + { + str: 'Dies ist eine Test-PDF', + dir: 'ltr', + width: 108.61950000000003, + height: 11, + transform: [11, 0, 0, 11, 240, 585], + fontName: 'g_d0_f2', + }, + { + str: '.', + dir: 'ltr', + width: 3.0580000000000003, + height: 11, + transform: [11, 0, 0, 11, 352.6927, 585], + fontName: 'g_d0_f2', + }, + { + str: '1', + dir: 'ltr', + width: 4.077333704, + height: 7.333334, + transform: [7.333334, 0, 0, 7.333334, 348, 588], + fontName: 'g_d0_f2', + }, + { + str: 'Für’s Testen des ', + dir: 'ltr', + width: 83.7826, + height: 11, + transform: [11, 0, 0, 11, 208, 572], + fontName: 'g_d0_f2', + }, + { + str: 'Markdown Parsers', + dir: 'ltr', + width: 91.6982, + height: 11, + transform: [11, 0, 0, 11, 291.77832, 572], + fontName: 'g_d0_f2', + }, + { + str: '.', + dir: 'ltr', + width: 3.0580000000000003, + height: 11, + transform: [11, 0, 0, 11, 383.47360000000003, 572], + fontName: 'g_d0_f2', + }, + { + str: ' ', + dir: 'ltr', + width: 3.0580000000000003, + height: 11, + transform: [11, 0, 0, 11, 61.078451, 59], + fontName: 'g_d0_f2', + }, + { + str: 'In Deutsch.', + dir: 'ltr', + width: 55.64240000000001, + height: 11, + transform: [11, 0, 0, 11, 64.134603, 59], + fontName: 'g_d0_f2', + }, + { + str: '1', + dir: 'ltr', + width: 4.077333704, + height: 7.333334, + transform: [7.333334, 0, 0, 7.333334, 57, 62], + fontName: 'g_d0_f2', + }, + { + str: '\x00', + dir: 'ltr', + width: 0, + height: 12, + transform: [12, 0, 0, 12, 294, 45], + fontName: 'g_d0_f3', + }, + { + str: '1', + dir: 'ltr', + width: 6.672000000000001, + height: 12, + transform: [12, 0, 0, 12, 294, 45], + fontName: 'g_d0_f2', + }, + ]); +}); diff --git a/core/test/index.test.ts b/core/test/index.test.ts deleted file mode 100644 index 17298a4..0000000 --- a/core/test/index.test.ts +++ /dev/null @@ -1,5 +0,0 @@ -import { Greeter } from 'src/index'; - -test('My Greeter', () => { - expect(Greeter('Carl')).toBe('Hello Carl'); -}); diff --git a/core/tsconfig.json b/core/tsconfig.json index 701cfc2..570b0d1 100644 --- a/core/tsconfig.json +++ b/core/tsconfig.json @@ -5,6 +5,7 @@ "declaration": true, "outDir": "./lib", "strict": true, + "noImplicitAny": false, "baseUrl": "./", "paths": { "src/*": ["src/*"], diff --git a/core/tslint.json b/core/tslint.json index 85e60a4..2a217ec 100644 --- a/core/tslint.json +++ b/core/tslint.json @@ -1,3 +1,7 @@ { - "extends": ["tslint:recommended", "tslint-config-prettier"] + "extends": ["tslint:recommended", "tslint-config-prettier"], + "rules": { + "no-string-literal" : false, + "no-namespace" : false + } }