From c98145a63cc6097cc25c45cb8c8b8dba17951418 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Mon, 22 Mar 2021 09:03:26 +0100 Subject: [PATCH] Test for remote PDFS --- .gitignore | 3 +- core/src/Debugger.ts | 2 +- core/test/Debugger.test.ts | 32 ++--- core/test/Files.test.ts | 143 ++++++++++++++++++----- examples/README.md | 29 ++++- examples/dict/removeRepetitiveItems.json | 32 +++++ ui/src/debug/DebugView.svelte | 2 +- 7 files changed, 194 insertions(+), 49 deletions(-) create mode 100644 examples/dict/removeRepetitiveItems.json diff --git a/.gitignore b/.gitignore index 025a419..cfbd496 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ +.DS_Store node_modules/ build/ npm-debug.log -.eslintcache \ No newline at end of file +.eslintcache diff --git a/core/src/Debugger.ts b/core/src/Debugger.ts index 3ec00a2..a65d115 100644 --- a/core/src/Debugger.ts +++ b/core/src/Debugger.ts @@ -33,7 +33,7 @@ export default class Debugger { this.stageResultCache = [initialStage(inputSchema, inputItems)]; } - stageResults(stageIndex: number): StageResult { + stageResult(stageIndex: number): StageResult { for (let idx = 0; idx < stageIndex + 1; idx++) { if (!this.stageResultCache[idx]) { const transformer = this.transformers[idx - 1]; diff --git a/core/test/Debugger.test.ts b/core/test/Debugger.test.ts index c8d1673..6dc6d5c 100644 --- a/core/test/Debugger.test.ts +++ b/core/test/Debugger.test.ts @@ -39,14 +39,14 @@ describe('Transform Items', () => { const debug = new Debugger(1, parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers); expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']); - expect(debug.stageResults(0).schema).toEqual(parsedSchema.map((column) => ({ name: column }))); - expect(debug.stageResults(1).schema).toEqual([ + expect(debug.stageResult(0).schema).toEqual(parsedSchema.map((column) => ({ name: column }))); + expect(debug.stageResult(1).schema).toEqual([ ...parsedSchema.map((column) => ({ name: column, annotation: ColumnAnnotation.REMOVED })), { name: 'C', annotation: ColumnAnnotation.ADDED }, ]); - expect(debug.stageResults(0).itemsUnpacked()).toEqual(parsedItems); - expect(debug.stageResults(1).itemsUnpacked()).toEqual(trans1Items); + expect(debug.stageResult(0).itemsUnpacked()).toEqual(parsedItems); + expect(debug.stageResult(1).itemsUnpacked()).toEqual(trans1Items); }); test('Line Merge', async () => { @@ -65,17 +65,17 @@ describe('Transform Items', () => { const debug = new Debugger(1, parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers); expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']); - expect(debug.stageResults(0).schema).toEqual([{ name: 'id' }, { name: 'y' }]); - expect(debug.stageResults(1).schema).toEqual([ + expect(debug.stageResult(0).schema).toEqual([{ name: 'id' }, { name: 'y' }]); + expect(debug.stageResult(1).schema).toEqual([ { name: 'id' }, { name: 'y', annotation: ColumnAnnotation.REMOVED }, { name: 'line', annotation: ColumnAnnotation.ADDED }, ]); - expect(debug.stageResults(0).itemsUnpacked()).toEqual(parsedItems); - expect(debug.stageResults(1).itemsUnpacked()).toEqual(trans1Items); + expect(debug.stageResult(0).itemsUnpacked()).toEqual(parsedItems); + expect(debug.stageResult(1).itemsUnpacked()).toEqual(trans1Items); - const lineMergingStage = debug.stageResults(1); + const lineMergingStage = debug.stageResult(1); const { changes, pages } = lineMergingStage; //verify item groups @@ -103,12 +103,12 @@ test('Change inside of Line', async () => { const debug = new Debugger(1, parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers); expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']); - expect(debug.stageResults(0).schema).toEqual([{ name: 'id' }, { name: 'line' }]); - expect(debug.stageResults(1).schema).toEqual([{ name: 'id' }, { name: 'line' }]); - expect(debug.stageResults(0).itemsUnpacked()).toEqual(parsedItems); - expect(debug.stageResults(1).itemsUnpacked()).toEqual(trans1Items); + expect(debug.stageResult(0).schema).toEqual([{ name: 'id' }, { name: 'line' }]); + expect(debug.stageResult(1).schema).toEqual([{ name: 'id' }, { name: 'line' }]); + expect(debug.stageResult(0).itemsUnpacked()).toEqual(parsedItems); + expect(debug.stageResult(1).itemsUnpacked()).toEqual(trans1Items); - const { changes, pages } = debug.stageResults(1); + const { changes, pages } = debug.stageResult(1); //verify item groups expect(pages[0].itemGroups.map((itemGroup) => changes.hasChanged(itemGroup.top))).toEqual([true, false]); @@ -116,7 +116,7 @@ test('Change inside of Line', async () => { //verify unpacked items expect( debug - .stageResults(1) + .stageResult(1) .itemsUnpacked() .map((item) => changes.hasChanged(item)), ).toEqual([true, true, false, false]); @@ -135,7 +135,7 @@ describe('build schemas', () => { function calculateSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] { const transformers = [new TestTransformer('Trans1', {}, outputSchema, items)]; const debug = new Debugger(1, inputSchema, items, { fontMap: new Map(), pageViewports: [] }, transformers); - return debug.stageResults(1).schema; + return debug.stageResult(1).schema; } test('Add', async () => { diff --git a/core/test/Files.test.ts b/core/test/Files.test.ts index 723c642..607f3a7 100644 --- a/core/test/Files.test.ts +++ b/core/test/Files.test.ts @@ -2,21 +2,34 @@ import { toMatchFile } from 'jest-file-snapshot'; import * as pdfjs from 'pdfjs-dist/es5/build/pdf'; import * as fs from 'fs'; +import * as path from 'path'; +import * as http from 'http'; +import * as https from 'https'; import PdfParser from 'src/PdfParser'; import PdfPipeline from 'src/PdfPipeline'; import { transformers } from 'src/index'; import Debugger from 'src/Debugger'; import Item from 'src/Item'; +import RemoveRepetitiveItems from 'src/transformer/RemoveRepetitiveItems'; +import StageResult from 'src/debug/StageResult'; const parser = new PdfParser(pdfjs); const pipeline = new PdfPipeline(parser, transformers); const folder = '../examples'; const files = fs.readdirSync(folder).filter((file) => file.endsWith('.pdf')); +const urls = ['https://homepages.cwi.nl/~lex/files/dict.pdf']; +const downloadCache = 'node_modules/.cache/files'; expect.extend({ toMatchFile }); +// Test is for debugging purpose +test.skip('Debug', async () => { + const data = fs.readFileSync(`${folder}/Adventures-Of-Sherlock-Holmes.pdf`); + await pipeline.execute(data, () => {}); +}); + describe.each(files)('Test %p', (file) => { const data = fs.readFileSync(`${folder}/${file}`); @@ -26,15 +39,15 @@ describe.each(files)('Test %p', (file) => { test.each(transformers.map((t) => t.name).filter((name) => name !== 'Does nothing'))( 'stage %p', (transformerName) => { - const stageResults = debug.stageResults(debug.stageNames.indexOf(transformerName)); + const stageResult = debug.stageResult(debug.stageNames.indexOf(transformerName)); const chunkedLines: string[][] = [[]]; let resultIndex = 0; let collectedItems = 0; - stageResults.selectPages(true, true).forEach((page) => { + stageResult.selectPages(true, true).forEach((page) => { page.itemGroups.forEach((itemGroup) => { - const change = stageResults.changes.change(itemGroup.top); - if (change || stageResults.descriptor.debug?.showAll) { + const change = stageResult.changes.change(itemGroup.top); + if (change || stageResult.descriptor.debug?.showAll) { const item = itemGroup.top; const changeType = change?.constructor.name || 'none'; chunkedLines[resultIndex].push(itemToString(debug.fontMap, item, changeType)); @@ -50,36 +63,70 @@ describe.each(files)('Test %p', (file) => { }); // Global characteristics - let groupedItemCount = stageResults - .selectPages(false, true) - .reduce((itemCount, page) => itemCount + page.itemGroups.length, 0); - chunkedLines[0].unshift( - JSON.stringify( - { - pages: stageResults.pages.length, - items: stageResults.itemsUnpacked().length, - groupedItems: groupedItemCount, - changes: stageResults.changes.changeCount(), - schema: stageResults.schema, - // messages: stageResults.messages, - }, - null, - 2, - ), - ); + chunkedLines[0].unshift(toHeader(stageResult)); chunkedLines.forEach((lines, idx) => { const transformerResultAsString = lines.join('\n') || '{}'; - const resultFolder = `${folder}/${file.substr(0, file.length - 4)}`; - const fileName = `${transformerName[0].toLowerCase() + transformerName.slice(1).replace(/\s/g, '')}`; - const fileIndex = chunkedLines.length > 1 ? `.${idx}` : ''; - const resultFile = `${resultFolder}/${fileName}${fileIndex}.json`; - expect(transformerResultAsString).toMatchFile(resultFile); + expect(transformerResultAsString).toMatchFile(matchFilePath(file, transformerName, chunkedLines.length, idx)); }); }, ); }); +function matchFilePath(pdfFileName: string, transformerName: string, chunkCount = 1, chunkIndex = 0): string { + const pdfFileNameWithoutExtension = pdfFileName.substr(0, pdfFileName.length - 4); + const resultFileName = `${transformerName[0].toLowerCase() + transformerName.slice(1).replace(/\s/g, '')}`; + const fileIndex = chunkCount > 1 ? `.${chunkIndex}` : ''; + return `${folder}/${pdfFileNameWithoutExtension}/${resultFileName}${fileIndex}.json`; +} + +describe('Remove repetitive items from online resources', () => { + const transformerName = new RemoveRepetitiveItems().name; + test.each(urls)('URL %p', async (url) => { + console.log(url); + const { fileName, data } = download(url); + const debug = await pipeline.debug(data, () => {}); + const stageResult = debug.stageResult(debug.stageNames.indexOf(transformerName)); + const pages = stageResult.selectPages(true, true); + + const lines: string[] = []; + lines.push(toHeader(stageResult)); + + pages.forEach((page) => + page.itemGroups.forEach((itemGroup) => { + const change = stageResult.changes.change(itemGroup.top); + if (change) { + const item = itemGroup.top; + const changeType = change?.constructor.name || 'none'; + lines.push(itemToString(debug.fontMap, item, changeType)); + } + }), + ); + + console.log(lines); + const transformerResultAsString = lines.join('\n') || '{}'; + expect(transformerResultAsString).toMatchFile(matchFilePath(fileName, transformerName)); + }); +}); + +function toHeader(stageResult: StageResult): string { + let groupedItemCount = stageResult + .selectPages(false, true) + .reduce((itemCount, page) => itemCount + page.itemGroups.length, 0); + return JSON.stringify( + { + pages: stageResult.pages.length, + items: stageResult.itemsUnpacked().length, + groupedItems: groupedItemCount, + changes: stageResult.changes.changeCount(), + schema: stageResult.schema, + // messages: stageResults.messages, + }, + null, + 2, + ); +} + function itemToString(fontMap: Map, item: Item, changeType: string): string { const fontName: string | Array = item.data['fontName']; let newFontName: string | Array | undefined = undefined; @@ -91,7 +138,7 @@ function itemToString(fontMap: Map, item: Item, changeType: stri } } const transform: undefined | number[] = item.data['transform']; - let newTransform; + let newTransform: undefined | string[]; if (transform) { newTransform = transform.map((num) => num.toFixed(2)); } @@ -105,3 +152,45 @@ function itemToString(fontMap: Map, item: Item, changeType: stri transform: newTransform, }); } + +function download(url: string): { fileName: string; data: Buffer } { + const fileName = path.basename(new URL(url).pathname); + const localFilePath = `${downloadCache}/${fileName}`; + console.log(localFilePath); + if (!fs.existsSync(localFilePath)) { + fs.mkdirSync(downloadCache, { recursive: true }); + downloadToFile(url, localFilePath); + } + return { + fileName, + data: fs.readFileSync(localFilePath), + }; +} + +function downloadToFile(url: string, dest: string): Promise { + const uri = new URL(url); + const pkg = url.toLowerCase().startsWith('https:') ? https : http; + + return new Promise((resolve, reject) => { + pkg.get(uri.href).on('response', (res) => { + if (res.statusCode === 200) { + const file = fs.createWriteStream(dest, { flags: 'wx' }); + res + .on('end', () => { + file.end(); + resolve(); + }) + .on('error', (err) => { + file.destroy(); + fs.unlink(dest, () => reject(err)); + }) + .pipe(file); + } else if (res.statusCode === 302 || res.statusCode === 301) { + // Recursively follow redirects, only a 200 will resolve. + downloadToFile(res.headers.location as string, dest).then(() => resolve()); + } else { + reject(new Error(`Download request failed, response status: ${res.statusCode} ${res.statusMessage}`)); + } + }); + }); +} diff --git a/examples/README.md b/examples/README.md index c09ba91..1b4aa1c 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,10 +1,21 @@ -These PDFs are used in the parsers's test suite. +# Test PDFs -Self-generated files are: +This folder contains PDFs for testing purposes and the parse results of the PDFs. Generally there are 3 types of PDFs test setups: + +1. Self generated PDFs +2. PDFs which entered `public domain` or have a otherwise permissive license like `Creative Commons SA` +3. PDFs where the license is unclear + +For (1) and (2) we track the end-result and all transformation steps. +For (3) we only track the resulst of some transfomation stages (those who doesn't leak too much of the content) + +## Self-generated PDFs - [ExamplePdf](ExamplePdf.pdf) -All other PDFs are either entered `public domain` or have a otherwise permissive license like `Creative Commons SA`. +## Included Public PDFs + +_(PDFs which entered `public domain` or have a otherwise permissive license like `Creative Commons SA`)_ | File | Source | Author | License Information | | ------------------------------------------------------------------ | ------------------------------------- | ------------------------------------------------ | ------------------------- | @@ -21,3 +32,15 @@ All other PDFs are either entered `public domain` or have a otherwise permissive | [The-War-of-the-Worlds](The-War-of-the-Worlds.pdf) | http://www.planetpdf.com/ | H.G Wells | Public Domain | | [Tragedy-Of-The-Commons](Tragedy-Of-The-Commons.pdf) | https://science.sciencemag.org | Garrett Hardin | Public Domain | | [WoodUp](WoodUp.pdf) | https://bupress.unibz.it/ | Freie Universität Bozen-Bolzano / Giustino Tonon | Creative Commons BY 4.0 | + +## PDFs not stored but paritally tested + +- https://homepages.cwi.nl/~lex/files/dict.pdf + +# Known transformatino problems + +_Tracks known problems with parsing and transforming certain PDFs ._ + +- `Remove Repetitive Elements` + - https://homepages.cwi.nl/~lex/files/dict.pdf + - Nothing gets detected cause the page-number line contains the current chapter diff --git a/examples/dict/removeRepetitiveItems.json b/examples/dict/removeRepetitiveItems.json new file mode 100644 index 0000000..03015f5 --- /dev/null +++ b/examples/dict/removeRepetitiveItems.json @@ -0,0 +1,32 @@ +{ + "pages": 221, + "items": 51638, + "groupedItems": 8465, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + }, + { + "name": "width" + }, + { + "name": "height" + } + ] +} \ No newline at end of file diff --git a/ui/src/debug/DebugView.svelte b/ui/src/debug/DebugView.svelte index 3171403..cb2b735 100644 --- a/ui/src/debug/DebugView.svelte +++ b/ui/src/debug/DebugView.svelte @@ -18,7 +18,7 @@ let groupingEnabled = true; let onlyRelevantItems = true; - $: stageResult = debug.stageResults($debugStage); + $: stageResult = debug.stageResult($debugStage); $: supportsGrouping = !!stageResult.descriptor?.debug?.itemMerger; $: supportsRelevanceFiltering = !stageResult.descriptor?.debug?.showAll; $: visiblePages = pageControl.selectPages(stageResult, onlyRelevantItems, groupingEnabled, $pinnedPageIndex);