mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-24 19:41:24 +02:00
Test for remote PDFS
This commit is contained in:
parent
f5a180113d
commit
c98145a63c
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,4 +1,5 @@
|
||||
.DS_Store
|
||||
node_modules/
|
||||
build/
|
||||
npm-debug.log
|
||||
.eslintcache
|
||||
.eslintcache
|
||||
|
@ -33,7 +33,7 @@ export default class Debugger {
|
||||
this.stageResultCache = [initialStage(inputSchema, inputItems)];
|
||||
}
|
||||
|
||||
stageResults(stageIndex: number): StageResult {
|
||||
stageResult(stageIndex: number): StageResult {
|
||||
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
||||
if (!this.stageResultCache[idx]) {
|
||||
const transformer = this.transformers[idx - 1];
|
||||
|
@ -39,14 +39,14 @@ describe('Transform Items', () => {
|
||||
const debug = new Debugger(1, parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers);
|
||||
|
||||
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
|
||||
expect(debug.stageResults(0).schema).toEqual(parsedSchema.map((column) => ({ name: column })));
|
||||
expect(debug.stageResults(1).schema).toEqual([
|
||||
expect(debug.stageResult(0).schema).toEqual(parsedSchema.map((column) => ({ name: column })));
|
||||
expect(debug.stageResult(1).schema).toEqual([
|
||||
...parsedSchema.map((column) => ({ name: column, annotation: ColumnAnnotation.REMOVED })),
|
||||
{ name: 'C', annotation: ColumnAnnotation.ADDED },
|
||||
]);
|
||||
|
||||
expect(debug.stageResults(0).itemsUnpacked()).toEqual(parsedItems);
|
||||
expect(debug.stageResults(1).itemsUnpacked()).toEqual(trans1Items);
|
||||
expect(debug.stageResult(0).itemsUnpacked()).toEqual(parsedItems);
|
||||
expect(debug.stageResult(1).itemsUnpacked()).toEqual(trans1Items);
|
||||
});
|
||||
|
||||
test('Line Merge', async () => {
|
||||
@ -65,17 +65,17 @@ describe('Transform Items', () => {
|
||||
const debug = new Debugger(1, parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers);
|
||||
|
||||
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
|
||||
expect(debug.stageResults(0).schema).toEqual([{ name: 'id' }, { name: 'y' }]);
|
||||
expect(debug.stageResults(1).schema).toEqual([
|
||||
expect(debug.stageResult(0).schema).toEqual([{ name: 'id' }, { name: 'y' }]);
|
||||
expect(debug.stageResult(1).schema).toEqual([
|
||||
{ name: 'id' },
|
||||
{ name: 'y', annotation: ColumnAnnotation.REMOVED },
|
||||
{ name: 'line', annotation: ColumnAnnotation.ADDED },
|
||||
]);
|
||||
|
||||
expect(debug.stageResults(0).itemsUnpacked()).toEqual(parsedItems);
|
||||
expect(debug.stageResults(1).itemsUnpacked()).toEqual(trans1Items);
|
||||
expect(debug.stageResult(0).itemsUnpacked()).toEqual(parsedItems);
|
||||
expect(debug.stageResult(1).itemsUnpacked()).toEqual(trans1Items);
|
||||
|
||||
const lineMergingStage = debug.stageResults(1);
|
||||
const lineMergingStage = debug.stageResult(1);
|
||||
const { changes, pages } = lineMergingStage;
|
||||
|
||||
//verify item groups
|
||||
@ -103,12 +103,12 @@ test('Change inside of Line', async () => {
|
||||
const debug = new Debugger(1, parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers);
|
||||
|
||||
expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
|
||||
expect(debug.stageResults(0).schema).toEqual([{ name: 'id' }, { name: 'line' }]);
|
||||
expect(debug.stageResults(1).schema).toEqual([{ name: 'id' }, { name: 'line' }]);
|
||||
expect(debug.stageResults(0).itemsUnpacked()).toEqual(parsedItems);
|
||||
expect(debug.stageResults(1).itemsUnpacked()).toEqual(trans1Items);
|
||||
expect(debug.stageResult(0).schema).toEqual([{ name: 'id' }, { name: 'line' }]);
|
||||
expect(debug.stageResult(1).schema).toEqual([{ name: 'id' }, { name: 'line' }]);
|
||||
expect(debug.stageResult(0).itemsUnpacked()).toEqual(parsedItems);
|
||||
expect(debug.stageResult(1).itemsUnpacked()).toEqual(trans1Items);
|
||||
|
||||
const { changes, pages } = debug.stageResults(1);
|
||||
const { changes, pages } = debug.stageResult(1);
|
||||
|
||||
//verify item groups
|
||||
expect(pages[0].itemGroups.map((itemGroup) => changes.hasChanged(itemGroup.top))).toEqual([true, false]);
|
||||
@ -116,7 +116,7 @@ test('Change inside of Line', async () => {
|
||||
//verify unpacked items
|
||||
expect(
|
||||
debug
|
||||
.stageResults(1)
|
||||
.stageResult(1)
|
||||
.itemsUnpacked()
|
||||
.map((item) => changes.hasChanged(item)),
|
||||
).toEqual([true, true, false, false]);
|
||||
@ -135,7 +135,7 @@ describe('build schemas', () => {
|
||||
function calculateSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] {
|
||||
const transformers = [new TestTransformer('Trans1', {}, outputSchema, items)];
|
||||
const debug = new Debugger(1, inputSchema, items, { fontMap: new Map(), pageViewports: [] }, transformers);
|
||||
return debug.stageResults(1).schema;
|
||||
return debug.stageResult(1).schema;
|
||||
}
|
||||
|
||||
test('Add', async () => {
|
||||
|
@ -2,21 +2,34 @@ import { toMatchFile } from 'jest-file-snapshot';
|
||||
|
||||
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as http from 'http';
|
||||
import * as https from 'https';
|
||||
|
||||
import PdfParser from 'src/PdfParser';
|
||||
import PdfPipeline from 'src/PdfPipeline';
|
||||
import { transformers } from 'src/index';
|
||||
import Debugger from 'src/Debugger';
|
||||
import Item from 'src/Item';
|
||||
import RemoveRepetitiveItems from 'src/transformer/RemoveRepetitiveItems';
|
||||
import StageResult from 'src/debug/StageResult';
|
||||
|
||||
const parser = new PdfParser(pdfjs);
|
||||
const pipeline = new PdfPipeline(parser, transformers);
|
||||
|
||||
const folder = '../examples';
|
||||
const files = fs.readdirSync(folder).filter((file) => file.endsWith('.pdf'));
|
||||
const urls = ['https://homepages.cwi.nl/~lex/files/dict.pdf'];
|
||||
const downloadCache = 'node_modules/.cache/files';
|
||||
|
||||
expect.extend({ toMatchFile });
|
||||
|
||||
// Test is for debugging purpose
|
||||
test.skip('Debug', async () => {
|
||||
const data = fs.readFileSync(`${folder}/Adventures-Of-Sherlock-Holmes.pdf`);
|
||||
await pipeline.execute(data, () => {});
|
||||
});
|
||||
|
||||
describe.each(files)('Test %p', (file) => {
|
||||
const data = fs.readFileSync(`${folder}/${file}`);
|
||||
|
||||
@ -26,15 +39,15 @@ describe.each(files)('Test %p', (file) => {
|
||||
test.each(transformers.map((t) => t.name).filter((name) => name !== 'Does nothing'))(
|
||||
'stage %p',
|
||||
(transformerName) => {
|
||||
const stageResults = debug.stageResults(debug.stageNames.indexOf(transformerName));
|
||||
const stageResult = debug.stageResult(debug.stageNames.indexOf(transformerName));
|
||||
|
||||
const chunkedLines: string[][] = [[]];
|
||||
let resultIndex = 0;
|
||||
let collectedItems = 0;
|
||||
stageResults.selectPages(true, true).forEach((page) => {
|
||||
stageResult.selectPages(true, true).forEach((page) => {
|
||||
page.itemGroups.forEach((itemGroup) => {
|
||||
const change = stageResults.changes.change(itemGroup.top);
|
||||
if (change || stageResults.descriptor.debug?.showAll) {
|
||||
const change = stageResult.changes.change(itemGroup.top);
|
||||
if (change || stageResult.descriptor.debug?.showAll) {
|
||||
const item = itemGroup.top;
|
||||
const changeType = change?.constructor.name || 'none';
|
||||
chunkedLines[resultIndex].push(itemToString(debug.fontMap, item, changeType));
|
||||
@ -50,36 +63,70 @@ describe.each(files)('Test %p', (file) => {
|
||||
});
|
||||
|
||||
// Global characteristics
|
||||
let groupedItemCount = stageResults
|
||||
.selectPages(false, true)
|
||||
.reduce((itemCount, page) => itemCount + page.itemGroups.length, 0);
|
||||
chunkedLines[0].unshift(
|
||||
JSON.stringify(
|
||||
{
|
||||
pages: stageResults.pages.length,
|
||||
items: stageResults.itemsUnpacked().length,
|
||||
groupedItems: groupedItemCount,
|
||||
changes: stageResults.changes.changeCount(),
|
||||
schema: stageResults.schema,
|
||||
// messages: stageResults.messages,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
chunkedLines[0].unshift(toHeader(stageResult));
|
||||
|
||||
chunkedLines.forEach((lines, idx) => {
|
||||
const transformerResultAsString = lines.join('\n') || '{}';
|
||||
const resultFolder = `${folder}/${file.substr(0, file.length - 4)}`;
|
||||
const fileName = `${transformerName[0].toLowerCase() + transformerName.slice(1).replace(/\s/g, '')}`;
|
||||
const fileIndex = chunkedLines.length > 1 ? `.${idx}` : '';
|
||||
const resultFile = `${resultFolder}/${fileName}${fileIndex}.json`;
|
||||
expect(transformerResultAsString).toMatchFile(resultFile);
|
||||
expect(transformerResultAsString).toMatchFile(matchFilePath(file, transformerName, chunkedLines.length, idx));
|
||||
});
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
function matchFilePath(pdfFileName: string, transformerName: string, chunkCount = 1, chunkIndex = 0): string {
|
||||
const pdfFileNameWithoutExtension = pdfFileName.substr(0, pdfFileName.length - 4);
|
||||
const resultFileName = `${transformerName[0].toLowerCase() + transformerName.slice(1).replace(/\s/g, '')}`;
|
||||
const fileIndex = chunkCount > 1 ? `.${chunkIndex}` : '';
|
||||
return `${folder}/${pdfFileNameWithoutExtension}/${resultFileName}${fileIndex}.json`;
|
||||
}
|
||||
|
||||
describe('Remove repetitive items from online resources', () => {
|
||||
const transformerName = new RemoveRepetitiveItems().name;
|
||||
test.each(urls)('URL %p', async (url) => {
|
||||
console.log(url);
|
||||
const { fileName, data } = download(url);
|
||||
const debug = await pipeline.debug(data, () => {});
|
||||
const stageResult = debug.stageResult(debug.stageNames.indexOf(transformerName));
|
||||
const pages = stageResult.selectPages(true, true);
|
||||
|
||||
const lines: string[] = [];
|
||||
lines.push(toHeader(stageResult));
|
||||
|
||||
pages.forEach((page) =>
|
||||
page.itemGroups.forEach((itemGroup) => {
|
||||
const change = stageResult.changes.change(itemGroup.top);
|
||||
if (change) {
|
||||
const item = itemGroup.top;
|
||||
const changeType = change?.constructor.name || 'none';
|
||||
lines.push(itemToString(debug.fontMap, item, changeType));
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
||||
console.log(lines);
|
||||
const transformerResultAsString = lines.join('\n') || '{}';
|
||||
expect(transformerResultAsString).toMatchFile(matchFilePath(fileName, transformerName));
|
||||
});
|
||||
});
|
||||
|
||||
function toHeader(stageResult: StageResult): string {
|
||||
let groupedItemCount = stageResult
|
||||
.selectPages(false, true)
|
||||
.reduce((itemCount, page) => itemCount + page.itemGroups.length, 0);
|
||||
return JSON.stringify(
|
||||
{
|
||||
pages: stageResult.pages.length,
|
||||
items: stageResult.itemsUnpacked().length,
|
||||
groupedItems: groupedItemCount,
|
||||
changes: stageResult.changes.changeCount(),
|
||||
schema: stageResult.schema,
|
||||
// messages: stageResults.messages,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
);
|
||||
}
|
||||
|
||||
function itemToString(fontMap: Map<string, object>, item: Item, changeType: string): string {
|
||||
const fontName: string | Array<string> = item.data['fontName'];
|
||||
let newFontName: string | Array<string> | undefined = undefined;
|
||||
@ -91,7 +138,7 @@ function itemToString(fontMap: Map<string, object>, item: Item, changeType: stri
|
||||
}
|
||||
}
|
||||
const transform: undefined | number[] = item.data['transform'];
|
||||
let newTransform;
|
||||
let newTransform: undefined | string[];
|
||||
if (transform) {
|
||||
newTransform = transform.map((num) => num.toFixed(2));
|
||||
}
|
||||
@ -105,3 +152,45 @@ function itemToString(fontMap: Map<string, object>, item: Item, changeType: stri
|
||||
transform: newTransform,
|
||||
});
|
||||
}
|
||||
|
||||
function download(url: string): { fileName: string; data: Buffer } {
|
||||
const fileName = path.basename(new URL(url).pathname);
|
||||
const localFilePath = `${downloadCache}/${fileName}`;
|
||||
console.log(localFilePath);
|
||||
if (!fs.existsSync(localFilePath)) {
|
||||
fs.mkdirSync(downloadCache, { recursive: true });
|
||||
downloadToFile(url, localFilePath);
|
||||
}
|
||||
return {
|
||||
fileName,
|
||||
data: fs.readFileSync(localFilePath),
|
||||
};
|
||||
}
|
||||
|
||||
function downloadToFile(url: string, dest: string): Promise<void> {
|
||||
const uri = new URL(url);
|
||||
const pkg = url.toLowerCase().startsWith('https:') ? https : http;
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
pkg.get(uri.href).on('response', (res) => {
|
||||
if (res.statusCode === 200) {
|
||||
const file = fs.createWriteStream(dest, { flags: 'wx' });
|
||||
res
|
||||
.on('end', () => {
|
||||
file.end();
|
||||
resolve();
|
||||
})
|
||||
.on('error', (err) => {
|
||||
file.destroy();
|
||||
fs.unlink(dest, () => reject(err));
|
||||
})
|
||||
.pipe(file);
|
||||
} else if (res.statusCode === 302 || res.statusCode === 301) {
|
||||
// Recursively follow redirects, only a 200 will resolve.
|
||||
downloadToFile(res.headers.location as string, dest).then(() => resolve());
|
||||
} else {
|
||||
reject(new Error(`Download request failed, response status: ${res.statusCode} ${res.statusMessage}`));
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -1,10 +1,21 @@
|
||||
These PDFs are used in the parsers's test suite.
|
||||
# Test PDFs
|
||||
|
||||
Self-generated files are:
|
||||
This folder contains PDFs for testing purposes and the parse results of the PDFs. Generally there are 3 types of PDFs test setups:
|
||||
|
||||
1. Self generated PDFs
|
||||
2. PDFs which entered `public domain` or have a otherwise permissive license like `Creative Commons SA`
|
||||
3. PDFs where the license is unclear
|
||||
|
||||
For (1) and (2) we track the end-result and all transformation steps.
|
||||
For (3) we only track the resulst of some transfomation stages (those who doesn't leak too much of the content)
|
||||
|
||||
## Self-generated PDFs
|
||||
|
||||
- [ExamplePdf](ExamplePdf.pdf)
|
||||
|
||||
All other PDFs are either entered `public domain` or have a otherwise permissive license like `Creative Commons SA`.
|
||||
## Included Public PDFs
|
||||
|
||||
_(PDFs which entered `public domain` or have a otherwise permissive license like `Creative Commons SA`)_
|
||||
|
||||
| File | Source | Author | License Information |
|
||||
| ------------------------------------------------------------------ | ------------------------------------- | ------------------------------------------------ | ------------------------- |
|
||||
@ -21,3 +32,15 @@ All other PDFs are either entered `public domain` or have a otherwise permissive
|
||||
| [The-War-of-the-Worlds](The-War-of-the-Worlds.pdf) | http://www.planetpdf.com/ | H.G Wells | Public Domain |
|
||||
| [Tragedy-Of-The-Commons](Tragedy-Of-The-Commons.pdf) | https://science.sciencemag.org | Garrett Hardin | Public Domain |
|
||||
| [WoodUp](WoodUp.pdf) | https://bupress.unibz.it/ | Freie Universität Bozen-Bolzano / Giustino Tonon | Creative Commons BY 4.0 |
|
||||
|
||||
## PDFs not stored but paritally tested
|
||||
|
||||
- https://homepages.cwi.nl/~lex/files/dict.pdf
|
||||
|
||||
# Known transformatino problems
|
||||
|
||||
_Tracks known problems with parsing and transforming certain PDFs ._
|
||||
|
||||
- `Remove Repetitive Elements`
|
||||
- https://homepages.cwi.nl/~lex/files/dict.pdf
|
||||
- Nothing gets detected cause the page-number line contains the current chapter
|
||||
|
32
examples/dict/removeRepetitiveItems.json
Normal file
32
examples/dict/removeRepetitiveItems.json
Normal file
@ -0,0 +1,32 @@
|
||||
{
|
||||
"pages": 221,
|
||||
"items": 51638,
|
||||
"groupedItems": 8465,
|
||||
"changes": 0,
|
||||
"schema": [
|
||||
{
|
||||
"name": "line"
|
||||
},
|
||||
{
|
||||
"name": "x"
|
||||
},
|
||||
{
|
||||
"name": "y"
|
||||
},
|
||||
{
|
||||
"name": "str"
|
||||
},
|
||||
{
|
||||
"name": "fontName"
|
||||
},
|
||||
{
|
||||
"name": "dir"
|
||||
},
|
||||
{
|
||||
"name": "width"
|
||||
},
|
||||
{
|
||||
"name": "height"
|
||||
}
|
||||
]
|
||||
}
|
@ -18,7 +18,7 @@
|
||||
let groupingEnabled = true;
|
||||
let onlyRelevantItems = true;
|
||||
|
||||
$: stageResult = debug.stageResults($debugStage);
|
||||
$: stageResult = debug.stageResult($debugStage);
|
||||
$: supportsGrouping = !!stageResult.descriptor?.debug?.itemMerger;
|
||||
$: supportsRelevanceFiltering = !stageResult.descriptor?.debug?.showAll;
|
||||
$: visiblePages = pageControl.selectPages(stageResult, onlyRelevantItems, groupingEnabled, $pinnedPageIndex);
|
||||
|
Loading…
x
Reference in New Issue
Block a user