Test for remote PDFS

2025-06-24 19:41:24 +02:00 · 2021-03-22 09:03:26 +01:00 · 2021-03-22 09:03:26 +01:00 · c98145a63c
commit c98145a63c
parent f5a180113d
7 changed files with 194 additions and 49 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,5 @@
+.DS_Store
 node_modules/
 build/
 npm-debug.log
-.eslintcache
+.eslintcache
--- a/core/src/Debugger.ts
+++ b/core/src/Debugger.ts
@ -33,7 +33,7 @@ export default class Debugger {
    this.stageResultCache = [initialStage(inputSchema, inputItems)];
  }

-  stageResults(stageIndex: number): StageResult {
+  stageResult(stageIndex: number): StageResult {
    for (let idx = 0; idx < stageIndex + 1; idx++) {
      if (!this.stageResultCache[idx]) {
        const transformer = this.transformers[idx - 1];
--- a/core/test/Debugger.test.ts
+++ b/core/test/Debugger.test.ts
@ -39,14 +39,14 @@ describe('Transform Items', () => {
    const debug = new Debugger(1, parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers);

    expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
-    expect(debug.stageResults(0).schema).toEqual(parsedSchema.map((column) => ({ name: column })));
-    expect(debug.stageResults(1).schema).toEqual([
+    expect(debug.stageResult(0).schema).toEqual(parsedSchema.map((column) => ({ name: column })));
+    expect(debug.stageResult(1).schema).toEqual([
      ...parsedSchema.map((column) => ({ name: column, annotation: ColumnAnnotation.REMOVED })),
      { name: 'C', annotation: ColumnAnnotation.ADDED },
    ]);

-    expect(debug.stageResults(0).itemsUnpacked()).toEqual(parsedItems);
-    expect(debug.stageResults(1).itemsUnpacked()).toEqual(trans1Items);
+    expect(debug.stageResult(0).itemsUnpacked()).toEqual(parsedItems);
+    expect(debug.stageResult(1).itemsUnpacked()).toEqual(trans1Items);
  });

  test('Line Merge', async () => {
@ -65,17 +65,17 @@ describe('Transform Items', () => {
    const debug = new Debugger(1, parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers);

    expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
-    expect(debug.stageResults(0).schema).toEqual([{ name: 'id' }, { name: 'y' }]);
-    expect(debug.stageResults(1).schema).toEqual([
+    expect(debug.stageResult(0).schema).toEqual([{ name: 'id' }, { name: 'y' }]);
+    expect(debug.stageResult(1).schema).toEqual([
      { name: 'id' },
      { name: 'y', annotation: ColumnAnnotation.REMOVED },
      { name: 'line', annotation: ColumnAnnotation.ADDED },
    ]);

-    expect(debug.stageResults(0).itemsUnpacked()).toEqual(parsedItems);
-    expect(debug.stageResults(1).itemsUnpacked()).toEqual(trans1Items);
+    expect(debug.stageResult(0).itemsUnpacked()).toEqual(parsedItems);
+    expect(debug.stageResult(1).itemsUnpacked()).toEqual(trans1Items);

-    const lineMergingStage = debug.stageResults(1);
+    const lineMergingStage = debug.stageResult(1);
    const { changes, pages } = lineMergingStage;

    //verify item groups
@ -103,12 +103,12 @@ test('Change inside of Line', async () => {
  const debug = new Debugger(1, parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers);

  expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
-  expect(debug.stageResults(0).schema).toEqual([{ name: 'id' }, { name: 'line' }]);
-  expect(debug.stageResults(1).schema).toEqual([{ name: 'id' }, { name: 'line' }]);
-  expect(debug.stageResults(0).itemsUnpacked()).toEqual(parsedItems);
-  expect(debug.stageResults(1).itemsUnpacked()).toEqual(trans1Items);
+  expect(debug.stageResult(0).schema).toEqual([{ name: 'id' }, { name: 'line' }]);
+  expect(debug.stageResult(1).schema).toEqual([{ name: 'id' }, { name: 'line' }]);
+  expect(debug.stageResult(0).itemsUnpacked()).toEqual(parsedItems);
+  expect(debug.stageResult(1).itemsUnpacked()).toEqual(trans1Items);

-  const { changes, pages } = debug.stageResults(1);
+  const { changes, pages } = debug.stageResult(1);

  //verify item groups
  expect(pages[0].itemGroups.map((itemGroup) => changes.hasChanged(itemGroup.top))).toEqual([true, false]);
@ -116,7 +116,7 @@ test('Change inside of Line', async () => {
  //verify unpacked items
  expect(
    debug
-      .stageResults(1)
+      .stageResult(1)
      .itemsUnpacked()
      .map((item) => changes.hasChanged(item)),
  ).toEqual([true, true, false, false]);
@ -135,7 +135,7 @@ describe('build schemas', () => {
  function calculateSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] {
    const transformers = [new TestTransformer('Trans1', {}, outputSchema, items)];
    const debug = new Debugger(1, inputSchema, items, { fontMap: new Map(), pageViewports: [] }, transformers);
-    return debug.stageResults(1).schema;
+    return debug.stageResult(1).schema;
  }

  test('Add', async () => {
--- a/core/test/Files.test.ts
+++ b/core/test/Files.test.ts
@ -2,21 +2,34 @@ import { toMatchFile } from 'jest-file-snapshot';

 import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
 import * as fs from 'fs';
+import * as path from 'path';
+import * as http from 'http';
+import * as https from 'https';

 import PdfParser from 'src/PdfParser';
 import PdfPipeline from 'src/PdfPipeline';
 import { transformers } from 'src/index';
 import Debugger from 'src/Debugger';
 import Item from 'src/Item';
+import RemoveRepetitiveItems from 'src/transformer/RemoveRepetitiveItems';
+import StageResult from 'src/debug/StageResult';

 const parser = new PdfParser(pdfjs);
 const pipeline = new PdfPipeline(parser, transformers);

 const folder = '../examples';
 const files = fs.readdirSync(folder).filter((file) => file.endsWith('.pdf'));
+const urls = ['https://homepages.cwi.nl/~lex/files/dict.pdf'];
+const downloadCache = 'node_modules/.cache/files';

 expect.extend({ toMatchFile });

+// Test is for debugging purpose
+test.skip('Debug', async () => {
+  const data = fs.readFileSync(`${folder}/Adventures-Of-Sherlock-Holmes.pdf`);
+  await pipeline.execute(data, () => {});
+});
+
 describe.each(files)('Test %p', (file) => {
  const data = fs.readFileSync(`${folder}/${file}`);

@ -26,15 +39,15 @@ describe.each(files)('Test %p', (file) => {
  test.each(transformers.map((t) => t.name).filter((name) => name !== 'Does nothing'))(
    'stage %p',
    (transformerName) => {
-      const stageResults = debug.stageResults(debug.stageNames.indexOf(transformerName));
+      const stageResult = debug.stageResult(debug.stageNames.indexOf(transformerName));

      const chunkedLines: string[][] = [[]];
      let resultIndex = 0;
      let collectedItems = 0;
-      stageResults.selectPages(true, true).forEach((page) => {
+      stageResult.selectPages(true, true).forEach((page) => {
        page.itemGroups.forEach((itemGroup) => {
-          const change = stageResults.changes.change(itemGroup.top);
-          if (change || stageResults.descriptor.debug?.showAll) {
+          const change = stageResult.changes.change(itemGroup.top);
+          if (change || stageResult.descriptor.debug?.showAll) {
            const item = itemGroup.top;
            const changeType = change?.constructor.name || 'none';
            chunkedLines[resultIndex].push(itemToString(debug.fontMap, item, changeType));
@ -50,36 +63,70 @@ describe.each(files)('Test %p', (file) => {
      });

      // Global characteristics
-      let groupedItemCount = stageResults
-        .selectPages(false, true)
-        .reduce((itemCount, page) => itemCount + page.itemGroups.length, 0);
-      chunkedLines[0].unshift(
-        JSON.stringify(
-          {
-            pages: stageResults.pages.length,
-            items: stageResults.itemsUnpacked().length,
-            groupedItems: groupedItemCount,
-            changes: stageResults.changes.changeCount(),
-            schema: stageResults.schema,
-            // messages: stageResults.messages,
-          },
-          null,
-          2,
-        ),
-      );
+      chunkedLines[0].unshift(toHeader(stageResult));

      chunkedLines.forEach((lines, idx) => {
        const transformerResultAsString = lines.join('\n') || '{}';
-        const resultFolder = `${folder}/${file.substr(0, file.length - 4)}`;
-        const fileName = `${transformerName[0].toLowerCase() + transformerName.slice(1).replace(/\s/g, '')}`;
-        const fileIndex = chunkedLines.length > 1 ? `.${idx}` : '';
-        const resultFile = `${resultFolder}/${fileName}${fileIndex}.json`;
-        expect(transformerResultAsString).toMatchFile(resultFile);
+        expect(transformerResultAsString).toMatchFile(matchFilePath(file, transformerName, chunkedLines.length, idx));
      });
    },
  );
 });

+function matchFilePath(pdfFileName: string, transformerName: string, chunkCount = 1, chunkIndex = 0): string {
+  const pdfFileNameWithoutExtension = pdfFileName.substr(0, pdfFileName.length - 4);
+  const resultFileName = `${transformerName[0].toLowerCase() + transformerName.slice(1).replace(/\s/g, '')}`;
+  const fileIndex = chunkCount > 1 ? `.${chunkIndex}` : '';
+  return `${folder}/${pdfFileNameWithoutExtension}/${resultFileName}${fileIndex}.json`;
+}
+
+describe('Remove repetitive items from online resources', () => {
+  const transformerName = new RemoveRepetitiveItems().name;
+  test.each(urls)('URL %p', async (url) => {
+    console.log(url);
+    const { fileName, data } = download(url);
+    const debug = await pipeline.debug(data, () => {});
+    const stageResult = debug.stageResult(debug.stageNames.indexOf(transformerName));
+    const pages = stageResult.selectPages(true, true);
+
+    const lines: string[] = [];
+    lines.push(toHeader(stageResult));
+
+    pages.forEach((page) =>
+      page.itemGroups.forEach((itemGroup) => {
+        const change = stageResult.changes.change(itemGroup.top);
+        if (change) {
+          const item = itemGroup.top;
+          const changeType = change?.constructor.name || 'none';
+          lines.push(itemToString(debug.fontMap, item, changeType));
+        }
+      }),
+    );
+
+    console.log(lines);
+    const transformerResultAsString = lines.join('\n') || '{}';
+    expect(transformerResultAsString).toMatchFile(matchFilePath(fileName, transformerName));
+  });
+});
+
+function toHeader(stageResult: StageResult): string {
+  let groupedItemCount = stageResult
+    .selectPages(false, true)
+    .reduce((itemCount, page) => itemCount + page.itemGroups.length, 0);
+  return JSON.stringify(
+    {
+      pages: stageResult.pages.length,
+      items: stageResult.itemsUnpacked().length,
+      groupedItems: groupedItemCount,
+      changes: stageResult.changes.changeCount(),
+      schema: stageResult.schema,
+      // messages: stageResults.messages,
+    },
+    null,
+    2,
+  );
+}
+
 function itemToString(fontMap: Map<string, object>, item: Item, changeType: string): string {
  const fontName: string | Array<string> = item.data['fontName'];
  let newFontName: string | Array<string> | undefined = undefined;
@ -91,7 +138,7 @@ function itemToString(fontMap: Map<string, object>, item: Item, changeType: stri
    }
  }
  const transform: undefined | number[] = item.data['transform'];
-  let newTransform;
+  let newTransform: undefined | string[];
  if (transform) {
    newTransform = transform.map((num) => num.toFixed(2));
  }
@ -105,3 +152,45 @@ function itemToString(fontMap: Map<string, object>, item: Item, changeType: stri
    transform: newTransform,
  });
 }
+
+function download(url: string): { fileName: string; data: Buffer } {
+  const fileName = path.basename(new URL(url).pathname);
+  const localFilePath = `${downloadCache}/${fileName}`;
+  console.log(localFilePath);
+  if (!fs.existsSync(localFilePath)) {
+    fs.mkdirSync(downloadCache, { recursive: true });
+    downloadToFile(url, localFilePath);
+  }
+  return {
+    fileName,
+    data: fs.readFileSync(localFilePath),
+  };
+}
+
+function downloadToFile(url: string, dest: string): Promise<void> {
+  const uri = new URL(url);
+  const pkg = url.toLowerCase().startsWith('https:') ? https : http;
+
+  return new Promise((resolve, reject) => {
+    pkg.get(uri.href).on('response', (res) => {
+      if (res.statusCode === 200) {
+        const file = fs.createWriteStream(dest, { flags: 'wx' });
+        res
+          .on('end', () => {
+            file.end();
+            resolve();
+          })
+          .on('error', (err) => {
+            file.destroy();
+            fs.unlink(dest, () => reject(err));
+          })
+          .pipe(file);
+      } else if (res.statusCode === 302 || res.statusCode === 301) {
+        // Recursively follow redirects, only a 200 will resolve.
+        downloadToFile(res.headers.location as string, dest).then(() => resolve());
+      } else {
+        reject(new Error(`Download request failed, response status: ${res.statusCode} ${res.statusMessage}`));
+      }
+    });
+  });
+}
--- a/examples/README.md
+++ b/examples/README.md
@ -1,10 +1,21 @@
-These PDFs are used in the parsers's test suite.
+# Test PDFs

-Self-generated files are:
+This folder contains PDFs for testing purposes and the parse results of the PDFs. Generally there are 3 types of PDFs test setups:
+
+1. Self generated PDFs
+2. PDFs which entered `public domain` or have a otherwise permissive license like `Creative Commons SA`
+3. PDFs where the license is unclear
+
+For (1) and (2) we track the end-result and all transformation steps.
+For (3) we only track the resulst of some transfomation stages (those who doesn't leak too much of the content)
+
+## Self-generated PDFs

 - [ExamplePdf](ExamplePdf.pdf)

-All other PDFs are either entered `public domain` or have a otherwise permissive license like `Creative Commons SA`.
+## Included Public PDFs
+
+_(PDFs which entered `public domain` or have a otherwise permissive license like `Creative Commons SA`)_

 | File                                                               | Source                                | Author                                           | License Information       |
 | ------------------------------------------------------------------ | ------------------------------------- | ------------------------------------------------ | ------------------------- |
@ -21,3 +32,15 @@ All other PDFs are either entered `public domain` or have a otherwise permissive
 | [The-War-of-the-Worlds](The-War-of-the-Worlds.pdf)                 | http://www.planetpdf.com/             | H.G Wells                                        | Public Domain             |
 | [Tragedy-Of-The-Commons](Tragedy-Of-The-Commons.pdf)               | https://science.sciencemag.org        | Garrett Hardin                                   | Public Domain             |
 | [WoodUp](WoodUp.pdf)                                               | https://bupress.unibz.it/             | Freie Universität Bozen-Bolzano / Giustino Tonon | Creative Commons BY 4.0   |
+
+## PDFs not stored but paritally tested
+
+- https://homepages.cwi.nl/~lex/files/dict.pdf
+
+# Known transformatino problems
+
+_Tracks known problems with parsing and transforming certain PDFs ._
+
+- `Remove Repetitive Elements`
+  - https://homepages.cwi.nl/~lex/files/dict.pdf
+    - Nothing gets detected cause the page-number line contains the current chapter
--- a/examples/dict/removeRepetitiveItems.json
+++ b/examples/dict/removeRepetitiveItems.json
@ -0,0 +1,32 @@
+{
+  "pages": 221,
+  "items": 51638,
+  "groupedItems": 8465,
+  "changes": 0,
+  "schema": [
+    {
+      "name": "line"
+    },
+    {
+      "name": "x"
+    },
+    {
+      "name": "y"
+    },
+    {
+      "name": "str"
+    },
+    {
+      "name": "fontName"
+    },
+    {
+      "name": "dir"
+    },
+    {
+      "name": "width"
+    },
+    {
+      "name": "height"
+    }
+  ]
+}
--- a/ui/src/debug/DebugView.svelte
+++ b/ui/src/debug/DebugView.svelte
@ -18,7 +18,7 @@
    let groupingEnabled = true;
    let onlyRelevantItems = true;

-    $: stageResult = debug.stageResults($debugStage);
+    $: stageResult = debug.stageResult($debugStage);
    $: supportsGrouping = !!stageResult.descriptor?.debug?.itemMerger;
    $: supportsRelevanceFiltering = !stageResult.descriptor?.debug?.showAll;
    $: visiblePages = pageControl.selectPages(stageResult, onlyRelevantItems, groupingEnabled, $pinnedPageIndex);