From c98145a63cc6097cc25c45cb8c8b8dba17951418 Mon Sep 17 00:00:00 2001
From: Johannes Zillmann <jzillmann@googlemail.com>
Date: Mon, 22 Mar 2021 09:03:26 +0100
Subject: [PATCH] Test for remote PDFS

---
 .gitignore                               |   3 +-
 core/src/Debugger.ts                     |   2 +-
 core/test/Debugger.test.ts               |  32 ++---
 core/test/Files.test.ts                  | 143 ++++++++++++++++++-----
 examples/README.md                       |  29 ++++-
 examples/dict/removeRepetitiveItems.json |  32 +++++
 ui/src/debug/DebugView.svelte            |   2 +-
 7 files changed, 194 insertions(+), 49 deletions(-)
 create mode 100644 examples/dict/removeRepetitiveItems.json

diff --git a/.gitignore b/.gitignore
index 025a419..cfbd496 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
+.DS_Store
 node_modules/
 build/
 npm-debug.log
-.eslintcache
\ No newline at end of file
+.eslintcache
diff --git a/core/src/Debugger.ts b/core/src/Debugger.ts
index 3ec00a2..a65d115 100644
--- a/core/src/Debugger.ts
+++ b/core/src/Debugger.ts
@@ -33,7 +33,7 @@ export default class Debugger {
     this.stageResultCache = [initialStage(inputSchema, inputItems)];
   }
 
-  stageResults(stageIndex: number): StageResult {
+  stageResult(stageIndex: number): StageResult {
     for (let idx = 0; idx < stageIndex + 1; idx++) {
       if (!this.stageResultCache[idx]) {
         const transformer = this.transformers[idx - 1];
diff --git a/core/test/Debugger.test.ts b/core/test/Debugger.test.ts
index c8d1673..6dc6d5c 100644
--- a/core/test/Debugger.test.ts
+++ b/core/test/Debugger.test.ts
@@ -39,14 +39,14 @@ describe('Transform Items', () => {
     const debug = new Debugger(1, parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers);
 
     expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
-    expect(debug.stageResults(0).schema).toEqual(parsedSchema.map((column) => ({ name: column })));
-    expect(debug.stageResults(1).schema).toEqual([
+    expect(debug.stageResult(0).schema).toEqual(parsedSchema.map((column) => ({ name: column })));
+    expect(debug.stageResult(1).schema).toEqual([
       ...parsedSchema.map((column) => ({ name: column, annotation: ColumnAnnotation.REMOVED })),
       { name: 'C', annotation: ColumnAnnotation.ADDED },
     ]);
 
-    expect(debug.stageResults(0).itemsUnpacked()).toEqual(parsedItems);
-    expect(debug.stageResults(1).itemsUnpacked()).toEqual(trans1Items);
+    expect(debug.stageResult(0).itemsUnpacked()).toEqual(parsedItems);
+    expect(debug.stageResult(1).itemsUnpacked()).toEqual(trans1Items);
   });
 
   test('Line Merge', async () => {
@@ -65,17 +65,17 @@ describe('Transform Items', () => {
     const debug = new Debugger(1, parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers);
 
     expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
-    expect(debug.stageResults(0).schema).toEqual([{ name: 'id' }, { name: 'y' }]);
-    expect(debug.stageResults(1).schema).toEqual([
+    expect(debug.stageResult(0).schema).toEqual([{ name: 'id' }, { name: 'y' }]);
+    expect(debug.stageResult(1).schema).toEqual([
       { name: 'id' },
       { name: 'y', annotation: ColumnAnnotation.REMOVED },
       { name: 'line', annotation: ColumnAnnotation.ADDED },
     ]);
 
-    expect(debug.stageResults(0).itemsUnpacked()).toEqual(parsedItems);
-    expect(debug.stageResults(1).itemsUnpacked()).toEqual(trans1Items);
+    expect(debug.stageResult(0).itemsUnpacked()).toEqual(parsedItems);
+    expect(debug.stageResult(1).itemsUnpacked()).toEqual(trans1Items);
 
-    const lineMergingStage = debug.stageResults(1);
+    const lineMergingStage = debug.stageResult(1);
     const { changes, pages } = lineMergingStage;
 
     //verify item groups
@@ -103,12 +103,12 @@ test('Change inside of Line', async () => {
   const debug = new Debugger(1, parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers);
 
   expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']);
-  expect(debug.stageResults(0).schema).toEqual([{ name: 'id' }, { name: 'line' }]);
-  expect(debug.stageResults(1).schema).toEqual([{ name: 'id' }, { name: 'line' }]);
-  expect(debug.stageResults(0).itemsUnpacked()).toEqual(parsedItems);
-  expect(debug.stageResults(1).itemsUnpacked()).toEqual(trans1Items);
+  expect(debug.stageResult(0).schema).toEqual([{ name: 'id' }, { name: 'line' }]);
+  expect(debug.stageResult(1).schema).toEqual([{ name: 'id' }, { name: 'line' }]);
+  expect(debug.stageResult(0).itemsUnpacked()).toEqual(parsedItems);
+  expect(debug.stageResult(1).itemsUnpacked()).toEqual(trans1Items);
 
-  const { changes, pages } = debug.stageResults(1);
+  const { changes, pages } = debug.stageResult(1);
 
   //verify item groups
   expect(pages[0].itemGroups.map((itemGroup) => changes.hasChanged(itemGroup.top))).toEqual([true, false]);
@@ -116,7 +116,7 @@ test('Change inside of Line', async () => {
   //verify unpacked items
   expect(
     debug
-      .stageResults(1)
+      .stageResult(1)
       .itemsUnpacked()
       .map((item) => changes.hasChanged(item)),
   ).toEqual([true, true, false, false]);
@@ -135,7 +135,7 @@ describe('build schemas', () => {
   function calculateSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] {
     const transformers = [new TestTransformer('Trans1', {}, outputSchema, items)];
     const debug = new Debugger(1, inputSchema, items, { fontMap: new Map(), pageViewports: [] }, transformers);
-    return debug.stageResults(1).schema;
+    return debug.stageResult(1).schema;
   }
 
   test('Add', async () => {
diff --git a/core/test/Files.test.ts b/core/test/Files.test.ts
index 723c642..607f3a7 100644
--- a/core/test/Files.test.ts
+++ b/core/test/Files.test.ts
@@ -2,21 +2,34 @@ import { toMatchFile } from 'jest-file-snapshot';
 
 import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
 import * as fs from 'fs';
+import * as path from 'path';
+import * as http from 'http';
+import * as https from 'https';
 
 import PdfParser from 'src/PdfParser';
 import PdfPipeline from 'src/PdfPipeline';
 import { transformers } from 'src/index';
 import Debugger from 'src/Debugger';
 import Item from 'src/Item';
+import RemoveRepetitiveItems from 'src/transformer/RemoveRepetitiveItems';
+import StageResult from 'src/debug/StageResult';
 
 const parser = new PdfParser(pdfjs);
 const pipeline = new PdfPipeline(parser, transformers);
 
 const folder = '../examples';
 const files = fs.readdirSync(folder).filter((file) => file.endsWith('.pdf'));
+const urls = ['https://homepages.cwi.nl/~lex/files/dict.pdf'];
+const downloadCache = 'node_modules/.cache/files';
 
 expect.extend({ toMatchFile });
 
+// Test is for debugging purpose
+test.skip('Debug', async () => {
+  const data = fs.readFileSync(`${folder}/Adventures-Of-Sherlock-Holmes.pdf`);
+  await pipeline.execute(data, () => {});
+});
+
 describe.each(files)('Test %p', (file) => {
   const data = fs.readFileSync(`${folder}/${file}`);
 
@@ -26,15 +39,15 @@ describe.each(files)('Test %p', (file) => {
   test.each(transformers.map((t) => t.name).filter((name) => name !== 'Does nothing'))(
     'stage %p',
     (transformerName) => {
-      const stageResults = debug.stageResults(debug.stageNames.indexOf(transformerName));
+      const stageResult = debug.stageResult(debug.stageNames.indexOf(transformerName));
 
       const chunkedLines: string[][] = [[]];
       let resultIndex = 0;
       let collectedItems = 0;
-      stageResults.selectPages(true, true).forEach((page) => {
+      stageResult.selectPages(true, true).forEach((page) => {
         page.itemGroups.forEach((itemGroup) => {
-          const change = stageResults.changes.change(itemGroup.top);
-          if (change || stageResults.descriptor.debug?.showAll) {
+          const change = stageResult.changes.change(itemGroup.top);
+          if (change || stageResult.descriptor.debug?.showAll) {
             const item = itemGroup.top;
             const changeType = change?.constructor.name || 'none';
             chunkedLines[resultIndex].push(itemToString(debug.fontMap, item, changeType));
@@ -50,36 +63,70 @@ describe.each(files)('Test %p', (file) => {
       });
 
       // Global characteristics
-      let groupedItemCount = stageResults
-        .selectPages(false, true)
-        .reduce((itemCount, page) => itemCount + page.itemGroups.length, 0);
-      chunkedLines[0].unshift(
-        JSON.stringify(
-          {
-            pages: stageResults.pages.length,
-            items: stageResults.itemsUnpacked().length,
-            groupedItems: groupedItemCount,
-            changes: stageResults.changes.changeCount(),
-            schema: stageResults.schema,
-            // messages: stageResults.messages,
-          },
-          null,
-          2,
-        ),
-      );
+      chunkedLines[0].unshift(toHeader(stageResult));
 
       chunkedLines.forEach((lines, idx) => {
         const transformerResultAsString = lines.join('\n') || '{}';
-        const resultFolder = `${folder}/${file.substr(0, file.length - 4)}`;
-        const fileName = `${transformerName[0].toLowerCase() + transformerName.slice(1).replace(/\s/g, '')}`;
-        const fileIndex = chunkedLines.length > 1 ? `.${idx}` : '';
-        const resultFile = `${resultFolder}/${fileName}${fileIndex}.json`;
-        expect(transformerResultAsString).toMatchFile(resultFile);
+        expect(transformerResultAsString).toMatchFile(matchFilePath(file, transformerName, chunkedLines.length, idx));
       });
     },
   );
 });
 
+function matchFilePath(pdfFileName: string, transformerName: string, chunkCount = 1, chunkIndex = 0): string {
+  const pdfFileNameWithoutExtension = pdfFileName.substr(0, pdfFileName.length - 4);
+  const resultFileName = `${transformerName[0].toLowerCase() + transformerName.slice(1).replace(/\s/g, '')}`;
+  const fileIndex = chunkCount > 1 ? `.${chunkIndex}` : '';
+  return `${folder}/${pdfFileNameWithoutExtension}/${resultFileName}${fileIndex}.json`;
+}
+
+describe('Remove repetitive items from online resources', () => {
+  const transformerName = new RemoveRepetitiveItems().name;
+  test.each(urls)('URL %p', async (url) => {
+    console.log(url);
+    const { fileName, data } = download(url);
+    const debug = await pipeline.debug(data, () => {});
+    const stageResult = debug.stageResult(debug.stageNames.indexOf(transformerName));
+    const pages = stageResult.selectPages(true, true);
+
+    const lines: string[] = [];
+    lines.push(toHeader(stageResult));
+
+    pages.forEach((page) =>
+      page.itemGroups.forEach((itemGroup) => {
+        const change = stageResult.changes.change(itemGroup.top);
+        if (change) {
+          const item = itemGroup.top;
+          const changeType = change?.constructor.name || 'none';
+          lines.push(itemToString(debug.fontMap, item, changeType));
+        }
+      }),
+    );
+
+    console.log(lines);
+    const transformerResultAsString = lines.join('\n') || '{}';
+    expect(transformerResultAsString).toMatchFile(matchFilePath(fileName, transformerName));
+  });
+});
+
+function toHeader(stageResult: StageResult): string {
+  let groupedItemCount = stageResult
+    .selectPages(false, true)
+    .reduce((itemCount, page) => itemCount + page.itemGroups.length, 0);
+  return JSON.stringify(
+    {
+      pages: stageResult.pages.length,
+      items: stageResult.itemsUnpacked().length,
+      groupedItems: groupedItemCount,
+      changes: stageResult.changes.changeCount(),
+      schema: stageResult.schema,
+      // messages: stageResults.messages,
+    },
+    null,
+    2,
+  );
+}
+
 function itemToString(fontMap: Map<string, object>, item: Item, changeType: string): string {
   const fontName: string | Array<string> = item.data['fontName'];
   let newFontName: string | Array<string> | undefined = undefined;
@@ -91,7 +138,7 @@ function itemToString(fontMap: Map<string, object>, item: Item, changeType: stri
     }
   }
   const transform: undefined | number[] = item.data['transform'];
-  let newTransform;
+  let newTransform: undefined | string[];
   if (transform) {
     newTransform = transform.map((num) => num.toFixed(2));
   }
@@ -105,3 +152,45 @@ function itemToString(fontMap: Map<string, object>, item: Item, changeType: stri
     transform: newTransform,
   });
 }
+
+function download(url: string): { fileName: string; data: Buffer } {
+  const fileName = path.basename(new URL(url).pathname);
+  const localFilePath = `${downloadCache}/${fileName}`;
+  console.log(localFilePath);
+  if (!fs.existsSync(localFilePath)) {
+    fs.mkdirSync(downloadCache, { recursive: true });
+    downloadToFile(url, localFilePath);
+  }
+  return {
+    fileName,
+    data: fs.readFileSync(localFilePath),
+  };
+}
+
+function downloadToFile(url: string, dest: string): Promise<void> {
+  const uri = new URL(url);
+  const pkg = url.toLowerCase().startsWith('https:') ? https : http;
+
+  return new Promise((resolve, reject) => {
+    pkg.get(uri.href).on('response', (res) => {
+      if (res.statusCode === 200) {
+        const file = fs.createWriteStream(dest, { flags: 'wx' });
+        res
+          .on('end', () => {
+            file.end();
+            resolve();
+          })
+          .on('error', (err) => {
+            file.destroy();
+            fs.unlink(dest, () => reject(err));
+          })
+          .pipe(file);
+      } else if (res.statusCode === 302 || res.statusCode === 301) {
+        // Recursively follow redirects, only a 200 will resolve.
+        downloadToFile(res.headers.location as string, dest).then(() => resolve());
+      } else {
+        reject(new Error(`Download request failed, response status: ${res.statusCode} ${res.statusMessage}`));
+      }
+    });
+  });
+}
diff --git a/examples/README.md b/examples/README.md
index c09ba91..1b4aa1c 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,10 +1,21 @@
-These PDFs are used in the parsers's test suite.
+# Test PDFs
 
-Self-generated files are:
+This folder contains PDFs for testing purposes and the parse results of the PDFs. Generally there are 3 types of PDFs test setups:
+
+1. Self generated PDFs
+2. PDFs which entered `public domain` or have a otherwise permissive license like `Creative Commons SA`
+3. PDFs where the license is unclear
+
+For (1) and (2) we track the end-result and all transformation steps.
+For (3) we only track the resulst of some transfomation stages (those who doesn't leak too much of the content)
+
+## Self-generated PDFs
 
 - [ExamplePdf](ExamplePdf.pdf)
 
-All other PDFs are either entered `public domain` or have a otherwise permissive license like `Creative Commons SA`.
+## Included Public PDFs
+
+_(PDFs which entered `public domain` or have a otherwise permissive license like `Creative Commons SA`)_
 
 | File                                                               | Source                                | Author                                           | License Information       |
 | ------------------------------------------------------------------ | ------------------------------------- | ------------------------------------------------ | ------------------------- |
@@ -21,3 +32,15 @@ All other PDFs are either entered `public domain` or have a otherwise permissive
 | [The-War-of-the-Worlds](The-War-of-the-Worlds.pdf)                 | http://www.planetpdf.com/             | H.G Wells                                        | Public Domain             |
 | [Tragedy-Of-The-Commons](Tragedy-Of-The-Commons.pdf)               | https://science.sciencemag.org        | Garrett Hardin                                   | Public Domain             |
 | [WoodUp](WoodUp.pdf)                                               | https://bupress.unibz.it/             | Freie Universität Bozen-Bolzano / Giustino Tonon | Creative Commons BY 4.0   |
+
+## PDFs not stored but paritally tested
+
+- https://homepages.cwi.nl/~lex/files/dict.pdf
+
+# Known transformatino problems
+
+_Tracks known problems with parsing and transforming certain PDFs ._
+
+- `Remove Repetitive Elements`
+  - https://homepages.cwi.nl/~lex/files/dict.pdf
+    - Nothing gets detected cause the page-number line contains the current chapter
diff --git a/examples/dict/removeRepetitiveItems.json b/examples/dict/removeRepetitiveItems.json
new file mode 100644
index 0000000..03015f5
--- /dev/null
+++ b/examples/dict/removeRepetitiveItems.json
@@ -0,0 +1,32 @@
+{
+  "pages": 221,
+  "items": 51638,
+  "groupedItems": 8465,
+  "changes": 0,
+  "schema": [
+    {
+      "name": "line"
+    },
+    {
+      "name": "x"
+    },
+    {
+      "name": "y"
+    },
+    {
+      "name": "str"
+    },
+    {
+      "name": "fontName"
+    },
+    {
+      "name": "dir"
+    },
+    {
+      "name": "width"
+    },
+    {
+      "name": "height"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/ui/src/debug/DebugView.svelte b/ui/src/debug/DebugView.svelte
index 3171403..cb2b735 100644
--- a/ui/src/debug/DebugView.svelte
+++ b/ui/src/debug/DebugView.svelte
@@ -18,7 +18,7 @@
     let groupingEnabled = true;
     let onlyRelevantItems = true;
 
-    $: stageResult = debug.stageResults($debugStage);
+    $: stageResult = debug.stageResult($debugStage);
     $: supportsGrouping = !!stageResult.descriptor?.debug?.itemMerger;
     $: supportsRelevanceFiltering = !stageResult.descriptor?.debug?.showAll;
     $: visiblePages = pageControl.selectPages(stageResult, onlyRelevantItems, groupingEnabled, $pinnedPageIndex);