From d7d3502a256bf132a3d2f07d6fdda387b07b8efa Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Sun, 28 Mar 2021 10:21:26 +0200 Subject: [PATCH] Fix processing pdfs with no page numbers --- core/src/transformer/RemoveRepetitiveItems.ts | 10 +++--- core/test/Files.test.ts | 5 ++- examples/README.md | 3 ++ .../removeRepetitiveItems.json | 32 +++++++++++++++++++ 4 files changed, 45 insertions(+), 5 deletions(-) create mode 100644 examples/compressed.tracemonkey-pldi-09/removeRepetitiveItems.json diff --git a/core/src/transformer/RemoveRepetitiveItems.ts b/core/src/transformer/RemoveRepetitiveItems.ts index 4fb0325..6b62dfa 100644 --- a/core/src/transformer/RemoveRepetitiveItems.ts +++ b/core/src/transformer/RemoveRepetitiveItems.ts @@ -161,10 +161,12 @@ function filterOutIncompatibleVariant(options: PageNumber[], nextPageLines: Page let remainingOptions = [...options]; while (remainingOptions.length > 1 && index < nextPageLines.length) { const nextPageNumbers = possiblePageNumbers(nextPageLines[index]); - remainingOptions = remainingOptions.filter((option) => { - const maxDistance = nextPageNumbers[0].pageIndex - option.pageIndex; - return nextPageNumbers.find((nextPageNum) => nextPageNum.pageNumber - option.pageNumber <= maxDistance); - }); + if (nextPageNumbers.length > 0) { + remainingOptions = remainingOptions.filter((option) => { + const maxDistance = nextPageNumbers[0].pageIndex - option.pageIndex; + return nextPageNumbers.find((nextPageNum) => nextPageNum.pageNumber - option.pageNumber <= maxDistance); + }); + } index++; } return remainingOptions; diff --git a/core/test/Files.test.ts b/core/test/Files.test.ts index 1335427..4f556b6 100644 --- a/core/test/Files.test.ts +++ b/core/test/Files.test.ts @@ -19,7 +19,10 @@ const pipeline = new PdfPipeline(parser, transformers); const folder = '../examples'; const files = fs.readdirSync(folder).filter((file) => file.endsWith('.pdf')); -const urls = ['https://homepages.cwi.nl/~lex/files/dict.pdf']; +const urls = [ + 'https://homepages.cwi.nl/~lex/files/dict.pdf', + 'https://github.com/mozilla/pdf.js/raw/master/web/compressed.tracemonkey-pldi-09.pdf', +]; const downloadCache = 'node_modules/.cache/files'; expect.extend({ toMatchFile }); diff --git a/examples/README.md b/examples/README.md index 1f9c5f1..1650b09 100644 --- a/examples/README.md +++ b/examples/README.md @@ -36,6 +36,9 @@ _(PDFs which entered `public domain` or have a otherwise permissive license like ## PDFs not stored but paritally tested - https://homepages.cwi.nl/~lex/files/dict.pdf + - Page numbers with current chapter +- https://github.com/mozilla/pdf.js/raw/master/web/compressed.tracemonkey-pldi-09.pdf + - No page numbers # Known transformatino problems diff --git a/examples/compressed.tracemonkey-pldi-09/removeRepetitiveItems.json b/examples/compressed.tracemonkey-pldi-09/removeRepetitiveItems.json new file mode 100644 index 0000000..3a27574 --- /dev/null +++ b/examples/compressed.tracemonkey-pldi-09/removeRepetitiveItems.json @@ -0,0 +1,32 @@ +{ + "pages": 14, + "items": 2430, + "groupedItems": 1497, + "changes": 0, + "schema": [ + { + "name": "line" + }, + { + "name": "x" + }, + { + "name": "y" + }, + { + "name": "str" + }, + { + "name": "fontName" + }, + { + "name": "dir" + }, + { + "name": "width" + }, + { + "name": "height" + } + ] +} \ No newline at end of file