Fix processing pdfs with no page numbers

This commit is contained in:
Johannes Zillmann 2021-03-28 10:21:26 +02:00
parent 202da9b005
commit d7d3502a25
4 changed files with 45 additions and 5 deletions

View File

@ -161,10 +161,12 @@ function filterOutIncompatibleVariant(options: PageNumber[], nextPageLines: Page
let remainingOptions = [...options]; let remainingOptions = [...options];
while (remainingOptions.length > 1 && index < nextPageLines.length) { while (remainingOptions.length > 1 && index < nextPageLines.length) {
const nextPageNumbers = possiblePageNumbers(nextPageLines[index]); const nextPageNumbers = possiblePageNumbers(nextPageLines[index]);
remainingOptions = remainingOptions.filter((option) => { if (nextPageNumbers.length > 0) {
const maxDistance = nextPageNumbers[0].pageIndex - option.pageIndex; remainingOptions = remainingOptions.filter((option) => {
return nextPageNumbers.find((nextPageNum) => nextPageNum.pageNumber - option.pageNumber <= maxDistance); const maxDistance = nextPageNumbers[0].pageIndex - option.pageIndex;
}); return nextPageNumbers.find((nextPageNum) => nextPageNum.pageNumber - option.pageNumber <= maxDistance);
});
}
index++; index++;
} }
return remainingOptions; return remainingOptions;

View File

@ -19,7 +19,10 @@ const pipeline = new PdfPipeline(parser, transformers);
const folder = '../examples'; const folder = '../examples';
const files = fs.readdirSync(folder).filter((file) => file.endsWith('.pdf')); const files = fs.readdirSync(folder).filter((file) => file.endsWith('.pdf'));
const urls = ['https://homepages.cwi.nl/~lex/files/dict.pdf']; const urls = [
'https://homepages.cwi.nl/~lex/files/dict.pdf',
'https://github.com/mozilla/pdf.js/raw/master/web/compressed.tracemonkey-pldi-09.pdf',
];
const downloadCache = 'node_modules/.cache/files'; const downloadCache = 'node_modules/.cache/files';
expect.extend({ toMatchFile }); expect.extend({ toMatchFile });

View File

@ -36,6 +36,9 @@ _(PDFs which entered `public domain` or have a otherwise permissive license like
## PDFs not stored but paritally tested ## PDFs not stored but paritally tested
- https://homepages.cwi.nl/~lex/files/dict.pdf - https://homepages.cwi.nl/~lex/files/dict.pdf
- Page numbers with current chapter
- https://github.com/mozilla/pdf.js/raw/master/web/compressed.tracemonkey-pldi-09.pdf
- No page numbers
# Known transformatino problems # Known transformatino problems

View File

@ -0,0 +1,32 @@
{
"pages": 14,
"items": 2430,
"groupedItems": 1497,
"changes": 0,
"schema": [
{
"name": "line"
},
{
"name": "x"
},
{
"name": "y"
},
{
"name": "str"
},
{
"name": "fontName"
},
{
"name": "dir"
},
{
"name": "width"
},
{
"name": "height"
}
]
}