mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-12-27 08:48:51 +01:00
Fix processing pdfs with no page numbers
This commit is contained in:
parent
202da9b005
commit
d7d3502a25
@ -161,10 +161,12 @@ function filterOutIncompatibleVariant(options: PageNumber[], nextPageLines: Page
|
||||
let remainingOptions = [...options];
|
||||
while (remainingOptions.length > 1 && index < nextPageLines.length) {
|
||||
const nextPageNumbers = possiblePageNumbers(nextPageLines[index]);
|
||||
remainingOptions = remainingOptions.filter((option) => {
|
||||
const maxDistance = nextPageNumbers[0].pageIndex - option.pageIndex;
|
||||
return nextPageNumbers.find((nextPageNum) => nextPageNum.pageNumber - option.pageNumber <= maxDistance);
|
||||
});
|
||||
if (nextPageNumbers.length > 0) {
|
||||
remainingOptions = remainingOptions.filter((option) => {
|
||||
const maxDistance = nextPageNumbers[0].pageIndex - option.pageIndex;
|
||||
return nextPageNumbers.find((nextPageNum) => nextPageNum.pageNumber - option.pageNumber <= maxDistance);
|
||||
});
|
||||
}
|
||||
index++;
|
||||
}
|
||||
return remainingOptions;
|
||||
|
@ -19,7 +19,10 @@ const pipeline = new PdfPipeline(parser, transformers);
|
||||
|
||||
const folder = '../examples';
|
||||
const files = fs.readdirSync(folder).filter((file) => file.endsWith('.pdf'));
|
||||
const urls = ['https://homepages.cwi.nl/~lex/files/dict.pdf'];
|
||||
const urls = [
|
||||
'https://homepages.cwi.nl/~lex/files/dict.pdf',
|
||||
'https://github.com/mozilla/pdf.js/raw/master/web/compressed.tracemonkey-pldi-09.pdf',
|
||||
];
|
||||
const downloadCache = 'node_modules/.cache/files';
|
||||
|
||||
expect.extend({ toMatchFile });
|
||||
|
@ -36,6 +36,9 @@ _(PDFs which entered `public domain` or have a otherwise permissive license like
|
||||
## PDFs not stored but paritally tested
|
||||
|
||||
- https://homepages.cwi.nl/~lex/files/dict.pdf
|
||||
- Page numbers with current chapter
|
||||
- https://github.com/mozilla/pdf.js/raw/master/web/compressed.tracemonkey-pldi-09.pdf
|
||||
- No page numbers
|
||||
|
||||
# Known transformatino problems
|
||||
|
||||
|
@ -0,0 +1,32 @@
|
||||
{
|
||||
"pages": 14,
|
||||
"items": 2430,
|
||||
"groupedItems": 1497,
|
||||
"changes": 0,
|
||||
"schema": [
|
||||
{
|
||||
"name": "line"
|
||||
},
|
||||
{
|
||||
"name": "x"
|
||||
},
|
||||
{
|
||||
"name": "y"
|
||||
},
|
||||
{
|
||||
"name": "str"
|
||||
},
|
||||
{
|
||||
"name": "fontName"
|
||||
},
|
||||
{
|
||||
"name": "dir"
|
||||
},
|
||||
{
|
||||
"name": "width"
|
||||
},
|
||||
{
|
||||
"name": "height"
|
||||
}
|
||||
]
|
||||
}
|
Loading…
Reference in New Issue
Block a user