mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-24 19:41:24 +02:00
Filter out some impossible page numbers
This commit is contained in:
parent
a6a21c9ed2
commit
ab40466ca8
@ -28,8 +28,6 @@ const config = {
|
|||||||
// Choosen number might be more effectful for PDFs with a strong odd/evan page differernce.
|
// Choosen number might be more effectful for PDFs with a strong odd/evan page differernce.
|
||||||
neighbourReach: 2,
|
neighbourReach: 2,
|
||||||
|
|
||||||
minSimilarity: 0.8,
|
|
||||||
|
|
||||||
minScore: 0.9,
|
minScore: 0.9,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -86,8 +84,6 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
|
|||||||
// '=',
|
// '=',
|
||||||
// totalScore,
|
// totalScore,
|
||||||
// );
|
// );
|
||||||
// console.log(y, 'numbers', allNumbers);
|
|
||||||
// console.log(y, 'regularNumbers', regularNumbers);
|
|
||||||
|
|
||||||
// TODO more checks
|
// TODO more checks
|
||||||
// - exclude headlines (higher height, e.g art of speaking)
|
// - exclude headlines (higher height, e.g art of speaking)
|
||||||
@ -121,14 +117,17 @@ function consecutiveNumbers(lines: PageLine[]): number {
|
|||||||
const allNumbersJoined = flatMap(
|
const allNumbersJoined = flatMap(
|
||||||
lines
|
lines
|
||||||
.map((line) => {
|
.map((line) => {
|
||||||
const match = line.text().match(/\d+/g);
|
const numbersInLine = (line.text().match(/\d+/g) || []).map(Number);
|
||||||
return match?.map(Number) as number[];
|
return numbersInLine.filter((number) => number >= 0 && number <= line.page);
|
||||||
})
|
})
|
||||||
.filter((match) => typeof match !== 'undefined'),
|
.filter((match) => typeof match !== 'undefined'),
|
||||||
(e) => e,
|
(e) => e,
|
||||||
).join('-');
|
).join('-');
|
||||||
const regularNumbersJoined = Array.from({ length: lines.length }, (_, i) => i + 1).join('-');
|
const regularNumbersJoined = Array.from({ length: lines.length }, (_, i) => i + 1).join('-');
|
||||||
|
|
||||||
|
// console.log(lines[0].y, 'numbers', allNumbersJoined);
|
||||||
|
// console.log(lines[0].y, 'regularNumbers', regularNumbersJoined);
|
||||||
|
|
||||||
//TODO OR... reduce (compare last with current == pre-1 100 punkte, current > pre 50 Punkte, sonst 0 punkte und reset. Dann zusammenzählen.)
|
//TODO OR... reduce (compare last with current == pre-1 100 punkte, current > pre 50 Punkte, sonst 0 punkte und reset. Dann zusammenzählen.)
|
||||||
return compareTwoStrings(allNumbersJoined, regularNumbersJoined);
|
return compareTwoStrings(allNumbersJoined, regularNumbersJoined);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user