Filter out some impossible page numbers

This commit is contained in:
Johannes Zillmann 2021-03-24 22:27:59 +01:00
parent a6a21c9ed2
commit ab40466ca8

View File

@ -28,8 +28,6 @@ const config = {
// Choosen number might be more effectful for PDFs with a strong odd/evan page differernce. // Choosen number might be more effectful for PDFs with a strong odd/evan page differernce.
neighbourReach: 2, neighbourReach: 2,
minSimilarity: 0.8,
minScore: 0.9, minScore: 0.9,
}; };
@ -86,8 +84,6 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
// '=', // '=',
// totalScore, // totalScore,
// ); // );
// console.log(y, 'numbers', allNumbers);
// console.log(y, 'regularNumbers', regularNumbers);
// TODO more checks // TODO more checks
// - exclude headlines (higher height, e.g art of speaking) // - exclude headlines (higher height, e.g art of speaking)
@ -121,14 +117,17 @@ function consecutiveNumbers(lines: PageLine[]): number {
const allNumbersJoined = flatMap( const allNumbersJoined = flatMap(
lines lines
.map((line) => { .map((line) => {
const match = line.text().match(/\d+/g); const numbersInLine = (line.text().match(/\d+/g) || []).map(Number);
return match?.map(Number) as number[]; return numbersInLine.filter((number) => number >= 0 && number <= line.page);
}) })
.filter((match) => typeof match !== 'undefined'), .filter((match) => typeof match !== 'undefined'),
(e) => e, (e) => e,
).join('-'); ).join('-');
const regularNumbersJoined = Array.from({ length: lines.length }, (_, i) => i + 1).join('-'); const regularNumbersJoined = Array.from({ length: lines.length }, (_, i) => i + 1).join('-');
// console.log(lines[0].y, 'numbers', allNumbersJoined);
// console.log(lines[0].y, 'regularNumbers', regularNumbersJoined);
//TODO OR... reduce (compare last with current == pre-1 100 punkte, current > pre 50 Punkte, sonst 0 punkte und reset. Dann zusammenzählen.) //TODO OR... reduce (compare last with current == pre-1 100 punkte, current > pre 50 Punkte, sonst 0 punkte und reset. Dann zusammenzählen.)
return compareTwoStrings(allNumbersJoined, regularNumbersJoined); return compareTwoStrings(allNumbersJoined, regularNumbersJoined);
} }