More accurate page number detection

This commit is contained in:
Johannes Zillmann 2021-03-26 08:42:31 +01:00
parent 4340acb758
commit 0b096faa0c
5 changed files with 149 additions and 53 deletions

View File

@ -0,0 +1,21 @@
export function flatMap<T, S>(array: T[], func: (element: T, idx: number) => S[]): S[] {
return array.reduce((result, entry, idx) => result.concat(func(entry, idx)), [] as S[]);
}
export function groupBy<T>(array: T[], groupKey: (element: T) => any): T[][] {
const groupMap = array.reduce((map: Map<object, T[]>, element: T) => {
const key = groupKey(element);
const elementsInGroup = map.get(key);
if (elementsInGroup) {
elementsInGroup.push(element);
} else {
map.set(key, [element]);
}
return map;
}, new Map());
return Array.from(groupMap, ([key, value]) => value);
}
export function flatten<T>(array: T[][]): T[] {
return flatMap(array, (e) => e);
}

View File

@ -16,6 +16,7 @@ import {
transformGroupedByPageAndLine,
} from '../support/groupingUtils';
import { filterOutDigits } from '../support/stringFunctions';
import { flatten, groupBy } from '../support/functional';
const config = {
// Max number of lines at top/bottom (per page) which are getting evaluated for eviction
@ -53,9 +54,10 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
{ minY: 999, maxY: 0 },
);
// console.log('min', minY, 'max', maxY);
const bottomMaxY = minY + config.maxDistanceFromFringeElements;
const topMinY = maxY - config.maxDistanceFromFringeElements;
// console.log('bottomMaxY', bottomMaxY, 'topMinY', topMinY);
const fringeItems = inputItems.filter((item) => {
const y = item.data['y'];
return y <= bottomMaxY || y >= topMinY;
@ -73,6 +75,7 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
(e) => e,
);
const pageNumber = detectAPageNumber(fringeLines);
const fringeYs = fringeLines
.map((line) => line.y)
.filter(onlyUniques)
@ -82,28 +85,24 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
const yToRemove = fringeYs.filter((y) => {
const yLines = fringeLines.filter((line) => line.y == y);
if (yLines.length < 2) {
return false;
}
//TODO OR... reduce (compare last with current == pre-1 100 punkte, current > pre 50 Punkte, sonst 0 punkte und reset. Dann zusammenzählen.)
const consecutiveNumberScores = consecutiveNumbers(yLines);
const allNumberScore: number = isAllNumbers(yLines) ? 1 : 0;
const pageNumberScore: number = pageNumber ? calculatePageNumerScore(context.pageCount, pageNumber, yLines) : 0;
const textSimilarityScore: number = textSimilarity(yLines);
const totalScore = consecutiveNumberScores + allNumberScore + textSimilarityScore;
const totalScore = pageNumberScore + textSimilarityScore;
// console.log(
// y,
// yLines.map((l) => l.text()),
// consecutiveNumberScores,
// allNumberScore,
// pageNumberScore,
// textSimilarityScore,
// '=',
// totalScore,
// );
// TODO more checks
// - magnetic y
// - exclude headlines (higher height, e.g art of speaking)
// - better odd/even handling (e.g war of worlds || dict)
// - same x structure
@ -131,23 +130,70 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
}
}
function consecutiveNumbers(lines: PageLine[]): number {
const allNumbersJoined = flatMap(
lines
.map((line) => {
const numbersInLine = (line.text().match(/\d+/g) || []).map(Number);
return numbersInLine.filter((number) => number >= 0 && number <= line.page);
})
.filter((match) => typeof match !== 'undefined'),
(e) => e,
).join('-');
const regularNumbersJoined = Array.from({ length: lines.length }, (_, i) => i + 1).join('-');
function calculatePageNumerScore(pageCount: number, pageNumber: PageNumber, lines: PageLine[]): number {
const pageNumberFactor = pageNumber.pageNumber - pageNumber.pageIndex;
const maxPageNumbers = pageCount + pageNumberFactor;
const linesWithPageNumbers = lines.filter((line) =>
extractNumbers(line.text()).includes(line.page + pageNumberFactor),
).length;
return linesWithPageNumbers / Math.min(maxPageNumbers, lines.length);
}
// console.log(lines[0].y, 'numbers', allNumbersJoined);
// console.log(lines[0].y, 'regularNumbers', regularNumbersJoined);
function detectAPageNumber(lines: PageLine[]): PageNumber | undefined {
const linesByPage = groupBy(lines, (line) => line.page).sort((a, b) => a[0].page - b[0].page);
const pageIndexInTheMiddle = Math.round(linesByPage.length / 2);
const possiblePageNumbersForMiddle = possiblePageNumbers(linesByPage[pageIndexInTheMiddle]);
const remainingOptions = filterOutIncompatibleVariant(
possiblePageNumbersForMiddle,
linesByPage.slice(pageIndexInTheMiddle + 1, linesByPage.length),
);
//TODO do the same filtering upstream !?
//TODO OR... reduce (compare last with current == pre-1 100 punkte, current > pre 50 Punkte, sonst 0 punkte und reset. Dann zusammenzählen.)
return compareTwoStrings(allNumbersJoined, regularNumbersJoined);
if (remainingOptions.length == 1) {
return remainingOptions[0];
}
return undefined;
}
function filterOutIncompatibleVariant(options: PageNumber[], nextPageLines: PageLine[][]): PageNumber[] {
let index = 0;
let remainingOptions = [...options];
while (remainingOptions.length > 1 && index < nextPageLines.length) {
const nextPageNumbers = possiblePageNumbers(nextPageLines[index]);
remainingOptions = remainingOptions.filter((option) => {
const maxDistance = nextPageNumbers[0].pageIndex - option.pageIndex;
return nextPageNumbers.find((nextPageNum) => nextPageNum.pageNumber - option.pageNumber <= maxDistance);
});
index++;
}
return remainingOptions;
}
interface PageNumber {
pageIndex: number;
pageNumber: number;
y: number;
}
function possiblePageNumbers(lines: PageLine[]): PageNumber[] {
return flatten(
lines.map((line) => {
return extractNumbers(line.text())
.filter((number) => number >= 0)
.filter((number) => number <= line.page + 1)
.filter(onlyUniques)
.map((num) => ({
pageIndex: line.page,
pageNumber: num,
y: line.y,
}));
}),
);
}
function extractNumbers(text: string): number[] {
return (text.match(/\d+/g) || []).map(Number);
}
function textSimilarity(lines: PageLine[]): number {
@ -157,17 +203,6 @@ function textSimilarity(lines: PageLine[]): number {
return median(similarities);
}
function isAllNumbers(lines: PageLine[]): boolean {
for (let index = 0; index < lines.length; index++) {
const string = lines[index].text().trim();
const asNumber = Number(string);
if (isNaN(asNumber)) {
return false;
}
}
return true;
}
function calculateSimilarity(line1: PageLine, line2: PageLine): number {
return compareTwoStrings(line1.textWithoutNumbers(), line2.textWithoutNumbers());
}

View File

@ -0,0 +1,35 @@
import Item from 'src/Item';
import { flatMap, flatten, groupBy } from 'src/support/functional';
import { items } from 'test/testItems';
test('flatMap', async () => {
expect(flatMap([], (e) => e)).toEqual([]);
expect(flatMap([[1, 2], [3], [4, 5, 6]], (e) => e)).toEqual([1, 2, 3, 4, 5, 6]);
expect(flatMap([{ x: [1, 2] }, { x: [3] }, { x: [4, 5, 6] }], (e) => e.x)).toEqual([1, 2, 3, 4, 5, 6]);
});
test('flatten', async () => {
expect(flatten([])).toEqual([]);
expect(flatten([[1, 2], [3], [4, 5, 6]])).toEqual([1, 2, 3, 4, 5, 6]);
});
test('groupBy', async () => {
expect(groupBy([], (e) => e)).toEqual([]);
expect(groupBy([1, 2, 1, 3, 2, 4, 4], (e) => e)).toEqual([[1, 1], [2, 2], [3], [4, 4]]);
expect(
groupBy(
[
{ k: 'a', v: 1 },
{ k: 'a', v: 2 },
{ k: 'b', v: 3 },
],
(e) => e.k,
),
).toEqual([
[
{ k: 'a', v: 1 },
{ k: 'a', v: 2 },
],
[{ k: 'b', v: 3 }],
]);
});

View File

@ -2,7 +2,7 @@
"pages": 140,
"items": 25968,
"groupedItems": 3294,
"changes": 170,
"changes": 192,
"schema": [
{
"name": "line"
@ -31,7 +31,8 @@
]
}
{"page":13,"change":"Removal","str":"\\-iii Preface.","line":0,"x":63.50214,"y":400.98296999999997,"width":"50.42","height":"7.99","fontName":[null],"dir":["ltr"]}
{"page":24,"change":"Removal","str":"in the Soul of Man,","line":0,"x":84.04695,"y":396.96020999999996,"width":"77.72","height":"10.00","fontName":[null],"dir":["ltr"]}
{"page":15,"change":"Removal","str":"Contents.","dir":"ltr","width":"40.95","height":"7.58","transform":["7.58","0.00","0.00","7.58","152.29","405.15"],"x":152.2902,"y":405.14939999999996,"line":0}
{"page":25,"change":"Removal","str":"The Life of Gcd","line":0,"x":120.6828,"y":403.28168999999997,"width":"66.37","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":27,"change":"Removal","str":"lO The Life of God","line":0,"x":42.95733,"y":399.25892999999996,"width":"76.43","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":28,"change":"Removal","str":"/;/ the Soul of Man. ii","line":0,"x":97.55193,"y":398.68424999999996,"width":"87.06","height":"10.18","fontName":[null],"dir":["ltr"]}
{"page":29,"change":"Removal","str":"12 The Life of God","line":0,"x":48.27312,"y":400.12095,"width":"75.00","height":"10.22","fontName":[null],"dir":["ltr"]}
@ -39,25 +40,28 @@
{"page":35,"change":"Removal","str":"The Life of God","line":2,"x":117.66573,"y":399.54627,"width":"66.23","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":37,"change":"Removal","str":"20 The Life of God","line":0,"x":46.405409999999996,"y":399.97727999999995,"width":"76.15","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":38,"change":"Removal","str":"in the Soul of A fan. 21","line":0,"x":94.96587,"y":398.82792,"width":"88.79","height":"10.54","fontName":[null],"dir":["ltr"]}
{"page":42,"change":"Removal","str":"/;/ the Soul of Man. 25","line":0,"x":94.10385,"y":397.24755,"width":"87.78","height":"10.24","fontName":[null],"dir":["ltr"]}
{"page":39,"change":"Removal","str":"22 The Life of God","line":0,"x":47.69844,"y":401.98866,"width":"75.86","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":40,"change":"Removal","str":"/;/ ihc So III of Man. 23","line":0,"x":93.24183,"y":403.28168999999997,"width":"85.77","height":"10.12","fontName":[null],"dir":["ltr"]}
{"page":41,"change":"Removal","str":"24 The Life of God","line":0,"x":47.12376,"y":401.84499,"width":"76.87","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":43,"change":"Removal","str":"26 The Life of God","line":0,"x":58.33002,"y":403.56903,"width":"76.57","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":44,"change":"Removal","str":"in the Soul of Man. 27","line":0,"x":96.54624,"y":402.56334,"width":"86.78","height":"10.12","fontName":[null],"dir":["ltr"]}
{"page":45,"change":"Removal","str":"28 The Life of God","line":0,"x":53.58891,"y":402.99435,"width":"76.58","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":46,"change":"Removal","str":"in the Soul of Man. 29","line":0,"x":91.37411999999999,"y":404.00003999999996,"width":"88.21","height":"10.18","fontName":[null],"dir":["ltr"]}
{"page":47,"change":"Removal","str":"30 The Life of God","line":0,"x":53.87625,"y":403.85636999999997,"width":"77.29","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":48,"change":"Removal","str":"in the Soul of Man. 31","line":0,"x":91.80512999999999,"y":404.28738,"width":"88.50","height":"10.24","fontName":[null],"dir":["ltr"]}
{"page":49,"change":"Removal","str":"32 The Lifo of God","line":0,"x":53.30157,"y":403.56903,"width":"76.58","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":50,"change":"Removal","str":"3","dir":"ltr","width":"3.59","height":"5.99","transform":["5.99","0.00","0.00","5.99","140.51","30.89"],"x":140.50925999999998,"y":30.889049999999997,"line":27}
{"page":52,"change":"Removal","str":"in the Soul of Man. 35","line":0,"x":93.81651,"y":395.52351,"width":"87.35","height":"10.18","fontName":[null],"dir":["ltr"]}
{"page":67,"change":"Removal","str":"50 The Life of God","line":0,"x":51.577529999999996,"y":396.5292,"width":"76.00","height":"10.46","fontName":[null],"dir":["ltr"]}
{"page":68,"change":"Removal","str":"/;/ the Sotil of Man. 51","line":0,"x":91.66145999999999,"y":395.95452,"width":"86.35","height":"9.76","fontName":[null],"dir":["ltr"]}
{"page":69,"change":"Removal","str":"The Life of God","line":0,"x":121.68848999999999,"y":396.67287,"width":"66.66","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":72,"change":"Removal","str":"in the Soul of Man. 55","line":0,"x":95.82789,"y":397.10388,"width":"87.49","height":"10.18","fontName":[null],"dir":["ltr"]}
{"page":82,"change":"Removal","str":"5","dir":"ltr","width":"3.45","height":"5.75","transform":["5.75","0.00","0.00","5.75","140.08","25.43"],"x":140.07825,"y":25.429589999999997,"line":24}
{"page":87,"change":"Removal","str":"70 The Life of God","line":0,"x":48.27312,"y":396.5292,"width":"76.72","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":97,"change":"Removal","str":"8o The Life of God","line":0,"x":47.98578,"y":396.67287,"width":"77.01","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":102,"change":"Removal","str":"in the Soul of Man, 85","line":0,"x":98.12661,"y":404.86206,"width":"88.21","height":"10.18","fontName":[null],"dir":["ltr"]}
{"page":104,"change":"Removal","str":"in the Sold oj Man. 87","line":0,"x":98.41395,"y":402.70700999999997,"width":"86.35","height":"10.24","fontName":[null],"dir":["ltr"]}
{"page":105,"change":"Removal","str":"The Life of God","line":0,"x":114.07397999999999,"y":402.13232999999997,"width":"66.66","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":106,"change":"Removal","str":"111 the Sold of Man. 89","line":0,"x":94.10385,"y":403.42535999999996,"width":"88.79","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":107,"change":"Removal","str":"90 The Life of God","line":0,"x":49.70982,"y":399.54627,"width":"77.44","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":108,"change":"Removal","str":"in the Sotd of Man. 91","line":0,"x":93.24183,"y":401.70131999999995,"width":"87.21","height":"10.06","fontName":[null],"dir":["ltr"]}
{"page":114,"change":"Removal","str":"tn the Soul of 3 fail. 97","line":0,"x":92.23613999999999,"y":397.24755,"width":"88.22","height":"9.34","fontName":[null],"dir":["ltr"]}
{"page":114,"change":"Removal","str":"7","dir":"ltr","width":"3.45","height":"5.75","transform":["5.75","0.00","0.00","5.75","139.22","30.75"],"x":139.21623,"y":30.745379999999997,"line":26}
{"page":115,"change":"Removal","str":"98 The Life of God","line":0,"x":52.58322,"y":396.81654,"width":"76.58","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":116,"change":"Removal","str":"in the Sotil of Alan. 99","line":0,"x":95.25321,"y":397.67856,"width":"88.36","height":"9.10","fontName":[null],"dir":["ltr"]}
{"page":117,"change":"Removal","str":"lOO The Life of God","line":0,"x":56.031299999999995,"y":396.81654,"width":"82.04","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":120,"change":"Removal","str":"in the Soul of Alan. 103","line":0,"x":95.54055,"y":397.24755,"width":"92.38","height":"9.10","fontName":[null],"dir":["ltr"]}
{"page":128,"change":"Removal","str":"in the Soul of j\\Ian. iii","line":0,"x":99.56331,"y":398.39691,"width":"92.23","height":"9.34","fontName":[null],"dir":["ltr"]}
{"page":130,"change":"Removal","str":"8","dir":"ltr","width":"3.59","height":"5.99","transform":["5.99","0.00","0.00","5.99","137.20","25.00"],"x":137.20485,"y":24.998579999999997,"line":24}
{"page":138,"change":"Removal","str":"in the Soul of Alan. 121","line":0,"x":108.03984,"y":396.5292,"width":"91.95","height":"9.22","fontName":[null],"dir":["ltr"]}
{"page":142,"change":"Removal","str":"in the Soul of Man. 125","line":0,"x":115.94169,"y":398.82792,"width":"92.09","height":"10.18","fontName":[null],"dir":["ltr"]}
{"page":143,"change":"Removal","str":"126 The Life of God","line":0,"x":36.92319,"y":396.96020999999996,"width":"81.89","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":142,"change":"Removal","str":"in the Soul of Man. 125","line":0,"x":115.94169,"y":398.82792,"width":"92.09","height":"10.18","fontName":[null],"dir":["ltr"]}

View File

@ -24,7 +24,7 @@ _(PDFs which entered `public domain` or have a otherwise permissive license like
| [Closed-Syllables](Closed-Syllables.pdf) | ? | Susan Jones | Creative Commons BY 4.0 |
| [Flash-Masques-Temperature](Flash-Masques-Temperature.pdf) | https://www.techtera.org/ | ? | Creative Commons BY 4.0 |
| [Grammar-Matters](Grammar-Matters.pdf) | ? | Debbie Kuhlmann | Creative Commons BY 4.0 |
| [Life-Of-God-In-Soul-Of-Man](lLife-Of-God-In-Soul-Of-Man.pdf) | https://archive.org/ | Henry Scougal | Public Domain |
| [Life-Of-God-In-Soul-Of-Man](Life-Of-God-In-Soul-Of-Man.pdf) | https://archive.org/ | Henry Scougal | Public Domain |
| [Safe-Communication](Safe-Communication.pdf) | https://www.england.nhs.uk/ | Nicola Davey & Ali Cole | Creative Commons BY 4.0 |
| [St-Mary-Witney-Social-Audit](St-Mary-Witney-Social-Audit.pdf) | https://catrionarobertson.com/ | Catriona Robertson | Creative Commons BY 4.0 |
| [The-Art-of-Public-Speaking](The-Art-of-Public-Speaking.pdf) | http://www.gutenberg.org/ebooks/16317 | Dale Carnagey, J. Berg Esenwein | Project Gutenberg License |
@ -42,5 +42,6 @@ _(PDFs which entered `public domain` or have a otherwise permissive license like
_Tracks known problems with parsing and transforming certain PDFs ._
- `Remove Repetitive Elements`
- https://homepages.cwi.nl/~lex/files/dict.pdf
- Nothing gets detected cause the page-number line contains the current chapter
- [](Life-Of-God-In-Soul-Of-Man.pdf)
- often numbers are cryptic text
- high variance in Y