Simplify code

This commit is contained in:
Johannes Zillmann 2021-03-24 23:08:36 +01:00
parent ab40466ca8
commit 4340acb758
2 changed files with 60 additions and 60 deletions

View File

@ -22,7 +22,7 @@ const config = {
maxNumberOffTopOrBottomLines: 3,
// From the absolute fringe elements (min/max y) how much y can item deviate before beeing disregarded.
maxDistanceFromFringeElements: 30,
maxDistanceFromFringeElements: 35,
// Max neighbour taken (in one direction) for detecting neighbour similarity.
// Choosen number might be more effectful for PDFs with a strong odd/evan page differernce.
@ -42,7 +42,36 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
}
transform(context: TransformContext, inputItems: Item[]): ItemResult {
const fringeLines = extractFringeLines(inputItems);
const { minY, maxY } = inputItems.reduce(
({ minY, maxY }, item) => {
const y = item.data['y'];
return {
minY: Math.min(minY, y),
maxY: Math.max(maxY, y),
};
},
{ minY: 999, maxY: 0 },
);
// console.log('min', minY, 'max', maxY);
const bottomMaxY = minY + config.maxDistanceFromFringeElements;
const topMinY = maxY - config.maxDistanceFromFringeElements;
const fringeItems = inputItems.filter((item) => {
const y = item.data['y'];
return y <= bottomMaxY || y >= topMinY;
});
const fringeLines = flatMap(
groupByPage(fringeItems).map((pageItems) =>
groupByLine(pageItems)
.map((lineItems) => {
const lineY = yFromLineItems(lineItems);
return new PageLine(pageItems[0].page, lineY, lineItems);
})
.sort((a, b) => a.y - b.y),
),
(e) => e,
);
const fringeYs = fringeLines
.map((line) => line.y)
@ -58,17 +87,6 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
return false;
}
const allNumbersJoined = flatMap(
yLines
.map((line) => {
const match = line.text().match(/\d+/g);
return match?.map(Number) as number[];
})
.filter((match) => typeof match !== 'undefined'),
(e) => e,
).join('-');
const regularNumbersJoined = Array.from({ length: yLines.length }, (_, i) => i + 1).join('-');
//TODO OR... reduce (compare last with current == pre-1 100 punkte, current > pre 50 Punkte, sonst 0 punkte und reset. Dann zusammenzählen.)
const consecutiveNumberScores = consecutiveNumbers(yLines);
const allNumberScore: number = isAllNumbers(yLines) ? 1 : 0;
@ -168,45 +186,6 @@ function adiacentLines(lines: PageLine[], index: number): PageLine[] {
return neighbours;
}
function extractFringeLines(inputItems: Item[]): PageLine[] {
let bottomY = 999;
let topY = 0;
const fringLines = flatMap(
groupByPage(inputItems).map((pageItems) => {
const pageLines = groupByLine(pageItems)
.map((lineItems) => {
const lineY = yFromLineItems(lineItems);
return new PageLine(pageItems[0].page, lineY, lineItems);
})
.sort((a, b) => a.y - b.y);
// Keep globals up to date
if (pageLines[0].y < bottomY) {
bottomY = pageLines[0].y;
}
if (pageLines[pageLines.length - 1].y > topY) {
topY = pageLines[pageLines.length - 1].y;
}
// keep only top and bottom fringes
const numberOfFringeElements = Math.min(pageLines.length, config.maxNumberOffTopOrBottomLines);
const bottomN = pageLines.slice(0, numberOfFringeElements);
const topN = pageLines.slice(pageLines.length - numberOfFringeElements, pageLines.length);
return [...bottomN, ...topN].filter(onlyUniques);
}),
(e) => e,
);
// console.log('bottom', bottomY);
// console.log('top', topY);
//Now that we now the global top and bottom y, we cut those y which are in the middle and not really on the fringes
const maxTopDistance = config.maxDistanceFromFringeElements;
const maxBottomDistance = config.maxDistanceFromFringeElements;
return fringLines.filter((line) => line.y <= bottomY + maxBottomDistance || line.y >= topY - maxTopDistance);
}
function yFromLineItems(lineItems: Item[]): number {
return Math.round(mostFrequent(lineItems, 'y') as number);
}

View File

@ -2,7 +2,7 @@
"pages": 140,
"items": 25968,
"groupedItems": 3294,
"changes": 40,
"changes": 170,
"schema": [
{
"name": "line"
@ -30,13 +30,34 @@
}
]
}
{"page":23,"change":"Removal","str":"The Life of God","line":0,"x":118.52775,"y":400.26462,"width":"66.38","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":13,"change":"Removal","str":"\\-iii Preface.","line":0,"x":63.50214,"y":400.98296999999997,"width":"50.42","height":"7.99","fontName":[null],"dir":["ltr"]}
{"page":24,"change":"Removal","str":"in the Soul of Man,","line":0,"x":84.04695,"y":396.96020999999996,"width":"77.72","height":"10.00","fontName":[null],"dir":["ltr"]}
{"page":27,"change":"Removal","str":"lO The Life of God","line":0,"x":42.95733,"y":399.25892999999996,"width":"76.43","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":28,"change":"Removal","str":"/;/ the Soul of Man. ii","line":0,"x":97.55193,"y":398.68424999999996,"width":"87.06","height":"10.18","fontName":[null],"dir":["ltr"]}
{"page":29,"change":"Removal","str":"12 The Life of God","line":0,"x":48.27312,"y":400.12095,"width":"75.00","height":"10.22","fontName":[null],"dir":["ltr"]}
{"page":35,"change":"Removal","str":"1","dir":"ltr","width":"2.87","height":"4.79","transform":["4.79","0.00","0.00","4.79","51.86","400.41"],"x":51.864869999999996,"y":400.40828999999997,"line":0}
{"page":35,"change":"Removal","str":"The Life of God","line":2,"x":117.66573,"y":399.54627,"width":"66.23","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":37,"change":"Removal","str":"20 The Life of God","line":0,"x":46.405409999999996,"y":399.97727999999995,"width":"76.15","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":38,"change":"Removal","str":"in the Soul of A fan. 21","line":0,"x":94.96587,"y":398.82792,"width":"88.79","height":"10.54","fontName":[null],"dir":["ltr"]}
{"page":42,"change":"Removal","str":"/;/ the Soul of Man. 25","line":0,"x":94.10385,"y":397.24755,"width":"87.78","height":"10.24","fontName":[null],"dir":["ltr"]}
{"page":50,"change":"Removal","str":"3","dir":"ltr","width":"3.59","height":"5.99","transform":["5.99","0.00","0.00","5.99","140.51","30.89"],"x":140.50925999999998,"y":30.889049999999997,"line":27}
{"page":51,"change":"Removal","str":"34 The Life of God","line":0,"x":52.29588,"y":399.40259999999995,"width":"77.30","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":62,"change":"Removal","str":"tn the Soul of Man. 45","line":0,"x":90.79943999999999,"y":398.68424999999996,"width":"88.07","height":"10.24","fontName":[null],"dir":["ltr"]}
{"page":63,"change":"Removal","str":"46 The Life of God","line":0,"x":47.55477,"y":400.69563,"width":"77.01","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":52,"change":"Removal","str":"in the Soul of Man. 35","line":0,"x":93.81651,"y":395.52351,"width":"87.35","height":"10.18","fontName":[null],"dir":["ltr"]}
{"page":67,"change":"Removal","str":"50 The Life of God","line":0,"x":51.577529999999996,"y":396.5292,"width":"76.00","height":"10.46","fontName":[null],"dir":["ltr"]}
{"page":68,"change":"Removal","str":"/;/ the Sotil of Man. 51","line":0,"x":91.66145999999999,"y":395.95452,"width":"86.35","height":"9.76","fontName":[null],"dir":["ltr"]}
{"page":69,"change":"Removal","str":"The Life of God","line":0,"x":121.68848999999999,"y":396.67287,"width":"66.66","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":72,"change":"Removal","str":"in the Soul of Man. 55","line":0,"x":95.82789,"y":397.10388,"width":"87.49","height":"10.18","fontName":[null],"dir":["ltr"]}
{"page":82,"change":"Removal","str":"5","dir":"ltr","width":"3.45","height":"5.75","transform":["5.75","0.00","0.00","5.75","140.08","25.43"],"x":140.07825,"y":25.429589999999997,"line":24}
{"page":103,"change":"Removal","str":"^6 The Life of God","line":0,"x":46.261739999999996,"y":399.25892999999996,"width":"77.15","height":"10.22","fontName":[null],"dir":["ltr"]}
{"page":109,"change":"Removal","str":"92 The Life of God","line":0,"x":51.146519999999995,"y":400.8393,"width":"76.58","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":87,"change":"Removal","str":"70 The Life of God","line":0,"x":48.27312,"y":396.5292,"width":"76.72","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":97,"change":"Removal","str":"8o The Life of God","line":0,"x":47.98578,"y":396.67287,"width":"77.01","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":107,"change":"Removal","str":"90 The Life of God","line":0,"x":49.70982,"y":399.54627,"width":"77.44","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":114,"change":"Removal","str":"tn the Soul of 3 fail. 97","line":0,"x":92.23613999999999,"y":397.24755,"width":"88.22","height":"9.34","fontName":[null],"dir":["ltr"]}
{"page":114,"change":"Removal","str":"7","dir":"ltr","width":"3.45","height":"5.75","transform":["5.75","0.00","0.00","5.75","139.22","30.75"],"x":139.21623,"y":30.745379999999997,"line":26}
{"page":115,"change":"Removal","str":"98 The Life of God","line":0,"x":52.58322,"y":396.81654,"width":"76.58","height":"10.30","fontName":[null],"dir":["ltr"]}
{"page":116,"change":"Removal","str":"in the Sotil of Alan. 99","line":0,"x":95.25321,"y":397.67856,"width":"88.36","height":"9.10","fontName":[null],"dir":["ltr"]}
{"page":117,"change":"Removal","str":"lOO The Life of God","line":0,"x":56.031299999999995,"y":396.81654,"width":"82.04","height":"10.38","fontName":[null],"dir":["ltr"]}
{"page":120,"change":"Removal","str":"in the Soul of Alan. 103","line":0,"x":95.54055,"y":397.24755,"width":"92.38","height":"9.10","fontName":[null],"dir":["ltr"]}
{"page":128,"change":"Removal","str":"in the Soul of j\\Ian. iii","line":0,"x":99.56331,"y":398.39691,"width":"92.23","height":"9.34","fontName":[null],"dir":["ltr"]}
{"page":130,"change":"Removal","str":"8","dir":"ltr","width":"3.59","height":"5.99","transform":["5.99","0.00","0.00","5.99","137.20","25.00"],"x":137.20485,"y":24.998579999999997,"line":24}
{"page":138,"change":"Removal","str":"in the Soul of Alan. 121","line":0,"x":108.03984,"y":396.5292,"width":"91.95","height":"9.22","fontName":[null],"dir":["ltr"]}
{"page":142,"change":"Removal","str":"in the Soul of Man. 125","line":0,"x":115.94169,"y":398.82792,"width":"92.09","height":"10.18","fontName":[null],"dir":["ltr"]}
{"page":143,"change":"Removal","str":"126 The Life of God","line":0,"x":36.92319,"y":396.96020999999996,"width":"81.89","height":"10.30","fontName":[null],"dir":["ltr"]}