Improve removal

* Always compare in one direction
This commit is contained in:
Johannes Zillmann 2021-03-21 08:09:41 +01:00
parent cd6689f141
commit 17290cf746

View File

@ -6,7 +6,6 @@ import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext'; import TransformContext from './TransformContext';
import LineItemMerger from '../debug/LineItemMerger'; import LineItemMerger from '../debug/LineItemMerger';
import { import {
count,
flatMap, flatMap,
groupByLine, groupByLine,
groupByPage, groupByPage,
@ -21,11 +20,12 @@ const config = {
// Max number of lines at top/bottom (per page) which are getting evaluated for eviction // Max number of lines at top/bottom (per page) which are getting evaluated for eviction
maxNumberOffTopOrBottomLines: 3, maxNumberOffTopOrBottomLines: 3,
// Minumum number of times in percent that the y has to appear as fringe element in a page // From the absolute fringe elements (min/max y) how much y can item deviate before beeing disregarded.
minYOccurence: 0.6, maxDistanceFromFringeElements:30,
// Max neighbour hops in both direction when checking for neighbour similarity // Max neighbour taken (in one direction) for detecting neighbour similarity.
neighbourReach: 3, // Choosen number might be more effectful for PDFs with a strong odd/evan page differernce.
neighbourReach: 2,
minSimilarity: 0.8, minSimilarity: 0.8,
}; };
@ -45,34 +45,33 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
const uniqueYs = flatMap(pageExtracts, (extract) => extract.fringeLines) const uniqueYs = flatMap(pageExtracts, (extract) => extract.fringeLines)
.map((line) => line.y) .map((line) => line.y)
.filter(onlyUniques); .filter(onlyUniques)
console.log(uniqueYs.sort((a, b) => a - b)); .sort((a, b) => a - b);
const numberOfPages = context.pageViewports.length; // console.log('uniqueYs', uniqueYs);
const yToRemove = uniqueYs.filter((y) => { const yToRemove = uniqueYs.filter((y) => {
// First check how often an element occurs on the given 'y'. const yLines = pageExtracts
// Repetetive elements tend to be on the same y all the time or half the time. .map((page) => page.lineByY(y))
const pageOccurence = count(pageExtracts, (extraxt) => extraxt.hasY(y)); .filter((line) => typeof line !== 'undefined') as Line[];
const texts = yLines.map((line) => line.text());
const simis = pageExtracts.map((extraxt, idx) => { const similarities = flatMap(yLines, (line, idx) =>
const line = extraxt.lineByY(y); adiacentLines(yLines, idx).map((adiacentLine) => calculateSimilarity(line, adiacentLine)),
if (line) { );
const neighbours = neighbourLines(pageExtracts, idx, y);
const similarities = neighbours.map((nLine) => calculateSimilarity(line, nLine));
return median(similarities);
}
return 0;
});
// TODO more checks // TODO more checks
// - exclude headlines (higher height, e.g art of speaking)
// - better odd/even handling (e.g war of worlds || dict)
// - same x structure // - same x structure
// - contain chapter highlights // - contain chapter highlights
// - contains rising number // - contains rising number
return pageOccurence >= numberOfPages * config.minYOccurence && median(simis) >= config.minSimilarity; // console.log('y' + y, texts, similarities, median(similarities));
return median(similarities) >= config.minSimilarity;
}); });
//console.log('yToRemove', yToRemove);
return { return {
items: transformGroupedByPageAndLine(inputItems, (_, __, items) => items: transformGroupedByPageAndLine(inputItems, (_, __, items) =>
yToRemove.includes(yFromLine(items)) ? [] : items, yToRemove.includes(yFromLine(items)) ? [] : items,
@ -86,30 +85,25 @@ function calculateSimilarity(line1: Line, line2: Line): number {
return compareTwoStrings(line1.textWithoutNumbers(), line2.textWithoutNumbers()); return compareTwoStrings(line1.textWithoutNumbers(), line2.textWithoutNumbers());
} }
function neighbourLines(pages: PageExtract[], pageIndex: number, y: number): Line[] { function adiacentLines(lines: Line[], index: number): Line[] {
const neighbourLines: Line[] = []; // Prefer to either collect x downstream OR x upstream neighbours (not a mix) in order to better catch odd/even page differences
let neighbours: Line[];
//Upstream if (index + config.neighbourReach < lines.length) {
for (let index = pageIndex - 1; index > -1 && index >= pageIndex - config.neighbourReach; index--) { neighbours = lines.slice(index + 1, index + config.neighbourReach + 1);
const neighbourLine = pages[index].lineByY(y); } else if (index - config.neighbourReach >= 0) {
if (neighbourLine) { neighbours = lines.slice(index - config.neighbourReach - 1, index - 1);
neighbourLines.push(neighbourLine); } else {
} neighbours = lines.filter((_, idx) => idx !== index);
} }
//Downstream return neighbours;
for (let index = pageIndex + 1; index < pages.length && index <= pageIndex + config.neighbourReach; index++) {
const neighbourLine = pages[index].lineByY(y);
if (neighbourLine) {
neighbourLines.push(neighbourLine);
}
}
return neighbourLines;
} }
function buildExtracts(inputItems: Item[]): PageExtract[] { function buildExtracts(inputItems: Item[]): PageExtract[] {
return groupByPage(inputItems).map((pageItems) => { let bottomY = 999;
let topY = 0;
const pages = groupByPage(inputItems).map((pageItems) => {
const lines = groupByLine(pageItems) const lines = groupByLine(pageItems)
.map((lineItems) => { .map((lineItems) => {
const lineY = yFromLine(lineItems); const lineY = yFromLine(lineItems);
@ -117,13 +111,36 @@ function buildExtracts(inputItems: Item[]): PageExtract[] {
}) })
.sort((a, b) => a.y - b.y); .sort((a, b) => a.y - b.y);
const numberOfFringeElements = Math.min(lines.length, config.maxNumberOffTopOrBottomLines); // Keep globals up to date
const topN = lines.slice(0, numberOfFringeElements); if (lines[0].y < bottomY) {
const lastN = lines.slice(lines.length - numberOfFringeElements, lines.length); bottomY = lines[0].y;
}
if (lines[lines.length - 1].y > topY) {
topY = lines[lines.length - 1].y;
}
const fringeLines = [...topN, ...lastN].filter(onlyUniques); // keep top and bottom fringes
const numberOfFringeElements = Math.min(lines.length, config.maxNumberOffTopOrBottomLines);
const bottomN = lines.slice(0, numberOfFringeElements);
const topN = lines.slice(lines.length - numberOfFringeElements, lines.length);
const fringeLines = [...bottomN, ...topN].filter(onlyUniques);
return new PageExtract(pageItems[0].page, fringeLines); return new PageExtract(pageItems[0].page, fringeLines);
}); });
// console.log('bottom', bottomY);
// console.log('top', topY);
//Now that we now the global top and bottom y, we cut those y which are in the middle and not really on the fringes
const maxTopDistance = config.maxDistanceFromFringeElements;
const maxBottomDistance = config.maxDistanceFromFringeElements;
return pages.map(
(page) =>
new PageExtract(
page.page,
page.fringeLines.filter((line) => line.y <= bottomY + maxBottomDistance || line.y >= topY - maxTopDistance),
),
);
} }
function yFromLine(lineItems: Item[]): number { function yFromLine(lineItems: Item[]): number {