mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-03 03:19:32 +01:00
Improve removal
* Always compare in one direction
This commit is contained in:
parent
cd6689f141
commit
17290cf746
@ -6,7 +6,6 @@ import ItemTransformer from './ItemTransformer';
|
|||||||
import TransformContext from './TransformContext';
|
import TransformContext from './TransformContext';
|
||||||
import LineItemMerger from '../debug/LineItemMerger';
|
import LineItemMerger from '../debug/LineItemMerger';
|
||||||
import {
|
import {
|
||||||
count,
|
|
||||||
flatMap,
|
flatMap,
|
||||||
groupByLine,
|
groupByLine,
|
||||||
groupByPage,
|
groupByPage,
|
||||||
@ -21,11 +20,12 @@ const config = {
|
|||||||
// Max number of lines at top/bottom (per page) which are getting evaluated for eviction
|
// Max number of lines at top/bottom (per page) which are getting evaluated for eviction
|
||||||
maxNumberOffTopOrBottomLines: 3,
|
maxNumberOffTopOrBottomLines: 3,
|
||||||
|
|
||||||
// Minumum number of times in percent that the y has to appear as fringe element in a page
|
// From the absolute fringe elements (min/max y) how much y can item deviate before beeing disregarded.
|
||||||
minYOccurence: 0.6,
|
maxDistanceFromFringeElements:30,
|
||||||
|
|
||||||
// Max neighbour hops in both direction when checking for neighbour similarity
|
// Max neighbour taken (in one direction) for detecting neighbour similarity.
|
||||||
neighbourReach: 3,
|
// Choosen number might be more effectful for PDFs with a strong odd/evan page differernce.
|
||||||
|
neighbourReach: 2,
|
||||||
|
|
||||||
minSimilarity: 0.8,
|
minSimilarity: 0.8,
|
||||||
};
|
};
|
||||||
@ -45,34 +45,33 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
|
|||||||
|
|
||||||
const uniqueYs = flatMap(pageExtracts, (extract) => extract.fringeLines)
|
const uniqueYs = flatMap(pageExtracts, (extract) => extract.fringeLines)
|
||||||
.map((line) => line.y)
|
.map((line) => line.y)
|
||||||
.filter(onlyUniques);
|
.filter(onlyUniques)
|
||||||
console.log(uniqueYs.sort((a, b) => a - b));
|
.sort((a, b) => a - b);
|
||||||
|
|
||||||
const numberOfPages = context.pageViewports.length;
|
// console.log('uniqueYs', uniqueYs);
|
||||||
|
|
||||||
const yToRemove = uniqueYs.filter((y) => {
|
const yToRemove = uniqueYs.filter((y) => {
|
||||||
// First check how often an element occurs on the given 'y'.
|
const yLines = pageExtracts
|
||||||
// Repetetive elements tend to be on the same y all the time or half the time.
|
.map((page) => page.lineByY(y))
|
||||||
const pageOccurence = count(pageExtracts, (extraxt) => extraxt.hasY(y));
|
.filter((line) => typeof line !== 'undefined') as Line[];
|
||||||
|
const texts = yLines.map((line) => line.text());
|
||||||
const simis = pageExtracts.map((extraxt, idx) => {
|
const similarities = flatMap(yLines, (line, idx) =>
|
||||||
const line = extraxt.lineByY(y);
|
adiacentLines(yLines, idx).map((adiacentLine) => calculateSimilarity(line, adiacentLine)),
|
||||||
if (line) {
|
);
|
||||||
const neighbours = neighbourLines(pageExtracts, idx, y);
|
|
||||||
const similarities = neighbours.map((nLine) => calculateSimilarity(line, nLine));
|
|
||||||
return median(similarities);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
});
|
|
||||||
|
|
||||||
// TODO more checks
|
// TODO more checks
|
||||||
|
// - exclude headlines (higher height, e.g art of speaking)
|
||||||
|
// - better odd/even handling (e.g war of worlds || dict)
|
||||||
// - same x structure
|
// - same x structure
|
||||||
// - contain chapter highlights
|
// - contain chapter highlights
|
||||||
// - contains rising number
|
// - contains rising number
|
||||||
|
|
||||||
return pageOccurence >= numberOfPages * config.minYOccurence && median(simis) >= config.minSimilarity;
|
// console.log('y' + y, texts, similarities, median(similarities));
|
||||||
|
return median(similarities) >= config.minSimilarity;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
//console.log('yToRemove', yToRemove);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
items: transformGroupedByPageAndLine(inputItems, (_, __, items) =>
|
items: transformGroupedByPageAndLine(inputItems, (_, __, items) =>
|
||||||
yToRemove.includes(yFromLine(items)) ? [] : items,
|
yToRemove.includes(yFromLine(items)) ? [] : items,
|
||||||
@ -86,30 +85,25 @@ function calculateSimilarity(line1: Line, line2: Line): number {
|
|||||||
return compareTwoStrings(line1.textWithoutNumbers(), line2.textWithoutNumbers());
|
return compareTwoStrings(line1.textWithoutNumbers(), line2.textWithoutNumbers());
|
||||||
}
|
}
|
||||||
|
|
||||||
function neighbourLines(pages: PageExtract[], pageIndex: number, y: number): Line[] {
|
function adiacentLines(lines: Line[], index: number): Line[] {
|
||||||
const neighbourLines: Line[] = [];
|
// Prefer to either collect x downstream OR x upstream neighbours (not a mix) in order to better catch odd/even page differences
|
||||||
|
let neighbours: Line[];
|
||||||
//Upstream
|
if (index + config.neighbourReach < lines.length) {
|
||||||
for (let index = pageIndex - 1; index > -1 && index >= pageIndex - config.neighbourReach; index--) {
|
neighbours = lines.slice(index + 1, index + config.neighbourReach + 1);
|
||||||
const neighbourLine = pages[index].lineByY(y);
|
} else if (index - config.neighbourReach >= 0) {
|
||||||
if (neighbourLine) {
|
neighbours = lines.slice(index - config.neighbourReach - 1, index - 1);
|
||||||
neighbourLines.push(neighbourLine);
|
} else {
|
||||||
}
|
neighbours = lines.filter((_, idx) => idx !== index);
|
||||||
}
|
}
|
||||||
|
|
||||||
//Downstream
|
return neighbours;
|
||||||
for (let index = pageIndex + 1; index < pages.length && index <= pageIndex + config.neighbourReach; index++) {
|
|
||||||
const neighbourLine = pages[index].lineByY(y);
|
|
||||||
if (neighbourLine) {
|
|
||||||
neighbourLines.push(neighbourLine);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return neighbourLines;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildExtracts(inputItems: Item[]): PageExtract[] {
|
function buildExtracts(inputItems: Item[]): PageExtract[] {
|
||||||
return groupByPage(inputItems).map((pageItems) => {
|
let bottomY = 999;
|
||||||
|
let topY = 0;
|
||||||
|
|
||||||
|
const pages = groupByPage(inputItems).map((pageItems) => {
|
||||||
const lines = groupByLine(pageItems)
|
const lines = groupByLine(pageItems)
|
||||||
.map((lineItems) => {
|
.map((lineItems) => {
|
||||||
const lineY = yFromLine(lineItems);
|
const lineY = yFromLine(lineItems);
|
||||||
@ -117,13 +111,36 @@ function buildExtracts(inputItems: Item[]): PageExtract[] {
|
|||||||
})
|
})
|
||||||
.sort((a, b) => a.y - b.y);
|
.sort((a, b) => a.y - b.y);
|
||||||
|
|
||||||
const numberOfFringeElements = Math.min(lines.length, config.maxNumberOffTopOrBottomLines);
|
// Keep globals up to date
|
||||||
const topN = lines.slice(0, numberOfFringeElements);
|
if (lines[0].y < bottomY) {
|
||||||
const lastN = lines.slice(lines.length - numberOfFringeElements, lines.length);
|
bottomY = lines[0].y;
|
||||||
|
}
|
||||||
|
if (lines[lines.length - 1].y > topY) {
|
||||||
|
topY = lines[lines.length - 1].y;
|
||||||
|
}
|
||||||
|
|
||||||
const fringeLines = [...topN, ...lastN].filter(onlyUniques);
|
// keep top and bottom fringes
|
||||||
|
const numberOfFringeElements = Math.min(lines.length, config.maxNumberOffTopOrBottomLines);
|
||||||
|
const bottomN = lines.slice(0, numberOfFringeElements);
|
||||||
|
const topN = lines.slice(lines.length - numberOfFringeElements, lines.length);
|
||||||
|
|
||||||
|
const fringeLines = [...bottomN, ...topN].filter(onlyUniques);
|
||||||
return new PageExtract(pageItems[0].page, fringeLines);
|
return new PageExtract(pageItems[0].page, fringeLines);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// console.log('bottom', bottomY);
|
||||||
|
// console.log('top', topY);
|
||||||
|
|
||||||
|
//Now that we now the global top and bottom y, we cut those y which are in the middle and not really on the fringes
|
||||||
|
const maxTopDistance = config.maxDistanceFromFringeElements;
|
||||||
|
const maxBottomDistance = config.maxDistanceFromFringeElements;
|
||||||
|
return pages.map(
|
||||||
|
(page) =>
|
||||||
|
new PageExtract(
|
||||||
|
page.page,
|
||||||
|
page.fringeLines.filter((line) => line.y <= bottomY + maxBottomDistance || line.y >= topY - maxTopDistance),
|
||||||
|
),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
function yFromLine(lineItems: Item[]): number {
|
function yFromLine(lineItems: Item[]): number {
|
||||||
|
Loading…
Reference in New Issue
Block a user