simplify code (and keep information) through flattening page lines

This commit is contained in:
Johannes Zillmann 2021-03-24 07:45:19 +01:00
parent 4c77274d16
commit a6a21c9ed2
2 changed files with 46 additions and 58 deletions

View File

@ -44,9 +44,9 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
} }
transform(context: TransformContext, inputItems: Item[]): ItemResult { transform(context: TransformContext, inputItems: Item[]): ItemResult {
const pageExtracts = buildExtracts(inputItems); const fringeLines = extractFringeLines(inputItems);
const fringeYs = flatMap(pageExtracts, (extract) => extract.fringeLines) const fringeYs = fringeLines
.map((line) => line.y) .map((line) => line.y)
.filter(onlyUniques) .filter(onlyUniques)
.sort(ascending); .sort(ascending);
@ -54,9 +54,7 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
// console.log('uniqueYs', uniqueYs); // console.log('uniqueYs', uniqueYs);
const yToRemove = fringeYs.filter((y) => { const yToRemove = fringeYs.filter((y) => {
const yLines = pageExtracts const yLines = fringeLines.filter((line) => line.y == y);
.map((page) => page.lineByY(y))
.filter((line) => typeof line !== 'undefined') as Line[];
if (yLines.length < 2) { if (yLines.length < 2) {
return false; return false;
@ -104,7 +102,7 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
let removalCount = 0; let removalCount = 0;
return { return {
items: transformGroupedByPageAndLine(inputItems, (_, __, lineItems) => { items: transformGroupedByPageAndLine(inputItems, (_, __, lineItems) => {
const itemsY = yFromLine(lineItems); const itemsY = yFromLineItems(lineItems);
if (fringeYs.includes(itemsY)) { if (fringeYs.includes(itemsY)) {
lineItems.forEach(context.trackEvaluation.bind(context)); lineItems.forEach(context.trackEvaluation.bind(context));
} }
@ -119,7 +117,7 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
} }
} }
function consecutiveNumbers(lines: Line[]): number { function consecutiveNumbers(lines: PageLine[]): number {
const allNumbersJoined = flatMap( const allNumbersJoined = flatMap(
lines lines
.map((line) => { .map((line) => {
@ -135,14 +133,14 @@ function consecutiveNumbers(lines: Line[]): number {
return compareTwoStrings(allNumbersJoined, regularNumbersJoined); return compareTwoStrings(allNumbersJoined, regularNumbersJoined);
} }
function textSimilarity(lines: Line[]): number { function textSimilarity(lines: PageLine[]): number {
const similarities = flatMap(lines, (line, idx) => const similarities = flatMap(lines, (line, idx) =>
adiacentLines(lines, idx).map((adiacentLine) => calculateSimilarity(line, adiacentLine)), adiacentLines(lines, idx).map((adiacentLine) => calculateSimilarity(line, adiacentLine)),
); );
return median(similarities); return median(similarities);
} }
function isAllNumbers(lines: Line[]): boolean { function isAllNumbers(lines: PageLine[]): boolean {
for (let index = 0; index < lines.length; index++) { for (let index = 0; index < lines.length; index++) {
const string = lines[index].text().trim(); const string = lines[index].text().trim();
const asNumber = Number(string); const asNumber = Number(string);
@ -153,13 +151,13 @@ function isAllNumbers(lines: Line[]): boolean {
return true; return true;
} }
function calculateSimilarity(line1: Line, line2: Line): number { function calculateSimilarity(line1: PageLine, line2: PageLine): number {
return compareTwoStrings(line1.textWithoutNumbers(), line2.textWithoutNumbers()); return compareTwoStrings(line1.textWithoutNumbers(), line2.textWithoutNumbers());
} }
function adiacentLines(lines: Line[], index: number): Line[] { function adiacentLines(lines: PageLine[], index: number): PageLine[] {
// Prefer to either collect x downstream OR x upstream neighbours (not a mix) in order to better catch odd/even page differences // Prefer to either collect x downstream OR x upstream neighbours (not a mix) in order to better catch odd/even page differences
let neighbours: Line[]; let neighbours: PageLine[];
if (index + config.neighbourReach < lines.length) { if (index + config.neighbourReach < lines.length) {
neighbours = lines.slice(index + 1, index + config.neighbourReach + 1); neighbours = lines.slice(index + 1, index + config.neighbourReach + 1);
} else if (index - config.neighbourReach >= 0) { } else if (index - config.neighbourReach >= 0) {
@ -171,34 +169,35 @@ function adiacentLines(lines: Line[], index: number): Line[] {
return neighbours; return neighbours;
} }
function buildExtracts(inputItems: Item[]): PageExtract[] { function extractFringeLines(inputItems: Item[]): PageLine[] {
let bottomY = 999; let bottomY = 999;
let topY = 0; let topY = 0;
const pages = groupByPage(inputItems).map((pageItems) => { const fringLines = flatMap(
const lines = groupByLine(pageItems) groupByPage(inputItems).map((pageItems) => {
const pageLines = groupByLine(pageItems)
.map((lineItems) => { .map((lineItems) => {
const lineY = yFromLine(lineItems); const lineY = yFromLineItems(lineItems);
return new Line(lineY, lineItems); return new PageLine(pageItems[0].page, lineY, lineItems);
}) })
.sort((a, b) => a.y - b.y); .sort((a, b) => a.y - b.y);
// Keep globals up to date // Keep globals up to date
if (lines[0].y < bottomY) { if (pageLines[0].y < bottomY) {
bottomY = lines[0].y; bottomY = pageLines[0].y;
} }
if (lines[lines.length - 1].y > topY) { if (pageLines[pageLines.length - 1].y > topY) {
topY = lines[lines.length - 1].y; topY = pageLines[pageLines.length - 1].y;
} }
// keep top and bottom fringes // keep only top and bottom fringes
const numberOfFringeElements = Math.min(lines.length, config.maxNumberOffTopOrBottomLines); const numberOfFringeElements = Math.min(pageLines.length, config.maxNumberOffTopOrBottomLines);
const bottomN = lines.slice(0, numberOfFringeElements); const bottomN = pageLines.slice(0, numberOfFringeElements);
const topN = lines.slice(lines.length - numberOfFringeElements, lines.length); const topN = pageLines.slice(pageLines.length - numberOfFringeElements, pageLines.length);
return [...bottomN, ...topN].filter(onlyUniques);
const fringeLines = [...bottomN, ...topN].filter(onlyUniques); }),
return new PageExtract(pageItems[0].page, fringeLines); (e) => e,
}); );
// console.log('bottom', bottomY); // console.log('bottom', bottomY);
// console.log('top', topY); // console.log('top', topY);
@ -206,36 +205,21 @@ function buildExtracts(inputItems: Item[]): PageExtract[] {
//Now that we now the global top and bottom y, we cut those y which are in the middle and not really on the fringes //Now that we now the global top and bottom y, we cut those y which are in the middle and not really on the fringes
const maxTopDistance = config.maxDistanceFromFringeElements; const maxTopDistance = config.maxDistanceFromFringeElements;
const maxBottomDistance = config.maxDistanceFromFringeElements; const maxBottomDistance = config.maxDistanceFromFringeElements;
return pages.map( return fringLines.filter((line) => line.y <= bottomY + maxBottomDistance || line.y >= topY - maxTopDistance);
(page) =>
new PageExtract(
page.page,
page.fringeLines.filter((line) => line.y <= bottomY + maxBottomDistance || line.y >= topY - maxTopDistance),
),
);
} }
function yFromLine(lineItems: Item[]): number { function yFromLineItems(lineItems: Item[]): number {
return Math.round(mostFrequent(lineItems, 'y') as number); return Math.round(mostFrequent(lineItems, 'y') as number);
} }
class PageExtract { /**
constructor(public page: number, public fringeLines: Line[]) {} * A number of Items on a line (~same y) on a page.
*/
hasY(y: number): boolean { class PageLine {
return this.fringeLines.findIndex((line) => line.y === y) >= 0;
}
lineByY(y: number): Line | undefined {
return this.fringeLines.find((line) => line.y === y);
}
}
class Line {
private _text: string | undefined; private _text: string | undefined;
private _textWithoutNumbers: string | undefined; private _textWithoutNumbers: string | undefined;
constructor(public y: number, public items: Item[]) {} constructor(public page: number, public y: number, public items: Item[]) {}
text(): string { text(): string {
if (!this._text) { if (!this._text) {

View File

@ -3,11 +3,15 @@ import PageViewport from '../parse/PageViewport';
import EvaluationTracker from './EvaluationTracker'; import EvaluationTracker from './EvaluationTracker';
export default class TransformContext { export default class TransformContext {
pageCount: number;
constructor( constructor(
public fontMap: Map<string, object>, public fontMap: Map<string, object>,
public pageViewports: PageViewport[], public pageViewports: PageViewport[],
private evaluations = new EvaluationTracker(), private evaluations = new EvaluationTracker(),
) {} ) {
this.pageCount = pageViewports.length;
}
trackEvaluation(item: Item) { trackEvaluation(item: Item) {
this.evaluations.trackEvaluation(item); this.evaluations.trackEvaluation(item);