Fix previous commit and re-use page mapping

This commit is contained in:
Johannes Zillmann 2021-03-29 07:24:20 +02:00
parent 388e8cc6b1
commit 898af7bbc8
21 changed files with 125 additions and 117 deletions

View File

@ -16,3 +16,7 @@ export function toCharcodes(text: string): number[] {
export function filterOutDigits(text: string): string {
return String.fromCharCode(...toCharcodes(text).filter((code) => !isDigit(code)));
}
export function extractNumbers(text: string): number[] {
return (text.match(/\d+/g) || []).map(Number);
}

View File

@ -4,8 +4,23 @@ import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext';
import FontType from '../FontType';
import GlobalDefinition from './GlobalDefinition';
import PageMapping from '../PageMapping';
import PageFactorFinder from '../support/PageFactorFinder';
import { groupByPage, onlyUniques } from '../support/groupingUtils';
import { flatten } from '../support/functional';
import { extractNumbers } from '../support/stringFunctions';
export const MIN_X = new GlobalDefinition<number>('minX');
export const MAX_X = new GlobalDefinition<number>('maxX');
export const MIN_Y = new GlobalDefinition<number>('minY');
export const MAX_Y = new GlobalDefinition<number>('maxY');
export const MAX_HEIGHT = new GlobalDefinition<number>('maxHeight');
export const PAGE_MAPPING = new GlobalDefinition<PageMapping>('pageMapping');
const config = {
// how much distance to min/max/x/y can an item have in order to be considered fringe
maxDistanceToFringe: 50,
};
export default class CalculateStatistics extends ItemTransformer {
constructor() {
@ -30,11 +45,21 @@ export default class CalculateStatistics extends ItemTransformer {
const heightToOccurrence = {};
const fontToOccurrence = {};
let maxHeight = 0;
let maxHeightFont;
let maxHeightFont: string;
let minX = 999;
let maxX = 0;
let minY = 999;
let maxY = 0;
items.forEach((inputItems) => {
const itemHeight = inputItems.data['height'];
const itemFont = inputItems.data['fontName'];
items.forEach((item) => {
const itemHeight = item.data['height'];
const itemFont = item.data['fontName'];
const x = item.data['x'];
const y = item.data['y'];
minX = Math.min(minX, x);
maxX = Math.max(maxX, x);
minY = Math.min(minY, y);
maxY = Math.max(maxY, y);
heightToOccurrence[itemHeight] = heightToOccurrence[itemHeight] ? heightToOccurrence[itemHeight] + 1 : 1;
fontToOccurrence[itemFont] = fontToOccurrence[itemFont] ? fontToOccurrence[itemFont] + 1 : 1;
if (itemHeight > maxHeight) {
@ -46,6 +71,10 @@ export default class CalculateStatistics extends ItemTransformer {
const mostUsedHeight = parseInt(getMostUsedKey(heightToOccurrence));
const mostUsedFont = getMostUsedKey(fontToOccurrence);
const groupedByPage = groupByPage(items);
const pageMapping = parsePageMapping(groupedByPage, minX, maxX, minY, maxY);
console.log(pageMapping);
// Parse line distances
const distanceToOccurrence = {};
@ -85,7 +114,14 @@ export default class CalculateStatistics extends ItemTransformer {
return {
items: items,
globals: [MAX_HEIGHT.value(maxHeight)],
globals: [
MAX_HEIGHT.value(maxHeight),
MIN_X.value(minX),
MAX_X.value(maxX),
MIN_Y.value(minY),
MAX_Y.value(maxY),
PAGE_MAPPING.value(pageMapping),
],
// globals2: {
// mostUsedHeight: mostUsedHeight,
// mostUsedFont: mostUsedFont,
@ -103,6 +139,35 @@ export default class CalculateStatistics extends ItemTransformer {
}
}
function parsePageMapping(
groupedByPage: Item[][],
minX: number,
maxX: number,
minY: number,
maxY: number,
): PageMapping {
const pageFactor = new PageFactorFinder().find(
groupedByPage,
(items) => ({
index: items[0].page,
numbers: possiblePageNumbers(
items.filter((item: Item) => {
const x = item.data['x'];
const y = item.data['y'];
return (
x <= minX + config.maxDistanceToFringe ||
x >= maxX - config.maxDistanceToFringe ||
y <= minY + config.maxDistanceToFringe ||
y >= maxY - config.maxDistanceToFringe
);
}),
),
}),
{ sampleCount: 20, minFulfillment: 0.8 },
);
return typeof pageFactor === 'undefined' ? new PageMapping(0, false) : new PageMapping(pageFactor, true);
}
function getMostUsedKey(keyToOccurrence): any {
var maxOccurence = 0;
var maxKey: string | undefined;
@ -138,3 +203,16 @@ function getFormatType(
return FontType.BOLD;
}
}
function possiblePageNumbers(items: Item[]): number[] {
return flatten(
items.map((item) => {
return (
extractNumbers(item.data['str'])
.filter((number) => number >= 0)
// .filter((number) => number <= line.page + 1)
.filter(onlyUniques)
);
}),
);
}

View File

@ -5,6 +5,7 @@ import ItemResult from '../ItemResult';
import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext';
import LineItemMerger from '../debug/LineItemMerger';
import { MIN_Y, MAX_Y, PAGE_MAPPING } from './CacluclateStatistics';
import {
ascending,
flatMap,
@ -16,16 +17,9 @@ import {
transformGroupedByPageAndLine,
} from '../support/groupingUtils';
import { filterOutDigits } from '../support/stringFunctions';
import { flatten, groupBy } from '../support/functional';
import { MIN_Y, MAX_Y } from './CacluclateStatistics';
import GlobalDefinition from './GlobalDefinition';
export const PAGE_FACTOR = new GlobalDefinition<string>('pageFactor');
import { extractNumbers } from '../support/stringFunctions';
const config = {
// Max number of lines at top/bottom (per page) which are getting evaluated for eviction
maxNumberOffTopOrBottomLines: 3,
// From the absolute fringe elements (min/max y) how much y can item deviate before beeing disregarded.
maxDistanceFromFringeElements: 35,
@ -49,6 +43,7 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
transform(context: TransformContext, inputItems: Item[]): ItemResult {
const minY = context.getGlobal(MIN_Y);
const maxY = context.getGlobal(MAX_Y);
const pageMapping = context.getGlobal(PAGE_MAPPING);
const bottomMaxY = minY + config.maxDistanceFromFringeElements;
const topMinY = maxY - config.maxDistanceFromFringeElements;
// console.log('bottomMaxY', bottomMaxY, 'topMinY', topMinY);
@ -70,9 +65,6 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
(e) => e,
);
const pageNumber = detectAPageNumber(fringeLines);
const globuly = pageNumber ? `${pageNumber.pageNumber - pageNumber.pageIndex}` : 'n/a';
const fringeYs = fringeLines
.map((line) => line.y)
.filter(onlyUniques)
@ -86,7 +78,9 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
return false;
}
const pageNumberScore: number = pageNumber ? calculatePageNumerScore(context.pageCount, pageNumber, yLines) : 0;
const pageNumberScore: number = pageMapping.detectedOnPage
? calculatePageNumerScore(context.pageCount, pageMapping.pageFactor, yLines)
: 0;
const textSimilarityScore: number = textSimilarity(yLines);
const totalScore = pageNumberScore + textSimilarityScore;
// console.log(
@ -123,80 +117,17 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
return lineItems;
}),
messages: [`Filtered out ${removalCount} items with y == ${yToRemove.join('||')}`],
globals: [PAGE_FACTOR.value(globuly)],
};
}
}
function calculatePageNumerScore(pageCount: number, pageNumber: PageNumber, lines: PageLine[]): number {
const pageNumberFactor = pageNumber.pageNumber - pageNumber.pageIndex;
const maxPageNumbers = pageCount + pageNumberFactor;
const linesWithPageNumbers = lines.filter((line) =>
extractNumbers(line.text()).includes(line.page + pageNumberFactor),
).length;
function calculatePageNumerScore(pageCount: number, pageFactor: number, lines: PageLine[]): number {
const maxPageNumbers = pageCount + pageFactor;
const linesWithPageNumbers = lines.filter((line) => extractNumbers(line.text()).includes(line.page + pageFactor))
.length;
return linesWithPageNumbers / Math.min(maxPageNumbers, lines.length);
}
function detectAPageNumber(lines: PageLine[]): PageNumber | undefined {
const linesByPage = groupBy(lines, (line) => line.page).sort((a, b) => a[0].page - b[0].page);
const pageIndexInTheMiddle = Math.round(linesByPage.length / 2);
const possiblePageNumbersForMiddle = possiblePageNumbers(linesByPage[pageIndexInTheMiddle]);
const remainingOptions = filterOutIncompatibleVariant(
possiblePageNumbersForMiddle,
linesByPage.slice(pageIndexInTheMiddle + 1, linesByPage.length),
);
//TODO do the same filtering upstream !?
if (remainingOptions.length == 1) {
return remainingOptions[0];
}
return undefined;
}
function filterOutIncompatibleVariant(options: PageNumber[], nextPageLines: PageLine[][]): PageNumber[] {
let index = 0;
let remainingOptions = [...options];
while (remainingOptions.length > 1 && index < nextPageLines.length) {
const nextPageNumbers = possiblePageNumbers(nextPageLines[index]);
if (nextPageNumbers.length > 0) {
remainingOptions = remainingOptions.filter((option) => {
const maxDistance = nextPageNumbers[0].pageIndex - option.pageIndex;
return nextPageNumbers.find((nextPageNum) => nextPageNum.pageNumber - option.pageNumber <= maxDistance);
});
}
index++;
}
return remainingOptions;
}
interface PageNumber {
pageIndex: number;
pageNumber: number;
y: number;
}
function possiblePageNumbers(lines: PageLine[]): PageNumber[] {
return flatten(
lines.map((line) => {
return extractNumbers(line.text())
.filter((number) => number >= 0)
.filter((number) => number <= line.page + 1)
.filter(onlyUniques)
.map((num) => ({
pageIndex: line.page,
pageNumber: num,
y: line.y,
}));
}),
);
}
function extractNumbers(text: string): number[] {
return (text.match(/\d+/g) || []).map(Number);
}
function textSimilarity(lines: PageLine[]): number {
const similarities = flatMap(lines, (line, idx) =>
adiacentLines(lines, idx).map((adiacentLine) => calculateSimilarity(line, adiacentLine)),

View File

@ -58,6 +58,8 @@ test('distraction free - defered', () => {
{ index: 5, numbers: [4] },
{ index: 6, numbers: [5] },
{ index: 7, numbers: [6] },
{ index: 8, numbers: [7] },
{ index: 9, numbers: [8] },
];
expect(finder.find(containers, extractor)).toEqual(-1);
@ -88,6 +90,8 @@ test('distraction loaden - defered', () => {
{ index: 5, numbers: [4, 5, 65, 8] },
{ index: 6, numbers: [5, 9] },
{ index: 7, numbers: [6] },
{ index: 8, numbers: [27, 7, 19] },
{ index: 9, numbers: [-4, 2016, 8] },
];
expect(finder.find(containers, extractor)).toEqual(-1);

View File

@ -1,7 +1,14 @@
import { filterOutDigits } from 'src/support/stringFunctions';
import { filterOutDigits, extractNumbers } from 'src/support/stringFunctions';
test('filterOutDigits', async () => {
expect(filterOutDigits('')).toEqual('');
expect(filterOutDigits('a b c')).toEqual('a b c');
expect(filterOutDigits('a1b 2c 3')).toEqual('ab c ');
});
test('extractNumbers', async () => {
expect(extractNumbers('')).toEqual([]);
expect(extractNumbers('a b c')).toEqual([]);
expect(extractNumbers('a1b 2c 3')).toEqual([1, 2, 3]);
expect(extractNumbers('a12 21 304')).toEqual([12, 21, 304]);
});

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": -1,
"detectedOnPage": true
},
"pageFactor": "-1"
}
}
}
{"page":3,"change":"Removal","str":"2","dir":"ltr","width":"5.45","height":"10.91","transform":["10.91","0.00","0.00","10.91","294.43","95.28"],"fontName":"KKLGKN+NimbusRomNo9L-Regu","x":294.428,"y":95.28300000000016,"line":13}

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": 0,
"detectedOnPage": true
},
"pageFactor": "0"
}
}
}
{"page":1,"change":"Removal","str":"1","dir":"ltr","width":"5.98","height":"11.96","transform":["11.96","0.00","0.00","11.96","294.17","95.55"],"fontName":"FZVLIH+NimbusRomNo9L-Regu","x":294.167,"y":95.545,"line":14}

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": 1,
"detectedOnPage": true
},
"pageFactor": "n/a"
}
}
}
{"page":0,"change":"Removal","str":"Closed syllable word lists 1","line":0,"x":420.79,"y":745.56,"width":"119.31","height":"11.04","fontName":["ABCDEE+Calibri"],"dir":["ltr"]}

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": 1,
"detectedOnPage": true
},
"pageFactor": "1"
}
}
}
{"page":0,"change":"Removal","str":"\u0000 1","line":5,"x":294,"y":45,"width":"6.67","height":"12.00","fontName":["QACXPP+Helvetica","JBRMKS+Helvetica"],"dir":["ltr"]}

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": 0,
"detectedOnPage": false
},
"pageFactor": "n/a"
}
}
}
{"page":0,"change":"Removal","str":"s h a l F - M a s q u e","line":4,"x":37.1206,"y":758.8381,"width":"99.35","height":"17.64","fontName":["NRVUEW+Futura-Light"],"dir":["ltr"]}

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": 1,
"detectedOnPage": true
},
"pageFactor": "1"
}
}
}
{"page":0,"change":"Removal","str":"1","dir":"ltr","width":"4.08","height":"8.04","transform":["8.04","0.00","0.00","8.04","304.01","22.68"],"fontName":"NTKUYH+Calibri","x":304.01,"y":22.6801,"line":0}

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": -17,
"detectedOnPage": true
},
"pageFactor": "-17"
}
}
}
{"page":13,"change":"Removal","str":"\\-iii Preface.","line":0,"x":63.50214,"y":400.98296999999997,"width":"50.42","height":"7.99","fontName":[null],"dir":["ltr"]}

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": 1,
"detectedOnPage": true
},
"pageFactor": "1"
}
}
}
{"page":1,"change":"Removal","str":"Quality Improvement Clinic Ltd. P a g e | 2 August 2015","line":0,"x":99.264,"y":30.84,"width":"261.24","height":"11.04","fontName":[null],"dir":["ltr"]}

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": 1,
"detectedOnPage": true
},
"pageFactor": "1"
}
}
}
{"page":2,"change":"Removal","str":"3","dir":"ltr","width":"6.02","height":"12.00","transform":["12.00","0.00","0.00","12.00","812.96","16.35"],"fontName":"Gill Sans MT","x":812.962,"y":16.346,"line":16}

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": 0,
"detectedOnPage": false
},
"pageFactor": "n/a"
}
}
}
{"page":87,"change":"Removal","str":"F O O T N O T E S :","line":0,"x":251.741232,"y":687.6,"width":"108.58","height":"16.56","fontName":["AAAAAB+LiberationSerif-Bold"],"dir":["ltr"]}

View File

@ -38,7 +38,6 @@
"pageMapping": {
"pageFactor": 0,
"detectedOnPage": false
},
"pageFactor": "n/a"
}
}
}

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": 1,
"detectedOnPage": true
},
"pageFactor": "1"
}
}
}
{"page":1,"change":"Removal","str":"The War of the Worlds ","dir":"ltr","width":"102.96","height":"10.98","transform":["10.98","0.00","0.00","10.98","57.60","493.80"],"x":57.6,"y":493.8,"line":0}

View File

@ -38,7 +38,6 @@
"pageMapping": {
"pageFactor": 1243,
"detectedOnPage": true
},
"pageFactor": "n/a"
}
}
}

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": -6,
"detectedOnPage": true
},
"pageFactor": "-6"
}
}
}
{"page":5,"change":"Removal","str":"V ","dir":"ltr","width":"7.68","height":"7.98","transform":["7.98","0.00","0.00","7.98","382.86","52.74"],"fontName":"CRDKGT+ArialMT","x":382.8617,"y":52.741,"line":0}

View File

@ -38,7 +38,6 @@
"pageMapping": {
"pageFactor": 0,
"detectedOnPage": false
},
"pageFactor": "n/a"
}
}
}

View File

@ -38,8 +38,7 @@
"pageMapping": {
"pageFactor": 1,
"detectedOnPage": true
},
"pageFactor": "1"
}
}
}
{"page":4,"change":"Removal","str":"5","dir":"ltr","width":"5.85","height":"11.96","transform":["11.96","0.00","0.00","11.96","526.49","738.02"],"fontName":"LERRTL+CMR12","x":526.491,"y":738.022,"line":0}