mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-22 21:00:48 +01:00
Fix previous commit and re-use page mapping
This commit is contained in:
parent
388e8cc6b1
commit
898af7bbc8
@ -16,3 +16,7 @@ export function toCharcodes(text: string): number[] {
|
||||
export function filterOutDigits(text: string): string {
|
||||
return String.fromCharCode(...toCharcodes(text).filter((code) => !isDigit(code)));
|
||||
}
|
||||
|
||||
export function extractNumbers(text: string): number[] {
|
||||
return (text.match(/\d+/g) || []).map(Number);
|
||||
}
|
||||
|
@ -4,8 +4,23 @@ import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
import FontType from '../FontType';
|
||||
import GlobalDefinition from './GlobalDefinition';
|
||||
import PageMapping from '../PageMapping';
|
||||
import PageFactorFinder from '../support/PageFactorFinder';
|
||||
import { groupByPage, onlyUniques } from '../support/groupingUtils';
|
||||
import { flatten } from '../support/functional';
|
||||
import { extractNumbers } from '../support/stringFunctions';
|
||||
|
||||
export const MIN_X = new GlobalDefinition<number>('minX');
|
||||
export const MAX_X = new GlobalDefinition<number>('maxX');
|
||||
export const MIN_Y = new GlobalDefinition<number>('minY');
|
||||
export const MAX_Y = new GlobalDefinition<number>('maxY');
|
||||
export const MAX_HEIGHT = new GlobalDefinition<number>('maxHeight');
|
||||
export const PAGE_MAPPING = new GlobalDefinition<PageMapping>('pageMapping');
|
||||
|
||||
const config = {
|
||||
// how much distance to min/max/x/y can an item have in order to be considered fringe
|
||||
maxDistanceToFringe: 50,
|
||||
};
|
||||
|
||||
export default class CalculateStatistics extends ItemTransformer {
|
||||
constructor() {
|
||||
@ -30,11 +45,21 @@ export default class CalculateStatistics extends ItemTransformer {
|
||||
const heightToOccurrence = {};
|
||||
const fontToOccurrence = {};
|
||||
let maxHeight = 0;
|
||||
let maxHeightFont;
|
||||
let maxHeightFont: string;
|
||||
let minX = 999;
|
||||
let maxX = 0;
|
||||
let minY = 999;
|
||||
let maxY = 0;
|
||||
|
||||
items.forEach((inputItems) => {
|
||||
const itemHeight = inputItems.data['height'];
|
||||
const itemFont = inputItems.data['fontName'];
|
||||
items.forEach((item) => {
|
||||
const itemHeight = item.data['height'];
|
||||
const itemFont = item.data['fontName'];
|
||||
const x = item.data['x'];
|
||||
const y = item.data['y'];
|
||||
minX = Math.min(minX, x);
|
||||
maxX = Math.max(maxX, x);
|
||||
minY = Math.min(minY, y);
|
||||
maxY = Math.max(maxY, y);
|
||||
heightToOccurrence[itemHeight] = heightToOccurrence[itemHeight] ? heightToOccurrence[itemHeight] + 1 : 1;
|
||||
fontToOccurrence[itemFont] = fontToOccurrence[itemFont] ? fontToOccurrence[itemFont] + 1 : 1;
|
||||
if (itemHeight > maxHeight) {
|
||||
@ -46,6 +71,10 @@ export default class CalculateStatistics extends ItemTransformer {
|
||||
const mostUsedHeight = parseInt(getMostUsedKey(heightToOccurrence));
|
||||
const mostUsedFont = getMostUsedKey(fontToOccurrence);
|
||||
|
||||
const groupedByPage = groupByPage(items);
|
||||
const pageMapping = parsePageMapping(groupedByPage, minX, maxX, minY, maxY);
|
||||
console.log(pageMapping);
|
||||
|
||||
// Parse line distances
|
||||
const distanceToOccurrence = {};
|
||||
|
||||
@ -85,7 +114,14 @@ export default class CalculateStatistics extends ItemTransformer {
|
||||
|
||||
return {
|
||||
items: items,
|
||||
globals: [MAX_HEIGHT.value(maxHeight)],
|
||||
globals: [
|
||||
MAX_HEIGHT.value(maxHeight),
|
||||
MIN_X.value(minX),
|
||||
MAX_X.value(maxX),
|
||||
MIN_Y.value(minY),
|
||||
MAX_Y.value(maxY),
|
||||
PAGE_MAPPING.value(pageMapping),
|
||||
],
|
||||
// globals2: {
|
||||
// mostUsedHeight: mostUsedHeight,
|
||||
// mostUsedFont: mostUsedFont,
|
||||
@ -103,6 +139,35 @@ export default class CalculateStatistics extends ItemTransformer {
|
||||
}
|
||||
}
|
||||
|
||||
function parsePageMapping(
|
||||
groupedByPage: Item[][],
|
||||
minX: number,
|
||||
maxX: number,
|
||||
minY: number,
|
||||
maxY: number,
|
||||
): PageMapping {
|
||||
const pageFactor = new PageFactorFinder().find(
|
||||
groupedByPage,
|
||||
(items) => ({
|
||||
index: items[0].page,
|
||||
numbers: possiblePageNumbers(
|
||||
items.filter((item: Item) => {
|
||||
const x = item.data['x'];
|
||||
const y = item.data['y'];
|
||||
return (
|
||||
x <= minX + config.maxDistanceToFringe ||
|
||||
x >= maxX - config.maxDistanceToFringe ||
|
||||
y <= minY + config.maxDistanceToFringe ||
|
||||
y >= maxY - config.maxDistanceToFringe
|
||||
);
|
||||
}),
|
||||
),
|
||||
}),
|
||||
{ sampleCount: 20, minFulfillment: 0.8 },
|
||||
);
|
||||
return typeof pageFactor === 'undefined' ? new PageMapping(0, false) : new PageMapping(pageFactor, true);
|
||||
}
|
||||
|
||||
function getMostUsedKey(keyToOccurrence): any {
|
||||
var maxOccurence = 0;
|
||||
var maxKey: string | undefined;
|
||||
@ -138,3 +203,16 @@ function getFormatType(
|
||||
return FontType.BOLD;
|
||||
}
|
||||
}
|
||||
|
||||
function possiblePageNumbers(items: Item[]): number[] {
|
||||
return flatten(
|
||||
items.map((item) => {
|
||||
return (
|
||||
extractNumbers(item.data['str'])
|
||||
.filter((number) => number >= 0)
|
||||
// .filter((number) => number <= line.page + 1)
|
||||
.filter(onlyUniques)
|
||||
);
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
@ -5,6 +5,7 @@ import ItemResult from '../ItemResult';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
import LineItemMerger from '../debug/LineItemMerger';
|
||||
import { MIN_Y, MAX_Y, PAGE_MAPPING } from './CacluclateStatistics';
|
||||
import {
|
||||
ascending,
|
||||
flatMap,
|
||||
@ -16,16 +17,9 @@ import {
|
||||
transformGroupedByPageAndLine,
|
||||
} from '../support/groupingUtils';
|
||||
import { filterOutDigits } from '../support/stringFunctions';
|
||||
import { flatten, groupBy } from '../support/functional';
|
||||
import { MIN_Y, MAX_Y } from './CacluclateStatistics';
|
||||
import GlobalDefinition from './GlobalDefinition';
|
||||
|
||||
export const PAGE_FACTOR = new GlobalDefinition<string>('pageFactor');
|
||||
import { extractNumbers } from '../support/stringFunctions';
|
||||
|
||||
const config = {
|
||||
// Max number of lines at top/bottom (per page) which are getting evaluated for eviction
|
||||
maxNumberOffTopOrBottomLines: 3,
|
||||
|
||||
// From the absolute fringe elements (min/max y) how much y can item deviate before beeing disregarded.
|
||||
maxDistanceFromFringeElements: 35,
|
||||
|
||||
@ -49,6 +43,7 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
|
||||
transform(context: TransformContext, inputItems: Item[]): ItemResult {
|
||||
const minY = context.getGlobal(MIN_Y);
|
||||
const maxY = context.getGlobal(MAX_Y);
|
||||
const pageMapping = context.getGlobal(PAGE_MAPPING);
|
||||
const bottomMaxY = minY + config.maxDistanceFromFringeElements;
|
||||
const topMinY = maxY - config.maxDistanceFromFringeElements;
|
||||
// console.log('bottomMaxY', bottomMaxY, 'topMinY', topMinY);
|
||||
@ -70,9 +65,6 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
|
||||
(e) => e,
|
||||
);
|
||||
|
||||
const pageNumber = detectAPageNumber(fringeLines);
|
||||
const globuly = pageNumber ? `${pageNumber.pageNumber - pageNumber.pageIndex}` : 'n/a';
|
||||
|
||||
const fringeYs = fringeLines
|
||||
.map((line) => line.y)
|
||||
.filter(onlyUniques)
|
||||
@ -86,7 +78,9 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
|
||||
return false;
|
||||
}
|
||||
|
||||
const pageNumberScore: number = pageNumber ? calculatePageNumerScore(context.pageCount, pageNumber, yLines) : 0;
|
||||
const pageNumberScore: number = pageMapping.detectedOnPage
|
||||
? calculatePageNumerScore(context.pageCount, pageMapping.pageFactor, yLines)
|
||||
: 0;
|
||||
const textSimilarityScore: number = textSimilarity(yLines);
|
||||
const totalScore = pageNumberScore + textSimilarityScore;
|
||||
// console.log(
|
||||
@ -123,80 +117,17 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
|
||||
return lineItems;
|
||||
}),
|
||||
messages: [`Filtered out ${removalCount} items with y == ${yToRemove.join('||')}`],
|
||||
globals: [PAGE_FACTOR.value(globuly)],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function calculatePageNumerScore(pageCount: number, pageNumber: PageNumber, lines: PageLine[]): number {
|
||||
const pageNumberFactor = pageNumber.pageNumber - pageNumber.pageIndex;
|
||||
const maxPageNumbers = pageCount + pageNumberFactor;
|
||||
const linesWithPageNumbers = lines.filter((line) =>
|
||||
extractNumbers(line.text()).includes(line.page + pageNumberFactor),
|
||||
).length;
|
||||
function calculatePageNumerScore(pageCount: number, pageFactor: number, lines: PageLine[]): number {
|
||||
const maxPageNumbers = pageCount + pageFactor;
|
||||
const linesWithPageNumbers = lines.filter((line) => extractNumbers(line.text()).includes(line.page + pageFactor))
|
||||
.length;
|
||||
return linesWithPageNumbers / Math.min(maxPageNumbers, lines.length);
|
||||
}
|
||||
|
||||
function detectAPageNumber(lines: PageLine[]): PageNumber | undefined {
|
||||
const linesByPage = groupBy(lines, (line) => line.page).sort((a, b) => a[0].page - b[0].page);
|
||||
const pageIndexInTheMiddle = Math.round(linesByPage.length / 2);
|
||||
|
||||
const possiblePageNumbersForMiddle = possiblePageNumbers(linesByPage[pageIndexInTheMiddle]);
|
||||
const remainingOptions = filterOutIncompatibleVariant(
|
||||
possiblePageNumbersForMiddle,
|
||||
linesByPage.slice(pageIndexInTheMiddle + 1, linesByPage.length),
|
||||
);
|
||||
//TODO do the same filtering upstream !?
|
||||
|
||||
if (remainingOptions.length == 1) {
|
||||
return remainingOptions[0];
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function filterOutIncompatibleVariant(options: PageNumber[], nextPageLines: PageLine[][]): PageNumber[] {
|
||||
let index = 0;
|
||||
let remainingOptions = [...options];
|
||||
while (remainingOptions.length > 1 && index < nextPageLines.length) {
|
||||
const nextPageNumbers = possiblePageNumbers(nextPageLines[index]);
|
||||
if (nextPageNumbers.length > 0) {
|
||||
remainingOptions = remainingOptions.filter((option) => {
|
||||
const maxDistance = nextPageNumbers[0].pageIndex - option.pageIndex;
|
||||
return nextPageNumbers.find((nextPageNum) => nextPageNum.pageNumber - option.pageNumber <= maxDistance);
|
||||
});
|
||||
}
|
||||
index++;
|
||||
}
|
||||
return remainingOptions;
|
||||
}
|
||||
|
||||
interface PageNumber {
|
||||
pageIndex: number;
|
||||
pageNumber: number;
|
||||
y: number;
|
||||
}
|
||||
|
||||
function possiblePageNumbers(lines: PageLine[]): PageNumber[] {
|
||||
return flatten(
|
||||
lines.map((line) => {
|
||||
return extractNumbers(line.text())
|
||||
.filter((number) => number >= 0)
|
||||
.filter((number) => number <= line.page + 1)
|
||||
.filter(onlyUniques)
|
||||
.map((num) => ({
|
||||
pageIndex: line.page,
|
||||
pageNumber: num,
|
||||
y: line.y,
|
||||
}));
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
function extractNumbers(text: string): number[] {
|
||||
return (text.match(/\d+/g) || []).map(Number);
|
||||
}
|
||||
|
||||
function textSimilarity(lines: PageLine[]): number {
|
||||
const similarities = flatMap(lines, (line, idx) =>
|
||||
adiacentLines(lines, idx).map((adiacentLine) => calculateSimilarity(line, adiacentLine)),
|
||||
|
@ -58,6 +58,8 @@ test('distraction free - defered', () => {
|
||||
{ index: 5, numbers: [4] },
|
||||
{ index: 6, numbers: [5] },
|
||||
{ index: 7, numbers: [6] },
|
||||
{ index: 8, numbers: [7] },
|
||||
{ index: 9, numbers: [8] },
|
||||
];
|
||||
|
||||
expect(finder.find(containers, extractor)).toEqual(-1);
|
||||
@ -88,6 +90,8 @@ test('distraction loaden - defered', () => {
|
||||
{ index: 5, numbers: [4, 5, 65, 8] },
|
||||
{ index: 6, numbers: [5, 9] },
|
||||
{ index: 7, numbers: [6] },
|
||||
{ index: 8, numbers: [27, 7, 19] },
|
||||
{ index: 9, numbers: [-4, 2016, 8] },
|
||||
];
|
||||
|
||||
expect(finder.find(containers, extractor)).toEqual(-1);
|
||||
|
@ -1,7 +1,14 @@
|
||||
import { filterOutDigits } from 'src/support/stringFunctions';
|
||||
import { filterOutDigits, extractNumbers } from 'src/support/stringFunctions';
|
||||
|
||||
test('filterOutDigits', async () => {
|
||||
expect(filterOutDigits('')).toEqual('');
|
||||
expect(filterOutDigits('a b c')).toEqual('a b c');
|
||||
expect(filterOutDigits('a1b 2c 3')).toEqual('ab c ');
|
||||
});
|
||||
|
||||
test('extractNumbers', async () => {
|
||||
expect(extractNumbers('')).toEqual([]);
|
||||
expect(extractNumbers('a b c')).toEqual([]);
|
||||
expect(extractNumbers('a1b 2c 3')).toEqual([1, 2, 3]);
|
||||
expect(extractNumbers('a12 21 304')).toEqual([12, 21, 304]);
|
||||
});
|
||||
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": -1,
|
||||
"detectedOnPage": true
|
||||
},
|
||||
"pageFactor": "-1"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":3,"change":"Removal","str":"2","dir":"ltr","width":"5.45","height":"10.91","transform":["10.91","0.00","0.00","10.91","294.43","95.28"],"fontName":"KKLGKN+NimbusRomNo9L-Regu","x":294.428,"y":95.28300000000016,"line":13}
|
||||
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 0,
|
||||
"detectedOnPage": true
|
||||
},
|
||||
"pageFactor": "0"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":1,"change":"Removal","str":"1","dir":"ltr","width":"5.98","height":"11.96","transform":["11.96","0.00","0.00","11.96","294.17","95.55"],"fontName":"FZVLIH+NimbusRomNo9L-Regu","x":294.167,"y":95.545,"line":14}
|
||||
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 1,
|
||||
"detectedOnPage": true
|
||||
},
|
||||
"pageFactor": "n/a"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":0,"change":"Removal","str":"Closed syllable word lists 1","line":0,"x":420.79,"y":745.56,"width":"119.31","height":"11.04","fontName":["ABCDEE+Calibri"],"dir":["ltr"]}
|
||||
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 1,
|
||||
"detectedOnPage": true
|
||||
},
|
||||
"pageFactor": "1"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":0,"change":"Removal","str":"\u0000 1","line":5,"x":294,"y":45,"width":"6.67","height":"12.00","fontName":["QACXPP+Helvetica","JBRMKS+Helvetica"],"dir":["ltr"]}
|
||||
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 0,
|
||||
"detectedOnPage": false
|
||||
},
|
||||
"pageFactor": "n/a"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":0,"change":"Removal","str":"s h a l F - M a s q u e","line":4,"x":37.1206,"y":758.8381,"width":"99.35","height":"17.64","fontName":["NRVUEW+Futura-Light"],"dir":["ltr"]}
|
||||
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 1,
|
||||
"detectedOnPage": true
|
||||
},
|
||||
"pageFactor": "1"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":0,"change":"Removal","str":"1","dir":"ltr","width":"4.08","height":"8.04","transform":["8.04","0.00","0.00","8.04","304.01","22.68"],"fontName":"NTKUYH+Calibri","x":304.01,"y":22.6801,"line":0}
|
||||
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": -17,
|
||||
"detectedOnPage": true
|
||||
},
|
||||
"pageFactor": "-17"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":13,"change":"Removal","str":"\\-iii Preface.","line":0,"x":63.50214,"y":400.98296999999997,"width":"50.42","height":"7.99","fontName":[null],"dir":["ltr"]}
|
||||
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 1,
|
||||
"detectedOnPage": true
|
||||
},
|
||||
"pageFactor": "1"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":1,"change":"Removal","str":"Quality Improvement Clinic Ltd. P a g e | 2 August 2015","line":0,"x":99.264,"y":30.84,"width":"261.24","height":"11.04","fontName":[null],"dir":["ltr"]}
|
||||
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 1,
|
||||
"detectedOnPage": true
|
||||
},
|
||||
"pageFactor": "1"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":2,"change":"Removal","str":"3","dir":"ltr","width":"6.02","height":"12.00","transform":["12.00","0.00","0.00","12.00","812.96","16.35"],"fontName":"Gill Sans MT","x":812.962,"y":16.346,"line":16}
|
||||
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 0,
|
||||
"detectedOnPage": false
|
||||
},
|
||||
"pageFactor": "n/a"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":87,"change":"Removal","str":"F O O T N O T E S :","line":0,"x":251.741232,"y":687.6,"width":"108.58","height":"16.56","fontName":["AAAAAB+LiberationSerif-Bold"],"dir":["ltr"]}
|
||||
|
@ -38,7 +38,6 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 0,
|
||||
"detectedOnPage": false
|
||||
},
|
||||
"pageFactor": "n/a"
|
||||
}
|
||||
}
|
||||
}
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 1,
|
||||
"detectedOnPage": true
|
||||
},
|
||||
"pageFactor": "1"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":1,"change":"Removal","str":"The War of the Worlds ","dir":"ltr","width":"102.96","height":"10.98","transform":["10.98","0.00","0.00","10.98","57.60","493.80"],"x":57.6,"y":493.8,"line":0}
|
||||
|
@ -38,7 +38,6 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 1243,
|
||||
"detectedOnPage": true
|
||||
},
|
||||
"pageFactor": "n/a"
|
||||
}
|
||||
}
|
||||
}
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": -6,
|
||||
"detectedOnPage": true
|
||||
},
|
||||
"pageFactor": "-6"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":5,"change":"Removal","str":"V ","dir":"ltr","width":"7.68","height":"7.98","transform":["7.98","0.00","0.00","7.98","382.86","52.74"],"fontName":"CRDKGT+ArialMT","x":382.8617,"y":52.741,"line":0}
|
||||
|
@ -38,7 +38,6 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 0,
|
||||
"detectedOnPage": false
|
||||
},
|
||||
"pageFactor": "n/a"
|
||||
}
|
||||
}
|
||||
}
|
@ -38,8 +38,7 @@
|
||||
"pageMapping": {
|
||||
"pageFactor": 1,
|
||||
"detectedOnPage": true
|
||||
},
|
||||
"pageFactor": "1"
|
||||
}
|
||||
}
|
||||
}
|
||||
{"page":4,"change":"Removal","str":"5","dir":"ltr","width":"5.85","height":"11.96","transform":["11.96","0.00","0.00","11.96","526.49","738.02"],"fontName":"LERRTL+CMR12","x":526.491,"y":738.022,"line":0}
|
||||
|
Loading…
Reference in New Issue
Block a user