mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-19 03:58:24 +01:00
simplify code (and keep information) through flattening page lines
This commit is contained in:
parent
4c77274d16
commit
a6a21c9ed2
@ -44,9 +44,9 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
transform(context: TransformContext, inputItems: Item[]): ItemResult {
|
transform(context: TransformContext, inputItems: Item[]): ItemResult {
|
||||||
const pageExtracts = buildExtracts(inputItems);
|
const fringeLines = extractFringeLines(inputItems);
|
||||||
|
|
||||||
const fringeYs = flatMap(pageExtracts, (extract) => extract.fringeLines)
|
const fringeYs = fringeLines
|
||||||
.map((line) => line.y)
|
.map((line) => line.y)
|
||||||
.filter(onlyUniques)
|
.filter(onlyUniques)
|
||||||
.sort(ascending);
|
.sort(ascending);
|
||||||
@ -54,9 +54,7 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
|
|||||||
// console.log('uniqueYs', uniqueYs);
|
// console.log('uniqueYs', uniqueYs);
|
||||||
|
|
||||||
const yToRemove = fringeYs.filter((y) => {
|
const yToRemove = fringeYs.filter((y) => {
|
||||||
const yLines = pageExtracts
|
const yLines = fringeLines.filter((line) => line.y == y);
|
||||||
.map((page) => page.lineByY(y))
|
|
||||||
.filter((line) => typeof line !== 'undefined') as Line[];
|
|
||||||
|
|
||||||
if (yLines.length < 2) {
|
if (yLines.length < 2) {
|
||||||
return false;
|
return false;
|
||||||
@ -104,7 +102,7 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
|
|||||||
let removalCount = 0;
|
let removalCount = 0;
|
||||||
return {
|
return {
|
||||||
items: transformGroupedByPageAndLine(inputItems, (_, __, lineItems) => {
|
items: transformGroupedByPageAndLine(inputItems, (_, __, lineItems) => {
|
||||||
const itemsY = yFromLine(lineItems);
|
const itemsY = yFromLineItems(lineItems);
|
||||||
if (fringeYs.includes(itemsY)) {
|
if (fringeYs.includes(itemsY)) {
|
||||||
lineItems.forEach(context.trackEvaluation.bind(context));
|
lineItems.forEach(context.trackEvaluation.bind(context));
|
||||||
}
|
}
|
||||||
@ -119,7 +117,7 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function consecutiveNumbers(lines: Line[]): number {
|
function consecutiveNumbers(lines: PageLine[]): number {
|
||||||
const allNumbersJoined = flatMap(
|
const allNumbersJoined = flatMap(
|
||||||
lines
|
lines
|
||||||
.map((line) => {
|
.map((line) => {
|
||||||
@ -135,14 +133,14 @@ function consecutiveNumbers(lines: Line[]): number {
|
|||||||
return compareTwoStrings(allNumbersJoined, regularNumbersJoined);
|
return compareTwoStrings(allNumbersJoined, regularNumbersJoined);
|
||||||
}
|
}
|
||||||
|
|
||||||
function textSimilarity(lines: Line[]): number {
|
function textSimilarity(lines: PageLine[]): number {
|
||||||
const similarities = flatMap(lines, (line, idx) =>
|
const similarities = flatMap(lines, (line, idx) =>
|
||||||
adiacentLines(lines, idx).map((adiacentLine) => calculateSimilarity(line, adiacentLine)),
|
adiacentLines(lines, idx).map((adiacentLine) => calculateSimilarity(line, adiacentLine)),
|
||||||
);
|
);
|
||||||
return median(similarities);
|
return median(similarities);
|
||||||
}
|
}
|
||||||
|
|
||||||
function isAllNumbers(lines: Line[]): boolean {
|
function isAllNumbers(lines: PageLine[]): boolean {
|
||||||
for (let index = 0; index < lines.length; index++) {
|
for (let index = 0; index < lines.length; index++) {
|
||||||
const string = lines[index].text().trim();
|
const string = lines[index].text().trim();
|
||||||
const asNumber = Number(string);
|
const asNumber = Number(string);
|
||||||
@ -153,13 +151,13 @@ function isAllNumbers(lines: Line[]): boolean {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
function calculateSimilarity(line1: Line, line2: Line): number {
|
function calculateSimilarity(line1: PageLine, line2: PageLine): number {
|
||||||
return compareTwoStrings(line1.textWithoutNumbers(), line2.textWithoutNumbers());
|
return compareTwoStrings(line1.textWithoutNumbers(), line2.textWithoutNumbers());
|
||||||
}
|
}
|
||||||
|
|
||||||
function adiacentLines(lines: Line[], index: number): Line[] {
|
function adiacentLines(lines: PageLine[], index: number): PageLine[] {
|
||||||
// Prefer to either collect x downstream OR x upstream neighbours (not a mix) in order to better catch odd/even page differences
|
// Prefer to either collect x downstream OR x upstream neighbours (not a mix) in order to better catch odd/even page differences
|
||||||
let neighbours: Line[];
|
let neighbours: PageLine[];
|
||||||
if (index + config.neighbourReach < lines.length) {
|
if (index + config.neighbourReach < lines.length) {
|
||||||
neighbours = lines.slice(index + 1, index + config.neighbourReach + 1);
|
neighbours = lines.slice(index + 1, index + config.neighbourReach + 1);
|
||||||
} else if (index - config.neighbourReach >= 0) {
|
} else if (index - config.neighbourReach >= 0) {
|
||||||
@ -171,34 +169,35 @@ function adiacentLines(lines: Line[], index: number): Line[] {
|
|||||||
return neighbours;
|
return neighbours;
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildExtracts(inputItems: Item[]): PageExtract[] {
|
function extractFringeLines(inputItems: Item[]): PageLine[] {
|
||||||
let bottomY = 999;
|
let bottomY = 999;
|
||||||
let topY = 0;
|
let topY = 0;
|
||||||
|
|
||||||
const pages = groupByPage(inputItems).map((pageItems) => {
|
const fringLines = flatMap(
|
||||||
const lines = groupByLine(pageItems)
|
groupByPage(inputItems).map((pageItems) => {
|
||||||
.map((lineItems) => {
|
const pageLines = groupByLine(pageItems)
|
||||||
const lineY = yFromLine(lineItems);
|
.map((lineItems) => {
|
||||||
return new Line(lineY, lineItems);
|
const lineY = yFromLineItems(lineItems);
|
||||||
})
|
return new PageLine(pageItems[0].page, lineY, lineItems);
|
||||||
.sort((a, b) => a.y - b.y);
|
})
|
||||||
|
.sort((a, b) => a.y - b.y);
|
||||||
|
|
||||||
// Keep globals up to date
|
// Keep globals up to date
|
||||||
if (lines[0].y < bottomY) {
|
if (pageLines[0].y < bottomY) {
|
||||||
bottomY = lines[0].y;
|
bottomY = pageLines[0].y;
|
||||||
}
|
}
|
||||||
if (lines[lines.length - 1].y > topY) {
|
if (pageLines[pageLines.length - 1].y > topY) {
|
||||||
topY = lines[lines.length - 1].y;
|
topY = pageLines[pageLines.length - 1].y;
|
||||||
}
|
}
|
||||||
|
|
||||||
// keep top and bottom fringes
|
// keep only top and bottom fringes
|
||||||
const numberOfFringeElements = Math.min(lines.length, config.maxNumberOffTopOrBottomLines);
|
const numberOfFringeElements = Math.min(pageLines.length, config.maxNumberOffTopOrBottomLines);
|
||||||
const bottomN = lines.slice(0, numberOfFringeElements);
|
const bottomN = pageLines.slice(0, numberOfFringeElements);
|
||||||
const topN = lines.slice(lines.length - numberOfFringeElements, lines.length);
|
const topN = pageLines.slice(pageLines.length - numberOfFringeElements, pageLines.length);
|
||||||
|
return [...bottomN, ...topN].filter(onlyUniques);
|
||||||
const fringeLines = [...bottomN, ...topN].filter(onlyUniques);
|
}),
|
||||||
return new PageExtract(pageItems[0].page, fringeLines);
|
(e) => e,
|
||||||
});
|
);
|
||||||
|
|
||||||
// console.log('bottom', bottomY);
|
// console.log('bottom', bottomY);
|
||||||
// console.log('top', topY);
|
// console.log('top', topY);
|
||||||
@ -206,36 +205,21 @@ function buildExtracts(inputItems: Item[]): PageExtract[] {
|
|||||||
//Now that we now the global top and bottom y, we cut those y which are in the middle and not really on the fringes
|
//Now that we now the global top and bottom y, we cut those y which are in the middle and not really on the fringes
|
||||||
const maxTopDistance = config.maxDistanceFromFringeElements;
|
const maxTopDistance = config.maxDistanceFromFringeElements;
|
||||||
const maxBottomDistance = config.maxDistanceFromFringeElements;
|
const maxBottomDistance = config.maxDistanceFromFringeElements;
|
||||||
return pages.map(
|
return fringLines.filter((line) => line.y <= bottomY + maxBottomDistance || line.y >= topY - maxTopDistance);
|
||||||
(page) =>
|
|
||||||
new PageExtract(
|
|
||||||
page.page,
|
|
||||||
page.fringeLines.filter((line) => line.y <= bottomY + maxBottomDistance || line.y >= topY - maxTopDistance),
|
|
||||||
),
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function yFromLine(lineItems: Item[]): number {
|
function yFromLineItems(lineItems: Item[]): number {
|
||||||
return Math.round(mostFrequent(lineItems, 'y') as number);
|
return Math.round(mostFrequent(lineItems, 'y') as number);
|
||||||
}
|
}
|
||||||
|
|
||||||
class PageExtract {
|
/**
|
||||||
constructor(public page: number, public fringeLines: Line[]) {}
|
* A number of Items on a line (~same y) on a page.
|
||||||
|
*/
|
||||||
hasY(y: number): boolean {
|
class PageLine {
|
||||||
return this.fringeLines.findIndex((line) => line.y === y) >= 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
lineByY(y: number): Line | undefined {
|
|
||||||
return this.fringeLines.find((line) => line.y === y);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class Line {
|
|
||||||
private _text: string | undefined;
|
private _text: string | undefined;
|
||||||
private _textWithoutNumbers: string | undefined;
|
private _textWithoutNumbers: string | undefined;
|
||||||
|
|
||||||
constructor(public y: number, public items: Item[]) {}
|
constructor(public page: number, public y: number, public items: Item[]) {}
|
||||||
|
|
||||||
text(): string {
|
text(): string {
|
||||||
if (!this._text) {
|
if (!this._text) {
|
||||||
|
@ -3,11 +3,15 @@ import PageViewport from '../parse/PageViewport';
|
|||||||
import EvaluationTracker from './EvaluationTracker';
|
import EvaluationTracker from './EvaluationTracker';
|
||||||
|
|
||||||
export default class TransformContext {
|
export default class TransformContext {
|
||||||
|
pageCount: number;
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
public fontMap: Map<string, object>,
|
public fontMap: Map<string, object>,
|
||||||
public pageViewports: PageViewport[],
|
public pageViewports: PageViewport[],
|
||||||
private evaluations = new EvaluationTracker(),
|
private evaluations = new EvaluationTracker(),
|
||||||
) {}
|
) {
|
||||||
|
this.pageCount = pageViewports.length;
|
||||||
|
}
|
||||||
|
|
||||||
trackEvaluation(item: Item) {
|
trackEvaluation(item: Item) {
|
||||||
this.evaluations.trackEvaluation(item);
|
this.evaluations.trackEvaluation(item);
|
||||||
|
Loading…
Reference in New Issue
Block a user