mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-01 03:09:01 +01:00
Begin calculate statistics
This commit is contained in:
parent
0498832d10
commit
a5b65b5d85
@ -3,4 +3,5 @@ import type Item from './Item';
|
||||
export default interface ItemResult {
|
||||
items: Item[];
|
||||
messages: string[];
|
||||
globals?: object;
|
||||
}
|
||||
|
@ -6,9 +6,10 @@ import PdfPipeline from './PdfPipeline';
|
||||
|
||||
import AdjustHeight from './transformer/AdjustHeight';
|
||||
import CalculateCoordinates from './transformer/CalculateCoordinates';
|
||||
import CalculateStatistics from './transformer/CacluclateStatistics';
|
||||
import CompactLines from './transformer/CompactLines';
|
||||
|
||||
const transformers = [new AdjustHeight(), new CalculateCoordinates(), new CompactLines()];
|
||||
const transformers = [new AdjustHeight(), new CalculateCoordinates(), new CalculateStatistics(), new CompactLines()];
|
||||
|
||||
const defaultConfig: Config = {
|
||||
pdfjsParams: {
|
||||
|
120
core/src/transformer/CacluclateStatistics.ts
Normal file
120
core/src/transformer/CacluclateStatistics.ts
Normal file
@ -0,0 +1,120 @@
|
||||
import Item from '../Item';
|
||||
import ItemResult from '../ItemResult';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
|
||||
export default class CalculateStatistics extends ItemTransformer {
|
||||
constructor() {
|
||||
super('Calculate Statistics', 'Calculate global statistics that are used in downstream transformers', {
|
||||
requireColumns: ['str', 'fontName', 'y', 'height'],
|
||||
producesGlobels: [
|
||||
'mostUsedHeight',
|
||||
'mostUsedFont',
|
||||
'mostUsedDistance',
|
||||
'maxHeight',
|
||||
'maxHeightFont',
|
||||
// 'fontToFormats',
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
transform(_: TransformContext, items: Item[]): ItemResult {
|
||||
const heightToOccurrence = {};
|
||||
const fontToOccurrence = {};
|
||||
let maxHeight = 0;
|
||||
let maxHeightFont;
|
||||
|
||||
items.forEach((item) => {
|
||||
const itemHeight = item.data['height'];
|
||||
const itemFont = item.data['fontName'];
|
||||
heightToOccurrence[itemHeight] = heightToOccurrence[itemHeight] ? heightToOccurrence[itemHeight] + 1 : 1;
|
||||
fontToOccurrence[itemFont] = fontToOccurrence[itemFont] ? fontToOccurrence[itemFont] + 1 : 1;
|
||||
if (itemHeight > maxHeight) {
|
||||
maxHeight = itemHeight;
|
||||
maxHeightFont = itemFont;
|
||||
}
|
||||
});
|
||||
const mostUsedHeight = parseInt(getMostUsedKey(heightToOccurrence));
|
||||
const mostUsedFont = getMostUsedKey(fontToOccurrence);
|
||||
|
||||
// Parse line distances
|
||||
const distanceToOccurrence = {};
|
||||
|
||||
let page = -1;
|
||||
let lastItemOfMostUsedHeight: Item | undefined;
|
||||
items.forEach((item) => {
|
||||
if (item.page !== page) lastItemOfMostUsedHeight = undefined;
|
||||
const itemHeight = item.data['height'];
|
||||
const itemText = item.data['str'];
|
||||
const itemY = item.data['y'];
|
||||
if (itemHeight == mostUsedHeight && itemText.trim().length > 0) {
|
||||
console.log('__', itemY, lastItemOfMostUsedHeight);
|
||||
|
||||
if (lastItemOfMostUsedHeight && itemY != lastItemOfMostUsedHeight.data['y']) {
|
||||
const distance = lastItemOfMostUsedHeight.data['y'] - itemY;
|
||||
if (distance > 0) {
|
||||
distanceToOccurrence[distance] = distanceToOccurrence[distance] ? distanceToOccurrence[distance] + 1 : 1;
|
||||
}
|
||||
}
|
||||
lastItemOfMostUsedHeight = item;
|
||||
} else {
|
||||
lastItemOfMostUsedHeight = undefined;
|
||||
}
|
||||
page = item.page;
|
||||
});
|
||||
const mostUsedDistance = parseInt(getMostUsedKey(distanceToOccurrence));
|
||||
|
||||
// const fontIdToName = [];
|
||||
// const fontToFormats = new Map();
|
||||
// this.fontMap.forEach(function (value, key) {
|
||||
// fontIdToName.push(key + ' = ' + value.name);
|
||||
// const fontName = value.name.toLowerCase();
|
||||
// var format;
|
||||
// if (key == mostUsedFont) {
|
||||
// format = null;
|
||||
// } else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
|
||||
// format = WordFormat.BOLD_OBLIQUE;
|
||||
// } else if (fontName.includes('bold')) {
|
||||
// format = WordFormat.BOLD;
|
||||
// } else if (fontName.includes('oblique') || fontName.includes('italic')) {
|
||||
// format = WordFormat.OBLIQUE;
|
||||
// } else if (fontName === maxHeightFont) {
|
||||
// format = WordFormat.BOLD;
|
||||
// }
|
||||
// if (format) {
|
||||
// fontToFormats.set(key, format.name);
|
||||
// }
|
||||
// });
|
||||
// fontIdToName.sort();
|
||||
|
||||
return {
|
||||
items: items,
|
||||
globals: {
|
||||
mostUsedHeight: mostUsedHeight,
|
||||
mostUsedFont: mostUsedFont,
|
||||
mostUsedDistance: mostUsedDistance,
|
||||
maxHeight: maxHeight,
|
||||
maxHeightFont: maxHeightFont,
|
||||
// fontToFormats: fontToFormats,
|
||||
},
|
||||
messages: [
|
||||
'Items per height: ' + JSON.stringify(heightToOccurrence),
|
||||
'Items per font: ' + JSON.stringify(fontToOccurrence),
|
||||
'Items per distance: ' + JSON.stringify(distanceToOccurrence),
|
||||
// 'Fonts:' + JSON.stringify(fontIdToName),
|
||||
],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function getMostUsedKey(keyToOccurrence) {
|
||||
var maxOccurence = 0;
|
||||
var maxKey;
|
||||
Object.keys(keyToOccurrence).map((element) => {
|
||||
if (!maxKey || keyToOccurrence[element] > maxOccurence) {
|
||||
maxOccurence = keyToOccurrence[element];
|
||||
maxKey = element;
|
||||
}
|
||||
});
|
||||
return maxKey;
|
||||
}
|
Loading…
Reference in New Issue
Block a user