Files
pdf-to-markdown/oldSrc/javascript/models/transformations/textitem/CalculateGlobalStats.jsx
2024-03-26 10:52:54 -06:00

121 lines
4.5 KiB
JavaScript

import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import WordFormat from '../../markdown/WordFormat.jsx';
export default class CalculateGlobalStats extends ToTextItemTransformation {
constructor(fontMap) {
super("Calculate Statistics");
this.fontMap = fontMap;
}
transform(parseResult:ParseResult) {
// Parse heights
const heightToOccurrence = {};
const fontToOccurrence = {};
var maxHeight = 0;
var maxHeightFont;
parseResult.pages.forEach(page => {
page.items.forEach(item => {
heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
fontToOccurrence[item.font] = fontToOccurrence[item.font] ? fontToOccurrence[item.font] + 1 : 1;
if (item.height > maxHeight) {
maxHeight = item.height;
maxHeightFont = item.font;
}
});
});
const mostUsedHeight = parseInt(getMostUsedKey(heightToOccurrence));
const mostUsedFont = getMostUsedKey(fontToOccurrence);
// Parse line distances
const distanceToOccurrence = {};
parseResult.pages.forEach(page => {
var lastItemOfMostUsedHeight;
page.items.forEach(item => {
if (item.height == mostUsedHeight && item.text.trim().length > 0) {
if (lastItemOfMostUsedHeight && item.y != lastItemOfMostUsedHeight.y) {
const distance = lastItemOfMostUsedHeight.y - item.y;
if (distance > 0) {
distanceToOccurrence[distance] = distanceToOccurrence[distance] ? distanceToOccurrence[distance] + 1 : 1;
}
}
lastItemOfMostUsedHeight = item;
} else {
lastItemOfMostUsedHeight = null;
}
});
});
const mostUsedDistance = parseInt(getMostUsedKey(distanceToOccurrence));
const fontIdToName = [];
const fontToFormats = new Map();
this.fontMap.forEach(function(value, key) {
fontIdToName.push(key + " = " + value.name)
const fontName = value.name.toLowerCase();
var format;
if (key == mostUsedFont) {
format = null;
} else if (fontName.includes('bold') && (fontName.includes('oblique') || fontName.includes('italic'))) {
format = WordFormat.BOLD_OBLIQUE;
} else if (fontName.includes('bold')) {
format = WordFormat.BOLD;
} else if (fontName.includes('oblique') || fontName.includes('italic')) {
format = WordFormat.OBLIQUE;
} else if (fontName === maxHeightFont) {
format = WordFormat.BOLD;
}
if (format) {
fontToFormats.set(key, format.name);
}
});
fontIdToName.sort();
//Make a copy of the originals so all following transformation don't modify them
const newPages = parseResult.pages.map(page => {
return {
...page,
items: page.items.map(textItem => {
return {
...textItem,
}
})
};
});
return new ParseResult({
...parseResult,
pages: newPages,
globals: {
mostUsedHeight: mostUsedHeight,
mostUsedFont: mostUsedFont,
mostUsedDistance: mostUsedDistance,
maxHeight: maxHeight,
maxHeightFont: maxHeightFont,
fontToFormats: fontToFormats
},
messages: [
'Items per height: ' + JSON.stringify(heightToOccurrence),
'Items per font: ' + JSON.stringify(fontToOccurrence),
'Items per distance: ' + JSON.stringify(distanceToOccurrence),
'Fonts:' + JSON.stringify(fontIdToName)
]
});
}
}
function getMostUsedKey(keyToOccurrence) {
var maxOccurence = 0;
var maxKey;
Object.keys(keyToOccurrence).map((element) => {
if (!maxKey || keyToOccurrence[element] > maxOccurence) {
maxOccurence = keyToOccurrence[element];
maxKey = element;
}
});
return maxKey;
}