mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-22 07:43:46 +01:00
Calculate most used distance
* round coordinates on construction
This commit is contained in:
parent
b7393fc806
commit
a92e384249
@ -78,10 +78,10 @@ export default class LoadingView extends React.Component {
|
||||
|
||||
const style = textContent.styles[item.fontName];
|
||||
return new TextItem({
|
||||
x: item.transform[4],
|
||||
y: item.transform[5],
|
||||
width: item.width,
|
||||
height: dividedHeight <= 1 ? item.height : dividedHeight,
|
||||
x: Math.round(item.transform[4]),
|
||||
y: Math.round(item.transform[5]),
|
||||
width: Math.round(item.width),
|
||||
height: Math.round(dividedHeight <= 1 ? item.height : dividedHeight),
|
||||
text: item.str,
|
||||
font: item.fontName,
|
||||
fontAscent: style.ascent,
|
||||
|
@ -1,7 +1,6 @@
|
||||
import { Enum } from 'enumify';
|
||||
|
||||
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
||||
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
|
||||
import DetectFormats from './transformations/DetectFormats.jsx'
|
||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
||||
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||
@ -23,7 +22,6 @@ export default class AppState {
|
||||
this.pdfPages = [];
|
||||
this.transformations = [
|
||||
new CalculateGlobalStats(),
|
||||
new RoundCoordinates(),
|
||||
new DetectFormats(),
|
||||
new CombineSameY(),
|
||||
new RemoveWhitespaces(),
|
||||
|
@ -17,6 +17,9 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
||||
<li>
|
||||
{ 'Most-used font: ' + parseResult.globals.mostUsedFont + ' ' }
|
||||
</li>
|
||||
<li>
|
||||
{ 'Most-used distance: ' + parseResult.globals.mostUsedDistance + ' ' }
|
||||
</li>
|
||||
<li>
|
||||
{ 'Max height: ' + parseResult.globals.maxHeight + ' ' }
|
||||
</li>
|
||||
@ -30,11 +33,16 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
||||
<li>
|
||||
{ 'Items per font: ' + JSON.stringify(parseResult.summary.fontToOccurrence) + ' ' }
|
||||
</li>
|
||||
<li>
|
||||
{ 'Items per distance: ' + JSON.stringify(parseResult.summary.distanceToOccurrence) + ' ' }
|
||||
</li>
|
||||
</ul>
|
||||
</div>;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
|
||||
// Parse heights
|
||||
const heightToOccurrence = {};
|
||||
const fontToOccurrence = {};
|
||||
var maxHeight = 0;
|
||||
@ -51,15 +59,39 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
||||
});
|
||||
const mostUsedHeight = parseInt(getMostUsedKey(heightToOccurrence));
|
||||
const mostUsedFont = getMostUsedKey(fontToOccurrence);
|
||||
|
||||
// Parse line distances
|
||||
const distanceToOccurrence = {};
|
||||
parseResult.content.forEach(page => {
|
||||
var lastItemOfMostUsedHeight;
|
||||
page.textItems.forEach(item => {
|
||||
if (item.height == mostUsedHeight) {
|
||||
if (lastItemOfMostUsedHeight && item.y != lastItemOfMostUsedHeight.y) {
|
||||
const distance = lastItemOfMostUsedHeight.y - item.y;
|
||||
if (distance > 0) {
|
||||
distanceToOccurrence[distance] = distanceToOccurrence[distance] ? distanceToOccurrence[distance] + 1 : 1;
|
||||
}
|
||||
}
|
||||
lastItemOfMostUsedHeight = item;
|
||||
} else {
|
||||
lastItemOfMostUsedHeight = null;
|
||||
}
|
||||
});
|
||||
});
|
||||
const mostUsedDistance = parseInt(getMostUsedKey(distanceToOccurrence));
|
||||
|
||||
|
||||
parseResult.globals = {
|
||||
mostUsedHeight: mostUsedHeight,
|
||||
mostUsedFont: mostUsedFont,
|
||||
mostUsedDistance: mostUsedDistance,
|
||||
maxHeight: maxHeight,
|
||||
maxHeightFont: maxHeightFont
|
||||
}
|
||||
parseResult.summary = {
|
||||
heightToOccurrence: heightToOccurrence,
|
||||
fontToOccurrence: fontToOccurrence
|
||||
fontToOccurrence: fontToOccurrence,
|
||||
distanceToOccurrence: distanceToOccurrence,
|
||||
}
|
||||
return parseResult;
|
||||
}
|
||||
|
@ -1,31 +0,0 @@
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
|
||||
export default class RoundCoordinates extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Round Coordinates");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const newContent = parseResult.content.map(pdfPage => {
|
||||
return {
|
||||
...pdfPage,
|
||||
textItems: pdfPage.textItems.map(textItem => {
|
||||
return {
|
||||
...textItem,
|
||||
x: Math.round(textItem.x),
|
||||
y: Math.round(textItem.y),
|
||||
width: Math.round(textItem.width),
|
||||
height: Math.round(textItem.height)
|
||||
}
|
||||
})
|
||||
};
|
||||
});
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
content: newContent,
|
||||
});
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user