mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-16 18:00:46 +01:00
Calculate most used distance
* round coordinates on construction
This commit is contained in:
parent
b7393fc806
commit
a92e384249
@ -78,10 +78,10 @@ export default class LoadingView extends React.Component {
|
|||||||
|
|
||||||
const style = textContent.styles[item.fontName];
|
const style = textContent.styles[item.fontName];
|
||||||
return new TextItem({
|
return new TextItem({
|
||||||
x: item.transform[4],
|
x: Math.round(item.transform[4]),
|
||||||
y: item.transform[5],
|
y: Math.round(item.transform[5]),
|
||||||
width: item.width,
|
width: Math.round(item.width),
|
||||||
height: dividedHeight <= 1 ? item.height : dividedHeight,
|
height: Math.round(dividedHeight <= 1 ? item.height : dividedHeight),
|
||||||
text: item.str,
|
text: item.str,
|
||||||
font: item.fontName,
|
font: item.fontName,
|
||||||
fontAscent: style.ascent,
|
fontAscent: style.ascent,
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import { Enum } from 'enumify';
|
import { Enum } from 'enumify';
|
||||||
|
|
||||||
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
|
||||||
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
|
|
||||||
import DetectFormats from './transformations/DetectFormats.jsx'
|
import DetectFormats from './transformations/DetectFormats.jsx'
|
||||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
import CombineSameY from './transformations/CombineSameY.jsx';
|
||||||
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||||
@ -23,7 +22,6 @@ export default class AppState {
|
|||||||
this.pdfPages = [];
|
this.pdfPages = [];
|
||||||
this.transformations = [
|
this.transformations = [
|
||||||
new CalculateGlobalStats(),
|
new CalculateGlobalStats(),
|
||||||
new RoundCoordinates(),
|
|
||||||
new DetectFormats(),
|
new DetectFormats(),
|
||||||
new CombineSameY(),
|
new CombineSameY(),
|
||||||
new RemoveWhitespaces(),
|
new RemoveWhitespaces(),
|
||||||
|
@ -17,6 +17,9 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
|||||||
<li>
|
<li>
|
||||||
{ 'Most-used font: ' + parseResult.globals.mostUsedFont + ' ' }
|
{ 'Most-used font: ' + parseResult.globals.mostUsedFont + ' ' }
|
||||||
</li>
|
</li>
|
||||||
|
<li>
|
||||||
|
{ 'Most-used distance: ' + parseResult.globals.mostUsedDistance + ' ' }
|
||||||
|
</li>
|
||||||
<li>
|
<li>
|
||||||
{ 'Max height: ' + parseResult.globals.maxHeight + ' ' }
|
{ 'Max height: ' + parseResult.globals.maxHeight + ' ' }
|
||||||
</li>
|
</li>
|
||||||
@ -30,11 +33,16 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
|||||||
<li>
|
<li>
|
||||||
{ 'Items per font: ' + JSON.stringify(parseResult.summary.fontToOccurrence) + ' ' }
|
{ 'Items per font: ' + JSON.stringify(parseResult.summary.fontToOccurrence) + ' ' }
|
||||||
</li>
|
</li>
|
||||||
|
<li>
|
||||||
|
{ 'Items per distance: ' + JSON.stringify(parseResult.summary.distanceToOccurrence) + ' ' }
|
||||||
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
</div>;
|
</div>;
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
|
|
||||||
|
// Parse heights
|
||||||
const heightToOccurrence = {};
|
const heightToOccurrence = {};
|
||||||
const fontToOccurrence = {};
|
const fontToOccurrence = {};
|
||||||
var maxHeight = 0;
|
var maxHeight = 0;
|
||||||
@ -51,15 +59,39 @@ export default class CalculateGlobalStats extends ToPdfViewTransformation {
|
|||||||
});
|
});
|
||||||
const mostUsedHeight = parseInt(getMostUsedKey(heightToOccurrence));
|
const mostUsedHeight = parseInt(getMostUsedKey(heightToOccurrence));
|
||||||
const mostUsedFont = getMostUsedKey(fontToOccurrence);
|
const mostUsedFont = getMostUsedKey(fontToOccurrence);
|
||||||
|
|
||||||
|
// Parse line distances
|
||||||
|
const distanceToOccurrence = {};
|
||||||
|
parseResult.content.forEach(page => {
|
||||||
|
var lastItemOfMostUsedHeight;
|
||||||
|
page.textItems.forEach(item => {
|
||||||
|
if (item.height == mostUsedHeight) {
|
||||||
|
if (lastItemOfMostUsedHeight && item.y != lastItemOfMostUsedHeight.y) {
|
||||||
|
const distance = lastItemOfMostUsedHeight.y - item.y;
|
||||||
|
if (distance > 0) {
|
||||||
|
distanceToOccurrence[distance] = distanceToOccurrence[distance] ? distanceToOccurrence[distance] + 1 : 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lastItemOfMostUsedHeight = item;
|
||||||
|
} else {
|
||||||
|
lastItemOfMostUsedHeight = null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
const mostUsedDistance = parseInt(getMostUsedKey(distanceToOccurrence));
|
||||||
|
|
||||||
|
|
||||||
parseResult.globals = {
|
parseResult.globals = {
|
||||||
mostUsedHeight: mostUsedHeight,
|
mostUsedHeight: mostUsedHeight,
|
||||||
mostUsedFont: mostUsedFont,
|
mostUsedFont: mostUsedFont,
|
||||||
|
mostUsedDistance: mostUsedDistance,
|
||||||
maxHeight: maxHeight,
|
maxHeight: maxHeight,
|
||||||
maxHeightFont: maxHeightFont
|
maxHeightFont: maxHeightFont
|
||||||
}
|
}
|
||||||
parseResult.summary = {
|
parseResult.summary = {
|
||||||
heightToOccurrence: heightToOccurrence,
|
heightToOccurrence: heightToOccurrence,
|
||||||
fontToOccurrence: fontToOccurrence
|
fontToOccurrence: fontToOccurrence,
|
||||||
|
distanceToOccurrence: distanceToOccurrence,
|
||||||
}
|
}
|
||||||
return parseResult;
|
return parseResult;
|
||||||
}
|
}
|
||||||
|
@ -1,31 +0,0 @@
|
|||||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
|
||||||
import ParseResult from '../ParseResult.jsx';
|
|
||||||
|
|
||||||
export default class RoundCoordinates extends ToPdfViewTransformation {
|
|
||||||
|
|
||||||
constructor() {
|
|
||||||
super("Round Coordinates");
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
|
||||||
const newContent = parseResult.content.map(pdfPage => {
|
|
||||||
return {
|
|
||||||
...pdfPage,
|
|
||||||
textItems: pdfPage.textItems.map(textItem => {
|
|
||||||
return {
|
|
||||||
...textItem,
|
|
||||||
x: Math.round(textItem.x),
|
|
||||||
y: Math.round(textItem.y),
|
|
||||||
width: Math.round(textItem.width),
|
|
||||||
height: Math.round(textItem.height)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
};
|
|
||||||
});
|
|
||||||
return new ParseResult({
|
|
||||||
...parseResult,
|
|
||||||
content: newContent,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user