mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 23:33:31 +01:00
add combine-Y transformation
This commit is contained in:
parent
92a904a0a6
commit
9ae32e02b5
@ -15,7 +15,6 @@ export default class PdfView extends React.Component {
|
||||
constructor(props) {
|
||||
super(props);
|
||||
this.state = {
|
||||
transformations: this.props.transformations,
|
||||
currentTransformation: 0,
|
||||
pageNr: -1
|
||||
};
|
||||
@ -29,25 +28,21 @@ export default class PdfView extends React.Component {
|
||||
}
|
||||
|
||||
nextTransformation() {
|
||||
console.debug("nextTransformation");
|
||||
this.setState({
|
||||
currentTransformation: this.state.currentTransformation + 1
|
||||
});
|
||||
console.debug(this.state.currentTransformation);
|
||||
}
|
||||
|
||||
prevTransformation() {
|
||||
console.debug("prevTransformation");
|
||||
this.setState({
|
||||
currentTransformation: this.state.currentTransformation - 1
|
||||
});
|
||||
console.debug(this.state.currentTransformation);
|
||||
}
|
||||
|
||||
|
||||
render() {
|
||||
const {transformations, currentTransformation, pageNr} = this.state;
|
||||
const {pdfPages} = this.props;
|
||||
const {currentTransformation, pageNr} = this.state;
|
||||
const {pdfPages, transformations} = this.props;
|
||||
|
||||
const header = "Parsed " + pdfPages.length + " pages!"
|
||||
|
||||
@ -65,21 +60,14 @@ export default class PdfView extends React.Component {
|
||||
{ '==>' }
|
||||
</a>;
|
||||
|
||||
|
||||
//TODO only transform selected page ?
|
||||
const transformedPdfPages = pdfPages.map(pdfPage => {
|
||||
const transformedPdfPages = pdfPages.filter((elem, i) => pageNr == -1 || i == pageNr).map(pdfPage => {
|
||||
for (var i = 0; i <= currentTransformation; i++) {
|
||||
pdfPage = transformations[i].transform(pdfPage);
|
||||
}
|
||||
return pdfPage;
|
||||
});
|
||||
|
||||
var pageComponents;
|
||||
if (pageNr >= 0) {
|
||||
pageComponents = <PdfPageView key={ pageNr } pdfPage={ transformedPdfPages[pageNr] } />;
|
||||
} else {
|
||||
pageComponents = transformedPdfPages.map((page) => <PdfPageView key={ page.index } pdfPage={ page } />);
|
||||
}
|
||||
var pageComponents = transformedPdfPages.map(page => <PdfPageView key={ page.index } pdfPage={ page } />);
|
||||
|
||||
return (
|
||||
<div>
|
||||
|
@ -4,7 +4,8 @@ import { pdfToTextItemsAsync } from '../functions/pdfToTextItems.jsx'
|
||||
import PdfPage from './PdfPage.jsx';
|
||||
|
||||
import NoOpTransformation from './transformations/NoOpTransformation.jsx';
|
||||
import RoundYTransformation from './transformations/RoundYTransformation.jsx';
|
||||
import RoundCoordinatesTransformation from './transformations/RoundCoordinatesTransformation.jsx';
|
||||
import CombineSameYTransformation from './transformations/CombineSameYTransformation.jsx';
|
||||
|
||||
// Holds the state of the Application
|
||||
export default class AppState {
|
||||
@ -15,7 +16,7 @@ export default class AppState {
|
||||
this.pagesToUpload = 0;
|
||||
this.uploadedPages = 0;
|
||||
this.pdfPages = [];
|
||||
this.transformations = [new NoOpTransformation(), new RoundYTransformation()];
|
||||
this.transformations = [new NoOpTransformation(), new RoundCoordinatesTransformation(), new CombineSameYTransformation()];
|
||||
|
||||
//bind functions
|
||||
this.render = this.render.bind(this);
|
||||
|
@ -0,0 +1,56 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
|
||||
export default class CombineSameYTransformation extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("Combine text on same Y");
|
||||
}
|
||||
|
||||
transform(pdfPage:PdfPage) {
|
||||
|
||||
const newTextItems = [];
|
||||
var lastTextItem;
|
||||
pdfPage.textItems.forEach(textItem => {
|
||||
if (!lastTextItem) {
|
||||
lastTextItem = textItem;
|
||||
} else {
|
||||
if (textItem.y == lastTextItem.y) {
|
||||
//combine
|
||||
|
||||
console.debug("last=" + lastTextItem.text + ", x=" + lastTextItem.x + ", width=" + lastTextItem.width);
|
||||
console.debug("new=" + textItem.text + ", x=" + textItem.x + ", width=" + textItem.width);
|
||||
console.debug("diff=" + (textItem.x - lastTextItem.x - lastTextItem.width));
|
||||
|
||||
var combinedText = lastTextItem.text;
|
||||
//TODO make 5 dependent on text size or biggest gap?
|
||||
if (textItem.x - lastTextItem.x - lastTextItem.width > 7) {
|
||||
combinedText += ' ';
|
||||
}
|
||||
combinedText += textItem.text;
|
||||
|
||||
lastTextItem = new TextItem({
|
||||
x: lastTextItem.x,
|
||||
y: lastTextItem.y,
|
||||
width: textItem.x - lastTextItem.x + textItem.width,
|
||||
height: lastTextItem.height, //might this cause problems ?
|
||||
text: combinedText
|
||||
});
|
||||
} else {
|
||||
//rotate
|
||||
newTextItems.push(lastTextItem);
|
||||
lastTextItem = textItem;
|
||||
}
|
||||
}
|
||||
});
|
||||
if (lastTextItem) {
|
||||
newTextItems.push(lastTextItem);
|
||||
}
|
||||
|
||||
return {
|
||||
...pdfPage,
|
||||
textItems: newTextItems
|
||||
};
|
||||
}
|
||||
|
||||
}
|
@ -1,9 +1,9 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
|
||||
export default class RoundYTransformation extends Transformation {
|
||||
export default class RoundCoordinatesTransformation extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("Round all Y");
|
||||
super("Round coordinates");
|
||||
}
|
||||
|
||||
transform(pdfPage:PdfPage) {
|
||||
@ -12,7 +12,10 @@ export default class RoundYTransformation extends Transformation {
|
||||
textItems: pdfPage.textItems.map(textItem => {
|
||||
return {
|
||||
...textItem,
|
||||
y: Math.round(textItem.y)
|
||||
x: Math.round(textItem.x),
|
||||
y: Math.round(textItem.y),
|
||||
width: Math.round(textItem.width),
|
||||
height: Math.round(textItem.height)
|
||||
}
|
||||
})
|
||||
};
|
Loading…
Reference in New Issue
Block a user