mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-25 01:03:59 +01:00
add combine-Y transformation
This commit is contained in:
parent
92a904a0a6
commit
9ae32e02b5
@ -15,7 +15,6 @@ export default class PdfView extends React.Component {
|
|||||||
constructor(props) {
|
constructor(props) {
|
||||||
super(props);
|
super(props);
|
||||||
this.state = {
|
this.state = {
|
||||||
transformations: this.props.transformations,
|
|
||||||
currentTransformation: 0,
|
currentTransformation: 0,
|
||||||
pageNr: -1
|
pageNr: -1
|
||||||
};
|
};
|
||||||
@ -29,25 +28,21 @@ export default class PdfView extends React.Component {
|
|||||||
}
|
}
|
||||||
|
|
||||||
nextTransformation() {
|
nextTransformation() {
|
||||||
console.debug("nextTransformation");
|
|
||||||
this.setState({
|
this.setState({
|
||||||
currentTransformation: this.state.currentTransformation + 1
|
currentTransformation: this.state.currentTransformation + 1
|
||||||
});
|
});
|
||||||
console.debug(this.state.currentTransformation);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
prevTransformation() {
|
prevTransformation() {
|
||||||
console.debug("prevTransformation");
|
|
||||||
this.setState({
|
this.setState({
|
||||||
currentTransformation: this.state.currentTransformation - 1
|
currentTransformation: this.state.currentTransformation - 1
|
||||||
});
|
});
|
||||||
console.debug(this.state.currentTransformation);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
render() {
|
render() {
|
||||||
const {transformations, currentTransformation, pageNr} = this.state;
|
const {currentTransformation, pageNr} = this.state;
|
||||||
const {pdfPages} = this.props;
|
const {pdfPages, transformations} = this.props;
|
||||||
|
|
||||||
const header = "Parsed " + pdfPages.length + " pages!"
|
const header = "Parsed " + pdfPages.length + " pages!"
|
||||||
|
|
||||||
@ -65,21 +60,14 @@ export default class PdfView extends React.Component {
|
|||||||
{ '==>' }
|
{ '==>' }
|
||||||
</a>;
|
</a>;
|
||||||
|
|
||||||
|
const transformedPdfPages = pdfPages.filter((elem, i) => pageNr == -1 || i == pageNr).map(pdfPage => {
|
||||||
//TODO only transform selected page ?
|
|
||||||
const transformedPdfPages = pdfPages.map(pdfPage => {
|
|
||||||
for (var i = 0; i <= currentTransformation; i++) {
|
for (var i = 0; i <= currentTransformation; i++) {
|
||||||
pdfPage = transformations[i].transform(pdfPage);
|
pdfPage = transformations[i].transform(pdfPage);
|
||||||
}
|
}
|
||||||
return pdfPage;
|
return pdfPage;
|
||||||
});
|
});
|
||||||
|
|
||||||
var pageComponents;
|
var pageComponents = transformedPdfPages.map(page => <PdfPageView key={ page.index } pdfPage={ page } />);
|
||||||
if (pageNr >= 0) {
|
|
||||||
pageComponents = <PdfPageView key={ pageNr } pdfPage={ transformedPdfPages[pageNr] } />;
|
|
||||||
} else {
|
|
||||||
pageComponents = transformedPdfPages.map((page) => <PdfPageView key={ page.index } pdfPage={ page } />);
|
|
||||||
}
|
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div>
|
<div>
|
||||||
|
@ -4,7 +4,8 @@ import { pdfToTextItemsAsync } from '../functions/pdfToTextItems.jsx'
|
|||||||
import PdfPage from './PdfPage.jsx';
|
import PdfPage from './PdfPage.jsx';
|
||||||
|
|
||||||
import NoOpTransformation from './transformations/NoOpTransformation.jsx';
|
import NoOpTransformation from './transformations/NoOpTransformation.jsx';
|
||||||
import RoundYTransformation from './transformations/RoundYTransformation.jsx';
|
import RoundCoordinatesTransformation from './transformations/RoundCoordinatesTransformation.jsx';
|
||||||
|
import CombineSameYTransformation from './transformations/CombineSameYTransformation.jsx';
|
||||||
|
|
||||||
// Holds the state of the Application
|
// Holds the state of the Application
|
||||||
export default class AppState {
|
export default class AppState {
|
||||||
@ -15,7 +16,7 @@ export default class AppState {
|
|||||||
this.pagesToUpload = 0;
|
this.pagesToUpload = 0;
|
||||||
this.uploadedPages = 0;
|
this.uploadedPages = 0;
|
||||||
this.pdfPages = [];
|
this.pdfPages = [];
|
||||||
this.transformations = [new NoOpTransformation(), new RoundYTransformation()];
|
this.transformations = [new NoOpTransformation(), new RoundCoordinatesTransformation(), new CombineSameYTransformation()];
|
||||||
|
|
||||||
//bind functions
|
//bind functions
|
||||||
this.render = this.render.bind(this);
|
this.render = this.render.bind(this);
|
||||||
|
@ -0,0 +1,56 @@
|
|||||||
|
import Transformation from './Transformation.jsx';
|
||||||
|
import TextItem from '../TextItem.jsx';
|
||||||
|
|
||||||
|
export default class CombineSameYTransformation extends Transformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Combine text on same Y");
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(pdfPage:PdfPage) {
|
||||||
|
|
||||||
|
const newTextItems = [];
|
||||||
|
var lastTextItem;
|
||||||
|
pdfPage.textItems.forEach(textItem => {
|
||||||
|
if (!lastTextItem) {
|
||||||
|
lastTextItem = textItem;
|
||||||
|
} else {
|
||||||
|
if (textItem.y == lastTextItem.y) {
|
||||||
|
//combine
|
||||||
|
|
||||||
|
console.debug("last=" + lastTextItem.text + ", x=" + lastTextItem.x + ", width=" + lastTextItem.width);
|
||||||
|
console.debug("new=" + textItem.text + ", x=" + textItem.x + ", width=" + textItem.width);
|
||||||
|
console.debug("diff=" + (textItem.x - lastTextItem.x - lastTextItem.width));
|
||||||
|
|
||||||
|
var combinedText = lastTextItem.text;
|
||||||
|
//TODO make 5 dependent on text size or biggest gap?
|
||||||
|
if (textItem.x - lastTextItem.x - lastTextItem.width > 7) {
|
||||||
|
combinedText += ' ';
|
||||||
|
}
|
||||||
|
combinedText += textItem.text;
|
||||||
|
|
||||||
|
lastTextItem = new TextItem({
|
||||||
|
x: lastTextItem.x,
|
||||||
|
y: lastTextItem.y,
|
||||||
|
width: textItem.x - lastTextItem.x + textItem.width,
|
||||||
|
height: lastTextItem.height, //might this cause problems ?
|
||||||
|
text: combinedText
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
//rotate
|
||||||
|
newTextItems.push(lastTextItem);
|
||||||
|
lastTextItem = textItem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (lastTextItem) {
|
||||||
|
newTextItems.push(lastTextItem);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
...pdfPage,
|
||||||
|
textItems: newTextItems
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,9 +1,9 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import Transformation from './Transformation.jsx';
|
||||||
|
|
||||||
export default class RoundYTransformation extends Transformation {
|
export default class RoundCoordinatesTransformation extends Transformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Round all Y");
|
super("Round coordinates");
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(pdfPage:PdfPage) {
|
transform(pdfPage:PdfPage) {
|
||||||
@ -12,7 +12,10 @@ export default class RoundYTransformation extends Transformation {
|
|||||||
textItems: pdfPage.textItems.map(textItem => {
|
textItems: pdfPage.textItems.map(textItem => {
|
||||||
return {
|
return {
|
||||||
...textItem,
|
...textItem,
|
||||||
y: Math.round(textItem.y)
|
x: Math.round(textItem.x),
|
||||||
|
y: Math.round(textItem.y),
|
||||||
|
width: Math.round(textItem.width),
|
||||||
|
height: Math.round(textItem.height)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
};
|
};
|
Loading…
Reference in New Issue
Block a user