mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-24 16:54:12 +01:00
CombineSameY => Switch to annotation strategy
This commit is contained in:
parent
201753a2e0
commit
df07968c4d
@ -2,6 +2,7 @@ import Transformation from './Transformation.jsx';
|
|||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
import ContentView from '../ContentView.jsx';
|
||||||
|
import Annotation from '../Annotation.jsx';
|
||||||
|
|
||||||
export default class CombineSameYTransformation extends Transformation {
|
export default class CombineSameYTransformation extends Transformation {
|
||||||
|
|
||||||
@ -14,6 +15,16 @@ export default class CombineSameYTransformation extends Transformation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
transform(pages:PdfPage[]) {
|
transform(pages:PdfPage[]) {
|
||||||
|
|
||||||
|
const removedAnnotation = new Annotation({
|
||||||
|
category: 'removed',
|
||||||
|
color: 'red'
|
||||||
|
});
|
||||||
|
const combinedAnnotation = new Annotation({
|
||||||
|
category: 'combined',
|
||||||
|
color: 'green'
|
||||||
|
});
|
||||||
|
|
||||||
return pages.map(pdfPage => {
|
return pages.map(pdfPage => {
|
||||||
const newTextItems = [];
|
const newTextItems = [];
|
||||||
var lastTextItem;
|
var lastTextItem;
|
||||||
@ -21,12 +32,14 @@ export default class CombineSameYTransformation extends Transformation {
|
|||||||
if (!lastTextItem) {
|
if (!lastTextItem) {
|
||||||
lastTextItem = textItem;
|
lastTextItem = textItem;
|
||||||
} else {
|
} else {
|
||||||
if (textItem.y == lastTextItem.y) {
|
if (textItem.y == lastTextItem.y) { //combine
|
||||||
//combine
|
|
||||||
|
|
||||||
// console.debug("last=" + lastTextItem.text + ", x=" + lastTextItem.x + ", width=" + lastTextItem.width);
|
if (!lastTextItem.annotation) {
|
||||||
// console.debug("new=" + textItem.text + ", x=" + textItem.x + ", width=" + textItem.width);
|
lastTextItem.annotation = removedAnnotation;
|
||||||
// console.debug("diff=" + (textItem.x - lastTextItem.x - lastTextItem.width));
|
newTextItems.push(lastTextItem);
|
||||||
|
}
|
||||||
|
textItem.annotation = removedAnnotation;
|
||||||
|
newTextItems.push(textItem);
|
||||||
|
|
||||||
var combinedText = lastTextItem.text;
|
var combinedText = lastTextItem.text;
|
||||||
//TODO make 5 dependent on text size or biggest gap?
|
//TODO make 5 dependent on text size or biggest gap?
|
||||||
@ -40,10 +53,10 @@ export default class CombineSameYTransformation extends Transformation {
|
|||||||
y: lastTextItem.y,
|
y: lastTextItem.y,
|
||||||
width: textItem.x - lastTextItem.x + textItem.width,
|
width: textItem.x - lastTextItem.x + textItem.width,
|
||||||
height: lastTextItem.height, //might this cause problems ?
|
height: lastTextItem.height, //might this cause problems ?
|
||||||
text: combinedText
|
text: combinedText,
|
||||||
|
annotation: combinedAnnotation
|
||||||
});
|
});
|
||||||
} else {
|
} else { //rotate
|
||||||
//rotate
|
|
||||||
newTextItems.push(lastTextItem);
|
newTextItems.push(lastTextItem);
|
||||||
lastTextItem = textItem;
|
lastTextItem = textItem;
|
||||||
}
|
}
|
||||||
@ -60,4 +73,12 @@ export default class CombineSameYTransformation extends Transformation {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
processAnnotations(pages:PdfPage[]) {
|
||||||
|
pages.forEach(page => {
|
||||||
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||||
|
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||||
|
});
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -1,7 +1,7 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import Transformation from './Transformation.jsx';
|
||||||
import Annotation from '../Annotation.jsx';
|
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
import ContentView from '../ContentView.jsx';
|
||||||
|
import Annotation from '../Annotation.jsx';
|
||||||
|
|
||||||
|
|
||||||
function hashCodeIgnoringNumbers(string) {
|
function hashCodeIgnoringNumbers(string) {
|
||||||
|
Loading…
Reference in New Issue
Block a user