mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-21 10:08:03 +02:00
Outsource annotation definitions
This commit is contained in:
parent
996e5fae62
commit
1ca9fa4362
@ -7,3 +7,18 @@ export default class Annotation {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
export const ADDED_ANNOTATION = new Annotation({
|
||||
category: 'Added',
|
||||
color: 'green'
|
||||
});
|
||||
|
||||
export const REMOVED_ANNOTATION = new Annotation({
|
||||
category: 'Removed',
|
||||
color: 'red'
|
||||
});
|
||||
|
||||
export const UNCHANGED_ANNOTATION = new Annotation({
|
||||
category: 'Unchanged',
|
||||
color: 'brown'
|
||||
})
|
||||
|
@ -2,7 +2,7 @@ import Transformation from './Transformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import Annotation from '../Annotation.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
function combineTextItems(textItems:TextItem[]) {
|
||||
var numChars = 0;
|
||||
@ -37,11 +37,7 @@ function combineTextItems(textItems:TextItem[]) {
|
||||
width: sumWidthWithWhitespaces,
|
||||
height: maxHeight,
|
||||
text: combinedText,
|
||||
annotation: new Annotation({
|
||||
category: 'combined',
|
||||
color: 'green'
|
||||
})
|
||||
|
||||
annotation: ADDED_ANNOTATION
|
||||
});
|
||||
}
|
||||
|
||||
@ -57,11 +53,6 @@ export default class CombineSameY extends Transformation {
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
|
||||
const removedAnnotation = new Annotation({
|
||||
category: 'removed',
|
||||
color: 'red'
|
||||
});
|
||||
|
||||
return pages.map(pdfPage => {
|
||||
const newTextItems = [];
|
||||
var textItemsWithSameY = [];
|
||||
@ -72,7 +63,7 @@ export default class CombineSameY extends Transformation {
|
||||
} else {
|
||||
// add removed text-items
|
||||
textItemsWithSameY.forEach(textItem => {
|
||||
textItem.annotation = removedAnnotation;
|
||||
textItem.annotation = REMOVED_ANNOTATION;
|
||||
newTextItems.push(textItem);
|
||||
});
|
||||
newTextItems.push(combineTextItems(textItemsWithSameY));
|
||||
@ -102,7 +93,7 @@ export default class CombineSameY extends Transformation {
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return pages;
|
||||
|
@ -2,7 +2,7 @@ import Transformation from './Transformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import Annotation from '../Annotation.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
import { isNumber } from '../../functions.jsx'
|
||||
|
||||
@ -21,19 +21,14 @@ export default class DetectFootnotes extends Transformation {
|
||||
var nextFooterNumber = 1;
|
||||
var potentialFootnoteItem;
|
||||
|
||||
const removedAnnotation = new Annotation({
|
||||
category: 'removed',
|
||||
color: 'red'
|
||||
});
|
||||
|
||||
return pages.map(page => {
|
||||
const newTextItems = [];
|
||||
for (var i = 0; i < page.textItems.length; i++) {
|
||||
const item = page.textItems[i];
|
||||
if (potentialFootnoteItem) {
|
||||
if (potentialFootnoteItem.y - item.y < item.height) {
|
||||
potentialFootnoteItem.annotation = removedAnnotation;
|
||||
item.annotation = removedAnnotation;
|
||||
potentialFootnoteItem.annotation = REMOVED_ANNOTATION;
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
newTextItems.push(potentialFootnoteItem);
|
||||
newTextItems.push(item);
|
||||
newTextItems.push(new TextItem({
|
||||
@ -42,10 +37,7 @@ export default class DetectFootnotes extends Transformation {
|
||||
width: potentialFootnoteItem.width + item.width,
|
||||
height: item.height,
|
||||
text: '[' + potentialFootnoteItem.text + '] ' + item.text,
|
||||
annotation: new Annotation({
|
||||
category: 'footnote',
|
||||
color: 'green'
|
||||
})
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
//TODO repsect multiline!!
|
||||
nextFooterNumber++;
|
||||
@ -66,7 +58,7 @@ export default class DetectFootnotes extends Transformation {
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return pages;
|
||||
|
@ -3,7 +3,7 @@ import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
import Annotation from '../Annotation.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
export default class DetectLinks extends Transformation {
|
||||
|
||||
@ -16,15 +16,6 @@ export default class DetectLinks extends Transformation {
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
const addedAnnotation = new Annotation({
|
||||
category: 'added',
|
||||
color: 'green'
|
||||
});
|
||||
const removedAnnotation = new Annotation({
|
||||
category: 'removed',
|
||||
color: 'red'
|
||||
});
|
||||
|
||||
pages.forEach(page => {
|
||||
const newTextItems = [];
|
||||
page.textItems.forEach(item => {
|
||||
@ -47,9 +38,9 @@ export default class DetectLinks extends Transformation {
|
||||
newTextItems.push(new TextItem({
|
||||
...item,
|
||||
text: changedWords.join(' '),
|
||||
annotation: addedAnnotation,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
}));
|
||||
item.annotation = removedAnnotation;
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
}
|
||||
});
|
||||
page.textItems = newTextItems;
|
||||
@ -59,7 +50,7 @@ export default class DetectLinks extends Transformation {
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return pages;
|
||||
|
@ -132,7 +132,7 @@ export default class HeadlineDetector extends Transformation {
|
||||
...item,
|
||||
text: item.text,
|
||||
annotation: new Annotation({
|
||||
category: "Headline " + headlineLevel,
|
||||
category: "Headline-" + headlineLevel,
|
||||
color: 'green'
|
||||
}),
|
||||
markdownElement: new Headline({
|
||||
|
@ -2,7 +2,7 @@ import Transformation from './Transformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import Annotation from '../Annotation.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION, UNCHANGED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
import { hasUpperCaseCharacterInMiddleOfWord } from '../../functions.jsx'
|
||||
|
||||
@ -27,24 +27,15 @@ export default class HeadlineToUppercase extends Transformation {
|
||||
if (item.markdownElement && item.markdownElement.constructor.name === 'Headline') {
|
||||
const headline = item.text.trim();
|
||||
if (hasUpperCaseCharacterInMiddleOfWord(headline)) {
|
||||
item.annotation = new Annotation({
|
||||
category: 'removed',
|
||||
color: 'red'
|
||||
});
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
newTextItems.push(item);
|
||||
newTextItems.push(new TextItem({
|
||||
...item,
|
||||
text: item.text.toUpperCase(),
|
||||
annotation: new Annotation({
|
||||
category: "Uppercased",
|
||||
color: 'green'
|
||||
})
|
||||
annotation: ADDED_ANNOTATION
|
||||
}));
|
||||
} else {
|
||||
item.annotation = new Annotation({
|
||||
category: 'Untouched',
|
||||
color: 'brown'
|
||||
});
|
||||
item.annotation = UNCHANGED_ANNOTATION;
|
||||
newTextItems.push(item);
|
||||
}
|
||||
} else {
|
||||
@ -60,7 +51,7 @@ export default class HeadlineToUppercase extends Transformation {
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return pages;
|
||||
|
@ -1,7 +1,7 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
import Annotation from '../Annotation.jsx';
|
||||
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
import { isDigit } from '../../functions.jsx'
|
||||
|
||||
@ -51,10 +51,7 @@ export default class RemoveRepetitiveElements extends Transformation {
|
||||
var combinedCoordinates = combineCoordinates(textItem);
|
||||
if (repetitionCounts[combinedCoordinates] > 1) {
|
||||
// console.debug("page " + pdfPage.index + " removed :" + repetitionCounts[combinedCoordinates] + " :" + textItem.text);
|
||||
textItem.annotation = new Annotation({
|
||||
category: 'removed',
|
||||
color: 'red'
|
||||
});
|
||||
textItem.annotation = REMOVED_ANNOTATION;
|
||||
}
|
||||
});
|
||||
});
|
||||
@ -63,7 +60,7 @@ export default class RemoveRepetitiveElements extends Transformation {
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
});
|
||||
return pages;
|
||||
}
|
||||
|
@ -3,7 +3,7 @@ import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
import Annotation from '../Annotation.jsx';
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
export default class RemoveWhitespaces extends Transformation {
|
||||
|
||||
@ -16,15 +16,6 @@ export default class RemoveWhitespaces extends Transformation {
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
const addedAnnotation = new Annotation({
|
||||
category: 'added',
|
||||
color: 'green'
|
||||
});
|
||||
const removedAnnotation = new Annotation({
|
||||
category: 'removed',
|
||||
color: 'red'
|
||||
});
|
||||
|
||||
pages.forEach(page => {
|
||||
const newTextItems = [];
|
||||
page.textItems.forEach(item => {
|
||||
@ -43,9 +34,9 @@ export default class RemoveWhitespaces extends Transformation {
|
||||
newTextItems.push(new TextItem({
|
||||
...item,
|
||||
text: changedWords.join(' '),
|
||||
annotation: addedAnnotation,
|
||||
annotation: ADDED_ANNOTATION,
|
||||
}));
|
||||
item.annotation = removedAnnotation;
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
}
|
||||
});
|
||||
page.textItems = newTextItems;
|
||||
@ -55,7 +46,7 @@ export default class RemoveWhitespaces extends Transformation {
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return pages;
|
||||
|
Loading…
x
Reference in New Issue
Block a user