mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-21 10:08:03 +02:00
Outsource annotation definitions
This commit is contained in:
parent
996e5fae62
commit
1ca9fa4362
@ -7,3 +7,18 @@ export default class Annotation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const ADDED_ANNOTATION = new Annotation({
|
||||||
|
category: 'Added',
|
||||||
|
color: 'green'
|
||||||
|
});
|
||||||
|
|
||||||
|
export const REMOVED_ANNOTATION = new Annotation({
|
||||||
|
category: 'Removed',
|
||||||
|
color: 'red'
|
||||||
|
});
|
||||||
|
|
||||||
|
export const UNCHANGED_ANNOTATION = new Annotation({
|
||||||
|
category: 'Unchanged',
|
||||||
|
color: 'brown'
|
||||||
|
})
|
||||||
|
@ -2,7 +2,7 @@ import Transformation from './Transformation.jsx';
|
|||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
import ContentView from '../ContentView.jsx';
|
||||||
import Annotation from '../Annotation.jsx';
|
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
function combineTextItems(textItems:TextItem[]) {
|
function combineTextItems(textItems:TextItem[]) {
|
||||||
var numChars = 0;
|
var numChars = 0;
|
||||||
@ -37,11 +37,7 @@ function combineTextItems(textItems:TextItem[]) {
|
|||||||
width: sumWidthWithWhitespaces,
|
width: sumWidthWithWhitespaces,
|
||||||
height: maxHeight,
|
height: maxHeight,
|
||||||
text: combinedText,
|
text: combinedText,
|
||||||
annotation: new Annotation({
|
annotation: ADDED_ANNOTATION
|
||||||
category: 'combined',
|
|
||||||
color: 'green'
|
|
||||||
})
|
|
||||||
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,11 +53,6 @@ export default class CombineSameY extends Transformation {
|
|||||||
|
|
||||||
transform(pages:PdfPage[]) {
|
transform(pages:PdfPage[]) {
|
||||||
|
|
||||||
const removedAnnotation = new Annotation({
|
|
||||||
category: 'removed',
|
|
||||||
color: 'red'
|
|
||||||
});
|
|
||||||
|
|
||||||
return pages.map(pdfPage => {
|
return pages.map(pdfPage => {
|
||||||
const newTextItems = [];
|
const newTextItems = [];
|
||||||
var textItemsWithSameY = [];
|
var textItemsWithSameY = [];
|
||||||
@ -72,7 +63,7 @@ export default class CombineSameY extends Transformation {
|
|||||||
} else {
|
} else {
|
||||||
// add removed text-items
|
// add removed text-items
|
||||||
textItemsWithSameY.forEach(textItem => {
|
textItemsWithSameY.forEach(textItem => {
|
||||||
textItem.annotation = removedAnnotation;
|
textItem.annotation = REMOVED_ANNOTATION;
|
||||||
newTextItems.push(textItem);
|
newTextItems.push(textItem);
|
||||||
});
|
});
|
||||||
newTextItems.push(combineTextItems(textItemsWithSameY));
|
newTextItems.push(combineTextItems(textItemsWithSameY));
|
||||||
@ -102,7 +93,7 @@ export default class CombineSameY extends Transformation {
|
|||||||
|
|
||||||
processAnnotations(pages:PdfPage[]) {
|
processAnnotations(pages:PdfPage[]) {
|
||||||
pages.forEach(page => {
|
pages.forEach(page => {
|
||||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||||
});
|
});
|
||||||
return pages;
|
return pages;
|
||||||
|
@ -2,7 +2,7 @@ import Transformation from './Transformation.jsx';
|
|||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
import ContentView from '../ContentView.jsx';
|
||||||
import Annotation from '../Annotation.jsx';
|
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
import { isNumber } from '../../functions.jsx'
|
import { isNumber } from '../../functions.jsx'
|
||||||
|
|
||||||
@ -21,19 +21,14 @@ export default class DetectFootnotes extends Transformation {
|
|||||||
var nextFooterNumber = 1;
|
var nextFooterNumber = 1;
|
||||||
var potentialFootnoteItem;
|
var potentialFootnoteItem;
|
||||||
|
|
||||||
const removedAnnotation = new Annotation({
|
|
||||||
category: 'removed',
|
|
||||||
color: 'red'
|
|
||||||
});
|
|
||||||
|
|
||||||
return pages.map(page => {
|
return pages.map(page => {
|
||||||
const newTextItems = [];
|
const newTextItems = [];
|
||||||
for (var i = 0; i < page.textItems.length; i++) {
|
for (var i = 0; i < page.textItems.length; i++) {
|
||||||
const item = page.textItems[i];
|
const item = page.textItems[i];
|
||||||
if (potentialFootnoteItem) {
|
if (potentialFootnoteItem) {
|
||||||
if (potentialFootnoteItem.y - item.y < item.height) {
|
if (potentialFootnoteItem.y - item.y < item.height) {
|
||||||
potentialFootnoteItem.annotation = removedAnnotation;
|
potentialFootnoteItem.annotation = REMOVED_ANNOTATION;
|
||||||
item.annotation = removedAnnotation;
|
item.annotation = REMOVED_ANNOTATION;
|
||||||
newTextItems.push(potentialFootnoteItem);
|
newTextItems.push(potentialFootnoteItem);
|
||||||
newTextItems.push(item);
|
newTextItems.push(item);
|
||||||
newTextItems.push(new TextItem({
|
newTextItems.push(new TextItem({
|
||||||
@ -42,10 +37,7 @@ export default class DetectFootnotes extends Transformation {
|
|||||||
width: potentialFootnoteItem.width + item.width,
|
width: potentialFootnoteItem.width + item.width,
|
||||||
height: item.height,
|
height: item.height,
|
||||||
text: '[' + potentialFootnoteItem.text + '] ' + item.text,
|
text: '[' + potentialFootnoteItem.text + '] ' + item.text,
|
||||||
annotation: new Annotation({
|
annotation: ADDED_ANNOTATION
|
||||||
category: 'footnote',
|
|
||||||
color: 'green'
|
|
||||||
})
|
|
||||||
}));
|
}));
|
||||||
//TODO repsect multiline!!
|
//TODO repsect multiline!!
|
||||||
nextFooterNumber++;
|
nextFooterNumber++;
|
||||||
@ -66,7 +58,7 @@ export default class DetectFootnotes extends Transformation {
|
|||||||
|
|
||||||
processAnnotations(pages:PdfPage[]) {
|
processAnnotations(pages:PdfPage[]) {
|
||||||
pages.forEach(page => {
|
pages.forEach(page => {
|
||||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||||
});
|
});
|
||||||
return pages;
|
return pages;
|
||||||
|
@ -3,7 +3,7 @@ import TextItem from '../TextItem.jsx';
|
|||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
import ContentView from '../ContentView.jsx';
|
||||||
|
|
||||||
import Annotation from '../Annotation.jsx';
|
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
export default class DetectLinks extends Transformation {
|
export default class DetectLinks extends Transformation {
|
||||||
|
|
||||||
@ -16,15 +16,6 @@ export default class DetectLinks extends Transformation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
transform(pages:PdfPage[]) {
|
transform(pages:PdfPage[]) {
|
||||||
const addedAnnotation = new Annotation({
|
|
||||||
category: 'added',
|
|
||||||
color: 'green'
|
|
||||||
});
|
|
||||||
const removedAnnotation = new Annotation({
|
|
||||||
category: 'removed',
|
|
||||||
color: 'red'
|
|
||||||
});
|
|
||||||
|
|
||||||
pages.forEach(page => {
|
pages.forEach(page => {
|
||||||
const newTextItems = [];
|
const newTextItems = [];
|
||||||
page.textItems.forEach(item => {
|
page.textItems.forEach(item => {
|
||||||
@ -47,9 +38,9 @@ export default class DetectLinks extends Transformation {
|
|||||||
newTextItems.push(new TextItem({
|
newTextItems.push(new TextItem({
|
||||||
...item,
|
...item,
|
||||||
text: changedWords.join(' '),
|
text: changedWords.join(' '),
|
||||||
annotation: addedAnnotation,
|
annotation: ADDED_ANNOTATION,
|
||||||
}));
|
}));
|
||||||
item.annotation = removedAnnotation;
|
item.annotation = REMOVED_ANNOTATION;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
page.textItems = newTextItems;
|
page.textItems = newTextItems;
|
||||||
@ -59,7 +50,7 @@ export default class DetectLinks extends Transformation {
|
|||||||
|
|
||||||
processAnnotations(pages:PdfPage[]) {
|
processAnnotations(pages:PdfPage[]) {
|
||||||
pages.forEach(page => {
|
pages.forEach(page => {
|
||||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||||
});
|
});
|
||||||
return pages;
|
return pages;
|
||||||
|
@ -132,7 +132,7 @@ export default class HeadlineDetector extends Transformation {
|
|||||||
...item,
|
...item,
|
||||||
text: item.text,
|
text: item.text,
|
||||||
annotation: new Annotation({
|
annotation: new Annotation({
|
||||||
category: "Headline " + headlineLevel,
|
category: "Headline-" + headlineLevel,
|
||||||
color: 'green'
|
color: 'green'
|
||||||
}),
|
}),
|
||||||
markdownElement: new Headline({
|
markdownElement: new Headline({
|
||||||
|
@ -2,7 +2,7 @@ import Transformation from './Transformation.jsx';
|
|||||||
import TextItem from '../TextItem.jsx';
|
import TextItem from '../TextItem.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
import ContentView from '../ContentView.jsx';
|
||||||
import Annotation from '../Annotation.jsx';
|
import { ADDED_ANNOTATION, REMOVED_ANNOTATION, UNCHANGED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
import { hasUpperCaseCharacterInMiddleOfWord } from '../../functions.jsx'
|
import { hasUpperCaseCharacterInMiddleOfWord } from '../../functions.jsx'
|
||||||
|
|
||||||
@ -27,24 +27,15 @@ export default class HeadlineToUppercase extends Transformation {
|
|||||||
if (item.markdownElement && item.markdownElement.constructor.name === 'Headline') {
|
if (item.markdownElement && item.markdownElement.constructor.name === 'Headline') {
|
||||||
const headline = item.text.trim();
|
const headline = item.text.trim();
|
||||||
if (hasUpperCaseCharacterInMiddleOfWord(headline)) {
|
if (hasUpperCaseCharacterInMiddleOfWord(headline)) {
|
||||||
item.annotation = new Annotation({
|
item.annotation = REMOVED_ANNOTATION;
|
||||||
category: 'removed',
|
|
||||||
color: 'red'
|
|
||||||
});
|
|
||||||
newTextItems.push(item);
|
newTextItems.push(item);
|
||||||
newTextItems.push(new TextItem({
|
newTextItems.push(new TextItem({
|
||||||
...item,
|
...item,
|
||||||
text: item.text.toUpperCase(),
|
text: item.text.toUpperCase(),
|
||||||
annotation: new Annotation({
|
annotation: ADDED_ANNOTATION
|
||||||
category: "Uppercased",
|
|
||||||
color: 'green'
|
|
||||||
})
|
|
||||||
}));
|
}));
|
||||||
} else {
|
} else {
|
||||||
item.annotation = new Annotation({
|
item.annotation = UNCHANGED_ANNOTATION;
|
||||||
category: 'Untouched',
|
|
||||||
color: 'brown'
|
|
||||||
});
|
|
||||||
newTextItems.push(item);
|
newTextItems.push(item);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -60,7 +51,7 @@ export default class HeadlineToUppercase extends Transformation {
|
|||||||
|
|
||||||
processAnnotations(pages:PdfPage[]) {
|
processAnnotations(pages:PdfPage[]) {
|
||||||
pages.forEach(page => {
|
pages.forEach(page => {
|
||||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||||
});
|
});
|
||||||
return pages;
|
return pages;
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import Transformation from './Transformation.jsx';
|
import Transformation from './Transformation.jsx';
|
||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
import ContentView from '../ContentView.jsx';
|
||||||
import Annotation from '../Annotation.jsx';
|
import { REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
import { isDigit } from '../../functions.jsx'
|
import { isDigit } from '../../functions.jsx'
|
||||||
|
|
||||||
@ -51,10 +51,7 @@ export default class RemoveRepetitiveElements extends Transformation {
|
|||||||
var combinedCoordinates = combineCoordinates(textItem);
|
var combinedCoordinates = combineCoordinates(textItem);
|
||||||
if (repetitionCounts[combinedCoordinates] > 1) {
|
if (repetitionCounts[combinedCoordinates] > 1) {
|
||||||
// console.debug("page " + pdfPage.index + " removed :" + repetitionCounts[combinedCoordinates] + " :" + textItem.text);
|
// console.debug("page " + pdfPage.index + " removed :" + repetitionCounts[combinedCoordinates] + " :" + textItem.text);
|
||||||
textItem.annotation = new Annotation({
|
textItem.annotation = REMOVED_ANNOTATION;
|
||||||
category: 'removed',
|
|
||||||
color: 'red'
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -63,7 +60,7 @@ export default class RemoveRepetitiveElements extends Transformation {
|
|||||||
|
|
||||||
processAnnotations(pages:PdfPage[]) {
|
processAnnotations(pages:PdfPage[]) {
|
||||||
pages.forEach(page => {
|
pages.forEach(page => {
|
||||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||||
});
|
});
|
||||||
return pages;
|
return pages;
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ import TextItem from '../TextItem.jsx';
|
|||||||
import PdfPage from '../PdfPage.jsx';
|
import PdfPage from '../PdfPage.jsx';
|
||||||
import ContentView from '../ContentView.jsx';
|
import ContentView from '../ContentView.jsx';
|
||||||
|
|
||||||
import Annotation from '../Annotation.jsx';
|
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||||
|
|
||||||
export default class RemoveWhitespaces extends Transformation {
|
export default class RemoveWhitespaces extends Transformation {
|
||||||
|
|
||||||
@ -16,15 +16,6 @@ export default class RemoveWhitespaces extends Transformation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
transform(pages:PdfPage[]) {
|
transform(pages:PdfPage[]) {
|
||||||
const addedAnnotation = new Annotation({
|
|
||||||
category: 'added',
|
|
||||||
color: 'green'
|
|
||||||
});
|
|
||||||
const removedAnnotation = new Annotation({
|
|
||||||
category: 'removed',
|
|
||||||
color: 'red'
|
|
||||||
});
|
|
||||||
|
|
||||||
pages.forEach(page => {
|
pages.forEach(page => {
|
||||||
const newTextItems = [];
|
const newTextItems = [];
|
||||||
page.textItems.forEach(item => {
|
page.textItems.forEach(item => {
|
||||||
@ -43,9 +34,9 @@ export default class RemoveWhitespaces extends Transformation {
|
|||||||
newTextItems.push(new TextItem({
|
newTextItems.push(new TextItem({
|
||||||
...item,
|
...item,
|
||||||
text: changedWords.join(' '),
|
text: changedWords.join(' '),
|
||||||
annotation: addedAnnotation,
|
annotation: ADDED_ANNOTATION,
|
||||||
}));
|
}));
|
||||||
item.annotation = removedAnnotation;
|
item.annotation = REMOVED_ANNOTATION;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
page.textItems = newTextItems;
|
page.textItems = newTextItems;
|
||||||
@ -55,7 +46,7 @@ export default class RemoveWhitespaces extends Transformation {
|
|||||||
|
|
||||||
processAnnotations(pages:PdfPage[]) {
|
processAnnotations(pages:PdfPage[]) {
|
||||||
pages.forEach(page => {
|
pages.forEach(page => {
|
||||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||||
});
|
});
|
||||||
return pages;
|
return pages;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user