Detect Links & Remove Whitespaces

This commit is contained in:
Johannes Zillmann 2017-02-11 15:23:01 +01:00
parent fc0aafebdd
commit 996e5fae62
3 changed files with 136 additions and 0 deletions

View File

@ -3,7 +3,9 @@ import { Enum } from 'enumify';
import NoOp from './transformations/NoOp.jsx'; import NoOp from './transformations/NoOp.jsx';
import RoundCoordinates from './transformations/RoundCoordinates.jsx'; import RoundCoordinates from './transformations/RoundCoordinates.jsx';
import CombineSameY from './transformations/CombineSameY.jsx'; import CombineSameY from './transformations/CombineSameY.jsx';
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx' import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectLinks from './transformations/DetectLinks.jsx'
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import HeadlineDetector from './transformations/HeadlineDetector.jsx' import HeadlineDetector from './transformations/HeadlineDetector.jsx'
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx' import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
@ -22,7 +24,9 @@ export default class AppState {
new NoOp, new NoOp,
new RoundCoordinates(), new RoundCoordinates(),
new CombineSameY(), new CombineSameY(),
new RemoveWhitespaces(),
new DetectFootnotes(), new DetectFootnotes(),
new DetectLinks(),
new RemoveRepetitiveElements(), new RemoveRepetitiveElements(),
new HeadlineDetector(), new HeadlineDetector(),
new HeadlineToUppercase(), new HeadlineToUppercase(),

View File

@ -0,0 +1,68 @@
import Transformation from './Transformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ContentView from '../ContentView.jsx';
import Annotation from '../Annotation.jsx';
export default class DetectLinks extends Transformation {
constructor() {
super("Detect Links");
}
contentView() {
return ContentView.PDF;
}
transform(pages:PdfPage[]) {
const addedAnnotation = new Annotation({
category: 'added',
color: 'green'
});
const removedAnnotation = new Annotation({
category: 'removed',
color: 'red'
});
pages.forEach(page => {
const newTextItems = [];
page.textItems.forEach(item => {
newTextItems.push(item);
var words = item.text.split(' ');
var changedWords = [];
var change = false;
words.forEach(word => {
if (word.startsWith('http:')) {
changedWords.push(`[${word}](${word})`);
change = true;
} else if (word.startsWith('www.')) {
changedWords.push(`[http://${word}](http://${word})`);
change = true;
} else {
changedWords.push(word);
}
});
if (change) {
newTextItems.push(new TextItem({
...item,
text: changedWords.join(' '),
annotation: addedAnnotation,
}));
item.annotation = removedAnnotation;
}
});
page.textItems = newTextItems;
});
return pages;
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
}
}

View File

@ -0,0 +1,64 @@
import Transformation from './Transformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ContentView from '../ContentView.jsx';
import Annotation from '../Annotation.jsx';
export default class RemoveWhitespaces extends Transformation {
constructor() {
super("Remove Whitespaces");
}
contentView() {
return ContentView.PDF;
}
transform(pages:PdfPage[]) {
const addedAnnotation = new Annotation({
category: 'added',
color: 'green'
});
const removedAnnotation = new Annotation({
category: 'removed',
color: 'red'
});
pages.forEach(page => {
const newTextItems = [];
page.textItems.forEach(item => {
newTextItems.push(item);
var words = item.text.trim().split(' ');
var changedWords = [];
var change = false;
words.forEach(word => {
if (word.length == 0) {
change = true;
} else {
changedWords.push(word);
}
});
if (change) {
newTextItems.push(new TextItem({
...item,
text: changedWords.join(' '),
annotation: addedAnnotation,
}));
item.annotation = removedAnnotation;
}
});
page.textItems = newTextItems;
});
return pages;
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
}
}