Detect Links & Remove Whitespaces

This commit is contained in:
Johannes Zillmann 2017-02-11 15:23:01 +01:00
parent fc0aafebdd
commit 996e5fae62
3 changed files with 136 additions and 0 deletions

View File

@ -3,7 +3,9 @@ import { Enum } from 'enumify';
import NoOp from './transformations/NoOp.jsx';
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
import CombineSameY from './transformations/CombineSameY.jsx';
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
import DetectLinks from './transformations/DetectLinks.jsx'
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
@ -22,7 +24,9 @@ export default class AppState {
new NoOp,
new RoundCoordinates(),
new CombineSameY(),
new RemoveWhitespaces(),
new DetectFootnotes(),
new DetectLinks(),
new RemoveRepetitiveElements(),
new HeadlineDetector(),
new HeadlineToUppercase(),

View File

@ -0,0 +1,68 @@
import Transformation from './Transformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ContentView from '../ContentView.jsx';
import Annotation from '../Annotation.jsx';
export default class DetectLinks extends Transformation {
constructor() {
super("Detect Links");
}
contentView() {
return ContentView.PDF;
}
transform(pages:PdfPage[]) {
const addedAnnotation = new Annotation({
category: 'added',
color: 'green'
});
const removedAnnotation = new Annotation({
category: 'removed',
color: 'red'
});
pages.forEach(page => {
const newTextItems = [];
page.textItems.forEach(item => {
newTextItems.push(item);
var words = item.text.split(' ');
var changedWords = [];
var change = false;
words.forEach(word => {
if (word.startsWith('http:')) {
changedWords.push(`[${word}](${word})`);
change = true;
} else if (word.startsWith('www.')) {
changedWords.push(`[http://${word}](http://${word})`);
change = true;
} else {
changedWords.push(word);
}
});
if (change) {
newTextItems.push(new TextItem({
...item,
text: changedWords.join(' '),
annotation: addedAnnotation,
}));
item.annotation = removedAnnotation;
}
});
page.textItems = newTextItems;
});
return pages;
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
}
}

View File

@ -0,0 +1,64 @@
import Transformation from './Transformation.jsx';
import TextItem from '../TextItem.jsx';
import PdfPage from '../PdfPage.jsx';
import ContentView from '../ContentView.jsx';
import Annotation from '../Annotation.jsx';
export default class RemoveWhitespaces extends Transformation {
constructor() {
super("Remove Whitespaces");
}
contentView() {
return ContentView.PDF;
}
transform(pages:PdfPage[]) {
const addedAnnotation = new Annotation({
category: 'added',
color: 'green'
});
const removedAnnotation = new Annotation({
category: 'removed',
color: 'red'
});
pages.forEach(page => {
const newTextItems = [];
page.textItems.forEach(item => {
newTextItems.push(item);
var words = item.text.trim().split(' ');
var changedWords = [];
var change = false;
words.forEach(word => {
if (word.length == 0) {
change = true;
} else {
changedWords.push(word);
}
});
if (change) {
newTextItems.push(new TextItem({
...item,
text: changedWords.join(' '),
annotation: addedAnnotation,
}));
item.annotation = removedAnnotation;
}
});
page.textItems = newTextItems;
});
return pages;
}
processAnnotations(pages:PdfPage[]) {
pages.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
page.textItems.forEach(textItem => textItem.annotation = null)
});
return pages;
}
}