diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index f2d4a39..32fd036 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -3,7 +3,9 @@ import { Enum } from 'enumify'; import NoOp from './transformations/NoOp.jsx'; import RoundCoordinates from './transformations/RoundCoordinates.jsx'; import CombineSameY from './transformations/CombineSameY.jsx'; +import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx' import DetectFootnotes from './transformations/DetectFootnotes.jsx' +import DetectLinks from './transformations/DetectLinks.jsx' import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import HeadlineDetector from './transformations/HeadlineDetector.jsx' import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx' @@ -22,7 +24,9 @@ export default class AppState { new NoOp, new RoundCoordinates(), new CombineSameY(), + new RemoveWhitespaces(), new DetectFootnotes(), + new DetectLinks(), new RemoveRepetitiveElements(), new HeadlineDetector(), new HeadlineToUppercase(), diff --git a/src/javascript/models/transformations/DetectLinks.jsx b/src/javascript/models/transformations/DetectLinks.jsx new file mode 100644 index 0000000..aa79919 --- /dev/null +++ b/src/javascript/models/transformations/DetectLinks.jsx @@ -0,0 +1,68 @@ +import Transformation from './Transformation.jsx'; +import TextItem from '../TextItem.jsx'; +import PdfPage from '../PdfPage.jsx'; +import ContentView from '../ContentView.jsx'; + +import Annotation from '../Annotation.jsx'; + +export default class DetectLinks extends Transformation { + + constructor() { + super("Detect Links"); + } + + contentView() { + return ContentView.PDF; + } + + transform(pages:PdfPage[]) { + const addedAnnotation = new Annotation({ + category: 'added', + color: 'green' + }); + const removedAnnotation = new Annotation({ + category: 'removed', + color: 'red' + }); + + pages.forEach(page => { + const newTextItems = []; + page.textItems.forEach(item => { + newTextItems.push(item); + var words = item.text.split(' '); + var changedWords = []; + var change = false; + words.forEach(word => { + if (word.startsWith('http:')) { + changedWords.push(`[${word}](${word})`); + change = true; + } else if (word.startsWith('www.')) { + changedWords.push(`[http://${word}](http://${word})`); + change = true; + } else { + changedWords.push(word); + } + }); + if (change) { + newTextItems.push(new TextItem({ + ...item, + text: changedWords.join(' '), + annotation: addedAnnotation, + })); + item.annotation = removedAnnotation; + } + }); + page.textItems = newTextItems; + }); + return pages; + } + + processAnnotations(pages:PdfPage[]) { + pages.forEach(page => { + page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed'); + page.textItems.forEach(textItem => textItem.annotation = null) + }); + return pages; + } + +} \ No newline at end of file diff --git a/src/javascript/models/transformations/RemoveWhitespaces.jsx b/src/javascript/models/transformations/RemoveWhitespaces.jsx new file mode 100644 index 0000000..7f4ba33 --- /dev/null +++ b/src/javascript/models/transformations/RemoveWhitespaces.jsx @@ -0,0 +1,64 @@ +import Transformation from './Transformation.jsx'; +import TextItem from '../TextItem.jsx'; +import PdfPage from '../PdfPage.jsx'; +import ContentView from '../ContentView.jsx'; + +import Annotation from '../Annotation.jsx'; + +export default class RemoveWhitespaces extends Transformation { + + constructor() { + super("Remove Whitespaces"); + } + + contentView() { + return ContentView.PDF; + } + + transform(pages:PdfPage[]) { + const addedAnnotation = new Annotation({ + category: 'added', + color: 'green' + }); + const removedAnnotation = new Annotation({ + category: 'removed', + color: 'red' + }); + + pages.forEach(page => { + const newTextItems = []; + page.textItems.forEach(item => { + newTextItems.push(item); + var words = item.text.trim().split(' '); + var changedWords = []; + var change = false; + words.forEach(word => { + if (word.length == 0) { + change = true; + } else { + changedWords.push(word); + } + }); + if (change) { + newTextItems.push(new TextItem({ + ...item, + text: changedWords.join(' '), + annotation: addedAnnotation, + })); + item.annotation = removedAnnotation; + } + }); + page.textItems = newTextItems; + }); + return pages; + } + + processAnnotations(pages:PdfPage[]) { + pages.forEach(page => { + page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed'); + page.textItems.forEach(textItem => textItem.annotation = null) + }); + return pages; + } + +} \ No newline at end of file