mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 23:33:31 +01:00
Detect Links & Remove Whitespaces
This commit is contained in:
parent
fc0aafebdd
commit
996e5fae62
@ -3,7 +3,9 @@ import { Enum } from 'enumify';
|
||||
import NoOp from './transformations/NoOp.jsx';
|
||||
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
|
||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
||||
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||
import DetectLinks from './transformations/DetectLinks.jsx'
|
||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||
@ -22,7 +24,9 @@ export default class AppState {
|
||||
new NoOp,
|
||||
new RoundCoordinates(),
|
||||
new CombineSameY(),
|
||||
new RemoveWhitespaces(),
|
||||
new DetectFootnotes(),
|
||||
new DetectLinks(),
|
||||
new RemoveRepetitiveElements(),
|
||||
new HeadlineDetector(),
|
||||
new HeadlineToUppercase(),
|
||||
|
68
src/javascript/models/transformations/DetectLinks.jsx
Normal file
68
src/javascript/models/transformations/DetectLinks.jsx
Normal file
@ -0,0 +1,68 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
import Annotation from '../Annotation.jsx';
|
||||
|
||||
export default class DetectLinks extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Links");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
const addedAnnotation = new Annotation({
|
||||
category: 'added',
|
||||
color: 'green'
|
||||
});
|
||||
const removedAnnotation = new Annotation({
|
||||
category: 'removed',
|
||||
color: 'red'
|
||||
});
|
||||
|
||||
pages.forEach(page => {
|
||||
const newTextItems = [];
|
||||
page.textItems.forEach(item => {
|
||||
newTextItems.push(item);
|
||||
var words = item.text.split(' ');
|
||||
var changedWords = [];
|
||||
var change = false;
|
||||
words.forEach(word => {
|
||||
if (word.startsWith('http:')) {
|
||||
changedWords.push(`[${word}](${word})`);
|
||||
change = true;
|
||||
} else if (word.startsWith('www.')) {
|
||||
changedWords.push(`[http://${word}](http://${word})`);
|
||||
change = true;
|
||||
} else {
|
||||
changedWords.push(word);
|
||||
}
|
||||
});
|
||||
if (change) {
|
||||
newTextItems.push(new TextItem({
|
||||
...item,
|
||||
text: changedWords.join(' '),
|
||||
annotation: addedAnnotation,
|
||||
}));
|
||||
item.annotation = removedAnnotation;
|
||||
}
|
||||
});
|
||||
page.textItems = newTextItems;
|
||||
});
|
||||
return pages;
|
||||
}
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return pages;
|
||||
}
|
||||
|
||||
}
|
64
src/javascript/models/transformations/RemoveWhitespaces.jsx
Normal file
64
src/javascript/models/transformations/RemoveWhitespaces.jsx
Normal file
@ -0,0 +1,64 @@
|
||||
import Transformation from './Transformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import PdfPage from '../PdfPage.jsx';
|
||||
import ContentView from '../ContentView.jsx';
|
||||
|
||||
import Annotation from '../Annotation.jsx';
|
||||
|
||||
export default class RemoveWhitespaces extends Transformation {
|
||||
|
||||
constructor() {
|
||||
super("Remove Whitespaces");
|
||||
}
|
||||
|
||||
contentView() {
|
||||
return ContentView.PDF;
|
||||
}
|
||||
|
||||
transform(pages:PdfPage[]) {
|
||||
const addedAnnotation = new Annotation({
|
||||
category: 'added',
|
||||
color: 'green'
|
||||
});
|
||||
const removedAnnotation = new Annotation({
|
||||
category: 'removed',
|
||||
color: 'red'
|
||||
});
|
||||
|
||||
pages.forEach(page => {
|
||||
const newTextItems = [];
|
||||
page.textItems.forEach(item => {
|
||||
newTextItems.push(item);
|
||||
var words = item.text.trim().split(' ');
|
||||
var changedWords = [];
|
||||
var change = false;
|
||||
words.forEach(word => {
|
||||
if (word.length == 0) {
|
||||
change = true;
|
||||
} else {
|
||||
changedWords.push(word);
|
||||
}
|
||||
});
|
||||
if (change) {
|
||||
newTextItems.push(new TextItem({
|
||||
...item,
|
||||
text: changedWords.join(' '),
|
||||
annotation: addedAnnotation,
|
||||
}));
|
||||
item.annotation = removedAnnotation;
|
||||
}
|
||||
});
|
||||
page.textItems = newTextItems;
|
||||
});
|
||||
return pages;
|
||||
}
|
||||
|
||||
processAnnotations(pages:PdfPage[]) {
|
||||
pages.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return pages;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user