mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-25 03:51:33 +02:00
Detect Links & Remove Whitespaces
This commit is contained in:
parent
fc0aafebdd
commit
996e5fae62
@ -3,7 +3,9 @@ import { Enum } from 'enumify';
|
|||||||
import NoOp from './transformations/NoOp.jsx';
|
import NoOp from './transformations/NoOp.jsx';
|
||||||
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
|
import RoundCoordinates from './transformations/RoundCoordinates.jsx';
|
||||||
import CombineSameY from './transformations/CombineSameY.jsx';
|
import CombineSameY from './transformations/CombineSameY.jsx';
|
||||||
|
import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||||
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
import DetectFootnotes from './transformations/DetectFootnotes.jsx'
|
||||||
|
import DetectLinks from './transformations/DetectLinks.jsx'
|
||||||
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
|
||||||
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||||
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||||
@ -22,7 +24,9 @@ export default class AppState {
|
|||||||
new NoOp,
|
new NoOp,
|
||||||
new RoundCoordinates(),
|
new RoundCoordinates(),
|
||||||
new CombineSameY(),
|
new CombineSameY(),
|
||||||
|
new RemoveWhitespaces(),
|
||||||
new DetectFootnotes(),
|
new DetectFootnotes(),
|
||||||
|
new DetectLinks(),
|
||||||
new RemoveRepetitiveElements(),
|
new RemoveRepetitiveElements(),
|
||||||
new HeadlineDetector(),
|
new HeadlineDetector(),
|
||||||
new HeadlineToUppercase(),
|
new HeadlineToUppercase(),
|
||||||
|
68
src/javascript/models/transformations/DetectLinks.jsx
Normal file
68
src/javascript/models/transformations/DetectLinks.jsx
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
import Transformation from './Transformation.jsx';
|
||||||
|
import TextItem from '../TextItem.jsx';
|
||||||
|
import PdfPage from '../PdfPage.jsx';
|
||||||
|
import ContentView from '../ContentView.jsx';
|
||||||
|
|
||||||
|
import Annotation from '../Annotation.jsx';
|
||||||
|
|
||||||
|
export default class DetectLinks extends Transformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Detect Links");
|
||||||
|
}
|
||||||
|
|
||||||
|
contentView() {
|
||||||
|
return ContentView.PDF;
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(pages:PdfPage[]) {
|
||||||
|
const addedAnnotation = new Annotation({
|
||||||
|
category: 'added',
|
||||||
|
color: 'green'
|
||||||
|
});
|
||||||
|
const removedAnnotation = new Annotation({
|
||||||
|
category: 'removed',
|
||||||
|
color: 'red'
|
||||||
|
});
|
||||||
|
|
||||||
|
pages.forEach(page => {
|
||||||
|
const newTextItems = [];
|
||||||
|
page.textItems.forEach(item => {
|
||||||
|
newTextItems.push(item);
|
||||||
|
var words = item.text.split(' ');
|
||||||
|
var changedWords = [];
|
||||||
|
var change = false;
|
||||||
|
words.forEach(word => {
|
||||||
|
if (word.startsWith('http:')) {
|
||||||
|
changedWords.push(`[${word}](${word})`);
|
||||||
|
change = true;
|
||||||
|
} else if (word.startsWith('www.')) {
|
||||||
|
changedWords.push(`[http://${word}](http://${word})`);
|
||||||
|
change = true;
|
||||||
|
} else {
|
||||||
|
changedWords.push(word);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (change) {
|
||||||
|
newTextItems.push(new TextItem({
|
||||||
|
...item,
|
||||||
|
text: changedWords.join(' '),
|
||||||
|
annotation: addedAnnotation,
|
||||||
|
}));
|
||||||
|
item.annotation = removedAnnotation;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
page.textItems = newTextItems;
|
||||||
|
});
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
processAnnotations(pages:PdfPage[]) {
|
||||||
|
pages.forEach(page => {
|
||||||
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||||
|
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||||
|
});
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
64
src/javascript/models/transformations/RemoveWhitespaces.jsx
Normal file
64
src/javascript/models/transformations/RemoveWhitespaces.jsx
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
import Transformation from './Transformation.jsx';
|
||||||
|
import TextItem from '../TextItem.jsx';
|
||||||
|
import PdfPage from '../PdfPage.jsx';
|
||||||
|
import ContentView from '../ContentView.jsx';
|
||||||
|
|
||||||
|
import Annotation from '../Annotation.jsx';
|
||||||
|
|
||||||
|
export default class RemoveWhitespaces extends Transformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Remove Whitespaces");
|
||||||
|
}
|
||||||
|
|
||||||
|
contentView() {
|
||||||
|
return ContentView.PDF;
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(pages:PdfPage[]) {
|
||||||
|
const addedAnnotation = new Annotation({
|
||||||
|
category: 'added',
|
||||||
|
color: 'green'
|
||||||
|
});
|
||||||
|
const removedAnnotation = new Annotation({
|
||||||
|
category: 'removed',
|
||||||
|
color: 'red'
|
||||||
|
});
|
||||||
|
|
||||||
|
pages.forEach(page => {
|
||||||
|
const newTextItems = [];
|
||||||
|
page.textItems.forEach(item => {
|
||||||
|
newTextItems.push(item);
|
||||||
|
var words = item.text.trim().split(' ');
|
||||||
|
var changedWords = [];
|
||||||
|
var change = false;
|
||||||
|
words.forEach(word => {
|
||||||
|
if (word.length == 0) {
|
||||||
|
change = true;
|
||||||
|
} else {
|
||||||
|
changedWords.push(word);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (change) {
|
||||||
|
newTextItems.push(new TextItem({
|
||||||
|
...item,
|
||||||
|
text: changedWords.join(' '),
|
||||||
|
annotation: addedAnnotation,
|
||||||
|
}));
|
||||||
|
item.annotation = removedAnnotation;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
page.textItems = newTextItems;
|
||||||
|
});
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
processAnnotations(pages:PdfPage[]) {
|
||||||
|
pages.forEach(page => {
|
||||||
|
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
|
||||||
|
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||||
|
});
|
||||||
|
return pages;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user