mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 15:23:26 +01:00
[WIP] Add remove whitespace and detect links again
This commit is contained in:
parent
4600dc6ee7
commit
07e7fbb505
@ -54,7 +54,7 @@ export default class TextItemTable extends React.Component {
|
||||
</div>
|
||||
<div style={ { textAlign: 'center', color: 'orange' } }>
|
||||
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
|
||||
{ textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
|
||||
{ textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' }
|
||||
</div>
|
||||
</td>
|
||||
<td>
|
||||
|
@ -4,6 +4,7 @@ import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStat
|
||||
import CompactLines from './transformations/textitem/CompactLines.jsx';
|
||||
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
|
||||
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
||||
import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
|
||||
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
|
||||
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
|
||||
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
||||
@ -11,11 +12,7 @@ import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
||||
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
|
||||
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
|
||||
import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx'
|
||||
// import DetectHeadlines from './transformations/textitemblock/DetectHeadlines.jsx'
|
||||
// import DetectFormats from './transformations/DetectFormats.jsx'
|
||||
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
||||
// import DetectLinks from './transformations/DetectLinks.jsx'
|
||||
// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
||||
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
|
||||
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
||||
@ -33,6 +30,7 @@ export default class AppState {
|
||||
new CompactLines(),
|
||||
new RemoveRepetitiveElements(),
|
||||
new VerticalToHorizontal(),
|
||||
new PostprocessLines(),
|
||||
new DetectTOC(),
|
||||
new DetectListItems(),
|
||||
new DetectHeaders(),
|
||||
@ -40,12 +38,8 @@ export default class AppState {
|
||||
new GatherBlocks(),
|
||||
new DetectCodeQuoteBlocks(),
|
||||
new DetectListLevels(),
|
||||
// new DetectHeadlines(),
|
||||
|
||||
// new DetectFormats(),
|
||||
// new RemoveWhitespaces(),
|
||||
// new DetectLinks(),
|
||||
// new HeadlineDetector(),
|
||||
// new HeadlineToUppercase(),
|
||||
new ToTextBlocks(),
|
||||
new ToMarkdown()];
|
||||
|
@ -15,13 +15,15 @@ export default class PageItem {
|
||||
export class ParsedElements {
|
||||
|
||||
constructor(options) {
|
||||
this.footnoteLinks = options.footnoteLinks;
|
||||
this.footnotes = options.footnotes;
|
||||
this.footnoteLinks = options.footnoteLinks || [];
|
||||
this.footnotes = options.footnotes || [];
|
||||
this.containLinks = options.containLinks;
|
||||
}
|
||||
|
||||
add(parsedElements) {
|
||||
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
||||
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
||||
this.containLinks = this.containLinks || parsedElements.containLinks;
|
||||
}
|
||||
|
||||
}
|
@ -51,10 +51,6 @@ export default class TextItemLineCompactor {
|
||||
});
|
||||
}
|
||||
combinedItem.parsedElements = parsedElements;
|
||||
|
||||
//TODO whitespace removal
|
||||
//TODO bold/emphasis
|
||||
|
||||
return combinedItem;
|
||||
}
|
||||
|
||||
@ -80,7 +76,7 @@ export default class TextItemLineCompactor {
|
||||
//TODO womb comp [29] => ydiff == 0
|
||||
newLineItems.push(new TextItem({
|
||||
...stashedNumberItems[0],
|
||||
text: `(^${ joinedNumber}):`
|
||||
text: `(^${ joinedNumber}): `
|
||||
}));
|
||||
footnotes.push(joinedNumber);
|
||||
} else {
|
||||
|
@ -1,54 +0,0 @@
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
export default class DetectLinks extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Links");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
parseResult.content.forEach(page => {
|
||||
const newTextItems = [];
|
||||
page.textItems.forEach(item => {
|
||||
newTextItems.push(item);
|
||||
var words = item.text.split(' ');
|
||||
var changedWords = [];
|
||||
var change = false;
|
||||
words.forEach(word => {
|
||||
if (word.startsWith('http:')) {
|
||||
changedWords.push(`[${word}](${word})`);
|
||||
change = true;
|
||||
} else if (word.startsWith('www.')) {
|
||||
changedWords.push(`[http://${word}](http://${word})`);
|
||||
change = true;
|
||||
} else {
|
||||
changedWords.push(word);
|
||||
}
|
||||
});
|
||||
if (change) {
|
||||
newTextItems.push(new TextItem({
|
||||
...item,
|
||||
text: changedWords.join(' '),
|
||||
annotation: ADDED_ANNOTATION,
|
||||
}));
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
}
|
||||
});
|
||||
page.textItems = newTextItems;
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
parseResult.content.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
}
|
@ -1,51 +0,0 @@
|
||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
||||
import TextItem from '../TextItem.jsx';
|
||||
import ParseResult from '../ParseResult.jsx';
|
||||
|
||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
||||
|
||||
export default class RemoveWhitespaces extends ToPdfViewTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Remove Whitespaces");
|
||||
this.showWhitespaces = true;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
parseResult.content.forEach(page => {
|
||||
const newTextItems = [];
|
||||
page.textItems.forEach(item => {
|
||||
newTextItems.push(item);
|
||||
var words = item.text.trim().split(' ');
|
||||
var changedWords = [];
|
||||
var change = false;
|
||||
words.forEach(word => {
|
||||
if (word.length == 0) {
|
||||
change = true;
|
||||
} else {
|
||||
changedWords.push(word);
|
||||
}
|
||||
});
|
||||
if (change) {
|
||||
newTextItems.push(new TextItem({
|
||||
...item,
|
||||
text: changedWords.join(' '),
|
||||
annotation: ADDED_ANNOTATION,
|
||||
}));
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
}
|
||||
});
|
||||
page.textItems = newTextItems;
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
completeTransform(parseResult:ParseResult) {
|
||||
parseResult.content.forEach(page => {
|
||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
||||
});
|
||||
return parseResult;
|
||||
}
|
||||
|
||||
}
|
@ -12,7 +12,7 @@ import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||
export default class CompactLines extends ToTextItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Compact Lines");
|
||||
super("Compact To Lines");
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
@ -29,28 +29,30 @@ export default class CompactLines extends ToTextItemTransformation {
|
||||
const newItems = [];
|
||||
const textItemsGroupedByLine = lineGrouper.group(page.items);
|
||||
textItemsGroupedByLine.forEach(textItemsOfLine => {
|
||||
var lineItem;
|
||||
if (textItemsOfLine.length == 1) {
|
||||
newItems.push(textItemsOfLine[0]);
|
||||
lineItem = textItemsOfLine[0];
|
||||
} else {
|
||||
textItemsOfLine.forEach(item => {
|
||||
item.annotation = REMOVED_ANNOTATION;
|
||||
newItems.push(item);
|
||||
});
|
||||
|
||||
const combinedItem = lineCompactor.compact(textItemsOfLine);
|
||||
combinedItem.annotation = ADDED_ANNOTATION;
|
||||
newItems.push(combinedItem);
|
||||
lineItem = lineCompactor.compact(textItemsOfLine);
|
||||
lineItem.annotation = ADDED_ANNOTATION;
|
||||
|
||||
if (combinedItem.parsedElements.footnoteLinks.length > 0) {
|
||||
const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
|
||||
if (lineItem.parsedElements.footnoteLinks.length > 0) {
|
||||
const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
|
||||
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
|
||||
}
|
||||
if (combinedItem.parsedElements.footnotes.length > 0) {
|
||||
combinedItem.type = ElementType.FOOTNOTES;
|
||||
const footnotes = combinedItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
||||
if (lineItem.parsedElements.footnotes.length > 0) {
|
||||
lineItem.type = ElementType.FOOTNOTES;
|
||||
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
||||
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
||||
}
|
||||
}
|
||||
lineItem.text = lineItem.text.trim();
|
||||
newItems.push(lineItem);
|
||||
});
|
||||
page.items = newItems;
|
||||
}
|
||||
|
@ -0,0 +1,75 @@
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import TextItem from '../../TextItem.jsx';
|
||||
import { ParsedElements } from '../../PageItem.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||
|
||||
|
||||
// Remove whitespace, detect links, etc...
|
||||
export default class PostprocessLines extends ToTextItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Remove Whitespace & Detect Links");
|
||||
this.showWhitespaces = true;
|
||||
}
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
var strippedWhitespace = 0;
|
||||
var foundLinks = 0;
|
||||
|
||||
parseResult.pages.forEach(page => {
|
||||
const newItems = [];
|
||||
page.items.forEach(lineItem => {
|
||||
newItems.push(lineItem);
|
||||
var words = lineItem.text.split(' ');
|
||||
var newWords = [];
|
||||
var foundSuperflousNewLine = false;
|
||||
var foundLink = false;
|
||||
words.forEach(word => {
|
||||
if (word.trim().length == 0) {
|
||||
foundSuperflousNewLine = true;
|
||||
strippedWhitespace++;
|
||||
} else {
|
||||
if (word.startsWith('http:')) {
|
||||
foundLinks++;
|
||||
foundLink = true;
|
||||
newWords.push(`[${word}](${word})`);
|
||||
} else if (word.startsWith('www.')) {
|
||||
foundLinks++;
|
||||
foundLink = true;
|
||||
newWords.push(`[http://${word}](http://${word})`);
|
||||
} else {
|
||||
newWords.push(word);
|
||||
}
|
||||
}
|
||||
});
|
||||
if (foundSuperflousNewLine || foundLink) {
|
||||
lineItem.annotation = REMOVED_ANNOTATION;
|
||||
if (newWords.length > 0) {
|
||||
newItems.push(new TextItem({
|
||||
...lineItem,
|
||||
text: newWords.join(' '),
|
||||
annotation: ADDED_ANNOTATION,
|
||||
parsedElements: new ParsedElements({
|
||||
...lineItem.parsedElements,
|
||||
containLinks: foundLink
|
||||
})
|
||||
}));
|
||||
}
|
||||
}
|
||||
});
|
||||
page.items = newItems;
|
||||
});
|
||||
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: [
|
||||
'Stripped ' + strippedWhitespace + ' superflous whitespaces',
|
||||
'Found ' + foundLinks + ' links',
|
||||
]
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user