mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-25 01:03:59 +01:00
[WIP] Add remove whitespace and detect links again
This commit is contained in:
parent
4600dc6ee7
commit
07e7fbb505
@ -54,7 +54,7 @@ export default class TextItemTable extends React.Component {
|
|||||||
</div>
|
</div>
|
||||||
<div style={ { textAlign: 'center', color: 'orange' } }>
|
<div style={ { textAlign: 'center', color: 'orange' } }>
|
||||||
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
|
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
|
||||||
{ textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
|
{ textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' }
|
||||||
</div>
|
</div>
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
|
@ -4,6 +4,7 @@ import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStat
|
|||||||
import CompactLines from './transformations/textitem/CompactLines.jsx';
|
import CompactLines from './transformations/textitem/CompactLines.jsx';
|
||||||
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
|
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
|
||||||
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
|
||||||
|
import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
|
||||||
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
|
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
|
||||||
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
|
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
|
||||||
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
||||||
@ -11,11 +12,7 @@ import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
|
|||||||
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
|
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
|
||||||
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
|
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
|
||||||
import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx'
|
import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx'
|
||||||
// import DetectHeadlines from './transformations/textitemblock/DetectHeadlines.jsx'
|
|
||||||
// import DetectFormats from './transformations/DetectFormats.jsx'
|
// import DetectFormats from './transformations/DetectFormats.jsx'
|
||||||
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
|
|
||||||
// import DetectLinks from './transformations/DetectLinks.jsx'
|
|
||||||
// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
|
|
||||||
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
|
||||||
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
|
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
|
||||||
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
import ToMarkdown from './transformations/ToMarkdown.jsx'
|
||||||
@ -33,6 +30,7 @@ export default class AppState {
|
|||||||
new CompactLines(),
|
new CompactLines(),
|
||||||
new RemoveRepetitiveElements(),
|
new RemoveRepetitiveElements(),
|
||||||
new VerticalToHorizontal(),
|
new VerticalToHorizontal(),
|
||||||
|
new PostprocessLines(),
|
||||||
new DetectTOC(),
|
new DetectTOC(),
|
||||||
new DetectListItems(),
|
new DetectListItems(),
|
||||||
new DetectHeaders(),
|
new DetectHeaders(),
|
||||||
@ -40,12 +38,8 @@ export default class AppState {
|
|||||||
new GatherBlocks(),
|
new GatherBlocks(),
|
||||||
new DetectCodeQuoteBlocks(),
|
new DetectCodeQuoteBlocks(),
|
||||||
new DetectListLevels(),
|
new DetectListLevels(),
|
||||||
// new DetectHeadlines(),
|
|
||||||
|
|
||||||
// new DetectFormats(),
|
// new DetectFormats(),
|
||||||
// new RemoveWhitespaces(),
|
|
||||||
// new DetectLinks(),
|
|
||||||
// new HeadlineDetector(),
|
|
||||||
// new HeadlineToUppercase(),
|
// new HeadlineToUppercase(),
|
||||||
new ToTextBlocks(),
|
new ToTextBlocks(),
|
||||||
new ToMarkdown()];
|
new ToMarkdown()];
|
||||||
|
@ -15,13 +15,15 @@ export default class PageItem {
|
|||||||
export class ParsedElements {
|
export class ParsedElements {
|
||||||
|
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
this.footnoteLinks = options.footnoteLinks;
|
this.footnoteLinks = options.footnoteLinks || [];
|
||||||
this.footnotes = options.footnotes;
|
this.footnotes = options.footnotes || [];
|
||||||
|
this.containLinks = options.containLinks;
|
||||||
}
|
}
|
||||||
|
|
||||||
add(parsedElements) {
|
add(parsedElements) {
|
||||||
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
|
||||||
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
|
||||||
|
this.containLinks = this.containLinks || parsedElements.containLinks;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -51,10 +51,6 @@ export default class TextItemLineCompactor {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
combinedItem.parsedElements = parsedElements;
|
combinedItem.parsedElements = parsedElements;
|
||||||
|
|
||||||
//TODO whitespace removal
|
|
||||||
//TODO bold/emphasis
|
|
||||||
|
|
||||||
return combinedItem;
|
return combinedItem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,54 +0,0 @@
|
|||||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
|
||||||
import TextItem from '../TextItem.jsx';
|
|
||||||
import ParseResult from '../ParseResult.jsx';
|
|
||||||
|
|
||||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
|
||||||
|
|
||||||
export default class DetectLinks extends ToPdfViewTransformation {
|
|
||||||
|
|
||||||
constructor() {
|
|
||||||
super("Detect Links");
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
|
||||||
parseResult.content.forEach(page => {
|
|
||||||
const newTextItems = [];
|
|
||||||
page.textItems.forEach(item => {
|
|
||||||
newTextItems.push(item);
|
|
||||||
var words = item.text.split(' ');
|
|
||||||
var changedWords = [];
|
|
||||||
var change = false;
|
|
||||||
words.forEach(word => {
|
|
||||||
if (word.startsWith('http:')) {
|
|
||||||
changedWords.push(`[${word}](${word})`);
|
|
||||||
change = true;
|
|
||||||
} else if (word.startsWith('www.')) {
|
|
||||||
changedWords.push(`[http://${word}](http://${word})`);
|
|
||||||
change = true;
|
|
||||||
} else {
|
|
||||||
changedWords.push(word);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
if (change) {
|
|
||||||
newTextItems.push(new TextItem({
|
|
||||||
...item,
|
|
||||||
text: changedWords.join(' '),
|
|
||||||
annotation: ADDED_ANNOTATION,
|
|
||||||
}));
|
|
||||||
item.annotation = REMOVED_ANNOTATION;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
page.textItems = newTextItems;
|
|
||||||
});
|
|
||||||
return parseResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
completeTransform(parseResult:ParseResult) {
|
|
||||||
parseResult.content.forEach(page => {
|
|
||||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
|
||||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
|
||||||
});
|
|
||||||
return parseResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,51 +0,0 @@
|
|||||||
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
|
|
||||||
import TextItem from '../TextItem.jsx';
|
|
||||||
import ParseResult from '../ParseResult.jsx';
|
|
||||||
|
|
||||||
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
|
|
||||||
|
|
||||||
export default class RemoveWhitespaces extends ToPdfViewTransformation {
|
|
||||||
|
|
||||||
constructor() {
|
|
||||||
super("Remove Whitespaces");
|
|
||||||
this.showWhitespaces = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
|
||||||
parseResult.content.forEach(page => {
|
|
||||||
const newTextItems = [];
|
|
||||||
page.textItems.forEach(item => {
|
|
||||||
newTextItems.push(item);
|
|
||||||
var words = item.text.trim().split(' ');
|
|
||||||
var changedWords = [];
|
|
||||||
var change = false;
|
|
||||||
words.forEach(word => {
|
|
||||||
if (word.length == 0) {
|
|
||||||
change = true;
|
|
||||||
} else {
|
|
||||||
changedWords.push(word);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
if (change) {
|
|
||||||
newTextItems.push(new TextItem({
|
|
||||||
...item,
|
|
||||||
text: changedWords.join(' '),
|
|
||||||
annotation: ADDED_ANNOTATION,
|
|
||||||
}));
|
|
||||||
item.annotation = REMOVED_ANNOTATION;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
page.textItems = newTextItems;
|
|
||||||
});
|
|
||||||
return parseResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
completeTransform(parseResult:ParseResult) {
|
|
||||||
parseResult.content.forEach(page => {
|
|
||||||
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
|
|
||||||
page.textItems.forEach(textItem => textItem.annotation = null)
|
|
||||||
});
|
|
||||||
return parseResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -12,7 +12,7 @@ import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
|||||||
export default class CompactLines extends ToTextItemTransformation {
|
export default class CompactLines extends ToTextItemTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Compact Lines");
|
super("Compact To Lines");
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
@ -29,28 +29,30 @@ export default class CompactLines extends ToTextItemTransformation {
|
|||||||
const newItems = [];
|
const newItems = [];
|
||||||
const textItemsGroupedByLine = lineGrouper.group(page.items);
|
const textItemsGroupedByLine = lineGrouper.group(page.items);
|
||||||
textItemsGroupedByLine.forEach(textItemsOfLine => {
|
textItemsGroupedByLine.forEach(textItemsOfLine => {
|
||||||
|
var lineItem;
|
||||||
if (textItemsOfLine.length == 1) {
|
if (textItemsOfLine.length == 1) {
|
||||||
newItems.push(textItemsOfLine[0]);
|
lineItem = textItemsOfLine[0];
|
||||||
} else {
|
} else {
|
||||||
textItemsOfLine.forEach(item => {
|
textItemsOfLine.forEach(item => {
|
||||||
item.annotation = REMOVED_ANNOTATION;
|
item.annotation = REMOVED_ANNOTATION;
|
||||||
newItems.push(item);
|
newItems.push(item);
|
||||||
});
|
});
|
||||||
|
|
||||||
const combinedItem = lineCompactor.compact(textItemsOfLine);
|
lineItem = lineCompactor.compact(textItemsOfLine);
|
||||||
combinedItem.annotation = ADDED_ANNOTATION;
|
lineItem.annotation = ADDED_ANNOTATION;
|
||||||
newItems.push(combinedItem);
|
|
||||||
|
|
||||||
if (combinedItem.parsedElements.footnoteLinks.length > 0) {
|
if (lineItem.parsedElements.footnoteLinks.length > 0) {
|
||||||
const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
|
const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
|
||||||
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
|
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
|
||||||
}
|
}
|
||||||
if (combinedItem.parsedElements.footnotes.length > 0) {
|
if (lineItem.parsedElements.footnotes.length > 0) {
|
||||||
combinedItem.type = ElementType.FOOTNOTES;
|
lineItem.type = ElementType.FOOTNOTES;
|
||||||
const footnotes = combinedItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
|
||||||
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
foundFootnotes.push.apply(foundFootnotes, footnotes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
lineItem.text = lineItem.text.trim();
|
||||||
|
newItems.push(lineItem);
|
||||||
});
|
});
|
||||||
page.items = newItems;
|
page.items = newItems;
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,75 @@
|
|||||||
|
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||||
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
|
import TextItem from '../../TextItem.jsx';
|
||||||
|
import { ParsedElements } from '../../PageItem.jsx';
|
||||||
|
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
|
|
||||||
|
|
||||||
|
// Remove whitespace, detect links, etc...
|
||||||
|
export default class PostprocessLines extends ToTextItemTransformation {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
super("Remove Whitespace & Detect Links");
|
||||||
|
this.showWhitespaces = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(parseResult:ParseResult) {
|
||||||
|
var strippedWhitespace = 0;
|
||||||
|
var foundLinks = 0;
|
||||||
|
|
||||||
|
parseResult.pages.forEach(page => {
|
||||||
|
const newItems = [];
|
||||||
|
page.items.forEach(lineItem => {
|
||||||
|
newItems.push(lineItem);
|
||||||
|
var words = lineItem.text.split(' ');
|
||||||
|
var newWords = [];
|
||||||
|
var foundSuperflousNewLine = false;
|
||||||
|
var foundLink = false;
|
||||||
|
words.forEach(word => {
|
||||||
|
if (word.trim().length == 0) {
|
||||||
|
foundSuperflousNewLine = true;
|
||||||
|
strippedWhitespace++;
|
||||||
|
} else {
|
||||||
|
if (word.startsWith('http:')) {
|
||||||
|
foundLinks++;
|
||||||
|
foundLink = true;
|
||||||
|
newWords.push(`[${word}](${word})`);
|
||||||
|
} else if (word.startsWith('www.')) {
|
||||||
|
foundLinks++;
|
||||||
|
foundLink = true;
|
||||||
|
newWords.push(`[http://${word}](http://${word})`);
|
||||||
|
} else {
|
||||||
|
newWords.push(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (foundSuperflousNewLine || foundLink) {
|
||||||
|
lineItem.annotation = REMOVED_ANNOTATION;
|
||||||
|
if (newWords.length > 0) {
|
||||||
|
newItems.push(new TextItem({
|
||||||
|
...lineItem,
|
||||||
|
text: newWords.join(' '),
|
||||||
|
annotation: ADDED_ANNOTATION,
|
||||||
|
parsedElements: new ParsedElements({
|
||||||
|
...lineItem.parsedElements,
|
||||||
|
containLinks: foundLink
|
||||||
|
})
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
page.items = newItems;
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
return new ParseResult({
|
||||||
|
...parseResult,
|
||||||
|
messages: [
|
||||||
|
'Stripped ' + strippedWhitespace + ' superflous whitespaces',
|
||||||
|
'Found ' + foundLinks + ' links',
|
||||||
|
]
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user