[WIP] Add remove whitespace and detect links again

This commit is contained in:
Johannes Zillmann 2017-03-18 08:56:08 +01:00
parent 4600dc6ee7
commit 07e7fbb505
8 changed files with 95 additions and 131 deletions

View File

@ -54,7 +54,7 @@ export default class TextItemTable extends React.Component {
</div>
<div style={ { textAlign: 'center', color: 'orange' } }>
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
{ textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
{ textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' }
</div>
</td>
<td>

View File

@ -4,6 +4,7 @@ import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStat
import CompactLines from './transformations/textitem/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
@ -11,11 +12,7 @@ import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx'
// import DetectHeadlines from './transformations/textitemblock/DetectHeadlines.jsx'
// import DetectFormats from './transformations/DetectFormats.jsx'
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
// import DetectLinks from './transformations/DetectLinks.jsx'
// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
import ToMarkdown from './transformations/ToMarkdown.jsx'
@ -33,6 +30,7 @@ export default class AppState {
new CompactLines(),
new RemoveRepetitiveElements(),
new VerticalToHorizontal(),
new PostprocessLines(),
new DetectTOC(),
new DetectListItems(),
new DetectHeaders(),
@ -40,12 +38,8 @@ export default class AppState {
new GatherBlocks(),
new DetectCodeQuoteBlocks(),
new DetectListLevels(),
// new DetectHeadlines(),
// new DetectFormats(),
// new RemoveWhitespaces(),
// new DetectLinks(),
// new HeadlineDetector(),
// new HeadlineToUppercase(),
new ToTextBlocks(),
new ToMarkdown()];

View File

@ -15,13 +15,15 @@ export default class PageItem {
export class ParsedElements {
constructor(options) {
this.footnoteLinks = options.footnoteLinks;
this.footnotes = options.footnotes;
this.footnoteLinks = options.footnoteLinks || [];
this.footnotes = options.footnotes || [];
this.containLinks = options.containLinks;
}
add(parsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
this.containLinks = this.containLinks || parsedElements.containLinks;
}
}

View File

@ -51,10 +51,6 @@ export default class TextItemLineCompactor {
});
}
combinedItem.parsedElements = parsedElements;
//TODO whitespace removal
//TODO bold/emphasis
return combinedItem;
}

View File

@ -1,54 +0,0 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
export default class DetectLinks extends ToPdfViewTransformation {
constructor() {
super("Detect Links");
}
transform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
const newTextItems = [];
page.textItems.forEach(item => {
newTextItems.push(item);
var words = item.text.split(' ');
var changedWords = [];
var change = false;
words.forEach(word => {
if (word.startsWith('http:')) {
changedWords.push(`[${word}](${word})`);
change = true;
} else if (word.startsWith('www.')) {
changedWords.push(`[http://${word}](http://${word})`);
change = true;
} else {
changedWords.push(word);
}
});
if (change) {
newTextItems.push(new TextItem({
...item,
text: changedWords.join(' '),
annotation: ADDED_ANNOTATION,
}));
item.annotation = REMOVED_ANNOTATION;
}
});
page.textItems = newTextItems;
});
return parseResult;
}
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return parseResult;
}
}

View File

@ -1,51 +0,0 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
export default class RemoveWhitespaces extends ToPdfViewTransformation {
constructor() {
super("Remove Whitespaces");
this.showWhitespaces = true;
}
transform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
const newTextItems = [];
page.textItems.forEach(item => {
newTextItems.push(item);
var words = item.text.trim().split(' ');
var changedWords = [];
var change = false;
words.forEach(word => {
if (word.length == 0) {
change = true;
} else {
changedWords.push(word);
}
});
if (change) {
newTextItems.push(new TextItem({
...item,
text: changedWords.join(' '),
annotation: ADDED_ANNOTATION,
}));
item.annotation = REMOVED_ANNOTATION;
}
});
page.textItems = newTextItems;
});
return parseResult;
}
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return parseResult;
}
}

View File

@ -12,7 +12,7 @@ import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
export default class CompactLines extends ToTextItemTransformation {
constructor() {
super("Compact Lines");
super("Compact To Lines");
}
transform(parseResult:ParseResult) {
@ -29,28 +29,30 @@ export default class CompactLines extends ToTextItemTransformation {
const newItems = [];
const textItemsGroupedByLine = lineGrouper.group(page.items);
textItemsGroupedByLine.forEach(textItemsOfLine => {
var lineItem;
if (textItemsOfLine.length == 1) {
newItems.push(textItemsOfLine[0]);
lineItem = textItemsOfLine[0];
} else {
textItemsOfLine.forEach(item => {
item.annotation = REMOVED_ANNOTATION;
newItems.push(item);
});
const combinedItem = lineCompactor.compact(textItemsOfLine);
combinedItem.annotation = ADDED_ANNOTATION;
newItems.push(combinedItem);
lineItem = lineCompactor.compact(textItemsOfLine);
lineItem.annotation = ADDED_ANNOTATION;
if (combinedItem.parsedElements.footnoteLinks.length > 0) {
const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
if (lineItem.parsedElements.footnoteLinks.length > 0) {
const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
}
if (combinedItem.parsedElements.footnotes.length > 0) {
combinedItem.type = ElementType.FOOTNOTES;
const footnotes = combinedItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
if (lineItem.parsedElements.footnotes.length > 0) {
lineItem.type = ElementType.FOOTNOTES;
const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
foundFootnotes.push.apply(foundFootnotes, footnotes);
}
}
lineItem.text = lineItem.text.trim();
newItems.push(lineItem);
});
page.items = newItems;
}

View File

@ -0,0 +1,75 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx';
import { ParsedElements } from '../../PageItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
// Remove whitespace, detect links, etc...
export default class PostprocessLines extends ToTextItemTransformation {
constructor() {
super("Remove Whitespace & Detect Links");
this.showWhitespaces = true;
}
transform(parseResult:ParseResult) {
var strippedWhitespace = 0;
var foundLinks = 0;
parseResult.pages.forEach(page => {
const newItems = [];
page.items.forEach(lineItem => {
newItems.push(lineItem);
var words = lineItem.text.split(' ');
var newWords = [];
var foundSuperflousNewLine = false;
var foundLink = false;
words.forEach(word => {
if (word.trim().length == 0) {
foundSuperflousNewLine = true;
strippedWhitespace++;
} else {
if (word.startsWith('http:')) {
foundLinks++;
foundLink = true;
newWords.push(`[${word}](${word})`);
} else if (word.startsWith('www.')) {
foundLinks++;
foundLink = true;
newWords.push(`[http://${word}](http://${word})`);
} else {
newWords.push(word);
}
}
});
if (foundSuperflousNewLine || foundLink) {
lineItem.annotation = REMOVED_ANNOTATION;
if (newWords.length > 0) {
newItems.push(new TextItem({
...lineItem,
text: newWords.join(' '),
annotation: ADDED_ANNOTATION,
parsedElements: new ParsedElements({
...lineItem.parsedElements,
containLinks: foundLink
})
}));
}
}
});
page.items = newItems;
});
return new ParseResult({
...parseResult,
messages: [
'Stripped ' + strippedWhitespace + ' superflous whitespaces',
'Found ' + foundLinks + ' links',
]
});
}
}